Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* CFQ, or complete fairness queueing, disk scheduler.

2

* CFQ, or complete fairness queueing, disk scheduler.

3

*

3

*

4

* Based on ideas from a previously unfinished io

4

* Based on ideas from a previously unfinished io

5

* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.

5

* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.

6

*

6

*

7

8

*/

8

*/

9

#include <linux/module.h>

9

#include <linux/module.h>

10

#include <linux/slab.h>

10

#include <linux/slab.h>

11

#include <linux/blkdev.h>

11

#include <linux/blkdev.h>

12

#include <linux/elevator.h>

12

#include <linux/elevator.h>

13

#include <linux/jiffies.h>

13

#include <linux/jiffies.h>

14

#include <linux/rbtree.h>

14

#include <linux/rbtree.h>

15

#include <linux/ioprio.h>

15

#include <linux/ioprio.h>

16

#include <linux/blktrace_api.h>

16

#include <linux/blktrace_api.h>

17

#include "blk.h"

17

#include "blk.h"

18

#include "blk-cgroup.h"

18

#include "blk-cgroup.h"

19

20

static struct blkio_policy_type blkio_policy_cfq __maybe_unused;

20

static struct blkio_policy_type blkio_policy_cfq __maybe_unused;

21

22

/*

22

/*

23

* tunables

23

* tunables

24

*/

24

*/

25

/* max queue in one round of service */

25

/* max queue in one round of service */

26

static const int cfq_quantum = 8;

26

static const int cfq_quantum = 8;

27

static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

27

static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

28

/* maximum backwards seek, in KiB */

28

/* maximum backwards seek, in KiB */

29

static const int cfq_back_max = 16 * 1024;

29

static const int cfq_back_max = 16 * 1024;

30

/* penalty of a backwards seek */

30

/* penalty of a backwards seek */

31

static const int cfq_back_penalty = 2;

31

static const int cfq_back_penalty = 2;

32

static const int cfq_slice_sync = HZ / 10;

32

static const int cfq_slice_sync = HZ / 10;

33

static int cfq_slice_async = HZ / 25;

33

static int cfq_slice_async = HZ / 25;

34

static const int cfq_slice_async_rq = 2;

34

static const int cfq_slice_async_rq = 2;

35

static int cfq_slice_idle = HZ / 125;

35

static int cfq_slice_idle = HZ / 125;

36

static int cfq_group_idle = HZ / 125;

36

static int cfq_group_idle = HZ / 125;

37

static const int cfq_target_latency = HZ * 3/10; /* 300 ms */

37

static const int cfq_target_latency = HZ * 3/10; /* 300 ms */

38

static const int cfq_hist_divisor = 4;

38

static const int cfq_hist_divisor = 4;

39

40

/*

40

/*

41

* offset from end of service tree

41

* offset from end of service tree

42

*/

42

*/

43

#define CFQ_IDLE_DELAY (HZ / 5)

43

#define CFQ_IDLE_DELAY (HZ / 5)

44

45

/*

45

/*

46

* below this threshold, we consider thinktime immediate

46

* below this threshold, we consider thinktime immediate

47

*/

47

*/

48

#define CFQ_MIN_TT (2)

48

#define CFQ_MIN_TT (2)

49

50

#define CFQ_SLICE_SCALE (5)

50

#define CFQ_SLICE_SCALE (5)

51

#define CFQ_HW_QUEUE_MIN (5)

51

#define CFQ_HW_QUEUE_MIN (5)

52

#define CFQ_SERVICE_SHIFT 12

52

#define CFQ_SERVICE_SHIFT 12

53

54

#define CFQQ_SEEK_THR (sector_t)(8 * 100)

54

#define CFQQ_SEEK_THR (sector_t)(8 * 100)

55

#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)

55

#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)

56

#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)

56

#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)

57

#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)

57

#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)

58

59

#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)

59

#define RQ_CIC(rq) icq_to_cic((rq)->elv.icq)

60

#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])

60

#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elv.priv[0])

61

#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])

61

#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elv.priv[1])

62

63

static struct kmem_cache *cfq_pool;

63

static struct kmem_cache *cfq_pool;

64

65

#define CFQ_PRIO_LISTS IOPRIO_BE_NR

65

#define CFQ_PRIO_LISTS IOPRIO_BE_NR

66

#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)

66

#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)

67

#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

67

#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

68

69

#define sample_valid(samples) ((samples) > 80)

69

#define sample_valid(samples) ((samples) > 80)

70

#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)

70

#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)

71

72

struct cfq_ttime {

72

struct cfq_ttime {

73

unsigned long last_end_request;

73

unsigned long last_end_request;

74

75

unsigned long ttime_total;

75

unsigned long ttime_total;

76

unsigned long ttime_samples;

76

unsigned long ttime_samples;

77

unsigned long ttime_mean;

77

unsigned long ttime_mean;

78

};

78

};

79

80

/*

80

/*

81

* Most of our rbtree usage is for sorting with min extraction, so

81

* Most of our rbtree usage is for sorting with min extraction, so

82

* if we cache the leftmost node we don't have to walk down the tree

82

* if we cache the leftmost node we don't have to walk down the tree

83

* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should

83

* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should

84

* move this into the elevator for the rq sorting as well.

84

* move this into the elevator for the rq sorting as well.

85

*/

85

*/

86

struct cfq_rb_root {

86

struct cfq_rb_root {

87

struct rb_root rb;

87

struct rb_root rb;

88

struct rb_node *left;

88

struct rb_node *left;

89

unsigned count;

89

unsigned count;

90

unsigned total_weight;

90

unsigned total_weight;

91

u64 min_vdisktime;

91

u64 min_vdisktime;

92

struct cfq_ttime ttime;

92

struct cfq_ttime ttime;

93

};

93

};

94

#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \

94

#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, \

95

.ttime = {.last_end_request = jiffies,},}

95

.ttime = {.last_end_request = jiffies,},}

96

97

/*

97

/*

98

* Per process-grouping structure

98

* Per process-grouping structure

99

*/

99

*/

100

struct cfq_queue {

100

struct cfq_queue {

101

/* reference count */

101

/* reference count */

102

int ref;

102

int ref;

103

/* various state flags, see below */

103

/* various state flags, see below */

104

unsigned int flags;

104

unsigned int flags;

105

/* parent cfq_data */

105

/* parent cfq_data */

106

struct cfq_data *cfqd;

106

struct cfq_data *cfqd;

107

/* service_tree member */

107

/* service_tree member */

108

struct rb_node rb_node;

108

struct rb_node rb_node;

109

/* service_tree key */

109

/* service_tree key */

110

unsigned long rb_key;

110

unsigned long rb_key;

111

/* prio tree member */

111

/* prio tree member */

112

struct rb_node p_node;

112

struct rb_node p_node;

113

/* prio tree root we belong to, if any */

113

/* prio tree root we belong to, if any */

114

struct rb_root *p_root;

114

struct rb_root *p_root;

115

/* sorted list of pending requests */

115

/* sorted list of pending requests */

116

struct rb_root sort_list;

116

struct rb_root sort_list;

117

/* if fifo isn't expired, next request to serve */

117

/* if fifo isn't expired, next request to serve */

118

struct request *next_rq;

118

struct request *next_rq;

119

/* requests queued in sort_list */

119

/* requests queued in sort_list */

120

int queued[2];

120

int queued[2];

121

/* currently allocated requests */

121

/* currently allocated requests */

122

int allocated[2];

122

int allocated[2];

123

/* fifo list of requests in sort_list */

123

/* fifo list of requests in sort_list */

124

struct list_head fifo;

124

struct list_head fifo;

125

126

/* time when queue got scheduled in to dispatch first request. */

126

/* time when queue got scheduled in to dispatch first request. */

127

unsigned long dispatch_start;

127

unsigned long dispatch_start;

128

unsigned int allocated_slice;

128

unsigned int allocated_slice;

129

unsigned int slice_dispatch;

129

unsigned int slice_dispatch;

130

/* time when first request from queue completed and slice started. */

130

/* time when first request from queue completed and slice started. */

131

unsigned long slice_start;

131

unsigned long slice_start;

132

unsigned long slice_end;

132

unsigned long slice_end;

133

long slice_resid;

133

long slice_resid;

134

135

/* pending priority requests */

135

/* pending priority requests */

136

int prio_pending;

136

int prio_pending;

137

/* number of requests that are on the dispatch list or inside driver */

137

/* number of requests that are on the dispatch list or inside driver */

138

int dispatched;

138

int dispatched;

139

140

/* io prio of this group */

140

/* io prio of this group */

141

unsigned short ioprio, org_ioprio;

141

unsigned short ioprio, org_ioprio;

142

unsigned short ioprio_class;

142

unsigned short ioprio_class;

143

144

pid_t pid;

144

pid_t pid;

145

146

u32 seek_history;

146

u32 seek_history;

147

sector_t last_request_pos;

147

sector_t last_request_pos;

148

149

struct cfq_rb_root *service_tree;

149

struct cfq_rb_root *service_tree;

150

struct cfq_queue *new_cfqq;

150

struct cfq_queue *new_cfqq;

151

struct cfq_group *cfqg;

151

struct cfq_group *cfqg;

152

/* Number of sectors dispatched from queue in single dispatch round */

152

/* Number of sectors dispatched from queue in single dispatch round */

153

unsigned long nr_sectors;

153

unsigned long nr_sectors;

154

};

154

};

155

156

/*

156

/*

157

* First index in the service_trees.

157

* First index in the service_trees.

158

* IDLE is handled separately, so it has negative index

158

* IDLE is handled separately, so it has negative index

159

*/

159

*/

160

enum wl_prio_t {

160

enum wl_prio_t {

161

BE_WORKLOAD = 0,

161

BE_WORKLOAD = 0,

162

RT_WORKLOAD = 1,

162

RT_WORKLOAD = 1,

163

IDLE_WORKLOAD = 2,

163

IDLE_WORKLOAD = 2,

164

CFQ_PRIO_NR,

164

CFQ_PRIO_NR,

165

};

165

};

166

167

/*

167

/*

168

* Second index in the service_trees.

168

* Second index in the service_trees.

169

*/

169

*/

170

enum wl_type_t {

170

enum wl_type_t {

171

ASYNC_WORKLOAD = 0,

171

ASYNC_WORKLOAD = 0,

172

SYNC_NOIDLE_WORKLOAD = 1,

172

SYNC_NOIDLE_WORKLOAD = 1,

173

SYNC_WORKLOAD = 2

173

SYNC_WORKLOAD = 2

174

};

174

};

175

176

struct cfqg_stats {

176

struct cfqg_stats {

177

#ifdef CONFIG_CFQ_GROUP_IOSCHED

177

#ifdef CONFIG_CFQ_GROUP_IOSCHED

178

/* total bytes transferred */

178

/* total bytes transferred */

179

struct blkg_rwstat service_bytes;

179

struct blkg_rwstat service_bytes;

180

/* total IOs serviced, post merge */

180

/* total IOs serviced, post merge */

181

struct blkg_rwstat serviced;

181

struct blkg_rwstat serviced;

182

/* number of ios merged */

182

/* number of ios merged */

183

struct blkg_rwstat merged;

183

struct blkg_rwstat merged;

184

/* total time spent on device in ns, may not be accurate w/ queueing */

184

/* total time spent on device in ns, may not be accurate w/ queueing */

185

struct blkg_rwstat service_time;

185

struct blkg_rwstat service_time;

186

/* total time spent waiting in scheduler queue in ns */

186

/* total time spent waiting in scheduler queue in ns */

187

struct blkg_rwstat wait_time;

187

struct blkg_rwstat wait_time;

188

/* number of IOs queued up */

188

/* number of IOs queued up */

189

struct blkg_rwstat queued;

189

struct blkg_rwstat queued;

190

/* total sectors transferred */

190

/* total sectors transferred */

191

struct blkg_stat sectors;

191

struct blkg_stat sectors;

192

/* total disk time and nr sectors dispatched by this group */

192

/* total disk time and nr sectors dispatched by this group */

193

struct blkg_stat time;

193

struct blkg_stat time;

194

#ifdef CONFIG_DEBUG_BLK_CGROUP

194

#ifdef CONFIG_DEBUG_BLK_CGROUP

195

/* time not charged to this cgroup */

195

/* time not charged to this cgroup */

196

struct blkg_stat unaccounted_time;

196

struct blkg_stat unaccounted_time;

197

/* sum of number of ios queued across all samples */

197

/* sum of number of ios queued across all samples */

198

struct blkg_stat avg_queue_size_sum;

198

struct blkg_stat avg_queue_size_sum;

199

/* count of samples taken for average */

199

/* count of samples taken for average */

200

struct blkg_stat avg_queue_size_samples;

200

struct blkg_stat avg_queue_size_samples;

201

/* how many times this group has been removed from service tree */

201

/* how many times this group has been removed from service tree */

202

struct blkg_stat dequeue;

202

struct blkg_stat dequeue;

203

/* total time spent waiting for it to be assigned a timeslice. */

203

/* total time spent waiting for it to be assigned a timeslice. */

204

struct blkg_stat group_wait_time;

204

struct blkg_stat group_wait_time;

205

/* time spent idling for this blkio_group */

205

/* time spent idling for this blkio_group */

206

struct blkg_stat idle_time;

206

struct blkg_stat idle_time;

207

/* total time with empty current active q with other requests queued */

207

/* total time with empty current active q with other requests queued */

208

struct blkg_stat empty_time;

208

struct blkg_stat empty_time;

209

/* fields after this shouldn't be cleared on stat reset */

209

/* fields after this shouldn't be cleared on stat reset */

210

uint64_t start_group_wait_time;

210

uint64_t start_group_wait_time;

211

uint64_t start_idle_time;

211

uint64_t start_idle_time;

212

uint64_t start_empty_time;

212

uint64_t start_empty_time;

213

uint16_t flags;

213

uint16_t flags;

214

#endif /* CONFIG_DEBUG_BLK_CGROUP */

214

#endif /* CONFIG_DEBUG_BLK_CGROUP */

215

#endif /* CONFIG_CFQ_GROUP_IOSCHED */

215

#endif /* CONFIG_CFQ_GROUP_IOSCHED */

216

};

216

};

217

218

/* This is per cgroup per device grouping structure */

218

/* This is per cgroup per device grouping structure */

219

struct cfq_group {

219

struct cfq_group {

220

/* group service_tree member */

220

/* group service_tree member */

221

struct rb_node rb_node;

221

struct rb_node rb_node;

222

223

/* group service_tree key */

223

/* group service_tree key */

224

u64 vdisktime;

224

u64 vdisktime;

225

unsigned int weight;

225

unsigned int weight;

226

unsigned int new_weight;

226

unsigned int new_weight;

227

unsigned int dev_weight;

227

unsigned int dev_weight;

228

229

/* number of cfqq currently on this group */

229

/* number of cfqq currently on this group */

230

int nr_cfqq;

230

int nr_cfqq;

231

232

/*

232

/*

233

* Per group busy queues average. Useful for workload slice calc. We

233

* Per group busy queues average. Useful for workload slice calc. We

234

* create the array for each prio class but at run time it is used

234

* create the array for each prio class but at run time it is used

235

* only for RT and BE class and slot for IDLE class remains unused.

235

* only for RT and BE class and slot for IDLE class remains unused.

236

* This is primarily done to avoid confusion and a gcc warning.

236

* This is primarily done to avoid confusion and a gcc warning.

237

*/

237

*/

238

unsigned int busy_queues_avg[CFQ_PRIO_NR];

238

unsigned int busy_queues_avg[CFQ_PRIO_NR];

239

/*

239

/*

240

* rr lists of queues with requests. We maintain service trees for

240

* rr lists of queues with requests. We maintain service trees for

241

* RT and BE classes. These trees are subdivided in subclasses

241

* RT and BE classes. These trees are subdivided in subclasses

242

* of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE

242

* of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE

243

* class there is no subclassification and all the cfq queues go on

243

* class there is no subclassification and all the cfq queues go on

244

* a single tree service_tree_idle.

244

* a single tree service_tree_idle.

245

* Counts are embedded in the cfq_rb_root

245

* Counts are embedded in the cfq_rb_root

246

*/

246

*/

247

struct cfq_rb_root service_trees[2][3];

247

struct cfq_rb_root service_trees[2][3];

248

struct cfq_rb_root service_tree_idle;

248

struct cfq_rb_root service_tree_idle;

249

250

unsigned long saved_workload_slice;

250

unsigned long saved_workload_slice;

251

enum wl_type_t saved_workload;

251

enum wl_type_t saved_workload;

252

enum wl_prio_t saved_serving_prio;

252

enum wl_prio_t saved_serving_prio;

253

254

/* number of requests that are on the dispatch list or inside driver */

254

/* number of requests that are on the dispatch list or inside driver */

255

int dispatched;

255

int dispatched;

256

struct cfq_ttime ttime;

256

struct cfq_ttime ttime;

257

struct cfqg_stats stats;

257

struct cfqg_stats stats;

258

};

258

};

259

260

struct cfq_io_cq {

260

struct cfq_io_cq {

261

struct io_cq icq; /* must be the first member */

261

struct io_cq icq; /* must be the first member */

262

struct cfq_queue *cfqq[2];

262

struct cfq_queue *cfqq[2];

263

struct cfq_ttime ttime;

263

struct cfq_ttime ttime;

264

int ioprio; /* the current ioprio */

264

int ioprio; /* the current ioprio */

265

#ifdef CONFIG_CFQ_GROUP_IOSCHED

265

#ifdef CONFIG_CFQ_GROUP_IOSCHED

266

uint64_t blkcg_id; /* the current blkcg ID */

266

uint64_t blkcg_id; /* the current blkcg ID */

267

#endif

267

#endif

268

};

268

};

269

270

/*

270

/*

271

* Per block device queue structure

271

* Per block device queue structure

272

*/

272

*/

273

struct cfq_data {

273

struct cfq_data {

274

struct request_queue *queue;

274

struct request_queue *queue;

275

/* Root service tree for cfq_groups */

275

/* Root service tree for cfq_groups */

276

struct cfq_rb_root grp_service_tree;

276

struct cfq_rb_root grp_service_tree;

277

struct cfq_group *root_group;

277

struct cfq_group *root_group;

278

279

/*

279

/*

280

* The priority currently being served

280

* The priority currently being served

281

*/

281

*/

282

enum wl_prio_t serving_prio;

282

enum wl_prio_t serving_prio;

283

enum wl_type_t serving_type;

283

enum wl_type_t serving_type;

284

unsigned long workload_expires;

284

unsigned long workload_expires;

285

struct cfq_group *serving_group;

285

struct cfq_group *serving_group;

286

287

/*

287

/*

288

* Each priority tree is sorted by next_request position. These

288

* Each priority tree is sorted by next_request position. These

289

* trees are used when determining if two or more queues are

289

* trees are used when determining if two or more queues are

290

* interleaving requests (see cfq_close_cooperator).

290

* interleaving requests (see cfq_close_cooperator).

291

*/

291

*/

292

struct rb_root prio_trees[CFQ_PRIO_LISTS];

292

struct rb_root prio_trees[CFQ_PRIO_LISTS];

293

294

unsigned int busy_queues;

294

unsigned int busy_queues;

295

unsigned int busy_sync_queues;

295

unsigned int busy_sync_queues;

296

297

int rq_in_driver;

297

int rq_in_driver;

298

int rq_in_flight[2];

298

int rq_in_flight[2];

299

300

/*

300

/*

301

* queue-depth detection

301

* queue-depth detection

302

*/

302

*/

303

int rq_queued;

303

int rq_queued;

304

int hw_tag;

304

int hw_tag;

305

/*

305

/*

306

* hw_tag can be

306

* hw_tag can be

307

* -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)

307

* -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)

308

* 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)

308

* 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)

309

* 0 => no NCQ

309

* 0 => no NCQ

310

*/

310

*/

311

int hw_tag_est_depth;

311

int hw_tag_est_depth;

312

unsigned int hw_tag_samples;

312

unsigned int hw_tag_samples;

313

314

/*

314

/*

315

* idle window management

315

* idle window management

316

*/

316

*/

317

struct timer_list idle_slice_timer;

317

struct timer_list idle_slice_timer;

318

struct work_struct unplug_work;

318

struct work_struct unplug_work;

319

320

struct cfq_queue *active_queue;

320

struct cfq_queue *active_queue;

321

struct cfq_io_cq *active_cic;

321

struct cfq_io_cq *active_cic;

322

323

/*

323

/*

324

* async queue for each priority case

324

* async queue for each priority case

325

*/

325

*/

326

struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];

326

struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];

327

struct cfq_queue *async_idle_cfqq;

327

struct cfq_queue *async_idle_cfqq;

328

329

sector_t last_position;

329

sector_t last_position;

330

331

/*

331

/*

332

* tunables, see top of file

332

* tunables, see top of file

333

*/

333

*/

334

unsigned int cfq_quantum;

334

unsigned int cfq_quantum;

335

unsigned int cfq_fifo_expire[2];

335

unsigned int cfq_fifo_expire[2];

336

unsigned int cfq_back_penalty;

336

unsigned int cfq_back_penalty;

337

unsigned int cfq_back_max;

337

unsigned int cfq_back_max;

338

unsigned int cfq_slice[2];

338

unsigned int cfq_slice[2];

339

unsigned int cfq_slice_async_rq;

339

unsigned int cfq_slice_async_rq;

340

unsigned int cfq_slice_idle;

340

unsigned int cfq_slice_idle;

341

unsigned int cfq_group_idle;

341

unsigned int cfq_group_idle;

342

unsigned int cfq_latency;

342

unsigned int cfq_latency;

343

344

/*

344

/*

345

* Fallback dummy cfqq for extreme OOM conditions

345

* Fallback dummy cfqq for extreme OOM conditions

346

*/

346

*/

347

struct cfq_queue oom_cfqq;

347

struct cfq_queue oom_cfqq;

348

349

unsigned long last_delayed_sync;

349

unsigned long last_delayed_sync;

350

};

350

};

351

352

static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);

352

static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);

353

354

static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,

354

static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,

355

enum wl_prio_t prio,

355

enum wl_prio_t prio,

356

enum wl_type_t type)

356

enum wl_type_t type)

357

{

357

{

358

if (!cfqg)

358

if (!cfqg)

359

return NULL;

359

return NULL;

360

361

if (prio == IDLE_WORKLOAD)

361

if (prio == IDLE_WORKLOAD)

362

return &cfqg->service_tree_idle;

362

return &cfqg->service_tree_idle;

363

364

return &cfqg->service_trees[prio][type];

364

return &cfqg->service_trees[prio][type];

365

}

365

}

366

367

enum cfqq_state_flags {

367

enum cfqq_state_flags {

368

CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */

368

CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */

369

CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */

369

CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */

370

CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */

370

CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */

371

CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */

371

CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */

372

CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */

372

CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */

373

CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */

373

CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */

374

CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */

374

CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */

375

CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */

375

CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */

376

CFQ_CFQQ_FLAG_sync, /* synchronous queue */

376

CFQ_CFQQ_FLAG_sync, /* synchronous queue */

377

CFQ_CFQQ_FLAG_coop, /* cfqq is shared */

377

CFQ_CFQQ_FLAG_coop, /* cfqq is shared */

378

CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */

378

CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */

379

CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */

379

CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */

380

CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */

380

CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */

381

};

381

};

382

383

#define CFQ_CFQQ_FNS(name) \

383

#define CFQ_CFQQ_FNS(name) \

384

static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \

384

static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \

385

{ \

385

{ \

386

(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \

386

(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \

387

} \

387

} \

388

static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \

388

static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \

389

{ \

389

{ \

390

(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \

390

(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \

391

} \

391

} \

392

static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \

392

static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \

393

{ \

393

{ \

394

return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \

394

return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \

395

}

395

}

396

397

CFQ_CFQQ_FNS(on_rr);

397

CFQ_CFQQ_FNS(on_rr);

398

CFQ_CFQQ_FNS(wait_request);

398

CFQ_CFQQ_FNS(wait_request);

399

CFQ_CFQQ_FNS(must_dispatch);

399

CFQ_CFQQ_FNS(must_dispatch);

400

CFQ_CFQQ_FNS(must_alloc_slice);

400

CFQ_CFQQ_FNS(must_alloc_slice);

401

CFQ_CFQQ_FNS(fifo_expire);

401

CFQ_CFQQ_FNS(fifo_expire);

402

CFQ_CFQQ_FNS(idle_window);

402

CFQ_CFQQ_FNS(idle_window);

403

CFQ_CFQQ_FNS(prio_changed);

403

CFQ_CFQQ_FNS(prio_changed);

404

CFQ_CFQQ_FNS(slice_new);

404

CFQ_CFQQ_FNS(slice_new);

405

CFQ_CFQQ_FNS(sync);

405

CFQ_CFQQ_FNS(sync);

406

CFQ_CFQQ_FNS(coop);

406

CFQ_CFQQ_FNS(coop);

407

CFQ_CFQQ_FNS(split_coop);

407

CFQ_CFQQ_FNS(split_coop);

408

CFQ_CFQQ_FNS(deep);

408

CFQ_CFQQ_FNS(deep);

409

CFQ_CFQQ_FNS(wait_busy);

409

CFQ_CFQQ_FNS(wait_busy);

410

#undef CFQ_CFQQ_FNS

410

#undef CFQ_CFQQ_FNS

411

412

#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)

412

#if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)

413

414

/* cfqg stats flags */

414

/* cfqg stats flags */

415

enum cfqg_stats_flags {

415

enum cfqg_stats_flags {

416

CFQG_stats_waiting = 0,

416

CFQG_stats_waiting = 0,

417

CFQG_stats_idling,

417

CFQG_stats_idling,

418

CFQG_stats_empty,

418

CFQG_stats_empty,

419

};

419

};

420

421

#define CFQG_FLAG_FNS(name) \

421

#define CFQG_FLAG_FNS(name) \

422

static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \

422

static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats) \

423

{ \

423

{ \

424

stats->flags |= (1 << CFQG_stats_##name); \

424

stats->flags |= (1 << CFQG_stats_##name); \

425

} \

425

} \

426

static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \

426

static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats) \

427

{ \

427

{ \

428

stats->flags &= ~(1 << CFQG_stats_##name); \

428

stats->flags &= ~(1 << CFQG_stats_##name); \

429

} \

429

} \

430

static inline int cfqg_stats_##name(struct cfqg_stats *stats) \

430

static inline int cfqg_stats_##name(struct cfqg_stats *stats) \

431

{ \

431

{ \

432

return (stats->flags & (1 << CFQG_stats_##name)) != 0; \

432

return (stats->flags & (1 << CFQG_stats_##name)) != 0; \

433

} \

433

} \

434

435

CFQG_FLAG_FNS(waiting)

435

CFQG_FLAG_FNS(waiting)

436

CFQG_FLAG_FNS(idling)

436

CFQG_FLAG_FNS(idling)

437

CFQG_FLAG_FNS(empty)

437

CFQG_FLAG_FNS(empty)

438

#undef CFQG_FLAG_FNS

438

#undef CFQG_FLAG_FNS

439

440

/* This should be called with the queue_lock held. */

440

/* This should be called with the queue_lock held. */

441

static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)

441

static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)

442

{

442

{

443

unsigned long long now;

443

unsigned long long now;

444

445

if (!cfqg_stats_waiting(stats))

445

if (!cfqg_stats_waiting(stats))

446

return;

446

return;

447

448

now = sched_clock();

448

now = sched_clock();

449

if (time_after64(now, stats->start_group_wait_time))

449

if (time_after64(now, stats->start_group_wait_time))

450

blkg_stat_add(&stats->group_wait_time,

450

blkg_stat_add(&stats->group_wait_time,

451

now - stats->start_group_wait_time);

451

now - stats->start_group_wait_time);

452

cfqg_stats_clear_waiting(stats);

452

cfqg_stats_clear_waiting(stats);

453

}

453

}

454

455

/* This should be called with the queue_lock held. */

455

/* This should be called with the queue_lock held. */

456

static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,

456

static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,

457

struct cfq_group *curr_cfqg)

457

struct cfq_group *curr_cfqg)

458

{

458

{

459

struct cfqg_stats *stats = &cfqg->stats;

459

struct cfqg_stats *stats = &cfqg->stats;

460

461

if (cfqg_stats_waiting(stats))

461

if (cfqg_stats_waiting(stats))

462

return;

462

return;

463

if (cfqg == curr_cfqg)

463

if (cfqg == curr_cfqg)

464

return;

464

return;

465

stats->start_group_wait_time = sched_clock();

465

stats->start_group_wait_time = sched_clock();

466

cfqg_stats_mark_waiting(stats);

466

cfqg_stats_mark_waiting(stats);

467

}

467

}

468

469

/* This should be called with the queue_lock held. */

469

/* This should be called with the queue_lock held. */

470

static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)

470

static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)

471

{

471

{

472

unsigned long long now;

472

unsigned long long now;

473

474

if (!cfqg_stats_empty(stats))

474

if (!cfqg_stats_empty(stats))

475

return;

475

return;

476

477

now = sched_clock();

477

now = sched_clock();

478

if (time_after64(now, stats->start_empty_time))

478

if (time_after64(now, stats->start_empty_time))

479

blkg_stat_add(&stats->empty_time,

479

blkg_stat_add(&stats->empty_time,

480

now - stats->start_empty_time);

480

now - stats->start_empty_time);

481

cfqg_stats_clear_empty(stats);

481

cfqg_stats_clear_empty(stats);

482

}

482

}

483

484

static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)

484

static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)

485

{

485

{

486

blkg_stat_add(&cfqg->stats.dequeue, 1);

486

blkg_stat_add(&cfqg->stats.dequeue, 1);

487

}

487

}

488

489

static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)

489

static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)

490

{

490

{

491

struct cfqg_stats *stats = &cfqg->stats;

491

struct cfqg_stats *stats = &cfqg->stats;

492

493

if (blkg_rwstat_sum(&stats->queued))

493

if (blkg_rwstat_sum(&stats->queued))

494

return;

494

return;

495

496

/*

496

/*

497

* group is already marked empty. This can happen if cfqq got new

497

* group is already marked empty. This can happen if cfqq got new

498

* request in parent group and moved to this group while being added

498

* request in parent group and moved to this group while being added

499

* to service tree. Just ignore the event and move on.

499

* to service tree. Just ignore the event and move on.

500

*/

500

*/

501

if (cfqg_stats_empty(stats))

501

if (cfqg_stats_empty(stats))

502

return;

502

return;

503

504

stats->start_empty_time = sched_clock();

504

stats->start_empty_time = sched_clock();

505

cfqg_stats_mark_empty(stats);

505

cfqg_stats_mark_empty(stats);

506

}

506

}

507

508

static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)

508

static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)

509

{

509

{

510

struct cfqg_stats *stats = &cfqg->stats;

510

struct cfqg_stats *stats = &cfqg->stats;

511

512

if (cfqg_stats_idling(stats)) {

512

if (cfqg_stats_idling(stats)) {

513

unsigned long long now = sched_clock();

513

unsigned long long now = sched_clock();

514

515

if (time_after64(now, stats->start_idle_time))

515

if (time_after64(now, stats->start_idle_time))

516

blkg_stat_add(&stats->idle_time,

516

blkg_stat_add(&stats->idle_time,

517

now - stats->start_idle_time);

517

now - stats->start_idle_time);

518

cfqg_stats_clear_idling(stats);

518

cfqg_stats_clear_idling(stats);

519

}

519

}

520

}

520

}

521

522

static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)

522

static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)

523

{

523

{

524

struct cfqg_stats *stats = &cfqg->stats;

524

struct cfqg_stats *stats = &cfqg->stats;

525

526

BUG_ON(cfqg_stats_idling(stats));

526

BUG_ON(cfqg_stats_idling(stats));

527

528

stats->start_idle_time = sched_clock();

528

stats->start_idle_time = sched_clock();

529

cfqg_stats_mark_idling(stats);

529

cfqg_stats_mark_idling(stats);

530

}

530

}

531

532

static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)

532

static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)

533

{

533

{

534

struct cfqg_stats *stats = &cfqg->stats;

534

struct cfqg_stats *stats = &cfqg->stats;

535

536

blkg_stat_add(&stats->avg_queue_size_sum,

536

blkg_stat_add(&stats->avg_queue_size_sum,

537

blkg_rwstat_sum(&stats->queued));

537

blkg_rwstat_sum(&stats->queued));

538

blkg_stat_add(&stats->avg_queue_size_samples, 1);

538

blkg_stat_add(&stats->avg_queue_size_samples, 1);

539

cfqg_stats_update_group_wait_time(stats);

539

cfqg_stats_update_group_wait_time(stats);

540

}

540

}

541

542

#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */

542

#else /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */

543

544

static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }

544

static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }

545

static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }

545

static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }

546

static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }

546

static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }

547

static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }

547

static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }

548

static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }

548

static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }

549

static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }

549

static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }

550

static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }

550

static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }

551

552

#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */

552

#endif /* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */

553

554

#ifdef CONFIG_CFQ_GROUP_IOSCHED

554

#ifdef CONFIG_CFQ_GROUP_IOSCHED

555

556

static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg)

556

static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg)

557

{

557

{

558

return blkg_to_pdata(blkg, &blkio_policy_cfq);

558

return blkg_to_pdata(blkg, &blkio_policy_cfq);

559

}

559

}

560

561

static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg)

561

static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg)

562

{

562

{

563

return pdata_to_blkg(cfqg);

563

return pdata_to_blkg(cfqg);

564

}

564

}

565

566

static inline void cfqg_get(struct cfq_group *cfqg)

566

static inline void cfqg_get(struct cfq_group *cfqg)

567

{

567

{

568

return blkg_get(cfqg_to_blkg(cfqg));

568

return blkg_get(cfqg_to_blkg(cfqg));

569

}

569

}

570

571

static inline void cfqg_put(struct cfq_group *cfqg)

571

static inline void cfqg_put(struct cfq_group *cfqg)

572

{

572

{

573

return blkg_put(cfqg_to_blkg(cfqg));

573

return blkg_put(cfqg_to_blkg(cfqg));

574

}

574

}

575

576

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \

576

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \

577

blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \

577

blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \

578

cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \

578

cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \

579

blkg_path(cfqg_to_blkg((cfqq)->cfqg)), ##args)

579

blkg_path(cfqg_to_blkg((cfqq)->cfqg)), ##args)

580

581

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \

581

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \

582

blk_add_trace_msg((cfqd)->queue, "%s " fmt, \

582

blk_add_trace_msg((cfqd)->queue, "%s " fmt, \

583

blkg_path(cfqg_to_blkg((cfqg))), ##args) \

583

blkg_path(cfqg_to_blkg((cfqg))), ##args) \

584

585

static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,

585

static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,

586

struct cfq_group *curr_cfqg, int rw)

586

struct cfq_group *curr_cfqg, int rw)

587

{

587

{

588

blkg_rwstat_add(&cfqg->stats.queued, rw, 1);

588

blkg_rwstat_add(&cfqg->stats.queued, rw, 1);

589

cfqg_stats_end_empty_time(&cfqg->stats);

589

cfqg_stats_end_empty_time(&cfqg->stats);

590

cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);

590

cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);

591

}

591

}

592

593

static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,

593

static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,

594

unsigned long time, unsigned long unaccounted_time)

594

unsigned long time, unsigned long unaccounted_time)

595

{

595

{

596

blkg_stat_add(&cfqg->stats.time, time);

596

blkg_stat_add(&cfqg->stats.time, time);

597

#ifdef CONFIG_DEBUG_BLK_CGROUP

597

#ifdef CONFIG_DEBUG_BLK_CGROUP

598

blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);

598

blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);

599

#endif

599

#endif

600

}

600

}

601

602

static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)

602

static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)

603

{

603

{

604

blkg_rwstat_add(&cfqg->stats.queued, rw, -1);

604

blkg_rwstat_add(&cfqg->stats.queued, rw, -1);

605

}

605

}

606

607

static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)

607

static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)

608

{

608

{

609

blkg_rwstat_add(&cfqg->stats.merged, rw, 1);

609

blkg_rwstat_add(&cfqg->stats.merged, rw, 1);

610

}

610

}

611

612

static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,

612

static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,

613

uint64_t bytes, int rw)

613

uint64_t bytes, int rw)

614

{

614

{

615

blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);

615

blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);

616

blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);

616

blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);

617

blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);

617

blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);

618

}

618

}

619

620

static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,

620

static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,

621

uint64_t start_time, uint64_t io_start_time, int rw)

621

uint64_t start_time, uint64_t io_start_time, int rw)

622

{

622

{

623

struct cfqg_stats *stats = &cfqg->stats;

623

struct cfqg_stats *stats = &cfqg->stats;

624

unsigned long long now = sched_clock();

624

unsigned long long now = sched_clock();

625

626

if (time_after64(now, io_start_time))

626

if (time_after64(now, io_start_time))

627

blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);

627

blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);

628

if (time_after64(io_start_time, start_time))

628

if (time_after64(io_start_time, start_time))

629

blkg_rwstat_add(&stats->wait_time, rw,

629

blkg_rwstat_add(&stats->wait_time, rw,

630

io_start_time - start_time);

630

io_start_time - start_time);

631

}

631

}

632

633

static void cfqg_stats_reset(struct blkio_group *blkg)

633

static void cfqg_stats_reset(struct blkio_group *blkg)

634

{

634

{

635

struct cfq_group *cfqg = blkg_to_cfqg(blkg);

635

struct cfq_group *cfqg = blkg_to_cfqg(blkg);

636

struct cfqg_stats *stats = &cfqg->stats;

636

struct cfqg_stats *stats = &cfqg->stats;

637

638

/* queued stats shouldn't be cleared */

638

/* queued stats shouldn't be cleared */

639

blkg_rwstat_reset(&stats->service_bytes);

639

blkg_rwstat_reset(&stats->service_bytes);

640

blkg_rwstat_reset(&stats->serviced);

640

blkg_rwstat_reset(&stats->serviced);

641

blkg_rwstat_reset(&stats->merged);

641

blkg_rwstat_reset(&stats->merged);

642

blkg_rwstat_reset(&stats->service_time);

642

blkg_rwstat_reset(&stats->service_time);

643

blkg_rwstat_reset(&stats->wait_time);

643

blkg_rwstat_reset(&stats->wait_time);

644

blkg_stat_reset(&stats->time);

644

blkg_stat_reset(&stats->time);

645

#ifdef CONFIG_DEBUG_BLK_CGROUP

645

#ifdef CONFIG_DEBUG_BLK_CGROUP

646

blkg_stat_reset(&stats->unaccounted_time);

646

blkg_stat_reset(&stats->unaccounted_time);

647

blkg_stat_reset(&stats->avg_queue_size_sum);

647

blkg_stat_reset(&stats->avg_queue_size_sum);

648

blkg_stat_reset(&stats->avg_queue_size_samples);

648

blkg_stat_reset(&stats->avg_queue_size_samples);

649

blkg_stat_reset(&stats->dequeue);

649

blkg_stat_reset(&stats->dequeue);

650

blkg_stat_reset(&stats->group_wait_time);

650

blkg_stat_reset(&stats->group_wait_time);

651

blkg_stat_reset(&stats->idle_time);

651

blkg_stat_reset(&stats->idle_time);

652

blkg_stat_reset(&stats->empty_time);

652

blkg_stat_reset(&stats->empty_time);

653

#endif

653

#endif

654

}

654

}

655

656

#else /* CONFIG_CFQ_GROUP_IOSCHED */

656

#else /* CONFIG_CFQ_GROUP_IOSCHED */

657

658

static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg) { return NULL; }

658

static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg) { return NULL; }

659

static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg) { return NULL; }

659

static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg) { return NULL; }

660

static inline void cfqg_get(struct cfq_group *cfqg) { }

660

static inline void cfqg_get(struct cfq_group *cfqg) { }

661

static inline void cfqg_put(struct cfq_group *cfqg) { }

661

static inline void cfqg_put(struct cfq_group *cfqg) { }

662

663

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \

663

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \

664

blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)

664

blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)

665

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)

665

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0)

666

667

static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,

667

static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,

668

struct cfq_group *curr_cfqg, int rw) { }

668

struct cfq_group *curr_cfqg, int rw) { }

669

static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,

669

static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,

670

unsigned long time, unsigned long unaccounted_time) { }

670

unsigned long time, unsigned long unaccounted_time) { }

671

static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }

671

static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }

672

static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }

672

static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }

673

static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,

673

static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,

674

uint64_t bytes, int rw) { }

674

uint64_t bytes, int rw) { }

675

static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,

675

static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,

676

uint64_t start_time, uint64_t io_start_time, int rw) { }

676

uint64_t start_time, uint64_t io_start_time, int rw) { }

677

678

#endif /* CONFIG_CFQ_GROUP_IOSCHED */

678

#endif /* CONFIG_CFQ_GROUP_IOSCHED */

679

680

#define cfq_log(cfqd, fmt, args...) \

680

#define cfq_log(cfqd, fmt, args...) \

681

blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

681

blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

682

683

/* Traverses through cfq group service trees */

683

/* Traverses through cfq group service trees */

684

#define for_each_cfqg_st(cfqg, i, j, st) \

684

#define for_each_cfqg_st(cfqg, i, j, st) \

685

for (i = 0; i <= IDLE_WORKLOAD; i++) \

685

for (i = 0; i <= IDLE_WORKLOAD; i++) \

686

for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\

686

for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\

687

: &cfqg->service_tree_idle; \

687

: &cfqg->service_tree_idle; \

688

(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \

688

(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \

689

(i == IDLE_WORKLOAD && j == 0); \

689

(i == IDLE_WORKLOAD && j == 0); \

690

j++, st = i < IDLE_WORKLOAD ? \

690

j++, st = i < IDLE_WORKLOAD ? \

691

&cfqg->service_trees[i][j]: NULL) \

691

&cfqg->service_trees[i][j]: NULL) \

692

693

static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,

693

static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,

694

struct cfq_ttime *ttime, bool group_idle)

694

struct cfq_ttime *ttime, bool group_idle)

695

{

695

{

696

unsigned long slice;

696

unsigned long slice;

697

if (!sample_valid(ttime->ttime_samples))

697

if (!sample_valid(ttime->ttime_samples))

698

return false;

698

return false;

699

if (group_idle)

699

if (group_idle)

700

slice = cfqd->cfq_group_idle;

700

slice = cfqd->cfq_group_idle;

701

else

701

else

702

slice = cfqd->cfq_slice_idle;

702

slice = cfqd->cfq_slice_idle;

703

return ttime->ttime_mean > slice;

703

return ttime->ttime_mean > slice;

704

}

704

}

705

706

static inline bool iops_mode(struct cfq_data *cfqd)

706

static inline bool iops_mode(struct cfq_data *cfqd)

707

{

707

{

708

/*

708

/*

709

* If we are not idling on queues and it is a NCQ drive, parallel

709

* If we are not idling on queues and it is a NCQ drive, parallel

710

* execution of requests is on and measuring time is not possible

710

* execution of requests is on and measuring time is not possible

711

* in most of the cases until and unless we drive shallower queue

711

* in most of the cases until and unless we drive shallower queue

712

* depths and that becomes a performance bottleneck. In such cases

712

* depths and that becomes a performance bottleneck. In such cases

713

* switch to start providing fairness in terms of number of IOs.

713

* switch to start providing fairness in terms of number of IOs.

714

*/

714

*/

715

if (!cfqd->cfq_slice_idle && cfqd->hw_tag)

715

if (!cfqd->cfq_slice_idle && cfqd->hw_tag)

716

return true;

716

return true;

717

else

717

else

718

return false;

718

return false;

719

}

719

}

720

721

static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)

721

static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)

722

{

722

{

723

if (cfq_class_idle(cfqq))

723

if (cfq_class_idle(cfqq))

724

return IDLE_WORKLOAD;

724

return IDLE_WORKLOAD;

725

if (cfq_class_rt(cfqq))

725

if (cfq_class_rt(cfqq))

726

return RT_WORKLOAD;

726

return RT_WORKLOAD;

727

return BE_WORKLOAD;

727

return BE_WORKLOAD;

728

}

728

}

729

730

731

static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)

731

static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)

732

{

732

{

733

if (!cfq_cfqq_sync(cfqq))

733

if (!cfq_cfqq_sync(cfqq))

734

return ASYNC_WORKLOAD;

734

return ASYNC_WORKLOAD;

735

if (!cfq_cfqq_idle_window(cfqq))

735

if (!cfq_cfqq_idle_window(cfqq))

736

return SYNC_NOIDLE_WORKLOAD;

736

return SYNC_NOIDLE_WORKLOAD;

737

return SYNC_WORKLOAD;

737

return SYNC_WORKLOAD;

738

}

738

}

739

740

static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,

740

static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,

741

struct cfq_data *cfqd,

741

struct cfq_data *cfqd,

742

struct cfq_group *cfqg)

742

struct cfq_group *cfqg)

743

{

743

{

744

if (wl == IDLE_WORKLOAD)

744

if (wl == IDLE_WORKLOAD)

745

return cfqg->service_tree_idle.count;

745

return cfqg->service_tree_idle.count;

746

747

return cfqg->service_trees[wl][ASYNC_WORKLOAD].count

747

return cfqg->service_trees[wl][ASYNC_WORKLOAD].count

748

+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count

748

+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count

749

+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;

749

+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;

750

}

750

}

751

752

static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,

752

static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,

753

struct cfq_group *cfqg)

753

struct cfq_group *cfqg)

754

{

754

{

755

return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count

755

return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count

756

+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;

756

+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;

757

}

757

}

758

759

static void cfq_dispatch_insert(struct request_queue *, struct request *);

759

static void cfq_dispatch_insert(struct request_queue *, struct request *);

760

static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,

760

static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,

761

struct cfq_io_cq *cic, struct bio *bio,

761

struct cfq_io_cq *cic, struct bio *bio,

762

gfp_t gfp_mask);

762

gfp_t gfp_mask);

763

764

static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)

764

static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)

765

{

765

{

766

/* cic->icq is the first member, %NULL will convert to %NULL */

766

/* cic->icq is the first member, %NULL will convert to %NULL */

767

return container_of(icq, struct cfq_io_cq, icq);

767

return container_of(icq, struct cfq_io_cq, icq);

768

}

768

}

769

770

static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,

770

static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,

771

struct io_context *ioc)

771

struct io_context *ioc)

772

{

772

{

773

if (ioc)

773

if (ioc)

774

return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));

774

return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));

775

return NULL;

775

return NULL;

776

}

776

}

777

778

static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)

778

static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)

779

{

779

{

780

return cic->cfqq[is_sync];

780

return cic->cfqq[is_sync];

781

}

781

}

782

783

static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,

783

static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,

784

bool is_sync)

784

bool is_sync)

785

{

785

{

786

cic->cfqq[is_sync] = cfqq;

786

cic->cfqq[is_sync] = cfqq;

787

}

787

}

788

789

static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)

789

static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)

790

{

790

{

791

return cic->icq.q->elevator->elevator_data;

791

return cic->icq.q->elevator->elevator_data;

792

}

792

}

793

794

/*

794

/*

795

* We regard a request as SYNC, if it's either a read or has the SYNC bit

795

* We regard a request as SYNC, if it's either a read or has the SYNC bit

796

* set (in which case it could also be direct WRITE).

796

* set (in which case it could also be direct WRITE).

797

*/

797

*/

798

static inline bool cfq_bio_sync(struct bio *bio)

798

static inline bool cfq_bio_sync(struct bio *bio)

799

{

799

{

800

return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);

800

return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);

801

}

801

}

802

803

/*

803

/*

804

* scheduler run of queue, if there are requests pending and no one in the

804

* scheduler run of queue, if there are requests pending and no one in the

805

* driver that will restart queueing

805

* driver that will restart queueing

806

*/

806

*/

807

static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)

807

static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)

808

{

808

{

809

if (cfqd->busy_queues) {

809

if (cfqd->busy_queues) {

810

cfq_log(cfqd, "schedule dispatch");

810

cfq_log(cfqd, "schedule dispatch");

811

kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);

811

kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);

812

}

812

}

813

}

813

}

814

815

/*

815

/*

816

* Scale schedule slice based on io priority. Use the sync time slice only

816

* Scale schedule slice based on io priority. Use the sync time slice only

817

* if a queue is marked sync and has sync io queued. A sync queue with async

817

* if a queue is marked sync and has sync io queued. A sync queue with async

818

* io only, should not get full sync slice length.

818

* io only, should not get full sync slice length.

819

*/

819

*/

820

static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,

820

static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,

821

unsigned short prio)

821

unsigned short prio)

822

{

822

{

823

const int base_slice = cfqd->cfq_slice[sync];

823

const int base_slice = cfqd->cfq_slice[sync];

824

825

WARN_ON(prio >= IOPRIO_BE_NR);

825

WARN_ON(prio >= IOPRIO_BE_NR);

826

827

return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));

827

return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));

828

}

828

}

829

830

static inline int

830

static inline int

831

cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

831

cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

832

{

832

{

833

return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);

833

return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);

834

}

834

}

835

836

static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)

836

static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)

837

{

837

{

838

u64 d = delta << CFQ_SERVICE_SHIFT;

838

u64 d = delta << CFQ_SERVICE_SHIFT;

839

840

d = d * CFQ_WEIGHT_DEFAULT;

840

d = d * CFQ_WEIGHT_DEFAULT;

841

do_div(d, cfqg->weight);

841

do_div(d, cfqg->weight);

842

return d;

842

return d;

843

}

843

}

844

845

static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)

845

static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)

846

{

846

{

847

s64 delta = (s64)(vdisktime - min_vdisktime);

847

s64 delta = (s64)(vdisktime - min_vdisktime);

848

if (delta > 0)

848

if (delta > 0)

849

min_vdisktime = vdisktime;

849

min_vdisktime = vdisktime;

850

851

return min_vdisktime;

851

return min_vdisktime;

852

}

852

}

853

854

static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)

854

static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)

855

{

855

{

856

s64 delta = (s64)(vdisktime - min_vdisktime);

856

s64 delta = (s64)(vdisktime - min_vdisktime);

857

if (delta < 0)

857

if (delta < 0)

858

min_vdisktime = vdisktime;

858

min_vdisktime = vdisktime;

859

860

return min_vdisktime;

860

return min_vdisktime;

861

}

861

}

862

863

static void update_min_vdisktime(struct cfq_rb_root *st)

863

static void update_min_vdisktime(struct cfq_rb_root *st)

864

{

864

{

865

struct cfq_group *cfqg;

865

struct cfq_group *cfqg;

866

867

if (st->left) {

867

if (st->left) {

868

cfqg = rb_entry_cfqg(st->left);

868

cfqg = rb_entry_cfqg(st->left);

869

st->min_vdisktime = max_vdisktime(st->min_vdisktime,

869

st->min_vdisktime = max_vdisktime(st->min_vdisktime,

870

cfqg->vdisktime);

870

cfqg->vdisktime);

871

}

871

}

872

}

872

}

873

874

/*

874

/*

875

* get averaged number of queues of RT/BE priority.

875

* get averaged number of queues of RT/BE priority.

876

* average is updated, with a formula that gives more weight to higher numbers,

876

* average is updated, with a formula that gives more weight to higher numbers,

877

* to quickly follows sudden increases and decrease slowly

877

* to quickly follows sudden increases and decrease slowly

878

*/

878

*/

879

880

static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,

880

static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,

881

struct cfq_group *cfqg, bool rt)

881

struct cfq_group *cfqg, bool rt)

882

{

882

{

883

unsigned min_q, max_q;

883

unsigned min_q, max_q;

884

unsigned mult = cfq_hist_divisor - 1;

884

unsigned mult = cfq_hist_divisor - 1;

885

unsigned round = cfq_hist_divisor / 2;

885

unsigned round = cfq_hist_divisor / 2;

886

unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);

886

unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);

887

888

min_q = min(cfqg->busy_queues_avg[rt], busy);

888

min_q = min(cfqg->busy_queues_avg[rt], busy);

889

max_q = max(cfqg->busy_queues_avg[rt], busy);

889

max_q = max(cfqg->busy_queues_avg[rt], busy);

890

cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /

890

cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /

891

cfq_hist_divisor;

891

cfq_hist_divisor;

892

return cfqg->busy_queues_avg[rt];

892

return cfqg->busy_queues_avg[rt];

893

}

893

}

894

895

static inline unsigned

895

static inline unsigned

896

cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)

896

cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)

897

{

897

{

898

struct cfq_rb_root *st = &cfqd->grp_service_tree;

898

struct cfq_rb_root *st = &cfqd->grp_service_tree;

899

900

return cfq_target_latency * cfqg->weight / st->total_weight;

900

return cfq_target_latency * cfqg->weight / st->total_weight;

901

}

901

}

902

903

static inline unsigned

903

static inline unsigned

904

cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

904

cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

905

{

905

{

906

unsigned slice = cfq_prio_to_slice(cfqd, cfqq);

906

unsigned slice = cfq_prio_to_slice(cfqd, cfqq);

907

if (cfqd->cfq_latency) {

907

if (cfqd->cfq_latency) {

908

/*

908

/*

909

* interested queues (we consider only the ones with the same

909

* interested queues (we consider only the ones with the same

910

* priority class in the cfq group)

910

* priority class in the cfq group)

911

*/

911

*/

912

unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,

912

unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,

913

cfq_class_rt(cfqq));

913

cfq_class_rt(cfqq));

914

unsigned sync_slice = cfqd->cfq_slice[1];

914

unsigned sync_slice = cfqd->cfq_slice[1];

915

unsigned expect_latency = sync_slice * iq;

915

unsigned expect_latency = sync_slice * iq;

916

unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);

916

unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);

917

918

if (expect_latency > group_slice) {

918

if (expect_latency > group_slice) {

919

unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;

919

unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;

920

/* scale low_slice according to IO priority

920

/* scale low_slice according to IO priority

921

* and sync vs async */

921

* and sync vs async */

922

unsigned low_slice =

922

unsigned low_slice =

923

min(slice, base_low_slice * slice / sync_slice);

923

min(slice, base_low_slice * slice / sync_slice);

924

/* the adapted slice value is scaled to fit all iqs

924

/* the adapted slice value is scaled to fit all iqs

925

* into the target latency */

925

* into the target latency */

926

slice = max(slice * group_slice / expect_latency,

926

slice = max(slice * group_slice / expect_latency,

927

low_slice);

927

low_slice);

928

}

928

}

929

}

929

}

930

return slice;

930

return slice;

931

}

931

}

932

933

static inline void

933

static inline void

934

cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

934

cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

935

{

935

{

936

unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);

936

unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);

937

938

cfqq->slice_start = jiffies;

938

cfqq->slice_start = jiffies;

939

cfqq->slice_end = jiffies + slice;

939

cfqq->slice_end = jiffies + slice;

940

cfqq->allocated_slice = slice;

940

cfqq->allocated_slice = slice;

941

cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);

941

cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);

942

}

942

}

943

944

/*

944

/*

945

* We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end

945

* We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end

946

* isn't valid until the first request from the dispatch is activated

946

* isn't valid until the first request from the dispatch is activated

947

* and the slice time set.

947

* and the slice time set.

948

*/

948

*/

949

static inline bool cfq_slice_used(struct cfq_queue *cfqq)

949

static inline bool cfq_slice_used(struct cfq_queue *cfqq)

950

{

950

{

951

if (cfq_cfqq_slice_new(cfqq))

951

if (cfq_cfqq_slice_new(cfqq))

952

return false;

952

return false;

953

if (time_before(jiffies, cfqq->slice_end))

953

if (time_before(jiffies, cfqq->slice_end))

954

return false;

954

return false;

955

956

return true;

956

return true;

957

}

957

}

958

959

/*

959

/*

960

* Lifted from AS - choose which of rq1 and rq2 that is best served now.

960

* Lifted from AS - choose which of rq1 and rq2 that is best served now.

961

* We choose the request that is closest to the head right now. Distance

961

* We choose the request that is closest to the head right now. Distance

962

* behind the head is penalized and only allowed to a certain extent.

962

* behind the head is penalized and only allowed to a certain extent.

963

*/

963

*/

964

static struct request *

964

static struct request *

965

cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)

965

cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)

966

{

966

{

967

sector_t s1, s2, d1 = 0, d2 = 0;

967

sector_t s1, s2, d1 = 0, d2 = 0;

968

unsigned long back_max;

968

unsigned long back_max;

969

#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */

969

#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */

970

#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */

970

#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */

971

unsigned wrap = 0; /* bit mask: requests behind the disk head? */

971

unsigned wrap = 0; /* bit mask: requests behind the disk head? */

972

973

if (rq1 == NULL || rq1 == rq2)

973

if (rq1 == NULL || rq1 == rq2)

974

return rq2;

974

return rq2;

975

if (rq2 == NULL)

975

if (rq2 == NULL)

976

return rq1;

976

return rq1;

977

978

if (rq_is_sync(rq1) != rq_is_sync(rq2))

978

if (rq_is_sync(rq1) != rq_is_sync(rq2))

979

return rq_is_sync(rq1) ? rq1 : rq2;

979

return rq_is_sync(rq1) ? rq1 : rq2;

980

981

if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)

981

if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)

982

return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;

982

return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;

983

984

s1 = blk_rq_pos(rq1);

984

s1 = blk_rq_pos(rq1);

985

s2 = blk_rq_pos(rq2);

985

s2 = blk_rq_pos(rq2);

986

987

/*

987

/*

988

* by definition, 1KiB is 2 sectors

988

* by definition, 1KiB is 2 sectors

989

*/

989

*/

990

back_max = cfqd->cfq_back_max * 2;

990

back_max = cfqd->cfq_back_max * 2;

991

992

/*

992

/*

993

* Strict one way elevator _except_ in the case where we allow

993

* Strict one way elevator _except_ in the case where we allow

994

* short backward seeks which are biased as twice the cost of a

994

* short backward seeks which are biased as twice the cost of a

995

* similar forward seek.

995

* similar forward seek.

996

*/

996

*/

997

if (s1 >= last)

997

if (s1 >= last)

998

d1 = s1 - last;

998

d1 = s1 - last;

999

else if (s1 + back_max >= last)

999

else if (s1 + back_max >= last)

1000

d1 = (last - s1) * cfqd->cfq_back_penalty;

1000

d1 = (last - s1) * cfqd->cfq_back_penalty;

1001

else

1001

else

1002

wrap |= CFQ_RQ1_WRAP;

1002

wrap |= CFQ_RQ1_WRAP;

1003

1004

if (s2 >= last)

1004

if (s2 >= last)

1005

d2 = s2 - last;

1005

d2 = s2 - last;

1006

else if (s2 + back_max >= last)

1006

else if (s2 + back_max >= last)

1007

d2 = (last - s2) * cfqd->cfq_back_penalty;

1007

d2 = (last - s2) * cfqd->cfq_back_penalty;

1008

else

1008

else

1009

wrap |= CFQ_RQ2_WRAP;

1009

wrap |= CFQ_RQ2_WRAP;

1010

1011

/* Found required data */

1011

/* Found required data */

1012

1013

/*

1013

/*

1014

* By doing switch() on the bit mask "wrap" we avoid having to

1014

* By doing switch() on the bit mask "wrap" we avoid having to

1015

* check two variables for all permutations: --> faster!

1015

* check two variables for all permutations: --> faster!

1016

*/

1016

*/

1017

switch (wrap) {

1017

switch (wrap) {

1018

case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

1018

case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

1019

if (d1 < d2)

1019

if (d1 < d2)

1020

return rq1;

1020

return rq1;

1021

else if (d2 < d1)

1021

else if (d2 < d1)

1022

return rq2;

1022

return rq2;

1023

else {

1023

else {

1024

if (s1 >= s2)

1024

if (s1 >= s2)

1025

return rq1;

1025

return rq1;

1026

else

1026

else

1027

return rq2;

1027

return rq2;

1028

}

1028

}

1029

1030

case CFQ_RQ2_WRAP:

1030

case CFQ_RQ2_WRAP:

1031

return rq1;

1031

return rq1;

1032

case CFQ_RQ1_WRAP:

1032

case CFQ_RQ1_WRAP:

1033

return rq2;

1033

return rq2;

1034

case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */

1034

case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */

1035

default:

1035

default:

1036

/*

1036

/*

1037

* Since both rqs are wrapped,

1037

* Since both rqs are wrapped,

1038

* start with the one that's further behind head

1038

* start with the one that's further behind head

1039

* (--> only *one* back seek required),

1039

* (--> only *one* back seek required),

1040

* since back seek takes more time than forward.

1040

* since back seek takes more time than forward.

1041

*/

1041

*/

1042

if (s1 <= s2)

1042

if (s1 <= s2)

1043

return rq1;

1043

return rq1;

1044

else

1044

else

1045

return rq2;

1045

return rq2;

1046

}

1046

}

1047

}

1047

}

1048

1049

/*

1049

/*

1050

* The below is leftmost cache rbtree addon

1050

* The below is leftmost cache rbtree addon

1051

*/

1051

*/

1052

static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)

1052

static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)

1053

{

1053

{

1054

/* Service tree is empty */

1054

/* Service tree is empty */

1055

if (!root->count)

1055

if (!root->count)

1056

return NULL;

1056

return NULL;

1057

1058

if (!root->left)

1058

if (!root->left)

1059

root->left = rb_first(&root->rb);

1059

root->left = rb_first(&root->rb);

1060

1061

if (root->left)

1061

if (root->left)

1062

return rb_entry(root->left, struct cfq_queue, rb_node);

1062

return rb_entry(root->left, struct cfq_queue, rb_node);

1063

1064

return NULL;

1064

return NULL;

1065

}

1065

}

1066

1067

static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)

1067

static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)

1068

{

1068

{

1069

if (!root->left)

1069

if (!root->left)

1070

root->left = rb_first(&root->rb);

1070

root->left = rb_first(&root->rb);

1071

1072

if (root->left)

1072

if (root->left)

1073

return rb_entry_cfqg(root->left);

1073

return rb_entry_cfqg(root->left);

1074

1075

return NULL;

1075

return NULL;

1076

}

1076

}

1077

1078

static void rb_erase_init(struct rb_node *n, struct rb_root *root)

1078

static void rb_erase_init(struct rb_node *n, struct rb_root *root)

1079

{

1079

{

1080

rb_erase(n, root);

1080

rb_erase(n, root);

1081

RB_CLEAR_NODE(n);

1081

RB_CLEAR_NODE(n);

1082

}

1082

}

1083

1084

static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)

1084

static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)

1085

{

1085

{

1086

if (root->left == n)

1086

if (root->left == n)

1087

root->left = NULL;

1087

root->left = NULL;

1088

rb_erase_init(n, &root->rb);

1088

rb_erase_init(n, &root->rb);

1089

--root->count;

1089

--root->count;

1090

}

1090

}

1091

1092

/*

1092

/*

1093

* would be nice to take fifo expire time into account as well

1093

* would be nice to take fifo expire time into account as well

1094

*/

1094

*/

1095

static struct request *

1095

static struct request *

1096

cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1096

cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1097

struct request *last)

1097

struct request *last)

1098

{

1098

{

1099

struct rb_node *rbnext = rb_next(&last->rb_node);

1099

struct rb_node *rbnext = rb_next(&last->rb_node);

1100

struct rb_node *rbprev = rb_prev(&last->rb_node);

1100

struct rb_node *rbprev = rb_prev(&last->rb_node);

1101

struct request *next = NULL, *prev = NULL;

1101

struct request *next = NULL, *prev = NULL;

1102

1103

BUG_ON(RB_EMPTY_NODE(&last->rb_node));

1103

BUG_ON(RB_EMPTY_NODE(&last->rb_node));

1104

1105

if (rbprev)

1105

if (rbprev)

1106

prev = rb_entry_rq(rbprev);

1106

prev = rb_entry_rq(rbprev);

1107

1108

if (rbnext)

1108

if (rbnext)

1109

next = rb_entry_rq(rbnext);

1109

next = rb_entry_rq(rbnext);

1110

else {

1110

else {

1111

rbnext = rb_first(&cfqq->sort_list);

1111

rbnext = rb_first(&cfqq->sort_list);

1112

if (rbnext && rbnext != &last->rb_node)

1112

if (rbnext && rbnext != &last->rb_node)

1113

next = rb_entry_rq(rbnext);

1113

next = rb_entry_rq(rbnext);

1114

}

1114

}

1115

1116

return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));

1116

return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));

1117

}

1117

}

1118

1119

static unsigned long cfq_slice_offset(struct cfq_data *cfqd,

1119

static unsigned long cfq_slice_offset(struct cfq_data *cfqd,

1120

struct cfq_queue *cfqq)

1120

struct cfq_queue *cfqq)

1121

{

1121

{

1122

/*

1122

/*

1123

* just an approximation, should be ok.

1123

* just an approximation, should be ok.

1124

*/

1124

*/

1125

return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -

1125

return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -

1126

cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));

1126

cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));

1127

}

1127

}

1128

1129

static inline s64

1129

static inline s64

1130

cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)

1130

cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)

1131

{

1131

{

1132

return cfqg->vdisktime - st->min_vdisktime;

1132

return cfqg->vdisktime - st->min_vdisktime;

1133

}

1133

}

1134

1135

static void

1135

static void

1136

__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)

1136

__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)

1137

{

1137

{

1138

struct rb_node **node = &st->rb.rb_node;

1138

struct rb_node **node = &st->rb.rb_node;

1139

struct rb_node *parent = NULL;

1139

struct rb_node *parent = NULL;

1140

struct cfq_group *__cfqg;

1140

struct cfq_group *__cfqg;

1141

s64 key = cfqg_key(st, cfqg);

1141

s64 key = cfqg_key(st, cfqg);

1142

int left = 1;

1142

int left = 1;

1143

1144

while (*node != NULL) {

1144

while (*node != NULL) {

1145

parent = *node;

1145

parent = *node;

1146

__cfqg = rb_entry_cfqg(parent);

1146

__cfqg = rb_entry_cfqg(parent);

1147

1148

if (key < cfqg_key(st, __cfqg))

1148

if (key < cfqg_key(st, __cfqg))

1149

node = &parent->rb_left;

1149

node = &parent->rb_left;

1150

else {

1150

else {

1151

node = &parent->rb_right;

1151

node = &parent->rb_right;

1152

left = 0;

1152

left = 0;

1153

}

1153

}

1154

}

1154

}

1155

1156

if (left)

1156

if (left)

1157

st->left = &cfqg->rb_node;

1157

st->left = &cfqg->rb_node;

1158

1159

rb_link_node(&cfqg->rb_node, parent, node);

1159

rb_link_node(&cfqg->rb_node, parent, node);

1160

rb_insert_color(&cfqg->rb_node, &st->rb);

1160

rb_insert_color(&cfqg->rb_node, &st->rb);

1161

}

1161

}

1162

1163

static void

1163

static void

1164

cfq_update_group_weight(struct cfq_group *cfqg)

1164

cfq_update_group_weight(struct cfq_group *cfqg)

1165

{

1165

{

1166

BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));

1166

BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));

1167

if (cfqg->new_weight) {

1167

if (cfqg->new_weight) {

1168

cfqg->weight = cfqg->new_weight;

1168

cfqg->weight = cfqg->new_weight;

1169

cfqg->new_weight = 0;

1169

cfqg->new_weight = 0;

1170

}

1170

}

1171

}

1171

}

1172

1173

static void

1173

static void

1174

cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)

1174

cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)

1175

{

1175

{

1176

BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));

1176

BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));

1177

1178

cfq_update_group_weight(cfqg);

1178

cfq_update_group_weight(cfqg);

1179

__cfq_group_service_tree_add(st, cfqg);

1179

__cfq_group_service_tree_add(st, cfqg);

1180

st->total_weight += cfqg->weight;

1180

st->total_weight += cfqg->weight;

1181

}

1181

}

1182

1183

static void

1183

static void

1184

cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)

1184

cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)

1185

{

1185

{

1186

struct cfq_rb_root *st = &cfqd->grp_service_tree;

1186

struct cfq_rb_root *st = &cfqd->grp_service_tree;

1187

struct cfq_group *__cfqg;

1187

struct cfq_group *__cfqg;

1188

struct rb_node *n;

1188

struct rb_node *n;

1189

1190

cfqg->nr_cfqq++;

1190

cfqg->nr_cfqq++;

1191

if (!RB_EMPTY_NODE(&cfqg->rb_node))

1191

if (!RB_EMPTY_NODE(&cfqg->rb_node))

1192

return;

1192

return;

1193

1194

/*

1194

/*

1195

* Currently put the group at the end. Later implement something

1195

* Currently put the group at the end. Later implement something

1196

* so that groups get lesser vtime based on their weights, so that

1196

* so that groups get lesser vtime based on their weights, so that

1197

* if group does not loose all if it was not continuously backlogged.

1197

* if group does not loose all if it was not continuously backlogged.

1198

*/

1198

*/

1199

n = rb_last(&st->rb);

1199

n = rb_last(&st->rb);

1200

if (n) {

1200

if (n) {

1201

__cfqg = rb_entry_cfqg(n);

1201

__cfqg = rb_entry_cfqg(n);

1202

cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;

1202

cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;

1203

} else

1203

} else

1204

cfqg->vdisktime = st->min_vdisktime;

1204

cfqg->vdisktime = st->min_vdisktime;

1205

cfq_group_service_tree_add(st, cfqg);

1205

cfq_group_service_tree_add(st, cfqg);

1206

}

1206

}

1207

1208

static void

1208

static void

1209

cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)

1209

cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)

1210

{

1210

{

1211

st->total_weight -= cfqg->weight;

1211

st->total_weight -= cfqg->weight;

1212

if (!RB_EMPTY_NODE(&cfqg->rb_node))

1212

if (!RB_EMPTY_NODE(&cfqg->rb_node))

1213

cfq_rb_erase(&cfqg->rb_node, st);

1213

cfq_rb_erase(&cfqg->rb_node, st);

1214

}

1214

}

1215

1216

static void

1216

static void

1217

cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)

1217

cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)

1218

{

1218

{

1219

struct cfq_rb_root *st = &cfqd->grp_service_tree;

1219

struct cfq_rb_root *st = &cfqd->grp_service_tree;

1220

1221

BUG_ON(cfqg->nr_cfqq < 1);

1221

BUG_ON(cfqg->nr_cfqq < 1);

1222

cfqg->nr_cfqq--;

1222

cfqg->nr_cfqq--;

1223

1224

/* If there are other cfq queues under this group, don't delete it */

1224

/* If there are other cfq queues under this group, don't delete it */

1225

if (cfqg->nr_cfqq)

1225

if (cfqg->nr_cfqq)

1226

return;

1226

return;

1227

1228

cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");

1228

cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");

1229

cfq_group_service_tree_del(st, cfqg);

1229

cfq_group_service_tree_del(st, cfqg);

1230

cfqg->saved_workload_slice = 0;

1230

cfqg->saved_workload_slice = 0;

1231

cfqg_stats_update_dequeue(cfqg);

1231

cfqg_stats_update_dequeue(cfqg);

1232

}

1232

}

1233

1234

static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,

1234

static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,

1235

unsigned int *unaccounted_time)

1235

unsigned int *unaccounted_time)

1236

{

1236

{

1237

unsigned int slice_used;

1237

unsigned int slice_used;

1238

1239

/*

1239

/*

1240

* Queue got expired before even a single request completed or

1240

* Queue got expired before even a single request completed or

1241

* got expired immediately after first request completion.

1241

* got expired immediately after first request completion.

1242

*/

1242

*/

1243

if (!cfqq->slice_start || cfqq->slice_start == jiffies) {

1243

if (!cfqq->slice_start || cfqq->slice_start == jiffies) {

1244

/*

1244

/*

1245

* Also charge the seek time incurred to the group, otherwise

1245

* Also charge the seek time incurred to the group, otherwise

1246

* if there are mutiple queues in the group, each can dispatch

1246

* if there are mutiple queues in the group, each can dispatch

1247

* a single request on seeky media and cause lots of seek time

1247

* a single request on seeky media and cause lots of seek time

1248

* and group will never know it.

1248

* and group will never know it.

1249

*/

1249

*/

1250

slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),

1250

slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),

1251

1);

1251

1);

1252

} else {

1252

} else {

1253

slice_used = jiffies - cfqq->slice_start;

1253

slice_used = jiffies - cfqq->slice_start;

1254

if (slice_used > cfqq->allocated_slice) {

1254

if (slice_used > cfqq->allocated_slice) {

1255

*unaccounted_time = slice_used - cfqq->allocated_slice;

1255

*unaccounted_time = slice_used - cfqq->allocated_slice;

1256

slice_used = cfqq->allocated_slice;

1256

slice_used = cfqq->allocated_slice;

1257

}

1257

}

1258

if (time_after(cfqq->slice_start, cfqq->dispatch_start))

1258

if (time_after(cfqq->slice_start, cfqq->dispatch_start))

1259

*unaccounted_time += cfqq->slice_start -

1259

*unaccounted_time += cfqq->slice_start -

1260

cfqq->dispatch_start;

1260

cfqq->dispatch_start;

1261

}

1261

}

1262

1263

return slice_used;

1263

return slice_used;

1264

}

1264

}

1265

1266

static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,

1266

static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,

1267

struct cfq_queue *cfqq)

1267

struct cfq_queue *cfqq)

1268

{

1268

{

1269

struct cfq_rb_root *st = &cfqd->grp_service_tree;

1269

struct cfq_rb_root *st = &cfqd->grp_service_tree;

1270

unsigned int used_sl, charge, unaccounted_sl = 0;

1270

unsigned int used_sl, charge, unaccounted_sl = 0;

1271

int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)

1271

int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)

1272

- cfqg->service_tree_idle.count;

1272

- cfqg->service_tree_idle.count;

1273

1274

BUG_ON(nr_sync < 0);

1274

BUG_ON(nr_sync < 0);

1275

used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);

1275

used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);

1276

1277

if (iops_mode(cfqd))

1277

if (iops_mode(cfqd))

1278

charge = cfqq->slice_dispatch;

1278

charge = cfqq->slice_dispatch;

1279

else if (!cfq_cfqq_sync(cfqq) && !nr_sync)

1279

else if (!cfq_cfqq_sync(cfqq) && !nr_sync)

1280

charge = cfqq->allocated_slice;

1280

charge = cfqq->allocated_slice;

1281

1282

/* Can't update vdisktime while group is on service tree */

1282

/* Can't update vdisktime while group is on service tree */

1283

cfq_group_service_tree_del(st, cfqg);

1283

cfq_group_service_tree_del(st, cfqg);

1284

cfqg->vdisktime += cfq_scale_slice(charge, cfqg);

1284

cfqg->vdisktime += cfq_scale_slice(charge, cfqg);

1285

/* If a new weight was requested, update now, off tree */

1285

/* If a new weight was requested, update now, off tree */

1286

cfq_group_service_tree_add(st, cfqg);

1286

cfq_group_service_tree_add(st, cfqg);

1287

1288

/* This group is being expired. Save the context */

1288

/* This group is being expired. Save the context */

1289

if (time_after(cfqd->workload_expires, jiffies)) {

1289

if (time_after(cfqd->workload_expires, jiffies)) {

1290

cfqg->saved_workload_slice = cfqd->workload_expires

1290

cfqg->saved_workload_slice = cfqd->workload_expires

1291

- jiffies;

1291

- jiffies;

1292

cfqg->saved_workload = cfqd->serving_type;

1292

cfqg->saved_workload = cfqd->serving_type;

1293

cfqg->saved_serving_prio = cfqd->serving_prio;

1293

cfqg->saved_serving_prio = cfqd->serving_prio;

1294

} else

1294

} else

1295

cfqg->saved_workload_slice = 0;

1295

cfqg->saved_workload_slice = 0;

1296

1297

cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,

1297

cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,

1298

st->min_vdisktime);

1298

st->min_vdisktime);

1299

cfq_log_cfqq(cfqq->cfqd, cfqq,

1299

cfq_log_cfqq(cfqq->cfqd, cfqq,

1300

"sl_used=%u disp=%u charge=%u iops=%u sect=%lu",

1300

"sl_used=%u disp=%u charge=%u iops=%u sect=%lu",

1301

used_sl, cfqq->slice_dispatch, charge,

1301

used_sl, cfqq->slice_dispatch, charge,

1302

iops_mode(cfqd), cfqq->nr_sectors);

1302

iops_mode(cfqd), cfqq->nr_sectors);

1303

cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);

1303

cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);

1304

cfqg_stats_set_start_empty_time(cfqg);

1304

cfqg_stats_set_start_empty_time(cfqg);

1305

}

1305

}

1306

1307

/**

1307

/**

1308

* cfq_init_cfqg_base - initialize base part of a cfq_group

1308

* cfq_init_cfqg_base - initialize base part of a cfq_group

1309

* @cfqg: cfq_group to initialize

1309

* @cfqg: cfq_group to initialize

1310

*

1310

*

1311

* Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED

1311

* Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED

1312

* is enabled or not.

1312

* is enabled or not.

1313

*/

1313

*/

1314

static void cfq_init_cfqg_base(struct cfq_group *cfqg)

1314

static void cfq_init_cfqg_base(struct cfq_group *cfqg)

1315

{

1315

{

1316

struct cfq_rb_root *st;

1316

struct cfq_rb_root *st;

1317

int i, j;

1317

int i, j;

1318

1319

for_each_cfqg_st(cfqg, i, j, st)

1319

for_each_cfqg_st(cfqg, i, j, st)

1320

*st = CFQ_RB_ROOT;

1320

*st = CFQ_RB_ROOT;

1321

RB_CLEAR_NODE(&cfqg->rb_node);

1321

RB_CLEAR_NODE(&cfqg->rb_node);

1322

1323

cfqg->ttime.last_end_request = jiffies;

1323

cfqg->ttime.last_end_request = jiffies;

1324

}

1324

}

1325

1326

#ifdef CONFIG_CFQ_GROUP_IOSCHED

1326

#ifdef CONFIG_CFQ_GROUP_IOSCHED

1327

static void cfq_init_blkio_group(struct blkio_group *blkg)

1327

static void cfq_init_blkio_group(struct blkio_group *blkg)

1328

{

1328

{

1329

struct cfq_group *cfqg = blkg_to_cfqg(blkg);

1329

struct cfq_group *cfqg = blkg_to_cfqg(blkg);

1330

1331

cfq_init_cfqg_base(cfqg);

1331

cfq_init_cfqg_base(cfqg);

1332

cfqg->weight = blkg->blkcg->cfq_weight;

1332

cfqg->weight = blkg->blkcg->cfq_weight;

1333

}

1333

}

1334

1335

/*

1335

/*

1336

* Search for the cfq group current task belongs to. request_queue lock must

1336

* Search for the cfq group current task belongs to. request_queue lock must

1337

* be held.

1337

* be held.

1338

*/

1338

*/

1339

static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,

1339

static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,

1340

struct blkio_cgroup *blkcg)

1340

struct blkio_cgroup *blkcg)

1341

{

1341

{

1342

struct request_queue *q = cfqd->queue;

1342

struct request_queue *q = cfqd->queue;

1343

struct cfq_group *cfqg = NULL;

1343

struct cfq_group *cfqg = NULL;

1344

1345

/* avoid lookup for the common case where there's no blkio cgroup */

1345

/* avoid lookup for the common case where there's no blkio cgroup */

1346

if (blkcg == &blkio_root_cgroup) {

1346

if (blkcg == &blkio_root_cgroup) {

1347

cfqg = cfqd->root_group;

1347

cfqg = cfqd->root_group;

1348

} else {

1348

} else {

1349

struct blkio_group *blkg;

1349

struct blkio_group *blkg;

1350

1351

blkg = blkg_lookup_create(blkcg, q, false);

1351

blkg = blkg_lookup_create(blkcg, q, false);

1352

if (!IS_ERR(blkg))

1352

if (!IS_ERR(blkg))

1353

cfqg = blkg_to_cfqg(blkg);

1353

cfqg = blkg_to_cfqg(blkg);

1354

}

1354

}

1355

1356

return cfqg;

1356

return cfqg;

1357

}

1357

}

1358

1359

static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)

1359

static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)

1360

{

1360

{

1361

/* Currently, all async queues are mapped to root group */

1361

/* Currently, all async queues are mapped to root group */

1362

if (!cfq_cfqq_sync(cfqq))

1362

if (!cfq_cfqq_sync(cfqq))

1363

cfqg = cfqq->cfqd->root_group;

1363

cfqg = cfqq->cfqd->root_group;

1364

1365

cfqq->cfqg = cfqg;

1365

cfqq->cfqg = cfqg;

1366

/* cfqq reference on cfqg */

1366

/* cfqq reference on cfqg */

1367

cfqg_get(cfqg);

1367

cfqg_get(cfqg);

1368

}

1368

}

1369

1370

static u64 cfqg_prfill_weight_device(struct seq_file *sf, void *pdata, int off)

1370

static u64 cfqg_prfill_weight_device(struct seq_file *sf, void *pdata, int off)

1371

{

1371

{

1372

struct cfq_group *cfqg = pdata;

1372

struct cfq_group *cfqg = pdata;

1373

1374

if (!cfqg->dev_weight)

1374

if (!cfqg->dev_weight)

1375

return 0;

1375

return 0;

1376

return __blkg_prfill_u64(sf, pdata, cfqg->dev_weight);

1376

return __blkg_prfill_u64(sf, pdata, cfqg->dev_weight);

1377

}

1377

}

1378

1379

static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,

1379

static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,

1380

struct seq_file *sf)

1380

struct seq_file *sf)

1381

{

1381

{

1382

blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),

1382

blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),

1383

cfqg_prfill_weight_device, &blkio_policy_cfq, 0,

1383

cfqg_prfill_weight_device, &blkio_policy_cfq, 0,

1384

false);

1384

false);

1385

return 0;

1385

return 0;

1386

}

1386

}

1387

1388

static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,

1388

static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,

1389

struct seq_file *sf)

1389

struct seq_file *sf)

1390

{

1390

{

1391

seq_printf(sf, "%u\n", cgroup_to_blkio_cgroup(cgrp)->cfq_weight);

1391

seq_printf(sf, "%u\n", cgroup_to_blkio_cgroup(cgrp)->cfq_weight);

1392

return 0;

1392

return 0;

1393

}

1393

}

1394

1395

static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,

1395

static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,

1396

const char *buf)

1396

const char *buf)

1397

{

1397

{

1398

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

1398

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

1399

struct blkg_conf_ctx ctx;

1399

struct blkg_conf_ctx ctx;

1400

struct cfq_group *cfqg;

1400

struct cfq_group *cfqg;

1401

int ret;

1401

int ret;

1402

1403

ret = blkg_conf_prep(blkcg, &blkio_policy_cfq, buf, &ctx);

1403

ret = blkg_conf_prep(blkcg, &blkio_policy_cfq, buf, &ctx);

1404

if (ret)

1404

if (ret)

1405

return ret;

1405

return ret;

1406

1407

ret = -EINVAL;

1407

ret = -EINVAL;

1408

cfqg = blkg_to_cfqg(ctx.blkg);

1408

cfqg = blkg_to_cfqg(ctx.blkg);

1409

if (cfqg && (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN &&

1409

if (cfqg && (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN &&

1410

ctx.v <= CFQ_WEIGHT_MAX))) {

1410

ctx.v <= CFQ_WEIGHT_MAX))) {

1411

cfqg->dev_weight = ctx.v;

1411

cfqg->dev_weight = ctx.v;

1412

cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;

1412

cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;

1413

ret = 0;

1413

ret = 0;

1414

}

1414

}

1415

1416

blkg_conf_finish(&ctx);

1416

blkg_conf_finish(&ctx);

1417

return ret;

1417

return ret;

1418

}

1418

}

1419

1420

static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)

1420

static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)

1421

{

1421

{

1422

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

1422

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

1423

struct blkio_group *blkg;

1423

struct blkio_group *blkg;

1424

struct hlist_node *n;

1424

struct hlist_node *n;

1425

1426

if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)

1426

if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)

1427

return -EINVAL;

1427

return -EINVAL;

1428

1429

spin_lock_irq(&blkcg->lock);

1429

spin_lock_irq(&blkcg->lock);

1430

blkcg->cfq_weight = (unsigned int)val;

1430

blkcg->cfq_weight = (unsigned int)val;

1431

1432

hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {

1432

hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {

1433

struct cfq_group *cfqg = blkg_to_cfqg(blkg);

1433

struct cfq_group *cfqg = blkg_to_cfqg(blkg);

1434

1435

if (cfqg && !cfqg->dev_weight)

1435

if (cfqg && !cfqg->dev_weight)

1436

cfqg->new_weight = blkcg->cfq_weight;

1436

cfqg->new_weight = blkcg->cfq_weight;

1437

}

1437

}

1438

1439

spin_unlock_irq(&blkcg->lock);

1439

spin_unlock_irq(&blkcg->lock);

1440

return 0;

1440

return 0;

1441

}

1441

}

1442

1443

static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,

1443

static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,

1444

struct seq_file *sf)

1444

struct seq_file *sf)

1445

{

1445

{

1446

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

1446

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

1447

1448

blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkio_policy_cfq,

1448

blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkio_policy_cfq,

1449

cft->private, false);

1449

cft->private, false);

1450

return 0;

1450

return 0;

1451

}

1451

}

1452

1453

static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,

1453

static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,

1454

struct seq_file *sf)

1454

struct seq_file *sf)

1455

{

1455

{

1456

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

1456

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

1457

1458

blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkio_policy_cfq,

1458

blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkio_policy_cfq,

1459

cft->private, true);

1459

cft->private, true);

1460

return 0;

1460

return 0;

1461

}

1461

}

1462

1463

#ifdef CONFIG_DEBUG_BLK_CGROUP

1463

#ifdef CONFIG_DEBUG_BLK_CGROUP

1464

static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, void *pdata, int off)

1464

static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, void *pdata, int off)

1465

{

1465

{

1466

struct cfq_group *cfqg = pdata;

1466

struct cfq_group *cfqg = pdata;

1467

u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);

1467

u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);

1468

u64 v = 0;

1468

u64 v = 0;

1469

1470

if (samples) {

1470

if (samples) {

1471

v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);

1471

v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);

1472

do_div(v, samples);

1472

do_div(v, samples);

1473

}

1473

}

1474

__blkg_prfill_u64(sf, pdata, v);

1474

__blkg_prfill_u64(sf, pdata, v);

1475

return 0;

1475

return 0;

1476

}

1476

}

1477

1478

/* print avg_queue_size */

1478

/* print avg_queue_size */

1479

static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,

1479

static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,

1480

struct seq_file *sf)

1480

struct seq_file *sf)

1481

{

1481

{

1482

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

1482

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);

1483

1484

blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,

1484

blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,

1485

&blkio_policy_cfq, 0, false);

1485

&blkio_policy_cfq, 0, false);

1486

return 0;

1486

return 0;

1487

}

1487

}

1488

#endif /* CONFIG_DEBUG_BLK_CGROUP */

1488

#endif /* CONFIG_DEBUG_BLK_CGROUP */

1489

1490

static struct cftype cfq_blkcg_files[] = {

1490

static struct cftype cfq_blkcg_files[] = {

1491

{

1491

{

1492

.name = "weight_device",

1492

.name = "weight_device",

1493

.read_seq_string = cfqg_print_weight_device,

1493

.read_seq_string = cfqg_print_weight_device,

1494

.write_string = cfqg_set_weight_device,

1494

.write_string = cfqg_set_weight_device,

1495

.max_write_len = 256,

1495

.max_write_len = 256,

1496

},

1496

},

1497

{

1497

{

1498

.name = "weight",

1498

.name = "weight",

1499

.read_seq_string = cfq_print_weight,

1499

.read_seq_string = cfq_print_weight,

1500

.write_u64 = cfq_set_weight,

1500

.write_u64 = cfq_set_weight,

1501

},

1501

},

1502

{

1502

{

1503

.name = "time",

1503

.name = "time",

1504

.private = offsetof(struct cfq_group, stats.time),

1504

.private = offsetof(struct cfq_group, stats.time),

1505

.read_seq_string = cfqg_print_stat,

1505

.read_seq_string = cfqg_print_stat,

1506

},

1506

},

1507

{

1507

{

1508

.name = "sectors",

1508

.name = "sectors",

1509

.private = offsetof(struct cfq_group, stats.sectors),

1509

.private = offsetof(struct cfq_group, stats.sectors),

1510

.read_seq_string = cfqg_print_stat,

1510

.read_seq_string = cfqg_print_stat,

1511

},

1511

},

1512

{

1512

{

1513

.name = "io_service_bytes",

1513

.name = "io_service_bytes",

1514

.private = offsetof(struct cfq_group, stats.service_bytes),

1514

.private = offsetof(struct cfq_group, stats.service_bytes),

1515

.read_seq_string = cfqg_print_rwstat,

1515

.read_seq_string = cfqg_print_rwstat,

1516

},

1516

},

1517

{

1517

{

1518

.name = "io_serviced",

1518

.name = "io_serviced",

1519

.private = offsetof(struct cfq_group, stats.serviced),

1519

.private = offsetof(struct cfq_group, stats.serviced),

1520

.read_seq_string = cfqg_print_rwstat,

1520

.read_seq_string = cfqg_print_rwstat,

1521

},

1521

},

1522

{

1522

{

1523

.name = "io_service_time",

1523

.name = "io_service_time",

1524

.private = offsetof(struct cfq_group, stats.service_time),

1524

.private = offsetof(struct cfq_group, stats.service_time),

1525

.read_seq_string = cfqg_print_rwstat,

1525

.read_seq_string = cfqg_print_rwstat,

1526

},

1526

},

1527

{

1527

{

1528

.name = "io_wait_time",

1528

.name = "io_wait_time",

1529

.private = offsetof(struct cfq_group, stats.wait_time),

1529

.private = offsetof(struct cfq_group, stats.wait_time),

1530

.read_seq_string = cfqg_print_rwstat,

1530

.read_seq_string = cfqg_print_rwstat,

1531

},

1531

},

1532

{

1532

{

1533

.name = "io_merged",

1533

.name = "io_merged",

1534

.private = offsetof(struct cfq_group, stats.merged),

1534

.private = offsetof(struct cfq_group, stats.merged),

1535

.read_seq_string = cfqg_print_rwstat,

1535

.read_seq_string = cfqg_print_rwstat,

1536

},

1536

},

1537

{

1537

{

1538

.name = "io_queued",

1538

.name = "io_queued",

1539

.private = offsetof(struct cfq_group, stats.queued),

1539

.private = offsetof(struct cfq_group, stats.queued),

1540

.read_seq_string = cfqg_print_rwstat,

1540

.read_seq_string = cfqg_print_rwstat,

1541

},

1541

},

1542

#ifdef CONFIG_DEBUG_BLK_CGROUP

1542

#ifdef CONFIG_DEBUG_BLK_CGROUP

1543

{

1543

{

1544

.name = "avg_queue_size",

1544

.name = "avg_queue_size",

1545

.read_seq_string = cfqg_print_avg_queue_size,

1545

.read_seq_string = cfqg_print_avg_queue_size,

1546

},

1546

},

1547

{

1547

{

1548

.name = "group_wait_time",

1548

.name = "group_wait_time",

1549

.private = offsetof(struct cfq_group, stats.group_wait_time),

1549

.private = offsetof(struct cfq_group, stats.group_wait_time),

1550

.read_seq_string = cfqg_print_stat,

1550

.read_seq_string = cfqg_print_stat,

1551

},

1551

},

1552

{

1552

{

1553

.name = "idle_time",

1553

.name = "idle_time",

1554

.private = offsetof(struct cfq_group, stats.idle_time),

1554

.private = offsetof(struct cfq_group, stats.idle_time),

1555

.read_seq_string = cfqg_print_stat,

1555

.read_seq_string = cfqg_print_stat,

1556

},

1556

},

1557

{

1557

{

1558

.name = "empty_time",

1558

.name = "empty_time",

1559

.private = offsetof(struct cfq_group, stats.empty_time),

1559

.private = offsetof(struct cfq_group, stats.empty_time),

1560

.read_seq_string = cfqg_print_stat,

1560

.read_seq_string = cfqg_print_stat,

1561

},

1561

},

1562

{

1562

{

1563

.name = "dequeue",

1563

.name = "dequeue",

1564

.private = offsetof(struct cfq_group, stats.dequeue),

1564

.private = offsetof(struct cfq_group, stats.dequeue),

1565

.read_seq_string = cfqg_print_stat,

1565

.read_seq_string = cfqg_print_stat,

1566

},

1566

},

1567

{

1567

{

1568

.name = "unaccounted_time",

1568

.name = "unaccounted_time",

1569

.private = offsetof(struct cfq_group, stats.unaccounted_time),

1569

.private = offsetof(struct cfq_group, stats.unaccounted_time),

1570

.read_seq_string = cfqg_print_stat,

1570

.read_seq_string = cfqg_print_stat,

1571

},

1571

},

1572

#endif /* CONFIG_DEBUG_BLK_CGROUP */

1572

#endif /* CONFIG_DEBUG_BLK_CGROUP */

1573

{ } /* terminate */

1573

{ } /* terminate */

1574

};

1574

};

1575

#else /* GROUP_IOSCHED */

1575

#else /* GROUP_IOSCHED */

1576

static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,

1576

static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,

1577

struct blkio_cgroup *blkcg)

1577

struct blkio_cgroup *blkcg)

1578

{

1578

{

1579

return cfqd->root_group;

1579

return cfqd->root_group;

1580

}

1580

}

1581

1582

static inline void

1582

static inline void

1583

cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {

1583

cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {

1584

cfqq->cfqg = cfqg;

1584

cfqq->cfqg = cfqg;

1585

}

1585

}

1586

1587

#endif /* GROUP_IOSCHED */

1587

#endif /* GROUP_IOSCHED */

1588

1589

/*

1589

/*

1590

* The cfqd->service_trees holds all pending cfq_queue's that have

1590

* The cfqd->service_trees holds all pending cfq_queue's that have

1591

* requests waiting to be processed. It is sorted in the order that

1591

* requests waiting to be processed. It is sorted in the order that

1592

* we will service the queues.

1592

* we will service the queues.

1593

*/

1593

*/

1594

static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1594

static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1595

bool add_front)

1595

bool add_front)

1596

{

1596

{

1597

struct rb_node **p, *parent;

1597

struct rb_node **p, *parent;

1598

struct cfq_queue *__cfqq;

1598

struct cfq_queue *__cfqq;

1599

unsigned long rb_key;

1599

unsigned long rb_key;

1600

struct cfq_rb_root *service_tree;

1600

struct cfq_rb_root *service_tree;

1601

int left;

1601

int left;

1602

int new_cfqq = 1;

1602

int new_cfqq = 1;

1603

1604

service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),

1604

service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),

1605

cfqq_type(cfqq));

1605

cfqq_type(cfqq));

1606

if (cfq_class_idle(cfqq)) {

1606

if (cfq_class_idle(cfqq)) {

1607

rb_key = CFQ_IDLE_DELAY;

1607

rb_key = CFQ_IDLE_DELAY;

1608

parent = rb_last(&service_tree->rb);

1608

parent = rb_last(&service_tree->rb);

1609

if (parent && parent != &cfqq->rb_node) {

1609

if (parent && parent != &cfqq->rb_node) {

1610

__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1610

__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1611

rb_key += __cfqq->rb_key;

1611

rb_key += __cfqq->rb_key;

1612

} else

1612

} else

1613

rb_key += jiffies;

1613

rb_key += jiffies;

1614

} else if (!add_front) {

1614

} else if (!add_front) {

1615

/*

1615

/*

1616

* Get our rb key offset. Subtract any residual slice

1616

* Get our rb key offset. Subtract any residual slice

1617

* value carried from last service. A negative resid

1617

* value carried from last service. A negative resid

1618

* count indicates slice overrun, and this should position

1618

* count indicates slice overrun, and this should position

1619

* the next service time further away in the tree.

1619

* the next service time further away in the tree.

1620

*/

1620

*/

1621

rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;

1621

rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;

1622

rb_key -= cfqq->slice_resid;

1622

rb_key -= cfqq->slice_resid;

1623

cfqq->slice_resid = 0;

1623

cfqq->slice_resid = 0;

1624

} else {

1624

} else {

1625

rb_key = -HZ;

1625

rb_key = -HZ;

1626

__cfqq = cfq_rb_first(service_tree);

1626

__cfqq = cfq_rb_first(service_tree);

1627

rb_key += __cfqq ? __cfqq->rb_key : jiffies;

1627

rb_key += __cfqq ? __cfqq->rb_key : jiffies;

1628

}

1628

}

1629

1630

if (!RB_EMPTY_NODE(&cfqq->rb_node)) {

1630

if (!RB_EMPTY_NODE(&cfqq->rb_node)) {

1631

new_cfqq = 0;

1631

new_cfqq = 0;

1632

/*

1632

/*

1633

* same position, nothing more to do

1633

* same position, nothing more to do

1634

*/

1634

*/

1635

if (rb_key == cfqq->rb_key &&

1635

if (rb_key == cfqq->rb_key &&

1636

cfqq->service_tree == service_tree)

1636

cfqq->service_tree == service_tree)

1637

return;

1637

return;

1638

1639

cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);

1639

cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);

1640

cfqq->service_tree = NULL;

1640

cfqq->service_tree = NULL;

1641

}

1641

}

1642

1643

left = 1;

1643

left = 1;

1644

parent = NULL;

1644

parent = NULL;

1645

cfqq->service_tree = service_tree;

1645

cfqq->service_tree = service_tree;

1646

p = &service_tree->rb.rb_node;

1646

p = &service_tree->rb.rb_node;

1647

while (*p) {

1647

while (*p) {

1648

struct rb_node **n;

1648

struct rb_node **n;

1649

1650

parent = *p;

1650

parent = *p;

1651

__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1651

__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1652

1653

/*

1653

/*

1654

* sort by key, that represents service time.

1654

* sort by key, that represents service time.

1655

*/

1655

*/

1656

if (time_before(rb_key, __cfqq->rb_key))

1656

if (time_before(rb_key, __cfqq->rb_key))

1657

n = &(*p)->rb_left;

1657

n = &(*p)->rb_left;

1658

else {

1658

else {

1659

n = &(*p)->rb_right;

1659

n = &(*p)->rb_right;

1660

left = 0;

1660

left = 0;

1661

}

1661

}

1662

1663

p = n;

1663

p = n;

1664

}

1664

}

1665

1666

if (left)

1666

if (left)

1667

service_tree->left = &cfqq->rb_node;

1667

service_tree->left = &cfqq->rb_node;

1668

1669

cfqq->rb_key = rb_key;

1669

cfqq->rb_key = rb_key;

1670

rb_link_node(&cfqq->rb_node, parent, p);

1670

rb_link_node(&cfqq->rb_node, parent, p);

1671

rb_insert_color(&cfqq->rb_node, &service_tree->rb);

1671

rb_insert_color(&cfqq->rb_node, &service_tree->rb);

1672

service_tree->count++;

1672

service_tree->count++;

1673

if (add_front || !new_cfqq)

1673

if (add_front || !new_cfqq)

1674

return;

1674

return;

1675

cfq_group_notify_queue_add(cfqd, cfqq->cfqg);

1675

cfq_group_notify_queue_add(cfqd, cfqq->cfqg);

1676

}

1676

}

1677

1678

static struct cfq_queue *

1678

static struct cfq_queue *

1679

cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,

1679

cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,

1680

sector_t sector, struct rb_node **ret_parent,

1680

sector_t sector, struct rb_node **ret_parent,

1681

struct rb_node ***rb_link)

1681

struct rb_node ***rb_link)

1682

{

1682

{

1683

struct rb_node **p, *parent;

1683

struct rb_node **p, *parent;

1684

struct cfq_queue *cfqq = NULL;

1684

struct cfq_queue *cfqq = NULL;

1685

1686

parent = NULL;

1686

parent = NULL;

1687

p = &root->rb_node;

1687

p = &root->rb_node;

1688

while (*p) {

1688

while (*p) {

1689

struct rb_node **n;

1689

struct rb_node **n;

1690

1691

parent = *p;

1691

parent = *p;

1692

cfqq = rb_entry(parent, struct cfq_queue, p_node);

1692

cfqq = rb_entry(parent, struct cfq_queue, p_node);

1693

1694

/*

1694

/*

1695

* Sort strictly based on sector. Smallest to the left,

1695

* Sort strictly based on sector. Smallest to the left,

1696

* largest to the right.

1696

* largest to the right.

1697

*/

1697

*/

1698

if (sector > blk_rq_pos(cfqq->next_rq))

1698

if (sector > blk_rq_pos(cfqq->next_rq))

1699

n = &(*p)->rb_right;

1699

n = &(*p)->rb_right;

1700

else if (sector < blk_rq_pos(cfqq->next_rq))

1700

else if (sector < blk_rq_pos(cfqq->next_rq))

1701

n = &(*p)->rb_left;

1701

n = &(*p)->rb_left;

1702

else

1702

else

1703

break;

1703

break;

1704

p = n;

1704

p = n;

1705

cfqq = NULL;

1705

cfqq = NULL;

1706

}

1706

}

1707

1708

*ret_parent = parent;

1708

*ret_parent = parent;

1709

if (rb_link)

1709

if (rb_link)

1710

*rb_link = p;

1710

*rb_link = p;

1711

return cfqq;

1711

return cfqq;

1712

}

1712

}

1713

1714

static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1714

static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1715

{

1715

{

1716

struct rb_node **p, *parent;

1716

struct rb_node **p, *parent;

1717

struct cfq_queue *__cfqq;

1717

struct cfq_queue *__cfqq;

1718

1719

if (cfqq->p_root) {

1719

if (cfqq->p_root) {

1720

rb_erase(&cfqq->p_node, cfqq->p_root);

1720

rb_erase(&cfqq->p_node, cfqq->p_root);

1721

cfqq->p_root = NULL;

1721

cfqq->p_root = NULL;

1722

}

1722

}

1723

1724

if (cfq_class_idle(cfqq))

1724

if (cfq_class_idle(cfqq))

1725

return;

1725

return;

1726

if (!cfqq->next_rq)

1726

if (!cfqq->next_rq)

1727

return;

1727

return;

1728

1729

cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];

1729

cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];

1730

__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,

1730

__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,

1731

blk_rq_pos(cfqq->next_rq), &parent, &p);

1731

blk_rq_pos(cfqq->next_rq), &parent, &p);

1732

if (!__cfqq) {

1732

if (!__cfqq) {

1733

rb_link_node(&cfqq->p_node, parent, p);

1733

rb_link_node(&cfqq->p_node, parent, p);

1734

rb_insert_color(&cfqq->p_node, cfqq->p_root);

1734

rb_insert_color(&cfqq->p_node, cfqq->p_root);

1735

} else

1735

} else

1736

cfqq->p_root = NULL;

1736

cfqq->p_root = NULL;

1737

}

1737

}

1738

1739

/*

1739

/*

1740

* Update cfqq's position in the service tree.

1740

* Update cfqq's position in the service tree.

1741

*/

1741

*/

1742

static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1742

static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1743

{

1743

{

1744

/*

1744

/*

1745

* Resorting requires the cfqq to be on the RR list already.

1745

* Resorting requires the cfqq to be on the RR list already.

1746

*/

1746

*/

1747

if (cfq_cfqq_on_rr(cfqq)) {

1747

if (cfq_cfqq_on_rr(cfqq)) {

1748

cfq_service_tree_add(cfqd, cfqq, 0);

1748

cfq_service_tree_add(cfqd, cfqq, 0);

1749

cfq_prio_tree_add(cfqd, cfqq);

1749

cfq_prio_tree_add(cfqd, cfqq);

1750

}

1750

}

1751

}

1751

}

1752

1753

/*

1753

/*

1754

* add to busy list of queues for service, trying to be fair in ordering

1754

* add to busy list of queues for service, trying to be fair in ordering

1755

* the pending list according to last request service

1755

* the pending list according to last request service

1756

*/

1756

*/

1757

static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1757

static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1758

{

1758

{

1759

cfq_log_cfqq(cfqd, cfqq, "add_to_rr");

1759

cfq_log_cfqq(cfqd, cfqq, "add_to_rr");

1760

BUG_ON(cfq_cfqq_on_rr(cfqq));

1760

BUG_ON(cfq_cfqq_on_rr(cfqq));

1761

cfq_mark_cfqq_on_rr(cfqq);

1761

cfq_mark_cfqq_on_rr(cfqq);

1762

cfqd->busy_queues++;

1762

cfqd->busy_queues++;

1763

if (cfq_cfqq_sync(cfqq))

1763

if (cfq_cfqq_sync(cfqq))

1764

cfqd->busy_sync_queues++;

1764

cfqd->busy_sync_queues++;

1765

1766

cfq_resort_rr_list(cfqd, cfqq);

1766

cfq_resort_rr_list(cfqd, cfqq);

1767

}

1767

}

1768

1769

/*

1769

/*

1770

* Called when the cfqq no longer has requests pending, remove it from

1770

* Called when the cfqq no longer has requests pending, remove it from

1771

* the service tree.

1771

* the service tree.

1772

*/

1772

*/

1773

static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1773

static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1774

{

1774

{

1775

cfq_log_cfqq(cfqd, cfqq, "del_from_rr");

1775

cfq_log_cfqq(cfqd, cfqq, "del_from_rr");

1776

BUG_ON(!cfq_cfqq_on_rr(cfqq));

1776

BUG_ON(!cfq_cfqq_on_rr(cfqq));

1777

cfq_clear_cfqq_on_rr(cfqq);

1777

cfq_clear_cfqq_on_rr(cfqq);

1778

1779

if (!RB_EMPTY_NODE(&cfqq->rb_node)) {

1779

if (!RB_EMPTY_NODE(&cfqq->rb_node)) {

1780

cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);

1780

cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);

1781

cfqq->service_tree = NULL;

1781

cfqq->service_tree = NULL;

1782

}

1782

}

1783

if (cfqq->p_root) {

1783

if (cfqq->p_root) {

1784

rb_erase(&cfqq->p_node, cfqq->p_root);

1784

rb_erase(&cfqq->p_node, cfqq->p_root);

1785

cfqq->p_root = NULL;

1785

cfqq->p_root = NULL;

1786

}

1786

}

1787

1788

cfq_group_notify_queue_del(cfqd, cfqq->cfqg);

1788

cfq_group_notify_queue_del(cfqd, cfqq->cfqg);

1789

BUG_ON(!cfqd->busy_queues);

1789

BUG_ON(!cfqd->busy_queues);

1790

cfqd->busy_queues--;

1790

cfqd->busy_queues--;

1791

if (cfq_cfqq_sync(cfqq))

1791

if (cfq_cfqq_sync(cfqq))

1792

cfqd->busy_sync_queues--;

1792

cfqd->busy_sync_queues--;

1793

}

1793

}

1794

1795

/*

1795

/*

1796

* rb tree support functions

1796

* rb tree support functions

1797

*/

1797

*/

1798

static void cfq_del_rq_rb(struct request *rq)

1798

static void cfq_del_rq_rb(struct request *rq)

1799

{

1799

{

1800

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1800

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1801

const int sync = rq_is_sync(rq);

1801

const int sync = rq_is_sync(rq);

1802

1803

BUG_ON(!cfqq->queued[sync]);

1803

BUG_ON(!cfqq->queued[sync]);

1804

cfqq->queued[sync]--;

1804

cfqq->queued[sync]--;

1805

1806

elv_rb_del(&cfqq->sort_list, rq);

1806

elv_rb_del(&cfqq->sort_list, rq);

1807

1808

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {

1808

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {

1809

/*

1809

/*

1810

* Queue will be deleted from service tree when we actually

1810

* Queue will be deleted from service tree when we actually

1811

* expire it later. Right now just remove it from prio tree

1811

* expire it later. Right now just remove it from prio tree

1812

* as it is empty.

1812

* as it is empty.

1813

*/

1813

*/

1814

if (cfqq->p_root) {

1814

if (cfqq->p_root) {

1815

rb_erase(&cfqq->p_node, cfqq->p_root);

1815

rb_erase(&cfqq->p_node, cfqq->p_root);

1816

cfqq->p_root = NULL;

1816

cfqq->p_root = NULL;

1817

}

1817

}

1818

}

1818

}

1819

}

1819

}

1820

1821

static void cfq_add_rq_rb(struct request *rq)

1821

static void cfq_add_rq_rb(struct request *rq)

1822

{

1822

{

1823

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1823

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1824

struct cfq_data *cfqd = cfqq->cfqd;

1824

struct cfq_data *cfqd = cfqq->cfqd;

1825

struct request *prev;

1825

struct request *prev;

1826

1827

cfqq->queued[rq_is_sync(rq)]++;

1827

cfqq->queued[rq_is_sync(rq)]++;

1828

1829

elv_rb_add(&cfqq->sort_list, rq);

1829

elv_rb_add(&cfqq->sort_list, rq);

1830

1831

if (!cfq_cfqq_on_rr(cfqq))

1831

if (!cfq_cfqq_on_rr(cfqq))

1832

cfq_add_cfqq_rr(cfqd, cfqq);

1832

cfq_add_cfqq_rr(cfqd, cfqq);

1833

1834

/*

1834

/*

1835

* check if this request is a better next-serve candidate

1835

* check if this request is a better next-serve candidate

1836

*/

1836

*/

1837

prev = cfqq->next_rq;

1837

prev = cfqq->next_rq;

1838

cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);

1838

cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);

1839

1840

/*

1840

/*

1841

* adjust priority tree position, if ->next_rq changes

1841

* adjust priority tree position, if ->next_rq changes

1842

*/

1842

*/

1843

if (prev != cfqq->next_rq)

1843

if (prev != cfqq->next_rq)

1844

cfq_prio_tree_add(cfqd, cfqq);

1844

cfq_prio_tree_add(cfqd, cfqq);

1845

1846

BUG_ON(!cfqq->next_rq);

1846

BUG_ON(!cfqq->next_rq);

1847

}

1847

}

1848

1849

static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)

1849

static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)

1850

{

1850

{

1851

elv_rb_del(&cfqq->sort_list, rq);

1851

elv_rb_del(&cfqq->sort_list, rq);

1852

cfqq->queued[rq_is_sync(rq)]--;

1852

cfqq->queued[rq_is_sync(rq)]--;

1853

cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);

1853

cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);

1854

cfq_add_rq_rb(rq);

1854

cfq_add_rq_rb(rq);

1855

cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,

1855

cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,

1856

rq->cmd_flags);

1856

rq->cmd_flags);

1857

}

1857

}

1858

1859

static struct request *

1859

static struct request *

1860

cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)

1860

cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)

1861

{

1861

{

1862

struct task_struct *tsk = current;

1862

struct task_struct *tsk = current;

1863

struct cfq_io_cq *cic;

1863

struct cfq_io_cq *cic;

1864

struct cfq_queue *cfqq;

1864

struct cfq_queue *cfqq;

1865

1866

cic = cfq_cic_lookup(cfqd, tsk->io_context);

1866

cic = cfq_cic_lookup(cfqd, tsk->io_context);

1867

if (!cic)

1867

if (!cic)

1868

return NULL;

1868

return NULL;

1869

1870

cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));

1870

cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));

1871

if (cfqq) {

1871

if (cfqq) {

1872

sector_t sector = bio->bi_sector + bio_sectors(bio);

1872

sector_t sector = bio->bi_sector + bio_sectors(bio);

1873

1874

return elv_rb_find(&cfqq->sort_list, sector);

1874

return elv_rb_find(&cfqq->sort_list, sector);

1875

}

1875

}

1876

1877

return NULL;

1877

return NULL;

1878

}

1878

}

1879

1880

static void cfq_activate_request(struct request_queue *q, struct request *rq)

1880

static void cfq_activate_request(struct request_queue *q, struct request *rq)

1881

{

1881

{

1882

struct cfq_data *cfqd = q->elevator->elevator_data;

1882

struct cfq_data *cfqd = q->elevator->elevator_data;

1883

1884

cfqd->rq_in_driver++;

1884

cfqd->rq_in_driver++;

1885

cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",

1885

cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",

1886

cfqd->rq_in_driver);

1886

cfqd->rq_in_driver);

1887

1888

cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

1888

cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

1889

}

1889

}

1890

1891

static void cfq_deactivate_request(struct request_queue *q, struct request *rq)

1891

static void cfq_deactivate_request(struct request_queue *q, struct request *rq)

1892

{

1892

{

1893

struct cfq_data *cfqd = q->elevator->elevator_data;

1893

struct cfq_data *cfqd = q->elevator->elevator_data;

1894

1895

WARN_ON(!cfqd->rq_in_driver);

1895

WARN_ON(!cfqd->rq_in_driver);

1896

cfqd->rq_in_driver--;

1896

cfqd->rq_in_driver--;

1897

cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",

1897

cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",

1898

cfqd->rq_in_driver);

1898

cfqd->rq_in_driver);

1899

}

1899

}

1900

1901

static void cfq_remove_request(struct request *rq)

1901

static void cfq_remove_request(struct request *rq)

1902

{

1902

{

1903

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1903

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1904

1905

if (cfqq->next_rq == rq)

1905

if (cfqq->next_rq == rq)

1906

cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);

1906

cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);

1907

1908

list_del_init(&rq->queuelist);

1908

list_del_init(&rq->queuelist);

1909

cfq_del_rq_rb(rq);

1909

cfq_del_rq_rb(rq);

1910

1911

cfqq->cfqd->rq_queued--;

1911

cfqq->cfqd->rq_queued--;

1912

cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);

1912

cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);

1913

if (rq->cmd_flags & REQ_PRIO) {

1913

if (rq->cmd_flags & REQ_PRIO) {

1914

WARN_ON(!cfqq->prio_pending);

1914

WARN_ON(!cfqq->prio_pending);

1915

cfqq->prio_pending--;

1915

cfqq->prio_pending--;

1916

}

1916

}

1917

}

1917

}

1918

1919

static int cfq_merge(struct request_queue *q, struct request **req,

1919

static int cfq_merge(struct request_queue *q, struct request **req,

1920

struct bio *bio)

1920

struct bio *bio)

1921

{

1921

{

1922

struct cfq_data *cfqd = q->elevator->elevator_data;

1922

struct cfq_data *cfqd = q->elevator->elevator_data;

1923

struct request *__rq;

1923

struct request *__rq;

1924

1925

__rq = cfq_find_rq_fmerge(cfqd, bio);

1925

__rq = cfq_find_rq_fmerge(cfqd, bio);

1926

if (__rq && elv_rq_merge_ok(__rq, bio)) {

1926

if (__rq && elv_rq_merge_ok(__rq, bio)) {

1927

*req = __rq;

1927

*req = __rq;

1928

return ELEVATOR_FRONT_MERGE;

1928

return ELEVATOR_FRONT_MERGE;

1929

}

1929

}

1930

1931

return ELEVATOR_NO_MERGE;

1931

return ELEVATOR_NO_MERGE;

1932

}

1932

}

1933

1934

static void cfq_merged_request(struct request_queue *q, struct request *req,

1934

static void cfq_merged_request(struct request_queue *q, struct request *req,

1935

int type)

1935

int type)

1936

{

1936

{

1937

if (type == ELEVATOR_FRONT_MERGE) {

1937

if (type == ELEVATOR_FRONT_MERGE) {

1938

struct cfq_queue *cfqq = RQ_CFQQ(req);

1938

struct cfq_queue *cfqq = RQ_CFQQ(req);

1939

1940

cfq_reposition_rq_rb(cfqq, req);

1940

cfq_reposition_rq_rb(cfqq, req);

1941

}

1941

}

1942

}

1942

}

1943

1944

static void cfq_bio_merged(struct request_queue *q, struct request *req,

1944

static void cfq_bio_merged(struct request_queue *q, struct request *req,

1945

struct bio *bio)

1945

struct bio *bio)

1946

{

1946

{

1947

cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);

1947

cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);

1948

}

1948

}

1949

1950

static void

1950

static void

1951

cfq_merged_requests(struct request_queue *q, struct request *rq,

1951

cfq_merged_requests(struct request_queue *q, struct request *rq,

1952

struct request *next)

1952

struct request *next)

1953

{

1953

{

1954

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1954

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1955

struct cfq_data *cfqd = q->elevator->elevator_data;

1955

struct cfq_data *cfqd = q->elevator->elevator_data;

1956

1957

/*

1957

/*

1958

* reposition in fifo if next is older than rq

1958

* reposition in fifo if next is older than rq

1959

*/

1959

*/

1960

if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

1960

if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

1961

time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

1961

time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

1962

list_move(&rq->queuelist, &next->queuelist);

1962

list_move(&rq->queuelist, &next->queuelist);

1963

rq_set_fifo_time(rq, rq_fifo_time(next));

1963

rq_set_fifo_time(rq, rq_fifo_time(next));

1964

}

1964

}

1965

1966

if (cfqq->next_rq == next)

1966

if (cfqq->next_rq == next)

1967

cfqq->next_rq = rq;

1967

cfqq->next_rq = rq;

1968

cfq_remove_request(next);

1968

cfq_remove_request(next);

1969

cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);

1969

cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);

1970

1971

cfqq = RQ_CFQQ(next);

1971

cfqq = RQ_CFQQ(next);

1972

/*

1972

/*

1973

* all requests of this queue are merged to other queues, delete it

1973

* all requests of this queue are merged to other queues, delete it

1974

* from the service tree. If it's the active_queue,

1974

* from the service tree. If it's the active_queue,

1975

* cfq_dispatch_requests() will choose to expire it or do idle

1975

* cfq_dispatch_requests() will choose to expire it or do idle

1976

*/

1976

*/

1977

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&

1977

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&

1978

cfqq != cfqd->active_queue)

1978

cfqq != cfqd->active_queue)

1979

cfq_del_cfqq_rr(cfqd, cfqq);

1979

cfq_del_cfqq_rr(cfqd, cfqq);

1980

}

1980

}

1981

1982

static int cfq_allow_merge(struct request_queue *q, struct request *rq,

1982

static int cfq_allow_merge(struct request_queue *q, struct request *rq,

1983

struct bio *bio)

1983

struct bio *bio)

1984

{

1984

{

1985

struct cfq_data *cfqd = q->elevator->elevator_data;

1985

struct cfq_data *cfqd = q->elevator->elevator_data;

1986

struct cfq_io_cq *cic;

1986

struct cfq_io_cq *cic;

1987

struct cfq_queue *cfqq;

1987

struct cfq_queue *cfqq;

1988

1989

/*

1989

/*

1990

* Disallow merge of a sync bio into an async request.

1990

* Disallow merge of a sync bio into an async request.

1991

*/

1991

*/

1992

if (cfq_bio_sync(bio) && !rq_is_sync(rq))

1992

if (cfq_bio_sync(bio) && !rq_is_sync(rq))

1993

return false;

1993

return false;

1994

1995

/*

1995

/*

1996

* Lookup the cfqq that this bio will be queued with and allow

1996

* Lookup the cfqq that this bio will be queued with and allow

1997

* merge only if rq is queued there.

1997

* merge only if rq is queued there.

1998

*/

1998

*/

1999

cic = cfq_cic_lookup(cfqd, current->io_context);

1999

cic = cfq_cic_lookup(cfqd, current->io_context);

2000

if (!cic)

2000

if (!cic)

2001

return false;

2001

return false;

2002

2003

cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));

2003

cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));

2004

return cfqq == RQ_CFQQ(rq);

2004

return cfqq == RQ_CFQQ(rq);

2005

}

2005

}

2006

2007

static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2007

static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2008

{

2008

{

2009

del_timer(&cfqd->idle_slice_timer);

2009

del_timer(&cfqd->idle_slice_timer);

2010

cfqg_stats_update_idle_time(cfqq->cfqg);

2010

cfqg_stats_update_idle_time(cfqq->cfqg);

2011

}

2011

}

2012

2013

static void __cfq_set_active_queue(struct cfq_data *cfqd,

2013

static void __cfq_set_active_queue(struct cfq_data *cfqd,

2014

struct cfq_queue *cfqq)

2014

struct cfq_queue *cfqq)

2015

{

2015

{

2016

if (cfqq) {

2016

if (cfqq) {

2017

cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",

2017

cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",

2018

cfqd->serving_prio, cfqd->serving_type);

2018

cfqd->serving_prio, cfqd->serving_type);

2019

cfqg_stats_update_avg_queue_size(cfqq->cfqg);

2019

cfqg_stats_update_avg_queue_size(cfqq->cfqg);

2020

cfqq->slice_start = 0;

2020

cfqq->slice_start = 0;

2021

cfqq->dispatch_start = jiffies;

2021

cfqq->dispatch_start = jiffies;

2022

cfqq->allocated_slice = 0;

2022

cfqq->allocated_slice = 0;

2023

cfqq->slice_end = 0;

2023

cfqq->slice_end = 0;

2024

cfqq->slice_dispatch = 0;

2024

cfqq->slice_dispatch = 0;

2025

cfqq->nr_sectors = 0;

2025

cfqq->nr_sectors = 0;

2026

2027

cfq_clear_cfqq_wait_request(cfqq);

2027

cfq_clear_cfqq_wait_request(cfqq);

2028

cfq_clear_cfqq_must_dispatch(cfqq);

2028

cfq_clear_cfqq_must_dispatch(cfqq);

2029

cfq_clear_cfqq_must_alloc_slice(cfqq);

2029

cfq_clear_cfqq_must_alloc_slice(cfqq);

2030

cfq_clear_cfqq_fifo_expire(cfqq);

2030

cfq_clear_cfqq_fifo_expire(cfqq);

2031

cfq_mark_cfqq_slice_new(cfqq);

2031

cfq_mark_cfqq_slice_new(cfqq);

2032

2033

cfq_del_timer(cfqd, cfqq);

2033

cfq_del_timer(cfqd, cfqq);

2034

}

2034

}

2035

2036

cfqd->active_queue = cfqq;

2036

cfqd->active_queue = cfqq;

2037

}

2037

}

2038

2039

/*

2039

/*

2040

* current cfqq expired its slice (or was too idle), select new one

2040

* current cfqq expired its slice (or was too idle), select new one

2041

*/

2041

*/

2042

static void

2042

static void

2043

__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,

2043

__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,

2044

bool timed_out)

2044

bool timed_out)

2045

{

2045

{

2046

cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);

2046

cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);

2047

2048

if (cfq_cfqq_wait_request(cfqq))

2048

if (cfq_cfqq_wait_request(cfqq))

2049

cfq_del_timer(cfqd, cfqq);

2049

cfq_del_timer(cfqd, cfqq);

2050

2051

cfq_clear_cfqq_wait_request(cfqq);

2051

cfq_clear_cfqq_wait_request(cfqq);

2052

cfq_clear_cfqq_wait_busy(cfqq);

2052

cfq_clear_cfqq_wait_busy(cfqq);

2053

2054

/*

2054

/*

2055

* If this cfqq is shared between multiple processes, check to

2055

* If this cfqq is shared between multiple processes, check to

2056

* make sure that those processes are still issuing I/Os within

2056

* make sure that those processes are still issuing I/Os within

2057

* the mean seek distance. If not, it may be time to break the

2057

* the mean seek distance. If not, it may be time to break the

2058

* queues apart again.

2058

* queues apart again.

2059

*/

2059

*/

2060

if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))

2060

if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))

2061

cfq_mark_cfqq_split_coop(cfqq);

2061

cfq_mark_cfqq_split_coop(cfqq);

2062

2063

/*

2063

/*

2064

* store what was left of this slice, if the queue idled/timed out

2064

* store what was left of this slice, if the queue idled/timed out

2065

*/

2065

*/

2066

if (timed_out) {

2066

if (timed_out) {

2067

if (cfq_cfqq_slice_new(cfqq))

2067

if (cfq_cfqq_slice_new(cfqq))

2068

cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);

2068

cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);

2069

else

2069

else

2070

cfqq->slice_resid = cfqq->slice_end - jiffies;

2070

cfqq->slice_resid = cfqq->slice_end - jiffies;

2071

cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);

2071

cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);

2072

}

2072

}

2073

2074

cfq_group_served(cfqd, cfqq->cfqg, cfqq);

2074

cfq_group_served(cfqd, cfqq->cfqg, cfqq);

2075

2076

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))

2076

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))

2077

cfq_del_cfqq_rr(cfqd, cfqq);

2077

cfq_del_cfqq_rr(cfqd, cfqq);

2078

2079

cfq_resort_rr_list(cfqd, cfqq);

2079

cfq_resort_rr_list(cfqd, cfqq);

2080

2081

if (cfqq == cfqd->active_queue)

2081

if (cfqq == cfqd->active_queue)

2082

cfqd->active_queue = NULL;

2082

cfqd->active_queue = NULL;

2083

2084

if (cfqd->active_cic) {

2084

if (cfqd->active_cic) {

2085

put_io_context(cfqd->active_cic->icq.ioc);

2085

put_io_context(cfqd->active_cic->icq.ioc);

2086

cfqd->active_cic = NULL;

2086

cfqd->active_cic = NULL;

2087

}

2087

}

2088

}

2088

}

2089

2090

static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)

2090

static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)

2091

{

2091

{

2092

struct cfq_queue *cfqq = cfqd->active_queue;

2092

struct cfq_queue *cfqq = cfqd->active_queue;

2093

2094

if (cfqq)

2094

if (cfqq)

2095

__cfq_slice_expired(cfqd, cfqq, timed_out);

2095

__cfq_slice_expired(cfqd, cfqq, timed_out);

2096

}

2096

}

2097

2098

/*

2098

/*

2099

* Get next queue for service. Unless we have a queue preemption,

2099

* Get next queue for service. Unless we have a queue preemption,

2100

* we'll simply select the first cfqq in the service tree.

2100

* we'll simply select the first cfqq in the service tree.

2101

*/

2101

*/

2102

static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)

2102

static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)

2103

{

2103

{

2104

struct cfq_rb_root *service_tree =

2104

struct cfq_rb_root *service_tree =

2105

service_tree_for(cfqd->serving_group, cfqd->serving_prio,

2105

service_tree_for(cfqd->serving_group, cfqd->serving_prio,

2106

cfqd->serving_type);

2106

cfqd->serving_type);

2107

2108

if (!cfqd->rq_queued)

2108

if (!cfqd->rq_queued)

2109

return NULL;

2109

return NULL;

2110

2111

/* There is nothing to dispatch */

2111

/* There is nothing to dispatch */

2112

if (!service_tree)

2112

if (!service_tree)

2113

return NULL;

2113

return NULL;

2114

if (RB_EMPTY_ROOT(&service_tree->rb))

2114

if (RB_EMPTY_ROOT(&service_tree->rb))

2115

return NULL;

2115

return NULL;

2116

return cfq_rb_first(service_tree);

2116

return cfq_rb_first(service_tree);

2117

}

2117

}

2118

2119

static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)

2119

static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)

2120

{

2120

{

2121

struct cfq_group *cfqg;

2121

struct cfq_group *cfqg;

2122

struct cfq_queue *cfqq;

2122

struct cfq_queue *cfqq;

2123

int i, j;

2123

int i, j;

2124

struct cfq_rb_root *st;

2124

struct cfq_rb_root *st;

2125

2126

if (!cfqd->rq_queued)

2126

if (!cfqd->rq_queued)

2127

return NULL;

2127

return NULL;

2128

2129

cfqg = cfq_get_next_cfqg(cfqd);

2129

cfqg = cfq_get_next_cfqg(cfqd);

2130

if (!cfqg)

2130

if (!cfqg)

2131

return NULL;

2131

return NULL;

2132

2133

for_each_cfqg_st(cfqg, i, j, st)

2133

for_each_cfqg_st(cfqg, i, j, st)

2134

if ((cfqq = cfq_rb_first(st)) != NULL)

2134

if ((cfqq = cfq_rb_first(st)) != NULL)

2135

return cfqq;

2135

return cfqq;

2136

return NULL;

2136

return NULL;

2137

}

2137

}

2138

2139

/*

2139

/*

2140

* Get and set a new active queue for service.

2140

* Get and set a new active queue for service.

2141

*/

2141

*/

2142

static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,

2142

static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,

2143

struct cfq_queue *cfqq)

2143

struct cfq_queue *cfqq)

2144

{

2144

{

2145

if (!cfqq)

2145

if (!cfqq)

2146

cfqq = cfq_get_next_queue(cfqd);

2146

cfqq = cfq_get_next_queue(cfqd);

2147

2148

__cfq_set_active_queue(cfqd, cfqq);

2148

__cfq_set_active_queue(cfqd, cfqq);

2149

return cfqq;

2149

return cfqq;

2150

}

2150

}

2151

2152

static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,

2152

static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,

2153

struct request *rq)

2153

struct request *rq)

2154

{

2154

{

2155

if (blk_rq_pos(rq) >= cfqd->last_position)

2155

if (blk_rq_pos(rq) >= cfqd->last_position)

2156

return blk_rq_pos(rq) - cfqd->last_position;

2156

return blk_rq_pos(rq) - cfqd->last_position;

2157

else

2157

else

2158

return cfqd->last_position - blk_rq_pos(rq);

2158

return cfqd->last_position - blk_rq_pos(rq);

2159

}

2159

}

2160

2161

static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,

2161

static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,

2162

struct request *rq)

2162

struct request *rq)

2163

{

2163

{

2164

return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;

2164

return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;

2165

}

2165

}

2166

2167

static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,

2167

static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,

2168

struct cfq_queue *cur_cfqq)

2168

struct cfq_queue *cur_cfqq)

2169

{

2169

{

2170

struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];

2170

struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];

2171

struct rb_node *parent, *node;

2171

struct rb_node *parent, *node;

2172

struct cfq_queue *__cfqq;

2172

struct cfq_queue *__cfqq;

2173

sector_t sector = cfqd->last_position;

2173

sector_t sector = cfqd->last_position;

2174

2175

if (RB_EMPTY_ROOT(root))

2175

if (RB_EMPTY_ROOT(root))

2176

return NULL;

2176

return NULL;

2177

2178

/*

2178

/*

2179

* First, if we find a request starting at the end of the last

2179

* First, if we find a request starting at the end of the last

2180

* request, choose it.

2180

* request, choose it.

2181

*/

2181

*/

2182

__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);

2182

__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);

2183

if (__cfqq)

2183

if (__cfqq)

2184

return __cfqq;

2184

return __cfqq;

2185

2186

/*

2186

/*

2187

* If the exact sector wasn't found, the parent of the NULL leaf

2187

* If the exact sector wasn't found, the parent of the NULL leaf

2188

* will contain the closest sector.

2188

* will contain the closest sector.

2189

*/

2189

*/

2190

__cfqq = rb_entry(parent, struct cfq_queue, p_node);

2190

__cfqq = rb_entry(parent, struct cfq_queue, p_node);

2191

if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))

2191

if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))

2192

return __cfqq;

2192

return __cfqq;

2193

2194

if (blk_rq_pos(__cfqq->next_rq) < sector)

2194

if (blk_rq_pos(__cfqq->next_rq) < sector)

2195

node = rb_next(&__cfqq->p_node);

2195

node = rb_next(&__cfqq->p_node);

2196

else

2196

else

2197

node = rb_prev(&__cfqq->p_node);

2197

node = rb_prev(&__cfqq->p_node);

2198

if (!node)

2198

if (!node)

2199

return NULL;

2199

return NULL;

2200

2201

__cfqq = rb_entry(node, struct cfq_queue, p_node);

2201

__cfqq = rb_entry(node, struct cfq_queue, p_node);

2202

if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))

2202

if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))

2203

return __cfqq;

2203

return __cfqq;

2204

2205

return NULL;

2205

return NULL;

2206

}

2206

}

2207

2208

/*

2208

/*

2209

* cfqd - obvious

2209

* cfqd - obvious

2210

* cur_cfqq - passed in so that we don't decide that the current queue is

2210

* cur_cfqq - passed in so that we don't decide that the current queue is

2211

* closely cooperating with itself.

2211

* closely cooperating with itself.

2212

*

2212

*

2213

* So, basically we're assuming that that cur_cfqq has dispatched at least

2213

* So, basically we're assuming that that cur_cfqq has dispatched at least

2214

* one request, and that cfqd->last_position reflects a position on the disk

2214

* one request, and that cfqd->last_position reflects a position on the disk

2215

* associated with the I/O issued by cur_cfqq. I'm not sure this is a valid

2215

* associated with the I/O issued by cur_cfqq. I'm not sure this is a valid

2216

* assumption.

2216

* assumption.

2217

*/

2217

*/

2218

static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,

2218

static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,

2219

struct cfq_queue *cur_cfqq)

2219

struct cfq_queue *cur_cfqq)

2220

{

2220

{

2221

struct cfq_queue *cfqq;

2221

struct cfq_queue *cfqq;

2222

2223

if (cfq_class_idle(cur_cfqq))

2223

if (cfq_class_idle(cur_cfqq))

2224

return NULL;

2224

return NULL;

2225

if (!cfq_cfqq_sync(cur_cfqq))

2225

if (!cfq_cfqq_sync(cur_cfqq))

2226

return NULL;

2226

return NULL;

2227

if (CFQQ_SEEKY(cur_cfqq))

2227

if (CFQQ_SEEKY(cur_cfqq))

2228

return NULL;

2228

return NULL;

2229

2230

/*

2230

/*

2231

* Don't search priority tree if it's the only queue in the group.

2231

* Don't search priority tree if it's the only queue in the group.

2232

*/

2232

*/

2233

if (cur_cfqq->cfqg->nr_cfqq == 1)

2233

if (cur_cfqq->cfqg->nr_cfqq == 1)

2234

return NULL;

2234

return NULL;

2235

2236

/*

2236

/*

2237

* We should notice if some of the queues are cooperating, eg

2237

* We should notice if some of the queues are cooperating, eg

2238

* working closely on the same area of the disk. In that case,

2238

* working closely on the same area of the disk. In that case,

2239

* we can group them together and don't waste time idling.

2239

* we can group them together and don't waste time idling.

2240

*/

2240

*/

2241

cfqq = cfqq_close(cfqd, cur_cfqq);

2241

cfqq = cfqq_close(cfqd, cur_cfqq);

2242

if (!cfqq)

2242

if (!cfqq)

2243

return NULL;

2243

return NULL;

2244

2245

/* If new queue belongs to different cfq_group, don't choose it */

2245

/* If new queue belongs to different cfq_group, don't choose it */

2246

if (cur_cfqq->cfqg != cfqq->cfqg)

2246

if (cur_cfqq->cfqg != cfqq->cfqg)

2247

return NULL;

2247

return NULL;

2248

2249

/*

2249

/*

2250

* It only makes sense to merge sync queues.

2250

* It only makes sense to merge sync queues.

2251

*/

2251

*/

2252

if (!cfq_cfqq_sync(cfqq))

2252

if (!cfq_cfqq_sync(cfqq))

2253

return NULL;

2253

return NULL;

2254

if (CFQQ_SEEKY(cfqq))

2254

if (CFQQ_SEEKY(cfqq))

2255

return NULL;

2255

return NULL;

2256

2257

/*

2257

/*

2258

* Do not merge queues of different priority classes

2258

* Do not merge queues of different priority classes

2259

*/

2259

*/

2260

if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))

2260

if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))

2261

return NULL;

2261

return NULL;

2262

2263

return cfqq;

2263

return cfqq;

2264

}

2264

}

2265

2266

/*

2266

/*

2267

* Determine whether we should enforce idle window for this queue.

2267

* Determine whether we should enforce idle window for this queue.

2268

*/

2268

*/

2269

2270

static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2270

static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2271

{

2271

{

2272

enum wl_prio_t prio = cfqq_prio(cfqq);

2272

enum wl_prio_t prio = cfqq_prio(cfqq);

2273

struct cfq_rb_root *service_tree = cfqq->service_tree;

2273

struct cfq_rb_root *service_tree = cfqq->service_tree;

2274

2275

BUG_ON(!service_tree);

2275

BUG_ON(!service_tree);

2276

BUG_ON(!service_tree->count);

2276

BUG_ON(!service_tree->count);

2277

2278

if (!cfqd->cfq_slice_idle)

2278

if (!cfqd->cfq_slice_idle)

2279

return false;

2279

return false;

2280

2281

/* We never do for idle class queues. */

2281

/* We never do for idle class queues. */

2282

if (prio == IDLE_WORKLOAD)

2282

if (prio == IDLE_WORKLOAD)

2283

return false;

2283

return false;

2284

2285

/* We do for queues that were marked with idle window flag. */

2285

/* We do for queues that were marked with idle window flag. */

2286

if (cfq_cfqq_idle_window(cfqq) &&

2286

if (cfq_cfqq_idle_window(cfqq) &&

2287

!(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))

2287

!(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))

2288

return true;

2288

return true;

2289

2290

/*

2290

/*

2291

* Otherwise, we do only if they are the last ones

2291

* Otherwise, we do only if they are the last ones

2292

* in their service tree.

2292

* in their service tree.

2293

*/

2293

*/

2294

if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&

2294

if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&

2295

!cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))

2295

!cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))

2296

return true;

2296

return true;

2297

cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",

2297

cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",

2298

service_tree->count);

2298

service_tree->count);

2299

return false;

2299

return false;

2300

}

2300

}

2301

2302

static void cfq_arm_slice_timer(struct cfq_data *cfqd)

2302

static void cfq_arm_slice_timer(struct cfq_data *cfqd)

2303

{

2303

{

2304

struct cfq_queue *cfqq = cfqd->active_queue;

2304

struct cfq_queue *cfqq = cfqd->active_queue;

2305

struct cfq_io_cq *cic;

2305

struct cfq_io_cq *cic;

2306

unsigned long sl, group_idle = 0;

2306

unsigned long sl, group_idle = 0;

2307

2308

/*

2308

/*

2309

* SSD device without seek penalty, disable idling. But only do so

2309

* SSD device without seek penalty, disable idling. But only do so

2310

* for devices that support queuing, otherwise we still have a problem

2310

* for devices that support queuing, otherwise we still have a problem

2311

* with sync vs async workloads.

2311

* with sync vs async workloads.

2312

*/

2312

*/

2313

if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)

2313

if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)

2314

return;

2314

return;

2315

2316

WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));

2316

WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));

2317

WARN_ON(cfq_cfqq_slice_new(cfqq));

2317

WARN_ON(cfq_cfqq_slice_new(cfqq));

2318

2319

/*

2319

/*

2320

* idle is disabled, either manually or by past process history

2320

* idle is disabled, either manually or by past process history

2321

*/

2321

*/

2322

if (!cfq_should_idle(cfqd, cfqq)) {

2322

if (!cfq_should_idle(cfqd, cfqq)) {

2323

/* no queue idling. Check for group idling */

2323

/* no queue idling. Check for group idling */

2324

if (cfqd->cfq_group_idle)

2324

if (cfqd->cfq_group_idle)

2325

group_idle = cfqd->cfq_group_idle;

2325

group_idle = cfqd->cfq_group_idle;

2326

else

2326

else

2327

return;

2327

return;

2328

}

2328

}

2329

2330

/*

2330

/*

2331

* still active requests from this queue, don't idle

2331

* still active requests from this queue, don't idle

2332

*/

2332

*/

2333

if (cfqq->dispatched)

2333

if (cfqq->dispatched)

2334

return;

2334

return;

2335

2336

/*

2336

/*

2337

* task has exited, don't wait

2337

* task has exited, don't wait

2338

*/

2338

*/

2339

cic = cfqd->active_cic;

2339

cic = cfqd->active_cic;

2340

if (!cic || !atomic_read(&cic->icq.ioc->active_ref))

2340

if (!cic || !atomic_read(&cic->icq.ioc->active_ref))

2341

return;

2341

return;

2342

2343

/*

2343

/*

2344

* If our average think time is larger than the remaining time

2344

* If our average think time is larger than the remaining time

2345

* slice, then don't idle. This avoids overrunning the allotted

2345

* slice, then don't idle. This avoids overrunning the allotted

2346

* time slice.

2346

* time slice.

2347

*/

2347

*/

2348

if (sample_valid(cic->ttime.ttime_samples) &&

2348

if (sample_valid(cic->ttime.ttime_samples) &&

2349

(cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {

2349

(cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {

2350

cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",

2350

cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",

2351

cic->ttime.ttime_mean);

2351

cic->ttime.ttime_mean);

2352

return;

2352

return;

2353

}

2353

}

2354

2355

/* There are other queues in the group, don't do group idle */

2355

/* There are other queues in the group, don't do group idle */

2356

if (group_idle && cfqq->cfqg->nr_cfqq > 1)

2356

if (group_idle && cfqq->cfqg->nr_cfqq > 1)

2357

return;

2357

return;

2358

2359

cfq_mark_cfqq_wait_request(cfqq);

2359

cfq_mark_cfqq_wait_request(cfqq);

2360

2361

if (group_idle)

2361

if (group_idle)

2362

sl = cfqd->cfq_group_idle;

2362

sl = cfqd->cfq_group_idle;

2363

else

2363

else

2364

sl = cfqd->cfq_slice_idle;

2364

sl = cfqd->cfq_slice_idle;

2365

2366

mod_timer(&cfqd->idle_slice_timer, jiffies + sl);

2366

mod_timer(&cfqd->idle_slice_timer, jiffies + sl);

2367

cfqg_stats_set_start_idle_time(cfqq->cfqg);

2367

cfqg_stats_set_start_idle_time(cfqq->cfqg);

2368

cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,

2368

cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,

2369

group_idle ? 1 : 0);

2369

group_idle ? 1 : 0);

2370

}

2370

}

2371

2372

/*

2372

/*

2373

* Move request from internal lists to the request queue dispatch list.

2373

* Move request from internal lists to the request queue dispatch list.

2374

*/

2374

*/

2375

static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)

2375

static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)

2376

{

2376

{

2377

struct cfq_data *cfqd = q->elevator->elevator_data;

2377

struct cfq_data *cfqd = q->elevator->elevator_data;

2378

struct cfq_queue *cfqq = RQ_CFQQ(rq);

2378

struct cfq_queue *cfqq = RQ_CFQQ(rq);

2379

2380

cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");

2380

cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");

2381

2382

cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);

2382

cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);

2383

cfq_remove_request(rq);

2383

cfq_remove_request(rq);

2384

cfqq->dispatched++;

2384

cfqq->dispatched++;

2385

(RQ_CFQG(rq))->dispatched++;

2385

(RQ_CFQG(rq))->dispatched++;

2386

elv_dispatch_sort(q, rq);

2386

elv_dispatch_sort(q, rq);

2387

2388

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;

2388

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;

2389

cfqq->nr_sectors += blk_rq_sectors(rq);

2389

cfqq->nr_sectors += blk_rq_sectors(rq);

2390

cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);

2390

cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);

2391

}

2391

}

2392

2393

/*

2393

/*

2394

* return expired entry, or NULL to just start from scratch in rbtree

2394

* return expired entry, or NULL to just start from scratch in rbtree

2395

*/

2395

*/

2396

static struct request *cfq_check_fifo(struct cfq_queue *cfqq)

2396

static struct request *cfq_check_fifo(struct cfq_queue *cfqq)

2397

{

2397

{

2398

struct request *rq = NULL;

2398

struct request *rq = NULL;

2399

2400

if (cfq_cfqq_fifo_expire(cfqq))

2400

if (cfq_cfqq_fifo_expire(cfqq))

2401

return NULL;

2401

return NULL;

2402

2403

cfq_mark_cfqq_fifo_expire(cfqq);

2403

cfq_mark_cfqq_fifo_expire(cfqq);

2404

2405

if (list_empty(&cfqq->fifo))

2405

if (list_empty(&cfqq->fifo))

2406

return NULL;

2406

return NULL;

2407

2408

rq = rq_entry_fifo(cfqq->fifo.next);

2408

rq = rq_entry_fifo(cfqq->fifo.next);

2409

if (time_before(jiffies, rq_fifo_time(rq)))

2409

if (time_before(jiffies, rq_fifo_time(rq)))

2410

rq = NULL;

2410

rq = NULL;

2411

2412

cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);

2412

cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);

2413

return rq;

2413

return rq;

2414

}

2414

}

2415

2416

static inline int

2416

static inline int

2417

cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2417

cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2418

{

2418

{

2419

const int base_rq = cfqd->cfq_slice_async_rq;

2419

const int base_rq = cfqd->cfq_slice_async_rq;

2420

2421

WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);

2421

WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);

2422

2423

return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);

2423

return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);

2424

}

2424

}

2425

2426

/*

2426

/*

2427

* Must be called with the queue_lock held.

2427

* Must be called with the queue_lock held.

2428

*/

2428

*/

2429

static int cfqq_process_refs(struct cfq_queue *cfqq)

2429

static int cfqq_process_refs(struct cfq_queue *cfqq)

2430

{

2430

{

2431

int process_refs, io_refs;

2431

int process_refs, io_refs;

2432

2433

io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];

2433

io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];

2434

process_refs = cfqq->ref - io_refs;

2434

process_refs = cfqq->ref - io_refs;

2435

BUG_ON(process_refs < 0);

2435

BUG_ON(process_refs < 0);

2436

return process_refs;

2436

return process_refs;

2437

}

2437

}

2438

2439

static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)

2439

static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)

2440

{

2440

{

2441

int process_refs, new_process_refs;

2441

int process_refs, new_process_refs;

2442

struct cfq_queue *__cfqq;

2442

struct cfq_queue *__cfqq;

2443

2444

/*

2444

/*

2445

* If there are no process references on the new_cfqq, then it is

2445

* If there are no process references on the new_cfqq, then it is

2446

* unsafe to follow the ->new_cfqq chain as other cfqq's in the

2446

* unsafe to follow the ->new_cfqq chain as other cfqq's in the

2447

* chain may have dropped their last reference (not just their

2447

* chain may have dropped their last reference (not just their

2448

* last process reference).

2448

* last process reference).

2449

*/

2449

*/

2450

if (!cfqq_process_refs(new_cfqq))

2450

if (!cfqq_process_refs(new_cfqq))

2451

return;

2451

return;

2452

2453

/* Avoid a circular list and skip interim queue merges */

2453

/* Avoid a circular list and skip interim queue merges */

2454

while ((__cfqq = new_cfqq->new_cfqq)) {

2454

while ((__cfqq = new_cfqq->new_cfqq)) {

2455

if (__cfqq == cfqq)

2455

if (__cfqq == cfqq)

2456

return;

2456

return;

2457

new_cfqq = __cfqq;

2457

new_cfqq = __cfqq;

2458

}

2458

}

2459

2460

process_refs = cfqq_process_refs(cfqq);

2460

process_refs = cfqq_process_refs(cfqq);

2461

new_process_refs = cfqq_process_refs(new_cfqq);

2461

new_process_refs = cfqq_process_refs(new_cfqq);

2462

/*

2462

/*

2463

* If the process for the cfqq has gone away, there is no

2463

* If the process for the cfqq has gone away, there is no

2464

* sense in merging the queues.

2464

* sense in merging the queues.

2465

*/

2465

*/

2466

if (process_refs == 0 || new_process_refs == 0)

2466

if (process_refs == 0 || new_process_refs == 0)

2467

return;

2467

return;

2468

2469

/*

2469

/*

2470

* Merge in the direction of the lesser amount of work.

2470

* Merge in the direction of the lesser amount of work.

2471

*/

2471

*/

2472

if (new_process_refs >= process_refs) {

2472

if (new_process_refs >= process_refs) {

2473

cfqq->new_cfqq = new_cfqq;

2473

cfqq->new_cfqq = new_cfqq;

2474

new_cfqq->ref += process_refs;

2474

new_cfqq->ref += process_refs;

2475

} else {

2475

} else {

2476

new_cfqq->new_cfqq = cfqq;

2476

new_cfqq->new_cfqq = cfqq;

2477

cfqq->ref += new_process_refs;

2477

cfqq->ref += new_process_refs;

2478

}

2478

}

2479

}

2479

}

2480

2481

static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,

2481

static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,

2482

struct cfq_group *cfqg, enum wl_prio_t prio)

2482

struct cfq_group *cfqg, enum wl_prio_t prio)

2483

{

2483

{

2484

struct cfq_queue *queue;

2484

struct cfq_queue *queue;

2485

int i;

2485

int i;

2486

bool key_valid = false;

2486

bool key_valid = false;

2487

unsigned long lowest_key = 0;

2487

unsigned long lowest_key = 0;

2488

enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;

2488

enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;

2489

2490

for (i = 0; i <= SYNC_WORKLOAD; ++i) {

2490

for (i = 0; i <= SYNC_WORKLOAD; ++i) {

2491

/* select the one with lowest rb_key */

2491

/* select the one with lowest rb_key */

2492

queue = cfq_rb_first(service_tree_for(cfqg, prio, i));

2492

queue = cfq_rb_first(service_tree_for(cfqg, prio, i));

2493

if (queue &&

2493

if (queue &&

2494

(!key_valid || time_before(queue->rb_key, lowest_key))) {

2494

(!key_valid || time_before(queue->rb_key, lowest_key))) {

2495

lowest_key = queue->rb_key;

2495

lowest_key = queue->rb_key;

2496

cur_best = i;

2496

cur_best = i;

2497

key_valid = true;

2497

key_valid = true;

2498

}

2498

}

2499

}

2499

}

2500

2501

return cur_best;

2501

return cur_best;

2502

}

2502

}

2503

2504

static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)

2504

static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)

2505

{

2505

{

2506

unsigned slice;

2506

unsigned slice;

2507

unsigned count;

2507

unsigned count;

2508

struct cfq_rb_root *st;

2508

struct cfq_rb_root *st;

2509

unsigned group_slice;

2509

unsigned group_slice;

2510

enum wl_prio_t original_prio = cfqd->serving_prio;

2510

enum wl_prio_t original_prio = cfqd->serving_prio;

2511

2512

/* Choose next priority. RT > BE > IDLE */

2512

/* Choose next priority. RT > BE > IDLE */

2513

if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))

2513

if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))

2514

cfqd->serving_prio = RT_WORKLOAD;

2514

cfqd->serving_prio = RT_WORKLOAD;

2515

else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))

2515

else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))

2516

cfqd->serving_prio = BE_WORKLOAD;

2516

cfqd->serving_prio = BE_WORKLOAD;

2517

else {

2517

else {

2518

cfqd->serving_prio = IDLE_WORKLOAD;

2518

cfqd->serving_prio = IDLE_WORKLOAD;

2519

cfqd->workload_expires = jiffies + 1;

2519

cfqd->workload_expires = jiffies + 1;

2520

return;

2520

return;

2521

}

2521

}

2522

2523

if (original_prio != cfqd->serving_prio)

2523

if (original_prio != cfqd->serving_prio)

2524

goto new_workload;

2524

goto new_workload;

2525

2526

/*

2526

/*

2527

* For RT and BE, we have to choose also the type

2527

* For RT and BE, we have to choose also the type

2528

* (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload

2528

* (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload

2529

* expiration time

2529

* expiration time

2530

*/

2530

*/

2531

st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);

2531

st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);

2532

count = st->count;

2532

count = st->count;

2533

2534

/*

2534

/*

2535

* check workload expiration, and that we still have other queues ready

2535

* check workload expiration, and that we still have other queues ready

2536

*/

2536

*/

2537

if (count && !time_after(jiffies, cfqd->workload_expires))

2537

if (count && !time_after(jiffies, cfqd->workload_expires))

2538

return;

2538

return;

2539

2540

new_workload:

2540

new_workload:

2541

/* otherwise select new workload type */

2541

/* otherwise select new workload type */

2542

cfqd->serving_type =

2542

cfqd->serving_type =

2543

cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);

2543

cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);

2544

st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);

2544

st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);

2545

count = st->count;

2545

count = st->count;

2546

2547

/*

2547

/*

2548

* the workload slice is computed as a fraction of target latency

2548

* the workload slice is computed as a fraction of target latency

2549

* proportional to the number of queues in that workload, over

2549

* proportional to the number of queues in that workload, over

2550

* all the queues in the same priority class

2550

* all the queues in the same priority class

2551

*/

2551

*/

2552

group_slice = cfq_group_slice(cfqd, cfqg);

2552

group_slice = cfq_group_slice(cfqd, cfqg);

2553

2554

slice = group_slice * count /

2554

slice = group_slice * count /

2555

max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],

2555

max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],

2556

cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));

2556

cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));

2557

2558

if (cfqd->serving_type == ASYNC_WORKLOAD) {

2558

if (cfqd->serving_type == ASYNC_WORKLOAD) {

2559

unsigned int tmp;

2559

unsigned int tmp;

2560

2561

/*

2561

/*

2562

* Async queues are currently system wide. Just taking

2562

* Async queues are currently system wide. Just taking

2563

* proportion of queues with-in same group will lead to higher

2563

* proportion of queues with-in same group will lead to higher

2564

* async ratio system wide as generally root group is going

2564

* async ratio system wide as generally root group is going

2565

* to have higher weight. A more accurate thing would be to

2565

* to have higher weight. A more accurate thing would be to

2566

* calculate system wide asnc/sync ratio.

2566

* calculate system wide asnc/sync ratio.

2567

*/

2567

*/

2568

tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);

2568

tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);

2569

tmp = tmp/cfqd->busy_queues;

2569

tmp = tmp/cfqd->busy_queues;

2570

slice = min_t(unsigned, slice, tmp);

2570

slice = min_t(unsigned, slice, tmp);

2571

2572

/* async workload slice is scaled down according to

2572

/* async workload slice is scaled down according to

2573

* the sync/async slice ratio. */

2573

* the sync/async slice ratio. */

2574

slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];

2574

slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];

2575

} else

2575

} else

2576

/* sync workload slice is at least 2 * cfq_slice_idle */

2576

/* sync workload slice is at least 2 * cfq_slice_idle */

2577

slice = max(slice, 2 * cfqd->cfq_slice_idle);

2577

slice = max(slice, 2 * cfqd->cfq_slice_idle);

2578

2579

slice = max_t(unsigned, slice, CFQ_MIN_TT);

2579

slice = max_t(unsigned, slice, CFQ_MIN_TT);

2580

cfq_log(cfqd, "workload slice:%d", slice);

2580

cfq_log(cfqd, "workload slice:%d", slice);

2581

cfqd->workload_expires = jiffies + slice;

2581

cfqd->workload_expires = jiffies + slice;

2582

}

2582

}

2583

2584

static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)

2584

static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)

2585

{

2585

{

2586

struct cfq_rb_root *st = &cfqd->grp_service_tree;

2586

struct cfq_rb_root *st = &cfqd->grp_service_tree;

2587

struct cfq_group *cfqg;

2587

struct cfq_group *cfqg;

2588

2589

if (RB_EMPTY_ROOT(&st->rb))

2589

if (RB_EMPTY_ROOT(&st->rb))

2590

return NULL;

2590

return NULL;

2591

cfqg = cfq_rb_first_group(st);

2591

cfqg = cfq_rb_first_group(st);

2592

update_min_vdisktime(st);

2592

update_min_vdisktime(st);

2593

return cfqg;

2593

return cfqg;

2594

}

2594

}

2595

2596

static void cfq_choose_cfqg(struct cfq_data *cfqd)

2596

static void cfq_choose_cfqg(struct cfq_data *cfqd)

2597

{

2597

{

2598

struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);

2598

struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);

2599

2600

cfqd->serving_group = cfqg;

2600

cfqd->serving_group = cfqg;

2601

2602

/* Restore the workload type data */

2602

/* Restore the workload type data */

2603

if (cfqg->saved_workload_slice) {

2603

if (cfqg->saved_workload_slice) {

2604

cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;

2604

cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;

2605

cfqd->serving_type = cfqg->saved_workload;

2605

cfqd->serving_type = cfqg->saved_workload;

2606

cfqd->serving_prio = cfqg->saved_serving_prio;

2606

cfqd->serving_prio = cfqg->saved_serving_prio;

2607

} else

2607

} else

2608

cfqd->workload_expires = jiffies - 1;

2608

cfqd->workload_expires = jiffies - 1;

2609

2610

choose_service_tree(cfqd, cfqg);

2610

choose_service_tree(cfqd, cfqg);

2611

}

2611

}

2612

2613

/*

2613

/*

2614

* Select a queue for service. If we have a current active queue,

2614

* Select a queue for service. If we have a current active queue,

2615

* check whether to continue servicing it, or retrieve and set a new one.

2615

* check whether to continue servicing it, or retrieve and set a new one.

2616

*/

2616

*/

2617

static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)

2617

static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)

2618

{

2618

{

2619

struct cfq_queue *cfqq, *new_cfqq = NULL;

2619

struct cfq_queue *cfqq, *new_cfqq = NULL;

2620

2621

cfqq = cfqd->active_queue;

2621

cfqq = cfqd->active_queue;

2622

if (!cfqq)

2622

if (!cfqq)

2623

goto new_queue;

2623

goto new_queue;

2624

2625

if (!cfqd->rq_queued)

2625

if (!cfqd->rq_queued)

2626

return NULL;

2626

return NULL;

2627

2628

/*

2628

/*

2629

* We were waiting for group to get backlogged. Expire the queue

2629

* We were waiting for group to get backlogged. Expire the queue

2630

*/

2630

*/

2631

if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))

2631

if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))

2632

goto expire;

2632

goto expire;

2633

2634

/*

2634

/*

2635

* The active queue has run out of time, expire it and select new.

2635

* The active queue has run out of time, expire it and select new.

2636

*/

2636

*/

2637

if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {

2637

if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {

2638

/*

2638

/*

2639

* If slice had not expired at the completion of last request

2639

* If slice had not expired at the completion of last request

2640

* we might not have turned on wait_busy flag. Don't expire

2640

* we might not have turned on wait_busy flag. Don't expire

2641

* the queue yet. Allow the group to get backlogged.

2641

* the queue yet. Allow the group to get backlogged.

2642

*

2642

*

2643

* The very fact that we have used the slice, that means we

2643

* The very fact that we have used the slice, that means we

2644

* have been idling all along on this queue and it should be

2644

* have been idling all along on this queue and it should be

2645

* ok to wait for this request to complete.

2645

* ok to wait for this request to complete.

2646

*/

2646

*/

2647

if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)

2647

if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)

2648

&& cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {

2648

&& cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {

2649

cfqq = NULL;

2649

cfqq = NULL;

2650

goto keep_queue;

2650

goto keep_queue;

2651

} else

2651

} else

2652

goto check_group_idle;

2652

goto check_group_idle;

2653

}

2653

}

2654

2655

/*

2655

/*

2656

* The active queue has requests and isn't expired, allow it to

2656

* The active queue has requests and isn't expired, allow it to

2657

* dispatch.

2657

* dispatch.

2658

*/

2658

*/

2659

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

2659

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

2660

goto keep_queue;

2660

goto keep_queue;

2661

2662

/*

2662

/*

2663

* If another queue has a request waiting within our mean seek

2663

* If another queue has a request waiting within our mean seek

2664

* distance, let it run. The expire code will check for close

2664

* distance, let it run. The expire code will check for close

2665

* cooperators and put the close queue at the front of the service

2665

* cooperators and put the close queue at the front of the service

2666

* tree. If possible, merge the expiring queue with the new cfqq.

2666

* tree. If possible, merge the expiring queue with the new cfqq.

2667

*/

2667

*/

2668

new_cfqq = cfq_close_cooperator(cfqd, cfqq);

2668

new_cfqq = cfq_close_cooperator(cfqd, cfqq);

2669

if (new_cfqq) {

2669

if (new_cfqq) {

2670

if (!cfqq->new_cfqq)

2670

if (!cfqq->new_cfqq)

2671

cfq_setup_merge(cfqq, new_cfqq);

2671

cfq_setup_merge(cfqq, new_cfqq);

2672

goto expire;

2672

goto expire;

2673

}

2673

}

2674

2675

/*

2675

/*

2676

* No requests pending. If the active queue still has requests in

2676

* No requests pending. If the active queue still has requests in

2677

* flight or is idling for a new request, allow either of these

2677

* flight or is idling for a new request, allow either of these

2678

* conditions to happen (or time out) before selecting a new queue.

2678

* conditions to happen (or time out) before selecting a new queue.

2679

*/

2679

*/

2680

if (timer_pending(&cfqd->idle_slice_timer)) {

2680

if (timer_pending(&cfqd->idle_slice_timer)) {

2681

cfqq = NULL;

2681

cfqq = NULL;

2682

goto keep_queue;

2682

goto keep_queue;

2683

}

2683

}

2684

2685

/*

2685

/*

2686

* This is a deep seek queue, but the device is much faster than

2686

* This is a deep seek queue, but the device is much faster than

2687

* the queue can deliver, don't idle

2687

* the queue can deliver, don't idle

2688

**/

2688

**/

2689

if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&

2689

if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&

2690

(cfq_cfqq_slice_new(cfqq) ||

2690

(cfq_cfqq_slice_new(cfqq) ||

2691

(cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {

2691

(cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {

2692

cfq_clear_cfqq_deep(cfqq);

2692

cfq_clear_cfqq_deep(cfqq);

2693

cfq_clear_cfqq_idle_window(cfqq);

2693

cfq_clear_cfqq_idle_window(cfqq);

2694

}

2694

}

2695

2696

if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {

2696

if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {

2697

cfqq = NULL;

2697

cfqq = NULL;

2698

goto keep_queue;

2698

goto keep_queue;

2699

}

2699

}

2700

2701

/*

2701

/*

2702

* If group idle is enabled and there are requests dispatched from

2702

* If group idle is enabled and there are requests dispatched from

2703

* this group, wait for requests to complete.

2703

* this group, wait for requests to complete.

2704

*/

2704

*/

2705

check_group_idle:

2705

check_group_idle:

2706

if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&

2706

if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&

2707

cfqq->cfqg->dispatched &&

2707

cfqq->cfqg->dispatched &&

2708

!cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {

2708

!cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {

2709

cfqq = NULL;

2709

cfqq = NULL;

2710

goto keep_queue;

2710

goto keep_queue;

2711

}

2711

}

2712

2713

expire:

2713

expire:

2714

cfq_slice_expired(cfqd, 0);

2714

cfq_slice_expired(cfqd, 0);

2715

new_queue:

2715

new_queue:

2716

/*

2716

/*

2717

* Current queue expired. Check if we have to switch to a new

2717

* Current queue expired. Check if we have to switch to a new

2718

* service tree

2718

* service tree

2719

*/

2719

*/

2720

if (!new_cfqq)

2720

if (!new_cfqq)

2721

cfq_choose_cfqg(cfqd);

2721

cfq_choose_cfqg(cfqd);

2722

2723

cfqq = cfq_set_active_queue(cfqd, new_cfqq);

2723

cfqq = cfq_set_active_queue(cfqd, new_cfqq);

2724

keep_queue:

2724

keep_queue:

2725

return cfqq;

2725

return cfqq;

2726

}

2726

}

2727

2728

static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)

2728

static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)

2729

{

2729

{

2730

int dispatched = 0;

2730

int dispatched = 0;

2731

2732

while (cfqq->next_rq) {

2732

while (cfqq->next_rq) {

2733

cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);

2733

cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);

2734

dispatched++;

2734

dispatched++;

2735

}

2735

}

2736

2737

BUG_ON(!list_empty(&cfqq->fifo));

2737

BUG_ON(!list_empty(&cfqq->fifo));

2738

2739

/* By default cfqq is not expired if it is empty. Do it explicitly */

2739

/* By default cfqq is not expired if it is empty. Do it explicitly */

2740

__cfq_slice_expired(cfqq->cfqd, cfqq, 0);

2740

__cfq_slice_expired(cfqq->cfqd, cfqq, 0);

2741

return dispatched;

2741

return dispatched;

2742

}

2742

}

2743

2744

/*

2744

/*

2745

* Drain our current requests. Used for barriers and when switching

2745

* Drain our current requests. Used for barriers and when switching

2746

* io schedulers on-the-fly.

2746

* io schedulers on-the-fly.

2747

*/

2747

*/

2748

static int cfq_forced_dispatch(struct cfq_data *cfqd)

2748

static int cfq_forced_dispatch(struct cfq_data *cfqd)

2749

{

2749

{

2750

struct cfq_queue *cfqq;

2750

struct cfq_queue *cfqq;

2751

int dispatched = 0;

2751

int dispatched = 0;

2752

2753

/* Expire the timeslice of the current active queue first */

2753

/* Expire the timeslice of the current active queue first */

2754

cfq_slice_expired(cfqd, 0);

2754

cfq_slice_expired(cfqd, 0);

2755

while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {

2755

while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {

2756

__cfq_set_active_queue(cfqd, cfqq);

2756

__cfq_set_active_queue(cfqd, cfqq);

2757

dispatched += __cfq_forced_dispatch_cfqq(cfqq);

2757

dispatched += __cfq_forced_dispatch_cfqq(cfqq);

2758

}

2758

}

2759

2760

BUG_ON(cfqd->busy_queues);

2760

BUG_ON(cfqd->busy_queues);

2761

2762

cfq_log(cfqd, "forced_dispatch=%d", dispatched);

2762

cfq_log(cfqd, "forced_dispatch=%d", dispatched);

2763

return dispatched;

2763

return dispatched;

2764

}

2764

}

2765

2766

static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,

2766

static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,

2767

struct cfq_queue *cfqq)

2767

struct cfq_queue *cfqq)

2768

{

2768

{

2769

/* the queue hasn't finished any request, can't estimate */

2769

/* the queue hasn't finished any request, can't estimate */

2770

if (cfq_cfqq_slice_new(cfqq))

2770

if (cfq_cfqq_slice_new(cfqq))

2771

return true;

2771

return true;

2772

if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,

2772

if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,

2773

cfqq->slice_end))

2773

cfqq->slice_end))

2774

return true;

2774

return true;

2775

2776

return false;

2776

return false;

2777

}

2777

}

2778

2779

static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2779

static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2780

{

2780

{

2781

unsigned int max_dispatch;

2781

unsigned int max_dispatch;

2782

2783

/*

2783

/*

2784

* Drain async requests before we start sync IO

2784

* Drain async requests before we start sync IO

2785

*/

2785

*/

2786

if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])

2786

if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])

2787

return false;

2787

return false;

2788

2789

/*

2789

/*

2790

* If this is an async queue and we have sync IO in flight, let it wait

2790

* If this is an async queue and we have sync IO in flight, let it wait

2791

*/

2791

*/

2792

if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))

2792

if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))

2793

return false;

2793

return false;

2794

2795

max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);

2795

max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);

2796

if (cfq_class_idle(cfqq))

2796

if (cfq_class_idle(cfqq))

2797

max_dispatch = 1;

2797

max_dispatch = 1;

2798

2799

/*

2799

/*

2800

* Does this cfqq already have too much IO in flight?

2800

* Does this cfqq already have too much IO in flight?

2801

*/

2801

*/

2802

if (cfqq->dispatched >= max_dispatch) {

2802

if (cfqq->dispatched >= max_dispatch) {

2803

bool promote_sync = false;

2803

bool promote_sync = false;

2804

/*

2804

/*

2805

* idle queue must always only have a single IO in flight

2805

* idle queue must always only have a single IO in flight

2806

*/

2806

*/

2807

if (cfq_class_idle(cfqq))

2807

if (cfq_class_idle(cfqq))

2808

return false;

2808

return false;

2809

2810

/*

2810

/*

2811

* If there is only one sync queue

2811

* If there is only one sync queue

2812

* we can ignore async queue here and give the sync

2812

* we can ignore async queue here and give the sync

2813

* queue no dispatch limit. The reason is a sync queue can

2813

* queue no dispatch limit. The reason is a sync queue can

2814

* preempt async queue, limiting the sync queue doesn't make

2814

* preempt async queue, limiting the sync queue doesn't make

2815

* sense. This is useful for aiostress test.

2815

* sense. This is useful for aiostress test.

2816

*/

2816

*/

2817

if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)

2817

if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)

2818

promote_sync = true;

2818

promote_sync = true;

2819

2820

/*

2820

/*

2821

* We have other queues, don't allow more IO from this one

2821

* We have other queues, don't allow more IO from this one

2822

*/

2822

*/

2823

if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&

2823

if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&

2824

!promote_sync)

2824

!promote_sync)

2825

return false;

2825

return false;

2826

2827

/*

2827

/*

2828

* Sole queue user, no limit

2828

* Sole queue user, no limit

2829

*/

2829

*/

2830

if (cfqd->busy_queues == 1 || promote_sync)

2830

if (cfqd->busy_queues == 1 || promote_sync)

2831

max_dispatch = -1;

2831

max_dispatch = -1;

2832

else

2832

else

2833

/*

2833

/*

2834

* Normally we start throttling cfqq when cfq_quantum/2

2834

* Normally we start throttling cfqq when cfq_quantum/2

2835

* requests have been dispatched. But we can drive

2835

* requests have been dispatched. But we can drive

2836

* deeper queue depths at the beginning of slice

2836

* deeper queue depths at the beginning of slice

2837

* subjected to upper limit of cfq_quantum.

2837

* subjected to upper limit of cfq_quantum.

2838

* */

2838

* */

2839

max_dispatch = cfqd->cfq_quantum;

2839

max_dispatch = cfqd->cfq_quantum;

2840

}

2840

}

2841

2842

/*

2842

/*

2843

* Async queues must wait a bit before being allowed dispatch.

2843

* Async queues must wait a bit before being allowed dispatch.

2844

* We also ramp up the dispatch depth gradually for async IO,

2844

* We also ramp up the dispatch depth gradually for async IO,

2845

* based on the last sync IO we serviced

2845

* based on the last sync IO we serviced

2846

*/

2846

*/

2847

if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {

2847

if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {

2848

unsigned long last_sync = jiffies - cfqd->last_delayed_sync;

2848

unsigned long last_sync = jiffies - cfqd->last_delayed_sync;

2849

unsigned int depth;

2849

unsigned int depth;

2850

2851

depth = last_sync / cfqd->cfq_slice[1];

2851

depth = last_sync / cfqd->cfq_slice[1];

2852

if (!depth && !cfqq->dispatched)

2852

if (!depth && !cfqq->dispatched)

2853

depth = 1;

2853

depth = 1;

2854

if (depth < max_dispatch)

2854

if (depth < max_dispatch)

2855

max_dispatch = depth;

2855

max_dispatch = depth;

2856

}

2856

}

2857

2858

/*

2858

/*

2859

* If we're below the current max, allow a dispatch

2859

* If we're below the current max, allow a dispatch

2860

*/

2860

*/

2861

return cfqq->dispatched < max_dispatch;

2861

return cfqq->dispatched < max_dispatch;

2862

}

2862

}

2863

2864

/*

2864

/*

2865

* Dispatch a request from cfqq, moving them to the request queue

2865

* Dispatch a request from cfqq, moving them to the request queue

2866

* dispatch list.

2866

* dispatch list.

2867

*/

2867

*/

2868

static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2868

static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2869

{

2869

{

2870

struct request *rq;

2870

struct request *rq;

2871

2872

BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));

2872

BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));

2873

2874

if (!cfq_may_dispatch(cfqd, cfqq))

2874

if (!cfq_may_dispatch(cfqd, cfqq))

2875

return false;

2875

return false;

2876

2877

/*

2877

/*

2878

* follow expired path, else get first next available

2878

* follow expired path, else get first next available

2879

*/

2879

*/

2880

rq = cfq_check_fifo(cfqq);

2880

rq = cfq_check_fifo(cfqq);

2881

if (!rq)

2881

if (!rq)

2882

rq = cfqq->next_rq;

2882

rq = cfqq->next_rq;

2883

2884

/*

2884

/*

2885

* insert request into driver dispatch list

2885

* insert request into driver dispatch list

2886

*/

2886

*/

2887

cfq_dispatch_insert(cfqd->queue, rq);

2887

cfq_dispatch_insert(cfqd->queue, rq);

2888

2889

if (!cfqd->active_cic) {

2889

if (!cfqd->active_cic) {

2890

struct cfq_io_cq *cic = RQ_CIC(rq);

2890

struct cfq_io_cq *cic = RQ_CIC(rq);

2891

2892

atomic_long_inc(&cic->icq.ioc->refcount);

2892

atomic_long_inc(&cic->icq.ioc->refcount);

2893

cfqd->active_cic = cic;

2893

cfqd->active_cic = cic;

2894

}

2894

}

2895

2896

return true;

2896

return true;

2897

}

2897

}

2898

2899

/*

2899

/*

2900

* Find the cfqq that we need to service and move a request from that to the

2900

* Find the cfqq that we need to service and move a request from that to the

2901

* dispatch list

2901

* dispatch list

2902

*/

2902

*/

2903

static int cfq_dispatch_requests(struct request_queue *q, int force)

2903

static int cfq_dispatch_requests(struct request_queue *q, int force)

2904

{

2904

{

2905

struct cfq_data *cfqd = q->elevator->elevator_data;

2905

struct cfq_data *cfqd = q->elevator->elevator_data;

2906

struct cfq_queue *cfqq;

2906

struct cfq_queue *cfqq;

2907

2908

if (!cfqd->busy_queues)

2908

if (!cfqd->busy_queues)

2909

return 0;

2909

return 0;

2910

2911

if (unlikely(force))

2911

if (unlikely(force))

2912

return cfq_forced_dispatch(cfqd);

2912

return cfq_forced_dispatch(cfqd);

2913

2914

cfqq = cfq_select_queue(cfqd);

2914

cfqq = cfq_select_queue(cfqd);

2915

if (!cfqq)

2915

if (!cfqq)

2916

return 0;

2916

return 0;

2917

2918

/*

2918

/*

2919

* Dispatch a request from this cfqq, if it is allowed

2919

* Dispatch a request from this cfqq, if it is allowed

2920

*/

2920

*/

2921

if (!cfq_dispatch_request(cfqd, cfqq))

2921

if (!cfq_dispatch_request(cfqd, cfqq))

2922

return 0;

2922

return 0;

2923

2924

cfqq->slice_dispatch++;

2924

cfqq->slice_dispatch++;

2925

cfq_clear_cfqq_must_dispatch(cfqq);

2925

cfq_clear_cfqq_must_dispatch(cfqq);

2926

2927

/*

2927

/*

2928

* expire an async queue immediately if it has used up its slice. idle

2928

* expire an async queue immediately if it has used up its slice. idle

2929

* queue always expire after 1 dispatch round.

2929

* queue always expire after 1 dispatch round.

2930

*/

2930

*/

2931

if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&

2931

if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&

2932

cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||

2932

cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||

2933

cfq_class_idle(cfqq))) {

2933

cfq_class_idle(cfqq))) {

2934

cfqq->slice_end = jiffies + 1;

2934

cfqq->slice_end = jiffies + 1;

2935

cfq_slice_expired(cfqd, 0);

2935

cfq_slice_expired(cfqd, 0);

2936

}

2936

}

2937

2938

cfq_log_cfqq(cfqd, cfqq, "dispatched a request");

2938

cfq_log_cfqq(cfqd, cfqq, "dispatched a request");

2939

return 1;

2939

return 1;

2940

}

2940

}

2941

2942

/*

2942

/*

2943

* task holds one reference to the queue, dropped when task exits. each rq

2943

* task holds one reference to the queue, dropped when task exits. each rq

2944

* in-flight on this queue also holds a reference, dropped when rq is freed.

2944

* in-flight on this queue also holds a reference, dropped when rq is freed.

2945

*

2945

*

2946

* Each cfq queue took a reference on the parent group. Drop it now.

2946

* Each cfq queue took a reference on the parent group. Drop it now.

2947

* queue lock must be held here.

2947

* queue lock must be held here.

2948

*/

2948

*/

2949

static void cfq_put_queue(struct cfq_queue *cfqq)

2949

static void cfq_put_queue(struct cfq_queue *cfqq)

2950

{

2950

{

2951

struct cfq_data *cfqd = cfqq->cfqd;

2951

struct cfq_data *cfqd = cfqq->cfqd;

2952

struct cfq_group *cfqg;

2952

struct cfq_group *cfqg;

2953

2954

BUG_ON(cfqq->ref <= 0);

2954

BUG_ON(cfqq->ref <= 0);

2955

2956

cfqq->ref--;

2956

cfqq->ref--;

2957

if (cfqq->ref)

2957

if (cfqq->ref)

2958

return;

2958

return;

2959

2960

cfq_log_cfqq(cfqd, cfqq, "put_queue");

2960

cfq_log_cfqq(cfqd, cfqq, "put_queue");

2961

BUG_ON(rb_first(&cfqq->sort_list));

2961

BUG_ON(rb_first(&cfqq->sort_list));

2962

BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);

2962

BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);

2963

cfqg = cfqq->cfqg;

2963

cfqg = cfqq->cfqg;

2964

2965

if (unlikely(cfqd->active_queue == cfqq)) {

2965

if (unlikely(cfqd->active_queue == cfqq)) {

2966

__cfq_slice_expired(cfqd, cfqq, 0);

2966

__cfq_slice_expired(cfqd, cfqq, 0);

2967

cfq_schedule_dispatch(cfqd);

2967

cfq_schedule_dispatch(cfqd);

2968

}

2968

}

2969

2970

BUG_ON(cfq_cfqq_on_rr(cfqq));

2970

BUG_ON(cfq_cfqq_on_rr(cfqq));

2971

kmem_cache_free(cfq_pool, cfqq);

2971

kmem_cache_free(cfq_pool, cfqq);

2972

cfqg_put(cfqg);

2972

cfqg_put(cfqg);

2973

}

2973

}

2974

2975

static void cfq_put_cooperator(struct cfq_queue *cfqq)

2975

static void cfq_put_cooperator(struct cfq_queue *cfqq)

2976

{

2976

{

2977

struct cfq_queue *__cfqq, *next;

2977

struct cfq_queue *__cfqq, *next;

2978

2979

/*

2979

/*

2980

* If this queue was scheduled to merge with another queue, be

2980

* If this queue was scheduled to merge with another queue, be

2981

* sure to drop the reference taken on that queue (and others in

2981

* sure to drop the reference taken on that queue (and others in

2982

* the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.

2982

* the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.

2983

*/

2983

*/

2984

__cfqq = cfqq->new_cfqq;

2984

__cfqq = cfqq->new_cfqq;

2985

while (__cfqq) {

2985

while (__cfqq) {

2986

if (__cfqq == cfqq) {

2986

if (__cfqq == cfqq) {

2987

WARN(1, "cfqq->new_cfqq loop detected\n");

2987

WARN(1, "cfqq->new_cfqq loop detected\n");

2988

break;

2988

break;

2989

}

2989

}

2990

next = __cfqq->new_cfqq;

2990

next = __cfqq->new_cfqq;

2991

cfq_put_queue(__cfqq);

2991

cfq_put_queue(__cfqq);

2992

__cfqq = next;

2992

__cfqq = next;

2993

}

2993

}

2994

}

2994

}

2995

2996

static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2996

static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2997

{

2997

{

2998

if (unlikely(cfqq == cfqd->active_queue)) {

2998

if (unlikely(cfqq == cfqd->active_queue)) {

2999

__cfq_slice_expired(cfqd, cfqq, 0);

2999

__cfq_slice_expired(cfqd, cfqq, 0);

3000

cfq_schedule_dispatch(cfqd);

3000

cfq_schedule_dispatch(cfqd);

3001

}

3001

}

3002

3003

cfq_put_cooperator(cfqq);

3003

cfq_put_cooperator(cfqq);

3004

3005

cfq_put_queue(cfqq);

3005

cfq_put_queue(cfqq);

3006

}

3006

}

3007

3008

static void cfq_init_icq(struct io_cq *icq)

3008

static void cfq_init_icq(struct io_cq *icq)

3009

{

3009

{

3010

struct cfq_io_cq *cic = icq_to_cic(icq);

3010

struct cfq_io_cq *cic = icq_to_cic(icq);

3011

3012

cic->ttime.last_end_request = jiffies;

3012

cic->ttime.last_end_request = jiffies;

3013

}

3013

}

3014

3015

static void cfq_exit_icq(struct io_cq *icq)

3015

static void cfq_exit_icq(struct io_cq *icq)

3016

{

3016

{

3017

struct cfq_io_cq *cic = icq_to_cic(icq);

3017

struct cfq_io_cq *cic = icq_to_cic(icq);

3018

struct cfq_data *cfqd = cic_to_cfqd(cic);

3018

struct cfq_data *cfqd = cic_to_cfqd(cic);

3019

3020

if (cic->cfqq[BLK_RW_ASYNC]) {

3020

if (cic->cfqq[BLK_RW_ASYNC]) {

3021

cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);

3021

cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);

3022

cic->cfqq[BLK_RW_ASYNC] = NULL;

3022

cic->cfqq[BLK_RW_ASYNC] = NULL;

3023

}

3023

}

3024

3025

if (cic->cfqq[BLK_RW_SYNC]) {

3025

if (cic->cfqq[BLK_RW_SYNC]) {

3026

cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);

3026

cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);

3027

cic->cfqq[BLK_RW_SYNC] = NULL;

3027

cic->cfqq[BLK_RW_SYNC] = NULL;

3028

}

3028

}

3029

}

3029

}

3030

3031

static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)

3031

static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)

3032

{

3032

{

3033

struct task_struct *tsk = current;

3033

struct task_struct *tsk = current;

3034

int ioprio_class;

3034

int ioprio_class;

3035

3036

if (!cfq_cfqq_prio_changed(cfqq))

3036

if (!cfq_cfqq_prio_changed(cfqq))

3037

return;

3037

return;

3038

3039

ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);

3039

ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);

3040

switch (ioprio_class) {

3040

switch (ioprio_class) {

3041

default:

3041

default:

3042

printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);

3042

printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);

3043

case IOPRIO_CLASS_NONE:

3043

case IOPRIO_CLASS_NONE:

3044

/*

3044

/*

3045

* no prio set, inherit CPU scheduling settings

3045

* no prio set, inherit CPU scheduling settings

3046

*/

3046

*/

3047

cfqq->ioprio = task_nice_ioprio(tsk);

3047

cfqq->ioprio = task_nice_ioprio(tsk);

3048

cfqq->ioprio_class = task_nice_ioclass(tsk);

3048

cfqq->ioprio_class = task_nice_ioclass(tsk);

3049

break;

3049

break;

3050

case IOPRIO_CLASS_RT:

3050

case IOPRIO_CLASS_RT:

3051

cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);

3051

cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);

3052

cfqq->ioprio_class = IOPRIO_CLASS_RT;

3052

cfqq->ioprio_class = IOPRIO_CLASS_RT;

3053

break;

3053

break;

3054

case IOPRIO_CLASS_BE:

3054

case IOPRIO_CLASS_BE:

3055

cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);

3055

cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);

3056

cfqq->ioprio_class = IOPRIO_CLASS_BE;

3056

cfqq->ioprio_class = IOPRIO_CLASS_BE;

3057

break;

3057

break;

3058

case IOPRIO_CLASS_IDLE:

3058

case IOPRIO_CLASS_IDLE:

3059

cfqq->ioprio_class = IOPRIO_CLASS_IDLE;

3059

cfqq->ioprio_class = IOPRIO_CLASS_IDLE;

3060

cfqq->ioprio = 7;

3060

cfqq->ioprio = 7;

3061

cfq_clear_cfqq_idle_window(cfqq);

3061

cfq_clear_cfqq_idle_window(cfqq);

3062

break;

3062

break;

3063

}

3063

}

3064

3065

/*

3065

/*

3066

* keep track of original prio settings in case we have to temporarily

3066

* keep track of original prio settings in case we have to temporarily

3067

* elevate the priority of this queue

3067

* elevate the priority of this queue

3068

*/

3068

*/

3069

cfqq->org_ioprio = cfqq->ioprio;

3069

cfqq->org_ioprio = cfqq->ioprio;

3070

cfq_clear_cfqq_prio_changed(cfqq);

3070

cfq_clear_cfqq_prio_changed(cfqq);

3071

}

3071

}

3072

3073

static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)

3073

static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)

3074

{

3074

{

3075

int ioprio = cic->icq.ioc->ioprio;

3075

int ioprio = cic->icq.ioc->ioprio;

3076

struct cfq_data *cfqd = cic_to_cfqd(cic);

3076

struct cfq_data *cfqd = cic_to_cfqd(cic);

3077

struct cfq_queue *cfqq;

3077

struct cfq_queue *cfqq;

3078

3079

/*

3079

/*

3080

* Check whether ioprio has changed. The condition may trigger

3080

* Check whether ioprio has changed. The condition may trigger

3081

* spuriously on a newly created cic but there's no harm.

3081

* spuriously on a newly created cic but there's no harm.

3082

*/

3082

*/

3083

if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))

3083

if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))

3084

return;

3084

return;

3085

3086

cfqq = cic->cfqq[BLK_RW_ASYNC];

3086

cfqq = cic->cfqq[BLK_RW_ASYNC];

3087

if (cfqq) {

3087

if (cfqq) {

3088

struct cfq_queue *new_cfqq;

3088

struct cfq_queue *new_cfqq;

3089

new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,

3089

new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,

3090

GFP_ATOMIC);

3090

GFP_ATOMIC);

3091

if (new_cfqq) {

3091

if (new_cfqq) {

3092

cic->cfqq[BLK_RW_ASYNC] = new_cfqq;

3092

cic->cfqq[BLK_RW_ASYNC] = new_cfqq;

3093

cfq_put_queue(cfqq);

3093

cfq_put_queue(cfqq);

3094

}

3094

}

3095

}

3095

}

3096

3097

cfqq = cic->cfqq[BLK_RW_SYNC];

3097

cfqq = cic->cfqq[BLK_RW_SYNC];

3098

if (cfqq)

3098

if (cfqq)

3099

cfq_mark_cfqq_prio_changed(cfqq);

3099

cfq_mark_cfqq_prio_changed(cfqq);

3100

3101

cic->ioprio = ioprio;

3101

cic->ioprio = ioprio;

3102

}

3102

}

3103

3104

static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3104

static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3105

pid_t pid, bool is_sync)

3105

pid_t pid, bool is_sync)

3106

{

3106

{

3107

RB_CLEAR_NODE(&cfqq->rb_node);

3107

RB_CLEAR_NODE(&cfqq->rb_node);

3108

RB_CLEAR_NODE(&cfqq->p_node);

3108

RB_CLEAR_NODE(&cfqq->p_node);

3109

INIT_LIST_HEAD(&cfqq->fifo);

3109

INIT_LIST_HEAD(&cfqq->fifo);

3110

3111

cfqq->ref = 0;

3111

cfqq->ref = 0;

3112

cfqq->cfqd = cfqd;

3112

cfqq->cfqd = cfqd;

3113

3114

cfq_mark_cfqq_prio_changed(cfqq);

3114

cfq_mark_cfqq_prio_changed(cfqq);

3115

3116

if (is_sync) {

3116

if (is_sync) {

3117

if (!cfq_class_idle(cfqq))

3117

if (!cfq_class_idle(cfqq))

3118

cfq_mark_cfqq_idle_window(cfqq);

3118

cfq_mark_cfqq_idle_window(cfqq);

3119

cfq_mark_cfqq_sync(cfqq);

3119

cfq_mark_cfqq_sync(cfqq);

3120

}

3120

}

3121

cfqq->pid = pid;

3121

cfqq->pid = pid;

3122

}

3122

}

3123

3124

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3124

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3125

static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)

3125

static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)

3126

{

3126

{

3127

struct cfq_data *cfqd = cic_to_cfqd(cic);

3127

struct cfq_data *cfqd = cic_to_cfqd(cic);

3128

struct cfq_queue *sync_cfqq;

3128

struct cfq_queue *sync_cfqq;

3129

uint64_t id;

3129

uint64_t id;

3130

3131

rcu_read_lock();

3131

rcu_read_lock();

3132

id = bio_blkio_cgroup(bio)->id;

3132

id = bio_blkio_cgroup(bio)->id;

3133

rcu_read_unlock();

3133

rcu_read_unlock();

3134

3135

/*

3135

/*

3136

* Check whether blkcg has changed. The condition may trigger

3136

* Check whether blkcg has changed. The condition may trigger

3137

* spuriously on a newly created cic but there's no harm.

3137

* spuriously on a newly created cic but there's no harm.

3138

*/

3138

*/

3139

if (unlikely(!cfqd) || likely(cic->blkcg_id == id))

3139

if (unlikely(!cfqd) || likely(cic->blkcg_id == id))

3140

return;

3140

return;

3141

3142

sync_cfqq = cic_to_cfqq(cic, 1);

3142

sync_cfqq = cic_to_cfqq(cic, 1);

3143

if (sync_cfqq) {

3143

if (sync_cfqq) {

3144

/*

3144

/*

3145

* Drop reference to sync queue. A new sync queue will be

3145

* Drop reference to sync queue. A new sync queue will be

3146

* assigned in new group upon arrival of a fresh request.

3146

* assigned in new group upon arrival of a fresh request.

3147

*/

3147

*/

3148

cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");

3148

cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");

3149

cic_set_cfqq(cic, NULL, 1);

3149

cic_set_cfqq(cic, NULL, 1);

3150

cfq_put_queue(sync_cfqq);

3150

cfq_put_queue(sync_cfqq);

3151

}

3151

}

3152

3153

cic->blkcg_id = id;

3153

cic->blkcg_id = id;

3154

}

3154

}

3155

#else

3155

#else

3156

static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }

3156

static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }

3157

#endif /* CONFIG_CFQ_GROUP_IOSCHED */

3157

#endif /* CONFIG_CFQ_GROUP_IOSCHED */

3158

3159

static struct cfq_queue *

3159

static struct cfq_queue *

3160

cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,

3160

cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,

3161

struct bio *bio, gfp_t gfp_mask)

3161

struct bio *bio, gfp_t gfp_mask)

3162

{

3162

{

3163

struct blkio_cgroup *blkcg;

3163

struct blkio_cgroup *blkcg;

3164

struct cfq_queue *cfqq, *new_cfqq = NULL;

3164

struct cfq_queue *cfqq, *new_cfqq = NULL;

3165

struct cfq_group *cfqg;

3165

struct cfq_group *cfqg;

3166

3167

retry:

3167

retry:

3168

rcu_read_lock();

3168

rcu_read_lock();

3169

3170

blkcg = bio_blkio_cgroup(bio);

3170

blkcg = bio_blkio_cgroup(bio);

3171

cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);

3171

cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);

3172

cfqq = cic_to_cfqq(cic, is_sync);

3172

cfqq = cic_to_cfqq(cic, is_sync);

3173

3174

/*

3174

/*

3175

* Always try a new alloc if we fell back to the OOM cfqq

3175

* Always try a new alloc if we fell back to the OOM cfqq

3176

* originally, since it should just be a temporary situation.

3176

* originally, since it should just be a temporary situation.

3177

*/

3177

*/

3178

if (!cfqq || cfqq == &cfqd->oom_cfqq) {

3178

if (!cfqq || cfqq == &cfqd->oom_cfqq) {

3179

cfqq = NULL;

3179

cfqq = NULL;

3180

if (new_cfqq) {

3180

if (new_cfqq) {

3181

cfqq = new_cfqq;

3181

cfqq = new_cfqq;

3182

new_cfqq = NULL;

3182

new_cfqq = NULL;

3183

} else if (gfp_mask & __GFP_WAIT) {

3183

} else if (gfp_mask & __GFP_WAIT) {

3184

rcu_read_unlock();

3184

rcu_read_unlock();

3185

spin_unlock_irq(cfqd->queue->queue_lock);

3185

spin_unlock_irq(cfqd->queue->queue_lock);

3186

new_cfqq = kmem_cache_alloc_node(cfq_pool,

3186

new_cfqq = kmem_cache_alloc_node(cfq_pool,

3187

gfp_mask | __GFP_ZERO,

3187

gfp_mask | __GFP_ZERO,

3188

cfqd->queue->node);

3188

cfqd->queue->node);

3189

spin_lock_irq(cfqd->queue->queue_lock);

3189

spin_lock_irq(cfqd->queue->queue_lock);

3190

if (new_cfqq)

3190

if (new_cfqq)

3191

goto retry;

3191

goto retry;

3192

} else {

3192

} else {

3193

cfqq = kmem_cache_alloc_node(cfq_pool,

3193

cfqq = kmem_cache_alloc_node(cfq_pool,

3194

gfp_mask | __GFP_ZERO,

3194

gfp_mask | __GFP_ZERO,

3195

cfqd->queue->node);

3195

cfqd->queue->node);

3196

}

3196

}

3197

3198

if (cfqq) {

3198

if (cfqq) {

3199

cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);

3199

cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);

3200

cfq_init_prio_data(cfqq, cic);

3200

cfq_init_prio_data(cfqq, cic);

3201

cfq_link_cfqq_cfqg(cfqq, cfqg);

3201

cfq_link_cfqq_cfqg(cfqq, cfqg);

3202

cfq_log_cfqq(cfqd, cfqq, "alloced");

3202

cfq_log_cfqq(cfqd, cfqq, "alloced");

3203

} else

3203

} else

3204

cfqq = &cfqd->oom_cfqq;

3204

cfqq = &cfqd->oom_cfqq;

3205

}

3205

}

3206

3207

if (new_cfqq)

3207

if (new_cfqq)

3208

kmem_cache_free(cfq_pool, new_cfqq);

3208

kmem_cache_free(cfq_pool, new_cfqq);

3209

3210

rcu_read_unlock();

3210

rcu_read_unlock();

3211

return cfqq;

3211

return cfqq;

3212

}

3212

}

3213

3214

static struct cfq_queue **

3214

static struct cfq_queue **

3215

cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)

3215

cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)

3216

{

3216

{

3217

switch (ioprio_class) {

3217

switch (ioprio_class) {

3218

case IOPRIO_CLASS_RT:

3218

case IOPRIO_CLASS_RT:

3219

return &cfqd->async_cfqq[0][ioprio];

3219

return &cfqd->async_cfqq[0][ioprio];

3220

case IOPRIO_CLASS_NONE:

3220

case IOPRIO_CLASS_NONE:

3221

ioprio = IOPRIO_NORM;

3221

ioprio = IOPRIO_NORM;

3222

/* fall through */

3222

/* fall through */

3223

case IOPRIO_CLASS_BE:

3223

case IOPRIO_CLASS_BE:

3224

return &cfqd->async_cfqq[1][ioprio];

3224

return &cfqd->async_cfqq[1][ioprio];

3225

case IOPRIO_CLASS_IDLE:

3225

case IOPRIO_CLASS_IDLE:

3226

return &cfqd->async_idle_cfqq;

3226

return &cfqd->async_idle_cfqq;

3227

default:

3227

default:

3228

BUG();

3228

BUG();

3229

}

3229

}

3230

}

3230

}

3231

3232

static struct cfq_queue *

3232

static struct cfq_queue *

3233

cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,

3233

cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,

3234

struct bio *bio, gfp_t gfp_mask)

3234

struct bio *bio, gfp_t gfp_mask)

3235

{

3235

{

3236

const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);

3236

const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);

3237

const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);

3237

const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);

3238

struct cfq_queue **async_cfqq = NULL;

3238

struct cfq_queue **async_cfqq = NULL;

3239

struct cfq_queue *cfqq = NULL;

3239

struct cfq_queue *cfqq = NULL;

3240

3241

if (!is_sync) {

3241

if (!is_sync) {

3242

async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);

3242

async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);

3243

cfqq = *async_cfqq;

3243

cfqq = *async_cfqq;

3244

}

3244

}

3245

3246

if (!cfqq)

3246

if (!cfqq)

3247

cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);

3247

cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);

3248

3249

/*

3249

/*

3250

* pin the queue now that it's allocated, scheduler exit will prune it

3250

* pin the queue now that it's allocated, scheduler exit will prune it

3251

*/

3251

*/

3252

if (!is_sync && !(*async_cfqq)) {

3252

if (!is_sync && !(*async_cfqq)) {

3253

cfqq->ref++;

3253

cfqq->ref++;

3254

*async_cfqq = cfqq;

3254

*async_cfqq = cfqq;

3255

}

3255

}

3256

3257

cfqq->ref++;

3257

cfqq->ref++;

3258

return cfqq;

3258

return cfqq;

3259

}

3259

}

3260

3261

static void

3261

static void

3262

__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)

3262

__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)

3263

{

3263

{

3264

unsigned long elapsed = jiffies - ttime->last_end_request;

3264

unsigned long elapsed = jiffies - ttime->last_end_request;

3265

elapsed = min(elapsed, 2UL * slice_idle);

3265

elapsed = min(elapsed, 2UL * slice_idle);

3266

3267

ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;

3267

ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;

3268

ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;

3268

ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;

3269

ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;

3269

ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;

3270

}

3270

}

3271

3272

static void

3272

static void

3273

cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3273

cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3274

struct cfq_io_cq *cic)

3274

struct cfq_io_cq *cic)

3275

{

3275

{

3276

if (cfq_cfqq_sync(cfqq)) {

3276

if (cfq_cfqq_sync(cfqq)) {

3277

__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);

3277

__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);

3278

__cfq_update_io_thinktime(&cfqq->service_tree->ttime,

3278

__cfq_update_io_thinktime(&cfqq->service_tree->ttime,

3279

cfqd->cfq_slice_idle);

3279

cfqd->cfq_slice_idle);

3280

}

3280

}

3281

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3281

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3282

__cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);

3282

__cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);

3283

#endif

3283

#endif

3284

}

3284

}

3285

3286

static void

3286

static void

3287

cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3287

cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3288

struct request *rq)

3288

struct request *rq)

3289

{

3289

{

3290

sector_t sdist = 0;

3290

sector_t sdist = 0;

3291

sector_t n_sec = blk_rq_sectors(rq);

3291

sector_t n_sec = blk_rq_sectors(rq);

3292

if (cfqq->last_request_pos) {

3292

if (cfqq->last_request_pos) {

3293

if (cfqq->last_request_pos < blk_rq_pos(rq))

3293

if (cfqq->last_request_pos < blk_rq_pos(rq))

3294

sdist = blk_rq_pos(rq) - cfqq->last_request_pos;

3294

sdist = blk_rq_pos(rq) - cfqq->last_request_pos;

3295

else

3295

else

3296

sdist = cfqq->last_request_pos - blk_rq_pos(rq);

3296

sdist = cfqq->last_request_pos - blk_rq_pos(rq);

3297

}

3297

}

3298

3299

cfqq->seek_history <<= 1;

3299

cfqq->seek_history <<= 1;

3300

if (blk_queue_nonrot(cfqd->queue))

3300

if (blk_queue_nonrot(cfqd->queue))

3301

cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);

3301

cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);

3302

else

3302

else

3303

cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);

3303

cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);

3304

}

3304

}

3305

3306

/*

3306

/*

3307

* Disable idle window if the process thinks too long or seeks so much that

3307

* Disable idle window if the process thinks too long or seeks so much that

3308

* it doesn't matter

3308

* it doesn't matter

3309

*/

3309

*/

3310

static void

3310

static void

3311

cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3311

cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3312

struct cfq_io_cq *cic)

3312

struct cfq_io_cq *cic)

3313

{

3313

{

3314

int old_idle, enable_idle;

3314

int old_idle, enable_idle;

3315

3316

/*

3316

/*

3317

* Don't idle for async or idle io prio class

3317

* Don't idle for async or idle io prio class

3318

*/

3318

*/

3319

if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))

3319

if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))

3320

return;

3320

return;

3321

3322

enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);

3322

enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);

3323

3324

if (cfqq->queued[0] + cfqq->queued[1] >= 4)

3324

if (cfqq->queued[0] + cfqq->queued[1] >= 4)

3325

cfq_mark_cfqq_deep(cfqq);

3325

cfq_mark_cfqq_deep(cfqq);

3326

3327

if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))

3327

if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))

3328

enable_idle = 0;

3328

enable_idle = 0;

3329

else if (!atomic_read(&cic->icq.ioc->active_ref) ||

3329

else if (!atomic_read(&cic->icq.ioc->active_ref) ||

3330

!cfqd->cfq_slice_idle ||

3330

!cfqd->cfq_slice_idle ||

3331

(!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))

3331

(!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))

3332

enable_idle = 0;

3332

enable_idle = 0;

3333

else if (sample_valid(cic->ttime.ttime_samples)) {

3333

else if (sample_valid(cic->ttime.ttime_samples)) {

3334

if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)

3334

if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)

3335

enable_idle = 0;

3335

enable_idle = 0;

3336

else

3336

else

3337

enable_idle = 1;

3337

enable_idle = 1;

3338

}

3338

}

3339

3340

if (old_idle != enable_idle) {

3340

if (old_idle != enable_idle) {

3341

cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);

3341

cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);

3342

if (enable_idle)

3342

if (enable_idle)

3343

cfq_mark_cfqq_idle_window(cfqq);

3343

cfq_mark_cfqq_idle_window(cfqq);

3344

else

3344

else

3345

cfq_clear_cfqq_idle_window(cfqq);

3345

cfq_clear_cfqq_idle_window(cfqq);

3346

}

3346

}

3347

}

3347

}

3348

3349

/*

3349

/*

3350

* Check if new_cfqq should preempt the currently active queue. Return 0 for

3350

* Check if new_cfqq should preempt the currently active queue. Return 0 for

3351

* no or if we aren't sure, a 1 will cause a preempt.

3351

* no or if we aren't sure, a 1 will cause a preempt.

3352

*/

3352

*/

3353

static bool

3353

static bool

3354

cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,

3354

cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,

3355

struct request *rq)

3355

struct request *rq)

3356

{

3356

{

3357

struct cfq_queue *cfqq;

3357

struct cfq_queue *cfqq;

3358

3359

cfqq = cfqd->active_queue;

3359

cfqq = cfqd->active_queue;

3360

if (!cfqq)

3360

if (!cfqq)

3361

return false;

3361

return false;

3362

3363

if (cfq_class_idle(new_cfqq))

3363

if (cfq_class_idle(new_cfqq))

3364

return false;

3364

return false;

3365

3366

if (cfq_class_idle(cfqq))

3366

if (cfq_class_idle(cfqq))

3367

return true;

3367

return true;

3368

3369

/*

3369

/*

3370

* Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.

3370

* Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.

3371

*/

3371

*/

3372

if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))

3372

if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))

3373

return false;

3373

return false;

3374

3375

/*

3375

/*

3376

* if the new request is sync, but the currently running queue is

3376

* if the new request is sync, but the currently running queue is

3377

* not, let the sync request have priority.

3377

* not, let the sync request have priority.

3378

*/

3378

*/

3379

if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))

3379

if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))

3380

return true;

3380

return true;

3381

3382

if (new_cfqq->cfqg != cfqq->cfqg)

3382

if (new_cfqq->cfqg != cfqq->cfqg)

3383

return false;

3383

return false;

3384

3385

if (cfq_slice_used(cfqq))

3385

if (cfq_slice_used(cfqq))

3386

return true;

3386

return true;

3387

3388

/* Allow preemption only if we are idling on sync-noidle tree */

3388

/* Allow preemption only if we are idling on sync-noidle tree */

3389

if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&

3389

if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&

3390

cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&

3390

cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&

3391

new_cfqq->service_tree->count == 2 &&

3391

new_cfqq->service_tree->count == 2 &&

3392

RB_EMPTY_ROOT(&cfqq->sort_list))

3392

RB_EMPTY_ROOT(&cfqq->sort_list))

3393

return true;

3393

return true;

3394

3395

/*

3395

/*

3396

* So both queues are sync. Let the new request get disk time if

3396

* So both queues are sync. Let the new request get disk time if

3397

* it's a metadata request and the current queue is doing regular IO.

3397

* it's a metadata request and the current queue is doing regular IO.

3398

*/

3398

*/

3399

if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)

3399

if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)

3400

return true;

3400

return true;

3401

3402

/*

3402

/*

3403

* Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.

3403

* Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.

3404

*/

3404

*/

3405

if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))

3405

if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))

3406

return true;

3406

return true;

3407

3408

/* An idle queue should not be idle now for some reason */

3408

/* An idle queue should not be idle now for some reason */

3409

if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))

3409

if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))

3410

return true;

3410

return true;

3411

3412

if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))

3412

if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))

3413

return false;

3413

return false;

3414

3415

/*

3415

/*

3416

* if this request is as-good as one we would expect from the

3416

* if this request is as-good as one we would expect from the

3417

* current cfqq, let it preempt

3417

* current cfqq, let it preempt

3418

*/

3418

*/

3419

if (cfq_rq_close(cfqd, cfqq, rq))

3419

if (cfq_rq_close(cfqd, cfqq, rq))

3420

return true;

3420

return true;

3421

3422

return false;

3422

return false;

3423

}

3423

}

3424

3425

/*

3425

/*

3426

* cfqq preempts the active queue. if we allowed preempt with no slice left,

3426

* cfqq preempts the active queue. if we allowed preempt with no slice left,

3427

* let it have half of its nominal slice.

3427

* let it have half of its nominal slice.

3428

*/

3428

*/

3429

static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)

3429

static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)

3430

{

3430

{

3431

enum wl_type_t old_type = cfqq_type(cfqd->active_queue);

3431

enum wl_type_t old_type = cfqq_type(cfqd->active_queue);

3432

3433

cfq_log_cfqq(cfqd, cfqq, "preempt");

3433

cfq_log_cfqq(cfqd, cfqq, "preempt");

3434

cfq_slice_expired(cfqd, 1);

3434

cfq_slice_expired(cfqd, 1);

3435

3436

/*

3436

/*

3437

* workload type is changed, don't save slice, otherwise preempt

3437

* workload type is changed, don't save slice, otherwise preempt

3438

* doesn't happen

3438

* doesn't happen

3439

*/

3439

*/

3440

if (old_type != cfqq_type(cfqq))

3440

if (old_type != cfqq_type(cfqq))

3441

cfqq->cfqg->saved_workload_slice = 0;

3441

cfqq->cfqg->saved_workload_slice = 0;

3442

3443

/*

3443

/*

3444

* Put the new queue at the front of the of the current list,

3444

* Put the new queue at the front of the of the current list,

3445

* so we know that it will be selected next.

3445

* so we know that it will be selected next.

3446

*/

3446

*/

3447

BUG_ON(!cfq_cfqq_on_rr(cfqq));

3447

BUG_ON(!cfq_cfqq_on_rr(cfqq));

3448

3449

cfq_service_tree_add(cfqd, cfqq, 1);

3449

cfq_service_tree_add(cfqd, cfqq, 1);

3450

3451

cfqq->slice_end = 0;

3451

cfqq->slice_end = 0;

3452

cfq_mark_cfqq_slice_new(cfqq);

3452

cfq_mark_cfqq_slice_new(cfqq);

3453

}

3453

}

3454

3455

/*

3455

/*

3456

* Called when a new fs request (rq) is added (to cfqq). Check if there's

3456

* Called when a new fs request (rq) is added (to cfqq). Check if there's

3457

* something we should do about it

3457

* something we should do about it

3458

*/

3458

*/

3459

static void

3459

static void

3460

cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3460

cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3461

struct request *rq)

3461

struct request *rq)

3462

{

3462

{

3463

struct cfq_io_cq *cic = RQ_CIC(rq);

3463

struct cfq_io_cq *cic = RQ_CIC(rq);

3464

3465

cfqd->rq_queued++;

3465

cfqd->rq_queued++;

3466

if (rq->cmd_flags & REQ_PRIO)

3466

if (rq->cmd_flags & REQ_PRIO)

3467

cfqq->prio_pending++;

3467

cfqq->prio_pending++;

3468

3469

cfq_update_io_thinktime(cfqd, cfqq, cic);

3469

cfq_update_io_thinktime(cfqd, cfqq, cic);

3470

cfq_update_io_seektime(cfqd, cfqq, rq);

3470

cfq_update_io_seektime(cfqd, cfqq, rq);

3471

cfq_update_idle_window(cfqd, cfqq, cic);

3471

cfq_update_idle_window(cfqd, cfqq, cic);

3472

3473

cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

3473

cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

3474

3475

if (cfqq == cfqd->active_queue) {

3475

if (cfqq == cfqd->active_queue) {

3476

/*

3476

/*

3477

* Remember that we saw a request from this process, but

3477

* Remember that we saw a request from this process, but

3478

* don't start queuing just yet. Otherwise we risk seeing lots

3478

* don't start queuing just yet. Otherwise we risk seeing lots

3479

* of tiny requests, because we disrupt the normal plugging

3479

* of tiny requests, because we disrupt the normal plugging

3480

* and merging. If the request is already larger than a single

3480

* and merging. If the request is already larger than a single

3481

* page, let it rip immediately. For that case we assume that

3481

* page, let it rip immediately. For that case we assume that

3482

* merging is already done. Ditto for a busy system that

3482

* merging is already done. Ditto for a busy system that

3483

* has other work pending, don't risk delaying until the

3483

* has other work pending, don't risk delaying until the

3484

* idle timer unplug to continue working.

3484

* idle timer unplug to continue working.

3485

*/

3485

*/

3486

if (cfq_cfqq_wait_request(cfqq)) {

3486

if (cfq_cfqq_wait_request(cfqq)) {

3487

if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||

3487

if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||

3488

cfqd->busy_queues > 1) {

3488

cfqd->busy_queues > 1) {

3489

cfq_del_timer(cfqd, cfqq);

3489

cfq_del_timer(cfqd, cfqq);

3490

cfq_clear_cfqq_wait_request(cfqq);

3490

cfq_clear_cfqq_wait_request(cfqq);

3491

__blk_run_queue(cfqd->queue);

3491

__blk_run_queue(cfqd->queue);

3492

} else {

3492

} else {

3493

cfqg_stats_update_idle_time(cfqq->cfqg);

3493

cfqg_stats_update_idle_time(cfqq->cfqg);

3494

cfq_mark_cfqq_must_dispatch(cfqq);

3494

cfq_mark_cfqq_must_dispatch(cfqq);

3495

}

3495

}

3496

}

3496

}

3497

} else if (cfq_should_preempt(cfqd, cfqq, rq)) {

3497

} else if (cfq_should_preempt(cfqd, cfqq, rq)) {

3498

/*

3498

/*

3499

* not the active queue - expire current slice if it is

3499

* not the active queue - expire current slice if it is

3500

* idle and has expired it's mean thinktime or this new queue

3500

* idle and has expired it's mean thinktime or this new queue

3501

* has some old slice time left and is of higher priority or

3501

* has some old slice time left and is of higher priority or

3502

* this new queue is RT and the current one is BE

3502

* this new queue is RT and the current one is BE

3503

*/

3503

*/

3504

cfq_preempt_queue(cfqd, cfqq);

3504

cfq_preempt_queue(cfqd, cfqq);

3505

__blk_run_queue(cfqd->queue);

3505

__blk_run_queue(cfqd->queue);

3506

}

3506

}

3507

}

3507

}

3508

3509

static void cfq_insert_request(struct request_queue *q, struct request *rq)

3509

static void cfq_insert_request(struct request_queue *q, struct request *rq)

3510

{

3510

{

3511

struct cfq_data *cfqd = q->elevator->elevator_data;

3511

struct cfq_data *cfqd = q->elevator->elevator_data;

3512

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3512

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3513

3514

cfq_log_cfqq(cfqd, cfqq, "insert_request");

3514

cfq_log_cfqq(cfqd, cfqq, "insert_request");

3515

cfq_init_prio_data(cfqq, RQ_CIC(rq));

3515

cfq_init_prio_data(cfqq, RQ_CIC(rq));

3516

3517

rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);

3517

rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);

3518

list_add_tail(&rq->queuelist, &cfqq->fifo);

3518

list_add_tail(&rq->queuelist, &cfqq->fifo);

3519

cfq_add_rq_rb(rq);

3519

cfq_add_rq_rb(rq);

3520

cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,

3520

cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,

3521

rq->cmd_flags);

3521

rq->cmd_flags);

3522

cfq_rq_enqueued(cfqd, cfqq, rq);

3522

cfq_rq_enqueued(cfqd, cfqq, rq);

3523

}

3523

}

3524

3525

/*

3525

/*

3526

* Update hw_tag based on peak queue depth over 50 samples under

3526

* Update hw_tag based on peak queue depth over 50 samples under

3527

* sufficient load.

3527

* sufficient load.

3528

*/

3528

*/

3529

static void cfq_update_hw_tag(struct cfq_data *cfqd)

3529

static void cfq_update_hw_tag(struct cfq_data *cfqd)

3530

{

3530

{

3531

struct cfq_queue *cfqq = cfqd->active_queue;

3531

struct cfq_queue *cfqq = cfqd->active_queue;

3532

3533

if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)

3533

if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)

3534

cfqd->hw_tag_est_depth = cfqd->rq_in_driver;

3534

cfqd->hw_tag_est_depth = cfqd->rq_in_driver;

3535

3536

if (cfqd->hw_tag == 1)

3536

if (cfqd->hw_tag == 1)

3537

return;

3537

return;

3538

3539

if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&

3539

if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&

3540

cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)

3540

cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)

3541

return;

3541

return;

3542

3543

/*

3543

/*

3544

* If active queue hasn't enough requests and can idle, cfq might not

3544

* If active queue hasn't enough requests and can idle, cfq might not

3545

* dispatch sufficient requests to hardware. Don't zero hw_tag in this

3545

* dispatch sufficient requests to hardware. Don't zero hw_tag in this

3546

* case

3546

* case

3547

*/

3547

*/

3548

if (cfqq && cfq_cfqq_idle_window(cfqq) &&

3548

if (cfqq && cfq_cfqq_idle_window(cfqq) &&

3549

cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <

3549

cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <

3550

CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)

3550

CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)

3551

return;

3551

return;

3552

3553

if (cfqd->hw_tag_samples++ < 50)

3553

if (cfqd->hw_tag_samples++ < 50)

3554

return;

3554

return;

3555

3556

if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)

3556

if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)

3557

cfqd->hw_tag = 1;

3557

cfqd->hw_tag = 1;

3558

else

3558

else

3559

cfqd->hw_tag = 0;

3559

cfqd->hw_tag = 0;

3560

}

3560

}

3561

3562

static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)

3562

static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)

3563

{

3563

{

3564

struct cfq_io_cq *cic = cfqd->active_cic;

3564

struct cfq_io_cq *cic = cfqd->active_cic;

3565

3566

/* If the queue already has requests, don't wait */

3566

/* If the queue already has requests, don't wait */

3567

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

3567

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

3568

return false;

3568

return false;

3569

3570

/* If there are other queues in the group, don't wait */

3570

/* If there are other queues in the group, don't wait */

3571

if (cfqq->cfqg->nr_cfqq > 1)

3571

if (cfqq->cfqg->nr_cfqq > 1)

3572

return false;

3572

return false;

3573

3574

/* the only queue in the group, but think time is big */

3574

/* the only queue in the group, but think time is big */

3575

if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))

3575

if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))

3576

return false;

3576

return false;

3577

3578

if (cfq_slice_used(cfqq))

3578

if (cfq_slice_used(cfqq))

3579

return true;

3579

return true;

3580

3581

/* if slice left is less than think time, wait busy */

3581

/* if slice left is less than think time, wait busy */

3582

if (cic && sample_valid(cic->ttime.ttime_samples)

3582

if (cic && sample_valid(cic->ttime.ttime_samples)

3583

&& (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))

3583

&& (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))

3584

return true;

3584

return true;

3585

3586

/*

3586

/*

3587

* If think times is less than a jiffy than ttime_mean=0 and above

3587

* If think times is less than a jiffy than ttime_mean=0 and above

3588

* will not be true. It might happen that slice has not expired yet

3588

* will not be true. It might happen that slice has not expired yet

3589

* but will expire soon (4-5 ns) during select_queue(). To cover the

3589

* but will expire soon (4-5 ns) during select_queue(). To cover the

3590

* case where think time is less than a jiffy, mark the queue wait

3590

* case where think time is less than a jiffy, mark the queue wait

3591

* busy if only 1 jiffy is left in the slice.

3591

* busy if only 1 jiffy is left in the slice.

3592

*/

3592

*/

3593

if (cfqq->slice_end - jiffies == 1)

3593

if (cfqq->slice_end - jiffies == 1)

3594

return true;

3594

return true;

3595

3596

return false;

3596

return false;

3597

}

3597

}

3598

3599

static void cfq_completed_request(struct request_queue *q, struct request *rq)

3599

static void cfq_completed_request(struct request_queue *q, struct request *rq)

3600

{

3600

{

3601

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3601

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3602

struct cfq_data *cfqd = cfqq->cfqd;

3602

struct cfq_data *cfqd = cfqq->cfqd;

3603

const int sync = rq_is_sync(rq);

3603

const int sync = rq_is_sync(rq);

3604

unsigned long now;

3604

unsigned long now;

3605

3606

now = jiffies;

3606

now = jiffies;

3607

cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",

3607

cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",

3608

!!(rq->cmd_flags & REQ_NOIDLE));

3608

!!(rq->cmd_flags & REQ_NOIDLE));

3609

3610

cfq_update_hw_tag(cfqd);

3610

cfq_update_hw_tag(cfqd);

3611

3612

WARN_ON(!cfqd->rq_in_driver);

3612

WARN_ON(!cfqd->rq_in_driver);

3613

WARN_ON(!cfqq->dispatched);

3613

WARN_ON(!cfqq->dispatched);

3614

cfqd->rq_in_driver--;

3614

cfqd->rq_in_driver--;

3615

cfqq->dispatched--;

3615

cfqq->dispatched--;

3616

(RQ_CFQG(rq))->dispatched--;

3616

(RQ_CFQG(rq))->dispatched--;

3617

cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),

3617

cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),

3618

rq_io_start_time_ns(rq), rq->cmd_flags);

3618

rq_io_start_time_ns(rq), rq->cmd_flags);

3619

3620

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;

3620

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;

3621

3622

if (sync) {

3622

if (sync) {

3623

struct cfq_rb_root *service_tree;

3623

struct cfq_rb_root *service_tree;

3624

3625

RQ_CIC(rq)->ttime.last_end_request = now;

3625

RQ_CIC(rq)->ttime.last_end_request = now;

3626

3627

if (cfq_cfqq_on_rr(cfqq))

3627

if (cfq_cfqq_on_rr(cfqq))

3628

service_tree = cfqq->service_tree;

3628

service_tree = cfqq->service_tree;

3629

else

3629

else

3630

service_tree = service_tree_for(cfqq->cfqg,

3630

service_tree = service_tree_for(cfqq->cfqg,

3631

cfqq_prio(cfqq), cfqq_type(cfqq));

3631

cfqq_prio(cfqq), cfqq_type(cfqq));

3632

service_tree->ttime.last_end_request = now;

3632

service_tree->ttime.last_end_request = now;

3633

if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))

3633

if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))

3634

cfqd->last_delayed_sync = now;

3634

cfqd->last_delayed_sync = now;

3635

}

3635

}

3636

3637

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3637

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3638

cfqq->cfqg->ttime.last_end_request = now;

3638

cfqq->cfqg->ttime.last_end_request = now;

3639

#endif

3639

#endif

3640

3641

/*

3641

/*

3642

* If this is the active queue, check if it needs to be expired,

3642

* If this is the active queue, check if it needs to be expired,

3643

* or if we want to idle in case it has no pending requests.

3643

* or if we want to idle in case it has no pending requests.

3644

*/

3644

*/

3645

if (cfqd->active_queue == cfqq) {

3645

if (cfqd->active_queue == cfqq) {

3646

const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);

3646

const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);

3647

3648

if (cfq_cfqq_slice_new(cfqq)) {

3648

if (cfq_cfqq_slice_new(cfqq)) {

3649

cfq_set_prio_slice(cfqd, cfqq);

3649

cfq_set_prio_slice(cfqd, cfqq);

3650

cfq_clear_cfqq_slice_new(cfqq);

3650

cfq_clear_cfqq_slice_new(cfqq);

3651

}

3651

}

3652

3653

/*

3653

/*

3654

* Should we wait for next request to come in before we expire

3654

* Should we wait for next request to come in before we expire

3655

* the queue.

3655

* the queue.

3656

*/

3656

*/

3657

if (cfq_should_wait_busy(cfqd, cfqq)) {

3657

if (cfq_should_wait_busy(cfqd, cfqq)) {

3658

unsigned long extend_sl = cfqd->cfq_slice_idle;

3658

unsigned long extend_sl = cfqd->cfq_slice_idle;

3659

if (!cfqd->cfq_slice_idle)

3659

if (!cfqd->cfq_slice_idle)

3660

extend_sl = cfqd->cfq_group_idle;

3660

extend_sl = cfqd->cfq_group_idle;

3661

cfqq->slice_end = jiffies + extend_sl;

3661

cfqq->slice_end = jiffies + extend_sl;

3662

cfq_mark_cfqq_wait_busy(cfqq);

3662

cfq_mark_cfqq_wait_busy(cfqq);

3663

cfq_log_cfqq(cfqd, cfqq, "will busy wait");

3663

cfq_log_cfqq(cfqd, cfqq, "will busy wait");

3664

}

3664

}

3665

3666

/*

3666

/*

3667

* Idling is not enabled on:

3667

* Idling is not enabled on:

3668

* - expired queues

3668

* - expired queues

3669

* - idle-priority queues

3669

* - idle-priority queues

3670

* - async queues

3670

* - async queues

3671

* - queues with still some requests queued

3671

* - queues with still some requests queued

3672

* - when there is a close cooperator

3672

* - when there is a close cooperator

3673

*/

3673

*/

3674

if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))

3674

if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))

3675

cfq_slice_expired(cfqd, 1);

3675

cfq_slice_expired(cfqd, 1);

3676

else if (sync && cfqq_empty &&

3676

else if (sync && cfqq_empty &&

3677

!cfq_close_cooperator(cfqd, cfqq)) {

3677

!cfq_close_cooperator(cfqd, cfqq)) {

3678

cfq_arm_slice_timer(cfqd);

3678

cfq_arm_slice_timer(cfqd);

3679

}

3679

}

3680

}

3680

}

3681

3682

if (!cfqd->rq_in_driver)

3682

if (!cfqd->rq_in_driver)

3683

cfq_schedule_dispatch(cfqd);

3683

cfq_schedule_dispatch(cfqd);

3684

}

3684

}

3685

3686

static inline int __cfq_may_queue(struct cfq_queue *cfqq)

3686

static inline int __cfq_may_queue(struct cfq_queue *cfqq)

3687

{

3687

{

3688

if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {

3688

if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {

3689

cfq_mark_cfqq_must_alloc_slice(cfqq);

3689

cfq_mark_cfqq_must_alloc_slice(cfqq);

3690

return ELV_MQUEUE_MUST;

3690

return ELV_MQUEUE_MUST;

3691

}

3691

}

3692

3693

return ELV_MQUEUE_MAY;

3693

return ELV_MQUEUE_MAY;

3694

}

3694

}

3695

3696

static int cfq_may_queue(struct request_queue *q, int rw)

3696

static int cfq_may_queue(struct request_queue *q, int rw)

3697

{

3697

{

3698

struct cfq_data *cfqd = q->elevator->elevator_data;

3698

struct cfq_data *cfqd = q->elevator->elevator_data;

3699

struct task_struct *tsk = current;

3699

struct task_struct *tsk = current;

3700

struct cfq_io_cq *cic;

3700

struct cfq_io_cq *cic;

3701

struct cfq_queue *cfqq;

3701

struct cfq_queue *cfqq;

3702

3703

/*

3703

/*

3704

* don't force setup of a queue from here, as a call to may_queue

3704

* don't force setup of a queue from here, as a call to may_queue

3705

* does not necessarily imply that a request actually will be queued.

3705

* does not necessarily imply that a request actually will be queued.

3706

* so just lookup a possibly existing queue, or return 'may queue'

3706

* so just lookup a possibly existing queue, or return 'may queue'

3707

* if that fails

3707

* if that fails

3708

*/

3708

*/

3709

cic = cfq_cic_lookup(cfqd, tsk->io_context);

3709

cic = cfq_cic_lookup(cfqd, tsk->io_context);

3710

if (!cic)

3710

if (!cic)

3711

return ELV_MQUEUE_MAY;

3711

return ELV_MQUEUE_MAY;

3712

3713

cfqq = cic_to_cfqq(cic, rw_is_sync(rw));

3713

cfqq = cic_to_cfqq(cic, rw_is_sync(rw));

3714

if (cfqq) {

3714

if (cfqq) {

3715

cfq_init_prio_data(cfqq, cic);

3715

cfq_init_prio_data(cfqq, cic);

3716

3717

return __cfq_may_queue(cfqq);

3717

return __cfq_may_queue(cfqq);

3718

}

3718

}

3719

3720

return ELV_MQUEUE_MAY;

3720

return ELV_MQUEUE_MAY;

3721

}

3721

}

3722

3723

/*

3723

/*

3724

* queue lock held here

3724

* queue lock held here

3725

*/

3725

*/

3726

static void cfq_put_request(struct request *rq)

3726

static void cfq_put_request(struct request *rq)

3727

{

3727

{

3728

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3728

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3729

3730

if (cfqq) {

3730

if (cfqq) {

3731

const int rw = rq_data_dir(rq);

3731

const int rw = rq_data_dir(rq);

3732

3733

BUG_ON(!cfqq->allocated[rw]);

3733

BUG_ON(!cfqq->allocated[rw]);

3734

cfqq->allocated[rw]--;

3734

cfqq->allocated[rw]--;

3735

3736

/* Put down rq reference on cfqg */

3736

/* Put down rq reference on cfqg */

3737

cfqg_put(RQ_CFQG(rq));

3737

cfqg_put(RQ_CFQG(rq));

3738

rq->elv.priv[0] = NULL;

3738

rq->elv.priv[0] = NULL;

3739

rq->elv.priv[1] = NULL;

3739

rq->elv.priv[1] = NULL;

3740

3741

cfq_put_queue(cfqq);

3741

cfq_put_queue(cfqq);

3742

}

3742

}

3743

}

3743

}

3744

3745

static struct cfq_queue *

3745

static struct cfq_queue *

3746

cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,

3746

cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,

3747

struct cfq_queue *cfqq)

3747

struct cfq_queue *cfqq)

3748

{

3748

{

3749

cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);

3749

cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);

3750

cic_set_cfqq(cic, cfqq->new_cfqq, 1);

3750

cic_set_cfqq(cic, cfqq->new_cfqq, 1);

3751

cfq_mark_cfqq_coop(cfqq->new_cfqq);

3751

cfq_mark_cfqq_coop(cfqq->new_cfqq);

3752

cfq_put_queue(cfqq);

3752

cfq_put_queue(cfqq);

3753

return cic_to_cfqq(cic, 1);

3753

return cic_to_cfqq(cic, 1);

3754

}

3754

}

3755

3756

/*

3756

/*

3757

* Returns NULL if a new cfqq should be allocated, or the old cfqq if this

3757

* Returns NULL if a new cfqq should be allocated, or the old cfqq if this

3758

* was the last process referring to said cfqq.

3758

* was the last process referring to said cfqq.

3759

*/

3759

*/

3760

static struct cfq_queue *

3760

static struct cfq_queue *

3761

split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)

3761

split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)

3762

{

3762

{

3763

if (cfqq_process_refs(cfqq) == 1) {

3763

if (cfqq_process_refs(cfqq) == 1) {

3764

cfqq->pid = current->pid;

3764

cfqq->pid = current->pid;

3765

cfq_clear_cfqq_coop(cfqq);

3765

cfq_clear_cfqq_coop(cfqq);

3766

cfq_clear_cfqq_split_coop(cfqq);

3766

cfq_clear_cfqq_split_coop(cfqq);

3767

return cfqq;

3767

return cfqq;

3768

}

3768

}

3769

3770

cic_set_cfqq(cic, NULL, 1);

3770

cic_set_cfqq(cic, NULL, 1);

3771

3772

cfq_put_cooperator(cfqq);

3772

cfq_put_cooperator(cfqq);

3773

3774

cfq_put_queue(cfqq);

3774

cfq_put_queue(cfqq);

3775

return NULL;

3775

return NULL;

3776

}

3776

}

3777

/*

3777

/*

3778

* Allocate cfq data structures associated with this request.

3778

* Allocate cfq data structures associated with this request.

3779

*/

3779

*/

3780

static int

3780

static int

3781

cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,

3781

cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,

3782

gfp_t gfp_mask)

3782

gfp_t gfp_mask)

3783

{

3783

{

3784

struct cfq_data *cfqd = q->elevator->elevator_data;

3784

struct cfq_data *cfqd = q->elevator->elevator_data;

3785

struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);

3785

struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);

3786

const int rw = rq_data_dir(rq);

3786

const int rw = rq_data_dir(rq);

3787

const bool is_sync = rq_is_sync(rq);

3787

const bool is_sync = rq_is_sync(rq);

3788

struct cfq_queue *cfqq;

3788

struct cfq_queue *cfqq;

3789

3790

might_sleep_if(gfp_mask & __GFP_WAIT);

3790

might_sleep_if(gfp_mask & __GFP_WAIT);

3791

3792

spin_lock_irq(q->queue_lock);

3792

spin_lock_irq(q->queue_lock);

3793

3794

check_ioprio_changed(cic, bio);

3794

check_ioprio_changed(cic, bio);

3795

check_blkcg_changed(cic, bio);

3795

check_blkcg_changed(cic, bio);

3796

new_queue:

3796

new_queue:

3797

cfqq = cic_to_cfqq(cic, is_sync);

3797

cfqq = cic_to_cfqq(cic, is_sync);

3798

if (!cfqq || cfqq == &cfqd->oom_cfqq) {

3798

if (!cfqq || cfqq == &cfqd->oom_cfqq) {

3799

cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);

3799

cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);

3800

cic_set_cfqq(cic, cfqq, is_sync);

3800

cic_set_cfqq(cic, cfqq, is_sync);

3801

} else {

3801

} else {

3802

/*

3802

/*

3803

* If the queue was seeky for too long, break it apart.

3803

* If the queue was seeky for too long, break it apart.

3804

*/

3804

*/

3805

if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {

3805

if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {

3806

cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");

3806

cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");

3807

cfqq = split_cfqq(cic, cfqq);

3807

cfqq = split_cfqq(cic, cfqq);

3808

if (!cfqq)

3808

if (!cfqq)

3809

goto new_queue;

3809

goto new_queue;

3810

}

3810

}

3811

3812

/*

3812

/*

3813

* Check to see if this queue is scheduled to merge with

3813

* Check to see if this queue is scheduled to merge with

3814

* another, closely cooperating queue. The merging of

3814

* another, closely cooperating queue. The merging of

3815

* queues happens here as it must be done in process context.

3815

* queues happens here as it must be done in process context.

3816

* The reference on new_cfqq was taken in merge_cfqqs.

3816

* The reference on new_cfqq was taken in merge_cfqqs.

3817

*/

3817

*/

3818

if (cfqq->new_cfqq)

3818

if (cfqq->new_cfqq)

3819

cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);

3819

cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);

3820

}

3820

}

3821

3822

cfqq->allocated[rw]++;

3822

cfqq->allocated[rw]++;

3823

3824

cfqq->ref++;

3824

cfqq->ref++;

3825

cfqg_get(cfqq->cfqg);

3825

cfqg_get(cfqq->cfqg);

3826

rq->elv.priv[0] = cfqq;

3826

rq->elv.priv[0] = cfqq;

3827

rq->elv.priv[1] = cfqq->cfqg;

3827

rq->elv.priv[1] = cfqq->cfqg;

3828

spin_unlock_irq(q->queue_lock);

3828

spin_unlock_irq(q->queue_lock);

3829

return 0;

3829

return 0;

3830

}

3830

}

3831

3832

static void cfq_kick_queue(struct work_struct *work)

3832

static void cfq_kick_queue(struct work_struct *work)

3833

{

3833

{

3834

struct cfq_data *cfqd =

3834

struct cfq_data *cfqd =

3835

container_of(work, struct cfq_data, unplug_work);

3835

container_of(work, struct cfq_data, unplug_work);

3836

struct request_queue *q = cfqd->queue;

3836

struct request_queue *q = cfqd->queue;

3837

3838

spin_lock_irq(q->queue_lock);

3838

spin_lock_irq(q->queue_lock);

3839

__blk_run_queue(cfqd->queue);

3839

__blk_run_queue(cfqd->queue);

3840

spin_unlock_irq(q->queue_lock);

3840

spin_unlock_irq(q->queue_lock);

3841

}

3841

}

3842

3843

/*

3843

/*

3844

* Timer running if the active_queue is currently idling inside its time slice

3844

* Timer running if the active_queue is currently idling inside its time slice

3845

*/

3845

*/

3846

static void cfq_idle_slice_timer(unsigned long data)

3846

static void cfq_idle_slice_timer(unsigned long data)

3847

{

3847

{

3848

struct cfq_data *cfqd = (struct cfq_data *) data;

3848

struct cfq_data *cfqd = (struct cfq_data *) data;

3849

struct cfq_queue *cfqq;

3849

struct cfq_queue *cfqq;

3850

unsigned long flags;

3850

unsigned long flags;

3851

int timed_out = 1;

3851

int timed_out = 1;

3852

3853

cfq_log(cfqd, "idle timer fired");

3853

cfq_log(cfqd, "idle timer fired");

3854

3855

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

3855

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

3856

3857

cfqq = cfqd->active_queue;

3857

cfqq = cfqd->active_queue;

3858

if (cfqq) {

3858

if (cfqq) {

3859

timed_out = 0;

3859

timed_out = 0;

3860

3861

/*

3861

/*

3862

* We saw a request before the queue expired, let it through

3862

* We saw a request before the queue expired, let it through

3863

*/

3863

*/

3864

if (cfq_cfqq_must_dispatch(cfqq))

3864

if (cfq_cfqq_must_dispatch(cfqq))

3865

goto out_kick;

3865

goto out_kick;

3866

3867

/*

3867

/*

3868

* expired

3868

* expired

3869

*/

3869

*/

3870

if (cfq_slice_used(cfqq))

3870

if (cfq_slice_used(cfqq))

3871

goto expire;

3871

goto expire;

3872

3873

/*

3873

/*

3874

* only expire and reinvoke request handler, if there are

3874

* only expire and reinvoke request handler, if there are

3875

* other queues with pending requests

3875

* other queues with pending requests

3876

*/

3876

*/

3877

if (!cfqd->busy_queues)

3877

if (!cfqd->busy_queues)

3878

goto out_cont;

3878

goto out_cont;

3879

3880

/*

3880

/*

3881

* not expired and it has a request pending, let it dispatch

3881

* not expired and it has a request pending, let it dispatch

3882

*/

3882

*/

3883

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

3883

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

3884

goto out_kick;

3884

goto out_kick;

3885

3886

/*

3886

/*

3887

* Queue depth flag is reset only when the idle didn't succeed

3887

* Queue depth flag is reset only when the idle didn't succeed

3888

*/

3888

*/

3889

cfq_clear_cfqq_deep(cfqq);

3889

cfq_clear_cfqq_deep(cfqq);

3890

}

3890

}

3891

expire:

3891

expire:

3892

cfq_slice_expired(cfqd, timed_out);

3892

cfq_slice_expired(cfqd, timed_out);

3893

out_kick:

3893

out_kick:

3894

cfq_schedule_dispatch(cfqd);

3894

cfq_schedule_dispatch(cfqd);

3895

out_cont:

3895

out_cont:

3896

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

3896

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

3897

}

3897

}

3898

3899

static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)

3899

static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)

3900

{

3900

{

3901

del_timer_sync(&cfqd->idle_slice_timer);

3901

del_timer_sync(&cfqd->idle_slice_timer);

3902

cancel_work_sync(&cfqd->unplug_work);

3902

cancel_work_sync(&cfqd->unplug_work);

3903

}

3903

}

3904

3905

static void cfq_put_async_queues(struct cfq_data *cfqd)

3905

static void cfq_put_async_queues(struct cfq_data *cfqd)

3906

{

3906

{

3907

int i;

3907

int i;

3908

3909

for (i = 0; i < IOPRIO_BE_NR; i++) {

3909

for (i = 0; i < IOPRIO_BE_NR; i++) {

3910

if (cfqd->async_cfqq[0][i])

3910

if (cfqd->async_cfqq[0][i])

3911

cfq_put_queue(cfqd->async_cfqq[0][i]);

3911

cfq_put_queue(cfqd->async_cfqq[0][i]);

3912

if (cfqd->async_cfqq[1][i])

3912

if (cfqd->async_cfqq[1][i])

3913

cfq_put_queue(cfqd->async_cfqq[1][i]);

3913

cfq_put_queue(cfqd->async_cfqq[1][i]);

3914

}

3914

}

3915

3916

if (cfqd->async_idle_cfqq)

3916

if (cfqd->async_idle_cfqq)

3917

cfq_put_queue(cfqd->async_idle_cfqq);

3917

cfq_put_queue(cfqd->async_idle_cfqq);

3918

}

3918

}

3919

3920

static void cfq_exit_queue(struct elevator_queue *e)

3920

static void cfq_exit_queue(struct elevator_queue *e)

3921

{

3921

{

3922

struct cfq_data *cfqd = e->elevator_data;

3922

struct cfq_data *cfqd = e->elevator_data;

3923

struct request_queue *q = cfqd->queue;

3923

struct request_queue *q = cfqd->queue;

3924

3925

cfq_shutdown_timer_wq(cfqd);

3925

cfq_shutdown_timer_wq(cfqd);

3926

3927

spin_lock_irq(q->queue_lock);

3927

spin_lock_irq(q->queue_lock);

3928

3929

if (cfqd->active_queue)

3929

if (cfqd->active_queue)

3930

__cfq_slice_expired(cfqd, cfqd->active_queue, 0);

3930

__cfq_slice_expired(cfqd, cfqd->active_queue, 0);

3931

3932

cfq_put_async_queues(cfqd);

3932

cfq_put_async_queues(cfqd);

3933

3934

spin_unlock_irq(q->queue_lock);

3934

spin_unlock_irq(q->queue_lock);

3935

3936

cfq_shutdown_timer_wq(cfqd);

3936

cfq_shutdown_timer_wq(cfqd);

3937

3938

#ifndef CONFIG_CFQ_GROUP_IOSCHED

3938

#ifndef CONFIG_CFQ_GROUP_IOSCHED

3939

kfree(cfqd->root_group);

3939

kfree(cfqd->root_group);

3940

#endif

3940

#endif

3941

update_root_blkg_pd(q, &blkio_policy_cfq);

3941

update_root_blkg_pd(q, &blkio_policy_cfq);

3942

kfree(cfqd);

3942

kfree(cfqd);

3943

}

3943

}

3944

3945

static int cfq_init_queue(struct request_queue *q)

3945

static int cfq_init_queue(struct request_queue *q)

3946

{

3946

{

3947

struct cfq_data *cfqd;

3947

struct cfq_data *cfqd;

3948

struct blkio_group *blkg __maybe_unused;

3948

struct blkio_group *blkg __maybe_unused;

3949

int i;

3949

int i;

3950

3951

cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);

3951

cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);

3952

if (!cfqd)

3952

if (!cfqd)

3953

return -ENOMEM;

3953

return -ENOMEM;

3954

3955

cfqd->queue = q;

3955

cfqd->queue = q;

3956

q->elevator->elevator_data = cfqd;

3956

q->elevator->elevator_data = cfqd;

3957

3958

/* Init root service tree */

3958

/* Init root service tree */

3959

cfqd->grp_service_tree = CFQ_RB_ROOT;

3959

cfqd->grp_service_tree = CFQ_RB_ROOT;

3960

3961

/* Init root group and prefer root group over other groups by default */

3961

/* Init root group and prefer root group over other groups by default */

3962

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3962

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3963

rcu_read_lock();

3963

rcu_read_lock();

3964

spin_lock_irq(q->queue_lock);

3964

spin_lock_irq(q->queue_lock);

3965

3966

blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);

3966

blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);

3967

if (!IS_ERR(blkg))

3967

if (!IS_ERR(blkg)) {

3968

q->root_blkg = blkg;

3968

cfqd->root_group = blkg_to_cfqg(blkg);

3969

cfqd->root_group = blkg_to_cfqg(blkg);

3970

}

3969

3971

3970

spin_unlock_irq(q->queue_lock);

3972

spin_unlock_irq(q->queue_lock);

3971

rcu_read_unlock();

3973

rcu_read_unlock();

3972

#else

3974

#else

3973

cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),

3975

cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),

3974

GFP_KERNEL, cfqd->queue->node);

3976

GFP_KERNEL, cfqd->queue->node);

3975

if (cfqd->root_group)

3977

if (cfqd->root_group)

3976

cfq_init_cfqg_base(cfqd->root_group);

3978

cfq_init_cfqg_base(cfqd->root_group);

3977

#endif

3979

#endif

3978

if (!cfqd->root_group) {

3980

if (!cfqd->root_group) {

3979

kfree(cfqd);

3981

kfree(cfqd);

3980

return -ENOMEM;

3982

return -ENOMEM;

3981

}

3983

}

3982

3984

3983

cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;

3985

cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;

3984

3986

3985

/*

3987

/*

3986

* Not strictly needed (since RB_ROOT just clears the node and we

3988

* Not strictly needed (since RB_ROOT just clears the node and we

3987

* zeroed cfqd on alloc), but better be safe in case someone decides

3989

* zeroed cfqd on alloc), but better be safe in case someone decides

3988

* to add magic to the rb code

3990

* to add magic to the rb code

3989

*/

3991

*/

3990

for (i = 0; i < CFQ_PRIO_LISTS; i++)

3992

for (i = 0; i < CFQ_PRIO_LISTS; i++)

3991

cfqd->prio_trees[i] = RB_ROOT;

3993

cfqd->prio_trees[i] = RB_ROOT;

3992

3994

3993

/*

3995

/*

3994

* Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.

3996

* Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.

3995

* Grab a permanent reference to it, so that the normal code flow

3997

* Grab a permanent reference to it, so that the normal code flow

3996

* will not attempt to free it. oom_cfqq is linked to root_group

3998

* will not attempt to free it. oom_cfqq is linked to root_group

3997

* but shouldn't hold a reference as it'll never be unlinked. Lose

3999

* but shouldn't hold a reference as it'll never be unlinked. Lose

3998

* the reference from linking right away.

4000

* the reference from linking right away.

3999

*/

4001

*/

4000

cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);

4002

cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);

4001

cfqd->oom_cfqq.ref++;

4003

cfqd->oom_cfqq.ref++;

4002

4004

4003

spin_lock_irq(q->queue_lock);

4005

spin_lock_irq(q->queue_lock);

4004

cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);

4006

cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);

4005

cfqg_put(cfqd->root_group);

4007

cfqg_put(cfqd->root_group);

4006

spin_unlock_irq(q->queue_lock);

4008

spin_unlock_irq(q->queue_lock);

4007

4009

4008

init_timer(&cfqd->idle_slice_timer);

4010

init_timer(&cfqd->idle_slice_timer);

4009

cfqd->idle_slice_timer.function = cfq_idle_slice_timer;

4011

cfqd->idle_slice_timer.function = cfq_idle_slice_timer;

4010

cfqd->idle_slice_timer.data = (unsigned long) cfqd;

4012

cfqd->idle_slice_timer.data = (unsigned long) cfqd;

4011

4013

4012

INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);

4014

INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);

4013

4015

4014

cfqd->cfq_quantum = cfq_quantum;

4016

cfqd->cfq_quantum = cfq_quantum;

4015

cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];

4017

cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];

4016

cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];

4018

cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];

4017

cfqd->cfq_back_max = cfq_back_max;

4019

cfqd->cfq_back_max = cfq_back_max;

4018

cfqd->cfq_back_penalty = cfq_back_penalty;

4020

cfqd->cfq_back_penalty = cfq_back_penalty;

4019

cfqd->cfq_slice[0] = cfq_slice_async;

4021

cfqd->cfq_slice[0] = cfq_slice_async;

4020

cfqd->cfq_slice[1] = cfq_slice_sync;

4022

cfqd->cfq_slice[1] = cfq_slice_sync;

4021

cfqd->cfq_slice_async_rq = cfq_slice_async_rq;

4023

cfqd->cfq_slice_async_rq = cfq_slice_async_rq;

4022

cfqd->cfq_slice_idle = cfq_slice_idle;

4024

cfqd->cfq_slice_idle = cfq_slice_idle;

4023

cfqd->cfq_group_idle = cfq_group_idle;

4025

cfqd->cfq_group_idle = cfq_group_idle;

4024

cfqd->cfq_latency = 1;

4026

cfqd->cfq_latency = 1;

4025

cfqd->hw_tag = -1;

4027

cfqd->hw_tag = -1;

4026

/*

4028

/*

4027

* we optimistically start assuming sync ops weren't delayed in last

4029

* we optimistically start assuming sync ops weren't delayed in last

4028

* second, in order to have larger depth for async operations.

4030

* second, in order to have larger depth for async operations.

4029

*/

4031

*/

4030

cfqd->last_delayed_sync = jiffies - HZ;

4032

cfqd->last_delayed_sync = jiffies - HZ;

4031

return 0;

4033

return 0;

4032

}

4034

}

4033

4035

4034

/*

4036

/*

4035

* sysfs parts below -->

4037

* sysfs parts below -->

4036

*/

4038

*/

4037

static ssize_t

4039

static ssize_t

4038

cfq_var_show(unsigned int var, char *page)

4040

cfq_var_show(unsigned int var, char *page)

4039

{

4041

{

4040

return sprintf(page, "%d\n", var);

4042

return sprintf(page, "%d\n", var);

4041

}

4043

}

4042

4044

4043

static ssize_t

4045

static ssize_t

4044

cfq_var_store(unsigned int *var, const char *page, size_t count)

4046

cfq_var_store(unsigned int *var, const char *page, size_t count)

4045

{

4047

{

4046

char *p = (char *) page;

4048

char *p = (char *) page;

4047

4049

4048

*var = simple_strtoul(p, &p, 10);

4050

*var = simple_strtoul(p, &p, 10);

4049

return count;

4051

return count;

4050

}

4052

}

4051

4053

4052

#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \

4054

#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \

4053

static ssize_t __FUNC(struct elevator_queue *e, char *page) \

4055

static ssize_t __FUNC(struct elevator_queue *e, char *page) \

4054

{ \

4056

{ \

4055

struct cfq_data *cfqd = e->elevator_data; \

4057

struct cfq_data *cfqd = e->elevator_data; \

4056

unsigned int __data = __VAR; \

4058

unsigned int __data = __VAR; \

4057

if (__CONV) \

4059

if (__CONV) \

4058

__data = jiffies_to_msecs(__data); \

4060

__data = jiffies_to_msecs(__data); \

4059

return cfq_var_show(__data, (page)); \

4061

return cfq_var_show(__data, (page)); \

4060

}

4062

}

4061

SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);

4063

SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);

4062

SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);

4064

SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);

4063

SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);

4065

SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);

4064

SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);

4066

SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);

4065

SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);

4067

SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);

4066

SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);

4068

SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);

4067

SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);

4069

SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);

4068

SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);

4070

SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);

4069

SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);

4071

SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);

4070

SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);

4072

SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);

4071

SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);

4073

SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);

4072

#undef SHOW_FUNCTION

4074

#undef SHOW_FUNCTION

4073

4075

4074

#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \

4076

#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \

4075

static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \

4077

static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \

4076

{ \

4078

{ \

4077

struct cfq_data *cfqd = e->elevator_data; \

4079

struct cfq_data *cfqd = e->elevator_data; \

4078

unsigned int __data; \

4080

unsigned int __data; \

4079

int ret = cfq_var_store(&__data, (page), count); \

4081

int ret = cfq_var_store(&__data, (page), count); \

4080

if (__data < (MIN)) \

4082

if (__data < (MIN)) \

4081

__data = (MIN); \

4083

__data = (MIN); \

4082

else if (__data > (MAX)) \

4084

else if (__data > (MAX)) \

4083

__data = (MAX); \

4085

__data = (MAX); \

4084

if (__CONV) \

4086

if (__CONV) \

4085

*(__PTR) = msecs_to_jiffies(__data); \

4087

*(__PTR) = msecs_to_jiffies(__data); \

4086

else \

4088

else \

4087

*(__PTR) = __data; \

4089

*(__PTR) = __data; \

4088

return ret; \

4090

return ret; \

4089

}

4091

}

4090

STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);

4092

STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);

4091

STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,

4093

STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,

4092

UINT_MAX, 1);

4094

UINT_MAX, 1);

4093

STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,

4095

STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,

4094

UINT_MAX, 1);

4096

UINT_MAX, 1);

4095

STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);

4097

STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);

4096

STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,

4098

STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,

4097

UINT_MAX, 0);

4099

UINT_MAX, 0);

4098

STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);

4100

STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);

4099

STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);

4101

STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);

4100

STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);

4102

STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);

4101

STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);

4103

STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);

4102

STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,

4104

STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,

4103

UINT_MAX, 0);

4105

UINT_MAX, 0);

4104

STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);

4106

STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);

4105

#undef STORE_FUNCTION

4107

#undef STORE_FUNCTION

4106

4108

4107

#define CFQ_ATTR(name) \

4109

#define CFQ_ATTR(name) \

4108

__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)

4110

__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)

4109

4111

4110

static struct elv_fs_entry cfq_attrs[] = {

4112

static struct elv_fs_entry cfq_attrs[] = {

4111

CFQ_ATTR(quantum),

4113

CFQ_ATTR(quantum),

4112

CFQ_ATTR(fifo_expire_sync),

4114

CFQ_ATTR(fifo_expire_sync),

4113

CFQ_ATTR(fifo_expire_async),

4115

CFQ_ATTR(fifo_expire_async),

4114

CFQ_ATTR(back_seek_max),

4116

CFQ_ATTR(back_seek_max),

4115

CFQ_ATTR(back_seek_penalty),

4117

CFQ_ATTR(back_seek_penalty),

4116

CFQ_ATTR(slice_sync),

4118

CFQ_ATTR(slice_sync),

4117

CFQ_ATTR(slice_async),

4119

CFQ_ATTR(slice_async),

4118

CFQ_ATTR(slice_async_rq),

4120

CFQ_ATTR(slice_async_rq),

4119

CFQ_ATTR(slice_idle),

4121

CFQ_ATTR(slice_idle),

4120

CFQ_ATTR(group_idle),

4122

CFQ_ATTR(group_idle),

4121

CFQ_ATTR(low_latency),

4123

CFQ_ATTR(low_latency),

4122

__ATTR_NULL

4124

__ATTR_NULL

4123

};

4125

};

4124

4126

4125

static struct elevator_type iosched_cfq = {

4127

static struct elevator_type iosched_cfq = {

4126

.ops = {

4128

.ops = {

4127

.elevator_merge_fn = cfq_merge,

4129

.elevator_merge_fn = cfq_merge,

4128

.elevator_merged_fn = cfq_merged_request,

4130

.elevator_merged_fn = cfq_merged_request,

4129

.elevator_merge_req_fn = cfq_merged_requests,

4131

.elevator_merge_req_fn = cfq_merged_requests,

4130

.elevator_allow_merge_fn = cfq_allow_merge,

4132

.elevator_allow_merge_fn = cfq_allow_merge,

4131

.elevator_bio_merged_fn = cfq_bio_merged,

4133

.elevator_bio_merged_fn = cfq_bio_merged,

4132

.elevator_dispatch_fn = cfq_dispatch_requests,

4134

.elevator_dispatch_fn = cfq_dispatch_requests,

4133

.elevator_add_req_fn = cfq_insert_request,

4135

.elevator_add_req_fn = cfq_insert_request,

4134

.elevator_activate_req_fn = cfq_activate_request,

4136

.elevator_activate_req_fn = cfq_activate_request,

4135

.elevator_deactivate_req_fn = cfq_deactivate_request,

4137

.elevator_deactivate_req_fn = cfq_deactivate_request,

4136

.elevator_completed_req_fn = cfq_completed_request,

4138

.elevator_completed_req_fn = cfq_completed_request,

4137

.elevator_former_req_fn = elv_rb_former_request,

4139

.elevator_former_req_fn = elv_rb_former_request,

4138

.elevator_latter_req_fn = elv_rb_latter_request,

4140

.elevator_latter_req_fn = elv_rb_latter_request,

4139

.elevator_init_icq_fn = cfq_init_icq,

4141

.elevator_init_icq_fn = cfq_init_icq,

4140

.elevator_exit_icq_fn = cfq_exit_icq,

4142

.elevator_exit_icq_fn = cfq_exit_icq,

4141

.elevator_set_req_fn = cfq_set_request,

4143

.elevator_set_req_fn = cfq_set_request,

4142

.elevator_put_req_fn = cfq_put_request,

4144

.elevator_put_req_fn = cfq_put_request,

4143

.elevator_may_queue_fn = cfq_may_queue,

4145

.elevator_may_queue_fn = cfq_may_queue,

4144

.elevator_init_fn = cfq_init_queue,

4146

.elevator_init_fn = cfq_init_queue,

4145

.elevator_exit_fn = cfq_exit_queue,

4147

.elevator_exit_fn = cfq_exit_queue,

4146

},

4148

},

4147

.icq_size = sizeof(struct cfq_io_cq),

4149

.icq_size = sizeof(struct cfq_io_cq),

4148

.icq_align = __alignof__(struct cfq_io_cq),

4150

.icq_align = __alignof__(struct cfq_io_cq),

4149

.elevator_attrs = cfq_attrs,

4151

.elevator_attrs = cfq_attrs,

4150

.elevator_name = "cfq",

4152

.elevator_name = "cfq",

4151

.elevator_owner = THIS_MODULE,

4153

.elevator_owner = THIS_MODULE,

4152

};

4154

};

4153

4155

4154

#ifdef CONFIG_CFQ_GROUP_IOSCHED

4156

#ifdef CONFIG_CFQ_GROUP_IOSCHED

4155

static struct blkio_policy_type blkio_policy_cfq = {

4157

static struct blkio_policy_type blkio_policy_cfq = {

4156

.ops = {

4158

.ops = {

4157

.blkio_init_group_fn = cfq_init_blkio_group,

4159

.blkio_init_group_fn = cfq_init_blkio_group,

4158

.blkio_reset_group_stats_fn = cfqg_stats_reset,

4160

.blkio_reset_group_stats_fn = cfqg_stats_reset,

4159

},

4161

},

4160

.pdata_size = sizeof(struct cfq_group),

4162

.pdata_size = sizeof(struct cfq_group),

4161

.cftypes = cfq_blkcg_files,

4163

.cftypes = cfq_blkcg_files,

4162

};

4164

};

4163

#endif

4165

#endif

4164

4166

4165

static int __init cfq_init(void)

4167

static int __init cfq_init(void)

4166

{

4168

{

4167

int ret;

4169

int ret;

4168

4170

4169

/*

4171

/*

4170

* could be 0 on HZ < 1000 setups

4172

* could be 0 on HZ < 1000 setups

4171

*/

4173

*/

4172

if (!cfq_slice_async)

4174

if (!cfq_slice_async)

4173

cfq_slice_async = 1;

4175

cfq_slice_async = 1;

4174

if (!cfq_slice_idle)

4176

if (!cfq_slice_idle)

4175

cfq_slice_idle = 1;

4177

cfq_slice_idle = 1;

4176

4178

4177

#ifdef CONFIG_CFQ_GROUP_IOSCHED

4179

#ifdef CONFIG_CFQ_GROUP_IOSCHED

4178

if (!cfq_group_idle)

4180

if (!cfq_group_idle)

4179

cfq_group_idle = 1;

4181

cfq_group_idle = 1;

4180

#else

4182

#else

4181

cfq_group_idle = 0;

4183

cfq_group_idle = 0;

4182

#endif

4184

#endif

4183

4185

4184

ret = blkio_policy_register(&blkio_policy_cfq);

4186

ret = blkio_policy_register(&blkio_policy_cfq);

4185

if (ret)

4187

if (ret)

4186

return ret;

4188

return ret;

4187

4189

4188

cfq_pool = KMEM_CACHE(cfq_queue, 0);

4190

cfq_pool = KMEM_CACHE(cfq_queue, 0);

4189

if (!cfq_pool)

4191

if (!cfq_pool)

4190

goto err_pol_unreg;

4192

goto err_pol_unreg;

4191

4193

4192

ret = elv_register(&iosched_cfq);

4194

ret = elv_register(&iosched_cfq);

4193

if (ret)

4195

if (ret)

4194

goto err_free_pool;

4196

goto err_free_pool;

4195

4197

4196

return 0;

4198

return 0;

4197

4199

4198

err_free_pool:

4200

err_free_pool:

4199

kmem_cache_destroy(cfq_pool);

4201

kmem_cache_destroy(cfq_pool);

4200

err_pol_unreg:

4202

err_pol_unreg:

4201

blkio_policy_unregister(&blkio_policy_cfq);

4203

blkio_policy_unregister(&blkio_policy_cfq);

4202

return ret;

4204

return ret;

4203

}

4205

}

4204

4206

4205

static void __exit cfq_exit(void)

4207

static void __exit cfq_exit(void)

4206

{

4208

{

4207

blkio_policy_unregister(&blkio_policy_cfq);

4209

blkio_policy_unregister(&blkio_policy_cfq);

4208

elv_unregister(&iosched_cfq);

4210

elv_unregister(&iosched_cfq);

4209

kmem_cache_destroy(cfq_pool);

4211

kmem_cache_destroy(cfq_pool);

4210

}

4212

}

4211

4213

4212

module_init(cfq_init);

4214

module_init(cfq_init);

4213

module_exit(cfq_exit);

4215

module_exit(cfq_exit);

4214

4216

4215

MODULE_AUTHOR("Jens Axboe");

4217

MODULE_AUTHOR("Jens Axboe");

4216

MODULE_LICENSE("GPL");

4218

MODULE_LICENSE("GPL");

4217

MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");

4219

MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");

4218

4220

GITLAB

blkcg: add request_queue->root_blkg

 /*
  * Interface for controlling IO bandwidth on a request queue
  *
  * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com>
  */
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/blktrace_api.h>
 #include "blk-cgroup.h"
 #include "blk.h"
 /* Max dispatch from a group in 1 round */
 static int throtl_grp_quantum = 8;
 /* Total max dispatch from all groups in one round */
 static int throtl_quantum = 32;
 /* Throttling is performed over 100ms slice and after that slice is renewed */
 static unsigned long throtl_slice = HZ/10;	/* 100 ms */
 static struct blkio_policy_type blkio_policy_throtl;
 /* A workqueue to queue throttle related work */
 static struct workqueue_struct *kthrotld_workqueue;
 static void throtl_schedule_delayed_work(struct throtl_data *td,
 				unsigned long delay);
 struct throtl_rb_root {
 	struct rb_root rb;
 	struct rb_node *left;
 	unsigned int count;
 	unsigned long min_disptime;
 };
 #define THROTL_RB_ROOT	(struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \
 			.count = 0, .min_disptime = 0}
 #define rb_entry_tg(node)	rb_entry((node), struct throtl_grp, rb_node)
 /* Per-cpu group stats */
 struct tg_stats_cpu {
 	/* total bytes transferred */
 	struct blkg_rwstat		service_bytes;
 	/* total IOs serviced, post merge */
 	struct blkg_rwstat		serviced;
 };
 struct throtl_grp {
 	/* active throtl group service_tree member */
 	struct rb_node rb_node;
 	/*
 	 * Dispatch time in jiffies. This is the estimated time when group
 	 * will unthrottle and is ready to dispatch more bio. It is used as
 	 * key to sort active groups in service tree.
 	 */
 	unsigned long disptime;
 	unsigned int flags;
 	/* Two lists for READ and WRITE */
 	struct bio_list bio_lists[2];
 	/* Number of queued bios on READ and WRITE lists */
 	unsigned int nr_queued[2];
 	/* bytes per second rate limits */
 	uint64_t bps[2];
 	/* IOPS limits */
 	unsigned int iops[2];
 	/* Number of bytes disptached in current slice */
 	uint64_t bytes_disp[2];
 	/* Number of bio's dispatched in current slice */
 	unsigned int io_disp[2];
 	/* When did we start a new slice */
 	unsigned long slice_start[2];
 	unsigned long slice_end[2];
 	/* Some throttle limits got updated for the group */
 	int limits_changed;
 	/* Per cpu stats pointer */
 	struct tg_stats_cpu __percpu *stats_cpu;
 	/* List of tgs waiting for per cpu stats memory to be allocated */
 	struct list_head stats_alloc_node;
 };
 struct throtl_data
 {
 	/* service tree for active throtl groups */
 	struct throtl_rb_root tg_service_tree;
-	struct throtl_grp *root_tg;
 	struct request_queue *queue;
 	/* Total Number of queued bios on READ and WRITE lists */
 	unsigned int nr_queued[2];
 	/*
 	 * number of total undestroyed groups
 	 */
 	unsigned int nr_undestroyed_grps;
 	/* Work for dispatching throttled bios */
 	struct delayed_work throtl_work;
 	int limits_changed;
 };
 /* list and work item to allocate percpu group stats */
 static DEFINE_SPINLOCK(tg_stats_alloc_lock);
 static LIST_HEAD(tg_stats_alloc_list);
 static void tg_stats_alloc_fn(struct work_struct *);
 static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn);
 static inline struct throtl_grp *blkg_to_tg(struct blkio_group *blkg)
 {
 	return blkg_to_pdata(blkg, &blkio_policy_throtl);
 }
 static inline struct blkio_group *tg_to_blkg(struct throtl_grp *tg)
 {
 	return pdata_to_blkg(tg);
 }
+static inline struct throtl_grp *td_root_tg(struct throtl_data *td)
+{
+	return blkg_to_tg(td->queue->root_blkg);
+}
 enum tg_state_flags {
 	THROTL_TG_FLAG_on_rr = 0,	/* on round-robin busy list */
 };
 #define THROTL_TG_FNS(name)						\
 static inline void throtl_mark_tg_##name(struct throtl_grp *tg)		\
 {									\
 	(tg)->flags |= (1 << THROTL_TG_FLAG_##name);			\
 }									\
 static inline void throtl_clear_tg_##name(struct throtl_grp *tg)	\
 {									\
 	(tg)->flags &= ~(1 << THROTL_TG_FLAG_##name);			\
 }									\
 static inline int throtl_tg_##name(const struct throtl_grp *tg)		\
 {									\
 	return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0;	\
 }
 THROTL_TG_FNS(on_rr);
 #define throtl_log_tg(td, tg, fmt, args...)				\
 	blk_add_trace_msg((td)->queue, "throtl %s " fmt,		\
 			  blkg_path(tg_to_blkg(tg)), ##args);		\
 #define throtl_log(td, fmt, args...)	\
 	blk_add_trace_msg((td)->queue, "throtl " fmt, ##args)
 static inline unsigned int total_nr_queued(struct throtl_data *td)
 {
 	return td->nr_queued[0] + td->nr_queued[1];
 }
 /*
  * Worker for allocating per cpu stat for tgs. This is scheduled on the
  * system_nrt_wq once there are some groups on the alloc_list waiting for
  * allocation.
  */
 static void tg_stats_alloc_fn(struct work_struct *work)
 {
 	static struct tg_stats_cpu *stats_cpu;	/* this fn is non-reentrant */
 	struct delayed_work *dwork = to_delayed_work(work);
 	bool empty = false;
 alloc_stats:
 	if (!stats_cpu) {
 		stats_cpu = alloc_percpu(struct tg_stats_cpu);
 		if (!stats_cpu) {
 			/* allocation failed, try again after some time */
 			queue_delayed_work(system_nrt_wq, dwork,
 					   msecs_to_jiffies(10));
 			return;
 		}
 	}
 	spin_lock_irq(&tg_stats_alloc_lock);
 	if (!list_empty(&tg_stats_alloc_list)) {
 		struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list,
 							 struct throtl_grp,
 							 stats_alloc_node);
 		swap(tg->stats_cpu, stats_cpu);
 		list_del_init(&tg->stats_alloc_node);
 	}
 	empty = list_empty(&tg_stats_alloc_list);
 	spin_unlock_irq(&tg_stats_alloc_lock);
 	if (!empty)
 		goto alloc_stats;
 }
 static void throtl_init_blkio_group(struct blkio_group *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	RB_CLEAR_NODE(&tg->rb_node);
 	bio_list_init(&tg->bio_lists[0]);
 	bio_list_init(&tg->bio_lists[1]);
 	tg->limits_changed = false;
 	tg->bps[READ] = -1;
 	tg->bps[WRITE] = -1;
 	tg->iops[READ] = -1;
 	tg->iops[WRITE] = -1;
 	/*
 	 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu
 	 * but percpu allocator can't be called from IO path.  Queue tg on
 	 * tg_stats_alloc_list and allocate from work item.
 	 */
 	spin_lock(&tg_stats_alloc_lock);
 	list_add(&tg->stats_alloc_node, &tg_stats_alloc_list);
 	queue_delayed_work(system_nrt_wq, &tg_stats_alloc_work, 0);
 	spin_unlock(&tg_stats_alloc_lock);
 }
 static void throtl_exit_blkio_group(struct blkio_group *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	spin_lock(&tg_stats_alloc_lock);
 	list_del_init(&tg->stats_alloc_node);
 	spin_unlock(&tg_stats_alloc_lock);
 	free_percpu(tg->stats_cpu);
 }
 static void throtl_reset_group_stats(struct blkio_group *blkg)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	int cpu;
 	if (tg->stats_cpu == NULL)
 		return;
 	for_each_possible_cpu(cpu) {
 		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
 		blkg_rwstat_reset(&sc->service_bytes);
 		blkg_rwstat_reset(&sc->serviced);
 	}
 }
 static struct
 throtl_grp *throtl_lookup_tg(struct throtl_data *td, struct blkio_cgroup *blkcg)
 {
 	/*
 	 * This is the common case when there are no blkio cgroups.
 	 * Avoid lookup in this case
 	 */
 	if (blkcg == &blkio_root_cgroup)
-		return td->root_tg;
+		return td_root_tg(td);
 	return blkg_to_tg(blkg_lookup(blkcg, td->queue));
 }
 static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td,
 						  struct blkio_cgroup *blkcg)
 {
 	struct request_queue *q = td->queue;
 	struct throtl_grp *tg = NULL;
 	/*
 	 * This is the common case when there are no blkio cgroups.
 	 * Avoid lookup in this case
 	 */
 	if (blkcg == &blkio_root_cgroup) {
-		tg = td->root_tg;
+		tg = td_root_tg(td);
 	} else {
 		struct blkio_group *blkg;
 		blkg = blkg_lookup_create(blkcg, q, false);
 		/* if %NULL and @q is alive, fall back to root_tg */
 		if (!IS_ERR(blkg))
 			tg = blkg_to_tg(blkg);
 		else if (!blk_queue_dead(q))
-			tg = td->root_tg;
+			tg = td_root_tg(td);
 	}
 	return tg;
 }
 static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root)
 {
 	/* Service tree is empty */
 	if (!root->count)
 		return NULL;
 	if (!root->left)
 		root->left = rb_first(&root->rb);
 	if (root->left)
 		return rb_entry_tg(root->left);
 	return NULL;
 }
 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 {
 	rb_erase(n, root);
 	RB_CLEAR_NODE(n);
 }
 static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root)
 {
 	if (root->left == n)
 		root->left = NULL;
 	rb_erase_init(n, &root->rb);
 	--root->count;
 }
 static void update_min_dispatch_time(struct throtl_rb_root *st)
 {
 	struct throtl_grp *tg;
 	tg = throtl_rb_first(st);
 	if (!tg)
 		return;
 	st->min_disptime = tg->disptime;
 }
 static void
 tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg)
 {
 	struct rb_node **node = &st->rb.rb_node;
 	struct rb_node *parent = NULL;
 	struct throtl_grp *__tg;
 	unsigned long key = tg->disptime;
 	int left = 1;
 	while (*node != NULL) {
 		parent = *node;
 		__tg = rb_entry_tg(parent);
 		if (time_before(key, __tg->disptime))
 			node = &parent->rb_left;
 		else {
 			node = &parent->rb_right;
 			left = 0;
 		}
 	}
 	if (left)
 		st->left = &tg->rb_node;
 	rb_link_node(&tg->rb_node, parent, node);
 	rb_insert_color(&tg->rb_node, &st->rb);
 }
 static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
 {
 	struct throtl_rb_root *st = &td->tg_service_tree;
 	tg_service_tree_add(st, tg);
 	throtl_mark_tg_on_rr(tg);
 	st->count++;
 }
 static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg)
 {
 	if (!throtl_tg_on_rr(tg))
 		__throtl_enqueue_tg(td, tg);
 }
 static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
 {
 	throtl_rb_erase(&tg->rb_node, &td->tg_service_tree);
 	throtl_clear_tg_on_rr(tg);
 }
 static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg)
 {
 	if (throtl_tg_on_rr(tg))
 		__throtl_dequeue_tg(td, tg);
 }
 static void throtl_schedule_next_dispatch(struct throtl_data *td)
 {
 	struct throtl_rb_root *st = &td->tg_service_tree;
 	/*
 	 * If there are more bios pending, schedule more work.
 	 */
 	if (!total_nr_queued(td))
 		return;
 	BUG_ON(!st->count);
 	update_min_dispatch_time(st);
 	if (time_before_eq(st->min_disptime, jiffies))
 		throtl_schedule_delayed_work(td, 0);
 	else
 		throtl_schedule_delayed_work(td, (st->min_disptime - jiffies));
 }
 static inline void
 throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
 	tg->bytes_disp[rw] = 0;
 	tg->io_disp[rw] = 0;
 	tg->slice_start[rw] = jiffies;
 	tg->slice_end[rw] = jiffies + throtl_slice;
 	throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu",
 			rw == READ ? 'R' : 'W', tg->slice_start[rw],
 			tg->slice_end[rw], jiffies);
 }
 static inline void throtl_set_slice_end(struct throtl_data *td,
 		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
 {
 	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
 }
 static inline void throtl_extend_slice(struct throtl_data *td,
 		struct throtl_grp *tg, bool rw, unsigned long jiffy_end)
 {
 	tg->slice_end[rw] = roundup(jiffy_end, throtl_slice);
 	throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu",
 			rw == READ ? 'R' : 'W', tg->slice_start[rw],
 			tg->slice_end[rw], jiffies);
 }
 /* Determine if previously allocated or extended slice is complete or not */
 static bool
 throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
 	if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw]))
 		return 0;
 	return 1;
 }
 /* Trim the used slices and adjust slice start accordingly */
 static inline void
 throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw)
 {
 	unsigned long nr_slices, time_elapsed, io_trim;
 	u64 bytes_trim, tmp;
 	BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
 	/*
 	 * If bps are unlimited (-1), then time slice don't get
 	 * renewed. Don't try to trim the slice if slice is used. A new
 	 * slice will start when appropriate.
 	 */
 	if (throtl_slice_used(td, tg, rw))
 		return;
 	/*
 	 * A bio has been dispatched. Also adjust slice_end. It might happen
 	 * that initially cgroup limit was very low resulting in high
 	 * slice_end, but later limit was bumped up and bio was dispached
 	 * sooner, then we need to reduce slice_end. A high bogus slice_end
 	 * is bad because it does not allow new slice to start.
 	 */
 	throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice);
 	time_elapsed = jiffies - tg->slice_start[rw];
 	nr_slices = time_elapsed / throtl_slice;
 	if (!nr_slices)
 		return;
 	tmp = tg->bps[rw] * throtl_slice * nr_slices;
 	do_div(tmp, HZ);
 	bytes_trim = tmp;
 	io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ;
 	if (!bytes_trim && !io_trim)
 		return;
 	if (tg->bytes_disp[rw] >= bytes_trim)
 		tg->bytes_disp[rw] -= bytes_trim;
 	else
 		tg->bytes_disp[rw] = 0;
 	if (tg->io_disp[rw] >= io_trim)
 		tg->io_disp[rw] -= io_trim;
 	else
 		tg->io_disp[rw] = 0;
 	tg->slice_start[rw] += nr_slices * throtl_slice;
 	throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu"
 			" start=%lu end=%lu jiffies=%lu",
 			rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
 			tg->slice_start[rw], tg->slice_end[rw], jiffies);
 }
 static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg,
 		struct bio *bio, unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
 	unsigned int io_allowed;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 	u64 tmp;
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 	/* Slice has just started. Consider one slice interval */
 	if (!jiffy_elapsed)
 		jiffy_elapsed_rnd = throtl_slice;
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 	/*
 	 * jiffy_elapsed_rnd should not be a big value as minimum iops can be
 	 * 1 then at max jiffy elapsed should be equivalent of 1 second as we
 	 * will allow dispatch after 1 second and after that slice should
 	 * have been trimmed.
 	 */
 	tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd;
 	do_div(tmp, HZ);
 	if (tmp > UINT_MAX)
 		io_allowed = UINT_MAX;
 	else
 		io_allowed = tmp;
 	if (tg->io_disp[rw] + 1 <= io_allowed) {
 		if (wait)
 			*wait = 0;
 		return 1;
 	}
 	/* Calc approx time to dispatch */
 	jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1;
 	if (jiffy_wait > jiffy_elapsed)
 		jiffy_wait = jiffy_wait - jiffy_elapsed;
 	else
 		jiffy_wait = 1;
 	if (wait)
 		*wait = jiffy_wait;
 	return 0;
 }
 static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg,
 		struct bio *bio, unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
 	u64 bytes_allowed, extra_bytes, tmp;
 	unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
 	jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
 	/* Slice has just started. Consider one slice interval */
 	if (!jiffy_elapsed)
 		jiffy_elapsed_rnd = throtl_slice;
 	jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice);
 	tmp = tg->bps[rw] * jiffy_elapsed_rnd;
 	do_div(tmp, HZ);
 	bytes_allowed = tmp;
 	if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) {
 		if (wait)
 			*wait = 0;
 		return 1;
 	}
 	/* Calc approx time to dispatch */
 	extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed;
 	jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]);
 	if (!jiffy_wait)
 		jiffy_wait = 1;
 	/*
 	 * This wait time is without taking into consideration the rounding
 	 * up we did. Add that time also.
 	 */
 	jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
 	if (wait)
 		*wait = jiffy_wait;
 	return 0;
 }
 static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) {
 	if (tg->bps[rw] == -1 && tg->iops[rw] == -1)
 		return 1;
 	return 0;
 }
 /*
  * Returns whether one can dispatch a bio or not. Also returns approx number
  * of jiffies to wait before this bio is with-in IO rate and can be dispatched
  */
 static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg,
 				struct bio *bio, unsigned long *wait)
 {
 	bool rw = bio_data_dir(bio);
 	unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
 	/*
  	 * Currently whole state machine of group depends on first bio
 	 * queued in the group bio list. So one should not be calling
 	 * this function with a different bio if there are other bios
 	 * queued.
 	 */
 	BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw]));
 	/* If tg->bps = -1, then BW is unlimited */
 	if (tg->bps[rw] == -1 && tg->iops[rw] == -1) {
 		if (wait)
 			*wait = 0;
 		return 1;
 	}
 	/*
 	 * If previous slice expired, start a new one otherwise renew/extend
 	 * existing slice to make sure it is at least throtl_slice interval
 	 * long since now.
 	 */
 	if (throtl_slice_used(td, tg, rw))
 		throtl_start_new_slice(td, tg, rw);
 	else {
 		if (time_before(tg->slice_end[rw], jiffies + throtl_slice))
 			throtl_extend_slice(td, tg, rw, jiffies + throtl_slice);
 	}
 	if (tg_with_in_bps_limit(td, tg, bio, &bps_wait)
 	    && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) {
 		if (wait)
 			*wait = 0;
 		return 1;
 	}
 	max_wait = max(bps_wait, iops_wait);
 	if (wait)
 		*wait = max_wait;
 	if (time_before(tg->slice_end[rw], jiffies + max_wait))
 		throtl_extend_slice(td, tg, rw, jiffies + max_wait);
 	return 0;
 }
 static void throtl_update_dispatch_stats(struct blkio_group *blkg, u64 bytes,
 					 int rw)
 {
 	struct throtl_grp *tg = blkg_to_tg(blkg);
 	struct tg_stats_cpu *stats_cpu;
 	unsigned long flags;
 	/* If per cpu stats are not allocated yet, don't do any accounting. */
 	if (tg->stats_cpu == NULL)
 		return;
 	/*
 	 * Disabling interrupts to provide mutual exclusion between two
 	 * writes on same cpu. It probably is not needed for 64bit. Not
 	 * optimizing that case yet.
 	 */
 	local_irq_save(flags);
 	stats_cpu = this_cpu_ptr(tg->stats_cpu);
 	blkg_rwstat_add(&stats_cpu->serviced, rw, 1);
 	blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes);
 	local_irq_restore(flags);
 }
 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
 {
 	bool rw = bio_data_dir(bio);
 	/* Charge the bio to the group */
 	tg->bytes_disp[rw] += bio->bi_size;
 	tg->io_disp[rw]++;
 	throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw);
 }
 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg,
 			struct bio *bio)
 {
 	bool rw = bio_data_dir(bio);
 	bio_list_add(&tg->bio_lists[rw], bio);
 	/* Take a bio reference on tg */
 	blkg_get(tg_to_blkg(tg));
 	tg->nr_queued[rw]++;
 	td->nr_queued[rw]++;
 	throtl_enqueue_tg(td, tg);
 }
 static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg)
 {
 	unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime;
 	struct bio *bio;
 	if ((bio = bio_list_peek(&tg->bio_lists[READ])))
 		tg_may_dispatch(td, tg, bio, &read_wait);
 	if ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
 		tg_may_dispatch(td, tg, bio, &write_wait);
 	min_wait = min(read_wait, write_wait);
 	disptime = jiffies + min_wait;
 	/* Update dispatch time */
 	throtl_dequeue_tg(td, tg);
 	tg->disptime = disptime;
 	throtl_enqueue_tg(td, tg);
 }
 static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg,
 				bool rw, struct bio_list *bl)
 {
 	struct bio *bio;
 	bio = bio_list_pop(&tg->bio_lists[rw]);
 	tg->nr_queued[rw]--;
 	/* Drop bio reference on blkg */
 	blkg_put(tg_to_blkg(tg));
 	BUG_ON(td->nr_queued[rw] <= 0);
 	td->nr_queued[rw]--;
 	throtl_charge_bio(tg, bio);
 	bio_list_add(bl, bio);
 	bio->bi_rw |= REQ_THROTTLED;
 	throtl_trim_slice(td, tg, rw);
 }
 static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg,
 				struct bio_list *bl)
 {
 	unsigned int nr_reads = 0, nr_writes = 0;
 	unsigned int max_nr_reads = throtl_grp_quantum*3/4;
 	unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
 	struct bio *bio;
 	/* Try to dispatch 75% READS and 25% WRITES */
 	while ((bio = bio_list_peek(&tg->bio_lists[READ]))
 		&& tg_may_dispatch(td, tg, bio, NULL)) {
 		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
 		nr_reads++;
 		if (nr_reads >= max_nr_reads)
 			break;
 	}
 	while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))
 		&& tg_may_dispatch(td, tg, bio, NULL)) {
 		tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl);
 		nr_writes++;
 		if (nr_writes >= max_nr_writes)
 			break;
 	}
 	return nr_reads + nr_writes;
 }
 static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl)
 {
 	unsigned int nr_disp = 0;
 	struct throtl_grp *tg;
 	struct throtl_rb_root *st = &td->tg_service_tree;
 	while (1) {
 		tg = throtl_rb_first(st);
 		if (!tg)
 			break;
 		if (time_before(jiffies, tg->disptime))
 			break;
 		throtl_dequeue_tg(td, tg);
 		nr_disp += throtl_dispatch_tg(td, tg, bl);
 		if (tg->nr_queued[0] || tg->nr_queued[1]) {
 			tg_update_disptime(td, tg);
 			throtl_enqueue_tg(td, tg);
 		}
 		if (nr_disp >= throtl_quantum)
 			break;
 	}
 	return nr_disp;
 }
 static void throtl_process_limit_change(struct throtl_data *td)
 {
 	struct request_queue *q = td->queue;
 	struct blkio_group *blkg, *n;
 	if (!td->limits_changed)
 		return;
 	xchg(&td->limits_changed, false);
 	throtl_log(td, "limits changed");
 	list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
 		struct throtl_grp *tg = blkg_to_tg(blkg);
 		if (!tg->limits_changed)
 			continue;
 		if (!xchg(&tg->limits_changed, false))
 			continue;
 		throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu"
 			" riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE],
 			tg->iops[READ], tg->iops[WRITE]);
 		/*
 		 * Restart the slices for both READ and WRITES. It
 		 * might happen that a group's limit are dropped
 		 * suddenly and we don't want to account recently
 		 * dispatched IO with new low rate
 		 */
 		throtl_start_new_slice(td, tg, 0);
 		throtl_start_new_slice(td, tg, 1);
 		if (throtl_tg_on_rr(tg))
 			tg_update_disptime(td, tg);
 	}
 }
 /* Dispatch throttled bios. Should be called without queue lock held. */
 static int throtl_dispatch(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
 	unsigned int nr_disp = 0;
 	struct bio_list bio_list_on_stack;
 	struct bio *bio;
 	struct blk_plug plug;
 	spin_lock_irq(q->queue_lock);
 	throtl_process_limit_change(td);
 	if (!total_nr_queued(td))
 		goto out;
 	bio_list_init(&bio_list_on_stack);
 	throtl_log(td, "dispatch nr_queued=%u read=%u write=%u",
 			total_nr_queued(td), td->nr_queued[READ],
 			td->nr_queued[WRITE]);
 	nr_disp = throtl_select_dispatch(td, &bio_list_on_stack);
 	if (nr_disp)
 		throtl_log(td, "bios disp=%u", nr_disp);
 	throtl_schedule_next_dispatch(td);
 out:
 	spin_unlock_irq(q->queue_lock);
 	/*
 	 * If we dispatched some requests, unplug the queue to make sure
 	 * immediate dispatch
 	 */
 	if (nr_disp) {
 		blk_start_plug(&plug);
 		while((bio = bio_list_pop(&bio_list_on_stack)))
 			generic_make_request(bio);
 		blk_finish_plug(&plug);
 	}
 	return nr_disp;
 }
 void blk_throtl_work(struct work_struct *work)
 {
 	struct throtl_data *td = container_of(work, struct throtl_data,
 					throtl_work.work);
 	struct request_queue *q = td->queue;
 	throtl_dispatch(q);
 }
 /* Call with queue lock held */
 static void
 throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay)
 {
 	struct delayed_work *dwork = &td->throtl_work;
 	/* schedule work if limits changed even if no bio is queued */
 	if (total_nr_queued(td) || td->limits_changed) {
 		/*
 		 * We might have a work scheduled to be executed in future.
 		 * Cancel that and schedule a new one.
 		 */
 		__cancel_delayed_work(dwork);
 		queue_delayed_work(kthrotld_workqueue, dwork, delay);
 		throtl_log(td, "schedule work. delay=%lu jiffies=%lu",
 				delay, jiffies);
 	}
 }
 static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, void *pdata, int off)
 {
 	struct throtl_grp *tg = pdata;
 	struct blkg_rwstat rwstat = { }, tmp;
 	int i, cpu;
 	for_each_possible_cpu(cpu) {
 		struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu);
 		tmp = blkg_rwstat_read((void *)sc + off);
 		for (i = 0; i < BLKG_RWSTAT_NR; i++)
 			rwstat.cnt[i] += tmp.cnt[i];
 	}
 	return __blkg_prfill_rwstat(sf, pdata, &rwstat);
 }
 static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft,
 			       struct seq_file *sf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 	blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkio_policy_throtl,
 			  cft->private, true);
 	return 0;
 }
 static u64 tg_prfill_conf_u64(struct seq_file *sf, void *pdata, int off)
 {
 	u64 v = *(u64 *)(pdata + off);
 	if (v == -1)
 		return 0;
 	return __blkg_prfill_u64(sf, pdata, v);
 }
 static u64 tg_prfill_conf_uint(struct seq_file *sf, void *pdata, int off)
 {
 	unsigned int v = *(unsigned int *)(pdata + off);
 	if (v == -1)
 		return 0;
 	return __blkg_prfill_u64(sf, pdata, v);
 }
 static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft,
 			     struct seq_file *sf)
 {
 	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_u64,
 			  &blkio_policy_throtl, cft->private, false);
 	return 0;
 }
 static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft,
 			      struct seq_file *sf)
 {
 	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp), tg_prfill_conf_uint,
 			  &blkio_policy_throtl, cft->private, false);
 	return 0;
 }
 static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf,
 		       bool is_u64)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 	struct blkg_conf_ctx ctx;
 	struct throtl_grp *tg;
 	int ret;
 	ret = blkg_conf_prep(blkcg, &blkio_policy_throtl, buf, &ctx);
 	if (ret)
 		return ret;
 	ret = -EINVAL;
 	tg = blkg_to_tg(ctx.blkg);
 	if (tg) {
 		struct throtl_data *td = ctx.blkg->q->td;
 		if (!ctx.v)
 			ctx.v = -1;
 		if (is_u64)
 			*(u64 *)((void *)tg + cft->private) = ctx.v;
 		else
 			*(unsigned int *)((void *)tg + cft->private) = ctx.v;
 		/* XXX: we don't need the following deferred processing */
 		xchg(&tg->limits_changed, true);
 		xchg(&td->limits_changed, true);
 		throtl_schedule_delayed_work(td, 0);
 		ret = 0;
 	}
 	blkg_conf_finish(&ctx);
 	return ret;
 }
 static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft,
 			   const char *buf)
 {
 	return tg_set_conf(cgrp, cft, buf, true);
 }
 static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft,
 			    const char *buf)
 {
 	return tg_set_conf(cgrp, cft, buf, false);
 }
 static struct cftype throtl_files[] = {
 	{
 		.name = "throttle.read_bps_device",
 		.private = offsetof(struct throtl_grp, bps[READ]),
 		.read_seq_string = tg_print_conf_u64,
 		.write_string = tg_set_conf_u64,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.write_bps_device",
 		.private = offsetof(struct throtl_grp, bps[WRITE]),
 		.read_seq_string = tg_print_conf_u64,
 		.write_string = tg_set_conf_u64,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.read_iops_device",
 		.private = offsetof(struct throtl_grp, iops[READ]),
 		.read_seq_string = tg_print_conf_uint,
 		.write_string = tg_set_conf_uint,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.write_iops_device",
 		.private = offsetof(struct throtl_grp, iops[WRITE]),
 		.read_seq_string = tg_print_conf_uint,
 		.write_string = tg_set_conf_uint,
 		.max_write_len = 256,
 	},
 	{
 		.name = "throttle.io_service_bytes",
 		.private = offsetof(struct tg_stats_cpu, service_bytes),
 		.read_seq_string = tg_print_cpu_rwstat,
 	},
 	{
 		.name = "throttle.io_serviced",
 		.private = offsetof(struct tg_stats_cpu, serviced),
 		.read_seq_string = tg_print_cpu_rwstat,
 	},
 	{ }	/* terminate */
 };
 static void throtl_shutdown_wq(struct request_queue *q)
 {
 	struct throtl_data *td = q->td;
 	cancel_delayed_work_sync(&td->throtl_work);
 }
 static struct blkio_policy_type blkio_policy_throtl = {
 	.ops = {
 		.blkio_init_group_fn = throtl_init_blkio_group,
 		.blkio_exit_group_fn = throtl_exit_blkio_group,
 		.blkio_reset_group_stats_fn = throtl_reset_group_stats,
 	},
 	.pdata_size = sizeof(struct throtl_grp),
 	.cftypes = throtl_files,
 };
 bool blk_throtl_bio(struct request_queue *q, struct bio *bio)
 {
 	struct throtl_data *td = q->td;
 	struct throtl_grp *tg;
 	bool rw = bio_data_dir(bio), update_disptime = true;
 	struct blkio_cgroup *blkcg;
 	bool throttled = false;
 	if (bio->bi_rw & REQ_THROTTLED) {
 		bio->bi_rw &= ~REQ_THROTTLED;
 		goto out;
 	}
 	/* bio_associate_current() needs ioc, try creating */
 	create_io_context(GFP_ATOMIC, q->node);
 	/*
 	 * A throtl_grp pointer retrieved under rcu can be used to access
 	 * basic fields like stats and io rates. If a group has no rules,
 	 * just update the dispatch stats in lockless manner and return.
 	 */
 	rcu_read_lock();
 	blkcg = bio_blkio_cgroup(bio);
 	tg = throtl_lookup_tg(td, blkcg);
 	if (tg) {
 		if (tg_no_rule_group(tg, rw)) {
 			throtl_update_dispatch_stats(tg_to_blkg(tg),
 						     bio->bi_size, bio->bi_rw);
 			goto out_unlock_rcu;
 		}
 	}
 	/*
 	 * Either group has not been allocated yet or it is not an unlimited
 	 * IO group
 	 */
 	spin_lock_irq(q->queue_lock);
 	tg = throtl_lookup_create_tg(td, blkcg);
 	if (unlikely(!tg))
 		goto out_unlock;
 	if (tg->nr_queued[rw]) {
 		/*
 		 * There is already another bio queued in same dir. No
 		 * need to update dispatch time.
 		 */
 		update_disptime = false;
 		goto queue_bio;
 	}
 	/* Bio is with-in rate limit of group */
 	if (tg_may_dispatch(td, tg, bio, NULL)) {
 		throtl_charge_bio(tg, bio);
 		/*
 		 * We need to trim slice even when bios are not being queued
 		 * otherwise it might happen that a bio is not queued for
 		 * a long time and slice keeps on extending and trim is not
 		 * called for a long time. Now if limits are reduced suddenly
 		 * we take into account all the IO dispatched so far at new
 		 * low rate and * newly queued IO gets a really long dispatch
 		 * time.
 		 *
 		 * So keep on trimming slice even if bio is not queued.
 		 */
 		throtl_trim_slice(td, tg, rw);
 		goto out_unlock;
 	}
 queue_bio:
 	throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu"
 			" iodisp=%u iops=%u queued=%d/%d",
 			rw == READ ? 'R' : 'W',
 			tg->bytes_disp[rw], bio->bi_size, tg->bps[rw],
 			tg->io_disp[rw], tg->iops[rw],
 			tg->nr_queued[READ], tg->nr_queued[WRITE]);
 	bio_associate_current(bio);
 	throtl_add_bio_tg(q->td, tg, bio);
 	throttled = true;
 	if (update_disptime) {
 		tg_update_disptime(td, tg);
 		throtl_schedule_next_dispatch(td);
 	}
 out_unlock:
 	spin_unlock_irq(q->queue_lock);
 out_unlock_rcu:
 	rcu_read_unlock();
 out:
 	return throttled;
 }
 /**
  * blk_throtl_drain - drain throttled bios
  * @q: request_queue to drain throttled bios for
  *
  * Dispatch all currently throttled bios on @q through ->make_request_fn().
  */
 void blk_throtl_drain(struct request_queue *q)
 	__releases(q->queue_lock) __acquires(q->queue_lock)
 {
 	struct throtl_data *td = q->td;
 	struct throtl_rb_root *st = &td->tg_service_tree;
 	struct throtl_grp *tg;
 	struct bio_list bl;
 	struct bio *bio;
 	WARN_ON_ONCE(!queue_is_locked(q));
 	bio_list_init(&bl);
 	while ((tg = throtl_rb_first(st))) {
 		throtl_dequeue_tg(td, tg);
 		while ((bio = bio_list_peek(&tg->bio_lists[READ])))
 			tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
 		while ((bio = bio_list_peek(&tg->bio_lists[WRITE])))
 			tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl);
 	}
 	spin_unlock_irq(q->queue_lock);
 	while ((bio = bio_list_pop(&bl)))
 		generic_make_request(bio);
 	spin_lock_irq(q->queue_lock);
 }
 int blk_throtl_init(struct request_queue *q)
 {
 	struct throtl_data *td;
 	struct blkio_group *blkg;
 	td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node);
 	if (!td)
 		return -ENOMEM;
 	td->tg_service_tree = THROTL_RB_ROOT;
 	td->limits_changed = false;
 	INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work);
 	q->td = td;
 	td->queue = q;
 	/* alloc and init root group. */
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
 	blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);
 	if (!IS_ERR(blkg))
-		td->root_tg = blkg_to_tg(blkg);
+		q->root_blkg = blkg;
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
-	if (!td->root_tg) {
+	if (!q->root_blkg) {
 		kfree(td);
 		return -ENOMEM;
 	}
 	return 0;
 }
 void blk_throtl_exit(struct request_queue *q)
 {
 	BUG_ON(!q->td);
 	throtl_shutdown_wq(q);
 	kfree(q->td);
 }
 static int __init throtl_init(void)
 {
 	kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0);
 	if (!kthrotld_workqueue)
 		panic("Failed to create kthrotld\n");
 	return blkio_policy_register(&blkio_policy_throtl);
 }
 module_init(throtl_init);

 /*
  *  CFQ, or complete fairness queueing, disk scheduler.
  *
  *  Based on ideas from a previously unfinished io
  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
  *
  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  */
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/jiffies.h>
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
 #include "blk.h"
 #include "blk-cgroup.h"
 static struct blkio_policy_type blkio_policy_cfq __maybe_unused;
 /*
  * tunables
  */
 /* max queue in one round of service */
 static const int cfq_quantum = 8;
 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
 /* maximum backwards seek, in KiB */
 static const int cfq_back_max = 16 * 1024;
 /* penalty of a backwards seek */
 static const int cfq_back_penalty = 2;
 static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
 static int cfq_group_idle = HZ / 125;
 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
 static const int cfq_hist_divisor = 4;
 /*
  * offset from end of service tree
  */
 #define CFQ_IDLE_DELAY		(HZ / 5)
 /*
  * below this threshold, we consider thinktime immediate
  */
 #define CFQ_MIN_TT		(2)
 #define CFQ_SLICE_SCALE		(5)
 #define CFQ_HW_QUEUE_MIN	(5)
 #define CFQ_SERVICE_SHIFT       12
 #define CFQQ_SEEK_THR		(sector_t)(8 * 100)
 #define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
 #define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
 #define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
 #define RQ_CIC(rq)		icq_to_cic((rq)->elv.icq)
 #define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elv.priv[0])
 #define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
 static struct kmem_cache *cfq_pool;
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
 #define sample_valid(samples)	((samples) > 80)
 #define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
 struct cfq_ttime {
 	unsigned long last_end_request;
 	unsigned long ttime_total;
 	unsigned long ttime_samples;
 	unsigned long ttime_mean;
 };
 /*
  * Most of our rbtree usage is for sorting with min extraction, so
  * if we cache the leftmost node we don't have to walk down the tree
  * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
  * move this into the elevator for the rq sorting as well.
  */
 struct cfq_rb_root {
 	struct rb_root rb;
 	struct rb_node *left;
 	unsigned count;
 	unsigned total_weight;
 	u64 min_vdisktime;
 	struct cfq_ttime ttime;
 };
 #define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, \
 			.ttime = {.last_end_request = jiffies,},}
 /*
  * Per process-grouping structure
  */
 struct cfq_queue {
 	/* reference count */
 	int ref;
 	/* various state flags, see below */
 	unsigned int flags;
 	/* parent cfq_data */
 	struct cfq_data *cfqd;
 	/* service_tree member */
 	struct rb_node rb_node;
 	/* service_tree key */
 	unsigned long rb_key;
 	/* prio tree member */
 	struct rb_node p_node;
 	/* prio tree root we belong to, if any */
 	struct rb_root *p_root;
 	/* sorted list of pending requests */
 	struct rb_root sort_list;
 	/* if fifo isn't expired, next request to serve */
 	struct request *next_rq;
 	/* requests queued in sort_list */
 	int queued[2];
 	/* currently allocated requests */
 	int allocated[2];
 	/* fifo list of requests in sort_list */
 	struct list_head fifo;
 	/* time when queue got scheduled in to dispatch first request. */
 	unsigned long dispatch_start;
 	unsigned int allocated_slice;
 	unsigned int slice_dispatch;
 	/* time when first request from queue completed and slice started. */
 	unsigned long slice_start;
 	unsigned long slice_end;
 	long slice_resid;
 	/* pending priority requests */
 	int prio_pending;
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 	/* io prio of this group */
 	unsigned short ioprio, org_ioprio;
 	unsigned short ioprio_class;
 	pid_t pid;
 	u32 seek_history;
 	sector_t last_request_pos;
 	struct cfq_rb_root *service_tree;
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
 	/* Number of sectors dispatched from queue in single dispatch round */
 	unsigned long nr_sectors;
 };
 /*
  * First index in the service_trees.
  * IDLE is handled separately, so it has negative index
  */
 enum wl_prio_t {
 	BE_WORKLOAD = 0,
 	RT_WORKLOAD = 1,
 	IDLE_WORKLOAD = 2,
 	CFQ_PRIO_NR,
 };
 /*
  * Second index in the service_trees.
  */
 enum wl_type_t {
 	ASYNC_WORKLOAD = 0,
 	SYNC_NOIDLE_WORKLOAD = 1,
 	SYNC_WORKLOAD = 2
 };
 struct cfqg_stats {
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	/* total bytes transferred */
 	struct blkg_rwstat		service_bytes;
 	/* total IOs serviced, post merge */
 	struct blkg_rwstat		serviced;
 	/* number of ios merged */
 	struct blkg_rwstat		merged;
 	/* total time spent on device in ns, may not be accurate w/ queueing */
 	struct blkg_rwstat		service_time;
 	/* total time spent waiting in scheduler queue in ns */
 	struct blkg_rwstat		wait_time;
 	/* number of IOs queued up */
 	struct blkg_rwstat		queued;
 	/* total sectors transferred */
 	struct blkg_stat		sectors;
 	/* total disk time and nr sectors dispatched by this group */
 	struct blkg_stat		time;
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	/* time not charged to this cgroup */
 	struct blkg_stat		unaccounted_time;
 	/* sum of number of ios queued across all samples */
 	struct blkg_stat		avg_queue_size_sum;
 	/* count of samples taken for average */
 	struct blkg_stat		avg_queue_size_samples;
 	/* how many times this group has been removed from service tree */
 	struct blkg_stat		dequeue;
 	/* total time spent waiting for it to be assigned a timeslice. */
 	struct blkg_stat		group_wait_time;
 	/* time spent idling for this blkio_group */
 	struct blkg_stat		idle_time;
 	/* total time with empty current active q with other requests queued */
 	struct blkg_stat		empty_time;
 	/* fields after this shouldn't be cleared on stat reset */
 	uint64_t			start_group_wait_time;
 	uint64_t			start_idle_time;
 	uint64_t			start_empty_time;
 	uint16_t			flags;
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 };
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
 	/* group service_tree member */
 	struct rb_node rb_node;
 	/* group service_tree key */
 	u64 vdisktime;
 	unsigned int weight;
 	unsigned int new_weight;
 	unsigned int dev_weight;
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
 	/*
 	 * Per group busy queues average. Useful for workload slice calc. We
 	 * create the array for each prio class but at run time it is used
 	 * only for RT and BE class and slot for IDLE class remains unused.
 	 * This is primarily done to avoid confusion and a gcc warning.
 	 */
 	unsigned int busy_queues_avg[CFQ_PRIO_NR];
 	/*
 	 * rr lists of queues with requests. We maintain service trees for
 	 * RT and BE classes. These trees are subdivided in subclasses
 	 * of SYNC, SYNC_NOIDLE and ASYNC based on workload type. For IDLE
 	 * class there is no subclassification and all the cfq queues go on
 	 * a single tree service_tree_idle.
 	 * Counts are embedded in the cfq_rb_root
 	 */
 	struct cfq_rb_root service_trees[2][3];
 	struct cfq_rb_root service_tree_idle;
 	unsigned long saved_workload_slice;
 	enum wl_type_t saved_workload;
 	enum wl_prio_t saved_serving_prio;
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 	struct cfq_ttime ttime;
 	struct cfqg_stats stats;
 };
 struct cfq_io_cq {
 	struct io_cq		icq;		/* must be the first member */
 	struct cfq_queue	*cfqq[2];
 	struct cfq_ttime	ttime;
 	int			ioprio;		/* the current ioprio */
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	uint64_t		blkcg_id;	/* the current blkcg ID */
 #endif
 };
 /*
  * Per block device queue structure
  */
 struct cfq_data {
 	struct request_queue *queue;
 	/* Root service tree for cfq_groups */
 	struct cfq_rb_root grp_service_tree;
 	struct cfq_group *root_group;
 	/*
 	 * The priority currently being served
 	 */
 	enum wl_prio_t serving_prio;
 	enum wl_type_t serving_type;
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
 	/*
 	 * Each priority tree is sorted by next_request position.  These
 	 * trees are used when determining if two or more queues are
 	 * interleaving requests (see cfq_close_cooperator).
 	 */
 	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 	unsigned int busy_queues;
 	unsigned int busy_sync_queues;
 	int rq_in_driver;
 	int rq_in_flight[2];
 	/*
 	 * queue-depth detection
 	 */
 	int rq_queued;
 	int hw_tag;
 	/*
 	 * hw_tag can be
 	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
 	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
 	 *  0 => no NCQ
 	 */
 	int hw_tag_est_depth;
 	unsigned int hw_tag_samples;
 	/*
 	 * idle window management
 	 */
 	struct timer_list idle_slice_timer;
 	struct work_struct unplug_work;
 	struct cfq_queue *active_queue;
 	struct cfq_io_cq *active_cic;
 	/*
 	 * async queue for each priority case
 	 */
 	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
 	struct cfq_queue *async_idle_cfqq;
 	sector_t last_position;
 	/*
 	 * tunables, see top of file
 	 */
 	unsigned int cfq_quantum;
 	unsigned int cfq_fifo_expire[2];
 	unsigned int cfq_back_penalty;
 	unsigned int cfq_back_max;
 	unsigned int cfq_slice[2];
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
 	unsigned int cfq_group_idle;
 	unsigned int cfq_latency;
 	/*
 	 * Fallback dummy cfqq for extreme OOM conditions
 	 */
 	struct cfq_queue oom_cfqq;
 	unsigned long last_delayed_sync;
 };
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
 					    enum wl_prio_t prio,
 					    enum wl_type_t type)
 {
 	if (!cfqg)
 		return NULL;
 	if (prio == IDLE_WORKLOAD)
 		return &cfqg->service_tree_idle;
 	return &cfqg->service_trees[prio][type];
 }
 enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
 	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
 	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
 	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
 	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
 	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
 	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
 	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
 	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
 	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
 	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
 	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
 	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
 };
 #define CFQ_CFQQ_FNS(name)						\
 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
 {									\
 	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
 }									\
 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
 {									\
 	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
 }									\
 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
 {									\
 	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
 }
 CFQ_CFQQ_FNS(on_rr);
 CFQ_CFQQ_FNS(wait_request);
 CFQ_CFQQ_FNS(must_dispatch);
 CFQ_CFQQ_FNS(must_alloc_slice);
 CFQ_CFQQ_FNS(fifo_expire);
 CFQ_CFQQ_FNS(idle_window);
 CFQ_CFQQ_FNS(prio_changed);
 CFQ_CFQQ_FNS(slice_new);
 CFQ_CFQQ_FNS(sync);
 CFQ_CFQQ_FNS(coop);
 CFQ_CFQQ_FNS(split_coop);
 CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 #if defined(CONFIG_CFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
 /* cfqg stats flags */
 enum cfqg_stats_flags {
 	CFQG_stats_waiting = 0,
 	CFQG_stats_idling,
 	CFQG_stats_empty,
 };
 #define CFQG_FLAG_FNS(name)						\
 static inline void cfqg_stats_mark_##name(struct cfqg_stats *stats)	\
 {									\
 	stats->flags |= (1 << CFQG_stats_##name);			\
 }									\
 static inline void cfqg_stats_clear_##name(struct cfqg_stats *stats)	\
 {									\
 	stats->flags &= ~(1 << CFQG_stats_##name);			\
 }									\
 static inline int cfqg_stats_##name(struct cfqg_stats *stats)		\
 {									\
 	return (stats->flags & (1 << CFQG_stats_##name)) != 0;		\
 }									\
 CFQG_FLAG_FNS(waiting)
 CFQG_FLAG_FNS(idling)
 CFQG_FLAG_FNS(empty)
 #undef CFQG_FLAG_FNS
 /* This should be called with the queue_lock held. */
 static void cfqg_stats_update_group_wait_time(struct cfqg_stats *stats)
 {
 	unsigned long long now;
 	if (!cfqg_stats_waiting(stats))
 		return;
 	now = sched_clock();
 	if (time_after64(now, stats->start_group_wait_time))
 		blkg_stat_add(&stats->group_wait_time,
 			      now - stats->start_group_wait_time);
 	cfqg_stats_clear_waiting(stats);
 }
 /* This should be called with the queue_lock held. */
 static void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg,
 						 struct cfq_group *curr_cfqg)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
 	if (cfqg_stats_waiting(stats))
 		return;
 	if (cfqg == curr_cfqg)
 		return;
 	stats->start_group_wait_time = sched_clock();
 	cfqg_stats_mark_waiting(stats);
 }
 /* This should be called with the queue_lock held. */
 static void cfqg_stats_end_empty_time(struct cfqg_stats *stats)
 {
 	unsigned long long now;
 	if (!cfqg_stats_empty(stats))
 		return;
 	now = sched_clock();
 	if (time_after64(now, stats->start_empty_time))
 		blkg_stat_add(&stats->empty_time,
 			      now - stats->start_empty_time);
 	cfqg_stats_clear_empty(stats);
 }
 static void cfqg_stats_update_dequeue(struct cfq_group *cfqg)
 {
 	blkg_stat_add(&cfqg->stats.dequeue, 1);
 }
 static void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
 	if (blkg_rwstat_sum(&stats->queued))
 		return;
 	/*
 	 * group is already marked empty. This can happen if cfqq got new
 	 * request in parent group and moved to this group while being added
 	 * to service tree. Just ignore the event and move on.
 	 */
 	if (cfqg_stats_empty(stats))
 		return;
 	stats->start_empty_time = sched_clock();
 	cfqg_stats_mark_empty(stats);
 }
 static void cfqg_stats_update_idle_time(struct cfq_group *cfqg)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
 	if (cfqg_stats_idling(stats)) {
 		unsigned long long now = sched_clock();
 		if (time_after64(now, stats->start_idle_time))
 			blkg_stat_add(&stats->idle_time,
 				      now - stats->start_idle_time);
 		cfqg_stats_clear_idling(stats);
 	}
 }
 static void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
 	BUG_ON(cfqg_stats_idling(stats));
 	stats->start_idle_time = sched_clock();
 	cfqg_stats_mark_idling(stats);
 }
 static void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
 	blkg_stat_add(&stats->avg_queue_size_sum,
 		      blkg_rwstat_sum(&stats->queued));
 	blkg_stat_add(&stats->avg_queue_size_samples, 1);
 	cfqg_stats_update_group_wait_time(stats);
 }
 #else	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 static inline void cfqg_stats_set_start_group_wait_time(struct cfq_group *cfqg, struct cfq_group *curr_cfqg) { }
 static inline void cfqg_stats_end_empty_time(struct cfqg_stats *stats) { }
 static inline void cfqg_stats_update_dequeue(struct cfq_group *cfqg) { }
 static inline void cfqg_stats_set_start_empty_time(struct cfq_group *cfqg) { }
 static inline void cfqg_stats_update_idle_time(struct cfq_group *cfqg) { }
 static inline void cfqg_stats_set_start_idle_time(struct cfq_group *cfqg) { }
 static inline void cfqg_stats_update_avg_queue_size(struct cfq_group *cfqg) { }
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg)
 {
 	return blkg_to_pdata(blkg, &blkio_policy_cfq);
 }
 static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg)
 {
 	return pdata_to_blkg(cfqg);
 }
 static inline void cfqg_get(struct cfq_group *cfqg)
 {
 	return blkg_get(cfqg_to_blkg(cfqg));
 }
 static inline void cfqg_put(struct cfq_group *cfqg)
 {
 	return blkg_put(cfqg_to_blkg(cfqg));
 }
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
 			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
 			blkg_path(cfqg_to_blkg((cfqq)->cfqg)), ##args)
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
 	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
 			blkg_path(cfqg_to_blkg((cfqg))), ##args)	\
 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
 					    struct cfq_group *curr_cfqg, int rw)
 {
 	blkg_rwstat_add(&cfqg->stats.queued, rw, 1);
 	cfqg_stats_end_empty_time(&cfqg->stats);
 	cfqg_stats_set_start_group_wait_time(cfqg, curr_cfqg);
 }
 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 			unsigned long time, unsigned long unaccounted_time)
 {
 	blkg_stat_add(&cfqg->stats.time, time);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	blkg_stat_add(&cfqg->stats.unaccounted_time, unaccounted_time);
 #endif
 }
 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw)
 {
 	blkg_rwstat_add(&cfqg->stats.queued, rw, -1);
 }
 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw)
 {
 	blkg_rwstat_add(&cfqg->stats.merged, rw, 1);
 }
 static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
 					      uint64_t bytes, int rw)
 {
 	blkg_stat_add(&cfqg->stats.sectors, bytes >> 9);
 	blkg_rwstat_add(&cfqg->stats.serviced, rw, 1);
 	blkg_rwstat_add(&cfqg->stats.service_bytes, rw, bytes);
 }
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 			uint64_t start_time, uint64_t io_start_time, int rw)
 {
 	struct cfqg_stats *stats = &cfqg->stats;
 	unsigned long long now = sched_clock();
 	if (time_after64(now, io_start_time))
 		blkg_rwstat_add(&stats->service_time, rw, now - io_start_time);
 	if (time_after64(io_start_time, start_time))
 		blkg_rwstat_add(&stats->wait_time, rw,
 				io_start_time - start_time);
 }
 static void cfqg_stats_reset(struct blkio_group *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 	struct cfqg_stats *stats = &cfqg->stats;
 	/* queued stats shouldn't be cleared */
 	blkg_rwstat_reset(&stats->service_bytes);
 	blkg_rwstat_reset(&stats->serviced);
 	blkg_rwstat_reset(&stats->merged);
 	blkg_rwstat_reset(&stats->service_time);
 	blkg_rwstat_reset(&stats->wait_time);
 	blkg_stat_reset(&stats->time);
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	blkg_stat_reset(&stats->unaccounted_time);
 	blkg_stat_reset(&stats->avg_queue_size_sum);
 	blkg_stat_reset(&stats->avg_queue_size_samples);
 	blkg_stat_reset(&stats->dequeue);
 	blkg_stat_reset(&stats->group_wait_time);
 	blkg_stat_reset(&stats->idle_time);
 	blkg_stat_reset(&stats->empty_time);
 #endif
 }
 #else	/* CONFIG_CFQ_GROUP_IOSCHED */
 static inline struct cfq_group *blkg_to_cfqg(struct blkio_group *blkg) { return NULL; }
 static inline struct blkio_group *cfqg_to_blkg(struct cfq_group *cfqg) { return NULL; }
 static inline void cfqg_get(struct cfq_group *cfqg) { }
 static inline void cfqg_put(struct cfq_group *cfqg) { }
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0)
 static inline void cfqg_stats_update_io_add(struct cfq_group *cfqg,
 			struct cfq_group *curr_cfqg, int rw) { }
 static inline void cfqg_stats_update_timeslice_used(struct cfq_group *cfqg,
 			unsigned long time, unsigned long unaccounted_time) { }
 static inline void cfqg_stats_update_io_remove(struct cfq_group *cfqg, int rw) { }
 static inline void cfqg_stats_update_io_merged(struct cfq_group *cfqg, int rw) { }
 static inline void cfqg_stats_update_dispatch(struct cfq_group *cfqg,
 					      uint64_t bytes, int rw) { }
 static inline void cfqg_stats_update_completion(struct cfq_group *cfqg,
 			uint64_t start_time, uint64_t io_start_time, int rw) { }
 #endif	/* CONFIG_CFQ_GROUP_IOSCHED */
 #define cfq_log(cfqd, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 /* Traverses through cfq group service trees */
 #define for_each_cfqg_st(cfqg, i, j, st) \
 	for (i = 0; i <= IDLE_WORKLOAD; i++) \
 		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
 			: &cfqg->service_tree_idle; \
 			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
 			(i == IDLE_WORKLOAD && j == 0); \
 			j++, st = i < IDLE_WORKLOAD ? \
 			&cfqg->service_trees[i][j]: NULL) \
 static inline bool cfq_io_thinktime_big(struct cfq_data *cfqd,
 	struct cfq_ttime *ttime, bool group_idle)
 {
 	unsigned long slice;
 	if (!sample_valid(ttime->ttime_samples))
 		return false;
 	if (group_idle)
 		slice = cfqd->cfq_group_idle;
 	else
 		slice = cfqd->cfq_slice_idle;
 	return ttime->ttime_mean > slice;
 }
 static inline bool iops_mode(struct cfq_data *cfqd)
 {
 	/*
 	 * If we are not idling on queues and it is a NCQ drive, parallel
 	 * execution of requests is on and measuring time is not possible
 	 * in most of the cases until and unless we drive shallower queue
 	 * depths and that becomes a performance bottleneck. In such cases
 	 * switch to start providing fairness in terms of number of IOs.
 	 */
 	if (!cfqd->cfq_slice_idle && cfqd->hw_tag)
 		return true;
 	else
 		return false;
 }
 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
 {
 	if (cfq_class_idle(cfqq))
 		return IDLE_WORKLOAD;
 	if (cfq_class_rt(cfqq))
 		return RT_WORKLOAD;
 	return BE_WORKLOAD;
 }
 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
 {
 	if (!cfq_cfqq_sync(cfqq))
 		return ASYNC_WORKLOAD;
 	if (!cfq_cfqq_idle_window(cfqq))
 		return SYNC_NOIDLE_WORKLOAD;
 	return SYNC_WORKLOAD;
 }
 static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
 					struct cfq_data *cfqd,
 					struct cfq_group *cfqg)
 {
 	if (wl == IDLE_WORKLOAD)
 		return cfqg->service_tree_idle.count;
 	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
 		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
 		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
 }
 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 					struct cfq_group *cfqg)
 {
 	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
 		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
 }
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
 				       struct cfq_io_cq *cic, struct bio *bio,
 				       gfp_t gfp_mask);
 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 {
 	/* cic->icq is the first member, %NULL will convert to %NULL */
 	return container_of(icq, struct cfq_io_cq, icq);
 }
 static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
 					       struct io_context *ioc)
 {
 	if (ioc)
 		return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
 	return NULL;
 }
 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
 {
 	return cic->cfqq[is_sync];
 }
 static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
 				bool is_sync)
 {
 	cic->cfqq[is_sync] = cfqq;
 }
 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
 {
 	return cic->icq.q->elevator->elevator_data;
 }
 /*
  * We regard a request as SYNC, if it's either a read or has the SYNC bit
  * set (in which case it could also be direct WRITE).
  */
 static inline bool cfq_bio_sync(struct bio *bio)
 {
 	return bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC);
 }
 /*
  * scheduler run of queue, if there are requests pending and no one in the
  * driver that will restart queueing
  */
 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 {
 	if (cfqd->busy_queues) {
 		cfq_log(cfqd, "schedule dispatch");
 		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
 	}
 }
 /*
  * Scale schedule slice based on io priority. Use the sync time slice only
  * if a queue is marked sync and has sync io queued. A sync queue with async
  * io only, should not get full sync slice length.
  */
 static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
 				 unsigned short prio)
 {
 	const int base_slice = cfqd->cfq_slice[sync];
 	WARN_ON(prio >= IOPRIO_BE_NR);
 	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
 }
 static inline int
 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
 {
 	u64 d = delta << CFQ_SERVICE_SHIFT;
 	d = d * CFQ_WEIGHT_DEFAULT;
 	do_div(d, cfqg->weight);
 	return d;
 }
 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
 {
 	s64 delta = (s64)(vdisktime - min_vdisktime);
 	if (delta > 0)
 		min_vdisktime = vdisktime;
 	return min_vdisktime;
 }
 static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 {
 	s64 delta = (s64)(vdisktime - min_vdisktime);
 	if (delta < 0)
 		min_vdisktime = vdisktime;
 	return min_vdisktime;
 }
 static void update_min_vdisktime(struct cfq_rb_root *st)
 {
 	struct cfq_group *cfqg;
 	if (st->left) {
 		cfqg = rb_entry_cfqg(st->left);
 		st->min_vdisktime = max_vdisktime(st->min_vdisktime,
 						  cfqg->vdisktime);
 	}
 }
 /*
  * get averaged number of queues of RT/BE priority.
  * average is updated, with a formula that gives more weight to higher numbers,
  * to quickly follows sudden increases and decrease slowly
  */
 static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
 					struct cfq_group *cfqg, bool rt)
 {
 	unsigned min_q, max_q;
 	unsigned mult  = cfq_hist_divisor - 1;
 	unsigned round = cfq_hist_divisor / 2;
 	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
 	min_q = min(cfqg->busy_queues_avg[rt], busy);
 	max_q = max(cfqg->busy_queues_avg[rt], busy);
 	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
 		cfq_hist_divisor;
 	return cfqg->busy_queues_avg[rt];
 }
 static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	return cfq_target_latency * cfqg->weight / st->total_weight;
 }
 static inline unsigned
 cfq_scaled_cfqq_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
 	if (cfqd->cfq_latency) {
 		/*
 		 * interested queues (we consider only the ones with the same
 		 * priority class in the cfq group)
 		 */
 		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
 						cfq_class_rt(cfqq));
 		unsigned sync_slice = cfqd->cfq_slice[1];
 		unsigned expect_latency = sync_slice * iq;
 		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
 		if (expect_latency > group_slice) {
 			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
 			/* scale low_slice according to IO priority
 			 * and sync vs async */
 			unsigned low_slice =
 				min(slice, base_low_slice * slice / sync_slice);
 			/* the adapted slice value is scaled to fit all iqs
 			 * into the target latency */
 			slice = max(slice * group_slice / expect_latency,
 				    low_slice);
 		}
 	}
 	return slice;
 }
 static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned slice = cfq_scaled_cfqq_slice(cfqd, cfqq);
 	cfqq->slice_start = jiffies;
 	cfqq->slice_end = jiffies + slice;
 	cfqq->allocated_slice = slice;
 	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
 }
 /*
  * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
  * isn't valid until the first request from the dispatch is activated
  * and the slice time set.
  */
 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
 {
 	if (cfq_cfqq_slice_new(cfqq))
 		return false;
 	if (time_before(jiffies, cfqq->slice_end))
 		return false;
 	return true;
 }
 /*
  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
  * We choose the request that is closest to the head right now. Distance
  * behind the head is penalized and only allowed to a certain extent.
  */
 static struct request *
 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
 {
 	sector_t s1, s2, d1 = 0, d2 = 0;
 	unsigned long back_max;
 #define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
 #define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
 	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
 	if (rq1 == NULL || rq1 == rq2)
 		return rq2;
 	if (rq2 == NULL)
 		return rq1;
 	if (rq_is_sync(rq1) != rq_is_sync(rq2))
 		return rq_is_sync(rq1) ? rq1 : rq2;
 	if ((rq1->cmd_flags ^ rq2->cmd_flags) & REQ_PRIO)
 		return rq1->cmd_flags & REQ_PRIO ? rq1 : rq2;
 	s1 = blk_rq_pos(rq1);
 	s2 = blk_rq_pos(rq2);
 	/*
 	 * by definition, 1KiB is 2 sectors
 	 */
 	back_max = cfqd->cfq_back_max * 2;
 	/*
 	 * Strict one way elevator _except_ in the case where we allow
 	 * short backward seeks which are biased as twice the cost of a
 	 * similar forward seek.
 	 */
 	if (s1 >= last)
 		d1 = s1 - last;
 	else if (s1 + back_max >= last)
 		d1 = (last - s1) * cfqd->cfq_back_penalty;
 	else
 		wrap |= CFQ_RQ1_WRAP;
 	if (s2 >= last)
 		d2 = s2 - last;
 	else if (s2 + back_max >= last)
 		d2 = (last - s2) * cfqd->cfq_back_penalty;
 	else
 		wrap |= CFQ_RQ2_WRAP;
 	/* Found required data */
 	/*
 	 * By doing switch() on the bit mask "wrap" we avoid having to
 	 * check two variables for all permutations: --> faster!
 	 */
 	switch (wrap) {
 	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
 		if (d1 < d2)
 			return rq1;
 		else if (d2 < d1)
 			return rq2;
 		else {
 			if (s1 >= s2)
 				return rq1;
 			else
 				return rq2;
 		}
 	case CFQ_RQ2_WRAP:
 		return rq1;
 	case CFQ_RQ1_WRAP:
 		return rq2;
 	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
 	default:
 		/*
 		 * Since both rqs are wrapped,
 		 * start with the one that's further behind head
 		 * (--> only *one* back seek required),
 		 * since back seek takes more time than forward.
 		 */
 		if (s1 <= s2)
 			return rq1;
 		else
 			return rq2;
 	}
 }
 /*
  * The below is leftmost cache rbtree addon
  */
 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
 {
 	/* Service tree is empty */
 	if (!root->count)
 		return NULL;
 	if (!root->left)
 		root->left = rb_first(&root->rb);
 	if (root->left)
 		return rb_entry(root->left, struct cfq_queue, rb_node);
 	return NULL;
 }
 static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
 {
 	if (!root->left)
 		root->left = rb_first(&root->rb);
 	if (root->left)
 		return rb_entry_cfqg(root->left);
 	return NULL;
 }
 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 {
 	rb_erase(n, root);
 	RB_CLEAR_NODE(n);
 }
 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
 {
 	if (root->left == n)
 		root->left = NULL;
 	rb_erase_init(n, &root->rb);
 	--root->count;
 }
 /*
  * would be nice to take fifo expire time into account as well
  */
 static struct request *
 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		  struct request *last)
 {
 	struct rb_node *rbnext = rb_next(&last->rb_node);
 	struct rb_node *rbprev = rb_prev(&last->rb_node);
 	struct request *next = NULL, *prev = NULL;
 	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
 	if (rbprev)
 		prev = rb_entry_rq(rbprev);
 	if (rbnext)
 		next = rb_entry_rq(rbnext);
 	else {
 		rbnext = rb_first(&cfqq->sort_list);
 		if (rbnext && rbnext != &last->rb_node)
 			next = rb_entry_rq(rbnext);
 	}
 	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
 }
 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 				      struct cfq_queue *cfqq)
 {
 	/*
 	 * just an approximation, should be ok.
 	 */
 	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
 		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
 }
 static inline s64
 cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
 	return cfqg->vdisktime - st->min_vdisktime;
 }
 static void
 __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
 	struct rb_node **node = &st->rb.rb_node;
 	struct rb_node *parent = NULL;
 	struct cfq_group *__cfqg;
 	s64 key = cfqg_key(st, cfqg);
 	int left = 1;
 	while (*node != NULL) {
 		parent = *node;
 		__cfqg = rb_entry_cfqg(parent);
 		if (key < cfqg_key(st, __cfqg))
 			node = &parent->rb_left;
 		else {
 			node = &parent->rb_right;
 			left = 0;
 		}
 	}
 	if (left)
 		st->left = &cfqg->rb_node;
 	rb_link_node(&cfqg->rb_node, parent, node);
 	rb_insert_color(&cfqg->rb_node, &st->rb);
 }
 static void
 cfq_update_group_weight(struct cfq_group *cfqg)
 {
 	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
 	if (cfqg->new_weight) {
 		cfqg->weight = cfqg->new_weight;
 		cfqg->new_weight = 0;
 	}
 }
 static void
 cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
 	BUG_ON(!RB_EMPTY_NODE(&cfqg->rb_node));
 	cfq_update_group_weight(cfqg);
 	__cfq_group_service_tree_add(st, cfqg);
 	st->total_weight += cfqg->weight;
 }
 static void
 cfq_group_notify_queue_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_group *__cfqg;
 	struct rb_node *n;
 	cfqg->nr_cfqq++;
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		return;
 	/*
 	 * Currently put the group at the end. Later implement something
 	 * so that groups get lesser vtime based on their weights, so that
 	 * if group does not loose all if it was not continuously backlogged.
 	 */
 	n = rb_last(&st->rb);
 	if (n) {
 		__cfqg = rb_entry_cfqg(n);
 		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
 	} else
 		cfqg->vdisktime = st->min_vdisktime;
 	cfq_group_service_tree_add(st, cfqg);
 }
 static void
 cfq_group_service_tree_del(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
 	st->total_weight -= cfqg->weight;
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
 }
 static void
 cfq_group_notify_queue_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
 	/* If there are other cfq queues under this group, don't delete it */
 	if (cfqg->nr_cfqq)
 		return;
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
 	cfq_group_service_tree_del(st, cfqg);
 	cfqg->saved_workload_slice = 0;
 	cfqg_stats_update_dequeue(cfqg);
 }
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq,
 						unsigned int *unaccounted_time)
 {
 	unsigned int slice_used;
 	/*
 	 * Queue got expired before even a single request completed or
 	 * got expired immediately after first request completion.
 	 */
 	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
 		/*
 		 * Also charge the seek time incurred to the group, otherwise
 		 * if there are mutiple queues in the group, each can dispatch
 		 * a single request on seeky media and cause lots of seek time
 		 * and group will never know it.
 		 */
 		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
 					1);
 	} else {
 		slice_used = jiffies - cfqq->slice_start;
 		if (slice_used > cfqq->allocated_slice) {
 			*unaccounted_time = slice_used - cfqq->allocated_slice;
 			slice_used = cfqq->allocated_slice;
 		}
 		if (time_after(cfqq->slice_start, cfqq->dispatch_start))
 			*unaccounted_time += cfqq->slice_start -
 					cfqq->dispatch_start;
 	}
 	return slice_used;
 }
 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 				struct cfq_queue *cfqq)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	unsigned int used_sl, charge, unaccounted_sl = 0;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
 	BUG_ON(nr_sync < 0);
 	used_sl = charge = cfq_cfqq_slice_usage(cfqq, &unaccounted_sl);
 	if (iops_mode(cfqd))
 		charge = cfqq->slice_dispatch;
 	else if (!cfq_cfqq_sync(cfqq) && !nr_sync)
 		charge = cfqq->allocated_slice;
 	/* Can't update vdisktime while group is on service tree */
 	cfq_group_service_tree_del(st, cfqg);
 	cfqg->vdisktime += cfq_scale_slice(charge, cfqg);
 	/* If a new weight was requested, update now, off tree */
 	cfq_group_service_tree_add(st, cfqg);
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
 		cfqg->saved_workload_slice = cfqd->workload_expires
 						- jiffies;
 		cfqg->saved_workload = cfqd->serving_type;
 		cfqg->saved_serving_prio = cfqd->serving_prio;
 	} else
 		cfqg->saved_workload_slice = 0;
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
 	cfq_log_cfqq(cfqq->cfqd, cfqq,
 		     "sl_used=%u disp=%u charge=%u iops=%u sect=%lu",
 		     used_sl, cfqq->slice_dispatch, charge,
 		     iops_mode(cfqd), cfqq->nr_sectors);
 	cfqg_stats_update_timeslice_used(cfqg, used_sl, unaccounted_sl);
 	cfqg_stats_set_start_empty_time(cfqg);
 }
 /**
  * cfq_init_cfqg_base - initialize base part of a cfq_group
  * @cfqg: cfq_group to initialize
  *
  * Initialize the base part which is used whether %CONFIG_CFQ_GROUP_IOSCHED
  * is enabled or not.
  */
 static void cfq_init_cfqg_base(struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st;
 	int i, j;
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
 	cfqg->ttime.last_end_request = jiffies;
 }
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static void cfq_init_blkio_group(struct blkio_group *blkg)
 {
 	struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 	cfq_init_cfqg_base(cfqg);
 	cfqg->weight = blkg->blkcg->cfq_weight;
 }
 /*
  * Search for the cfq group current task belongs to. request_queue lock must
  * be held.
  */
 static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
 						struct blkio_cgroup *blkcg)
 {
 	struct request_queue *q = cfqd->queue;
 	struct cfq_group *cfqg = NULL;
 	/* avoid lookup for the common case where there's no blkio cgroup */
 	if (blkcg == &blkio_root_cgroup) {
 		cfqg = cfqd->root_group;
 	} else {
 		struct blkio_group *blkg;
 		blkg = blkg_lookup_create(blkcg, q, false);
 		if (!IS_ERR(blkg))
 			cfqg = blkg_to_cfqg(blkg);
 	}
 	return cfqg;
 }
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
 	/* Currently, all async queues are mapped to root group */
 	if (!cfq_cfqq_sync(cfqq))
 		cfqg = cfqq->cfqd->root_group;
 	cfqq->cfqg = cfqg;
 	/* cfqq reference on cfqg */
 	cfqg_get(cfqg);
 }
 static u64 cfqg_prfill_weight_device(struct seq_file *sf, void *pdata, int off)
 {
 	struct cfq_group *cfqg = pdata;
 	if (!cfqg->dev_weight)
 		return 0;
 	return __blkg_prfill_u64(sf, pdata, cfqg->dev_weight);
 }
 static int cfqg_print_weight_device(struct cgroup *cgrp, struct cftype *cft,
 				    struct seq_file *sf)
 {
 	blkcg_print_blkgs(sf, cgroup_to_blkio_cgroup(cgrp),
 			  cfqg_prfill_weight_device, &blkio_policy_cfq, 0,
 			  false);
 	return 0;
 }
 static int cfq_print_weight(struct cgroup *cgrp, struct cftype *cft,
 			    struct seq_file *sf)
 {
 	seq_printf(sf, "%u\n", cgroup_to_blkio_cgroup(cgrp)->cfq_weight);
 	return 0;
 }
 static int cfqg_set_weight_device(struct cgroup *cgrp, struct cftype *cft,
 				  const char *buf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 	struct blkg_conf_ctx ctx;
 	struct cfq_group *cfqg;
 	int ret;
 	ret = blkg_conf_prep(blkcg, &blkio_policy_cfq, buf, &ctx);
 	if (ret)
 		return ret;
 	ret = -EINVAL;
 	cfqg = blkg_to_cfqg(ctx.blkg);
 	if (cfqg && (!ctx.v || (ctx.v >= CFQ_WEIGHT_MIN &&
 				ctx.v <= CFQ_WEIGHT_MAX))) {
 		cfqg->dev_weight = ctx.v;
 		cfqg->new_weight = cfqg->dev_weight ?: blkcg->cfq_weight;
 		ret = 0;
 	}
 	blkg_conf_finish(&ctx);
 	return ret;
 }
 static int cfq_set_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 	struct blkio_group *blkg;
 	struct hlist_node *n;
 	if (val < CFQ_WEIGHT_MIN || val > CFQ_WEIGHT_MAX)
 		return -EINVAL;
 	spin_lock_irq(&blkcg->lock);
 	blkcg->cfq_weight = (unsigned int)val;
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
 		struct cfq_group *cfqg = blkg_to_cfqg(blkg);
 		if (cfqg && !cfqg->dev_weight)
 			cfqg->new_weight = blkcg->cfq_weight;
 	}
 	spin_unlock_irq(&blkcg->lock);
 	return 0;
 }
 static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
 			   struct seq_file *sf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 	blkcg_print_blkgs(sf, blkcg, blkg_prfill_stat, &blkio_policy_cfq,
 			  cft->private, false);
 	return 0;
 }
 static int cfqg_print_rwstat(struct cgroup *cgrp, struct cftype *cft,
 			     struct seq_file *sf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 	blkcg_print_blkgs(sf, blkcg, blkg_prfill_rwstat, &blkio_policy_cfq,
 			  cft->private, true);
 	return 0;
 }
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf, void *pdata, int off)
 {
 	struct cfq_group *cfqg = pdata;
 	u64 samples = blkg_stat_read(&cfqg->stats.avg_queue_size_samples);
 	u64 v = 0;
 	if (samples) {
 		v = blkg_stat_read(&cfqg->stats.avg_queue_size_sum);
 		do_div(v, samples);
 	}
 	__blkg_prfill_u64(sf, pdata, v);
 	return 0;
 }
 /* print avg_queue_size */
 static int cfqg_print_avg_queue_size(struct cgroup *cgrp, struct cftype *cft,
 				     struct seq_file *sf)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgrp);
 	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_avg_queue_size,
 			  &blkio_policy_cfq, 0, false);
 	return 0;
 }
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 static struct cftype cfq_blkcg_files[] = {
 	{
 		.name = "weight_device",
 		.read_seq_string = cfqg_print_weight_device,
 		.write_string = cfqg_set_weight_device,
 		.max_write_len = 256,
 	},
 	{
 		.name = "weight",
 		.read_seq_string = cfq_print_weight,
 		.write_u64 = cfq_set_weight,
 	},
 	{
 		.name = "time",
 		.private = offsetof(struct cfq_group, stats.time),
 		.read_seq_string = cfqg_print_stat,
 	},
 	{
 		.name = "sectors",
 		.private = offsetof(struct cfq_group, stats.sectors),
 		.read_seq_string = cfqg_print_stat,
 	},
 	{
 		.name = "io_service_bytes",
 		.private = offsetof(struct cfq_group, stats.service_bytes),
 		.read_seq_string = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_serviced",
 		.private = offsetof(struct cfq_group, stats.serviced),
 		.read_seq_string = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_service_time",
 		.private = offsetof(struct cfq_group, stats.service_time),
 		.read_seq_string = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_wait_time",
 		.private = offsetof(struct cfq_group, stats.wait_time),
 		.read_seq_string = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_merged",
 		.private = offsetof(struct cfq_group, stats.merged),
 		.read_seq_string = cfqg_print_rwstat,
 	},
 	{
 		.name = "io_queued",
 		.private = offsetof(struct cfq_group, stats.queued),
 		.read_seq_string = cfqg_print_rwstat,
 	},
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 	{
 		.name = "avg_queue_size",
 		.read_seq_string = cfqg_print_avg_queue_size,
 	},
 	{
 		.name = "group_wait_time",
 		.private = offsetof(struct cfq_group, stats.group_wait_time),
 		.read_seq_string = cfqg_print_stat,
 	},
 	{
 		.name = "idle_time",
 		.private = offsetof(struct cfq_group, stats.idle_time),
 		.read_seq_string = cfqg_print_stat,
 	},
 	{
 		.name = "empty_time",
 		.private = offsetof(struct cfq_group, stats.empty_time),
 		.read_seq_string = cfqg_print_stat,
 	},
 	{
 		.name = "dequeue",
 		.private = offsetof(struct cfq_group, stats.dequeue),
 		.read_seq_string = cfqg_print_stat,
 	},
 	{
 		.name = "unaccounted_time",
 		.private = offsetof(struct cfq_group, stats.unaccounted_time),
 		.read_seq_string = cfqg_print_stat,
 	},
 #endif	/* CONFIG_DEBUG_BLK_CGROUP */
 	{ }	/* terminate */
 };
 #else /* GROUP_IOSCHED */
 static struct cfq_group *cfq_lookup_create_cfqg(struct cfq_data *cfqd,
 						struct blkio_cgroup *blkcg)
 {
 	return cfqd->root_group;
 }
 static inline void
 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
 	cfqq->cfqg = cfqg;
 }
 #endif /* GROUP_IOSCHED */
 /*
  * The cfqd->service_trees holds all pending cfq_queue's that have
  * requests waiting to be processed. It is sorted in the order that
  * we will service the queues.
  */
 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				 bool add_front)
 {
 	struct rb_node **p, *parent;
 	struct cfq_queue *__cfqq;
 	unsigned long rb_key;
 	struct cfq_rb_root *service_tree;
 	int left;
 	int new_cfqq = 1;
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
 						cfqq_type(cfqq));
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
 		if (parent && parent != &cfqq->rb_node) {
 			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 			rb_key += __cfqq->rb_key;
 		} else
 			rb_key += jiffies;
 	} else if (!add_front) {
 		/*
 		 * Get our rb key offset. Subtract any residual slice
 		 * value carried from last service. A negative resid
 		 * count indicates slice overrun, and this should position
 		 * the next service time further away in the tree.
 		 */
 		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
 		rb_key -= cfqq->slice_resid;
 		cfqq->slice_resid = 0;
 	} else {
 		rb_key = -HZ;
 		__cfqq = cfq_rb_first(service_tree);
 		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
 	}
 	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 		new_cfqq = 0;
 		/*
 		 * same position, nothing more to do
 		 */
 		if (rb_key == cfqq->rb_key &&
 		    cfqq->service_tree == service_tree)
 			return;
 		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
 		cfqq->service_tree = NULL;
 	}
 	left = 1;
 	parent = NULL;
 	cfqq->service_tree = service_tree;
 	p = &service_tree->rb.rb_node;
 	while (*p) {
 		struct rb_node **n;
 		parent = *p;
 		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 		/*
 		 * sort by key, that represents service time.
 		 */
 		if (time_before(rb_key, __cfqq->rb_key))
 			n = &(*p)->rb_left;
 		else {
 			n = &(*p)->rb_right;
 			left = 0;
 		}
 		p = n;
 	}
 	if (left)
 		service_tree->left = &cfqq->rb_node;
 	cfqq->rb_key = rb_key;
 	rb_link_node(&cfqq->rb_node, parent, p);
 	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
 	service_tree->count++;
 	if (add_front || !new_cfqq)
 		return;
 	cfq_group_notify_queue_add(cfqd, cfqq->cfqg);
 }
 static struct cfq_queue *
 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
 		     sector_t sector, struct rb_node **ret_parent,
 		     struct rb_node ***rb_link)
 {
 	struct rb_node **p, *parent;
 	struct cfq_queue *cfqq = NULL;
 	parent = NULL;
 	p = &root->rb_node;
 	while (*p) {
 		struct rb_node **n;
 		parent = *p;
 		cfqq = rb_entry(parent, struct cfq_queue, p_node);
 		/*
 		 * Sort strictly based on sector.  Smallest to the left,
 		 * largest to the right.
 		 */
 		if (sector > blk_rq_pos(cfqq->next_rq))
 			n = &(*p)->rb_right;
 		else if (sector < blk_rq_pos(cfqq->next_rq))
 			n = &(*p)->rb_left;
 		else
 			break;
 		p = n;
 		cfqq = NULL;
 	}
 	*ret_parent = parent;
 	if (rb_link)
 		*rb_link = p;
 	return cfqq;
 }
 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct rb_node **p, *parent;
 	struct cfq_queue *__cfqq;
 	if (cfqq->p_root) {
 		rb_erase(&cfqq->p_node, cfqq->p_root);
 		cfqq->p_root = NULL;
 	}
 	if (cfq_class_idle(cfqq))
 		return;
 	if (!cfqq->next_rq)
 		return;
 	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
 	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
 				      blk_rq_pos(cfqq->next_rq), &parent, &p);
 	if (!__cfqq) {
 		rb_link_node(&cfqq->p_node, parent, p);
 		rb_insert_color(&cfqq->p_node, cfqq->p_root);
 	} else
 		cfqq->p_root = NULL;
 }
 /*
  * Update cfqq's position in the service tree.
  */
 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	/*
 	 * Resorting requires the cfqq to be on the RR list already.
 	 */
 	if (cfq_cfqq_on_rr(cfqq)) {
 		cfq_service_tree_add(cfqd, cfqq, 0);
 		cfq_prio_tree_add(cfqd, cfqq);
 	}
 }
 /*
  * add to busy list of queues for service, trying to be fair in ordering
  * the pending list according to last request service
  */
 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
 	if (cfq_cfqq_sync(cfqq))
 		cfqd->busy_sync_queues++;
 	cfq_resort_rr_list(cfqd, cfqq);
 }
 /*
  * Called when the cfqq no longer has requests pending, remove it from
  * the service tree.
  */
 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
 	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
 		cfqq->service_tree = NULL;
 	}
 	if (cfqq->p_root) {
 		rb_erase(&cfqq->p_node, cfqq->p_root);
 		cfqq->p_root = NULL;
 	}
 	cfq_group_notify_queue_del(cfqd, cfqq->cfqg);
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
 	if (cfq_cfqq_sync(cfqq))
 		cfqd->busy_sync_queues--;
 }
 /*
  * rb tree support functions
  */
 static void cfq_del_rq_rb(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	const int sync = rq_is_sync(rq);
 	BUG_ON(!cfqq->queued[sync]);
 	cfqq->queued[sync]--;
 	elv_rb_del(&cfqq->sort_list, rq);
 	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
 		/*
 		 * Queue will be deleted from service tree when we actually
 		 * expire it later. Right now just remove it from prio tree
 		 * as it is empty.
 		 */
 		if (cfqq->p_root) {
 			rb_erase(&cfqq->p_node, cfqq->p_root);
 			cfqq->p_root = NULL;
 		}
 	}
 }
 static void cfq_add_rq_rb(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct request *prev;
 	cfqq->queued[rq_is_sync(rq)]++;
 	elv_rb_add(&cfqq->sort_list, rq);
 	if (!cfq_cfqq_on_rr(cfqq))
 		cfq_add_cfqq_rr(cfqd, cfqq);
 	/*
 	 * check if this request is a better next-serve candidate
 	 */
 	prev = cfqq->next_rq;
 	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
 	/*
 	 * adjust priority tree position, if ->next_rq changes
 	 */
 	if (prev != cfqq->next_rq)
 		cfq_prio_tree_add(cfqd, cfqq);
 	BUG_ON(!cfqq->next_rq);
 }
 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
 	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
 	cfq_add_rq_rb(rq);
 	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqq->cfqd->serving_group,
 				 rq->cmd_flags);
 }
 static struct request *
 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 {
 	struct task_struct *tsk = current;
 	struct cfq_io_cq *cic;
 	struct cfq_queue *cfqq;
 	cic = cfq_cic_lookup(cfqd, tsk->io_context);
 	if (!cic)
 		return NULL;
 	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 	if (cfqq) {
 		sector_t sector = bio->bi_sector + bio_sectors(bio);
 		return elv_rb_find(&cfqq->sort_list, sector);
 	}
 	return NULL;
 }
 static void cfq_activate_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	cfqd->rq_in_driver++;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
 						cfqd->rq_in_driver);
 	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 }
 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	WARN_ON(!cfqd->rq_in_driver);
 	cfqd->rq_in_driver--;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
 						cfqd->rq_in_driver);
 }
 static void cfq_remove_request(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	if (cfqq->next_rq == rq)
 		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
 	list_del_init(&rq->queuelist);
 	cfq_del_rq_rb(rq);
 	cfqq->cfqd->rq_queued--;
 	cfqg_stats_update_io_remove(RQ_CFQG(rq), rq->cmd_flags);
 	if (rq->cmd_flags & REQ_PRIO) {
 		WARN_ON(!cfqq->prio_pending);
 		cfqq->prio_pending--;
 	}
 }
 static int cfq_merge(struct request_queue *q, struct request **req,
 		     struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *__rq;
 	__rq = cfq_find_rq_fmerge(cfqd, bio);
 	if (__rq && elv_rq_merge_ok(__rq, bio)) {
 		*req = __rq;
 		return ELEVATOR_FRONT_MERGE;
 	}
 	return ELEVATOR_NO_MERGE;
 }
 static void cfq_merged_request(struct request_queue *q, struct request *req,
 			       int type)
 {
 	if (type == ELEVATOR_FRONT_MERGE) {
 		struct cfq_queue *cfqq = RQ_CFQQ(req);
 		cfq_reposition_rq_rb(cfqq, req);
 	}
 }
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
 	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
 }
 static void
 cfq_merged_requests(struct request_queue *q, struct request *rq,
 		    struct request *next)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	/*
 	 * reposition in fifo if next is older than rq
 	 */
 	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
 	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
 		list_move(&rq->queuelist, &next->queuelist);
 		rq_set_fifo_time(rq, rq_fifo_time(next));
 	}
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
 	cfqg_stats_update_io_merged(RQ_CFQG(rq), next->cmd_flags);
 	cfqq = RQ_CFQQ(next);
 	/*
 	 * all requests of this queue are merged to other queues, delete it
 	 * from the service tree. If it's the active_queue,
 	 * cfq_dispatch_requests() will choose to expire it or do idle
 	 */
 	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list) &&
 	    cfqq != cfqd->active_queue)
 		cfq_del_cfqq_rr(cfqd, cfqq);
 }
 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 			   struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_io_cq *cic;
 	struct cfq_queue *cfqq;
 	/*
 	 * Disallow merge of a sync bio into an async request.
 	 */
 	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
 		return false;
 	/*
 	 * Lookup the cfqq that this bio will be queued with and allow
 	 * merge only if rq is queued there.
 	 */
 	cic = cfq_cic_lookup(cfqd, current->io_context);
 	if (!cic)
 		return false;
 	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 	return cfqq == RQ_CFQQ(rq);
 }
 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	del_timer(&cfqd->idle_slice_timer);
 	cfqg_stats_update_idle_time(cfqq->cfqg);
 }
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
 				   struct cfq_queue *cfqq)
 {
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
 		cfqg_stats_update_avg_queue_size(cfqq->cfqg);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
 		cfqq->allocated_slice = 0;
 		cfqq->slice_end = 0;
 		cfqq->slice_dispatch = 0;
 		cfqq->nr_sectors = 0;
 		cfq_clear_cfqq_wait_request(cfqq);
 		cfq_clear_cfqq_must_dispatch(cfqq);
 		cfq_clear_cfqq_must_alloc_slice(cfqq);
 		cfq_clear_cfqq_fifo_expire(cfqq);
 		cfq_mark_cfqq_slice_new(cfqq);
 		cfq_del_timer(cfqd, cfqq);
 	}
 	cfqd->active_queue = cfqq;
 }
 /*
  * current cfqq expired its slice (or was too idle), select new one
  */
 static void
 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		    bool timed_out)
 {
 	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
 	if (cfq_cfqq_wait_request(cfqq))
 		cfq_del_timer(cfqd, cfqq);
 	cfq_clear_cfqq_wait_request(cfqq);
 	cfq_clear_cfqq_wait_busy(cfqq);
 	/*
 	 * If this cfqq is shared between multiple processes, check to
 	 * make sure that those processes are still issuing I/Os within
 	 * the mean seek distance.  If not, it may be time to break the
 	 * queues apart again.
 	 */
 	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
 		cfq_mark_cfqq_split_coop(cfqq);
 	/*
 	 * store what was left of this slice, if the queue idled/timed out
 	 */
 	if (timed_out) {
 		if (cfq_cfqq_slice_new(cfqq))
 			cfqq->slice_resid = cfq_scaled_cfqq_slice(cfqd, cfqq);
 		else
 			cfqq->slice_resid = cfqq->slice_end - jiffies;
 		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
 	}
 	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
 	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
 		cfq_del_cfqq_rr(cfqd, cfqq);
 	cfq_resort_rr_list(cfqd, cfqq);
 	if (cfqq == cfqd->active_queue)
 		cfqd->active_queue = NULL;
 	if (cfqd->active_cic) {
 		put_io_context(cfqd->active_cic->icq.ioc);
 		cfqd->active_cic = NULL;
 	}
 }
 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 	if (cfqq)
 		__cfq_slice_expired(cfqd, cfqq, timed_out);
 }
 /*
  * Get next queue for service. Unless we have a queue preemption,
  * we'll simply select the first cfqq in the service tree.
  */
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *service_tree =
 		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
 					cfqd->serving_type);
 	if (!cfqd->rq_queued)
 		return NULL;
 	/* There is nothing to dispatch */
 	if (!service_tree)
 		return NULL;
 	if (RB_EMPTY_ROOT(&service_tree->rb))
 		return NULL;
 	return cfq_rb_first(service_tree);
 }
 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
 {
 	struct cfq_group *cfqg;
 	struct cfq_queue *cfqq;
 	int i, j;
 	struct cfq_rb_root *st;
 	if (!cfqd->rq_queued)
 		return NULL;
 	cfqg = cfq_get_next_cfqg(cfqd);
 	if (!cfqg)
 		return NULL;
 	for_each_cfqg_st(cfqg, i, j, st)
 		if ((cfqq = cfq_rb_first(st)) != NULL)
 			return cfqq;
 	return NULL;
 }
 /*
  * Get and set a new active queue for service.
  */
 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
 					      struct cfq_queue *cfqq)
 {
 	if (!cfqq)
 		cfqq = cfq_get_next_queue(cfqd);
 	__cfq_set_active_queue(cfqd, cfqq);
 	return cfqq;
 }
 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 					  struct request *rq)
 {
 	if (blk_rq_pos(rq) >= cfqd->last_position)
 		return blk_rq_pos(rq) - cfqd->last_position;
 	else
 		return cfqd->last_position - blk_rq_pos(rq);
 }
 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			       struct request *rq)
 {
 	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
 }
 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 				    struct cfq_queue *cur_cfqq)
 {
 	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
 	struct rb_node *parent, *node;
 	struct cfq_queue *__cfqq;
 	sector_t sector = cfqd->last_position;
 	if (RB_EMPTY_ROOT(root))
 		return NULL;
 	/*
 	 * First, if we find a request starting at the end of the last
 	 * request, choose it.
 	 */
 	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
 	if (__cfqq)
 		return __cfqq;
 	/*
 	 * If the exact sector wasn't found, the parent of the NULL leaf
 	 * will contain the closest sector.
 	 */
 	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
 	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
 		return __cfqq;
 	if (blk_rq_pos(__cfqq->next_rq) < sector)
 		node = rb_next(&__cfqq->p_node);
 	else
 		node = rb_prev(&__cfqq->p_node);
 	if (!node)
 		return NULL;
 	__cfqq = rb_entry(node, struct cfq_queue, p_node);
 	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
 		return __cfqq;
 	return NULL;
 }
 /*
  * cfqd - obvious
  * cur_cfqq - passed in so that we don't decide that the current queue is
  * 	      closely cooperating with itself.
  *
  * So, basically we're assuming that that cur_cfqq has dispatched at least
  * one request, and that cfqd->last_position reflects a position on the disk
  * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
  * assumption.
  */
 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 					      struct cfq_queue *cur_cfqq)
 {
 	struct cfq_queue *cfqq;
 	if (cfq_class_idle(cur_cfqq))
 		return NULL;
 	if (!cfq_cfqq_sync(cur_cfqq))
 		return NULL;
 	if (CFQQ_SEEKY(cur_cfqq))
 		return NULL;
 	/*
 	 * Don't search priority tree if it's the only queue in the group.
 	 */
 	if (cur_cfqq->cfqg->nr_cfqq == 1)
 		return NULL;
 	/*
 	 * We should notice if some of the queues are cooperating, eg
 	 * working closely on the same area of the disk. In that case,
 	 * we can group them together and don't waste time idling.
 	 */
 	cfqq = cfqq_close(cfqd, cur_cfqq);
 	if (!cfqq)
 		return NULL;
 	/* If new queue belongs to different cfq_group, don't choose it */
 	if (cur_cfqq->cfqg != cfqq->cfqg)
 		return NULL;
 	/*
 	 * It only makes sense to merge sync queues.
 	 */
 	if (!cfq_cfqq_sync(cfqq))
 		return NULL;
 	if (CFQQ_SEEKY(cfqq))
 		return NULL;
 	/*
 	 * Do not merge queues of different priority classes
 	 */
 	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
 		return NULL;
 	return cfqq;
 }
 /*
  * Determine whether we should enforce idle window for this queue.
  */
 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	enum wl_prio_t prio = cfqq_prio(cfqq);
 	struct cfq_rb_root *service_tree = cfqq->service_tree;
 	BUG_ON(!service_tree);
 	BUG_ON(!service_tree->count);
 	if (!cfqd->cfq_slice_idle)
 		return false;
 	/* We never do for idle class queues. */
 	if (prio == IDLE_WORKLOAD)
 		return false;
 	/* We do for queues that were marked with idle window flag. */
 	if (cfq_cfqq_idle_window(cfqq) &&
 	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
 		return true;
 	/*
 	 * Otherwise, we do only if they are the last ones
 	 * in their service tree.
 	 */
 	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq) &&
 	   !cfq_io_thinktime_big(cfqd, &service_tree->ttime, false))
 		return true;
 	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
 			service_tree->count);
 	return false;
 }
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 	struct cfq_io_cq *cic;
 	unsigned long sl, group_idle = 0;
 	/*
 	 * SSD device without seek penalty, disable idling. But only do so
 	 * for devices that support queuing, otherwise we still have a problem
 	 * with sync vs async workloads.
 	 */
 	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
 		return;
 	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
 	WARN_ON(cfq_cfqq_slice_new(cfqq));
 	/*
 	 * idle is disabled, either manually or by past process history
 	 */
 	if (!cfq_should_idle(cfqd, cfqq)) {
 		/* no queue idling. Check for group idling */
 		if (cfqd->cfq_group_idle)
 			group_idle = cfqd->cfq_group_idle;
 		else
 			return;
 	}
 	/*
 	 * still active requests from this queue, don't idle
 	 */
 	if (cfqq->dispatched)
 		return;
 	/*
 	 * task has exited, don't wait
 	 */
 	cic = cfqd->active_cic;
 	if (!cic || !atomic_read(&cic->icq.ioc->active_ref))
 		return;
 	/*
 	 * If our average think time is larger than the remaining time
 	 * slice, then don't idle. This avoids overrunning the allotted
 	 * time slice.
 	 */
 	if (sample_valid(cic->ttime.ttime_samples) &&
 	    (cfqq->slice_end - jiffies < cic->ttime.ttime_mean)) {
 		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%lu",
 			     cic->ttime.ttime_mean);
 		return;
 	}
 	/* There are other queues in the group, don't do group idle */
 	if (group_idle && cfqq->cfqg->nr_cfqq > 1)
 		return;
 	cfq_mark_cfqq_wait_request(cfqq);
 	if (group_idle)
 		sl = cfqd->cfq_group_idle;
 	else
 		sl = cfqd->cfq_slice_idle;
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 	cfqg_stats_set_start_idle_time(cfqq->cfqg);
 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu group_idle: %d", sl,
 			group_idle ? 1 : 0);
 }
 /*
  * Move request from internal lists to the request queue dispatch list.
  */
 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
 	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
 	cfq_remove_request(rq);
 	cfqq->dispatched++;
 	(RQ_CFQG(rq))->dispatched++;
 	elv_dispatch_sort(q, rq);
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
 	cfqq->nr_sectors += blk_rq_sectors(rq);
 	cfqg_stats_update_dispatch(cfqq->cfqg, blk_rq_bytes(rq), rq->cmd_flags);
 }
 /*
  * return expired entry, or NULL to just start from scratch in rbtree
  */
 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
 {
 	struct request *rq = NULL;
 	if (cfq_cfqq_fifo_expire(cfqq))
 		return NULL;
 	cfq_mark_cfqq_fifo_expire(cfqq);
 	if (list_empty(&cfqq->fifo))
 		return NULL;
 	rq = rq_entry_fifo(cfqq->fifo.next);
 	if (time_before(jiffies, rq_fifo_time(rq)))
 		rq = NULL;
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
 	return rq;
 }
 static inline int
 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	const int base_rq = cfqd->cfq_slice_async_rq;
 	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
 	return 2 * base_rq * (IOPRIO_BE_NR - cfqq->ioprio);
 }
 /*
  * Must be called with the queue_lock held.
  */
 static int cfqq_process_refs(struct cfq_queue *cfqq)
 {
 	int process_refs, io_refs;
 	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
 	process_refs = cfqq->ref - io_refs;
 	BUG_ON(process_refs < 0);
 	return process_refs;
 }
 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 {
 	int process_refs, new_process_refs;
 	struct cfq_queue *__cfqq;
 	/*
 	 * If there are no process references on the new_cfqq, then it is
 	 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
 	 * chain may have dropped their last reference (not just their
 	 * last process reference).
 	 */
 	if (!cfqq_process_refs(new_cfqq))
 		return;
 	/* Avoid a circular list and skip interim queue merges */
 	while ((__cfqq = new_cfqq->new_cfqq)) {
 		if (__cfqq == cfqq)
 			return;
 		new_cfqq = __cfqq;
 	}
 	process_refs = cfqq_process_refs(cfqq);
 	new_process_refs = cfqq_process_refs(new_cfqq);
 	/*
 	 * If the process for the cfqq has gone away, there is no
 	 * sense in merging the queues.
 	 */
 	if (process_refs == 0 || new_process_refs == 0)
 		return;
 	/*
 	 * Merge in the direction of the lesser amount of work.
 	 */
 	if (new_process_refs >= process_refs) {
 		cfqq->new_cfqq = new_cfqq;
 		new_cfqq->ref += process_refs;
 	} else {
 		new_cfqq->new_cfqq = cfqq;
 		cfqq->ref += new_process_refs;
 	}
 }
 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 				struct cfq_group *cfqg, enum wl_prio_t prio)
 {
 	struct cfq_queue *queue;
 	int i;
 	bool key_valid = false;
 	unsigned long lowest_key = 0;
 	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
 		/* select the one with lowest rb_key */
 		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
 		if (queue &&
 		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
 			lowest_key = queue->rb_key;
 			cur_best = i;
 			key_valid = true;
 		}
 	}
 	return cur_best;
 }
 static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	unsigned slice;
 	unsigned count;
 	struct cfq_rb_root *st;
 	unsigned group_slice;
 	enum wl_prio_t original_prio = cfqd->serving_prio;
 	/* Choose next priority. RT > BE > IDLE */
 	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
 		cfqd->serving_prio = RT_WORKLOAD;
 	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
 		cfqd->serving_prio = BE_WORKLOAD;
 	else {
 		cfqd->serving_prio = IDLE_WORKLOAD;
 		cfqd->workload_expires = jiffies + 1;
 		return;
 	}
 	if (original_prio != cfqd->serving_prio)
 		goto new_workload;
 	/*
 	 * For RT and BE, we have to choose also the type
 	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
 	 * expiration time
 	 */
 	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 	count = st->count;
 	/*
 	 * check workload expiration, and that we still have other queues ready
 	 */
 	if (count && !time_after(jiffies, cfqd->workload_expires))
 		return;
 new_workload:
 	/* otherwise select new workload type */
 	cfqd->serving_type =
 		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
 	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 	count = st->count;
 	/*
 	 * the workload slice is computed as a fraction of target latency
 	 * proportional to the number of queues in that workload, over
 	 * all the queues in the same priority class
 	 */
 	group_slice = cfq_group_slice(cfqd, cfqg);
 	slice = group_slice * count /
 		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
 		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
 	if (cfqd->serving_type == ASYNC_WORKLOAD) {
 		unsigned int tmp;
 		/*
 		 * Async queues are currently system wide. Just taking
 		 * proportion of queues with-in same group will lead to higher
 		 * async ratio system wide as generally root group is going
 		 * to have higher weight. A more accurate thing would be to
 		 * calculate system wide asnc/sync ratio.
 		 */
 		tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
 		tmp = tmp/cfqd->busy_queues;
 		slice = min_t(unsigned, slice, tmp);
 		/* async workload slice is scaled down according to
 		 * the sync/async slice ratio. */
 		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
 	} else
 		/* sync workload slice is at least 2 * cfq_slice_idle */
 		slice = max(slice, 2 * cfqd->cfq_slice_idle);
 	slice = max_t(unsigned, slice, CFQ_MIN_TT);
 	cfq_log(cfqd, "workload slice:%d", slice);
 	cfqd->workload_expires = jiffies + slice;
 }
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_group *cfqg;
 	if (RB_EMPTY_ROOT(&st->rb))
 		return NULL;
 	cfqg = cfq_rb_first_group(st);
 	update_min_vdisktime(st);
 	return cfqg;
 }
 static void cfq_choose_cfqg(struct cfq_data *cfqd)
 {
 	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
 	cfqd->serving_group = cfqg;
 	/* Restore the workload type data */
 	if (cfqg->saved_workload_slice) {
 		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
 		cfqd->serving_type = cfqg->saved_workload;
 		cfqd->serving_prio = cfqg->saved_serving_prio;
 	} else
 		cfqd->workload_expires = jiffies - 1;
 	choose_service_tree(cfqd, cfqg);
 }
 /*
  * Select a queue for service. If we have a current active queue,
  * check whether to continue servicing it, or retrieve and set a new one.
  */
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	cfqq = cfqd->active_queue;
 	if (!cfqq)
 		goto new_queue;
 	if (!cfqd->rq_queued)
 		return NULL;
 	/*
 	 * We were waiting for group to get backlogged. Expire the queue
 	 */
 	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto expire;
 	/*
 	 * The active queue has run out of time, expire it and select new.
 	 */
 	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
 		/*
 		 * If slice had not expired at the completion of last request
 		 * we might not have turned on wait_busy flag. Don't expire
 		 * the queue yet. Allow the group to get backlogged.
 		 *
 		 * The very fact that we have used the slice, that means we
 		 * have been idling all along on this queue and it should be
 		 * ok to wait for this request to complete.
 		 */
 		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
 		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
 			cfqq = NULL;
 			goto keep_queue;
 		} else
 			goto check_group_idle;
 	}
 	/*
 	 * The active queue has requests and isn't expired, allow it to
 	 * dispatch.
 	 */
 	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto keep_queue;
 	/*
 	 * If another queue has a request waiting within our mean seek
 	 * distance, let it run.  The expire code will check for close
 	 * cooperators and put the close queue at the front of the service
 	 * tree.  If possible, merge the expiring queue with the new cfqq.
 	 */
 	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
 	if (new_cfqq) {
 		if (!cfqq->new_cfqq)
 			cfq_setup_merge(cfqq, new_cfqq);
 		goto expire;
 	}
 	/*
 	 * No requests pending. If the active queue still has requests in
 	 * flight or is idling for a new request, allow either of these
 	 * conditions to happen (or time out) before selecting a new queue.
 	 */
 	if (timer_pending(&cfqd->idle_slice_timer)) {
 		cfqq = NULL;
 		goto keep_queue;
 	}
 	/*
 	 * This is a deep seek queue, but the device is much faster than
 	 * the queue can deliver, don't idle
 	 **/
 	if (CFQQ_SEEKY(cfqq) && cfq_cfqq_idle_window(cfqq) &&
 	    (cfq_cfqq_slice_new(cfqq) ||
 	    (cfqq->slice_end - jiffies > jiffies - cfqq->slice_start))) {
 		cfq_clear_cfqq_deep(cfqq);
 		cfq_clear_cfqq_idle_window(cfqq);
 	}
 	if (cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
 		cfqq = NULL;
 		goto keep_queue;
 	}
 	/*
 	 * If group idle is enabled and there are requests dispatched from
 	 * this group, wait for requests to complete.
 	 */
 check_group_idle:
 	if (cfqd->cfq_group_idle && cfqq->cfqg->nr_cfqq == 1 &&
 	    cfqq->cfqg->dispatched &&
 	    !cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true)) {
 		cfqq = NULL;
 		goto keep_queue;
 	}
 expire:
 	cfq_slice_expired(cfqd, 0);
 new_queue:
 	/*
 	 * Current queue expired. Check if we have to switch to a new
 	 * service tree
 	 */
 	if (!new_cfqq)
 		cfq_choose_cfqg(cfqd);
 	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
 keep_queue:
 	return cfqq;
 }
 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
 {
 	int dispatched = 0;
 	while (cfqq->next_rq) {
 		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
 		dispatched++;
 	}
 	BUG_ON(!list_empty(&cfqq->fifo));
 	/* By default cfqq is not expired if it is empty. Do it explicitly */
 	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
 	return dispatched;
 }
 /*
  * Drain our current requests. Used for barriers and when switching
  * io schedulers on-the-fly.
  */
 static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq;
 	int dispatched = 0;
 	/* Expire the timeslice of the current active queue first */
 	cfq_slice_expired(cfqd, 0);
 	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
 		__cfq_set_active_queue(cfqd, cfqq);
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 	}
 	BUG_ON(cfqd->busy_queues);
 	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
 	return dispatched;
 }
 static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
 	struct cfq_queue *cfqq)
 {
 	/* the queue hasn't finished any request, can't estimate */
 	if (cfq_cfqq_slice_new(cfqq))
 		return true;
 	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
 		cfqq->slice_end))
 		return true;
 	return false;
 }
 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned int max_dispatch;
 	/*
 	 * Drain async requests before we start sync IO
 	 */
 	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
 		return false;
 	/*
 	 * If this is an async queue and we have sync IO in flight, let it wait
 	 */
 	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
 		return false;
 	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
 	if (cfq_class_idle(cfqq))
 		max_dispatch = 1;
 	/*
 	 * Does this cfqq already have too much IO in flight?
 	 */
 	if (cfqq->dispatched >= max_dispatch) {
 		bool promote_sync = false;
 		/*
 		 * idle queue must always only have a single IO in flight
 		 */
 		if (cfq_class_idle(cfqq))
 			return false;
 		/*
 		 * If there is only one sync queue
 		 * we can ignore async queue here and give the sync
 		 * queue no dispatch limit. The reason is a sync queue can
 		 * preempt async queue, limiting the sync queue doesn't make
 		 * sense. This is useful for aiostress test.
 		 */
 		if (cfq_cfqq_sync(cfqq) && cfqd->busy_sync_queues == 1)
 			promote_sync = true;
 		/*
 		 * We have other queues, don't allow more IO from this one
 		 */
 		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq) &&
 				!promote_sync)
 			return false;
 		/*
 		 * Sole queue user, no limit
 		 */
 		if (cfqd->busy_queues == 1 || promote_sync)
 			max_dispatch = -1;
 		else
 			/*
 			 * Normally we start throttling cfqq when cfq_quantum/2
 			 * requests have been dispatched. But we can drive
 			 * deeper queue depths at the beginning of slice
 			 * subjected to upper limit of cfq_quantum.
 			 * */
 			max_dispatch = cfqd->cfq_quantum;
 	}
 	/*
 	 * Async queues must wait a bit before being allowed dispatch.
 	 * We also ramp up the dispatch depth gradually for async IO,
 	 * based on the last sync IO we serviced
 	 */
 	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
 		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
 		unsigned int depth;
 		depth = last_sync / cfqd->cfq_slice[1];
 		if (!depth && !cfqq->dispatched)
 			depth = 1;
 		if (depth < max_dispatch)
 			max_dispatch = depth;
 	}
 	/*
 	 * If we're below the current max, allow a dispatch
 	 */
 	return cfqq->dispatched < max_dispatch;
 }
 /*
  * Dispatch a request from cfqq, moving them to the request queue
  * dispatch list.
  */
 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct request *rq;
 	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
 	if (!cfq_may_dispatch(cfqd, cfqq))
 		return false;
 	/*
 	 * follow expired path, else get first next available
 	 */
 	rq = cfq_check_fifo(cfqq);
 	if (!rq)
 		rq = cfqq->next_rq;
 	/*
 	 * insert request into driver dispatch list
 	 */
 	cfq_dispatch_insert(cfqd->queue, rq);
 	if (!cfqd->active_cic) {
 		struct cfq_io_cq *cic = RQ_CIC(rq);
 		atomic_long_inc(&cic->icq.ioc->refcount);
 		cfqd->active_cic = cic;
 	}
 	return true;
 }
 /*
  * Find the cfqq that we need to service and move a request from that to the
  * dispatch list
  */
 static int cfq_dispatch_requests(struct request_queue *q, int force)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq;
 	if (!cfqd->busy_queues)
 		return 0;
 	if (unlikely(force))
 		return cfq_forced_dispatch(cfqd);
 	cfqq = cfq_select_queue(cfqd);
 	if (!cfqq)
 		return 0;
 	/*
 	 * Dispatch a request from this cfqq, if it is allowed
 	 */
 	if (!cfq_dispatch_request(cfqd, cfqq))
 		return 0;
 	cfqq->slice_dispatch++;
 	cfq_clear_cfqq_must_dispatch(cfqq);
 	/*
 	 * expire an async queue immediately if it has used up its slice. idle
 	 * queue always expire after 1 dispatch round.
 	 */
 	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
 	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
 	    cfq_class_idle(cfqq))) {
 		cfqq->slice_end = jiffies + 1;
 		cfq_slice_expired(cfqd, 0);
 	}
 	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
 	return 1;
 }
 /*
  * task holds one reference to the queue, dropped when task exits. each rq
  * in-flight on this queue also holds a reference, dropped when rq is freed.
  *
  * Each cfq queue took a reference on the parent group. Drop it now.
  * queue lock must be held here.
  */
 static void cfq_put_queue(struct cfq_queue *cfqq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct cfq_group *cfqg;
 	BUG_ON(cfqq->ref <= 0);
 	cfqq->ref--;
 	if (cfqq->ref)
 		return;
 	cfq_log_cfqq(cfqd, cfqq, "put_queue");
 	BUG_ON(rb_first(&cfqq->sort_list));
 	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
 	cfqg = cfqq->cfqg;
 	if (unlikely(cfqd->active_queue == cfqq)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	kmem_cache_free(cfq_pool, cfqq);
 	cfqg_put(cfqg);
 }
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
 {
 	struct cfq_queue *__cfqq, *next;
 	/*
 	 * If this queue was scheduled to merge with another queue, be
 	 * sure to drop the reference taken on that queue (and others in
 	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
 	 */
 	__cfqq = cfqq->new_cfqq;
 	while (__cfqq) {
 		if (__cfqq == cfqq) {
 			WARN(1, "cfqq->new_cfqq loop detected\n");
 			break;
 		}
 		next = __cfqq->new_cfqq;
 		cfq_put_queue(__cfqq);
 		__cfqq = next;
 	}
 }
 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	if (unlikely(cfqq == cfqd->active_queue)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 	cfq_put_cooperator(cfqq);
 	cfq_put_queue(cfqq);
 }
 static void cfq_init_icq(struct io_cq *icq)
 {
 	struct cfq_io_cq *cic = icq_to_cic(icq);
 	cic->ttime.last_end_request = jiffies;
 }
 static void cfq_exit_icq(struct io_cq *icq)
 {
 	struct cfq_io_cq *cic = icq_to_cic(icq);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	if (cic->cfqq[BLK_RW_ASYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
 		cic->cfqq[BLK_RW_ASYNC] = NULL;
 	}
 	if (cic->cfqq[BLK_RW_SYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
 		cic->cfqq[BLK_RW_SYNC] = NULL;
 	}
 }
 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct cfq_io_cq *cic)
 {
 	struct task_struct *tsk = current;
 	int ioprio_class;
 	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 	ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
 	switch (ioprio_class) {
 	default:
 		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
 	case IOPRIO_CLASS_NONE:
 		/*
 		 * no prio set, inherit CPU scheduling settings
 		 */
 		cfqq->ioprio = task_nice_ioprio(tsk);
 		cfqq->ioprio_class = task_nice_ioclass(tsk);
 		break;
 	case IOPRIO_CLASS_RT:
 		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
 		cfqq->ioprio_class = IOPRIO_CLASS_RT;
 		break;
 	case IOPRIO_CLASS_BE:
 		cfqq->ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
 		cfqq->ioprio_class = IOPRIO_CLASS_BE;
 		break;
 	case IOPRIO_CLASS_IDLE:
 		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
 		cfqq->ioprio = 7;
 		cfq_clear_cfqq_idle_window(cfqq);
 		break;
 	}
 	/*
 	 * keep track of original prio settings in case we have to temporarily
 	 * elevate the priority of this queue
 	 */
 	cfqq->org_ioprio = cfqq->ioprio;
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
 	int ioprio = cic->icq.ioc->ioprio;
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
 	/*
 	 * Check whether ioprio has changed.  The condition may trigger
 	 * spuriously on a newly created cic but there's no harm.
 	 */
 	if (unlikely(!cfqd) || likely(cic->ioprio == ioprio))
 		return;
 	cfqq = cic->cfqq[BLK_RW_ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
 		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
 					 GFP_ATOMIC);
 		if (new_cfqq) {
 			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
 			cfq_put_queue(cfqq);
 		}
 	}
 	cfqq = cic->cfqq[BLK_RW_SYNC];
 	if (cfqq)
 		cfq_mark_cfqq_prio_changed(cfqq);
 	cic->ioprio = ioprio;
 }
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			  pid_t pid, bool is_sync)
 {
 	RB_CLEAR_NODE(&cfqq->rb_node);
 	RB_CLEAR_NODE(&cfqq->p_node);
 	INIT_LIST_HEAD(&cfqq->fifo);
 	cfqq->ref = 0;
 	cfqq->cfqd = cfqd;
 	cfq_mark_cfqq_prio_changed(cfqq);
 	if (is_sync) {
 		if (!cfq_class_idle(cfqq))
 			cfq_mark_cfqq_idle_window(cfqq);
 		cfq_mark_cfqq_sync(cfqq);
 	}
 	cfqq->pid = pid;
 }
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio)
 {
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *sync_cfqq;
 	uint64_t id;
 	rcu_read_lock();
 	id = bio_blkio_cgroup(bio)->id;
 	rcu_read_unlock();
 	/*
 	 * Check whether blkcg has changed.  The condition may trigger
 	 * spuriously on a newly created cic but there's no harm.
 	 */
 	if (unlikely(!cfqd) || likely(cic->blkcg_id == id))
 		return;
 	sync_cfqq = cic_to_cfqq(cic, 1);
 	if (sync_cfqq) {
 		/*
 		 * Drop reference to sync queue. A new sync queue will be
 		 * assigned in new group upon arrival of a fresh request.
 		 */
 		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
 		cic_set_cfqq(cic, NULL, 1);
 		cfq_put_queue(sync_cfqq);
 	}
 	cic->blkcg_id = id;
 }
 #else
 static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) { }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 		     struct bio *bio, gfp_t gfp_mask)
 {
 	struct blkio_cgroup *blkcg;
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	struct cfq_group *cfqg;
 retry:
 	rcu_read_lock();
 	blkcg = bio_blkio_cgroup(bio);
 	cfqg = cfq_lookup_create_cfqg(cfqd, blkcg);
 	cfqq = cic_to_cfqq(cic, is_sync);
 	/*
 	 * Always try a new alloc if we fell back to the OOM cfqq
 	 * originally, since it should just be a temporary situation.
 	 */
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
 		cfqq = NULL;
 		if (new_cfqq) {
 			cfqq = new_cfqq;
 			new_cfqq = NULL;
 		} else if (gfp_mask & __GFP_WAIT) {
 			rcu_read_unlock();
 			spin_unlock_irq(cfqd->queue->queue_lock);
 			new_cfqq = kmem_cache_alloc_node(cfq_pool,
 					gfp_mask | __GFP_ZERO,
 					cfqd->queue->node);
 			spin_lock_irq(cfqd->queue->queue_lock);
 			if (new_cfqq)
 				goto retry;
 		} else {
 			cfqq = kmem_cache_alloc_node(cfq_pool,
 					gfp_mask | __GFP_ZERO,
 					cfqd->queue->node);
 		}
 		if (cfqq) {
 			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
 			cfq_init_prio_data(cfqq, cic);
 			cfq_link_cfqq_cfqg(cfqq, cfqg);
 			cfq_log_cfqq(cfqd, cfqq, "alloced");
 		} else
 			cfqq = &cfqd->oom_cfqq;
 	}
 	if (new_cfqq)
 		kmem_cache_free(cfq_pool, new_cfqq);
 	rcu_read_unlock();
 	return cfqq;
 }
 static struct cfq_queue **
 cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 {
 	switch (ioprio_class) {
 	case IOPRIO_CLASS_RT:
 		return &cfqd->async_cfqq[0][ioprio];
 	case IOPRIO_CLASS_NONE:
 		ioprio = IOPRIO_NORM;
 		/* fall through */
 	case IOPRIO_CLASS_BE:
 		return &cfqd->async_cfqq[1][ioprio];
 	case IOPRIO_CLASS_IDLE:
 		return &cfqd->async_idle_cfqq;
 	default:
 		BUG();
 	}
 }
 static struct cfq_queue *
 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
 	      struct bio *bio, gfp_t gfp_mask)
 {
 	const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
 	const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
 	struct cfq_queue **async_cfqq = NULL;
 	struct cfq_queue *cfqq = NULL;
 	if (!is_sync) {
 		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
 		cfqq = *async_cfqq;
 	}
 	if (!cfqq)
 		cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
 	 */
 	if (!is_sync && !(*async_cfqq)) {
 		cfqq->ref++;
 		*async_cfqq = cfqq;
 	}
 	cfqq->ref++;
 	return cfqq;
 }
 static void
 __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
 {
 	unsigned long elapsed = jiffies - ttime->last_end_request;
 	elapsed = min(elapsed, 2UL * slice_idle);
 	ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
 	ttime->ttime_total = (7*ttime->ttime_total + 256*elapsed) / 8;
 	ttime->ttime_mean = (ttime->ttime_total + 128) / ttime->ttime_samples;
 }
 static void
 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			struct cfq_io_cq *cic)
 {
 	if (cfq_cfqq_sync(cfqq)) {
 		__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
 		__cfq_update_io_thinktime(&cfqq->service_tree->ttime,
 			cfqd->cfq_slice_idle);
 	}
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	__cfq_update_io_thinktime(&cfqq->cfqg->ttime, cfqd->cfq_group_idle);
 #endif
 }
 static void
 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct request *rq)
 {
 	sector_t sdist = 0;
 	sector_t n_sec = blk_rq_sectors(rq);
 	if (cfqq->last_request_pos) {
 		if (cfqq->last_request_pos < blk_rq_pos(rq))
 			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
 		else
 			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
 	}
 	cfqq->seek_history <<= 1;
 	if (blk_queue_nonrot(cfqd->queue))
 		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
 	else
 		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
 }
 /*
  * Disable idle window if the process thinks too long or seeks so much that
  * it doesn't matter
  */
 static void
 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct cfq_io_cq *cic)
 {
 	int old_idle, enable_idle;
 	/*
 	 * Don't idle for async or idle io prio class
 	 */
 	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
 		return;
 	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
 	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
 		cfq_mark_cfqq_deep(cfqq);
 	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
 		enable_idle = 0;
 	else if (!atomic_read(&cic->icq.ioc->active_ref) ||
 		 !cfqd->cfq_slice_idle ||
 		 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime.ttime_samples)) {
 		if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
 			enable_idle = 0;
 		else
 			enable_idle = 1;
 	}
 	if (old_idle != enable_idle) {
 		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
 		if (enable_idle)
 			cfq_mark_cfqq_idle_window(cfqq);
 		else
 			cfq_clear_cfqq_idle_window(cfqq);
 	}
 }
 /*
  * Check if new_cfqq should preempt the currently active queue. Return 0 for
  * no or if we aren't sure, a 1 will cause a preempt.
  */
 static bool
 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 		   struct request *rq)
 {
 	struct cfq_queue *cfqq;
 	cfqq = cfqd->active_queue;
 	if (!cfqq)
 		return false;
 	if (cfq_class_idle(new_cfqq))
 		return false;
 	if (cfq_class_idle(cfqq))
 		return true;
 	/*
 	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
 	 */
 	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
 		return false;
 	/*
 	 * if the new request is sync, but the currently running queue is
 	 * not, let the sync request have priority.
 	 */
 	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
 		return true;
 	if (new_cfqq->cfqg != cfqq->cfqg)
 		return false;
 	if (cfq_slice_used(cfqq))
 		return true;
 	/* Allow preemption only if we are idling on sync-noidle tree */
 	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
 	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
 	    new_cfqq->service_tree->count == 2 &&
 	    RB_EMPTY_ROOT(&cfqq->sort_list))
 		return true;
 	/*
 	 * So both queues are sync. Let the new request get disk time if
 	 * it's a metadata request and the current queue is doing regular IO.
 	 */
 	if ((rq->cmd_flags & REQ_PRIO) && !cfqq->prio_pending)
 		return true;
 	/*
 	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
 	 */
 	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
 		return true;
 	/* An idle queue should not be idle now for some reason */
 	if (RB_EMPTY_ROOT(&cfqq->sort_list) && !cfq_should_idle(cfqd, cfqq))
 		return true;
 	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
 		return false;
 	/*
 	 * if this request is as-good as one we would expect from the
 	 * current cfqq, let it preempt
 	 */
 	if (cfq_rq_close(cfqd, cfqq, rq))
 		return true;
 	return false;
 }
 /*
  * cfqq preempts the active queue. if we allowed preempt with no slice left,
  * let it have half of its nominal slice.
  */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	enum wl_type_t old_type = cfqq_type(cfqd->active_queue);
 	cfq_log_cfqq(cfqd, cfqq, "preempt");
 	cfq_slice_expired(cfqd, 1);
 	/*
 	 * workload type is changed, don't save slice, otherwise preempt
 	 * doesn't happen
 	 */
 	if (old_type != cfqq_type(cfqq))
 		cfqq->cfqg->saved_workload_slice = 0;
 	/*
 	 * Put the new queue at the front of the of the current list,
 	 * so we know that it will be selected next.
 	 */
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_service_tree_add(cfqd, cfqq, 1);
 	cfqq->slice_end = 0;
 	cfq_mark_cfqq_slice_new(cfqq);
 }
 /*
  * Called when a new fs request (rq) is added (to cfqq). Check if there's
  * something we should do about it
  */
 static void
 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		struct request *rq)
 {
 	struct cfq_io_cq *cic = RQ_CIC(rq);
 	cfqd->rq_queued++;
 	if (rq->cmd_flags & REQ_PRIO)
 		cfqq->prio_pending++;
 	cfq_update_io_thinktime(cfqd, cfqq, cic);
 	cfq_update_io_seektime(cfqd, cfqq, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 	if (cfqq == cfqd->active_queue) {
 		/*
 		 * Remember that we saw a request from this process, but
 		 * don't start queuing just yet. Otherwise we risk seeing lots
 		 * of tiny requests, because we disrupt the normal plugging
 		 * and merging. If the request is already larger than a single
 		 * page, let it rip immediately. For that case we assume that
 		 * merging is already done. Ditto for a busy system that
 		 * has other work pending, don't risk delaying until the
 		 * idle timer unplug to continue working.
 		 */
 		if (cfq_cfqq_wait_request(cfqq)) {
 			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
 			    cfqd->busy_queues > 1) {
 				cfq_del_timer(cfqd, cfqq);
 				cfq_clear_cfqq_wait_request(cfqq);
 				__blk_run_queue(cfqd->queue);
 			} else {
 				cfqg_stats_update_idle_time(cfqq->cfqg);
 				cfq_mark_cfqq_must_dispatch(cfqq);
 			}
 		}
 	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
 		/*
 		 * not the active queue - expire current slice if it is
 		 * idle and has expired it's mean thinktime or this new queue
 		 * has some old slice time left and is of higher priority or
 		 * this new queue is RT and the current one is BE
 		 */
 		cfq_preempt_queue(cfqd, cfqq);
 		__blk_run_queue(cfqd->queue);
 	}
 }
 static void cfq_insert_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	cfq_log_cfqq(cfqd, cfqq, "insert_request");
 	cfq_init_prio_data(cfqq, RQ_CIC(rq));
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
 	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
 				 rq->cmd_flags);
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 /*
  * Update hw_tag based on peak queue depth over 50 samples under
  * sufficient load.
  */
 static void cfq_update_hw_tag(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
 		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
 	if (cfqd->hw_tag == 1)
 		return;
 	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
 	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
 		return;
 	/*
 	 * If active queue hasn't enough requests and can idle, cfq might not
 	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
 	 * case
 	 */
 	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
 	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
 	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
 		return;
 	if (cfqd->hw_tag_samples++ < 50)
 		return;
 	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
 		cfqd->hw_tag = 1;
 	else
 		cfqd->hw_tag = 0;
 }
 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct cfq_io_cq *cic = cfqd->active_cic;
 	/* If the queue already has requests, don't wait */
 	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 		return false;
 	/* If there are other queues in the group, don't wait */
 	if (cfqq->cfqg->nr_cfqq > 1)
 		return false;
 	/* the only queue in the group, but think time is big */
 	if (cfq_io_thinktime_big(cfqd, &cfqq->cfqg->ttime, true))
 		return false;
 	if (cfq_slice_used(cfqq))
 		return true;
 	/* if slice left is less than think time, wait busy */
 	if (cic && sample_valid(cic->ttime.ttime_samples)
 	    && (cfqq->slice_end - jiffies < cic->ttime.ttime_mean))
 		return true;
 	/*
 	 * If think times is less than a jiffy than ttime_mean=0 and above
 	 * will not be true. It might happen that slice has not expired yet
 	 * but will expire soon (4-5 ns) during select_queue(). To cover the
 	 * case where think time is less than a jiffy, mark the queue wait
 	 * busy if only 1 jiffy is left in the slice.
 	 */
 	if (cfqq->slice_end - jiffies == 1)
 		return true;
 	return false;
 }
 static void cfq_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = cfqq->cfqd;
 	const int sync = rq_is_sync(rq);
 	unsigned long now;
 	now = jiffies;
 	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d",
 		     !!(rq->cmd_flags & REQ_NOIDLE));
 	cfq_update_hw_tag(cfqd);
 	WARN_ON(!cfqd->rq_in_driver);
 	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
 	(RQ_CFQG(rq))->dispatched--;
 	cfqg_stats_update_completion(cfqq->cfqg, rq_start_time_ns(rq),
 				     rq_io_start_time_ns(rq), rq->cmd_flags);
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 	if (sync) {
 		struct cfq_rb_root *service_tree;
 		RQ_CIC(rq)->ttime.last_end_request = now;
 		if (cfq_cfqq_on_rr(cfqq))
 			service_tree = cfqq->service_tree;
 		else
 			service_tree = service_tree_for(cfqq->cfqg,
 				cfqq_prio(cfqq), cfqq_type(cfqq));
 		service_tree->ttime.last_end_request = now;
 		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
 			cfqd->last_delayed_sync = now;
 	}
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	cfqq->cfqg->ttime.last_end_request = now;
 #endif
 	/*
 	 * If this is the active queue, check if it needs to be expired,
 	 * or if we want to idle in case it has no pending requests.
 	 */
 	if (cfqd->active_queue == cfqq) {
 		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
 		if (cfq_cfqq_slice_new(cfqq)) {
 			cfq_set_prio_slice(cfqd, cfqq);
 			cfq_clear_cfqq_slice_new(cfqq);
 		}
 		/*
 		 * Should we wait for next request to come in before we expire
 		 * the queue.
 		 */
 		if (cfq_should_wait_busy(cfqd, cfqq)) {
 			unsigned long extend_sl = cfqd->cfq_slice_idle;
 			if (!cfqd->cfq_slice_idle)
 				extend_sl = cfqd->cfq_group_idle;
 			cfqq->slice_end = jiffies + extend_sl;
 			cfq_mark_cfqq_wait_busy(cfqq);
 			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
 		}
 		/*
 		 * Idling is not enabled on:
 		 * - expired queues
 		 * - idle-priority queues
 		 * - async queues
 		 * - queues with still some requests queued
 		 * - when there is a close cooperator
 		 */
 		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
 			cfq_slice_expired(cfqd, 1);
 		else if (sync && cfqq_empty &&
 			 !cfq_close_cooperator(cfqd, cfqq)) {
 			cfq_arm_slice_timer(cfqd);
 		}
 	}
 	if (!cfqd->rq_in_driver)
 		cfq_schedule_dispatch(cfqd);
 }
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
 {
 	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
 		cfq_mark_cfqq_must_alloc_slice(cfqq);
 		return ELV_MQUEUE_MUST;
 	}
 	return ELV_MQUEUE_MAY;
 }
 static int cfq_may_queue(struct request_queue *q, int rw)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct task_struct *tsk = current;
 	struct cfq_io_cq *cic;
 	struct cfq_queue *cfqq;
 	/*
 	 * don't force setup of a queue from here, as a call to may_queue
 	 * does not necessarily imply that a request actually will be queued.
 	 * so just lookup a possibly existing queue, or return 'may queue'
 	 * if that fails
 	 */
 	cic = cfq_cic_lookup(cfqd, tsk->io_context);
 	if (!cic)
 		return ELV_MQUEUE_MAY;
 	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
 	if (cfqq) {
 		cfq_init_prio_data(cfqq, cic);
 		return __cfq_may_queue(cfqq);
 	}
 	return ELV_MQUEUE_MAY;
 }
 /*
  * queue lock held here
  */
 static void cfq_put_request(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	if (cfqq) {
 		const int rw = rq_data_dir(rq);
 		BUG_ON(!cfqq->allocated[rw]);
 		cfqq->allocated[rw]--;
 		/* Put down rq reference on cfqg */
 		cfqg_put(RQ_CFQG(rq));
 		rq->elv.priv[0] = NULL;
 		rq->elv.priv[1] = NULL;
 		cfq_put_queue(cfqq);
 	}
 }
 static struct cfq_queue *
 cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
 		struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
 	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
 	cfq_mark_cfqq_coop(cfqq->new_cfqq);
 	cfq_put_queue(cfqq);
 	return cic_to_cfqq(cic, 1);
 }
 /*
  * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
  * was the last process referring to said cfqq.
  */
 static struct cfq_queue *
 split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
 {
 	if (cfqq_process_refs(cfqq) == 1) {
 		cfqq->pid = current->pid;
 		cfq_clear_cfqq_coop(cfqq);
 		cfq_clear_cfqq_split_coop(cfqq);
 		return cfqq;
 	}
 	cic_set_cfqq(cic, NULL, 1);
 	cfq_put_cooperator(cfqq);
 	cfq_put_queue(cfqq);
 	return NULL;
 }
 /*
  * Allocate cfq data structures associated with this request.
  */
 static int
 cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 		gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
 	const int rw = rq_data_dir(rq);
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	spin_lock_irq(q->queue_lock);
 	check_ioprio_changed(cic, bio);
 	check_blkcg_changed(cic, bio);
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
 		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
 		/*
 		 * If the queue was seeky for too long, break it apart.
 		 */
 		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
 			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
 			cfqq = split_cfqq(cic, cfqq);
 			if (!cfqq)
 				goto new_queue;
 		}
 		/*
 		 * Check to see if this queue is scheduled to merge with
 		 * another, closely cooperating queue.  The merging of
 		 * queues happens here as it must be done in process context.
 		 * The reference on new_cfqq was taken in merge_cfqqs.
 		 */
 		if (cfqq->new_cfqq)
 			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
 	}
 	cfqq->allocated[rw]++;
 	cfqq->ref++;
 	cfqg_get(cfqq->cfqg);
 	rq->elv.priv[0] = cfqq;
 	rq->elv.priv[1] = cfqq->cfqg;
 	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
 static void cfq_kick_queue(struct work_struct *work)
 {
 	struct cfq_data *cfqd =
 		container_of(work, struct cfq_data, unplug_work);
 	struct request_queue *q = cfqd->queue;
 	spin_lock_irq(q->queue_lock);
 	__blk_run_queue(cfqd->queue);
 	spin_unlock_irq(q->queue_lock);
 }
 /*
  * Timer running if the active_queue is currently idling inside its time slice
  */
 static void cfq_idle_slice_timer(unsigned long data)
 {
 	struct cfq_data *cfqd = (struct cfq_data *) data;
 	struct cfq_queue *cfqq;
 	unsigned long flags;
 	int timed_out = 1;
 	cfq_log(cfqd, "idle timer fired");
 	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 	cfqq = cfqd->active_queue;
 	if (cfqq) {
 		timed_out = 0;
 		/*
 		 * We saw a request before the queue expired, let it through
 		 */
 		if (cfq_cfqq_must_dispatch(cfqq))
 			goto out_kick;
 		/*
 		 * expired
 		 */
 		if (cfq_slice_used(cfqq))
 			goto expire;
 		/*
 		 * only expire and reinvoke request handler, if there are
 		 * other queues with pending requests
 		 */
 		if (!cfqd->busy_queues)
 			goto out_cont;
 		/*
 		 * not expired and it has a request pending, let it dispatch
 		 */
 		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 			goto out_kick;
 		/*
 		 * Queue depth flag is reset only when the idle didn't succeed
 		 */
 		cfq_clear_cfqq_deep(cfqq);
 	}
 expire:
 	cfq_slice_expired(cfqd, timed_out);
 out_kick:
 	cfq_schedule_dispatch(cfqd);
 out_cont:
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 {
 	del_timer_sync(&cfqd->idle_slice_timer);
 	cancel_work_sync(&cfqd->unplug_work);
 }
 static void cfq_put_async_queues(struct cfq_data *cfqd)
 {
 	int i;
 	for (i = 0; i < IOPRIO_BE_NR; i++) {
 		if (cfqd->async_cfqq[0][i])
 			cfq_put_queue(cfqd->async_cfqq[0][i]);
 		if (cfqd->async_cfqq[1][i])
 			cfq_put_queue(cfqd->async_cfqq[1][i]);
 	}
 	if (cfqd->async_idle_cfqq)
 		cfq_put_queue(cfqd->async_idle_cfqq);
 }
 static void cfq_exit_queue(struct elevator_queue *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 	struct request_queue *q = cfqd->queue;
 	cfq_shutdown_timer_wq(cfqd);
 	spin_lock_irq(q->queue_lock);
 	if (cfqd->active_queue)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 	cfq_put_async_queues(cfqd);
 	spin_unlock_irq(q->queue_lock);
 	cfq_shutdown_timer_wq(cfqd);
 #ifndef CONFIG_CFQ_GROUP_IOSCHED
 	kfree(cfqd->root_group);
 #endif
 	update_root_blkg_pd(q, &blkio_policy_cfq);
 	kfree(cfqd);
 }
 static int cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
 	struct blkio_group *blkg __maybe_unused;
 	int i;
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!cfqd)
 		return -ENOMEM;
 	cfqd->queue = q;
 	q->elevator->elevator_data = cfqd;
 	/* Init root service tree */
 	cfqd->grp_service_tree = CFQ_RB_ROOT;
 	/* Init root group and prefer root group over other groups by default */
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	rcu_read_lock();
 	spin_lock_irq(q->queue_lock);
 	blkg = blkg_lookup_create(&blkio_root_cgroup, q, true);
-	if (!IS_ERR(blkg))
+	if (!IS_ERR(blkg)) {
+		q->root_blkg = blkg;
 		cfqd->root_group = blkg_to_cfqg(blkg);
+	}
 	spin_unlock_irq(q->queue_lock);
 	rcu_read_unlock();
 #else
 	cfqd->root_group = kzalloc_node(sizeof(*cfqd->root_group),
 					GFP_KERNEL, cfqd->queue->node);
 	if (cfqd->root_group)
 		cfq_init_cfqg_base(cfqd->root_group);
 #endif
 	if (!cfqd->root_group) {
 		kfree(cfqd);
 		return -ENOMEM;
 	}
 	cfqd->root_group->weight = 2 * CFQ_WEIGHT_DEFAULT;
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
 	 * zeroed cfqd on alloc), but better be safe in case someone decides
 	 * to add magic to the rb code
 	 */
 	for (i = 0; i < CFQ_PRIO_LISTS; i++)
 		cfqd->prio_trees[i] = RB_ROOT;
 	/*
 	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
 	 * will not attempt to free it.  oom_cfqq is linked to root_group
 	 * but shouldn't hold a reference as it'll never be unlinked.  Lose
 	 * the reference from linking right away.
 	 */
 	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
 	cfqd->oom_cfqq.ref++;
 	spin_lock_irq(q->queue_lock);
 	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, cfqd->root_group);
 	cfqg_put(cfqd->root_group);
 	spin_unlock_irq(q->queue_lock);
 	init_timer(&cfqd->idle_slice_timer);
 	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
 	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
 	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
 	cfqd->cfq_quantum = cfq_quantum;
 	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
 	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
 	cfqd->cfq_back_max = cfq_back_max;
 	cfqd->cfq_back_penalty = cfq_back_penalty;
 	cfqd->cfq_slice[0] = cfq_slice_async;
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_group_idle = cfq_group_idle;
 	cfqd->cfq_latency = 1;
 	cfqd->hw_tag = -1;
 	/*
 	 * we optimistically start assuming sync ops weren't delayed in last
 	 * second, in order to have larger depth for async operations.
 	 */
 	cfqd->last_delayed_sync = jiffies - HZ;
 	return 0;
 }
 /*
  * sysfs parts below -->
  */
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
 {
 	return sprintf(page, "%d\n", var);
 }
 static ssize_t
 cfq_var_store(unsigned int *var, const char *page, size_t count)
 {
 	char *p = (char *) page;
 	*var = simple_strtoul(p, &p, 10);
 	return count;
 }
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
 static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 {									\
 	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data = __VAR;					\
 	if (__CONV)							\
 		__data = jiffies_to_msecs(__data);			\
 	return cfq_var_show(__data, (page));				\
 }
 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
 SHOW_FUNCTION(cfq_group_idle_show, cfqd->cfq_group_idle, 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
 #undef SHOW_FUNCTION
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 {									\
 	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data;						\
 	int ret = cfq_var_store(&__data, (page), count);		\
 	if (__data < (MIN))						\
 		__data = (MIN);						\
 	else if (__data > (MAX))					\
 		__data = (MAX);						\
 	if (__CONV)							\
 		*(__PTR) = msecs_to_jiffies(__data);			\
 	else								\
 		*(__PTR) = __data;					\
 	return ret;							\
 }
 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
 		UINT_MAX, 1);
 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
 		UINT_MAX, 1);
 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
 		UINT_MAX, 0);
 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
 STORE_FUNCTION(cfq_group_idle_store, &cfqd->cfq_group_idle, 0, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
 		UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
 #undef STORE_FUNCTION
 #define CFQ_ATTR(name) \
 	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
 static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(quantum),
 	CFQ_ATTR(fifo_expire_sync),
 	CFQ_ATTR(fifo_expire_async),
 	CFQ_ATTR(back_seek_max),
 	CFQ_ATTR(back_seek_penalty),
 	CFQ_ATTR(slice_sync),
 	CFQ_ATTR(slice_async),
 	CFQ_ATTR(slice_async_rq),
 	CFQ_ATTR(slice_idle),
 	CFQ_ATTR(group_idle),
 	CFQ_ATTR(low_latency),
 	__ATTR_NULL
 };
 static struct elevator_type iosched_cfq = {
 	.ops = {
 		.elevator_merge_fn = 		cfq_merge,
 		.elevator_merged_fn =		cfq_merged_request,
 		.elevator_merge_req_fn =	cfq_merged_requests,
 		.elevator_allow_merge_fn =	cfq_allow_merge,
 		.elevator_bio_merged_fn =	cfq_bio_merged,
 		.elevator_dispatch_fn =		cfq_dispatch_requests,
 		.elevator_add_req_fn =		cfq_insert_request,
 		.elevator_activate_req_fn =	cfq_activate_request,
 		.elevator_deactivate_req_fn =	cfq_deactivate_request,
 		.elevator_completed_req_fn =	cfq_completed_request,
 		.elevator_former_req_fn =	elv_rb_former_request,
 		.elevator_latter_req_fn =	elv_rb_latter_request,
 		.elevator_init_icq_fn =		cfq_init_icq,
 		.elevator_exit_icq_fn =		cfq_exit_icq,
 		.elevator_set_req_fn =		cfq_set_request,
 		.elevator_put_req_fn =		cfq_put_request,
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
 	},
 	.icq_size	=	sizeof(struct cfq_io_cq),
 	.icq_align	=	__alignof__(struct cfq_io_cq),
 	.elevator_attrs =	cfq_attrs,
 	.elevator_name	=	"cfq",
 	.elevator_owner =	THIS_MODULE,
 };
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkio_policy_type blkio_policy_cfq = {
 	.ops = {
 		.blkio_init_group_fn =		cfq_init_blkio_group,
 		.blkio_reset_group_stats_fn =	cfqg_stats_reset,
 	},
 	.pdata_size = sizeof(struct cfq_group),
 	.cftypes = cfq_blkcg_files,
 };
 #endif
 static int __init cfq_init(void)
 {
 	int ret;
 	/*
 	 * could be 0 on HZ < 1000 setups
 	 */
 	if (!cfq_slice_async)
 		cfq_slice_async = 1;
 	if (!cfq_slice_idle)
 		cfq_slice_idle = 1;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	if (!cfq_group_idle)
 		cfq_group_idle = 1;
 #else
 		cfq_group_idle = 0;
 #endif
 	ret = blkio_policy_register(&blkio_policy_cfq);
 	if (ret)
 		return ret;
 	cfq_pool = KMEM_CACHE(cfq_queue, 0);
 	if (!cfq_pool)
 		goto err_pol_unreg;
 	ret = elv_register(&iosched_cfq);
 	if (ret)
 		goto err_free_pool;
 	return 0;
 err_free_pool:
 	kmem_cache_destroy(cfq_pool);
 err_pol_unreg:
 	blkio_policy_unregister(&blkio_policy_cfq);
 	return ret;
 }
 static void __exit cfq_exit(void)
 {
 	blkio_policy_unregister(&blkio_policy_cfq);
 	elv_unregister(&iosched_cfq);
 	kmem_cache_destroy(cfq_pool);
 }
 module_init(cfq_init);
 module_exit(cfq_exit);
 MODULE_AUTHOR("Jens Axboe");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");

 #ifndef _LINUX_BLKDEV_H
 #define _LINUX_BLKDEV_H
 #ifdef CONFIG_BLOCK
 #include <linux/sched.h>
 #include <linux/major.h>
 #include <linux/genhd.h>
 #include <linux/list.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/stringify.h>
 #include <linux/gfp.h>
 #include <linux/bsg.h>
 #include <linux/smp.h>
 #include <asm/scatterlist.h>
 struct module;
 struct scsi_ioctl_command;
 struct request_queue;
 struct elevator_queue;
 struct request_pm_state;
 struct blk_trace;
 struct request;
 struct sg_io_hdr;
 struct bsg_job;
+struct blkio_group;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 /*
  * Maximum number of blkcg policies allowed to be registered concurrently.
  * Defined here to simplify include dependency.
  */
 #define BLKCG_MAX_POLS		2
 struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
 struct request_list {
 	/*
 	 * count[], starved[], and wait[] are indexed by
 	 * BLK_RW_SYNC/BLK_RW_ASYNC
 	 */
 	int count[2];
 	int starved[2];
 	int elvpriv;
 	mempool_t *rq_pool;
 	wait_queue_head_t wait[2];
 };
 /*
  * request command types
  */
 enum rq_cmd_type_bits {
 	REQ_TYPE_FS		= 1,	/* fs request */
 	REQ_TYPE_BLOCK_PC,		/* scsi command */
 	REQ_TYPE_SENSE,			/* sense request */
 	REQ_TYPE_PM_SUSPEND,		/* suspend request */
 	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
 	REQ_TYPE_SPECIAL,		/* driver defined type */
 	/*
 	 * for ATA/ATAPI devices. this really doesn't belong here, ide should
 	 * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver
 	 * private REQ_LB opcodes to differentiate what type of request this is
 	 */
 	REQ_TYPE_ATA_TASKFILE,
 	REQ_TYPE_ATA_PC,
 };
 #define BLK_MAX_CDB	16
 /*
  * try to put the fields that are referenced together in the same cacheline.
  * if you modify this structure, be sure to check block/blk-core.c:blk_rq_init()
  * as well!
  */
 struct request {
 	struct list_head queuelist;
 	struct call_single_data csd;
 	struct request_queue *q;
 	unsigned int cmd_flags;
 	enum rq_cmd_type_bits cmd_type;
 	unsigned long atomic_flags;
 	int cpu;
 	/* the following two fields are internal, NEVER access directly */
 	unsigned int __data_len;	/* total data len */
 	sector_t __sector;		/* sector cursor */
 	struct bio *bio;
 	struct bio *biotail;
 	struct hlist_node hash;	/* merge hash */
 	/*
 	 * The rb_node is only used inside the io scheduler, requests
 	 * are pruned when moved to the dispatch queue. So let the
 	 * completion_data share space with the rb_node.
 	 */
 	union {
 		struct rb_node rb_node;	/* sort/lookup */
 		void *completion_data;
 	};
 	/*
 	 * Three pointers are available for the IO schedulers, if they need
 	 * more they have to dynamically allocate it.  Flush requests are
 	 * never put on the IO scheduler. So let the flush fields share
 	 * space with the elevator data.
 	 */
 	union {
 		struct {
 			struct io_cq		*icq;
 			void			*priv[2];
 		} elv;
 		struct {
 			unsigned int		seq;
 			struct list_head	list;
 			rq_end_io_fn		*saved_end_io;
 		} flush;
 	};
 	struct gendisk *rq_disk;
 	struct hd_struct *part;
 	unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
 	unsigned long long start_time_ns;
 	unsigned long long io_start_time_ns;    /* when passed to hardware */
 #endif
 	/* Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	unsigned short nr_integrity_segments;
 #endif
 	unsigned short ioprio;
 	int ref_count;
 	void *special;		/* opaque pointer available for LLD use */
 	char *buffer;		/* kaddr of the current segment if available */
 	int tag;
 	int errors;
 	/*
 	 * when request is used as a packet command carrier
 	 */
 	unsigned char __cmd[BLK_MAX_CDB];
 	unsigned char *cmd;
 	unsigned short cmd_len;
 	unsigned int extra_len;	/* length of alignment and padding */
 	unsigned int sense_len;
 	unsigned int resid_len;	/* residual count */
 	void *sense;
 	unsigned long deadline;
 	struct list_head timeout_list;
 	unsigned int timeout;
 	int retries;
 	/*
 	 * completion callback.
 	 */
 	rq_end_io_fn *end_io;
 	void *end_io_data;
 	/* for bidi */
 	struct request *next_rq;
 };
 static inline unsigned short req_get_ioprio(struct request *req)
 {
 	return req->ioprio;
 }
 /*
  * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME
  * requests. Some step values could eventually be made generic.
  */
 struct request_pm_state
 {
 	/* PM state machine step value, currently driver specific */
 	int	pm_step;
 	/* requested PM state value (S1, S2, S3, S4, ...) */
 	u32	pm_state;
 	void*	data;		/* for driver use */
 };
 #include <linux/elevator.h>
 typedef void (request_fn_proc) (struct request_queue *q);
 typedef void (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 struct bio_vec;
 struct bvec_merge_data {
 	struct block_device *bi_bdev;
 	sector_t bi_sector;
 	unsigned bi_size;
 	unsigned long bi_rw;
 };
 typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *,
 			     struct bio_vec *);
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
 typedef int (lld_busy_fn) (struct request_queue *q);
 typedef int (bsg_job_fn) (struct bsg_job *);
 enum blk_eh_timer_return {
 	BLK_EH_NOT_HANDLED,
 	BLK_EH_HANDLED,
 	BLK_EH_RESET_TIMER,
 };
 typedef enum blk_eh_timer_return (rq_timed_out_fn)(struct request *);
 enum blk_queue_state {
 	Queue_down,
 	Queue_up,
 };
 struct blk_queue_tag {
 	struct request **tag_index;	/* map of busy tags */
 	unsigned long *tag_map;		/* bit map of free/busy tags */
 	int busy;			/* current depth */
 	int max_depth;			/* what we will send to device */
 	int real_max_depth;		/* what the array can hold */
 	atomic_t refcnt;		/* map can be shared */
 };
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
 struct queue_limits {
 	unsigned long		bounce_pfn;
 	unsigned long		seg_boundary_mask;
 	unsigned int		max_hw_sectors;
 	unsigned int		max_sectors;
 	unsigned int		max_segment_size;
 	unsigned int		physical_block_size;
 	unsigned int		alignment_offset;
 	unsigned int		io_min;
 	unsigned int		io_opt;
 	unsigned int		max_discard_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
 	unsigned short		logical_block_size;
 	unsigned short		max_segments;
 	unsigned short		max_integrity_segments;
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
 	unsigned char		cluster;
 	unsigned char		discard_zeroes_data;
 };
 struct request_queue {
 	/*
 	 * Together with queue_head for cacheline sharing
 	 */
 	struct list_head	queue_head;
 	struct request		*last_merge;
 	struct elevator_queue	*elevator;
 	/*
 	 * the queue request freelist, one for reads and one for writes
 	 */
 	struct request_list	rq;
 	request_fn_proc		*request_fn;
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unprep_rq_fn		*unprep_rq_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	softirq_done_fn		*softirq_done_fn;
 	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
 	lld_busy_fn		*lld_busy_fn;
 	/*
 	 * Dispatch queue sorting
 	 */
 	sector_t		end_sector;
 	struct request		*boundary_rq;
 	/*
 	 * Delayed queue handling
 	 */
 	struct delayed_work	delay_work;
 	struct backing_dev_info	backing_dev_info;
 	/*
 	 * The queue owner gets to use this for whatever they like.
 	 * ll_rw_blk doesn't touch it.
 	 */
 	void			*queuedata;
 	/*
 	 * various queue flags, see QUEUE_* below
 	 */
 	unsigned long		queue_flags;
 	/*
 	 * ida allocated id for this queue.  Used to index queues from
 	 * ioctx.
 	 */
 	int			id;
 	/*
 	 * queue needs bounce pages for pages above this limit
 	 */
 	gfp_t			bounce_gfp;
 	/*
 	 * protects queue structures from reentrancy. ->__queue_lock should
 	 * _never_ be used directly, it is queue private. always use
 	 * ->queue_lock.
 	 */
 	spinlock_t		__queue_lock;
 	spinlock_t		*queue_lock;
 	/*
 	 * queue kobject
 	 */
 	struct kobject kobj;
 	/*
 	 * queue settings
 	 */
 	unsigned long		nr_requests;	/* Max # of requests */
 	unsigned int		nr_congestion_on;
 	unsigned int		nr_congestion_off;
 	unsigned int		nr_batching;
 	unsigned int		dma_drain_size;
 	void			*dma_drain_buffer;
 	unsigned int		dma_pad_mask;
 	unsigned int		dma_alignment;
 	struct blk_queue_tag	*queue_tags;
 	struct list_head	tag_busy_list;
 	unsigned int		nr_sorted;
 	unsigned int		in_flight[2];
 	unsigned int		rq_timeout;
 	struct timer_list	timeout;
 	struct list_head	timeout_list;
 	struct list_head	icq_list;
 #ifdef CONFIG_BLK_CGROUP
+	struct blkio_group	*root_blkg;
 	struct list_head	blkg_list;
 #endif
 	struct queue_limits	limits;
 	/*
 	 * sg stuff
 	 */
 	unsigned int		sg_timeout;
 	unsigned int		sg_reserved_size;
 	int			node;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	struct blk_trace	*blk_trace;
 #endif
 	/*
 	 * for flush operations
 	 */
 	unsigned int		flush_flags;
 	unsigned int		flush_not_queueable:1;
 	unsigned int		flush_queue_delayed:1;
 	unsigned int		flush_pending_idx:1;
 	unsigned int		flush_running_idx:1;
 	unsigned long		flush_pending_since;
 	struct list_head	flush_queue[2];
 	struct list_head	flush_data_in_flight;
 	struct request		flush_rq;
 	struct mutex		sysfs_lock;
 	int			bypass_depth;
 #if defined(CONFIG_BLK_DEV_BSG)
 	bsg_job_fn		*bsg_job_fn;
 	int			bsg_job_size;
 	struct bsg_class_device bsg_dev;
 #endif
 #ifdef CONFIG_BLK_CGROUP
 	struct list_head	all_q_node;
 #endif
 #ifdef CONFIG_BLK_DEV_THROTTLING
 	/* Throttle data */
 	struct throtl_data *td;
 #endif
 };
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
 #define QUEUE_FLAG_STOPPED	2	/* queue is stopped */
 #define	QUEUE_FLAG_SYNCFULL	3	/* read queue has been filled */
 #define QUEUE_FLAG_ASYNCFULL	4	/* write queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_BYPASS	6	/* act as dumb FIFO queue */
 #define QUEUE_FLAG_BIDI		7	/* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES     8	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP	9	/* complete on same CPU-group */
 #define QUEUE_FLAG_FAIL_IO     10	/* fake timeout */
 #define QUEUE_FLAG_STACKABLE   11	/* supports request stacking */
 #define QUEUE_FLAG_NONROT      12	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 #define QUEUE_FLAG_IO_STAT     13	/* do IO stats */
 #define QUEUE_FLAG_DISCARD     14	/* supports DISCARD */
 #define QUEUE_FLAG_NOXMERGES   15	/* No extended merges */
 #define QUEUE_FLAG_ADD_RANDOM  16	/* Contributes to random pool */
 #define QUEUE_FLAG_SECDISCARD  17	/* supports SECDISCARD */
 #define QUEUE_FLAG_SAME_FORCE  18	/* force complete on same CPU */
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
 				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
 				 (1 << QUEUE_FLAG_ADD_RANDOM))
 static inline int queue_is_locked(struct request_queue *q)
 {
 #ifdef CONFIG_SMP
 	spinlock_t *lock = q->queue_lock;
 	return lock && spin_is_locked(lock);
 #else
 	return 1;
 #endif
 }
 static inline void queue_flag_set_unlocked(unsigned int flag,
 					   struct request_queue *q)
 {
 	__set_bit(flag, &q->queue_flags);
 }
 static inline int queue_flag_test_and_clear(unsigned int flag,
 					    struct request_queue *q)
 {
 	WARN_ON_ONCE(!queue_is_locked(q));
 	if (test_bit(flag, &q->queue_flags)) {
 		__clear_bit(flag, &q->queue_flags);
 		return 1;
 	}
 	return 0;
 }
 static inline int queue_flag_test_and_set(unsigned int flag,
 					  struct request_queue *q)
 {
 	WARN_ON_ONCE(!queue_is_locked(q));
 	if (!test_bit(flag, &q->queue_flags)) {
 		__set_bit(flag, &q->queue_flags);
 		return 0;
 	}
 	return 1;
 }
 static inline void queue_flag_set(unsigned int flag, struct request_queue *q)
 {
 	WARN_ON_ONCE(!queue_is_locked(q));
 	__set_bit(flag, &q->queue_flags);
 }
 static inline void queue_flag_clear_unlocked(unsigned int flag,
 					     struct request_queue *q)
 {
 	__clear_bit(flag, &q->queue_flags);
 }
 static inline int queue_in_flight(struct request_queue *q)
 {
 	return q->in_flight[0] + q->in_flight[1];
 }
 static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 {
 	WARN_ON_ONCE(!queue_is_locked(q));
 	__clear_bit(flag, &q->queue_flags);
 }
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_dead(q)	test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
 #define blk_queue_bypass(q)	test_bit(QUEUE_FLAG_BYPASS, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_noxmerges(q)	\
 	test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
 #define blk_queue_nonrot(q)	test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
 #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
 #define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
 #define blk_queue_stackable(q)	\
 	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
 #define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
 	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
 			     REQ_FAILFAST_DRIVER))
 #define blk_account_rq(rq) \
 	(((rq)->cmd_flags & REQ_STARTED) && \
 	 ((rq)->cmd_type == REQ_TYPE_FS || \
 	  ((rq)->cmd_flags & REQ_DISCARD)))
 #define blk_pm_request(rq)	\
 	((rq)->cmd_type == REQ_TYPE_PM_SUSPEND || \
 	 (rq)->cmd_type == REQ_TYPE_PM_RESUME)
 #define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 /* rq->queuelist of dequeued request must be list_empty() */
 #define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 #define rq_data_dir(rq)		((rq)->cmd_flags & 1)
 static inline unsigned int blk_queue_cluster(struct request_queue *q)
 {
 	return q->limits.cluster;
 }
 /*
  * We regard a request as sync, if either a read or a sync write
  */
 static inline bool rw_is_sync(unsigned int rw_flags)
 {
 	return !(rw_flags & REQ_WRITE) || (rw_flags & REQ_SYNC);
 }
 static inline bool rq_is_sync(struct request *rq)
 {
 	return rw_is_sync(rq->cmd_flags);
 }
 static inline int blk_queue_full(struct request_queue *q, int sync)
 {
 	if (sync)
 		return test_bit(QUEUE_FLAG_SYNCFULL, &q->queue_flags);
 	return test_bit(QUEUE_FLAG_ASYNCFULL, &q->queue_flags);
 }
 static inline void blk_set_queue_full(struct request_queue *q, int sync)
 {
 	if (sync)
 		queue_flag_set(QUEUE_FLAG_SYNCFULL, q);
 	else
 		queue_flag_set(QUEUE_FLAG_ASYNCFULL, q);
 }
 static inline void blk_clear_queue_full(struct request_queue *q, int sync)
 {
 	if (sync)
 		queue_flag_clear(QUEUE_FLAG_SYNCFULL, q);
 	else
 		queue_flag_clear(QUEUE_FLAG_ASYNCFULL, q);
 }
 /*
  * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may
  * it already be started by driver.
  */
 #define RQ_NOMERGE_FLAGS	\
 	(REQ_NOMERGE | REQ_STARTED | REQ_SOFTBARRIER | REQ_FLUSH | REQ_FUA)
 #define rq_mergeable(rq)	\
 	(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
 	 (((rq)->cmd_flags & REQ_DISCARD) || \
 	  (rq)->cmd_type == REQ_TYPE_FS))
 /*
  * q->prep_rq_fn return values
  */
 #define BLKPREP_OK		0	/* serve it */
 #define BLKPREP_KILL		1	/* fatal error, kill */
 #define BLKPREP_DEFER		2	/* leave on queue */
 extern unsigned long blk_max_low_pfn, blk_max_pfn;
 /*
  * standard bounce addresses:
  *
  * BLK_BOUNCE_HIGH	: bounce all highmem pages
  * BLK_BOUNCE_ANY	: don't bounce anything
  * BLK_BOUNCE_ISA	: bounce pages above ISA DMA boundary
  */
 #if BITS_PER_LONG == 32
 #define BLK_BOUNCE_HIGH		((u64)blk_max_low_pfn << PAGE_SHIFT)
 #else
 #define BLK_BOUNCE_HIGH		-1ULL
 #endif
 #define BLK_BOUNCE_ANY		(-1ULL)
 #define BLK_BOUNCE_ISA		(DMA_BIT_MASK(24))
 /*
  * default timeout for SG_IO if none specified
  */
 #define BLK_DEFAULT_SG_TIMEOUT	(60 * HZ)
 #define BLK_MIN_SG_TIMEOUT	(7 * HZ)
 #ifdef CONFIG_BOUNCE
 extern int init_emergency_isa_pool(void);
 extern void blk_queue_bounce(struct request_queue *q, struct bio **bio);
 #else
 static inline int init_emergency_isa_pool(void)
 {
 	return 0;
 }
 static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
 {
 }
 #endif /* CONFIG_MMU */
 struct rq_map_data {
 	struct page **pages;
 	int page_order;
 	int nr_entries;
 	unsigned long offset;
 	int null_mapped;
 	int from_user;
 };
 struct req_iterator {
 	int i;
 	struct bio *bio;
 };
 /* This should not be used directly - use rq_for_each_segment */
 #define for_each_bio(_bio)		\
 	for (; _bio; _bio = _bio->bi_next)
 #define __rq_for_each_bio(_bio, rq)	\
 	if ((rq->bio))			\
 		for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
 #define rq_for_each_segment(bvl, _rq, _iter)			\
 	__rq_for_each_bio(_iter.bio, _rq)			\
 		bio_for_each_segment(bvl, _iter.bio, _iter.i)
 #define rq_iter_last(rq, _iter)					\
 		(_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1)
 #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 # error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
 #endif
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 extern void rq_flush_dcache_pages(struct request *rq);
 #else
 static inline void rq_flush_dcache_pages(struct request *rq)
 {
 }
 #endif
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern void generic_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern struct request *blk_make_request(struct request_queue *, struct bio *,
 					gfp_t);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 			     struct bio_set *bs, gfp_t gfp_mask,
 			     int (*bio_ctr)(struct bio *, struct bio *, void *),
 			     void *data);
 extern void blk_rq_unprep_clone(struct request *rq);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_delay_queue(struct request_queue *, unsigned long);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_verify_blk_ioctl(struct block_device *, unsigned int);
 extern int scsi_cmd_blk_ioctl(struct block_device *, fmode_t,
 			      unsigned int, void __user *);
 extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			  unsigned int, void __user *);
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 extern void blk_queue_bio(struct request_queue *q, struct bio *bio);
 /*
  * A queue has just exitted congestion.  Note this in the global counter of
  * congested queues, and wake up anyone who was waiting for requests to be
  * put back.
  */
 static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
 {
 	clear_bdi_congested(&q->backing_dev_info, sync);
 }
 /*
  * A queue has just entered congestion.  Flag that in the queue's VM-visible
  * state flags and increment the global gounter of congested queues.
  */
 static inline void blk_set_queue_congested(struct request_queue *q, int sync)
 {
 	set_bdi_congested(&q->backing_dev_info, sync);
 }
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
 extern void __blk_stop_queue(struct request_queue *q);
 extern void __blk_run_queue(struct request_queue *q);
 extern void blk_run_queue(struct request_queue *);
 extern void blk_run_queue_async(struct request_queue *q);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
 			   struct rq_map_data *, void __user *, unsigned long,
 			   gfp_t);
 extern int blk_rq_unmap_user(struct bio *);
 extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
 			       struct rq_map_data *, struct sg_iovec *, int,
 			       unsigned int, gfp_t);
 extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
 	return bdev->bd_disk->queue;
 }
 /*
  * blk_rq_pos()			: the current sector
  * blk_rq_bytes()		: bytes left in the entire request
  * blk_rq_cur_bytes()		: bytes left in the current segment
  * blk_rq_err_bytes()		: bytes left till the next error boundary
  * blk_rq_sectors()		: sectors left in the entire request
  * blk_rq_cur_sectors()		: sectors left in the current segment
  */
 static inline sector_t blk_rq_pos(const struct request *rq)
 {
 	return rq->__sector;
 }
 static inline unsigned int blk_rq_bytes(const struct request *rq)
 {
 	return rq->__data_len;
 }
 static inline int blk_rq_cur_bytes(const struct request *rq)
 {
 	return rq->bio ? bio_cur_bytes(rq->bio) : 0;
 }
 extern unsigned int blk_rq_err_bytes(const struct request *rq);
 static inline unsigned int blk_rq_sectors(const struct request *rq)
 {
 	return blk_rq_bytes(rq) >> 9;
 }
 static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 {
 	return blk_rq_cur_bytes(rq) >> 9;
 }
 /*
  * Request issue related functions.
  */
 extern struct request *blk_peek_request(struct request_queue *q);
 extern void blk_start_request(struct request *rq);
 extern struct request *blk_fetch_request(struct request_queue *q);
 /*
  * Request completion related functions.
  *
  * blk_update_request() completes given number of bytes and updates
  * the request without completing it.
  *
  * blk_end_request() and friends.  __blk_end_request() must be called
  * with the request queue spinlock acquired.
  *
  * Several drivers define their own end_request and call
  * blk_end_request() for parts of the original function.
  * This prevents code duplication in drivers.
  */
 extern bool blk_update_request(struct request *rq, int error,
 			       unsigned int nr_bytes);
 extern bool blk_end_request(struct request *rq, int error,
 			    unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, int error);
 extern bool blk_end_request_cur(struct request *rq, int error);
 extern bool blk_end_request_err(struct request *rq, int error);
 extern bool __blk_end_request(struct request *rq, int error,
 			      unsigned int nr_bytes);
 extern void __blk_end_request_all(struct request *rq, int error);
 extern bool __blk_end_request_cur(struct request *rq, int error);
 extern bool __blk_end_request_err(struct request *rq, int error);
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
 extern void blk_abort_queue(struct request_queue *);
 extern void blk_unprep_request(struct request *);
 /*
  * Access functions for manipulating queue properties
  */
 extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
 					spinlock_t *lock, int node_id);
 extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern struct request_queue *blk_init_allocated_queue(struct request_queue *,
 						      request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_limits_max_hw_sectors(struct queue_limits *, unsigned int);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
 extern void blk_set_default_limits(struct queue_limits *lim);
 extern void blk_set_stacking_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			    sector_t offset);
 extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
 			    sector_t offset);
 extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
 			      sector_t offset);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
 extern int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size);
 extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
 extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
 extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
 extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern long nr_blockdev_pages(void);
 bool __must_check blk_get_queue(struct request_queue *);
 struct request_queue *blk_alloc_queue(gfp_t);
 struct request_queue *blk_alloc_queue_node(gfp_t, int);
 extern void blk_put_queue(struct request_queue *);
 /*
  * blk_plug permits building a queue of related requests by holding the I/O
  * fragments for a short period. This allows merging of sequential requests
  * into single larger request. As the requests are moved from a per-task list to
  * the device's request_queue in a batch, this results in improved scalability
  * as the lock contention for request_queue lock is reduced.
  *
  * It is ok not to disable preemption when adding the request to the plug list
  * or when attempting a merge, because blk_schedule_flush_list() will only flush
  * the plug list when the task sleeps by itself. For details, please see
  * schedule() where blk_schedule_flush_plug() is called.
  */
 struct blk_plug {
 	unsigned long magic; /* detect uninitialized use-cases */
 	struct list_head list; /* requests */
 	struct list_head cb_list; /* md requires an unplug callback */
 	unsigned int should_sort; /* list to be sorted before flushing? */
 };
 #define BLK_MAX_REQUEST_COUNT 16
 struct blk_plug_cb {
 	struct list_head list;
 	void (*callback)(struct blk_plug_cb *);
 };
 extern void blk_start_plug(struct blk_plug *);
 extern void blk_finish_plug(struct blk_plug *);
 extern void blk_flush_plug_list(struct blk_plug *, bool);
 static inline void blk_flush_plug(struct task_struct *tsk)
 {
 	struct blk_plug *plug = tsk->plug;
 	if (plug)
 		blk_flush_plug_list(plug, false);
 }
 static inline void blk_schedule_flush_plug(struct task_struct *tsk)
 {
 	struct blk_plug *plug = tsk->plug;
 	if (plug)
 		blk_flush_plug_list(plug, true);
 }
 static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 {
 	struct blk_plug *plug = tsk->plug;
 	return plug && (!list_empty(&plug->list) || !list_empty(&plug->cb_list));
 }
 /*
  * tag stuff
  */
 #define blk_rq_tagged(rq)		((rq)->cmd_flags & REQ_QUEUED)
 extern int blk_queue_start_tag(struct request_queue *, struct request *);
 extern struct request *blk_queue_find_tag(struct request_queue *, int);
 extern void blk_queue_end_tag(struct request_queue *, struct request *);
 extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *);
 extern void blk_queue_free_tags(struct request_queue *);
 extern int blk_queue_resize_tags(struct request_queue *, int);
 extern void blk_queue_invalidate_tags(struct request_queue *);
 extern struct blk_queue_tag *blk_init_tags(int);
 extern void blk_free_tags(struct blk_queue_tag *);
 static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 						int tag)
 {
 	if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
 		return NULL;
 	return bqt->tag_index[tag];
 }
 #define BLKDEV_DISCARD_SECURE  0x01    /* secure discard */
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			sector_t nr_sects, gfp_t gfp_mask);
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
 	return blkdev_issue_discard(sb->s_bdev, block << (sb->s_blocksize_bits - 9),
 				    nr_blocks << (sb->s_blocksize_bits - 9),
 				    gfp_mask, flags);
 }
 static inline int sb_issue_zeroout(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask)
 {
 	return blkdev_issue_zeroout(sb->s_bdev,
 				    block << (sb->s_blocksize_bits - 9),
 				    nr_blocks << (sb->s_blocksize_bits - 9),
 				    gfp_mask);
 }
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
 enum blk_default_limits {
 	BLK_MAX_SEGMENTS	= 128,
 	BLK_SAFE_MAX_SECTORS	= 255,
 	BLK_DEF_MAX_SECTORS	= 1024,
 	BLK_MAX_SEGMENT_SIZE	= 65536,
 	BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
 };
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 static inline unsigned long queue_bounce_pfn(struct request_queue *q)
 {
 	return q->limits.bounce_pfn;
 }
 static inline unsigned long queue_segment_boundary(struct request_queue *q)
 {
 	return q->limits.seg_boundary_mask;
 }
 static inline unsigned int queue_max_sectors(struct request_queue *q)
 {
 	return q->limits.max_sectors;
 }
 static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
 {
 	return q->limits.max_hw_sectors;
 }
 static inline unsigned short queue_max_segments(struct request_queue *q)
 {
 	return q->limits.max_segments;
 }
 static inline unsigned int queue_max_segment_size(struct request_queue *q)
 {
 	return q->limits.max_segment_size;
 }
 static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
 	if (q && q->limits.logical_block_size)
 		retval = q->limits.logical_block_size;
 	return retval;
 }
 static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
 {
 	return queue_logical_block_size(bdev_get_queue(bdev));
 }
 static inline unsigned int queue_physical_block_size(struct request_queue *q)
 {
 	return q->limits.physical_block_size;
 }
 static inline unsigned int bdev_physical_block_size(struct block_device *bdev)
 {
 	return queue_physical_block_size(bdev_get_queue(bdev));
 }
 static inline unsigned int queue_io_min(struct request_queue *q)
 {
 	return q->limits.io_min;
 }
 static inline int bdev_io_min(struct block_device *bdev)
 {
 	return queue_io_min(bdev_get_queue(bdev));
 }
 static inline unsigned int queue_io_opt(struct request_queue *q)
 {
 	return q->limits.io_opt;
 }
 static inline int bdev_io_opt(struct block_device *bdev)
 {
 	return queue_io_opt(bdev_get_queue(bdev));
 }
 static inline int queue_alignment_offset(struct request_queue *q)
 {
 	if (q->limits.misaligned)
 		return -1;
 	return q->limits.alignment_offset;
 }
 static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector)
 {
 	unsigned int granularity = max(lim->physical_block_size, lim->io_min);
 	unsigned int alignment = (sector << 9) & (granularity - 1);
 	return (granularity + lim->alignment_offset - alignment)
 		& (granularity - 1);
 }
 static inline int bdev_alignment_offset(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 	if (q->limits.misaligned)
 		return -1;
 	if (bdev != bdev->bd_contains)
 		return bdev->bd_part->alignment_offset;
 	return q->limits.alignment_offset;
 }
 static inline int queue_discard_alignment(struct request_queue *q)
 {
 	if (q->limits.discard_misaligned)
 		return -1;
 	return q->limits.discard_alignment;
 }
 static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector)
 {
 	unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1);
 	if (!lim->max_discard_sectors)
 		return 0;
 	return (lim->discard_granularity + lim->discard_alignment - alignment)
 		& (lim->discard_granularity - 1);
 }
 static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
 {
 	if (q->limits.max_discard_sectors && q->limits.discard_zeroes_data == 1)
 		return 1;
 	return 0;
 }
 static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
 {
 	return queue_discard_zeroes_data(bdev_get_queue(bdev));
 }
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
 }
 static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
 				 unsigned int len)
 {
 	unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
 	return !(addr & alignment) && !(len & alignment);
 }
 /* assumes size > 256 */
 static inline unsigned int blksize_bits(unsigned int size)
 {
 	unsigned int bits = 8;
 	do {
 		bits++;
 		size >>= 1;
 	} while (size > 256);
 	return bits;
 }
 static inline unsigned int block_size(struct block_device *bdev)
 {
 	return bdev->bd_block_size;
 }
 static inline bool queue_flush_queueable(struct request_queue *q)
 {
 	return !q->flush_not_queueable;
 }
 typedef struct {struct page *v;} Sector;
 unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *);
 static inline void put_dev_sector(Sector p)
 {
 	page_cache_release(p.v);
 }
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
 #ifdef CONFIG_BLK_CGROUP
 /*
  * This should not be using sched_clock(). A real patch is in progress
  * to fix this up, until that is in place we need to disable preemption
  * around sched_clock() in this function and set_io_start_time_ns().
  */
 static inline void set_start_time_ns(struct request *req)
 {
 	preempt_disable();
 	req->start_time_ns = sched_clock();
 	preempt_enable();
 }
 static inline void set_io_start_time_ns(struct request *req)
 {
 	preempt_disable();
 	req->io_start_time_ns = sched_clock();
 	preempt_enable();
 }
 static inline uint64_t rq_start_time_ns(struct request *req)
 {
         return req->start_time_ns;
 }
 static inline uint64_t rq_io_start_time_ns(struct request *req)
 {
         return req->io_start_time_ns;
 }
 #else
 static inline void set_start_time_ns(struct request *req) {}
 static inline void set_io_start_time_ns(struct request *req) {}
 static inline uint64_t rq_start_time_ns(struct request *req)
 {
 	return 0;
 }
 static inline uint64_t rq_io_start_time_ns(struct request *req)
 {
 	return 0;
 }
 #endif
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-*")
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 #define INTEGRITY_FLAG_READ	2	/* verify data integrity on read */
 #define INTEGRITY_FLAG_WRITE	4	/* generate data integrity on write */
 struct blk_integrity_exchg {
 	void			*prot_buf;
 	void			*data_buf;
 	sector_t		sector;
 	unsigned int		data_size;
 	unsigned short		sector_size;
 	const char		*disk_name;
 };
 typedef void (integrity_gen_fn) (struct blk_integrity_exchg *);
 typedef int (integrity_vrfy_fn) (struct blk_integrity_exchg *);
 typedef void (integrity_set_tag_fn) (void *, void *, unsigned int);
 typedef void (integrity_get_tag_fn) (void *, void *, unsigned int);
 struct blk_integrity {
 	integrity_gen_fn	*generate_fn;
 	integrity_vrfy_fn	*verify_fn;
 	integrity_set_tag_fn	*set_tag_fn;
 	integrity_get_tag_fn	*get_tag_fn;
 	unsigned short		flags;
 	unsigned short		tuple_size;
 	unsigned short		sector_size;
 	unsigned short		tag_size;
 	const char		*name;
 	struct kobject		kobj;
 };
 extern bool blk_integrity_is_initialized(struct gendisk *);
 extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
 extern void blk_integrity_unregister(struct gendisk *);
 extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
 extern int blk_rq_map_integrity_sg(struct request_queue *, struct bio *,
 				   struct scatterlist *);
 extern int blk_rq_count_integrity_sg(struct request_queue *, struct bio *);
 extern int blk_integrity_merge_rq(struct request_queue *, struct request *,
 				  struct request *);
 extern int blk_integrity_merge_bio(struct request_queue *, struct request *,
 				   struct bio *);
 static inline
 struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
 {
 	return bdev->bd_disk->integrity;
 }
 static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
 {
 	return disk->integrity;
 }
 static inline int blk_integrity_rq(struct request *rq)
 {
 	if (rq->bio == NULL)
 		return 0;
 	return bio_integrity(rq->bio);
 }
 static inline void blk_queue_max_integrity_segments(struct request_queue *q,
 						    unsigned int segs)
 {
 	q->limits.max_integrity_segments = segs;
 }
 static inline unsigned short
 queue_max_integrity_segments(struct request_queue *q)
 {
 	return q->limits.max_integrity_segments;
 }
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 struct bio;
 struct block_device;
 struct gendisk;
 struct blk_integrity;
 static inline int blk_integrity_rq(struct request *rq)
 {
 	return 0;
 }
 static inline int blk_rq_count_integrity_sg(struct request_queue *q,
 					    struct bio *b)
 {
 	return 0;
 }
 static inline int blk_rq_map_integrity_sg(struct request_queue *q,
 					  struct bio *b,
 					  struct scatterlist *s)
 {
 	return 0;
 }
 static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
 {
 	return 0;
 }
 static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
 {
 	return NULL;
 }
 static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
 {
 	return 0;
 }
 static inline int blk_integrity_register(struct gendisk *d,
 					 struct blk_integrity *b)
 {
 	return 0;
 }
 static inline void blk_integrity_unregister(struct gendisk *d)
 {
 }
 static inline void blk_queue_max_integrity_segments(struct request_queue *q,
 						    unsigned int segs)
 {
 }
 static inline unsigned short queue_max_integrity_segments(struct request_queue *q)
 {
 	return 0;
 }
 static inline int blk_integrity_merge_rq(struct request_queue *rq,
 					 struct request *r1,
 					 struct request *r2)
 {
 	return 0;
 }
 static inline int blk_integrity_merge_bio(struct request_queue *rq,
 					  struct request *r,
 					  struct bio *b)
 {
 	return 0;
 }
 static inline bool blk_integrity_is_initialized(struct gendisk *g)
 {
 	return 0;
 }
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 struct block_device_operations {
 	int (*open) (struct block_device *, fmode_t);
 	int (*release) (struct gendisk *, fmode_t);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*direct_access) (struct block_device *, sector_t,
 						void **, unsigned long *);
 	unsigned int (*check_events) (struct gendisk *disk,
 				      unsigned int clearing);
 	/* ->media_changed() is DEPRECATED, use ->check_events() instead */
 	int (*media_changed) (struct gendisk *);
 	void (*unlock_native_capacity) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
 	int (*getgeo)(struct block_device *, struct hd_geometry *);
 	/* this callback is with swap_lock and sometimes page table lock held */
 	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 	struct module *owner;
 };
 extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 				 unsigned long);
 #else /* CONFIG_BLOCK */
 /*
  * stubs for when the block layer is configured out
  */
 #define buffer_heads_over_limit 0
 static inline long nr_blockdev_pages(void)
 {
 	return 0;
 }
 struct blk_plug {
 };
 static inline void blk_start_plug(struct blk_plug *plug)
 {
 }
 static inline void blk_finish_plug(struct blk_plug *plug)
 {
 }
 static inline void blk_flush_plug(struct task_struct *task)
 {
 }
 static inline void blk_schedule_flush_plug(struct task_struct *task)
 {
 }
 static inline bool blk_needs_flush_plug(struct task_struct *tsk)
 {
 	return false;
 }
 #endif /* CONFIG_BLOCK */
 #endif