Eric Lee / linux-smarc-t335x-v3.2

1

/*

1

/*

2

* CFQ, or complete fairness queueing, disk scheduler.

2

* CFQ, or complete fairness queueing, disk scheduler.

3

*

3

*

4

* Based on ideas from a previously unfinished io

4

* Based on ideas from a previously unfinished io

5

* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.

5

* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.

6

*

6

*

7

8

*/

8

*/

9

#include <linux/module.h>

9

#include <linux/module.h>

10

#include <linux/slab.h>

10

#include <linux/slab.h>

11

#include <linux/blkdev.h>

11

#include <linux/blkdev.h>

12

#include <linux/elevator.h>

12

#include <linux/elevator.h>

13

#include <linux/jiffies.h>

13

#include <linux/jiffies.h>

14

#include <linux/rbtree.h>

14

#include <linux/rbtree.h>

15

#include <linux/ioprio.h>

15

#include <linux/ioprio.h>

16

#include <linux/blktrace_api.h>

16

#include <linux/blktrace_api.h>

17

#include "blk-cgroup.h"

17

#include "blk-cgroup.h"

18

19

/*

19

/*

20

* tunables

20

* tunables

21

*/

21

*/

22

/* max queue in one round of service */

22

/* max queue in one round of service */

23

static const int cfq_quantum = 8;

23

static const int cfq_quantum = 8;

24

static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

24

static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

25

/* maximum backwards seek, in KiB */

25

/* maximum backwards seek, in KiB */

26

static const int cfq_back_max = 16 * 1024;

26

static const int cfq_back_max = 16 * 1024;

27

/* penalty of a backwards seek */

27

/* penalty of a backwards seek */

28

static const int cfq_back_penalty = 2;

28

static const int cfq_back_penalty = 2;

29

static const int cfq_slice_sync = HZ / 10;

29

static const int cfq_slice_sync = HZ / 10;

30

static int cfq_slice_async = HZ / 25;

30

static int cfq_slice_async = HZ / 25;

31

static const int cfq_slice_async_rq = 2;

31

static const int cfq_slice_async_rq = 2;

32

static int cfq_slice_idle = HZ / 125;

32

static int cfq_slice_idle = HZ / 125;

33

static const int cfq_target_latency = HZ * 3/10; /* 300 ms */

33

static const int cfq_target_latency = HZ * 3/10; /* 300 ms */

34

static const int cfq_hist_divisor = 4;

34

static const int cfq_hist_divisor = 4;

35

36

/*

36

/*

37

* offset from end of service tree

37

* offset from end of service tree

38

*/

38

*/

39

#define CFQ_IDLE_DELAY (HZ / 5)

39

#define CFQ_IDLE_DELAY (HZ / 5)

40

41

/*

41

/*

42

* below this threshold, we consider thinktime immediate

42

* below this threshold, we consider thinktime immediate

43

*/

43

*/

44

#define CFQ_MIN_TT (2)

44

#define CFQ_MIN_TT (2)

45

46

#define CFQ_SLICE_SCALE (5)

46

#define CFQ_SLICE_SCALE (5)

47

#define CFQ_HW_QUEUE_MIN (5)

47

#define CFQ_HW_QUEUE_MIN (5)

48

#define CFQ_SERVICE_SHIFT 12

48

#define CFQ_SERVICE_SHIFT 12

49

50

#define CFQQ_SEEK_THR (sector_t)(8 * 100)

50

#define CFQQ_SEEK_THR (sector_t)(8 * 100)

51

#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)

51

#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)

52

#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)

52

#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)

53

#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)

53

#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)

54

55

#define RQ_CIC(rq) \

55

#define RQ_CIC(rq) \

56

((struct cfq_io_context *) (rq)->elevator_private)

56

((struct cfq_io_context *) (rq)->elevator_private)

57

#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)

57

#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)

58

#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3)

58

#define RQ_CFQG(rq) (struct cfq_group *) ((rq)->elevator_private3)

59

60

static struct kmem_cache *cfq_pool;

60

static struct kmem_cache *cfq_pool;

61

static struct kmem_cache *cfq_ioc_pool;

61

static struct kmem_cache *cfq_ioc_pool;

62

63

static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);

63

static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);

64

static struct completion *ioc_gone;

64

static struct completion *ioc_gone;

65

static DEFINE_SPINLOCK(ioc_gone_lock);

65

static DEFINE_SPINLOCK(ioc_gone_lock);

66

67

static DEFINE_SPINLOCK(cic_index_lock);

67

static DEFINE_SPINLOCK(cic_index_lock);

68

static DEFINE_IDA(cic_index_ida);

68

static DEFINE_IDA(cic_index_ida);

69

70

#define CFQ_PRIO_LISTS IOPRIO_BE_NR

70

#define CFQ_PRIO_LISTS IOPRIO_BE_NR

71

#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)

71

#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)

72

#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

72

#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

73

74

#define sample_valid(samples) ((samples) > 80)

74

#define sample_valid(samples) ((samples) > 80)

75

#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)

75

#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)

76

77

/*

77

/*

78

* Most of our rbtree usage is for sorting with min extraction, so

78

* Most of our rbtree usage is for sorting with min extraction, so

79

* if we cache the leftmost node we don't have to walk down the tree

79

* if we cache the leftmost node we don't have to walk down the tree

80

* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should

80

* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should

81

* move this into the elevator for the rq sorting as well.

81

* move this into the elevator for the rq sorting as well.

82

*/

82

*/

83

struct cfq_rb_root {

83

struct cfq_rb_root {

84

struct rb_root rb;

84

struct rb_root rb;

85

struct rb_node *left;

85

struct rb_node *left;

86

unsigned count;

86

unsigned count;

87

unsigned total_weight;

87

unsigned total_weight;

88

u64 min_vdisktime;

88

u64 min_vdisktime;

89

struct rb_node *active;

89

struct rb_node *active;

90

};

90

};

91

#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \

91

#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \

92

.count = 0, .min_vdisktime = 0, }

92

.count = 0, .min_vdisktime = 0, }

93

94

/*

94

/*

95

* Per process-grouping structure

95

* Per process-grouping structure

96

*/

96

*/

97

struct cfq_queue {

97

struct cfq_queue {

98

/* reference count */

98

/* reference count */

99

atomic_t ref;

99

atomic_t ref;

100

/* various state flags, see below */

100

/* various state flags, see below */

101

unsigned int flags;

101

unsigned int flags;

102

/* parent cfq_data */

102

/* parent cfq_data */

103

struct cfq_data *cfqd;

103

struct cfq_data *cfqd;

104

/* service_tree member */

104

/* service_tree member */

105

struct rb_node rb_node;

105

struct rb_node rb_node;

106

/* service_tree key */

106

/* service_tree key */

107

unsigned long rb_key;

107

unsigned long rb_key;

108

/* prio tree member */

108

/* prio tree member */

109

struct rb_node p_node;

109

struct rb_node p_node;

110

/* prio tree root we belong to, if any */

110

/* prio tree root we belong to, if any */

111

struct rb_root *p_root;

111

struct rb_root *p_root;

112

/* sorted list of pending requests */

112

/* sorted list of pending requests */

113

struct rb_root sort_list;

113

struct rb_root sort_list;

114

/* if fifo isn't expired, next request to serve */

114

/* if fifo isn't expired, next request to serve */

115

struct request *next_rq;

115

struct request *next_rq;

116

/* requests queued in sort_list */

116

/* requests queued in sort_list */

117

int queued[2];

117

int queued[2];

118

/* currently allocated requests */

118

/* currently allocated requests */

119

int allocated[2];

119

int allocated[2];

120

/* fifo list of requests in sort_list */

120

/* fifo list of requests in sort_list */

121

struct list_head fifo;

121

struct list_head fifo;

122

123

/* time when queue got scheduled in to dispatch first request. */

123

/* time when queue got scheduled in to dispatch first request. */

124

unsigned long dispatch_start;

124

unsigned long dispatch_start;

125

unsigned int allocated_slice;

125

unsigned int allocated_slice;

126

unsigned int slice_dispatch;

126

unsigned int slice_dispatch;

127

/* time when first request from queue completed and slice started. */

127

/* time when first request from queue completed and slice started. */

128

unsigned long slice_start;

128

unsigned long slice_start;

129

unsigned long slice_end;

129

unsigned long slice_end;

130

long slice_resid;

130

long slice_resid;

131

132

/* pending metadata requests */

132

/* pending metadata requests */

133

int meta_pending;

133

int meta_pending;

134

/* number of requests that are on the dispatch list or inside driver */

134

/* number of requests that are on the dispatch list or inside driver */

135

int dispatched;

135

int dispatched;

136

137

/* io prio of this group */

137

/* io prio of this group */

138

unsigned short ioprio, org_ioprio;

138

unsigned short ioprio, org_ioprio;

139

unsigned short ioprio_class, org_ioprio_class;

139

unsigned short ioprio_class, org_ioprio_class;

140

141

pid_t pid;

141

pid_t pid;

142

143

u32 seek_history;

143

u32 seek_history;

144

sector_t last_request_pos;

144

sector_t last_request_pos;

145

146

struct cfq_rb_root *service_tree;

146

struct cfq_rb_root *service_tree;

147

struct cfq_queue *new_cfqq;

147

struct cfq_queue *new_cfqq;

148

struct cfq_group *cfqg;

148

struct cfq_group *cfqg;

149

struct cfq_group *orig_cfqg;

149

struct cfq_group *orig_cfqg;

150

};

150

};

151

152

/*

152

/*

153

* First index in the service_trees.

153

* First index in the service_trees.

154

* IDLE is handled separately, so it has negative index

154

* IDLE is handled separately, so it has negative index

155

*/

155

*/

156

enum wl_prio_t {

156

enum wl_prio_t {

157

BE_WORKLOAD = 0,

157

BE_WORKLOAD = 0,

158

RT_WORKLOAD = 1,

158

RT_WORKLOAD = 1,

159

IDLE_WORKLOAD = 2,

159

IDLE_WORKLOAD = 2,

160

};

160

};

161

162

/*

162

/*

163

* Second index in the service_trees.

163

* Second index in the service_trees.

164

*/

164

*/

165

enum wl_type_t {

165

enum wl_type_t {

166

ASYNC_WORKLOAD = 0,

166

ASYNC_WORKLOAD = 0,

167

SYNC_NOIDLE_WORKLOAD = 1,

167

SYNC_NOIDLE_WORKLOAD = 1,

168

SYNC_WORKLOAD = 2

168

SYNC_WORKLOAD = 2

169

};

169

};

170

171

/* This is per cgroup per device grouping structure */

171

/* This is per cgroup per device grouping structure */

172

struct cfq_group {

172

struct cfq_group {

173

/* group service_tree member */

173

/* group service_tree member */

174

struct rb_node rb_node;

174

struct rb_node rb_node;

175

176

/* group service_tree key */

176

/* group service_tree key */

177

u64 vdisktime;

177

u64 vdisktime;

178

unsigned int weight;

178

unsigned int weight;

179

bool on_st;

179

bool on_st;

180

181

/* number of cfqq currently on this group */

181

/* number of cfqq currently on this group */

182

int nr_cfqq;

182

int nr_cfqq;

183

184

/* Per group busy queus average. Useful for workload slice calc. */

184

/* Per group busy queus average. Useful for workload slice calc. */

185

unsigned int busy_queues_avg[2];

185

unsigned int busy_queues_avg[2];

186

/*

186

/*

187

* rr lists of queues with requests, onle rr for each priority class.

187

* rr lists of queues with requests, onle rr for each priority class.

188

* Counts are embedded in the cfq_rb_root

188

* Counts are embedded in the cfq_rb_root

189

*/

189

*/

190

struct cfq_rb_root service_trees[2][3];

190

struct cfq_rb_root service_trees[2][3];

191

struct cfq_rb_root service_tree_idle;

191

struct cfq_rb_root service_tree_idle;

192

193

unsigned long saved_workload_slice;

193

unsigned long saved_workload_slice;

194

enum wl_type_t saved_workload;

194

enum wl_type_t saved_workload;

195

enum wl_prio_t saved_serving_prio;

195

enum wl_prio_t saved_serving_prio;

196

struct blkio_group blkg;

196

struct blkio_group blkg;

197

#ifdef CONFIG_CFQ_GROUP_IOSCHED

197

#ifdef CONFIG_CFQ_GROUP_IOSCHED

198

struct hlist_node cfqd_node;

198

struct hlist_node cfqd_node;

199

atomic_t ref;

199

atomic_t ref;

200

#endif

200

#endif

201

};

201

};

202

203

/*

203

/*

204

* Per block device queue structure

204

* Per block device queue structure

205

*/

205

*/

206

struct cfq_data {

206

struct cfq_data {

207

struct request_queue *queue;

207

struct request_queue *queue;

208

/* Root service tree for cfq_groups */

208

/* Root service tree for cfq_groups */

209

struct cfq_rb_root grp_service_tree;

209

struct cfq_rb_root grp_service_tree;

210

struct cfq_group root_group;

210

struct cfq_group root_group;

211

212

/*

212

/*

213

* The priority currently being served

213

* The priority currently being served

214

*/

214

*/

215

enum wl_prio_t serving_prio;

215

enum wl_prio_t serving_prio;

216

enum wl_type_t serving_type;

216

enum wl_type_t serving_type;

217

unsigned long workload_expires;

217

unsigned long workload_expires;

218

struct cfq_group *serving_group;

218

struct cfq_group *serving_group;

219

bool noidle_tree_requires_idle;

219

bool noidle_tree_requires_idle;

220

221

/*

221

/*

222

* Each priority tree is sorted by next_request position. These

222

* Each priority tree is sorted by next_request position. These

223

* trees are used when determining if two or more queues are

223

* trees are used when determining if two or more queues are

224

* interleaving requests (see cfq_close_cooperator).

224

* interleaving requests (see cfq_close_cooperator).

225

*/

225

*/

226

struct rb_root prio_trees[CFQ_PRIO_LISTS];

226

struct rb_root prio_trees[CFQ_PRIO_LISTS];

227

228

unsigned int busy_queues;

228

unsigned int busy_queues;

229

230

int rq_in_driver;

230

int rq_in_driver;

231

int rq_in_flight[2];

231

int rq_in_flight[2];

232

233

/*

233

/*

234

* queue-depth detection

234

* queue-depth detection

235

*/

235

*/

236

int rq_queued;

236

int rq_queued;

237

int hw_tag;

237

int hw_tag;

238

/*

238

/*

239

* hw_tag can be

239

* hw_tag can be

240

* -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)

240

* -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)

241

* 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)

241

* 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)

242

* 0 => no NCQ

242

* 0 => no NCQ

243

*/

243

*/

244

int hw_tag_est_depth;

244

int hw_tag_est_depth;

245

unsigned int hw_tag_samples;

245

unsigned int hw_tag_samples;

246

247

/*

247

/*

248

* idle window management

248

* idle window management

249

*/

249

*/

250

struct timer_list idle_slice_timer;

250

struct timer_list idle_slice_timer;

251

struct work_struct unplug_work;

251

struct work_struct unplug_work;

252

253

struct cfq_queue *active_queue;

253

struct cfq_queue *active_queue;

254

struct cfq_io_context *active_cic;

254

struct cfq_io_context *active_cic;

255

256

/*

256

/*

257

* async queue for each priority case

257

* async queue for each priority case

258

*/

258

*/

259

struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];

259

struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];

260

struct cfq_queue *async_idle_cfqq;

260

struct cfq_queue *async_idle_cfqq;

261

262

sector_t last_position;

262

sector_t last_position;

263

264

/*

264

/*

265

* tunables, see top of file

265

* tunables, see top of file

266

*/

266

*/

267

unsigned int cfq_quantum;

267

unsigned int cfq_quantum;

268

unsigned int cfq_fifo_expire[2];

268

unsigned int cfq_fifo_expire[2];

269

unsigned int cfq_back_penalty;

269

unsigned int cfq_back_penalty;

270

unsigned int cfq_back_max;

270

unsigned int cfq_back_max;

271

unsigned int cfq_slice[2];

271

unsigned int cfq_slice[2];

272

unsigned int cfq_slice_async_rq;

272

unsigned int cfq_slice_async_rq;

273

unsigned int cfq_slice_idle;

273

unsigned int cfq_slice_idle;

274

unsigned int cfq_latency;

274

unsigned int cfq_latency;

275

unsigned int cfq_group_isolation;

275

unsigned int cfq_group_isolation;

276

277

unsigned int cic_index;

277

unsigned int cic_index;

278

struct list_head cic_list;

278

struct list_head cic_list;

279

280

/*

280

/*

281

* Fallback dummy cfqq for extreme OOM conditions

281

* Fallback dummy cfqq for extreme OOM conditions

282

*/

282

*/

283

struct cfq_queue oom_cfqq;

283

struct cfq_queue oom_cfqq;

284

285

unsigned long last_delayed_sync;

285

unsigned long last_delayed_sync;

286

287

/* List of cfq groups being managed on this device*/

287

/* List of cfq groups being managed on this device*/

288

struct hlist_head cfqg_list;

288

struct hlist_head cfqg_list;

289

struct rcu_head rcu;

289

struct rcu_head rcu;

290

};

290

};

291

292

static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);

292

static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);

293

294

static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,

294

static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,

295

enum wl_prio_t prio,

295

enum wl_prio_t prio,

296

enum wl_type_t type)

296

enum wl_type_t type)

297

{

297

{

298

if (!cfqg)

298

if (!cfqg)

299

return NULL;

299

return NULL;

300

301

if (prio == IDLE_WORKLOAD)

301

if (prio == IDLE_WORKLOAD)

302

return &cfqg->service_tree_idle;

302

return &cfqg->service_tree_idle;

303

304

return &cfqg->service_trees[prio][type];

304

return &cfqg->service_trees[prio][type];

305

}

305

}

306

307

enum cfqq_state_flags {

307

enum cfqq_state_flags {

308

CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */

308

CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */

309

CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */

309

CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */

310

CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */

310

CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */

311

CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */

311

CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */

312

CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */

312

CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */

313

CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */

313

CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */

314

CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */

314

CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */

315

CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */

315

CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */

316

CFQ_CFQQ_FLAG_sync, /* synchronous queue */

316

CFQ_CFQQ_FLAG_sync, /* synchronous queue */

317

CFQ_CFQQ_FLAG_coop, /* cfqq is shared */

317

CFQ_CFQQ_FLAG_coop, /* cfqq is shared */

318

CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */

318

CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */

319

CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */

319

CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */

320

CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */

320

CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */

321

};

321

};

322

323

#define CFQ_CFQQ_FNS(name) \

323

#define CFQ_CFQQ_FNS(name) \

324

static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \

324

static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \

325

{ \

325

{ \

326

(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \

326

(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \

327

} \

327

} \

328

static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \

328

static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \

329

{ \

329

{ \

330

(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \

330

(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \

331

} \

331

} \

332

static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \

332

static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \

333

{ \

333

{ \

334

return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \

334

return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \

335

}

335

}

336

337

CFQ_CFQQ_FNS(on_rr);

337

CFQ_CFQQ_FNS(on_rr);

338

CFQ_CFQQ_FNS(wait_request);

338

CFQ_CFQQ_FNS(wait_request);

339

CFQ_CFQQ_FNS(must_dispatch);

339

CFQ_CFQQ_FNS(must_dispatch);

340

CFQ_CFQQ_FNS(must_alloc_slice);

340

CFQ_CFQQ_FNS(must_alloc_slice);

341

CFQ_CFQQ_FNS(fifo_expire);

341

CFQ_CFQQ_FNS(fifo_expire);

342

CFQ_CFQQ_FNS(idle_window);

342

CFQ_CFQQ_FNS(idle_window);

343

CFQ_CFQQ_FNS(prio_changed);

343

CFQ_CFQQ_FNS(prio_changed);

344

CFQ_CFQQ_FNS(slice_new);

344

CFQ_CFQQ_FNS(slice_new);

345

CFQ_CFQQ_FNS(sync);

345

CFQ_CFQQ_FNS(sync);

346

CFQ_CFQQ_FNS(coop);

346

CFQ_CFQQ_FNS(coop);

347

CFQ_CFQQ_FNS(split_coop);

347

CFQ_CFQQ_FNS(split_coop);

348

CFQ_CFQQ_FNS(deep);

348

CFQ_CFQQ_FNS(deep);

349

CFQ_CFQQ_FNS(wait_busy);

349

CFQ_CFQQ_FNS(wait_busy);

350

#undef CFQ_CFQQ_FNS

350

#undef CFQ_CFQQ_FNS

351

352

#ifdef CONFIG_CFQ_GROUP_IOSCHED

352

#ifdef CONFIG_CFQ_GROUP_IOSCHED

353

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \

353

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \

354

blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \

354

blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \

355

cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \

355

cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \

356

blkg_path(&(cfqq)->cfqg->blkg), ##args);

356

blkg_path(&(cfqq)->cfqg->blkg), ##args);

357

358

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \

358

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \

359

blk_add_trace_msg((cfqd)->queue, "%s " fmt, \

359

blk_add_trace_msg((cfqd)->queue, "%s " fmt, \

360

blkg_path(&(cfqg)->blkg), ##args); \

360

blkg_path(&(cfqg)->blkg), ##args); \

361

362

#else

362

#else

363

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \

363

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \

364

blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)

364

blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)

365

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);

365

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);

366

#endif

366

#endif

367

#define cfq_log(cfqd, fmt, args...) \

367

#define cfq_log(cfqd, fmt, args...) \

368

blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

368

blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

369

370

/* Traverses through cfq group service trees */

370

/* Traverses through cfq group service trees */

371

#define for_each_cfqg_st(cfqg, i, j, st) \

371

#define for_each_cfqg_st(cfqg, i, j, st) \

372

for (i = 0; i <= IDLE_WORKLOAD; i++) \

372

for (i = 0; i <= IDLE_WORKLOAD; i++) \

373

for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\

373

for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\

374

: &cfqg->service_tree_idle; \

374

: &cfqg->service_tree_idle; \

375

(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \

375

(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \

376

(i == IDLE_WORKLOAD && j == 0); \

376

(i == IDLE_WORKLOAD && j == 0); \

377

j++, st = i < IDLE_WORKLOAD ? \

377

j++, st = i < IDLE_WORKLOAD ? \

378

&cfqg->service_trees[i][j]: NULL) \

378

&cfqg->service_trees[i][j]: NULL) \

379

380

381

static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)

381

static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)

382

{

382

{

383

if (cfq_class_idle(cfqq))

383

if (cfq_class_idle(cfqq))

384

return IDLE_WORKLOAD;

384

return IDLE_WORKLOAD;

385

if (cfq_class_rt(cfqq))

385

if (cfq_class_rt(cfqq))

386

return RT_WORKLOAD;

386

return RT_WORKLOAD;

387

return BE_WORKLOAD;

387

return BE_WORKLOAD;

388

}

388

}

389

390

391

static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)

391

static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)

392

{

392

{

393

if (!cfq_cfqq_sync(cfqq))

393

if (!cfq_cfqq_sync(cfqq))

394

return ASYNC_WORKLOAD;

394

return ASYNC_WORKLOAD;

395

if (!cfq_cfqq_idle_window(cfqq))

395

if (!cfq_cfqq_idle_window(cfqq))

396

return SYNC_NOIDLE_WORKLOAD;

396

return SYNC_NOIDLE_WORKLOAD;

397

return SYNC_WORKLOAD;

397

return SYNC_WORKLOAD;

398

}

398

}

399

400

static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,

400

static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,

401

struct cfq_data *cfqd,

401

struct cfq_data *cfqd,

402

struct cfq_group *cfqg)

402

struct cfq_group *cfqg)

403

{

403

{

404

if (wl == IDLE_WORKLOAD)

404

if (wl == IDLE_WORKLOAD)

405

return cfqg->service_tree_idle.count;

405

return cfqg->service_tree_idle.count;

406

407

return cfqg->service_trees[wl][ASYNC_WORKLOAD].count

407

return cfqg->service_trees[wl][ASYNC_WORKLOAD].count

408

+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count

408

+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count

409

+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;

409

+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;

410

}

410

}

411

412

static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,

412

static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,

413

struct cfq_group *cfqg)

413

struct cfq_group *cfqg)

414

{

414

{

415

return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count

415

return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count

416

+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;

416

+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;

417

}

417

}

418

419

static void cfq_dispatch_insert(struct request_queue *, struct request *);

419

static void cfq_dispatch_insert(struct request_queue *, struct request *);

420

static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,

420

static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,

421

struct io_context *, gfp_t);

421

struct io_context *, gfp_t);

422

static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,

422

static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,

423

struct io_context *);

423

struct io_context *);

424

425

static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,

425

static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,

426

bool is_sync)

426

bool is_sync)

427

{

427

{

428

return cic->cfqq[is_sync];

428

return cic->cfqq[is_sync];

429

}

429

}

430

431

static inline void cic_set_cfqq(struct cfq_io_context *cic,

431

static inline void cic_set_cfqq(struct cfq_io_context *cic,

432

struct cfq_queue *cfqq, bool is_sync)

432

struct cfq_queue *cfqq, bool is_sync)

433

{

433

{

434

cic->cfqq[is_sync] = cfqq;

434

cic->cfqq[is_sync] = cfqq;

435

}

435

}

436

437

#define CIC_DEAD_KEY 1ul

437

#define CIC_DEAD_KEY 1ul

438

#define CIC_DEAD_INDEX_SHIFT 1

438

#define CIC_DEAD_INDEX_SHIFT 1

439

440

static inline void *cfqd_dead_key(struct cfq_data *cfqd)

440

static inline void *cfqd_dead_key(struct cfq_data *cfqd)

441

{

441

{

442

return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);

442

return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);

443

}

443

}

444

445

static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)

445

static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)

446

{

446

{

447

struct cfq_data *cfqd = cic->key;

447

struct cfq_data *cfqd = cic->key;

448

449

if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))

449

if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))

450

return NULL;

450

return NULL;

451

452

return cfqd;

452

return cfqd;

453

}

453

}

454

455

/*

455

/*

456

* We regard a request as SYNC, if it's either a read or has the SYNC bit

456

* We regard a request as SYNC, if it's either a read or has the SYNC bit

457

* set (in which case it could also be direct WRITE).

457

* set (in which case it could also be direct WRITE).

458

*/

458

*/

459

static inline bool cfq_bio_sync(struct bio *bio)

459

static inline bool cfq_bio_sync(struct bio *bio)

460

{

460

{

461

return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);

461

return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);

462

}

462

}

463

464

/*

464

/*

465

* scheduler run of queue, if there are requests pending and no one in the

465

* scheduler run of queue, if there are requests pending and no one in the

466

* driver that will restart queueing

466

* driver that will restart queueing

467

*/

467

*/

468

static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)

468

static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)

469

{

469

{

470

if (cfqd->busy_queues) {

470

if (cfqd->busy_queues) {

471

cfq_log(cfqd, "schedule dispatch");

471

cfq_log(cfqd, "schedule dispatch");

472

kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);

472

kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);

473

}

473

}

474

}

474

}

475

476

static int cfq_queue_empty(struct request_queue *q)

476

static int cfq_queue_empty(struct request_queue *q)

477

{

477

{

478

struct cfq_data *cfqd = q->elevator->elevator_data;

478

struct cfq_data *cfqd = q->elevator->elevator_data;

479

480

return !cfqd->rq_queued;

480

return !cfqd->rq_queued;

481

}

481

}

482

483

/*

483

/*

484

* Scale schedule slice based on io priority. Use the sync time slice only

484

* Scale schedule slice based on io priority. Use the sync time slice only

485

* if a queue is marked sync and has sync io queued. A sync queue with async

485

* if a queue is marked sync and has sync io queued. A sync queue with async

486

* io only, should not get full sync slice length.

486

* io only, should not get full sync slice length.

487

*/

487

*/

488

static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,

488

static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,

489

unsigned short prio)

489

unsigned short prio)

490

{

490

{

491

const int base_slice = cfqd->cfq_slice[sync];

491

const int base_slice = cfqd->cfq_slice[sync];

492

493

WARN_ON(prio >= IOPRIO_BE_NR);

493

WARN_ON(prio >= IOPRIO_BE_NR);

494

495

return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));

495

return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));

496

}

496

}

497

498

static inline int

498

static inline int

499

cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

499

cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

500

{

500

{

501

return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);

501

return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);

502

}

502

}

503

504

static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)

504

static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)

505

{

505

{

506

u64 d = delta << CFQ_SERVICE_SHIFT;

506

u64 d = delta << CFQ_SERVICE_SHIFT;

507

508

d = d * BLKIO_WEIGHT_DEFAULT;

508

d = d * BLKIO_WEIGHT_DEFAULT;

509

do_div(d, cfqg->weight);

509

do_div(d, cfqg->weight);

510

return d;

510

return d;

511

}

511

}

512

513

static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)

513

static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)

514

{

514

{

515

s64 delta = (s64)(vdisktime - min_vdisktime);

515

s64 delta = (s64)(vdisktime - min_vdisktime);

516

if (delta > 0)

516

if (delta > 0)

517

min_vdisktime = vdisktime;

517

min_vdisktime = vdisktime;

518

519

return min_vdisktime;

519

return min_vdisktime;

520

}

520

}

521

522

static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)

522

static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)

523

{

523

{

524

s64 delta = (s64)(vdisktime - min_vdisktime);

524

s64 delta = (s64)(vdisktime - min_vdisktime);

525

if (delta < 0)

525

if (delta < 0)

526

min_vdisktime = vdisktime;

526

min_vdisktime = vdisktime;

527

528

return min_vdisktime;

528

return min_vdisktime;

529

}

529

}

530

531

static void update_min_vdisktime(struct cfq_rb_root *st)

531

static void update_min_vdisktime(struct cfq_rb_root *st)

532

{

532

{

533

u64 vdisktime = st->min_vdisktime;

533

u64 vdisktime = st->min_vdisktime;

534

struct cfq_group *cfqg;

534

struct cfq_group *cfqg;

535

536

if (st->active) {

536

if (st->active) {

537

cfqg = rb_entry_cfqg(st->active);

537

cfqg = rb_entry_cfqg(st->active);

538

vdisktime = cfqg->vdisktime;

538

vdisktime = cfqg->vdisktime;

539

}

539

}

540

541

if (st->left) {

541

if (st->left) {

542

cfqg = rb_entry_cfqg(st->left);

542

cfqg = rb_entry_cfqg(st->left);

543

vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);

543

vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);

544

}

544

}

545

546

st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);

546

st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);

547

}

547

}

548

549

/*

549

/*

550

* get averaged number of queues of RT/BE priority.

550

* get averaged number of queues of RT/BE priority.

551

* average is updated, with a formula that gives more weight to higher numbers,

551

* average is updated, with a formula that gives more weight to higher numbers,

552

* to quickly follows sudden increases and decrease slowly

552

* to quickly follows sudden increases and decrease slowly

553

*/

553

*/

554

555

static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,

555

static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,

556

struct cfq_group *cfqg, bool rt)

556

struct cfq_group *cfqg, bool rt)

557

{

557

{

558

unsigned min_q, max_q;

558

unsigned min_q, max_q;

559

unsigned mult = cfq_hist_divisor - 1;

559

unsigned mult = cfq_hist_divisor - 1;

560

unsigned round = cfq_hist_divisor / 2;

560

unsigned round = cfq_hist_divisor / 2;

561

unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);

561

unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);

562

563

min_q = min(cfqg->busy_queues_avg[rt], busy);

563

min_q = min(cfqg->busy_queues_avg[rt], busy);

564

max_q = max(cfqg->busy_queues_avg[rt], busy);

564

max_q = max(cfqg->busy_queues_avg[rt], busy);

565

cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /

565

cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /

566

cfq_hist_divisor;

566

cfq_hist_divisor;

567

return cfqg->busy_queues_avg[rt];

567

return cfqg->busy_queues_avg[rt];

568

}

568

}

569

570

static inline unsigned

570

static inline unsigned

571

cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)

571

cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)

572

{

572

{

573

struct cfq_rb_root *st = &cfqd->grp_service_tree;

573

struct cfq_rb_root *st = &cfqd->grp_service_tree;

574

575

return cfq_target_latency * cfqg->weight / st->total_weight;

575

return cfq_target_latency * cfqg->weight / st->total_weight;

576

}

576

}

577

578

static inline void

578

static inline void

579

cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

579

cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

580

{

580

{

581

unsigned slice = cfq_prio_to_slice(cfqd, cfqq);

581

unsigned slice = cfq_prio_to_slice(cfqd, cfqq);

582

if (cfqd->cfq_latency) {

582

if (cfqd->cfq_latency) {

583

/*

583

/*

584

* interested queues (we consider only the ones with the same

584

* interested queues (we consider only the ones with the same

585

* priority class in the cfq group)

585

* priority class in the cfq group)

586

*/

586

*/

587

unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,

587

unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,

588

cfq_class_rt(cfqq));

588

cfq_class_rt(cfqq));

589

unsigned sync_slice = cfqd->cfq_slice[1];

589

unsigned sync_slice = cfqd->cfq_slice[1];

590

unsigned expect_latency = sync_slice * iq;

590

unsigned expect_latency = sync_slice * iq;

591

unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);

591

unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);

592

593

if (expect_latency > group_slice) {

593

if (expect_latency > group_slice) {

594

unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;

594

unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;

595

/* scale low_slice according to IO priority

595

/* scale low_slice according to IO priority

596

* and sync vs async */

596

* and sync vs async */

597

unsigned low_slice =

597

unsigned low_slice =

598

min(slice, base_low_slice * slice / sync_slice);

598

min(slice, base_low_slice * slice / sync_slice);

599

/* the adapted slice value is scaled to fit all iqs

599

/* the adapted slice value is scaled to fit all iqs

600

* into the target latency */

600

* into the target latency */

601

slice = max(slice * group_slice / expect_latency,

601

slice = max(slice * group_slice / expect_latency,

602

low_slice);

602

low_slice);

603

}

603

}

604

}

604

}

605

cfqq->slice_start = jiffies;

605

cfqq->slice_start = jiffies;

606

cfqq->slice_end = jiffies + slice;

606

cfqq->slice_end = jiffies + slice;

607

cfqq->allocated_slice = slice;

607

cfqq->allocated_slice = slice;

608

cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);

608

cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);

609

}

609

}

610

611

/*

611

/*

612

* We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end

612

* We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end

613

* isn't valid until the first request from the dispatch is activated

613

* isn't valid until the first request from the dispatch is activated

614

* and the slice time set.

614

* and the slice time set.

615

*/

615

*/

616

static inline bool cfq_slice_used(struct cfq_queue *cfqq)

616

static inline bool cfq_slice_used(struct cfq_queue *cfqq)

617

{

617

{

618

if (cfq_cfqq_slice_new(cfqq))

618

if (cfq_cfqq_slice_new(cfqq))

619

return 0;

619

return 0;

620

if (time_before(jiffies, cfqq->slice_end))

620

if (time_before(jiffies, cfqq->slice_end))

621

return 0;

621

return 0;

622

623

return 1;

623

return 1;

624

}

624

}

625

626

/*

626

/*

627

* Lifted from AS - choose which of rq1 and rq2 that is best served now.

627

* Lifted from AS - choose which of rq1 and rq2 that is best served now.

628

* We choose the request that is closest to the head right now. Distance

628

* We choose the request that is closest to the head right now. Distance

629

* behind the head is penalized and only allowed to a certain extent.

629

* behind the head is penalized and only allowed to a certain extent.

630

*/

630

*/

631

static struct request *

631

static struct request *

632

cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)

632

cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)

633

{

633

{

634

sector_t s1, s2, d1 = 0, d2 = 0;

634

sector_t s1, s2, d1 = 0, d2 = 0;

635

unsigned long back_max;

635

unsigned long back_max;

636

#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */

636

#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */

637

#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */

637

#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */

638

unsigned wrap = 0; /* bit mask: requests behind the disk head? */

638

unsigned wrap = 0; /* bit mask: requests behind the disk head? */

639

640

if (rq1 == NULL || rq1 == rq2)

640

if (rq1 == NULL || rq1 == rq2)

641

return rq2;

641

return rq2;

642

if (rq2 == NULL)

642

if (rq2 == NULL)

643

return rq1;

643

return rq1;

644

645

if (rq_is_sync(rq1) && !rq_is_sync(rq2))

645

if (rq_is_sync(rq1) && !rq_is_sync(rq2))

646

return rq1;

646

return rq1;

647

else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

647

else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

648

return rq2;

648

return rq2;

649

if (rq_is_meta(rq1) && !rq_is_meta(rq2))

649

if (rq_is_meta(rq1) && !rq_is_meta(rq2))

650

return rq1;

650

return rq1;

651

else if (rq_is_meta(rq2) && !rq_is_meta(rq1))

651

else if (rq_is_meta(rq2) && !rq_is_meta(rq1))

652

return rq2;

652

return rq2;

653

654

s1 = blk_rq_pos(rq1);

654

s1 = blk_rq_pos(rq1);

655

s2 = blk_rq_pos(rq2);

655

s2 = blk_rq_pos(rq2);

656

657

/*

657

/*

658

* by definition, 1KiB is 2 sectors

658

* by definition, 1KiB is 2 sectors

659

*/

659

*/

660

back_max = cfqd->cfq_back_max * 2;

660

back_max = cfqd->cfq_back_max * 2;

661

662

/*

662

/*

663

* Strict one way elevator _except_ in the case where we allow

663

* Strict one way elevator _except_ in the case where we allow

664

* short backward seeks which are biased as twice the cost of a

664

* short backward seeks which are biased as twice the cost of a

665

* similar forward seek.

665

* similar forward seek.

666

*/

666

*/

667

if (s1 >= last)

667

if (s1 >= last)

668

d1 = s1 - last;

668

d1 = s1 - last;

669

else if (s1 + back_max >= last)

669

else if (s1 + back_max >= last)

670

d1 = (last - s1) * cfqd->cfq_back_penalty;

670

d1 = (last - s1) * cfqd->cfq_back_penalty;

671

else

671

else

672

wrap |= CFQ_RQ1_WRAP;

672

wrap |= CFQ_RQ1_WRAP;

673

674

if (s2 >= last)

674

if (s2 >= last)

675

d2 = s2 - last;

675

d2 = s2 - last;

676

else if (s2 + back_max >= last)

676

else if (s2 + back_max >= last)

677

d2 = (last - s2) * cfqd->cfq_back_penalty;

677

d2 = (last - s2) * cfqd->cfq_back_penalty;

678

else

678

else

679

wrap |= CFQ_RQ2_WRAP;

679

wrap |= CFQ_RQ2_WRAP;

680

681

/* Found required data */

681

/* Found required data */

682

683

/*

683

/*

684

* By doing switch() on the bit mask "wrap" we avoid having to

684

* By doing switch() on the bit mask "wrap" we avoid having to

685

* check two variables for all permutations: --> faster!

685

* check two variables for all permutations: --> faster!

686

*/

686

*/

687

switch (wrap) {

687

switch (wrap) {

688

case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

688

case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

689

if (d1 < d2)

689

if (d1 < d2)

690

return rq1;

690

return rq1;

691

else if (d2 < d1)

691

else if (d2 < d1)

692

return rq2;

692

return rq2;

693

else {

693

else {

694

if (s1 >= s2)

694

if (s1 >= s2)

695

return rq1;

695

return rq1;

696

else

696

else

697

return rq2;

697

return rq2;

698

}

698

}

699

700

case CFQ_RQ2_WRAP:

700

case CFQ_RQ2_WRAP:

701

return rq1;

701

return rq1;

702

case CFQ_RQ1_WRAP:

702

case CFQ_RQ1_WRAP:

703

return rq2;

703

return rq2;

704

case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */

704

case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */

705

default:

705

default:

706

/*

706

/*

707

* Since both rqs are wrapped,

707

* Since both rqs are wrapped,

708

* start with the one that's further behind head

708

* start with the one that's further behind head

709

* (--> only *one* back seek required),

709

* (--> only *one* back seek required),

710

* since back seek takes more time than forward.

710

* since back seek takes more time than forward.

711

*/

711

*/

712

if (s1 <= s2)

712

if (s1 <= s2)

713

return rq1;

713

return rq1;

714

else

714

else

715

return rq2;

715

return rq2;

716

}

716

}

717

}

717

}

718

719

/*

719

/*

720

* The below is leftmost cache rbtree addon

720

* The below is leftmost cache rbtree addon

721

*/

721

*/

722

static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)

722

static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)

723

{

723

{

724

/* Service tree is empty */

724

/* Service tree is empty */

725

if (!root->count)

725

if (!root->count)

726

return NULL;

726

return NULL;

727

728

if (!root->left)

728

if (!root->left)

729

root->left = rb_first(&root->rb);

729

root->left = rb_first(&root->rb);

730

731

if (root->left)

731

if (root->left)

732

return rb_entry(root->left, struct cfq_queue, rb_node);

732

return rb_entry(root->left, struct cfq_queue, rb_node);

733

734

return NULL;

734

return NULL;

735

}

735

}

736

737

static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)

737

static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)

738

{

738

{

739

if (!root->left)

739

if (!root->left)

740

root->left = rb_first(&root->rb);

740

root->left = rb_first(&root->rb);

741

742

if (root->left)

742

if (root->left)

743

return rb_entry_cfqg(root->left);

743

return rb_entry_cfqg(root->left);

744

745

return NULL;

745

return NULL;

746

}

746

}

747

748

static void rb_erase_init(struct rb_node *n, struct rb_root *root)

748

static void rb_erase_init(struct rb_node *n, struct rb_root *root)

749

{

749

{

750

rb_erase(n, root);

750

rb_erase(n, root);

751

RB_CLEAR_NODE(n);

751

RB_CLEAR_NODE(n);

752

}

752

}

753

754

static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)

754

static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)

755

{

755

{

756

if (root->left == n)

756

if (root->left == n)

757

root->left = NULL;

757

root->left = NULL;

758

rb_erase_init(n, &root->rb);

758

rb_erase_init(n, &root->rb);

759

--root->count;

759

--root->count;

760

}

760

}

761

762

/*

762

/*

763

* would be nice to take fifo expire time into account as well

763

* would be nice to take fifo expire time into account as well

764

*/

764

*/

765

static struct request *

765

static struct request *

766

cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,

766

cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,

767

struct request *last)

767

struct request *last)

768

{

768

{

769

struct rb_node *rbnext = rb_next(&last->rb_node);

769

struct rb_node *rbnext = rb_next(&last->rb_node);

770

struct rb_node *rbprev = rb_prev(&last->rb_node);

770

struct rb_node *rbprev = rb_prev(&last->rb_node);

771

struct request *next = NULL, *prev = NULL;

771

struct request *next = NULL, *prev = NULL;

772

773

BUG_ON(RB_EMPTY_NODE(&last->rb_node));

773

BUG_ON(RB_EMPTY_NODE(&last->rb_node));

774

775

if (rbprev)

775

if (rbprev)

776

prev = rb_entry_rq(rbprev);

776

prev = rb_entry_rq(rbprev);

777

778

if (rbnext)

778

if (rbnext)

779

next = rb_entry_rq(rbnext);

779

next = rb_entry_rq(rbnext);

780

else {

780

else {

781

rbnext = rb_first(&cfqq->sort_list);

781

rbnext = rb_first(&cfqq->sort_list);

782

if (rbnext && rbnext != &last->rb_node)

782

if (rbnext && rbnext != &last->rb_node)

783

next = rb_entry_rq(rbnext);

783

next = rb_entry_rq(rbnext);

784

}

784

}

785

786

return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));

786

return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));

787

}

787

}

788

789

static unsigned long cfq_slice_offset(struct cfq_data *cfqd,

789

static unsigned long cfq_slice_offset(struct cfq_data *cfqd,

790

struct cfq_queue *cfqq)

790

struct cfq_queue *cfqq)

791

{

791

{

792

/*

792

/*

793

* just an approximation, should be ok.

793

* just an approximation, should be ok.

794

*/

794

*/

795

return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -

795

return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -

796

cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));

796

cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));

797

}

797

}

798

799

static inline s64

799

static inline s64

800

cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)

800

cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)

801

{

801

{

802

return cfqg->vdisktime - st->min_vdisktime;

802

return cfqg->vdisktime - st->min_vdisktime;

803

}

803

}

804

805

static void

805

static void

806

__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)

806

__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)

807

{

807

{

808

struct rb_node **node = &st->rb.rb_node;

808

struct rb_node **node = &st->rb.rb_node;

809

struct rb_node *parent = NULL;

809

struct rb_node *parent = NULL;

810

struct cfq_group *__cfqg;

810

struct cfq_group *__cfqg;

811

s64 key = cfqg_key(st, cfqg);

811

s64 key = cfqg_key(st, cfqg);

812

int left = 1;

812

int left = 1;

813

814

while (*node != NULL) {

814

while (*node != NULL) {

815

parent = *node;

815

parent = *node;

816

__cfqg = rb_entry_cfqg(parent);

816

__cfqg = rb_entry_cfqg(parent);

817

818

if (key < cfqg_key(st, __cfqg))

818

if (key < cfqg_key(st, __cfqg))

819

node = &parent->rb_left;

819

node = &parent->rb_left;

820

else {

820

else {

821

node = &parent->rb_right;

821

node = &parent->rb_right;

822

left = 0;

822

left = 0;

823

}

823

}

824

}

824

}

825

826

if (left)

826

if (left)

827

st->left = &cfqg->rb_node;

827

st->left = &cfqg->rb_node;

828

829

rb_link_node(&cfqg->rb_node, parent, node);

829

rb_link_node(&cfqg->rb_node, parent, node);

830

rb_insert_color(&cfqg->rb_node, &st->rb);

830

rb_insert_color(&cfqg->rb_node, &st->rb);

831

}

831

}

832

833

static void

833

static void

834

cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)

834

cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)

835

{

835

{

836

struct cfq_rb_root *st = &cfqd->grp_service_tree;

836

struct cfq_rb_root *st = &cfqd->grp_service_tree;

837

struct cfq_group *__cfqg;

837

struct cfq_group *__cfqg;

838

struct rb_node *n;

838

struct rb_node *n;

839

840

cfqg->nr_cfqq++;

840

cfqg->nr_cfqq++;

841

if (cfqg->on_st)

841

if (cfqg->on_st)

842

return;

842

return;

843

844

/*

844

/*

845

* Currently put the group at the end. Later implement something

845

* Currently put the group at the end. Later implement something

846

* so that groups get lesser vtime based on their weights, so that

846

* so that groups get lesser vtime based on their weights, so that

847

* if group does not loose all if it was not continously backlogged.

847

* if group does not loose all if it was not continously backlogged.

848

*/

848

*/

849

n = rb_last(&st->rb);

849

n = rb_last(&st->rb);

850

if (n) {

850

if (n) {

851

__cfqg = rb_entry_cfqg(n);

851

__cfqg = rb_entry_cfqg(n);

852

cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;

852

cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;

853

} else

853

} else

854

cfqg->vdisktime = st->min_vdisktime;

854

cfqg->vdisktime = st->min_vdisktime;

855

856

__cfq_group_service_tree_add(st, cfqg);

856

__cfq_group_service_tree_add(st, cfqg);

857

cfqg->on_st = true;

857

cfqg->on_st = true;

858

st->total_weight += cfqg->weight;

858

st->total_weight += cfqg->weight;

859

}

859

}

860

861

static void

861

static void

862

cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)

862

cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)

863

{

863

{

864

struct cfq_rb_root *st = &cfqd->grp_service_tree;

864

struct cfq_rb_root *st = &cfqd->grp_service_tree;

865

866

if (st->active == &cfqg->rb_node)

866

if (st->active == &cfqg->rb_node)

867

st->active = NULL;

867

st->active = NULL;

868

869

BUG_ON(cfqg->nr_cfqq < 1);

869

BUG_ON(cfqg->nr_cfqq < 1);

870

cfqg->nr_cfqq--;

870

cfqg->nr_cfqq--;

871

872

/* If there are other cfq queues under this group, don't delete it */

872

/* If there are other cfq queues under this group, don't delete it */

873

if (cfqg->nr_cfqq)

873

if (cfqg->nr_cfqq)

874

return;

874

return;

875

876

cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");

876

cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");

877

cfqg->on_st = false;

877

cfqg->on_st = false;

878

st->total_weight -= cfqg->weight;

878

st->total_weight -= cfqg->weight;

879

if (!RB_EMPTY_NODE(&cfqg->rb_node))

879

if (!RB_EMPTY_NODE(&cfqg->rb_node))

880

cfq_rb_erase(&cfqg->rb_node, st);

880

cfq_rb_erase(&cfqg->rb_node, st);

881

cfqg->saved_workload_slice = 0;

881

cfqg->saved_workload_slice = 0;

882

blkiocg_update_dequeue_stats(&cfqg->blkg, 1);

882

blkiocg_update_dequeue_stats(&cfqg->blkg, 1);

883

}

883

}

884

885

static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)

885

static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)

886

{

886

{

887

unsigned int slice_used;

887

unsigned int slice_used;

888

889

/*

889

/*

890

* Queue got expired before even a single request completed or

890

* Queue got expired before even a single request completed or

891

* got expired immediately after first request completion.

891

* got expired immediately after first request completion.

892

*/

892

*/

893

if (!cfqq->slice_start || cfqq->slice_start == jiffies) {

893

if (!cfqq->slice_start || cfqq->slice_start == jiffies) {

894

/*

894

/*

895

* Also charge the seek time incurred to the group, otherwise

895

* Also charge the seek time incurred to the group, otherwise

896

* if there are mutiple queues in the group, each can dispatch

896

* if there are mutiple queues in the group, each can dispatch

897

* a single request on seeky media and cause lots of seek time

897

* a single request on seeky media and cause lots of seek time

898

* and group will never know it.

898

* and group will never know it.

899

*/

899

*/

900

slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),

900

slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),

901

1);

901

1);

902

} else {

902

} else {

903

slice_used = jiffies - cfqq->slice_start;

903

slice_used = jiffies - cfqq->slice_start;

904

if (slice_used > cfqq->allocated_slice)

904

if (slice_used > cfqq->allocated_slice)

905

slice_used = cfqq->allocated_slice;

905

slice_used = cfqq->allocated_slice;

906

}

906

}

907

908

cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);

908

cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);

909

return slice_used;

909

return slice_used;

910

}

910

}

911

912

static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,

912

static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,

913

struct cfq_queue *cfqq)

913

struct cfq_queue *cfqq)

914

{

914

{

915

struct cfq_rb_root *st = &cfqd->grp_service_tree;

915

struct cfq_rb_root *st = &cfqd->grp_service_tree;

916

unsigned int used_sl, charge_sl;

916

unsigned int used_sl, charge_sl;

917

int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)

917

int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)

918

- cfqg->service_tree_idle.count;

918

- cfqg->service_tree_idle.count;

919

920

BUG_ON(nr_sync < 0);

920

BUG_ON(nr_sync < 0);

921

used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);

921

used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);

922

923

if (!cfq_cfqq_sync(cfqq) && !nr_sync)

923

if (!cfq_cfqq_sync(cfqq) && !nr_sync)

924

charge_sl = cfqq->allocated_slice;

924

charge_sl = cfqq->allocated_slice;

925

926

/* Can't update vdisktime while group is on service tree */

926

/* Can't update vdisktime while group is on service tree */

927

cfq_rb_erase(&cfqg->rb_node, st);

927

cfq_rb_erase(&cfqg->rb_node, st);

928

cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);

928

cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);

929

__cfq_group_service_tree_add(st, cfqg);

929

__cfq_group_service_tree_add(st, cfqg);

930

931

/* This group is being expired. Save the context */

931

/* This group is being expired. Save the context */

932

if (time_after(cfqd->workload_expires, jiffies)) {

932

if (time_after(cfqd->workload_expires, jiffies)) {

933

cfqg->saved_workload_slice = cfqd->workload_expires

933

cfqg->saved_workload_slice = cfqd->workload_expires

934

- jiffies;

934

- jiffies;

935

cfqg->saved_workload = cfqd->serving_type;

935

cfqg->saved_workload = cfqd->serving_type;

936

cfqg->saved_serving_prio = cfqd->serving_prio;

936

cfqg->saved_serving_prio = cfqd->serving_prio;

937

} else

937

} else

938

cfqg->saved_workload_slice = 0;

938

cfqg->saved_workload_slice = 0;

939

940

cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,

940

cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,

941

st->min_vdisktime);

941

st->min_vdisktime);

942

blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);

942

blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);

943

blkiocg_set_start_empty_time(&cfqg->blkg);

943

blkiocg_set_start_empty_time(&cfqg->blkg);

944

}

944

}

945

946

#ifdef CONFIG_CFQ_GROUP_IOSCHED

946

#ifdef CONFIG_CFQ_GROUP_IOSCHED

947

static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)

947

static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)

948

{

948

{

949

if (blkg)

949

if (blkg)

950

return container_of(blkg, struct cfq_group, blkg);

950

return container_of(blkg, struct cfq_group, blkg);

951

return NULL;

951

return NULL;

952

}

952

}

953

954

void

954

void

955

cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)

955

cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)

956

{

956

{

957

cfqg_of_blkg(blkg)->weight = weight;

957

cfqg_of_blkg(blkg)->weight = weight;

958

}

958

}

959

960

static struct cfq_group *

960

static struct cfq_group *

961

cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)

961

cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)

962

{

962

{

963

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);

963

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);

964

struct cfq_group *cfqg = NULL;

964

struct cfq_group *cfqg = NULL;

965

void *key = cfqd;

965

void *key = cfqd;

966

int i, j;

966

int i, j;

967

struct cfq_rb_root *st;

967

struct cfq_rb_root *st;

968

struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;

968

struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;

969

unsigned int major, minor;

969

unsigned int major, minor;

970

971

cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));

971

cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));

972

if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {

972

if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {

973

sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);

973

sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);

974

cfqg->blkg.dev = MKDEV(major, minor);

974

cfqg->blkg.dev = MKDEV(major, minor);

975

goto done;

975

goto done;

976

}

976

}

977

if (cfqg || !create)

977

if (cfqg || !create)

978

goto done;

978

goto done;

979

980

cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);

980

cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);

981

if (!cfqg)

981

if (!cfqg)

982

goto done;

982

goto done;

983

984

for_each_cfqg_st(cfqg, i, j, st)

984

for_each_cfqg_st(cfqg, i, j, st)

985

*st = CFQ_RB_ROOT;

985

*st = CFQ_RB_ROOT;

986

RB_CLEAR_NODE(&cfqg->rb_node);

986

RB_CLEAR_NODE(&cfqg->rb_node);

987

988

/*

988

/*

989

* Take the initial reference that will be released on destroy

989

* Take the initial reference that will be released on destroy

990

* This can be thought of a joint reference by cgroup and

990

* This can be thought of a joint reference by cgroup and

991

* elevator which will be dropped by either elevator exit

991

* elevator which will be dropped by either elevator exit

992

* or cgroup deletion path depending on who is exiting first.

992

* or cgroup deletion path depending on who is exiting first.

993

*/

993

*/

994

atomic_set(&cfqg->ref, 1);

994

atomic_set(&cfqg->ref, 1);

995

996

/* Add group onto cgroup list */

996

/* Add group onto cgroup list */

997

sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);

997

sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);

998

blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,

998

blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,

999

MKDEV(major, minor));

999

MKDEV(major, minor));

1000

cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);

1000

cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);

1001

1002

/* Add group on cfqd list */

1002

/* Add group on cfqd list */

1003

hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);

1003

hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);

1004

1005

done:

1005

done:

1006

return cfqg;

1006

return cfqg;

1007

}

1007

}

1008

1009

/*

1009

/*

1010

* Search for the cfq group current task belongs to. If create = 1, then also

1010

* Search for the cfq group current task belongs to. If create = 1, then also

1011

* create the cfq group if it does not exist. request_queue lock must be held.

1011

* create the cfq group if it does not exist. request_queue lock must be held.

1012

*/

1012

*/

1013

static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)

1013

static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)

1014

{

1014

{

1015

struct cgroup *cgroup;

1015

struct cgroup *cgroup;

1016

struct cfq_group *cfqg = NULL;

1016

struct cfq_group *cfqg = NULL;

1017

1018

rcu_read_lock();

1018

rcu_read_lock();

1019

cgroup = task_cgroup(current, blkio_subsys_id);

1019

cgroup = task_cgroup(current, blkio_subsys_id);

1020

cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);

1020

cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);

1021

if (!cfqg && create)

1021

if (!cfqg && create)

1022

cfqg = &cfqd->root_group;

1022

cfqg = &cfqd->root_group;

1023

rcu_read_unlock();

1023

rcu_read_unlock();

1024

return cfqg;

1024

return cfqg;

1025

}

1025

}

1026

1027

static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)

1027

static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)

1028

{

1028

{

1029

atomic_inc(&cfqg->ref);

1029

atomic_inc(&cfqg->ref);

1030

return cfqg;

1030

return cfqg;

1031

}

1031

}

1032

1033

static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)

1033

static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)

1034

{

1034

{

1035

/* Currently, all async queues are mapped to root group */

1035

/* Currently, all async queues are mapped to root group */

1036

if (!cfq_cfqq_sync(cfqq))

1036

if (!cfq_cfqq_sync(cfqq))

1037

cfqg = &cfqq->cfqd->root_group;

1037

cfqg = &cfqq->cfqd->root_group;

1038

1039

cfqq->cfqg = cfqg;

1039

cfqq->cfqg = cfqg;

1040

/* cfqq reference on cfqg */

1040

/* cfqq reference on cfqg */

1041

atomic_inc(&cfqq->cfqg->ref);

1041

atomic_inc(&cfqq->cfqg->ref);

1042

}

1042

}

1043

1044

static void cfq_put_cfqg(struct cfq_group *cfqg)

1044

static void cfq_put_cfqg(struct cfq_group *cfqg)

1045

{

1045

{

1046

struct cfq_rb_root *st;

1046

struct cfq_rb_root *st;

1047

int i, j;

1047

int i, j;

1048

1049

BUG_ON(atomic_read(&cfqg->ref) <= 0);

1049

BUG_ON(atomic_read(&cfqg->ref) <= 0);

1050

if (!atomic_dec_and_test(&cfqg->ref))

1050

if (!atomic_dec_and_test(&cfqg->ref))

1051

return;

1051

return;

1052

for_each_cfqg_st(cfqg, i, j, st)

1052

for_each_cfqg_st(cfqg, i, j, st)

1053

BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);

1053

BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);

1054

kfree(cfqg);

1054

kfree(cfqg);

1055

}

1055

}

1056

1057

static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)

1057

static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)

1058

{

1058

{

1059

/* Something wrong if we are trying to remove same group twice */

1059

/* Something wrong if we are trying to remove same group twice */

1060

BUG_ON(hlist_unhashed(&cfqg->cfqd_node));

1060

BUG_ON(hlist_unhashed(&cfqg->cfqd_node));

1061

1062

hlist_del_init(&cfqg->cfqd_node);

1062

hlist_del_init(&cfqg->cfqd_node);

1063

1064

/*

1064

/*

1065

* Put the reference taken at the time of creation so that when all

1065

* Put the reference taken at the time of creation so that when all

1066

* queues are gone, group can be destroyed.

1066

* queues are gone, group can be destroyed.

1067

*/

1067

*/

1068

cfq_put_cfqg(cfqg);

1068

cfq_put_cfqg(cfqg);

1069

}

1069

}

1070

1071

static void cfq_release_cfq_groups(struct cfq_data *cfqd)

1071

static void cfq_release_cfq_groups(struct cfq_data *cfqd)

1072

{

1072

{

1073

struct hlist_node *pos, *n;

1073

struct hlist_node *pos, *n;

1074

struct cfq_group *cfqg;

1074

struct cfq_group *cfqg;

1075

1076

hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {

1076

hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {

1077

/*

1077

/*

1078

* If cgroup removal path got to blk_group first and removed

1078

* If cgroup removal path got to blk_group first and removed

1079

* it from cgroup list, then it will take care of destroying

1079

* it from cgroup list, then it will take care of destroying

1080

* cfqg also.

1080

* cfqg also.

1081

*/

1081

*/

1082

if (!blkiocg_del_blkio_group(&cfqg->blkg))

1082

if (!blkiocg_del_blkio_group(&cfqg->blkg))

1083

cfq_destroy_cfqg(cfqd, cfqg);

1083

cfq_destroy_cfqg(cfqd, cfqg);

1084

}

1084

}

1085

}

1085

}

1086

1087

/*

1087

/*

1088

* Blk cgroup controller notification saying that blkio_group object is being

1088

* Blk cgroup controller notification saying that blkio_group object is being

1089

* delinked as associated cgroup object is going away. That also means that

1089

* delinked as associated cgroup object is going away. That also means that

1090

* no new IO will come in this group. So get rid of this group as soon as

1090

* no new IO will come in this group. So get rid of this group as soon as

1091

* any pending IO in the group is finished.

1091

* any pending IO in the group is finished.

1092

*

1092

*

1093

* This function is called under rcu_read_lock(). key is the rcu protected

1093

* This function is called under rcu_read_lock(). key is the rcu protected

1094

* pointer. That means "key" is a valid cfq_data pointer as long as we are rcu

1094

* pointer. That means "key" is a valid cfq_data pointer as long as we are rcu

1095

* read lock.

1095

* read lock.

1096

*

1096

*

1097

* "key" was fetched from blkio_group under blkio_cgroup->lock. That means

1097

* "key" was fetched from blkio_group under blkio_cgroup->lock. That means

1098

* it should not be NULL as even if elevator was exiting, cgroup deltion

1098

* it should not be NULL as even if elevator was exiting, cgroup deltion

1099

* path got to it first.

1099

* path got to it first.

1100

*/

1100

*/

1101

void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)

1101

void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)

1102

{

1102

{

1103

unsigned long flags;

1103

unsigned long flags;

1104

struct cfq_data *cfqd = key;

1104

struct cfq_data *cfqd = key;

1105

1106

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

1106

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

1107

cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));

1107

cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));

1108

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

1108

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

1109

}

1109

}

1110

1111

#else /* GROUP_IOSCHED */

1111

#else /* GROUP_IOSCHED */

1112

static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)

1112

static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)

1113

{

1113

{

1114

return &cfqd->root_group;

1114

return &cfqd->root_group;

1115

}

1115

}

1116

1117

static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)

1117

static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)

1118

{

1118

{

1119

return cfqg;

1119

return cfqg;

1120

}

1120

}

1121

1122

static inline void

1122

static inline void

1123

cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {

1123

cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {

1124

cfqq->cfqg = cfqg;

1124

cfqq->cfqg = cfqg;

1125

}

1125

}

1126

1127

static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}

1127

static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}

1128

static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}

1128

static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}

1129

1130

#endif /* GROUP_IOSCHED */

1130

#endif /* GROUP_IOSCHED */

1131

1132

/*

1132

/*

1133

* The cfqd->service_trees holds all pending cfq_queue's that have

1133

* The cfqd->service_trees holds all pending cfq_queue's that have

1134

* requests waiting to be processed. It is sorted in the order that

1134

* requests waiting to be processed. It is sorted in the order that

1135

* we will service the queues.

1135

* we will service the queues.

1136

*/

1136

*/

1137

static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1137

static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1138

bool add_front)

1138

bool add_front)

1139

{

1139

{

1140

struct rb_node **p, *parent;

1140

struct rb_node **p, *parent;

1141

struct cfq_queue *__cfqq;

1141

struct cfq_queue *__cfqq;

1142

unsigned long rb_key;

1142

unsigned long rb_key;

1143

struct cfq_rb_root *service_tree;

1143

struct cfq_rb_root *service_tree;

1144

int left;

1144

int left;

1145

int new_cfqq = 1;

1145

int new_cfqq = 1;

1146

int group_changed = 0;

1146

int group_changed = 0;

1147

1148

#ifdef CONFIG_CFQ_GROUP_IOSCHED

1148

#ifdef CONFIG_CFQ_GROUP_IOSCHED

1149

if (!cfqd->cfq_group_isolation

1149

if (!cfqd->cfq_group_isolation

1150

&& cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD

1150

&& cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD

1151

&& cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {

1151

&& cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {

1152

/* Move this cfq to root group */

1152

/* Move this cfq to root group */

1153

cfq_log_cfqq(cfqd, cfqq, "moving to root group");

1153

cfq_log_cfqq(cfqd, cfqq, "moving to root group");

1154

if (!RB_EMPTY_NODE(&cfqq->rb_node))

1154

if (!RB_EMPTY_NODE(&cfqq->rb_node))

1155

cfq_group_service_tree_del(cfqd, cfqq->cfqg);

1155

cfq_group_service_tree_del(cfqd, cfqq->cfqg);

1156

cfqq->orig_cfqg = cfqq->cfqg;

1156

cfqq->orig_cfqg = cfqq->cfqg;

1157

cfqq->cfqg = &cfqd->root_group;

1157

cfqq->cfqg = &cfqd->root_group;

1158

atomic_inc(&cfqd->root_group.ref);

1158

atomic_inc(&cfqd->root_group.ref);

1159

group_changed = 1;

1159

group_changed = 1;

1160

} else if (!cfqd->cfq_group_isolation

1160

} else if (!cfqd->cfq_group_isolation

1161

&& cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {

1161

&& cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {

1162

/* cfqq is sequential now needs to go to its original group */

1162

/* cfqq is sequential now needs to go to its original group */

1163

BUG_ON(cfqq->cfqg != &cfqd->root_group);

1163

BUG_ON(cfqq->cfqg != &cfqd->root_group);

1164

if (!RB_EMPTY_NODE(&cfqq->rb_node))

1164

if (!RB_EMPTY_NODE(&cfqq->rb_node))

1165

cfq_group_service_tree_del(cfqd, cfqq->cfqg);

1165

cfq_group_service_tree_del(cfqd, cfqq->cfqg);

1166

cfq_put_cfqg(cfqq->cfqg);

1166

cfq_put_cfqg(cfqq->cfqg);

1167

cfqq->cfqg = cfqq->orig_cfqg;

1167

cfqq->cfqg = cfqq->orig_cfqg;

1168

cfqq->orig_cfqg = NULL;

1168

cfqq->orig_cfqg = NULL;

1169

group_changed = 1;

1169

group_changed = 1;

1170

cfq_log_cfqq(cfqd, cfqq, "moved to origin group");

1170

cfq_log_cfqq(cfqd, cfqq, "moved to origin group");

1171

}

1171

}

1172

#endif

1172

#endif

1173

1174

service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),

1174

service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),

1175

cfqq_type(cfqq));

1175

cfqq_type(cfqq));

1176

if (cfq_class_idle(cfqq)) {

1176

if (cfq_class_idle(cfqq)) {

1177

rb_key = CFQ_IDLE_DELAY;

1177

rb_key = CFQ_IDLE_DELAY;

1178

parent = rb_last(&service_tree->rb);

1178

parent = rb_last(&service_tree->rb);

1179

if (parent && parent != &cfqq->rb_node) {

1179

if (parent && parent != &cfqq->rb_node) {

1180

__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1180

__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1181

rb_key += __cfqq->rb_key;

1181

rb_key += __cfqq->rb_key;

1182

} else

1182

} else

1183

rb_key += jiffies;

1183

rb_key += jiffies;

1184

} else if (!add_front) {

1184

} else if (!add_front) {

1185

/*

1185

/*

1186

* Get our rb key offset. Subtract any residual slice

1186

* Get our rb key offset. Subtract any residual slice

1187

* value carried from last service. A negative resid

1187

* value carried from last service. A negative resid

1188

* count indicates slice overrun, and this should position

1188

* count indicates slice overrun, and this should position

1189

* the next service time further away in the tree.

1189

* the next service time further away in the tree.

1190

*/

1190

*/

1191

rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;

1191

rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;

1192

rb_key -= cfqq->slice_resid;

1192

rb_key -= cfqq->slice_resid;

1193

cfqq->slice_resid = 0;

1193

cfqq->slice_resid = 0;

1194

} else {

1194

} else {

1195

rb_key = -HZ;

1195

rb_key = -HZ;

1196

__cfqq = cfq_rb_first(service_tree);

1196

__cfqq = cfq_rb_first(service_tree);

1197

rb_key += __cfqq ? __cfqq->rb_key : jiffies;

1197

rb_key += __cfqq ? __cfqq->rb_key : jiffies;

1198

}

1198

}

1199

1200

if (!RB_EMPTY_NODE(&cfqq->rb_node)) {

1200

if (!RB_EMPTY_NODE(&cfqq->rb_node)) {

1201

new_cfqq = 0;

1201

new_cfqq = 0;

1202

/*

1202

/*

1203

* same position, nothing more to do

1203

* same position, nothing more to do

1204

*/

1204

*/

1205

if (rb_key == cfqq->rb_key &&

1205

if (rb_key == cfqq->rb_key &&

1206

cfqq->service_tree == service_tree)

1206

cfqq->service_tree == service_tree)

1207

return;

1207

return;

1208

1209

cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);

1209

cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);

1210

cfqq->service_tree = NULL;

1210

cfqq->service_tree = NULL;

1211

}

1211

}

1212

1213

left = 1;

1213

left = 1;

1214

parent = NULL;

1214

parent = NULL;

1215

cfqq->service_tree = service_tree;

1215

cfqq->service_tree = service_tree;

1216

p = &service_tree->rb.rb_node;

1216

p = &service_tree->rb.rb_node;

1217

while (*p) {

1217

while (*p) {

1218

struct rb_node **n;

1218

struct rb_node **n;

1219

1220

parent = *p;

1220

parent = *p;

1221

__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1221

__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1222

1223

/*

1223

/*

1224

* sort by key, that represents service time.

1224

* sort by key, that represents service time.

1225

*/

1225

*/

1226

if (time_before(rb_key, __cfqq->rb_key))

1226

if (time_before(rb_key, __cfqq->rb_key))

1227

n = &(*p)->rb_left;

1227

n = &(*p)->rb_left;

1228

else {

1228

else {

1229

n = &(*p)->rb_right;

1229

n = &(*p)->rb_right;

1230

left = 0;

1230

left = 0;

1231

}

1231

}

1232

1233

p = n;

1233

p = n;

1234

}

1234

}

1235

1236

if (left)

1236

if (left)

1237

service_tree->left = &cfqq->rb_node;

1237

service_tree->left = &cfqq->rb_node;

1238

1239

cfqq->rb_key = rb_key;

1239

cfqq->rb_key = rb_key;

1240

rb_link_node(&cfqq->rb_node, parent, p);

1240

rb_link_node(&cfqq->rb_node, parent, p);

1241

rb_insert_color(&cfqq->rb_node, &service_tree->rb);

1241

rb_insert_color(&cfqq->rb_node, &service_tree->rb);

1242

service_tree->count++;

1242

service_tree->count++;

1243

if ((add_front || !new_cfqq) && !group_changed)

1243

if ((add_front || !new_cfqq) && !group_changed)

1244

return;

1244

return;

1245

cfq_group_service_tree_add(cfqd, cfqq->cfqg);

1245

cfq_group_service_tree_add(cfqd, cfqq->cfqg);

1246

}

1246

}

1247

1248

static struct cfq_queue *

1248

static struct cfq_queue *

1249

cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,

1249

cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,

1250

sector_t sector, struct rb_node **ret_parent,

1250

sector_t sector, struct rb_node **ret_parent,

1251

struct rb_node ***rb_link)

1251

struct rb_node ***rb_link)

1252

{

1252

{

1253

struct rb_node **p, *parent;

1253

struct rb_node **p, *parent;

1254

struct cfq_queue *cfqq = NULL;

1254

struct cfq_queue *cfqq = NULL;

1255

1256

parent = NULL;

1256

parent = NULL;

1257

p = &root->rb_node;

1257

p = &root->rb_node;

1258

while (*p) {

1258

while (*p) {

1259

struct rb_node **n;

1259

struct rb_node **n;

1260

1261

parent = *p;

1261

parent = *p;

1262

cfqq = rb_entry(parent, struct cfq_queue, p_node);

1262

cfqq = rb_entry(parent, struct cfq_queue, p_node);

1263

1264

/*

1264

/*

1265

* Sort strictly based on sector. Smallest to the left,

1265

* Sort strictly based on sector. Smallest to the left,

1266

* largest to the right.

1266

* largest to the right.

1267

*/

1267

*/

1268

if (sector > blk_rq_pos(cfqq->next_rq))

1268

if (sector > blk_rq_pos(cfqq->next_rq))

1269

n = &(*p)->rb_right;

1269

n = &(*p)->rb_right;

1270

else if (sector < blk_rq_pos(cfqq->next_rq))

1270

else if (sector < blk_rq_pos(cfqq->next_rq))

1271

n = &(*p)->rb_left;

1271

n = &(*p)->rb_left;

1272

else

1272

else

1273

break;

1273

break;

1274

p = n;

1274

p = n;

1275

cfqq = NULL;

1275

cfqq = NULL;

1276

}

1276

}

1277

1278

*ret_parent = parent;

1278

*ret_parent = parent;

1279

if (rb_link)

1279

if (rb_link)

1280

*rb_link = p;

1280

*rb_link = p;

1281

return cfqq;

1281

return cfqq;

1282

}

1282

}

1283

1284

static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1284

static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1285

{

1285

{

1286

struct rb_node **p, *parent;

1286

struct rb_node **p, *parent;

1287

struct cfq_queue *__cfqq;

1287

struct cfq_queue *__cfqq;

1288

1289

if (cfqq->p_root) {

1289

if (cfqq->p_root) {

1290

rb_erase(&cfqq->p_node, cfqq->p_root);

1290

rb_erase(&cfqq->p_node, cfqq->p_root);

1291

cfqq->p_root = NULL;

1291

cfqq->p_root = NULL;

1292

}

1292

}

1293

1294

if (cfq_class_idle(cfqq))

1294

if (cfq_class_idle(cfqq))

1295

return;

1295

return;

1296

if (!cfqq->next_rq)

1296

if (!cfqq->next_rq)

1297

return;

1297

return;

1298

1299

cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];

1299

cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];

1300

__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,

1300

__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,

1301

blk_rq_pos(cfqq->next_rq), &parent, &p);

1301

blk_rq_pos(cfqq->next_rq), &parent, &p);

1302

if (!__cfqq) {

1302

if (!__cfqq) {

1303

rb_link_node(&cfqq->p_node, parent, p);

1303

rb_link_node(&cfqq->p_node, parent, p);

1304

rb_insert_color(&cfqq->p_node, cfqq->p_root);

1304

rb_insert_color(&cfqq->p_node, cfqq->p_root);

1305

} else

1305

} else

1306

cfqq->p_root = NULL;

1306

cfqq->p_root = NULL;

1307

}

1307

}

1308

1309

/*

1309

/*

1310

* Update cfqq's position in the service tree.

1310

* Update cfqq's position in the service tree.

1311

*/

1311

*/

1312

static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1312

static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1313

{

1313

{

1314

/*

1314

/*

1315

* Resorting requires the cfqq to be on the RR list already.

1315

* Resorting requires the cfqq to be on the RR list already.

1316

*/

1316

*/

1317

if (cfq_cfqq_on_rr(cfqq)) {

1317

if (cfq_cfqq_on_rr(cfqq)) {

1318

cfq_service_tree_add(cfqd, cfqq, 0);

1318

cfq_service_tree_add(cfqd, cfqq, 0);

1319

cfq_prio_tree_add(cfqd, cfqq);

1319

cfq_prio_tree_add(cfqd, cfqq);

1320

}

1320

}

1321

}

1321

}

1322

1323

/*

1323

/*

1324

* add to busy list of queues for service, trying to be fair in ordering

1324

* add to busy list of queues for service, trying to be fair in ordering

1325

* the pending list according to last request service

1325

* the pending list according to last request service

1326

*/

1326

*/

1327

static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1327

static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1328

{

1328

{

1329

cfq_log_cfqq(cfqd, cfqq, "add_to_rr");

1329

cfq_log_cfqq(cfqd, cfqq, "add_to_rr");

1330

BUG_ON(cfq_cfqq_on_rr(cfqq));

1330

BUG_ON(cfq_cfqq_on_rr(cfqq));

1331

cfq_mark_cfqq_on_rr(cfqq);

1331

cfq_mark_cfqq_on_rr(cfqq);

1332

cfqd->busy_queues++;

1332

cfqd->busy_queues++;

1333

1334

cfq_resort_rr_list(cfqd, cfqq);

1334

cfq_resort_rr_list(cfqd, cfqq);

1335

}

1335

}

1336

1337

/*

1337

/*

1338

* Called when the cfqq no longer has requests pending, remove it from

1338

* Called when the cfqq no longer has requests pending, remove it from

1339

* the service tree.

1339

* the service tree.

1340

*/

1340

*/

1341

static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1341

static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1342

{

1342

{

1343

cfq_log_cfqq(cfqd, cfqq, "del_from_rr");

1343

cfq_log_cfqq(cfqd, cfqq, "del_from_rr");

1344

BUG_ON(!cfq_cfqq_on_rr(cfqq));

1344

BUG_ON(!cfq_cfqq_on_rr(cfqq));

1345

cfq_clear_cfqq_on_rr(cfqq);

1345

cfq_clear_cfqq_on_rr(cfqq);

1346

1347

if (!RB_EMPTY_NODE(&cfqq->rb_node)) {

1347

if (!RB_EMPTY_NODE(&cfqq->rb_node)) {

1348

cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);

1348

cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);

1349

cfqq->service_tree = NULL;

1349

cfqq->service_tree = NULL;

1350

}

1350

}

1351

if (cfqq->p_root) {

1351

if (cfqq->p_root) {

1352

rb_erase(&cfqq->p_node, cfqq->p_root);

1352

rb_erase(&cfqq->p_node, cfqq->p_root);

1353

cfqq->p_root = NULL;

1353

cfqq->p_root = NULL;

1354

}

1354

}

1355

1356

cfq_group_service_tree_del(cfqd, cfqq->cfqg);

1356

cfq_group_service_tree_del(cfqd, cfqq->cfqg);

1357

BUG_ON(!cfqd->busy_queues);

1357

BUG_ON(!cfqd->busy_queues);

1358

cfqd->busy_queues--;

1358

cfqd->busy_queues--;

1359

}

1359

}

1360

1361

/*

1361

/*

1362

* rb tree support functions

1362

* rb tree support functions

1363

*/

1363

*/

1364

static void cfq_del_rq_rb(struct request *rq)

1364

static void cfq_del_rq_rb(struct request *rq)

1365

{

1365

{

1366

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1366

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1367

const int sync = rq_is_sync(rq);

1367

const int sync = rq_is_sync(rq);

1368

1369

BUG_ON(!cfqq->queued[sync]);

1369

BUG_ON(!cfqq->queued[sync]);

1370

cfqq->queued[sync]--;

1370

cfqq->queued[sync]--;

1371

1372

elv_rb_del(&cfqq->sort_list, rq);

1372

elv_rb_del(&cfqq->sort_list, rq);

1373

1374

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {

1374

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {

1375

/*

1375

/*

1376

* Queue will be deleted from service tree when we actually

1376

* Queue will be deleted from service tree when we actually

1377

* expire it later. Right now just remove it from prio tree

1377

* expire it later. Right now just remove it from prio tree

1378

* as it is empty.

1378

* as it is empty.

1379

*/

1379

*/

1380

if (cfqq->p_root) {

1380

if (cfqq->p_root) {

1381

rb_erase(&cfqq->p_node, cfqq->p_root);

1381

rb_erase(&cfqq->p_node, cfqq->p_root);

1382

cfqq->p_root = NULL;

1382

cfqq->p_root = NULL;

1383

}

1383

}

1384

}

1384

}

1385

}

1385

}

1386

1387

static void cfq_add_rq_rb(struct request *rq)

1387

static void cfq_add_rq_rb(struct request *rq)

1388

{

1388

{

1389

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1389

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1390

struct cfq_data *cfqd = cfqq->cfqd;

1390

struct cfq_data *cfqd = cfqq->cfqd;

1391

struct request *__alias, *prev;

1391

struct request *__alias, *prev;

1392

1393

cfqq->queued[rq_is_sync(rq)]++;

1393

cfqq->queued[rq_is_sync(rq)]++;

1394

1395

/*

1395

/*

1396

* looks a little odd, but the first insert might return an alias.

1396

* looks a little odd, but the first insert might return an alias.

1397

* if that happens, put the alias on the dispatch list

1397

* if that happens, put the alias on the dispatch list

1398

*/

1398

*/

1399

while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)

1399

while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)

1400

cfq_dispatch_insert(cfqd->queue, __alias);

1400

cfq_dispatch_insert(cfqd->queue, __alias);

1401

1402

if (!cfq_cfqq_on_rr(cfqq))

1402

if (!cfq_cfqq_on_rr(cfqq))

1403

cfq_add_cfqq_rr(cfqd, cfqq);

1403

cfq_add_cfqq_rr(cfqd, cfqq);

1404

1405

/*

1405

/*

1406

* check if this request is a better next-serve candidate

1406

* check if this request is a better next-serve candidate

1407

*/

1407

*/

1408

prev = cfqq->next_rq;

1408

prev = cfqq->next_rq;

1409

cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);

1409

cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);

1410

1411

/*

1411

/*

1412

* adjust priority tree position, if ->next_rq changes

1412

* adjust priority tree position, if ->next_rq changes

1413

*/

1413

*/

1414

if (prev != cfqq->next_rq)

1414

if (prev != cfqq->next_rq)

1415

cfq_prio_tree_add(cfqd, cfqq);

1415

cfq_prio_tree_add(cfqd, cfqq);

1416

1417

BUG_ON(!cfqq->next_rq);

1417

BUG_ON(!cfqq->next_rq);

1418

}

1418

}

1419

1420

static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)

1420

static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)

1421

{

1421

{

1422

elv_rb_del(&cfqq->sort_list, rq);

1422

elv_rb_del(&cfqq->sort_list, rq);

1423

cfqq->queued[rq_is_sync(rq)]--;

1423

cfqq->queued[rq_is_sync(rq)]--;

1424

blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),

1424

blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),

1425

rq_is_sync(rq));

1425

rq_is_sync(rq));

1426

cfq_add_rq_rb(rq);

1426

cfq_add_rq_rb(rq);

1427

blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,

1427

blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,

1428

&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),

1428

&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),

1429

rq_is_sync(rq));

1429

rq_is_sync(rq));

1430

}

1430

}

1431

1432

static struct request *

1432

static struct request *

1433

cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)

1433

cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)

1434

{

1434

{

1435

struct task_struct *tsk = current;

1435

struct task_struct *tsk = current;

1436

struct cfq_io_context *cic;

1436

struct cfq_io_context *cic;

1437

struct cfq_queue *cfqq;

1437

struct cfq_queue *cfqq;

1438

1439

cic = cfq_cic_lookup(cfqd, tsk->io_context);

1439

cic = cfq_cic_lookup(cfqd, tsk->io_context);

1440

if (!cic)

1440

if (!cic)

1441

return NULL;

1441

return NULL;

1442

1443

cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));

1443

cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));

1444

if (cfqq) {

1444

if (cfqq) {

1445

sector_t sector = bio->bi_sector + bio_sectors(bio);

1445

sector_t sector = bio->bi_sector + bio_sectors(bio);

1446

1447

return elv_rb_find(&cfqq->sort_list, sector);

1447

return elv_rb_find(&cfqq->sort_list, sector);

1448

}

1448

}

1449

1450

return NULL;

1450

return NULL;

1451

}

1451

}

1452

1453

static void cfq_activate_request(struct request_queue *q, struct request *rq)

1453

static void cfq_activate_request(struct request_queue *q, struct request *rq)

1454

{

1454

{

1455

struct cfq_data *cfqd = q->elevator->elevator_data;

1455

struct cfq_data *cfqd = q->elevator->elevator_data;

1456

1457

cfqd->rq_in_driver++;

1457

cfqd->rq_in_driver++;

1458

cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",

1458

cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",

1459

cfqd->rq_in_driver);

1459

cfqd->rq_in_driver);

1460

1461

cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

1461

cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

1462

}

1462

}

1463

1464

static void cfq_deactivate_request(struct request_queue *q, struct request *rq)

1464

static void cfq_deactivate_request(struct request_queue *q, struct request *rq)

1465

{

1465

{

1466

struct cfq_data *cfqd = q->elevator->elevator_data;

1466

struct cfq_data *cfqd = q->elevator->elevator_data;

1467

1468

WARN_ON(!cfqd->rq_in_driver);

1468

WARN_ON(!cfqd->rq_in_driver);

1469

cfqd->rq_in_driver--;

1469

cfqd->rq_in_driver--;

1470

cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",

1470

cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",

1471

cfqd->rq_in_driver);

1471

cfqd->rq_in_driver);

1472

}

1472

}

1473

1474

static void cfq_remove_request(struct request *rq)

1474

static void cfq_remove_request(struct request *rq)

1475

{

1475

{

1476

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1476

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1477

1478

if (cfqq->next_rq == rq)

1478

if (cfqq->next_rq == rq)

1479

cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);

1479

cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);

1480

1481

list_del_init(&rq->queuelist);

1481

list_del_init(&rq->queuelist);

1482

cfq_del_rq_rb(rq);

1482

cfq_del_rq_rb(rq);

1483

1484

cfqq->cfqd->rq_queued--;

1484

cfqq->cfqd->rq_queued--;

1485

blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),

1485

blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),

1486

rq_is_sync(rq));

1486

rq_is_sync(rq));

1487

if (rq_is_meta(rq)) {

1487

if (rq_is_meta(rq)) {

1488

WARN_ON(!cfqq->meta_pending);

1488

WARN_ON(!cfqq->meta_pending);

1489

cfqq->meta_pending--;

1489

cfqq->meta_pending--;

1490

}

1490

}

1491

}

1491

}

1492

1493

static int cfq_merge(struct request_queue *q, struct request **req,

1493

static int cfq_merge(struct request_queue *q, struct request **req,

1494

struct bio *bio)

1494

struct bio *bio)

1495

{

1495

{

1496

struct cfq_data *cfqd = q->elevator->elevator_data;

1496

struct cfq_data *cfqd = q->elevator->elevator_data;

1497

struct request *__rq;

1497

struct request *__rq;

1498

1499

__rq = cfq_find_rq_fmerge(cfqd, bio);

1499

__rq = cfq_find_rq_fmerge(cfqd, bio);

1500

if (__rq && elv_rq_merge_ok(__rq, bio)) {

1500

if (__rq && elv_rq_merge_ok(__rq, bio)) {

1501

*req = __rq;

1501

*req = __rq;

1502

return ELEVATOR_FRONT_MERGE;

1502

return ELEVATOR_FRONT_MERGE;

1503

}

1503

}

1504

1505

return ELEVATOR_NO_MERGE;

1505

return ELEVATOR_NO_MERGE;

1506

}

1506

}

1507

1508

static void cfq_merged_request(struct request_queue *q, struct request *req,

1508

static void cfq_merged_request(struct request_queue *q, struct request *req,

1509

int type)

1509

int type)

1510

{

1510

{

1511

if (type == ELEVATOR_FRONT_MERGE) {

1511

if (type == ELEVATOR_FRONT_MERGE) {

1512

struct cfq_queue *cfqq = RQ_CFQQ(req);

1512

struct cfq_queue *cfqq = RQ_CFQQ(req);

1513

1514

cfq_reposition_rq_rb(cfqq, req);

1514

cfq_reposition_rq_rb(cfqq, req);

1515

}

1515

}

1516

}

1516

}

1517

1518

static void cfq_bio_merged(struct request_queue *q, struct request *req,

1518

static void cfq_bio_merged(struct request_queue *q, struct request *req,

1519

struct bio *bio)

1519

struct bio *bio)

1520

{

1520

{

1521

blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),

1521

blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),

1522

cfq_bio_sync(bio));

1522

cfq_bio_sync(bio));

1523

}

1523

}

1524

1525

static void

1525

static void

1526

cfq_merged_requests(struct request_queue *q, struct request *rq,

1526

cfq_merged_requests(struct request_queue *q, struct request *rq,

1527

struct request *next)

1527

struct request *next)

1528

{

1528

{

1529

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1529

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1530

/*

1530

/*

1531

* reposition in fifo if next is older than rq

1531

* reposition in fifo if next is older than rq

1532

*/

1532

*/

1533

if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

1533

if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

1534

time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

1534

time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

1535

list_move(&rq->queuelist, &next->queuelist);

1535

list_move(&rq->queuelist, &next->queuelist);

1536

rq_set_fifo_time(rq, rq_fifo_time(next));

1536

rq_set_fifo_time(rq, rq_fifo_time(next));

1537

}

1537

}

1538

1539

if (cfqq->next_rq == next)

1539

if (cfqq->next_rq == next)

1540

cfqq->next_rq = rq;

1540

cfqq->next_rq = rq;

1541

cfq_remove_request(next);

1541

cfq_remove_request(next);

1542

blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),

1542

blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),

1543

rq_is_sync(next));

1543

rq_is_sync(next));

1544

}

1544

}

1545

1546

static int cfq_allow_merge(struct request_queue *q, struct request *rq,

1546

static int cfq_allow_merge(struct request_queue *q, struct request *rq,

1547

struct bio *bio)

1547

struct bio *bio)

1548

{

1548

{

1549

struct cfq_data *cfqd = q->elevator->elevator_data;

1549

struct cfq_data *cfqd = q->elevator->elevator_data;

1550

struct cfq_io_context *cic;

1550

struct cfq_io_context *cic;

1551

struct cfq_queue *cfqq;

1551

struct cfq_queue *cfqq;

1552

1553

/*

1553

/*

1554

* Disallow merge of a sync bio into an async request.

1554

* Disallow merge of a sync bio into an async request.

1555

*/

1555

*/

1556

if (cfq_bio_sync(bio) && !rq_is_sync(rq))

1556

if (cfq_bio_sync(bio) && !rq_is_sync(rq))

1557

return false;

1557

return false;

1558

1559

/*

1559

/*

1560

* Lookup the cfqq that this bio will be queued with. Allow

1560

* Lookup the cfqq that this bio will be queued with. Allow

1561

* merge only if rq is queued there.

1561

* merge only if rq is queued there.

1562

*/

1562

*/

1563

cic = cfq_cic_lookup(cfqd, current->io_context);

1563

cic = cfq_cic_lookup(cfqd, current->io_context);

1564

if (!cic)

1564

if (!cic)

1565

return false;

1565

return false;

1566

1567

cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));

1567

cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));

1568

return cfqq == RQ_CFQQ(rq);

1568

return cfqq == RQ_CFQQ(rq);

1569

}

1569

}

1570

1571

static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1571

static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1572

{

1572

{

1573

del_timer(&cfqd->idle_slice_timer);

1573

del_timer(&cfqd->idle_slice_timer);

1574

blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);

1574

blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);

1575

}

1575

}

1576

1577

static void __cfq_set_active_queue(struct cfq_data *cfqd,

1577

static void __cfq_set_active_queue(struct cfq_data *cfqd,

1578

struct cfq_queue *cfqq)

1578

struct cfq_queue *cfqq)

1579

{

1579

{

1580

if (cfqq) {

1580

if (cfqq) {

1581

cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",

1581

cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",

1582

cfqd->serving_prio, cfqd->serving_type);

1582

cfqd->serving_prio, cfqd->serving_type);

1583

blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);

1583

blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);

1584

cfqq->slice_start = 0;

1584

cfqq->slice_start = 0;

1585

cfqq->dispatch_start = jiffies;

1585

cfqq->dispatch_start = jiffies;

1586

cfqq->allocated_slice = 0;

1586

cfqq->allocated_slice = 0;

1587

cfqq->slice_end = 0;

1587

cfqq->slice_end = 0;

1588

cfqq->slice_dispatch = 0;

1588

cfqq->slice_dispatch = 0;

1589

1590

cfq_clear_cfqq_wait_request(cfqq);

1590

cfq_clear_cfqq_wait_request(cfqq);

1591

cfq_clear_cfqq_must_dispatch(cfqq);

1591

cfq_clear_cfqq_must_dispatch(cfqq);

1592

cfq_clear_cfqq_must_alloc_slice(cfqq);

1592

cfq_clear_cfqq_must_alloc_slice(cfqq);

1593

cfq_clear_cfqq_fifo_expire(cfqq);

1593

cfq_clear_cfqq_fifo_expire(cfqq);

1594

cfq_mark_cfqq_slice_new(cfqq);

1594

cfq_mark_cfqq_slice_new(cfqq);

1595

1596

cfq_del_timer(cfqd, cfqq);

1596

cfq_del_timer(cfqd, cfqq);

1597

}

1597

}

1598

1599

cfqd->active_queue = cfqq;

1599

cfqd->active_queue = cfqq;

1600

}

1600

}

1601

1602

/*

1602

/*

1603

* current cfqq expired its slice (or was too idle), select new one

1603

* current cfqq expired its slice (or was too idle), select new one

1604

*/

1604

*/

1605

static void

1605

static void

1606

__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1606

__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1607

bool timed_out)

1607

bool timed_out)

1608

{

1608

{

1609

cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);

1609

cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);

1610

1611

if (cfq_cfqq_wait_request(cfqq))

1611

if (cfq_cfqq_wait_request(cfqq))

1612

cfq_del_timer(cfqd, cfqq);

1612

cfq_del_timer(cfqd, cfqq);

1613

1614

cfq_clear_cfqq_wait_request(cfqq);

1614

cfq_clear_cfqq_wait_request(cfqq);

1615

cfq_clear_cfqq_wait_busy(cfqq);

1615

cfq_clear_cfqq_wait_busy(cfqq);

1616

1617

/*

1617

/*

1618

* If this cfqq is shared between multiple processes, check to

1618

* If this cfqq is shared between multiple processes, check to

1619

* make sure that those processes are still issuing I/Os within

1619

* make sure that those processes are still issuing I/Os within

1620

* the mean seek distance. If not, it may be time to break the

1620

* the mean seek distance. If not, it may be time to break the

1621

* queues apart again.

1621

* queues apart again.

1622

*/

1622

*/

1623

if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))

1623

if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))

1624

cfq_mark_cfqq_split_coop(cfqq);

1624

cfq_mark_cfqq_split_coop(cfqq);

1625

1626

/*

1626

/*

1627

* store what was left of this slice, if the queue idled/timed out

1627

* store what was left of this slice, if the queue idled/timed out

1628

*/

1628

*/

1629

if (timed_out && !cfq_cfqq_slice_new(cfqq)) {

1629

if (timed_out && !cfq_cfqq_slice_new(cfqq)) {

1630

cfqq->slice_resid = cfqq->slice_end - jiffies;

1630

cfqq->slice_resid = cfqq->slice_end - jiffies;

1631

cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);

1631

cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);

1632

}

1632

}

1633

1634

cfq_group_served(cfqd, cfqq->cfqg, cfqq);

1634

cfq_group_served(cfqd, cfqq->cfqg, cfqq);

1635

1636

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))

1636

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))

1637

cfq_del_cfqq_rr(cfqd, cfqq);

1637

cfq_del_cfqq_rr(cfqd, cfqq);

1638

1639

cfq_resort_rr_list(cfqd, cfqq);

1639

cfq_resort_rr_list(cfqd, cfqq);

1640

1641

if (cfqq == cfqd->active_queue)

1641

if (cfqq == cfqd->active_queue)

1642

cfqd->active_queue = NULL;

1642

cfqd->active_queue = NULL;

1643

1644

if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)

1644

if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)

1645

cfqd->grp_service_tree.active = NULL;

1645

cfqd->grp_service_tree.active = NULL;

1646

1647

if (cfqd->active_cic) {

1647

if (cfqd->active_cic) {

1648

put_io_context(cfqd->active_cic->ioc);

1648

put_io_context(cfqd->active_cic->ioc);

1649

cfqd->active_cic = NULL;

1649

cfqd->active_cic = NULL;

1650

}

1650

}

1651

}

1651

}

1652

1653

static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)

1653

static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)

1654

{

1654

{

1655

struct cfq_queue *cfqq = cfqd->active_queue;

1655

struct cfq_queue *cfqq = cfqd->active_queue;

1656

1657

if (cfqq)

1657

if (cfqq)

1658

__cfq_slice_expired(cfqd, cfqq, timed_out);

1658

__cfq_slice_expired(cfqd, cfqq, timed_out);

1659

}

1659

}

1660

1661

/*

1661

/*

1662

* Get next queue for service. Unless we have a queue preemption,

1662

* Get next queue for service. Unless we have a queue preemption,

1663

* we'll simply select the first cfqq in the service tree.

1663

* we'll simply select the first cfqq in the service tree.

1664

*/

1664

*/

1665

static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)

1665

static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)

1666

{

1666

{

1667

struct cfq_rb_root *service_tree =

1667

struct cfq_rb_root *service_tree =

1668

service_tree_for(cfqd->serving_group, cfqd->serving_prio,

1668

service_tree_for(cfqd->serving_group, cfqd->serving_prio,

1669

cfqd->serving_type);

1669

cfqd->serving_type);

1670

1671

if (!cfqd->rq_queued)

1671

if (!cfqd->rq_queued)

1672

return NULL;

1672

return NULL;

1673

1674

/* There is nothing to dispatch */

1674

/* There is nothing to dispatch */

1675

if (!service_tree)

1675

if (!service_tree)

1676

return NULL;

1676

return NULL;

1677

if (RB_EMPTY_ROOT(&service_tree->rb))

1677

if (RB_EMPTY_ROOT(&service_tree->rb))

1678

return NULL;

1678

return NULL;

1679

return cfq_rb_first(service_tree);

1679

return cfq_rb_first(service_tree);

1680

}

1680

}

1681

1682

static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)

1682

static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)

1683

{

1683

{

1684

struct cfq_group *cfqg;

1684

struct cfq_group *cfqg;

1685

struct cfq_queue *cfqq;

1685

struct cfq_queue *cfqq;

1686

int i, j;

1686

int i, j;

1687

struct cfq_rb_root *st;

1687

struct cfq_rb_root *st;

1688

1689

if (!cfqd->rq_queued)

1689

if (!cfqd->rq_queued)

1690

return NULL;

1690

return NULL;

1691

1692

cfqg = cfq_get_next_cfqg(cfqd);

1692

cfqg = cfq_get_next_cfqg(cfqd);

1693

if (!cfqg)

1693

if (!cfqg)

1694

return NULL;

1694

return NULL;

1695

1696

for_each_cfqg_st(cfqg, i, j, st)

1696

for_each_cfqg_st(cfqg, i, j, st)

1697

if ((cfqq = cfq_rb_first(st)) != NULL)

1697

if ((cfqq = cfq_rb_first(st)) != NULL)

1698

return cfqq;

1698

return cfqq;

1699

return NULL;

1699

return NULL;

1700

}

1700

}

1701

1702

/*

1702

/*

1703

* Get and set a new active queue for service.

1703

* Get and set a new active queue for service.

1704

*/

1704

*/

1705

static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,

1705

static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,

1706

struct cfq_queue *cfqq)

1706

struct cfq_queue *cfqq)

1707

{

1707

{

1708

if (!cfqq)

1708

if (!cfqq)

1709

cfqq = cfq_get_next_queue(cfqd);

1709

cfqq = cfq_get_next_queue(cfqd);

1710

1711

__cfq_set_active_queue(cfqd, cfqq);

1711

__cfq_set_active_queue(cfqd, cfqq);

1712

return cfqq;

1712

return cfqq;

1713

}

1713

}

1714

1715

static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,

1715

static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,

1716

struct request *rq)

1716

struct request *rq)

1717

{

1717

{

1718

if (blk_rq_pos(rq) >= cfqd->last_position)

1718

if (blk_rq_pos(rq) >= cfqd->last_position)

1719

return blk_rq_pos(rq) - cfqd->last_position;

1719

return blk_rq_pos(rq) - cfqd->last_position;

1720

else

1720

else

1721

return cfqd->last_position - blk_rq_pos(rq);

1721

return cfqd->last_position - blk_rq_pos(rq);

1722

}

1722

}

1723

1724

static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1724

static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1725

struct request *rq)

1725

struct request *rq)

1726

{

1726

{

1727

return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;

1727

return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;

1728

}

1728

}

1729

1730

static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,

1730

static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,

1731

struct cfq_queue *cur_cfqq)

1731

struct cfq_queue *cur_cfqq)

1732

{

1732

{

1733

struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];

1733

struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];

1734

struct rb_node *parent, *node;

1734

struct rb_node *parent, *node;

1735

struct cfq_queue *__cfqq;

1735

struct cfq_queue *__cfqq;

1736

sector_t sector = cfqd->last_position;

1736

sector_t sector = cfqd->last_position;

1737

1738

if (RB_EMPTY_ROOT(root))

1738

if (RB_EMPTY_ROOT(root))

1739

return NULL;

1739

return NULL;

1740

1741

/*

1741

/*

1742

* First, if we find a request starting at the end of the last

1742

* First, if we find a request starting at the end of the last

1743

* request, choose it.

1743

* request, choose it.

1744

*/

1744

*/

1745

__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);

1745

__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);

1746

if (__cfqq)

1746

if (__cfqq)

1747

return __cfqq;

1747

return __cfqq;

1748

1749

/*

1749

/*

1750

* If the exact sector wasn't found, the parent of the NULL leaf

1750

* If the exact sector wasn't found, the parent of the NULL leaf

1751

* will contain the closest sector.

1751

* will contain the closest sector.

1752

*/

1752

*/

1753

__cfqq = rb_entry(parent, struct cfq_queue, p_node);

1753

__cfqq = rb_entry(parent, struct cfq_queue, p_node);

1754

if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))

1754

if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))

1755

return __cfqq;

1755

return __cfqq;

1756

1757

if (blk_rq_pos(__cfqq->next_rq) < sector)

1757

if (blk_rq_pos(__cfqq->next_rq) < sector)

1758

node = rb_next(&__cfqq->p_node);

1758

node = rb_next(&__cfqq->p_node);

1759

else

1759

else

1760

node = rb_prev(&__cfqq->p_node);

1760

node = rb_prev(&__cfqq->p_node);

1761

if (!node)

1761

if (!node)

1762

return NULL;

1762

return NULL;

1763

1764

__cfqq = rb_entry(node, struct cfq_queue, p_node);

1764

__cfqq = rb_entry(node, struct cfq_queue, p_node);

1765

if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))

1765

if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))

1766

return __cfqq;

1766

return __cfqq;

1767

1768

return NULL;

1768

return NULL;

1769

}

1769

}

1770

1771

/*

1771

/*

1772

* cfqd - obvious

1772

* cfqd - obvious

1773

* cur_cfqq - passed in so that we don't decide that the current queue is

1773

* cur_cfqq - passed in so that we don't decide that the current queue is

1774

* closely cooperating with itself.

1774

* closely cooperating with itself.

1775

*

1775

*

1776

* So, basically we're assuming that that cur_cfqq has dispatched at least

1776

* So, basically we're assuming that that cur_cfqq has dispatched at least

1777

* one request, and that cfqd->last_position reflects a position on the disk

1777

* one request, and that cfqd->last_position reflects a position on the disk

1778

* associated with the I/O issued by cur_cfqq. I'm not sure this is a valid

1778

* associated with the I/O issued by cur_cfqq. I'm not sure this is a valid

1779

* assumption.

1779

* assumption.

1780

*/

1780

*/

1781

static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,

1781

static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,

1782

struct cfq_queue *cur_cfqq)

1782

struct cfq_queue *cur_cfqq)

1783

{

1783

{

1784

struct cfq_queue *cfqq;

1784

struct cfq_queue *cfqq;

1785

1786

if (cfq_class_idle(cur_cfqq))

1786

if (cfq_class_idle(cur_cfqq))

1787

return NULL;

1787

return NULL;

1788

if (!cfq_cfqq_sync(cur_cfqq))

1788

if (!cfq_cfqq_sync(cur_cfqq))

1789

return NULL;

1789

return NULL;

1790

if (CFQQ_SEEKY(cur_cfqq))

1790

if (CFQQ_SEEKY(cur_cfqq))

1791

return NULL;

1791

return NULL;

1792

1793

/*

1793

/*

1794

* Don't search priority tree if it's the only queue in the group.

1794

* Don't search priority tree if it's the only queue in the group.

1795

*/

1795

*/

1796

if (cur_cfqq->cfqg->nr_cfqq == 1)

1796

if (cur_cfqq->cfqg->nr_cfqq == 1)

1797

return NULL;

1797

return NULL;

1798

1799

/*

1799

/*

1800

* We should notice if some of the queues are cooperating, eg

1800

* We should notice if some of the queues are cooperating, eg

1801

* working closely on the same area of the disk. In that case,

1801

* working closely on the same area of the disk. In that case,

1802

* we can group them together and don't waste time idling.

1802

* we can group them together and don't waste time idling.

1803

*/

1803

*/

1804

cfqq = cfqq_close(cfqd, cur_cfqq);

1804

cfqq = cfqq_close(cfqd, cur_cfqq);

1805

if (!cfqq)

1805

if (!cfqq)

1806

return NULL;

1806

return NULL;

1807

1808

/* If new queue belongs to different cfq_group, don't choose it */

1808

/* If new queue belongs to different cfq_group, don't choose it */

1809

if (cur_cfqq->cfqg != cfqq->cfqg)

1809

if (cur_cfqq->cfqg != cfqq->cfqg)

1810

return NULL;

1810

return NULL;

1811

1812

/*

1812

/*

1813

* It only makes sense to merge sync queues.

1813

* It only makes sense to merge sync queues.

1814

*/

1814

*/

1815

if (!cfq_cfqq_sync(cfqq))

1815

if (!cfq_cfqq_sync(cfqq))

1816

return NULL;

1816

return NULL;

1817

if (CFQQ_SEEKY(cfqq))

1817

if (CFQQ_SEEKY(cfqq))

1818

return NULL;

1818

return NULL;

1819

1820

/*

1820

/*

1821

* Do not merge queues of different priority classes

1821

* Do not merge queues of different priority classes

1822

*/

1822

*/

1823

if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))

1823

if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))

1824

return NULL;

1824

return NULL;

1825

1826

return cfqq;

1826

return cfqq;

1827

}

1827

}

1828

1829

/*

1829

/*

1830

* Determine whether we should enforce idle window for this queue.

1830

* Determine whether we should enforce idle window for this queue.

1831

*/

1831

*/

1832

1833

static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1833

static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1834

{

1834

{

1835

enum wl_prio_t prio = cfqq_prio(cfqq);

1835

enum wl_prio_t prio = cfqq_prio(cfqq);

1836

struct cfq_rb_root *service_tree = cfqq->service_tree;

1836

struct cfq_rb_root *service_tree = cfqq->service_tree;

1837

1838

BUG_ON(!service_tree);

1838

BUG_ON(!service_tree);

1839

BUG_ON(!service_tree->count);

1839

BUG_ON(!service_tree->count);

1840

1841

/* We never do for idle class queues. */

1841

/* We never do for idle class queues. */

1842

if (prio == IDLE_WORKLOAD)

1842

if (prio == IDLE_WORKLOAD)

1843

return false;

1843

return false;

1844

1845

/* We do for queues that were marked with idle window flag. */

1845

/* We do for queues that were marked with idle window flag. */

1846

if (cfq_cfqq_idle_window(cfqq) &&

1846

if (cfq_cfqq_idle_window(cfqq) &&

1847

!(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))

1847

!(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))

1848

return true;

1848

return true;

1849

1850

/*

1850

/*

1851

* Otherwise, we do only if they are the last ones

1851

* Otherwise, we do only if they are the last ones

1852

* in their service tree.

1852

* in their service tree.

1853

*/

1853

*/

1854

if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))

1854

if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))

1855

return 1;

1855

return 1;

1856

cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",

1856

cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",

1857

service_tree->count);

1857

service_tree->count);

1858

return 0;

1858

return 0;

1859

}

1859

}

1860

1861

static void cfq_arm_slice_timer(struct cfq_data *cfqd)

1861

static void cfq_arm_slice_timer(struct cfq_data *cfqd)

1862

{

1862

{

1863

struct cfq_queue *cfqq = cfqd->active_queue;

1863

struct cfq_queue *cfqq = cfqd->active_queue;

1864

struct cfq_io_context *cic;

1864

struct cfq_io_context *cic;

1865

unsigned long sl;

1865

unsigned long sl;

1866

1867

/*

1867

/*

1868

* SSD device without seek penalty, disable idling. But only do so

1868

* SSD device without seek penalty, disable idling. But only do so

1869

* for devices that support queuing, otherwise we still have a problem

1869

* for devices that support queuing, otherwise we still have a problem

1870

* with sync vs async workloads.

1870

* with sync vs async workloads.

1871

*/

1871

*/

1872

if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)

1872

if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)

1873

return;

1873

return;

1874

1875

WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));

1875

WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));

1876

WARN_ON(cfq_cfqq_slice_new(cfqq));

1876

WARN_ON(cfq_cfqq_slice_new(cfqq));

1877

1878

/*

1878

/*

1879

* idle is disabled, either manually or by past process history

1879

* idle is disabled, either manually or by past process history

1880

*/

1880

*/

1881

if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))

1881

if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))

1882

return;

1882

return;

1883

1884

/*

1884

/*

1885

* still active requests from this queue, don't idle

1885

* still active requests from this queue, don't idle

1886

*/

1886

*/

1887

if (cfqq->dispatched)

1887

if (cfqq->dispatched)

1888

return;

1888

return;

1889

1890

/*

1890

/*

1891

* task has exited, don't wait

1891

* task has exited, don't wait

1892

*/

1892

*/

1893

cic = cfqd->active_cic;

1893

cic = cfqd->active_cic;

1894

if (!cic || !atomic_read(&cic->ioc->nr_tasks))

1894

if (!cic || !atomic_read(&cic->ioc->nr_tasks))

1895

return;

1895

return;

1896

1897

/*

1897

/*

1898

* If our average think time is larger than the remaining time

1898

* If our average think time is larger than the remaining time

1899

* slice, then don't idle. This avoids overrunning the allotted

1899

* slice, then don't idle. This avoids overrunning the allotted

1900

* time slice.

1900

* time slice.

1901

*/

1901

*/

1902

if (sample_valid(cic->ttime_samples) &&

1902

if (sample_valid(cic->ttime_samples) &&

1903

(cfqq->slice_end - jiffies < cic->ttime_mean)) {

1903

(cfqq->slice_end - jiffies < cic->ttime_mean)) {

1904

cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",

1904

cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",

1905

cic->ttime_mean);

1905

cic->ttime_mean);

1906

return;

1906

return;

1907

}

1907

}

1908

1909

cfq_mark_cfqq_wait_request(cfqq);

1909

cfq_mark_cfqq_wait_request(cfqq);

1910

1911

sl = cfqd->cfq_slice_idle;

1911

sl = cfqd->cfq_slice_idle;

1912

1913

mod_timer(&cfqd->idle_slice_timer, jiffies + sl);

1913

mod_timer(&cfqd->idle_slice_timer, jiffies + sl);

1914

blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);

1914

blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);

1915

cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);

1915

cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);

1916

}

1916

}

1917

1918

/*

1918

/*

1919

* Move request from internal lists to the request queue dispatch list.

1919

* Move request from internal lists to the request queue dispatch list.

1920

*/

1920

*/

1921

static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)

1921

static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)

1922

{

1922

{

1923

struct cfq_data *cfqd = q->elevator->elevator_data;

1923

struct cfq_data *cfqd = q->elevator->elevator_data;

1924

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1924

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1925

1926

cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");

1926

cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");

1927

1928

cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);

1928

cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);

1929

cfq_remove_request(rq);

1929

cfq_remove_request(rq);

1930

cfqq->dispatched++;

1930

cfqq->dispatched++;

1931

elv_dispatch_sort(q, rq);

1931

elv_dispatch_sort(q, rq);

1932

1933

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;

1933

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;

1934

blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),

1934

blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),

1935

rq_data_dir(rq), rq_is_sync(rq));

1935

rq_data_dir(rq), rq_is_sync(rq));

1936

}

1936

}

1937

1938

/*

1938

/*

1939

* return expired entry, or NULL to just start from scratch in rbtree

1939

* return expired entry, or NULL to just start from scratch in rbtree

1940

*/

1940

*/

1941

static struct request *cfq_check_fifo(struct cfq_queue *cfqq)

1941

static struct request *cfq_check_fifo(struct cfq_queue *cfqq)

1942

{

1942

{

1943

struct request *rq = NULL;

1943

struct request *rq = NULL;

1944

1945

if (cfq_cfqq_fifo_expire(cfqq))

1945

if (cfq_cfqq_fifo_expire(cfqq))

1946

return NULL;

1946

return NULL;

1947

1948

cfq_mark_cfqq_fifo_expire(cfqq);

1948

cfq_mark_cfqq_fifo_expire(cfqq);

1949

1950

if (list_empty(&cfqq->fifo))

1950

if (list_empty(&cfqq->fifo))

1951

return NULL;

1951

return NULL;

1952

1953

rq = rq_entry_fifo(cfqq->fifo.next);

1953

rq = rq_entry_fifo(cfqq->fifo.next);

1954

if (time_before(jiffies, rq_fifo_time(rq)))

1954

if (time_before(jiffies, rq_fifo_time(rq)))

1955

rq = NULL;

1955

rq = NULL;

1956

1957

cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);

1957

cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);

1958

return rq;

1958

return rq;

1959

}

1959

}

1960

1961

static inline int

1961

static inline int

1962

cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1962

cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1963

{

1963

{

1964

const int base_rq = cfqd->cfq_slice_async_rq;

1964

const int base_rq = cfqd->cfq_slice_async_rq;

1965

1966

WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);

1966

WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);

1967

1968

return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));

1968

return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));

1969

}

1969

}

1970

1971

/*

1971

/*

1972

* Must be called with the queue_lock held.

1972

* Must be called with the queue_lock held.

1973

*/

1973

*/

1974

static int cfqq_process_refs(struct cfq_queue *cfqq)

1974

static int cfqq_process_refs(struct cfq_queue *cfqq)

1975

{

1975

{

1976

int process_refs, io_refs;

1976

int process_refs, io_refs;

1977

1978

io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];

1978

io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];

1979

process_refs = atomic_read(&cfqq->ref) - io_refs;

1979

process_refs = atomic_read(&cfqq->ref) - io_refs;

1980

BUG_ON(process_refs < 0);

1980

BUG_ON(process_refs < 0);

1981

return process_refs;

1981

return process_refs;

1982

}

1982

}

1983

1984

static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)

1984

static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)

1985

{

1985

{

1986

int process_refs, new_process_refs;

1986

int process_refs, new_process_refs;

1987

struct cfq_queue *__cfqq;

1987

struct cfq_queue *__cfqq;

1988

1989

/*

1990

* If there are no process references on the new_cfqq, then it is

1991

* unsafe to follow the ->new_cfqq chain as other cfqq's in the

1992

* chain may have dropped their last reference (not just their

1993

* last process reference).

1994

*/

1995

if (!cfqq_process_refs(new_cfqq))

1996

return;

1997

1989

/* Avoid a circular list and skip interim queue merges */

1998

/* Avoid a circular list and skip interim queue merges */

1990

while ((__cfqq = new_cfqq->new_cfqq)) {

1999

while ((__cfqq = new_cfqq->new_cfqq)) {

1991

if (__cfqq == cfqq)

2000

if (__cfqq == cfqq)

1992

return;

2001

return;

1993

new_cfqq = __cfqq;

2002

new_cfqq = __cfqq;

1994

}

2003

}

1995

2004

1996

process_refs = cfqq_process_refs(cfqq);

2005

process_refs = cfqq_process_refs(cfqq);

2006

new_process_refs = cfqq_process_refs(new_cfqq);

1997

/*

2007

/*

1998

* If the process for the cfqq has gone away, there is no

2008

* If the process for the cfqq has gone away, there is no

1999

* sense in merging the queues.

2009

* sense in merging the queues.

2000

*/

2010

*/

2001

if (process_refs == 0)

2011

if (process_refs == 0 || new_process_refs == 0)

2002

return;

2012

return;

2003

2013

2004

/*

2014

/*

2005

* Merge in the direction of the lesser amount of work.

2015

* Merge in the direction of the lesser amount of work.

2006

*/

2016

*/

2007

new_process_refs = cfqq_process_refs(new_cfqq);

2008

if (new_process_refs >= process_refs) {

2017

if (new_process_refs >= process_refs) {

2009

cfqq->new_cfqq = new_cfqq;

2018

cfqq->new_cfqq = new_cfqq;

2010

atomic_add(process_refs, &new_cfqq->ref);

2019

atomic_add(process_refs, &new_cfqq->ref);

2011

} else {

2020

} else {

2012

new_cfqq->new_cfqq = cfqq;

2021

new_cfqq->new_cfqq = cfqq;

2013

atomic_add(new_process_refs, &cfqq->ref);

2022

atomic_add(new_process_refs, &cfqq->ref);

2014

}

2023

}

2015

}

2024

}

2016

2025

2017

static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,

2026

static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,

2018

struct cfq_group *cfqg, enum wl_prio_t prio)

2027

struct cfq_group *cfqg, enum wl_prio_t prio)

2019

{

2028

{

2020

struct cfq_queue *queue;

2029

struct cfq_queue *queue;

2021

int i;

2030

int i;

2022

bool key_valid = false;

2031

bool key_valid = false;

2023

unsigned long lowest_key = 0;

2032

unsigned long lowest_key = 0;

2024

enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;

2033

enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;

2025

2034

2026

for (i = 0; i <= SYNC_WORKLOAD; ++i) {

2035

for (i = 0; i <= SYNC_WORKLOAD; ++i) {

2027

/* select the one with lowest rb_key */

2036

/* select the one with lowest rb_key */

2028

queue = cfq_rb_first(service_tree_for(cfqg, prio, i));

2037

queue = cfq_rb_first(service_tree_for(cfqg, prio, i));

2029

if (queue &&

2038

if (queue &&

2030

(!key_valid || time_before(queue->rb_key, lowest_key))) {

2039

(!key_valid || time_before(queue->rb_key, lowest_key))) {

2031

lowest_key = queue->rb_key;

2040

lowest_key = queue->rb_key;

2032

cur_best = i;

2041

cur_best = i;

2033

key_valid = true;

2042

key_valid = true;

2034

}

2043

}

2035

}

2044

}

2036

2045

2037

return cur_best;

2046

return cur_best;

2038

}

2047

}

2039

2048

2040

static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)

2049

static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)

2041

{

2050

{

2042

unsigned slice;

2051

unsigned slice;

2043

unsigned count;

2052

unsigned count;

2044

struct cfq_rb_root *st;

2053

struct cfq_rb_root *st;

2045

unsigned group_slice;

2054

unsigned group_slice;

2046

2055

2047

if (!cfqg) {

2056

if (!cfqg) {

2048

cfqd->serving_prio = IDLE_WORKLOAD;

2057

cfqd->serving_prio = IDLE_WORKLOAD;

2049

cfqd->workload_expires = jiffies + 1;

2058

cfqd->workload_expires = jiffies + 1;

2050

return;

2059

return;

2051

}

2060

}

2052

2061

2053

/* Choose next priority. RT > BE > IDLE */

2062

/* Choose next priority. RT > BE > IDLE */

2054

if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))

2063

if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))

2055

cfqd->serving_prio = RT_WORKLOAD;

2064

cfqd->serving_prio = RT_WORKLOAD;

2056

else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))

2065

else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))

2057

cfqd->serving_prio = BE_WORKLOAD;

2066

cfqd->serving_prio = BE_WORKLOAD;

2058

else {

2067

else {

2059

cfqd->serving_prio = IDLE_WORKLOAD;

2068

cfqd->serving_prio = IDLE_WORKLOAD;

2060

cfqd->workload_expires = jiffies + 1;

2069

cfqd->workload_expires = jiffies + 1;

2061

return;

2070

return;

2062

}

2071

}

2063

2072

2064

/*

2073

/*

2065

* For RT and BE, we have to choose also the type

2074

* For RT and BE, we have to choose also the type

2066

* (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload

2075

* (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload

2067

* expiration time

2076

* expiration time

2068

*/

2077

*/

2069

st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);

2078

st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);

2070

count = st->count;

2079

count = st->count;

2071

2080

2072

/*

2081

/*

2073

* check workload expiration, and that we still have other queues ready

2082

* check workload expiration, and that we still have other queues ready

2074

*/

2083

*/

2075

if (count && !time_after(jiffies, cfqd->workload_expires))

2084

if (count && !time_after(jiffies, cfqd->workload_expires))

2076

return;

2085

return;

2077

2086

2078

/* otherwise select new workload type */

2087

/* otherwise select new workload type */

2079

cfqd->serving_type =

2088

cfqd->serving_type =

2080

cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);

2089

cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);

2081

st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);

2090

st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);

2082

count = st->count;

2091

count = st->count;

2083

2092

2084

/*

2093

/*

2085

* the workload slice is computed as a fraction of target latency

2094

* the workload slice is computed as a fraction of target latency

2086

* proportional to the number of queues in that workload, over

2095

* proportional to the number of queues in that workload, over

2087

* all the queues in the same priority class

2096

* all the queues in the same priority class

2088

*/

2097

*/

2089

group_slice = cfq_group_slice(cfqd, cfqg);

2098

group_slice = cfq_group_slice(cfqd, cfqg);

2090

2099

2091

slice = group_slice * count /

2100

slice = group_slice * count /

2092

max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],

2101

max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],

2093

cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));

2102

cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));

2094

2103

2095

if (cfqd->serving_type == ASYNC_WORKLOAD) {

2104

if (cfqd->serving_type == ASYNC_WORKLOAD) {

2096

unsigned int tmp;

2105

unsigned int tmp;

2097

2106

2098

/*

2107

/*

2099

* Async queues are currently system wide. Just taking

2108

* Async queues are currently system wide. Just taking

2100

* proportion of queues with-in same group will lead to higher

2109

* proportion of queues with-in same group will lead to higher

2101

* async ratio system wide as generally root group is going

2110

* async ratio system wide as generally root group is going

2102

* to have higher weight. A more accurate thing would be to

2111

* to have higher weight. A more accurate thing would be to

2103

* calculate system wide asnc/sync ratio.

2112

* calculate system wide asnc/sync ratio.

2104

*/

2113

*/

2105

tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);

2114

tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);

2106

tmp = tmp/cfqd->busy_queues;

2115

tmp = tmp/cfqd->busy_queues;

2107

slice = min_t(unsigned, slice, tmp);

2116

slice = min_t(unsigned, slice, tmp);

2108

2117

2109

/* async workload slice is scaled down according to

2118

/* async workload slice is scaled down according to

2110

* the sync/async slice ratio. */

2119

* the sync/async slice ratio. */

2111

slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];

2120

slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];

2112

} else

2121

} else

2113

/* sync workload slice is at least 2 * cfq_slice_idle */

2122

/* sync workload slice is at least 2 * cfq_slice_idle */

2114

slice = max(slice, 2 * cfqd->cfq_slice_idle);

2123

slice = max(slice, 2 * cfqd->cfq_slice_idle);

2115

2124

2116

slice = max_t(unsigned, slice, CFQ_MIN_TT);

2125

slice = max_t(unsigned, slice, CFQ_MIN_TT);

2117

cfq_log(cfqd, "workload slice:%d", slice);

2126

cfq_log(cfqd, "workload slice:%d", slice);

2118

cfqd->workload_expires = jiffies + slice;

2127

cfqd->workload_expires = jiffies + slice;

2119

cfqd->noidle_tree_requires_idle = false;

2128

cfqd->noidle_tree_requires_idle = false;

2120

}

2129

}

2121

2130

2122

static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)

2131

static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)

2123

{

2132

{

2124

struct cfq_rb_root *st = &cfqd->grp_service_tree;

2133

struct cfq_rb_root *st = &cfqd->grp_service_tree;

2125

struct cfq_group *cfqg;

2134

struct cfq_group *cfqg;

2126

2135

2127

if (RB_EMPTY_ROOT(&st->rb))

2136

if (RB_EMPTY_ROOT(&st->rb))

2128

return NULL;

2137

return NULL;

2129

cfqg = cfq_rb_first_group(st);

2138

cfqg = cfq_rb_first_group(st);

2130

st->active = &cfqg->rb_node;

2139

st->active = &cfqg->rb_node;

2131

update_min_vdisktime(st);

2140

update_min_vdisktime(st);

2132

return cfqg;

2141

return cfqg;

2133

}

2142

}

2134

2143

2135

static void cfq_choose_cfqg(struct cfq_data *cfqd)

2144

static void cfq_choose_cfqg(struct cfq_data *cfqd)

2136

{

2145

{

2137

struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);

2146

struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);

2138

2147

2139

cfqd->serving_group = cfqg;

2148

cfqd->serving_group = cfqg;

2140

2149

2141

/* Restore the workload type data */

2150

/* Restore the workload type data */

2142

if (cfqg->saved_workload_slice) {

2151

if (cfqg->saved_workload_slice) {

2143

cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;

2152

cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;

2144

cfqd->serving_type = cfqg->saved_workload;

2153

cfqd->serving_type = cfqg->saved_workload;

2145

cfqd->serving_prio = cfqg->saved_serving_prio;

2154

cfqd->serving_prio = cfqg->saved_serving_prio;

2146

} else

2155

} else

2147

cfqd->workload_expires = jiffies - 1;

2156

cfqd->workload_expires = jiffies - 1;

2148

2157

2149

choose_service_tree(cfqd, cfqg);

2158

choose_service_tree(cfqd, cfqg);

2150

}

2159

}

2151

2160

2152

/*

2161

/*

2153

* Select a queue for service. If we have a current active queue,

2162

* Select a queue for service. If we have a current active queue,

2154

* check whether to continue servicing it, or retrieve and set a new one.

2163

* check whether to continue servicing it, or retrieve and set a new one.

2155

*/

2164

*/

2156

static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)

2165

static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)

2157

{

2166

{

2158

struct cfq_queue *cfqq, *new_cfqq = NULL;

2167

struct cfq_queue *cfqq, *new_cfqq = NULL;

2159

2168

2160

cfqq = cfqd->active_queue;

2169

cfqq = cfqd->active_queue;

2161

if (!cfqq)

2170

if (!cfqq)

2162

goto new_queue;

2171

goto new_queue;

2163

2172

2164

if (!cfqd->rq_queued)

2173

if (!cfqd->rq_queued)

2165

return NULL;

2174

return NULL;

2166

2175

2167

/*

2176

/*

2168

* We were waiting for group to get backlogged. Expire the queue

2177

* We were waiting for group to get backlogged. Expire the queue

2169

*/

2178

*/

2170

if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))

2179

if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))

2171

goto expire;

2180

goto expire;

2172

2181

2173

/*

2182

/*

2174

* The active queue has run out of time, expire it and select new.

2183

* The active queue has run out of time, expire it and select new.

2175

*/

2184

*/

2176

if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {

2185

if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {

2177

/*

2186

/*

2178

* If slice had not expired at the completion of last request

2187

* If slice had not expired at the completion of last request

2179

* we might not have turned on wait_busy flag. Don't expire

2188

* we might not have turned on wait_busy flag. Don't expire

2180

* the queue yet. Allow the group to get backlogged.

2189

* the queue yet. Allow the group to get backlogged.

2181

*

2190

*

2182

* The very fact that we have used the slice, that means we

2191

* The very fact that we have used the slice, that means we

2183

* have been idling all along on this queue and it should be

2192

* have been idling all along on this queue and it should be

2184

* ok to wait for this request to complete.

2193

* ok to wait for this request to complete.

2185

*/

2194

*/

2186

if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)

2195

if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)

2187

&& cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {

2196

&& cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {

2188

cfqq = NULL;

2197

cfqq = NULL;

2189

goto keep_queue;

2198

goto keep_queue;

2190

} else

2199

} else

2191

goto expire;

2200

goto expire;

2192

}

2201

}

2193

2202

2194

/*

2203

/*

2195

* The active queue has requests and isn't expired, allow it to

2204

* The active queue has requests and isn't expired, allow it to

2196

* dispatch.

2205

* dispatch.

2197

*/

2206

*/

2198

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

2207

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

2199

goto keep_queue;

2208

goto keep_queue;

2200

2209

2201

/*

2210

/*

2202

* If another queue has a request waiting within our mean seek

2211

* If another queue has a request waiting within our mean seek

2203

* distance, let it run. The expire code will check for close

2212

* distance, let it run. The expire code will check for close

2204

* cooperators and put the close queue at the front of the service

2213

* cooperators and put the close queue at the front of the service

2205

* tree. If possible, merge the expiring queue with the new cfqq.

2214

* tree. If possible, merge the expiring queue with the new cfqq.

2206

*/

2215

*/

2207

new_cfqq = cfq_close_cooperator(cfqd, cfqq);

2216

new_cfqq = cfq_close_cooperator(cfqd, cfqq);

2208

if (new_cfqq) {

2217

if (new_cfqq) {

2209

if (!cfqq->new_cfqq)

2218

if (!cfqq->new_cfqq)

2210

cfq_setup_merge(cfqq, new_cfqq);

2219

cfq_setup_merge(cfqq, new_cfqq);

2211

goto expire;

2220

goto expire;

2212

}

2221

}

2213

2222

2214

/*

2223

/*

2215

* No requests pending. If the active queue still has requests in

2224

* No requests pending. If the active queue still has requests in

2216

* flight or is idling for a new request, allow either of these

2225

* flight or is idling for a new request, allow either of these

2217

* conditions to happen (or time out) before selecting a new queue.

2226

* conditions to happen (or time out) before selecting a new queue.

2218

*/

2227

*/

2219

if (timer_pending(&cfqd->idle_slice_timer) ||

2228

if (timer_pending(&cfqd->idle_slice_timer) ||

2220

(cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {

2229

(cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {

2221

cfqq = NULL;

2230

cfqq = NULL;

2222

goto keep_queue;

2231

goto keep_queue;

2223

}

2232

}

2224

2233

2225

expire:

2234

expire:

2226

cfq_slice_expired(cfqd, 0);

2235

cfq_slice_expired(cfqd, 0);

2227

new_queue:

2236

new_queue:

2228

/*

2237

/*

2229

* Current queue expired. Check if we have to switch to a new

2238

* Current queue expired. Check if we have to switch to a new

2230

* service tree

2239

* service tree

2231

*/

2240

*/

2232

if (!new_cfqq)

2241

if (!new_cfqq)

2233

cfq_choose_cfqg(cfqd);

2242

cfq_choose_cfqg(cfqd);

2234

2243

2235

cfqq = cfq_set_active_queue(cfqd, new_cfqq);

2244

cfqq = cfq_set_active_queue(cfqd, new_cfqq);

2236

keep_queue:

2245

keep_queue:

2237

return cfqq;

2246

return cfqq;

2238

}

2247

}

2239

2248

2240

static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)

2249

static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)

2241

{

2250

{

2242

int dispatched = 0;

2251

int dispatched = 0;

2243

2252

2244

while (cfqq->next_rq) {

2253

while (cfqq->next_rq) {

2245

cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);

2254

cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);

2246

dispatched++;

2255

dispatched++;

2247

}

2256

}

2248

2257

2249

BUG_ON(!list_empty(&cfqq->fifo));

2258

BUG_ON(!list_empty(&cfqq->fifo));

2250

2259

2251

/* By default cfqq is not expired if it is empty. Do it explicitly */

2260

/* By default cfqq is not expired if it is empty. Do it explicitly */

2252

__cfq_slice_expired(cfqq->cfqd, cfqq, 0);

2261

__cfq_slice_expired(cfqq->cfqd, cfqq, 0);

2253

return dispatched;

2262

return dispatched;

2254

}

2263

}

2255

2264

2256

/*

2265

/*

2257

* Drain our current requests. Used for barriers and when switching

2266

* Drain our current requests. Used for barriers and when switching

2258

* io schedulers on-the-fly.

2267

* io schedulers on-the-fly.

2259

*/

2268

*/

2260

static int cfq_forced_dispatch(struct cfq_data *cfqd)

2269

static int cfq_forced_dispatch(struct cfq_data *cfqd)

2261

{

2270

{

2262

struct cfq_queue *cfqq;

2271

struct cfq_queue *cfqq;

2263

int dispatched = 0;

2272

int dispatched = 0;

2264

2273

2265

/* Expire the timeslice of the current active queue first */

2274

/* Expire the timeslice of the current active queue first */

2266

cfq_slice_expired(cfqd, 0);

2275

cfq_slice_expired(cfqd, 0);

2267

while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {

2276

while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {

2268

__cfq_set_active_queue(cfqd, cfqq);

2277

__cfq_set_active_queue(cfqd, cfqq);

2269

dispatched += __cfq_forced_dispatch_cfqq(cfqq);

2278

dispatched += __cfq_forced_dispatch_cfqq(cfqq);

2270

}

2279

}

2271

2280

2272

BUG_ON(cfqd->busy_queues);

2281

BUG_ON(cfqd->busy_queues);

2273

2282

2274

cfq_log(cfqd, "forced_dispatch=%d", dispatched);

2283

cfq_log(cfqd, "forced_dispatch=%d", dispatched);

2275

return dispatched;

2284

return dispatched;

2276

}

2285

}

2277

2286

2278

static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,

2287

static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,

2279

struct cfq_queue *cfqq)

2288

struct cfq_queue *cfqq)

2280

{

2289

{

2281

/* the queue hasn't finished any request, can't estimate */

2290

/* the queue hasn't finished any request, can't estimate */

2282

if (cfq_cfqq_slice_new(cfqq))

2291

if (cfq_cfqq_slice_new(cfqq))

2283

return 1;

2292

return 1;

2284

if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,

2293

if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,

2285

cfqq->slice_end))

2294

cfqq->slice_end))

2286

return 1;

2295

return 1;

2287

2296

2288

return 0;

2297

return 0;

2289

}

2298

}

2290

2299

2291

static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2300

static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2292

{

2301

{

2293

unsigned int max_dispatch;

2302

unsigned int max_dispatch;

2294

2303

2295

/*

2304

/*

2296

* Drain async requests before we start sync IO

2305

* Drain async requests before we start sync IO

2297

*/

2306

*/

2298

if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])

2307

if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])

2299

return false;

2308

return false;

2300

2309

2301

/*

2310

/*

2302

* If this is an async queue and we have sync IO in flight, let it wait

2311

* If this is an async queue and we have sync IO in flight, let it wait

2303

*/

2312

*/

2304

if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))

2313

if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))

2305

return false;

2314

return false;

2306

2315

2307

max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);

2316

max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);

2308

if (cfq_class_idle(cfqq))

2317

if (cfq_class_idle(cfqq))

2309

max_dispatch = 1;

2318

max_dispatch = 1;

2310

2319

2311

/*

2320

/*

2312

* Does this cfqq already have too much IO in flight?

2321

* Does this cfqq already have too much IO in flight?

2313

*/

2322

*/

2314

if (cfqq->dispatched >= max_dispatch) {

2323

if (cfqq->dispatched >= max_dispatch) {

2315

/*

2324

/*

2316

* idle queue must always only have a single IO in flight

2325

* idle queue must always only have a single IO in flight

2317

*/

2326

*/

2318

if (cfq_class_idle(cfqq))

2327

if (cfq_class_idle(cfqq))

2319

return false;

2328

return false;

2320

2329

2321

/*

2330

/*

2322

* We have other queues, don't allow more IO from this one

2331

* We have other queues, don't allow more IO from this one

2323

*/

2332

*/

2324

if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))

2333

if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))

2325

return false;

2334

return false;

2326

2335

2327

/*

2336

/*

2328

* Sole queue user, no limit

2337

* Sole queue user, no limit

2329

*/

2338

*/

2330

if (cfqd->busy_queues == 1)

2339

if (cfqd->busy_queues == 1)

2331

max_dispatch = -1;

2340

max_dispatch = -1;

2332

else

2341

else

2333

/*

2342

/*

2334

* Normally we start throttling cfqq when cfq_quantum/2

2343

* Normally we start throttling cfqq when cfq_quantum/2

2335

* requests have been dispatched. But we can drive

2344

* requests have been dispatched. But we can drive

2336

* deeper queue depths at the beginning of slice

2345

* deeper queue depths at the beginning of slice

2337

* subjected to upper limit of cfq_quantum.

2346

* subjected to upper limit of cfq_quantum.

2338

* */

2347

* */

2339

max_dispatch = cfqd->cfq_quantum;

2348

max_dispatch = cfqd->cfq_quantum;

2340

}

2349

}

2341

2350

2342

/*

2351

/*

2343

* Async queues must wait a bit before being allowed dispatch.

2352

* Async queues must wait a bit before being allowed dispatch.

2344

* We also ramp up the dispatch depth gradually for async IO,

2353

* We also ramp up the dispatch depth gradually for async IO,

2345

* based on the last sync IO we serviced

2354

* based on the last sync IO we serviced

2346

*/

2355

*/

2347

if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {

2356

if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {

2348

unsigned long last_sync = jiffies - cfqd->last_delayed_sync;

2357

unsigned long last_sync = jiffies - cfqd->last_delayed_sync;

2349

unsigned int depth;

2358

unsigned int depth;

2350

2359

2351

depth = last_sync / cfqd->cfq_slice[1];

2360

depth = last_sync / cfqd->cfq_slice[1];

2352

if (!depth && !cfqq->dispatched)

2361

if (!depth && !cfqq->dispatched)

2353

depth = 1;

2362

depth = 1;

2354

if (depth < max_dispatch)

2363

if (depth < max_dispatch)

2355

max_dispatch = depth;

2364

max_dispatch = depth;

2356

}

2365

}

2357

2366

2358

/*

2367

/*

2359

* If we're below the current max, allow a dispatch

2368

* If we're below the current max, allow a dispatch

2360

*/

2369

*/

2361

return cfqq->dispatched < max_dispatch;

2370

return cfqq->dispatched < max_dispatch;

2362

}

2371

}

2363

2372

2364

/*

2373

/*

2365

* Dispatch a request from cfqq, moving them to the request queue

2374

* Dispatch a request from cfqq, moving them to the request queue

2366

* dispatch list.

2375

* dispatch list.

2367

*/

2376

*/

2368

static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2377

static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2369

{

2378

{

2370

struct request *rq;

2379

struct request *rq;

2371

2380

2372

BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));

2381

BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));

2373

2382

2374

if (!cfq_may_dispatch(cfqd, cfqq))

2383

if (!cfq_may_dispatch(cfqd, cfqq))

2375

return false;

2384

return false;

2376

2385

2377

/*

2386

/*

2378

* follow expired path, else get first next available

2387

* follow expired path, else get first next available

2379

*/

2388

*/

2380

rq = cfq_check_fifo(cfqq);

2389

rq = cfq_check_fifo(cfqq);

2381

if (!rq)

2390

if (!rq)

2382

rq = cfqq->next_rq;

2391

rq = cfqq->next_rq;

2383

2392

2384

/*

2393

/*

2385

* insert request into driver dispatch list

2394

* insert request into driver dispatch list

2386

*/

2395

*/

2387

cfq_dispatch_insert(cfqd->queue, rq);

2396

cfq_dispatch_insert(cfqd->queue, rq);

2388

2397

2389

if (!cfqd->active_cic) {

2398

if (!cfqd->active_cic) {

2390

struct cfq_io_context *cic = RQ_CIC(rq);

2399

struct cfq_io_context *cic = RQ_CIC(rq);

2391

2400

2392

atomic_long_inc(&cic->ioc->refcount);

2401

atomic_long_inc(&cic->ioc->refcount);

2393

cfqd->active_cic = cic;

2402

cfqd->active_cic = cic;

2394

}

2403

}

2395

2404

2396

return true;

2405

return true;

2397

}

2406

}

2398

2407

2399

/*

2408

/*

2400

* Find the cfqq that we need to service and move a request from that to the

2409

* Find the cfqq that we need to service and move a request from that to the

2401

* dispatch list

2410

* dispatch list

2402

*/

2411

*/

2403

static int cfq_dispatch_requests(struct request_queue *q, int force)

2412

static int cfq_dispatch_requests(struct request_queue *q, int force)

2404

{

2413

{

2405

struct cfq_data *cfqd = q->elevator->elevator_data;

2414

struct cfq_data *cfqd = q->elevator->elevator_data;

2406

struct cfq_queue *cfqq;

2415

struct cfq_queue *cfqq;

2407

2416

2408

if (!cfqd->busy_queues)

2417

if (!cfqd->busy_queues)

2409

return 0;

2418

return 0;

2410

2419

2411

if (unlikely(force))

2420

if (unlikely(force))

2412

return cfq_forced_dispatch(cfqd);

2421

return cfq_forced_dispatch(cfqd);

2413

2422

2414

cfqq = cfq_select_queue(cfqd);

2423

cfqq = cfq_select_queue(cfqd);

2415

if (!cfqq)

2424

if (!cfqq)

2416

return 0;

2425

return 0;

2417

2426

2418

/*

2427

/*

2419

* Dispatch a request from this cfqq, if it is allowed

2428

* Dispatch a request from this cfqq, if it is allowed

2420

*/

2429

*/

2421

if (!cfq_dispatch_request(cfqd, cfqq))

2430

if (!cfq_dispatch_request(cfqd, cfqq))

2422

return 0;

2431

return 0;

2423

2432

2424

cfqq->slice_dispatch++;

2433

cfqq->slice_dispatch++;

2425

cfq_clear_cfqq_must_dispatch(cfqq);

2434

cfq_clear_cfqq_must_dispatch(cfqq);

2426

2435

2427

/*

2436

/*

2428

* expire an async queue immediately if it has used up its slice. idle

2437

* expire an async queue immediately if it has used up its slice. idle

2429

* queue always expire after 1 dispatch round.

2438

* queue always expire after 1 dispatch round.

2430

*/

2439

*/

2431

if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&

2440

if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&

2432

cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||

2441

cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||

2433

cfq_class_idle(cfqq))) {

2442

cfq_class_idle(cfqq))) {

2434

cfqq->slice_end = jiffies + 1;

2443

cfqq->slice_end = jiffies + 1;

2435

cfq_slice_expired(cfqd, 0);

2444

cfq_slice_expired(cfqd, 0);

2436

}

2445

}

2437

2446

2438

cfq_log_cfqq(cfqd, cfqq, "dispatched a request");

2447

cfq_log_cfqq(cfqd, cfqq, "dispatched a request");

2439

return 1;

2448

return 1;

2440

}

2449

}

2441

2450

2442

/*

2451

/*

2443

* task holds one reference to the queue, dropped when task exits. each rq

2452

* task holds one reference to the queue, dropped when task exits. each rq

2444

* in-flight on this queue also holds a reference, dropped when rq is freed.

2453

* in-flight on this queue also holds a reference, dropped when rq is freed.

2445

*

2454

*

2446

* Each cfq queue took a reference on the parent group. Drop it now.

2455

* Each cfq queue took a reference on the parent group. Drop it now.

2447

* queue lock must be held here.

2456

* queue lock must be held here.

2448

*/

2457

*/

2449

static void cfq_put_queue(struct cfq_queue *cfqq)

2458

static void cfq_put_queue(struct cfq_queue *cfqq)

2450

{

2459

{

2451

struct cfq_data *cfqd = cfqq->cfqd;

2460

struct cfq_data *cfqd = cfqq->cfqd;

2452

struct cfq_group *cfqg, *orig_cfqg;

2461

struct cfq_group *cfqg, *orig_cfqg;

2453

2462

2454

BUG_ON(atomic_read(&cfqq->ref) <= 0);

2463

BUG_ON(atomic_read(&cfqq->ref) <= 0);

2455

2464

2456

if (!atomic_dec_and_test(&cfqq->ref))

2465

if (!atomic_dec_and_test(&cfqq->ref))

2457

return;

2466

return;

2458

2467

2459

cfq_log_cfqq(cfqd, cfqq, "put_queue");

2468

cfq_log_cfqq(cfqd, cfqq, "put_queue");

2460

BUG_ON(rb_first(&cfqq->sort_list));

2469

BUG_ON(rb_first(&cfqq->sort_list));

2461

BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);

2470

BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);

2462

cfqg = cfqq->cfqg;

2471

cfqg = cfqq->cfqg;

2463

orig_cfqg = cfqq->orig_cfqg;

2472

orig_cfqg = cfqq->orig_cfqg;

2464

2473

2465

if (unlikely(cfqd->active_queue == cfqq)) {

2474

if (unlikely(cfqd->active_queue == cfqq)) {

2466

__cfq_slice_expired(cfqd, cfqq, 0);

2475

__cfq_slice_expired(cfqd, cfqq, 0);

2467

cfq_schedule_dispatch(cfqd);

2476

cfq_schedule_dispatch(cfqd);

2468

}

2477

}

2469

2478

2470

BUG_ON(cfq_cfqq_on_rr(cfqq));

2479

BUG_ON(cfq_cfqq_on_rr(cfqq));

2471

kmem_cache_free(cfq_pool, cfqq);

2480

kmem_cache_free(cfq_pool, cfqq);

2472

cfq_put_cfqg(cfqg);

2481

cfq_put_cfqg(cfqg);

2473

if (orig_cfqg)

2482

if (orig_cfqg)

2474

cfq_put_cfqg(orig_cfqg);

2483

cfq_put_cfqg(orig_cfqg);

2475

}

2484

}

2476

2485

2477

/*

2486

/*

2478

* Must always be called with the rcu_read_lock() held

2487

* Must always be called with the rcu_read_lock() held

2479

*/

2488

*/

2480

static void

2489

static void

2481

__call_for_each_cic(struct io_context *ioc,

2490

__call_for_each_cic(struct io_context *ioc,

2482

void (*func)(struct io_context *, struct cfq_io_context *))

2491

void (*func)(struct io_context *, struct cfq_io_context *))

2483

{

2492

{

2484

struct cfq_io_context *cic;

2493

struct cfq_io_context *cic;

2485

struct hlist_node *n;

2494

struct hlist_node *n;

2486

2495

2487

hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)

2496

hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)

2488

func(ioc, cic);

2497

func(ioc, cic);

2489

}

2498

}

2490

2499

2491

/*

2500

/*

2492

* Call func for each cic attached to this ioc.

2501

* Call func for each cic attached to this ioc.

2493

*/

2502

*/

2494

static void

2503

static void

2495

call_for_each_cic(struct io_context *ioc,

2504

call_for_each_cic(struct io_context *ioc,

2496

void (*func)(struct io_context *, struct cfq_io_context *))

2505

void (*func)(struct io_context *, struct cfq_io_context *))

2497

{

2506

{

2498

rcu_read_lock();

2507

rcu_read_lock();

2499

__call_for_each_cic(ioc, func);

2508

__call_for_each_cic(ioc, func);

2500

rcu_read_unlock();

2509

rcu_read_unlock();

2501

}

2510

}

2502

2511

2503

static void cfq_cic_free_rcu(struct rcu_head *head)

2512

static void cfq_cic_free_rcu(struct rcu_head *head)

2504

{

2513

{

2505

struct cfq_io_context *cic;

2514

struct cfq_io_context *cic;

2506

2515

2507

cic = container_of(head, struct cfq_io_context, rcu_head);

2516

cic = container_of(head, struct cfq_io_context, rcu_head);

2508

2517

2509

kmem_cache_free(cfq_ioc_pool, cic);

2518

kmem_cache_free(cfq_ioc_pool, cic);

2510

elv_ioc_count_dec(cfq_ioc_count);

2519

elv_ioc_count_dec(cfq_ioc_count);

2511

2520

2512

if (ioc_gone) {

2521

if (ioc_gone) {

2513

/*

2522

/*

2514

* CFQ scheduler is exiting, grab exit lock and check

2523

* CFQ scheduler is exiting, grab exit lock and check

2515

* the pending io context count. If it hits zero,

2524

* the pending io context count. If it hits zero,

2516

* complete ioc_gone and set it back to NULL

2525

* complete ioc_gone and set it back to NULL

2517

*/

2526

*/

2518

spin_lock(&ioc_gone_lock);

2527

spin_lock(&ioc_gone_lock);

2519

if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {

2528

if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {

2520

complete(ioc_gone);

2529

complete(ioc_gone);

2521

ioc_gone = NULL;

2530

ioc_gone = NULL;

2522

}

2531

}

2523

spin_unlock(&ioc_gone_lock);

2532

spin_unlock(&ioc_gone_lock);

2524

}

2533

}

2525

}

2534

}

2526

2535

2527

static void cfq_cic_free(struct cfq_io_context *cic)

2536

static void cfq_cic_free(struct cfq_io_context *cic)

2528

{

2537

{

2529

call_rcu(&cic->rcu_head, cfq_cic_free_rcu);

2538

call_rcu(&cic->rcu_head, cfq_cic_free_rcu);

2530

}

2539

}

2531

2540

2532

static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)

2541

static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)

2533

{

2542

{

2534

unsigned long flags;

2543

unsigned long flags;

2535

unsigned long dead_key = (unsigned long) cic->key;

2544

unsigned long dead_key = (unsigned long) cic->key;

2536

2545

2537

BUG_ON(!(dead_key & CIC_DEAD_KEY));

2546

BUG_ON(!(dead_key & CIC_DEAD_KEY));

2538

2547

2539

spin_lock_irqsave(&ioc->lock, flags);

2548

spin_lock_irqsave(&ioc->lock, flags);

2540

radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);

2549

radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);

2541

hlist_del_rcu(&cic->cic_list);

2550

hlist_del_rcu(&cic->cic_list);

2542

spin_unlock_irqrestore(&ioc->lock, flags);

2551

spin_unlock_irqrestore(&ioc->lock, flags);

2543

2552

2544

cfq_cic_free(cic);

2553

cfq_cic_free(cic);

2545

}

2554

}

2546

2555

2547

/*

2556

/*

2548

* Must be called with rcu_read_lock() held or preemption otherwise disabled.

2557

* Must be called with rcu_read_lock() held or preemption otherwise disabled.

2549

* Only two callers of this - ->dtor() which is called with the rcu_read_lock(),

2558

* Only two callers of this - ->dtor() which is called with the rcu_read_lock(),

2550

* and ->trim() which is called with the task lock held

2559

* and ->trim() which is called with the task lock held

2551

*/

2560

*/

2552

static void cfq_free_io_context(struct io_context *ioc)

2561

static void cfq_free_io_context(struct io_context *ioc)

2553

{

2562

{

2554

/*

2563

/*

2555

* ioc->refcount is zero here, or we are called from elv_unregister(),

2564

* ioc->refcount is zero here, or we are called from elv_unregister(),

2556

* so no more cic's are allowed to be linked into this ioc. So it

2565

* so no more cic's are allowed to be linked into this ioc. So it

2557

* should be ok to iterate over the known list, we will see all cic's

2566

* should be ok to iterate over the known list, we will see all cic's

2558

* since no new ones are added.

2567

* since no new ones are added.

2559

*/

2568

*/

2560

__call_for_each_cic(ioc, cic_free_func);

2569

__call_for_each_cic(ioc, cic_free_func);

2561

}

2570

}

2562

2571

2563

static void cfq_put_cooperator(struct cfq_queue *cfqq)

2572

static void cfq_put_cooperator(struct cfq_queue *cfqq)

2564

{

2573

{

2565

struct cfq_queue *__cfqq, *next;

2574

struct cfq_queue *__cfqq, *next;

2566

2575

2567

/*

2576

/*

2568

* If this queue was scheduled to merge with another queue, be

2577

* If this queue was scheduled to merge with another queue, be

2569

* sure to drop the reference taken on that queue (and others in

2578

* sure to drop the reference taken on that queue (and others in

2570

* the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.

2579

* the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.

2571

*/

2580

*/

2572

__cfqq = cfqq->new_cfqq;

2581

__cfqq = cfqq->new_cfqq;

2573

while (__cfqq) {

2582

while (__cfqq) {

2574

if (__cfqq == cfqq) {

2583

if (__cfqq == cfqq) {

2575

WARN(1, "cfqq->new_cfqq loop detected\n");

2584

WARN(1, "cfqq->new_cfqq loop detected\n");

2576

break;

2585

break;

2577

}

2586

}

2578

next = __cfqq->new_cfqq;

2587

next = __cfqq->new_cfqq;

2579

cfq_put_queue(__cfqq);

2588

cfq_put_queue(__cfqq);

2580

__cfqq = next;

2589

__cfqq = next;

2581

}

2590

}

2582

}

2591

}

2583

2592

2584

static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2593

static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2585

{

2594

{

2586

if (unlikely(cfqq == cfqd->active_queue)) {

2595

if (unlikely(cfqq == cfqd->active_queue)) {

2587

__cfq_slice_expired(cfqd, cfqq, 0);

2596

__cfq_slice_expired(cfqd, cfqq, 0);

2588

cfq_schedule_dispatch(cfqd);

2597

cfq_schedule_dispatch(cfqd);

2589

}

2598

}

2590

2599

2591

cfq_put_cooperator(cfqq);

2600

cfq_put_cooperator(cfqq);

2592

2601

2593

cfq_put_queue(cfqq);

2602

cfq_put_queue(cfqq);

2594

}

2603

}

2595

2604

2596

static void __cfq_exit_single_io_context(struct cfq_data *cfqd,

2605

static void __cfq_exit_single_io_context(struct cfq_data *cfqd,

2597

struct cfq_io_context *cic)

2606

struct cfq_io_context *cic)

2598

{

2607

{

2599

struct io_context *ioc = cic->ioc;

2608

struct io_context *ioc = cic->ioc;

2600

2609

2601

list_del_init(&cic->queue_list);

2610

list_del_init(&cic->queue_list);

2602

2611

2603

/*

2612

/*

2604

* Make sure dead mark is seen for dead queues

2613

* Make sure dead mark is seen for dead queues

2605

*/

2614

*/

2606

smp_wmb();

2615

smp_wmb();

2607

cic->key = cfqd_dead_key(cfqd);

2616

cic->key = cfqd_dead_key(cfqd);

2608

2617

2609

if (ioc->ioc_data == cic)

2618

if (ioc->ioc_data == cic)

2610

rcu_assign_pointer(ioc->ioc_data, NULL);

2619

rcu_assign_pointer(ioc->ioc_data, NULL);

2611

2620

2612

if (cic->cfqq[BLK_RW_ASYNC]) {

2621

if (cic->cfqq[BLK_RW_ASYNC]) {

2613

cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);

2622

cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);

2614

cic->cfqq[BLK_RW_ASYNC] = NULL;

2623

cic->cfqq[BLK_RW_ASYNC] = NULL;

2615

}

2624

}

2616

2625

2617

if (cic->cfqq[BLK_RW_SYNC]) {

2626

if (cic->cfqq[BLK_RW_SYNC]) {

2618

cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);

2627

cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);

2619

cic->cfqq[BLK_RW_SYNC] = NULL;

2628

cic->cfqq[BLK_RW_SYNC] = NULL;

2620

}

2629

}

2621

}

2630

}

2622

2631

2623

static void cfq_exit_single_io_context(struct io_context *ioc,

2632

static void cfq_exit_single_io_context(struct io_context *ioc,

2624

struct cfq_io_context *cic)

2633

struct cfq_io_context *cic)

2625

{

2634

{

2626

struct cfq_data *cfqd = cic_to_cfqd(cic);

2635

struct cfq_data *cfqd = cic_to_cfqd(cic);

2627

2636

2628

if (cfqd) {

2637

if (cfqd) {

2629

struct request_queue *q = cfqd->queue;

2638

struct request_queue *q = cfqd->queue;

2630

unsigned long flags;

2639

unsigned long flags;

2631

2640

2632

spin_lock_irqsave(q->queue_lock, flags);

2641

spin_lock_irqsave(q->queue_lock, flags);

2633

2642

2634

/*

2643

/*

2635

* Ensure we get a fresh copy of the ->key to prevent

2644

* Ensure we get a fresh copy of the ->key to prevent

2636

* race between exiting task and queue

2645

* race between exiting task and queue

2637

*/

2646

*/

2638

smp_read_barrier_depends();

2647

smp_read_barrier_depends();

2639

if (cic->key == cfqd)

2648

if (cic->key == cfqd)

2640

__cfq_exit_single_io_context(cfqd, cic);

2649

__cfq_exit_single_io_context(cfqd, cic);

2641

2650

2642

spin_unlock_irqrestore(q->queue_lock, flags);

2651

spin_unlock_irqrestore(q->queue_lock, flags);

2643

}

2652

}

2644

}

2653

}

2645

2654

2646

/*

2655

/*

2647

* The process that ioc belongs to has exited, we need to clean up

2656

* The process that ioc belongs to has exited, we need to clean up

2648

* and put the internal structures we have that belongs to that process.

2657

* and put the internal structures we have that belongs to that process.

2649

*/

2658

*/

2650

static void cfq_exit_io_context(struct io_context *ioc)

2659

static void cfq_exit_io_context(struct io_context *ioc)

2651

{

2660

{

2652

call_for_each_cic(ioc, cfq_exit_single_io_context);

2661

call_for_each_cic(ioc, cfq_exit_single_io_context);

2653

}

2662

}

2654

2663

2655

static struct cfq_io_context *

2664

static struct cfq_io_context *

2656

cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)

2665

cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)

2657

{

2666

{

2658

struct cfq_io_context *cic;

2667

struct cfq_io_context *cic;

2659

2668

2660

cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,

2669

cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,

2661

cfqd->queue->node);

2670

cfqd->queue->node);

2662

if (cic) {

2671

if (cic) {

2663

cic->last_end_request = jiffies;

2672

cic->last_end_request = jiffies;

2664

INIT_LIST_HEAD(&cic->queue_list);

2673

INIT_LIST_HEAD(&cic->queue_list);

2665

INIT_HLIST_NODE(&cic->cic_list);

2674

INIT_HLIST_NODE(&cic->cic_list);

2666

cic->dtor = cfq_free_io_context;

2675

cic->dtor = cfq_free_io_context;

2667

cic->exit = cfq_exit_io_context;

2676

cic->exit = cfq_exit_io_context;

2668

elv_ioc_count_inc(cfq_ioc_count);

2677

elv_ioc_count_inc(cfq_ioc_count);

2669

}

2678

}

2670

2679

2671

return cic;

2680

return cic;

2672

}

2681

}

2673

2682

2674

static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)

2683

static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)

2675

{

2684

{

2676

struct task_struct *tsk = current;

2685

struct task_struct *tsk = current;

2677

int ioprio_class;

2686

int ioprio_class;

2678

2687

2679

if (!cfq_cfqq_prio_changed(cfqq))

2688

if (!cfq_cfqq_prio_changed(cfqq))

2680

return;

2689

return;

2681

2690

2682

ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);

2691

ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);

2683

switch (ioprio_class) {

2692

switch (ioprio_class) {

2684

default:

2693

default:

2685

printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);

2694

printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);

2686

case IOPRIO_CLASS_NONE:

2695

case IOPRIO_CLASS_NONE:

2687

/*

2696

/*

2688

* no prio set, inherit CPU scheduling settings

2697

* no prio set, inherit CPU scheduling settings

2689

*/

2698

*/

2690

cfqq->ioprio = task_nice_ioprio(tsk);

2699

cfqq->ioprio = task_nice_ioprio(tsk);

2691

cfqq->ioprio_class = task_nice_ioclass(tsk);

2700

cfqq->ioprio_class = task_nice_ioclass(tsk);

2692

break;

2701

break;

2693

case IOPRIO_CLASS_RT:

2702

case IOPRIO_CLASS_RT:

2694

cfqq->ioprio = task_ioprio(ioc);

2703

cfqq->ioprio = task_ioprio(ioc);

2695

cfqq->ioprio_class = IOPRIO_CLASS_RT;

2704

cfqq->ioprio_class = IOPRIO_CLASS_RT;

2696

break;

2705

break;

2697

case IOPRIO_CLASS_BE:

2706

case IOPRIO_CLASS_BE:

2698

cfqq->ioprio = task_ioprio(ioc);

2707

cfqq->ioprio = task_ioprio(ioc);

2699

cfqq->ioprio_class = IOPRIO_CLASS_BE;

2708

cfqq->ioprio_class = IOPRIO_CLASS_BE;

2700

break;

2709

break;

2701

case IOPRIO_CLASS_IDLE:

2710

case IOPRIO_CLASS_IDLE:

2702

cfqq->ioprio_class = IOPRIO_CLASS_IDLE;

2711

cfqq->ioprio_class = IOPRIO_CLASS_IDLE;

2703

cfqq->ioprio = 7;

2712

cfqq->ioprio = 7;

2704

cfq_clear_cfqq_idle_window(cfqq);

2713

cfq_clear_cfqq_idle_window(cfqq);

2705

break;

2714

break;

2706

}

2715

}

2707

2716

2708

/*

2717

/*

2709

* keep track of original prio settings in case we have to temporarily

2718

* keep track of original prio settings in case we have to temporarily

2710

* elevate the priority of this queue

2719

* elevate the priority of this queue

2711

*/

2720

*/

2712

cfqq->org_ioprio = cfqq->ioprio;

2721

cfqq->org_ioprio = cfqq->ioprio;

2713

cfqq->org_ioprio_class = cfqq->ioprio_class;

2722

cfqq->org_ioprio_class = cfqq->ioprio_class;

2714

cfq_clear_cfqq_prio_changed(cfqq);

2723

cfq_clear_cfqq_prio_changed(cfqq);

2715

}

2724

}

2716

2725

2717

static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)

2726

static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)

2718

{

2727

{

2719

struct cfq_data *cfqd = cic_to_cfqd(cic);

2728

struct cfq_data *cfqd = cic_to_cfqd(cic);

2720

struct cfq_queue *cfqq;

2729

struct cfq_queue *cfqq;

2721

unsigned long flags;

2730

unsigned long flags;

2722

2731

2723

if (unlikely(!cfqd))

2732

if (unlikely(!cfqd))

2724

return;

2733

return;

2725

2734

2726

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

2735

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

2727

2736

2728

cfqq = cic->cfqq[BLK_RW_ASYNC];

2737

cfqq = cic->cfqq[BLK_RW_ASYNC];

2729

if (cfqq) {

2738

if (cfqq) {

2730

struct cfq_queue *new_cfqq;

2739

struct cfq_queue *new_cfqq;

2731

new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,

2740

new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,

2732

GFP_ATOMIC);

2741

GFP_ATOMIC);

2733

if (new_cfqq) {

2742

if (new_cfqq) {

2734

cic->cfqq[BLK_RW_ASYNC] = new_cfqq;

2743

cic->cfqq[BLK_RW_ASYNC] = new_cfqq;

2735

cfq_put_queue(cfqq);

2744

cfq_put_queue(cfqq);

2736

}

2745

}

2737

}

2746

}

2738

2747

2739

cfqq = cic->cfqq[BLK_RW_SYNC];

2748

cfqq = cic->cfqq[BLK_RW_SYNC];

2740

if (cfqq)

2749

if (cfqq)

2741

cfq_mark_cfqq_prio_changed(cfqq);

2750

cfq_mark_cfqq_prio_changed(cfqq);

2742

2751

2743

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

2752

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

2744

}

2753

}

2745

2754

2746

static void cfq_ioc_set_ioprio(struct io_context *ioc)

2755

static void cfq_ioc_set_ioprio(struct io_context *ioc)

2747

{

2756

{

2748

call_for_each_cic(ioc, changed_ioprio);

2757

call_for_each_cic(ioc, changed_ioprio);

2749

ioc->ioprio_changed = 0;

2758

ioc->ioprio_changed = 0;

2750

}

2759

}

2751

2760

2752

static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,

2761

static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,

2753

pid_t pid, bool is_sync)

2762

pid_t pid, bool is_sync)

2754

{

2763

{

2755

RB_CLEAR_NODE(&cfqq->rb_node);

2764

RB_CLEAR_NODE(&cfqq->rb_node);

2756

RB_CLEAR_NODE(&cfqq->p_node);

2765

RB_CLEAR_NODE(&cfqq->p_node);

2757

INIT_LIST_HEAD(&cfqq->fifo);

2766

INIT_LIST_HEAD(&cfqq->fifo);

2758

2767

2759

atomic_set(&cfqq->ref, 0);

2768

atomic_set(&cfqq->ref, 0);

2760

cfqq->cfqd = cfqd;

2769

cfqq->cfqd = cfqd;

2761

2770

2762

cfq_mark_cfqq_prio_changed(cfqq);

2771

cfq_mark_cfqq_prio_changed(cfqq);

2763

2772

2764

if (is_sync) {

2773

if (is_sync) {

2765

if (!cfq_class_idle(cfqq))

2774

if (!cfq_class_idle(cfqq))

2766

cfq_mark_cfqq_idle_window(cfqq);

2775

cfq_mark_cfqq_idle_window(cfqq);

2767

cfq_mark_cfqq_sync(cfqq);

2776

cfq_mark_cfqq_sync(cfqq);

2768

}

2777

}

2769

cfqq->pid = pid;

2778

cfqq->pid = pid;

2770

}

2779

}

2771

2780

2772

#ifdef CONFIG_CFQ_GROUP_IOSCHED

2781

#ifdef CONFIG_CFQ_GROUP_IOSCHED

2773

static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)

2782

static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)

2774

{

2783

{

2775

struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);

2784

struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);

2776

struct cfq_data *cfqd = cic_to_cfqd(cic);

2785

struct cfq_data *cfqd = cic_to_cfqd(cic);

2777

unsigned long flags;

2786

unsigned long flags;

2778

struct request_queue *q;

2787

struct request_queue *q;

2779

2788

2780

if (unlikely(!cfqd))

2789

if (unlikely(!cfqd))

2781

return;

2790

return;

2782

2791

2783

q = cfqd->queue;

2792

q = cfqd->queue;

2784

2793

2785

spin_lock_irqsave(q->queue_lock, flags);

2794

spin_lock_irqsave(q->queue_lock, flags);

2786

2795

2787

if (sync_cfqq) {

2796

if (sync_cfqq) {

2788

/*

2797

/*

2789

* Drop reference to sync queue. A new sync queue will be

2798

* Drop reference to sync queue. A new sync queue will be

2790

* assigned in new group upon arrival of a fresh request.

2799

* assigned in new group upon arrival of a fresh request.

2791

*/

2800

*/

2792

cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");

2801

cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");

2793

cic_set_cfqq(cic, NULL, 1);

2802

cic_set_cfqq(cic, NULL, 1);

2794

cfq_put_queue(sync_cfqq);

2803

cfq_put_queue(sync_cfqq);

2795

}

2804

}

2796

2805

2797

spin_unlock_irqrestore(q->queue_lock, flags);

2806

spin_unlock_irqrestore(q->queue_lock, flags);

2798

}

2807

}

2799

2808

2800

static void cfq_ioc_set_cgroup(struct io_context *ioc)

2809

static void cfq_ioc_set_cgroup(struct io_context *ioc)

2801

{

2810

{

2802

call_for_each_cic(ioc, changed_cgroup);

2811

call_for_each_cic(ioc, changed_cgroup);

2803

ioc->cgroup_changed = 0;

2812

ioc->cgroup_changed = 0;

2804

}

2813

}

2805

#endif /* CONFIG_CFQ_GROUP_IOSCHED */

2814

#endif /* CONFIG_CFQ_GROUP_IOSCHED */

2806

2815

2807

static struct cfq_queue *

2816

static struct cfq_queue *

2808

cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,

2817

cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,

2809

struct io_context *ioc, gfp_t gfp_mask)

2818

struct io_context *ioc, gfp_t gfp_mask)

2810

{

2819

{

2811

struct cfq_queue *cfqq, *new_cfqq = NULL;

2820

struct cfq_queue *cfqq, *new_cfqq = NULL;

2812

struct cfq_io_context *cic;

2821

struct cfq_io_context *cic;

2813

struct cfq_group *cfqg;

2822

struct cfq_group *cfqg;

2814

2823

2815

retry:

2824

retry:

2816

cfqg = cfq_get_cfqg(cfqd, 1);

2825

cfqg = cfq_get_cfqg(cfqd, 1);

2817

cic = cfq_cic_lookup(cfqd, ioc);

2826

cic = cfq_cic_lookup(cfqd, ioc);

2818

/* cic always exists here */

2827

/* cic always exists here */

2819

cfqq = cic_to_cfqq(cic, is_sync);

2828

cfqq = cic_to_cfqq(cic, is_sync);

2820

2829

2821

/*

2830

/*

2822

* Always try a new alloc if we fell back to the OOM cfqq

2831

* Always try a new alloc if we fell back to the OOM cfqq

2823

* originally, since it should just be a temporary situation.

2832

* originally, since it should just be a temporary situation.

2824

*/

2833

*/

2825

if (!cfqq || cfqq == &cfqd->oom_cfqq) {

2834

if (!cfqq || cfqq == &cfqd->oom_cfqq) {

2826

cfqq = NULL;

2835

cfqq = NULL;

2827

if (new_cfqq) {

2836

if (new_cfqq) {

2828

cfqq = new_cfqq;

2837

cfqq = new_cfqq;

2829

new_cfqq = NULL;

2838

new_cfqq = NULL;

2830

} else if (gfp_mask & __GFP_WAIT) {

2839

} else if (gfp_mask & __GFP_WAIT) {

2831

spin_unlock_irq(cfqd->queue->queue_lock);

2840

spin_unlock_irq(cfqd->queue->queue_lock);

2832

new_cfqq = kmem_cache_alloc_node(cfq_pool,

2841

new_cfqq = kmem_cache_alloc_node(cfq_pool,

2833

gfp_mask | __GFP_ZERO,

2842

gfp_mask | __GFP_ZERO,

2834

cfqd->queue->node);

2843

cfqd->queue->node);

2835

spin_lock_irq(cfqd->queue->queue_lock);

2844

spin_lock_irq(cfqd->queue->queue_lock);

2836

if (new_cfqq)

2845

if (new_cfqq)

2837

goto retry;

2846

goto retry;

2838

} else {

2847

} else {

2839

cfqq = kmem_cache_alloc_node(cfq_pool,

2848

cfqq = kmem_cache_alloc_node(cfq_pool,

2840

gfp_mask | __GFP_ZERO,

2849

gfp_mask | __GFP_ZERO,

2841

cfqd->queue->node);

2850

cfqd->queue->node);

2842

}

2851

}

2843

2852

2844

if (cfqq) {

2853

if (cfqq) {

2845

cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);

2854

cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);

2846

cfq_init_prio_data(cfqq, ioc);

2855

cfq_init_prio_data(cfqq, ioc);

2847

cfq_link_cfqq_cfqg(cfqq, cfqg);

2856

cfq_link_cfqq_cfqg(cfqq, cfqg);

2848

cfq_log_cfqq(cfqd, cfqq, "alloced");

2857

cfq_log_cfqq(cfqd, cfqq, "alloced");

2849

} else

2858

} else

2850

cfqq = &cfqd->oom_cfqq;

2859

cfqq = &cfqd->oom_cfqq;

2851

}

2860

}

2852

2861

2853

if (new_cfqq)

2862

if (new_cfqq)

2854

kmem_cache_free(cfq_pool, new_cfqq);

2863

kmem_cache_free(cfq_pool, new_cfqq);

2855

2864

2856

return cfqq;

2865

return cfqq;

2857

}

2866

}

2858

2867

2859

static struct cfq_queue **

2868

static struct cfq_queue **

2860

cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)

2869

cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)

2861

{

2870

{

2862

switch (ioprio_class) {

2871

switch (ioprio_class) {

2863

case IOPRIO_CLASS_RT:

2872

case IOPRIO_CLASS_RT:

2864

return &cfqd->async_cfqq[0][ioprio];

2873

return &cfqd->async_cfqq[0][ioprio];

2865

case IOPRIO_CLASS_BE:

2874

case IOPRIO_CLASS_BE:

2866

return &cfqd->async_cfqq[1][ioprio];

2875

return &cfqd->async_cfqq[1][ioprio];

2867

case IOPRIO_CLASS_IDLE:

2876

case IOPRIO_CLASS_IDLE:

2868

return &cfqd->async_idle_cfqq;

2877

return &cfqd->async_idle_cfqq;

2869

default:

2878

default:

2870

BUG();

2879

BUG();

2871

}

2880

}

2872

}

2881

}

2873

2882

2874

static struct cfq_queue *

2883

static struct cfq_queue *

2875

cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,

2884

cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,

2876

gfp_t gfp_mask)

2885

gfp_t gfp_mask)

2877

{

2886

{

2878

const int ioprio = task_ioprio(ioc);

2887

const int ioprio = task_ioprio(ioc);

2879

const int ioprio_class = task_ioprio_class(ioc);

2888

const int ioprio_class = task_ioprio_class(ioc);

2880

struct cfq_queue **async_cfqq = NULL;

2889

struct cfq_queue **async_cfqq = NULL;

2881

struct cfq_queue *cfqq = NULL;

2890

struct cfq_queue *cfqq = NULL;

2882

2891

2883

if (!is_sync) {

2892

if (!is_sync) {

2884

async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);

2893

async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);

2885

cfqq = *async_cfqq;

2894

cfqq = *async_cfqq;

2886

}

2895

}

2887

2896

2888

if (!cfqq)

2897

if (!cfqq)

2889

cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);

2898

cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);

2890

2899

2891

/*

2900

/*

2892

* pin the queue now that it's allocated, scheduler exit will prune it

2901

* pin the queue now that it's allocated, scheduler exit will prune it

2893

*/

2902

*/

2894

if (!is_sync && !(*async_cfqq)) {

2903

if (!is_sync && !(*async_cfqq)) {

2895

atomic_inc(&cfqq->ref);

2904

atomic_inc(&cfqq->ref);

2896

*async_cfqq = cfqq;

2905

*async_cfqq = cfqq;

2897

}

2906

}

2898

2907

2899

atomic_inc(&cfqq->ref);

2908

atomic_inc(&cfqq->ref);

2900

return cfqq;

2909

return cfqq;

2901

}

2910

}

2902

2911

2903

/*

2912

/*

2904

* We drop cfq io contexts lazily, so we may find a dead one.

2913

* We drop cfq io contexts lazily, so we may find a dead one.

2905

*/

2914

*/

2906

static void

2915

static void

2907

cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,

2916

cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,

2908

struct cfq_io_context *cic)

2917

struct cfq_io_context *cic)

2909

{

2918

{

2910

unsigned long flags;

2919

unsigned long flags;

2911

2920

2912

WARN_ON(!list_empty(&cic->queue_list));

2921

WARN_ON(!list_empty(&cic->queue_list));

2913

BUG_ON(cic->key != cfqd_dead_key(cfqd));

2922

BUG_ON(cic->key != cfqd_dead_key(cfqd));

2914

2923

2915

spin_lock_irqsave(&ioc->lock, flags);

2924

spin_lock_irqsave(&ioc->lock, flags);

2916

2925

2917

BUG_ON(ioc->ioc_data == cic);

2926

BUG_ON(ioc->ioc_data == cic);

2918

2927

2919

radix_tree_delete(&ioc->radix_root, cfqd->cic_index);

2928

radix_tree_delete(&ioc->radix_root, cfqd->cic_index);

2920

hlist_del_rcu(&cic->cic_list);

2929

hlist_del_rcu(&cic->cic_list);

2921

spin_unlock_irqrestore(&ioc->lock, flags);

2930

spin_unlock_irqrestore(&ioc->lock, flags);

2922

2931

2923

cfq_cic_free(cic);

2932

cfq_cic_free(cic);

2924

}

2933

}

2925

2934

2926

static struct cfq_io_context *

2935

static struct cfq_io_context *

2927

cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)

2936

cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)

2928

{

2937

{

2929

struct cfq_io_context *cic;

2938

struct cfq_io_context *cic;

2930

unsigned long flags;

2939

unsigned long flags;

2931

2940

2932

if (unlikely(!ioc))

2941

if (unlikely(!ioc))

2933

return NULL;

2942

return NULL;

2934

2943

2935

rcu_read_lock();

2944

rcu_read_lock();

2936

2945

2937

/*

2946

/*

2938

* we maintain a last-hit cache, to avoid browsing over the tree

2947

* we maintain a last-hit cache, to avoid browsing over the tree

2939

*/

2948

*/

2940

cic = rcu_dereference(ioc->ioc_data);

2949

cic = rcu_dereference(ioc->ioc_data);

2941

if (cic && cic->key == cfqd) {

2950

if (cic && cic->key == cfqd) {

2942

rcu_read_unlock();

2951

rcu_read_unlock();

2943

return cic;

2952

return cic;

2944

}

2953

}

2945

2954

2946

do {

2955

do {

2947

cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);

2956

cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);

2948

rcu_read_unlock();

2957

rcu_read_unlock();

2949

if (!cic)

2958

if (!cic)

2950

break;

2959

break;

2951

if (unlikely(cic->key != cfqd)) {

2960

if (unlikely(cic->key != cfqd)) {

2952

cfq_drop_dead_cic(cfqd, ioc, cic);

2961

cfq_drop_dead_cic(cfqd, ioc, cic);

2953

rcu_read_lock();

2962

rcu_read_lock();

2954

continue;

2963

continue;

2955

}

2964

}

2956

2965

2957

spin_lock_irqsave(&ioc->lock, flags);

2966

spin_lock_irqsave(&ioc->lock, flags);

2958

rcu_assign_pointer(ioc->ioc_data, cic);

2967

rcu_assign_pointer(ioc->ioc_data, cic);

2959

spin_unlock_irqrestore(&ioc->lock, flags);

2968

spin_unlock_irqrestore(&ioc->lock, flags);

2960

break;

2969

break;

2961

} while (1);

2970

} while (1);

2962

2971

2963

return cic;

2972

return cic;

2964

}

2973

}

2965

2974

2966

/*

2975

/*

2967

* Add cic into ioc, using cfqd as the search key. This enables us to lookup

2976

* Add cic into ioc, using cfqd as the search key. This enables us to lookup

2968

* the process specific cfq io context when entered from the block layer.

2977

* the process specific cfq io context when entered from the block layer.

2969

* Also adds the cic to a per-cfqd list, used when this queue is removed.

2978

* Also adds the cic to a per-cfqd list, used when this queue is removed.

2970

*/

2979

*/

2971

static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,

2980

static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,

2972

struct cfq_io_context *cic, gfp_t gfp_mask)

2981

struct cfq_io_context *cic, gfp_t gfp_mask)

2973

{

2982

{

2974

unsigned long flags;

2983

unsigned long flags;

2975

int ret;

2984

int ret;

2976

2985

2977

ret = radix_tree_preload(gfp_mask);

2986

ret = radix_tree_preload(gfp_mask);

2978

if (!ret) {

2987

if (!ret) {

2979

cic->ioc = ioc;

2988

cic->ioc = ioc;

2980

cic->key = cfqd;

2989

cic->key = cfqd;

2981

2990

2982

spin_lock_irqsave(&ioc->lock, flags);

2991

spin_lock_irqsave(&ioc->lock, flags);

2983

ret = radix_tree_insert(&ioc->radix_root,

2992

ret = radix_tree_insert(&ioc->radix_root,

2984

cfqd->cic_index, cic);

2993

cfqd->cic_index, cic);

2985

if (!ret)

2994

if (!ret)

2986

hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);

2995

hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);

2987

spin_unlock_irqrestore(&ioc->lock, flags);

2996

spin_unlock_irqrestore(&ioc->lock, flags);

2988

2997

2989

radix_tree_preload_end();

2998

radix_tree_preload_end();

2990

2999

2991

if (!ret) {

3000

if (!ret) {

2992

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

3001

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

2993

list_add(&cic->queue_list, &cfqd->cic_list);

3002

list_add(&cic->queue_list, &cfqd->cic_list);

2994

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

3003

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

2995

}

3004

}

2996

}

3005

}

2997

3006

2998

if (ret)

3007

if (ret)

2999

printk(KERN_ERR "cfq: cic link failed!\n");

3008

printk(KERN_ERR "cfq: cic link failed!\n");

3000

3009

3001

return ret;

3010

return ret;

3002

}

3011

}

3003

3012

3004

/*

3013

/*

3005

* Setup general io context and cfq io context. There can be several cfq

3014

* Setup general io context and cfq io context. There can be several cfq

3006

* io contexts per general io context, if this process is doing io to more

3015

* io contexts per general io context, if this process is doing io to more

3007

* than one device managed by cfq.

3016

* than one device managed by cfq.

3008

*/

3017

*/

3009

static struct cfq_io_context *

3018

static struct cfq_io_context *

3010

cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)

3019

cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)

3011

{

3020

{

3012

struct io_context *ioc = NULL;

3021

struct io_context *ioc = NULL;

3013

struct cfq_io_context *cic;

3022

struct cfq_io_context *cic;

3014

3023

3015

might_sleep_if(gfp_mask & __GFP_WAIT);

3024

might_sleep_if(gfp_mask & __GFP_WAIT);

3016

3025

3017

ioc = get_io_context(gfp_mask, cfqd->queue->node);

3026

ioc = get_io_context(gfp_mask, cfqd->queue->node);

3018

if (!ioc)

3027

if (!ioc)

3019

return NULL;

3028

return NULL;

3020

3029

3021

cic = cfq_cic_lookup(cfqd, ioc);

3030

cic = cfq_cic_lookup(cfqd, ioc);

3022

if (cic)

3031

if (cic)

3023

goto out;

3032

goto out;

3024

3033

3025

cic = cfq_alloc_io_context(cfqd, gfp_mask);

3034

cic = cfq_alloc_io_context(cfqd, gfp_mask);

3026

if (cic == NULL)

3035

if (cic == NULL)

3027

goto err;

3036

goto err;

3028

3037

3029

if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))

3038

if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))

3030

goto err_free;

3039

goto err_free;

3031

3040

3032

out:

3041

out:

3033

smp_read_barrier_depends();

3042

smp_read_barrier_depends();

3034

if (unlikely(ioc->ioprio_changed))

3043

if (unlikely(ioc->ioprio_changed))

3035

cfq_ioc_set_ioprio(ioc);

3044

cfq_ioc_set_ioprio(ioc);

3036

3045

3037

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3046

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3038

if (unlikely(ioc->cgroup_changed))

3047

if (unlikely(ioc->cgroup_changed))

3039

cfq_ioc_set_cgroup(ioc);

3048

cfq_ioc_set_cgroup(ioc);

3040

#endif

3049

#endif

3041

return cic;

3050

return cic;

3042

err_free:

3051

err_free:

3043

cfq_cic_free(cic);

3052

cfq_cic_free(cic);

3044

err:

3053

err:

3045

put_io_context(ioc);

3054

put_io_context(ioc);

3046

return NULL;

3055

return NULL;

3047

}

3056

}

3048

3057

3049

static void

3058

static void

3050

cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)

3059

cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)

3051

{

3060

{

3052

unsigned long elapsed = jiffies - cic->last_end_request;

3061

unsigned long elapsed = jiffies - cic->last_end_request;

3053

unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);

3062

unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);

3054

3063

3055

cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;

3064

cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;

3056

cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;

3065

cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;

3057

cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;

3066

cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;

3058

}

3067

}

3059

3068

3060

static void

3069

static void

3061

cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3070

cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3062

struct request *rq)

3071

struct request *rq)

3063

{

3072

{

3064

sector_t sdist = 0;

3073

sector_t sdist = 0;

3065

sector_t n_sec = blk_rq_sectors(rq);

3074

sector_t n_sec = blk_rq_sectors(rq);

3066

if (cfqq->last_request_pos) {

3075

if (cfqq->last_request_pos) {

3067

if (cfqq->last_request_pos < blk_rq_pos(rq))

3076

if (cfqq->last_request_pos < blk_rq_pos(rq))

3068

sdist = blk_rq_pos(rq) - cfqq->last_request_pos;

3077

sdist = blk_rq_pos(rq) - cfqq->last_request_pos;

3069

else

3078

else

3070

sdist = cfqq->last_request_pos - blk_rq_pos(rq);

3079

sdist = cfqq->last_request_pos - blk_rq_pos(rq);

3071

}

3080

}

3072

3081

3073

cfqq->seek_history <<= 1;

3082

cfqq->seek_history <<= 1;

3074

if (blk_queue_nonrot(cfqd->queue))

3083

if (blk_queue_nonrot(cfqd->queue))

3075

cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);

3084

cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);

3076

else

3085

else

3077

cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);

3086

cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);

3078

}

3087

}

3079

3088

3080

/*

3089

/*

3081

* Disable idle window if the process thinks too long or seeks so much that

3090

* Disable idle window if the process thinks too long or seeks so much that

3082

* it doesn't matter

3091

* it doesn't matter

3083

*/

3092

*/

3084

static void

3093

static void

3085

cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3094

cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3086

struct cfq_io_context *cic)

3095

struct cfq_io_context *cic)

3087

{

3096

{

3088

int old_idle, enable_idle;

3097

int old_idle, enable_idle;

3089

3098

3090

/*

3099

/*

3091

* Don't idle for async or idle io prio class

3100

* Don't idle for async or idle io prio class

3092

*/

3101

*/

3093

if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))

3102

if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))

3094

return;

3103

return;

3095

3104

3096

enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);

3105

enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);

3097

3106

3098

if (cfqq->queued[0] + cfqq->queued[1] >= 4)

3107

if (cfqq->queued[0] + cfqq->queued[1] >= 4)

3099

cfq_mark_cfqq_deep(cfqq);

3108

cfq_mark_cfqq_deep(cfqq);

3100

3109

3101

if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||

3110

if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||

3102

(!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))

3111

(!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))

3103

enable_idle = 0;

3112

enable_idle = 0;

3104

else if (sample_valid(cic->ttime_samples)) {

3113

else if (sample_valid(cic->ttime_samples)) {

3105

if (cic->ttime_mean > cfqd->cfq_slice_idle)

3114

if (cic->ttime_mean > cfqd->cfq_slice_idle)

3106

enable_idle = 0;

3115

enable_idle = 0;

3107

else

3116

else

3108

enable_idle = 1;

3117

enable_idle = 1;

3109

}

3118

}

3110

3119

3111

if (old_idle != enable_idle) {

3120

if (old_idle != enable_idle) {

3112

cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);

3121

cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);

3113

if (enable_idle)

3122

if (enable_idle)

3114

cfq_mark_cfqq_idle_window(cfqq);

3123

cfq_mark_cfqq_idle_window(cfqq);

3115

else

3124

else

3116

cfq_clear_cfqq_idle_window(cfqq);

3125

cfq_clear_cfqq_idle_window(cfqq);

3117

}

3126

}

3118

}

3127

}

3119

3128

3120

/*

3129

/*

3121

* Check if new_cfqq should preempt the currently active queue. Return 0 for

3130

* Check if new_cfqq should preempt the currently active queue. Return 0 for

3122

* no or if we aren't sure, a 1 will cause a preempt.

3131

* no or if we aren't sure, a 1 will cause a preempt.

3123

*/

3132

*/

3124

static bool

3133

static bool

3125

cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,

3134

cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,

3126

struct request *rq)

3135

struct request *rq)

3127

{

3136

{

3128

struct cfq_queue *cfqq;

3137

struct cfq_queue *cfqq;

3129

3138

3130

cfqq = cfqd->active_queue;

3139

cfqq = cfqd->active_queue;

3131

if (!cfqq)

3140

if (!cfqq)

3132

return false;

3141

return false;

3133

3142

3134

if (cfq_class_idle(new_cfqq))

3143

if (cfq_class_idle(new_cfqq))

3135

return false;

3144

return false;

3136

3145

3137

if (cfq_class_idle(cfqq))

3146

if (cfq_class_idle(cfqq))

3138

return true;

3147

return true;

3139

3148

3140

/*

3149

/*

3141

* Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.

3150

* Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.

3142

*/

3151

*/

3143

if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))

3152

if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))

3144

return false;

3153

return false;

3145

3154

3146

/*

3155

/*

3147

* if the new request is sync, but the currently running queue is

3156

* if the new request is sync, but the currently running queue is

3148

* not, let the sync request have priority.

3157

* not, let the sync request have priority.

3149

*/

3158

*/

3150

if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))

3159

if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))

3151

return true;

3160

return true;

3152

3161

3153

if (new_cfqq->cfqg != cfqq->cfqg)

3162

if (new_cfqq->cfqg != cfqq->cfqg)

3154

return false;

3163

return false;

3155

3164

3156

if (cfq_slice_used(cfqq))

3165

if (cfq_slice_used(cfqq))

3157

return true;

3166

return true;

3158

3167

3159

/* Allow preemption only if we are idling on sync-noidle tree */

3168

/* Allow preemption only if we are idling on sync-noidle tree */

3160

if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&

3169

if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&

3161

cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&

3170

cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&

3162

new_cfqq->service_tree->count == 2 &&

3171

new_cfqq->service_tree->count == 2 &&

3163

RB_EMPTY_ROOT(&cfqq->sort_list))

3172

RB_EMPTY_ROOT(&cfqq->sort_list))

3164

return true;

3173

return true;

3165

3174

3166

/*

3175

/*

3167

* So both queues are sync. Let the new request get disk time if

3176

* So both queues are sync. Let the new request get disk time if

3168

* it's a metadata request and the current queue is doing regular IO.

3177

* it's a metadata request and the current queue is doing regular IO.

3169

*/

3178

*/

3170

if (rq_is_meta(rq) && !cfqq->meta_pending)

3179

if (rq_is_meta(rq) && !cfqq->meta_pending)

3171

return true;

3180

return true;

3172

3181

3173

/*

3182

/*

3174

* Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.

3183

* Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.

3175

*/

3184

*/

3176

if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))

3185

if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))

3177

return true;

3186

return true;

3178

3187

3179

if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))

3188

if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))

3180

return false;

3189

return false;

3181

3190

3182

/*

3191

/*

3183

* if this request is as-good as one we would expect from the

3192

* if this request is as-good as one we would expect from the

3184

* current cfqq, let it preempt

3193

* current cfqq, let it preempt

3185

*/

3194

*/

3186

if (cfq_rq_close(cfqd, cfqq, rq))

3195

if (cfq_rq_close(cfqd, cfqq, rq))

3187

return true;

3196

return true;

3188

3197

3189

return false;

3198

return false;

3190

}

3199

}

3191

3200

3192

/*

3201

/*

3193

* cfqq preempts the active queue. if we allowed preempt with no slice left,

3202

* cfqq preempts the active queue. if we allowed preempt with no slice left,

3194

* let it have half of its nominal slice.

3203

* let it have half of its nominal slice.

3195

*/

3204

*/

3196

static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)

3205

static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)

3197

{

3206

{

3198

cfq_log_cfqq(cfqd, cfqq, "preempt");

3207

cfq_log_cfqq(cfqd, cfqq, "preempt");

3199

cfq_slice_expired(cfqd, 1);

3208

cfq_slice_expired(cfqd, 1);

3200

3209

3201

/*

3210

/*

3202

* Put the new queue at the front of the of the current list,

3211

* Put the new queue at the front of the of the current list,

3203

* so we know that it will be selected next.

3212

* so we know that it will be selected next.

3204

*/

3213

*/

3205

BUG_ON(!cfq_cfqq_on_rr(cfqq));

3214

BUG_ON(!cfq_cfqq_on_rr(cfqq));

3206

3215

3207

cfq_service_tree_add(cfqd, cfqq, 1);

3216

cfq_service_tree_add(cfqd, cfqq, 1);

3208

3217

3209

cfqq->slice_end = 0;

3218

cfqq->slice_end = 0;

3210

cfq_mark_cfqq_slice_new(cfqq);

3219

cfq_mark_cfqq_slice_new(cfqq);

3211

}

3220

}

3212

3221

3213

/*

3222

/*

3214

* Called when a new fs request (rq) is added (to cfqq). Check if there's

3223

* Called when a new fs request (rq) is added (to cfqq). Check if there's

3215

* something we should do about it

3224

* something we should do about it

3216

*/

3225

*/

3217

static void

3226

static void

3218

cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3227

cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3219

struct request *rq)

3228

struct request *rq)

3220

{

3229

{

3221

struct cfq_io_context *cic = RQ_CIC(rq);

3230

struct cfq_io_context *cic = RQ_CIC(rq);

3222

3231

3223

cfqd->rq_queued++;

3232

cfqd->rq_queued++;

3224

if (rq_is_meta(rq))

3233

if (rq_is_meta(rq))

3225

cfqq->meta_pending++;

3234

cfqq->meta_pending++;

3226

3235

3227

cfq_update_io_thinktime(cfqd, cic);

3236

cfq_update_io_thinktime(cfqd, cic);

3228

cfq_update_io_seektime(cfqd, cfqq, rq);

3237

cfq_update_io_seektime(cfqd, cfqq, rq);

3229

cfq_update_idle_window(cfqd, cfqq, cic);

3238

cfq_update_idle_window(cfqd, cfqq, cic);

3230

3239

3231

cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

3240

cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

3232

3241

3233

if (cfqq == cfqd->active_queue) {

3242

if (cfqq == cfqd->active_queue) {

3234

/*

3243

/*

3235

* Remember that we saw a request from this process, but

3244

* Remember that we saw a request from this process, but

3236

* don't start queuing just yet. Otherwise we risk seeing lots

3245

* don't start queuing just yet. Otherwise we risk seeing lots

3237

* of tiny requests, because we disrupt the normal plugging

3246

* of tiny requests, because we disrupt the normal plugging

3238

* and merging. If the request is already larger than a single

3247

* and merging. If the request is already larger than a single

3239

* page, let it rip immediately. For that case we assume that

3248

* page, let it rip immediately. For that case we assume that

3240

* merging is already done. Ditto for a busy system that

3249

* merging is already done. Ditto for a busy system that

3241

* has other work pending, don't risk delaying until the

3250

* has other work pending, don't risk delaying until the

3242

* idle timer unplug to continue working.

3251

* idle timer unplug to continue working.

3243

*/

3252

*/

3244

if (cfq_cfqq_wait_request(cfqq)) {

3253

if (cfq_cfqq_wait_request(cfqq)) {

3245

if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||

3254

if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||

3246

cfqd->busy_queues > 1) {

3255

cfqd->busy_queues > 1) {

3247

cfq_del_timer(cfqd, cfqq);

3256

cfq_del_timer(cfqd, cfqq);

3248

cfq_clear_cfqq_wait_request(cfqq);

3257

cfq_clear_cfqq_wait_request(cfqq);

3249

__blk_run_queue(cfqd->queue);

3258

__blk_run_queue(cfqd->queue);

3250

} else {

3259

} else {

3251

blkiocg_update_idle_time_stats(

3260

blkiocg_update_idle_time_stats(

3252

&cfqq->cfqg->blkg);

3261

&cfqq->cfqg->blkg);

3253

cfq_mark_cfqq_must_dispatch(cfqq);

3262

cfq_mark_cfqq_must_dispatch(cfqq);

3254

}

3263

}

3255

}

3264

}

3256

} else if (cfq_should_preempt(cfqd, cfqq, rq)) {

3265

} else if (cfq_should_preempt(cfqd, cfqq, rq)) {

3257

/*

3266

/*

3258

* not the active queue - expire current slice if it is

3267

* not the active queue - expire current slice if it is

3259

* idle and has expired it's mean thinktime or this new queue

3268

* idle and has expired it's mean thinktime or this new queue

3260

* has some old slice time left and is of higher priority or

3269

* has some old slice time left and is of higher priority or

3261

* this new queue is RT and the current one is BE

3270

* this new queue is RT and the current one is BE

3262

*/

3271

*/

3263

cfq_preempt_queue(cfqd, cfqq);

3272

cfq_preempt_queue(cfqd, cfqq);

3264

__blk_run_queue(cfqd->queue);

3273

__blk_run_queue(cfqd->queue);

3265

}

3274

}

3266

}

3275

}

3267

3276

3268

static void cfq_insert_request(struct request_queue *q, struct request *rq)

3277

static void cfq_insert_request(struct request_queue *q, struct request *rq)

3269

{

3278

{

3270

struct cfq_data *cfqd = q->elevator->elevator_data;

3279

struct cfq_data *cfqd = q->elevator->elevator_data;

3271

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3280

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3272

3281

3273

cfq_log_cfqq(cfqd, cfqq, "insert_request");

3282

cfq_log_cfqq(cfqd, cfqq, "insert_request");

3274

cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);

3283

cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);

3275

3284

3276

rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);

3285

rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);

3277

list_add_tail(&rq->queuelist, &cfqq->fifo);

3286

list_add_tail(&rq->queuelist, &cfqq->fifo);

3278

cfq_add_rq_rb(rq);

3287

cfq_add_rq_rb(rq);

3279

blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,

3288

blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,

3280

&cfqd->serving_group->blkg, rq_data_dir(rq),

3289

&cfqd->serving_group->blkg, rq_data_dir(rq),

3281

rq_is_sync(rq));

3290

rq_is_sync(rq));

3282

cfq_rq_enqueued(cfqd, cfqq, rq);

3291

cfq_rq_enqueued(cfqd, cfqq, rq);

3283

}

3292

}

3284

3293

3285

/*

3294

/*

3286

* Update hw_tag based on peak queue depth over 50 samples under

3295

* Update hw_tag based on peak queue depth over 50 samples under

3287

* sufficient load.

3296

* sufficient load.

3288

*/

3297

*/

3289

static void cfq_update_hw_tag(struct cfq_data *cfqd)

3298

static void cfq_update_hw_tag(struct cfq_data *cfqd)

3290

{

3299

{

3291

struct cfq_queue *cfqq = cfqd->active_queue;

3300

struct cfq_queue *cfqq = cfqd->active_queue;

3292

3301

3293

if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)

3302

if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)

3294

cfqd->hw_tag_est_depth = cfqd->rq_in_driver;

3303

cfqd->hw_tag_est_depth = cfqd->rq_in_driver;

3295

3304

3296

if (cfqd->hw_tag == 1)

3305

if (cfqd->hw_tag == 1)

3297

return;

3306

return;

3298

3307

3299

if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&

3308

if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&

3300

cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)

3309

cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)

3301

return;

3310

return;

3302

3311

3303

/*

3312

/*

3304

* If active queue hasn't enough requests and can idle, cfq might not

3313

* If active queue hasn't enough requests and can idle, cfq might not

3305

* dispatch sufficient requests to hardware. Don't zero hw_tag in this

3314

* dispatch sufficient requests to hardware. Don't zero hw_tag in this

3306

* case

3315

* case

3307

*/

3316

*/

3308

if (cfqq && cfq_cfqq_idle_window(cfqq) &&

3317

if (cfqq && cfq_cfqq_idle_window(cfqq) &&

3309

cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <

3318

cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <

3310

CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)

3319

CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)

3311

return;

3320

return;

3312

3321

3313

if (cfqd->hw_tag_samples++ < 50)

3322

if (cfqd->hw_tag_samples++ < 50)

3314

return;

3323

return;

3315

3324

3316

if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)

3325

if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)

3317

cfqd->hw_tag = 1;

3326

cfqd->hw_tag = 1;

3318

else

3327

else

3319

cfqd->hw_tag = 0;

3328

cfqd->hw_tag = 0;

3320

}

3329

}

3321

3330

3322

static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)

3331

static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)

3323

{

3332

{

3324

struct cfq_io_context *cic = cfqd->active_cic;

3333

struct cfq_io_context *cic = cfqd->active_cic;

3325

3334

3326

/* If there are other queues in the group, don't wait */

3335

/* If there are other queues in the group, don't wait */

3327

if (cfqq->cfqg->nr_cfqq > 1)

3336

if (cfqq->cfqg->nr_cfqq > 1)

3328

return false;

3337

return false;

3329

3338

3330

if (cfq_slice_used(cfqq))

3339

if (cfq_slice_used(cfqq))

3331

return true;

3340

return true;

3332

3341

3333

/* if slice left is less than think time, wait busy */

3342

/* if slice left is less than think time, wait busy */

3334

if (cic && sample_valid(cic->ttime_samples)

3343

if (cic && sample_valid(cic->ttime_samples)

3335

&& (cfqq->slice_end - jiffies < cic->ttime_mean))

3344

&& (cfqq->slice_end - jiffies < cic->ttime_mean))

3336

return true;

3345

return true;

3337

3346

3338

/*

3347

/*

3339

* If think times is less than a jiffy than ttime_mean=0 and above

3348

* If think times is less than a jiffy than ttime_mean=0 and above

3340

* will not be true. It might happen that slice has not expired yet

3349

* will not be true. It might happen that slice has not expired yet

3341

* but will expire soon (4-5 ns) during select_queue(). To cover the

3350

* but will expire soon (4-5 ns) during select_queue(). To cover the

3342

* case where think time is less than a jiffy, mark the queue wait

3351

* case where think time is less than a jiffy, mark the queue wait

3343

* busy if only 1 jiffy is left in the slice.

3352

* busy if only 1 jiffy is left in the slice.

3344

*/

3353

*/

3345

if (cfqq->slice_end - jiffies == 1)

3354

if (cfqq->slice_end - jiffies == 1)

3346

return true;

3355

return true;

3347

3356

3348

return false;

3357

return false;

3349

}

3358

}

3350

3359

3351

static void cfq_completed_request(struct request_queue *q, struct request *rq)

3360

static void cfq_completed_request(struct request_queue *q, struct request *rq)

3352

{

3361

{

3353

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3362

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3354

struct cfq_data *cfqd = cfqq->cfqd;

3363

struct cfq_data *cfqd = cfqq->cfqd;

3355

const int sync = rq_is_sync(rq);

3364

const int sync = rq_is_sync(rq);

3356

unsigned long now;

3365

unsigned long now;

3357

3366

3358

now = jiffies;

3367

now = jiffies;

3359

cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));

3368

cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));

3360

3369

3361

cfq_update_hw_tag(cfqd);

3370

cfq_update_hw_tag(cfqd);

3362

3371

3363

WARN_ON(!cfqd->rq_in_driver);

3372

WARN_ON(!cfqd->rq_in_driver);

3364

WARN_ON(!cfqq->dispatched);

3373

WARN_ON(!cfqq->dispatched);

3365

cfqd->rq_in_driver--;

3374

cfqd->rq_in_driver--;

3366

cfqq->dispatched--;

3375

cfqq->dispatched--;

3367

blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),

3376

blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),

3368

rq_io_start_time_ns(rq), rq_data_dir(rq),

3377

rq_io_start_time_ns(rq), rq_data_dir(rq),

3369

rq_is_sync(rq));

3378

rq_is_sync(rq));

3370

3379

3371

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;

3380

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;

3372

3381

3373

if (sync) {

3382

if (sync) {

3374

RQ_CIC(rq)->last_end_request = now;

3383

RQ_CIC(rq)->last_end_request = now;

3375

if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))

3384

if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))

3376

cfqd->last_delayed_sync = now;

3385

cfqd->last_delayed_sync = now;

3377

}

3386

}

3378

3387

3379

/*

3388

/*

3380

* If this is the active queue, check if it needs to be expired,

3389

* If this is the active queue, check if it needs to be expired,

3381

* or if we want to idle in case it has no pending requests.

3390

* or if we want to idle in case it has no pending requests.

3382

*/

3391

*/

3383

if (cfqd->active_queue == cfqq) {

3392

if (cfqd->active_queue == cfqq) {

3384

const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);

3393

const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);

3385

3394

3386

if (cfq_cfqq_slice_new(cfqq)) {

3395

if (cfq_cfqq_slice_new(cfqq)) {

3387

cfq_set_prio_slice(cfqd, cfqq);

3396

cfq_set_prio_slice(cfqd, cfqq);

3388

cfq_clear_cfqq_slice_new(cfqq);

3397

cfq_clear_cfqq_slice_new(cfqq);

3389

}

3398

}

3390

3399

3391

/*

3400

/*

3392

* Should we wait for next request to come in before we expire

3401

* Should we wait for next request to come in before we expire

3393

* the queue.

3402

* the queue.

3394

*/

3403

*/

3395

if (cfq_should_wait_busy(cfqd, cfqq)) {

3404

if (cfq_should_wait_busy(cfqd, cfqq)) {

3396

cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;

3405

cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;

3397

cfq_mark_cfqq_wait_busy(cfqq);

3406

cfq_mark_cfqq_wait_busy(cfqq);

3398

cfq_log_cfqq(cfqd, cfqq, "will busy wait");

3407

cfq_log_cfqq(cfqd, cfqq, "will busy wait");

3399

}

3408

}

3400

3409

3401

/*

3410

/*

3402

* Idling is not enabled on:

3411

* Idling is not enabled on:

3403

* - expired queues

3412

* - expired queues

3404

* - idle-priority queues

3413

* - idle-priority queues

3405

* - async queues

3414

* - async queues

3406

* - queues with still some requests queued

3415

* - queues with still some requests queued

3407

* - when there is a close cooperator

3416

* - when there is a close cooperator

3408

*/

3417

*/

3409

if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))

3418

if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))

3410

cfq_slice_expired(cfqd, 1);

3419

cfq_slice_expired(cfqd, 1);

3411

else if (sync && cfqq_empty &&

3420

else if (sync && cfqq_empty &&

3412

!cfq_close_cooperator(cfqd, cfqq)) {

3421

!cfq_close_cooperator(cfqd, cfqq)) {

3413

cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);

3422

cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);

3414

/*

3423

/*

3415

* Idling is enabled for SYNC_WORKLOAD.

3424

* Idling is enabled for SYNC_WORKLOAD.

3416

* SYNC_NOIDLE_WORKLOAD idles at the end of the tree

3425

* SYNC_NOIDLE_WORKLOAD idles at the end of the tree

3417

* only if we processed at least one !rq_noidle request

3426

* only if we processed at least one !rq_noidle request

3418

*/

3427

*/

3419

if (cfqd->serving_type == SYNC_WORKLOAD

3428

if (cfqd->serving_type == SYNC_WORKLOAD

3420

|| cfqd->noidle_tree_requires_idle

3429

|| cfqd->noidle_tree_requires_idle

3421

|| cfqq->cfqg->nr_cfqq == 1)

3430

|| cfqq->cfqg->nr_cfqq == 1)

3422

cfq_arm_slice_timer(cfqd);

3431

cfq_arm_slice_timer(cfqd);

3423

}

3432

}

3424

}

3433

}

3425

3434

3426

if (!cfqd->rq_in_driver)

3435

if (!cfqd->rq_in_driver)

3427

cfq_schedule_dispatch(cfqd);

3436

cfq_schedule_dispatch(cfqd);

3428

}

3437

}

3429

3438

3430

/*

3439

/*

3431

* we temporarily boost lower priority queues if they are holding fs exclusive

3440

* we temporarily boost lower priority queues if they are holding fs exclusive

3432

* resources. they are boosted to normal prio (CLASS_BE/4)

3441

* resources. they are boosted to normal prio (CLASS_BE/4)

3433

*/

3442

*/

3434

static void cfq_prio_boost(struct cfq_queue *cfqq)

3443

static void cfq_prio_boost(struct cfq_queue *cfqq)

3435

{

3444

{

3436

if (has_fs_excl()) {

3445

if (has_fs_excl()) {

3437

/*

3446

/*

3438

* boost idle prio on transactions that would lock out other

3447

* boost idle prio on transactions that would lock out other

3439

* users of the filesystem

3448

* users of the filesystem

3440

*/

3449

*/

3441

if (cfq_class_idle(cfqq))

3450

if (cfq_class_idle(cfqq))

3442

cfqq->ioprio_class = IOPRIO_CLASS_BE;

3451

cfqq->ioprio_class = IOPRIO_CLASS_BE;

3443

if (cfqq->ioprio > IOPRIO_NORM)

3452

if (cfqq->ioprio > IOPRIO_NORM)

3444

cfqq->ioprio = IOPRIO_NORM;

3453

cfqq->ioprio = IOPRIO_NORM;

3445

} else {

3454

} else {

3446

/*

3455

/*

3447

* unboost the queue (if needed)

3456

* unboost the queue (if needed)

3448

*/

3457

*/

3449

cfqq->ioprio_class = cfqq->org_ioprio_class;

3458

cfqq->ioprio_class = cfqq->org_ioprio_class;

3450

cfqq->ioprio = cfqq->org_ioprio;

3459

cfqq->ioprio = cfqq->org_ioprio;

3451

}

3460

}

3452

}

3461

}

3453

3462

3454

static inline int __cfq_may_queue(struct cfq_queue *cfqq)

3463

static inline int __cfq_may_queue(struct cfq_queue *cfqq)

3455

{

3464

{

3456

if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {

3465

if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {

3457

cfq_mark_cfqq_must_alloc_slice(cfqq);

3466

cfq_mark_cfqq_must_alloc_slice(cfqq);

3458

return ELV_MQUEUE_MUST;

3467

return ELV_MQUEUE_MUST;

3459

}

3468

}

3460

3469

3461

return ELV_MQUEUE_MAY;

3470

return ELV_MQUEUE_MAY;

3462

}

3471

}

3463

3472

3464

static int cfq_may_queue(struct request_queue *q, int rw)

3473

static int cfq_may_queue(struct request_queue *q, int rw)

3465

{

3474

{

3466

struct cfq_data *cfqd = q->elevator->elevator_data;

3475

struct cfq_data *cfqd = q->elevator->elevator_data;

3467

struct task_struct *tsk = current;

3476

struct task_struct *tsk = current;

3468

struct cfq_io_context *cic;

3477

struct cfq_io_context *cic;

3469

struct cfq_queue *cfqq;

3478

struct cfq_queue *cfqq;

3470

3479

3471

/*

3480

/*

3472

* don't force setup of a queue from here, as a call to may_queue

3481

* don't force setup of a queue from here, as a call to may_queue

3473

* does not necessarily imply that a request actually will be queued.

3482

* does not necessarily imply that a request actually will be queued.

3474

* so just lookup a possibly existing queue, or return 'may queue'

3483

* so just lookup a possibly existing queue, or return 'may queue'

3475

* if that fails

3484

* if that fails

3476

*/

3485

*/

3477

cic = cfq_cic_lookup(cfqd, tsk->io_context);

3486

cic = cfq_cic_lookup(cfqd, tsk->io_context);

3478

if (!cic)

3487

if (!cic)

3479

return ELV_MQUEUE_MAY;

3488

return ELV_MQUEUE_MAY;

3480

3489

3481

cfqq = cic_to_cfqq(cic, rw_is_sync(rw));

3490

cfqq = cic_to_cfqq(cic, rw_is_sync(rw));

3482

if (cfqq) {

3491

if (cfqq) {

3483

cfq_init_prio_data(cfqq, cic->ioc);

3492

cfq_init_prio_data(cfqq, cic->ioc);

3484

cfq_prio_boost(cfqq);

3493

cfq_prio_boost(cfqq);

3485

3494

3486

return __cfq_may_queue(cfqq);

3495

return __cfq_may_queue(cfqq);

3487

}

3496

}

3488

3497

3489

return ELV_MQUEUE_MAY;

3498

return ELV_MQUEUE_MAY;

3490

}

3499

}

3491

3500

3492

/*

3501

/*

3493

* queue lock held here

3502

* queue lock held here

3494

*/

3503

*/

3495

static void cfq_put_request(struct request *rq)

3504

static void cfq_put_request(struct request *rq)

3496

{

3505

{

3497

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3506

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3498

3507

3499

if (cfqq) {

3508

if (cfqq) {

3500

const int rw = rq_data_dir(rq);

3509

const int rw = rq_data_dir(rq);

3501

3510

3502

BUG_ON(!cfqq->allocated[rw]);

3511

BUG_ON(!cfqq->allocated[rw]);

3503

cfqq->allocated[rw]--;

3512

cfqq->allocated[rw]--;

3504

3513

3505

put_io_context(RQ_CIC(rq)->ioc);

3514

put_io_context(RQ_CIC(rq)->ioc);

3506

3515

3507

rq->elevator_private = NULL;

3516

rq->elevator_private = NULL;

3508

rq->elevator_private2 = NULL;

3517

rq->elevator_private2 = NULL;

3509

3518

3510

/* Put down rq reference on cfqg */

3519

/* Put down rq reference on cfqg */

3511

cfq_put_cfqg(RQ_CFQG(rq));

3520

cfq_put_cfqg(RQ_CFQG(rq));

3512

rq->elevator_private3 = NULL;

3521

rq->elevator_private3 = NULL;

3513

3522

3514

cfq_put_queue(cfqq);

3523

cfq_put_queue(cfqq);

3515

}

3524

}

3516

}

3525

}

3517

3526

3518

static struct cfq_queue *

3527

static struct cfq_queue *

3519

cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,

3528

cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,

3520

struct cfq_queue *cfqq)

3529

struct cfq_queue *cfqq)

3521

{

3530

{

3522

cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);

3531

cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);

3523

cic_set_cfqq(cic, cfqq->new_cfqq, 1);

3532

cic_set_cfqq(cic, cfqq->new_cfqq, 1);

3524

cfq_mark_cfqq_coop(cfqq->new_cfqq);

3533

cfq_mark_cfqq_coop(cfqq->new_cfqq);

3525

cfq_put_queue(cfqq);

3534

cfq_put_queue(cfqq);

3526

return cic_to_cfqq(cic, 1);

3535

return cic_to_cfqq(cic, 1);

3527

}

3536

}

3528

3537

3529

/*

3538

/*

3530

* Returns NULL if a new cfqq should be allocated, or the old cfqq if this

3539

* Returns NULL if a new cfqq should be allocated, or the old cfqq if this

3531

* was the last process referring to said cfqq.

3540

* was the last process referring to said cfqq.

3532

*/

3541

*/

3533

static struct cfq_queue *

3542

static struct cfq_queue *

3534

split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)

3543

split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)

3535

{

3544

{

3536

if (cfqq_process_refs(cfqq) == 1) {

3545

if (cfqq_process_refs(cfqq) == 1) {

3537

cfqq->pid = current->pid;

3546

cfqq->pid = current->pid;

3538

cfq_clear_cfqq_coop(cfqq);

3547

cfq_clear_cfqq_coop(cfqq);

3539

cfq_clear_cfqq_split_coop(cfqq);

3548

cfq_clear_cfqq_split_coop(cfqq);

3540

return cfqq;

3549

return cfqq;

3541

}

3550

}

3542

3551

3543

cic_set_cfqq(cic, NULL, 1);

3552

cic_set_cfqq(cic, NULL, 1);

3544

3553

3545

cfq_put_cooperator(cfqq);

3554

cfq_put_cooperator(cfqq);

3546

3555

3547

cfq_put_queue(cfqq);

3556

cfq_put_queue(cfqq);

3548

return NULL;

3557

return NULL;

3549

}

3558

}

3550

/*

3559

/*

3551

* Allocate cfq data structures associated with this request.

3560

* Allocate cfq data structures associated with this request.

3552

*/

3561

*/

3553

static int

3562

static int

3554

cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)

3563

cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)

3555

{

3564

{

3556

struct cfq_data *cfqd = q->elevator->elevator_data;

3565

struct cfq_data *cfqd = q->elevator->elevator_data;

3557

struct cfq_io_context *cic;

3566

struct cfq_io_context *cic;

3558

const int rw = rq_data_dir(rq);

3567

const int rw = rq_data_dir(rq);

3559

const bool is_sync = rq_is_sync(rq);

3568

const bool is_sync = rq_is_sync(rq);

3560

struct cfq_queue *cfqq;

3569

struct cfq_queue *cfqq;

3561

unsigned long flags;

3570

unsigned long flags;

3562

3571

3563

might_sleep_if(gfp_mask & __GFP_WAIT);

3572

might_sleep_if(gfp_mask & __GFP_WAIT);

3564

3573

3565

cic = cfq_get_io_context(cfqd, gfp_mask);

3574

cic = cfq_get_io_context(cfqd, gfp_mask);

3566

3575

3567

spin_lock_irqsave(q->queue_lock, flags);

3576

spin_lock_irqsave(q->queue_lock, flags);

3568

3577

3569

if (!cic)

3578

if (!cic)

3570

goto queue_fail;

3579

goto queue_fail;

3571

3580

3572

new_queue:

3581

new_queue:

3573

cfqq = cic_to_cfqq(cic, is_sync);

3582

cfqq = cic_to_cfqq(cic, is_sync);

3574

if (!cfqq || cfqq == &cfqd->oom_cfqq) {

3583

if (!cfqq || cfqq == &cfqd->oom_cfqq) {

3575

cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);

3584

cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);

3576

cic_set_cfqq(cic, cfqq, is_sync);

3585

cic_set_cfqq(cic, cfqq, is_sync);

3577

} else {

3586

} else {

3578

/*

3587

/*

3579

* If the queue was seeky for too long, break it apart.

3588

* If the queue was seeky for too long, break it apart.

3580

*/

3589

*/

3581

if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {

3590

if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {

3582

cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");

3591

cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");

3583

cfqq = split_cfqq(cic, cfqq);

3592

cfqq = split_cfqq(cic, cfqq);

3584

if (!cfqq)

3593

if (!cfqq)

3585

goto new_queue;

3594

goto new_queue;

3586

}

3595

}

3587

3596

3588

/*

3597

/*

3589

* Check to see if this queue is scheduled to merge with

3598

* Check to see if this queue is scheduled to merge with

3590

* another, closely cooperating queue. The merging of

3599

* another, closely cooperating queue. The merging of

3591

* queues happens here as it must be done in process context.

3600

* queues happens here as it must be done in process context.

3592

* The reference on new_cfqq was taken in merge_cfqqs.

3601

* The reference on new_cfqq was taken in merge_cfqqs.

3593

*/

3602

*/

3594

if (cfqq->new_cfqq)

3603

if (cfqq->new_cfqq)

3595

cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);

3604

cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);

3596

}

3605

}

3597

3606

3598

cfqq->allocated[rw]++;

3607

cfqq->allocated[rw]++;

3599

atomic_inc(&cfqq->ref);

3608

atomic_inc(&cfqq->ref);

3600

3609

3601

spin_unlock_irqrestore(q->queue_lock, flags);

3610

spin_unlock_irqrestore(q->queue_lock, flags);

3602

3611

3603

rq->elevator_private = cic;

3612

rq->elevator_private = cic;

3604

rq->elevator_private2 = cfqq;

3613

rq->elevator_private2 = cfqq;

3605

rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);

3614

rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);

3606

return 0;

3615

return 0;

3607

3616

3608

queue_fail:

3617

queue_fail:

3609

if (cic)

3618

if (cic)

3610

put_io_context(cic->ioc);

3619

put_io_context(cic->ioc);

3611

3620

3612

cfq_schedule_dispatch(cfqd);

3621

cfq_schedule_dispatch(cfqd);

3613

spin_unlock_irqrestore(q->queue_lock, flags);

3622

spin_unlock_irqrestore(q->queue_lock, flags);

3614

cfq_log(cfqd, "set_request fail");

3623

cfq_log(cfqd, "set_request fail");

3615

return 1;

3624

return 1;

3616

}

3625

}

3617

3626

3618

static void cfq_kick_queue(struct work_struct *work)

3627

static void cfq_kick_queue(struct work_struct *work)

3619

{

3628

{

3620

struct cfq_data *cfqd =

3629

struct cfq_data *cfqd =

3621

container_of(work, struct cfq_data, unplug_work);

3630

container_of(work, struct cfq_data, unplug_work);

3622

struct request_queue *q = cfqd->queue;

3631

struct request_queue *q = cfqd->queue;

3623

3632

3624

spin_lock_irq(q->queue_lock);

3633

spin_lock_irq(q->queue_lock);

3625

__blk_run_queue(cfqd->queue);

3634

__blk_run_queue(cfqd->queue);

3626

spin_unlock_irq(q->queue_lock);

3635

spin_unlock_irq(q->queue_lock);

3627

}

3636

}

3628

3637

3629

/*

3638

/*

3630

* Timer running if the active_queue is currently idling inside its time slice

3639

* Timer running if the active_queue is currently idling inside its time slice

3631

*/

3640

*/

3632

static void cfq_idle_slice_timer(unsigned long data)

3641

static void cfq_idle_slice_timer(unsigned long data)

3633

{

3642

{

3634

struct cfq_data *cfqd = (struct cfq_data *) data;

3643

struct cfq_data *cfqd = (struct cfq_data *) data;

3635

struct cfq_queue *cfqq;

3644

struct cfq_queue *cfqq;

3636

unsigned long flags;

3645

unsigned long flags;

3637

int timed_out = 1;

3646

int timed_out = 1;

3638

3647

3639

cfq_log(cfqd, "idle timer fired");

3648

cfq_log(cfqd, "idle timer fired");

3640

3649

3641

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

3650

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

3642

3651

3643

cfqq = cfqd->active_queue;

3652

cfqq = cfqd->active_queue;

3644

if (cfqq) {

3653

if (cfqq) {

3645

timed_out = 0;

3654

timed_out = 0;

3646

3655

3647

/*

3656

/*

3648

* We saw a request before the queue expired, let it through

3657

* We saw a request before the queue expired, let it through

3649

*/

3658

*/

3650

if (cfq_cfqq_must_dispatch(cfqq))

3659

if (cfq_cfqq_must_dispatch(cfqq))

3651

goto out_kick;

3660

goto out_kick;

3652

3661

3653

/*

3662

/*

3654

* expired

3663

* expired

3655

*/

3664

*/

3656

if (cfq_slice_used(cfqq))

3665

if (cfq_slice_used(cfqq))

3657

goto expire;

3666

goto expire;

3658

3667

3659

/*

3668

/*

3660

* only expire and reinvoke request handler, if there are

3669

* only expire and reinvoke request handler, if there are

3661

* other queues with pending requests

3670

* other queues with pending requests

3662

*/

3671

*/

3663

if (!cfqd->busy_queues)

3672

if (!cfqd->busy_queues)

3664

goto out_cont;

3673

goto out_cont;

3665

3674

3666

/*

3675

/*

3667

* not expired and it has a request pending, let it dispatch

3676

* not expired and it has a request pending, let it dispatch

3668

*/

3677

*/

3669

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

3678

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

3670

goto out_kick;

3679

goto out_kick;

3671

3680

3672

/*

3681

/*

3673

* Queue depth flag is reset only when the idle didn't succeed

3682

* Queue depth flag is reset only when the idle didn't succeed

3674

*/

3683

*/

3675

cfq_clear_cfqq_deep(cfqq);

3684

cfq_clear_cfqq_deep(cfqq);

3676

}

3685

}

3677

expire:

3686

expire:

3678

cfq_slice_expired(cfqd, timed_out);

3687

cfq_slice_expired(cfqd, timed_out);

3679

out_kick:

3688

out_kick:

3680

cfq_schedule_dispatch(cfqd);

3689

cfq_schedule_dispatch(cfqd);

3681

out_cont:

3690

out_cont:

3682

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

3691

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

3683

}

3692

}

3684

3693

3685

static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)

3694

static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)

3686

{

3695

{

3687

del_timer_sync(&cfqd->idle_slice_timer);

3696

del_timer_sync(&cfqd->idle_slice_timer);

3688

cancel_work_sync(&cfqd->unplug_work);

3697

cancel_work_sync(&cfqd->unplug_work);

3689

}

3698

}

3690

3699

3691

static void cfq_put_async_queues(struct cfq_data *cfqd)

3700

static void cfq_put_async_queues(struct cfq_data *cfqd)

3692

{

3701

{

3693

int i;

3702

int i;

3694

3703

3695

for (i = 0; i < IOPRIO_BE_NR; i++) {

3704

for (i = 0; i < IOPRIO_BE_NR; i++) {

3696

if (cfqd->async_cfqq[0][i])

3705

if (cfqd->async_cfqq[0][i])

3697

cfq_put_queue(cfqd->async_cfqq[0][i]);

3706

cfq_put_queue(cfqd->async_cfqq[0][i]);

3698

if (cfqd->async_cfqq[1][i])

3707

if (cfqd->async_cfqq[1][i])

3699

cfq_put_queue(cfqd->async_cfqq[1][i]);

3708

cfq_put_queue(cfqd->async_cfqq[1][i]);

3700

}

3709

}

3701

3710

3702

if (cfqd->async_idle_cfqq)

3711

if (cfqd->async_idle_cfqq)

3703

cfq_put_queue(cfqd->async_idle_cfqq);

3712

cfq_put_queue(cfqd->async_idle_cfqq);

3704

}

3713

}

3705

3714

3706

static void cfq_cfqd_free(struct rcu_head *head)

3715

static void cfq_cfqd_free(struct rcu_head *head)

3707

{

3716

{

3708

kfree(container_of(head, struct cfq_data, rcu));

3717

kfree(container_of(head, struct cfq_data, rcu));

3709

}

3718

}

3710

3719

3711

static void cfq_exit_queue(struct elevator_queue *e)

3720

static void cfq_exit_queue(struct elevator_queue *e)

3712

{

3721

{

3713

struct cfq_data *cfqd = e->elevator_data;

3722

struct cfq_data *cfqd = e->elevator_data;

3714

struct request_queue *q = cfqd->queue;

3723

struct request_queue *q = cfqd->queue;

3715

3724

3716

cfq_shutdown_timer_wq(cfqd);

3725

cfq_shutdown_timer_wq(cfqd);

3717

3726

3718

spin_lock_irq(q->queue_lock);

3727

spin_lock_irq(q->queue_lock);

3719

3728

3720

if (cfqd->active_queue)

3729

if (cfqd->active_queue)

3721

__cfq_slice_expired(cfqd, cfqd->active_queue, 0);

3730

__cfq_slice_expired(cfqd, cfqd->active_queue, 0);

3722

3731

3723

while (!list_empty(&cfqd->cic_list)) {

3732

while (!list_empty(&cfqd->cic_list)) {

3724

struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,

3733

struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,

3725

struct cfq_io_context,

3734

struct cfq_io_context,

3726

queue_list);

3735

queue_list);

3727

3736

3728

__cfq_exit_single_io_context(cfqd, cic);

3737

__cfq_exit_single_io_context(cfqd, cic);

3729

}

3738

}

3730

3739

3731

cfq_put_async_queues(cfqd);

3740

cfq_put_async_queues(cfqd);

3732

cfq_release_cfq_groups(cfqd);

3741

cfq_release_cfq_groups(cfqd);

3733

blkiocg_del_blkio_group(&cfqd->root_group.blkg);

3742

blkiocg_del_blkio_group(&cfqd->root_group.blkg);

3734

3743

3735

spin_unlock_irq(q->queue_lock);

3744

spin_unlock_irq(q->queue_lock);

3736

3745

3737

cfq_shutdown_timer_wq(cfqd);

3746

cfq_shutdown_timer_wq(cfqd);

3738

3747

3739

spin_lock(&cic_index_lock);

3748

spin_lock(&cic_index_lock);

3740

ida_remove(&cic_index_ida, cfqd->cic_index);

3749

ida_remove(&cic_index_ida, cfqd->cic_index);

3741

spin_unlock(&cic_index_lock);

3750

spin_unlock(&cic_index_lock);

3742

3751

3743

/* Wait for cfqg->blkg->key accessors to exit their grace periods. */

3752

/* Wait for cfqg->blkg->key accessors to exit their grace periods. */

3744

call_rcu(&cfqd->rcu, cfq_cfqd_free);

3753

call_rcu(&cfqd->rcu, cfq_cfqd_free);

3745

}

3754

}

3746

3755

3747

static int cfq_alloc_cic_index(void)

3756

static int cfq_alloc_cic_index(void)

3748

{

3757

{

3749

int index, error;

3758

int index, error;

3750

3759

3751

do {

3760

do {

3752

if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))

3761

if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))

3753

return -ENOMEM;

3762

return -ENOMEM;

3754

3763

3755

spin_lock(&cic_index_lock);

3764

spin_lock(&cic_index_lock);

3756

error = ida_get_new(&cic_index_ida, &index);

3765

error = ida_get_new(&cic_index_ida, &index);

3757

spin_unlock(&cic_index_lock);

3766

spin_unlock(&cic_index_lock);

3758

if (error && error != -EAGAIN)

3767

if (error && error != -EAGAIN)

3759

return error;

3768

return error;

3760

} while (error);

3769

} while (error);

3761

3770

3762

return index;

3771

return index;

3763

}

3772

}

3764

3773

3765

static void *cfq_init_queue(struct request_queue *q)

3774

static void *cfq_init_queue(struct request_queue *q)

3766

{

3775

{

3767

struct cfq_data *cfqd;

3776

struct cfq_data *cfqd;

3768

int i, j;

3777

int i, j;

3769

struct cfq_group *cfqg;

3778

struct cfq_group *cfqg;

3770

struct cfq_rb_root *st;

3779

struct cfq_rb_root *st;

3771

3780

3772

i = cfq_alloc_cic_index();

3781

i = cfq_alloc_cic_index();

3773

if (i < 0)

3782

if (i < 0)

3774

return NULL;

3783

return NULL;

3775

3784

3776

cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);

3785

cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);

3777

if (!cfqd)

3786

if (!cfqd)

3778

return NULL;

3787

return NULL;

3779

3788

3780

cfqd->cic_index = i;

3789

cfqd->cic_index = i;

3781

3790

3782

/* Init root service tree */

3791

/* Init root service tree */

3783

cfqd->grp_service_tree = CFQ_RB_ROOT;

3792

cfqd->grp_service_tree = CFQ_RB_ROOT;

3784

3793

3785

/* Init root group */

3794

/* Init root group */

3786

cfqg = &cfqd->root_group;

3795

cfqg = &cfqd->root_group;

3787

for_each_cfqg_st(cfqg, i, j, st)

3796

for_each_cfqg_st(cfqg, i, j, st)

3788

*st = CFQ_RB_ROOT;

3797

*st = CFQ_RB_ROOT;

3789

RB_CLEAR_NODE(&cfqg->rb_node);

3798

RB_CLEAR_NODE(&cfqg->rb_node);

3790

3799

3791

/* Give preference to root group over other groups */

3800

/* Give preference to root group over other groups */

3792

cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;

3801

cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;

3793

3802

3794

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3803

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3795

/*

3804

/*

3796

* Take a reference to root group which we never drop. This is just

3805

* Take a reference to root group which we never drop. This is just

3797

* to make sure that cfq_put_cfqg() does not try to kfree root group

3806

* to make sure that cfq_put_cfqg() does not try to kfree root group

3798

*/

3807

*/

3799

atomic_set(&cfqg->ref, 1);

3808

atomic_set(&cfqg->ref, 1);

3800

rcu_read_lock();

3809

rcu_read_lock();

3801

blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,

3810

blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,

3802

0);

3811

0);

3803

rcu_read_unlock();

3812

rcu_read_unlock();

3804

#endif

3813

#endif

3805

/*

3814

/*

3806

* Not strictly needed (since RB_ROOT just clears the node and we

3815

* Not strictly needed (since RB_ROOT just clears the node and we

3807

* zeroed cfqd on alloc), but better be safe in case someone decides

3816

* zeroed cfqd on alloc), but better be safe in case someone decides

3808

* to add magic to the rb code

3817

* to add magic to the rb code

3809

*/

3818

*/

3810

for (i = 0; i < CFQ_PRIO_LISTS; i++)

3819

for (i = 0; i < CFQ_PRIO_LISTS; i++)

3811

cfqd->prio_trees[i] = RB_ROOT;

3820

cfqd->prio_trees[i] = RB_ROOT;

3812

3821

3813

/*

3822

/*

3814

* Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.

3823

* Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.

3815

* Grab a permanent reference to it, so that the normal code flow

3824

* Grab a permanent reference to it, so that the normal code flow

3816

* will not attempt to free it.

3825

* will not attempt to free it.

3817

*/

3826

*/

3818

cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);

3827

cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);

3819

atomic_inc(&cfqd->oom_cfqq.ref);

3828

atomic_inc(&cfqd->oom_cfqq.ref);

3820

cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);

3829

cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);

3821

3830

3822

INIT_LIST_HEAD(&cfqd->cic_list);

3831

INIT_LIST_HEAD(&cfqd->cic_list);

3823

3832

3824

cfqd->queue = q;

3833

cfqd->queue = q;

3825

3834

3826

init_timer(&cfqd->idle_slice_timer);

3835

init_timer(&cfqd->idle_slice_timer);

3827

cfqd->idle_slice_timer.function = cfq_idle_slice_timer;

3836

cfqd->idle_slice_timer.function = cfq_idle_slice_timer;

3828

cfqd->idle_slice_timer.data = (unsigned long) cfqd;

3837

cfqd->idle_slice_timer.data = (unsigned long) cfqd;

3829

3838

3830

INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);

3839

INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);

3831

3840

3832

cfqd->cfq_quantum = cfq_quantum;

3841

cfqd->cfq_quantum = cfq_quantum;

3833

cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];

3842

cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];

3834

cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];

3843

cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];

3835

cfqd->cfq_back_max = cfq_back_max;

3844

cfqd->cfq_back_max = cfq_back_max;

3836

cfqd->cfq_back_penalty = cfq_back_penalty;

3845

cfqd->cfq_back_penalty = cfq_back_penalty;

3837

cfqd->cfq_slice[0] = cfq_slice_async;

3846

cfqd->cfq_slice[0] = cfq_slice_async;

3838

cfqd->cfq_slice[1] = cfq_slice_sync;

3847

cfqd->cfq_slice[1] = cfq_slice_sync;

3839

cfqd->cfq_slice_async_rq = cfq_slice_async_rq;

3848

cfqd->cfq_slice_async_rq = cfq_slice_async_rq;

3840

cfqd->cfq_slice_idle = cfq_slice_idle;

3849

cfqd->cfq_slice_idle = cfq_slice_idle;

3841

cfqd->cfq_latency = 1;

3850

cfqd->cfq_latency = 1;

3842

cfqd->cfq_group_isolation = 0;

3851

cfqd->cfq_group_isolation = 0;

3843

cfqd->hw_tag = -1;

3852

cfqd->hw_tag = -1;

3844

/*

3853

/*

3845

* we optimistically start assuming sync ops weren't delayed in last

3854

* we optimistically start assuming sync ops weren't delayed in last

3846

* second, in order to have larger depth for async operations.

3855

* second, in order to have larger depth for async operations.

3847

*/

3856

*/

3848

cfqd->last_delayed_sync = jiffies - HZ;

3857

cfqd->last_delayed_sync = jiffies - HZ;

3849

return cfqd;

3858

return cfqd;

3850

}

3859

}

3851

3860

3852

static void cfq_slab_kill(void)

3861

static void cfq_slab_kill(void)

3853

{

3862

{

3854

/*

3863

/*

3855

* Caller already ensured that pending RCU callbacks are completed,

3864

* Caller already ensured that pending RCU callbacks are completed,

3856

* so we should have no busy allocations at this point.

3865

* so we should have no busy allocations at this point.

3857

*/

3866

*/

3858

if (cfq_pool)

3867

if (cfq_pool)

3859

kmem_cache_destroy(cfq_pool);

3868

kmem_cache_destroy(cfq_pool);

3860

if (cfq_ioc_pool)

3869

if (cfq_ioc_pool)

3861

kmem_cache_destroy(cfq_ioc_pool);

3870

kmem_cache_destroy(cfq_ioc_pool);

3862

}

3871

}

3863

3872

3864

static int __init cfq_slab_setup(void)

3873

static int __init cfq_slab_setup(void)

3865

{

3874

{

3866

cfq_pool = KMEM_CACHE(cfq_queue, 0);

3875

cfq_pool = KMEM_CACHE(cfq_queue, 0);

3867

if (!cfq_pool)

3876

if (!cfq_pool)

3868

goto fail;

3877

goto fail;

3869

3878

3870

cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);

3879

cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);

3871

if (!cfq_ioc_pool)

3880

if (!cfq_ioc_pool)

3872

goto fail;

3881

goto fail;

3873

3882

3874

return 0;

3883

return 0;

3875

fail:

3884

fail:

3876

cfq_slab_kill();

3885

cfq_slab_kill();

3877

return -ENOMEM;

3886

return -ENOMEM;

3878

}

3887

}

3879

3888

3880

/*

3889

/*

3881

* sysfs parts below -->

3890

* sysfs parts below -->

3882

*/

3891

*/

3883

static ssize_t

3892

static ssize_t

3884

cfq_var_show(unsigned int var, char *page)

3893

cfq_var_show(unsigned int var, char *page)

3885

{

3894

{

3886

return sprintf(page, "%d\n", var);

3895

return sprintf(page, "%d\n", var);

3887

}

3896

}

3888

3897

3889

static ssize_t

3898

static ssize_t

3890

cfq_var_store(unsigned int *var, const char *page, size_t count)

3899

cfq_var_store(unsigned int *var, const char *page, size_t count)

3891

{

3900

{

3892

char *p = (char *) page;

3901

char *p = (char *) page;

3893

3902

3894

*var = simple_strtoul(p, &p, 10);

3903

*var = simple_strtoul(p, &p, 10);

3895

return count;

3904

return count;

3896

}

3905

}

3897

3906

3898

#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \

3907

#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \

3899

static ssize_t __FUNC(struct elevator_queue *e, char *page) \

3908

static ssize_t __FUNC(struct elevator_queue *e, char *page) \

3900

{ \

3909

{ \

3901

struct cfq_data *cfqd = e->elevator_data; \

3910

struct cfq_data *cfqd = e->elevator_data; \

3902

unsigned int __data = __VAR; \

3911

unsigned int __data = __VAR; \

3903

if (__CONV) \

3912

if (__CONV) \

3904

__data = jiffies_to_msecs(__data); \

3913

__data = jiffies_to_msecs(__data); \

3905

return cfq_var_show(__data, (page)); \

3914

return cfq_var_show(__data, (page)); \

3906

}

3915

}

3907

SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);

3916

SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);

3908

SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);

3917

SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);

3909

SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);

3918

SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);

3910

SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);

3919

SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);

3911

SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);

3920

SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);

3912

SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);

3921

SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);

3913

SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);

3922

SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);

3914

SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);

3923

SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);

3915

SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);

3924

SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);

3916

SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);

3925

SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);

3917

SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);

3926

SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);

3918

#undef SHOW_FUNCTION

3927

#undef SHOW_FUNCTION

3919

3928

3920

#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \

3929

#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \

3921

static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \

3930

static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \

3922

{ \

3931

{ \

3923

struct cfq_data *cfqd = e->elevator_data; \

3932

struct cfq_data *cfqd = e->elevator_data; \

3924

unsigned int __data; \

3933

unsigned int __data; \

3925

int ret = cfq_var_store(&__data, (page), count); \

3934

int ret = cfq_var_store(&__data, (page), count); \

3926

if (__data < (MIN)) \

3935

if (__data < (MIN)) \

3927

__data = (MIN); \

3936

__data = (MIN); \

3928

else if (__data > (MAX)) \

3937

else if (__data > (MAX)) \

3929

__data = (MAX); \

3938

__data = (MAX); \

3930

if (__CONV) \

3939

if (__CONV) \

3931

*(__PTR) = msecs_to_jiffies(__data); \

3940

*(__PTR) = msecs_to_jiffies(__data); \

3932

else \

3941

else \

3933

*(__PTR) = __data; \

3942

*(__PTR) = __data; \

3934

return ret; \

3943

return ret; \

3935

}

3944

}

3936

STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);

3945

STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);

3937

STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,

3946

STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,

3938

UINT_MAX, 1);

3947

UINT_MAX, 1);

3939

STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,

3948

STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,

3940

UINT_MAX, 1);

3949

UINT_MAX, 1);

3941

STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);

3950

STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);

3942

STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,

3951

STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,

3943

UINT_MAX, 0);

3952

UINT_MAX, 0);

3944

STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);

3953

STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);

3945

STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);

3954

STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);

3946

STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);

3955

STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);

3947

STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,

3956

STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,

3948

UINT_MAX, 0);

3957

UINT_MAX, 0);

3949

STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);

3958

STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);

3950

STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);

3959

STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);

3951

#undef STORE_FUNCTION

3960

#undef STORE_FUNCTION

3952

3961

3953

#define CFQ_ATTR(name) \

3962

#define CFQ_ATTR(name) \

3954

__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)

3963

__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)

3955

3964

3956

static struct elv_fs_entry cfq_attrs[] = {

3965

static struct elv_fs_entry cfq_attrs[] = {

3957

CFQ_ATTR(quantum),

3966

CFQ_ATTR(quantum),

3958

CFQ_ATTR(fifo_expire_sync),

3967

CFQ_ATTR(fifo_expire_sync),

3959

CFQ_ATTR(fifo_expire_async),

3968

CFQ_ATTR(fifo_expire_async),

3960

CFQ_ATTR(back_seek_max),

3969

CFQ_ATTR(back_seek_max),

3961

CFQ_ATTR(back_seek_penalty),

3970

CFQ_ATTR(back_seek_penalty),

3962

CFQ_ATTR(slice_sync),

3971

CFQ_ATTR(slice_sync),

3963

CFQ_ATTR(slice_async),

3972

CFQ_ATTR(slice_async),

3964

CFQ_ATTR(slice_async_rq),

3973

CFQ_ATTR(slice_async_rq),

3965

CFQ_ATTR(slice_idle),

3974

CFQ_ATTR(slice_idle),

3966

CFQ_ATTR(low_latency),

3975

CFQ_ATTR(low_latency),

3967

CFQ_ATTR(group_isolation),

3976

CFQ_ATTR(group_isolation),

3968

__ATTR_NULL

3977

__ATTR_NULL

3969

};

3978

};

3970

3979

3971

static struct elevator_type iosched_cfq = {

3980

static struct elevator_type iosched_cfq = {

3972

.ops = {

3981

.ops = {

3973

.elevator_merge_fn = cfq_merge,

3982

.elevator_merge_fn = cfq_merge,

3974

.elevator_merged_fn = cfq_merged_request,

3983

.elevator_merged_fn = cfq_merged_request,

3975

.elevator_merge_req_fn = cfq_merged_requests,

3984

.elevator_merge_req_fn = cfq_merged_requests,

3976

.elevator_allow_merge_fn = cfq_allow_merge,

3985

.elevator_allow_merge_fn = cfq_allow_merge,

3977

.elevator_bio_merged_fn = cfq_bio_merged,

3986

.elevator_bio_merged_fn = cfq_bio_merged,

3978

.elevator_dispatch_fn = cfq_dispatch_requests,

3987

.elevator_dispatch_fn = cfq_dispatch_requests,

3979

.elevator_add_req_fn = cfq_insert_request,

3988

.elevator_add_req_fn = cfq_insert_request,

3980

.elevator_activate_req_fn = cfq_activate_request,

3989

.elevator_activate_req_fn = cfq_activate_request,

3981

.elevator_deactivate_req_fn = cfq_deactivate_request,

3990

.elevator_deactivate_req_fn = cfq_deactivate_request,

3982

.elevator_queue_empty_fn = cfq_queue_empty,

3991

.elevator_queue_empty_fn = cfq_queue_empty,

3983

.elevator_completed_req_fn = cfq_completed_request,

3992

.elevator_completed_req_fn = cfq_completed_request,

3984

.elevator_former_req_fn = elv_rb_former_request,

3993

.elevator_former_req_fn = elv_rb_former_request,

3985

.elevator_latter_req_fn = elv_rb_latter_request,

3994

.elevator_latter_req_fn = elv_rb_latter_request,

3986

.elevator_set_req_fn = cfq_set_request,

3995

.elevator_set_req_fn = cfq_set_request,

3987

.elevator_put_req_fn = cfq_put_request,

3996

.elevator_put_req_fn = cfq_put_request,

3988

.elevator_may_queue_fn = cfq_may_queue,

3997

.elevator_may_queue_fn = cfq_may_queue,

3989

.elevator_init_fn = cfq_init_queue,

3998

.elevator_init_fn = cfq_init_queue,

3990

.elevator_exit_fn = cfq_exit_queue,

3999

.elevator_exit_fn = cfq_exit_queue,

3991

.trim = cfq_free_io_context,

4000

.trim = cfq_free_io_context,

3992

},

4001

},

3993

.elevator_attrs = cfq_attrs,

4002

.elevator_attrs = cfq_attrs,

3994

.elevator_name = "cfq",

4003

.elevator_name = "cfq",

3995

.elevator_owner = THIS_MODULE,

4004

.elevator_owner = THIS_MODULE,

3996

};

4005

};

3997

4006

3998

#ifdef CONFIG_CFQ_GROUP_IOSCHED

4007

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3999

static struct blkio_policy_type blkio_policy_cfq = {

4008

static struct blkio_policy_type blkio_policy_cfq = {

4000

.ops = {

4009

.ops = {

4001

.blkio_unlink_group_fn = cfq_unlink_blkio_group,

4010

.blkio_unlink_group_fn = cfq_unlink_blkio_group,

4002

.blkio_update_group_weight_fn = cfq_update_blkio_group_weight,

4011

.blkio_update_group_weight_fn = cfq_update_blkio_group_weight,

4003

},

4012

},

4004

};

4013

};

4005

#else

4014

#else

4006

static struct blkio_policy_type blkio_policy_cfq;

4015

static struct blkio_policy_type blkio_policy_cfq;

4007

#endif

4016

#endif

4008

4017

4009

static int __init cfq_init(void)

4018

static int __init cfq_init(void)

4010

{

4019

{

4011

/*

4020

/*

4012

* could be 0 on HZ < 1000 setups

4021

* could be 0 on HZ < 1000 setups

4013

*/

4022

*/

4014

if (!cfq_slice_async)

4023

if (!cfq_slice_async)

4015

cfq_slice_async = 1;

4024

cfq_slice_async = 1;

4016

if (!cfq_slice_idle)

4025

if (!cfq_slice_idle)

4017

cfq_slice_idle = 1;

4026

cfq_slice_idle = 1;

4018

4027

4019

if (cfq_slab_setup())

4028

if (cfq_slab_setup())

4020

return -ENOMEM;

4029

return -ENOMEM;

4021

4030

4022

elv_register(&iosched_cfq);

4031

elv_register(&iosched_cfq);

4023

blkio_policy_register(&blkio_policy_cfq);

4032

blkio_policy_register(&blkio_policy_cfq);

4024

4033

4025

return 0;

4034

return 0;

4026

}

4035

}

4027

4036

4028

static void __exit cfq_exit(void)

4037

static void __exit cfq_exit(void)

4029

{

4038

{

4030

DECLARE_COMPLETION_ONSTACK(all_gone);

4039

DECLARE_COMPLETION_ONSTACK(all_gone);

4031

blkio_policy_unregister(&blkio_policy_cfq);

4040

blkio_policy_unregister(&blkio_policy_cfq);

4032

elv_unregister(&iosched_cfq);

4041

elv_unregister(&iosched_cfq);

4033

ioc_gone = &all_gone;

4042

ioc_gone = &all_gone;

4034

/* ioc_gone's update must be visible before reading ioc_count */

4043

/* ioc_gone's update must be visible before reading ioc_count */

4035

smp_wmb();

4044

smp_wmb();

4036

4045

4037

/*

4046

/*

4038

* this also protects us from entering cfq_slab_kill() with

4047

* this also protects us from entering cfq_slab_kill() with

4039

* pending RCU callbacks

4048

* pending RCU callbacks

4040

*/

4049

*/

4041

if (elv_ioc_count_read(cfq_ioc_count))

4050

if (elv_ioc_count_read(cfq_ioc_count))

4042

wait_for_completion(&all_gone);

4051

wait_for_completion(&all_gone);

4043

ida_destroy(&cic_index_ida);

4052

ida_destroy(&cic_index_ida);

4044

cfq_slab_kill();

4053

cfq_slab_kill();

4045

}

4054

}

4046

4055

4047

module_init(cfq_init);

4056

module_init(cfq_init);

4048

module_exit(cfq_exit);

4057

module_exit(cfq_exit);

4049

4058

4050

MODULE_AUTHOR("Jens Axboe");

4059

MODULE_AUTHOR("Jens Axboe");

4051

MODULE_LICENSE("GPL");

4060

MODULE_LICENSE("GPL");

4052

MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");

4061

MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");

GITLAB

Eric Lee / linux-smarc-t335x-v3.2

cfq: Don't allow queue merges for queues that have no process references

 /*
  *  CFQ, or complete fairness queueing, disk scheduler.
  *
  *  Based on ideas from a previously unfinished io
  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
  *
  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  */
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/jiffies.h>
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
 #include "blk-cgroup.h"
 /*
  * tunables
  */
 /* max queue in one round of service */
 static const int cfq_quantum = 8;
 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
 /* maximum backwards seek, in KiB */
 static const int cfq_back_max = 16 * 1024;
 /* penalty of a backwards seek */
 static const int cfq_back_penalty = 2;
 static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
 static const int cfq_hist_divisor = 4;
 /*
  * offset from end of service tree
  */
 #define CFQ_IDLE_DELAY		(HZ / 5)
 /*
  * below this threshold, we consider thinktime immediate
  */
 #define CFQ_MIN_TT		(2)
 #define CFQ_SLICE_SCALE		(5)
 #define CFQ_HW_QUEUE_MIN	(5)
 #define CFQ_SERVICE_SHIFT       12
 #define CFQQ_SEEK_THR		(sector_t)(8 * 100)
 #define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
 #define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
 #define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
 #define RQ_CIC(rq)		\
 	((struct cfq_io_context *) (rq)->elevator_private)
 #define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
 #define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private3)
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
 static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
 static struct completion *ioc_gone;
 static DEFINE_SPINLOCK(ioc_gone_lock);
 static DEFINE_SPINLOCK(cic_index_lock);
 static DEFINE_IDA(cic_index_ida);
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
 #define sample_valid(samples)	((samples) > 80)
 #define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
 /*
  * Most of our rbtree usage is for sorting with min extraction, so
  * if we cache the leftmost node we don't have to walk down the tree
  * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
  * move this into the elevator for the rq sorting as well.
  */
 struct cfq_rb_root {
 	struct rb_root rb;
 	struct rb_node *left;
 	unsigned count;
 	unsigned total_weight;
 	u64 min_vdisktime;
 	struct rb_node *active;
 };
 #define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
 			.count = 0, .min_vdisktime = 0, }
 /*
  * Per process-grouping structure
  */
 struct cfq_queue {
 	/* reference count */
 	atomic_t ref;
 	/* various state flags, see below */
 	unsigned int flags;
 	/* parent cfq_data */
 	struct cfq_data *cfqd;
 	/* service_tree member */
 	struct rb_node rb_node;
 	/* service_tree key */
 	unsigned long rb_key;
 	/* prio tree member */
 	struct rb_node p_node;
 	/* prio tree root we belong to, if any */
 	struct rb_root *p_root;
 	/* sorted list of pending requests */
 	struct rb_root sort_list;
 	/* if fifo isn't expired, next request to serve */
 	struct request *next_rq;
 	/* requests queued in sort_list */
 	int queued[2];
 	/* currently allocated requests */
 	int allocated[2];
 	/* fifo list of requests in sort_list */
 	struct list_head fifo;
 	/* time when queue got scheduled in to dispatch first request. */
 	unsigned long dispatch_start;
 	unsigned int allocated_slice;
 	unsigned int slice_dispatch;
 	/* time when first request from queue completed and slice started. */
 	unsigned long slice_start;
 	unsigned long slice_end;
 	long slice_resid;
 	/* pending metadata requests */
 	int meta_pending;
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 	/* io prio of this group */
 	unsigned short ioprio, org_ioprio;
 	unsigned short ioprio_class, org_ioprio_class;
 	pid_t pid;
 	u32 seek_history;
 	sector_t last_request_pos;
 	struct cfq_rb_root *service_tree;
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
 	struct cfq_group *orig_cfqg;
 };
 /*
  * First index in the service_trees.
  * IDLE is handled separately, so it has negative index
  */
 enum wl_prio_t {
 	BE_WORKLOAD = 0,
 	RT_WORKLOAD = 1,
 	IDLE_WORKLOAD = 2,
 };
 /*
  * Second index in the service_trees.
  */
 enum wl_type_t {
 	ASYNC_WORKLOAD = 0,
 	SYNC_NOIDLE_WORKLOAD = 1,
 	SYNC_WORKLOAD = 2
 };
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
 	/* group service_tree member */
 	struct rb_node rb_node;
 	/* group service_tree key */
 	u64 vdisktime;
 	unsigned int weight;
 	bool on_st;
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
 	/* Per group busy queus average. Useful for workload slice calc. */
 	unsigned int busy_queues_avg[2];
 	/*
 	 * rr lists of queues with requests, onle rr for each priority class.
 	 * Counts are embedded in the cfq_rb_root
 	 */
 	struct cfq_rb_root service_trees[2][3];
 	struct cfq_rb_root service_tree_idle;
 	unsigned long saved_workload_slice;
 	enum wl_type_t saved_workload;
 	enum wl_prio_t saved_serving_prio;
 	struct blkio_group blkg;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	struct hlist_node cfqd_node;
 	atomic_t ref;
 #endif
 };
 /*
  * Per block device queue structure
  */
 struct cfq_data {
 	struct request_queue *queue;
 	/* Root service tree for cfq_groups */
 	struct cfq_rb_root grp_service_tree;
 	struct cfq_group root_group;
 	/*
 	 * The priority currently being served
 	 */
 	enum wl_prio_t serving_prio;
 	enum wl_type_t serving_type;
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
 	bool noidle_tree_requires_idle;
 	/*
 	 * Each priority tree is sorted by next_request position.  These
 	 * trees are used when determining if two or more queues are
 	 * interleaving requests (see cfq_close_cooperator).
 	 */
 	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 	unsigned int busy_queues;
 	int rq_in_driver;
 	int rq_in_flight[2];
 	/*
 	 * queue-depth detection
 	 */
 	int rq_queued;
 	int hw_tag;
 	/*
 	 * hw_tag can be
 	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
 	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
 	 *  0 => no NCQ
 	 */
 	int hw_tag_est_depth;
 	unsigned int hw_tag_samples;
 	/*
 	 * idle window management
 	 */
 	struct timer_list idle_slice_timer;
 	struct work_struct unplug_work;
 	struct cfq_queue *active_queue;
 	struct cfq_io_context *active_cic;
 	/*
 	 * async queue for each priority case
 	 */
 	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
 	struct cfq_queue *async_idle_cfqq;
 	sector_t last_position;
 	/*
 	 * tunables, see top of file
 	 */
 	unsigned int cfq_quantum;
 	unsigned int cfq_fifo_expire[2];
 	unsigned int cfq_back_penalty;
 	unsigned int cfq_back_max;
 	unsigned int cfq_slice[2];
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
 	unsigned int cfq_latency;
 	unsigned int cfq_group_isolation;
 	unsigned int cic_index;
 	struct list_head cic_list;
 	/*
 	 * Fallback dummy cfqq for extreme OOM conditions
 	 */
 	struct cfq_queue oom_cfqq;
 	unsigned long last_delayed_sync;
 	/* List of cfq groups being managed on this device*/
 	struct hlist_head cfqg_list;
 	struct rcu_head rcu;
 };
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
 					    enum wl_prio_t prio,
 					    enum wl_type_t type)
 {
 	if (!cfqg)
 		return NULL;
 	if (prio == IDLE_WORKLOAD)
 		return &cfqg->service_tree_idle;
 	return &cfqg->service_trees[prio][type];
 }
 enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
 	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
 	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
 	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
 	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
 	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
 	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
 	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
 	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
 	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
 	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
 	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
 	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
 };
 #define CFQ_CFQQ_FNS(name)						\
 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
 {									\
 	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
 }									\
 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
 {									\
 	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
 }									\
 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
 {									\
 	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
 }
 CFQ_CFQQ_FNS(on_rr);
 CFQ_CFQQ_FNS(wait_request);
 CFQ_CFQQ_FNS(must_dispatch);
 CFQ_CFQQ_FNS(must_alloc_slice);
 CFQ_CFQQ_FNS(fifo_expire);
 CFQ_CFQQ_FNS(idle_window);
 CFQ_CFQQ_FNS(prio_changed);
 CFQ_CFQQ_FNS(slice_new);
 CFQ_CFQQ_FNS(sync);
 CFQ_CFQQ_FNS(coop);
 CFQ_CFQQ_FNS(split_coop);
 CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
 			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
 			blkg_path(&(cfqq)->cfqg->blkg), ##args);
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
 	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
 				blkg_path(&(cfqg)->blkg), ##args);      \
 #else
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0);
 #endif
 #define cfq_log(cfqd, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 /* Traverses through cfq group service trees */
 #define for_each_cfqg_st(cfqg, i, j, st) \
 	for (i = 0; i <= IDLE_WORKLOAD; i++) \
 		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
 			: &cfqg->service_tree_idle; \
 			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
 			(i == IDLE_WORKLOAD && j == 0); \
 			j++, st = i < IDLE_WORKLOAD ? \
 			&cfqg->service_trees[i][j]: NULL) \
 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
 {
 	if (cfq_class_idle(cfqq))
 		return IDLE_WORKLOAD;
 	if (cfq_class_rt(cfqq))
 		return RT_WORKLOAD;
 	return BE_WORKLOAD;
 }
 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
 {
 	if (!cfq_cfqq_sync(cfqq))
 		return ASYNC_WORKLOAD;
 	if (!cfq_cfqq_idle_window(cfqq))
 		return SYNC_NOIDLE_WORKLOAD;
 	return SYNC_WORKLOAD;
 }
 static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
 					struct cfq_data *cfqd,
 					struct cfq_group *cfqg)
 {
 	if (wl == IDLE_WORKLOAD)
 		return cfqg->service_tree_idle.count;
 	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
 		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
 		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
 }
 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 					struct cfq_group *cfqg)
 {
 	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
 		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
 }
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
 				       struct io_context *, gfp_t);
 static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
 						struct io_context *);
 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
 					    bool is_sync)
 {
 	return cic->cfqq[is_sync];
 }
 static inline void cic_set_cfqq(struct cfq_io_context *cic,
 				struct cfq_queue *cfqq, bool is_sync)
 {
 	cic->cfqq[is_sync] = cfqq;
 }
 #define CIC_DEAD_KEY	1ul
 #define CIC_DEAD_INDEX_SHIFT	1
 static inline void *cfqd_dead_key(struct cfq_data *cfqd)
 {
 	return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
 }
 static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic->key;
 	if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
 		return NULL;
 	return cfqd;
 }
 /*
  * We regard a request as SYNC, if it's either a read or has the SYNC bit
  * set (in which case it could also be direct WRITE).
  */
 static inline bool cfq_bio_sync(struct bio *bio)
 {
 	return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);
 }
 /*
  * scheduler run of queue, if there are requests pending and no one in the
  * driver that will restart queueing
  */
 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 {
 	if (cfqd->busy_queues) {
 		cfq_log(cfqd, "schedule dispatch");
 		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
 	}
 }
 static int cfq_queue_empty(struct request_queue *q)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	return !cfqd->rq_queued;
 }
 /*
  * Scale schedule slice based on io priority. Use the sync time slice only
  * if a queue is marked sync and has sync io queued. A sync queue with async
  * io only, should not get full sync slice length.
  */
 static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
 				 unsigned short prio)
 {
 	const int base_slice = cfqd->cfq_slice[sync];
 	WARN_ON(prio >= IOPRIO_BE_NR);
 	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
 }
 static inline int
 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
 {
 	u64 d = delta << CFQ_SERVICE_SHIFT;
 	d = d * BLKIO_WEIGHT_DEFAULT;
 	do_div(d, cfqg->weight);
 	return d;
 }
 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
 {
 	s64 delta = (s64)(vdisktime - min_vdisktime);
 	if (delta > 0)
 		min_vdisktime = vdisktime;
 	return min_vdisktime;
 }
 static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 {
 	s64 delta = (s64)(vdisktime - min_vdisktime);
 	if (delta < 0)
 		min_vdisktime = vdisktime;
 	return min_vdisktime;
 }
 static void update_min_vdisktime(struct cfq_rb_root *st)
 {
 	u64 vdisktime = st->min_vdisktime;
 	struct cfq_group *cfqg;
 	if (st->active) {
 		cfqg = rb_entry_cfqg(st->active);
 		vdisktime = cfqg->vdisktime;
 	}
 	if (st->left) {
 		cfqg = rb_entry_cfqg(st->left);
 		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
 	}
 	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
 }
 /*
  * get averaged number of queues of RT/BE priority.
  * average is updated, with a formula that gives more weight to higher numbers,
  * to quickly follows sudden increases and decrease slowly
  */
 static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
 					struct cfq_group *cfqg, bool rt)
 {
 	unsigned min_q, max_q;
 	unsigned mult  = cfq_hist_divisor - 1;
 	unsigned round = cfq_hist_divisor / 2;
 	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
 	min_q = min(cfqg->busy_queues_avg[rt], busy);
 	max_q = max(cfqg->busy_queues_avg[rt], busy);
 	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
 		cfq_hist_divisor;
 	return cfqg->busy_queues_avg[rt];
 }
 static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	return cfq_target_latency * cfqg->weight / st->total_weight;
 }
 static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
 	if (cfqd->cfq_latency) {
 		/*
 		 * interested queues (we consider only the ones with the same
 		 * priority class in the cfq group)
 		 */
 		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
 						cfq_class_rt(cfqq));
 		unsigned sync_slice = cfqd->cfq_slice[1];
 		unsigned expect_latency = sync_slice * iq;
 		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
 		if (expect_latency > group_slice) {
 			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
 			/* scale low_slice according to IO priority
 			 * and sync vs async */
 			unsigned low_slice =
 				min(slice, base_low_slice * slice / sync_slice);
 			/* the adapted slice value is scaled to fit all iqs
 			 * into the target latency */
 			slice = max(slice * group_slice / expect_latency,
 				    low_slice);
 		}
 	}
 	cfqq->slice_start = jiffies;
 	cfqq->slice_end = jiffies + slice;
 	cfqq->allocated_slice = slice;
 	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
 }
 /*
  * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
  * isn't valid until the first request from the dispatch is activated
  * and the slice time set.
  */
 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
 {
 	if (cfq_cfqq_slice_new(cfqq))
 		return 0;
 	if (time_before(jiffies, cfqq->slice_end))
 		return 0;
 	return 1;
 }
 /*
  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
  * We choose the request that is closest to the head right now. Distance
  * behind the head is penalized and only allowed to a certain extent.
  */
 static struct request *
 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
 {
 	sector_t s1, s2, d1 = 0, d2 = 0;
 	unsigned long back_max;
 #define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
 #define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
 	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
 	if (rq1 == NULL || rq1 == rq2)
 		return rq2;
 	if (rq2 == NULL)
 		return rq1;
 	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
 		return rq1;
 	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
 		return rq2;
 	if (rq_is_meta(rq1) && !rq_is_meta(rq2))
 		return rq1;
 	else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
 		return rq2;
 	s1 = blk_rq_pos(rq1);
 	s2 = blk_rq_pos(rq2);
 	/*
 	 * by definition, 1KiB is 2 sectors
 	 */
 	back_max = cfqd->cfq_back_max * 2;
 	/*
 	 * Strict one way elevator _except_ in the case where we allow
 	 * short backward seeks which are biased as twice the cost of a
 	 * similar forward seek.
 	 */
 	if (s1 >= last)
 		d1 = s1 - last;
 	else if (s1 + back_max >= last)
 		d1 = (last - s1) * cfqd->cfq_back_penalty;
 	else
 		wrap |= CFQ_RQ1_WRAP;
 	if (s2 >= last)
 		d2 = s2 - last;
 	else if (s2 + back_max >= last)
 		d2 = (last - s2) * cfqd->cfq_back_penalty;
 	else
 		wrap |= CFQ_RQ2_WRAP;
 	/* Found required data */
 	/*
 	 * By doing switch() on the bit mask "wrap" we avoid having to
 	 * check two variables for all permutations: --> faster!
 	 */
 	switch (wrap) {
 	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
 		if (d1 < d2)
 			return rq1;
 		else if (d2 < d1)
 			return rq2;
 		else {
 			if (s1 >= s2)
 				return rq1;
 			else
 				return rq2;
 		}
 	case CFQ_RQ2_WRAP:
 		return rq1;
 	case CFQ_RQ1_WRAP:
 		return rq2;
 	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
 	default:
 		/*
 		 * Since both rqs are wrapped,
 		 * start with the one that's further behind head
 		 * (--> only *one* back seek required),
 		 * since back seek takes more time than forward.
 		 */
 		if (s1 <= s2)
 			return rq1;
 		else
 			return rq2;
 	}
 }
 /*
  * The below is leftmost cache rbtree addon
  */
 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
 {
 	/* Service tree is empty */
 	if (!root->count)
 		return NULL;
 	if (!root->left)
 		root->left = rb_first(&root->rb);
 	if (root->left)
 		return rb_entry(root->left, struct cfq_queue, rb_node);
 	return NULL;
 }
 static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
 {
 	if (!root->left)
 		root->left = rb_first(&root->rb);
 	if (root->left)
 		return rb_entry_cfqg(root->left);
 	return NULL;
 }
 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 {
 	rb_erase(n, root);
 	RB_CLEAR_NODE(n);
 }
 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
 {
 	if (root->left == n)
 		root->left = NULL;
 	rb_erase_init(n, &root->rb);
 	--root->count;
 }
 /*
  * would be nice to take fifo expire time into account as well
  */
 static struct request *
 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		  struct request *last)
 {
 	struct rb_node *rbnext = rb_next(&last->rb_node);
 	struct rb_node *rbprev = rb_prev(&last->rb_node);
 	struct request *next = NULL, *prev = NULL;
 	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
 	if (rbprev)
 		prev = rb_entry_rq(rbprev);
 	if (rbnext)
 		next = rb_entry_rq(rbnext);
 	else {
 		rbnext = rb_first(&cfqq->sort_list);
 		if (rbnext && rbnext != &last->rb_node)
 			next = rb_entry_rq(rbnext);
 	}
 	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
 }
 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 				      struct cfq_queue *cfqq)
 {
 	/*
 	 * just an approximation, should be ok.
 	 */
 	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
 		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
 }
 static inline s64
 cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
 	return cfqg->vdisktime - st->min_vdisktime;
 }
 static void
 __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
 	struct rb_node **node = &st->rb.rb_node;
 	struct rb_node *parent = NULL;
 	struct cfq_group *__cfqg;
 	s64 key = cfqg_key(st, cfqg);
 	int left = 1;
 	while (*node != NULL) {
 		parent = *node;
 		__cfqg = rb_entry_cfqg(parent);
 		if (key < cfqg_key(st, __cfqg))
 			node = &parent->rb_left;
 		else {
 			node = &parent->rb_right;
 			left = 0;
 		}
 	}
 	if (left)
 		st->left = &cfqg->rb_node;
 	rb_link_node(&cfqg->rb_node, parent, node);
 	rb_insert_color(&cfqg->rb_node, &st->rb);
 }
 static void
 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_group *__cfqg;
 	struct rb_node *n;
 	cfqg->nr_cfqq++;
 	if (cfqg->on_st)
 		return;
 	/*
 	 * Currently put the group at the end. Later implement something
 	 * so that groups get lesser vtime based on their weights, so that
 	 * if group does not loose all if it was not continously backlogged.
 	 */
 	n = rb_last(&st->rb);
 	if (n) {
 		__cfqg = rb_entry_cfqg(n);
 		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
 	} else
 		cfqg->vdisktime = st->min_vdisktime;
 	__cfq_group_service_tree_add(st, cfqg);
 	cfqg->on_st = true;
 	st->total_weight += cfqg->weight;
 }
 static void
 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	if (st->active == &cfqg->rb_node)
 		st->active = NULL;
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
 	/* If there are other cfq queues under this group, don't delete it */
 	if (cfqg->nr_cfqq)
 		return;
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
 	cfqg->on_st = false;
 	st->total_weight -= cfqg->weight;
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
 	cfqg->saved_workload_slice = 0;
 	blkiocg_update_dequeue_stats(&cfqg->blkg, 1);
 }
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 {
 	unsigned int slice_used;
 	/*
 	 * Queue got expired before even a single request completed or
 	 * got expired immediately after first request completion.
 	 */
 	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
 		/*
 		 * Also charge the seek time incurred to the group, otherwise
 		 * if there are mutiple queues in the group, each can dispatch
 		 * a single request on seeky media and cause lots of seek time
 		 * and group will never know it.
 		 */
 		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
 					1);
 	} else {
 		slice_used = jiffies - cfqq->slice_start;
 		if (slice_used > cfqq->allocated_slice)
 			slice_used = cfqq->allocated_slice;
 	}
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u", slice_used);
 	return slice_used;
 }
 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 				struct cfq_queue *cfqq)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	unsigned int used_sl, charge_sl;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
 	BUG_ON(nr_sync < 0);
 	used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
 	if (!cfq_cfqq_sync(cfqq) && !nr_sync)
 		charge_sl = cfqq->allocated_slice;
 	/* Can't update vdisktime while group is on service tree */
 	cfq_rb_erase(&cfqg->rb_node, st);
 	cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
 	__cfq_group_service_tree_add(st, cfqg);
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
 		cfqg->saved_workload_slice = cfqd->workload_expires
 						- jiffies;
 		cfqg->saved_workload = cfqd->serving_type;
 		cfqg->saved_serving_prio = cfqd->serving_prio;
 	} else
 		cfqg->saved_workload_slice = 0;
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
 	blkiocg_update_timeslice_used(&cfqg->blkg, used_sl);
 	blkiocg_set_start_empty_time(&cfqg->blkg);
 }
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 {
 	if (blkg)
 		return container_of(blkg, struct cfq_group, blkg);
 	return NULL;
 }
 void
 cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
 {
 	cfqg_of_blkg(blkg)->weight = weight;
 }
 static struct cfq_group *
 cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 	struct cfq_group *cfqg = NULL;
 	void *key = cfqd;
 	int i, j;
 	struct cfq_rb_root *st;
 	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 	unsigned int major, minor;
 	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
 	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
 		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
 		cfqg->blkg.dev = MKDEV(major, minor);
 		goto done;
 	}
 	if (cfqg || !create)
 		goto done;
 	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
 	if (!cfqg)
 		goto done;
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
 	/*
 	 * Take the initial reference that will be released on destroy
 	 * This can be thought of a joint reference by cgroup and
 	 * elevator which will be dropped by either elevator exit
 	 * or cgroup deletion path depending on who is exiting first.
 	 */
 	atomic_set(&cfqg->ref, 1);
 	/* Add group onto cgroup list */
 	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
 	blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
 					MKDEV(major, minor));
 	cfqg->weight = blkcg_get_weight(blkcg, cfqg->blkg.dev);
 	/* Add group on cfqd list */
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
 done:
 	return cfqg;
 }
 /*
  * Search for the cfq group current task belongs to. If create = 1, then also
  * create the cfq group if it does not exist. request_queue lock must be held.
  */
 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 {
 	struct cgroup *cgroup;
 	struct cfq_group *cfqg = NULL;
 	rcu_read_lock();
 	cgroup = task_cgroup(current, blkio_subsys_id);
 	cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
 	if (!cfqg && create)
 		cfqg = &cfqd->root_group;
 	rcu_read_unlock();
 	return cfqg;
 }
 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
 {
 	atomic_inc(&cfqg->ref);
 	return cfqg;
 }
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
 	/* Currently, all async queues are mapped to root group */
 	if (!cfq_cfqq_sync(cfqq))
 		cfqg = &cfqq->cfqd->root_group;
 	cfqq->cfqg = cfqg;
 	/* cfqq reference on cfqg */
 	atomic_inc(&cfqq->cfqg->ref);
 }
 static void cfq_put_cfqg(struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st;
 	int i, j;
 	BUG_ON(atomic_read(&cfqg->ref) <= 0);
 	if (!atomic_dec_and_test(&cfqg->ref))
 		return;
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
 	kfree(cfqg);
 }
 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	/* Something wrong if we are trying to remove same group twice */
 	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
 	hlist_del_init(&cfqg->cfqd_node);
 	/*
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
 	 */
 	cfq_put_cfqg(cfqg);
 }
 static void cfq_release_cfq_groups(struct cfq_data *cfqd)
 {
 	struct hlist_node *pos, *n;
 	struct cfq_group *cfqg;
 	hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
 		/*
 		 * If cgroup removal path got to blk_group first and removed
 		 * it from cgroup list, then it will take care of destroying
 		 * cfqg also.
 		 */
 		if (!blkiocg_del_blkio_group(&cfqg->blkg))
 			cfq_destroy_cfqg(cfqd, cfqg);
 	}
 }
 /*
  * Blk cgroup controller notification saying that blkio_group object is being
  * delinked as associated cgroup object is going away. That also means that
  * no new IO will come in this group. So get rid of this group as soon as
  * any pending IO in the group is finished.
  *
  * This function is called under rcu_read_lock(). key is the rcu protected
  * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
  * read lock.
  *
  * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
  * it should not be NULL as even if elevator was exiting, cgroup deltion
  * path got to it first.
  */
 void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
 {
 	unsigned long  flags;
 	struct cfq_data *cfqd = key;
 	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 	cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 #else /* GROUP_IOSCHED */
 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 {
 	return &cfqd->root_group;
 }
 static inline struct cfq_group *cfq_ref_get_cfqg(struct cfq_group *cfqg)
 {
 	return cfqg;
 }
 static inline void
 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
 	cfqq->cfqg = cfqg;
 }
 static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
 static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
 #endif /* GROUP_IOSCHED */
 /*
  * The cfqd->service_trees holds all pending cfq_queue's that have
  * requests waiting to be processed. It is sorted in the order that
  * we will service the queues.
  */
 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				 bool add_front)
 {
 	struct rb_node **p, *parent;
 	struct cfq_queue *__cfqq;
 	unsigned long rb_key;
 	struct cfq_rb_root *service_tree;
 	int left;
 	int new_cfqq = 1;
 	int group_changed = 0;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	if (!cfqd->cfq_group_isolation
 	    && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
 	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
 		/* Move this cfq to root group */
 		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
 		if (!RB_EMPTY_NODE(&cfqq->rb_node))
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 		cfqq->orig_cfqg = cfqq->cfqg;
 		cfqq->cfqg = &cfqd->root_group;
 		atomic_inc(&cfqd->root_group.ref);
 		group_changed = 1;
 	} else if (!cfqd->cfq_group_isolation
 		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
 		/* cfqq is sequential now needs to go to its original group */
 		BUG_ON(cfqq->cfqg != &cfqd->root_group);
 		if (!RB_EMPTY_NODE(&cfqq->rb_node))
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 		cfq_put_cfqg(cfqq->cfqg);
 		cfqq->cfqg = cfqq->orig_cfqg;
 		cfqq->orig_cfqg = NULL;
 		group_changed = 1;
 		cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
 	}
 #endif
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
 						cfqq_type(cfqq));
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
 		if (parent && parent != &cfqq->rb_node) {
 			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 			rb_key += __cfqq->rb_key;
 		} else
 			rb_key += jiffies;
 	} else if (!add_front) {
 		/*
 		 * Get our rb key offset. Subtract any residual slice
 		 * value carried from last service. A negative resid
 		 * count indicates slice overrun, and this should position
 		 * the next service time further away in the tree.
 		 */
 		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
 		rb_key -= cfqq->slice_resid;
 		cfqq->slice_resid = 0;
 	} else {
 		rb_key = -HZ;
 		__cfqq = cfq_rb_first(service_tree);
 		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
 	}
 	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 		new_cfqq = 0;
 		/*
 		 * same position, nothing more to do
 		 */
 		if (rb_key == cfqq->rb_key &&
 		    cfqq->service_tree == service_tree)
 			return;
 		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
 		cfqq->service_tree = NULL;
 	}
 	left = 1;
 	parent = NULL;
 	cfqq->service_tree = service_tree;
 	p = &service_tree->rb.rb_node;
 	while (*p) {
 		struct rb_node **n;
 		parent = *p;
 		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 		/*
 		 * sort by key, that represents service time.
 		 */
 		if (time_before(rb_key, __cfqq->rb_key))
 			n = &(*p)->rb_left;
 		else {
 			n = &(*p)->rb_right;
 			left = 0;
 		}
 		p = n;
 	}
 	if (left)
 		service_tree->left = &cfqq->rb_node;
 	cfqq->rb_key = rb_key;
 	rb_link_node(&cfqq->rb_node, parent, p);
 	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
 	service_tree->count++;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
 	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
 }
 static struct cfq_queue *
 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
 		     sector_t sector, struct rb_node **ret_parent,
 		     struct rb_node ***rb_link)
 {
 	struct rb_node **p, *parent;
 	struct cfq_queue *cfqq = NULL;
 	parent = NULL;
 	p = &root->rb_node;
 	while (*p) {
 		struct rb_node **n;
 		parent = *p;
 		cfqq = rb_entry(parent, struct cfq_queue, p_node);
 		/*
 		 * Sort strictly based on sector.  Smallest to the left,
 		 * largest to the right.
 		 */
 		if (sector > blk_rq_pos(cfqq->next_rq))
 			n = &(*p)->rb_right;
 		else if (sector < blk_rq_pos(cfqq->next_rq))
 			n = &(*p)->rb_left;
 		else
 			break;
 		p = n;
 		cfqq = NULL;
 	}
 	*ret_parent = parent;
 	if (rb_link)
 		*rb_link = p;
 	return cfqq;
 }
 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct rb_node **p, *parent;
 	struct cfq_queue *__cfqq;
 	if (cfqq->p_root) {
 		rb_erase(&cfqq->p_node, cfqq->p_root);
 		cfqq->p_root = NULL;
 	}
 	if (cfq_class_idle(cfqq))
 		return;
 	if (!cfqq->next_rq)
 		return;
 	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
 	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
 				      blk_rq_pos(cfqq->next_rq), &parent, &p);
 	if (!__cfqq) {
 		rb_link_node(&cfqq->p_node, parent, p);
 		rb_insert_color(&cfqq->p_node, cfqq->p_root);
 	} else
 		cfqq->p_root = NULL;
 }
 /*
  * Update cfqq's position in the service tree.
  */
 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	/*
 	 * Resorting requires the cfqq to be on the RR list already.
 	 */
 	if (cfq_cfqq_on_rr(cfqq)) {
 		cfq_service_tree_add(cfqd, cfqq, 0);
 		cfq_prio_tree_add(cfqd, cfqq);
 	}
 }
 /*
  * add to busy list of queues for service, trying to be fair in ordering
  * the pending list according to last request service
  */
 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
 	cfq_resort_rr_list(cfqd, cfqq);
 }
 /*
  * Called when the cfqq no longer has requests pending, remove it from
  * the service tree.
  */
 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
 	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
 		cfqq->service_tree = NULL;
 	}
 	if (cfqq->p_root) {
 		rb_erase(&cfqq->p_node, cfqq->p_root);
 		cfqq->p_root = NULL;
 	}
 	cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
 }
 /*
  * rb tree support functions
  */
 static void cfq_del_rq_rb(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	const int sync = rq_is_sync(rq);
 	BUG_ON(!cfqq->queued[sync]);
 	cfqq->queued[sync]--;
 	elv_rb_del(&cfqq->sort_list, rq);
 	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
 		/*
 		 * Queue will be deleted from service tree when we actually
 		 * expire it later. Right now just remove it from prio tree
 		 * as it is empty.
 		 */
 		if (cfqq->p_root) {
 			rb_erase(&cfqq->p_node, cfqq->p_root);
 			cfqq->p_root = NULL;
 		}
 	}
 }
 static void cfq_add_rq_rb(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct request *__alias, *prev;
 	cfqq->queued[rq_is_sync(rq)]++;
 	/*
 	 * looks a little odd, but the first insert might return an alias.
 	 * if that happens, put the alias on the dispatch list
 	 */
 	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
 		cfq_dispatch_insert(cfqd->queue, __alias);
 	if (!cfq_cfqq_on_rr(cfqq))
 		cfq_add_cfqq_rr(cfqd, cfqq);
 	/*
 	 * check if this request is a better next-serve candidate
 	 */
 	prev = cfqq->next_rq;
 	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
 	/*
 	 * adjust priority tree position, if ->next_rq changes
 	 */
 	if (prev != cfqq->next_rq)
 		cfq_prio_tree_add(cfqd, cfqq);
 	BUG_ON(!cfqq->next_rq);
 }
 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
 	blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
 						rq_is_sync(rq));
 	cfq_add_rq_rb(rq);
 	blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
 			&cfqq->cfqd->serving_group->blkg, rq_data_dir(rq),
 			rq_is_sync(rq));
 }
 static struct request *
 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 {
 	struct task_struct *tsk = current;
 	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
 	cic = cfq_cic_lookup(cfqd, tsk->io_context);
 	if (!cic)
 		return NULL;
 	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 	if (cfqq) {
 		sector_t sector = bio->bi_sector + bio_sectors(bio);
 		return elv_rb_find(&cfqq->sort_list, sector);
 	}
 	return NULL;
 }
 static void cfq_activate_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	cfqd->rq_in_driver++;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
 						cfqd->rq_in_driver);
 	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 }
 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	WARN_ON(!cfqd->rq_in_driver);
 	cfqd->rq_in_driver--;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
 						cfqd->rq_in_driver);
 }
 static void cfq_remove_request(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	if (cfqq->next_rq == rq)
 		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
 	list_del_init(&rq->queuelist);
 	cfq_del_rq_rb(rq);
 	cfqq->cfqd->rq_queued--;
 	blkiocg_update_io_remove_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(rq),
 						rq_is_sync(rq));
 	if (rq_is_meta(rq)) {
 		WARN_ON(!cfqq->meta_pending);
 		cfqq->meta_pending--;
 	}
 }
 static int cfq_merge(struct request_queue *q, struct request **req,
 		     struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *__rq;
 	__rq = cfq_find_rq_fmerge(cfqd, bio);
 	if (__rq && elv_rq_merge_ok(__rq, bio)) {
 		*req = __rq;
 		return ELEVATOR_FRONT_MERGE;
 	}
 	return ELEVATOR_NO_MERGE;
 }
 static void cfq_merged_request(struct request_queue *q, struct request *req,
 			       int type)
 {
 	if (type == ELEVATOR_FRONT_MERGE) {
 		struct cfq_queue *cfqq = RQ_CFQQ(req);
 		cfq_reposition_rq_rb(cfqq, req);
 	}
 }
 static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
 	blkiocg_update_io_merged_stats(&(RQ_CFQG(req))->blkg, bio_data_dir(bio),
 					cfq_bio_sync(bio));
 }
 static void
 cfq_merged_requests(struct request_queue *q, struct request *rq,
 		    struct request *next)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	/*
 	 * reposition in fifo if next is older than rq
 	 */
 	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
 	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
 		list_move(&rq->queuelist, &next->queuelist);
 		rq_set_fifo_time(rq, rq_fifo_time(next));
 	}
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
 	blkiocg_update_io_merged_stats(&(RQ_CFQG(rq))->blkg, rq_data_dir(next),
 					rq_is_sync(next));
 }
 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 			   struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
 	/*
 	 * Disallow merge of a sync bio into an async request.
 	 */
 	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
 		return false;
 	/*
 	 * Lookup the cfqq that this bio will be queued with. Allow
 	 * merge only if rq is queued there.
 	 */
 	cic = cfq_cic_lookup(cfqd, current->io_context);
 	if (!cic)
 		return false;
 	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 	return cfqq == RQ_CFQQ(rq);
 }
 static inline void cfq_del_timer(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	del_timer(&cfqd->idle_slice_timer);
 	blkiocg_update_idle_time_stats(&cfqq->cfqg->blkg);
 }
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
 				   struct cfq_queue *cfqq)
 {
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
 		blkiocg_update_avg_queue_size_stats(&cfqq->cfqg->blkg);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
 		cfqq->allocated_slice = 0;
 		cfqq->slice_end = 0;
 		cfqq->slice_dispatch = 0;
 		cfq_clear_cfqq_wait_request(cfqq);
 		cfq_clear_cfqq_must_dispatch(cfqq);
 		cfq_clear_cfqq_must_alloc_slice(cfqq);
 		cfq_clear_cfqq_fifo_expire(cfqq);
 		cfq_mark_cfqq_slice_new(cfqq);
 		cfq_del_timer(cfqd, cfqq);
 	}
 	cfqd->active_queue = cfqq;
 }
 /*
  * current cfqq expired its slice (or was too idle), select new one
  */
 static void
 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		    bool timed_out)
 {
 	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
 	if (cfq_cfqq_wait_request(cfqq))
 		cfq_del_timer(cfqd, cfqq);
 	cfq_clear_cfqq_wait_request(cfqq);
 	cfq_clear_cfqq_wait_busy(cfqq);
 	/*
 	 * If this cfqq is shared between multiple processes, check to
 	 * make sure that those processes are still issuing I/Os within
 	 * the mean seek distance.  If not, it may be time to break the
 	 * queues apart again.
 	 */
 	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
 		cfq_mark_cfqq_split_coop(cfqq);
 	/*
 	 * store what was left of this slice, if the queue idled/timed out
 	 */
 	if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
 		cfqq->slice_resid = cfqq->slice_end - jiffies;
 		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
 	}
 	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
 	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
 		cfq_del_cfqq_rr(cfqd, cfqq);
 	cfq_resort_rr_list(cfqd, cfqq);
 	if (cfqq == cfqd->active_queue)
 		cfqd->active_queue = NULL;
 	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
 		cfqd->grp_service_tree.active = NULL;
 	if (cfqd->active_cic) {
 		put_io_context(cfqd->active_cic->ioc);
 		cfqd->active_cic = NULL;
 	}
 }
 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 	if (cfqq)
 		__cfq_slice_expired(cfqd, cfqq, timed_out);
 }
 /*
  * Get next queue for service. Unless we have a queue preemption,
  * we'll simply select the first cfqq in the service tree.
  */
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *service_tree =
 		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
 					cfqd->serving_type);
 	if (!cfqd->rq_queued)
 		return NULL;
 	/* There is nothing to dispatch */
 	if (!service_tree)
 		return NULL;
 	if (RB_EMPTY_ROOT(&service_tree->rb))
 		return NULL;
 	return cfq_rb_first(service_tree);
 }
 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
 {
 	struct cfq_group *cfqg;
 	struct cfq_queue *cfqq;
 	int i, j;
 	struct cfq_rb_root *st;
 	if (!cfqd->rq_queued)
 		return NULL;
 	cfqg = cfq_get_next_cfqg(cfqd);
 	if (!cfqg)
 		return NULL;
 	for_each_cfqg_st(cfqg, i, j, st)
 		if ((cfqq = cfq_rb_first(st)) != NULL)
 			return cfqq;
 	return NULL;
 }
 /*
  * Get and set a new active queue for service.
  */
 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
 					      struct cfq_queue *cfqq)
 {
 	if (!cfqq)
 		cfqq = cfq_get_next_queue(cfqd);
 	__cfq_set_active_queue(cfqd, cfqq);
 	return cfqq;
 }
 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 					  struct request *rq)
 {
 	if (blk_rq_pos(rq) >= cfqd->last_position)
 		return blk_rq_pos(rq) - cfqd->last_position;
 	else
 		return cfqd->last_position - blk_rq_pos(rq);
 }
 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			       struct request *rq)
 {
 	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
 }
 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 				    struct cfq_queue *cur_cfqq)
 {
 	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
 	struct rb_node *parent, *node;
 	struct cfq_queue *__cfqq;
 	sector_t sector = cfqd->last_position;
 	if (RB_EMPTY_ROOT(root))
 		return NULL;
 	/*
 	 * First, if we find a request starting at the end of the last
 	 * request, choose it.
 	 */
 	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
 	if (__cfqq)
 		return __cfqq;
 	/*
 	 * If the exact sector wasn't found, the parent of the NULL leaf
 	 * will contain the closest sector.
 	 */
 	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
 	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
 		return __cfqq;
 	if (blk_rq_pos(__cfqq->next_rq) < sector)
 		node = rb_next(&__cfqq->p_node);
 	else
 		node = rb_prev(&__cfqq->p_node);
 	if (!node)
 		return NULL;
 	__cfqq = rb_entry(node, struct cfq_queue, p_node);
 	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
 		return __cfqq;
 	return NULL;
 }
 /*
  * cfqd - obvious
  * cur_cfqq - passed in so that we don't decide that the current queue is
  * 	      closely cooperating with itself.
  *
  * So, basically we're assuming that that cur_cfqq has dispatched at least
  * one request, and that cfqd->last_position reflects a position on the disk
  * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
  * assumption.
  */
 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 					      struct cfq_queue *cur_cfqq)
 {
 	struct cfq_queue *cfqq;
 	if (cfq_class_idle(cur_cfqq))
 		return NULL;
 	if (!cfq_cfqq_sync(cur_cfqq))
 		return NULL;
 	if (CFQQ_SEEKY(cur_cfqq))
 		return NULL;
 	/*
 	 * Don't search priority tree if it's the only queue in the group.
 	 */
 	if (cur_cfqq->cfqg->nr_cfqq == 1)
 		return NULL;
 	/*
 	 * We should notice if some of the queues are cooperating, eg
 	 * working closely on the same area of the disk. In that case,
 	 * we can group them together and don't waste time idling.
 	 */
 	cfqq = cfqq_close(cfqd, cur_cfqq);
 	if (!cfqq)
 		return NULL;
 	/* If new queue belongs to different cfq_group, don't choose it */
 	if (cur_cfqq->cfqg != cfqq->cfqg)
 		return NULL;
 	/*
 	 * It only makes sense to merge sync queues.
 	 */
 	if (!cfq_cfqq_sync(cfqq))
 		return NULL;
 	if (CFQQ_SEEKY(cfqq))
 		return NULL;
 	/*
 	 * Do not merge queues of different priority classes
 	 */
 	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
 		return NULL;
 	return cfqq;
 }
 /*
  * Determine whether we should enforce idle window for this queue.
  */
 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	enum wl_prio_t prio = cfqq_prio(cfqq);
 	struct cfq_rb_root *service_tree = cfqq->service_tree;
 	BUG_ON(!service_tree);
 	BUG_ON(!service_tree->count);
 	/* We never do for idle class queues. */
 	if (prio == IDLE_WORKLOAD)
 		return false;
 	/* We do for queues that were marked with idle window flag. */
 	if (cfq_cfqq_idle_window(cfqq) &&
 	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
 		return true;
 	/*
 	 * Otherwise, we do only if they are the last ones
 	 * in their service tree.
 	 */
 	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
 		return 1;
 	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
 			service_tree->count);
 	return 0;
 }
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 	struct cfq_io_context *cic;
 	unsigned long sl;
 	/*
 	 * SSD device without seek penalty, disable idling. But only do so
 	 * for devices that support queuing, otherwise we still have a problem
 	 * with sync vs async workloads.
 	 */
 	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
 		return;
 	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
 	WARN_ON(cfq_cfqq_slice_new(cfqq));
 	/*
 	 * idle is disabled, either manually or by past process history
 	 */
 	if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
 		return;
 	/*
 	 * still active requests from this queue, don't idle
 	 */
 	if (cfqq->dispatched)
 		return;
 	/*
 	 * task has exited, don't wait
 	 */
 	cic = cfqd->active_cic;
 	if (!cic || !atomic_read(&cic->ioc->nr_tasks))
 		return;
 	/*
 	 * If our average think time is larger than the remaining time
 	 * slice, then don't idle. This avoids overrunning the allotted
 	 * time slice.
 	 */
 	if (sample_valid(cic->ttime_samples) &&
 	    (cfqq->slice_end - jiffies < cic->ttime_mean)) {
 		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
 				cic->ttime_mean);
 		return;
 	}
 	cfq_mark_cfqq_wait_request(cfqq);
 	sl = cfqd->cfq_slice_idle;
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 	blkiocg_update_set_idle_time_stats(&cfqq->cfqg->blkg);
 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
 }
 /*
  * Move request from internal lists to the request queue dispatch list.
  */
 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
 	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
 	cfq_remove_request(rq);
 	cfqq->dispatched++;
 	elv_dispatch_sort(q, rq);
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
 	blkiocg_update_dispatch_stats(&cfqq->cfqg->blkg, blk_rq_bytes(rq),
 					rq_data_dir(rq), rq_is_sync(rq));
 }
 /*
  * return expired entry, or NULL to just start from scratch in rbtree
  */
 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
 {
 	struct request *rq = NULL;
 	if (cfq_cfqq_fifo_expire(cfqq))
 		return NULL;
 	cfq_mark_cfqq_fifo_expire(cfqq);
 	if (list_empty(&cfqq->fifo))
 		return NULL;
 	rq = rq_entry_fifo(cfqq->fifo.next);
 	if (time_before(jiffies, rq_fifo_time(rq)))
 		rq = NULL;
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
 	return rq;
 }
 static inline int
 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	const int base_rq = cfqd->cfq_slice_async_rq;
 	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
 	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
 }
 /*
  * Must be called with the queue_lock held.
  */
 static int cfqq_process_refs(struct cfq_queue *cfqq)
 {
 	int process_refs, io_refs;
 	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
 	process_refs = atomic_read(&cfqq->ref) - io_refs;
 	BUG_ON(process_refs < 0);
 	return process_refs;
 }
 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 {
 	int process_refs, new_process_refs;
 	struct cfq_queue *__cfqq;
+	/*
+	 * If there are no process references on the new_cfqq, then it is
+	 * unsafe to follow the ->new_cfqq chain as other cfqq's in the
+	 * chain may have dropped their last reference (not just their
+	 * last process reference).
+	 */
+	if (!cfqq_process_refs(new_cfqq))
+		return;
 	/* Avoid a circular list and skip interim queue merges */
 	while ((__cfqq = new_cfqq->new_cfqq)) {
 		if (__cfqq == cfqq)
 			return;
 		new_cfqq = __cfqq;
 	}
 	process_refs = cfqq_process_refs(cfqq);
+	new_process_refs = cfqq_process_refs(new_cfqq);
 	/*
 	 * If the process for the cfqq has gone away, there is no
 	 * sense in merging the queues.
 	 */
-	if (process_refs == 0)
+	if (process_refs == 0 || new_process_refs == 0)
 		return;
 	/*
 	 * Merge in the direction of the lesser amount of work.
 	 */
-	new_process_refs = cfqq_process_refs(new_cfqq);
 	if (new_process_refs >= process_refs) {
 		cfqq->new_cfqq = new_cfqq;
 		atomic_add(process_refs, &new_cfqq->ref);
 	} else {
 		new_cfqq->new_cfqq = cfqq;
 		atomic_add(new_process_refs, &cfqq->ref);
 	}
 }
 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 				struct cfq_group *cfqg, enum wl_prio_t prio)
 {
 	struct cfq_queue *queue;
 	int i;
 	bool key_valid = false;
 	unsigned long lowest_key = 0;
 	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
 		/* select the one with lowest rb_key */
 		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
 		if (queue &&
 		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
 			lowest_key = queue->rb_key;
 			cur_best = i;
 			key_valid = true;
 		}
 	}
 	return cur_best;
 }
 static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	unsigned slice;
 	unsigned count;
 	struct cfq_rb_root *st;
 	unsigned group_slice;
 	if (!cfqg) {
 		cfqd->serving_prio = IDLE_WORKLOAD;
 		cfqd->workload_expires = jiffies + 1;
 		return;
 	}
 	/* Choose next priority. RT > BE > IDLE */
 	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
 		cfqd->serving_prio = RT_WORKLOAD;
 	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
 		cfqd->serving_prio = BE_WORKLOAD;
 	else {
 		cfqd->serving_prio = IDLE_WORKLOAD;
 		cfqd->workload_expires = jiffies + 1;
 		return;
 	}
 	/*
 	 * For RT and BE, we have to choose also the type
 	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
 	 * expiration time
 	 */
 	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 	count = st->count;
 	/*
 	 * check workload expiration, and that we still have other queues ready
 	 */
 	if (count && !time_after(jiffies, cfqd->workload_expires))
 		return;
 	/* otherwise select new workload type */
 	cfqd->serving_type =
 		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
 	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 	count = st->count;
 	/*
 	 * the workload slice is computed as a fraction of target latency
 	 * proportional to the number of queues in that workload, over
 	 * all the queues in the same priority class
 	 */
 	group_slice = cfq_group_slice(cfqd, cfqg);
 	slice = group_slice * count /
 		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
 		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
 	if (cfqd->serving_type == ASYNC_WORKLOAD) {
 		unsigned int tmp;
 		/*
 		 * Async queues are currently system wide. Just taking
 		 * proportion of queues with-in same group will lead to higher
 		 * async ratio system wide as generally root group is going
 		 * to have higher weight. A more accurate thing would be to
 		 * calculate system wide asnc/sync ratio.
 		 */
 		tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
 		tmp = tmp/cfqd->busy_queues;
 		slice = min_t(unsigned, slice, tmp);
 		/* async workload slice is scaled down according to
 		 * the sync/async slice ratio. */
 		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
 	} else
 		/* sync workload slice is at least 2 * cfq_slice_idle */
 		slice = max(slice, 2 * cfqd->cfq_slice_idle);
 	slice = max_t(unsigned, slice, CFQ_MIN_TT);
 	cfq_log(cfqd, "workload slice:%d", slice);
 	cfqd->workload_expires = jiffies + slice;
 	cfqd->noidle_tree_requires_idle = false;
 }
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_group *cfqg;
 	if (RB_EMPTY_ROOT(&st->rb))
 		return NULL;
 	cfqg = cfq_rb_first_group(st);
 	st->active = &cfqg->rb_node;
 	update_min_vdisktime(st);
 	return cfqg;
 }
 static void cfq_choose_cfqg(struct cfq_data *cfqd)
 {
 	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
 	cfqd->serving_group = cfqg;
 	/* Restore the workload type data */
 	if (cfqg->saved_workload_slice) {
 		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
 		cfqd->serving_type = cfqg->saved_workload;
 		cfqd->serving_prio = cfqg->saved_serving_prio;
 	} else
 		cfqd->workload_expires = jiffies - 1;
 	choose_service_tree(cfqd, cfqg);
 }
 /*
  * Select a queue for service. If we have a current active queue,
  * check whether to continue servicing it, or retrieve and set a new one.
  */
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	cfqq = cfqd->active_queue;
 	if (!cfqq)
 		goto new_queue;
 	if (!cfqd->rq_queued)
 		return NULL;
 	/*
 	 * We were waiting for group to get backlogged. Expire the queue
 	 */
 	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto expire;
 	/*
 	 * The active queue has run out of time, expire it and select new.
 	 */
 	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
 		/*
 		 * If slice had not expired at the completion of last request
 		 * we might not have turned on wait_busy flag. Don't expire
 		 * the queue yet. Allow the group to get backlogged.
 		 *
 		 * The very fact that we have used the slice, that means we
 		 * have been idling all along on this queue and it should be
 		 * ok to wait for this request to complete.
 		 */
 		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
 		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
 			cfqq = NULL;
 			goto keep_queue;
 		} else
 			goto expire;
 	}
 	/*
 	 * The active queue has requests and isn't expired, allow it to
 	 * dispatch.
 	 */
 	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto keep_queue;
 	/*
 	 * If another queue has a request waiting within our mean seek
 	 * distance, let it run.  The expire code will check for close
 	 * cooperators and put the close queue at the front of the service
 	 * tree.  If possible, merge the expiring queue with the new cfqq.
 	 */
 	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
 	if (new_cfqq) {
 		if (!cfqq->new_cfqq)
 			cfq_setup_merge(cfqq, new_cfqq);
 		goto expire;
 	}
 	/*
 	 * No requests pending. If the active queue still has requests in
 	 * flight or is idling for a new request, allow either of these
 	 * conditions to happen (or time out) before selecting a new queue.
 	 */
 	if (timer_pending(&cfqd->idle_slice_timer) ||
 	    (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
 		cfqq = NULL;
 		goto keep_queue;
 	}
 expire:
 	cfq_slice_expired(cfqd, 0);
 new_queue:
 	/*
 	 * Current queue expired. Check if we have to switch to a new
 	 * service tree
 	 */
 	if (!new_cfqq)
 		cfq_choose_cfqg(cfqd);
 	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
 keep_queue:
 	return cfqq;
 }
 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
 {
 	int dispatched = 0;
 	while (cfqq->next_rq) {
 		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
 		dispatched++;
 	}
 	BUG_ON(!list_empty(&cfqq->fifo));
 	/* By default cfqq is not expired if it is empty. Do it explicitly */
 	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
 	return dispatched;
 }
 /*
  * Drain our current requests. Used for barriers and when switching
  * io schedulers on-the-fly.
  */
 static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq;
 	int dispatched = 0;
 	/* Expire the timeslice of the current active queue first */
 	cfq_slice_expired(cfqd, 0);
 	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
 		__cfq_set_active_queue(cfqd, cfqq);
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
 	}
 	BUG_ON(cfqd->busy_queues);
 	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
 	return dispatched;
 }
 static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
 	struct cfq_queue *cfqq)
 {
 	/* the queue hasn't finished any request, can't estimate */
 	if (cfq_cfqq_slice_new(cfqq))
 		return 1;
 	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
 		cfqq->slice_end))
 		return 1;
 	return 0;
 }
 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned int max_dispatch;
 	/*
 	 * Drain async requests before we start sync IO
 	 */
 	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
 		return false;
 	/*
 	 * If this is an async queue and we have sync IO in flight, let it wait
 	 */
 	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
 		return false;
 	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
 	if (cfq_class_idle(cfqq))
 		max_dispatch = 1;
 	/*
 	 * Does this cfqq already have too much IO in flight?
 	 */
 	if (cfqq->dispatched >= max_dispatch) {
 		/*
 		 * idle queue must always only have a single IO in flight
 		 */
 		if (cfq_class_idle(cfqq))
 			return false;
 		/*
 		 * We have other queues, don't allow more IO from this one
 		 */
 		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
 			return false;
 		/*
 		 * Sole queue user, no limit
 		 */
 		if (cfqd->busy_queues == 1)
 			max_dispatch = -1;
 		else
 			/*
 			 * Normally we start throttling cfqq when cfq_quantum/2
 			 * requests have been dispatched. But we can drive
 			 * deeper queue depths at the beginning of slice
 			 * subjected to upper limit of cfq_quantum.
 			 * */
 			max_dispatch = cfqd->cfq_quantum;
 	}
 	/*
 	 * Async queues must wait a bit before being allowed dispatch.
 	 * We also ramp up the dispatch depth gradually for async IO,
 	 * based on the last sync IO we serviced
 	 */
 	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
 		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
 		unsigned int depth;
 		depth = last_sync / cfqd->cfq_slice[1];
 		if (!depth && !cfqq->dispatched)
 			depth = 1;
 		if (depth < max_dispatch)
 			max_dispatch = depth;
 	}
 	/*
 	 * If we're below the current max, allow a dispatch
 	 */
 	return cfqq->dispatched < max_dispatch;
 }
 /*
  * Dispatch a request from cfqq, moving them to the request queue
  * dispatch list.
  */
 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct request *rq;
 	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
 	if (!cfq_may_dispatch(cfqd, cfqq))
 		return false;
 	/*
 	 * follow expired path, else get first next available
 	 */
 	rq = cfq_check_fifo(cfqq);
 	if (!rq)
 		rq = cfqq->next_rq;
 	/*
 	 * insert request into driver dispatch list
 	 */
 	cfq_dispatch_insert(cfqd->queue, rq);
 	if (!cfqd->active_cic) {
 		struct cfq_io_context *cic = RQ_CIC(rq);
 		atomic_long_inc(&cic->ioc->refcount);
 		cfqd->active_cic = cic;
 	}
 	return true;
 }
 /*
  * Find the cfqq that we need to service and move a request from that to the
  * dispatch list
  */
 static int cfq_dispatch_requests(struct request_queue *q, int force)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq;
 	if (!cfqd->busy_queues)
 		return 0;
 	if (unlikely(force))
 		return cfq_forced_dispatch(cfqd);
 	cfqq = cfq_select_queue(cfqd);
 	if (!cfqq)
 		return 0;
 	/*
 	 * Dispatch a request from this cfqq, if it is allowed
 	 */
 	if (!cfq_dispatch_request(cfqd, cfqq))
 		return 0;
 	cfqq->slice_dispatch++;
 	cfq_clear_cfqq_must_dispatch(cfqq);
 	/*
 	 * expire an async queue immediately if it has used up its slice. idle
 	 * queue always expire after 1 dispatch round.
 	 */
 	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
 	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
 	    cfq_class_idle(cfqq))) {
 		cfqq->slice_end = jiffies + 1;
 		cfq_slice_expired(cfqd, 0);
 	}
 	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
 	return 1;
 }
 /*
  * task holds one reference to the queue, dropped when task exits. each rq
  * in-flight on this queue also holds a reference, dropped when rq is freed.
  *
  * Each cfq queue took a reference on the parent group. Drop it now.
  * queue lock must be held here.
  */
 static void cfq_put_queue(struct cfq_queue *cfqq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct cfq_group *cfqg, *orig_cfqg;
 	BUG_ON(atomic_read(&cfqq->ref) <= 0);
 	if (!atomic_dec_and_test(&cfqq->ref))
 		return;
 	cfq_log_cfqq(cfqd, cfqq, "put_queue");
 	BUG_ON(rb_first(&cfqq->sort_list));
 	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
 	cfqg = cfqq->cfqg;
 	orig_cfqg = cfqq->orig_cfqg;
 	if (unlikely(cfqd->active_queue == cfqq)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	kmem_cache_free(cfq_pool, cfqq);
 	cfq_put_cfqg(cfqg);
 	if (orig_cfqg)
 		cfq_put_cfqg(orig_cfqg);
 }
 /*
  * Must always be called with the rcu_read_lock() held
  */
 static void
 __call_for_each_cic(struct io_context *ioc,
 		    void (*func)(struct io_context *, struct cfq_io_context *))
 {
 	struct cfq_io_context *cic;
 	struct hlist_node *n;
 	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
 		func(ioc, cic);
 }
 /*
  * Call func for each cic attached to this ioc.
  */
 static void
 call_for_each_cic(struct io_context *ioc,
 		  void (*func)(struct io_context *, struct cfq_io_context *))
 {
 	rcu_read_lock();
 	__call_for_each_cic(ioc, func);
 	rcu_read_unlock();
 }
 static void cfq_cic_free_rcu(struct rcu_head *head)
 {
 	struct cfq_io_context *cic;
 	cic = container_of(head, struct cfq_io_context, rcu_head);
 	kmem_cache_free(cfq_ioc_pool, cic);
 	elv_ioc_count_dec(cfq_ioc_count);
 	if (ioc_gone) {
 		/*
 		 * CFQ scheduler is exiting, grab exit lock and check
 		 * the pending io context count. If it hits zero,
 		 * complete ioc_gone and set it back to NULL
 		 */
 		spin_lock(&ioc_gone_lock);
 		if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
 			complete(ioc_gone);
 			ioc_gone = NULL;
 		}
 		spin_unlock(&ioc_gone_lock);
 	}
 }
 static void cfq_cic_free(struct cfq_io_context *cic)
 {
 	call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
 }
 static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
 {
 	unsigned long flags;
 	unsigned long dead_key = (unsigned long) cic->key;
 	BUG_ON(!(dead_key & CIC_DEAD_KEY));
 	spin_lock_irqsave(&ioc->lock, flags);
 	radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
 	hlist_del_rcu(&cic->cic_list);
 	spin_unlock_irqrestore(&ioc->lock, flags);
 	cfq_cic_free(cic);
 }
 /*
  * Must be called with rcu_read_lock() held or preemption otherwise disabled.
  * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
  * and ->trim() which is called with the task lock held
  */
 static void cfq_free_io_context(struct io_context *ioc)
 {
 	/*
 	 * ioc->refcount is zero here, or we are called from elv_unregister(),
 	 * so no more cic's are allowed to be linked into this ioc.  So it
 	 * should be ok to iterate over the known list, we will see all cic's
 	 * since no new ones are added.
 	 */
 	__call_for_each_cic(ioc, cic_free_func);
 }
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
 {
 	struct cfq_queue *__cfqq, *next;
 	/*
 	 * If this queue was scheduled to merge with another queue, be
 	 * sure to drop the reference taken on that queue (and others in
 	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
 	 */
 	__cfqq = cfqq->new_cfqq;
 	while (__cfqq) {
 		if (__cfqq == cfqq) {
 			WARN(1, "cfqq->new_cfqq loop detected\n");
 			break;
 		}
 		next = __cfqq->new_cfqq;
 		cfq_put_queue(__cfqq);
 		__cfqq = next;
 	}
 }
 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	if (unlikely(cfqq == cfqd->active_queue)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 	cfq_put_cooperator(cfqq);
 	cfq_put_queue(cfqq);
 }
 static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
 					 struct cfq_io_context *cic)
 {
 	struct io_context *ioc = cic->ioc;
 	list_del_init(&cic->queue_list);
 	/*
 	 * Make sure dead mark is seen for dead queues
 	 */
 	smp_wmb();
 	cic->key = cfqd_dead_key(cfqd);
 	if (ioc->ioc_data == cic)
 		rcu_assign_pointer(ioc->ioc_data, NULL);
 	if (cic->cfqq[BLK_RW_ASYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
 		cic->cfqq[BLK_RW_ASYNC] = NULL;
 	}
 	if (cic->cfqq[BLK_RW_SYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
 		cic->cfqq[BLK_RW_SYNC] = NULL;
 	}
 }
 static void cfq_exit_single_io_context(struct io_context *ioc,
 				       struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	if (cfqd) {
 		struct request_queue *q = cfqd->queue;
 		unsigned long flags;
 		spin_lock_irqsave(q->queue_lock, flags);
 		/*
 		 * Ensure we get a fresh copy of the ->key to prevent
 		 * race between exiting task and queue
 		 */
 		smp_read_barrier_depends();
 		if (cic->key == cfqd)
 			__cfq_exit_single_io_context(cfqd, cic);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 	}
 }
 /*
  * The process that ioc belongs to has exited, we need to clean up
  * and put the internal structures we have that belongs to that process.
  */
 static void cfq_exit_io_context(struct io_context *ioc)
 {
 	call_for_each_cic(ioc, cfq_exit_single_io_context);
 }
 static struct cfq_io_context *
 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct cfq_io_context *cic;
 	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
 							cfqd->queue->node);
 	if (cic) {
 		cic->last_end_request = jiffies;
 		INIT_LIST_HEAD(&cic->queue_list);
 		INIT_HLIST_NODE(&cic->cic_list);
 		cic->dtor = cfq_free_io_context;
 		cic->exit = cfq_exit_io_context;
 		elv_ioc_count_inc(cfq_ioc_count);
 	}
 	return cic;
 }
 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 {
 	struct task_struct *tsk = current;
 	int ioprio_class;
 	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
 	switch (ioprio_class) {
 	default:
 		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
 	case IOPRIO_CLASS_NONE:
 		/*
 		 * no prio set, inherit CPU scheduling settings
 		 */
 		cfqq->ioprio = task_nice_ioprio(tsk);
 		cfqq->ioprio_class = task_nice_ioclass(tsk);
 		break;
 	case IOPRIO_CLASS_RT:
 		cfqq->ioprio = task_ioprio(ioc);
 		cfqq->ioprio_class = IOPRIO_CLASS_RT;
 		break;
 	case IOPRIO_CLASS_BE:
 		cfqq->ioprio = task_ioprio(ioc);
 		cfqq->ioprio_class = IOPRIO_CLASS_BE;
 		break;
 	case IOPRIO_CLASS_IDLE:
 		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
 		cfqq->ioprio = 7;
 		cfq_clear_cfqq_idle_window(cfqq);
 		break;
 	}
 	/*
 	 * keep track of original prio settings in case we have to temporarily
 	 * elevate the priority of this queue
 	 */
 	cfqq->org_ioprio = cfqq->ioprio;
 	cfqq->org_ioprio_class = cfqq->ioprio_class;
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
 	unsigned long flags;
 	if (unlikely(!cfqd))
 		return;
 	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 	cfqq = cic->cfqq[BLK_RW_ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
 		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
 						GFP_ATOMIC);
 		if (new_cfqq) {
 			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
 			cfq_put_queue(cfqq);
 		}
 	}
 	cfqq = cic->cfqq[BLK_RW_SYNC];
 	if (cfqq)
 		cfq_mark_cfqq_prio_changed(cfqq);
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 static void cfq_ioc_set_ioprio(struct io_context *ioc)
 {
 	call_for_each_cic(ioc, changed_ioprio);
 	ioc->ioprio_changed = 0;
 }
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			  pid_t pid, bool is_sync)
 {
 	RB_CLEAR_NODE(&cfqq->rb_node);
 	RB_CLEAR_NODE(&cfqq->p_node);
 	INIT_LIST_HEAD(&cfqq->fifo);
 	atomic_set(&cfqq->ref, 0);
 	cfqq->cfqd = cfqd;
 	cfq_mark_cfqq_prio_changed(cfqq);
 	if (is_sync) {
 		if (!cfq_class_idle(cfqq))
 			cfq_mark_cfqq_idle_window(cfqq);
 		cfq_mark_cfqq_sync(cfqq);
 	}
 	cfqq->pid = pid;
 }
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
 {
 	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	unsigned long flags;
 	struct request_queue *q;
 	if (unlikely(!cfqd))
 		return;
 	q = cfqd->queue;
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (sync_cfqq) {
 		/*
 		 * Drop reference to sync queue. A new sync queue will be
 		 * assigned in new group upon arrival of a fresh request.
 		 */
 		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
 		cic_set_cfqq(cic, NULL, 1);
 		cfq_put_queue(sync_cfqq);
 	}
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 static void cfq_ioc_set_cgroup(struct io_context *ioc)
 {
 	call_for_each_cic(ioc, changed_cgroup);
 	ioc->cgroup_changed = 0;
 }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
 		     struct io_context *ioc, gfp_t gfp_mask)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	struct cfq_io_context *cic;
 	struct cfq_group *cfqg;
 retry:
 	cfqg = cfq_get_cfqg(cfqd, 1);
 	cic = cfq_cic_lookup(cfqd, ioc);
 	/* cic always exists here */
 	cfqq = cic_to_cfqq(cic, is_sync);
 	/*
 	 * Always try a new alloc if we fell back to the OOM cfqq
 	 * originally, since it should just be a temporary situation.
 	 */
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
 		cfqq = NULL;
 		if (new_cfqq) {
 			cfqq = new_cfqq;
 			new_cfqq = NULL;
 		} else if (gfp_mask & __GFP_WAIT) {
 			spin_unlock_irq(cfqd->queue->queue_lock);
 			new_cfqq = kmem_cache_alloc_node(cfq_pool,
 					gfp_mask | __GFP_ZERO,
 					cfqd->queue->node);
 			spin_lock_irq(cfqd->queue->queue_lock);
 			if (new_cfqq)
 				goto retry;
 		} else {
 			cfqq = kmem_cache_alloc_node(cfq_pool,
 					gfp_mask | __GFP_ZERO,
 					cfqd->queue->node);
 		}
 		if (cfqq) {
 			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
 			cfq_init_prio_data(cfqq, ioc);
 			cfq_link_cfqq_cfqg(cfqq, cfqg);
 			cfq_log_cfqq(cfqd, cfqq, "alloced");
 		} else
 			cfqq = &cfqd->oom_cfqq;
 	}
 	if (new_cfqq)
 		kmem_cache_free(cfq_pool, new_cfqq);
 	return cfqq;
 }
 static struct cfq_queue **
 cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 {
 	switch (ioprio_class) {
 	case IOPRIO_CLASS_RT:
 		return &cfqd->async_cfqq[0][ioprio];
 	case IOPRIO_CLASS_BE:
 		return &cfqd->async_cfqq[1][ioprio];
 	case IOPRIO_CLASS_IDLE:
 		return &cfqd->async_idle_cfqq;
 	default:
 		BUG();
 	}
 }
 static struct cfq_queue *
 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 	      gfp_t gfp_mask)
 {
 	const int ioprio = task_ioprio(ioc);
 	const int ioprio_class = task_ioprio_class(ioc);
 	struct cfq_queue **async_cfqq = NULL;
 	struct cfq_queue *cfqq = NULL;
 	if (!is_sync) {
 		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
 		cfqq = *async_cfqq;
 	}
 	if (!cfqq)
 		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
 	 */
 	if (!is_sync && !(*async_cfqq)) {
 		atomic_inc(&cfqq->ref);
 		*async_cfqq = cfqq;
 	}
 	atomic_inc(&cfqq->ref);
 	return cfqq;
 }
 /*
  * We drop cfq io contexts lazily, so we may find a dead one.
  */
 static void
 cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
 		  struct cfq_io_context *cic)
 {
 	unsigned long flags;
 	WARN_ON(!list_empty(&cic->queue_list));
 	BUG_ON(cic->key != cfqd_dead_key(cfqd));
 	spin_lock_irqsave(&ioc->lock, flags);
 	BUG_ON(ioc->ioc_data == cic);
 	radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
 	hlist_del_rcu(&cic->cic_list);
 	spin_unlock_irqrestore(&ioc->lock, flags);
 	cfq_cic_free(cic);
 }
 static struct cfq_io_context *
 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 {
 	struct cfq_io_context *cic;
 	unsigned long flags;
 	if (unlikely(!ioc))
 		return NULL;
 	rcu_read_lock();
 	/*
 	 * we maintain a last-hit cache, to avoid browsing over the tree
 	 */
 	cic = rcu_dereference(ioc->ioc_data);
 	if (cic && cic->key == cfqd) {
 		rcu_read_unlock();
 		return cic;
 	}
 	do {
 		cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
 		rcu_read_unlock();
 		if (!cic)
 			break;
 		if (unlikely(cic->key != cfqd)) {
 			cfq_drop_dead_cic(cfqd, ioc, cic);
 			rcu_read_lock();
 			continue;
 		}
 		spin_lock_irqsave(&ioc->lock, flags);
 		rcu_assign_pointer(ioc->ioc_data, cic);
 		spin_unlock_irqrestore(&ioc->lock, flags);
 		break;
 	} while (1);
 	return cic;
 }
 /*
  * Add cic into ioc, using cfqd as the search key. This enables us to lookup
  * the process specific cfq io context when entered from the block layer.
  * Also adds the cic to a per-cfqd list, used when this queue is removed.
  */
 static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
 			struct cfq_io_context *cic, gfp_t gfp_mask)
 {
 	unsigned long flags;
 	int ret;
 	ret = radix_tree_preload(gfp_mask);
 	if (!ret) {
 		cic->ioc = ioc;
 		cic->key = cfqd;
 		spin_lock_irqsave(&ioc->lock, flags);
 		ret = radix_tree_insert(&ioc->radix_root,
 						cfqd->cic_index, cic);
 		if (!ret)
 			hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
 		spin_unlock_irqrestore(&ioc->lock, flags);
 		radix_tree_preload_end();
 		if (!ret) {
 			spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 			list_add(&cic->queue_list, &cfqd->cic_list);
 			spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 		}
 	}
 	if (ret)
 		printk(KERN_ERR "cfq: cic link failed!\n");
 	return ret;
 }
 /*
  * Setup general io context and cfq io context. There can be several cfq
  * io contexts per general io context, if this process is doing io to more
  * than one device managed by cfq.
  */
 static struct cfq_io_context *
 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct io_context *ioc = NULL;
 	struct cfq_io_context *cic;
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	ioc = get_io_context(gfp_mask, cfqd->queue->node);
 	if (!ioc)
 		return NULL;
 	cic = cfq_cic_lookup(cfqd, ioc);
 	if (cic)
 		goto out;
 	cic = cfq_alloc_io_context(cfqd, gfp_mask);
 	if (cic == NULL)
 		goto err;
 	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
 		goto err_free;
 out:
 	smp_read_barrier_depends();
 	if (unlikely(ioc->ioprio_changed))
 		cfq_ioc_set_ioprio(ioc);
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	if (unlikely(ioc->cgroup_changed))
 		cfq_ioc_set_cgroup(ioc);
 #endif
 	return cic;
 err_free:
 	cfq_cic_free(cic);
 err:
 	put_io_context(ioc);
 	return NULL;
 }
 static void
 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 {
 	unsigned long elapsed = jiffies - cic->last_end_request;
 	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
 	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
 	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
 	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
 }
 static void
 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct request *rq)
 {
 	sector_t sdist = 0;
 	sector_t n_sec = blk_rq_sectors(rq);
 	if (cfqq->last_request_pos) {
 		if (cfqq->last_request_pos < blk_rq_pos(rq))
 			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
 		else
 			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
 	}
 	cfqq->seek_history <<= 1;
 	if (blk_queue_nonrot(cfqd->queue))
 		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
 	else
 		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
 }
 /*
  * Disable idle window if the process thinks too long or seeks so much that
  * it doesn't matter
  */
 static void
 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct cfq_io_context *cic)
 {
 	int old_idle, enable_idle;
 	/*
 	 * Don't idle for async or idle io prio class
 	 */
 	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
 		return;
 	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
 	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
 		cfq_mark_cfqq_deep(cfqq);
 	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
 	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
 		if (cic->ttime_mean > cfqd->cfq_slice_idle)
 			enable_idle = 0;
 		else
 			enable_idle = 1;
 	}
 	if (old_idle != enable_idle) {
 		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
 		if (enable_idle)
 			cfq_mark_cfqq_idle_window(cfqq);
 		else
 			cfq_clear_cfqq_idle_window(cfqq);
 	}
 }
 /*
  * Check if new_cfqq should preempt the currently active queue. Return 0 for
  * no or if we aren't sure, a 1 will cause a preempt.
  */
 static bool
 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 		   struct request *rq)
 {
 	struct cfq_queue *cfqq;
 	cfqq = cfqd->active_queue;
 	if (!cfqq)
 		return false;
 	if (cfq_class_idle(new_cfqq))
 		return false;
 	if (cfq_class_idle(cfqq))
 		return true;
 	/*
 	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
 	 */
 	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
 		return false;
 	/*
 	 * if the new request is sync, but the currently running queue is
 	 * not, let the sync request have priority.
 	 */
 	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
 		return true;
 	if (new_cfqq->cfqg != cfqq->cfqg)
 		return false;
 	if (cfq_slice_used(cfqq))
 		return true;
 	/* Allow preemption only if we are idling on sync-noidle tree */
 	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
 	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
 	    new_cfqq->service_tree->count == 2 &&
 	    RB_EMPTY_ROOT(&cfqq->sort_list))
 		return true;
 	/*
 	 * So both queues are sync. Let the new request get disk time if
 	 * it's a metadata request and the current queue is doing regular IO.
 	 */
 	if (rq_is_meta(rq) && !cfqq->meta_pending)
 		return true;
 	/*
 	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
 	 */
 	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
 		return true;
 	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
 		return false;
 	/*
 	 * if this request is as-good as one we would expect from the
 	 * current cfqq, let it preempt
 	 */
 	if (cfq_rq_close(cfqd, cfqq, rq))
 		return true;
 	return false;
 }
 /*
  * cfqq preempts the active queue. if we allowed preempt with no slice left,
  * let it have half of its nominal slice.
  */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "preempt");
 	cfq_slice_expired(cfqd, 1);
 	/*
 	 * Put the new queue at the front of the of the current list,
 	 * so we know that it will be selected next.
 	 */
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_service_tree_add(cfqd, cfqq, 1);
 	cfqq->slice_end = 0;
 	cfq_mark_cfqq_slice_new(cfqq);
 }
 /*
  * Called when a new fs request (rq) is added (to cfqq). Check if there's
  * something we should do about it
  */
 static void
 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		struct request *rq)
 {
 	struct cfq_io_context *cic = RQ_CIC(rq);
 	cfqd->rq_queued++;
 	if (rq_is_meta(rq))
 		cfqq->meta_pending++;
 	cfq_update_io_thinktime(cfqd, cic);
 	cfq_update_io_seektime(cfqd, cfqq, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 	if (cfqq == cfqd->active_queue) {
 		/*
 		 * Remember that we saw a request from this process, but
 		 * don't start queuing just yet. Otherwise we risk seeing lots
 		 * of tiny requests, because we disrupt the normal plugging
 		 * and merging. If the request is already larger than a single
 		 * page, let it rip immediately. For that case we assume that
 		 * merging is already done. Ditto for a busy system that
 		 * has other work pending, don't risk delaying until the
 		 * idle timer unplug to continue working.
 		 */
 		if (cfq_cfqq_wait_request(cfqq)) {
 			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
 			    cfqd->busy_queues > 1) {
 				cfq_del_timer(cfqd, cfqq);
 				cfq_clear_cfqq_wait_request(cfqq);
 				__blk_run_queue(cfqd->queue);
 			} else {
 				blkiocg_update_idle_time_stats(
 						&cfqq->cfqg->blkg);
 				cfq_mark_cfqq_must_dispatch(cfqq);
 			}
 		}
 	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
 		/*
 		 * not the active queue - expire current slice if it is
 		 * idle and has expired it's mean thinktime or this new queue
 		 * has some old slice time left and is of higher priority or
 		 * this new queue is RT and the current one is BE
 		 */
 		cfq_preempt_queue(cfqd, cfqq);
 		__blk_run_queue(cfqd->queue);
 	}
 }
 static void cfq_insert_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	cfq_log_cfqq(cfqd, cfqq, "insert_request");
 	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
 	blkiocg_update_io_add_stats(&(RQ_CFQG(rq))->blkg,
 			&cfqd->serving_group->blkg, rq_data_dir(rq),
 			rq_is_sync(rq));
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 /*
  * Update hw_tag based on peak queue depth over 50 samples under
  * sufficient load.
  */
 static void cfq_update_hw_tag(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
 		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
 	if (cfqd->hw_tag == 1)
 		return;
 	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
 	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
 		return;
 	/*
 	 * If active queue hasn't enough requests and can idle, cfq might not
 	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
 	 * case
 	 */
 	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
 	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
 	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
 		return;
 	if (cfqd->hw_tag_samples++ < 50)
 		return;
 	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
 		cfqd->hw_tag = 1;
 	else
 		cfqd->hw_tag = 0;
 }
 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct cfq_io_context *cic = cfqd->active_cic;
 	/* If there are other queues in the group, don't wait */
 	if (cfqq->cfqg->nr_cfqq > 1)
 		return false;
 	if (cfq_slice_used(cfqq))
 		return true;
 	/* if slice left is less than think time, wait busy */
 	if (cic && sample_valid(cic->ttime_samples)
 	    && (cfqq->slice_end - jiffies < cic->ttime_mean))
 		return true;
 	/*
 	 * If think times is less than a jiffy than ttime_mean=0 and above
 	 * will not be true. It might happen that slice has not expired yet
 	 * but will expire soon (4-5 ns) during select_queue(). To cover the
 	 * case where think time is less than a jiffy, mark the queue wait
 	 * busy if only 1 jiffy is left in the slice.
 	 */
 	if (cfqq->slice_end - jiffies == 1)
 		return true;
 	return false;
 }
 static void cfq_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = cfqq->cfqd;
 	const int sync = rq_is_sync(rq);
 	unsigned long now;
 	now = jiffies;
 	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
 	cfq_update_hw_tag(cfqd);
 	WARN_ON(!cfqd->rq_in_driver);
 	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
 	blkiocg_update_completion_stats(&cfqq->cfqg->blkg, rq_start_time_ns(rq),
 			rq_io_start_time_ns(rq), rq_data_dir(rq),
 			rq_is_sync(rq));
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 	if (sync) {
 		RQ_CIC(rq)->last_end_request = now;
 		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
 			cfqd->last_delayed_sync = now;
 	}
 	/*
 	 * If this is the active queue, check if it needs to be expired,
 	 * or if we want to idle in case it has no pending requests.
 	 */
 	if (cfqd->active_queue == cfqq) {
 		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
 		if (cfq_cfqq_slice_new(cfqq)) {
 			cfq_set_prio_slice(cfqd, cfqq);
 			cfq_clear_cfqq_slice_new(cfqq);
 		}
 		/*
 		 * Should we wait for next request to come in before we expire
 		 * the queue.
 		 */
 		if (cfq_should_wait_busy(cfqd, cfqq)) {
 			cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
 			cfq_mark_cfqq_wait_busy(cfqq);
 			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
 		}
 		/*
 		 * Idling is not enabled on:
 		 * - expired queues
 		 * - idle-priority queues
 		 * - async queues
 		 * - queues with still some requests queued
 		 * - when there is a close cooperator
 		 */
 		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
 			cfq_slice_expired(cfqd, 1);
 		else if (sync && cfqq_empty &&
 			 !cfq_close_cooperator(cfqd, cfqq)) {
 			cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
 			/*
 			 * Idling is enabled for SYNC_WORKLOAD.
 			 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
 			 * only if we processed at least one !rq_noidle request
 			 */
 			if (cfqd->serving_type == SYNC_WORKLOAD
 			    || cfqd->noidle_tree_requires_idle
 			    || cfqq->cfqg->nr_cfqq == 1)
 				cfq_arm_slice_timer(cfqd);
 		}
 	}
 	if (!cfqd->rq_in_driver)
 		cfq_schedule_dispatch(cfqd);
 }
 /*
  * we temporarily boost lower priority queues if they are holding fs exclusive
  * resources. they are boosted to normal prio (CLASS_BE/4)
  */
 static void cfq_prio_boost(struct cfq_queue *cfqq)
 {
 	if (has_fs_excl()) {
 		/*
 		 * boost idle prio on transactions that would lock out other
 		 * users of the filesystem
 		 */
 		if (cfq_class_idle(cfqq))
 			cfqq->ioprio_class = IOPRIO_CLASS_BE;
 		if (cfqq->ioprio > IOPRIO_NORM)
 			cfqq->ioprio = IOPRIO_NORM;
 	} else {
 		/*
 		 * unboost the queue (if needed)
 		 */
 		cfqq->ioprio_class = cfqq->org_ioprio_class;
 		cfqq->ioprio = cfqq->org_ioprio;
 	}
 }
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
 {
 	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
 		cfq_mark_cfqq_must_alloc_slice(cfqq);
 		return ELV_MQUEUE_MUST;
 	}
 	return ELV_MQUEUE_MAY;
 }
 static int cfq_may_queue(struct request_queue *q, int rw)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct task_struct *tsk = current;
 	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
 	/*
 	 * don't force setup of a queue from here, as a call to may_queue
 	 * does not necessarily imply that a request actually will be queued.
 	 * so just lookup a possibly existing queue, or return 'may queue'
 	 * if that fails
 	 */
 	cic = cfq_cic_lookup(cfqd, tsk->io_context);
 	if (!cic)
 		return ELV_MQUEUE_MAY;
 	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
 	if (cfqq) {
 		cfq_init_prio_data(cfqq, cic->ioc);
 		cfq_prio_boost(cfqq);
 		return __cfq_may_queue(cfqq);
 	}
 	return ELV_MQUEUE_MAY;
 }
 /*
  * queue lock held here
  */
 static void cfq_put_request(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	if (cfqq) {
 		const int rw = rq_data_dir(rq);
 		BUG_ON(!cfqq->allocated[rw]);
 		cfqq->allocated[rw]--;
 		put_io_context(RQ_CIC(rq)->ioc);
 		rq->elevator_private = NULL;
 		rq->elevator_private2 = NULL;
 		/* Put down rq reference on cfqg */
 		cfq_put_cfqg(RQ_CFQG(rq));
 		rq->elevator_private3 = NULL;
 		cfq_put_queue(cfqq);
 	}
 }
 static struct cfq_queue *
 cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
 		struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
 	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
 	cfq_mark_cfqq_coop(cfqq->new_cfqq);
 	cfq_put_queue(cfqq);
 	return cic_to_cfqq(cic, 1);
 }
 /*
  * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
  * was the last process referring to said cfqq.
  */
 static struct cfq_queue *
 split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
 {
 	if (cfqq_process_refs(cfqq) == 1) {
 		cfqq->pid = current->pid;
 		cfq_clear_cfqq_coop(cfqq);
 		cfq_clear_cfqq_split_coop(cfqq);
 		return cfqq;
 	}
 	cic_set_cfqq(cic, NULL, 1);
 	cfq_put_cooperator(cfqq);
 	cfq_put_queue(cfqq);
 	return NULL;
 }
 /*
  * Allocate cfq data structures associated with this request.
  */
 static int
 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_io_context *cic;
 	const int rw = rq_data_dir(rq);
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
 	unsigned long flags;
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	cic = cfq_get_io_context(cfqd, gfp_mask);
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (!cic)
 		goto queue_fail;
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
 		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
 		/*
 		 * If the queue was seeky for too long, break it apart.
 		 */
 		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
 			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
 			cfqq = split_cfqq(cic, cfqq);
 			if (!cfqq)
 				goto new_queue;
 		}
 		/*
 		 * Check to see if this queue is scheduled to merge with
 		 * another, closely cooperating queue.  The merging of
 		 * queues happens here as it must be done in process context.
 		 * The reference on new_cfqq was taken in merge_cfqqs.
 		 */
 		if (cfqq->new_cfqq)
 			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
 	}
 	cfqq->allocated[rw]++;
 	atomic_inc(&cfqq->ref);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	rq->elevator_private = cic;
 	rq->elevator_private2 = cfqq;
 	rq->elevator_private3 = cfq_ref_get_cfqg(cfqq->cfqg);
 	return 0;
 queue_fail:
 	if (cic)
 		put_io_context(cic->ioc);
 	cfq_schedule_dispatch(cfqd);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	cfq_log(cfqd, "set_request fail");
 	return 1;
 }
 static void cfq_kick_queue(struct work_struct *work)
 {
 	struct cfq_data *cfqd =
 		container_of(work, struct cfq_data, unplug_work);
 	struct request_queue *q = cfqd->queue;
 	spin_lock_irq(q->queue_lock);
 	__blk_run_queue(cfqd->queue);
 	spin_unlock_irq(q->queue_lock);
 }
 /*
  * Timer running if the active_queue is currently idling inside its time slice
  */
 static void cfq_idle_slice_timer(unsigned long data)
 {
 	struct cfq_data *cfqd = (struct cfq_data *) data;
 	struct cfq_queue *cfqq;
 	unsigned long flags;
 	int timed_out = 1;
 	cfq_log(cfqd, "idle timer fired");
 	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 	cfqq = cfqd->active_queue;
 	if (cfqq) {
 		timed_out = 0;
 		/*
 		 * We saw a request before the queue expired, let it through
 		 */
 		if (cfq_cfqq_must_dispatch(cfqq))
 			goto out_kick;
 		/*
 		 * expired
 		 */
 		if (cfq_slice_used(cfqq))
 			goto expire;
 		/*
 		 * only expire and reinvoke request handler, if there are
 		 * other queues with pending requests
 		 */
 		if (!cfqd->busy_queues)
 			goto out_cont;
 		/*
 		 * not expired and it has a request pending, let it dispatch
 		 */
 		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 			goto out_kick;
 		/*
 		 * Queue depth flag is reset only when the idle didn't succeed
 		 */
 		cfq_clear_cfqq_deep(cfqq);
 	}
 expire:
 	cfq_slice_expired(cfqd, timed_out);
 out_kick:
 	cfq_schedule_dispatch(cfqd);
 out_cont:
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 {
 	del_timer_sync(&cfqd->idle_slice_timer);
 	cancel_work_sync(&cfqd->unplug_work);
 }
 static void cfq_put_async_queues(struct cfq_data *cfqd)
 {
 	int i;
 	for (i = 0; i < IOPRIO_BE_NR; i++) {
 		if (cfqd->async_cfqq[0][i])
 			cfq_put_queue(cfqd->async_cfqq[0][i]);
 		if (cfqd->async_cfqq[1][i])
 			cfq_put_queue(cfqd->async_cfqq[1][i]);
 	}
 	if (cfqd->async_idle_cfqq)
 		cfq_put_queue(cfqd->async_idle_cfqq);
 }
 static void cfq_cfqd_free(struct rcu_head *head)
 {
 	kfree(container_of(head, struct cfq_data, rcu));
 }
 static void cfq_exit_queue(struct elevator_queue *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 	struct request_queue *q = cfqd->queue;
 	cfq_shutdown_timer_wq(cfqd);
 	spin_lock_irq(q->queue_lock);
 	if (cfqd->active_queue)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 	while (!list_empty(&cfqd->cic_list)) {
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
 							struct cfq_io_context,
 							queue_list);
 		__cfq_exit_single_io_context(cfqd, cic);
 	}
 	cfq_put_async_queues(cfqd);
 	cfq_release_cfq_groups(cfqd);
 	blkiocg_del_blkio_group(&cfqd->root_group.blkg);
 	spin_unlock_irq(q->queue_lock);
 	cfq_shutdown_timer_wq(cfqd);
 	spin_lock(&cic_index_lock);
 	ida_remove(&cic_index_ida, cfqd->cic_index);
 	spin_unlock(&cic_index_lock);
 	/* Wait for cfqg->blkg->key accessors to exit their grace periods. */
 	call_rcu(&cfqd->rcu, cfq_cfqd_free);
 }
 static int cfq_alloc_cic_index(void)
 {
 	int index, error;
 	do {
 		if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
 			return -ENOMEM;
 		spin_lock(&cic_index_lock);
 		error = ida_get_new(&cic_index_ida, &index);
 		spin_unlock(&cic_index_lock);
 		if (error && error != -EAGAIN)
 			return error;
 	} while (error);
 	return index;
 }
 static void *cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
 	int i, j;
 	struct cfq_group *cfqg;
 	struct cfq_rb_root *st;
 	i = cfq_alloc_cic_index();
 	if (i < 0)
 		return NULL;
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!cfqd)
 		return NULL;
 	cfqd->cic_index = i;
 	/* Init root service tree */
 	cfqd->grp_service_tree = CFQ_RB_ROOT;
 	/* Init root group */
 	cfqg = &cfqd->root_group;
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
 	/* Give preference to root group over other groups */
 	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	/*
 	 * Take a reference to root group which we never drop. This is just
 	 * to make sure that cfq_put_cfqg() does not try to kfree root group
 	 */
 	atomic_set(&cfqg->ref, 1);
 	rcu_read_lock();
 	blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
 					0);
 	rcu_read_unlock();
 #endif
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
 	 * zeroed cfqd on alloc), but better be safe in case someone decides
 	 * to add magic to the rb code
 	 */
 	for (i = 0; i < CFQ_PRIO_LISTS; i++)
 		cfqd->prio_trees[i] = RB_ROOT;
 	/*
 	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
 	 * will not attempt to free it.
 	 */
 	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
 	atomic_inc(&cfqd->oom_cfqq.ref);
 	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
 	INIT_LIST_HEAD(&cfqd->cic_list);
 	cfqd->queue = q;
 	init_timer(&cfqd->idle_slice_timer);
 	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
 	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
 	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
 	cfqd->cfq_quantum = cfq_quantum;
 	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
 	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
 	cfqd->cfq_back_max = cfq_back_max;
 	cfqd->cfq_back_penalty = cfq_back_penalty;
 	cfqd->cfq_slice[0] = cfq_slice_async;
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_latency = 1;
 	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
 	/*
 	 * we optimistically start assuming sync ops weren't delayed in last
 	 * second, in order to have larger depth for async operations.
 	 */
 	cfqd->last_delayed_sync = jiffies - HZ;
 	return cfqd;
 }
 static void cfq_slab_kill(void)
 {
 	/*
 	 * Caller already ensured that pending RCU callbacks are completed,
 	 * so we should have no busy allocations at this point.
 	 */
 	if (cfq_pool)
 		kmem_cache_destroy(cfq_pool);
 	if (cfq_ioc_pool)
 		kmem_cache_destroy(cfq_ioc_pool);
 }
 static int __init cfq_slab_setup(void)
 {
 	cfq_pool = KMEM_CACHE(cfq_queue, 0);
 	if (!cfq_pool)
 		goto fail;
 	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
 	if (!cfq_ioc_pool)
 		goto fail;
 	return 0;
 fail:
 	cfq_slab_kill();
 	return -ENOMEM;
 }
 /*
  * sysfs parts below -->
  */
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
 {
 	return sprintf(page, "%d\n", var);
 }
 static ssize_t
 cfq_var_store(unsigned int *var, const char *page, size_t count)
 {
 	char *p = (char *) page;
 	*var = simple_strtoul(p, &p, 10);
 	return count;
 }
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
 static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 {									\
 	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data = __VAR;					\
 	if (__CONV)							\
 		__data = jiffies_to_msecs(__data);			\
 	return cfq_var_show(__data, (page));				\
 }
 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
 SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
 #undef SHOW_FUNCTION
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 {									\
 	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data;						\
 	int ret = cfq_var_store(&__data, (page), count);		\
 	if (__data < (MIN))						\
 		__data = (MIN);						\
 	else if (__data > (MAX))					\
 		__data = (MAX);						\
 	if (__CONV)							\
 		*(__PTR) = msecs_to_jiffies(__data);			\
 	else								\
 		*(__PTR) = __data;					\
 	return ret;							\
 }
 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
 		UINT_MAX, 1);
 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
 		UINT_MAX, 1);
 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
 		UINT_MAX, 0);
 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
 		UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
 STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
 #undef STORE_FUNCTION
 #define CFQ_ATTR(name) \
 	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
 static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(quantum),
 	CFQ_ATTR(fifo_expire_sync),
 	CFQ_ATTR(fifo_expire_async),
 	CFQ_ATTR(back_seek_max),
 	CFQ_ATTR(back_seek_penalty),
 	CFQ_ATTR(slice_sync),
 	CFQ_ATTR(slice_async),
 	CFQ_ATTR(slice_async_rq),
 	CFQ_ATTR(slice_idle),
 	CFQ_ATTR(low_latency),
 	CFQ_ATTR(group_isolation),
 	__ATTR_NULL
 };
 static struct elevator_type iosched_cfq = {
 	.ops = {
 		.elevator_merge_fn = 		cfq_merge,
 		.elevator_merged_fn =		cfq_merged_request,
 		.elevator_merge_req_fn =	cfq_merged_requests,
 		.elevator_allow_merge_fn =	cfq_allow_merge,
 		.elevator_bio_merged_fn =	cfq_bio_merged,
 		.elevator_dispatch_fn =		cfq_dispatch_requests,
 		.elevator_add_req_fn =		cfq_insert_request,
 		.elevator_activate_req_fn =	cfq_activate_request,
 		.elevator_deactivate_req_fn =	cfq_deactivate_request,
 		.elevator_queue_empty_fn =	cfq_queue_empty,
 		.elevator_completed_req_fn =	cfq_completed_request,
 		.elevator_former_req_fn =	elv_rb_former_request,
 		.elevator_latter_req_fn =	elv_rb_latter_request,
 		.elevator_set_req_fn =		cfq_set_request,
 		.elevator_put_req_fn =		cfq_put_request,
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
 		.trim =				cfq_free_io_context,
 	},
 	.elevator_attrs =	cfq_attrs,
 	.elevator_name =	"cfq",
 	.elevator_owner =	THIS_MODULE,
 };
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkio_policy_type blkio_policy_cfq = {
 	.ops = {
 		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
 		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
 	},
 };
 #else
 static struct blkio_policy_type blkio_policy_cfq;
 #endif
 static int __init cfq_init(void)
 {
 	/*
 	 * could be 0 on HZ < 1000 setups
 	 */
 	if (!cfq_slice_async)
 		cfq_slice_async = 1;
 	if (!cfq_slice_idle)
 		cfq_slice_idle = 1;
 	if (cfq_slab_setup())
 		return -ENOMEM;
 	elv_register(&iosched_cfq);
 	blkio_policy_register(&blkio_policy_cfq);
 	return 0;
 }
 static void __exit cfq_exit(void)
 {
 	DECLARE_COMPLETION_ONSTACK(all_gone);
 	blkio_policy_unregister(&blkio_policy_cfq);
 	elv_unregister(&iosched_cfq);
 	ioc_gone = &all_gone;
 	/* ioc_gone's update must be visible before reading ioc_count */
 	smp_wmb();
 	/*
 	 * this also protects us from entering cfq_slab_kill() with
 	 * pending RCU callbacks
 	 */
 	if (elv_ioc_count_read(cfq_ioc_count))
 		wait_for_completion(&all_gone);
 	ida_destroy(&cic_index_ida);
 	cfq_slab_kill();
 }
 module_init(cfq_init);
 module_exit(cfq_exit);
 MODULE_AUTHOR("Jens Axboe");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");