Eric Lee / linux-smarc-t335x-v3.2

1

/*

1

/*

2

* CFQ, or complete fairness queueing, disk scheduler.

2

* CFQ, or complete fairness queueing, disk scheduler.

3

*

3

*

4

* Based on ideas from a previously unfinished io

4

* Based on ideas from a previously unfinished io

5

* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.

5

* scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.

6

*

6

*

7

8

*/

8

*/

9

#include <linux/module.h>

9

#include <linux/module.h>

10

#include <linux/blkdev.h>

10

#include <linux/blkdev.h>

11

#include <linux/elevator.h>

11

#include <linux/elevator.h>

12

#include <linux/jiffies.h>

12

#include <linux/jiffies.h>

13

#include <linux/rbtree.h>

13

#include <linux/rbtree.h>

14

#include <linux/ioprio.h>

14

#include <linux/ioprio.h>

15

#include <linux/blktrace_api.h>

15

#include <linux/blktrace_api.h>

16

#include "blk-cgroup.h"

16

#include "blk-cgroup.h"

17

18

/*

18

/*

19

* tunables

19

* tunables

20

*/

20

*/

21

/* max queue in one round of service */

21

/* max queue in one round of service */

22

static const int cfq_quantum = 8;

22

static const int cfq_quantum = 8;

23

static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

23

static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };

24

/* maximum backwards seek, in KiB */

24

/* maximum backwards seek, in KiB */

25

static const int cfq_back_max = 16 * 1024;

25

static const int cfq_back_max = 16 * 1024;

26

/* penalty of a backwards seek */

26

/* penalty of a backwards seek */

27

static const int cfq_back_penalty = 2;

27

static const int cfq_back_penalty = 2;

28

static const int cfq_slice_sync = HZ / 10;

28

static const int cfq_slice_sync = HZ / 10;

29

static int cfq_slice_async = HZ / 25;

29

static int cfq_slice_async = HZ / 25;

30

static const int cfq_slice_async_rq = 2;

30

static const int cfq_slice_async_rq = 2;

31

static int cfq_slice_idle = HZ / 125;

31

static int cfq_slice_idle = HZ / 125;

32

static const int cfq_target_latency = HZ * 3/10; /* 300 ms */

32

static const int cfq_target_latency = HZ * 3/10; /* 300 ms */

33

static const int cfq_hist_divisor = 4;

33

static const int cfq_hist_divisor = 4;

34

35

/*

35

/*

36

* offset from end of service tree

36

* offset from end of service tree

37

*/

37

*/

38

#define CFQ_IDLE_DELAY (HZ / 5)

38

#define CFQ_IDLE_DELAY (HZ / 5)

39

40

/*

40

/*

41

* below this threshold, we consider thinktime immediate

41

* below this threshold, we consider thinktime immediate

42

*/

42

*/

43

#define CFQ_MIN_TT (2)

43

#define CFQ_MIN_TT (2)

44

45

#define CFQ_SLICE_SCALE (5)

45

#define CFQ_SLICE_SCALE (5)

46

#define CFQ_HW_QUEUE_MIN (5)

46

#define CFQ_HW_QUEUE_MIN (5)

47

#define CFQ_SERVICE_SHIFT 12

47

#define CFQ_SERVICE_SHIFT 12

48

49

#define CFQQ_SEEK_THR (sector_t)(8 * 100)

49

#define CFQQ_SEEK_THR (sector_t)(8 * 100)

50

#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)

50

#define CFQQ_CLOSE_THR (sector_t)(8 * 1024)

51

#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)

51

#define CFQQ_SECT_THR_NONROT (sector_t)(2 * 32)

52

#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)

52

#define CFQQ_SEEKY(cfqq) (hweight32(cfqq->seek_history) > 32/8)

53

54

#define RQ_CIC(rq) \

54

#define RQ_CIC(rq) \

55

((struct cfq_io_context *) (rq)->elevator_private)

55

((struct cfq_io_context *) (rq)->elevator_private)

56

#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)

56

#define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2)

57

58

static struct kmem_cache *cfq_pool;

58

static struct kmem_cache *cfq_pool;

59

static struct kmem_cache *cfq_ioc_pool;

59

static struct kmem_cache *cfq_ioc_pool;

60

61

static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);

61

static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);

62

static struct completion *ioc_gone;

62

static struct completion *ioc_gone;

63

static DEFINE_SPINLOCK(ioc_gone_lock);

63

static DEFINE_SPINLOCK(ioc_gone_lock);

64

65

#define CFQ_PRIO_LISTS IOPRIO_BE_NR

65

#define CFQ_PRIO_LISTS IOPRIO_BE_NR

66

#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)

66

#define cfq_class_idle(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)

67

#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

67

#define cfq_class_rt(cfqq) ((cfqq)->ioprio_class == IOPRIO_CLASS_RT)

68

69

#define sample_valid(samples) ((samples) > 80)

69

#define sample_valid(samples) ((samples) > 80)

70

#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)

70

#define rb_entry_cfqg(node) rb_entry((node), struct cfq_group, rb_node)

71

72

/*

72

/*

73

* Most of our rbtree usage is for sorting with min extraction, so

73

* Most of our rbtree usage is for sorting with min extraction, so

74

* if we cache the leftmost node we don't have to walk down the tree

74

* if we cache the leftmost node we don't have to walk down the tree

75

* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should

75

* to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should

76

* move this into the elevator for the rq sorting as well.

76

* move this into the elevator for the rq sorting as well.

77

*/

77

*/

78

struct cfq_rb_root {

78

struct cfq_rb_root {

79

struct rb_root rb;

79

struct rb_root rb;

80

struct rb_node *left;

80

struct rb_node *left;

81

unsigned count;

81

unsigned count;

82

unsigned total_weight;

82

unsigned total_weight;

83

u64 min_vdisktime;

83

u64 min_vdisktime;

84

struct rb_node *active;

84

struct rb_node *active;

85

};

85

};

86

#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \

86

#define CFQ_RB_ROOT (struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \

87

.count = 0, .min_vdisktime = 0, }

87

.count = 0, .min_vdisktime = 0, }

88

89

/*

89

/*

90

* Per process-grouping structure

90

* Per process-grouping structure

91

*/

91

*/

92

struct cfq_queue {

92

struct cfq_queue {

93

/* reference count */

93

/* reference count */

94

atomic_t ref;

94

atomic_t ref;

95

/* various state flags, see below */

95

/* various state flags, see below */

96

unsigned int flags;

96

unsigned int flags;

97

/* parent cfq_data */

97

/* parent cfq_data */

98

struct cfq_data *cfqd;

98

struct cfq_data *cfqd;

99

/* service_tree member */

99

/* service_tree member */

100

struct rb_node rb_node;

100

struct rb_node rb_node;

101

/* service_tree key */

101

/* service_tree key */

102

unsigned long rb_key;

102

unsigned long rb_key;

103

/* prio tree member */

103

/* prio tree member */

104

struct rb_node p_node;

104

struct rb_node p_node;

105

/* prio tree root we belong to, if any */

105

/* prio tree root we belong to, if any */

106

struct rb_root *p_root;

106

struct rb_root *p_root;

107

/* sorted list of pending requests */

107

/* sorted list of pending requests */

108

struct rb_root sort_list;

108

struct rb_root sort_list;

109

/* if fifo isn't expired, next request to serve */

109

/* if fifo isn't expired, next request to serve */

110

struct request *next_rq;

110

struct request *next_rq;

111

/* requests queued in sort_list */

111

/* requests queued in sort_list */

112

int queued[2];

112

int queued[2];

113

/* currently allocated requests */

113

/* currently allocated requests */

114

int allocated[2];

114

int allocated[2];

115

/* fifo list of requests in sort_list */

115

/* fifo list of requests in sort_list */

116

struct list_head fifo;

116

struct list_head fifo;

117

118

/* time when queue got scheduled in to dispatch first request. */

118

/* time when queue got scheduled in to dispatch first request. */

119

unsigned long dispatch_start;

119

unsigned long dispatch_start;

120

unsigned int allocated_slice;

120

unsigned int allocated_slice;

121

unsigned int slice_dispatch;

121

unsigned int slice_dispatch;

122

/* time when first request from queue completed and slice started. */

122

/* time when first request from queue completed and slice started. */

123

unsigned long slice_start;

123

unsigned long slice_start;

124

unsigned long slice_end;

124

unsigned long slice_end;

125

long slice_resid;

125

long slice_resid;

126

127

/* pending metadata requests */

127

/* pending metadata requests */

128

int meta_pending;

128

int meta_pending;

129

/* number of requests that are on the dispatch list or inside driver */

129

/* number of requests that are on the dispatch list or inside driver */

130

int dispatched;

130

int dispatched;

131

132

/* io prio of this group */

132

/* io prio of this group */

133

unsigned short ioprio, org_ioprio;

133

unsigned short ioprio, org_ioprio;

134

unsigned short ioprio_class, org_ioprio_class;

134

unsigned short ioprio_class, org_ioprio_class;

135

136

pid_t pid;

136

pid_t pid;

137

138

u32 seek_history;

138

u32 seek_history;

139

sector_t last_request_pos;

139

sector_t last_request_pos;

140

141

struct cfq_rb_root *service_tree;

141

struct cfq_rb_root *service_tree;

142

struct cfq_queue *new_cfqq;

142

struct cfq_queue *new_cfqq;

143

struct cfq_group *cfqg;

143

struct cfq_group *cfqg;

144

struct cfq_group *orig_cfqg;

144

struct cfq_group *orig_cfqg;

145

/* Sectors dispatched in current dispatch round */

145

/* Sectors dispatched in current dispatch round */

146

unsigned long nr_sectors;

146

unsigned long nr_sectors;

147

};

147

};

148

149

/*

149

/*

150

* First index in the service_trees.

150

* First index in the service_trees.

151

* IDLE is handled separately, so it has negative index

151

* IDLE is handled separately, so it has negative index

152

*/

152

*/

153

enum wl_prio_t {

153

enum wl_prio_t {

154

BE_WORKLOAD = 0,

154

BE_WORKLOAD = 0,

155

RT_WORKLOAD = 1,

155

RT_WORKLOAD = 1,

156

IDLE_WORKLOAD = 2,

156

IDLE_WORKLOAD = 2,

157

};

157

};

158

159

/*

159

/*

160

* Second index in the service_trees.

160

* Second index in the service_trees.

161

*/

161

*/

162

enum wl_type_t {

162

enum wl_type_t {

163

ASYNC_WORKLOAD = 0,

163

ASYNC_WORKLOAD = 0,

164

SYNC_NOIDLE_WORKLOAD = 1,

164

SYNC_NOIDLE_WORKLOAD = 1,

165

SYNC_WORKLOAD = 2

165

SYNC_WORKLOAD = 2

166

};

166

};

167

168

/* This is per cgroup per device grouping structure */

168

/* This is per cgroup per device grouping structure */

169

struct cfq_group {

169

struct cfq_group {

170

/* group service_tree member */

170

/* group service_tree member */

171

struct rb_node rb_node;

171

struct rb_node rb_node;

172

173

/* group service_tree key */

173

/* group service_tree key */

174

u64 vdisktime;

174

u64 vdisktime;

175

unsigned int weight;

175

unsigned int weight;

176

bool on_st;

176

bool on_st;

177

178

/* number of cfqq currently on this group */

178

/* number of cfqq currently on this group */

179

int nr_cfqq;

179

int nr_cfqq;

180

181

/* Per group busy queus average. Useful for workload slice calc. */

181

/* Per group busy queus average. Useful for workload slice calc. */

182

unsigned int busy_queues_avg[2];

182

unsigned int busy_queues_avg[2];

183

/*

183

/*

184

* rr lists of queues with requests, onle rr for each priority class.

184

* rr lists of queues with requests, onle rr for each priority class.

185

* Counts are embedded in the cfq_rb_root

185

* Counts are embedded in the cfq_rb_root

186

*/

186

*/

187

struct cfq_rb_root service_trees[2][3];

187

struct cfq_rb_root service_trees[2][3];

188

struct cfq_rb_root service_tree_idle;

188

struct cfq_rb_root service_tree_idle;

189

190

unsigned long saved_workload_slice;

190

unsigned long saved_workload_slice;

191

enum wl_type_t saved_workload;

191

enum wl_type_t saved_workload;

192

enum wl_prio_t saved_serving_prio;

192

enum wl_prio_t saved_serving_prio;

193

struct blkio_group blkg;

193

struct blkio_group blkg;

194

#ifdef CONFIG_CFQ_GROUP_IOSCHED

194

#ifdef CONFIG_CFQ_GROUP_IOSCHED

195

struct hlist_node cfqd_node;

195

struct hlist_node cfqd_node;

196

atomic_t ref;

196

atomic_t ref;

197

#endif

197

#endif

198

};

198

};

199

200

/*

200

/*

201

* Per block device queue structure

201

* Per block device queue structure

202

*/

202

*/

203

struct cfq_data {

203

struct cfq_data {

204

struct request_queue *queue;

204

struct request_queue *queue;

205

/* Root service tree for cfq_groups */

205

/* Root service tree for cfq_groups */

206

struct cfq_rb_root grp_service_tree;

206

struct cfq_rb_root grp_service_tree;

207

struct cfq_group root_group;

207

struct cfq_group root_group;

208

209

/*

209

/*

210

* The priority currently being served

210

* The priority currently being served

211

*/

211

*/

212

enum wl_prio_t serving_prio;

212

enum wl_prio_t serving_prio;

213

enum wl_type_t serving_type;

213

enum wl_type_t serving_type;

214

unsigned long workload_expires;

214

unsigned long workload_expires;

215

struct cfq_group *serving_group;

215

struct cfq_group *serving_group;

216

bool noidle_tree_requires_idle;

216

bool noidle_tree_requires_idle;

217

218

/*

218

/*

219

* Each priority tree is sorted by next_request position. These

219

* Each priority tree is sorted by next_request position. These

220

* trees are used when determining if two or more queues are

220

* trees are used when determining if two or more queues are

221

* interleaving requests (see cfq_close_cooperator).

221

* interleaving requests (see cfq_close_cooperator).

222

*/

222

*/

223

struct rb_root prio_trees[CFQ_PRIO_LISTS];

223

struct rb_root prio_trees[CFQ_PRIO_LISTS];

224

225

unsigned int busy_queues;

225

unsigned int busy_queues;

226

227

int rq_in_driver;

227

int rq_in_driver;

228

int rq_in_flight[2];

228

int rq_in_flight[2];

229

230

/*

230

/*

231

* queue-depth detection

231

* queue-depth detection

232

*/

232

*/

233

int rq_queued;

233

int rq_queued;

234

int hw_tag;

234

int hw_tag;

235

/*

235

/*

236

* hw_tag can be

236

* hw_tag can be

237

* -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)

237

* -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)

238

* 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)

238

* 1 => NCQ is present (hw_tag_est_depth is the estimated max depth)

239

* 0 => no NCQ

239

* 0 => no NCQ

240

*/

240

*/

241

int hw_tag_est_depth;

241

int hw_tag_est_depth;

242

unsigned int hw_tag_samples;

242

unsigned int hw_tag_samples;

243

244

/*

244

/*

245

* idle window management

245

* idle window management

246

*/

246

*/

247

struct timer_list idle_slice_timer;

247

struct timer_list idle_slice_timer;

248

struct work_struct unplug_work;

248

struct work_struct unplug_work;

249

250

struct cfq_queue *active_queue;

250

struct cfq_queue *active_queue;

251

struct cfq_io_context *active_cic;

251

struct cfq_io_context *active_cic;

252

253

/*

253

/*

254

* async queue for each priority case

254

* async queue for each priority case

255

*/

255

*/

256

struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];

256

struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];

257

struct cfq_queue *async_idle_cfqq;

257

struct cfq_queue *async_idle_cfqq;

258

259

sector_t last_position;

259

sector_t last_position;

260

261

/*

261

/*

262

* tunables, see top of file

262

* tunables, see top of file

263

*/

263

*/

264

unsigned int cfq_quantum;

264

unsigned int cfq_quantum;

265

unsigned int cfq_fifo_expire[2];

265

unsigned int cfq_fifo_expire[2];

266

unsigned int cfq_back_penalty;

266

unsigned int cfq_back_penalty;

267

unsigned int cfq_back_max;

267

unsigned int cfq_back_max;

268

unsigned int cfq_slice[2];

268

unsigned int cfq_slice[2];

269

unsigned int cfq_slice_async_rq;

269

unsigned int cfq_slice_async_rq;

270

unsigned int cfq_slice_idle;

270

unsigned int cfq_slice_idle;

271

unsigned int cfq_latency;

271

unsigned int cfq_latency;

272

unsigned int cfq_group_isolation;

272

unsigned int cfq_group_isolation;

273

274

struct list_head cic_list;

274

struct list_head cic_list;

275

276

/*

276

/*

277

* Fallback dummy cfqq for extreme OOM conditions

277

* Fallback dummy cfqq for extreme OOM conditions

278

*/

278

*/

279

struct cfq_queue oom_cfqq;

279

struct cfq_queue oom_cfqq;

280

281

unsigned long last_delayed_sync;

281

unsigned long last_delayed_sync;

282

283

/* List of cfq groups being managed on this device*/

283

/* List of cfq groups being managed on this device*/

284

struct hlist_head cfqg_list;

284

struct hlist_head cfqg_list;

285

struct rcu_head rcu;

285

struct rcu_head rcu;

286

};

286

};

287

288

static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);

288

static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);

289

290

static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,

290

static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,

291

enum wl_prio_t prio,

291

enum wl_prio_t prio,

292

enum wl_type_t type)

292

enum wl_type_t type)

293

{

293

{

294

if (!cfqg)

294

if (!cfqg)

295

return NULL;

295

return NULL;

296

297

if (prio == IDLE_WORKLOAD)

297

if (prio == IDLE_WORKLOAD)

298

return &cfqg->service_tree_idle;

298

return &cfqg->service_tree_idle;

299

300

return &cfqg->service_trees[prio][type];

300

return &cfqg->service_trees[prio][type];

301

}

301

}

302

303

enum cfqq_state_flags {

303

enum cfqq_state_flags {

304

CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */

304

CFQ_CFQQ_FLAG_on_rr = 0, /* on round-robin busy list */

305

CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */

305

CFQ_CFQQ_FLAG_wait_request, /* waiting for a request */

306

CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */

306

CFQ_CFQQ_FLAG_must_dispatch, /* must be allowed a dispatch */

307

CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */

307

CFQ_CFQQ_FLAG_must_alloc_slice, /* per-slice must_alloc flag */

308

CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */

308

CFQ_CFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */

309

CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */

309

CFQ_CFQQ_FLAG_idle_window, /* slice idling enabled */

310

CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */

310

CFQ_CFQQ_FLAG_prio_changed, /* task priority has changed */

311

CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */

311

CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */

312

CFQ_CFQQ_FLAG_sync, /* synchronous queue */

312

CFQ_CFQQ_FLAG_sync, /* synchronous queue */

313

CFQ_CFQQ_FLAG_coop, /* cfqq is shared */

313

CFQ_CFQQ_FLAG_coop, /* cfqq is shared */

314

CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */

314

CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */

315

CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */

315

CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */

316

CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */

316

CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */

317

};

317

};

318

319

#define CFQ_CFQQ_FNS(name) \

319

#define CFQ_CFQQ_FNS(name) \

320

static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \

320

static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq) \

321

{ \

321

{ \

322

(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \

322

(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name); \

323

} \

323

} \

324

static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \

324

static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq) \

325

{ \

325

{ \

326

(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \

326

(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name); \

327

} \

327

} \

328

static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \

328

static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq) \

329

{ \

329

{ \

330

return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \

330

return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0; \

331

}

331

}

332

333

CFQ_CFQQ_FNS(on_rr);

333

CFQ_CFQQ_FNS(on_rr);

334

CFQ_CFQQ_FNS(wait_request);

334

CFQ_CFQQ_FNS(wait_request);

335

CFQ_CFQQ_FNS(must_dispatch);

335

CFQ_CFQQ_FNS(must_dispatch);

336

CFQ_CFQQ_FNS(must_alloc_slice);

336

CFQ_CFQQ_FNS(must_alloc_slice);

337

CFQ_CFQQ_FNS(fifo_expire);

337

CFQ_CFQQ_FNS(fifo_expire);

338

CFQ_CFQQ_FNS(idle_window);

338

CFQ_CFQQ_FNS(idle_window);

339

CFQ_CFQQ_FNS(prio_changed);

339

CFQ_CFQQ_FNS(prio_changed);

340

CFQ_CFQQ_FNS(slice_new);

340

CFQ_CFQQ_FNS(slice_new);

341

CFQ_CFQQ_FNS(sync);

341

CFQ_CFQQ_FNS(sync);

342

CFQ_CFQQ_FNS(coop);

342

CFQ_CFQQ_FNS(coop);

343

CFQ_CFQQ_FNS(split_coop);

343

CFQ_CFQQ_FNS(split_coop);

344

CFQ_CFQQ_FNS(deep);

344

CFQ_CFQQ_FNS(deep);

345

CFQ_CFQQ_FNS(wait_busy);

345

CFQ_CFQQ_FNS(wait_busy);

346

#undef CFQ_CFQQ_FNS

346

#undef CFQ_CFQQ_FNS

347

348

#ifdef CONFIG_DEBUG_CFQ_IOSCHED

348

#ifdef CONFIG_DEBUG_CFQ_IOSCHED

349

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \

349

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \

350

blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \

350

blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \

351

cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \

351

cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \

352

blkg_path(&(cfqq)->cfqg->blkg), ##args);

352

blkg_path(&(cfqq)->cfqg->blkg), ##args);

353

354

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \

354

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) \

355

blk_add_trace_msg((cfqd)->queue, "%s " fmt, \

355

blk_add_trace_msg((cfqd)->queue, "%s " fmt, \

356

blkg_path(&(cfqg)->blkg), ##args); \

356

blkg_path(&(cfqg)->blkg), ##args); \

357

358

#else

358

#else

359

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \

359

#define cfq_log_cfqq(cfqd, cfqq, fmt, args...) \

360

blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)

360

blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)

361

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);

361

#define cfq_log_cfqg(cfqd, cfqg, fmt, args...) do {} while (0);

362

#endif

362

#endif

363

#define cfq_log(cfqd, fmt, args...) \

363

#define cfq_log(cfqd, fmt, args...) \

364

blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

364

blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)

365

366

/* Traverses through cfq group service trees */

366

/* Traverses through cfq group service trees */

367

#define for_each_cfqg_st(cfqg, i, j, st) \

367

#define for_each_cfqg_st(cfqg, i, j, st) \

368

for (i = 0; i <= IDLE_WORKLOAD; i++) \

368

for (i = 0; i <= IDLE_WORKLOAD; i++) \

369

for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\

369

for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\

370

: &cfqg->service_tree_idle; \

370

: &cfqg->service_tree_idle; \

371

(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \

371

(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \

372

(i == IDLE_WORKLOAD && j == 0); \

372

(i == IDLE_WORKLOAD && j == 0); \

373

j++, st = i < IDLE_WORKLOAD ? \

373

j++, st = i < IDLE_WORKLOAD ? \

374

&cfqg->service_trees[i][j]: NULL) \

374

&cfqg->service_trees[i][j]: NULL) \

375

376

377

static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)

377

static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)

378

{

378

{

379

if (cfq_class_idle(cfqq))

379

if (cfq_class_idle(cfqq))

380

return IDLE_WORKLOAD;

380

return IDLE_WORKLOAD;

381

if (cfq_class_rt(cfqq))

381

if (cfq_class_rt(cfqq))

382

return RT_WORKLOAD;

382

return RT_WORKLOAD;

383

return BE_WORKLOAD;

383

return BE_WORKLOAD;

384

}

384

}

385

386

387

static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)

387

static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)

388

{

388

{

389

if (!cfq_cfqq_sync(cfqq))

389

if (!cfq_cfqq_sync(cfqq))

390

return ASYNC_WORKLOAD;

390

return ASYNC_WORKLOAD;

391

if (!cfq_cfqq_idle_window(cfqq))

391

if (!cfq_cfqq_idle_window(cfqq))

392

return SYNC_NOIDLE_WORKLOAD;

392

return SYNC_NOIDLE_WORKLOAD;

393

return SYNC_WORKLOAD;

393

return SYNC_WORKLOAD;

394

}

394

}

395

396

static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,

396

static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,

397

struct cfq_data *cfqd,

397

struct cfq_data *cfqd,

398

struct cfq_group *cfqg)

398

struct cfq_group *cfqg)

399

{

399

{

400

if (wl == IDLE_WORKLOAD)

400

if (wl == IDLE_WORKLOAD)

401

return cfqg->service_tree_idle.count;

401

return cfqg->service_tree_idle.count;

402

403

return cfqg->service_trees[wl][ASYNC_WORKLOAD].count

403

return cfqg->service_trees[wl][ASYNC_WORKLOAD].count

404

+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count

404

+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count

405

+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;

405

+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;

406

}

406

}

407

408

static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,

408

static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,

409

struct cfq_group *cfqg)

409

struct cfq_group *cfqg)

410

{

410

{

411

return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count

411

return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count

412

+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;

412

+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;

413

}

413

}

414

415

static void cfq_dispatch_insert(struct request_queue *, struct request *);

415

static void cfq_dispatch_insert(struct request_queue *, struct request *);

416

static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,

416

static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,

417

struct io_context *, gfp_t);

417

struct io_context *, gfp_t);

418

static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,

418

static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,

419

struct io_context *);

419

struct io_context *);

420

421

static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,

421

static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,

422

bool is_sync)

422

bool is_sync)

423

{

423

{

424

return cic->cfqq[is_sync];

424

return cic->cfqq[is_sync];

425

}

425

}

426

427

static inline void cic_set_cfqq(struct cfq_io_context *cic,

427

static inline void cic_set_cfqq(struct cfq_io_context *cic,

428

struct cfq_queue *cfqq, bool is_sync)

428

struct cfq_queue *cfqq, bool is_sync)

429

{

429

{

430

cic->cfqq[is_sync] = cfqq;

430

cic->cfqq[is_sync] = cfqq;

431

}

431

}

432

433

/*

433

/*

434

* We regard a request as SYNC, if it's either a read or has the SYNC bit

434

* We regard a request as SYNC, if it's either a read or has the SYNC bit

435

* set (in which case it could also be direct WRITE).

435

* set (in which case it could also be direct WRITE).

436

*/

436

*/

437

static inline bool cfq_bio_sync(struct bio *bio)

437

static inline bool cfq_bio_sync(struct bio *bio)

438

{

438

{

439

return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);

439

return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);

440

}

440

}

441

442

/*

442

/*

443

* scheduler run of queue, if there are requests pending and no one in the

443

* scheduler run of queue, if there are requests pending and no one in the

444

* driver that will restart queueing

444

* driver that will restart queueing

445

*/

445

*/

446

static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)

446

static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)

447

{

447

{

448

if (cfqd->busy_queues) {

448

if (cfqd->busy_queues) {

449

cfq_log(cfqd, "schedule dispatch");

449

cfq_log(cfqd, "schedule dispatch");

450

kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);

450

kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);

451

}

451

}

452

}

452

}

453

454

static int cfq_queue_empty(struct request_queue *q)

454

static int cfq_queue_empty(struct request_queue *q)

455

{

455

{

456

struct cfq_data *cfqd = q->elevator->elevator_data;

456

struct cfq_data *cfqd = q->elevator->elevator_data;

457

458

return !cfqd->rq_queued;

458

return !cfqd->rq_queued;

459

}

459

}

460

461

/*

461

/*

462

* Scale schedule slice based on io priority. Use the sync time slice only

462

* Scale schedule slice based on io priority. Use the sync time slice only

463

* if a queue is marked sync and has sync io queued. A sync queue with async

463

* if a queue is marked sync and has sync io queued. A sync queue with async

464

* io only, should not get full sync slice length.

464

* io only, should not get full sync slice length.

465

*/

465

*/

466

static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,

466

static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,

467

unsigned short prio)

467

unsigned short prio)

468

{

468

{

469

const int base_slice = cfqd->cfq_slice[sync];

469

const int base_slice = cfqd->cfq_slice[sync];

470

471

WARN_ON(prio >= IOPRIO_BE_NR);

471

WARN_ON(prio >= IOPRIO_BE_NR);

472

473

return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));

473

return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));

474

}

474

}

475

476

static inline int

476

static inline int

477

cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

477

cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

478

{

478

{

479

return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);

479

return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);

480

}

480

}

481

482

static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)

482

static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)

483

{

483

{

484

u64 d = delta << CFQ_SERVICE_SHIFT;

484

u64 d = delta << CFQ_SERVICE_SHIFT;

485

486

d = d * BLKIO_WEIGHT_DEFAULT;

486

d = d * BLKIO_WEIGHT_DEFAULT;

487

do_div(d, cfqg->weight);

487

do_div(d, cfqg->weight);

488

return d;

488

return d;

489

}

489

}

490

491

static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)

491

static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)

492

{

492

{

493

s64 delta = (s64)(vdisktime - min_vdisktime);

493

s64 delta = (s64)(vdisktime - min_vdisktime);

494

if (delta > 0)

494

if (delta > 0)

495

min_vdisktime = vdisktime;

495

min_vdisktime = vdisktime;

496

497

return min_vdisktime;

497

return min_vdisktime;

498

}

498

}

499

500

static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)

500

static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)

501

{

501

{

502

s64 delta = (s64)(vdisktime - min_vdisktime);

502

s64 delta = (s64)(vdisktime - min_vdisktime);

503

if (delta < 0)

503

if (delta < 0)

504

min_vdisktime = vdisktime;

504

min_vdisktime = vdisktime;

505

506

return min_vdisktime;

506

return min_vdisktime;

507

}

507

}

508

509

static void update_min_vdisktime(struct cfq_rb_root *st)

509

static void update_min_vdisktime(struct cfq_rb_root *st)

510

{

510

{

511

u64 vdisktime = st->min_vdisktime;

511

u64 vdisktime = st->min_vdisktime;

512

struct cfq_group *cfqg;

512

struct cfq_group *cfqg;

513

514

if (st->active) {

514

if (st->active) {

515

cfqg = rb_entry_cfqg(st->active);

515

cfqg = rb_entry_cfqg(st->active);

516

vdisktime = cfqg->vdisktime;

516

vdisktime = cfqg->vdisktime;

517

}

517

}

518

519

if (st->left) {

519

if (st->left) {

520

cfqg = rb_entry_cfqg(st->left);

520

cfqg = rb_entry_cfqg(st->left);

521

vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);

521

vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);

522

}

522

}

523

524

st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);

524

st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);

525

}

525

}

526

527

/*

527

/*

528

* get averaged number of queues of RT/BE priority.

528

* get averaged number of queues of RT/BE priority.

529

* average is updated, with a formula that gives more weight to higher numbers,

529

* average is updated, with a formula that gives more weight to higher numbers,

530

* to quickly follows sudden increases and decrease slowly

530

* to quickly follows sudden increases and decrease slowly

531

*/

531

*/

532

533

static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,

533

static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,

534

struct cfq_group *cfqg, bool rt)

534

struct cfq_group *cfqg, bool rt)

535

{

535

{

536

unsigned min_q, max_q;

536

unsigned min_q, max_q;

537

unsigned mult = cfq_hist_divisor - 1;

537

unsigned mult = cfq_hist_divisor - 1;

538

unsigned round = cfq_hist_divisor / 2;

538

unsigned round = cfq_hist_divisor / 2;

539

unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);

539

unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);

540

541

min_q = min(cfqg->busy_queues_avg[rt], busy);

541

min_q = min(cfqg->busy_queues_avg[rt], busy);

542

max_q = max(cfqg->busy_queues_avg[rt], busy);

542

max_q = max(cfqg->busy_queues_avg[rt], busy);

543

cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /

543

cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /

544

cfq_hist_divisor;

544

cfq_hist_divisor;

545

return cfqg->busy_queues_avg[rt];

545

return cfqg->busy_queues_avg[rt];

546

}

546

}

547

548

static inline unsigned

548

static inline unsigned

549

cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)

549

cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)

550

{

550

{

551

struct cfq_rb_root *st = &cfqd->grp_service_tree;

551

struct cfq_rb_root *st = &cfqd->grp_service_tree;

552

553

return cfq_target_latency * cfqg->weight / st->total_weight;

553

return cfq_target_latency * cfqg->weight / st->total_weight;

554

}

554

}

555

556

static inline void

556

static inline void

557

cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

557

cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)

558

{

558

{

559

unsigned slice = cfq_prio_to_slice(cfqd, cfqq);

559

unsigned slice = cfq_prio_to_slice(cfqd, cfqq);

560

if (cfqd->cfq_latency) {

560

if (cfqd->cfq_latency) {

561

/*

561

/*

562

* interested queues (we consider only the ones with the same

562

* interested queues (we consider only the ones with the same

563

* priority class in the cfq group)

563

* priority class in the cfq group)

564

*/

564

*/

565

unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,

565

unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,

566

cfq_class_rt(cfqq));

566

cfq_class_rt(cfqq));

567

unsigned sync_slice = cfqd->cfq_slice[1];

567

unsigned sync_slice = cfqd->cfq_slice[1];

568

unsigned expect_latency = sync_slice * iq;

568

unsigned expect_latency = sync_slice * iq;

569

unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);

569

unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);

570

571

if (expect_latency > group_slice) {

571

if (expect_latency > group_slice) {

572

unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;

572

unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;

573

/* scale low_slice according to IO priority

573

/* scale low_slice according to IO priority

574

* and sync vs async */

574

* and sync vs async */

575

unsigned low_slice =

575

unsigned low_slice =

576

min(slice, base_low_slice * slice / sync_slice);

576

min(slice, base_low_slice * slice / sync_slice);

577

/* the adapted slice value is scaled to fit all iqs

577

/* the adapted slice value is scaled to fit all iqs

578

* into the target latency */

578

* into the target latency */

579

slice = max(slice * group_slice / expect_latency,

579

slice = max(slice * group_slice / expect_latency,

580

low_slice);

580

low_slice);

581

}

581

}

582

}

582

}

583

cfqq->slice_start = jiffies;

583

cfqq->slice_start = jiffies;

584

cfqq->slice_end = jiffies + slice;

584

cfqq->slice_end = jiffies + slice;

585

cfqq->allocated_slice = slice;

585

cfqq->allocated_slice = slice;

586

cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);

586

cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);

587

}

587

}

588

589

/*

589

/*

590

* We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end

590

* We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end

591

* isn't valid until the first request from the dispatch is activated

591

* isn't valid until the first request from the dispatch is activated

592

* and the slice time set.

592

* and the slice time set.

593

*/

593

*/

594

static inline bool cfq_slice_used(struct cfq_queue *cfqq)

594

static inline bool cfq_slice_used(struct cfq_queue *cfqq)

595

{

595

{

596

if (cfq_cfqq_slice_new(cfqq))

596

if (cfq_cfqq_slice_new(cfqq))

597

return 0;

597

return 0;

598

if (time_before(jiffies, cfqq->slice_end))

598

if (time_before(jiffies, cfqq->slice_end))

599

return 0;

599

return 0;

600

601

return 1;

601

return 1;

602

}

602

}

603

604

/*

604

/*

605

* Lifted from AS - choose which of rq1 and rq2 that is best served now.

605

* Lifted from AS - choose which of rq1 and rq2 that is best served now.

606

* We choose the request that is closest to the head right now. Distance

606

* We choose the request that is closest to the head right now. Distance

607

* behind the head is penalized and only allowed to a certain extent.

607

* behind the head is penalized and only allowed to a certain extent.

608

*/

608

*/

609

static struct request *

609

static struct request *

610

cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)

610

cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)

611

{

611

{

612

sector_t s1, s2, d1 = 0, d2 = 0;

612

sector_t s1, s2, d1 = 0, d2 = 0;

613

unsigned long back_max;

613

unsigned long back_max;

614

#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */

614

#define CFQ_RQ1_WRAP 0x01 /* request 1 wraps */

615

#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */

615

#define CFQ_RQ2_WRAP 0x02 /* request 2 wraps */

616

unsigned wrap = 0; /* bit mask: requests behind the disk head? */

616

unsigned wrap = 0; /* bit mask: requests behind the disk head? */

617

618

if (rq1 == NULL || rq1 == rq2)

618

if (rq1 == NULL || rq1 == rq2)

619

return rq2;

619

return rq2;

620

if (rq2 == NULL)

620

if (rq2 == NULL)

621

return rq1;

621

return rq1;

622

623

if (rq_is_sync(rq1) && !rq_is_sync(rq2))

623

if (rq_is_sync(rq1) && !rq_is_sync(rq2))

624

return rq1;

624

return rq1;

625

else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

625

else if (rq_is_sync(rq2) && !rq_is_sync(rq1))

626

return rq2;

626

return rq2;

627

if (rq_is_meta(rq1) && !rq_is_meta(rq2))

627

if (rq_is_meta(rq1) && !rq_is_meta(rq2))

628

return rq1;

628

return rq1;

629

else if (rq_is_meta(rq2) && !rq_is_meta(rq1))

629

else if (rq_is_meta(rq2) && !rq_is_meta(rq1))

630

return rq2;

630

return rq2;

631

632

s1 = blk_rq_pos(rq1);

632

s1 = blk_rq_pos(rq1);

633

s2 = blk_rq_pos(rq2);

633

s2 = blk_rq_pos(rq2);

634

635

/*

635

/*

636

* by definition, 1KiB is 2 sectors

636

* by definition, 1KiB is 2 sectors

637

*/

637

*/

638

back_max = cfqd->cfq_back_max * 2;

638

back_max = cfqd->cfq_back_max * 2;

639

640

/*

640

/*

641

* Strict one way elevator _except_ in the case where we allow

641

* Strict one way elevator _except_ in the case where we allow

642

* short backward seeks which are biased as twice the cost of a

642

* short backward seeks which are biased as twice the cost of a

643

* similar forward seek.

643

* similar forward seek.

644

*/

644

*/

645

if (s1 >= last)

645

if (s1 >= last)

646

d1 = s1 - last;

646

d1 = s1 - last;

647

else if (s1 + back_max >= last)

647

else if (s1 + back_max >= last)

648

d1 = (last - s1) * cfqd->cfq_back_penalty;

648

d1 = (last - s1) * cfqd->cfq_back_penalty;

649

else

649

else

650

wrap |= CFQ_RQ1_WRAP;

650

wrap |= CFQ_RQ1_WRAP;

651

652

if (s2 >= last)

652

if (s2 >= last)

653

d2 = s2 - last;

653

d2 = s2 - last;

654

else if (s2 + back_max >= last)

654

else if (s2 + back_max >= last)

655

d2 = (last - s2) * cfqd->cfq_back_penalty;

655

d2 = (last - s2) * cfqd->cfq_back_penalty;

656

else

656

else

657

wrap |= CFQ_RQ2_WRAP;

657

wrap |= CFQ_RQ2_WRAP;

658

659

/* Found required data */

659

/* Found required data */

660

661

/*

661

/*

662

* By doing switch() on the bit mask "wrap" we avoid having to

662

* By doing switch() on the bit mask "wrap" we avoid having to

663

* check two variables for all permutations: --> faster!

663

* check two variables for all permutations: --> faster!

664

*/

664

*/

665

switch (wrap) {

665

switch (wrap) {

666

case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

666

case 0: /* common case for CFQ: rq1 and rq2 not wrapped */

667

if (d1 < d2)

667

if (d1 < d2)

668

return rq1;

668

return rq1;

669

else if (d2 < d1)

669

else if (d2 < d1)

670

return rq2;

670

return rq2;

671

else {

671

else {

672

if (s1 >= s2)

672

if (s1 >= s2)

673

return rq1;

673

return rq1;

674

else

674

else

675

return rq2;

675

return rq2;

676

}

676

}

677

678

case CFQ_RQ2_WRAP:

678

case CFQ_RQ2_WRAP:

679

return rq1;

679

return rq1;

680

case CFQ_RQ1_WRAP:

680

case CFQ_RQ1_WRAP:

681

return rq2;

681

return rq2;

682

case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */

682

case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */

683

default:

683

default:

684

/*

684

/*

685

* Since both rqs are wrapped,

685

* Since both rqs are wrapped,

686

* start with the one that's further behind head

686

* start with the one that's further behind head

687

* (--> only *one* back seek required),

687

* (--> only *one* back seek required),

688

* since back seek takes more time than forward.

688

* since back seek takes more time than forward.

689

*/

689

*/

690

if (s1 <= s2)

690

if (s1 <= s2)

691

return rq1;

691

return rq1;

692

else

692

else

693

return rq2;

693

return rq2;

694

}

694

}

695

}

695

}

696

697

/*

697

/*

698

* The below is leftmost cache rbtree addon

698

* The below is leftmost cache rbtree addon

699

*/

699

*/

700

static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)

700

static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)

701

{

701

{

702

/* Service tree is empty */

702

/* Service tree is empty */

703

if (!root->count)

703

if (!root->count)

704

return NULL;

704

return NULL;

705

706

if (!root->left)

706

if (!root->left)

707

root->left = rb_first(&root->rb);

707

root->left = rb_first(&root->rb);

708

709

if (root->left)

709

if (root->left)

710

return rb_entry(root->left, struct cfq_queue, rb_node);

710

return rb_entry(root->left, struct cfq_queue, rb_node);

711

712

return NULL;

712

return NULL;

713

}

713

}

714

715

static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)

715

static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)

716

{

716

{

717

if (!root->left)

717

if (!root->left)

718

root->left = rb_first(&root->rb);

718

root->left = rb_first(&root->rb);

719

720

if (root->left)

720

if (root->left)

721

return rb_entry_cfqg(root->left);

721

return rb_entry_cfqg(root->left);

722

723

return NULL;

723

return NULL;

724

}

724

}

725

726

static void rb_erase_init(struct rb_node *n, struct rb_root *root)

726

static void rb_erase_init(struct rb_node *n, struct rb_root *root)

727

{

727

{

728

rb_erase(n, root);

728

rb_erase(n, root);

729

RB_CLEAR_NODE(n);

729

RB_CLEAR_NODE(n);

730

}

730

}

731

732

static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)

732

static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)

733

{

733

{

734

if (root->left == n)

734

if (root->left == n)

735

root->left = NULL;

735

root->left = NULL;

736

rb_erase_init(n, &root->rb);

736

rb_erase_init(n, &root->rb);

737

--root->count;

737

--root->count;

738

}

738

}

739

740

/*

740

/*

741

* would be nice to take fifo expire time into account as well

741

* would be nice to take fifo expire time into account as well

742

*/

742

*/

743

static struct request *

743

static struct request *

744

cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,

744

cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,

745

struct request *last)

745

struct request *last)

746

{

746

{

747

struct rb_node *rbnext = rb_next(&last->rb_node);

747

struct rb_node *rbnext = rb_next(&last->rb_node);

748

struct rb_node *rbprev = rb_prev(&last->rb_node);

748

struct rb_node *rbprev = rb_prev(&last->rb_node);

749

struct request *next = NULL, *prev = NULL;

749

struct request *next = NULL, *prev = NULL;

750

751

BUG_ON(RB_EMPTY_NODE(&last->rb_node));

751

BUG_ON(RB_EMPTY_NODE(&last->rb_node));

752

753

if (rbprev)

753

if (rbprev)

754

prev = rb_entry_rq(rbprev);

754

prev = rb_entry_rq(rbprev);

755

756

if (rbnext)

756

if (rbnext)

757

next = rb_entry_rq(rbnext);

757

next = rb_entry_rq(rbnext);

758

else {

758

else {

759

rbnext = rb_first(&cfqq->sort_list);

759

rbnext = rb_first(&cfqq->sort_list);

760

if (rbnext && rbnext != &last->rb_node)

760

if (rbnext && rbnext != &last->rb_node)

761

next = rb_entry_rq(rbnext);

761

next = rb_entry_rq(rbnext);

762

}

762

}

763

764

return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));

764

return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));

765

}

765

}

766

767

static unsigned long cfq_slice_offset(struct cfq_data *cfqd,

767

static unsigned long cfq_slice_offset(struct cfq_data *cfqd,

768

struct cfq_queue *cfqq)

768

struct cfq_queue *cfqq)

769

{

769

{

770

/*

770

/*

771

* just an approximation, should be ok.

771

* just an approximation, should be ok.

772

*/

772

*/

773

return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -

773

return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -

774

cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));

774

cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));

775

}

775

}

776

777

static inline s64

777

static inline s64

778

cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)

778

cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)

779

{

779

{

780

return cfqg->vdisktime - st->min_vdisktime;

780

return cfqg->vdisktime - st->min_vdisktime;

781

}

781

}

782

783

static void

783

static void

784

__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)

784

__cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)

785

{

785

{

786

struct rb_node **node = &st->rb.rb_node;

786

struct rb_node **node = &st->rb.rb_node;

787

struct rb_node *parent = NULL;

787

struct rb_node *parent = NULL;

788

struct cfq_group *__cfqg;

788

struct cfq_group *__cfqg;

789

s64 key = cfqg_key(st, cfqg);

789

s64 key = cfqg_key(st, cfqg);

790

int left = 1;

790

int left = 1;

791

792

while (*node != NULL) {

792

while (*node != NULL) {

793

parent = *node;

793

parent = *node;

794

__cfqg = rb_entry_cfqg(parent);

794

__cfqg = rb_entry_cfqg(parent);

795

796

if (key < cfqg_key(st, __cfqg))

796

if (key < cfqg_key(st, __cfqg))

797

node = &parent->rb_left;

797

node = &parent->rb_left;

798

else {

798

else {

799

node = &parent->rb_right;

799

node = &parent->rb_right;

800

left = 0;

800

left = 0;

801

}

801

}

802

}

802

}

803

804

if (left)

804

if (left)

805

st->left = &cfqg->rb_node;

805

st->left = &cfqg->rb_node;

806

807

rb_link_node(&cfqg->rb_node, parent, node);

807

rb_link_node(&cfqg->rb_node, parent, node);

808

rb_insert_color(&cfqg->rb_node, &st->rb);

808

rb_insert_color(&cfqg->rb_node, &st->rb);

809

}

809

}

810

811

static void

811

static void

812

cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)

812

cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)

813

{

813

{

814

struct cfq_rb_root *st = &cfqd->grp_service_tree;

814

struct cfq_rb_root *st = &cfqd->grp_service_tree;

815

struct cfq_group *__cfqg;

815

struct cfq_group *__cfqg;

816

struct rb_node *n;

816

struct rb_node *n;

817

818

cfqg->nr_cfqq++;

818

cfqg->nr_cfqq++;

819

if (cfqg->on_st)

819

if (cfqg->on_st)

820

return;

820

return;

821

822

/*

822

/*

823

* Currently put the group at the end. Later implement something

823

* Currently put the group at the end. Later implement something

824

* so that groups get lesser vtime based on their weights, so that

824

* so that groups get lesser vtime based on their weights, so that

825

* if group does not loose all if it was not continously backlogged.

825

* if group does not loose all if it was not continously backlogged.

826

*/

826

*/

827

n = rb_last(&st->rb);

827

n = rb_last(&st->rb);

828

if (n) {

828

if (n) {

829

__cfqg = rb_entry_cfqg(n);

829

__cfqg = rb_entry_cfqg(n);

830

cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;

830

cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;

831

} else

831

} else

832

cfqg->vdisktime = st->min_vdisktime;

832

cfqg->vdisktime = st->min_vdisktime;

833

834

__cfq_group_service_tree_add(st, cfqg);

834

__cfq_group_service_tree_add(st, cfqg);

835

cfqg->on_st = true;

835

cfqg->on_st = true;

836

st->total_weight += cfqg->weight;

836

st->total_weight += cfqg->weight;

837

}

837

}

838

839

static void

839

static void

840

cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)

840

cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)

841

{

841

{

842

struct cfq_rb_root *st = &cfqd->grp_service_tree;

842

struct cfq_rb_root *st = &cfqd->grp_service_tree;

843

844

if (st->active == &cfqg->rb_node)

844

if (st->active == &cfqg->rb_node)

845

st->active = NULL;

845

st->active = NULL;

846

847

BUG_ON(cfqg->nr_cfqq < 1);

847

BUG_ON(cfqg->nr_cfqq < 1);

848

cfqg->nr_cfqq--;

848

cfqg->nr_cfqq--;

849

850

/* If there are other cfq queues under this group, don't delete it */

850

/* If there are other cfq queues under this group, don't delete it */

851

if (cfqg->nr_cfqq)

851

if (cfqg->nr_cfqq)

852

return;

852

return;

853

854

cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");

854

cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");

855

cfqg->on_st = false;

855

cfqg->on_st = false;

856

st->total_weight -= cfqg->weight;

856

st->total_weight -= cfqg->weight;

857

if (!RB_EMPTY_NODE(&cfqg->rb_node))

857

if (!RB_EMPTY_NODE(&cfqg->rb_node))

858

cfq_rb_erase(&cfqg->rb_node, st);

858

cfq_rb_erase(&cfqg->rb_node, st);

859

cfqg->saved_workload_slice = 0;

859

cfqg->saved_workload_slice = 0;

860

blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);

860

blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);

861

}

861

}

862

863

static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)

863

static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)

864

{

864

{

865

unsigned int slice_used;

865

unsigned int slice_used;

866

867

/*

867

/*

868

* Queue got expired before even a single request completed or

868

* Queue got expired before even a single request completed or

869

* got expired immediately after first request completion.

869

* got expired immediately after first request completion.

870

*/

870

*/

871

if (!cfqq->slice_start || cfqq->slice_start == jiffies) {

871

if (!cfqq->slice_start || cfqq->slice_start == jiffies) {

872

/*

872

/*

873

* Also charge the seek time incurred to the group, otherwise

873

* Also charge the seek time incurred to the group, otherwise

874

* if there are mutiple queues in the group, each can dispatch

874

* if there are mutiple queues in the group, each can dispatch

875

* a single request on seeky media and cause lots of seek time

875

* a single request on seeky media and cause lots of seek time

876

* and group will never know it.

876

* and group will never know it.

877

*/

877

*/

878

slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),

878

slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),

879

1);

879

1);

880

} else {

880

} else {

881

slice_used = jiffies - cfqq->slice_start;

881

slice_used = jiffies - cfqq->slice_start;

882

if (slice_used > cfqq->allocated_slice)

882

if (slice_used > cfqq->allocated_slice)

883

slice_used = cfqq->allocated_slice;

883

slice_used = cfqq->allocated_slice;

884

}

884

}

885

886

cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,

886

cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,

887

cfqq->nr_sectors);

887

cfqq->nr_sectors);

888

return slice_used;

888

return slice_used;

889

}

889

}

890

891

static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,

891

static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,

892

struct cfq_queue *cfqq)

892

struct cfq_queue *cfqq)

893

{

893

{

894

struct cfq_rb_root *st = &cfqd->grp_service_tree;

894

struct cfq_rb_root *st = &cfqd->grp_service_tree;

895

unsigned int used_sl, charge_sl;

895

unsigned int used_sl, charge_sl;

896

int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)

896

int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)

897

- cfqg->service_tree_idle.count;

897

- cfqg->service_tree_idle.count;

898

899

BUG_ON(nr_sync < 0);

899

BUG_ON(nr_sync < 0);

900

used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);

900

used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);

901

902

if (!cfq_cfqq_sync(cfqq) && !nr_sync)

902

if (!cfq_cfqq_sync(cfqq) && !nr_sync)

903

charge_sl = cfqq->allocated_slice;

903

charge_sl = cfqq->allocated_slice;

904

905

/* Can't update vdisktime while group is on service tree */

905

/* Can't update vdisktime while group is on service tree */

906

cfq_rb_erase(&cfqg->rb_node, st);

906

cfq_rb_erase(&cfqg->rb_node, st);

907

cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);

907

cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);

908

__cfq_group_service_tree_add(st, cfqg);

908

__cfq_group_service_tree_add(st, cfqg);

909

910

/* This group is being expired. Save the context */

910

/* This group is being expired. Save the context */

911

if (time_after(cfqd->workload_expires, jiffies)) {

911

if (time_after(cfqd->workload_expires, jiffies)) {

912

cfqg->saved_workload_slice = cfqd->workload_expires

912

cfqg->saved_workload_slice = cfqd->workload_expires

913

- jiffies;

913

- jiffies;

914

cfqg->saved_workload = cfqd->serving_type;

914

cfqg->saved_workload = cfqd->serving_type;

915

cfqg->saved_serving_prio = cfqd->serving_prio;

915

cfqg->saved_serving_prio = cfqd->serving_prio;

916

} else

916

} else

917

cfqg->saved_workload_slice = 0;

917

cfqg->saved_workload_slice = 0;

918

919

cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,

919

cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,

920

st->min_vdisktime);

920

st->min_vdisktime);

921

blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,

921

blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,

922

cfqq->nr_sectors);

922

cfqq->nr_sectors);

923

}

923

}

924

925

#ifdef CONFIG_CFQ_GROUP_IOSCHED

925

#ifdef CONFIG_CFQ_GROUP_IOSCHED

926

static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)

926

static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)

927

{

927

{

928

if (blkg)

928

if (blkg)

929

return container_of(blkg, struct cfq_group, blkg);

929

return container_of(blkg, struct cfq_group, blkg);

930

return NULL;

930

return NULL;

931

}

931

}

932

933

void

933

void

934

cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)

934

cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)

935

{

935

{

936

cfqg_of_blkg(blkg)->weight = weight;

936

cfqg_of_blkg(blkg)->weight = weight;

937

}

937

}

938

939

static struct cfq_group *

939

static struct cfq_group *

940

cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)

940

cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)

941

{

941

{

942

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);

942

struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);

943

struct cfq_group *cfqg = NULL;

943

struct cfq_group *cfqg = NULL;

944

void *key = cfqd;

944

void *key = cfqd;

945

int i, j;

945

int i, j;

946

struct cfq_rb_root *st;

946

struct cfq_rb_root *st;

947

struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;

947

struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;

948

unsigned int major, minor;

948

unsigned int major, minor;

949

950

cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));

950

cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));

951

if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {

951

if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {

952

sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);

952

sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);

953

cfqg->blkg.dev = MKDEV(major, minor);

953

cfqg->blkg.dev = MKDEV(major, minor);

954

goto done;

954

goto done;

955

}

955

}

956

if (cfqg || !create)

956

if (cfqg || !create)

957

goto done;

957

goto done;

958

959

cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);

959

cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);

960

if (!cfqg)

960

if (!cfqg)

961

goto done;

961

goto done;

962

963

cfqg->weight = blkcg->weight;

963

cfqg->weight = blkcg->weight;

964

for_each_cfqg_st(cfqg, i, j, st)

964

for_each_cfqg_st(cfqg, i, j, st)

965

*st = CFQ_RB_ROOT;

965

*st = CFQ_RB_ROOT;

966

RB_CLEAR_NODE(&cfqg->rb_node);

966

RB_CLEAR_NODE(&cfqg->rb_node);

967

968

/*

968

/*

969

* Take the initial reference that will be released on destroy

969

* Take the initial reference that will be released on destroy

970

* This can be thought of a joint reference by cgroup and

970

* This can be thought of a joint reference by cgroup and

971

* elevator which will be dropped by either elevator exit

971

* elevator which will be dropped by either elevator exit

972

* or cgroup deletion path depending on who is exiting first.

972

* or cgroup deletion path depending on who is exiting first.

973

*/

973

*/

974

atomic_set(&cfqg->ref, 1);

974

atomic_set(&cfqg->ref, 1);

975

976

/* Add group onto cgroup list */

976

/* Add group onto cgroup list */

977

sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);

977

sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);

978

blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,

978

blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,

979

MKDEV(major, minor));

979

MKDEV(major, minor));

980

981

/* Add group on cfqd list */

981

/* Add group on cfqd list */

982

hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);

982

hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);

983

984

done:

984

done:

985

return cfqg;

985

return cfqg;

986

}

986

}

987

988

/*

988

/*

989

* Search for the cfq group current task belongs to. If create = 1, then also

989

* Search for the cfq group current task belongs to. If create = 1, then also

990

* create the cfq group if it does not exist. request_queue lock must be held.

990

* create the cfq group if it does not exist. request_queue lock must be held.

991

*/

991

*/

992

static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)

992

static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)

993

{

993

{

994

struct cgroup *cgroup;

994

struct cgroup *cgroup;

995

struct cfq_group *cfqg = NULL;

995

struct cfq_group *cfqg = NULL;

996

997

rcu_read_lock();

997

rcu_read_lock();

998

cgroup = task_cgroup(current, blkio_subsys_id);

998

cgroup = task_cgroup(current, blkio_subsys_id);

999

cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);

999

cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);

1000

if (!cfqg && create)

1000

if (!cfqg && create)

1001

cfqg = &cfqd->root_group;

1001

cfqg = &cfqd->root_group;

1002

rcu_read_unlock();

1002

rcu_read_unlock();

1003

return cfqg;

1003

return cfqg;

1004

}

1004

}

1005

1006

static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)

1006

static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)

1007

{

1007

{

1008

/* Currently, all async queues are mapped to root group */

1008

/* Currently, all async queues are mapped to root group */

1009

if (!cfq_cfqq_sync(cfqq))

1009

if (!cfq_cfqq_sync(cfqq))

1010

cfqg = &cfqq->cfqd->root_group;

1010

cfqg = &cfqq->cfqd->root_group;

1011

1012

cfqq->cfqg = cfqg;

1012

cfqq->cfqg = cfqg;

1013

/* cfqq reference on cfqg */

1013

/* cfqq reference on cfqg */

1014

atomic_inc(&cfqq->cfqg->ref);

1014

atomic_inc(&cfqq->cfqg->ref);

1015

}

1015

}

1016

1017

static void cfq_put_cfqg(struct cfq_group *cfqg)

1017

static void cfq_put_cfqg(struct cfq_group *cfqg)

1018

{

1018

{

1019

struct cfq_rb_root *st;

1019

struct cfq_rb_root *st;

1020

int i, j;

1020

int i, j;

1021

1022

BUG_ON(atomic_read(&cfqg->ref) <= 0);

1022

BUG_ON(atomic_read(&cfqg->ref) <= 0);

1023

if (!atomic_dec_and_test(&cfqg->ref))

1023

if (!atomic_dec_and_test(&cfqg->ref))

1024

return;

1024

return;

1025

for_each_cfqg_st(cfqg, i, j, st)

1025

for_each_cfqg_st(cfqg, i, j, st)

1026

BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);

1026

BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);

1027

kfree(cfqg);

1027

kfree(cfqg);

1028

}

1028

}

1029

1030

static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)

1030

static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)

1031

{

1031

{

1032

/* Something wrong if we are trying to remove same group twice */

1032

/* Something wrong if we are trying to remove same group twice */

1033

BUG_ON(hlist_unhashed(&cfqg->cfqd_node));

1033

BUG_ON(hlist_unhashed(&cfqg->cfqd_node));

1034

1035

hlist_del_init(&cfqg->cfqd_node);

1035

hlist_del_init(&cfqg->cfqd_node);

1036

1037

/*

1037

/*

1038

* Put the reference taken at the time of creation so that when all

1038

* Put the reference taken at the time of creation so that when all

1039

* queues are gone, group can be destroyed.

1039

* queues are gone, group can be destroyed.

1040

*/

1040

*/

1041

cfq_put_cfqg(cfqg);

1041

cfq_put_cfqg(cfqg);

1042

}

1042

}

1043

1044

static void cfq_release_cfq_groups(struct cfq_data *cfqd)

1044

static void cfq_release_cfq_groups(struct cfq_data *cfqd)

1045

{

1045

{

1046

struct hlist_node *pos, *n;

1046

struct hlist_node *pos, *n;

1047

struct cfq_group *cfqg;

1047

struct cfq_group *cfqg;

1048

1049

hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {

1049

hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {

1050

/*

1050

/*

1051

* If cgroup removal path got to blk_group first and removed

1051

* If cgroup removal path got to blk_group first and removed

1052

* it from cgroup list, then it will take care of destroying

1052

* it from cgroup list, then it will take care of destroying

1053

* cfqg also.

1053

* cfqg also.

1054

*/

1054

*/

1055

if (!blkiocg_del_blkio_group(&cfqg->blkg))

1055

if (!blkiocg_del_blkio_group(&cfqg->blkg))

1056

cfq_destroy_cfqg(cfqd, cfqg);

1056

cfq_destroy_cfqg(cfqd, cfqg);

1057

}

1057

}

1058

}

1058

}

1059

1060

/*

1060

/*

1061

* Blk cgroup controller notification saying that blkio_group object is being

1061

* Blk cgroup controller notification saying that blkio_group object is being

1062

* delinked as associated cgroup object is going away. That also means that

1062

* delinked as associated cgroup object is going away. That also means that

1063

* no new IO will come in this group. So get rid of this group as soon as

1063

* no new IO will come in this group. So get rid of this group as soon as

1064

* any pending IO in the group is finished.

1064

* any pending IO in the group is finished.

1065

*

1065

*

1066

* This function is called under rcu_read_lock(). key is the rcu protected

1066

* This function is called under rcu_read_lock(). key is the rcu protected

1067

* pointer. That means "key" is a valid cfq_data pointer as long as we are rcu

1067

* pointer. That means "key" is a valid cfq_data pointer as long as we are rcu

1068

* read lock.

1068

* read lock.

1069

*

1069

*

1070

* "key" was fetched from blkio_group under blkio_cgroup->lock. That means

1070

* "key" was fetched from blkio_group under blkio_cgroup->lock. That means

1071

* it should not be NULL as even if elevator was exiting, cgroup deltion

1071

* it should not be NULL as even if elevator was exiting, cgroup deltion

1072

* path got to it first.

1072

* path got to it first.

1073

*/

1073

*/

1074

void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)

1074

void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)

1075

{

1075

{

1076

unsigned long flags;

1076

unsigned long flags;

1077

struct cfq_data *cfqd = key;

1077

struct cfq_data *cfqd = key;

1078

1079

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

1079

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

1080

cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));

1080

cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));

1081

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

1081

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

1082

}

1082

}

1083

1084

#else /* GROUP_IOSCHED */

1084

#else /* GROUP_IOSCHED */

1085

static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)

1085

static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)

1086

{

1086

{

1087

return &cfqd->root_group;

1087

return &cfqd->root_group;

1088

}

1088

}

1089

static inline void

1089

static inline void

1090

cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {

1090

cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {

1091

cfqq->cfqg = cfqg;

1091

cfqq->cfqg = cfqg;

1092

}

1092

}

1093

1094

static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}

1094

static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}

1095

static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}

1095

static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}

1096

1097

#endif /* GROUP_IOSCHED */

1097

#endif /* GROUP_IOSCHED */

1098

1099

/*

1099

/*

1100

* The cfqd->service_trees holds all pending cfq_queue's that have

1100

* The cfqd->service_trees holds all pending cfq_queue's that have

1101

* requests waiting to be processed. It is sorted in the order that

1101

* requests waiting to be processed. It is sorted in the order that

1102

* we will service the queues.

1102

* we will service the queues.

1103

*/

1103

*/

1104

static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1104

static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1105

bool add_front)

1105

bool add_front)

1106

{

1106

{

1107

struct rb_node **p, *parent;

1107

struct rb_node **p, *parent;

1108

struct cfq_queue *__cfqq;

1108

struct cfq_queue *__cfqq;

1109

unsigned long rb_key;

1109

unsigned long rb_key;

1110

struct cfq_rb_root *service_tree;

1110

struct cfq_rb_root *service_tree;

1111

int left;

1111

int left;

1112

int new_cfqq = 1;

1112

int new_cfqq = 1;

1113

int group_changed = 0;

1113

int group_changed = 0;

1114

1115

#ifdef CONFIG_CFQ_GROUP_IOSCHED

1115

#ifdef CONFIG_CFQ_GROUP_IOSCHED

1116

if (!cfqd->cfq_group_isolation

1116

if (!cfqd->cfq_group_isolation

1117

&& cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD

1117

&& cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD

1118

&& cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {

1118

&& cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {

1119

/* Move this cfq to root group */

1119

/* Move this cfq to root group */

1120

cfq_log_cfqq(cfqd, cfqq, "moving to root group");

1120

cfq_log_cfqq(cfqd, cfqq, "moving to root group");

1121

if (!RB_EMPTY_NODE(&cfqq->rb_node))

1121

if (!RB_EMPTY_NODE(&cfqq->rb_node))

1122

cfq_group_service_tree_del(cfqd, cfqq->cfqg);

1122

cfq_group_service_tree_del(cfqd, cfqq->cfqg);

1123

cfqq->orig_cfqg = cfqq->cfqg;

1123

cfqq->orig_cfqg = cfqq->cfqg;

1124

cfqq->cfqg = &cfqd->root_group;

1124

cfqq->cfqg = &cfqd->root_group;

1125

atomic_inc(&cfqd->root_group.ref);

1125

atomic_inc(&cfqd->root_group.ref);

1126

group_changed = 1;

1126

group_changed = 1;

1127

} else if (!cfqd->cfq_group_isolation

1127

} else if (!cfqd->cfq_group_isolation

1128

&& cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {

1128

&& cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {

1129

/* cfqq is sequential now needs to go to its original group */

1129

/* cfqq is sequential now needs to go to its original group */

1130

BUG_ON(cfqq->cfqg != &cfqd->root_group);

1130

BUG_ON(cfqq->cfqg != &cfqd->root_group);

1131

if (!RB_EMPTY_NODE(&cfqq->rb_node))

1131

if (!RB_EMPTY_NODE(&cfqq->rb_node))

1132

cfq_group_service_tree_del(cfqd, cfqq->cfqg);

1132

cfq_group_service_tree_del(cfqd, cfqq->cfqg);

1133

cfq_put_cfqg(cfqq->cfqg);

1133

cfq_put_cfqg(cfqq->cfqg);

1134

cfqq->cfqg = cfqq->orig_cfqg;

1134

cfqq->cfqg = cfqq->orig_cfqg;

1135

cfqq->orig_cfqg = NULL;

1135

cfqq->orig_cfqg = NULL;

1136

group_changed = 1;

1136

group_changed = 1;

1137

cfq_log_cfqq(cfqd, cfqq, "moved to origin group");

1137

cfq_log_cfqq(cfqd, cfqq, "moved to origin group");

1138

}

1138

}

1139

#endif

1139

#endif

1140

1141

service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),

1141

service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),

1142

cfqq_type(cfqq));

1142

cfqq_type(cfqq));

1143

if (cfq_class_idle(cfqq)) {

1143

if (cfq_class_idle(cfqq)) {

1144

rb_key = CFQ_IDLE_DELAY;

1144

rb_key = CFQ_IDLE_DELAY;

1145

parent = rb_last(&service_tree->rb);

1145

parent = rb_last(&service_tree->rb);

1146

if (parent && parent != &cfqq->rb_node) {

1146

if (parent && parent != &cfqq->rb_node) {

1147

__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1147

__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1148

rb_key += __cfqq->rb_key;

1148

rb_key += __cfqq->rb_key;

1149

} else

1149

} else

1150

rb_key += jiffies;

1150

rb_key += jiffies;

1151

} else if (!add_front) {

1151

} else if (!add_front) {

1152

/*

1152

/*

1153

* Get our rb key offset. Subtract any residual slice

1153

* Get our rb key offset. Subtract any residual slice

1154

* value carried from last service. A negative resid

1154

* value carried from last service. A negative resid

1155

* count indicates slice overrun, and this should position

1155

* count indicates slice overrun, and this should position

1156

* the next service time further away in the tree.

1156

* the next service time further away in the tree.

1157

*/

1157

*/

1158

rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;

1158

rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;

1159

rb_key -= cfqq->slice_resid;

1159

rb_key -= cfqq->slice_resid;

1160

cfqq->slice_resid = 0;

1160

cfqq->slice_resid = 0;

1161

} else {

1161

} else {

1162

rb_key = -HZ;

1162

rb_key = -HZ;

1163

__cfqq = cfq_rb_first(service_tree);

1163

__cfqq = cfq_rb_first(service_tree);

1164

rb_key += __cfqq ? __cfqq->rb_key : jiffies;

1164

rb_key += __cfqq ? __cfqq->rb_key : jiffies;

1165

}

1165

}

1166

1167

if (!RB_EMPTY_NODE(&cfqq->rb_node)) {

1167

if (!RB_EMPTY_NODE(&cfqq->rb_node)) {

1168

new_cfqq = 0;

1168

new_cfqq = 0;

1169

/*

1169

/*

1170

* same position, nothing more to do

1170

* same position, nothing more to do

1171

*/

1171

*/

1172

if (rb_key == cfqq->rb_key &&

1172

if (rb_key == cfqq->rb_key &&

1173

cfqq->service_tree == service_tree)

1173

cfqq->service_tree == service_tree)

1174

return;

1174

return;

1175

1176

cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);

1176

cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);

1177

cfqq->service_tree = NULL;

1177

cfqq->service_tree = NULL;

1178

}

1178

}

1179

1180

left = 1;

1180

left = 1;

1181

parent = NULL;

1181

parent = NULL;

1182

cfqq->service_tree = service_tree;

1182

cfqq->service_tree = service_tree;

1183

p = &service_tree->rb.rb_node;

1183

p = &service_tree->rb.rb_node;

1184

while (*p) {

1184

while (*p) {

1185

struct rb_node **n;

1185

struct rb_node **n;

1186

1187

parent = *p;

1187

parent = *p;

1188

__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1188

__cfqq = rb_entry(parent, struct cfq_queue, rb_node);

1189

1190

/*

1190

/*

1191

* sort by key, that represents service time.

1191

* sort by key, that represents service time.

1192

*/

1192

*/

1193

if (time_before(rb_key, __cfqq->rb_key))

1193

if (time_before(rb_key, __cfqq->rb_key))

1194

n = &(*p)->rb_left;

1194

n = &(*p)->rb_left;

1195

else {

1195

else {

1196

n = &(*p)->rb_right;

1196

n = &(*p)->rb_right;

1197

left = 0;

1197

left = 0;

1198

}

1198

}

1199

1200

p = n;

1200

p = n;

1201

}

1201

}

1202

1203

if (left)

1203

if (left)

1204

service_tree->left = &cfqq->rb_node;

1204

service_tree->left = &cfqq->rb_node;

1205

1206

cfqq->rb_key = rb_key;

1206

cfqq->rb_key = rb_key;

1207

rb_link_node(&cfqq->rb_node, parent, p);

1207

rb_link_node(&cfqq->rb_node, parent, p);

1208

rb_insert_color(&cfqq->rb_node, &service_tree->rb);

1208

rb_insert_color(&cfqq->rb_node, &service_tree->rb);

1209

service_tree->count++;

1209

service_tree->count++;

1210

if ((add_front || !new_cfqq) && !group_changed)

1210

if ((add_front || !new_cfqq) && !group_changed)

1211

return;

1211

return;

1212

cfq_group_service_tree_add(cfqd, cfqq->cfqg);

1212

cfq_group_service_tree_add(cfqd, cfqq->cfqg);

1213

}

1213

}

1214

1215

static struct cfq_queue *

1215

static struct cfq_queue *

1216

cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,

1216

cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,

1217

sector_t sector, struct rb_node **ret_parent,

1217

sector_t sector, struct rb_node **ret_parent,

1218

struct rb_node ***rb_link)

1218

struct rb_node ***rb_link)

1219

{

1219

{

1220

struct rb_node **p, *parent;

1220

struct rb_node **p, *parent;

1221

struct cfq_queue *cfqq = NULL;

1221

struct cfq_queue *cfqq = NULL;

1222

1223

parent = NULL;

1223

parent = NULL;

1224

p = &root->rb_node;

1224

p = &root->rb_node;

1225

while (*p) {

1225

while (*p) {

1226

struct rb_node **n;

1226

struct rb_node **n;

1227

1228

parent = *p;

1228

parent = *p;

1229

cfqq = rb_entry(parent, struct cfq_queue, p_node);

1229

cfqq = rb_entry(parent, struct cfq_queue, p_node);

1230

1231

/*

1231

/*

1232

* Sort strictly based on sector. Smallest to the left,

1232

* Sort strictly based on sector. Smallest to the left,

1233

* largest to the right.

1233

* largest to the right.

1234

*/

1234

*/

1235

if (sector > blk_rq_pos(cfqq->next_rq))

1235

if (sector > blk_rq_pos(cfqq->next_rq))

1236

n = &(*p)->rb_right;

1236

n = &(*p)->rb_right;

1237

else if (sector < blk_rq_pos(cfqq->next_rq))

1237

else if (sector < blk_rq_pos(cfqq->next_rq))

1238

n = &(*p)->rb_left;

1238

n = &(*p)->rb_left;

1239

else

1239

else

1240

break;

1240

break;

1241

p = n;

1241

p = n;

1242

cfqq = NULL;

1242

cfqq = NULL;

1243

}

1243

}

1244

1245

*ret_parent = parent;

1245

*ret_parent = parent;

1246

if (rb_link)

1246

if (rb_link)

1247

*rb_link = p;

1247

*rb_link = p;

1248

return cfqq;

1248

return cfqq;

1249

}

1249

}

1250

1251

static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1251

static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1252

{

1252

{

1253

struct rb_node **p, *parent;

1253

struct rb_node **p, *parent;

1254

struct cfq_queue *__cfqq;

1254

struct cfq_queue *__cfqq;

1255

1256

if (cfqq->p_root) {

1256

if (cfqq->p_root) {

1257

rb_erase(&cfqq->p_node, cfqq->p_root);

1257

rb_erase(&cfqq->p_node, cfqq->p_root);

1258

cfqq->p_root = NULL;

1258

cfqq->p_root = NULL;

1259

}

1259

}

1260

1261

if (cfq_class_idle(cfqq))

1261

if (cfq_class_idle(cfqq))

1262

return;

1262

return;

1263

if (!cfqq->next_rq)

1263

if (!cfqq->next_rq)

1264

return;

1264

return;

1265

1266

cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];

1266

cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];

1267

__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,

1267

__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,

1268

blk_rq_pos(cfqq->next_rq), &parent, &p);

1268

blk_rq_pos(cfqq->next_rq), &parent, &p);

1269

if (!__cfqq) {

1269

if (!__cfqq) {

1270

rb_link_node(&cfqq->p_node, parent, p);

1270

rb_link_node(&cfqq->p_node, parent, p);

1271

rb_insert_color(&cfqq->p_node, cfqq->p_root);

1271

rb_insert_color(&cfqq->p_node, cfqq->p_root);

1272

} else

1272

} else

1273

cfqq->p_root = NULL;

1273

cfqq->p_root = NULL;

1274

}

1274

}

1275

1276

/*

1276

/*

1277

* Update cfqq's position in the service tree.

1277

* Update cfqq's position in the service tree.

1278

*/

1278

*/

1279

static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1279

static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1280

{

1280

{

1281

/*

1281

/*

1282

* Resorting requires the cfqq to be on the RR list already.

1282

* Resorting requires the cfqq to be on the RR list already.

1283

*/

1283

*/

1284

if (cfq_cfqq_on_rr(cfqq)) {

1284

if (cfq_cfqq_on_rr(cfqq)) {

1285

cfq_service_tree_add(cfqd, cfqq, 0);

1285

cfq_service_tree_add(cfqd, cfqq, 0);

1286

cfq_prio_tree_add(cfqd, cfqq);

1286

cfq_prio_tree_add(cfqd, cfqq);

1287

}

1287

}

1288

}

1288

}

1289

1290

/*

1290

/*

1291

* add to busy list of queues for service, trying to be fair in ordering

1291

* add to busy list of queues for service, trying to be fair in ordering

1292

* the pending list according to last request service

1292

* the pending list according to last request service

1293

*/

1293

*/

1294

static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1294

static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1295

{

1295

{

1296

cfq_log_cfqq(cfqd, cfqq, "add_to_rr");

1296

cfq_log_cfqq(cfqd, cfqq, "add_to_rr");

1297

BUG_ON(cfq_cfqq_on_rr(cfqq));

1297

BUG_ON(cfq_cfqq_on_rr(cfqq));

1298

cfq_mark_cfqq_on_rr(cfqq);

1298

cfq_mark_cfqq_on_rr(cfqq);

1299

cfqd->busy_queues++;

1299

cfqd->busy_queues++;

1300

1301

cfq_resort_rr_list(cfqd, cfqq);

1301

cfq_resort_rr_list(cfqd, cfqq);

1302

}

1302

}

1303

1304

/*

1304

/*

1305

* Called when the cfqq no longer has requests pending, remove it from

1305

* Called when the cfqq no longer has requests pending, remove it from

1306

* the service tree.

1306

* the service tree.

1307

*/

1307

*/

1308

static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1308

static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1309

{

1309

{

1310

cfq_log_cfqq(cfqd, cfqq, "del_from_rr");

1310

cfq_log_cfqq(cfqd, cfqq, "del_from_rr");

1311

BUG_ON(!cfq_cfqq_on_rr(cfqq));

1311

BUG_ON(!cfq_cfqq_on_rr(cfqq));

1312

cfq_clear_cfqq_on_rr(cfqq);

1312

cfq_clear_cfqq_on_rr(cfqq);

1313

1314

if (!RB_EMPTY_NODE(&cfqq->rb_node)) {

1314

if (!RB_EMPTY_NODE(&cfqq->rb_node)) {

1315

cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);

1315

cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);

1316

cfqq->service_tree = NULL;

1316

cfqq->service_tree = NULL;

1317

}

1317

}

1318

if (cfqq->p_root) {

1318

if (cfqq->p_root) {

1319

rb_erase(&cfqq->p_node, cfqq->p_root);

1319

rb_erase(&cfqq->p_node, cfqq->p_root);

1320

cfqq->p_root = NULL;

1320

cfqq->p_root = NULL;

1321

}

1321

}

1322

1323

cfq_group_service_tree_del(cfqd, cfqq->cfqg);

1323

cfq_group_service_tree_del(cfqd, cfqq->cfqg);

1324

BUG_ON(!cfqd->busy_queues);

1324

BUG_ON(!cfqd->busy_queues);

1325

cfqd->busy_queues--;

1325

cfqd->busy_queues--;

1326

}

1326

}

1327

1328

/*

1328

/*

1329

* rb tree support functions

1329

* rb tree support functions

1330

*/

1330

*/

1331

static void cfq_del_rq_rb(struct request *rq)

1331

static void cfq_del_rq_rb(struct request *rq)

1332

{

1332

{

1333

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1333

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1334

const int sync = rq_is_sync(rq);

1334

const int sync = rq_is_sync(rq);

1335

1336

BUG_ON(!cfqq->queued[sync]);

1336

BUG_ON(!cfqq->queued[sync]);

1337

cfqq->queued[sync]--;

1337

cfqq->queued[sync]--;

1338

1339

elv_rb_del(&cfqq->sort_list, rq);

1339

elv_rb_del(&cfqq->sort_list, rq);

1340

1341

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {

1341

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {

1342

/*

1342

/*

1343

* Queue will be deleted from service tree when we actually

1343

* Queue will be deleted from service tree when we actually

1344

* expire it later. Right now just remove it from prio tree

1344

* expire it later. Right now just remove it from prio tree

1345

* as it is empty.

1345

* as it is empty.

1346

*/

1346

*/

1347

if (cfqq->p_root) {

1347

if (cfqq->p_root) {

1348

rb_erase(&cfqq->p_node, cfqq->p_root);

1348

rb_erase(&cfqq->p_node, cfqq->p_root);

1349

cfqq->p_root = NULL;

1349

cfqq->p_root = NULL;

1350

}

1350

}

1351

}

1351

}

1352

}

1352

}

1353

1354

static void cfq_add_rq_rb(struct request *rq)

1354

static void cfq_add_rq_rb(struct request *rq)

1355

{

1355

{

1356

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1356

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1357

struct cfq_data *cfqd = cfqq->cfqd;

1357

struct cfq_data *cfqd = cfqq->cfqd;

1358

struct request *__alias, *prev;

1358

struct request *__alias, *prev;

1359

1360

cfqq->queued[rq_is_sync(rq)]++;

1360

cfqq->queued[rq_is_sync(rq)]++;

1361

1362

/*

1362

/*

1363

* looks a little odd, but the first insert might return an alias.

1363

* looks a little odd, but the first insert might return an alias.

1364

* if that happens, put the alias on the dispatch list

1364

* if that happens, put the alias on the dispatch list

1365

*/

1365

*/

1366

while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)

1366

while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)

1367

cfq_dispatch_insert(cfqd->queue, __alias);

1367

cfq_dispatch_insert(cfqd->queue, __alias);

1368

1369

if (!cfq_cfqq_on_rr(cfqq))

1369

if (!cfq_cfqq_on_rr(cfqq))

1370

cfq_add_cfqq_rr(cfqd, cfqq);

1370

cfq_add_cfqq_rr(cfqd, cfqq);

1371

1372

/*

1372

/*

1373

* check if this request is a better next-serve candidate

1373

* check if this request is a better next-serve candidate

1374

*/

1374

*/

1375

prev = cfqq->next_rq;

1375

prev = cfqq->next_rq;

1376

cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);

1376

cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);

1377

1378

/*

1378

/*

1379

* adjust priority tree position, if ->next_rq changes

1379

* adjust priority tree position, if ->next_rq changes

1380

*/

1380

*/

1381

if (prev != cfqq->next_rq)

1381

if (prev != cfqq->next_rq)

1382

cfq_prio_tree_add(cfqd, cfqq);

1382

cfq_prio_tree_add(cfqd, cfqq);

1383

1384

BUG_ON(!cfqq->next_rq);

1384

BUG_ON(!cfqq->next_rq);

1385

}

1385

}

1386

1387

static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)

1387

static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)

1388

{

1388

{

1389

elv_rb_del(&cfqq->sort_list, rq);

1389

elv_rb_del(&cfqq->sort_list, rq);

1390

cfqq->queued[rq_is_sync(rq)]--;

1390

cfqq->queued[rq_is_sync(rq)]--;

1391

cfq_add_rq_rb(rq);

1391

cfq_add_rq_rb(rq);

1392

}

1392

}

1393

1394

static struct request *

1394

static struct request *

1395

cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)

1395

cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)

1396

{

1396

{

1397

struct task_struct *tsk = current;

1397

struct task_struct *tsk = current;

1398

struct cfq_io_context *cic;

1398

struct cfq_io_context *cic;

1399

struct cfq_queue *cfqq;

1399

struct cfq_queue *cfqq;

1400

1401

cic = cfq_cic_lookup(cfqd, tsk->io_context);

1401

cic = cfq_cic_lookup(cfqd, tsk->io_context);

1402

if (!cic)

1402

if (!cic)

1403

return NULL;

1403

return NULL;

1404

1405

cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));

1405

cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));

1406

if (cfqq) {

1406

if (cfqq) {

1407

sector_t sector = bio->bi_sector + bio_sectors(bio);

1407

sector_t sector = bio->bi_sector + bio_sectors(bio);

1408

1409

return elv_rb_find(&cfqq->sort_list, sector);

1409

return elv_rb_find(&cfqq->sort_list, sector);

1410

}

1410

}

1411

1412

return NULL;

1412

return NULL;

1413

}

1413

}

1414

1415

static void cfq_activate_request(struct request_queue *q, struct request *rq)

1415

static void cfq_activate_request(struct request_queue *q, struct request *rq)

1416

{

1416

{

1417

struct cfq_data *cfqd = q->elevator->elevator_data;

1417

struct cfq_data *cfqd = q->elevator->elevator_data;

1418

1419

cfqd->rq_in_driver++;

1419

cfqd->rq_in_driver++;

1420

cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",

1420

cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",

1421

cfqd->rq_in_driver);

1421

cfqd->rq_in_driver);

1422

1423

cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

1423

cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);

1424

}

1424

}

1425

1426

static void cfq_deactivate_request(struct request_queue *q, struct request *rq)

1426

static void cfq_deactivate_request(struct request_queue *q, struct request *rq)

1427

{

1427

{

1428

struct cfq_data *cfqd = q->elevator->elevator_data;

1428

struct cfq_data *cfqd = q->elevator->elevator_data;

1429

1430

WARN_ON(!cfqd->rq_in_driver);

1430

WARN_ON(!cfqd->rq_in_driver);

1431

cfqd->rq_in_driver--;

1431

cfqd->rq_in_driver--;

1432

cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",

1432

cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",

1433

cfqd->rq_in_driver);

1433

cfqd->rq_in_driver);

1434

}

1434

}

1435

1436

static void cfq_remove_request(struct request *rq)

1436

static void cfq_remove_request(struct request *rq)

1437

{

1437

{

1438

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1438

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1439

1440

if (cfqq->next_rq == rq)

1440

if (cfqq->next_rq == rq)

1441

cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);

1441

cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);

1442

1443

list_del_init(&rq->queuelist);

1443

list_del_init(&rq->queuelist);

1444

cfq_del_rq_rb(rq);

1444

cfq_del_rq_rb(rq);

1445

1446

cfqq->cfqd->rq_queued--;

1446

cfqq->cfqd->rq_queued--;

1447

if (rq_is_meta(rq)) {

1447

if (rq_is_meta(rq)) {

1448

WARN_ON(!cfqq->meta_pending);

1448

WARN_ON(!cfqq->meta_pending);

1449

cfqq->meta_pending--;

1449

cfqq->meta_pending--;

1450

}

1450

}

1451

}

1451

}

1452

1453

static int cfq_merge(struct request_queue *q, struct request **req,

1453

static int cfq_merge(struct request_queue *q, struct request **req,

1454

struct bio *bio)

1454

struct bio *bio)

1455

{

1455

{

1456

struct cfq_data *cfqd = q->elevator->elevator_data;

1456

struct cfq_data *cfqd = q->elevator->elevator_data;

1457

struct request *__rq;

1457

struct request *__rq;

1458

1459

__rq = cfq_find_rq_fmerge(cfqd, bio);

1459

__rq = cfq_find_rq_fmerge(cfqd, bio);

1460

if (__rq && elv_rq_merge_ok(__rq, bio)) {

1460

if (__rq && elv_rq_merge_ok(__rq, bio)) {

1461

*req = __rq;

1461

*req = __rq;

1462

return ELEVATOR_FRONT_MERGE;

1462

return ELEVATOR_FRONT_MERGE;

1463

}

1463

}

1464

1465

return ELEVATOR_NO_MERGE;

1465

return ELEVATOR_NO_MERGE;

1466

}

1466

}

1467

1468

static void cfq_merged_request(struct request_queue *q, struct request *req,

1468

static void cfq_merged_request(struct request_queue *q, struct request *req,

1469

int type)

1469

int type)

1470

{

1470

{

1471

if (type == ELEVATOR_FRONT_MERGE) {

1471

if (type == ELEVATOR_FRONT_MERGE) {

1472

struct cfq_queue *cfqq = RQ_CFQQ(req);

1472

struct cfq_queue *cfqq = RQ_CFQQ(req);

1473

1474

cfq_reposition_rq_rb(cfqq, req);

1474

cfq_reposition_rq_rb(cfqq, req);

1475

}

1475

}

1476

}

1476

}

1477

1478

static void

1478

static void

1479

cfq_merged_requests(struct request_queue *q, struct request *rq,

1479

cfq_merged_requests(struct request_queue *q, struct request *rq,

1480

struct request *next)

1480

struct request *next)

1481

{

1481

{

1482

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1482

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1483

/*

1483

/*

1484

* reposition in fifo if next is older than rq

1484

* reposition in fifo if next is older than rq

1485

*/

1485

*/

1486

if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

1486

if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&

1487

time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

1487

time_before(rq_fifo_time(next), rq_fifo_time(rq))) {

1488

list_move(&rq->queuelist, &next->queuelist);

1488

list_move(&rq->queuelist, &next->queuelist);

1489

rq_set_fifo_time(rq, rq_fifo_time(next));

1489

rq_set_fifo_time(rq, rq_fifo_time(next));

1490

}

1490

}

1491

1492

if (cfqq->next_rq == next)

1492

if (cfqq->next_rq == next)

1493

cfqq->next_rq = rq;

1493

cfqq->next_rq = rq;

1494

cfq_remove_request(next);

1494

cfq_remove_request(next);

1495

}

1495

}

1496

1497

static int cfq_allow_merge(struct request_queue *q, struct request *rq,

1497

static int cfq_allow_merge(struct request_queue *q, struct request *rq,

1498

struct bio *bio)

1498

struct bio *bio)

1499

{

1499

{

1500

struct cfq_data *cfqd = q->elevator->elevator_data;

1500

struct cfq_data *cfqd = q->elevator->elevator_data;

1501

struct cfq_io_context *cic;

1501

struct cfq_io_context *cic;

1502

struct cfq_queue *cfqq;

1502

struct cfq_queue *cfqq;

1503

1504

/*

1504

/*

1505

* Disallow merge of a sync bio into an async request.

1505

* Disallow merge of a sync bio into an async request.

1506

*/

1506

*/

1507

if (cfq_bio_sync(bio) && !rq_is_sync(rq))

1507

if (cfq_bio_sync(bio) && !rq_is_sync(rq))

1508

return false;

1508

return false;

1509

1510

/*

1510

/*

1511

* Lookup the cfqq that this bio will be queued with. Allow

1511

* Lookup the cfqq that this bio will be queued with. Allow

1512

* merge only if rq is queued there.

1512

* merge only if rq is queued there.

1513

*/

1513

*/

1514

cic = cfq_cic_lookup(cfqd, current->io_context);

1514

cic = cfq_cic_lookup(cfqd, current->io_context);

1515

if (!cic)

1515

if (!cic)

1516

return false;

1516

return false;

1517

1518

cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));

1518

cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));

1519

return cfqq == RQ_CFQQ(rq);

1519

return cfqq == RQ_CFQQ(rq);

1520

}

1520

}

1521

1522

static void __cfq_set_active_queue(struct cfq_data *cfqd,

1522

static void __cfq_set_active_queue(struct cfq_data *cfqd,

1523

struct cfq_queue *cfqq)

1523

struct cfq_queue *cfqq)

1524

{

1524

{

1525

if (cfqq) {

1525

if (cfqq) {

1526

cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",

1526

cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",

1527

cfqd->serving_prio, cfqd->serving_type);

1527

cfqd->serving_prio, cfqd->serving_type);

1528

cfqq->slice_start = 0;

1528

cfqq->slice_start = 0;

1529

cfqq->dispatch_start = jiffies;

1529

cfqq->dispatch_start = jiffies;

1530

cfqq->allocated_slice = 0;

1530

cfqq->allocated_slice = 0;

1531

cfqq->slice_end = 0;

1531

cfqq->slice_end = 0;

1532

cfqq->slice_dispatch = 0;

1532

cfqq->slice_dispatch = 0;

1533

cfqq->nr_sectors = 0;

1533

cfqq->nr_sectors = 0;

1534

1535

cfq_clear_cfqq_wait_request(cfqq);

1535

cfq_clear_cfqq_wait_request(cfqq);

1536

cfq_clear_cfqq_must_dispatch(cfqq);

1536

cfq_clear_cfqq_must_dispatch(cfqq);

1537

cfq_clear_cfqq_must_alloc_slice(cfqq);

1537

cfq_clear_cfqq_must_alloc_slice(cfqq);

1538

cfq_clear_cfqq_fifo_expire(cfqq);

1538

cfq_clear_cfqq_fifo_expire(cfqq);

1539

cfq_mark_cfqq_slice_new(cfqq);

1539

cfq_mark_cfqq_slice_new(cfqq);

1540

1541

del_timer(&cfqd->idle_slice_timer);

1541

del_timer(&cfqd->idle_slice_timer);

1542

}

1542

}

1543

1544

cfqd->active_queue = cfqq;

1544

cfqd->active_queue = cfqq;

1545

}

1545

}

1546

1547

/*

1547

/*

1548

* current cfqq expired its slice (or was too idle), select new one

1548

* current cfqq expired its slice (or was too idle), select new one

1549

*/

1549

*/

1550

static void

1550

static void

1551

__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1551

__cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1552

bool timed_out)

1552

bool timed_out)

1553

{

1553

{

1554

cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);

1554

cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);

1555

1556

if (cfq_cfqq_wait_request(cfqq))

1556

if (cfq_cfqq_wait_request(cfqq))

1557

del_timer(&cfqd->idle_slice_timer);

1557

del_timer(&cfqd->idle_slice_timer);

1558

1559

cfq_clear_cfqq_wait_request(cfqq);

1559

cfq_clear_cfqq_wait_request(cfqq);

1560

cfq_clear_cfqq_wait_busy(cfqq);

1560

cfq_clear_cfqq_wait_busy(cfqq);

1561

1562

/*

1562

/*

1563

* If this cfqq is shared between multiple processes, check to

1563

* If this cfqq is shared between multiple processes, check to

1564

* make sure that those processes are still issuing I/Os within

1564

* make sure that those processes are still issuing I/Os within

1565

* the mean seek distance. If not, it may be time to break the

1565

* the mean seek distance. If not, it may be time to break the

1566

* queues apart again.

1566

* queues apart again.

1567

*/

1567

*/

1568

if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))

1568

if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))

1569

cfq_mark_cfqq_split_coop(cfqq);

1569

cfq_mark_cfqq_split_coop(cfqq);

1570

1571

/*

1571

/*

1572

* store what was left of this slice, if the queue idled/timed out

1572

* store what was left of this slice, if the queue idled/timed out

1573

*/

1573

*/

1574

if (timed_out && !cfq_cfqq_slice_new(cfqq)) {

1574

if (timed_out && !cfq_cfqq_slice_new(cfqq)) {

1575

cfqq->slice_resid = cfqq->slice_end - jiffies;

1575

cfqq->slice_resid = cfqq->slice_end - jiffies;

1576

cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);

1576

cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);

1577

}

1577

}

1578

1579

cfq_group_served(cfqd, cfqq->cfqg, cfqq);

1579

cfq_group_served(cfqd, cfqq->cfqg, cfqq);

1580

1581

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))

1581

if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))

1582

cfq_del_cfqq_rr(cfqd, cfqq);

1582

cfq_del_cfqq_rr(cfqd, cfqq);

1583

1584

cfq_resort_rr_list(cfqd, cfqq);

1584

cfq_resort_rr_list(cfqd, cfqq);

1585

1586

if (cfqq == cfqd->active_queue)

1586

if (cfqq == cfqd->active_queue)

1587

cfqd->active_queue = NULL;

1587

cfqd->active_queue = NULL;

1588

1589

if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)

1589

if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)

1590

cfqd->grp_service_tree.active = NULL;

1590

cfqd->grp_service_tree.active = NULL;

1591

1592

if (cfqd->active_cic) {

1592

if (cfqd->active_cic) {

1593

put_io_context(cfqd->active_cic->ioc);

1593

put_io_context(cfqd->active_cic->ioc);

1594

cfqd->active_cic = NULL;

1594

cfqd->active_cic = NULL;

1595

}

1595

}

1596

}

1596

}

1597

1598

static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)

1598

static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)

1599

{

1599

{

1600

struct cfq_queue *cfqq = cfqd->active_queue;

1600

struct cfq_queue *cfqq = cfqd->active_queue;

1601

1602

if (cfqq)

1602

if (cfqq)

1603

__cfq_slice_expired(cfqd, cfqq, timed_out);

1603

__cfq_slice_expired(cfqd, cfqq, timed_out);

1604

}

1604

}

1605

1606

/*

1606

/*

1607

* Get next queue for service. Unless we have a queue preemption,

1607

* Get next queue for service. Unless we have a queue preemption,

1608

* we'll simply select the first cfqq in the service tree.

1608

* we'll simply select the first cfqq in the service tree.

1609

*/

1609

*/

1610

static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)

1610

static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)

1611

{

1611

{

1612

struct cfq_rb_root *service_tree =

1612

struct cfq_rb_root *service_tree =

1613

service_tree_for(cfqd->serving_group, cfqd->serving_prio,

1613

service_tree_for(cfqd->serving_group, cfqd->serving_prio,

1614

cfqd->serving_type);

1614

cfqd->serving_type);

1615

1616

if (!cfqd->rq_queued)

1616

if (!cfqd->rq_queued)

1617

return NULL;

1617

return NULL;

1618

1619

/* There is nothing to dispatch */

1619

/* There is nothing to dispatch */

1620

if (!service_tree)

1620

if (!service_tree)

1621

return NULL;

1621

return NULL;

1622

if (RB_EMPTY_ROOT(&service_tree->rb))

1622

if (RB_EMPTY_ROOT(&service_tree->rb))

1623

return NULL;

1623

return NULL;

1624

return cfq_rb_first(service_tree);

1624

return cfq_rb_first(service_tree);

1625

}

1625

}

1626

1627

static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)

1627

static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)

1628

{

1628

{

1629

struct cfq_group *cfqg;

1629

struct cfq_group *cfqg;

1630

struct cfq_queue *cfqq;

1630

struct cfq_queue *cfqq;

1631

int i, j;

1631

int i, j;

1632

struct cfq_rb_root *st;

1632

struct cfq_rb_root *st;

1633

1634

if (!cfqd->rq_queued)

1634

if (!cfqd->rq_queued)

1635

return NULL;

1635

return NULL;

1636

1637

cfqg = cfq_get_next_cfqg(cfqd);

1637

cfqg = cfq_get_next_cfqg(cfqd);

1638

if (!cfqg)

1638

if (!cfqg)

1639

return NULL;

1639

return NULL;

1640

1641

for_each_cfqg_st(cfqg, i, j, st)

1641

for_each_cfqg_st(cfqg, i, j, st)

1642

if ((cfqq = cfq_rb_first(st)) != NULL)

1642

if ((cfqq = cfq_rb_first(st)) != NULL)

1643

return cfqq;

1643

return cfqq;

1644

return NULL;

1644

return NULL;

1645

}

1645

}

1646

1647

/*

1647

/*

1648

* Get and set a new active queue for service.

1648

* Get and set a new active queue for service.

1649

*/

1649

*/

1650

static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,

1650

static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,

1651

struct cfq_queue *cfqq)

1651

struct cfq_queue *cfqq)

1652

{

1652

{

1653

if (!cfqq)

1653

if (!cfqq)

1654

cfqq = cfq_get_next_queue(cfqd);

1654

cfqq = cfq_get_next_queue(cfqd);

1655

1656

__cfq_set_active_queue(cfqd, cfqq);

1656

__cfq_set_active_queue(cfqd, cfqq);

1657

return cfqq;

1657

return cfqq;

1658

}

1658

}

1659

1660

static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,

1660

static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,

1661

struct request *rq)

1661

struct request *rq)

1662

{

1662

{

1663

if (blk_rq_pos(rq) >= cfqd->last_position)

1663

if (blk_rq_pos(rq) >= cfqd->last_position)

1664

return blk_rq_pos(rq) - cfqd->last_position;

1664

return blk_rq_pos(rq) - cfqd->last_position;

1665

else

1665

else

1666

return cfqd->last_position - blk_rq_pos(rq);

1666

return cfqd->last_position - blk_rq_pos(rq);

1667

}

1667

}

1668

1669

static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1669

static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,

1670

struct request *rq)

1670

struct request *rq)

1671

{

1671

{

1672

return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;

1672

return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;

1673

}

1673

}

1674

1675

static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,

1675

static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,

1676

struct cfq_queue *cur_cfqq)

1676

struct cfq_queue *cur_cfqq)

1677

{

1677

{

1678

struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];

1678

struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];

1679

struct rb_node *parent, *node;

1679

struct rb_node *parent, *node;

1680

struct cfq_queue *__cfqq;

1680

struct cfq_queue *__cfqq;

1681

sector_t sector = cfqd->last_position;

1681

sector_t sector = cfqd->last_position;

1682

1683

if (RB_EMPTY_ROOT(root))

1683

if (RB_EMPTY_ROOT(root))

1684

return NULL;

1684

return NULL;

1685

1686

/*

1686

/*

1687

* First, if we find a request starting at the end of the last

1687

* First, if we find a request starting at the end of the last

1688

* request, choose it.

1688

* request, choose it.

1689

*/

1689

*/

1690

__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);

1690

__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);

1691

if (__cfqq)

1691

if (__cfqq)

1692

return __cfqq;

1692

return __cfqq;

1693

1694

/*

1694

/*

1695

* If the exact sector wasn't found, the parent of the NULL leaf

1695

* If the exact sector wasn't found, the parent of the NULL leaf

1696

* will contain the closest sector.

1696

* will contain the closest sector.

1697

*/

1697

*/

1698

__cfqq = rb_entry(parent, struct cfq_queue, p_node);

1698

__cfqq = rb_entry(parent, struct cfq_queue, p_node);

1699

if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))

1699

if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))

1700

return __cfqq;

1700

return __cfqq;

1701

1702

if (blk_rq_pos(__cfqq->next_rq) < sector)

1702

if (blk_rq_pos(__cfqq->next_rq) < sector)

1703

node = rb_next(&__cfqq->p_node);

1703

node = rb_next(&__cfqq->p_node);

1704

else

1704

else

1705

node = rb_prev(&__cfqq->p_node);

1705

node = rb_prev(&__cfqq->p_node);

1706

if (!node)

1706

if (!node)

1707

return NULL;

1707

return NULL;

1708

1709

__cfqq = rb_entry(node, struct cfq_queue, p_node);

1709

__cfqq = rb_entry(node, struct cfq_queue, p_node);

1710

if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))

1710

if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))

1711

return __cfqq;

1711

return __cfqq;

1712

1713

return NULL;

1713

return NULL;

1714

}

1714

}

1715

1716

/*

1716

/*

1717

* cfqd - obvious

1717

* cfqd - obvious

1718

* cur_cfqq - passed in so that we don't decide that the current queue is

1718

* cur_cfqq - passed in so that we don't decide that the current queue is

1719

* closely cooperating with itself.

1719

* closely cooperating with itself.

1720

*

1720

*

1721

* So, basically we're assuming that that cur_cfqq has dispatched at least

1721

* So, basically we're assuming that that cur_cfqq has dispatched at least

1722

* one request, and that cfqd->last_position reflects a position on the disk

1722

* one request, and that cfqd->last_position reflects a position on the disk

1723

* associated with the I/O issued by cur_cfqq. I'm not sure this is a valid

1723

* associated with the I/O issued by cur_cfqq. I'm not sure this is a valid

1724

* assumption.

1724

* assumption.

1725

*/

1725

*/

1726

static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,

1726

static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,

1727

struct cfq_queue *cur_cfqq)

1727

struct cfq_queue *cur_cfqq)

1728

{

1728

{

1729

struct cfq_queue *cfqq;

1729

struct cfq_queue *cfqq;

1730

1731

if (cfq_class_idle(cur_cfqq))

1731

if (cfq_class_idle(cur_cfqq))

1732

return NULL;

1732

return NULL;

1733

if (!cfq_cfqq_sync(cur_cfqq))

1733

if (!cfq_cfqq_sync(cur_cfqq))

1734

return NULL;

1734

return NULL;

1735

if (CFQQ_SEEKY(cur_cfqq))

1735

if (CFQQ_SEEKY(cur_cfqq))

1736

return NULL;

1736

return NULL;

1737

1738

/*

1738

/*

1739

* Don't search priority tree if it's the only queue in the group.

1739

* Don't search priority tree if it's the only queue in the group.

1740

*/

1740

*/

1741

if (cur_cfqq->cfqg->nr_cfqq == 1)

1741

if (cur_cfqq->cfqg->nr_cfqq == 1)

1742

return NULL;

1742

return NULL;

1743

1744

/*

1744

/*

1745

* We should notice if some of the queues are cooperating, eg

1745

* We should notice if some of the queues are cooperating, eg

1746

* working closely on the same area of the disk. In that case,

1746

* working closely on the same area of the disk. In that case,

1747

* we can group them together and don't waste time idling.

1747

* we can group them together and don't waste time idling.

1748

*/

1748

*/

1749

cfqq = cfqq_close(cfqd, cur_cfqq);

1749

cfqq = cfqq_close(cfqd, cur_cfqq);

1750

if (!cfqq)

1750

if (!cfqq)

1751

return NULL;

1751

return NULL;

1752

1753

/* If new queue belongs to different cfq_group, don't choose it */

1753

/* If new queue belongs to different cfq_group, don't choose it */

1754

if (cur_cfqq->cfqg != cfqq->cfqg)

1754

if (cur_cfqq->cfqg != cfqq->cfqg)

1755

return NULL;

1755

return NULL;

1756

1757

/*

1757

/*

1758

* It only makes sense to merge sync queues.

1758

* It only makes sense to merge sync queues.

1759

*/

1759

*/

1760

if (!cfq_cfqq_sync(cfqq))

1760

if (!cfq_cfqq_sync(cfqq))

1761

return NULL;

1761

return NULL;

1762

if (CFQQ_SEEKY(cfqq))

1762

if (CFQQ_SEEKY(cfqq))

1763

return NULL;

1763

return NULL;

1764

1765

/*

1765

/*

1766

* Do not merge queues of different priority classes

1766

* Do not merge queues of different priority classes

1767

*/

1767

*/

1768

if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))

1768

if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))

1769

return NULL;

1769

return NULL;

1770

1771

return cfqq;

1771

return cfqq;

1772

}

1772

}

1773

1774

/*

1774

/*

1775

* Determine whether we should enforce idle window for this queue.

1775

* Determine whether we should enforce idle window for this queue.

1776

*/

1776

*/

1777

1778

static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1778

static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1779

{

1779

{

1780

enum wl_prio_t prio = cfqq_prio(cfqq);

1780

enum wl_prio_t prio = cfqq_prio(cfqq);

1781

struct cfq_rb_root *service_tree = cfqq->service_tree;

1781

struct cfq_rb_root *service_tree = cfqq->service_tree;

1782

1783

BUG_ON(!service_tree);

1783

BUG_ON(!service_tree);

1784

BUG_ON(!service_tree->count);

1784

BUG_ON(!service_tree->count);

1785

1786

/* We never do for idle class queues. */

1786

/* We never do for idle class queues. */

1787

if (prio == IDLE_WORKLOAD)

1787

if (prio == IDLE_WORKLOAD)

1788

return false;

1788

return false;

1789

1790

/* We do for queues that were marked with idle window flag. */

1790

/* We do for queues that were marked with idle window flag. */

1791

if (cfq_cfqq_idle_window(cfqq) &&

1791

if (cfq_cfqq_idle_window(cfqq) &&

1792

!(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))

1792

!(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))

1793

return true;

1793

return true;

1794

1795

/*

1795

/*

1796

* Otherwise, we do only if they are the last ones

1796

* Otherwise, we do only if they are the last ones

1797

* in their service tree.

1797

* in their service tree.

1798

*/

1798

*/

1799

if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))

1799

if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))

1800

return 1;

1800

return 1;

1801

cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",

1801

cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",

1802

service_tree->count);

1802

service_tree->count);

1803

return 0;

1803

return 0;

1804

}

1804

}

1805

1806

static void cfq_arm_slice_timer(struct cfq_data *cfqd)

1806

static void cfq_arm_slice_timer(struct cfq_data *cfqd)

1807

{

1807

{

1808

struct cfq_queue *cfqq = cfqd->active_queue;

1808

struct cfq_queue *cfqq = cfqd->active_queue;

1809

struct cfq_io_context *cic;

1809

struct cfq_io_context *cic;

1810

unsigned long sl;

1810

unsigned long sl;

1811

1812

/*

1812

/*

1813

* SSD device without seek penalty, disable idling. But only do so

1813

* SSD device without seek penalty, disable idling. But only do so

1814

* for devices that support queuing, otherwise we still have a problem

1814

* for devices that support queuing, otherwise we still have a problem

1815

* with sync vs async workloads.

1815

* with sync vs async workloads.

1816

*/

1816

*/

1817

if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)

1817

if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)

1818

return;

1818

return;

1819

1820

WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));

1820

WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));

1821

WARN_ON(cfq_cfqq_slice_new(cfqq));

1821

WARN_ON(cfq_cfqq_slice_new(cfqq));

1822

1823

/*

1823

/*

1824

* idle is disabled, either manually or by past process history

1824

* idle is disabled, either manually or by past process history

1825

*/

1825

*/

1826

if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))

1826

if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))

1827

return;

1827

return;

1828

1829

/*

1829

/*

1830

* still active requests from this queue, don't idle

1830

* still active requests from this queue, don't idle

1831

*/

1831

*/

1832

if (cfqq->dispatched)

1832

if (cfqq->dispatched)

1833

return;

1833

return;

1834

1835

/*

1835

/*

1836

* task has exited, don't wait

1836

* task has exited, don't wait

1837

*/

1837

*/

1838

cic = cfqd->active_cic;

1838

cic = cfqd->active_cic;

1839

if (!cic || !atomic_read(&cic->ioc->nr_tasks))

1839

if (!cic || !atomic_read(&cic->ioc->nr_tasks))

1840

return;

1840

return;

1841

1842

/*

1842

/*

1843

* If our average think time is larger than the remaining time

1843

* If our average think time is larger than the remaining time

1844

* slice, then don't idle. This avoids overrunning the allotted

1844

* slice, then don't idle. This avoids overrunning the allotted

1845

* time slice.

1845

* time slice.

1846

*/

1846

*/

1847

if (sample_valid(cic->ttime_samples) &&

1847

if (sample_valid(cic->ttime_samples) &&

1848

(cfqq->slice_end - jiffies < cic->ttime_mean)) {

1848

(cfqq->slice_end - jiffies < cic->ttime_mean)) {

1849

cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",

1849

cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",

1850

cic->ttime_mean);

1850

cic->ttime_mean);

1851

return;

1851

return;

1852

}

1852

}

1853

1854

cfq_mark_cfqq_wait_request(cfqq);

1854

cfq_mark_cfqq_wait_request(cfqq);

1855

1856

sl = cfqd->cfq_slice_idle;

1856

sl = cfqd->cfq_slice_idle;

1857

1858

mod_timer(&cfqd->idle_slice_timer, jiffies + sl);

1858

mod_timer(&cfqd->idle_slice_timer, jiffies + sl);

1859

cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);

1859

cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);

1860

}

1860

}

1861

1862

/*

1862

/*

1863

* Move request from internal lists to the request queue dispatch list.

1863

* Move request from internal lists to the request queue dispatch list.

1864

*/

1864

*/

1865

static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)

1865

static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)

1866

{

1866

{

1867

struct cfq_data *cfqd = q->elevator->elevator_data;

1867

struct cfq_data *cfqd = q->elevator->elevator_data;

1868

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1868

struct cfq_queue *cfqq = RQ_CFQQ(rq);

1869

1870

cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");

1870

cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");

1871

1872

cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);

1872

cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);

1873

cfq_remove_request(rq);

1873

cfq_remove_request(rq);

1874

cfqq->dispatched++;

1874

cfqq->dispatched++;

1875

elv_dispatch_sort(q, rq);

1875

elv_dispatch_sort(q, rq);

1876

1877

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;

1877

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;

1878

cfqq->nr_sectors += blk_rq_sectors(rq);

1878

cfqq->nr_sectors += blk_rq_sectors(rq);

1879

}

1879

}

1880

1881

/*

1881

/*

1882

* return expired entry, or NULL to just start from scratch in rbtree

1882

* return expired entry, or NULL to just start from scratch in rbtree

1883

*/

1883

*/

1884

static struct request *cfq_check_fifo(struct cfq_queue *cfqq)

1884

static struct request *cfq_check_fifo(struct cfq_queue *cfqq)

1885

{

1885

{

1886

struct request *rq = NULL;

1886

struct request *rq = NULL;

1887

1888

if (cfq_cfqq_fifo_expire(cfqq))

1888

if (cfq_cfqq_fifo_expire(cfqq))

1889

return NULL;

1889

return NULL;

1890

1891

cfq_mark_cfqq_fifo_expire(cfqq);

1891

cfq_mark_cfqq_fifo_expire(cfqq);

1892

1893

if (list_empty(&cfqq->fifo))

1893

if (list_empty(&cfqq->fifo))

1894

return NULL;

1894

return NULL;

1895

1896

rq = rq_entry_fifo(cfqq->fifo.next);

1896

rq = rq_entry_fifo(cfqq->fifo.next);

1897

if (time_before(jiffies, rq_fifo_time(rq)))

1897

if (time_before(jiffies, rq_fifo_time(rq)))

1898

rq = NULL;

1898

rq = NULL;

1899

1900

cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);

1900

cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);

1901

return rq;

1901

return rq;

1902

}

1902

}

1903

1904

static inline int

1904

static inline int

1905

cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1905

cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)

1906

{

1906

{

1907

const int base_rq = cfqd->cfq_slice_async_rq;

1907

const int base_rq = cfqd->cfq_slice_async_rq;

1908

1909

WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);

1909

WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);

1910

1911

return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));

1911

return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));

1912

}

1912

}

1913

1914

/*

1914

/*

1915

* Must be called with the queue_lock held.

1915

* Must be called with the queue_lock held.

1916

*/

1916

*/

1917

static int cfqq_process_refs(struct cfq_queue *cfqq)

1917

static int cfqq_process_refs(struct cfq_queue *cfqq)

1918

{

1918

{

1919

int process_refs, io_refs;

1919

int process_refs, io_refs;

1920

1921

io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];

1921

io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];

1922

process_refs = atomic_read(&cfqq->ref) - io_refs;

1922

process_refs = atomic_read(&cfqq->ref) - io_refs;

1923

BUG_ON(process_refs < 0);

1923

BUG_ON(process_refs < 0);

1924

return process_refs;

1924

return process_refs;

1925

}

1925

}

1926

1927

static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)

1927

static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)

1928

{

1928

{

1929

int process_refs, new_process_refs;

1929

int process_refs, new_process_refs;

1930

struct cfq_queue *__cfqq;

1930

struct cfq_queue *__cfqq;

1931

1932

/* Avoid a circular list and skip interim queue merges */

1932

/* Avoid a circular list and skip interim queue merges */

1933

while ((__cfqq = new_cfqq->new_cfqq)) {

1933

while ((__cfqq = new_cfqq->new_cfqq)) {

1934

if (__cfqq == cfqq)

1934

if (__cfqq == cfqq)

1935

return;

1935

return;

1936

new_cfqq = __cfqq;

1936

new_cfqq = __cfqq;

1937

}

1937

}

1938

1939

process_refs = cfqq_process_refs(cfqq);

1939

process_refs = cfqq_process_refs(cfqq);

1940

/*

1940

/*

1941

* If the process for the cfqq has gone away, there is no

1941

* If the process for the cfqq has gone away, there is no

1942

* sense in merging the queues.

1942

* sense in merging the queues.

1943

*/

1943

*/

1944

if (process_refs == 0)

1944

if (process_refs == 0)

1945

return;

1945

return;

1946

1947

/*

1947

/*

1948

* Merge in the direction of the lesser amount of work.

1948

* Merge in the direction of the lesser amount of work.

1949

*/

1949

*/

1950

new_process_refs = cfqq_process_refs(new_cfqq);

1950

new_process_refs = cfqq_process_refs(new_cfqq);

1951

if (new_process_refs >= process_refs) {

1951

if (new_process_refs >= process_refs) {

1952

cfqq->new_cfqq = new_cfqq;

1952

cfqq->new_cfqq = new_cfqq;

1953

atomic_add(process_refs, &new_cfqq->ref);

1953

atomic_add(process_refs, &new_cfqq->ref);

1954

} else {

1954

} else {

1955

new_cfqq->new_cfqq = cfqq;

1955

new_cfqq->new_cfqq = cfqq;

1956

atomic_add(new_process_refs, &cfqq->ref);

1956

atomic_add(new_process_refs, &cfqq->ref);

1957

}

1957

}

1958

}

1958

}

1959

1960

static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,

1960

static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,

1961

struct cfq_group *cfqg, enum wl_prio_t prio)

1961

struct cfq_group *cfqg, enum wl_prio_t prio)

1962

{

1962

{

1963

struct cfq_queue *queue;

1963

struct cfq_queue *queue;

1964

int i;

1964

int i;

1965

bool key_valid = false;

1965

bool key_valid = false;

1966

unsigned long lowest_key = 0;

1966

unsigned long lowest_key = 0;

1967

enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;

1967

enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;

1968

1969

for (i = 0; i <= SYNC_WORKLOAD; ++i) {

1969

for (i = 0; i <= SYNC_WORKLOAD; ++i) {

1970

/* select the one with lowest rb_key */

1970

/* select the one with lowest rb_key */

1971

queue = cfq_rb_first(service_tree_for(cfqg, prio, i));

1971

queue = cfq_rb_first(service_tree_for(cfqg, prio, i));

1972

if (queue &&

1972

if (queue &&

1973

(!key_valid || time_before(queue->rb_key, lowest_key))) {

1973

(!key_valid || time_before(queue->rb_key, lowest_key))) {

1974

lowest_key = queue->rb_key;

1974

lowest_key = queue->rb_key;

1975

cur_best = i;

1975

cur_best = i;

1976

key_valid = true;

1976

key_valid = true;

1977

}

1977

}

1978

}

1978

}

1979

1980

return cur_best;

1980

return cur_best;

1981

}

1981

}

1982

1983

static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)

1983

static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)

1984

{

1984

{

1985

unsigned slice;

1985

unsigned slice;

1986

unsigned count;

1986

unsigned count;

1987

struct cfq_rb_root *st;

1987

struct cfq_rb_root *st;

1988

unsigned group_slice;

1988

unsigned group_slice;

1989

1990

if (!cfqg) {

1990

if (!cfqg) {

1991

cfqd->serving_prio = IDLE_WORKLOAD;

1991

cfqd->serving_prio = IDLE_WORKLOAD;

1992

cfqd->workload_expires = jiffies + 1;

1992

cfqd->workload_expires = jiffies + 1;

1993

return;

1993

return;

1994

}

1994

}

1995

1996

/* Choose next priority. RT > BE > IDLE */

1996

/* Choose next priority. RT > BE > IDLE */

1997

if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))

1997

if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))

1998

cfqd->serving_prio = RT_WORKLOAD;

1998

cfqd->serving_prio = RT_WORKLOAD;

1999

else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))

1999

else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))

2000

cfqd->serving_prio = BE_WORKLOAD;

2000

cfqd->serving_prio = BE_WORKLOAD;

2001

else {

2001

else {

2002

cfqd->serving_prio = IDLE_WORKLOAD;

2002

cfqd->serving_prio = IDLE_WORKLOAD;

2003

cfqd->workload_expires = jiffies + 1;

2003

cfqd->workload_expires = jiffies + 1;

2004

return;

2004

return;

2005

}

2005

}

2006

2007

/*

2007

/*

2008

* For RT and BE, we have to choose also the type

2008

* For RT and BE, we have to choose also the type

2009

* (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload

2009

* (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload

2010

* expiration time

2010

* expiration time

2011

*/

2011

*/

2012

st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);

2012

st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);

2013

count = st->count;

2013

count = st->count;

2014

2015

/*

2015

/*

2016

* check workload expiration, and that we still have other queues ready

2016

* check workload expiration, and that we still have other queues ready

2017

*/

2017

*/

2018

if (count && !time_after(jiffies, cfqd->workload_expires))

2018

if (count && !time_after(jiffies, cfqd->workload_expires))

2019

return;

2019

return;

2020

2021

/* otherwise select new workload type */

2021

/* otherwise select new workload type */

2022

cfqd->serving_type =

2022

cfqd->serving_type =

2023

cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);

2023

cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);

2024

st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);

2024

st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);

2025

count = st->count;

2025

count = st->count;

2026

2027

/*

2027

/*

2028

* the workload slice is computed as a fraction of target latency

2028

* the workload slice is computed as a fraction of target latency

2029

* proportional to the number of queues in that workload, over

2029

* proportional to the number of queues in that workload, over

2030

* all the queues in the same priority class

2030

* all the queues in the same priority class

2031

*/

2031

*/

2032

group_slice = cfq_group_slice(cfqd, cfqg);

2032

group_slice = cfq_group_slice(cfqd, cfqg);

2033

2034

slice = group_slice * count /

2034

slice = group_slice * count /

2035

max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],

2035

max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],

2036

cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));

2036

cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));

2037

2038

if (cfqd->serving_type == ASYNC_WORKLOAD) {

2038

if (cfqd->serving_type == ASYNC_WORKLOAD) {

2039

unsigned int tmp;

2039

unsigned int tmp;

2040

2041

/*

2041

/*

2042

* Async queues are currently system wide. Just taking

2042

* Async queues are currently system wide. Just taking

2043

* proportion of queues with-in same group will lead to higher

2043

* proportion of queues with-in same group will lead to higher

2044

* async ratio system wide as generally root group is going

2044

* async ratio system wide as generally root group is going

2045

* to have higher weight. A more accurate thing would be to

2045

* to have higher weight. A more accurate thing would be to

2046

* calculate system wide asnc/sync ratio.

2046

* calculate system wide asnc/sync ratio.

2047

*/

2047

*/

2048

tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);

2048

tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);

2049

tmp = tmp/cfqd->busy_queues;

2049

tmp = tmp/cfqd->busy_queues;

2050

slice = min_t(unsigned, slice, tmp);

2050

slice = min_t(unsigned, slice, tmp);

2051

2052

/* async workload slice is scaled down according to

2052

/* async workload slice is scaled down according to

2053

* the sync/async slice ratio. */

2053

* the sync/async slice ratio. */

2054

slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];

2054

slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];

2055

} else

2055

} else

2056

/* sync workload slice is at least 2 * cfq_slice_idle */

2056

/* sync workload slice is at least 2 * cfq_slice_idle */

2057

slice = max(slice, 2 * cfqd->cfq_slice_idle);

2057

slice = max(slice, 2 * cfqd->cfq_slice_idle);

2058

2059

slice = max_t(unsigned, slice, CFQ_MIN_TT);

2059

slice = max_t(unsigned, slice, CFQ_MIN_TT);

2060

cfq_log(cfqd, "workload slice:%d", slice);

2060

cfq_log(cfqd, "workload slice:%d", slice);

2061

cfqd->workload_expires = jiffies + slice;

2061

cfqd->workload_expires = jiffies + slice;

2062

cfqd->noidle_tree_requires_idle = false;

2062

cfqd->noidle_tree_requires_idle = false;

2063

}

2063

}

2064

2065

static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)

2065

static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)

2066

{

2066

{

2067

struct cfq_rb_root *st = &cfqd->grp_service_tree;

2067

struct cfq_rb_root *st = &cfqd->grp_service_tree;

2068

struct cfq_group *cfqg;

2068

struct cfq_group *cfqg;

2069

2070

if (RB_EMPTY_ROOT(&st->rb))

2070

if (RB_EMPTY_ROOT(&st->rb))

2071

return NULL;

2071

return NULL;

2072

cfqg = cfq_rb_first_group(st);

2072

cfqg = cfq_rb_first_group(st);

2073

st->active = &cfqg->rb_node;

2073

st->active = &cfqg->rb_node;

2074

update_min_vdisktime(st);

2074

update_min_vdisktime(st);

2075

return cfqg;

2075

return cfqg;

2076

}

2076

}

2077

2078

static void cfq_choose_cfqg(struct cfq_data *cfqd)

2078

static void cfq_choose_cfqg(struct cfq_data *cfqd)

2079

{

2079

{

2080

struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);

2080

struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);

2081

2082

cfqd->serving_group = cfqg;

2082

cfqd->serving_group = cfqg;

2083

2084

/* Restore the workload type data */

2084

/* Restore the workload type data */

2085

if (cfqg->saved_workload_slice) {

2085

if (cfqg->saved_workload_slice) {

2086

cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;

2086

cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;

2087

cfqd->serving_type = cfqg->saved_workload;

2087

cfqd->serving_type = cfqg->saved_workload;

2088

cfqd->serving_prio = cfqg->saved_serving_prio;

2088

cfqd->serving_prio = cfqg->saved_serving_prio;

2089

} else

2089

} else

2090

cfqd->workload_expires = jiffies - 1;

2090

cfqd->workload_expires = jiffies - 1;

2091

2092

choose_service_tree(cfqd, cfqg);

2092

choose_service_tree(cfqd, cfqg);

2093

}

2093

}

2094

2095

/*

2095

/*

2096

* Select a queue for service. If we have a current active queue,

2096

* Select a queue for service. If we have a current active queue,

2097

* check whether to continue servicing it, or retrieve and set a new one.

2097

* check whether to continue servicing it, or retrieve and set a new one.

2098

*/

2098

*/

2099

static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)

2099

static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)

2100

{

2100

{

2101

struct cfq_queue *cfqq, *new_cfqq = NULL;

2101

struct cfq_queue *cfqq, *new_cfqq = NULL;

2102

2103

cfqq = cfqd->active_queue;

2103

cfqq = cfqd->active_queue;

2104

if (!cfqq)

2104

if (!cfqq)

2105

goto new_queue;

2105

goto new_queue;

2106

2107

if (!cfqd->rq_queued)

2107

if (!cfqd->rq_queued)

2108

return NULL;

2108

return NULL;

2109

2110

/*

2110

/*

2111

* We were waiting for group to get backlogged. Expire the queue

2111

* We were waiting for group to get backlogged. Expire the queue

2112

*/

2112

*/

2113

if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))

2113

if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))

2114

goto expire;

2114

goto expire;

2115

2116

/*

2116

/*

2117

* The active queue has run out of time, expire it and select new.

2117

* The active queue has run out of time, expire it and select new.

2118

*/

2118

*/

2119

if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {

2119

if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {

2120

/*

2120

/*

2121

* If slice had not expired at the completion of last request

2121

* If slice had not expired at the completion of last request

2122

* we might not have turned on wait_busy flag. Don't expire

2122

* we might not have turned on wait_busy flag. Don't expire

2123

* the queue yet. Allow the group to get backlogged.

2123

* the queue yet. Allow the group to get backlogged.

2124

*

2124

*

2125

* The very fact that we have used the slice, that means we

2125

* The very fact that we have used the slice, that means we

2126

* have been idling all along on this queue and it should be

2126

* have been idling all along on this queue and it should be

2127

* ok to wait for this request to complete.

2127

* ok to wait for this request to complete.

2128

*/

2128

*/

2129

if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)

2129

if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)

2130

&& cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {

2130

&& cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {

2131

cfqq = NULL;

2131

cfqq = NULL;

2132

goto keep_queue;

2132

goto keep_queue;

2133

} else

2133

} else

2134

goto expire;

2134

goto expire;

2135

}

2135

}

2136

2137

/*

2137

/*

2138

* The active queue has requests and isn't expired, allow it to

2138

* The active queue has requests and isn't expired, allow it to

2139

* dispatch.

2139

* dispatch.

2140

*/

2140

*/

2141

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

2141

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

2142

goto keep_queue;

2142

goto keep_queue;

2143

2144

/*

2144

/*

2145

* If another queue has a request waiting within our mean seek

2145

* If another queue has a request waiting within our mean seek

2146

* distance, let it run. The expire code will check for close

2146

* distance, let it run. The expire code will check for close

2147

* cooperators and put the close queue at the front of the service

2147

* cooperators and put the close queue at the front of the service

2148

* tree. If possible, merge the expiring queue with the new cfqq.

2148

* tree. If possible, merge the expiring queue with the new cfqq.

2149

*/

2149

*/

2150

new_cfqq = cfq_close_cooperator(cfqd, cfqq);

2150

new_cfqq = cfq_close_cooperator(cfqd, cfqq);

2151

if (new_cfqq) {

2151

if (new_cfqq) {

2152

if (!cfqq->new_cfqq)

2152

if (!cfqq->new_cfqq)

2153

cfq_setup_merge(cfqq, new_cfqq);

2153

cfq_setup_merge(cfqq, new_cfqq);

2154

goto expire;

2154

goto expire;

2155

}

2155

}

2156

2157

/*

2157

/*

2158

* No requests pending. If the active queue still has requests in

2158

* No requests pending. If the active queue still has requests in

2159

* flight or is idling for a new request, allow either of these

2159

* flight or is idling for a new request, allow either of these

2160

* conditions to happen (or time out) before selecting a new queue.

2160

* conditions to happen (or time out) before selecting a new queue.

2161

*/

2161

*/

2162

if (timer_pending(&cfqd->idle_slice_timer) ||

2162

if (timer_pending(&cfqd->idle_slice_timer) ||

2163

(cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {

2163

(cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {

2164

cfqq = NULL;

2164

cfqq = NULL;

2165

goto keep_queue;

2165

goto keep_queue;

2166

}

2166

}

2167

2168

expire:

2168

expire:

2169

cfq_slice_expired(cfqd, 0);

2169

cfq_slice_expired(cfqd, 0);

2170

new_queue:

2170

new_queue:

2171

/*

2171

/*

2172

* Current queue expired. Check if we have to switch to a new

2172

* Current queue expired. Check if we have to switch to a new

2173

* service tree

2173

* service tree

2174

*/

2174

*/

2175

if (!new_cfqq)

2175

if (!new_cfqq)

2176

cfq_choose_cfqg(cfqd);

2176

cfq_choose_cfqg(cfqd);

2177

2178

cfqq = cfq_set_active_queue(cfqd, new_cfqq);

2178

cfqq = cfq_set_active_queue(cfqd, new_cfqq);

2179

keep_queue:

2179

keep_queue:

2180

return cfqq;

2180

return cfqq;

2181

}

2181

}

2182

2183

static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)

2183

static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)

2184

{

2184

{

2185

int dispatched = 0;

2185

int dispatched = 0;

2186

2187

while (cfqq->next_rq) {

2187

while (cfqq->next_rq) {

2188

cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);

2188

cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);

2189

dispatched++;

2189

dispatched++;

2190

}

2190

}

2191

2192

BUG_ON(!list_empty(&cfqq->fifo));

2192

BUG_ON(!list_empty(&cfqq->fifo));

2193

2194

/* By default cfqq is not expired if it is empty. Do it explicitly */

2194

/* By default cfqq is not expired if it is empty. Do it explicitly */

2195

__cfq_slice_expired(cfqq->cfqd, cfqq, 0);

2195

__cfq_slice_expired(cfqq->cfqd, cfqq, 0);

2196

return dispatched;

2196

return dispatched;

2197

}

2197

}

2198

2199

/*

2199

/*

2200

* Drain our current requests. Used for barriers and when switching

2200

* Drain our current requests. Used for barriers and when switching

2201

* io schedulers on-the-fly.

2201

* io schedulers on-the-fly.

2202

*/

2202

*/

2203

static int cfq_forced_dispatch(struct cfq_data *cfqd)

2203

static int cfq_forced_dispatch(struct cfq_data *cfqd)

2204

{

2204

{

2205

struct cfq_queue *cfqq;

2205

struct cfq_queue *cfqq;

2206

int dispatched = 0;

2206

int dispatched = 0;

2207

2208

while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)

2208

/* Expire the timeslice of the current active queue first */

2209

cfq_slice_expired(cfqd, 0);

2210

while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {

2211

__cfq_set_active_queue(cfqd, cfqq);

2209

dispatched += __cfq_forced_dispatch_cfqq(cfqq);

2212

dispatched += __cfq_forced_dispatch_cfqq(cfqq);

2213

}

2210

2214

2211

cfq_slice_expired(cfqd, 0);

2212

BUG_ON(cfqd->busy_queues);

2215

BUG_ON(cfqd->busy_queues);

2213

2216

2214

cfq_log(cfqd, "forced_dispatch=%d", dispatched);

2217

cfq_log(cfqd, "forced_dispatch=%d", dispatched);

2215

return dispatched;

2218

return dispatched;

2216

}

2219

}

2217

2220

2218

static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,

2221

static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,

2219

struct cfq_queue *cfqq)

2222

struct cfq_queue *cfqq)

2220

{

2223

{

2221

/* the queue hasn't finished any request, can't estimate */

2224

/* the queue hasn't finished any request, can't estimate */

2222

if (cfq_cfqq_slice_new(cfqq))

2225

if (cfq_cfqq_slice_new(cfqq))

2223

return 1;

2226

return 1;

2224

if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,

2227

if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,

2225

cfqq->slice_end))

2228

cfqq->slice_end))

2226

return 1;

2229

return 1;

2227

2230

2228

return 0;

2231

return 0;

2229

}

2232

}

2230

2233

2231

static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2234

static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2232

{

2235

{

2233

unsigned int max_dispatch;

2236

unsigned int max_dispatch;

2234

2237

2235

/*

2238

/*

2236

* Drain async requests before we start sync IO

2239

* Drain async requests before we start sync IO

2237

*/

2240

*/

2238

if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])

2241

if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])

2239

return false;

2242

return false;

2240

2243

2241

/*

2244

/*

2242

* If this is an async queue and we have sync IO in flight, let it wait

2245

* If this is an async queue and we have sync IO in flight, let it wait

2243

*/

2246

*/

2244

if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))

2247

if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))

2245

return false;

2248

return false;

2246

2249

2247

max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);

2250

max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);

2248

if (cfq_class_idle(cfqq))

2251

if (cfq_class_idle(cfqq))

2249

max_dispatch = 1;

2252

max_dispatch = 1;

2250

2253

2251

/*

2254

/*

2252

* Does this cfqq already have too much IO in flight?

2255

* Does this cfqq already have too much IO in flight?

2253

*/

2256

*/

2254

if (cfqq->dispatched >= max_dispatch) {

2257

if (cfqq->dispatched >= max_dispatch) {

2255

/*

2258

/*

2256

* idle queue must always only have a single IO in flight

2259

* idle queue must always only have a single IO in flight

2257

*/

2260

*/

2258

if (cfq_class_idle(cfqq))

2261

if (cfq_class_idle(cfqq))

2259

return false;

2262

return false;

2260

2263

2261

/*

2264

/*

2262

* We have other queues, don't allow more IO from this one

2265

* We have other queues, don't allow more IO from this one

2263

*/

2266

*/

2264

if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))

2267

if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))

2265

return false;

2268

return false;

2266

2269

2267

/*

2270

/*

2268

* Sole queue user, no limit

2271

* Sole queue user, no limit

2269

*/

2272

*/

2270

if (cfqd->busy_queues == 1)

2273

if (cfqd->busy_queues == 1)

2271

max_dispatch = -1;

2274

max_dispatch = -1;

2272

else

2275

else

2273

/*

2276

/*

2274

* Normally we start throttling cfqq when cfq_quantum/2

2277

* Normally we start throttling cfqq when cfq_quantum/2

2275

* requests have been dispatched. But we can drive

2278

* requests have been dispatched. But we can drive

2276

* deeper queue depths at the beginning of slice

2279

* deeper queue depths at the beginning of slice

2277

* subjected to upper limit of cfq_quantum.

2280

* subjected to upper limit of cfq_quantum.

2278

* */

2281

* */

2279

max_dispatch = cfqd->cfq_quantum;

2282

max_dispatch = cfqd->cfq_quantum;

2280

}

2283

}

2281

2284

2282

/*

2285

/*

2283

* Async queues must wait a bit before being allowed dispatch.

2286

* Async queues must wait a bit before being allowed dispatch.

2284

* We also ramp up the dispatch depth gradually for async IO,

2287

* We also ramp up the dispatch depth gradually for async IO,

2285

* based on the last sync IO we serviced

2288

* based on the last sync IO we serviced

2286

*/

2289

*/

2287

if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {

2290

if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {

2288

unsigned long last_sync = jiffies - cfqd->last_delayed_sync;

2291

unsigned long last_sync = jiffies - cfqd->last_delayed_sync;

2289

unsigned int depth;

2292

unsigned int depth;

2290

2293

2291

depth = last_sync / cfqd->cfq_slice[1];

2294

depth = last_sync / cfqd->cfq_slice[1];

2292

if (!depth && !cfqq->dispatched)

2295

if (!depth && !cfqq->dispatched)

2293

depth = 1;

2296

depth = 1;

2294

if (depth < max_dispatch)

2297

if (depth < max_dispatch)

2295

max_dispatch = depth;

2298

max_dispatch = depth;

2296

}

2299

}

2297

2300

2298

/*

2301

/*

2299

* If we're below the current max, allow a dispatch

2302

* If we're below the current max, allow a dispatch

2300

*/

2303

*/

2301

return cfqq->dispatched < max_dispatch;

2304

return cfqq->dispatched < max_dispatch;

2302

}

2305

}

2303

2306

2304

/*

2307

/*

2305

* Dispatch a request from cfqq, moving them to the request queue

2308

* Dispatch a request from cfqq, moving them to the request queue

2306

* dispatch list.

2309

* dispatch list.

2307

*/

2310

*/

2308

static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2311

static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2309

{

2312

{

2310

struct request *rq;

2313

struct request *rq;

2311

2314

2312

BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));

2315

BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));

2313

2316

2314

if (!cfq_may_dispatch(cfqd, cfqq))

2317

if (!cfq_may_dispatch(cfqd, cfqq))

2315

return false;

2318

return false;

2316

2319

2317

/*

2320

/*

2318

* follow expired path, else get first next available

2321

* follow expired path, else get first next available

2319

*/

2322

*/

2320

rq = cfq_check_fifo(cfqq);

2323

rq = cfq_check_fifo(cfqq);

2321

if (!rq)

2324

if (!rq)

2322

rq = cfqq->next_rq;

2325

rq = cfqq->next_rq;

2323

2326

2324

/*

2327

/*

2325

* insert request into driver dispatch list

2328

* insert request into driver dispatch list

2326

*/

2329

*/

2327

cfq_dispatch_insert(cfqd->queue, rq);

2330

cfq_dispatch_insert(cfqd->queue, rq);

2328

2331

2329

if (!cfqd->active_cic) {

2332

if (!cfqd->active_cic) {

2330

struct cfq_io_context *cic = RQ_CIC(rq);

2333

struct cfq_io_context *cic = RQ_CIC(rq);

2331

2334

2332

atomic_long_inc(&cic->ioc->refcount);

2335

atomic_long_inc(&cic->ioc->refcount);

2333

cfqd->active_cic = cic;

2336

cfqd->active_cic = cic;

2334

}

2337

}

2335

2338

2336

return true;

2339

return true;

2337

}

2340

}

2338

2341

2339

/*

2342

/*

2340

* Find the cfqq that we need to service and move a request from that to the

2343

* Find the cfqq that we need to service and move a request from that to the

2341

* dispatch list

2344

* dispatch list

2342

*/

2345

*/

2343

static int cfq_dispatch_requests(struct request_queue *q, int force)

2346

static int cfq_dispatch_requests(struct request_queue *q, int force)

2344

{

2347

{

2345

struct cfq_data *cfqd = q->elevator->elevator_data;

2348

struct cfq_data *cfqd = q->elevator->elevator_data;

2346

struct cfq_queue *cfqq;

2349

struct cfq_queue *cfqq;

2347

2350

2348

if (!cfqd->busy_queues)

2351

if (!cfqd->busy_queues)

2349

return 0;

2352

return 0;

2350

2353

2351

if (unlikely(force))

2354

if (unlikely(force))

2352

return cfq_forced_dispatch(cfqd);

2355

return cfq_forced_dispatch(cfqd);

2353

2356

2354

cfqq = cfq_select_queue(cfqd);

2357

cfqq = cfq_select_queue(cfqd);

2355

if (!cfqq)

2358

if (!cfqq)

2356

return 0;

2359

return 0;

2357

2360

2358

/*

2361

/*

2359

* Dispatch a request from this cfqq, if it is allowed

2362

* Dispatch a request from this cfqq, if it is allowed

2360

*/

2363

*/

2361

if (!cfq_dispatch_request(cfqd, cfqq))

2364

if (!cfq_dispatch_request(cfqd, cfqq))

2362

return 0;

2365

return 0;

2363

2366

2364

cfqq->slice_dispatch++;

2367

cfqq->slice_dispatch++;

2365

cfq_clear_cfqq_must_dispatch(cfqq);

2368

cfq_clear_cfqq_must_dispatch(cfqq);

2366

2369

2367

/*

2370

/*

2368

* expire an async queue immediately if it has used up its slice. idle

2371

* expire an async queue immediately if it has used up its slice. idle

2369

* queue always expire after 1 dispatch round.

2372

* queue always expire after 1 dispatch round.

2370

*/

2373

*/

2371

if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&

2374

if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&

2372

cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||

2375

cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||

2373

cfq_class_idle(cfqq))) {

2376

cfq_class_idle(cfqq))) {

2374

cfqq->slice_end = jiffies + 1;

2377

cfqq->slice_end = jiffies + 1;

2375

cfq_slice_expired(cfqd, 0);

2378

cfq_slice_expired(cfqd, 0);

2376

}

2379

}

2377

2380

2378

cfq_log_cfqq(cfqd, cfqq, "dispatched a request");

2381

cfq_log_cfqq(cfqd, cfqq, "dispatched a request");

2379

return 1;

2382

return 1;

2380

}

2383

}

2381

2384

2382

/*

2385

/*

2383

* task holds one reference to the queue, dropped when task exits. each rq

2386

* task holds one reference to the queue, dropped when task exits. each rq

2384

* in-flight on this queue also holds a reference, dropped when rq is freed.

2387

* in-flight on this queue also holds a reference, dropped when rq is freed.

2385

*

2388

*

2386

* Each cfq queue took a reference on the parent group. Drop it now.

2389

* Each cfq queue took a reference on the parent group. Drop it now.

2387

* queue lock must be held here.

2390

* queue lock must be held here.

2388

*/

2391

*/

2389

static void cfq_put_queue(struct cfq_queue *cfqq)

2392

static void cfq_put_queue(struct cfq_queue *cfqq)

2390

{

2393

{

2391

struct cfq_data *cfqd = cfqq->cfqd;

2394

struct cfq_data *cfqd = cfqq->cfqd;

2392

struct cfq_group *cfqg, *orig_cfqg;

2395

struct cfq_group *cfqg, *orig_cfqg;

2393

2396

2394

BUG_ON(atomic_read(&cfqq->ref) <= 0);

2397

BUG_ON(atomic_read(&cfqq->ref) <= 0);

2395

2398

2396

if (!atomic_dec_and_test(&cfqq->ref))

2399

if (!atomic_dec_and_test(&cfqq->ref))

2397

return;

2400

return;

2398

2401

2399

cfq_log_cfqq(cfqd, cfqq, "put_queue");

2402

cfq_log_cfqq(cfqd, cfqq, "put_queue");

2400

BUG_ON(rb_first(&cfqq->sort_list));

2403

BUG_ON(rb_first(&cfqq->sort_list));

2401

BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);

2404

BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);

2402

cfqg = cfqq->cfqg;

2405

cfqg = cfqq->cfqg;

2403

orig_cfqg = cfqq->orig_cfqg;

2406

orig_cfqg = cfqq->orig_cfqg;

2404

2407

2405

if (unlikely(cfqd->active_queue == cfqq)) {

2408

if (unlikely(cfqd->active_queue == cfqq)) {

2406

__cfq_slice_expired(cfqd, cfqq, 0);

2409

__cfq_slice_expired(cfqd, cfqq, 0);

2407

cfq_schedule_dispatch(cfqd);

2410

cfq_schedule_dispatch(cfqd);

2408

}

2411

}

2409

2412

2410

BUG_ON(cfq_cfqq_on_rr(cfqq));

2413

BUG_ON(cfq_cfqq_on_rr(cfqq));

2411

kmem_cache_free(cfq_pool, cfqq);

2414

kmem_cache_free(cfq_pool, cfqq);

2412

cfq_put_cfqg(cfqg);

2415

cfq_put_cfqg(cfqg);

2413

if (orig_cfqg)

2416

if (orig_cfqg)

2414

cfq_put_cfqg(orig_cfqg);

2417

cfq_put_cfqg(orig_cfqg);

2415

}

2418

}

2416

2419

2417

/*

2420

/*

2418

* Must always be called with the rcu_read_lock() held

2421

* Must always be called with the rcu_read_lock() held

2419

*/

2422

*/

2420

static void

2423

static void

2421

__call_for_each_cic(struct io_context *ioc,

2424

__call_for_each_cic(struct io_context *ioc,

2422

void (*func)(struct io_context *, struct cfq_io_context *))

2425

void (*func)(struct io_context *, struct cfq_io_context *))

2423

{

2426

{

2424

struct cfq_io_context *cic;

2427

struct cfq_io_context *cic;

2425

struct hlist_node *n;

2428

struct hlist_node *n;

2426

2429

2427

hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)

2430

hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)

2428

func(ioc, cic);

2431

func(ioc, cic);

2429

}

2432

}

2430

2433

2431

/*

2434

/*

2432

* Call func for each cic attached to this ioc.

2435

* Call func for each cic attached to this ioc.

2433

*/

2436

*/

2434

static void

2437

static void

2435

call_for_each_cic(struct io_context *ioc,

2438

call_for_each_cic(struct io_context *ioc,

2436

void (*func)(struct io_context *, struct cfq_io_context *))

2439

void (*func)(struct io_context *, struct cfq_io_context *))

2437

{

2440

{

2438

rcu_read_lock();

2441

rcu_read_lock();

2439

__call_for_each_cic(ioc, func);

2442

__call_for_each_cic(ioc, func);

2440

rcu_read_unlock();

2443

rcu_read_unlock();

2441

}

2444

}

2442

2445

2443

static void cfq_cic_free_rcu(struct rcu_head *head)

2446

static void cfq_cic_free_rcu(struct rcu_head *head)

2444

{

2447

{

2445

struct cfq_io_context *cic;

2448

struct cfq_io_context *cic;

2446

2449

2447

cic = container_of(head, struct cfq_io_context, rcu_head);

2450

cic = container_of(head, struct cfq_io_context, rcu_head);

2448

2451

2449

kmem_cache_free(cfq_ioc_pool, cic);

2452

kmem_cache_free(cfq_ioc_pool, cic);

2450

elv_ioc_count_dec(cfq_ioc_count);

2453

elv_ioc_count_dec(cfq_ioc_count);

2451

2454

2452

if (ioc_gone) {

2455

if (ioc_gone) {

2453

/*

2456

/*

2454

* CFQ scheduler is exiting, grab exit lock and check

2457

* CFQ scheduler is exiting, grab exit lock and check

2455

* the pending io context count. If it hits zero,

2458

* the pending io context count. If it hits zero,

2456

* complete ioc_gone and set it back to NULL

2459

* complete ioc_gone and set it back to NULL

2457

*/

2460

*/

2458

spin_lock(&ioc_gone_lock);

2461

spin_lock(&ioc_gone_lock);

2459

if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {

2462

if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {

2460

complete(ioc_gone);

2463

complete(ioc_gone);

2461

ioc_gone = NULL;

2464

ioc_gone = NULL;

2462

}

2465

}

2463

spin_unlock(&ioc_gone_lock);

2466

spin_unlock(&ioc_gone_lock);

2464

}

2467

}

2465

}

2468

}

2466

2469

2467

static void cfq_cic_free(struct cfq_io_context *cic)

2470

static void cfq_cic_free(struct cfq_io_context *cic)

2468

{

2471

{

2469

call_rcu(&cic->rcu_head, cfq_cic_free_rcu);

2472

call_rcu(&cic->rcu_head, cfq_cic_free_rcu);

2470

}

2473

}

2471

2474

2472

static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)

2475

static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)

2473

{

2476

{

2474

unsigned long flags;

2477

unsigned long flags;

2475

2478

2476

BUG_ON(!cic->dead_key);

2479

BUG_ON(!cic->dead_key);

2477

2480

2478

spin_lock_irqsave(&ioc->lock, flags);

2481

spin_lock_irqsave(&ioc->lock, flags);

2479

radix_tree_delete(&ioc->radix_root, cic->dead_key);

2482

radix_tree_delete(&ioc->radix_root, cic->dead_key);

2480

hlist_del_rcu(&cic->cic_list);

2483

hlist_del_rcu(&cic->cic_list);

2481

spin_unlock_irqrestore(&ioc->lock, flags);

2484

spin_unlock_irqrestore(&ioc->lock, flags);

2482

2485

2483

cfq_cic_free(cic);

2486

cfq_cic_free(cic);

2484

}

2487

}

2485

2488

2486

/*

2489

/*

2487

* Must be called with rcu_read_lock() held or preemption otherwise disabled.

2490

* Must be called with rcu_read_lock() held or preemption otherwise disabled.

2488

* Only two callers of this - ->dtor() which is called with the rcu_read_lock(),

2491

* Only two callers of this - ->dtor() which is called with the rcu_read_lock(),

2489

* and ->trim() which is called with the task lock held

2492

* and ->trim() which is called with the task lock held

2490

*/

2493

*/

2491

static void cfq_free_io_context(struct io_context *ioc)

2494

static void cfq_free_io_context(struct io_context *ioc)

2492

{

2495

{

2493

/*

2496

/*

2494

* ioc->refcount is zero here, or we are called from elv_unregister(),

2497

* ioc->refcount is zero here, or we are called from elv_unregister(),

2495

* so no more cic's are allowed to be linked into this ioc. So it

2498

* so no more cic's are allowed to be linked into this ioc. So it

2496

* should be ok to iterate over the known list, we will see all cic's

2499

* should be ok to iterate over the known list, we will see all cic's

2497

* since no new ones are added.

2500

* since no new ones are added.

2498

*/

2501

*/

2499

__call_for_each_cic(ioc, cic_free_func);

2502

__call_for_each_cic(ioc, cic_free_func);

2500

}

2503

}

2501

2504

2502

static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2505

static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)

2503

{

2506

{

2504

struct cfq_queue *__cfqq, *next;

2507

struct cfq_queue *__cfqq, *next;

2505

2508

2506

if (unlikely(cfqq == cfqd->active_queue)) {

2509

if (unlikely(cfqq == cfqd->active_queue)) {

2507

__cfq_slice_expired(cfqd, cfqq, 0);

2510

__cfq_slice_expired(cfqd, cfqq, 0);

2508

cfq_schedule_dispatch(cfqd);

2511

cfq_schedule_dispatch(cfqd);

2509

}

2512

}

2510

2513

2511

/*

2514

/*

2512

* If this queue was scheduled to merge with another queue, be

2515

* If this queue was scheduled to merge with another queue, be

2513

* sure to drop the reference taken on that queue (and others in

2516

* sure to drop the reference taken on that queue (and others in

2514

* the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.

2517

* the merge chain). See cfq_setup_merge and cfq_merge_cfqqs.

2515

*/

2518

*/

2516

__cfqq = cfqq->new_cfqq;

2519

__cfqq = cfqq->new_cfqq;

2517

while (__cfqq) {

2520

while (__cfqq) {

2518

if (__cfqq == cfqq) {

2521

if (__cfqq == cfqq) {

2519

WARN(1, "cfqq->new_cfqq loop detected\n");

2522

WARN(1, "cfqq->new_cfqq loop detected\n");

2520

break;

2523

break;

2521

}

2524

}

2522

next = __cfqq->new_cfqq;

2525

next = __cfqq->new_cfqq;

2523

cfq_put_queue(__cfqq);

2526

cfq_put_queue(__cfqq);

2524

__cfqq = next;

2527

__cfqq = next;

2525

}

2528

}

2526

2529

2527

cfq_put_queue(cfqq);

2530

cfq_put_queue(cfqq);

2528

}

2531

}

2529

2532

2530

static void __cfq_exit_single_io_context(struct cfq_data *cfqd,

2533

static void __cfq_exit_single_io_context(struct cfq_data *cfqd,

2531

struct cfq_io_context *cic)

2534

struct cfq_io_context *cic)

2532

{

2535

{

2533

struct io_context *ioc = cic->ioc;

2536

struct io_context *ioc = cic->ioc;

2534

2537

2535

list_del_init(&cic->queue_list);

2538

list_del_init(&cic->queue_list);

2536

2539

2537

/*

2540

/*

2538

* Make sure key == NULL is seen for dead queues

2541

* Make sure key == NULL is seen for dead queues

2539

*/

2542

*/

2540

smp_wmb();

2543

smp_wmb();

2541

cic->dead_key = (unsigned long) cic->key;

2544

cic->dead_key = (unsigned long) cic->key;

2542

cic->key = NULL;

2545

cic->key = NULL;

2543

2546

2544

if (ioc->ioc_data == cic)

2547

if (ioc->ioc_data == cic)

2545

rcu_assign_pointer(ioc->ioc_data, NULL);

2548

rcu_assign_pointer(ioc->ioc_data, NULL);

2546

2549

2547

if (cic->cfqq[BLK_RW_ASYNC]) {

2550

if (cic->cfqq[BLK_RW_ASYNC]) {

2548

cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);

2551

cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);

2549

cic->cfqq[BLK_RW_ASYNC] = NULL;

2552

cic->cfqq[BLK_RW_ASYNC] = NULL;

2550

}

2553

}

2551

2554

2552

if (cic->cfqq[BLK_RW_SYNC]) {

2555

if (cic->cfqq[BLK_RW_SYNC]) {

2553

cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);

2556

cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);

2554

cic->cfqq[BLK_RW_SYNC] = NULL;

2557

cic->cfqq[BLK_RW_SYNC] = NULL;

2555

}

2558

}

2556

}

2559

}

2557

2560

2558

static void cfq_exit_single_io_context(struct io_context *ioc,

2561

static void cfq_exit_single_io_context(struct io_context *ioc,

2559

struct cfq_io_context *cic)

2562

struct cfq_io_context *cic)

2560

{

2563

{

2561

struct cfq_data *cfqd = cic->key;

2564

struct cfq_data *cfqd = cic->key;

2562

2565

2563

if (cfqd) {

2566

if (cfqd) {

2564

struct request_queue *q = cfqd->queue;

2567

struct request_queue *q = cfqd->queue;

2565

unsigned long flags;

2568

unsigned long flags;

2566

2569

2567

spin_lock_irqsave(q->queue_lock, flags);

2570

spin_lock_irqsave(q->queue_lock, flags);

2568

2571

2569

/*

2572

/*

2570

* Ensure we get a fresh copy of the ->key to prevent

2573

* Ensure we get a fresh copy of the ->key to prevent

2571

* race between exiting task and queue

2574

* race between exiting task and queue

2572

*/

2575

*/

2573

smp_read_barrier_depends();

2576

smp_read_barrier_depends();

2574

if (cic->key)

2577

if (cic->key)

2575

__cfq_exit_single_io_context(cfqd, cic);

2578

__cfq_exit_single_io_context(cfqd, cic);

2576

2579

2577

spin_unlock_irqrestore(q->queue_lock, flags);

2580

spin_unlock_irqrestore(q->queue_lock, flags);

2578

}

2581

}

2579

}

2582

}

2580

2583

2581

/*

2584

/*

2582

* The process that ioc belongs to has exited, we need to clean up

2585

* The process that ioc belongs to has exited, we need to clean up

2583

* and put the internal structures we have that belongs to that process.

2586

* and put the internal structures we have that belongs to that process.

2584

*/

2587

*/

2585

static void cfq_exit_io_context(struct io_context *ioc)

2588

static void cfq_exit_io_context(struct io_context *ioc)

2586

{

2589

{

2587

call_for_each_cic(ioc, cfq_exit_single_io_context);

2590

call_for_each_cic(ioc, cfq_exit_single_io_context);

2588

}

2591

}

2589

2592

2590

static struct cfq_io_context *

2593

static struct cfq_io_context *

2591

cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)

2594

cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)

2592

{

2595

{

2593

struct cfq_io_context *cic;

2596

struct cfq_io_context *cic;

2594

2597

2595

cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,

2598

cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,

2596

cfqd->queue->node);

2599

cfqd->queue->node);

2597

if (cic) {

2600

if (cic) {

2598

cic->last_end_request = jiffies;

2601

cic->last_end_request = jiffies;

2599

INIT_LIST_HEAD(&cic->queue_list);

2602

INIT_LIST_HEAD(&cic->queue_list);

2600

INIT_HLIST_NODE(&cic->cic_list);

2603

INIT_HLIST_NODE(&cic->cic_list);

2601

cic->dtor = cfq_free_io_context;

2604

cic->dtor = cfq_free_io_context;

2602

cic->exit = cfq_exit_io_context;

2605

cic->exit = cfq_exit_io_context;

2603

elv_ioc_count_inc(cfq_ioc_count);

2606

elv_ioc_count_inc(cfq_ioc_count);

2604

}

2607

}

2605

2608

2606

return cic;

2609

return cic;

2607

}

2610

}

2608

2611

2609

static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)

2612

static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)

2610

{

2613

{

2611

struct task_struct *tsk = current;

2614

struct task_struct *tsk = current;

2612

int ioprio_class;

2615

int ioprio_class;

2613

2616

2614

if (!cfq_cfqq_prio_changed(cfqq))

2617

if (!cfq_cfqq_prio_changed(cfqq))

2615

return;

2618

return;

2616

2619

2617

ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);

2620

ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);

2618

switch (ioprio_class) {

2621

switch (ioprio_class) {

2619

default:

2622

default:

2620

printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);

2623

printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);

2621

case IOPRIO_CLASS_NONE:

2624

case IOPRIO_CLASS_NONE:

2622

/*

2625

/*

2623

* no prio set, inherit CPU scheduling settings

2626

* no prio set, inherit CPU scheduling settings

2624

*/

2627

*/

2625

cfqq->ioprio = task_nice_ioprio(tsk);

2628

cfqq->ioprio = task_nice_ioprio(tsk);

2626

cfqq->ioprio_class = task_nice_ioclass(tsk);

2629

cfqq->ioprio_class = task_nice_ioclass(tsk);

2627

break;

2630

break;

2628

case IOPRIO_CLASS_RT:

2631

case IOPRIO_CLASS_RT:

2629

cfqq->ioprio = task_ioprio(ioc);

2632

cfqq->ioprio = task_ioprio(ioc);

2630

cfqq->ioprio_class = IOPRIO_CLASS_RT;

2633

cfqq->ioprio_class = IOPRIO_CLASS_RT;

2631

break;

2634

break;

2632

case IOPRIO_CLASS_BE:

2635

case IOPRIO_CLASS_BE:

2633

cfqq->ioprio = task_ioprio(ioc);

2636

cfqq->ioprio = task_ioprio(ioc);

2634

cfqq->ioprio_class = IOPRIO_CLASS_BE;

2637

cfqq->ioprio_class = IOPRIO_CLASS_BE;

2635

break;

2638

break;

2636

case IOPRIO_CLASS_IDLE:

2639

case IOPRIO_CLASS_IDLE:

2637

cfqq->ioprio_class = IOPRIO_CLASS_IDLE;

2640

cfqq->ioprio_class = IOPRIO_CLASS_IDLE;

2638

cfqq->ioprio = 7;

2641

cfqq->ioprio = 7;

2639

cfq_clear_cfqq_idle_window(cfqq);

2642

cfq_clear_cfqq_idle_window(cfqq);

2640

break;

2643

break;

2641

}

2644

}

2642

2645

2643

/*

2646

/*

2644

* keep track of original prio settings in case we have to temporarily

2647

* keep track of original prio settings in case we have to temporarily

2645

* elevate the priority of this queue

2648

* elevate the priority of this queue

2646

*/

2649

*/

2647

cfqq->org_ioprio = cfqq->ioprio;

2650

cfqq->org_ioprio = cfqq->ioprio;

2648

cfqq->org_ioprio_class = cfqq->ioprio_class;

2651

cfqq->org_ioprio_class = cfqq->ioprio_class;

2649

cfq_clear_cfqq_prio_changed(cfqq);

2652

cfq_clear_cfqq_prio_changed(cfqq);

2650

}

2653

}

2651

2654

2652

static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)

2655

static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)

2653

{

2656

{

2654

struct cfq_data *cfqd = cic->key;

2657

struct cfq_data *cfqd = cic->key;

2655

struct cfq_queue *cfqq;

2658

struct cfq_queue *cfqq;

2656

unsigned long flags;

2659

unsigned long flags;

2657

2660

2658

if (unlikely(!cfqd))

2661

if (unlikely(!cfqd))

2659

return;

2662

return;

2660

2663

2661

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

2664

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

2662

2665

2663

cfqq = cic->cfqq[BLK_RW_ASYNC];

2666

cfqq = cic->cfqq[BLK_RW_ASYNC];

2664

if (cfqq) {

2667

if (cfqq) {

2665

struct cfq_queue *new_cfqq;

2668

struct cfq_queue *new_cfqq;

2666

new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,

2669

new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,

2667

GFP_ATOMIC);

2670

GFP_ATOMIC);

2668

if (new_cfqq) {

2671

if (new_cfqq) {

2669

cic->cfqq[BLK_RW_ASYNC] = new_cfqq;

2672

cic->cfqq[BLK_RW_ASYNC] = new_cfqq;

2670

cfq_put_queue(cfqq);

2673

cfq_put_queue(cfqq);

2671

}

2674

}

2672

}

2675

}

2673

2676

2674

cfqq = cic->cfqq[BLK_RW_SYNC];

2677

cfqq = cic->cfqq[BLK_RW_SYNC];

2675

if (cfqq)

2678

if (cfqq)

2676

cfq_mark_cfqq_prio_changed(cfqq);

2679

cfq_mark_cfqq_prio_changed(cfqq);

2677

2680

2678

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

2681

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

2679

}

2682

}

2680

2683

2681

static void cfq_ioc_set_ioprio(struct io_context *ioc)

2684

static void cfq_ioc_set_ioprio(struct io_context *ioc)

2682

{

2685

{

2683

call_for_each_cic(ioc, changed_ioprio);

2686

call_for_each_cic(ioc, changed_ioprio);

2684

ioc->ioprio_changed = 0;

2687

ioc->ioprio_changed = 0;

2685

}

2688

}

2686

2689

2687

static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,

2690

static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,

2688

pid_t pid, bool is_sync)

2691

pid_t pid, bool is_sync)

2689

{

2692

{

2690

RB_CLEAR_NODE(&cfqq->rb_node);

2693

RB_CLEAR_NODE(&cfqq->rb_node);

2691

RB_CLEAR_NODE(&cfqq->p_node);

2694

RB_CLEAR_NODE(&cfqq->p_node);

2692

INIT_LIST_HEAD(&cfqq->fifo);

2695

INIT_LIST_HEAD(&cfqq->fifo);

2693

2696

2694

atomic_set(&cfqq->ref, 0);

2697

atomic_set(&cfqq->ref, 0);

2695

cfqq->cfqd = cfqd;

2698

cfqq->cfqd = cfqd;

2696

2699

2697

cfq_mark_cfqq_prio_changed(cfqq);

2700

cfq_mark_cfqq_prio_changed(cfqq);

2698

2701

2699

if (is_sync) {

2702

if (is_sync) {

2700

if (!cfq_class_idle(cfqq))

2703

if (!cfq_class_idle(cfqq))

2701

cfq_mark_cfqq_idle_window(cfqq);

2704

cfq_mark_cfqq_idle_window(cfqq);

2702

cfq_mark_cfqq_sync(cfqq);

2705

cfq_mark_cfqq_sync(cfqq);

2703

}

2706

}

2704

cfqq->pid = pid;

2707

cfqq->pid = pid;

2705

}

2708

}

2706

2709

2707

#ifdef CONFIG_CFQ_GROUP_IOSCHED

2710

#ifdef CONFIG_CFQ_GROUP_IOSCHED

2708

static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)

2711

static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)

2709

{

2712

{

2710

struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);

2713

struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);

2711

struct cfq_data *cfqd = cic->key;

2714

struct cfq_data *cfqd = cic->key;

2712

unsigned long flags;

2715

unsigned long flags;

2713

struct request_queue *q;

2716

struct request_queue *q;

2714

2717

2715

if (unlikely(!cfqd))

2718

if (unlikely(!cfqd))

2716

return;

2719

return;

2717

2720

2718

q = cfqd->queue;

2721

q = cfqd->queue;

2719

2722

2720

spin_lock_irqsave(q->queue_lock, flags);

2723

spin_lock_irqsave(q->queue_lock, flags);

2721

2724

2722

if (sync_cfqq) {

2725

if (sync_cfqq) {

2723

/*

2726

/*

2724

* Drop reference to sync queue. A new sync queue will be

2727

* Drop reference to sync queue. A new sync queue will be

2725

* assigned in new group upon arrival of a fresh request.

2728

* assigned in new group upon arrival of a fresh request.

2726

*/

2729

*/

2727

cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");

2730

cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");

2728

cic_set_cfqq(cic, NULL, 1);

2731

cic_set_cfqq(cic, NULL, 1);

2729

cfq_put_queue(sync_cfqq);

2732

cfq_put_queue(sync_cfqq);

2730

}

2733

}

2731

2734

2732

spin_unlock_irqrestore(q->queue_lock, flags);

2735

spin_unlock_irqrestore(q->queue_lock, flags);

2733

}

2736

}

2734

2737

2735

static void cfq_ioc_set_cgroup(struct io_context *ioc)

2738

static void cfq_ioc_set_cgroup(struct io_context *ioc)

2736

{

2739

{

2737

call_for_each_cic(ioc, changed_cgroup);

2740

call_for_each_cic(ioc, changed_cgroup);

2738

ioc->cgroup_changed = 0;

2741

ioc->cgroup_changed = 0;

2739

}

2742

}

2740

#endif /* CONFIG_CFQ_GROUP_IOSCHED */

2743

#endif /* CONFIG_CFQ_GROUP_IOSCHED */

2741

2744

2742

static struct cfq_queue *

2745

static struct cfq_queue *

2743

cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,

2746

cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,

2744

struct io_context *ioc, gfp_t gfp_mask)

2747

struct io_context *ioc, gfp_t gfp_mask)

2745

{

2748

{

2746

struct cfq_queue *cfqq, *new_cfqq = NULL;

2749

struct cfq_queue *cfqq, *new_cfqq = NULL;

2747

struct cfq_io_context *cic;

2750

struct cfq_io_context *cic;

2748

struct cfq_group *cfqg;

2751

struct cfq_group *cfqg;

2749

2752

2750

retry:

2753

retry:

2751

cfqg = cfq_get_cfqg(cfqd, 1);

2754

cfqg = cfq_get_cfqg(cfqd, 1);

2752

cic = cfq_cic_lookup(cfqd, ioc);

2755

cic = cfq_cic_lookup(cfqd, ioc);

2753

/* cic always exists here */

2756

/* cic always exists here */

2754

cfqq = cic_to_cfqq(cic, is_sync);

2757

cfqq = cic_to_cfqq(cic, is_sync);

2755

2758

2756

/*

2759

/*

2757

* Always try a new alloc if we fell back to the OOM cfqq

2760

* Always try a new alloc if we fell back to the OOM cfqq

2758

* originally, since it should just be a temporary situation.

2761

* originally, since it should just be a temporary situation.

2759

*/

2762

*/

2760

if (!cfqq || cfqq == &cfqd->oom_cfqq) {

2763

if (!cfqq || cfqq == &cfqd->oom_cfqq) {

2761

cfqq = NULL;

2764

cfqq = NULL;

2762

if (new_cfqq) {

2765

if (new_cfqq) {

2763

cfqq = new_cfqq;

2766

cfqq = new_cfqq;

2764

new_cfqq = NULL;

2767

new_cfqq = NULL;

2765

} else if (gfp_mask & __GFP_WAIT) {

2768

} else if (gfp_mask & __GFP_WAIT) {

2766

spin_unlock_irq(cfqd->queue->queue_lock);

2769

spin_unlock_irq(cfqd->queue->queue_lock);

2767

new_cfqq = kmem_cache_alloc_node(cfq_pool,

2770

new_cfqq = kmem_cache_alloc_node(cfq_pool,

2768

gfp_mask | __GFP_ZERO,

2771

gfp_mask | __GFP_ZERO,

2769

cfqd->queue->node);

2772

cfqd->queue->node);

2770

spin_lock_irq(cfqd->queue->queue_lock);

2773

spin_lock_irq(cfqd->queue->queue_lock);

2771

if (new_cfqq)

2774

if (new_cfqq)

2772

goto retry;

2775

goto retry;

2773

} else {

2776

} else {

2774

cfqq = kmem_cache_alloc_node(cfq_pool,

2777

cfqq = kmem_cache_alloc_node(cfq_pool,

2775

gfp_mask | __GFP_ZERO,

2778

gfp_mask | __GFP_ZERO,

2776

cfqd->queue->node);

2779

cfqd->queue->node);

2777

}

2780

}

2778

2781

2779

if (cfqq) {

2782

if (cfqq) {

2780

cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);

2783

cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);

2781

cfq_init_prio_data(cfqq, ioc);

2784

cfq_init_prio_data(cfqq, ioc);

2782

cfq_link_cfqq_cfqg(cfqq, cfqg);

2785

cfq_link_cfqq_cfqg(cfqq, cfqg);

2783

cfq_log_cfqq(cfqd, cfqq, "alloced");

2786

cfq_log_cfqq(cfqd, cfqq, "alloced");

2784

} else

2787

} else

2785

cfqq = &cfqd->oom_cfqq;

2788

cfqq = &cfqd->oom_cfqq;

2786

}

2789

}

2787

2790

2788

if (new_cfqq)

2791

if (new_cfqq)

2789

kmem_cache_free(cfq_pool, new_cfqq);

2792

kmem_cache_free(cfq_pool, new_cfqq);

2790

2793

2791

return cfqq;

2794

return cfqq;

2792

}

2795

}

2793

2796

2794

static struct cfq_queue **

2797

static struct cfq_queue **

2795

cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)

2798

cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)

2796

{

2799

{

2797

switch (ioprio_class) {

2800

switch (ioprio_class) {

2798

case IOPRIO_CLASS_RT:

2801

case IOPRIO_CLASS_RT:

2799

return &cfqd->async_cfqq[0][ioprio];

2802

return &cfqd->async_cfqq[0][ioprio];

2800

case IOPRIO_CLASS_BE:

2803

case IOPRIO_CLASS_BE:

2801

return &cfqd->async_cfqq[1][ioprio];

2804

return &cfqd->async_cfqq[1][ioprio];

2802

case IOPRIO_CLASS_IDLE:

2805

case IOPRIO_CLASS_IDLE:

2803

return &cfqd->async_idle_cfqq;

2806

return &cfqd->async_idle_cfqq;

2804

default:

2807

default:

2805

BUG();

2808

BUG();

2806

}

2809

}

2807

}

2810

}

2808

2811

2809

static struct cfq_queue *

2812

static struct cfq_queue *

2810

cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,

2813

cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,

2811

gfp_t gfp_mask)

2814

gfp_t gfp_mask)

2812

{

2815

{

2813

const int ioprio = task_ioprio(ioc);

2816

const int ioprio = task_ioprio(ioc);

2814

const int ioprio_class = task_ioprio_class(ioc);

2817

const int ioprio_class = task_ioprio_class(ioc);

2815

struct cfq_queue **async_cfqq = NULL;

2818

struct cfq_queue **async_cfqq = NULL;

2816

struct cfq_queue *cfqq = NULL;

2819

struct cfq_queue *cfqq = NULL;

2817

2820

2818

if (!is_sync) {

2821

if (!is_sync) {

2819

async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);

2822

async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);

2820

cfqq = *async_cfqq;

2823

cfqq = *async_cfqq;

2821

}

2824

}

2822

2825

2823

if (!cfqq)

2826

if (!cfqq)

2824

cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);

2827

cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);

2825

2828

2826

/*

2829

/*

2827

* pin the queue now that it's allocated, scheduler exit will prune it

2830

* pin the queue now that it's allocated, scheduler exit will prune it

2828

*/

2831

*/

2829

if (!is_sync && !(*async_cfqq)) {

2832

if (!is_sync && !(*async_cfqq)) {

2830

atomic_inc(&cfqq->ref);

2833

atomic_inc(&cfqq->ref);

2831

*async_cfqq = cfqq;

2834

*async_cfqq = cfqq;

2832

}

2835

}

2833

2836

2834

atomic_inc(&cfqq->ref);

2837

atomic_inc(&cfqq->ref);

2835

return cfqq;

2838

return cfqq;

2836

}

2839

}

2837

2840

2838

/*

2841

/*

2839

* We drop cfq io contexts lazily, so we may find a dead one.

2842

* We drop cfq io contexts lazily, so we may find a dead one.

2840

*/

2843

*/

2841

static void

2844

static void

2842

cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,

2845

cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,

2843

struct cfq_io_context *cic)

2846

struct cfq_io_context *cic)

2844

{

2847

{

2845

unsigned long flags;

2848

unsigned long flags;

2846

2849

2847

WARN_ON(!list_empty(&cic->queue_list));

2850

WARN_ON(!list_empty(&cic->queue_list));

2848

2851

2849

spin_lock_irqsave(&ioc->lock, flags);

2852

spin_lock_irqsave(&ioc->lock, flags);

2850

2853

2851

BUG_ON(ioc->ioc_data == cic);

2854

BUG_ON(ioc->ioc_data == cic);

2852

2855

2853

radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);

2856

radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);

2854

hlist_del_rcu(&cic->cic_list);

2857

hlist_del_rcu(&cic->cic_list);

2855

spin_unlock_irqrestore(&ioc->lock, flags);

2858

spin_unlock_irqrestore(&ioc->lock, flags);

2856

2859

2857

cfq_cic_free(cic);

2860

cfq_cic_free(cic);

2858

}

2861

}

2859

2862

2860

static struct cfq_io_context *

2863

static struct cfq_io_context *

2861

cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)

2864

cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)

2862

{

2865

{

2863

struct cfq_io_context *cic;

2866

struct cfq_io_context *cic;

2864

unsigned long flags;

2867

unsigned long flags;

2865

void *k;

2868

void *k;

2866

2869

2867

if (unlikely(!ioc))

2870

if (unlikely(!ioc))

2868

return NULL;

2871

return NULL;

2869

2872

2870

rcu_read_lock();

2873

rcu_read_lock();

2871

2874

2872

/*

2875

/*

2873

* we maintain a last-hit cache, to avoid browsing over the tree

2876

* we maintain a last-hit cache, to avoid browsing over the tree

2874

*/

2877

*/

2875

cic = rcu_dereference(ioc->ioc_data);

2878

cic = rcu_dereference(ioc->ioc_data);

2876

if (cic && cic->key == cfqd) {

2879

if (cic && cic->key == cfqd) {

2877

rcu_read_unlock();

2880

rcu_read_unlock();

2878

return cic;

2881

return cic;

2879

}

2882

}

2880

2883

2881

do {

2884

do {

2882

cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);

2885

cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);

2883

rcu_read_unlock();

2886

rcu_read_unlock();

2884

if (!cic)

2887

if (!cic)

2885

break;

2888

break;

2886

/* ->key must be copied to avoid race with cfq_exit_queue() */

2889

/* ->key must be copied to avoid race with cfq_exit_queue() */

2887

k = cic->key;

2890

k = cic->key;

2888

if (unlikely(!k)) {

2891

if (unlikely(!k)) {

2889

cfq_drop_dead_cic(cfqd, ioc, cic);

2892

cfq_drop_dead_cic(cfqd, ioc, cic);

2890

rcu_read_lock();

2893

rcu_read_lock();

2891

continue;

2894

continue;

2892

}

2895

}

2893

2896

2894

spin_lock_irqsave(&ioc->lock, flags);

2897

spin_lock_irqsave(&ioc->lock, flags);

2895

rcu_assign_pointer(ioc->ioc_data, cic);

2898

rcu_assign_pointer(ioc->ioc_data, cic);

2896

spin_unlock_irqrestore(&ioc->lock, flags);

2899

spin_unlock_irqrestore(&ioc->lock, flags);

2897

break;

2900

break;

2898

} while (1);

2901

} while (1);

2899

2902

2900

return cic;

2903

return cic;

2901

}

2904

}

2902

2905

2903

/*

2906

/*

2904

* Add cic into ioc, using cfqd as the search key. This enables us to lookup

2907

* Add cic into ioc, using cfqd as the search key. This enables us to lookup

2905

* the process specific cfq io context when entered from the block layer.

2908

* the process specific cfq io context when entered from the block layer.

2906

* Also adds the cic to a per-cfqd list, used when this queue is removed.

2909

* Also adds the cic to a per-cfqd list, used when this queue is removed.

2907

*/

2910

*/

2908

static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,

2911

static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,

2909

struct cfq_io_context *cic, gfp_t gfp_mask)

2912

struct cfq_io_context *cic, gfp_t gfp_mask)

2910

{

2913

{

2911

unsigned long flags;

2914

unsigned long flags;

2912

int ret;

2915

int ret;

2913

2916

2914

ret = radix_tree_preload(gfp_mask);

2917

ret = radix_tree_preload(gfp_mask);

2915

if (!ret) {

2918

if (!ret) {

2916

cic->ioc = ioc;

2919

cic->ioc = ioc;

2917

cic->key = cfqd;

2920

cic->key = cfqd;

2918

2921

2919

spin_lock_irqsave(&ioc->lock, flags);

2922

spin_lock_irqsave(&ioc->lock, flags);

2920

ret = radix_tree_insert(&ioc->radix_root,

2923

ret = radix_tree_insert(&ioc->radix_root,

2921

(unsigned long) cfqd, cic);

2924

(unsigned long) cfqd, cic);

2922

if (!ret)

2925

if (!ret)

2923

hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);

2926

hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);

2924

spin_unlock_irqrestore(&ioc->lock, flags);

2927

spin_unlock_irqrestore(&ioc->lock, flags);

2925

2928

2926

radix_tree_preload_end();

2929

radix_tree_preload_end();

2927

2930

2928

if (!ret) {

2931

if (!ret) {

2929

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

2932

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

2930

list_add(&cic->queue_list, &cfqd->cic_list);

2933

list_add(&cic->queue_list, &cfqd->cic_list);

2931

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

2934

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

2932

}

2935

}

2933

}

2936

}

2934

2937

2935

if (ret)

2938

if (ret)

2936

printk(KERN_ERR "cfq: cic link failed!\n");

2939

printk(KERN_ERR "cfq: cic link failed!\n");

2937

2940

2938

return ret;

2941

return ret;

2939

}

2942

}

2940

2943

2941

/*

2944

/*

2942

* Setup general io context and cfq io context. There can be several cfq

2945

* Setup general io context and cfq io context. There can be several cfq

2943

* io contexts per general io context, if this process is doing io to more

2946

* io contexts per general io context, if this process is doing io to more

2944

* than one device managed by cfq.

2947

* than one device managed by cfq.

2945

*/

2948

*/

2946

static struct cfq_io_context *

2949

static struct cfq_io_context *

2947

cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)

2950

cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)

2948

{

2951

{

2949

struct io_context *ioc = NULL;

2952

struct io_context *ioc = NULL;

2950

struct cfq_io_context *cic;

2953

struct cfq_io_context *cic;

2951

2954

2952

might_sleep_if(gfp_mask & __GFP_WAIT);

2955

might_sleep_if(gfp_mask & __GFP_WAIT);

2953

2956

2954

ioc = get_io_context(gfp_mask, cfqd->queue->node);

2957

ioc = get_io_context(gfp_mask, cfqd->queue->node);

2955

if (!ioc)

2958

if (!ioc)

2956

return NULL;

2959

return NULL;

2957

2960

2958

cic = cfq_cic_lookup(cfqd, ioc);

2961

cic = cfq_cic_lookup(cfqd, ioc);

2959

if (cic)

2962

if (cic)

2960

goto out;

2963

goto out;

2961

2964

2962

cic = cfq_alloc_io_context(cfqd, gfp_mask);

2965

cic = cfq_alloc_io_context(cfqd, gfp_mask);

2963

if (cic == NULL)

2966

if (cic == NULL)

2964

goto err;

2967

goto err;

2965

2968

2966

if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))

2969

if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))

2967

goto err_free;

2970

goto err_free;

2968

2971

2969

out:

2972

out:

2970

smp_read_barrier_depends();

2973

smp_read_barrier_depends();

2971

if (unlikely(ioc->ioprio_changed))

2974

if (unlikely(ioc->ioprio_changed))

2972

cfq_ioc_set_ioprio(ioc);

2975

cfq_ioc_set_ioprio(ioc);

2973

2976

2974

#ifdef CONFIG_CFQ_GROUP_IOSCHED

2977

#ifdef CONFIG_CFQ_GROUP_IOSCHED

2975

if (unlikely(ioc->cgroup_changed))

2978

if (unlikely(ioc->cgroup_changed))

2976

cfq_ioc_set_cgroup(ioc);

2979

cfq_ioc_set_cgroup(ioc);

2977

#endif

2980

#endif

2978

return cic;

2981

return cic;

2979

err_free:

2982

err_free:

2980

cfq_cic_free(cic);

2983

cfq_cic_free(cic);

2981

err:

2984

err:

2982

put_io_context(ioc);

2985

put_io_context(ioc);

2983

return NULL;

2986

return NULL;

2984

}

2987

}

2985

2988

2986

static void

2989

static void

2987

cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)

2990

cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)

2988

{

2991

{

2989

unsigned long elapsed = jiffies - cic->last_end_request;

2992

unsigned long elapsed = jiffies - cic->last_end_request;

2990

unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);

2993

unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);

2991

2994

2992

cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;

2995

cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;

2993

cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;

2996

cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;

2994

cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;

2997

cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;

2995

}

2998

}

2996

2999

2997

static void

3000

static void

2998

cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3001

cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,

2999

struct request *rq)

3002

struct request *rq)

3000

{

3003

{

3001

sector_t sdist = 0;

3004

sector_t sdist = 0;

3002

sector_t n_sec = blk_rq_sectors(rq);

3005

sector_t n_sec = blk_rq_sectors(rq);

3003

if (cfqq->last_request_pos) {

3006

if (cfqq->last_request_pos) {

3004

if (cfqq->last_request_pos < blk_rq_pos(rq))

3007

if (cfqq->last_request_pos < blk_rq_pos(rq))

3005

sdist = blk_rq_pos(rq) - cfqq->last_request_pos;

3008

sdist = blk_rq_pos(rq) - cfqq->last_request_pos;

3006

else

3009

else

3007

sdist = cfqq->last_request_pos - blk_rq_pos(rq);

3010

sdist = cfqq->last_request_pos - blk_rq_pos(rq);

3008

}

3011

}

3009

3012

3010

cfqq->seek_history <<= 1;

3013

cfqq->seek_history <<= 1;

3011

if (blk_queue_nonrot(cfqd->queue))

3014

if (blk_queue_nonrot(cfqd->queue))

3012

cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);

3015

cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);

3013

else

3016

else

3014

cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);

3017

cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);

3015

}

3018

}

3016

3019

3017

/*

3020

/*

3018

* Disable idle window if the process thinks too long or seeks so much that

3021

* Disable idle window if the process thinks too long or seeks so much that

3019

* it doesn't matter

3022

* it doesn't matter

3020

*/

3023

*/

3021

static void

3024

static void

3022

cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3025

cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3023

struct cfq_io_context *cic)

3026

struct cfq_io_context *cic)

3024

{

3027

{

3025

int old_idle, enable_idle;

3028

int old_idle, enable_idle;

3026

3029

3027

/*

3030

/*

3028

* Don't idle for async or idle io prio class

3031

* Don't idle for async or idle io prio class

3029

*/

3032

*/

3030

if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))

3033

if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))

3031

return;

3034

return;

3032

3035

3033

enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);

3036

enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);

3034

3037

3035

if (cfqq->queued[0] + cfqq->queued[1] >= 4)

3038

if (cfqq->queued[0] + cfqq->queued[1] >= 4)

3036

cfq_mark_cfqq_deep(cfqq);

3039

cfq_mark_cfqq_deep(cfqq);

3037

3040

3038

if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||

3041

if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||

3039

(!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))

3042

(!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))

3040

enable_idle = 0;

3043

enable_idle = 0;

3041

else if (sample_valid(cic->ttime_samples)) {

3044

else if (sample_valid(cic->ttime_samples)) {

3042

if (cic->ttime_mean > cfqd->cfq_slice_idle)

3045

if (cic->ttime_mean > cfqd->cfq_slice_idle)

3043

enable_idle = 0;

3046

enable_idle = 0;

3044

else

3047

else

3045

enable_idle = 1;

3048

enable_idle = 1;

3046

}

3049

}

3047

3050

3048

if (old_idle != enable_idle) {

3051

if (old_idle != enable_idle) {

3049

cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);

3052

cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);

3050

if (enable_idle)

3053

if (enable_idle)

3051

cfq_mark_cfqq_idle_window(cfqq);

3054

cfq_mark_cfqq_idle_window(cfqq);

3052

else

3055

else

3053

cfq_clear_cfqq_idle_window(cfqq);

3056

cfq_clear_cfqq_idle_window(cfqq);

3054

}

3057

}

3055

}

3058

}

3056

3059

3057

/*

3060

/*

3058

* Check if new_cfqq should preempt the currently active queue. Return 0 for

3061

* Check if new_cfqq should preempt the currently active queue. Return 0 for

3059

* no or if we aren't sure, a 1 will cause a preempt.

3062

* no or if we aren't sure, a 1 will cause a preempt.

3060

*/

3063

*/

3061

static bool

3064

static bool

3062

cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,

3065

cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,

3063

struct request *rq)

3066

struct request *rq)

3064

{

3067

{

3065

struct cfq_queue *cfqq;

3068

struct cfq_queue *cfqq;

3066

3069

3067

cfqq = cfqd->active_queue;

3070

cfqq = cfqd->active_queue;

3068

if (!cfqq)

3071

if (!cfqq)

3069

return false;

3072

return false;

3070

3073

3071

if (cfq_class_idle(new_cfqq))

3074

if (cfq_class_idle(new_cfqq))

3072

return false;

3075

return false;

3073

3076

3074

if (cfq_class_idle(cfqq))

3077

if (cfq_class_idle(cfqq))

3075

return true;

3078

return true;

3076

3079

3077

/*

3080

/*

3078

* Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.

3081

* Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.

3079

*/

3082

*/

3080

if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))

3083

if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))

3081

return false;

3084

return false;

3082

3085

3083

/*

3086

/*

3084

* if the new request is sync, but the currently running queue is

3087

* if the new request is sync, but the currently running queue is

3085

* not, let the sync request have priority.

3088

* not, let the sync request have priority.

3086

*/

3089

*/

3087

if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))

3090

if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))

3088

return true;

3091

return true;

3089

3092

3090

if (new_cfqq->cfqg != cfqq->cfqg)

3093

if (new_cfqq->cfqg != cfqq->cfqg)

3091

return false;

3094

return false;

3092

3095

3093

if (cfq_slice_used(cfqq))

3096

if (cfq_slice_used(cfqq))

3094

return true;

3097

return true;

3095

3098

3096

/* Allow preemption only if we are idling on sync-noidle tree */

3099

/* Allow preemption only if we are idling on sync-noidle tree */

3097

if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&

3100

if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&

3098

cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&

3101

cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&

3099

new_cfqq->service_tree->count == 2 &&

3102

new_cfqq->service_tree->count == 2 &&

3100

RB_EMPTY_ROOT(&cfqq->sort_list))

3103

RB_EMPTY_ROOT(&cfqq->sort_list))

3101

return true;

3104

return true;

3102

3105

3103

/*

3106

/*

3104

* So both queues are sync. Let the new request get disk time if

3107

* So both queues are sync. Let the new request get disk time if

3105

* it's a metadata request and the current queue is doing regular IO.

3108

* it's a metadata request and the current queue is doing regular IO.

3106

*/

3109

*/

3107

if (rq_is_meta(rq) && !cfqq->meta_pending)

3110

if (rq_is_meta(rq) && !cfqq->meta_pending)

3108

return true;

3111

return true;

3109

3112

3110

/*

3113

/*

3111

* Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.

3114

* Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.

3112

*/

3115

*/

3113

if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))

3116

if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))

3114

return true;

3117

return true;

3115

3118

3116

if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))

3119

if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))

3117

return false;

3120

return false;

3118

3121

3119

/*

3122

/*

3120

* if this request is as-good as one we would expect from the

3123

* if this request is as-good as one we would expect from the

3121

* current cfqq, let it preempt

3124

* current cfqq, let it preempt

3122

*/

3125

*/

3123

if (cfq_rq_close(cfqd, cfqq, rq))

3126

if (cfq_rq_close(cfqd, cfqq, rq))

3124

return true;

3127

return true;

3125

3128

3126

return false;

3129

return false;

3127

}

3130

}

3128

3131

3129

/*

3132

/*

3130

* cfqq preempts the active queue. if we allowed preempt with no slice left,

3133

* cfqq preempts the active queue. if we allowed preempt with no slice left,

3131

* let it have half of its nominal slice.

3134

* let it have half of its nominal slice.

3132

*/

3135

*/

3133

static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)

3136

static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)

3134

{

3137

{

3135

cfq_log_cfqq(cfqd, cfqq, "preempt");

3138

cfq_log_cfqq(cfqd, cfqq, "preempt");

3136

cfq_slice_expired(cfqd, 1);

3139

cfq_slice_expired(cfqd, 1);

3137

3140

3138

/*

3141

/*

3139

* Put the new queue at the front of the of the current list,

3142

* Put the new queue at the front of the of the current list,

3140

* so we know that it will be selected next.

3143

* so we know that it will be selected next.

3141

*/

3144

*/

3142

BUG_ON(!cfq_cfqq_on_rr(cfqq));

3145

BUG_ON(!cfq_cfqq_on_rr(cfqq));

3143

3146

3144

cfq_service_tree_add(cfqd, cfqq, 1);

3147

cfq_service_tree_add(cfqd, cfqq, 1);

3145

3148

3146

cfqq->slice_end = 0;

3149

cfqq->slice_end = 0;

3147

cfq_mark_cfqq_slice_new(cfqq);

3150

cfq_mark_cfqq_slice_new(cfqq);

3148

}

3151

}

3149

3152

3150

/*

3153

/*

3151

* Called when a new fs request (rq) is added (to cfqq). Check if there's

3154

* Called when a new fs request (rq) is added (to cfqq). Check if there's

3152

* something we should do about it

3155

* something we should do about it

3153

*/

3156

*/

3154

static void

3157

static void

3155

cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3158

cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,

3156

struct request *rq)

3159

struct request *rq)

3157

{

3160

{

3158

struct cfq_io_context *cic = RQ_CIC(rq);

3161

struct cfq_io_context *cic = RQ_CIC(rq);

3159

3162

3160

cfqd->rq_queued++;

3163

cfqd->rq_queued++;

3161

if (rq_is_meta(rq))

3164

if (rq_is_meta(rq))

3162

cfqq->meta_pending++;

3165

cfqq->meta_pending++;

3163

3166

3164

cfq_update_io_thinktime(cfqd, cic);

3167

cfq_update_io_thinktime(cfqd, cic);

3165

cfq_update_io_seektime(cfqd, cfqq, rq);

3168

cfq_update_io_seektime(cfqd, cfqq, rq);

3166

cfq_update_idle_window(cfqd, cfqq, cic);

3169

cfq_update_idle_window(cfqd, cfqq, cic);

3167

3170

3168

cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

3171

cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);

3169

3172

3170

if (cfqq == cfqd->active_queue) {

3173

if (cfqq == cfqd->active_queue) {

3171

/*

3174

/*

3172

* Remember that we saw a request from this process, but

3175

* Remember that we saw a request from this process, but

3173

* don't start queuing just yet. Otherwise we risk seeing lots

3176

* don't start queuing just yet. Otherwise we risk seeing lots

3174

* of tiny requests, because we disrupt the normal plugging

3177

* of tiny requests, because we disrupt the normal plugging

3175

* and merging. If the request is already larger than a single

3178

* and merging. If the request is already larger than a single

3176

* page, let it rip immediately. For that case we assume that

3179

* page, let it rip immediately. For that case we assume that

3177

* merging is already done. Ditto for a busy system that

3180

* merging is already done. Ditto for a busy system that

3178

* has other work pending, don't risk delaying until the

3181

* has other work pending, don't risk delaying until the

3179

* idle timer unplug to continue working.

3182

* idle timer unplug to continue working.

3180

*/

3183

*/

3181

if (cfq_cfqq_wait_request(cfqq)) {

3184

if (cfq_cfqq_wait_request(cfqq)) {

3182

if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||

3185

if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||

3183

cfqd->busy_queues > 1) {

3186

cfqd->busy_queues > 1) {

3184

del_timer(&cfqd->idle_slice_timer);

3187

del_timer(&cfqd->idle_slice_timer);

3185

cfq_clear_cfqq_wait_request(cfqq);

3188

cfq_clear_cfqq_wait_request(cfqq);

3186

__blk_run_queue(cfqd->queue);

3189

__blk_run_queue(cfqd->queue);

3187

} else

3190

} else

3188

cfq_mark_cfqq_must_dispatch(cfqq);

3191

cfq_mark_cfqq_must_dispatch(cfqq);

3189

}

3192

}

3190

} else if (cfq_should_preempt(cfqd, cfqq, rq)) {

3193

} else if (cfq_should_preempt(cfqd, cfqq, rq)) {

3191

/*

3194

/*

3192

* not the active queue - expire current slice if it is

3195

* not the active queue - expire current slice if it is

3193

* idle and has expired it's mean thinktime or this new queue

3196

* idle and has expired it's mean thinktime or this new queue

3194

* has some old slice time left and is of higher priority or

3197

* has some old slice time left and is of higher priority or

3195

* this new queue is RT and the current one is BE

3198

* this new queue is RT and the current one is BE

3196

*/

3199

*/

3197

cfq_preempt_queue(cfqd, cfqq);

3200

cfq_preempt_queue(cfqd, cfqq);

3198

__blk_run_queue(cfqd->queue);

3201

__blk_run_queue(cfqd->queue);

3199

}

3202

}

3200

}

3203

}

3201

3204

3202

static void cfq_insert_request(struct request_queue *q, struct request *rq)

3205

static void cfq_insert_request(struct request_queue *q, struct request *rq)

3203

{

3206

{

3204

struct cfq_data *cfqd = q->elevator->elevator_data;

3207

struct cfq_data *cfqd = q->elevator->elevator_data;

3205

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3208

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3206

3209

3207

cfq_log_cfqq(cfqd, cfqq, "insert_request");

3210

cfq_log_cfqq(cfqd, cfqq, "insert_request");

3208

cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);

3211

cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);

3209

3212

3210

rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);

3213

rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);

3211

list_add_tail(&rq->queuelist, &cfqq->fifo);

3214

list_add_tail(&rq->queuelist, &cfqq->fifo);

3212

cfq_add_rq_rb(rq);

3215

cfq_add_rq_rb(rq);

3213

3216

3214

cfq_rq_enqueued(cfqd, cfqq, rq);

3217

cfq_rq_enqueued(cfqd, cfqq, rq);

3215

}

3218

}

3216

3219

3217

/*

3220

/*

3218

* Update hw_tag based on peak queue depth over 50 samples under

3221

* Update hw_tag based on peak queue depth over 50 samples under

3219

* sufficient load.

3222

* sufficient load.

3220

*/

3223

*/

3221

static void cfq_update_hw_tag(struct cfq_data *cfqd)

3224

static void cfq_update_hw_tag(struct cfq_data *cfqd)

3222

{

3225

{

3223

struct cfq_queue *cfqq = cfqd->active_queue;

3226

struct cfq_queue *cfqq = cfqd->active_queue;

3224

3227

3225

if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)

3228

if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)

3226

cfqd->hw_tag_est_depth = cfqd->rq_in_driver;

3229

cfqd->hw_tag_est_depth = cfqd->rq_in_driver;

3227

3230

3228

if (cfqd->hw_tag == 1)

3231

if (cfqd->hw_tag == 1)

3229

return;

3232

return;

3230

3233

3231

if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&

3234

if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&

3232

cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)

3235

cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)

3233

return;

3236

return;

3234

3237

3235

/*

3238

/*

3236

* If active queue hasn't enough requests and can idle, cfq might not

3239

* If active queue hasn't enough requests and can idle, cfq might not

3237

* dispatch sufficient requests to hardware. Don't zero hw_tag in this

3240

* dispatch sufficient requests to hardware. Don't zero hw_tag in this

3238

* case

3241

* case

3239

*/

3242

*/

3240

if (cfqq && cfq_cfqq_idle_window(cfqq) &&

3243

if (cfqq && cfq_cfqq_idle_window(cfqq) &&

3241

cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <

3244

cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <

3242

CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)

3245

CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)

3243

return;

3246

return;

3244

3247

3245

if (cfqd->hw_tag_samples++ < 50)

3248

if (cfqd->hw_tag_samples++ < 50)

3246

return;

3249

return;

3247

3250

3248

if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)

3251

if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)

3249

cfqd->hw_tag = 1;

3252

cfqd->hw_tag = 1;

3250

else

3253

else

3251

cfqd->hw_tag = 0;

3254

cfqd->hw_tag = 0;

3252

}

3255

}

3253

3256

3254

static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)

3257

static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)

3255

{

3258

{

3256

struct cfq_io_context *cic = cfqd->active_cic;

3259

struct cfq_io_context *cic = cfqd->active_cic;

3257

3260

3258

/* If there are other queues in the group, don't wait */

3261

/* If there are other queues in the group, don't wait */

3259

if (cfqq->cfqg->nr_cfqq > 1)

3262

if (cfqq->cfqg->nr_cfqq > 1)

3260

return false;

3263

return false;

3261

3264

3262

if (cfq_slice_used(cfqq))

3265

if (cfq_slice_used(cfqq))

3263

return true;

3266

return true;

3264

3267

3265

/* if slice left is less than think time, wait busy */

3268

/* if slice left is less than think time, wait busy */

3266

if (cic && sample_valid(cic->ttime_samples)

3269

if (cic && sample_valid(cic->ttime_samples)

3267

&& (cfqq->slice_end - jiffies < cic->ttime_mean))

3270

&& (cfqq->slice_end - jiffies < cic->ttime_mean))

3268

return true;

3271

return true;

3269

3272

3270

/*

3273

/*

3271

* If think times is less than a jiffy than ttime_mean=0 and above

3274

* If think times is less than a jiffy than ttime_mean=0 and above

3272

* will not be true. It might happen that slice has not expired yet

3275

* will not be true. It might happen that slice has not expired yet

3273

* but will expire soon (4-5 ns) during select_queue(). To cover the

3276

* but will expire soon (4-5 ns) during select_queue(). To cover the

3274

* case where think time is less than a jiffy, mark the queue wait

3277

* case where think time is less than a jiffy, mark the queue wait

3275

* busy if only 1 jiffy is left in the slice.

3278

* busy if only 1 jiffy is left in the slice.

3276

*/

3279

*/

3277

if (cfqq->slice_end - jiffies == 1)

3280

if (cfqq->slice_end - jiffies == 1)

3278

return true;

3281

return true;

3279

3282

3280

return false;

3283

return false;

3281

}

3284

}

3282

3285

3283

static void cfq_completed_request(struct request_queue *q, struct request *rq)

3286

static void cfq_completed_request(struct request_queue *q, struct request *rq)

3284

{

3287

{

3285

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3288

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3286

struct cfq_data *cfqd = cfqq->cfqd;

3289

struct cfq_data *cfqd = cfqq->cfqd;

3287

const int sync = rq_is_sync(rq);

3290

const int sync = rq_is_sync(rq);

3288

unsigned long now;

3291

unsigned long now;

3289

3292

3290

now = jiffies;

3293

now = jiffies;

3291

cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));

3294

cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));

3292

3295

3293

cfq_update_hw_tag(cfqd);

3296

cfq_update_hw_tag(cfqd);

3294

3297

3295

WARN_ON(!cfqd->rq_in_driver);

3298

WARN_ON(!cfqd->rq_in_driver);

3296

WARN_ON(!cfqq->dispatched);

3299

WARN_ON(!cfqq->dispatched);

3297

cfqd->rq_in_driver--;

3300

cfqd->rq_in_driver--;

3298

cfqq->dispatched--;

3301

cfqq->dispatched--;

3299

3302

3300

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;

3303

cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;

3301

3304

3302

if (sync) {

3305

if (sync) {

3303

RQ_CIC(rq)->last_end_request = now;

3306

RQ_CIC(rq)->last_end_request = now;

3304

if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))

3307

if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))

3305

cfqd->last_delayed_sync = now;

3308

cfqd->last_delayed_sync = now;

3306

}

3309

}

3307

3310

3308

/*

3311

/*

3309

* If this is the active queue, check if it needs to be expired,

3312

* If this is the active queue, check if it needs to be expired,

3310

* or if we want to idle in case it has no pending requests.

3313

* or if we want to idle in case it has no pending requests.

3311

*/

3314

*/

3312

if (cfqd->active_queue == cfqq) {

3315

if (cfqd->active_queue == cfqq) {

3313

const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);

3316

const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);

3314

3317

3315

if (cfq_cfqq_slice_new(cfqq)) {

3318

if (cfq_cfqq_slice_new(cfqq)) {

3316

cfq_set_prio_slice(cfqd, cfqq);

3319

cfq_set_prio_slice(cfqd, cfqq);

3317

cfq_clear_cfqq_slice_new(cfqq);

3320

cfq_clear_cfqq_slice_new(cfqq);

3318

}

3321

}

3319

3322

3320

/*

3323

/*

3321

* Should we wait for next request to come in before we expire

3324

* Should we wait for next request to come in before we expire

3322

* the queue.

3325

* the queue.

3323

*/

3326

*/

3324

if (cfq_should_wait_busy(cfqd, cfqq)) {

3327

if (cfq_should_wait_busy(cfqd, cfqq)) {

3325

cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;

3328

cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;

3326

cfq_mark_cfqq_wait_busy(cfqq);

3329

cfq_mark_cfqq_wait_busy(cfqq);

3327

cfq_log_cfqq(cfqd, cfqq, "will busy wait");

3330

cfq_log_cfqq(cfqd, cfqq, "will busy wait");

3328

}

3331

}

3329

3332

3330

/*

3333

/*

3331

* Idling is not enabled on:

3334

* Idling is not enabled on:

3332

* - expired queues

3335

* - expired queues

3333

* - idle-priority queues

3336

* - idle-priority queues

3334

* - async queues

3337

* - async queues

3335

* - queues with still some requests queued

3338

* - queues with still some requests queued

3336

* - when there is a close cooperator

3339

* - when there is a close cooperator

3337

*/

3340

*/

3338

if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))

3341

if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))

3339

cfq_slice_expired(cfqd, 1);

3342

cfq_slice_expired(cfqd, 1);

3340

else if (sync && cfqq_empty &&

3343

else if (sync && cfqq_empty &&

3341

!cfq_close_cooperator(cfqd, cfqq)) {

3344

!cfq_close_cooperator(cfqd, cfqq)) {

3342

cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);

3345

cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);

3343

/*

3346

/*

3344

* Idling is enabled for SYNC_WORKLOAD.

3347

* Idling is enabled for SYNC_WORKLOAD.

3345

* SYNC_NOIDLE_WORKLOAD idles at the end of the tree

3348

* SYNC_NOIDLE_WORKLOAD idles at the end of the tree

3346

* only if we processed at least one !rq_noidle request

3349

* only if we processed at least one !rq_noidle request

3347

*/

3350

*/

3348

if (cfqd->serving_type == SYNC_WORKLOAD

3351

if (cfqd->serving_type == SYNC_WORKLOAD

3349

|| cfqd->noidle_tree_requires_idle

3352

|| cfqd->noidle_tree_requires_idle

3350

|| cfqq->cfqg->nr_cfqq == 1)

3353

|| cfqq->cfqg->nr_cfqq == 1)

3351

cfq_arm_slice_timer(cfqd);

3354

cfq_arm_slice_timer(cfqd);

3352

}

3355

}

3353

}

3356

}

3354

3357

3355

if (!cfqd->rq_in_driver)

3358

if (!cfqd->rq_in_driver)

3356

cfq_schedule_dispatch(cfqd);

3359

cfq_schedule_dispatch(cfqd);

3357

}

3360

}

3358

3361

3359

/*

3362

/*

3360

* we temporarily boost lower priority queues if they are holding fs exclusive

3363

* we temporarily boost lower priority queues if they are holding fs exclusive

3361

* resources. they are boosted to normal prio (CLASS_BE/4)

3364

* resources. they are boosted to normal prio (CLASS_BE/4)

3362

*/

3365

*/

3363

static void cfq_prio_boost(struct cfq_queue *cfqq)

3366

static void cfq_prio_boost(struct cfq_queue *cfqq)

3364

{

3367

{

3365

if (has_fs_excl()) {

3368

if (has_fs_excl()) {

3366

/*

3369

/*

3367

* boost idle prio on transactions that would lock out other

3370

* boost idle prio on transactions that would lock out other

3368

* users of the filesystem

3371

* users of the filesystem

3369

*/

3372

*/

3370

if (cfq_class_idle(cfqq))

3373

if (cfq_class_idle(cfqq))

3371

cfqq->ioprio_class = IOPRIO_CLASS_BE;

3374

cfqq->ioprio_class = IOPRIO_CLASS_BE;

3372

if (cfqq->ioprio > IOPRIO_NORM)

3375

if (cfqq->ioprio > IOPRIO_NORM)

3373

cfqq->ioprio = IOPRIO_NORM;

3376

cfqq->ioprio = IOPRIO_NORM;

3374

} else {

3377

} else {

3375

/*

3378

/*

3376

* unboost the queue (if needed)

3379

* unboost the queue (if needed)

3377

*/

3380

*/

3378

cfqq->ioprio_class = cfqq->org_ioprio_class;

3381

cfqq->ioprio_class = cfqq->org_ioprio_class;

3379

cfqq->ioprio = cfqq->org_ioprio;

3382

cfqq->ioprio = cfqq->org_ioprio;

3380

}

3383

}

3381

}

3384

}

3382

3385

3383

static inline int __cfq_may_queue(struct cfq_queue *cfqq)

3386

static inline int __cfq_may_queue(struct cfq_queue *cfqq)

3384

{

3387

{

3385

if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {

3388

if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {

3386

cfq_mark_cfqq_must_alloc_slice(cfqq);

3389

cfq_mark_cfqq_must_alloc_slice(cfqq);

3387

return ELV_MQUEUE_MUST;

3390

return ELV_MQUEUE_MUST;

3388

}

3391

}

3389

3392

3390

return ELV_MQUEUE_MAY;

3393

return ELV_MQUEUE_MAY;

3391

}

3394

}

3392

3395

3393

static int cfq_may_queue(struct request_queue *q, int rw)

3396

static int cfq_may_queue(struct request_queue *q, int rw)

3394

{

3397

{

3395

struct cfq_data *cfqd = q->elevator->elevator_data;

3398

struct cfq_data *cfqd = q->elevator->elevator_data;

3396

struct task_struct *tsk = current;

3399

struct task_struct *tsk = current;

3397

struct cfq_io_context *cic;

3400

struct cfq_io_context *cic;

3398

struct cfq_queue *cfqq;

3401

struct cfq_queue *cfqq;

3399

3402

3400

/*

3403

/*

3401

* don't force setup of a queue from here, as a call to may_queue

3404

* don't force setup of a queue from here, as a call to may_queue

3402

* does not necessarily imply that a request actually will be queued.

3405

* does not necessarily imply that a request actually will be queued.

3403

* so just lookup a possibly existing queue, or return 'may queue'

3406

* so just lookup a possibly existing queue, or return 'may queue'

3404

* if that fails

3407

* if that fails

3405

*/

3408

*/

3406

cic = cfq_cic_lookup(cfqd, tsk->io_context);

3409

cic = cfq_cic_lookup(cfqd, tsk->io_context);

3407

if (!cic)

3410

if (!cic)

3408

return ELV_MQUEUE_MAY;

3411

return ELV_MQUEUE_MAY;

3409

3412

3410

cfqq = cic_to_cfqq(cic, rw_is_sync(rw));

3413

cfqq = cic_to_cfqq(cic, rw_is_sync(rw));

3411

if (cfqq) {

3414

if (cfqq) {

3412

cfq_init_prio_data(cfqq, cic->ioc);

3415

cfq_init_prio_data(cfqq, cic->ioc);

3413

cfq_prio_boost(cfqq);

3416

cfq_prio_boost(cfqq);

3414

3417

3415

return __cfq_may_queue(cfqq);

3418

return __cfq_may_queue(cfqq);

3416

}

3419

}

3417

3420

3418

return ELV_MQUEUE_MAY;

3421

return ELV_MQUEUE_MAY;

3419

}

3422

}

3420

3423

3421

/*

3424

/*

3422

* queue lock held here

3425

* queue lock held here

3423

*/

3426

*/

3424

static void cfq_put_request(struct request *rq)

3427

static void cfq_put_request(struct request *rq)

3425

{

3428

{

3426

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3429

struct cfq_queue *cfqq = RQ_CFQQ(rq);

3427

3430

3428

if (cfqq) {

3431

if (cfqq) {

3429

const int rw = rq_data_dir(rq);

3432

const int rw = rq_data_dir(rq);

3430

3433

3431

BUG_ON(!cfqq->allocated[rw]);

3434

BUG_ON(!cfqq->allocated[rw]);

3432

cfqq->allocated[rw]--;

3435

cfqq->allocated[rw]--;

3433

3436

3434

put_io_context(RQ_CIC(rq)->ioc);

3437

put_io_context(RQ_CIC(rq)->ioc);

3435

3438

3436

rq->elevator_private = NULL;

3439

rq->elevator_private = NULL;

3437

rq->elevator_private2 = NULL;

3440

rq->elevator_private2 = NULL;

3438

3441

3439

cfq_put_queue(cfqq);

3442

cfq_put_queue(cfqq);

3440

}

3443

}

3441

}

3444

}

3442

3445

3443

static struct cfq_queue *

3446

static struct cfq_queue *

3444

cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,

3447

cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,

3445

struct cfq_queue *cfqq)

3448

struct cfq_queue *cfqq)

3446

{

3449

{

3447

cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);

3450

cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);

3448

cic_set_cfqq(cic, cfqq->new_cfqq, 1);

3451

cic_set_cfqq(cic, cfqq->new_cfqq, 1);

3449

cfq_mark_cfqq_coop(cfqq->new_cfqq);

3452

cfq_mark_cfqq_coop(cfqq->new_cfqq);

3450

cfq_put_queue(cfqq);

3453

cfq_put_queue(cfqq);

3451

return cic_to_cfqq(cic, 1);

3454

return cic_to_cfqq(cic, 1);

3452

}

3455

}

3453

3456

3454

/*

3457

/*

3455

* Returns NULL if a new cfqq should be allocated, or the old cfqq if this

3458

* Returns NULL if a new cfqq should be allocated, or the old cfqq if this

3456

* was the last process referring to said cfqq.

3459

* was the last process referring to said cfqq.

3457

*/

3460

*/

3458

static struct cfq_queue *

3461

static struct cfq_queue *

3459

split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)

3462

split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)

3460

{

3463

{

3461

if (cfqq_process_refs(cfqq) == 1) {

3464

if (cfqq_process_refs(cfqq) == 1) {

3462

cfqq->pid = current->pid;

3465

cfqq->pid = current->pid;

3463

cfq_clear_cfqq_coop(cfqq);

3466

cfq_clear_cfqq_coop(cfqq);

3464

cfq_clear_cfqq_split_coop(cfqq);

3467

cfq_clear_cfqq_split_coop(cfqq);

3465

return cfqq;

3468

return cfqq;

3466

}

3469

}

3467

3470

3468

cic_set_cfqq(cic, NULL, 1);

3471

cic_set_cfqq(cic, NULL, 1);

3469

cfq_put_queue(cfqq);

3472

cfq_put_queue(cfqq);

3470

return NULL;

3473

return NULL;

3471

}

3474

}

3472

/*

3475

/*

3473

* Allocate cfq data structures associated with this request.

3476

* Allocate cfq data structures associated with this request.

3474

*/

3477

*/

3475

static int

3478

static int

3476

cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)

3479

cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)

3477

{

3480

{

3478

struct cfq_data *cfqd = q->elevator->elevator_data;

3481

struct cfq_data *cfqd = q->elevator->elevator_data;

3479

struct cfq_io_context *cic;

3482

struct cfq_io_context *cic;

3480

const int rw = rq_data_dir(rq);

3483

const int rw = rq_data_dir(rq);

3481

const bool is_sync = rq_is_sync(rq);

3484

const bool is_sync = rq_is_sync(rq);

3482

struct cfq_queue *cfqq;

3485

struct cfq_queue *cfqq;

3483

unsigned long flags;

3486

unsigned long flags;

3484

3487

3485

might_sleep_if(gfp_mask & __GFP_WAIT);

3488

might_sleep_if(gfp_mask & __GFP_WAIT);

3486

3489

3487

cic = cfq_get_io_context(cfqd, gfp_mask);

3490

cic = cfq_get_io_context(cfqd, gfp_mask);

3488

3491

3489

spin_lock_irqsave(q->queue_lock, flags);

3492

spin_lock_irqsave(q->queue_lock, flags);

3490

3493

3491

if (!cic)

3494

if (!cic)

3492

goto queue_fail;

3495

goto queue_fail;

3493

3496

3494

new_queue:

3497

new_queue:

3495

cfqq = cic_to_cfqq(cic, is_sync);

3498

cfqq = cic_to_cfqq(cic, is_sync);

3496

if (!cfqq || cfqq == &cfqd->oom_cfqq) {

3499

if (!cfqq || cfqq == &cfqd->oom_cfqq) {

3497

cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);

3500

cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);

3498

cic_set_cfqq(cic, cfqq, is_sync);

3501

cic_set_cfqq(cic, cfqq, is_sync);

3499

} else {

3502

} else {

3500

/*

3503

/*

3501

* If the queue was seeky for too long, break it apart.

3504

* If the queue was seeky for too long, break it apart.

3502

*/

3505

*/

3503

if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {

3506

if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {

3504

cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");

3507

cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");

3505

cfqq = split_cfqq(cic, cfqq);

3508

cfqq = split_cfqq(cic, cfqq);

3506

if (!cfqq)

3509

if (!cfqq)

3507

goto new_queue;

3510

goto new_queue;

3508

}

3511

}

3509

3512

3510

/*

3513

/*

3511

* Check to see if this queue is scheduled to merge with

3514

* Check to see if this queue is scheduled to merge with

3512

* another, closely cooperating queue. The merging of

3515

* another, closely cooperating queue. The merging of

3513

* queues happens here as it must be done in process context.

3516

* queues happens here as it must be done in process context.

3514

* The reference on new_cfqq was taken in merge_cfqqs.

3517

* The reference on new_cfqq was taken in merge_cfqqs.

3515

*/

3518

*/

3516

if (cfqq->new_cfqq)

3519

if (cfqq->new_cfqq)

3517

cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);

3520

cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);

3518

}

3521

}

3519

3522

3520

cfqq->allocated[rw]++;

3523

cfqq->allocated[rw]++;

3521

atomic_inc(&cfqq->ref);

3524

atomic_inc(&cfqq->ref);

3522

3525

3523

spin_unlock_irqrestore(q->queue_lock, flags);

3526

spin_unlock_irqrestore(q->queue_lock, flags);

3524

3527

3525

rq->elevator_private = cic;

3528

rq->elevator_private = cic;

3526

rq->elevator_private2 = cfqq;

3529

rq->elevator_private2 = cfqq;

3527

return 0;

3530

return 0;

3528

3531

3529

queue_fail:

3532

queue_fail:

3530

if (cic)

3533

if (cic)

3531

put_io_context(cic->ioc);

3534

put_io_context(cic->ioc);

3532

3535

3533

cfq_schedule_dispatch(cfqd);

3536

cfq_schedule_dispatch(cfqd);

3534

spin_unlock_irqrestore(q->queue_lock, flags);

3537

spin_unlock_irqrestore(q->queue_lock, flags);

3535

cfq_log(cfqd, "set_request fail");

3538

cfq_log(cfqd, "set_request fail");

3536

return 1;

3539

return 1;

3537

}

3540

}

3538

3541

3539

static void cfq_kick_queue(struct work_struct *work)

3542

static void cfq_kick_queue(struct work_struct *work)

3540

{

3543

{

3541

struct cfq_data *cfqd =

3544

struct cfq_data *cfqd =

3542

container_of(work, struct cfq_data, unplug_work);

3545

container_of(work, struct cfq_data, unplug_work);

3543

struct request_queue *q = cfqd->queue;

3546

struct request_queue *q = cfqd->queue;

3544

3547

3545

spin_lock_irq(q->queue_lock);

3548

spin_lock_irq(q->queue_lock);

3546

__blk_run_queue(cfqd->queue);

3549

__blk_run_queue(cfqd->queue);

3547

spin_unlock_irq(q->queue_lock);

3550

spin_unlock_irq(q->queue_lock);

3548

}

3551

}

3549

3552

3550

/*

3553

/*

3551

* Timer running if the active_queue is currently idling inside its time slice

3554

* Timer running if the active_queue is currently idling inside its time slice

3552

*/

3555

*/

3553

static void cfq_idle_slice_timer(unsigned long data)

3556

static void cfq_idle_slice_timer(unsigned long data)

3554

{

3557

{

3555

struct cfq_data *cfqd = (struct cfq_data *) data;

3558

struct cfq_data *cfqd = (struct cfq_data *) data;

3556

struct cfq_queue *cfqq;

3559

struct cfq_queue *cfqq;

3557

unsigned long flags;

3560

unsigned long flags;

3558

int timed_out = 1;

3561

int timed_out = 1;

3559

3562

3560

cfq_log(cfqd, "idle timer fired");

3563

cfq_log(cfqd, "idle timer fired");

3561

3564

3562

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

3565

spin_lock_irqsave(cfqd->queue->queue_lock, flags);

3563

3566

3564

cfqq = cfqd->active_queue;

3567

cfqq = cfqd->active_queue;

3565

if (cfqq) {

3568

if (cfqq) {

3566

timed_out = 0;

3569

timed_out = 0;

3567

3570

3568

/*

3571

/*

3569

* We saw a request before the queue expired, let it through

3572

* We saw a request before the queue expired, let it through

3570

*/

3573

*/

3571

if (cfq_cfqq_must_dispatch(cfqq))

3574

if (cfq_cfqq_must_dispatch(cfqq))

3572

goto out_kick;

3575

goto out_kick;

3573

3576

3574

/*

3577

/*

3575

* expired

3578

* expired

3576

*/

3579

*/

3577

if (cfq_slice_used(cfqq))

3580

if (cfq_slice_used(cfqq))

3578

goto expire;

3581

goto expire;

3579

3582

3580

/*

3583

/*

3581

* only expire and reinvoke request handler, if there are

3584

* only expire and reinvoke request handler, if there are

3582

* other queues with pending requests

3585

* other queues with pending requests

3583

*/

3586

*/

3584

if (!cfqd->busy_queues)

3587

if (!cfqd->busy_queues)

3585

goto out_cont;

3588

goto out_cont;

3586

3589

3587

/*

3590

/*

3588

* not expired and it has a request pending, let it dispatch

3591

* not expired and it has a request pending, let it dispatch

3589

*/

3592

*/

3590

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

3593

if (!RB_EMPTY_ROOT(&cfqq->sort_list))

3591

goto out_kick;

3594

goto out_kick;

3592

3595

3593

/*

3596

/*

3594

* Queue depth flag is reset only when the idle didn't succeed

3597

* Queue depth flag is reset only when the idle didn't succeed

3595

*/

3598

*/

3596

cfq_clear_cfqq_deep(cfqq);

3599

cfq_clear_cfqq_deep(cfqq);

3597

}

3600

}

3598

expire:

3601

expire:

3599

cfq_slice_expired(cfqd, timed_out);

3602

cfq_slice_expired(cfqd, timed_out);

3600

out_kick:

3603

out_kick:

3601

cfq_schedule_dispatch(cfqd);

3604

cfq_schedule_dispatch(cfqd);

3602

out_cont:

3605

out_cont:

3603

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

3606

spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);

3604

}

3607

}

3605

3608

3606

static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)

3609

static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)

3607

{

3610

{

3608

del_timer_sync(&cfqd->idle_slice_timer);

3611

del_timer_sync(&cfqd->idle_slice_timer);

3609

cancel_work_sync(&cfqd->unplug_work);

3612

cancel_work_sync(&cfqd->unplug_work);

3610

}

3613

}

3611

3614

3612

static void cfq_put_async_queues(struct cfq_data *cfqd)

3615

static void cfq_put_async_queues(struct cfq_data *cfqd)

3613

{

3616

{

3614

int i;

3617

int i;

3615

3618

3616

for (i = 0; i < IOPRIO_BE_NR; i++) {

3619

for (i = 0; i < IOPRIO_BE_NR; i++) {

3617

if (cfqd->async_cfqq[0][i])

3620

if (cfqd->async_cfqq[0][i])

3618

cfq_put_queue(cfqd->async_cfqq[0][i]);

3621

cfq_put_queue(cfqd->async_cfqq[0][i]);

3619

if (cfqd->async_cfqq[1][i])

3622

if (cfqd->async_cfqq[1][i])

3620

cfq_put_queue(cfqd->async_cfqq[1][i]);

3623

cfq_put_queue(cfqd->async_cfqq[1][i]);

3621

}

3624

}

3622

3625

3623

if (cfqd->async_idle_cfqq)

3626

if (cfqd->async_idle_cfqq)

3624

cfq_put_queue(cfqd->async_idle_cfqq);

3627

cfq_put_queue(cfqd->async_idle_cfqq);

3625

}

3628

}

3626

3629

3627

static void cfq_cfqd_free(struct rcu_head *head)

3630

static void cfq_cfqd_free(struct rcu_head *head)

3628

{

3631

{

3629

kfree(container_of(head, struct cfq_data, rcu));

3632

kfree(container_of(head, struct cfq_data, rcu));

3630

}

3633

}

3631

3634

3632

static void cfq_exit_queue(struct elevator_queue *e)

3635

static void cfq_exit_queue(struct elevator_queue *e)

3633

{

3636

{

3634

struct cfq_data *cfqd = e->elevator_data;

3637

struct cfq_data *cfqd = e->elevator_data;

3635

struct request_queue *q = cfqd->queue;

3638

struct request_queue *q = cfqd->queue;

3636

3639

3637

cfq_shutdown_timer_wq(cfqd);

3640

cfq_shutdown_timer_wq(cfqd);

3638

3641

3639

spin_lock_irq(q->queue_lock);

3642

spin_lock_irq(q->queue_lock);

3640

3643

3641

if (cfqd->active_queue)

3644

if (cfqd->active_queue)

3642

__cfq_slice_expired(cfqd, cfqd->active_queue, 0);

3645

__cfq_slice_expired(cfqd, cfqd->active_queue, 0);

3643

3646

3644

while (!list_empty(&cfqd->cic_list)) {

3647

while (!list_empty(&cfqd->cic_list)) {

3645

struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,

3648

struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,

3646

struct cfq_io_context,

3649

struct cfq_io_context,

3647

queue_list);

3650

queue_list);

3648

3651

3649

__cfq_exit_single_io_context(cfqd, cic);

3652

__cfq_exit_single_io_context(cfqd, cic);

3650

}

3653

}

3651

3654

3652

cfq_put_async_queues(cfqd);

3655

cfq_put_async_queues(cfqd);

3653

cfq_release_cfq_groups(cfqd);

3656

cfq_release_cfq_groups(cfqd);

3654

blkiocg_del_blkio_group(&cfqd->root_group.blkg);

3657

blkiocg_del_blkio_group(&cfqd->root_group.blkg);

3655

3658

3656

spin_unlock_irq(q->queue_lock);

3659

spin_unlock_irq(q->queue_lock);

3657

3660

3658

cfq_shutdown_timer_wq(cfqd);

3661

cfq_shutdown_timer_wq(cfqd);

3659

3662

3660

/* Wait for cfqg->blkg->key accessors to exit their grace periods. */

3663

/* Wait for cfqg->blkg->key accessors to exit their grace periods. */

3661

call_rcu(&cfqd->rcu, cfq_cfqd_free);

3664

call_rcu(&cfqd->rcu, cfq_cfqd_free);

3662

}

3665

}

3663

3666

3664

static void *cfq_init_queue(struct request_queue *q)

3667

static void *cfq_init_queue(struct request_queue *q)

3665

{

3668

{

3666

struct cfq_data *cfqd;

3669

struct cfq_data *cfqd;

3667

int i, j;

3670

int i, j;

3668

struct cfq_group *cfqg;

3671

struct cfq_group *cfqg;

3669

struct cfq_rb_root *st;

3672

struct cfq_rb_root *st;

3670

3673

3671

cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);

3674

cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);

3672

if (!cfqd)

3675

if (!cfqd)

3673

return NULL;

3676

return NULL;

3674

3677

3675

/* Init root service tree */

3678

/* Init root service tree */

3676

cfqd->grp_service_tree = CFQ_RB_ROOT;

3679

cfqd->grp_service_tree = CFQ_RB_ROOT;

3677

3680

3678

/* Init root group */

3681

/* Init root group */

3679

cfqg = &cfqd->root_group;

3682

cfqg = &cfqd->root_group;

3680

for_each_cfqg_st(cfqg, i, j, st)

3683

for_each_cfqg_st(cfqg, i, j, st)

3681

*st = CFQ_RB_ROOT;

3684

*st = CFQ_RB_ROOT;

3682

RB_CLEAR_NODE(&cfqg->rb_node);

3685

RB_CLEAR_NODE(&cfqg->rb_node);

3683

3686

3684

/* Give preference to root group over other groups */

3687

/* Give preference to root group over other groups */

3685

cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;

3688

cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;

3686

3689

3687

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3690

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3688

/*

3691

/*

3689

* Take a reference to root group which we never drop. This is just

3692

* Take a reference to root group which we never drop. This is just

3690

* to make sure that cfq_put_cfqg() does not try to kfree root group

3693

* to make sure that cfq_put_cfqg() does not try to kfree root group

3691

*/

3694

*/

3692

atomic_set(&cfqg->ref, 1);

3695

atomic_set(&cfqg->ref, 1);

3693

blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,

3696

blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,

3694

0);

3697

0);

3695

#endif

3698

#endif

3696

/*

3699

/*

3697

* Not strictly needed (since RB_ROOT just clears the node and we

3700

* Not strictly needed (since RB_ROOT just clears the node and we

3698

* zeroed cfqd on alloc), but better be safe in case someone decides

3701

* zeroed cfqd on alloc), but better be safe in case someone decides

3699

* to add magic to the rb code

3702

* to add magic to the rb code

3700

*/

3703

*/

3701

for (i = 0; i < CFQ_PRIO_LISTS; i++)

3704

for (i = 0; i < CFQ_PRIO_LISTS; i++)

3702

cfqd->prio_trees[i] = RB_ROOT;

3705

cfqd->prio_trees[i] = RB_ROOT;

3703

3706

3704

/*

3707

/*

3705

* Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.

3708

* Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.

3706

* Grab a permanent reference to it, so that the normal code flow

3709

* Grab a permanent reference to it, so that the normal code flow

3707

* will not attempt to free it.

3710

* will not attempt to free it.

3708

*/

3711

*/

3709

cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);

3712

cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);

3710

atomic_inc(&cfqd->oom_cfqq.ref);

3713

atomic_inc(&cfqd->oom_cfqq.ref);

3711

cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);

3714

cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);

3712

3715

3713

INIT_LIST_HEAD(&cfqd->cic_list);

3716

INIT_LIST_HEAD(&cfqd->cic_list);

3714

3717

3715

cfqd->queue = q;

3718

cfqd->queue = q;

3716

3719

3717

init_timer(&cfqd->idle_slice_timer);

3720

init_timer(&cfqd->idle_slice_timer);

3718

cfqd->idle_slice_timer.function = cfq_idle_slice_timer;

3721

cfqd->idle_slice_timer.function = cfq_idle_slice_timer;

3719

cfqd->idle_slice_timer.data = (unsigned long) cfqd;

3722

cfqd->idle_slice_timer.data = (unsigned long) cfqd;

3720

3723

3721

INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);

3724

INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);

3722

3725

3723

cfqd->cfq_quantum = cfq_quantum;

3726

cfqd->cfq_quantum = cfq_quantum;

3724

cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];

3727

cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];

3725

cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];

3728

cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];

3726

cfqd->cfq_back_max = cfq_back_max;

3729

cfqd->cfq_back_max = cfq_back_max;

3727

cfqd->cfq_back_penalty = cfq_back_penalty;

3730

cfqd->cfq_back_penalty = cfq_back_penalty;

3728

cfqd->cfq_slice[0] = cfq_slice_async;

3731

cfqd->cfq_slice[0] = cfq_slice_async;

3729

cfqd->cfq_slice[1] = cfq_slice_sync;

3732

cfqd->cfq_slice[1] = cfq_slice_sync;

3730

cfqd->cfq_slice_async_rq = cfq_slice_async_rq;

3733

cfqd->cfq_slice_async_rq = cfq_slice_async_rq;

3731

cfqd->cfq_slice_idle = cfq_slice_idle;

3734

cfqd->cfq_slice_idle = cfq_slice_idle;

3732

cfqd->cfq_latency = 1;

3735

cfqd->cfq_latency = 1;

3733

cfqd->cfq_group_isolation = 0;

3736

cfqd->cfq_group_isolation = 0;

3734

cfqd->hw_tag = -1;

3737

cfqd->hw_tag = -1;

3735

/*

3738

/*

3736

* we optimistically start assuming sync ops weren't delayed in last

3739

* we optimistically start assuming sync ops weren't delayed in last

3737

* second, in order to have larger depth for async operations.

3740

* second, in order to have larger depth for async operations.

3738

*/

3741

*/

3739

cfqd->last_delayed_sync = jiffies - HZ;

3742

cfqd->last_delayed_sync = jiffies - HZ;

3740

INIT_RCU_HEAD(&cfqd->rcu);

3743

INIT_RCU_HEAD(&cfqd->rcu);

3741

return cfqd;

3744

return cfqd;

3742

}

3745

}

3743

3746

3744

static void cfq_slab_kill(void)

3747

static void cfq_slab_kill(void)

3745

{

3748

{

3746

/*

3749

/*

3747

* Caller already ensured that pending RCU callbacks are completed,

3750

* Caller already ensured that pending RCU callbacks are completed,

3748

* so we should have no busy allocations at this point.

3751

* so we should have no busy allocations at this point.

3749

*/

3752

*/

3750

if (cfq_pool)

3753

if (cfq_pool)

3751

kmem_cache_destroy(cfq_pool);

3754

kmem_cache_destroy(cfq_pool);

3752

if (cfq_ioc_pool)

3755

if (cfq_ioc_pool)

3753

kmem_cache_destroy(cfq_ioc_pool);

3756

kmem_cache_destroy(cfq_ioc_pool);

3754

}

3757

}

3755

3758

3756

static int __init cfq_slab_setup(void)

3759

static int __init cfq_slab_setup(void)

3757

{

3760

{

3758

cfq_pool = KMEM_CACHE(cfq_queue, 0);

3761

cfq_pool = KMEM_CACHE(cfq_queue, 0);

3759

if (!cfq_pool)

3762

if (!cfq_pool)

3760

goto fail;

3763

goto fail;

3761

3764

3762

cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);

3765

cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);

3763

if (!cfq_ioc_pool)

3766

if (!cfq_ioc_pool)

3764

goto fail;

3767

goto fail;

3765

3768

3766

return 0;

3769

return 0;

3767

fail:

3770

fail:

3768

cfq_slab_kill();

3771

cfq_slab_kill();

3769

return -ENOMEM;

3772

return -ENOMEM;

3770

}

3773

}

3771

3774

3772

/*

3775

/*

3773

* sysfs parts below -->

3776

* sysfs parts below -->

3774

*/

3777

*/

3775

static ssize_t

3778

static ssize_t

3776

cfq_var_show(unsigned int var, char *page)

3779

cfq_var_show(unsigned int var, char *page)

3777

{

3780

{

3778

return sprintf(page, "%d\n", var);

3781

return sprintf(page, "%d\n", var);

3779

}

3782

}

3780

3783

3781

static ssize_t

3784

static ssize_t

3782

cfq_var_store(unsigned int *var, const char *page, size_t count)

3785

cfq_var_store(unsigned int *var, const char *page, size_t count)

3783

{

3786

{

3784

char *p = (char *) page;

3787

char *p = (char *) page;

3785

3788

3786

*var = simple_strtoul(p, &p, 10);

3789

*var = simple_strtoul(p, &p, 10);

3787

return count;

3790

return count;

3788

}

3791

}

3789

3792

3790

#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \

3793

#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \

3791

static ssize_t __FUNC(struct elevator_queue *e, char *page) \

3794

static ssize_t __FUNC(struct elevator_queue *e, char *page) \

3792

{ \

3795

{ \

3793

struct cfq_data *cfqd = e->elevator_data; \

3796

struct cfq_data *cfqd = e->elevator_data; \

3794

unsigned int __data = __VAR; \

3797

unsigned int __data = __VAR; \

3795

if (__CONV) \

3798

if (__CONV) \

3796

__data = jiffies_to_msecs(__data); \

3799

__data = jiffies_to_msecs(__data); \

3797

return cfq_var_show(__data, (page)); \

3800

return cfq_var_show(__data, (page)); \

3798

}

3801

}

3799

SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);

3802

SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);

3800

SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);

3803

SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);

3801

SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);

3804

SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);

3802

SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);

3805

SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);

3803

SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);

3806

SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);

3804

SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);

3807

SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);

3805

SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);

3808

SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);

3806

SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);

3809

SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);

3807

SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);

3810

SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);

3808

SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);

3811

SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);

3809

SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);

3812

SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);

3810

#undef SHOW_FUNCTION

3813

#undef SHOW_FUNCTION

3811

3814

3812

#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \

3815

#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \

3813

static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \

3816

static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \

3814

{ \

3817

{ \

3815

struct cfq_data *cfqd = e->elevator_data; \

3818

struct cfq_data *cfqd = e->elevator_data; \

3816

unsigned int __data; \

3819

unsigned int __data; \

3817

int ret = cfq_var_store(&__data, (page), count); \

3820

int ret = cfq_var_store(&__data, (page), count); \

3818

if (__data < (MIN)) \

3821

if (__data < (MIN)) \

3819

__data = (MIN); \

3822

__data = (MIN); \

3820

else if (__data > (MAX)) \

3823

else if (__data > (MAX)) \

3821

__data = (MAX); \

3824

__data = (MAX); \

3822

if (__CONV) \

3825

if (__CONV) \

3823

*(__PTR) = msecs_to_jiffies(__data); \

3826

*(__PTR) = msecs_to_jiffies(__data); \

3824

else \

3827

else \

3825

*(__PTR) = __data; \

3828

*(__PTR) = __data; \

3826

return ret; \

3829

return ret; \

3827

}

3830

}

3828

STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);

3831

STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);

3829

STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,

3832

STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,

3830

UINT_MAX, 1);

3833

UINT_MAX, 1);

3831

STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,

3834

STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,

3832

UINT_MAX, 1);

3835

UINT_MAX, 1);

3833

STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);

3836

STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);

3834

STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,

3837

STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,

3835

UINT_MAX, 0);

3838

UINT_MAX, 0);

3836

STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);

3839

STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);

3837

STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);

3840

STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);

3838

STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);

3841

STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);

3839

STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,

3842

STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,

3840

UINT_MAX, 0);

3843

UINT_MAX, 0);

3841

STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);

3844

STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);

3842

STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);

3845

STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);

3843

#undef STORE_FUNCTION

3846

#undef STORE_FUNCTION

3844

3847

3845

#define CFQ_ATTR(name) \

3848

#define CFQ_ATTR(name) \

3846

__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)

3849

__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)

3847

3850

3848

static struct elv_fs_entry cfq_attrs[] = {

3851

static struct elv_fs_entry cfq_attrs[] = {

3849

CFQ_ATTR(quantum),

3852

CFQ_ATTR(quantum),

3850

CFQ_ATTR(fifo_expire_sync),

3853

CFQ_ATTR(fifo_expire_sync),

3851

CFQ_ATTR(fifo_expire_async),

3854

CFQ_ATTR(fifo_expire_async),

3852

CFQ_ATTR(back_seek_max),

3855

CFQ_ATTR(back_seek_max),

3853

CFQ_ATTR(back_seek_penalty),

3856

CFQ_ATTR(back_seek_penalty),

3854

CFQ_ATTR(slice_sync),

3857

CFQ_ATTR(slice_sync),

3855

CFQ_ATTR(slice_async),

3858

CFQ_ATTR(slice_async),

3856

CFQ_ATTR(slice_async_rq),

3859

CFQ_ATTR(slice_async_rq),

3857

CFQ_ATTR(slice_idle),

3860

CFQ_ATTR(slice_idle),

3858

CFQ_ATTR(low_latency),

3861

CFQ_ATTR(low_latency),

3859

CFQ_ATTR(group_isolation),

3862

CFQ_ATTR(group_isolation),

3860

__ATTR_NULL

3863

__ATTR_NULL

3861

};

3864

};

3862

3865

3863

static struct elevator_type iosched_cfq = {

3866

static struct elevator_type iosched_cfq = {

3864

.ops = {

3867

.ops = {

3865

.elevator_merge_fn = cfq_merge,

3868

.elevator_merge_fn = cfq_merge,

3866

.elevator_merged_fn = cfq_merged_request,

3869

.elevator_merged_fn = cfq_merged_request,

3867

.elevator_merge_req_fn = cfq_merged_requests,

3870

.elevator_merge_req_fn = cfq_merged_requests,

3868

.elevator_allow_merge_fn = cfq_allow_merge,

3871

.elevator_allow_merge_fn = cfq_allow_merge,

3869

.elevator_dispatch_fn = cfq_dispatch_requests,

3872

.elevator_dispatch_fn = cfq_dispatch_requests,

3870

.elevator_add_req_fn = cfq_insert_request,

3873

.elevator_add_req_fn = cfq_insert_request,

3871

.elevator_activate_req_fn = cfq_activate_request,

3874

.elevator_activate_req_fn = cfq_activate_request,

3872

.elevator_deactivate_req_fn = cfq_deactivate_request,

3875

.elevator_deactivate_req_fn = cfq_deactivate_request,

3873

.elevator_queue_empty_fn = cfq_queue_empty,

3876

.elevator_queue_empty_fn = cfq_queue_empty,

3874

.elevator_completed_req_fn = cfq_completed_request,

3877

.elevator_completed_req_fn = cfq_completed_request,

3875

.elevator_former_req_fn = elv_rb_former_request,

3878

.elevator_former_req_fn = elv_rb_former_request,

3876

.elevator_latter_req_fn = elv_rb_latter_request,

3879

.elevator_latter_req_fn = elv_rb_latter_request,

3877

.elevator_set_req_fn = cfq_set_request,

3880

.elevator_set_req_fn = cfq_set_request,

3878

.elevator_put_req_fn = cfq_put_request,

3881

.elevator_put_req_fn = cfq_put_request,

3879

.elevator_may_queue_fn = cfq_may_queue,

3882

.elevator_may_queue_fn = cfq_may_queue,

3880

.elevator_init_fn = cfq_init_queue,

3883

.elevator_init_fn = cfq_init_queue,

3881

.elevator_exit_fn = cfq_exit_queue,

3884

.elevator_exit_fn = cfq_exit_queue,

3882

.trim = cfq_free_io_context,

3885

.trim = cfq_free_io_context,

3883

},

3886

},

3884

.elevator_attrs = cfq_attrs,

3887

.elevator_attrs = cfq_attrs,

3885

.elevator_name = "cfq",

3888

.elevator_name = "cfq",

3886

.elevator_owner = THIS_MODULE,

3889

.elevator_owner = THIS_MODULE,

3887

};

3890

};

3888

3891

3889

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3892

#ifdef CONFIG_CFQ_GROUP_IOSCHED

3890

static struct blkio_policy_type blkio_policy_cfq = {

3893

static struct blkio_policy_type blkio_policy_cfq = {

3891

.ops = {

3894

.ops = {

3892

.blkio_unlink_group_fn = cfq_unlink_blkio_group,

3895

.blkio_unlink_group_fn = cfq_unlink_blkio_group,

3893

.blkio_update_group_weight_fn = cfq_update_blkio_group_weight,

3896

.blkio_update_group_weight_fn = cfq_update_blkio_group_weight,

3894

},

3897

},

3895

};

3898

};

3896

#else

3899

#else

3897

static struct blkio_policy_type blkio_policy_cfq;

3900

static struct blkio_policy_type blkio_policy_cfq;

3898

#endif

3901

#endif

3899

3902

3900

static int __init cfq_init(void)

3903

static int __init cfq_init(void)

3901

{

3904

{

3902

/*

3905

/*

3903

* could be 0 on HZ < 1000 setups

3906

* could be 0 on HZ < 1000 setups

3904

*/

3907

*/

3905

if (!cfq_slice_async)

3908

if (!cfq_slice_async)

3906

cfq_slice_async = 1;

3909

cfq_slice_async = 1;

3907

if (!cfq_slice_idle)

3910

if (!cfq_slice_idle)

3908

cfq_slice_idle = 1;

3911

cfq_slice_idle = 1;

3909

3912

3910

if (cfq_slab_setup())

3913

if (cfq_slab_setup())

3911

return -ENOMEM;

3914

return -ENOMEM;

3912

3915

3913

elv_register(&iosched_cfq);

3916

elv_register(&iosched_cfq);

3914

blkio_policy_register(&blkio_policy_cfq);

3917

blkio_policy_register(&blkio_policy_cfq);

3915

3918

3916

return 0;

3919

return 0;

3917

}

3920

}

3918

3921

3919

static void __exit cfq_exit(void)

3922

static void __exit cfq_exit(void)

3920

{

3923

{

3921

DECLARE_COMPLETION_ONSTACK(all_gone);

3924

DECLARE_COMPLETION_ONSTACK(all_gone);

3922

blkio_policy_unregister(&blkio_policy_cfq);

3925

blkio_policy_unregister(&blkio_policy_cfq);

3923

elv_unregister(&iosched_cfq);

3926

elv_unregister(&iosched_cfq);

3924

ioc_gone = &all_gone;

3927

ioc_gone = &all_gone;

3925

/* ioc_gone's update must be visible before reading ioc_count */

3928

/* ioc_gone's update must be visible before reading ioc_count */

3926

smp_wmb();

3929

smp_wmb();

3927

3930

3928

/*

3931

/*

3929

* this also protects us from entering cfq_slab_kill() with

3932

* this also protects us from entering cfq_slab_kill() with

3930

* pending RCU callbacks

3933

* pending RCU callbacks

3931

*/

3934

*/

3932

if (elv_ioc_count_read(cfq_ioc_count))

3935

if (elv_ioc_count_read(cfq_ioc_count))

3933

wait_for_completion(&all_gone);

3936

wait_for_completion(&all_gone);

3934

cfq_slab_kill();

3937

cfq_slab_kill();

3935

}

3938

}

3936

3939

3937

module_init(cfq_init);

3940

module_init(cfq_init);

3938

module_exit(cfq_exit);

3941

module_exit(cfq_exit);

3939

3942

3940

MODULE_AUTHOR("Jens Axboe");

3943

MODULE_AUTHOR("Jens Axboe");

3941

MODULE_LICENSE("GPL");

3944

MODULE_LICENSE("GPL");

3942

MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");

3945

MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");

GITLAB

Eric Lee / linux-smarc-t335x-v3.2

cfq-iosched: Fix the incorrect timeslice accounting with forced_dispatch

 /*
  *  CFQ, or complete fairness queueing, disk scheduler.
  *
  *  Based on ideas from a previously unfinished io
  *  scheduler (round robin per-process disk scheduling) and Andrea Arcangeli.
  *
  *  Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
  */
 #include <linux/module.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/jiffies.h>
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
 #include "blk-cgroup.h"
 /*
  * tunables
  */
 /* max queue in one round of service */
 static const int cfq_quantum = 8;
 static const int cfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
 /* maximum backwards seek, in KiB */
 static const int cfq_back_max = 16 * 1024;
 /* penalty of a backwards seek */
 static const int cfq_back_penalty = 2;
 static const int cfq_slice_sync = HZ / 10;
 static int cfq_slice_async = HZ / 25;
 static const int cfq_slice_async_rq = 2;
 static int cfq_slice_idle = HZ / 125;
 static const int cfq_target_latency = HZ * 3/10; /* 300 ms */
 static const int cfq_hist_divisor = 4;
 /*
  * offset from end of service tree
  */
 #define CFQ_IDLE_DELAY		(HZ / 5)
 /*
  * below this threshold, we consider thinktime immediate
  */
 #define CFQ_MIN_TT		(2)
 #define CFQ_SLICE_SCALE		(5)
 #define CFQ_HW_QUEUE_MIN	(5)
 #define CFQ_SERVICE_SHIFT       12
 #define CFQQ_SEEK_THR		(sector_t)(8 * 100)
 #define CFQQ_CLOSE_THR		(sector_t)(8 * 1024)
 #define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
 #define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
 #define RQ_CIC(rq)		\
 	((struct cfq_io_context *) (rq)->elevator_private)
 #define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
 static struct kmem_cache *cfq_pool;
 static struct kmem_cache *cfq_ioc_pool;
 static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
 static struct completion *ioc_gone;
 static DEFINE_SPINLOCK(ioc_gone_lock);
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
 #define sample_valid(samples)	((samples) > 80)
 #define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
 /*
  * Most of our rbtree usage is for sorting with min extraction, so
  * if we cache the leftmost node we don't have to walk down the tree
  * to find it. Idea borrowed from Ingo Molnars CFS scheduler. We should
  * move this into the elevator for the rq sorting as well.
  */
 struct cfq_rb_root {
 	struct rb_root rb;
 	struct rb_node *left;
 	unsigned count;
 	unsigned total_weight;
 	u64 min_vdisktime;
 	struct rb_node *active;
 };
 #define CFQ_RB_ROOT	(struct cfq_rb_root) { .rb = RB_ROOT, .left = NULL, \
 			.count = 0, .min_vdisktime = 0, }
 /*
  * Per process-grouping structure
  */
 struct cfq_queue {
 	/* reference count */
 	atomic_t ref;
 	/* various state flags, see below */
 	unsigned int flags;
 	/* parent cfq_data */
 	struct cfq_data *cfqd;
 	/* service_tree member */
 	struct rb_node rb_node;
 	/* service_tree key */
 	unsigned long rb_key;
 	/* prio tree member */
 	struct rb_node p_node;
 	/* prio tree root we belong to, if any */
 	struct rb_root *p_root;
 	/* sorted list of pending requests */
 	struct rb_root sort_list;
 	/* if fifo isn't expired, next request to serve */
 	struct request *next_rq;
 	/* requests queued in sort_list */
 	int queued[2];
 	/* currently allocated requests */
 	int allocated[2];
 	/* fifo list of requests in sort_list */
 	struct list_head fifo;
 	/* time when queue got scheduled in to dispatch first request. */
 	unsigned long dispatch_start;
 	unsigned int allocated_slice;
 	unsigned int slice_dispatch;
 	/* time when first request from queue completed and slice started. */
 	unsigned long slice_start;
 	unsigned long slice_end;
 	long slice_resid;
 	/* pending metadata requests */
 	int meta_pending;
 	/* number of requests that are on the dispatch list or inside driver */
 	int dispatched;
 	/* io prio of this group */
 	unsigned short ioprio, org_ioprio;
 	unsigned short ioprio_class, org_ioprio_class;
 	pid_t pid;
 	u32 seek_history;
 	sector_t last_request_pos;
 	struct cfq_rb_root *service_tree;
 	struct cfq_queue *new_cfqq;
 	struct cfq_group *cfqg;
 	struct cfq_group *orig_cfqg;
 	/* Sectors dispatched in current dispatch round */
 	unsigned long nr_sectors;
 };
 /*
  * First index in the service_trees.
  * IDLE is handled separately, so it has negative index
  */
 enum wl_prio_t {
 	BE_WORKLOAD = 0,
 	RT_WORKLOAD = 1,
 	IDLE_WORKLOAD = 2,
 };
 /*
  * Second index in the service_trees.
  */
 enum wl_type_t {
 	ASYNC_WORKLOAD = 0,
 	SYNC_NOIDLE_WORKLOAD = 1,
 	SYNC_WORKLOAD = 2
 };
 /* This is per cgroup per device grouping structure */
 struct cfq_group {
 	/* group service_tree member */
 	struct rb_node rb_node;
 	/* group service_tree key */
 	u64 vdisktime;
 	unsigned int weight;
 	bool on_st;
 	/* number of cfqq currently on this group */
 	int nr_cfqq;
 	/* Per group busy queus average. Useful for workload slice calc. */
 	unsigned int busy_queues_avg[2];
 	/*
 	 * rr lists of queues with requests, onle rr for each priority class.
 	 * Counts are embedded in the cfq_rb_root
 	 */
 	struct cfq_rb_root service_trees[2][3];
 	struct cfq_rb_root service_tree_idle;
 	unsigned long saved_workload_slice;
 	enum wl_type_t saved_workload;
 	enum wl_prio_t saved_serving_prio;
 	struct blkio_group blkg;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	struct hlist_node cfqd_node;
 	atomic_t ref;
 #endif
 };
 /*
  * Per block device queue structure
  */
 struct cfq_data {
 	struct request_queue *queue;
 	/* Root service tree for cfq_groups */
 	struct cfq_rb_root grp_service_tree;
 	struct cfq_group root_group;
 	/*
 	 * The priority currently being served
 	 */
 	enum wl_prio_t serving_prio;
 	enum wl_type_t serving_type;
 	unsigned long workload_expires;
 	struct cfq_group *serving_group;
 	bool noidle_tree_requires_idle;
 	/*
 	 * Each priority tree is sorted by next_request position.  These
 	 * trees are used when determining if two or more queues are
 	 * interleaving requests (see cfq_close_cooperator).
 	 */
 	struct rb_root prio_trees[CFQ_PRIO_LISTS];
 	unsigned int busy_queues;
 	int rq_in_driver;
 	int rq_in_flight[2];
 	/*
 	 * queue-depth detection
 	 */
 	int rq_queued;
 	int hw_tag;
 	/*
 	 * hw_tag can be
 	 * -1 => indeterminate, (cfq will behave as if NCQ is present, to allow better detection)
 	 *  1 => NCQ is present (hw_tag_est_depth is the estimated max depth)
 	 *  0 => no NCQ
 	 */
 	int hw_tag_est_depth;
 	unsigned int hw_tag_samples;
 	/*
 	 * idle window management
 	 */
 	struct timer_list idle_slice_timer;
 	struct work_struct unplug_work;
 	struct cfq_queue *active_queue;
 	struct cfq_io_context *active_cic;
 	/*
 	 * async queue for each priority case
 	 */
 	struct cfq_queue *async_cfqq[2][IOPRIO_BE_NR];
 	struct cfq_queue *async_idle_cfqq;
 	sector_t last_position;
 	/*
 	 * tunables, see top of file
 	 */
 	unsigned int cfq_quantum;
 	unsigned int cfq_fifo_expire[2];
 	unsigned int cfq_back_penalty;
 	unsigned int cfq_back_max;
 	unsigned int cfq_slice[2];
 	unsigned int cfq_slice_async_rq;
 	unsigned int cfq_slice_idle;
 	unsigned int cfq_latency;
 	unsigned int cfq_group_isolation;
 	struct list_head cic_list;
 	/*
 	 * Fallback dummy cfqq for extreme OOM conditions
 	 */
 	struct cfq_queue oom_cfqq;
 	unsigned long last_delayed_sync;
 	/* List of cfq groups being managed on this device*/
 	struct hlist_head cfqg_list;
 	struct rcu_head rcu;
 };
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd);
 static struct cfq_rb_root *service_tree_for(struct cfq_group *cfqg,
 					    enum wl_prio_t prio,
 					    enum wl_type_t type)
 {
 	if (!cfqg)
 		return NULL;
 	if (prio == IDLE_WORKLOAD)
 		return &cfqg->service_tree_idle;
 	return &cfqg->service_trees[prio][type];
 }
 enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_on_rr = 0,	/* on round-robin busy list */
 	CFQ_CFQQ_FLAG_wait_request,	/* waiting for a request */
 	CFQ_CFQQ_FLAG_must_dispatch,	/* must be allowed a dispatch */
 	CFQ_CFQQ_FLAG_must_alloc_slice,	/* per-slice must_alloc flag */
 	CFQ_CFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
 	CFQ_CFQQ_FLAG_idle_window,	/* slice idling enabled */
 	CFQ_CFQQ_FLAG_prio_changed,	/* task priority has changed */
 	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
 	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
 	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
 	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
 	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
 	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
 };
 #define CFQ_CFQQ_FNS(name)						\
 static inline void cfq_mark_cfqq_##name(struct cfq_queue *cfqq)		\
 {									\
 	(cfqq)->flags |= (1 << CFQ_CFQQ_FLAG_##name);			\
 }									\
 static inline void cfq_clear_cfqq_##name(struct cfq_queue *cfqq)	\
 {									\
 	(cfqq)->flags &= ~(1 << CFQ_CFQQ_FLAG_##name);			\
 }									\
 static inline int cfq_cfqq_##name(const struct cfq_queue *cfqq)		\
 {									\
 	return ((cfqq)->flags & (1 << CFQ_CFQQ_FLAG_##name)) != 0;	\
 }
 CFQ_CFQQ_FNS(on_rr);
 CFQ_CFQQ_FNS(wait_request);
 CFQ_CFQQ_FNS(must_dispatch);
 CFQ_CFQQ_FNS(must_alloc_slice);
 CFQ_CFQQ_FNS(fifo_expire);
 CFQ_CFQQ_FNS(idle_window);
 CFQ_CFQQ_FNS(prio_changed);
 CFQ_CFQQ_FNS(slice_new);
 CFQ_CFQQ_FNS(sync);
 CFQ_CFQQ_FNS(coop);
 CFQ_CFQQ_FNS(split_coop);
 CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
 #ifdef CONFIG_DEBUG_CFQ_IOSCHED
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d%c %s " fmt, (cfqq)->pid, \
 			cfq_cfqq_sync((cfqq)) ? 'S' : 'A', \
 			blkg_path(&(cfqq)->cfqg->blkg), ##args);
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)				\
 	blk_add_trace_msg((cfqd)->queue, "%s " fmt,			\
 				blkg_path(&(cfqg)->blkg), ##args);      \
 #else
 #define cfq_log_cfqq(cfqd, cfqq, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq%d " fmt, (cfqq)->pid, ##args)
 #define cfq_log_cfqg(cfqd, cfqg, fmt, args...)		do {} while (0);
 #endif
 #define cfq_log(cfqd, fmt, args...)	\
 	blk_add_trace_msg((cfqd)->queue, "cfq " fmt, ##args)
 /* Traverses through cfq group service trees */
 #define for_each_cfqg_st(cfqg, i, j, st) \
 	for (i = 0; i <= IDLE_WORKLOAD; i++) \
 		for (j = 0, st = i < IDLE_WORKLOAD ? &cfqg->service_trees[i][j]\
 			: &cfqg->service_tree_idle; \
 			(i < IDLE_WORKLOAD && j <= SYNC_WORKLOAD) || \
 			(i == IDLE_WORKLOAD && j == 0); \
 			j++, st = i < IDLE_WORKLOAD ? \
 			&cfqg->service_trees[i][j]: NULL) \
 static inline enum wl_prio_t cfqq_prio(struct cfq_queue *cfqq)
 {
 	if (cfq_class_idle(cfqq))
 		return IDLE_WORKLOAD;
 	if (cfq_class_rt(cfqq))
 		return RT_WORKLOAD;
 	return BE_WORKLOAD;
 }
 static enum wl_type_t cfqq_type(struct cfq_queue *cfqq)
 {
 	if (!cfq_cfqq_sync(cfqq))
 		return ASYNC_WORKLOAD;
 	if (!cfq_cfqq_idle_window(cfqq))
 		return SYNC_NOIDLE_WORKLOAD;
 	return SYNC_WORKLOAD;
 }
 static inline int cfq_group_busy_queues_wl(enum wl_prio_t wl,
 					struct cfq_data *cfqd,
 					struct cfq_group *cfqg)
 {
 	if (wl == IDLE_WORKLOAD)
 		return cfqg->service_tree_idle.count;
 	return cfqg->service_trees[wl][ASYNC_WORKLOAD].count
 		+ cfqg->service_trees[wl][SYNC_NOIDLE_WORKLOAD].count
 		+ cfqg->service_trees[wl][SYNC_WORKLOAD].count;
 }
 static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 					struct cfq_group *cfqg)
 {
 	return cfqg->service_trees[RT_WORKLOAD][ASYNC_WORKLOAD].count
 		+ cfqg->service_trees[BE_WORKLOAD][ASYNC_WORKLOAD].count;
 }
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
 				       struct io_context *, gfp_t);
 static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
 						struct io_context *);
 static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
 					    bool is_sync)
 {
 	return cic->cfqq[is_sync];
 }
 static inline void cic_set_cfqq(struct cfq_io_context *cic,
 				struct cfq_queue *cfqq, bool is_sync)
 {
 	cic->cfqq[is_sync] = cfqq;
 }
 /*
  * We regard a request as SYNC, if it's either a read or has the SYNC bit
  * set (in which case it could also be direct WRITE).
  */
 static inline bool cfq_bio_sync(struct bio *bio)
 {
 	return bio_data_dir(bio) == READ || bio_rw_flagged(bio, BIO_RW_SYNCIO);
 }
 /*
  * scheduler run of queue, if there are requests pending and no one in the
  * driver that will restart queueing
  */
 static inline void cfq_schedule_dispatch(struct cfq_data *cfqd)
 {
 	if (cfqd->busy_queues) {
 		cfq_log(cfqd, "schedule dispatch");
 		kblockd_schedule_work(cfqd->queue, &cfqd->unplug_work);
 	}
 }
 static int cfq_queue_empty(struct request_queue *q)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	return !cfqd->rq_queued;
 }
 /*
  * Scale schedule slice based on io priority. Use the sync time slice only
  * if a queue is marked sync and has sync io queued. A sync queue with async
  * io only, should not get full sync slice length.
  */
 static inline int cfq_prio_slice(struct cfq_data *cfqd, bool sync,
 				 unsigned short prio)
 {
 	const int base_slice = cfqd->cfq_slice[sync];
 	WARN_ON(prio >= IOPRIO_BE_NR);
 	return base_slice + (base_slice/CFQ_SLICE_SCALE * (4 - prio));
 }
 static inline int
 cfq_prio_to_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	return cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio);
 }
 static inline u64 cfq_scale_slice(unsigned long delta, struct cfq_group *cfqg)
 {
 	u64 d = delta << CFQ_SERVICE_SHIFT;
 	d = d * BLKIO_WEIGHT_DEFAULT;
 	do_div(d, cfqg->weight);
 	return d;
 }
 static inline u64 max_vdisktime(u64 min_vdisktime, u64 vdisktime)
 {
 	s64 delta = (s64)(vdisktime - min_vdisktime);
 	if (delta > 0)
 		min_vdisktime = vdisktime;
 	return min_vdisktime;
 }
 static inline u64 min_vdisktime(u64 min_vdisktime, u64 vdisktime)
 {
 	s64 delta = (s64)(vdisktime - min_vdisktime);
 	if (delta < 0)
 		min_vdisktime = vdisktime;
 	return min_vdisktime;
 }
 static void update_min_vdisktime(struct cfq_rb_root *st)
 {
 	u64 vdisktime = st->min_vdisktime;
 	struct cfq_group *cfqg;
 	if (st->active) {
 		cfqg = rb_entry_cfqg(st->active);
 		vdisktime = cfqg->vdisktime;
 	}
 	if (st->left) {
 		cfqg = rb_entry_cfqg(st->left);
 		vdisktime = min_vdisktime(vdisktime, cfqg->vdisktime);
 	}
 	st->min_vdisktime = max_vdisktime(st->min_vdisktime, vdisktime);
 }
 /*
  * get averaged number of queues of RT/BE priority.
  * average is updated, with a formula that gives more weight to higher numbers,
  * to quickly follows sudden increases and decrease slowly
  */
 static inline unsigned cfq_group_get_avg_queues(struct cfq_data *cfqd,
 					struct cfq_group *cfqg, bool rt)
 {
 	unsigned min_q, max_q;
 	unsigned mult  = cfq_hist_divisor - 1;
 	unsigned round = cfq_hist_divisor / 2;
 	unsigned busy = cfq_group_busy_queues_wl(rt, cfqd, cfqg);
 	min_q = min(cfqg->busy_queues_avg[rt], busy);
 	max_q = max(cfqg->busy_queues_avg[rt], busy);
 	cfqg->busy_queues_avg[rt] = (mult * max_q + min_q + round) /
 		cfq_hist_divisor;
 	return cfqg->busy_queues_avg[rt];
 }
 static inline unsigned
 cfq_group_slice(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	return cfq_target_latency * cfqg->weight / st->total_weight;
 }
 static inline void
 cfq_set_prio_slice(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned slice = cfq_prio_to_slice(cfqd, cfqq);
 	if (cfqd->cfq_latency) {
 		/*
 		 * interested queues (we consider only the ones with the same
 		 * priority class in the cfq group)
 		 */
 		unsigned iq = cfq_group_get_avg_queues(cfqd, cfqq->cfqg,
 						cfq_class_rt(cfqq));
 		unsigned sync_slice = cfqd->cfq_slice[1];
 		unsigned expect_latency = sync_slice * iq;
 		unsigned group_slice = cfq_group_slice(cfqd, cfqq->cfqg);
 		if (expect_latency > group_slice) {
 			unsigned base_low_slice = 2 * cfqd->cfq_slice_idle;
 			/* scale low_slice according to IO priority
 			 * and sync vs async */
 			unsigned low_slice =
 				min(slice, base_low_slice * slice / sync_slice);
 			/* the adapted slice value is scaled to fit all iqs
 			 * into the target latency */
 			slice = max(slice * group_slice / expect_latency,
 				    low_slice);
 		}
 	}
 	cfqq->slice_start = jiffies;
 	cfqq->slice_end = jiffies + slice;
 	cfqq->allocated_slice = slice;
 	cfq_log_cfqq(cfqd, cfqq, "set_slice=%lu", cfqq->slice_end - jiffies);
 }
 /*
  * We need to wrap this check in cfq_cfqq_slice_new(), since ->slice_end
  * isn't valid until the first request from the dispatch is activated
  * and the slice time set.
  */
 static inline bool cfq_slice_used(struct cfq_queue *cfqq)
 {
 	if (cfq_cfqq_slice_new(cfqq))
 		return 0;
 	if (time_before(jiffies, cfqq->slice_end))
 		return 0;
 	return 1;
 }
 /*
  * Lifted from AS - choose which of rq1 and rq2 that is best served now.
  * We choose the request that is closest to the head right now. Distance
  * behind the head is penalized and only allowed to a certain extent.
  */
 static struct request *
 cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2, sector_t last)
 {
 	sector_t s1, s2, d1 = 0, d2 = 0;
 	unsigned long back_max;
 #define CFQ_RQ1_WRAP	0x01 /* request 1 wraps */
 #define CFQ_RQ2_WRAP	0x02 /* request 2 wraps */
 	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
 	if (rq1 == NULL || rq1 == rq2)
 		return rq2;
 	if (rq2 == NULL)
 		return rq1;
 	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
 		return rq1;
 	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
 		return rq2;
 	if (rq_is_meta(rq1) && !rq_is_meta(rq2))
 		return rq1;
 	else if (rq_is_meta(rq2) && !rq_is_meta(rq1))
 		return rq2;
 	s1 = blk_rq_pos(rq1);
 	s2 = blk_rq_pos(rq2);
 	/*
 	 * by definition, 1KiB is 2 sectors
 	 */
 	back_max = cfqd->cfq_back_max * 2;
 	/*
 	 * Strict one way elevator _except_ in the case where we allow
 	 * short backward seeks which are biased as twice the cost of a
 	 * similar forward seek.
 	 */
 	if (s1 >= last)
 		d1 = s1 - last;
 	else if (s1 + back_max >= last)
 		d1 = (last - s1) * cfqd->cfq_back_penalty;
 	else
 		wrap |= CFQ_RQ1_WRAP;
 	if (s2 >= last)
 		d2 = s2 - last;
 	else if (s2 + back_max >= last)
 		d2 = (last - s2) * cfqd->cfq_back_penalty;
 	else
 		wrap |= CFQ_RQ2_WRAP;
 	/* Found required data */
 	/*
 	 * By doing switch() on the bit mask "wrap" we avoid having to
 	 * check two variables for all permutations: --> faster!
 	 */
 	switch (wrap) {
 	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
 		if (d1 < d2)
 			return rq1;
 		else if (d2 < d1)
 			return rq2;
 		else {
 			if (s1 >= s2)
 				return rq1;
 			else
 				return rq2;
 		}
 	case CFQ_RQ2_WRAP:
 		return rq1;
 	case CFQ_RQ1_WRAP:
 		return rq2;
 	case (CFQ_RQ1_WRAP|CFQ_RQ2_WRAP): /* both rqs wrapped */
 	default:
 		/*
 		 * Since both rqs are wrapped,
 		 * start with the one that's further behind head
 		 * (--> only *one* back seek required),
 		 * since back seek takes more time than forward.
 		 */
 		if (s1 <= s2)
 			return rq1;
 		else
 			return rq2;
 	}
 }
 /*
  * The below is leftmost cache rbtree addon
  */
 static struct cfq_queue *cfq_rb_first(struct cfq_rb_root *root)
 {
 	/* Service tree is empty */
 	if (!root->count)
 		return NULL;
 	if (!root->left)
 		root->left = rb_first(&root->rb);
 	if (root->left)
 		return rb_entry(root->left, struct cfq_queue, rb_node);
 	return NULL;
 }
 static struct cfq_group *cfq_rb_first_group(struct cfq_rb_root *root)
 {
 	if (!root->left)
 		root->left = rb_first(&root->rb);
 	if (root->left)
 		return rb_entry_cfqg(root->left);
 	return NULL;
 }
 static void rb_erase_init(struct rb_node *n, struct rb_root *root)
 {
 	rb_erase(n, root);
 	RB_CLEAR_NODE(n);
 }
 static void cfq_rb_erase(struct rb_node *n, struct cfq_rb_root *root)
 {
 	if (root->left == n)
 		root->left = NULL;
 	rb_erase_init(n, &root->rb);
 	--root->count;
 }
 /*
  * would be nice to take fifo expire time into account as well
  */
 static struct request *
 cfq_find_next_rq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		  struct request *last)
 {
 	struct rb_node *rbnext = rb_next(&last->rb_node);
 	struct rb_node *rbprev = rb_prev(&last->rb_node);
 	struct request *next = NULL, *prev = NULL;
 	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
 	if (rbprev)
 		prev = rb_entry_rq(rbprev);
 	if (rbnext)
 		next = rb_entry_rq(rbnext);
 	else {
 		rbnext = rb_first(&cfqq->sort_list);
 		if (rbnext && rbnext != &last->rb_node)
 			next = rb_entry_rq(rbnext);
 	}
 	return cfq_choose_req(cfqd, next, prev, blk_rq_pos(last));
 }
 static unsigned long cfq_slice_offset(struct cfq_data *cfqd,
 				      struct cfq_queue *cfqq)
 {
 	/*
 	 * just an approximation, should be ok.
 	 */
 	return (cfqq->cfqg->nr_cfqq - 1) * (cfq_prio_slice(cfqd, 1, 0) -
 		       cfq_prio_slice(cfqd, cfq_cfqq_sync(cfqq), cfqq->ioprio));
 }
 static inline s64
 cfqg_key(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
 	return cfqg->vdisktime - st->min_vdisktime;
 }
 static void
 __cfq_group_service_tree_add(struct cfq_rb_root *st, struct cfq_group *cfqg)
 {
 	struct rb_node **node = &st->rb.rb_node;
 	struct rb_node *parent = NULL;
 	struct cfq_group *__cfqg;
 	s64 key = cfqg_key(st, cfqg);
 	int left = 1;
 	while (*node != NULL) {
 		parent = *node;
 		__cfqg = rb_entry_cfqg(parent);
 		if (key < cfqg_key(st, __cfqg))
 			node = &parent->rb_left;
 		else {
 			node = &parent->rb_right;
 			left = 0;
 		}
 	}
 	if (left)
 		st->left = &cfqg->rb_node;
 	rb_link_node(&cfqg->rb_node, parent, node);
 	rb_insert_color(&cfqg->rb_node, &st->rb);
 }
 static void
 cfq_group_service_tree_add(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_group *__cfqg;
 	struct rb_node *n;
 	cfqg->nr_cfqq++;
 	if (cfqg->on_st)
 		return;
 	/*
 	 * Currently put the group at the end. Later implement something
 	 * so that groups get lesser vtime based on their weights, so that
 	 * if group does not loose all if it was not continously backlogged.
 	 */
 	n = rb_last(&st->rb);
 	if (n) {
 		__cfqg = rb_entry_cfqg(n);
 		cfqg->vdisktime = __cfqg->vdisktime + CFQ_IDLE_DELAY;
 	} else
 		cfqg->vdisktime = st->min_vdisktime;
 	__cfq_group_service_tree_add(st, cfqg);
 	cfqg->on_st = true;
 	st->total_weight += cfqg->weight;
 }
 static void
 cfq_group_service_tree_del(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	if (st->active == &cfqg->rb_node)
 		st->active = NULL;
 	BUG_ON(cfqg->nr_cfqq < 1);
 	cfqg->nr_cfqq--;
 	/* If there are other cfq queues under this group, don't delete it */
 	if (cfqg->nr_cfqq)
 		return;
 	cfq_log_cfqg(cfqd, cfqg, "del_from_rr group");
 	cfqg->on_st = false;
 	st->total_weight -= cfqg->weight;
 	if (!RB_EMPTY_NODE(&cfqg->rb_node))
 		cfq_rb_erase(&cfqg->rb_node, st);
 	cfqg->saved_workload_slice = 0;
 	blkiocg_update_blkio_group_dequeue_stats(&cfqg->blkg, 1);
 }
 static inline unsigned int cfq_cfqq_slice_usage(struct cfq_queue *cfqq)
 {
 	unsigned int slice_used;
 	/*
 	 * Queue got expired before even a single request completed or
 	 * got expired immediately after first request completion.
 	 */
 	if (!cfqq->slice_start || cfqq->slice_start == jiffies) {
 		/*
 		 * Also charge the seek time incurred to the group, otherwise
 		 * if there are mutiple queues in the group, each can dispatch
 		 * a single request on seeky media and cause lots of seek time
 		 * and group will never know it.
 		 */
 		slice_used = max_t(unsigned, (jiffies - cfqq->dispatch_start),
 					1);
 	} else {
 		slice_used = jiffies - cfqq->slice_start;
 		if (slice_used > cfqq->allocated_slice)
 			slice_used = cfqq->allocated_slice;
 	}
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "sl_used=%u sect=%lu", slice_used,
 				cfqq->nr_sectors);
 	return slice_used;
 }
 static void cfq_group_served(struct cfq_data *cfqd, struct cfq_group *cfqg,
 				struct cfq_queue *cfqq)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	unsigned int used_sl, charge_sl;
 	int nr_sync = cfqg->nr_cfqq - cfqg_busy_async_queues(cfqd, cfqg)
 			- cfqg->service_tree_idle.count;
 	BUG_ON(nr_sync < 0);
 	used_sl = charge_sl = cfq_cfqq_slice_usage(cfqq);
 	if (!cfq_cfqq_sync(cfqq) && !nr_sync)
 		charge_sl = cfqq->allocated_slice;
 	/* Can't update vdisktime while group is on service tree */
 	cfq_rb_erase(&cfqg->rb_node, st);
 	cfqg->vdisktime += cfq_scale_slice(charge_sl, cfqg);
 	__cfq_group_service_tree_add(st, cfqg);
 	/* This group is being expired. Save the context */
 	if (time_after(cfqd->workload_expires, jiffies)) {
 		cfqg->saved_workload_slice = cfqd->workload_expires
 						- jiffies;
 		cfqg->saved_workload = cfqd->serving_type;
 		cfqg->saved_serving_prio = cfqd->serving_prio;
 	} else
 		cfqg->saved_workload_slice = 0;
 	cfq_log_cfqg(cfqd, cfqg, "served: vt=%llu min_vt=%llu", cfqg->vdisktime,
 					st->min_vdisktime);
 	blkiocg_update_blkio_group_stats(&cfqg->blkg, used_sl,
 						cfqq->nr_sectors);
 }
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static inline struct cfq_group *cfqg_of_blkg(struct blkio_group *blkg)
 {
 	if (blkg)
 		return container_of(blkg, struct cfq_group, blkg);
 	return NULL;
 }
 void
 cfq_update_blkio_group_weight(struct blkio_group *blkg, unsigned int weight)
 {
 	cfqg_of_blkg(blkg)->weight = weight;
 }
 static struct cfq_group *
 cfq_find_alloc_cfqg(struct cfq_data *cfqd, struct cgroup *cgroup, int create)
 {
 	struct blkio_cgroup *blkcg = cgroup_to_blkio_cgroup(cgroup);
 	struct cfq_group *cfqg = NULL;
 	void *key = cfqd;
 	int i, j;
 	struct cfq_rb_root *st;
 	struct backing_dev_info *bdi = &cfqd->queue->backing_dev_info;
 	unsigned int major, minor;
 	cfqg = cfqg_of_blkg(blkiocg_lookup_group(blkcg, key));
 	if (cfqg && !cfqg->blkg.dev && bdi->dev && dev_name(bdi->dev)) {
 		sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
 		cfqg->blkg.dev = MKDEV(major, minor);
 		goto done;
 	}
 	if (cfqg || !create)
 		goto done;
 	cfqg = kzalloc_node(sizeof(*cfqg), GFP_ATOMIC, cfqd->queue->node);
 	if (!cfqg)
 		goto done;
 	cfqg->weight = blkcg->weight;
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
 	/*
 	 * Take the initial reference that will be released on destroy
 	 * This can be thought of a joint reference by cgroup and
 	 * elevator which will be dropped by either elevator exit
 	 * or cgroup deletion path depending on who is exiting first.
 	 */
 	atomic_set(&cfqg->ref, 1);
 	/* Add group onto cgroup list */
 	sscanf(dev_name(bdi->dev), "%u:%u", &major, &minor);
 	blkiocg_add_blkio_group(blkcg, &cfqg->blkg, (void *)cfqd,
 					MKDEV(major, minor));
 	/* Add group on cfqd list */
 	hlist_add_head(&cfqg->cfqd_node, &cfqd->cfqg_list);
 done:
 	return cfqg;
 }
 /*
  * Search for the cfq group current task belongs to. If create = 1, then also
  * create the cfq group if it does not exist. request_queue lock must be held.
  */
 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 {
 	struct cgroup *cgroup;
 	struct cfq_group *cfqg = NULL;
 	rcu_read_lock();
 	cgroup = task_cgroup(current, blkio_subsys_id);
 	cfqg = cfq_find_alloc_cfqg(cfqd, cgroup, create);
 	if (!cfqg && create)
 		cfqg = &cfqd->root_group;
 	rcu_read_unlock();
 	return cfqg;
 }
 static void cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg)
 {
 	/* Currently, all async queues are mapped to root group */
 	if (!cfq_cfqq_sync(cfqq))
 		cfqg = &cfqq->cfqd->root_group;
 	cfqq->cfqg = cfqg;
 	/* cfqq reference on cfqg */
 	atomic_inc(&cfqq->cfqg->ref);
 }
 static void cfq_put_cfqg(struct cfq_group *cfqg)
 {
 	struct cfq_rb_root *st;
 	int i, j;
 	BUG_ON(atomic_read(&cfqg->ref) <= 0);
 	if (!atomic_dec_and_test(&cfqg->ref))
 		return;
 	for_each_cfqg_st(cfqg, i, j, st)
 		BUG_ON(!RB_EMPTY_ROOT(&st->rb) || st->active != NULL);
 	kfree(cfqg);
 }
 static void cfq_destroy_cfqg(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	/* Something wrong if we are trying to remove same group twice */
 	BUG_ON(hlist_unhashed(&cfqg->cfqd_node));
 	hlist_del_init(&cfqg->cfqd_node);
 	/*
 	 * Put the reference taken at the time of creation so that when all
 	 * queues are gone, group can be destroyed.
 	 */
 	cfq_put_cfqg(cfqg);
 }
 static void cfq_release_cfq_groups(struct cfq_data *cfqd)
 {
 	struct hlist_node *pos, *n;
 	struct cfq_group *cfqg;
 	hlist_for_each_entry_safe(cfqg, pos, n, &cfqd->cfqg_list, cfqd_node) {
 		/*
 		 * If cgroup removal path got to blk_group first and removed
 		 * it from cgroup list, then it will take care of destroying
 		 * cfqg also.
 		 */
 		if (!blkiocg_del_blkio_group(&cfqg->blkg))
 			cfq_destroy_cfqg(cfqd, cfqg);
 	}
 }
 /*
  * Blk cgroup controller notification saying that blkio_group object is being
  * delinked as associated cgroup object is going away. That also means that
  * no new IO will come in this group. So get rid of this group as soon as
  * any pending IO in the group is finished.
  *
  * This function is called under rcu_read_lock(). key is the rcu protected
  * pointer. That means "key" is a valid cfq_data pointer as long as we are rcu
  * read lock.
  *
  * "key" was fetched from blkio_group under blkio_cgroup->lock. That means
  * it should not be NULL as even if elevator was exiting, cgroup deltion
  * path got to it first.
  */
 void cfq_unlink_blkio_group(void *key, struct blkio_group *blkg)
 {
 	unsigned long  flags;
 	struct cfq_data *cfqd = key;
 	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 	cfq_destroy_cfqg(cfqd, cfqg_of_blkg(blkg));
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 #else /* GROUP_IOSCHED */
 static struct cfq_group *cfq_get_cfqg(struct cfq_data *cfqd, int create)
 {
 	return &cfqd->root_group;
 }
 static inline void
 cfq_link_cfqq_cfqg(struct cfq_queue *cfqq, struct cfq_group *cfqg) {
 	cfqq->cfqg = cfqg;
 }
 static void cfq_release_cfq_groups(struct cfq_data *cfqd) {}
 static inline void cfq_put_cfqg(struct cfq_group *cfqg) {}
 #endif /* GROUP_IOSCHED */
 /*
  * The cfqd->service_trees holds all pending cfq_queue's that have
  * requests waiting to be processed. It is sorted in the order that
  * we will service the queues.
  */
 static void cfq_service_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 				 bool add_front)
 {
 	struct rb_node **p, *parent;
 	struct cfq_queue *__cfqq;
 	unsigned long rb_key;
 	struct cfq_rb_root *service_tree;
 	int left;
 	int new_cfqq = 1;
 	int group_changed = 0;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	if (!cfqd->cfq_group_isolation
 	    && cfqq_type(cfqq) == SYNC_NOIDLE_WORKLOAD
 	    && cfqq->cfqg && cfqq->cfqg != &cfqd->root_group) {
 		/* Move this cfq to root group */
 		cfq_log_cfqq(cfqd, cfqq, "moving to root group");
 		if (!RB_EMPTY_NODE(&cfqq->rb_node))
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 		cfqq->orig_cfqg = cfqq->cfqg;
 		cfqq->cfqg = &cfqd->root_group;
 		atomic_inc(&cfqd->root_group.ref);
 		group_changed = 1;
 	} else if (!cfqd->cfq_group_isolation
 		   && cfqq_type(cfqq) == SYNC_WORKLOAD && cfqq->orig_cfqg) {
 		/* cfqq is sequential now needs to go to its original group */
 		BUG_ON(cfqq->cfqg != &cfqd->root_group);
 		if (!RB_EMPTY_NODE(&cfqq->rb_node))
 			cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 		cfq_put_cfqg(cfqq->cfqg);
 		cfqq->cfqg = cfqq->orig_cfqg;
 		cfqq->orig_cfqg = NULL;
 		group_changed = 1;
 		cfq_log_cfqq(cfqd, cfqq, "moved to origin group");
 	}
 #endif
 	service_tree = service_tree_for(cfqq->cfqg, cfqq_prio(cfqq),
 						cfqq_type(cfqq));
 	if (cfq_class_idle(cfqq)) {
 		rb_key = CFQ_IDLE_DELAY;
 		parent = rb_last(&service_tree->rb);
 		if (parent && parent != &cfqq->rb_node) {
 			__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 			rb_key += __cfqq->rb_key;
 		} else
 			rb_key += jiffies;
 	} else if (!add_front) {
 		/*
 		 * Get our rb key offset. Subtract any residual slice
 		 * value carried from last service. A negative resid
 		 * count indicates slice overrun, and this should position
 		 * the next service time further away in the tree.
 		 */
 		rb_key = cfq_slice_offset(cfqd, cfqq) + jiffies;
 		rb_key -= cfqq->slice_resid;
 		cfqq->slice_resid = 0;
 	} else {
 		rb_key = -HZ;
 		__cfqq = cfq_rb_first(service_tree);
 		rb_key += __cfqq ? __cfqq->rb_key : jiffies;
 	}
 	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 		new_cfqq = 0;
 		/*
 		 * same position, nothing more to do
 		 */
 		if (rb_key == cfqq->rb_key &&
 		    cfqq->service_tree == service_tree)
 			return;
 		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
 		cfqq->service_tree = NULL;
 	}
 	left = 1;
 	parent = NULL;
 	cfqq->service_tree = service_tree;
 	p = &service_tree->rb.rb_node;
 	while (*p) {
 		struct rb_node **n;
 		parent = *p;
 		__cfqq = rb_entry(parent, struct cfq_queue, rb_node);
 		/*
 		 * sort by key, that represents service time.
 		 */
 		if (time_before(rb_key, __cfqq->rb_key))
 			n = &(*p)->rb_left;
 		else {
 			n = &(*p)->rb_right;
 			left = 0;
 		}
 		p = n;
 	}
 	if (left)
 		service_tree->left = &cfqq->rb_node;
 	cfqq->rb_key = rb_key;
 	rb_link_node(&cfqq->rb_node, parent, p);
 	rb_insert_color(&cfqq->rb_node, &service_tree->rb);
 	service_tree->count++;
 	if ((add_front || !new_cfqq) && !group_changed)
 		return;
 	cfq_group_service_tree_add(cfqd, cfqq->cfqg);
 }
 static struct cfq_queue *
 cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root,
 		     sector_t sector, struct rb_node **ret_parent,
 		     struct rb_node ***rb_link)
 {
 	struct rb_node **p, *parent;
 	struct cfq_queue *cfqq = NULL;
 	parent = NULL;
 	p = &root->rb_node;
 	while (*p) {
 		struct rb_node **n;
 		parent = *p;
 		cfqq = rb_entry(parent, struct cfq_queue, p_node);
 		/*
 		 * Sort strictly based on sector.  Smallest to the left,
 		 * largest to the right.
 		 */
 		if (sector > blk_rq_pos(cfqq->next_rq))
 			n = &(*p)->rb_right;
 		else if (sector < blk_rq_pos(cfqq->next_rq))
 			n = &(*p)->rb_left;
 		else
 			break;
 		p = n;
 		cfqq = NULL;
 	}
 	*ret_parent = parent;
 	if (rb_link)
 		*rb_link = p;
 	return cfqq;
 }
 static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct rb_node **p, *parent;
 	struct cfq_queue *__cfqq;
 	if (cfqq->p_root) {
 		rb_erase(&cfqq->p_node, cfqq->p_root);
 		cfqq->p_root = NULL;
 	}
 	if (cfq_class_idle(cfqq))
 		return;
 	if (!cfqq->next_rq)
 		return;
 	cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio];
 	__cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root,
 				      blk_rq_pos(cfqq->next_rq), &parent, &p);
 	if (!__cfqq) {
 		rb_link_node(&cfqq->p_node, parent, p);
 		rb_insert_color(&cfqq->p_node, cfqq->p_root);
 	} else
 		cfqq->p_root = NULL;
 }
 /*
  * Update cfqq's position in the service tree.
  */
 static void cfq_resort_rr_list(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	/*
 	 * Resorting requires the cfqq to be on the RR list already.
 	 */
 	if (cfq_cfqq_on_rr(cfqq)) {
 		cfq_service_tree_add(cfqd, cfqq, 0);
 		cfq_prio_tree_add(cfqd, cfqq);
 	}
 }
 /*
  * add to busy list of queues for service, trying to be fair in ordering
  * the pending list according to last request service
  */
 static void cfq_add_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "add_to_rr");
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	cfq_mark_cfqq_on_rr(cfqq);
 	cfqd->busy_queues++;
 	cfq_resort_rr_list(cfqd, cfqq);
 }
 /*
  * Called when the cfqq no longer has requests pending, remove it from
  * the service tree.
  */
 static void cfq_del_cfqq_rr(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "del_from_rr");
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_clear_cfqq_on_rr(cfqq);
 	if (!RB_EMPTY_NODE(&cfqq->rb_node)) {
 		cfq_rb_erase(&cfqq->rb_node, cfqq->service_tree);
 		cfqq->service_tree = NULL;
 	}
 	if (cfqq->p_root) {
 		rb_erase(&cfqq->p_node, cfqq->p_root);
 		cfqq->p_root = NULL;
 	}
 	cfq_group_service_tree_del(cfqd, cfqq->cfqg);
 	BUG_ON(!cfqd->busy_queues);
 	cfqd->busy_queues--;
 }
 /*
  * rb tree support functions
  */
 static void cfq_del_rq_rb(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	const int sync = rq_is_sync(rq);
 	BUG_ON(!cfqq->queued[sync]);
 	cfqq->queued[sync]--;
 	elv_rb_del(&cfqq->sort_list, rq);
 	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list)) {
 		/*
 		 * Queue will be deleted from service tree when we actually
 		 * expire it later. Right now just remove it from prio tree
 		 * as it is empty.
 		 */
 		if (cfqq->p_root) {
 			rb_erase(&cfqq->p_node, cfqq->p_root);
 			cfqq->p_root = NULL;
 		}
 	}
 }
 static void cfq_add_rq_rb(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct request *__alias, *prev;
 	cfqq->queued[rq_is_sync(rq)]++;
 	/*
 	 * looks a little odd, but the first insert might return an alias.
 	 * if that happens, put the alias on the dispatch list
 	 */
 	while ((__alias = elv_rb_add(&cfqq->sort_list, rq)) != NULL)
 		cfq_dispatch_insert(cfqd->queue, __alias);
 	if (!cfq_cfqq_on_rr(cfqq))
 		cfq_add_cfqq_rr(cfqd, cfqq);
 	/*
 	 * check if this request is a better next-serve candidate
 	 */
 	prev = cfqq->next_rq;
 	cfqq->next_rq = cfq_choose_req(cfqd, cfqq->next_rq, rq, cfqd->last_position);
 	/*
 	 * adjust priority tree position, if ->next_rq changes
 	 */
 	if (prev != cfqq->next_rq)
 		cfq_prio_tree_add(cfqd, cfqq);
 	BUG_ON(!cfqq->next_rq);
 }
 static void cfq_reposition_rq_rb(struct cfq_queue *cfqq, struct request *rq)
 {
 	elv_rb_del(&cfqq->sort_list, rq);
 	cfqq->queued[rq_is_sync(rq)]--;
 	cfq_add_rq_rb(rq);
 }
 static struct request *
 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 {
 	struct task_struct *tsk = current;
 	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
 	cic = cfq_cic_lookup(cfqd, tsk->io_context);
 	if (!cic)
 		return NULL;
 	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 	if (cfqq) {
 		sector_t sector = bio->bi_sector + bio_sectors(bio);
 		return elv_rb_find(&cfqq->sort_list, sector);
 	}
 	return NULL;
 }
 static void cfq_activate_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	cfqd->rq_in_driver++;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d",
 						cfqd->rq_in_driver);
 	cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
 }
 static void cfq_deactivate_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	WARN_ON(!cfqd->rq_in_driver);
 	cfqd->rq_in_driver--;
 	cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "deactivate rq, drv=%d",
 						cfqd->rq_in_driver);
 }
 static void cfq_remove_request(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	if (cfqq->next_rq == rq)
 		cfqq->next_rq = cfq_find_next_rq(cfqq->cfqd, cfqq, rq);
 	list_del_init(&rq->queuelist);
 	cfq_del_rq_rb(rq);
 	cfqq->cfqd->rq_queued--;
 	if (rq_is_meta(rq)) {
 		WARN_ON(!cfqq->meta_pending);
 		cfqq->meta_pending--;
 	}
 }
 static int cfq_merge(struct request_queue *q, struct request **req,
 		     struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct request *__rq;
 	__rq = cfq_find_rq_fmerge(cfqd, bio);
 	if (__rq && elv_rq_merge_ok(__rq, bio)) {
 		*req = __rq;
 		return ELEVATOR_FRONT_MERGE;
 	}
 	return ELEVATOR_NO_MERGE;
 }
 static void cfq_merged_request(struct request_queue *q, struct request *req,
 			       int type)
 {
 	if (type == ELEVATOR_FRONT_MERGE) {
 		struct cfq_queue *cfqq = RQ_CFQQ(req);
 		cfq_reposition_rq_rb(cfqq, req);
 	}
 }
 static void
 cfq_merged_requests(struct request_queue *q, struct request *rq,
 		    struct request *next)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	/*
 	 * reposition in fifo if next is older than rq
 	 */
 	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
 	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
 		list_move(&rq->queuelist, &next->queuelist);
 		rq_set_fifo_time(rq, rq_fifo_time(next));
 	}
 	if (cfqq->next_rq == next)
 		cfqq->next_rq = rq;
 	cfq_remove_request(next);
 }
 static int cfq_allow_merge(struct request_queue *q, struct request *rq,
 			   struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
 	/*
 	 * Disallow merge of a sync bio into an async request.
 	 */
 	if (cfq_bio_sync(bio) && !rq_is_sync(rq))
 		return false;
 	/*
 	 * Lookup the cfqq that this bio will be queued with. Allow
 	 * merge only if rq is queued there.
 	 */
 	cic = cfq_cic_lookup(cfqd, current->io_context);
 	if (!cic)
 		return false;
 	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 	return cfqq == RQ_CFQQ(rq);
 }
 static void __cfq_set_active_queue(struct cfq_data *cfqd,
 				   struct cfq_queue *cfqq)
 {
 	if (cfqq) {
 		cfq_log_cfqq(cfqd, cfqq, "set_active wl_prio:%d wl_type:%d",
 				cfqd->serving_prio, cfqd->serving_type);
 		cfqq->slice_start = 0;
 		cfqq->dispatch_start = jiffies;
 		cfqq->allocated_slice = 0;
 		cfqq->slice_end = 0;
 		cfqq->slice_dispatch = 0;
 		cfqq->nr_sectors = 0;
 		cfq_clear_cfqq_wait_request(cfqq);
 		cfq_clear_cfqq_must_dispatch(cfqq);
 		cfq_clear_cfqq_must_alloc_slice(cfqq);
 		cfq_clear_cfqq_fifo_expire(cfqq);
 		cfq_mark_cfqq_slice_new(cfqq);
 		del_timer(&cfqd->idle_slice_timer);
 	}
 	cfqd->active_queue = cfqq;
 }
 /*
  * current cfqq expired its slice (or was too idle), select new one
  */
 static void
 __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		    bool timed_out)
 {
 	cfq_log_cfqq(cfqd, cfqq, "slice expired t=%d", timed_out);
 	if (cfq_cfqq_wait_request(cfqq))
 		del_timer(&cfqd->idle_slice_timer);
 	cfq_clear_cfqq_wait_request(cfqq);
 	cfq_clear_cfqq_wait_busy(cfqq);
 	/*
 	 * If this cfqq is shared between multiple processes, check to
 	 * make sure that those processes are still issuing I/Os within
 	 * the mean seek distance.  If not, it may be time to break the
 	 * queues apart again.
 	 */
 	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
 		cfq_mark_cfqq_split_coop(cfqq);
 	/*
 	 * store what was left of this slice, if the queue idled/timed out
 	 */
 	if (timed_out && !cfq_cfqq_slice_new(cfqq)) {
 		cfqq->slice_resid = cfqq->slice_end - jiffies;
 		cfq_log_cfqq(cfqd, cfqq, "resid=%ld", cfqq->slice_resid);
 	}
 	cfq_group_served(cfqd, cfqq->cfqg, cfqq);
 	if (cfq_cfqq_on_rr(cfqq) && RB_EMPTY_ROOT(&cfqq->sort_list))
 		cfq_del_cfqq_rr(cfqd, cfqq);
 	cfq_resort_rr_list(cfqd, cfqq);
 	if (cfqq == cfqd->active_queue)
 		cfqd->active_queue = NULL;
 	if (&cfqq->cfqg->rb_node == cfqd->grp_service_tree.active)
 		cfqd->grp_service_tree.active = NULL;
 	if (cfqd->active_cic) {
 		put_io_context(cfqd->active_cic->ioc);
 		cfqd->active_cic = NULL;
 	}
 }
 static inline void cfq_slice_expired(struct cfq_data *cfqd, bool timed_out)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 	if (cfqq)
 		__cfq_slice_expired(cfqd, cfqq, timed_out);
 }
 /*
  * Get next queue for service. Unless we have a queue preemption,
  * we'll simply select the first cfqq in the service tree.
  */
 static struct cfq_queue *cfq_get_next_queue(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *service_tree =
 		service_tree_for(cfqd->serving_group, cfqd->serving_prio,
 					cfqd->serving_type);
 	if (!cfqd->rq_queued)
 		return NULL;
 	/* There is nothing to dispatch */
 	if (!service_tree)
 		return NULL;
 	if (RB_EMPTY_ROOT(&service_tree->rb))
 		return NULL;
 	return cfq_rb_first(service_tree);
 }
 static struct cfq_queue *cfq_get_next_queue_forced(struct cfq_data *cfqd)
 {
 	struct cfq_group *cfqg;
 	struct cfq_queue *cfqq;
 	int i, j;
 	struct cfq_rb_root *st;
 	if (!cfqd->rq_queued)
 		return NULL;
 	cfqg = cfq_get_next_cfqg(cfqd);
 	if (!cfqg)
 		return NULL;
 	for_each_cfqg_st(cfqg, i, j, st)
 		if ((cfqq = cfq_rb_first(st)) != NULL)
 			return cfqq;
 	return NULL;
 }
 /*
  * Get and set a new active queue for service.
  */
 static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd,
 					      struct cfq_queue *cfqq)
 {
 	if (!cfqq)
 		cfqq = cfq_get_next_queue(cfqd);
 	__cfq_set_active_queue(cfqd, cfqq);
 	return cfqq;
 }
 static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 					  struct request *rq)
 {
 	if (blk_rq_pos(rq) >= cfqd->last_position)
 		return blk_rq_pos(rq) - cfqd->last_position;
 	else
 		return cfqd->last_position - blk_rq_pos(rq);
 }
 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			       struct request *rq)
 {
 	return cfq_dist_from_last(cfqd, rq) <= CFQQ_CLOSE_THR;
 }
 static struct cfq_queue *cfqq_close(struct cfq_data *cfqd,
 				    struct cfq_queue *cur_cfqq)
 {
 	struct rb_root *root = &cfqd->prio_trees[cur_cfqq->org_ioprio];
 	struct rb_node *parent, *node;
 	struct cfq_queue *__cfqq;
 	sector_t sector = cfqd->last_position;
 	if (RB_EMPTY_ROOT(root))
 		return NULL;
 	/*
 	 * First, if we find a request starting at the end of the last
 	 * request, choose it.
 	 */
 	__cfqq = cfq_prio_tree_lookup(cfqd, root, sector, &parent, NULL);
 	if (__cfqq)
 		return __cfqq;
 	/*
 	 * If the exact sector wasn't found, the parent of the NULL leaf
 	 * will contain the closest sector.
 	 */
 	__cfqq = rb_entry(parent, struct cfq_queue, p_node);
 	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
 		return __cfqq;
 	if (blk_rq_pos(__cfqq->next_rq) < sector)
 		node = rb_next(&__cfqq->p_node);
 	else
 		node = rb_prev(&__cfqq->p_node);
 	if (!node)
 		return NULL;
 	__cfqq = rb_entry(node, struct cfq_queue, p_node);
 	if (cfq_rq_close(cfqd, cur_cfqq, __cfqq->next_rq))
 		return __cfqq;
 	return NULL;
 }
 /*
  * cfqd - obvious
  * cur_cfqq - passed in so that we don't decide that the current queue is
  * 	      closely cooperating with itself.
  *
  * So, basically we're assuming that that cur_cfqq has dispatched at least
  * one request, and that cfqd->last_position reflects a position on the disk
  * associated with the I/O issued by cur_cfqq.  I'm not sure this is a valid
  * assumption.
  */
 static struct cfq_queue *cfq_close_cooperator(struct cfq_data *cfqd,
 					      struct cfq_queue *cur_cfqq)
 {
 	struct cfq_queue *cfqq;
 	if (cfq_class_idle(cur_cfqq))
 		return NULL;
 	if (!cfq_cfqq_sync(cur_cfqq))
 		return NULL;
 	if (CFQQ_SEEKY(cur_cfqq))
 		return NULL;
 	/*
 	 * Don't search priority tree if it's the only queue in the group.
 	 */
 	if (cur_cfqq->cfqg->nr_cfqq == 1)
 		return NULL;
 	/*
 	 * We should notice if some of the queues are cooperating, eg
 	 * working closely on the same area of the disk. In that case,
 	 * we can group them together and don't waste time idling.
 	 */
 	cfqq = cfqq_close(cfqd, cur_cfqq);
 	if (!cfqq)
 		return NULL;
 	/* If new queue belongs to different cfq_group, don't choose it */
 	if (cur_cfqq->cfqg != cfqq->cfqg)
 		return NULL;
 	/*
 	 * It only makes sense to merge sync queues.
 	 */
 	if (!cfq_cfqq_sync(cfqq))
 		return NULL;
 	if (CFQQ_SEEKY(cfqq))
 		return NULL;
 	/*
 	 * Do not merge queues of different priority classes
 	 */
 	if (cfq_class_rt(cfqq) != cfq_class_rt(cur_cfqq))
 		return NULL;
 	return cfqq;
 }
 /*
  * Determine whether we should enforce idle window for this queue.
  */
 static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	enum wl_prio_t prio = cfqq_prio(cfqq);
 	struct cfq_rb_root *service_tree = cfqq->service_tree;
 	BUG_ON(!service_tree);
 	BUG_ON(!service_tree->count);
 	/* We never do for idle class queues. */
 	if (prio == IDLE_WORKLOAD)
 		return false;
 	/* We do for queues that were marked with idle window flag. */
 	if (cfq_cfqq_idle_window(cfqq) &&
 	   !(blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag))
 		return true;
 	/*
 	 * Otherwise, we do only if they are the last ones
 	 * in their service tree.
 	 */
 	if (service_tree->count == 1 && cfq_cfqq_sync(cfqq))
 		return 1;
 	cfq_log_cfqq(cfqd, cfqq, "Not idling. st->count:%d",
 			service_tree->count);
 	return 0;
 }
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 	struct cfq_io_context *cic;
 	unsigned long sl;
 	/*
 	 * SSD device without seek penalty, disable idling. But only do so
 	 * for devices that support queuing, otherwise we still have a problem
 	 * with sync vs async workloads.
 	 */
 	if (blk_queue_nonrot(cfqd->queue) && cfqd->hw_tag)
 		return;
 	WARN_ON(!RB_EMPTY_ROOT(&cfqq->sort_list));
 	WARN_ON(cfq_cfqq_slice_new(cfqq));
 	/*
 	 * idle is disabled, either manually or by past process history
 	 */
 	if (!cfqd->cfq_slice_idle || !cfq_should_idle(cfqd, cfqq))
 		return;
 	/*
 	 * still active requests from this queue, don't idle
 	 */
 	if (cfqq->dispatched)
 		return;
 	/*
 	 * task has exited, don't wait
 	 */
 	cic = cfqd->active_cic;
 	if (!cic || !atomic_read(&cic->ioc->nr_tasks))
 		return;
 	/*
 	 * If our average think time is larger than the remaining time
 	 * slice, then don't idle. This avoids overrunning the allotted
 	 * time slice.
 	 */
 	if (sample_valid(cic->ttime_samples) &&
 	    (cfqq->slice_end - jiffies < cic->ttime_mean)) {
 		cfq_log_cfqq(cfqd, cfqq, "Not idling. think_time:%d",
 				cic->ttime_mean);
 		return;
 	}
 	cfq_mark_cfqq_wait_request(cfqq);
 	sl = cfqd->cfq_slice_idle;
 	mod_timer(&cfqd->idle_slice_timer, jiffies + sl);
 	cfq_log_cfqq(cfqd, cfqq, "arm_idle: %lu", sl);
 }
 /*
  * Move request from internal lists to the request queue dispatch list.
  */
 static void cfq_dispatch_insert(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	cfq_log_cfqq(cfqd, cfqq, "dispatch_insert");
 	cfqq->next_rq = cfq_find_next_rq(cfqd, cfqq, rq);
 	cfq_remove_request(rq);
 	cfqq->dispatched++;
 	elv_dispatch_sort(q, rq);
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]++;
 	cfqq->nr_sectors += blk_rq_sectors(rq);
 }
 /*
  * return expired entry, or NULL to just start from scratch in rbtree
  */
 static struct request *cfq_check_fifo(struct cfq_queue *cfqq)
 {
 	struct request *rq = NULL;
 	if (cfq_cfqq_fifo_expire(cfqq))
 		return NULL;
 	cfq_mark_cfqq_fifo_expire(cfqq);
 	if (list_empty(&cfqq->fifo))
 		return NULL;
 	rq = rq_entry_fifo(cfqq->fifo.next);
 	if (time_before(jiffies, rq_fifo_time(rq)))
 		rq = NULL;
 	cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq);
 	return rq;
 }
 static inline int
 cfq_prio_to_maxrq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	const int base_rq = cfqd->cfq_slice_async_rq;
 	WARN_ON(cfqq->ioprio >= IOPRIO_BE_NR);
 	return 2 * (base_rq + base_rq * (CFQ_PRIO_LISTS - 1 - cfqq->ioprio));
 }
 /*
  * Must be called with the queue_lock held.
  */
 static int cfqq_process_refs(struct cfq_queue *cfqq)
 {
 	int process_refs, io_refs;
 	io_refs = cfqq->allocated[READ] + cfqq->allocated[WRITE];
 	process_refs = atomic_read(&cfqq->ref) - io_refs;
 	BUG_ON(process_refs < 0);
 	return process_refs;
 }
 static void cfq_setup_merge(struct cfq_queue *cfqq, struct cfq_queue *new_cfqq)
 {
 	int process_refs, new_process_refs;
 	struct cfq_queue *__cfqq;
 	/* Avoid a circular list and skip interim queue merges */
 	while ((__cfqq = new_cfqq->new_cfqq)) {
 		if (__cfqq == cfqq)
 			return;
 		new_cfqq = __cfqq;
 	}
 	process_refs = cfqq_process_refs(cfqq);
 	/*
 	 * If the process for the cfqq has gone away, there is no
 	 * sense in merging the queues.
 	 */
 	if (process_refs == 0)
 		return;
 	/*
 	 * Merge in the direction of the lesser amount of work.
 	 */
 	new_process_refs = cfqq_process_refs(new_cfqq);
 	if (new_process_refs >= process_refs) {
 		cfqq->new_cfqq = new_cfqq;
 		atomic_add(process_refs, &new_cfqq->ref);
 	} else {
 		new_cfqq->new_cfqq = cfqq;
 		atomic_add(new_process_refs, &cfqq->ref);
 	}
 }
 static enum wl_type_t cfq_choose_wl(struct cfq_data *cfqd,
 				struct cfq_group *cfqg, enum wl_prio_t prio)
 {
 	struct cfq_queue *queue;
 	int i;
 	bool key_valid = false;
 	unsigned long lowest_key = 0;
 	enum wl_type_t cur_best = SYNC_NOIDLE_WORKLOAD;
 	for (i = 0; i <= SYNC_WORKLOAD; ++i) {
 		/* select the one with lowest rb_key */
 		queue = cfq_rb_first(service_tree_for(cfqg, prio, i));
 		if (queue &&
 		    (!key_valid || time_before(queue->rb_key, lowest_key))) {
 			lowest_key = queue->rb_key;
 			cur_best = i;
 			key_valid = true;
 		}
 	}
 	return cur_best;
 }
 static void choose_service_tree(struct cfq_data *cfqd, struct cfq_group *cfqg)
 {
 	unsigned slice;
 	unsigned count;
 	struct cfq_rb_root *st;
 	unsigned group_slice;
 	if (!cfqg) {
 		cfqd->serving_prio = IDLE_WORKLOAD;
 		cfqd->workload_expires = jiffies + 1;
 		return;
 	}
 	/* Choose next priority. RT > BE > IDLE */
 	if (cfq_group_busy_queues_wl(RT_WORKLOAD, cfqd, cfqg))
 		cfqd->serving_prio = RT_WORKLOAD;
 	else if (cfq_group_busy_queues_wl(BE_WORKLOAD, cfqd, cfqg))
 		cfqd->serving_prio = BE_WORKLOAD;
 	else {
 		cfqd->serving_prio = IDLE_WORKLOAD;
 		cfqd->workload_expires = jiffies + 1;
 		return;
 	}
 	/*
 	 * For RT and BE, we have to choose also the type
 	 * (SYNC, SYNC_NOIDLE, ASYNC), and to compute a workload
 	 * expiration time
 	 */
 	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 	count = st->count;
 	/*
 	 * check workload expiration, and that we still have other queues ready
 	 */
 	if (count && !time_after(jiffies, cfqd->workload_expires))
 		return;
 	/* otherwise select new workload type */
 	cfqd->serving_type =
 		cfq_choose_wl(cfqd, cfqg, cfqd->serving_prio);
 	st = service_tree_for(cfqg, cfqd->serving_prio, cfqd->serving_type);
 	count = st->count;
 	/*
 	 * the workload slice is computed as a fraction of target latency
 	 * proportional to the number of queues in that workload, over
 	 * all the queues in the same priority class
 	 */
 	group_slice = cfq_group_slice(cfqd, cfqg);
 	slice = group_slice * count /
 		max_t(unsigned, cfqg->busy_queues_avg[cfqd->serving_prio],
 		      cfq_group_busy_queues_wl(cfqd->serving_prio, cfqd, cfqg));
 	if (cfqd->serving_type == ASYNC_WORKLOAD) {
 		unsigned int tmp;
 		/*
 		 * Async queues are currently system wide. Just taking
 		 * proportion of queues with-in same group will lead to higher
 		 * async ratio system wide as generally root group is going
 		 * to have higher weight. A more accurate thing would be to
 		 * calculate system wide asnc/sync ratio.
 		 */
 		tmp = cfq_target_latency * cfqg_busy_async_queues(cfqd, cfqg);
 		tmp = tmp/cfqd->busy_queues;
 		slice = min_t(unsigned, slice, tmp);
 		/* async workload slice is scaled down according to
 		 * the sync/async slice ratio. */
 		slice = slice * cfqd->cfq_slice[0] / cfqd->cfq_slice[1];
 	} else
 		/* sync workload slice is at least 2 * cfq_slice_idle */
 		slice = max(slice, 2 * cfqd->cfq_slice_idle);
 	slice = max_t(unsigned, slice, CFQ_MIN_TT);
 	cfq_log(cfqd, "workload slice:%d", slice);
 	cfqd->workload_expires = jiffies + slice;
 	cfqd->noidle_tree_requires_idle = false;
 }
 static struct cfq_group *cfq_get_next_cfqg(struct cfq_data *cfqd)
 {
 	struct cfq_rb_root *st = &cfqd->grp_service_tree;
 	struct cfq_group *cfqg;
 	if (RB_EMPTY_ROOT(&st->rb))
 		return NULL;
 	cfqg = cfq_rb_first_group(st);
 	st->active = &cfqg->rb_node;
 	update_min_vdisktime(st);
 	return cfqg;
 }
 static void cfq_choose_cfqg(struct cfq_data *cfqd)
 {
 	struct cfq_group *cfqg = cfq_get_next_cfqg(cfqd);
 	cfqd->serving_group = cfqg;
 	/* Restore the workload type data */
 	if (cfqg->saved_workload_slice) {
 		cfqd->workload_expires = jiffies + cfqg->saved_workload_slice;
 		cfqd->serving_type = cfqg->saved_workload;
 		cfqd->serving_prio = cfqg->saved_serving_prio;
 	} else
 		cfqd->workload_expires = jiffies - 1;
 	choose_service_tree(cfqd, cfqg);
 }
 /*
  * Select a queue for service. If we have a current active queue,
  * check whether to continue servicing it, or retrieve and set a new one.
  */
 static struct cfq_queue *cfq_select_queue(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	cfqq = cfqd->active_queue;
 	if (!cfqq)
 		goto new_queue;
 	if (!cfqd->rq_queued)
 		return NULL;
 	/*
 	 * We were waiting for group to get backlogged. Expire the queue
 	 */
 	if (cfq_cfqq_wait_busy(cfqq) && !RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto expire;
 	/*
 	 * The active queue has run out of time, expire it and select new.
 	 */
 	if (cfq_slice_used(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) {
 		/*
 		 * If slice had not expired at the completion of last request
 		 * we might not have turned on wait_busy flag. Don't expire
 		 * the queue yet. Allow the group to get backlogged.
 		 *
 		 * The very fact that we have used the slice, that means we
 		 * have been idling all along on this queue and it should be
 		 * ok to wait for this request to complete.
 		 */
 		if (cfqq->cfqg->nr_cfqq == 1 && RB_EMPTY_ROOT(&cfqq->sort_list)
 		    && cfqq->dispatched && cfq_should_idle(cfqd, cfqq)) {
 			cfqq = NULL;
 			goto keep_queue;
 		} else
 			goto expire;
 	}
 	/*
 	 * The active queue has requests and isn't expired, allow it to
 	 * dispatch.
 	 */
 	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 		goto keep_queue;
 	/*
 	 * If another queue has a request waiting within our mean seek
 	 * distance, let it run.  The expire code will check for close
 	 * cooperators and put the close queue at the front of the service
 	 * tree.  If possible, merge the expiring queue with the new cfqq.
 	 */
 	new_cfqq = cfq_close_cooperator(cfqd, cfqq);
 	if (new_cfqq) {
 		if (!cfqq->new_cfqq)
 			cfq_setup_merge(cfqq, new_cfqq);
 		goto expire;
 	}
 	/*
 	 * No requests pending. If the active queue still has requests in
 	 * flight or is idling for a new request, allow either of these
 	 * conditions to happen (or time out) before selecting a new queue.
 	 */
 	if (timer_pending(&cfqd->idle_slice_timer) ||
 	    (cfqq->dispatched && cfq_should_idle(cfqd, cfqq))) {
 		cfqq = NULL;
 		goto keep_queue;
 	}
 expire:
 	cfq_slice_expired(cfqd, 0);
 new_queue:
 	/*
 	 * Current queue expired. Check if we have to switch to a new
 	 * service tree
 	 */
 	if (!new_cfqq)
 		cfq_choose_cfqg(cfqd);
 	cfqq = cfq_set_active_queue(cfqd, new_cfqq);
 keep_queue:
 	return cfqq;
 }
 static int __cfq_forced_dispatch_cfqq(struct cfq_queue *cfqq)
 {
 	int dispatched = 0;
 	while (cfqq->next_rq) {
 		cfq_dispatch_insert(cfqq->cfqd->queue, cfqq->next_rq);
 		dispatched++;
 	}
 	BUG_ON(!list_empty(&cfqq->fifo));
 	/* By default cfqq is not expired if it is empty. Do it explicitly */
 	__cfq_slice_expired(cfqq->cfqd, cfqq, 0);
 	return dispatched;
 }
 /*
  * Drain our current requests. Used for barriers and when switching
  * io schedulers on-the-fly.
  */
 static int cfq_forced_dispatch(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq;
 	int dispatched = 0;
-	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL)
+	/* Expire the timeslice of the current active queue first */
+	cfq_slice_expired(cfqd, 0);
+	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
+		__cfq_set_active_queue(cfqd, cfqq);
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+	}
-	cfq_slice_expired(cfqd, 0);
 	BUG_ON(cfqd->busy_queues);
 	cfq_log(cfqd, "forced_dispatch=%d", dispatched);
 	return dispatched;
 }
 static inline bool cfq_slice_used_soon(struct cfq_data *cfqd,
 	struct cfq_queue *cfqq)
 {
 	/* the queue hasn't finished any request, can't estimate */
 	if (cfq_cfqq_slice_new(cfqq))
 		return 1;
 	if (time_after(jiffies + cfqd->cfq_slice_idle * cfqq->dispatched,
 		cfqq->slice_end))
 		return 1;
 	return 0;
 }
 static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	unsigned int max_dispatch;
 	/*
 	 * Drain async requests before we start sync IO
 	 */
 	if (cfq_should_idle(cfqd, cfqq) && cfqd->rq_in_flight[BLK_RW_ASYNC])
 		return false;
 	/*
 	 * If this is an async queue and we have sync IO in flight, let it wait
 	 */
 	if (cfqd->rq_in_flight[BLK_RW_SYNC] && !cfq_cfqq_sync(cfqq))
 		return false;
 	max_dispatch = max_t(unsigned int, cfqd->cfq_quantum / 2, 1);
 	if (cfq_class_idle(cfqq))
 		max_dispatch = 1;
 	/*
 	 * Does this cfqq already have too much IO in flight?
 	 */
 	if (cfqq->dispatched >= max_dispatch) {
 		/*
 		 * idle queue must always only have a single IO in flight
 		 */
 		if (cfq_class_idle(cfqq))
 			return false;
 		/*
 		 * We have other queues, don't allow more IO from this one
 		 */
 		if (cfqd->busy_queues > 1 && cfq_slice_used_soon(cfqd, cfqq))
 			return false;
 		/*
 		 * Sole queue user, no limit
 		 */
 		if (cfqd->busy_queues == 1)
 			max_dispatch = -1;
 		else
 			/*
 			 * Normally we start throttling cfqq when cfq_quantum/2
 			 * requests have been dispatched. But we can drive
 			 * deeper queue depths at the beginning of slice
 			 * subjected to upper limit of cfq_quantum.
 			 * */
 			max_dispatch = cfqd->cfq_quantum;
 	}
 	/*
 	 * Async queues must wait a bit before being allowed dispatch.
 	 * We also ramp up the dispatch depth gradually for async IO,
 	 * based on the last sync IO we serviced
 	 */
 	if (!cfq_cfqq_sync(cfqq) && cfqd->cfq_latency) {
 		unsigned long last_sync = jiffies - cfqd->last_delayed_sync;
 		unsigned int depth;
 		depth = last_sync / cfqd->cfq_slice[1];
 		if (!depth && !cfqq->dispatched)
 			depth = 1;
 		if (depth < max_dispatch)
 			max_dispatch = depth;
 	}
 	/*
 	 * If we're below the current max, allow a dispatch
 	 */
 	return cfqq->dispatched < max_dispatch;
 }
 /*
  * Dispatch a request from cfqq, moving them to the request queue
  * dispatch list.
  */
 static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct request *rq;
 	BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list));
 	if (!cfq_may_dispatch(cfqd, cfqq))
 		return false;
 	/*
 	 * follow expired path, else get first next available
 	 */
 	rq = cfq_check_fifo(cfqq);
 	if (!rq)
 		rq = cfqq->next_rq;
 	/*
 	 * insert request into driver dispatch list
 	 */
 	cfq_dispatch_insert(cfqd->queue, rq);
 	if (!cfqd->active_cic) {
 		struct cfq_io_context *cic = RQ_CIC(rq);
 		atomic_long_inc(&cic->ioc->refcount);
 		cfqd->active_cic = cic;
 	}
 	return true;
 }
 /*
  * Find the cfqq that we need to service and move a request from that to the
  * dispatch list
  */
 static int cfq_dispatch_requests(struct request_queue *q, int force)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq;
 	if (!cfqd->busy_queues)
 		return 0;
 	if (unlikely(force))
 		return cfq_forced_dispatch(cfqd);
 	cfqq = cfq_select_queue(cfqd);
 	if (!cfqq)
 		return 0;
 	/*
 	 * Dispatch a request from this cfqq, if it is allowed
 	 */
 	if (!cfq_dispatch_request(cfqd, cfqq))
 		return 0;
 	cfqq->slice_dispatch++;
 	cfq_clear_cfqq_must_dispatch(cfqq);
 	/*
 	 * expire an async queue immediately if it has used up its slice. idle
 	 * queue always expire after 1 dispatch round.
 	 */
 	if (cfqd->busy_queues > 1 && ((!cfq_cfqq_sync(cfqq) &&
 	    cfqq->slice_dispatch >= cfq_prio_to_maxrq(cfqd, cfqq)) ||
 	    cfq_class_idle(cfqq))) {
 		cfqq->slice_end = jiffies + 1;
 		cfq_slice_expired(cfqd, 0);
 	}
 	cfq_log_cfqq(cfqd, cfqq, "dispatched a request");
 	return 1;
 }
 /*
  * task holds one reference to the queue, dropped when task exits. each rq
  * in-flight on this queue also holds a reference, dropped when rq is freed.
  *
  * Each cfq queue took a reference on the parent group. Drop it now.
  * queue lock must be held here.
  */
 static void cfq_put_queue(struct cfq_queue *cfqq)
 {
 	struct cfq_data *cfqd = cfqq->cfqd;
 	struct cfq_group *cfqg, *orig_cfqg;
 	BUG_ON(atomic_read(&cfqq->ref) <= 0);
 	if (!atomic_dec_and_test(&cfqq->ref))
 		return;
 	cfq_log_cfqq(cfqd, cfqq, "put_queue");
 	BUG_ON(rb_first(&cfqq->sort_list));
 	BUG_ON(cfqq->allocated[READ] + cfqq->allocated[WRITE]);
 	cfqg = cfqq->cfqg;
 	orig_cfqg = cfqq->orig_cfqg;
 	if (unlikely(cfqd->active_queue == cfqq)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 	BUG_ON(cfq_cfqq_on_rr(cfqq));
 	kmem_cache_free(cfq_pool, cfqq);
 	cfq_put_cfqg(cfqg);
 	if (orig_cfqg)
 		cfq_put_cfqg(orig_cfqg);
 }
 /*
  * Must always be called with the rcu_read_lock() held
  */
 static void
 __call_for_each_cic(struct io_context *ioc,
 		    void (*func)(struct io_context *, struct cfq_io_context *))
 {
 	struct cfq_io_context *cic;
 	struct hlist_node *n;
 	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
 		func(ioc, cic);
 }
 /*
  * Call func for each cic attached to this ioc.
  */
 static void
 call_for_each_cic(struct io_context *ioc,
 		  void (*func)(struct io_context *, struct cfq_io_context *))
 {
 	rcu_read_lock();
 	__call_for_each_cic(ioc, func);
 	rcu_read_unlock();
 }
 static void cfq_cic_free_rcu(struct rcu_head *head)
 {
 	struct cfq_io_context *cic;
 	cic = container_of(head, struct cfq_io_context, rcu_head);
 	kmem_cache_free(cfq_ioc_pool, cic);
 	elv_ioc_count_dec(cfq_ioc_count);
 	if (ioc_gone) {
 		/*
 		 * CFQ scheduler is exiting, grab exit lock and check
 		 * the pending io context count. If it hits zero,
 		 * complete ioc_gone and set it back to NULL
 		 */
 		spin_lock(&ioc_gone_lock);
 		if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
 			complete(ioc_gone);
 			ioc_gone = NULL;
 		}
 		spin_unlock(&ioc_gone_lock);
 	}
 }
 static void cfq_cic_free(struct cfq_io_context *cic)
 {
 	call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
 }
 static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
 {
 	unsigned long flags;
 	BUG_ON(!cic->dead_key);
 	spin_lock_irqsave(&ioc->lock, flags);
 	radix_tree_delete(&ioc->radix_root, cic->dead_key);
 	hlist_del_rcu(&cic->cic_list);
 	spin_unlock_irqrestore(&ioc->lock, flags);
 	cfq_cic_free(cic);
 }
 /*
  * Must be called with rcu_read_lock() held or preemption otherwise disabled.
  * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
  * and ->trim() which is called with the task lock held
  */
 static void cfq_free_io_context(struct io_context *ioc)
 {
 	/*
 	 * ioc->refcount is zero here, or we are called from elv_unregister(),
 	 * so no more cic's are allowed to be linked into this ioc.  So it
 	 * should be ok to iterate over the known list, we will see all cic's
 	 * since no new ones are added.
 	 */
 	__call_for_each_cic(ioc, cic_free_func);
 }
 static void cfq_exit_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct cfq_queue *__cfqq, *next;
 	if (unlikely(cfqq == cfqd->active_queue)) {
 		__cfq_slice_expired(cfqd, cfqq, 0);
 		cfq_schedule_dispatch(cfqd);
 	}
 	/*
 	 * If this queue was scheduled to merge with another queue, be
 	 * sure to drop the reference taken on that queue (and others in
 	 * the merge chain).  See cfq_setup_merge and cfq_merge_cfqqs.
 	 */
 	__cfqq = cfqq->new_cfqq;
 	while (__cfqq) {
 		if (__cfqq == cfqq) {
 			WARN(1, "cfqq->new_cfqq loop detected\n");
 			break;
 		}
 		next = __cfqq->new_cfqq;
 		cfq_put_queue(__cfqq);
 		__cfqq = next;
 	}
 	cfq_put_queue(cfqq);
 }
 static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
 					 struct cfq_io_context *cic)
 {
 	struct io_context *ioc = cic->ioc;
 	list_del_init(&cic->queue_list);
 	/*
 	 * Make sure key == NULL is seen for dead queues
 	 */
 	smp_wmb();
 	cic->dead_key = (unsigned long) cic->key;
 	cic->key = NULL;
 	if (ioc->ioc_data == cic)
 		rcu_assign_pointer(ioc->ioc_data, NULL);
 	if (cic->cfqq[BLK_RW_ASYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
 		cic->cfqq[BLK_RW_ASYNC] = NULL;
 	}
 	if (cic->cfqq[BLK_RW_SYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_SYNC]);
 		cic->cfqq[BLK_RW_SYNC] = NULL;
 	}
 }
 static void cfq_exit_single_io_context(struct io_context *ioc,
 				       struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic->key;
 	if (cfqd) {
 		struct request_queue *q = cfqd->queue;
 		unsigned long flags;
 		spin_lock_irqsave(q->queue_lock, flags);
 		/*
 		 * Ensure we get a fresh copy of the ->key to prevent
 		 * race between exiting task and queue
 		 */
 		smp_read_barrier_depends();
 		if (cic->key)
 			__cfq_exit_single_io_context(cfqd, cic);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 	}
 }
 /*
  * The process that ioc belongs to has exited, we need to clean up
  * and put the internal structures we have that belongs to that process.
  */
 static void cfq_exit_io_context(struct io_context *ioc)
 {
 	call_for_each_cic(ioc, cfq_exit_single_io_context);
 }
 static struct cfq_io_context *
 cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct cfq_io_context *cic;
 	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
 							cfqd->queue->node);
 	if (cic) {
 		cic->last_end_request = jiffies;
 		INIT_LIST_HEAD(&cic->queue_list);
 		INIT_HLIST_NODE(&cic->cic_list);
 		cic->dtor = cfq_free_io_context;
 		cic->exit = cfq_exit_io_context;
 		elv_ioc_count_inc(cfq_ioc_count);
 	}
 	return cic;
 }
 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 {
 	struct task_struct *tsk = current;
 	int ioprio_class;
 	if (!cfq_cfqq_prio_changed(cfqq))
 		return;
 	ioprio_class = IOPRIO_PRIO_CLASS(ioc->ioprio);
 	switch (ioprio_class) {
 	default:
 		printk(KERN_ERR "cfq: bad prio %x\n", ioprio_class);
 	case IOPRIO_CLASS_NONE:
 		/*
 		 * no prio set, inherit CPU scheduling settings
 		 */
 		cfqq->ioprio = task_nice_ioprio(tsk);
 		cfqq->ioprio_class = task_nice_ioclass(tsk);
 		break;
 	case IOPRIO_CLASS_RT:
 		cfqq->ioprio = task_ioprio(ioc);
 		cfqq->ioprio_class = IOPRIO_CLASS_RT;
 		break;
 	case IOPRIO_CLASS_BE:
 		cfqq->ioprio = task_ioprio(ioc);
 		cfqq->ioprio_class = IOPRIO_CLASS_BE;
 		break;
 	case IOPRIO_CLASS_IDLE:
 		cfqq->ioprio_class = IOPRIO_CLASS_IDLE;
 		cfqq->ioprio = 7;
 		cfq_clear_cfqq_idle_window(cfqq);
 		break;
 	}
 	/*
 	 * keep track of original prio settings in case we have to temporarily
 	 * elevate the priority of this queue
 	 */
 	cfqq->org_ioprio = cfqq->ioprio;
 	cfqq->org_ioprio_class = cfqq->ioprio_class;
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
 static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
 {
 	struct cfq_data *cfqd = cic->key;
 	struct cfq_queue *cfqq;
 	unsigned long flags;
 	if (unlikely(!cfqd))
 		return;
 	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 	cfqq = cic->cfqq[BLK_RW_ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
 		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
 						GFP_ATOMIC);
 		if (new_cfqq) {
 			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
 			cfq_put_queue(cfqq);
 		}
 	}
 	cfqq = cic->cfqq[BLK_RW_SYNC];
 	if (cfqq)
 		cfq_mark_cfqq_prio_changed(cfqq);
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 static void cfq_ioc_set_ioprio(struct io_context *ioc)
 {
 	call_for_each_cic(ioc, changed_ioprio);
 	ioc->ioprio_changed = 0;
 }
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			  pid_t pid, bool is_sync)
 {
 	RB_CLEAR_NODE(&cfqq->rb_node);
 	RB_CLEAR_NODE(&cfqq->p_node);
 	INIT_LIST_HEAD(&cfqq->fifo);
 	atomic_set(&cfqq->ref, 0);
 	cfqq->cfqd = cfqd;
 	cfq_mark_cfqq_prio_changed(cfqq);
 	if (is_sync) {
 		if (!cfq_class_idle(cfqq))
 			cfq_mark_cfqq_idle_window(cfqq);
 		cfq_mark_cfqq_sync(cfqq);
 	}
 	cfqq->pid = pid;
 }
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
 {
 	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
 	struct cfq_data *cfqd = cic->key;
 	unsigned long flags;
 	struct request_queue *q;
 	if (unlikely(!cfqd))
 		return;
 	q = cfqd->queue;
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (sync_cfqq) {
 		/*
 		 * Drop reference to sync queue. A new sync queue will be
 		 * assigned in new group upon arrival of a fresh request.
 		 */
 		cfq_log_cfqq(cfqd, sync_cfqq, "changed cgroup");
 		cic_set_cfqq(cic, NULL, 1);
 		cfq_put_queue(sync_cfqq);
 	}
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 static void cfq_ioc_set_cgroup(struct io_context *ioc)
 {
 	call_for_each_cic(ioc, changed_cgroup);
 	ioc->cgroup_changed = 0;
 }
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync,
 		     struct io_context *ioc, gfp_t gfp_mask)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
 	struct cfq_io_context *cic;
 	struct cfq_group *cfqg;
 retry:
 	cfqg = cfq_get_cfqg(cfqd, 1);
 	cic = cfq_cic_lookup(cfqd, ioc);
 	/* cic always exists here */
 	cfqq = cic_to_cfqq(cic, is_sync);
 	/*
 	 * Always try a new alloc if we fell back to the OOM cfqq
 	 * originally, since it should just be a temporary situation.
 	 */
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
 		cfqq = NULL;
 		if (new_cfqq) {
 			cfqq = new_cfqq;
 			new_cfqq = NULL;
 		} else if (gfp_mask & __GFP_WAIT) {
 			spin_unlock_irq(cfqd->queue->queue_lock);
 			new_cfqq = kmem_cache_alloc_node(cfq_pool,
 					gfp_mask | __GFP_ZERO,
 					cfqd->queue->node);
 			spin_lock_irq(cfqd->queue->queue_lock);
 			if (new_cfqq)
 				goto retry;
 		} else {
 			cfqq = kmem_cache_alloc_node(cfq_pool,
 					gfp_mask | __GFP_ZERO,
 					cfqd->queue->node);
 		}
 		if (cfqq) {
 			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
 			cfq_init_prio_data(cfqq, ioc);
 			cfq_link_cfqq_cfqg(cfqq, cfqg);
 			cfq_log_cfqq(cfqd, cfqq, "alloced");
 		} else
 			cfqq = &cfqd->oom_cfqq;
 	}
 	if (new_cfqq)
 		kmem_cache_free(cfq_pool, new_cfqq);
 	return cfqq;
 }
 static struct cfq_queue **
 cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 {
 	switch (ioprio_class) {
 	case IOPRIO_CLASS_RT:
 		return &cfqd->async_cfqq[0][ioprio];
 	case IOPRIO_CLASS_BE:
 		return &cfqd->async_cfqq[1][ioprio];
 	case IOPRIO_CLASS_IDLE:
 		return &cfqd->async_idle_cfqq;
 	default:
 		BUG();
 	}
 }
 static struct cfq_queue *
 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct io_context *ioc,
 	      gfp_t gfp_mask)
 {
 	const int ioprio = task_ioprio(ioc);
 	const int ioprio_class = task_ioprio_class(ioc);
 	struct cfq_queue **async_cfqq = NULL;
 	struct cfq_queue *cfqq = NULL;
 	if (!is_sync) {
 		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
 		cfqq = *async_cfqq;
 	}
 	if (!cfqq)
 		cfqq = cfq_find_alloc_queue(cfqd, is_sync, ioc, gfp_mask);
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
 	 */
 	if (!is_sync && !(*async_cfqq)) {
 		atomic_inc(&cfqq->ref);
 		*async_cfqq = cfqq;
 	}
 	atomic_inc(&cfqq->ref);
 	return cfqq;
 }
 /*
  * We drop cfq io contexts lazily, so we may find a dead one.
  */
 static void
 cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
 		  struct cfq_io_context *cic)
 {
 	unsigned long flags;
 	WARN_ON(!list_empty(&cic->queue_list));
 	spin_lock_irqsave(&ioc->lock, flags);
 	BUG_ON(ioc->ioc_data == cic);
 	radix_tree_delete(&ioc->radix_root, (unsigned long) cfqd);
 	hlist_del_rcu(&cic->cic_list);
 	spin_unlock_irqrestore(&ioc->lock, flags);
 	cfq_cic_free(cic);
 }
 static struct cfq_io_context *
 cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
 {
 	struct cfq_io_context *cic;
 	unsigned long flags;
 	void *k;
 	if (unlikely(!ioc))
 		return NULL;
 	rcu_read_lock();
 	/*
 	 * we maintain a last-hit cache, to avoid browsing over the tree
 	 */
 	cic = rcu_dereference(ioc->ioc_data);
 	if (cic && cic->key == cfqd) {
 		rcu_read_unlock();
 		return cic;
 	}
 	do {
 		cic = radix_tree_lookup(&ioc->radix_root, (unsigned long) cfqd);
 		rcu_read_unlock();
 		if (!cic)
 			break;
 		/* ->key must be copied to avoid race with cfq_exit_queue() */
 		k = cic->key;
 		if (unlikely(!k)) {
 			cfq_drop_dead_cic(cfqd, ioc, cic);
 			rcu_read_lock();
 			continue;
 		}
 		spin_lock_irqsave(&ioc->lock, flags);
 		rcu_assign_pointer(ioc->ioc_data, cic);
 		spin_unlock_irqrestore(&ioc->lock, flags);
 		break;
 	} while (1);
 	return cic;
 }
 /*
  * Add cic into ioc, using cfqd as the search key. This enables us to lookup
  * the process specific cfq io context when entered from the block layer.
  * Also adds the cic to a per-cfqd list, used when this queue is removed.
  */
 static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
 			struct cfq_io_context *cic, gfp_t gfp_mask)
 {
 	unsigned long flags;
 	int ret;
 	ret = radix_tree_preload(gfp_mask);
 	if (!ret) {
 		cic->ioc = ioc;
 		cic->key = cfqd;
 		spin_lock_irqsave(&ioc->lock, flags);
 		ret = radix_tree_insert(&ioc->radix_root,
 						(unsigned long) cfqd, cic);
 		if (!ret)
 			hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
 		spin_unlock_irqrestore(&ioc->lock, flags);
 		radix_tree_preload_end();
 		if (!ret) {
 			spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 			list_add(&cic->queue_list, &cfqd->cic_list);
 			spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 		}
 	}
 	if (ret)
 		printk(KERN_ERR "cfq: cic link failed!\n");
 	return ret;
 }
 /*
  * Setup general io context and cfq io context. There can be several cfq
  * io contexts per general io context, if this process is doing io to more
  * than one device managed by cfq.
  */
 static struct cfq_io_context *
 cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
 {
 	struct io_context *ioc = NULL;
 	struct cfq_io_context *cic;
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	ioc = get_io_context(gfp_mask, cfqd->queue->node);
 	if (!ioc)
 		return NULL;
 	cic = cfq_cic_lookup(cfqd, ioc);
 	if (cic)
 		goto out;
 	cic = cfq_alloc_io_context(cfqd, gfp_mask);
 	if (cic == NULL)
 		goto err;
 	if (cfq_cic_link(cfqd, ioc, cic, gfp_mask))
 		goto err_free;
 out:
 	smp_read_barrier_depends();
 	if (unlikely(ioc->ioprio_changed))
 		cfq_ioc_set_ioprio(ioc);
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	if (unlikely(ioc->cgroup_changed))
 		cfq_ioc_set_cgroup(ioc);
 #endif
 	return cic;
 err_free:
 	cfq_cic_free(cic);
 err:
 	put_io_context(ioc);
 	return NULL;
 }
 static void
 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_io_context *cic)
 {
 	unsigned long elapsed = jiffies - cic->last_end_request;
 	unsigned long ttime = min(elapsed, 2UL * cfqd->cfq_slice_idle);
 	cic->ttime_samples = (7*cic->ttime_samples + 256) / 8;
 	cic->ttime_total = (7*cic->ttime_total + 256*ttime) / 8;
 	cic->ttime_mean = (cic->ttime_total + 128) / cic->ttime_samples;
 }
 static void
 cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct request *rq)
 {
 	sector_t sdist = 0;
 	sector_t n_sec = blk_rq_sectors(rq);
 	if (cfqq->last_request_pos) {
 		if (cfqq->last_request_pos < blk_rq_pos(rq))
 			sdist = blk_rq_pos(rq) - cfqq->last_request_pos;
 		else
 			sdist = cfqq->last_request_pos - blk_rq_pos(rq);
 	}
 	cfqq->seek_history <<= 1;
 	if (blk_queue_nonrot(cfqd->queue))
 		cfqq->seek_history |= (n_sec < CFQQ_SECT_THR_NONROT);
 	else
 		cfqq->seek_history |= (sdist > CFQQ_SEEK_THR);
 }
 /*
  * Disable idle window if the process thinks too long or seeks so much that
  * it doesn't matter
  */
 static void
 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		       struct cfq_io_context *cic)
 {
 	int old_idle, enable_idle;
 	/*
 	 * Don't idle for async or idle io prio class
 	 */
 	if (!cfq_cfqq_sync(cfqq) || cfq_class_idle(cfqq))
 		return;
 	enable_idle = old_idle = cfq_cfqq_idle_window(cfqq);
 	if (cfqq->queued[0] + cfqq->queued[1] >= 4)
 		cfq_mark_cfqq_deep(cfqq);
 	if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
 	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime_samples)) {
 		if (cic->ttime_mean > cfqd->cfq_slice_idle)
 			enable_idle = 0;
 		else
 			enable_idle = 1;
 	}
 	if (old_idle != enable_idle) {
 		cfq_log_cfqq(cfqd, cfqq, "idle=%d", enable_idle);
 		if (enable_idle)
 			cfq_mark_cfqq_idle_window(cfqq);
 		else
 			cfq_clear_cfqq_idle_window(cfqq);
 	}
 }
 /*
  * Check if new_cfqq should preempt the currently active queue. Return 0 for
  * no or if we aren't sure, a 1 will cause a preempt.
  */
 static bool
 cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 		   struct request *rq)
 {
 	struct cfq_queue *cfqq;
 	cfqq = cfqd->active_queue;
 	if (!cfqq)
 		return false;
 	if (cfq_class_idle(new_cfqq))
 		return false;
 	if (cfq_class_idle(cfqq))
 		return true;
 	/*
 	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
 	 */
 	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
 		return false;
 	/*
 	 * if the new request is sync, but the currently running queue is
 	 * not, let the sync request have priority.
 	 */
 	if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq))
 		return true;
 	if (new_cfqq->cfqg != cfqq->cfqg)
 		return false;
 	if (cfq_slice_used(cfqq))
 		return true;
 	/* Allow preemption only if we are idling on sync-noidle tree */
 	if (cfqd->serving_type == SYNC_NOIDLE_WORKLOAD &&
 	    cfqq_type(new_cfqq) == SYNC_NOIDLE_WORKLOAD &&
 	    new_cfqq->service_tree->count == 2 &&
 	    RB_EMPTY_ROOT(&cfqq->sort_list))
 		return true;
 	/*
 	 * So both queues are sync. Let the new request get disk time if
 	 * it's a metadata request and the current queue is doing regular IO.
 	 */
 	if (rq_is_meta(rq) && !cfqq->meta_pending)
 		return true;
 	/*
 	 * Allow an RT request to pre-empt an ongoing non-RT cfqq timeslice.
 	 */
 	if (cfq_class_rt(new_cfqq) && !cfq_class_rt(cfqq))
 		return true;
 	if (!cfqd->active_cic || !cfq_cfqq_wait_request(cfqq))
 		return false;
 	/*
 	 * if this request is as-good as one we would expect from the
 	 * current cfqq, let it preempt
 	 */
 	if (cfq_rq_close(cfqd, cfqq, rq))
 		return true;
 	return false;
 }
 /*
  * cfqq preempts the active queue. if we allowed preempt with no slice left,
  * let it have half of its nominal slice.
  */
 static void cfq_preempt_queue(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "preempt");
 	cfq_slice_expired(cfqd, 1);
 	/*
 	 * Put the new queue at the front of the of the current list,
 	 * so we know that it will be selected next.
 	 */
 	BUG_ON(!cfq_cfqq_on_rr(cfqq));
 	cfq_service_tree_add(cfqd, cfqq, 1);
 	cfqq->slice_end = 0;
 	cfq_mark_cfqq_slice_new(cfqq);
 }
 /*
  * Called when a new fs request (rq) is added (to cfqq). Check if there's
  * something we should do about it
  */
 static void
 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		struct request *rq)
 {
 	struct cfq_io_context *cic = RQ_CIC(rq);
 	cfqd->rq_queued++;
 	if (rq_is_meta(rq))
 		cfqq->meta_pending++;
 	cfq_update_io_thinktime(cfqd, cic);
 	cfq_update_io_seektime(cfqd, cfqq, rq);
 	cfq_update_idle_window(cfqd, cfqq, cic);
 	cfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
 	if (cfqq == cfqd->active_queue) {
 		/*
 		 * Remember that we saw a request from this process, but
 		 * don't start queuing just yet. Otherwise we risk seeing lots
 		 * of tiny requests, because we disrupt the normal plugging
 		 * and merging. If the request is already larger than a single
 		 * page, let it rip immediately. For that case we assume that
 		 * merging is already done. Ditto for a busy system that
 		 * has other work pending, don't risk delaying until the
 		 * idle timer unplug to continue working.
 		 */
 		if (cfq_cfqq_wait_request(cfqq)) {
 			if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE ||
 			    cfqd->busy_queues > 1) {
 				del_timer(&cfqd->idle_slice_timer);
 				cfq_clear_cfqq_wait_request(cfqq);
 				__blk_run_queue(cfqd->queue);
 			} else
 				cfq_mark_cfqq_must_dispatch(cfqq);
 		}
 	} else if (cfq_should_preempt(cfqd, cfqq, rq)) {
 		/*
 		 * not the active queue - expire current slice if it is
 		 * idle and has expired it's mean thinktime or this new queue
 		 * has some old slice time left and is of higher priority or
 		 * this new queue is RT and the current one is BE
 		 */
 		cfq_preempt_queue(cfqd, cfqq);
 		__blk_run_queue(cfqd->queue);
 	}
 }
 static void cfq_insert_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	cfq_log_cfqq(cfqd, cfqq, "insert_request");
 	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
 	cfq_add_rq_rb(rq);
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 /*
  * Update hw_tag based on peak queue depth over 50 samples under
  * sufficient load.
  */
 static void cfq_update_hw_tag(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
 	if (cfqd->rq_in_driver > cfqd->hw_tag_est_depth)
 		cfqd->hw_tag_est_depth = cfqd->rq_in_driver;
 	if (cfqd->hw_tag == 1)
 		return;
 	if (cfqd->rq_queued <= CFQ_HW_QUEUE_MIN &&
 	    cfqd->rq_in_driver <= CFQ_HW_QUEUE_MIN)
 		return;
 	/*
 	 * If active queue hasn't enough requests and can idle, cfq might not
 	 * dispatch sufficient requests to hardware. Don't zero hw_tag in this
 	 * case
 	 */
 	if (cfqq && cfq_cfqq_idle_window(cfqq) &&
 	    cfqq->dispatched + cfqq->queued[0] + cfqq->queued[1] <
 	    CFQ_HW_QUEUE_MIN && cfqd->rq_in_driver < CFQ_HW_QUEUE_MIN)
 		return;
 	if (cfqd->hw_tag_samples++ < 50)
 		return;
 	if (cfqd->hw_tag_est_depth >= CFQ_HW_QUEUE_MIN)
 		cfqd->hw_tag = 1;
 	else
 		cfqd->hw_tag = 0;
 }
 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
 	struct cfq_io_context *cic = cfqd->active_cic;
 	/* If there are other queues in the group, don't wait */
 	if (cfqq->cfqg->nr_cfqq > 1)
 		return false;
 	if (cfq_slice_used(cfqq))
 		return true;
 	/* if slice left is less than think time, wait busy */
 	if (cic && sample_valid(cic->ttime_samples)
 	    && (cfqq->slice_end - jiffies < cic->ttime_mean))
 		return true;
 	/*
 	 * If think times is less than a jiffy than ttime_mean=0 and above
 	 * will not be true. It might happen that slice has not expired yet
 	 * but will expire soon (4-5 ns) during select_queue(). To cover the
 	 * case where think time is less than a jiffy, mark the queue wait
 	 * busy if only 1 jiffy is left in the slice.
 	 */
 	if (cfqq->slice_end - jiffies == 1)
 		return true;
 	return false;
 }
 static void cfq_completed_request(struct request_queue *q, struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	struct cfq_data *cfqd = cfqq->cfqd;
 	const int sync = rq_is_sync(rq);
 	unsigned long now;
 	now = jiffies;
 	cfq_log_cfqq(cfqd, cfqq, "complete rqnoidle %d", !!rq_noidle(rq));
 	cfq_update_hw_tag(cfqd);
 	WARN_ON(!cfqd->rq_in_driver);
 	WARN_ON(!cfqq->dispatched);
 	cfqd->rq_in_driver--;
 	cfqq->dispatched--;
 	cfqd->rq_in_flight[cfq_cfqq_sync(cfqq)]--;
 	if (sync) {
 		RQ_CIC(rq)->last_end_request = now;
 		if (!time_after(rq->start_time + cfqd->cfq_fifo_expire[1], now))
 			cfqd->last_delayed_sync = now;
 	}
 	/*
 	 * If this is the active queue, check if it needs to be expired,
 	 * or if we want to idle in case it has no pending requests.
 	 */
 	if (cfqd->active_queue == cfqq) {
 		const bool cfqq_empty = RB_EMPTY_ROOT(&cfqq->sort_list);
 		if (cfq_cfqq_slice_new(cfqq)) {
 			cfq_set_prio_slice(cfqd, cfqq);
 			cfq_clear_cfqq_slice_new(cfqq);
 		}
 		/*
 		 * Should we wait for next request to come in before we expire
 		 * the queue.
 		 */
 		if (cfq_should_wait_busy(cfqd, cfqq)) {
 			cfqq->slice_end = jiffies + cfqd->cfq_slice_idle;
 			cfq_mark_cfqq_wait_busy(cfqq);
 			cfq_log_cfqq(cfqd, cfqq, "will busy wait");
 		}
 		/*
 		 * Idling is not enabled on:
 		 * - expired queues
 		 * - idle-priority queues
 		 * - async queues
 		 * - queues with still some requests queued
 		 * - when there is a close cooperator
 		 */
 		if (cfq_slice_used(cfqq) || cfq_class_idle(cfqq))
 			cfq_slice_expired(cfqd, 1);
 		else if (sync && cfqq_empty &&
 			 !cfq_close_cooperator(cfqd, cfqq)) {
 			cfqd->noidle_tree_requires_idle |= !rq_noidle(rq);
 			/*
 			 * Idling is enabled for SYNC_WORKLOAD.
 			 * SYNC_NOIDLE_WORKLOAD idles at the end of the tree
 			 * only if we processed at least one !rq_noidle request
 			 */
 			if (cfqd->serving_type == SYNC_WORKLOAD
 			    || cfqd->noidle_tree_requires_idle
 			    || cfqq->cfqg->nr_cfqq == 1)
 				cfq_arm_slice_timer(cfqd);
 		}
 	}
 	if (!cfqd->rq_in_driver)
 		cfq_schedule_dispatch(cfqd);
 }
 /*
  * we temporarily boost lower priority queues if they are holding fs exclusive
  * resources. they are boosted to normal prio (CLASS_BE/4)
  */
 static void cfq_prio_boost(struct cfq_queue *cfqq)
 {
 	if (has_fs_excl()) {
 		/*
 		 * boost idle prio on transactions that would lock out other
 		 * users of the filesystem
 		 */
 		if (cfq_class_idle(cfqq))
 			cfqq->ioprio_class = IOPRIO_CLASS_BE;
 		if (cfqq->ioprio > IOPRIO_NORM)
 			cfqq->ioprio = IOPRIO_NORM;
 	} else {
 		/*
 		 * unboost the queue (if needed)
 		 */
 		cfqq->ioprio_class = cfqq->org_ioprio_class;
 		cfqq->ioprio = cfqq->org_ioprio;
 	}
 }
 static inline int __cfq_may_queue(struct cfq_queue *cfqq)
 {
 	if (cfq_cfqq_wait_request(cfqq) && !cfq_cfqq_must_alloc_slice(cfqq)) {
 		cfq_mark_cfqq_must_alloc_slice(cfqq);
 		return ELV_MQUEUE_MUST;
 	}
 	return ELV_MQUEUE_MAY;
 }
 static int cfq_may_queue(struct request_queue *q, int rw)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct task_struct *tsk = current;
 	struct cfq_io_context *cic;
 	struct cfq_queue *cfqq;
 	/*
 	 * don't force setup of a queue from here, as a call to may_queue
 	 * does not necessarily imply that a request actually will be queued.
 	 * so just lookup a possibly existing queue, or return 'may queue'
 	 * if that fails
 	 */
 	cic = cfq_cic_lookup(cfqd, tsk->io_context);
 	if (!cic)
 		return ELV_MQUEUE_MAY;
 	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
 	if (cfqq) {
 		cfq_init_prio_data(cfqq, cic->ioc);
 		cfq_prio_boost(cfqq);
 		return __cfq_may_queue(cfqq);
 	}
 	return ELV_MQUEUE_MAY;
 }
 /*
  * queue lock held here
  */
 static void cfq_put_request(struct request *rq)
 {
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
 	if (cfqq) {
 		const int rw = rq_data_dir(rq);
 		BUG_ON(!cfqq->allocated[rw]);
 		cfqq->allocated[rw]--;
 		put_io_context(RQ_CIC(rq)->ioc);
 		rq->elevator_private = NULL;
 		rq->elevator_private2 = NULL;
 		cfq_put_queue(cfqq);
 	}
 }
 static struct cfq_queue *
 cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
 		struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
 	cic_set_cfqq(cic, cfqq->new_cfqq, 1);
 	cfq_mark_cfqq_coop(cfqq->new_cfqq);
 	cfq_put_queue(cfqq);
 	return cic_to_cfqq(cic, 1);
 }
 /*
  * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
  * was the last process referring to said cfqq.
  */
 static struct cfq_queue *
 split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
 {
 	if (cfqq_process_refs(cfqq) == 1) {
 		cfqq->pid = current->pid;
 		cfq_clear_cfqq_coop(cfqq);
 		cfq_clear_cfqq_split_coop(cfqq);
 		return cfqq;
 	}
 	cic_set_cfqq(cic, NULL, 1);
 	cfq_put_queue(cfqq);
 	return NULL;
 }
 /*
  * Allocate cfq data structures associated with this request.
  */
 static int
 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct cfq_io_context *cic;
 	const int rw = rq_data_dir(rq);
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
 	unsigned long flags;
 	might_sleep_if(gfp_mask & __GFP_WAIT);
 	cic = cfq_get_io_context(cfqd, gfp_mask);
 	spin_lock_irqsave(q->queue_lock, flags);
 	if (!cic)
 		goto queue_fail;
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
 		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
 		/*
 		 * If the queue was seeky for too long, break it apart.
 		 */
 		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
 			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
 			cfqq = split_cfqq(cic, cfqq);
 			if (!cfqq)
 				goto new_queue;
 		}
 		/*
 		 * Check to see if this queue is scheduled to merge with
 		 * another, closely cooperating queue.  The merging of
 		 * queues happens here as it must be done in process context.
 		 * The reference on new_cfqq was taken in merge_cfqqs.
 		 */
 		if (cfqq->new_cfqq)
 			cfqq = cfq_merge_cfqqs(cfqd, cic, cfqq);
 	}
 	cfqq->allocated[rw]++;
 	atomic_inc(&cfqq->ref);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	rq->elevator_private = cic;
 	rq->elevator_private2 = cfqq;
 	return 0;
 queue_fail:
 	if (cic)
 		put_io_context(cic->ioc);
 	cfq_schedule_dispatch(cfqd);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	cfq_log(cfqd, "set_request fail");
 	return 1;
 }
 static void cfq_kick_queue(struct work_struct *work)
 {
 	struct cfq_data *cfqd =
 		container_of(work, struct cfq_data, unplug_work);
 	struct request_queue *q = cfqd->queue;
 	spin_lock_irq(q->queue_lock);
 	__blk_run_queue(cfqd->queue);
 	spin_unlock_irq(q->queue_lock);
 }
 /*
  * Timer running if the active_queue is currently idling inside its time slice
  */
 static void cfq_idle_slice_timer(unsigned long data)
 {
 	struct cfq_data *cfqd = (struct cfq_data *) data;
 	struct cfq_queue *cfqq;
 	unsigned long flags;
 	int timed_out = 1;
 	cfq_log(cfqd, "idle timer fired");
 	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
 	cfqq = cfqd->active_queue;
 	if (cfqq) {
 		timed_out = 0;
 		/*
 		 * We saw a request before the queue expired, let it through
 		 */
 		if (cfq_cfqq_must_dispatch(cfqq))
 			goto out_kick;
 		/*
 		 * expired
 		 */
 		if (cfq_slice_used(cfqq))
 			goto expire;
 		/*
 		 * only expire and reinvoke request handler, if there are
 		 * other queues with pending requests
 		 */
 		if (!cfqd->busy_queues)
 			goto out_cont;
 		/*
 		 * not expired and it has a request pending, let it dispatch
 		 */
 		if (!RB_EMPTY_ROOT(&cfqq->sort_list))
 			goto out_kick;
 		/*
 		 * Queue depth flag is reset only when the idle didn't succeed
 		 */
 		cfq_clear_cfqq_deep(cfqq);
 	}
 expire:
 	cfq_slice_expired(cfqd, timed_out);
 out_kick:
 	cfq_schedule_dispatch(cfqd);
 out_cont:
 	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
 static void cfq_shutdown_timer_wq(struct cfq_data *cfqd)
 {
 	del_timer_sync(&cfqd->idle_slice_timer);
 	cancel_work_sync(&cfqd->unplug_work);
 }
 static void cfq_put_async_queues(struct cfq_data *cfqd)
 {
 	int i;
 	for (i = 0; i < IOPRIO_BE_NR; i++) {
 		if (cfqd->async_cfqq[0][i])
 			cfq_put_queue(cfqd->async_cfqq[0][i]);
 		if (cfqd->async_cfqq[1][i])
 			cfq_put_queue(cfqd->async_cfqq[1][i]);
 	}
 	if (cfqd->async_idle_cfqq)
 		cfq_put_queue(cfqd->async_idle_cfqq);
 }
 static void cfq_cfqd_free(struct rcu_head *head)
 {
 	kfree(container_of(head, struct cfq_data, rcu));
 }
 static void cfq_exit_queue(struct elevator_queue *e)
 {
 	struct cfq_data *cfqd = e->elevator_data;
 	struct request_queue *q = cfqd->queue;
 	cfq_shutdown_timer_wq(cfqd);
 	spin_lock_irq(q->queue_lock);
 	if (cfqd->active_queue)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
 	while (!list_empty(&cfqd->cic_list)) {
 		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
 							struct cfq_io_context,
 							queue_list);
 		__cfq_exit_single_io_context(cfqd, cic);
 	}
 	cfq_put_async_queues(cfqd);
 	cfq_release_cfq_groups(cfqd);
 	blkiocg_del_blkio_group(&cfqd->root_group.blkg);
 	spin_unlock_irq(q->queue_lock);
 	cfq_shutdown_timer_wq(cfqd);
 	/* Wait for cfqg->blkg->key accessors to exit their grace periods. */
 	call_rcu(&cfqd->rcu, cfq_cfqd_free);
 }
 static void *cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
 	int i, j;
 	struct cfq_group *cfqg;
 	struct cfq_rb_root *st;
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (!cfqd)
 		return NULL;
 	/* Init root service tree */
 	cfqd->grp_service_tree = CFQ_RB_ROOT;
 	/* Init root group */
 	cfqg = &cfqd->root_group;
 	for_each_cfqg_st(cfqg, i, j, st)
 		*st = CFQ_RB_ROOT;
 	RB_CLEAR_NODE(&cfqg->rb_node);
 	/* Give preference to root group over other groups */
 	cfqg->weight = 2*BLKIO_WEIGHT_DEFAULT;
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 	/*
 	 * Take a reference to root group which we never drop. This is just
 	 * to make sure that cfq_put_cfqg() does not try to kfree root group
 	 */
 	atomic_set(&cfqg->ref, 1);
 	blkiocg_add_blkio_group(&blkio_root_cgroup, &cfqg->blkg, (void *)cfqd,
 					0);
 #endif
 	/*
 	 * Not strictly needed (since RB_ROOT just clears the node and we
 	 * zeroed cfqd on alloc), but better be safe in case someone decides
 	 * to add magic to the rb code
 	 */
 	for (i = 0; i < CFQ_PRIO_LISTS; i++)
 		cfqd->prio_trees[i] = RB_ROOT;
 	/*
 	 * Our fallback cfqq if cfq_find_alloc_queue() runs into OOM issues.
 	 * Grab a permanent reference to it, so that the normal code flow
 	 * will not attempt to free it.
 	 */
 	cfq_init_cfqq(cfqd, &cfqd->oom_cfqq, 1, 0);
 	atomic_inc(&cfqd->oom_cfqq.ref);
 	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
 	INIT_LIST_HEAD(&cfqd->cic_list);
 	cfqd->queue = q;
 	init_timer(&cfqd->idle_slice_timer);
 	cfqd->idle_slice_timer.function = cfq_idle_slice_timer;
 	cfqd->idle_slice_timer.data = (unsigned long) cfqd;
 	INIT_WORK(&cfqd->unplug_work, cfq_kick_queue);
 	cfqd->cfq_quantum = cfq_quantum;
 	cfqd->cfq_fifo_expire[0] = cfq_fifo_expire[0];
 	cfqd->cfq_fifo_expire[1] = cfq_fifo_expire[1];
 	cfqd->cfq_back_max = cfq_back_max;
 	cfqd->cfq_back_penalty = cfq_back_penalty;
 	cfqd->cfq_slice[0] = cfq_slice_async;
 	cfqd->cfq_slice[1] = cfq_slice_sync;
 	cfqd->cfq_slice_async_rq = cfq_slice_async_rq;
 	cfqd->cfq_slice_idle = cfq_slice_idle;
 	cfqd->cfq_latency = 1;
 	cfqd->cfq_group_isolation = 0;
 	cfqd->hw_tag = -1;
 	/*
 	 * we optimistically start assuming sync ops weren't delayed in last
 	 * second, in order to have larger depth for async operations.
 	 */
 	cfqd->last_delayed_sync = jiffies - HZ;
 	INIT_RCU_HEAD(&cfqd->rcu);
 	return cfqd;
 }
 static void cfq_slab_kill(void)
 {
 	/*
 	 * Caller already ensured that pending RCU callbacks are completed,
 	 * so we should have no busy allocations at this point.
 	 */
 	if (cfq_pool)
 		kmem_cache_destroy(cfq_pool);
 	if (cfq_ioc_pool)
 		kmem_cache_destroy(cfq_ioc_pool);
 }
 static int __init cfq_slab_setup(void)
 {
 	cfq_pool = KMEM_CACHE(cfq_queue, 0);
 	if (!cfq_pool)
 		goto fail;
 	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
 	if (!cfq_ioc_pool)
 		goto fail;
 	return 0;
 fail:
 	cfq_slab_kill();
 	return -ENOMEM;
 }
 /*
  * sysfs parts below -->
  */
 static ssize_t
 cfq_var_show(unsigned int var, char *page)
 {
 	return sprintf(page, "%d\n", var);
 }
 static ssize_t
 cfq_var_store(unsigned int *var, const char *page, size_t count)
 {
 	char *p = (char *) page;
 	*var = simple_strtoul(p, &p, 10);
 	return count;
 }
 #define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
 static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
 {									\
 	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data = __VAR;					\
 	if (__CONV)							\
 		__data = jiffies_to_msecs(__data);			\
 	return cfq_var_show(__data, (page));				\
 }
 SHOW_FUNCTION(cfq_quantum_show, cfqd->cfq_quantum, 0);
 SHOW_FUNCTION(cfq_fifo_expire_sync_show, cfqd->cfq_fifo_expire[1], 1);
 SHOW_FUNCTION(cfq_fifo_expire_async_show, cfqd->cfq_fifo_expire[0], 1);
 SHOW_FUNCTION(cfq_back_seek_max_show, cfqd->cfq_back_max, 0);
 SHOW_FUNCTION(cfq_back_seek_penalty_show, cfqd->cfq_back_penalty, 0);
 SHOW_FUNCTION(cfq_slice_idle_show, cfqd->cfq_slice_idle, 1);
 SHOW_FUNCTION(cfq_slice_sync_show, cfqd->cfq_slice[1], 1);
 SHOW_FUNCTION(cfq_slice_async_show, cfqd->cfq_slice[0], 1);
 SHOW_FUNCTION(cfq_slice_async_rq_show, cfqd->cfq_slice_async_rq, 0);
 SHOW_FUNCTION(cfq_low_latency_show, cfqd->cfq_latency, 0);
 SHOW_FUNCTION(cfq_group_isolation_show, cfqd->cfq_group_isolation, 0);
 #undef SHOW_FUNCTION
 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)	\
 {									\
 	struct cfq_data *cfqd = e->elevator_data;			\
 	unsigned int __data;						\
 	int ret = cfq_var_store(&__data, (page), count);		\
 	if (__data < (MIN))						\
 		__data = (MIN);						\
 	else if (__data > (MAX))					\
 		__data = (MAX);						\
 	if (__CONV)							\
 		*(__PTR) = msecs_to_jiffies(__data);			\
 	else								\
 		*(__PTR) = __data;					\
 	return ret;							\
 }
 STORE_FUNCTION(cfq_quantum_store, &cfqd->cfq_quantum, 1, UINT_MAX, 0);
 STORE_FUNCTION(cfq_fifo_expire_sync_store, &cfqd->cfq_fifo_expire[1], 1,
 		UINT_MAX, 1);
 STORE_FUNCTION(cfq_fifo_expire_async_store, &cfqd->cfq_fifo_expire[0], 1,
 		UINT_MAX, 1);
 STORE_FUNCTION(cfq_back_seek_max_store, &cfqd->cfq_back_max, 0, UINT_MAX, 0);
 STORE_FUNCTION(cfq_back_seek_penalty_store, &cfqd->cfq_back_penalty, 1,
 		UINT_MAX, 0);
 STORE_FUNCTION(cfq_slice_idle_store, &cfqd->cfq_slice_idle, 0, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_sync_store, &cfqd->cfq_slice[1], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_store, &cfqd->cfq_slice[0], 1, UINT_MAX, 1);
 STORE_FUNCTION(cfq_slice_async_rq_store, &cfqd->cfq_slice_async_rq, 1,
 		UINT_MAX, 0);
 STORE_FUNCTION(cfq_low_latency_store, &cfqd->cfq_latency, 0, 1, 0);
 STORE_FUNCTION(cfq_group_isolation_store, &cfqd->cfq_group_isolation, 0, 1, 0);
 #undef STORE_FUNCTION
 #define CFQ_ATTR(name) \
 	__ATTR(name, S_IRUGO|S_IWUSR, cfq_##name##_show, cfq_##name##_store)
 static struct elv_fs_entry cfq_attrs[] = {
 	CFQ_ATTR(quantum),
 	CFQ_ATTR(fifo_expire_sync),
 	CFQ_ATTR(fifo_expire_async),
 	CFQ_ATTR(back_seek_max),
 	CFQ_ATTR(back_seek_penalty),
 	CFQ_ATTR(slice_sync),
 	CFQ_ATTR(slice_async),
 	CFQ_ATTR(slice_async_rq),
 	CFQ_ATTR(slice_idle),
 	CFQ_ATTR(low_latency),
 	CFQ_ATTR(group_isolation),
 	__ATTR_NULL
 };
 static struct elevator_type iosched_cfq = {
 	.ops = {
 		.elevator_merge_fn = 		cfq_merge,
 		.elevator_merged_fn =		cfq_merged_request,
 		.elevator_merge_req_fn =	cfq_merged_requests,
 		.elevator_allow_merge_fn =	cfq_allow_merge,
 		.elevator_dispatch_fn =		cfq_dispatch_requests,
 		.elevator_add_req_fn =		cfq_insert_request,
 		.elevator_activate_req_fn =	cfq_activate_request,
 		.elevator_deactivate_req_fn =	cfq_deactivate_request,
 		.elevator_queue_empty_fn =	cfq_queue_empty,
 		.elevator_completed_req_fn =	cfq_completed_request,
 		.elevator_former_req_fn =	elv_rb_former_request,
 		.elevator_latter_req_fn =	elv_rb_latter_request,
 		.elevator_set_req_fn =		cfq_set_request,
 		.elevator_put_req_fn =		cfq_put_request,
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
 		.trim =				cfq_free_io_context,
 	},
 	.elevator_attrs =	cfq_attrs,
 	.elevator_name =	"cfq",
 	.elevator_owner =	THIS_MODULE,
 };
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
 static struct blkio_policy_type blkio_policy_cfq = {
 	.ops = {
 		.blkio_unlink_group_fn =	cfq_unlink_blkio_group,
 		.blkio_update_group_weight_fn =	cfq_update_blkio_group_weight,
 	},
 };
 #else
 static struct blkio_policy_type blkio_policy_cfq;
 #endif
 static int __init cfq_init(void)
 {
 	/*
 	 * could be 0 on HZ < 1000 setups
 	 */
 	if (!cfq_slice_async)
 		cfq_slice_async = 1;
 	if (!cfq_slice_idle)
 		cfq_slice_idle = 1;
 	if (cfq_slab_setup())
 		return -ENOMEM;
 	elv_register(&iosched_cfq);
 	blkio_policy_register(&blkio_policy_cfq);
 	return 0;
 }
 static void __exit cfq_exit(void)
 {
 	DECLARE_COMPLETION_ONSTACK(all_gone);
 	blkio_policy_unregister(&blkio_policy_cfq);
 	elv_unregister(&iosched_cfq);
 	ioc_gone = &all_gone;
 	/* ioc_gone's update must be visible before reading ioc_count */
 	smp_wmb();
 	/*
 	 * this also protects us from entering cfq_slab_kill() with
 	 * pending RCU callbacks
 	 */
 	if (elv_ioc_count_read(cfq_ioc_count))
 		wait_for_completion(&all_gone);
 	cfq_slab_kill();
 }
 module_init(cfq_init);
 module_exit(cfq_exit);
 MODULE_AUTHOR("Jens Axboe");
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("Completely Fair Queueing IO scheduler");