Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

3

4

5

* Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>

5

* Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>

6

* kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000

6

* kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000

7

* bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001

7

* bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001

8

*/

8

*/

9

10

/*

10

/*

11

* This handles all read/write requests to block devices

11

* This handles all read/write requests to block devices

12

*/

12

*/

13

#include <linux/kernel.h>

13

#include <linux/kernel.h>

14

#include <linux/module.h>

14

#include <linux/module.h>

15

#include <linux/backing-dev.h>

15

#include <linux/backing-dev.h>

16

#include <linux/bio.h>

16

#include <linux/bio.h>

17

#include <linux/blkdev.h>

17

#include <linux/blkdev.h>

18

#include <linux/highmem.h>

18

#include <linux/highmem.h>

19

#include <linux/mm.h>

19

#include <linux/mm.h>

20

#include <linux/kernel_stat.h>

20

#include <linux/kernel_stat.h>

21

#include <linux/string.h>

21

#include <linux/string.h>

22

#include <linux/init.h>

22

#include <linux/init.h>

23

#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */

23

#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */

24

#include <linux/completion.h>

24

#include <linux/completion.h>

25

#include <linux/slab.h>

25

#include <linux/slab.h>

26

#include <linux/swap.h>

26

#include <linux/swap.h>

27

#include <linux/writeback.h>

27

#include <linux/writeback.h>

28

#include <linux/task_io_accounting_ops.h>

28

#include <linux/task_io_accounting_ops.h>

29

#include <linux/interrupt.h>

29

#include <linux/interrupt.h>

30

#include <linux/cpu.h>

30

#include <linux/cpu.h>

31

#include <linux/blktrace_api.h>

31

#include <linux/blktrace_api.h>

32

#include <linux/fault-inject.h>

32

#include <linux/fault-inject.h>

33

34

/*

34

/*

35

* for max sense size

35

* for max sense size

36

*/

36

*/

37

#include <scsi/scsi_cmnd.h>

37

#include <scsi/scsi_cmnd.h>

38

39

static void blk_unplug_work(struct work_struct *work);

39

static void blk_unplug_work(struct work_struct *work);

40

static void blk_unplug_timeout(unsigned long data);

40

static void blk_unplug_timeout(unsigned long data);

41

static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);

41

static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);

42

static void init_request_from_bio(struct request *req, struct bio *bio);

42

static void init_request_from_bio(struct request *req, struct bio *bio);

43

static int __make_request(struct request_queue *q, struct bio *bio);

43

static int __make_request(struct request_queue *q, struct bio *bio);

44

static struct io_context *current_io_context(gfp_t gfp_flags, int node);

44

static struct io_context *current_io_context(gfp_t gfp_flags, int node);

45

static void blk_recalc_rq_segments(struct request *rq);

45

static void blk_recalc_rq_segments(struct request *rq);

46

static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,

46

static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,

47

struct bio *bio);

47

struct bio *bio);

48

49

/*

49

/*

50

* For the allocated request tables

50

* For the allocated request tables

51

*/

51

*/

52

static struct kmem_cache *request_cachep;

52

static struct kmem_cache *request_cachep;

53

54

/*

54

/*

55

* For queue allocation

55

* For queue allocation

56

*/

56

*/

57

static struct kmem_cache *requestq_cachep;

57

static struct kmem_cache *requestq_cachep;

58

59

/*

59

/*

60

* For io context allocations

60

* For io context allocations

61

*/

61

*/

62

static struct kmem_cache *iocontext_cachep;

62

static struct kmem_cache *iocontext_cachep;

63

64

/*

64

/*

65

* Controlling structure to kblockd

65

* Controlling structure to kblockd

66

*/

66

*/

67

static struct workqueue_struct *kblockd_workqueue;

67

static struct workqueue_struct *kblockd_workqueue;

68

69

unsigned long blk_max_low_pfn, blk_max_pfn;

69

unsigned long blk_max_low_pfn, blk_max_pfn;

70

71

EXPORT_SYMBOL(blk_max_low_pfn);

71

EXPORT_SYMBOL(blk_max_low_pfn);

72

EXPORT_SYMBOL(blk_max_pfn);

72

EXPORT_SYMBOL(blk_max_pfn);

73

74

static DEFINE_PER_CPU(struct list_head, blk_cpu_done);

74

static DEFINE_PER_CPU(struct list_head, blk_cpu_done);

75

76

/* Amount of time in which a process may batch requests */

76

/* Amount of time in which a process may batch requests */

77

#define BLK_BATCH_TIME (HZ/50UL)

77

#define BLK_BATCH_TIME (HZ/50UL)

78

79

/* Number of requests a "batching" process may submit */

79

/* Number of requests a "batching" process may submit */

80

#define BLK_BATCH_REQ 32

80

#define BLK_BATCH_REQ 32

81

82

/*

82

/*

83

* Return the threshold (number of used requests) at which the queue is

83

* Return the threshold (number of used requests) at which the queue is

84

* considered to be congested. It include a little hysteresis to keep the

84

* considered to be congested. It include a little hysteresis to keep the

85

* context switch rate down.

85

* context switch rate down.

86

*/

86

*/

87

static inline int queue_congestion_on_threshold(struct request_queue *q)

87

static inline int queue_congestion_on_threshold(struct request_queue *q)

88

{

88

{

89

return q->nr_congestion_on;

89

return q->nr_congestion_on;

90

}

90

}

91

92

/*

92

/*

93

* The threshold at which a queue is considered to be uncongested

93

* The threshold at which a queue is considered to be uncongested

94

*/

94

*/

95

static inline int queue_congestion_off_threshold(struct request_queue *q)

95

static inline int queue_congestion_off_threshold(struct request_queue *q)

96

{

96

{

97

return q->nr_congestion_off;

97

return q->nr_congestion_off;

98

}

98

}

99

100

static void blk_queue_congestion_threshold(struct request_queue *q)

100

static void blk_queue_congestion_threshold(struct request_queue *q)

101

{

101

{

102

int nr;

102

int nr;

103

104

nr = q->nr_requests - (q->nr_requests / 8) + 1;

104

nr = q->nr_requests - (q->nr_requests / 8) + 1;

105

if (nr > q->nr_requests)

105

if (nr > q->nr_requests)

106

nr = q->nr_requests;

106

nr = q->nr_requests;

107

q->nr_congestion_on = nr;

107

q->nr_congestion_on = nr;

108

109

nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;

109

nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;

110

if (nr < 1)

110

if (nr < 1)

111

nr = 1;

111

nr = 1;

112

q->nr_congestion_off = nr;

112

q->nr_congestion_off = nr;

113

}

113

}

114

115

/**

115

/**

116

* blk_get_backing_dev_info - get the address of a queue's backing_dev_info

116

* blk_get_backing_dev_info - get the address of a queue's backing_dev_info

117

* @bdev: device

117

* @bdev: device

118

*

118

*

119

* Locates the passed device's request queue and returns the address of its

119

* Locates the passed device's request queue and returns the address of its

120

* backing_dev_info

120

* backing_dev_info

121

*

121

*

122

* Will return NULL if the request queue cannot be located.

122

* Will return NULL if the request queue cannot be located.

123

*/

123

*/

124

struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)

124

struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)

125

{

125

{

126

struct backing_dev_info *ret = NULL;

126

struct backing_dev_info *ret = NULL;

127

struct request_queue *q = bdev_get_queue(bdev);

127

struct request_queue *q = bdev_get_queue(bdev);

128

129

if (q)

129

if (q)

130

ret = &q->backing_dev_info;

130

ret = &q->backing_dev_info;

131

return ret;

131

return ret;

132

}

132

}

133

EXPORT_SYMBOL(blk_get_backing_dev_info);

133

EXPORT_SYMBOL(blk_get_backing_dev_info);

134

135

/**

135

/**

136

* blk_queue_prep_rq - set a prepare_request function for queue

136

* blk_queue_prep_rq - set a prepare_request function for queue

137

* @q: queue

137

* @q: queue

138

* @pfn: prepare_request function

138

* @pfn: prepare_request function

139

*

139

*

140

* It's possible for a queue to register a prepare_request callback which

140

* It's possible for a queue to register a prepare_request callback which

141

* is invoked before the request is handed to the request_fn. The goal of

141

* is invoked before the request is handed to the request_fn. The goal of

142

* the function is to prepare a request for I/O, it can be used to build a

142

* the function is to prepare a request for I/O, it can be used to build a

143

* cdb from the request data for instance.

143

* cdb from the request data for instance.

144

*

144

*

145

*/

145

*/

146

void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)

146

void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)

147

{

147

{

148

q->prep_rq_fn = pfn;

148

q->prep_rq_fn = pfn;

149

}

149

}

150

151

EXPORT_SYMBOL(blk_queue_prep_rq);

151

EXPORT_SYMBOL(blk_queue_prep_rq);

152

153

/**

153

/**

154

* blk_queue_merge_bvec - set a merge_bvec function for queue

154

* blk_queue_merge_bvec - set a merge_bvec function for queue

155

* @q: queue

155

* @q: queue

156

* @mbfn: merge_bvec_fn

156

* @mbfn: merge_bvec_fn

157

*

157

*

158

* Usually queues have static limitations on the max sectors or segments that

158

* Usually queues have static limitations on the max sectors or segments that

159

* we can put in a request. Stacking drivers may have some settings that

159

* we can put in a request. Stacking drivers may have some settings that

160

* are dynamic, and thus we have to query the queue whether it is ok to

160

* are dynamic, and thus we have to query the queue whether it is ok to

161

* add a new bio_vec to a bio at a given offset or not. If the block device

161

* add a new bio_vec to a bio at a given offset or not. If the block device

162

* has such limitations, it needs to register a merge_bvec_fn to control

162

* has such limitations, it needs to register a merge_bvec_fn to control

163

* the size of bio's sent to it. Note that a block device *must* allow a

163

* the size of bio's sent to it. Note that a block device *must* allow a

164

* single page to be added to an empty bio. The block device driver may want

164

* single page to be added to an empty bio. The block device driver may want

165

* to use the bio_split() function to deal with these bio's. By default

165

* to use the bio_split() function to deal with these bio's. By default

166

* no merge_bvec_fn is defined for a queue, and only the fixed limits are

166

* no merge_bvec_fn is defined for a queue, and only the fixed limits are

167

* honored.

167

* honored.

168

*/

168

*/

169

void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)

169

void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)

170

{

170

{

171

q->merge_bvec_fn = mbfn;

171

q->merge_bvec_fn = mbfn;

172

}

172

}

173

174

EXPORT_SYMBOL(blk_queue_merge_bvec);

174

EXPORT_SYMBOL(blk_queue_merge_bvec);

175

176

void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)

176

void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)

177

{

177

{

178

q->softirq_done_fn = fn;

178

q->softirq_done_fn = fn;

179

}

179

}

180

181

EXPORT_SYMBOL(blk_queue_softirq_done);

181

EXPORT_SYMBOL(blk_queue_softirq_done);

182

183

/**

183

/**

184

* blk_queue_make_request - define an alternate make_request function for a device

184

* blk_queue_make_request - define an alternate make_request function for a device

185

* @q: the request queue for the device to be affected

185

* @q: the request queue for the device to be affected

186

* @mfn: the alternate make_request function

186

* @mfn: the alternate make_request function

187

*

187

*

188

* Description:

188

* Description:

189

* The normal way for &struct bios to be passed to a device

189

* The normal way for &struct bios to be passed to a device

190

* driver is for them to be collected into requests on a request

190

* driver is for them to be collected into requests on a request

191

* queue, and then to allow the device driver to select requests

191

* queue, and then to allow the device driver to select requests

192

* off that queue when it is ready. This works well for many block

192

* off that queue when it is ready. This works well for many block

193

* devices. However some block devices (typically virtual devices

193

* devices. However some block devices (typically virtual devices

194

* such as md or lvm) do not benefit from the processing on the

194

* such as md or lvm) do not benefit from the processing on the

195

* request queue, and are served best by having the requests passed

195

* request queue, and are served best by having the requests passed

196

* directly to them. This can be achieved by providing a function

196

* directly to them. This can be achieved by providing a function

197

* to blk_queue_make_request().

197

* to blk_queue_make_request().

198

*

198

*

199

* Caveat:

199

* Caveat:

200

* The driver that does this *must* be able to deal appropriately

200

* The driver that does this *must* be able to deal appropriately

201

* with buffers in "highmemory". This can be accomplished by either calling

201

* with buffers in "highmemory". This can be accomplished by either calling

202

* __bio_kmap_atomic() to get a temporary kernel mapping, or by calling

202

* __bio_kmap_atomic() to get a temporary kernel mapping, or by calling

203

* blk_queue_bounce() to create a buffer in normal memory.

203

* blk_queue_bounce() to create a buffer in normal memory.

204

**/

204

**/

205

void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn)

205

void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn)

206

{

206

{

207

/*

207

/*

208

* set defaults

208

* set defaults

209

*/

209

*/

210

q->nr_requests = BLKDEV_MAX_RQ;

210

q->nr_requests = BLKDEV_MAX_RQ;

211

blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);

211

blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);

212

blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);

212

blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);

213

q->make_request_fn = mfn;

213

q->make_request_fn = mfn;

214

q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;

214

q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;

215

q->backing_dev_info.state = 0;

215

q->backing_dev_info.state = 0;

216

q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;

216

q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;

217

blk_queue_max_sectors(q, SAFE_MAX_SECTORS);

217

blk_queue_max_sectors(q, SAFE_MAX_SECTORS);

218

blk_queue_hardsect_size(q, 512);

218

blk_queue_hardsect_size(q, 512);

219

blk_queue_dma_alignment(q, 511);

219

blk_queue_dma_alignment(q, 511);

220

blk_queue_congestion_threshold(q);

220

blk_queue_congestion_threshold(q);

221

q->nr_batching = BLK_BATCH_REQ;

221

q->nr_batching = BLK_BATCH_REQ;

222

223

q->unplug_thresh = 4; /* hmm */

223

q->unplug_thresh = 4; /* hmm */

224

q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */

224

q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */

225

if (q->unplug_delay == 0)

225

if (q->unplug_delay == 0)

226

q->unplug_delay = 1;

226

q->unplug_delay = 1;

227

228

INIT_WORK(&q->unplug_work, blk_unplug_work);

228

INIT_WORK(&q->unplug_work, blk_unplug_work);

229

230

q->unplug_timer.function = blk_unplug_timeout;

230

q->unplug_timer.function = blk_unplug_timeout;

231

q->unplug_timer.data = (unsigned long)q;

231

q->unplug_timer.data = (unsigned long)q;

232

233

/*

233

/*

234

* by default assume old behaviour and bounce for any highmem page

234

* by default assume old behaviour and bounce for any highmem page

235

*/

235

*/

236

blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);

236

blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);

237

}

237

}

238

239

EXPORT_SYMBOL(blk_queue_make_request);

239

EXPORT_SYMBOL(blk_queue_make_request);

240

241

static void rq_init(struct request_queue *q, struct request *rq)

241

static void rq_init(struct request_queue *q, struct request *rq)

242

{

242

{

243

INIT_LIST_HEAD(&rq->queuelist);

243

INIT_LIST_HEAD(&rq->queuelist);

244

INIT_LIST_HEAD(&rq->donelist);

244

INIT_LIST_HEAD(&rq->donelist);

245

246

rq->errors = 0;

246

rq->errors = 0;

247

rq->bio = rq->biotail = NULL;

247

rq->bio = rq->biotail = NULL;

248

INIT_HLIST_NODE(&rq->hash);

248

INIT_HLIST_NODE(&rq->hash);

249

RB_CLEAR_NODE(&rq->rb_node);

249

RB_CLEAR_NODE(&rq->rb_node);

250

rq->ioprio = 0;

250

rq->ioprio = 0;

251

rq->buffer = NULL;

251

rq->buffer = NULL;

252

rq->ref_count = 1;

252

rq->ref_count = 1;

253

rq->q = q;

253

rq->q = q;

254

rq->special = NULL;

254

rq->special = NULL;

255

rq->data_len = 0;

255

rq->data_len = 0;

256

rq->data = NULL;

256

rq->data = NULL;

257

rq->nr_phys_segments = 0;

257

rq->nr_phys_segments = 0;

258

rq->sense = NULL;

258

rq->sense = NULL;

259

rq->end_io = NULL;

259

rq->end_io = NULL;

260

rq->end_io_data = NULL;

260

rq->end_io_data = NULL;

261

rq->completion_data = NULL;

261

rq->completion_data = NULL;

262

rq->next_rq = NULL;

262

rq->next_rq = NULL;

263

}

263

}

264

265

/**

265

/**

266

* blk_queue_ordered - does this queue support ordered writes

266

* blk_queue_ordered - does this queue support ordered writes

267

* @q: the request queue

267

* @q: the request queue

268

* @ordered: one of QUEUE_ORDERED_*

268

* @ordered: one of QUEUE_ORDERED_*

269

* @prepare_flush_fn: rq setup helper for cache flush ordered writes

269

* @prepare_flush_fn: rq setup helper for cache flush ordered writes

270

*

270

*

271

* Description:

271

* Description:

272

* For journalled file systems, doing ordered writes on a commit

272

* For journalled file systems, doing ordered writes on a commit

273

* block instead of explicitly doing wait_on_buffer (which is bad

273

* block instead of explicitly doing wait_on_buffer (which is bad

274

* for performance) can be a big win. Block drivers supporting this

274

* for performance) can be a big win. Block drivers supporting this

275

* feature should call this function and indicate so.

275

* feature should call this function and indicate so.

276

*

276

*

277

**/

277

**/

278

int blk_queue_ordered(struct request_queue *q, unsigned ordered,

278

int blk_queue_ordered(struct request_queue *q, unsigned ordered,

279

prepare_flush_fn *prepare_flush_fn)

279

prepare_flush_fn *prepare_flush_fn)

280

{

280

{

281

if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&

281

if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&

282

prepare_flush_fn == NULL) {

282

prepare_flush_fn == NULL) {

283

printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");

283

printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");

284

return -EINVAL;

284

return -EINVAL;

285

}

285

}

286

287

if (ordered != QUEUE_ORDERED_NONE &&

287

if (ordered != QUEUE_ORDERED_NONE &&

288

ordered != QUEUE_ORDERED_DRAIN &&

288

ordered != QUEUE_ORDERED_DRAIN &&

289

ordered != QUEUE_ORDERED_DRAIN_FLUSH &&

289

ordered != QUEUE_ORDERED_DRAIN_FLUSH &&

290

ordered != QUEUE_ORDERED_DRAIN_FUA &&

290

ordered != QUEUE_ORDERED_DRAIN_FUA &&

291

ordered != QUEUE_ORDERED_TAG &&

291

ordered != QUEUE_ORDERED_TAG &&

292

ordered != QUEUE_ORDERED_TAG_FLUSH &&

292

ordered != QUEUE_ORDERED_TAG_FLUSH &&

293

ordered != QUEUE_ORDERED_TAG_FUA) {

293

ordered != QUEUE_ORDERED_TAG_FUA) {

294

printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);

294

printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);

295

return -EINVAL;

295

return -EINVAL;

296

}

296

}

297

298

q->ordered = ordered;

298

q->ordered = ordered;

299

q->next_ordered = ordered;

299

q->next_ordered = ordered;

300

q->prepare_flush_fn = prepare_flush_fn;

300

q->prepare_flush_fn = prepare_flush_fn;

301

302

return 0;

302

return 0;

303

}

303

}

304

305

EXPORT_SYMBOL(blk_queue_ordered);

305

EXPORT_SYMBOL(blk_queue_ordered);

306

307

/**

307

/**

308

* blk_queue_issue_flush_fn - set function for issuing a flush

308

* blk_queue_issue_flush_fn - set function for issuing a flush

309

* @q: the request queue

309

* @q: the request queue

310

* @iff: the function to be called issuing the flush

310

* @iff: the function to be called issuing the flush

311

*

311

*

312

* Description:

312

* Description:

313

* If a driver supports issuing a flush command, the support is notified

313

* If a driver supports issuing a flush command, the support is notified

314

* to the block layer by defining it through this call.

314

* to the block layer by defining it through this call.

315

*

315

*

316

**/

316

**/

317

void blk_queue_issue_flush_fn(struct request_queue *q, issue_flush_fn *iff)

317

void blk_queue_issue_flush_fn(struct request_queue *q, issue_flush_fn *iff)

318

{

318

{

319

q->issue_flush_fn = iff;

319

q->issue_flush_fn = iff;

320

}

320

}

321

322

EXPORT_SYMBOL(blk_queue_issue_flush_fn);

322

EXPORT_SYMBOL(blk_queue_issue_flush_fn);

323

324

/*

324

/*

325

* Cache flushing for ordered writes handling

325

* Cache flushing for ordered writes handling

326

*/

326

*/

327

inline unsigned blk_ordered_cur_seq(struct request_queue *q)

327

inline unsigned blk_ordered_cur_seq(struct request_queue *q)

328

{

328

{

329

if (!q->ordseq)

329

if (!q->ordseq)

330

return 0;

330

return 0;

331

return 1 << ffz(q->ordseq);

331

return 1 << ffz(q->ordseq);

332

}

332

}

333

334

unsigned blk_ordered_req_seq(struct request *rq)

334

unsigned blk_ordered_req_seq(struct request *rq)

335

{

335

{

336

struct request_queue *q = rq->q;

336

struct request_queue *q = rq->q;

337

338

BUG_ON(q->ordseq == 0);

338

BUG_ON(q->ordseq == 0);

339

340

if (rq == &q->pre_flush_rq)

340

if (rq == &q->pre_flush_rq)

341

return QUEUE_ORDSEQ_PREFLUSH;

341

return QUEUE_ORDSEQ_PREFLUSH;

342

if (rq == &q->bar_rq)

342

if (rq == &q->bar_rq)

343

return QUEUE_ORDSEQ_BAR;

343

return QUEUE_ORDSEQ_BAR;

344

if (rq == &q->post_flush_rq)

344

if (rq == &q->post_flush_rq)

345

return QUEUE_ORDSEQ_POSTFLUSH;

345

return QUEUE_ORDSEQ_POSTFLUSH;

346

347

/*

347

/*

348

* !fs requests don't need to follow barrier ordering. Always

348

* !fs requests don't need to follow barrier ordering. Always

349

* put them at the front. This fixes the following deadlock.

349

* put them at the front. This fixes the following deadlock.

350

*

350

*

351

* http://thread.gmane.org/gmane.linux.kernel/537473

351

* http://thread.gmane.org/gmane.linux.kernel/537473

352

*/

352

*/

353

if (!blk_fs_request(rq))

353

if (!blk_fs_request(rq))

354

return QUEUE_ORDSEQ_DRAIN;

354

return QUEUE_ORDSEQ_DRAIN;

355

356

if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==

356

if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==

357

(q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))

357

(q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))

358

return QUEUE_ORDSEQ_DRAIN;

358

return QUEUE_ORDSEQ_DRAIN;

359

else

359

else

360

return QUEUE_ORDSEQ_DONE;

360

return QUEUE_ORDSEQ_DONE;

361

}

361

}

362

363

void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)

363

void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)

364

{

364

{

365

struct request *rq;

365

struct request *rq;

366

int uptodate;

366

int uptodate;

367

368

if (error && !q->orderr)

368

if (error && !q->orderr)

369

q->orderr = error;

369

q->orderr = error;

370

371

BUG_ON(q->ordseq & seq);

371

BUG_ON(q->ordseq & seq);

372

q->ordseq |= seq;

372

q->ordseq |= seq;

373

374

if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)

374

if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)

375

return;

375

return;

376

377

/*

377

/*

378

* Okay, sequence complete.

378

* Okay, sequence complete.

379

*/

379

*/

380

uptodate = 1;

380

uptodate = 1;

381

if (q->orderr)

381

if (q->orderr)

382

uptodate = q->orderr;

382

uptodate = q->orderr;

383

384

q->ordseq = 0;

384

q->ordseq = 0;

385

rq = q->orig_bar_rq;

385

rq = q->orig_bar_rq;

386

387

end_that_request_first(rq, uptodate, rq->hard_nr_sectors);

387

end_that_request_first(rq, uptodate, rq->hard_nr_sectors);

388

end_that_request_last(rq, uptodate);

388

end_that_request_last(rq, uptodate);

389

}

389

}

390

391

static void pre_flush_end_io(struct request *rq, int error)

391

static void pre_flush_end_io(struct request *rq, int error)

392

{

392

{

393

elv_completed_request(rq->q, rq);

393

elv_completed_request(rq->q, rq);

394

blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);

394

blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);

395

}

395

}

396

397

static void bar_end_io(struct request *rq, int error)

397

static void bar_end_io(struct request *rq, int error)

398

{

398

{

399

elv_completed_request(rq->q, rq);

399

elv_completed_request(rq->q, rq);

400

blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);

400

blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);

401

}

401

}

402

403

static void post_flush_end_io(struct request *rq, int error)

403

static void post_flush_end_io(struct request *rq, int error)

404

{

404

{

405

elv_completed_request(rq->q, rq);

405

elv_completed_request(rq->q, rq);

406

blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);

406

blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);

407

}

407

}

408

409

static void queue_flush(struct request_queue *q, unsigned which)

409

static void queue_flush(struct request_queue *q, unsigned which)

410

{

410

{

411

struct request *rq;

411

struct request *rq;

412

rq_end_io_fn *end_io;

412

rq_end_io_fn *end_io;

413

414

if (which == QUEUE_ORDERED_PREFLUSH) {

414

if (which == QUEUE_ORDERED_PREFLUSH) {

415

rq = &q->pre_flush_rq;

415

rq = &q->pre_flush_rq;

416

end_io = pre_flush_end_io;

416

end_io = pre_flush_end_io;

417

} else {

417

} else {

418

rq = &q->post_flush_rq;

418

rq = &q->post_flush_rq;

419

end_io = post_flush_end_io;

419

end_io = post_flush_end_io;

420

}

420

}

421

422

rq->cmd_flags = REQ_HARDBARRIER;

422

rq->cmd_flags = REQ_HARDBARRIER;

423

rq_init(q, rq);

423

rq_init(q, rq);

424

rq->elevator_private = NULL;

424

rq->elevator_private = NULL;

425

rq->elevator_private2 = NULL;

425

rq->elevator_private2 = NULL;

426

rq->rq_disk = q->bar_rq.rq_disk;

426

rq->rq_disk = q->bar_rq.rq_disk;

427

rq->end_io = end_io;

427

rq->end_io = end_io;

428

q->prepare_flush_fn(q, rq);

428

q->prepare_flush_fn(q, rq);

429

430

elv_insert(q, rq, ELEVATOR_INSERT_FRONT);

430

elv_insert(q, rq, ELEVATOR_INSERT_FRONT);

431

}

431

}

432

433

static inline struct request *start_ordered(struct request_queue *q,

433

static inline struct request *start_ordered(struct request_queue *q,

434

struct request *rq)

434

struct request *rq)

435

{

435

{

436

q->orderr = 0;

436

q->orderr = 0;

437

q->ordered = q->next_ordered;

437

q->ordered = q->next_ordered;

438

q->ordseq |= QUEUE_ORDSEQ_STARTED;

438

q->ordseq |= QUEUE_ORDSEQ_STARTED;

439

440

/*

440

/*

441

* Prep proxy barrier request.

441

* Prep proxy barrier request.

442

*/

442

*/

443

blkdev_dequeue_request(rq);

443

blkdev_dequeue_request(rq);

444

q->orig_bar_rq = rq;

444

q->orig_bar_rq = rq;

445

rq = &q->bar_rq;

445

rq = &q->bar_rq;

446

rq->cmd_flags = 0;

446

rq->cmd_flags = 0;

447

rq_init(q, rq);

447

rq_init(q, rq);

448

if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)

448

if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)

449

rq->cmd_flags |= REQ_RW;

449

rq->cmd_flags |= REQ_RW;

450

if (q->ordered & QUEUE_ORDERED_FUA)

450

if (q->ordered & QUEUE_ORDERED_FUA)

451

rq->cmd_flags |= REQ_FUA;

451

rq->cmd_flags |= REQ_FUA;

452

rq->elevator_private = NULL;

452

rq->elevator_private = NULL;

453

rq->elevator_private2 = NULL;

453

rq->elevator_private2 = NULL;

454

init_request_from_bio(rq, q->orig_bar_rq->bio);

454

init_request_from_bio(rq, q->orig_bar_rq->bio);

455

rq->end_io = bar_end_io;

455

rq->end_io = bar_end_io;

456

457

/*

457

/*

458

* Queue ordered sequence. As we stack them at the head, we

458

* Queue ordered sequence. As we stack them at the head, we

459

* need to queue in reverse order. Note that we rely on that

459

* need to queue in reverse order. Note that we rely on that

460

* no fs request uses ELEVATOR_INSERT_FRONT and thus no fs

460

* no fs request uses ELEVATOR_INSERT_FRONT and thus no fs

461

* request gets inbetween ordered sequence.

461

* request gets inbetween ordered sequence.

462

*/

462

*/

463

if (q->ordered & QUEUE_ORDERED_POSTFLUSH)

463

if (q->ordered & QUEUE_ORDERED_POSTFLUSH)

464

queue_flush(q, QUEUE_ORDERED_POSTFLUSH);

464

queue_flush(q, QUEUE_ORDERED_POSTFLUSH);

465

else

465

else

466

q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;

466

q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;

467

468

elv_insert(q, rq, ELEVATOR_INSERT_FRONT);

468

elv_insert(q, rq, ELEVATOR_INSERT_FRONT);

469

470

if (q->ordered & QUEUE_ORDERED_PREFLUSH) {

470

if (q->ordered & QUEUE_ORDERED_PREFLUSH) {

471

queue_flush(q, QUEUE_ORDERED_PREFLUSH);

471

queue_flush(q, QUEUE_ORDERED_PREFLUSH);

472

rq = &q->pre_flush_rq;

472

rq = &q->pre_flush_rq;

473

} else

473

} else

474

q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;

474

q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;

475

476

if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)

476

if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)

477

q->ordseq |= QUEUE_ORDSEQ_DRAIN;

477

q->ordseq |= QUEUE_ORDSEQ_DRAIN;

478

else

478

else

479

rq = NULL;

479

rq = NULL;

480

481

return rq;

481

return rq;

482

}

482

}

483

484

int blk_do_ordered(struct request_queue *q, struct request **rqp)

484

int blk_do_ordered(struct request_queue *q, struct request **rqp)

485

{

485

{

486

struct request *rq = *rqp;

486

struct request *rq = *rqp;

487

int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);

487

int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);

488

489

if (!q->ordseq) {

489

if (!q->ordseq) {

490

if (!is_barrier)

490

if (!is_barrier)

491

return 1;

491

return 1;

492

493

if (q->next_ordered != QUEUE_ORDERED_NONE) {

493

if (q->next_ordered != QUEUE_ORDERED_NONE) {

494

*rqp = start_ordered(q, rq);

494

*rqp = start_ordered(q, rq);

495

return 1;

495

return 1;

496

} else {

496

} else {

497

/*

497

/*

498

* This can happen when the queue switches to

498

* This can happen when the queue switches to

499

* ORDERED_NONE while this request is on it.

499

* ORDERED_NONE while this request is on it.

500

*/

500

*/

501

blkdev_dequeue_request(rq);

501

blkdev_dequeue_request(rq);

502

end_that_request_first(rq, -EOPNOTSUPP,

502

end_that_request_first(rq, -EOPNOTSUPP,

503

rq->hard_nr_sectors);

503

rq->hard_nr_sectors);

504

end_that_request_last(rq, -EOPNOTSUPP);

504

end_that_request_last(rq, -EOPNOTSUPP);

505

*rqp = NULL;

505

*rqp = NULL;

506

return 0;

506

return 0;

507

}

507

}

508

}

508

}

509

510

/*

510

/*

511

* Ordered sequence in progress

511

* Ordered sequence in progress

512

*/

512

*/

513

514

/* Special requests are not subject to ordering rules. */

514

/* Special requests are not subject to ordering rules. */

515

if (!blk_fs_request(rq) &&

515

if (!blk_fs_request(rq) &&

516

rq != &q->pre_flush_rq && rq != &q->post_flush_rq)

516

rq != &q->pre_flush_rq && rq != &q->post_flush_rq)

517

return 1;

517

return 1;

518

519

if (q->ordered & QUEUE_ORDERED_TAG) {

519

if (q->ordered & QUEUE_ORDERED_TAG) {

520

/* Ordered by tag. Blocking the next barrier is enough. */

520

/* Ordered by tag. Blocking the next barrier is enough. */

521

if (is_barrier && rq != &q->bar_rq)

521

if (is_barrier && rq != &q->bar_rq)

522

*rqp = NULL;

522

*rqp = NULL;

523

} else {

523

} else {

524

/* Ordered by draining. Wait for turn. */

524

/* Ordered by draining. Wait for turn. */

525

WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));

525

WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));

526

if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))

526

if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))

527

*rqp = NULL;

527

*rqp = NULL;

528

}

528

}

529

530

return 1;

530

return 1;

531

}

531

}

532

533

static void req_bio_endio(struct request *rq, struct bio *bio,

533

static void req_bio_endio(struct request *rq, struct bio *bio,

534

unsigned int nbytes, int error)

534

unsigned int nbytes, int error)

535

{

535

{

536

struct request_queue *q = rq->q;

536

struct request_queue *q = rq->q;

537

538

if (&q->bar_rq != rq) {

538

if (&q->bar_rq != rq) {

539

if (error)

539

if (error)

540

clear_bit(BIO_UPTODATE, &bio->bi_flags);

540

clear_bit(BIO_UPTODATE, &bio->bi_flags);

541

else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))

541

else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))

542

error = -EIO;

542

error = -EIO;

543

544

if (unlikely(nbytes > bio->bi_size)) {

544

if (unlikely(nbytes > bio->bi_size)) {

545

printk("%s: want %u bytes done, only %u left\n",

545

printk("%s: want %u bytes done, only %u left\n",

546

__FUNCTION__, nbytes, bio->bi_size);

546

__FUNCTION__, nbytes, bio->bi_size);

547

nbytes = bio->bi_size;

547

nbytes = bio->bi_size;

548

}

548

}

549

550

bio->bi_size -= nbytes;

550

bio->bi_size -= nbytes;

551

bio->bi_sector += (nbytes >> 9);

551

bio->bi_sector += (nbytes >> 9);

552

if (bio->bi_size == 0)

552

if (bio->bi_size == 0)

553

bio_endio(bio, error);

553

bio_endio(bio, error);

554

} else {

554

} else {

555

556

/*

556

/*

557

* Okay, this is the barrier request in progress, just

557

* Okay, this is the barrier request in progress, just

558

* record the error;

558

* record the error;

559

*/

559

*/

560

if (error && !q->orderr)

560

if (error && !q->orderr)

561

q->orderr = error;

561

q->orderr = error;

562

}

562

}

563

}

563

}

564

565

/**

565

/**

566

* blk_queue_bounce_limit - set bounce buffer limit for queue

566

* blk_queue_bounce_limit - set bounce buffer limit for queue

567

* @q: the request queue for the device

567

* @q: the request queue for the device

568

* @dma_addr: bus address limit

568

* @dma_addr: bus address limit

569

*

569

*

570

* Description:

570

* Description:

571

* Different hardware can have different requirements as to what pages

571

* Different hardware can have different requirements as to what pages

572

* it can do I/O directly to. A low level driver can call

572

* it can do I/O directly to. A low level driver can call

573

* blk_queue_bounce_limit to have lower memory pages allocated as bounce

573

* blk_queue_bounce_limit to have lower memory pages allocated as bounce

574

* buffers for doing I/O to pages residing above @page.

574

* buffers for doing I/O to pages residing above @page.

575

**/

575

**/

576

void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)

576

void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)

577

{

577

{

578

unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;

578

unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;

579

int dma = 0;

579

int dma = 0;

580

581

q->bounce_gfp = GFP_NOIO;

581

q->bounce_gfp = GFP_NOIO;

582

#if BITS_PER_LONG == 64

582

#if BITS_PER_LONG == 64

583

/* Assume anything <= 4GB can be handled by IOMMU.

583

/* Assume anything <= 4GB can be handled by IOMMU.

584

Actually some IOMMUs can handle everything, but I don't

584

Actually some IOMMUs can handle everything, but I don't

585

know of a way to test this here. */

585

know of a way to test this here. */

586

if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT))

586

if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT))

587

dma = 1;

587

dma = 1;

588

q->bounce_pfn = max_low_pfn;

588

q->bounce_pfn = max_low_pfn;

589

#else

589

#else

590

if (bounce_pfn < blk_max_low_pfn)

590

if (bounce_pfn < blk_max_low_pfn)

591

dma = 1;

591

dma = 1;

592

q->bounce_pfn = bounce_pfn;

592

q->bounce_pfn = bounce_pfn;

593

#endif

593

#endif

594

if (dma) {

594

if (dma) {

595

init_emergency_isa_pool();

595

init_emergency_isa_pool();

596

q->bounce_gfp = GFP_NOIO | GFP_DMA;

596

q->bounce_gfp = GFP_NOIO | GFP_DMA;

597

q->bounce_pfn = bounce_pfn;

597

q->bounce_pfn = bounce_pfn;

598

}

598

}

599

}

599

}

600

601

EXPORT_SYMBOL(blk_queue_bounce_limit);

601

EXPORT_SYMBOL(blk_queue_bounce_limit);

602

603

/**

603

/**

604

* blk_queue_max_sectors - set max sectors for a request for this queue

604

* blk_queue_max_sectors - set max sectors for a request for this queue

605

* @q: the request queue for the device

605

* @q: the request queue for the device

606

* @max_sectors: max sectors in the usual 512b unit

606

* @max_sectors: max sectors in the usual 512b unit

607

*

607

*

608

* Description:

608

* Description:

609

* Enables a low level driver to set an upper limit on the size of

609

* Enables a low level driver to set an upper limit on the size of

610

* received requests.

610

* received requests.

611

**/

611

**/

612

void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)

612

void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)

613

{

613

{

614

if ((max_sectors << 9) < PAGE_CACHE_SIZE) {

614

if ((max_sectors << 9) < PAGE_CACHE_SIZE) {

615

max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);

615

max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);

616

printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);

616

printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);

617

}

617

}

618

619

if (BLK_DEF_MAX_SECTORS > max_sectors)

619

if (BLK_DEF_MAX_SECTORS > max_sectors)

620

q->max_hw_sectors = q->max_sectors = max_sectors;

620

q->max_hw_sectors = q->max_sectors = max_sectors;

621

else {

621

else {

622

q->max_sectors = BLK_DEF_MAX_SECTORS;

622

q->max_sectors = BLK_DEF_MAX_SECTORS;

623

q->max_hw_sectors = max_sectors;

623

q->max_hw_sectors = max_sectors;

624

}

624

}

625

}

625

}

626

627

EXPORT_SYMBOL(blk_queue_max_sectors);

627

EXPORT_SYMBOL(blk_queue_max_sectors);

628

629

/**

629

/**

630

* blk_queue_max_phys_segments - set max phys segments for a request for this queue

630

* blk_queue_max_phys_segments - set max phys segments for a request for this queue

631

* @q: the request queue for the device

631

* @q: the request queue for the device

632

* @max_segments: max number of segments

632

* @max_segments: max number of segments

633

*

633

*

634

* Description:

634

* Description:

635

* Enables a low level driver to set an upper limit on the number of

635

* Enables a low level driver to set an upper limit on the number of

636

* physical data segments in a request. This would be the largest sized

636

* physical data segments in a request. This would be the largest sized

637

* scatter list the driver could handle.

637

* scatter list the driver could handle.

638

**/

638

**/

639

void blk_queue_max_phys_segments(struct request_queue *q,

639

void blk_queue_max_phys_segments(struct request_queue *q,

640

unsigned short max_segments)

640

unsigned short max_segments)

641

{

641

{

642

if (!max_segments) {

642

if (!max_segments) {

643

max_segments = 1;

643

max_segments = 1;

644

printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);

644

printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);

645

}

645

}

646

647

q->max_phys_segments = max_segments;

647

q->max_phys_segments = max_segments;

648

}

648

}

649

650

EXPORT_SYMBOL(blk_queue_max_phys_segments);

650

EXPORT_SYMBOL(blk_queue_max_phys_segments);

651

652

/**

652

/**

653

* blk_queue_max_hw_segments - set max hw segments for a request for this queue

653

* blk_queue_max_hw_segments - set max hw segments for a request for this queue

654

* @q: the request queue for the device

654

* @q: the request queue for the device

655

* @max_segments: max number of segments

655

* @max_segments: max number of segments

656

*

656

*

657

* Description:

657

* Description:

658

* Enables a low level driver to set an upper limit on the number of

658

* Enables a low level driver to set an upper limit on the number of

659

* hw data segments in a request. This would be the largest number of

659

* hw data segments in a request. This would be the largest number of

660

* address/length pairs the host adapter can actually give as once

660

* address/length pairs the host adapter can actually give as once

661

* to the device.

661

* to the device.

662

**/

662

**/

663

void blk_queue_max_hw_segments(struct request_queue *q,

663

void blk_queue_max_hw_segments(struct request_queue *q,

664

unsigned short max_segments)

664

unsigned short max_segments)

665

{

665

{

666

if (!max_segments) {

666

if (!max_segments) {

667

max_segments = 1;

667

max_segments = 1;

668

printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);

668

printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);

669

}

669

}

670

671

q->max_hw_segments = max_segments;

671

q->max_hw_segments = max_segments;

672

}

672

}

673

674

EXPORT_SYMBOL(blk_queue_max_hw_segments);

674

EXPORT_SYMBOL(blk_queue_max_hw_segments);

675

676

/**

676

/**

677

* blk_queue_max_segment_size - set max segment size for blk_rq_map_sg

677

* blk_queue_max_segment_size - set max segment size for blk_rq_map_sg

678

* @q: the request queue for the device

678

* @q: the request queue for the device

679

* @max_size: max size of segment in bytes

679

* @max_size: max size of segment in bytes

680

*

680

*

681

* Description:

681

* Description:

682

* Enables a low level driver to set an upper limit on the size of a

682

* Enables a low level driver to set an upper limit on the size of a

683

* coalesced segment

683

* coalesced segment

684

**/

684

**/

685

void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)

685

void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)

686

{

686

{

687

if (max_size < PAGE_CACHE_SIZE) {

687

if (max_size < PAGE_CACHE_SIZE) {

688

max_size = PAGE_CACHE_SIZE;

688

max_size = PAGE_CACHE_SIZE;

689

printk("%s: set to minimum %d\n", __FUNCTION__, max_size);

689

printk("%s: set to minimum %d\n", __FUNCTION__, max_size);

690

}

690

}

691

692

q->max_segment_size = max_size;

692

q->max_segment_size = max_size;

693

}

693

}

694

695

EXPORT_SYMBOL(blk_queue_max_segment_size);

695

EXPORT_SYMBOL(blk_queue_max_segment_size);

696

697

/**

697

/**

698

* blk_queue_hardsect_size - set hardware sector size for the queue

698

* blk_queue_hardsect_size - set hardware sector size for the queue

699

* @q: the request queue for the device

699

* @q: the request queue for the device

700

* @size: the hardware sector size, in bytes

700

* @size: the hardware sector size, in bytes

701

*

701

*

702

* Description:

702

* Description:

703

* This should typically be set to the lowest possible sector size

703

* This should typically be set to the lowest possible sector size

704

* that the hardware can operate on (possible without reverting to

704

* that the hardware can operate on (possible without reverting to

705

* even internal read-modify-write operations). Usually the default

705

* even internal read-modify-write operations). Usually the default

706

* of 512 covers most hardware.

706

* of 512 covers most hardware.

707

**/

707

**/

708

void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)

708

void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)

709

{

709

{

710

q->hardsect_size = size;

710

q->hardsect_size = size;

711

}

711

}

712

713

EXPORT_SYMBOL(blk_queue_hardsect_size);

713

EXPORT_SYMBOL(blk_queue_hardsect_size);

714

715

/*

715

/*

716

* Returns the minimum that is _not_ zero, unless both are zero.

716

* Returns the minimum that is _not_ zero, unless both are zero.

717

*/

717

*/

718

#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))

718

#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))

719

720

/**

720

/**

721

* blk_queue_stack_limits - inherit underlying queue limits for stacked drivers

721

* blk_queue_stack_limits - inherit underlying queue limits for stacked drivers

722

* @t: the stacking driver (top)

722

* @t: the stacking driver (top)

723

* @b: the underlying device (bottom)

723

* @b: the underlying device (bottom)

724

**/

724

**/

725

void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)

725

void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)

726

{

726

{

727

/* zero is "infinity" */

727

/* zero is "infinity" */

728

t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);

728

t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);

729

t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);

729

t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);

730

731

t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);

731

t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);

732

t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);

732

t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);

733

t->max_segment_size = min(t->max_segment_size,b->max_segment_size);

733

t->max_segment_size = min(t->max_segment_size,b->max_segment_size);

734

t->hardsect_size = max(t->hardsect_size,b->hardsect_size);

734

t->hardsect_size = max(t->hardsect_size,b->hardsect_size);

735

if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))

735

if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))

736

clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);

736

clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);

737

}

737

}

738

739

EXPORT_SYMBOL(blk_queue_stack_limits);

739

EXPORT_SYMBOL(blk_queue_stack_limits);

740

741

/**

741

/**

742

* blk_queue_segment_boundary - set boundary rules for segment merging

742

* blk_queue_segment_boundary - set boundary rules for segment merging

743

* @q: the request queue for the device

743

* @q: the request queue for the device

744

* @mask: the memory boundary mask

744

* @mask: the memory boundary mask

745

**/

745

**/

746

void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)

746

void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)

747

{

747

{

748

if (mask < PAGE_CACHE_SIZE - 1) {

748

if (mask < PAGE_CACHE_SIZE - 1) {

749

mask = PAGE_CACHE_SIZE - 1;

749

mask = PAGE_CACHE_SIZE - 1;

750

printk("%s: set to minimum %lx\n", __FUNCTION__, mask);

750

printk("%s: set to minimum %lx\n", __FUNCTION__, mask);

751

}

751

}

752

753

q->seg_boundary_mask = mask;

753

q->seg_boundary_mask = mask;

754

}

754

}

755

756

EXPORT_SYMBOL(blk_queue_segment_boundary);

756

EXPORT_SYMBOL(blk_queue_segment_boundary);

757

758

/**

758

/**

759

* blk_queue_dma_alignment - set dma length and memory alignment

759

* blk_queue_dma_alignment - set dma length and memory alignment

760

* @q: the request queue for the device

760

* @q: the request queue for the device

761

* @mask: alignment mask

761

* @mask: alignment mask

762

*

762

*

763

* description:

763

* description:

764

* set required memory and length aligment for direct dma transactions.

764

* set required memory and length aligment for direct dma transactions.

765

* this is used when buiding direct io requests for the queue.

765

* this is used when buiding direct io requests for the queue.

766

*

766

*

767

**/

767

**/

768

void blk_queue_dma_alignment(struct request_queue *q, int mask)

768

void blk_queue_dma_alignment(struct request_queue *q, int mask)

769

{

769

{

770

q->dma_alignment = mask;

770

q->dma_alignment = mask;

771

}

771

}

772

773

EXPORT_SYMBOL(blk_queue_dma_alignment);

773

EXPORT_SYMBOL(blk_queue_dma_alignment);

774

775

/**

775

/**

776

* blk_queue_find_tag - find a request by its tag and queue

776

* blk_queue_find_tag - find a request by its tag and queue

777

* @q: The request queue for the device

777

* @q: The request queue for the device

778

* @tag: The tag of the request

778

* @tag: The tag of the request

779

*

779

*

780

* Notes:

780

* Notes:

781

* Should be used when a device returns a tag and you want to match

781

* Should be used when a device returns a tag and you want to match

782

* it with a request.

782

* it with a request.

783

*

783

*

784

* no locks need be held.

784

* no locks need be held.

785

**/

785

**/

786

struct request *blk_queue_find_tag(struct request_queue *q, int tag)

786

struct request *blk_queue_find_tag(struct request_queue *q, int tag)

787

{

787

{

788

return blk_map_queue_find_tag(q->queue_tags, tag);

788

return blk_map_queue_find_tag(q->queue_tags, tag);

789

}

789

}

790

791

EXPORT_SYMBOL(blk_queue_find_tag);

791

EXPORT_SYMBOL(blk_queue_find_tag);

792

793

/**

793

/**

794

* __blk_free_tags - release a given set of tag maintenance info

794

* __blk_free_tags - release a given set of tag maintenance info

795

* @bqt: the tag map to free

795

* @bqt: the tag map to free

796

*

796

*

797

* Tries to free the specified @bqt@. Returns true if it was

797

* Tries to free the specified @bqt@. Returns true if it was

798

* actually freed and false if there are still references using it

798

* actually freed and false if there are still references using it

799

*/

799

*/

800

static int __blk_free_tags(struct blk_queue_tag *bqt)

800

static int __blk_free_tags(struct blk_queue_tag *bqt)

801

{

801

{

802

int retval;

802

int retval;

803

804

retval = atomic_dec_and_test(&bqt->refcnt);

804

retval = atomic_dec_and_test(&bqt->refcnt);

805

if (retval) {

805

if (retval) {

806

BUG_ON(bqt->busy);

806

BUG_ON(bqt->busy);

807

BUG_ON(!list_empty(&bqt->busy_list));

807

BUG_ON(!list_empty(&bqt->busy_list));

808

809

kfree(bqt->tag_index);

809

kfree(bqt->tag_index);

810

bqt->tag_index = NULL;

810

bqt->tag_index = NULL;

811

812

kfree(bqt->tag_map);

812

kfree(bqt->tag_map);

813

bqt->tag_map = NULL;

813

bqt->tag_map = NULL;

814

815

kfree(bqt);

815

kfree(bqt);

816

817

}

817

}

818

819

return retval;

819

return retval;

820

}

820

}

821

822

/**

822

/**

823

* __blk_queue_free_tags - release tag maintenance info

823

* __blk_queue_free_tags - release tag maintenance info

824

* @q: the request queue for the device

824

* @q: the request queue for the device

825

*

825

*

826

* Notes:

826

* Notes:

827

* blk_cleanup_queue() will take care of calling this function, if tagging

827

* blk_cleanup_queue() will take care of calling this function, if tagging

828

* has been used. So there's no need to call this directly.

828

* has been used. So there's no need to call this directly.

829

**/

829

**/

830

static void __blk_queue_free_tags(struct request_queue *q)

830

static void __blk_queue_free_tags(struct request_queue *q)

831

{

831

{

832

struct blk_queue_tag *bqt = q->queue_tags;

832

struct blk_queue_tag *bqt = q->queue_tags;

833

834

if (!bqt)

834

if (!bqt)

835

return;

835

return;

836

837

__blk_free_tags(bqt);

837

__blk_free_tags(bqt);

838

839

q->queue_tags = NULL;

839

q->queue_tags = NULL;

840

q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);

840

q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);

841

}

841

}

842

843

844

/**

844

/**

845

* blk_free_tags - release a given set of tag maintenance info

845

* blk_free_tags - release a given set of tag maintenance info

846

* @bqt: the tag map to free

846

* @bqt: the tag map to free

847

*

847

*

848

* For externally managed @bqt@ frees the map. Callers of this

848

* For externally managed @bqt@ frees the map. Callers of this

849

* function must guarantee to have released all the queues that

849

* function must guarantee to have released all the queues that

850

* might have been using this tag map.

850

* might have been using this tag map.

851

*/

851

*/

852

void blk_free_tags(struct blk_queue_tag *bqt)

852

void blk_free_tags(struct blk_queue_tag *bqt)

853

{

853

{

854

if (unlikely(!__blk_free_tags(bqt)))

854

if (unlikely(!__blk_free_tags(bqt)))

855

BUG();

855

BUG();

856

}

856

}

857

EXPORT_SYMBOL(blk_free_tags);

857

EXPORT_SYMBOL(blk_free_tags);

858

859

/**

859

/**

860

* blk_queue_free_tags - release tag maintenance info

860

* blk_queue_free_tags - release tag maintenance info

861

* @q: the request queue for the device

861

* @q: the request queue for the device

862

*

862

*

863

* Notes:

863

* Notes:

864

* This is used to disabled tagged queuing to a device, yet leave

864

* This is used to disabled tagged queuing to a device, yet leave

865

* queue in function.

865

* queue in function.

866

**/

866

**/

867

void blk_queue_free_tags(struct request_queue *q)

867

void blk_queue_free_tags(struct request_queue *q)

868

{

868

{

869

clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);

869

clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);

870

}

870

}

871

872

EXPORT_SYMBOL(blk_queue_free_tags);

872

EXPORT_SYMBOL(blk_queue_free_tags);

873

874

static int

874

static int

875

init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)

875

init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)

876

{

876

{

877

struct request **tag_index;

877

struct request **tag_index;

878

unsigned long *tag_map;

878

unsigned long *tag_map;

879

int nr_ulongs;

879

int nr_ulongs;

880

881

if (q && depth > q->nr_requests * 2) {

881

if (q && depth > q->nr_requests * 2) {

882

depth = q->nr_requests * 2;

882

depth = q->nr_requests * 2;

883

printk(KERN_ERR "%s: adjusted depth to %d\n",

883

printk(KERN_ERR "%s: adjusted depth to %d\n",

884

__FUNCTION__, depth);

884

__FUNCTION__, depth);

885

}

885

}

886

887

tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);

887

tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);

888

if (!tag_index)

888

if (!tag_index)

889

goto fail;

889

goto fail;

890

891

nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;

891

nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;

892

tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);

892

tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);

893

if (!tag_map)

893

if (!tag_map)

894

goto fail;

894

goto fail;

895

896

tags->real_max_depth = depth;

896

tags->real_max_depth = depth;

897

tags->max_depth = depth;

897

tags->max_depth = depth;

898

tags->tag_index = tag_index;

898

tags->tag_index = tag_index;

899

tags->tag_map = tag_map;

899

tags->tag_map = tag_map;

900

901

return 0;

901

return 0;

902

fail:

902

fail:

903

kfree(tag_index);

903

kfree(tag_index);

904

return -ENOMEM;

904

return -ENOMEM;

905

}

905

}

906

907

static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,

907

static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,

908

int depth)

908

int depth)

909

{

909

{

910

struct blk_queue_tag *tags;

910

struct blk_queue_tag *tags;

911

912

tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);

912

tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);

913

if (!tags)

913

if (!tags)

914

goto fail;

914

goto fail;

915

916

if (init_tag_map(q, tags, depth))

916

if (init_tag_map(q, tags, depth))

917

goto fail;

917

goto fail;

918

919

INIT_LIST_HEAD(&tags->busy_list);

919

INIT_LIST_HEAD(&tags->busy_list);

920

tags->busy = 0;

920

tags->busy = 0;

921

atomic_set(&tags->refcnt, 1);

921

atomic_set(&tags->refcnt, 1);

922

return tags;

922

return tags;

923

fail:

923

fail:

924

kfree(tags);

924

kfree(tags);

925

return NULL;

925

return NULL;

926

}

926

}

927

928

/**

928

/**

929

* blk_init_tags - initialize the tag info for an external tag map

929

* blk_init_tags - initialize the tag info for an external tag map

930

* @depth: the maximum queue depth supported

930

* @depth: the maximum queue depth supported

931

* @tags: the tag to use

931

* @tags: the tag to use

932

**/

932

**/

933

struct blk_queue_tag *blk_init_tags(int depth)

933

struct blk_queue_tag *blk_init_tags(int depth)

934

{

934

{

935

return __blk_queue_init_tags(NULL, depth);

935

return __blk_queue_init_tags(NULL, depth);

936

}

936

}

937

EXPORT_SYMBOL(blk_init_tags);

937

EXPORT_SYMBOL(blk_init_tags);

938

939

/**

939

/**

940

* blk_queue_init_tags - initialize the queue tag info

940

* blk_queue_init_tags - initialize the queue tag info

941

* @q: the request queue for the device

941

* @q: the request queue for the device

942

* @depth: the maximum queue depth supported

942

* @depth: the maximum queue depth supported

943

* @tags: the tag to use

943

* @tags: the tag to use

944

**/

944

**/

945

int blk_queue_init_tags(struct request_queue *q, int depth,

945

int blk_queue_init_tags(struct request_queue *q, int depth,

946

struct blk_queue_tag *tags)

946

struct blk_queue_tag *tags)

947

{

947

{

948

int rc;

948

int rc;

949

950

BUG_ON(tags && q->queue_tags && tags != q->queue_tags);

950

BUG_ON(tags && q->queue_tags && tags != q->queue_tags);

951

952

if (!tags && !q->queue_tags) {

952

if (!tags && !q->queue_tags) {

953

tags = __blk_queue_init_tags(q, depth);

953

tags = __blk_queue_init_tags(q, depth);

954

955

if (!tags)

955

if (!tags)

956

goto fail;

956

goto fail;

957

} else if (q->queue_tags) {

957

} else if (q->queue_tags) {

958

if ((rc = blk_queue_resize_tags(q, depth)))

958

if ((rc = blk_queue_resize_tags(q, depth)))

959

return rc;

959

return rc;

960

set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);

960

set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);

961

return 0;

961

return 0;

962

} else

962

} else

963

atomic_inc(&tags->refcnt);

963

atomic_inc(&tags->refcnt);

964

965

/*

965

/*

966

* assign it, all done

966

* assign it, all done

967

*/

967

*/

968

q->queue_tags = tags;

968

q->queue_tags = tags;

969

q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);

969

q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);

970

return 0;

970

return 0;

971

fail:

971

fail:

972

kfree(tags);

972

kfree(tags);

973

return -ENOMEM;

973

return -ENOMEM;

974

}

974

}

975

976

EXPORT_SYMBOL(blk_queue_init_tags);

976

EXPORT_SYMBOL(blk_queue_init_tags);

977

978

/**

978

/**

979

* blk_queue_resize_tags - change the queueing depth

979

* blk_queue_resize_tags - change the queueing depth

980

* @q: the request queue for the device

980

* @q: the request queue for the device

981

* @new_depth: the new max command queueing depth

981

* @new_depth: the new max command queueing depth

982

*

982

*

983

* Notes:

983

* Notes:

984

* Must be called with the queue lock held.

984

* Must be called with the queue lock held.

985

**/

985

**/

986

int blk_queue_resize_tags(struct request_queue *q, int new_depth)

986

int blk_queue_resize_tags(struct request_queue *q, int new_depth)

987

{

987

{

988

struct blk_queue_tag *bqt = q->queue_tags;

988

struct blk_queue_tag *bqt = q->queue_tags;

989

struct request **tag_index;

989

struct request **tag_index;

990

unsigned long *tag_map;

990

unsigned long *tag_map;

991

int max_depth, nr_ulongs;

991

int max_depth, nr_ulongs;

992

993

if (!bqt)

993

if (!bqt)

994

return -ENXIO;

994

return -ENXIO;

995

996

/*

996

/*

997

* if we already have large enough real_max_depth. just

997

* if we already have large enough real_max_depth. just

998

* adjust max_depth. *NOTE* as requests with tag value

998

* adjust max_depth. *NOTE* as requests with tag value

999

* between new_depth and real_max_depth can be in-flight, tag

999

* between new_depth and real_max_depth can be in-flight, tag

1000

* map can not be shrunk blindly here.

1000

* map can not be shrunk blindly here.

1001

*/

1001

*/

1002

if (new_depth <= bqt->real_max_depth) {

1002

if (new_depth <= bqt->real_max_depth) {

1003

bqt->max_depth = new_depth;

1003

bqt->max_depth = new_depth;

1004

return 0;

1004

return 0;

1005

}

1005

}

1006

1007

/*

1007

/*

1008

* Currently cannot replace a shared tag map with a new

1008

* Currently cannot replace a shared tag map with a new

1009

* one, so error out if this is the case

1009

* one, so error out if this is the case

1010

*/

1010

*/

1011

if (atomic_read(&bqt->refcnt) != 1)

1011

if (atomic_read(&bqt->refcnt) != 1)

1012

return -EBUSY;

1012

return -EBUSY;

1013

1014

/*

1014

/*

1015

* save the old state info, so we can copy it back

1015

* save the old state info, so we can copy it back

1016

*/

1016

*/

1017

tag_index = bqt->tag_index;

1017

tag_index = bqt->tag_index;

1018

tag_map = bqt->tag_map;

1018

tag_map = bqt->tag_map;

1019

max_depth = bqt->real_max_depth;

1019

max_depth = bqt->real_max_depth;

1020

1021

if (init_tag_map(q, bqt, new_depth))

1021

if (init_tag_map(q, bqt, new_depth))

1022

return -ENOMEM;

1022

return -ENOMEM;

1023

1024

memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));

1024

memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));

1025

nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;

1025

nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;

1026

memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));

1026

memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));

1027

1028

kfree(tag_index);

1028

kfree(tag_index);

1029

kfree(tag_map);

1029

kfree(tag_map);

1030

return 0;

1030

return 0;

1031

}

1031

}

1032

1033

EXPORT_SYMBOL(blk_queue_resize_tags);

1033

EXPORT_SYMBOL(blk_queue_resize_tags);

1034

1035

/**

1035

/**

1036

* blk_queue_end_tag - end tag operations for a request

1036

* blk_queue_end_tag - end tag operations for a request

1037

* @q: the request queue for the device

1037

* @q: the request queue for the device

1038

* @rq: the request that has completed

1038

* @rq: the request that has completed

1039

*

1039

*

1040

* Description:

1040

* Description:

1041

* Typically called when end_that_request_first() returns 0, meaning

1041

* Typically called when end_that_request_first() returns 0, meaning

1042

* all transfers have been done for a request. It's important to call

1042

* all transfers have been done for a request. It's important to call

1043

* this function before end_that_request_last(), as that will put the

1043

* this function before end_that_request_last(), as that will put the

1044

* request back on the free list thus corrupting the internal tag list.

1044

* request back on the free list thus corrupting the internal tag list.

1045

*

1045

*

1046

* Notes:

1046

* Notes:

1047

* queue lock must be held.

1047

* queue lock must be held.

1048

**/

1048

**/

1049

void blk_queue_end_tag(struct request_queue *q, struct request *rq)

1049

void blk_queue_end_tag(struct request_queue *q, struct request *rq)

1050

{

1050

{

1051

struct blk_queue_tag *bqt = q->queue_tags;

1051

struct blk_queue_tag *bqt = q->queue_tags;

1052

int tag = rq->tag;

1052

int tag = rq->tag;

1053

1054

BUG_ON(tag == -1);

1054

BUG_ON(tag == -1);

1055

1056

if (unlikely(tag >= bqt->real_max_depth))

1056

if (unlikely(tag >= bqt->real_max_depth))

1057

/*

1057

/*

1058

* This can happen after tag depth has been reduced.

1058

* This can happen after tag depth has been reduced.

1059

* FIXME: how about a warning or info message here?

1059

* FIXME: how about a warning or info message here?

1060

*/

1060

*/

1061

return;

1061

return;

1062

1063

list_del_init(&rq->queuelist);

1063

list_del_init(&rq->queuelist);

1064

rq->cmd_flags &= ~REQ_QUEUED;

1064

rq->cmd_flags &= ~REQ_QUEUED;

1065

rq->tag = -1;

1065

rq->tag = -1;

1066

1067

if (unlikely(bqt->tag_index[tag] == NULL))

1067

if (unlikely(bqt->tag_index[tag] == NULL))

1068

printk(KERN_ERR "%s: tag %d is missing\n",

1068

printk(KERN_ERR "%s: tag %d is missing\n",

1069

__FUNCTION__, tag);

1069

__FUNCTION__, tag);

1070

1071

bqt->tag_index[tag] = NULL;

1071

bqt->tag_index[tag] = NULL;

1072

1073

/*

1073

/*

1074

* We use test_and_clear_bit's memory ordering properties here.

1074

* We use test_and_clear_bit's memory ordering properties here.

1075

* The tag_map bit acts as a lock for tag_index[bit], so we need

1075

* The tag_map bit acts as a lock for tag_index[bit], so we need

1076

* a barrer before clearing the bit (precisely: release semantics).

1076

* a barrer before clearing the bit (precisely: release semantics).

1077

* Could use clear_bit_unlock when it is merged.

1077

* Could use clear_bit_unlock when it is merged.

1078

*/

1078

*/

1079

if (unlikely(!test_and_clear_bit(tag, bqt->tag_map))) {

1079

if (unlikely(!test_and_clear_bit(tag, bqt->tag_map))) {

1080

printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",

1080

printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",

1081

__FUNCTION__, tag);

1081

__FUNCTION__, tag);

1082

return;

1082

return;

1083

}

1083

}

1084

1085

bqt->busy--;

1085

bqt->busy--;

1086

}

1086

}

1087

1088

EXPORT_SYMBOL(blk_queue_end_tag);

1088

EXPORT_SYMBOL(blk_queue_end_tag);

1089

1090

/**

1090

/**

1091

* blk_queue_start_tag - find a free tag and assign it

1091

* blk_queue_start_tag - find a free tag and assign it

1092

* @q: the request queue for the device

1092

* @q: the request queue for the device

1093

* @rq: the block request that needs tagging

1093

* @rq: the block request that needs tagging

1094

*

1094

*

1095

* Description:

1095

* Description:

1096

* This can either be used as a stand-alone helper, or possibly be

1096

* This can either be used as a stand-alone helper, or possibly be

1097

* assigned as the queue &prep_rq_fn (in which case &struct request

1097

* assigned as the queue &prep_rq_fn (in which case &struct request

1098

* automagically gets a tag assigned). Note that this function

1098

* automagically gets a tag assigned). Note that this function

1099

* assumes that any type of request can be queued! if this is not

1099

* assumes that any type of request can be queued! if this is not

1100

* true for your device, you must check the request type before

1100

* true for your device, you must check the request type before

1101

* calling this function. The request will also be removed from

1101

* calling this function. The request will also be removed from

1102

* the request queue, so it's the drivers responsibility to readd

1102

* the request queue, so it's the drivers responsibility to readd

1103

* it if it should need to be restarted for some reason.

1103

* it if it should need to be restarted for some reason.

1104

*

1104

*

1105

* Notes:

1105

* Notes:

1106

* queue lock must be held.

1106

* queue lock must be held.

1107

**/

1107

**/

1108

int blk_queue_start_tag(struct request_queue *q, struct request *rq)

1108

int blk_queue_start_tag(struct request_queue *q, struct request *rq)

1109

{

1109

{

1110

struct blk_queue_tag *bqt = q->queue_tags;

1110

struct blk_queue_tag *bqt = q->queue_tags;

1111

int tag;

1111

int tag;

1112

1113

if (unlikely((rq->cmd_flags & REQ_QUEUED))) {

1113

if (unlikely((rq->cmd_flags & REQ_QUEUED))) {

1114

printk(KERN_ERR

1114

printk(KERN_ERR

1115

"%s: request %p for device [%s] already tagged %d",

1115

"%s: request %p for device [%s] already tagged %d",

1116

__FUNCTION__, rq,

1116

__FUNCTION__, rq,

1117

rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);

1117

rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);

1118

BUG();

1118

BUG();

1119

}

1119

}

1120

1121

/*

1121

/*

1122

* Protect against shared tag maps, as we may not have exclusive

1122

* Protect against shared tag maps, as we may not have exclusive

1123

* access to the tag map.

1123

* access to the tag map.

1124

*/

1124

*/

1125

do {

1125

do {

1126

tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);

1126

tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);

1127

if (tag >= bqt->max_depth)

1127

if (tag >= bqt->max_depth)

1128

return 1;

1128

return 1;

1129

1130

} while (test_and_set_bit(tag, bqt->tag_map));

1130

} while (test_and_set_bit(tag, bqt->tag_map));

1131

/*

1131

/*

1132

* We rely on test_and_set_bit providing lock memory ordering semantics

1132

* We rely on test_and_set_bit providing lock memory ordering semantics

1133

* (could use test_and_set_bit_lock when it is merged).

1133

* (could use test_and_set_bit_lock when it is merged).

1134

*/

1134

*/

1135

1136

rq->cmd_flags |= REQ_QUEUED;

1136

rq->cmd_flags |= REQ_QUEUED;

1137

rq->tag = tag;

1137

rq->tag = tag;

1138

bqt->tag_index[tag] = rq;

1138

bqt->tag_index[tag] = rq;

1139

blkdev_dequeue_request(rq);

1139

blkdev_dequeue_request(rq);

1140

list_add(&rq->queuelist, &bqt->busy_list);

1140

list_add(&rq->queuelist, &bqt->busy_list);

1141

bqt->busy++;

1141

bqt->busy++;

1142

return 0;

1142

return 0;

1143

}

1143

}

1144

1145

EXPORT_SYMBOL(blk_queue_start_tag);

1145

EXPORT_SYMBOL(blk_queue_start_tag);

1146

1147

/**

1147

/**

1148

* blk_queue_invalidate_tags - invalidate all pending tags

1148

* blk_queue_invalidate_tags - invalidate all pending tags

1149

* @q: the request queue for the device

1149

* @q: the request queue for the device

1150

*

1150

*

1151

* Description:

1151

* Description:

1152

* Hardware conditions may dictate a need to stop all pending requests.

1152

* Hardware conditions may dictate a need to stop all pending requests.

1153

* In this case, we will safely clear the block side of the tag queue and

1153

* In this case, we will safely clear the block side of the tag queue and

1154

* readd all requests to the request queue in the right order.

1154

* readd all requests to the request queue in the right order.

1155

*

1155

*

1156

* Notes:

1156

* Notes:

1157

* queue lock must be held.

1157

* queue lock must be held.

1158

**/

1158

**/

1159

void blk_queue_invalidate_tags(struct request_queue *q)

1159

void blk_queue_invalidate_tags(struct request_queue *q)

1160

{

1160

{

1161

struct blk_queue_tag *bqt = q->queue_tags;

1161

struct blk_queue_tag *bqt = q->queue_tags;

1162

struct list_head *tmp, *n;

1162

struct list_head *tmp, *n;

1163

struct request *rq;

1163

struct request *rq;

1164

1165

list_for_each_safe(tmp, n, &bqt->busy_list) {

1165

list_for_each_safe(tmp, n, &bqt->busy_list) {

1166

rq = list_entry_rq(tmp);

1166

rq = list_entry_rq(tmp);

1167

1168

if (rq->tag == -1) {

1168

if (rq->tag == -1) {

1169

printk(KERN_ERR

1169

printk(KERN_ERR

1170

"%s: bad tag found on list\n", __FUNCTION__);

1170

"%s: bad tag found on list\n", __FUNCTION__);

1171

list_del_init(&rq->queuelist);

1171

list_del_init(&rq->queuelist);

1172

rq->cmd_flags &= ~REQ_QUEUED;

1172

rq->cmd_flags &= ~REQ_QUEUED;

1173

} else

1173

} else

1174

blk_queue_end_tag(q, rq);

1174

blk_queue_end_tag(q, rq);

1175

1176

rq->cmd_flags &= ~REQ_STARTED;

1176

rq->cmd_flags &= ~REQ_STARTED;

1177

__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);

1177

__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);

1178

}

1178

}

1179

}

1179

}

1180

1181

EXPORT_SYMBOL(blk_queue_invalidate_tags);

1181

EXPORT_SYMBOL(blk_queue_invalidate_tags);

1182

1183

void blk_dump_rq_flags(struct request *rq, char *msg)

1183

void blk_dump_rq_flags(struct request *rq, char *msg)

1184

{

1184

{

1185

int bit;

1185

int bit;

1186

1187

printk("%s: dev %s: type=%x, flags=%x\n", msg,

1187

printk("%s: dev %s: type=%x, flags=%x\n", msg,

1188

rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,

1188

rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,

1189

rq->cmd_flags);

1189

rq->cmd_flags);

1190

1191

printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,

1191

printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,

1192

rq->nr_sectors,

1192

rq->nr_sectors,

1193

rq->current_nr_sectors);

1193

rq->current_nr_sectors);

1194

printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);

1194

printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);

1195

1196

if (blk_pc_request(rq)) {

1196

if (blk_pc_request(rq)) {

1197

printk("cdb: ");

1197

printk("cdb: ");

1198

for (bit = 0; bit < sizeof(rq->cmd); bit++)

1198

for (bit = 0; bit < sizeof(rq->cmd); bit++)

1199

printk("%02x ", rq->cmd[bit]);

1199

printk("%02x ", rq->cmd[bit]);

1200

printk("\n");

1200

printk("\n");

1201

}

1201

}

1202

}

1202

}

1203

1204

EXPORT_SYMBOL(blk_dump_rq_flags);

1204

EXPORT_SYMBOL(blk_dump_rq_flags);

1205

1206

void blk_recount_segments(struct request_queue *q, struct bio *bio)

1206

void blk_recount_segments(struct request_queue *q, struct bio *bio)

1207

{

1207

{

1208

struct request rq;

1208

struct request rq;

1209

struct bio *nxt = bio->bi_next;

1209

struct bio *nxt = bio->bi_next;

1210

rq.q = q;

1210

rq.q = q;

1211

rq.bio = rq.biotail = bio;

1211

rq.bio = rq.biotail = bio;

1212

bio->bi_next = NULL;

1212

bio->bi_next = NULL;

1213

blk_recalc_rq_segments(&rq);

1213

blk_recalc_rq_segments(&rq);

1214

bio->bi_next = nxt;

1214

bio->bi_next = nxt;

1215

bio->bi_phys_segments = rq.nr_phys_segments;

1215

bio->bi_phys_segments = rq.nr_phys_segments;

1216

bio->bi_hw_segments = rq.nr_hw_segments;

1216

bio->bi_hw_segments = rq.nr_hw_segments;

1217

bio->bi_flags |= (1 << BIO_SEG_VALID);

1217

bio->bi_flags |= (1 << BIO_SEG_VALID);

1218

}

1218

}

1219

EXPORT_SYMBOL(blk_recount_segments);

1219

EXPORT_SYMBOL(blk_recount_segments);

1220

1221

static void blk_recalc_rq_segments(struct request *rq)

1221

static void blk_recalc_rq_segments(struct request *rq)

1222

{

1222

{

1223

int nr_phys_segs;

1223

int nr_phys_segs;

1224

int nr_hw_segs;

1224

int nr_hw_segs;

1225

unsigned int phys_size;

1225

unsigned int phys_size;

1226

unsigned int hw_size;

1226

unsigned int hw_size;

1227

struct bio_vec *bv, *bvprv = NULL;

1227

struct bio_vec *bv, *bvprv = NULL;

1228

int seg_size;

1228

int seg_size;

1229

int hw_seg_size;

1229

int hw_seg_size;

1230

int cluster;

1230

int cluster;

1231

struct req_iterator iter;

1231

struct req_iterator iter;

1232

int high, highprv = 1;

1232

int high, highprv = 1;

1233

struct request_queue *q = rq->q;

1233

struct request_queue *q = rq->q;

1234

1235

if (!rq->bio)

1235

if (!rq->bio)

1236

return;

1236

return;

1237

1238

cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);

1238

cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);

1239

hw_seg_size = seg_size = 0;

1239

hw_seg_size = seg_size = 0;

1240

phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;

1240

phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;

1241

rq_for_each_segment(bv, rq, iter) {

1241

rq_for_each_segment(bv, rq, iter) {

1242

/*

1242

/*

1243

* the trick here is making sure that a high page is never

1243

* the trick here is making sure that a high page is never

1244

* considered part of another segment, since that might

1244

* considered part of another segment, since that might

1245

* change with the bounce page.

1245

* change with the bounce page.

1246

*/

1246

*/

1247

high = page_to_pfn(bv->bv_page) > q->bounce_pfn;

1247

high = page_to_pfn(bv->bv_page) > q->bounce_pfn;

1248

if (high || highprv)

1248

if (high || highprv)

1249

goto new_hw_segment;

1249

goto new_hw_segment;

1250

if (cluster) {

1250

if (cluster) {

1251

if (seg_size + bv->bv_len > q->max_segment_size)

1251

if (seg_size + bv->bv_len > q->max_segment_size)

1252

goto new_segment;

1252

goto new_segment;

1253

if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))

1253

if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))

1254

goto new_segment;

1254

goto new_segment;

1255

if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))

1255

if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))

1256

goto new_segment;

1256

goto new_segment;

1257

if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))

1257

if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))

1258

goto new_hw_segment;

1258

goto new_hw_segment;

1259

1260

seg_size += bv->bv_len;

1260

seg_size += bv->bv_len;

1261

hw_seg_size += bv->bv_len;

1261

hw_seg_size += bv->bv_len;

1262

bvprv = bv;

1262

bvprv = bv;

1263

continue;

1263

continue;

1264

}

1264

}

1265

new_segment:

1265

new_segment:

1266

if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&

1266

if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&

1267

!BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))

1267

!BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))

1268

hw_seg_size += bv->bv_len;

1268

hw_seg_size += bv->bv_len;

1269

else {

1269

else {

1270

new_hw_segment:

1270

new_hw_segment:

1271

if (nr_hw_segs == 1 &&

1271

if (nr_hw_segs == 1 &&

1272

hw_seg_size > rq->bio->bi_hw_front_size)

1272

hw_seg_size > rq->bio->bi_hw_front_size)

1273

rq->bio->bi_hw_front_size = hw_seg_size;

1273

rq->bio->bi_hw_front_size = hw_seg_size;

1274

hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;

1274

hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;

1275

nr_hw_segs++;

1275

nr_hw_segs++;

1276

}

1276

}

1277

1278

nr_phys_segs++;

1278

nr_phys_segs++;

1279

bvprv = bv;

1279

bvprv = bv;

1280

seg_size = bv->bv_len;

1280

seg_size = bv->bv_len;

1281

highprv = high;

1281

highprv = high;

1282

}

1282

}

1283

1284

if (nr_hw_segs == 1 &&

1284

if (nr_hw_segs == 1 &&

1285

hw_seg_size > rq->bio->bi_hw_front_size)

1285

hw_seg_size > rq->bio->bi_hw_front_size)

1286

rq->bio->bi_hw_front_size = hw_seg_size;

1286

rq->bio->bi_hw_front_size = hw_seg_size;

1287

if (hw_seg_size > rq->biotail->bi_hw_back_size)

1287

if (hw_seg_size > rq->biotail->bi_hw_back_size)

1288

rq->biotail->bi_hw_back_size = hw_seg_size;

1288

rq->biotail->bi_hw_back_size = hw_seg_size;

1289

rq->nr_phys_segments = nr_phys_segs;

1289

rq->nr_phys_segments = nr_phys_segs;

1290

rq->nr_hw_segments = nr_hw_segs;

1290

rq->nr_hw_segments = nr_hw_segs;

1291

}

1291

}

1292

1293

static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,

1293

static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,

1294

struct bio *nxt)

1294

struct bio *nxt)

1295

{

1295

{

1296

if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))

1296

if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))

1297

return 0;

1297

return 0;

1298

1299

if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))

1299

if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))

1300

return 0;

1300

return 0;

1301

if (bio->bi_size + nxt->bi_size > q->max_segment_size)

1301

if (bio->bi_size + nxt->bi_size > q->max_segment_size)

1302

return 0;

1302

return 0;

1303

1304

/*

1304

/*

1305

* bio and nxt are contigous in memory, check if the queue allows

1305

* bio and nxt are contigous in memory, check if the queue allows

1306

* these two to be merged into one

1306

* these two to be merged into one

1307

*/

1307

*/

1308

if (BIO_SEG_BOUNDARY(q, bio, nxt))

1308

if (BIO_SEG_BOUNDARY(q, bio, nxt))

1309

return 1;

1309

return 1;

1310

1311

return 0;

1311

return 0;

1312

}

1312

}

1313

1314

static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,

1314

static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,

1315

struct bio *nxt)

1315

struct bio *nxt)

1316

{

1316

{

1317

if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))

1317

if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))

1318

blk_recount_segments(q, bio);

1318

blk_recount_segments(q, bio);

1319

if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))

1319

if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))

1320

blk_recount_segments(q, nxt);

1320

blk_recount_segments(q, nxt);

1321

if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||

1321

if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||

1322

BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))

1322

BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))

1323

return 0;

1323

return 0;

1324

if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)

1324

if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)

1325

return 0;

1325

return 0;

1326

1327

return 1;

1327

return 1;

1328

}

1328

}

1329

1330

/*

1330

/*

1331

* map a request to scatterlist, return number of sg entries setup. Caller

1331

* map a request to scatterlist, return number of sg entries setup. Caller

1332

* must make sure sg can hold rq->nr_phys_segments entries

1332

* must make sure sg can hold rq->nr_phys_segments entries

1333

*/

1333

*/

1334

int blk_rq_map_sg(struct request_queue *q, struct request *rq,

1334

int blk_rq_map_sg(struct request_queue *q, struct request *rq,

1335

struct scatterlist *sg)

1335

struct scatterlist *sg)

1336

{

1336

{

1337

struct bio_vec *bvec, *bvprv;

1337

struct bio_vec *bvec, *bvprv;

1338

struct req_iterator iter;

1338

struct req_iterator iter;

1339

int nsegs, cluster;

1339

int nsegs, cluster;

1340

1341

nsegs = 0;

1341

nsegs = 0;

1342

cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);

1342

cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);

1343

1344

/*

1344

/*

1345

* for each bio in rq

1345

* for each bio in rq

1346

*/

1346

*/

1347

bvprv = NULL;

1347

bvprv = NULL;

1348

rq_for_each_segment(bvec, rq, iter) {

1348

rq_for_each_segment(bvec, rq, iter) {

1349

int nbytes = bvec->bv_len;

1349

int nbytes = bvec->bv_len;

1350

1351

if (bvprv && cluster) {

1351

if (bvprv && cluster) {

1352

if (sg[nsegs - 1].length + nbytes > q->max_segment_size)

1352

if (sg[nsegs - 1].length + nbytes > q->max_segment_size)

1353

goto new_segment;

1353

goto new_segment;

1354

1355

if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))

1355

if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))

1356

goto new_segment;

1356

goto new_segment;

1357

if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))

1357

if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))

1358

goto new_segment;

1358

goto new_segment;

1359

1360

sg[nsegs - 1].length += nbytes;

1360

sg[nsegs - 1].length += nbytes;

1361

} else {

1361

} else {

1362

new_segment:

1362

new_segment:

1363

memset(&sg[nsegs],0,sizeof(struct scatterlist));

1363

memset(&sg[nsegs],0,sizeof(struct scatterlist));

1364

sg[nsegs].page = bvec->bv_page;

1364

sg[nsegs].page = bvec->bv_page;

1365

sg[nsegs].length = nbytes;

1365

sg[nsegs].length = nbytes;

1366

sg[nsegs].offset = bvec->bv_offset;

1366

sg[nsegs].offset = bvec->bv_offset;

1367

1368

nsegs++;

1368

nsegs++;

1369

}

1369

}

1370

bvprv = bvec;

1370

bvprv = bvec;

1371

} /* segments in rq */

1371

} /* segments in rq */

1372

1373

return nsegs;

1373

return nsegs;

1374

}

1374

}

1375

1376

EXPORT_SYMBOL(blk_rq_map_sg);

1376

EXPORT_SYMBOL(blk_rq_map_sg);

1377

1378

/*

1378

/*

1379

* the standard queue merge functions, can be overridden with device

1379

* the standard queue merge functions, can be overridden with device

1380

* specific ones if so desired

1380

* specific ones if so desired

1381

*/

1381

*/

1382

1383

static inline int ll_new_mergeable(struct request_queue *q,

1383

static inline int ll_new_mergeable(struct request_queue *q,

1384

struct request *req,

1384

struct request *req,

1385

struct bio *bio)

1385

struct bio *bio)

1386

{

1386

{

1387

int nr_phys_segs = bio_phys_segments(q, bio);

1387

int nr_phys_segs = bio_phys_segments(q, bio);

1388

1389

if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {

1389

if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {

1390

req->cmd_flags |= REQ_NOMERGE;

1390

req->cmd_flags |= REQ_NOMERGE;

1391

if (req == q->last_merge)

1391

if (req == q->last_merge)

1392

q->last_merge = NULL;

1392

q->last_merge = NULL;

1393

return 0;

1393

return 0;

1394

}

1394

}

1395

1396

/*

1396

/*

1397

* A hw segment is just getting larger, bump just the phys

1397

* A hw segment is just getting larger, bump just the phys

1398

* counter.

1398

* counter.

1399

*/

1399

*/

1400

req->nr_phys_segments += nr_phys_segs;

1400

req->nr_phys_segments += nr_phys_segs;

1401

return 1;

1401

return 1;

1402

}

1402

}

1403

1404

static inline int ll_new_hw_segment(struct request_queue *q,

1404

static inline int ll_new_hw_segment(struct request_queue *q,

1405

struct request *req,

1405

struct request *req,

1406

struct bio *bio)

1406

struct bio *bio)

1407

{

1407

{

1408

int nr_hw_segs = bio_hw_segments(q, bio);

1408

int nr_hw_segs = bio_hw_segments(q, bio);

1409

int nr_phys_segs = bio_phys_segments(q, bio);

1409

int nr_phys_segs = bio_phys_segments(q, bio);

1410

1411

if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments

1411

if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments

1412

|| req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {

1412

|| req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {

1413

req->cmd_flags |= REQ_NOMERGE;

1413

req->cmd_flags |= REQ_NOMERGE;

1414

if (req == q->last_merge)

1414

if (req == q->last_merge)

1415

q->last_merge = NULL;

1415

q->last_merge = NULL;

1416

return 0;

1416

return 0;

1417

}

1417

}

1418

1419

/*

1419

/*

1420

* This will form the start of a new hw segment. Bump both

1420

* This will form the start of a new hw segment. Bump both

1421

* counters.

1421

* counters.

1422

*/

1422

*/

1423

req->nr_hw_segments += nr_hw_segs;

1423

req->nr_hw_segments += nr_hw_segs;

1424

req->nr_phys_segments += nr_phys_segs;

1424

req->nr_phys_segments += nr_phys_segs;

1425

return 1;

1425

return 1;

1426

}

1426

}

1427

1428

static int ll_back_merge_fn(struct request_queue *q, struct request *req,

1428

static int ll_back_merge_fn(struct request_queue *q, struct request *req,

1429

struct bio *bio)

1429

struct bio *bio)

1430

{

1430

{

1431

unsigned short max_sectors;

1431

unsigned short max_sectors;

1432

int len;

1432

int len;

1433

1434

if (unlikely(blk_pc_request(req)))

1434

if (unlikely(blk_pc_request(req)))

1435

max_sectors = q->max_hw_sectors;

1435

max_sectors = q->max_hw_sectors;

1436

else

1436

else

1437

max_sectors = q->max_sectors;

1437

max_sectors = q->max_sectors;

1438

1439

if (req->nr_sectors + bio_sectors(bio) > max_sectors) {

1439

if (req->nr_sectors + bio_sectors(bio) > max_sectors) {

1440

req->cmd_flags |= REQ_NOMERGE;

1440

req->cmd_flags |= REQ_NOMERGE;

1441

if (req == q->last_merge)

1441

if (req == q->last_merge)

1442

q->last_merge = NULL;

1442

q->last_merge = NULL;

1443

return 0;

1443

return 0;

1444

}

1444

}

1445

if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))

1445

if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))

1446

blk_recount_segments(q, req->biotail);

1446

blk_recount_segments(q, req->biotail);

1447

if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))

1447

if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))

1448

blk_recount_segments(q, bio);

1448

blk_recount_segments(q, bio);

1449

len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;

1449

len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;

1450

if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&

1450

if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&

1451

!BIOVEC_VIRT_OVERSIZE(len)) {

1451

!BIOVEC_VIRT_OVERSIZE(len)) {

1452

int mergeable = ll_new_mergeable(q, req, bio);

1452

int mergeable = ll_new_mergeable(q, req, bio);

1453

1454

if (mergeable) {

1454

if (mergeable) {

1455

if (req->nr_hw_segments == 1)

1455

if (req->nr_hw_segments == 1)

1456

req->bio->bi_hw_front_size = len;

1456

req->bio->bi_hw_front_size = len;

1457

if (bio->bi_hw_segments == 1)

1457

if (bio->bi_hw_segments == 1)

1458

bio->bi_hw_back_size = len;

1458

bio->bi_hw_back_size = len;

1459

}

1459

}

1460

return mergeable;

1460

return mergeable;

1461

}

1461

}

1462

1463

return ll_new_hw_segment(q, req, bio);

1463

return ll_new_hw_segment(q, req, bio);

1464

}

1464

}

1465

1466

static int ll_front_merge_fn(struct request_queue *q, struct request *req,

1466

static int ll_front_merge_fn(struct request_queue *q, struct request *req,

1467

struct bio *bio)

1467

struct bio *bio)

1468

{

1468

{

1469

unsigned short max_sectors;

1469

unsigned short max_sectors;

1470

int len;

1470

int len;

1471

1472

if (unlikely(blk_pc_request(req)))

1472

if (unlikely(blk_pc_request(req)))

1473

max_sectors = q->max_hw_sectors;

1473

max_sectors = q->max_hw_sectors;

1474

else

1474

else

1475

max_sectors = q->max_sectors;

1475

max_sectors = q->max_sectors;

1476

1477

1478

if (req->nr_sectors + bio_sectors(bio) > max_sectors) {

1478

if (req->nr_sectors + bio_sectors(bio) > max_sectors) {

1479

req->cmd_flags |= REQ_NOMERGE;

1479

req->cmd_flags |= REQ_NOMERGE;

1480

if (req == q->last_merge)

1480

if (req == q->last_merge)

1481

q->last_merge = NULL;

1481

q->last_merge = NULL;

1482

return 0;

1482

return 0;

1483

}

1483

}

1484

len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;

1484

len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;

1485

if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))

1485

if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))

1486

blk_recount_segments(q, bio);

1486

blk_recount_segments(q, bio);

1487

if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))

1487

if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))

1488

blk_recount_segments(q, req->bio);

1488

blk_recount_segments(q, req->bio);

1489

if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&

1489

if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&

1490

!BIOVEC_VIRT_OVERSIZE(len)) {

1490

!BIOVEC_VIRT_OVERSIZE(len)) {

1491

int mergeable = ll_new_mergeable(q, req, bio);

1491

int mergeable = ll_new_mergeable(q, req, bio);

1492

1493

if (mergeable) {

1493

if (mergeable) {

1494

if (bio->bi_hw_segments == 1)

1494

if (bio->bi_hw_segments == 1)

1495

bio->bi_hw_front_size = len;

1495

bio->bi_hw_front_size = len;

1496

if (req->nr_hw_segments == 1)

1496

if (req->nr_hw_segments == 1)

1497

req->biotail->bi_hw_back_size = len;

1497

req->biotail->bi_hw_back_size = len;

1498

}

1498

}

1499

return mergeable;

1499

return mergeable;

1500

}

1500

}

1501

1502

return ll_new_hw_segment(q, req, bio);

1502

return ll_new_hw_segment(q, req, bio);

1503

}

1503

}

1504

1505

static int ll_merge_requests_fn(struct request_queue *q, struct request *req,

1505

static int ll_merge_requests_fn(struct request_queue *q, struct request *req,

1506

struct request *next)

1506

struct request *next)

1507

{

1507

{

1508

int total_phys_segments;

1508

int total_phys_segments;

1509

int total_hw_segments;

1509

int total_hw_segments;

1510

1511

/*

1511

/*

1512

* First check if the either of the requests are re-queued

1512

* First check if the either of the requests are re-queued

1513

* requests. Can't merge them if they are.

1513

* requests. Can't merge them if they are.

1514

*/

1514

*/

1515

if (req->special || next->special)

1515

if (req->special || next->special)

1516

return 0;

1516

return 0;

1517

1518

/*

1518

/*

1519

* Will it become too large?

1519

* Will it become too large?

1520

*/

1520

*/

1521

if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)

1521

if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)

1522

return 0;

1522

return 0;

1523

1524

total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;

1524

total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;

1525

if (blk_phys_contig_segment(q, req->biotail, next->bio))

1525

if (blk_phys_contig_segment(q, req->biotail, next->bio))

1526

total_phys_segments--;

1526

total_phys_segments--;

1527

1528

if (total_phys_segments > q->max_phys_segments)

1528

if (total_phys_segments > q->max_phys_segments)

1529

return 0;

1529

return 0;

1530

1531

total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;

1531

total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;

1532

if (blk_hw_contig_segment(q, req->biotail, next->bio)) {

1532

if (blk_hw_contig_segment(q, req->biotail, next->bio)) {

1533

int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;

1533

int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;

1534

/*

1534

/*

1535

* propagate the combined length to the end of the requests

1535

* propagate the combined length to the end of the requests

1536

*/

1536

*/

1537

if (req->nr_hw_segments == 1)

1537

if (req->nr_hw_segments == 1)

1538

req->bio->bi_hw_front_size = len;

1538

req->bio->bi_hw_front_size = len;

1539

if (next->nr_hw_segments == 1)

1539

if (next->nr_hw_segments == 1)

1540

next->biotail->bi_hw_back_size = len;

1540

next->biotail->bi_hw_back_size = len;

1541

total_hw_segments--;

1541

total_hw_segments--;

1542

}

1542

}

1543

1544

if (total_hw_segments > q->max_hw_segments)

1544

if (total_hw_segments > q->max_hw_segments)

1545

return 0;

1545

return 0;

1546

1547

/* Merge is OK... */

1547

/* Merge is OK... */

1548

req->nr_phys_segments = total_phys_segments;

1548

req->nr_phys_segments = total_phys_segments;

1549

req->nr_hw_segments = total_hw_segments;

1549

req->nr_hw_segments = total_hw_segments;

1550

return 1;

1550

return 1;

1551

}

1551

}

1552

1553

/*

1553

/*

1554

* "plug" the device if there are no outstanding requests: this will

1554

* "plug" the device if there are no outstanding requests: this will

1555

* force the transfer to start only after we have put all the requests

1555

* force the transfer to start only after we have put all the requests

1556

* on the list.

1556

* on the list.

1557

*

1557

*

1558

* This is called with interrupts off and no requests on the queue and

1558

* This is called with interrupts off and no requests on the queue and

1559

* with the queue lock held.

1559

* with the queue lock held.

1560

*/

1560

*/

1561

void blk_plug_device(struct request_queue *q)

1561

void blk_plug_device(struct request_queue *q)

1562

{

1562

{

1563

WARN_ON(!irqs_disabled());

1563

WARN_ON(!irqs_disabled());

1564

1565

/*

1565

/*

1566

* don't plug a stopped queue, it must be paired with blk_start_queue()

1566

* don't plug a stopped queue, it must be paired with blk_start_queue()

1567

* which will restart the queueing

1567

* which will restart the queueing

1568

*/

1568

*/

1569

if (blk_queue_stopped(q))

1569

if (blk_queue_stopped(q))

1570

return;

1570

return;

1571

1572

if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {

1572

if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {

1573

mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);

1573

mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);

1574

blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);

1574

blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);

1575

}

1575

}

1576

}

1576

}

1577

1578

EXPORT_SYMBOL(blk_plug_device);

1578

EXPORT_SYMBOL(blk_plug_device);

1579

1580

/*

1580

/*

1581

* remove the queue from the plugged list, if present. called with

1581

* remove the queue from the plugged list, if present. called with

1582

* queue lock held and interrupts disabled.

1582

* queue lock held and interrupts disabled.

1583

*/

1583

*/

1584

int blk_remove_plug(struct request_queue *q)

1584

int blk_remove_plug(struct request_queue *q)

1585

{

1585

{

1586

WARN_ON(!irqs_disabled());

1586

WARN_ON(!irqs_disabled());

1587

1588

if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))

1588

if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))

1589

return 0;

1589

return 0;

1590

1591

del_timer(&q->unplug_timer);

1591

del_timer(&q->unplug_timer);

1592

return 1;

1592

return 1;

1593

}

1593

}

1594

1595

EXPORT_SYMBOL(blk_remove_plug);

1595

EXPORT_SYMBOL(blk_remove_plug);

1596

1597

/*

1597

/*

1598

* remove the plug and let it rip..

1598

* remove the plug and let it rip..

1599

*/

1599

*/

1600

void __generic_unplug_device(struct request_queue *q)

1600

void __generic_unplug_device(struct request_queue *q)

1601

{

1601

{

1602

if (unlikely(blk_queue_stopped(q)))

1602

if (unlikely(blk_queue_stopped(q)))

1603

return;

1603

return;

1604

1605

if (!blk_remove_plug(q))

1605

if (!blk_remove_plug(q))

1606

return;

1606

return;

1607

1608

q->request_fn(q);

1608

q->request_fn(q);

1609

}

1609

}

1610

EXPORT_SYMBOL(__generic_unplug_device);

1610

EXPORT_SYMBOL(__generic_unplug_device);

1611

1612

/**

1612

/**

1613

* generic_unplug_device - fire a request queue

1613

* generic_unplug_device - fire a request queue

1614

* @q: The &struct request_queue in question

1614

* @q: The &struct request_queue in question

1615

*

1615

*

1616

* Description:

1616

* Description:

1617

* Linux uses plugging to build bigger requests queues before letting

1617

* Linux uses plugging to build bigger requests queues before letting

1618

* the device have at them. If a queue is plugged, the I/O scheduler

1618

* the device have at them. If a queue is plugged, the I/O scheduler

1619

* is still adding and merging requests on the queue. Once the queue

1619

* is still adding and merging requests on the queue. Once the queue

1620

* gets unplugged, the request_fn defined for the queue is invoked and

1620

* gets unplugged, the request_fn defined for the queue is invoked and

1621

* transfers started.

1621

* transfers started.

1622

**/

1622

**/

1623

void generic_unplug_device(struct request_queue *q)

1623

void generic_unplug_device(struct request_queue *q)

1624

{

1624

{

1625

spin_lock_irq(q->queue_lock);

1625

spin_lock_irq(q->queue_lock);

1626

__generic_unplug_device(q);

1626

__generic_unplug_device(q);

1627

spin_unlock_irq(q->queue_lock);

1627

spin_unlock_irq(q->queue_lock);

1628

}

1628

}

1629

EXPORT_SYMBOL(generic_unplug_device);

1629

EXPORT_SYMBOL(generic_unplug_device);

1630

1631

static void blk_backing_dev_unplug(struct backing_dev_info *bdi,

1631

static void blk_backing_dev_unplug(struct backing_dev_info *bdi,

1632

struct page *page)

1632

struct page *page)

1633

{

1633

{

1634

struct request_queue *q = bdi->unplug_io_data;

1634

struct request_queue *q = bdi->unplug_io_data;

1635

1636

/*

1636

/*

1637

* devices don't necessarily have an ->unplug_fn defined

1637

* devices don't necessarily have an ->unplug_fn defined

1638

*/

1638

*/

1639

if (q->unplug_fn) {

1639

if (q->unplug_fn) {

1640

blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,

1640

blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,

1641

q->rq.count[READ] + q->rq.count[WRITE]);

1641

q->rq.count[READ] + q->rq.count[WRITE]);

1642

1643

q->unplug_fn(q);

1643

q->unplug_fn(q);

1644

}

1644

}

1645

}

1645

}

1646

1647

static void blk_unplug_work(struct work_struct *work)

1647

static void blk_unplug_work(struct work_struct *work)

1648

{

1648

{

1649

struct request_queue *q =

1649

struct request_queue *q =

1650

container_of(work, struct request_queue, unplug_work);

1650

container_of(work, struct request_queue, unplug_work);

1651

1652

blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,

1652

blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,

1653

q->rq.count[READ] + q->rq.count[WRITE]);

1653

q->rq.count[READ] + q->rq.count[WRITE]);

1654

1655

q->unplug_fn(q);

1655

q->unplug_fn(q);

1656

}

1656

}

1657

1658

static void blk_unplug_timeout(unsigned long data)

1658

static void blk_unplug_timeout(unsigned long data)

1659

{

1659

{

1660

struct request_queue *q = (struct request_queue *)data;

1660

struct request_queue *q = (struct request_queue *)data;

1661

1662

blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,

1662

blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,

1663

q->rq.count[READ] + q->rq.count[WRITE]);

1663

q->rq.count[READ] + q->rq.count[WRITE]);

1664

1665

kblockd_schedule_work(&q->unplug_work);

1665

kblockd_schedule_work(&q->unplug_work);

1666

}

1666

}

1667

1668

/**

1668

/**

1669

* blk_start_queue - restart a previously stopped queue

1669

* blk_start_queue - restart a previously stopped queue

1670

* @q: The &struct request_queue in question

1670

* @q: The &struct request_queue in question

1671

*

1671

*

1672

* Description:

1672

* Description:

1673

* blk_start_queue() will clear the stop flag on the queue, and call

1673

* blk_start_queue() will clear the stop flag on the queue, and call

1674

* the request_fn for the queue if it was in a stopped state when

1674

* the request_fn for the queue if it was in a stopped state when

1675

* entered. Also see blk_stop_queue(). Queue lock must be held.

1675

* entered. Also see blk_stop_queue(). Queue lock must be held.

1676

**/

1676

**/

1677

void blk_start_queue(struct request_queue *q)

1677

void blk_start_queue(struct request_queue *q)

1678

{

1678

{

1679

WARN_ON(!irqs_disabled());

1679

WARN_ON(!irqs_disabled());

1680

1681

clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);

1681

clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);

1682

1683

/*

1683

/*

1684

* one level of recursion is ok and is much faster than kicking

1684

* one level of recursion is ok and is much faster than kicking

1685

* the unplug handling

1685

* the unplug handling

1686

*/

1686

*/

1687

if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {

1687

if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {

1688

q->request_fn(q);

1688

q->request_fn(q);

1689

clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);

1689

clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);

1690

} else {

1690

} else {

1691

blk_plug_device(q);

1691

blk_plug_device(q);

1692

kblockd_schedule_work(&q->unplug_work);

1692

kblockd_schedule_work(&q->unplug_work);

1693

}

1693

}

1694

}

1694

}

1695

1696

EXPORT_SYMBOL(blk_start_queue);

1696

EXPORT_SYMBOL(blk_start_queue);

1697

1698

/**

1698

/**

1699

* blk_stop_queue - stop a queue

1699

* blk_stop_queue - stop a queue

1700

* @q: The &struct request_queue in question

1700

* @q: The &struct request_queue in question

1701

*

1701

*

1702

* Description:

1702

* Description:

1703

* The Linux block layer assumes that a block driver will consume all

1703

* The Linux block layer assumes that a block driver will consume all

1704

* entries on the request queue when the request_fn strategy is called.

1704

* entries on the request queue when the request_fn strategy is called.

1705

* Often this will not happen, because of hardware limitations (queue

1705

* Often this will not happen, because of hardware limitations (queue

1706

* depth settings). If a device driver gets a 'queue full' response,

1706

* depth settings). If a device driver gets a 'queue full' response,

1707

* or if it simply chooses not to queue more I/O at one point, it can

1707

* or if it simply chooses not to queue more I/O at one point, it can

1708

* call this function to prevent the request_fn from being called until

1708

* call this function to prevent the request_fn from being called until

1709

* the driver has signalled it's ready to go again. This happens by calling

1709

* the driver has signalled it's ready to go again. This happens by calling

1710

* blk_start_queue() to restart queue operations. Queue lock must be held.

1710

* blk_start_queue() to restart queue operations. Queue lock must be held.

1711

**/

1711

**/

1712

void blk_stop_queue(struct request_queue *q)

1712

void blk_stop_queue(struct request_queue *q)

1713

{

1713

{

1714

blk_remove_plug(q);

1714

blk_remove_plug(q);

1715

set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);

1715

set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);

1716

}

1716

}

1717

EXPORT_SYMBOL(blk_stop_queue);

1717

EXPORT_SYMBOL(blk_stop_queue);

1718

1719

/**

1719

/**

1720

* blk_sync_queue - cancel any pending callbacks on a queue

1720

* blk_sync_queue - cancel any pending callbacks on a queue

1721

* @q: the queue

1721

* @q: the queue

1722

*

1722

*

1723

* Description:

1723

* Description:

1724

* The block layer may perform asynchronous callback activity

1724

* The block layer may perform asynchronous callback activity

1725

* on a queue, such as calling the unplug function after a timeout.

1725

* on a queue, such as calling the unplug function after a timeout.

1726

* A block device may call blk_sync_queue to ensure that any

1726

* A block device may call blk_sync_queue to ensure that any

1727

* such activity is cancelled, thus allowing it to release resources

1727

* such activity is cancelled, thus allowing it to release resources

1728

* that the callbacks might use. The caller must already have made sure

1728

* that the callbacks might use. The caller must already have made sure

1729

* that its ->make_request_fn will not re-add plugging prior to calling

1729

* that its ->make_request_fn will not re-add plugging prior to calling

1730

* this function.

1730

* this function.

1731

*

1731

*

1732

*/

1732

*/

1733

void blk_sync_queue(struct request_queue *q)

1733

void blk_sync_queue(struct request_queue *q)

1734

{

1734

{

1735

del_timer_sync(&q->unplug_timer);

1735

del_timer_sync(&q->unplug_timer);

1736

}

1736

}

1737

EXPORT_SYMBOL(blk_sync_queue);

1737

EXPORT_SYMBOL(blk_sync_queue);

1738

1739

/**

1739

/**

1740

* blk_run_queue - run a single device queue

1740

* blk_run_queue - run a single device queue

1741

* @q: The queue to run

1741

* @q: The queue to run

1742

*/

1742

*/

1743

void blk_run_queue(struct request_queue *q)

1743

void blk_run_queue(struct request_queue *q)

1744

{

1744

{

1745

unsigned long flags;

1745

unsigned long flags;

1746

1747

spin_lock_irqsave(q->queue_lock, flags);

1747

spin_lock_irqsave(q->queue_lock, flags);

1748

blk_remove_plug(q);

1748

blk_remove_plug(q);

1749

1750

/*

1750

/*

1751

* Only recurse once to avoid overrunning the stack, let the unplug

1751

* Only recurse once to avoid overrunning the stack, let the unplug

1752

* handling reinvoke the handler shortly if we already got there.

1752

* handling reinvoke the handler shortly if we already got there.

1753

*/

1753

*/

1754

if (!elv_queue_empty(q)) {

1754

if (!elv_queue_empty(q)) {

1755

if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {

1755

if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {

1756

q->request_fn(q);

1756

q->request_fn(q);

1757

clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);

1757

clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);

1758

} else {

1758

} else {

1759

blk_plug_device(q);

1759

blk_plug_device(q);

1760

kblockd_schedule_work(&q->unplug_work);

1760

kblockd_schedule_work(&q->unplug_work);

1761

}

1761

}

1762

}

1762

}

1763

1764

spin_unlock_irqrestore(q->queue_lock, flags);

1764

spin_unlock_irqrestore(q->queue_lock, flags);

1765

}

1765

}

1766

EXPORT_SYMBOL(blk_run_queue);

1766

EXPORT_SYMBOL(blk_run_queue);

1767

1768

/**

1768

/**

1769

* blk_cleanup_queue: - release a &struct request_queue when it is no longer needed

1769

* blk_cleanup_queue: - release a &struct request_queue when it is no longer needed

1770

* @kobj: the kobj belonging of the request queue to be released

1770

* @kobj: the kobj belonging of the request queue to be released

1771

*

1771

*

1772

* Description:

1772

* Description:

1773

* blk_cleanup_queue is the pair to blk_init_queue() or

1773

* blk_cleanup_queue is the pair to blk_init_queue() or

1774

* blk_queue_make_request(). It should be called when a request queue is

1774

* blk_queue_make_request(). It should be called when a request queue is

1775

* being released; typically when a block device is being de-registered.

1775

* being released; typically when a block device is being de-registered.

1776

* Currently, its primary task it to free all the &struct request

1776

* Currently, its primary task it to free all the &struct request

1777

* structures that were allocated to the queue and the queue itself.

1777

* structures that were allocated to the queue and the queue itself.

1778

*

1778

*

1779

* Caveat:

1779

* Caveat:

1780

* Hopefully the low level driver will have finished any

1780

* Hopefully the low level driver will have finished any

1781

* outstanding requests first...

1781

* outstanding requests first...

1782

**/

1782

**/

1783

static void blk_release_queue(struct kobject *kobj)

1783

static void blk_release_queue(struct kobject *kobj)

1784

{

1784

{

1785

struct request_queue *q =

1785

struct request_queue *q =

1786

container_of(kobj, struct request_queue, kobj);

1786

container_of(kobj, struct request_queue, kobj);

1787

struct request_list *rl = &q->rq;

1787

struct request_list *rl = &q->rq;

1788

1789

blk_sync_queue(q);

1789

blk_sync_queue(q);

1790

1791

if (rl->rq_pool)

1791

if (rl->rq_pool)

1792

mempool_destroy(rl->rq_pool);

1792

mempool_destroy(rl->rq_pool);

1793

1794

if (q->queue_tags)

1794

if (q->queue_tags)

1795

__blk_queue_free_tags(q);

1795

__blk_queue_free_tags(q);

1796

1797

blk_trace_shutdown(q);

1797

blk_trace_shutdown(q);

1798

1799

kmem_cache_free(requestq_cachep, q);

1799

kmem_cache_free(requestq_cachep, q);

1800

}

1800

}

1801

1802

void blk_put_queue(struct request_queue *q)

1802

void blk_put_queue(struct request_queue *q)

1803

{

1803

{

1804

kobject_put(&q->kobj);

1804

kobject_put(&q->kobj);

1805

}

1805

}

1806

EXPORT_SYMBOL(blk_put_queue);

1806

EXPORT_SYMBOL(blk_put_queue);

1807

1808

void blk_cleanup_queue(struct request_queue * q)

1808

void blk_cleanup_queue(struct request_queue * q)

1809

{

1809

{

1810

mutex_lock(&q->sysfs_lock);

1810

mutex_lock(&q->sysfs_lock);

1811

set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);

1811

set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);

1812

mutex_unlock(&q->sysfs_lock);

1812

mutex_unlock(&q->sysfs_lock);

1813

1814

if (q->elevator)

1814

if (q->elevator)

1815

elevator_exit(q->elevator);

1815

elevator_exit(q->elevator);

1816

1817

blk_put_queue(q);

1817

blk_put_queue(q);

1818

}

1818

}

1819

1820

EXPORT_SYMBOL(blk_cleanup_queue);

1820

EXPORT_SYMBOL(blk_cleanup_queue);

1821

1822

static int blk_init_free_list(struct request_queue *q)

1822

static int blk_init_free_list(struct request_queue *q)

1823

{

1823

{

1824

struct request_list *rl = &q->rq;

1824

struct request_list *rl = &q->rq;

1825

1826

rl->count[READ] = rl->count[WRITE] = 0;

1826

rl->count[READ] = rl->count[WRITE] = 0;

1827

rl->starved[READ] = rl->starved[WRITE] = 0;

1827

rl->starved[READ] = rl->starved[WRITE] = 0;

1828

rl->elvpriv = 0;

1828

rl->elvpriv = 0;

1829

init_waitqueue_head(&rl->wait[READ]);

1829

init_waitqueue_head(&rl->wait[READ]);

1830

init_waitqueue_head(&rl->wait[WRITE]);

1830

init_waitqueue_head(&rl->wait[WRITE]);

1831

1832

rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,

1832

rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,

1833

mempool_free_slab, request_cachep, q->node);

1833

mempool_free_slab, request_cachep, q->node);

1834

1835

if (!rl->rq_pool)

1835

if (!rl->rq_pool)

1836

return -ENOMEM;

1836

return -ENOMEM;

1837

1838

return 0;

1838

return 0;

1839

}

1839

}

1840

1841

struct request_queue *blk_alloc_queue(gfp_t gfp_mask)

1841

struct request_queue *blk_alloc_queue(gfp_t gfp_mask)

1842

{

1842

{

1843

return blk_alloc_queue_node(gfp_mask, -1);

1843

return blk_alloc_queue_node(gfp_mask, -1);

1844

}

1844

}

1845

EXPORT_SYMBOL(blk_alloc_queue);

1845

EXPORT_SYMBOL(blk_alloc_queue);

1846

1847

static struct kobj_type queue_ktype;

1847

static struct kobj_type queue_ktype;

1848

1849

struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)

1849

struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)

1850

{

1850

{

1851

struct request_queue *q;

1851

struct request_queue *q;

1852

1853

q = kmem_cache_alloc_node(requestq_cachep,

1853

q = kmem_cache_alloc_node(requestq_cachep,

1854

gfp_mask | __GFP_ZERO, node_id);

1854

gfp_mask | __GFP_ZERO, node_id);

1855

if (!q)

1855

if (!q)

1856

return NULL;

1856

return NULL;

1857

1858

init_timer(&q->unplug_timer);

1858

init_timer(&q->unplug_timer);

1859

1860

kobject_set_name(&q->kobj, "%s", "queue");

1860

kobject_set_name(&q->kobj, "%s", "queue");

1861

q->kobj.ktype = &queue_ktype;

1861

q->kobj.ktype = &queue_ktype;

1862

kobject_init(&q->kobj);

1862

kobject_init(&q->kobj);

1863

1864

q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;

1864

q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;

1865

q->backing_dev_info.unplug_io_data = q;

1865

q->backing_dev_info.unplug_io_data = q;

1866

1867

mutex_init(&q->sysfs_lock);

1867

mutex_init(&q->sysfs_lock);

1868

1869

return q;

1869

return q;

1870

}

1870

}

1871

EXPORT_SYMBOL(blk_alloc_queue_node);

1871

EXPORT_SYMBOL(blk_alloc_queue_node);

1872

1873

/**

1873

/**

1874

* blk_init_queue - prepare a request queue for use with a block device

1874

* blk_init_queue - prepare a request queue for use with a block device

1875

* @rfn: The function to be called to process requests that have been

1875

* @rfn: The function to be called to process requests that have been

1876

* placed on the queue.

1876

* placed on the queue.

1877

* @lock: Request queue spin lock

1877

* @lock: Request queue spin lock

1878

*

1878

*

1879

* Description:

1879

* Description:

1880

* If a block device wishes to use the standard request handling procedures,

1880

* If a block device wishes to use the standard request handling procedures,

1881

* which sorts requests and coalesces adjacent requests, then it must

1881

* which sorts requests and coalesces adjacent requests, then it must

1882

* call blk_init_queue(). The function @rfn will be called when there

1882

* call blk_init_queue(). The function @rfn will be called when there

1883

* are requests on the queue that need to be processed. If the device

1883

* are requests on the queue that need to be processed. If the device

1884

* supports plugging, then @rfn may not be called immediately when requests

1884

* supports plugging, then @rfn may not be called immediately when requests

1885

* are available on the queue, but may be called at some time later instead.

1885

* are available on the queue, but may be called at some time later instead.

1886

* Plugged queues are generally unplugged when a buffer belonging to one

1886

* Plugged queues are generally unplugged when a buffer belonging to one

1887

* of the requests on the queue is needed, or due to memory pressure.

1887

* of the requests on the queue is needed, or due to memory pressure.

1888

*

1888

*

1889

* @rfn is not required, or even expected, to remove all requests off the

1889

* @rfn is not required, or even expected, to remove all requests off the

1890

* queue, but only as many as it can handle at a time. If it does leave

1890

* queue, but only as many as it can handle at a time. If it does leave

1891

* requests on the queue, it is responsible for arranging that the requests

1891

* requests on the queue, it is responsible for arranging that the requests

1892

* get dealt with eventually.

1892

* get dealt with eventually.

1893

*

1893

*

1894

* The queue spin lock must be held while manipulating the requests on the

1894

* The queue spin lock must be held while manipulating the requests on the

1895

* request queue; this lock will be taken also from interrupt context, so irq

1895

* request queue; this lock will be taken also from interrupt context, so irq

1896

* disabling is needed for it.

1896

* disabling is needed for it.

1897

*

1897

*

1898

* Function returns a pointer to the initialized request queue, or NULL if

1898

* Function returns a pointer to the initialized request queue, or NULL if

1899

* it didn't succeed.

1899

* it didn't succeed.

1900

*

1900

*

1901

* Note:

1901

* Note:

1902

* blk_init_queue() must be paired with a blk_cleanup_queue() call

1902

* blk_init_queue() must be paired with a blk_cleanup_queue() call

1903

* when the block device is deactivated (such as at module unload).

1903

* when the block device is deactivated (such as at module unload).

1904

**/

1904

**/

1905

1906

struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)

1906

struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)

1907

{

1907

{

1908

return blk_init_queue_node(rfn, lock, -1);

1908

return blk_init_queue_node(rfn, lock, -1);

1909

}

1909

}

1910

EXPORT_SYMBOL(blk_init_queue);

1910

EXPORT_SYMBOL(blk_init_queue);

1911

1912

struct request_queue *

1912

struct request_queue *

1913

blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)

1913

blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)

1914

{

1914

{

1915

struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);

1915

struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);

1916

1917

if (!q)

1917

if (!q)

1918

return NULL;

1918

return NULL;

1919

1920

q->node = node_id;

1920

q->node = node_id;

1921

if (blk_init_free_list(q)) {

1921

if (blk_init_free_list(q)) {

1922

kmem_cache_free(requestq_cachep, q);

1922

kmem_cache_free(requestq_cachep, q);

1923

return NULL;

1923

return NULL;

1924

}

1924

}

1925

1926

/*

1926

/*

1927

* if caller didn't supply a lock, they get per-queue locking with

1927

* if caller didn't supply a lock, they get per-queue locking with

1928

* our embedded lock

1928

* our embedded lock

1929

*/

1929

*/

1930

if (!lock) {

1930

if (!lock) {

1931

spin_lock_init(&q->__queue_lock);

1931

spin_lock_init(&q->__queue_lock);

1932

lock = &q->__queue_lock;

1932

lock = &q->__queue_lock;

1933

}

1933

}

1934

1935

q->request_fn = rfn;

1935

q->request_fn = rfn;

1936

q->prep_rq_fn = NULL;

1936

q->prep_rq_fn = NULL;

1937

q->unplug_fn = generic_unplug_device;

1937

q->unplug_fn = generic_unplug_device;

1938

q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);

1938

q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);

1939

q->queue_lock = lock;

1939

q->queue_lock = lock;

1940

1941

blk_queue_segment_boundary(q, 0xffffffff);

1941

blk_queue_segment_boundary(q, 0xffffffff);

1942

1943

blk_queue_make_request(q, __make_request);

1943

blk_queue_make_request(q, __make_request);

1944

blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);

1944

blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);

1945

1946

blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);

1946

blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);

1947

blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);

1947

blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);

1948

1949

q->sg_reserved_size = INT_MAX;

1949

q->sg_reserved_size = INT_MAX;

1950

1951

/*

1951

/*

1952

* all done

1952

* all done

1953

*/

1953

*/

1954

if (!elevator_init(q, NULL)) {

1954

if (!elevator_init(q, NULL)) {

1955

blk_queue_congestion_threshold(q);

1955

blk_queue_congestion_threshold(q);

1956

return q;

1956

return q;

1957

}

1957

}

1958

1959

blk_put_queue(q);

1959

blk_put_queue(q);

1960

return NULL;

1960

return NULL;

1961

}

1961

}

1962

EXPORT_SYMBOL(blk_init_queue_node);

1962

EXPORT_SYMBOL(blk_init_queue_node);

1963

1964

int blk_get_queue(struct request_queue *q)

1964

int blk_get_queue(struct request_queue *q)

1965

{

1965

{

1966

if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {

1966

if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {

1967

kobject_get(&q->kobj);

1967

kobject_get(&q->kobj);

1968

return 0;

1968

return 0;

1969

}

1969

}

1970

1971

return 1;

1971

return 1;

1972

}

1972

}

1973

1974

EXPORT_SYMBOL(blk_get_queue);

1974

EXPORT_SYMBOL(blk_get_queue);

1975

1976

static inline void blk_free_request(struct request_queue *q, struct request *rq)

1976

static inline void blk_free_request(struct request_queue *q, struct request *rq)

1977

{

1977

{

1978

if (rq->cmd_flags & REQ_ELVPRIV)

1978

if (rq->cmd_flags & REQ_ELVPRIV)

1979

elv_put_request(q, rq);

1979

elv_put_request(q, rq);

1980

mempool_free(rq, q->rq.rq_pool);

1980

mempool_free(rq, q->rq.rq_pool);

1981

}

1981

}

1982

1983

static struct request *

1983

static struct request *

1984

blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)

1984

blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)

1985

{

1985

{

1986

struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

1986

struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

1987

1988

if (!rq)

1988

if (!rq)

1989

return NULL;

1989

return NULL;

1990

1991

/*

1991

/*

1992

* first three bits are identical in rq->cmd_flags and bio->bi_rw,

1992

* first three bits are identical in rq->cmd_flags and bio->bi_rw,

1993

* see bio.h and blkdev.h

1993

* see bio.h and blkdev.h

1994

*/

1994

*/

1995

rq->cmd_flags = rw | REQ_ALLOCED;

1995

rq->cmd_flags = rw | REQ_ALLOCED;

1996

1997

if (priv) {

1997

if (priv) {

1998

if (unlikely(elv_set_request(q, rq, gfp_mask))) {

1998

if (unlikely(elv_set_request(q, rq, gfp_mask))) {

1999

mempool_free(rq, q->rq.rq_pool);

1999

mempool_free(rq, q->rq.rq_pool);

2000

return NULL;

2000

return NULL;

2001

}

2001

}

2002

rq->cmd_flags |= REQ_ELVPRIV;

2002

rq->cmd_flags |= REQ_ELVPRIV;

2003

}

2003

}

2004

2005

return rq;

2005

return rq;

2006

}

2006

}

2007

2008

/*

2008

/*

2009

* ioc_batching returns true if the ioc is a valid batching request and

2009

* ioc_batching returns true if the ioc is a valid batching request and

2010

* should be given priority access to a request.

2010

* should be given priority access to a request.

2011

*/

2011

*/

2012

static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)

2012

static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)

2013

{

2013

{

2014

if (!ioc)

2014

if (!ioc)

2015

return 0;

2015

return 0;

2016

2017

/*

2017

/*

2018

* Make sure the process is able to allocate at least 1 request

2018

* Make sure the process is able to allocate at least 1 request

2019

* even if the batch times out, otherwise we could theoretically

2019

* even if the batch times out, otherwise we could theoretically

2020

* lose wakeups.

2020

* lose wakeups.

2021

*/

2021

*/

2022

return ioc->nr_batch_requests == q->nr_batching ||

2022

return ioc->nr_batch_requests == q->nr_batching ||

2023

(ioc->nr_batch_requests > 0

2023

(ioc->nr_batch_requests > 0

2024

&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));

2024

&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));

2025

}

2025

}

2026

2027

/*

2027

/*

2028

* ioc_set_batching sets ioc to be a new "batcher" if it is not one. This

2028

* ioc_set_batching sets ioc to be a new "batcher" if it is not one. This

2029

* will cause the process to be a "batcher" on all queues in the system. This

2029

* will cause the process to be a "batcher" on all queues in the system. This

2030

* is the behaviour we want though - once it gets a wakeup it should be given

2030

* is the behaviour we want though - once it gets a wakeup it should be given

2031

* a nice run.

2031

* a nice run.

2032

*/

2032

*/

2033

static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)

2033

static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)

2034

{

2034

{

2035

if (!ioc || ioc_batching(q, ioc))

2035

if (!ioc || ioc_batching(q, ioc))

2036

return;

2036

return;

2037

2038

ioc->nr_batch_requests = q->nr_batching;

2038

ioc->nr_batch_requests = q->nr_batching;

2039

ioc->last_waited = jiffies;

2039

ioc->last_waited = jiffies;

2040

}

2040

}

2041

2042

static void __freed_request(struct request_queue *q, int rw)

2042

static void __freed_request(struct request_queue *q, int rw)

2043

{

2043

{

2044

struct request_list *rl = &q->rq;

2044

struct request_list *rl = &q->rq;

2045

2046

if (rl->count[rw] < queue_congestion_off_threshold(q))

2046

if (rl->count[rw] < queue_congestion_off_threshold(q))

2047

blk_clear_queue_congested(q, rw);

2047

blk_clear_queue_congested(q, rw);

2048

2049

if (rl->count[rw] + 1 <= q->nr_requests) {

2049

if (rl->count[rw] + 1 <= q->nr_requests) {

2050

if (waitqueue_active(&rl->wait[rw]))

2050

if (waitqueue_active(&rl->wait[rw]))

2051

wake_up(&rl->wait[rw]);

2051

wake_up(&rl->wait[rw]);

2052

2053

blk_clear_queue_full(q, rw);

2053

blk_clear_queue_full(q, rw);

2054

}

2054

}

2055

}

2055

}

2056

2057

/*

2057

/*

2058

* A request has just been released. Account for it, update the full and

2058

* A request has just been released. Account for it, update the full and

2059

* congestion status, wake up any waiters. Called under q->queue_lock.

2059

* congestion status, wake up any waiters. Called under q->queue_lock.

2060

*/

2060

*/

2061

static void freed_request(struct request_queue *q, int rw, int priv)

2061

static void freed_request(struct request_queue *q, int rw, int priv)

2062

{

2062

{

2063

struct request_list *rl = &q->rq;

2063

struct request_list *rl = &q->rq;

2064

2065

rl->count[rw]--;

2065

rl->count[rw]--;

2066

if (priv)

2066

if (priv)

2067

rl->elvpriv--;

2067

rl->elvpriv--;

2068

2069

__freed_request(q, rw);

2069

__freed_request(q, rw);

2070

2071

if (unlikely(rl->starved[rw ^ 1]))

2071

if (unlikely(rl->starved[rw ^ 1]))

2072

__freed_request(q, rw ^ 1);

2072

__freed_request(q, rw ^ 1);

2073

}

2073

}

2074

2075

#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)

2075

#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)

2076

/*

2076

/*

2077

* Get a free request, queue_lock must be held.

2077

* Get a free request, queue_lock must be held.

2078

* Returns NULL on failure, with queue_lock held.

2078

* Returns NULL on failure, with queue_lock held.

2079

* Returns !NULL on success, with queue_lock *not held*.

2079

* Returns !NULL on success, with queue_lock *not held*.

2080

*/

2080

*/

2081

static struct request *get_request(struct request_queue *q, int rw_flags,

2081

static struct request *get_request(struct request_queue *q, int rw_flags,

2082

struct bio *bio, gfp_t gfp_mask)

2082

struct bio *bio, gfp_t gfp_mask)

2083

{

2083

{

2084

struct request *rq = NULL;

2084

struct request *rq = NULL;

2085

struct request_list *rl = &q->rq;

2085

struct request_list *rl = &q->rq;

2086

struct io_context *ioc = NULL;

2086

struct io_context *ioc = NULL;

2087

const int rw = rw_flags & 0x01;

2087

const int rw = rw_flags & 0x01;

2088

int may_queue, priv;

2088

int may_queue, priv;

2089

2090

may_queue = elv_may_queue(q, rw_flags);

2090

may_queue = elv_may_queue(q, rw_flags);

2091

if (may_queue == ELV_MQUEUE_NO)

2091

if (may_queue == ELV_MQUEUE_NO)

2092

goto rq_starved;

2092

goto rq_starved;

2093

2094

if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {

2094

if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {

2095

if (rl->count[rw]+1 >= q->nr_requests) {

2095

if (rl->count[rw]+1 >= q->nr_requests) {

2096

ioc = current_io_context(GFP_ATOMIC, q->node);

2096

ioc = current_io_context(GFP_ATOMIC, q->node);

2097

/*

2097

/*

2098

* The queue will fill after this allocation, so set

2098

* The queue will fill after this allocation, so set

2099

* it as full, and mark this process as "batching".

2099

* it as full, and mark this process as "batching".

2100

* This process will be allowed to complete a batch of

2100

* This process will be allowed to complete a batch of

2101

* requests, others will be blocked.

2101

* requests, others will be blocked.

2102

*/

2102

*/

2103

if (!blk_queue_full(q, rw)) {

2103

if (!blk_queue_full(q, rw)) {

2104

ioc_set_batching(q, ioc);

2104

ioc_set_batching(q, ioc);

2105

blk_set_queue_full(q, rw);

2105

blk_set_queue_full(q, rw);

2106

} else {

2106

} else {

2107

if (may_queue != ELV_MQUEUE_MUST

2107

if (may_queue != ELV_MQUEUE_MUST

2108

&& !ioc_batching(q, ioc)) {

2108

&& !ioc_batching(q, ioc)) {

2109

/*

2109

/*

2110

* The queue is full and the allocating

2110

* The queue is full and the allocating

2111

* process is not a "batcher", and not

2111

* process is not a "batcher", and not

2112

* exempted by the IO scheduler

2112

* exempted by the IO scheduler

2113

*/

2113

*/

2114

goto out;

2114

goto out;

2115

}

2115

}

2116

}

2116

}

2117

}

2117

}

2118

blk_set_queue_congested(q, rw);

2118

blk_set_queue_congested(q, rw);

2119

}

2119

}

2120

2121

/*

2121

/*

2122

* Only allow batching queuers to allocate up to 50% over the defined

2122

* Only allow batching queuers to allocate up to 50% over the defined

2123

* limit of requests, otherwise we could have thousands of requests

2123

* limit of requests, otherwise we could have thousands of requests

2124

* allocated with any setting of ->nr_requests

2124

* allocated with any setting of ->nr_requests

2125

*/

2125

*/

2126

if (rl->count[rw] >= (3 * q->nr_requests / 2))

2126

if (rl->count[rw] >= (3 * q->nr_requests / 2))

2127

goto out;

2127

goto out;

2128

2129

rl->count[rw]++;

2129

rl->count[rw]++;

2130

rl->starved[rw] = 0;

2130

rl->starved[rw] = 0;

2131

2132

priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);

2132

priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);

2133

if (priv)

2133

if (priv)

2134

rl->elvpriv++;

2134

rl->elvpriv++;

2135

2136

spin_unlock_irq(q->queue_lock);

2136

spin_unlock_irq(q->queue_lock);

2137

2138

rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);

2138

rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);

2139

if (unlikely(!rq)) {

2139

if (unlikely(!rq)) {

2140

/*

2140

/*

2141

* Allocation failed presumably due to memory. Undo anything

2141

* Allocation failed presumably due to memory. Undo anything

2142

* we might have messed up.

2142

* we might have messed up.

2143

*

2143

*

2144

* Allocating task should really be put onto the front of the

2144

* Allocating task should really be put onto the front of the

2145

* wait queue, but this is pretty rare.

2145

* wait queue, but this is pretty rare.

2146

*/

2146

*/

2147

spin_lock_irq(q->queue_lock);

2147

spin_lock_irq(q->queue_lock);

2148

freed_request(q, rw, priv);

2148

freed_request(q, rw, priv);

2149

2150

/*

2150

/*

2151

* in the very unlikely event that allocation failed and no

2151

* in the very unlikely event that allocation failed and no

2152

* requests for this direction was pending, mark us starved

2152

* requests for this direction was pending, mark us starved

2153

* so that freeing of a request in the other direction will

2153

* so that freeing of a request in the other direction will

2154

* notice us. another possible fix would be to split the

2154

* notice us. another possible fix would be to split the

2155

* rq mempool into READ and WRITE

2155

* rq mempool into READ and WRITE

2156

*/

2156

*/

2157

rq_starved:

2157

rq_starved:

2158

if (unlikely(rl->count[rw] == 0))

2158

if (unlikely(rl->count[rw] == 0))

2159

rl->starved[rw] = 1;

2159

rl->starved[rw] = 1;

2160

2161

goto out;

2161

goto out;

2162

}

2162

}

2163

2164

/*

2164

/*

2165

* ioc may be NULL here, and ioc_batching will be false. That's

2165

* ioc may be NULL here, and ioc_batching will be false. That's

2166

* OK, if the queue is under the request limit then requests need

2166

* OK, if the queue is under the request limit then requests need

2167

* not count toward the nr_batch_requests limit. There will always

2167

* not count toward the nr_batch_requests limit. There will always

2168

* be some limit enforced by BLK_BATCH_TIME.

2168

* be some limit enforced by BLK_BATCH_TIME.

2169

*/

2169

*/

2170

if (ioc_batching(q, ioc))

2170

if (ioc_batching(q, ioc))

2171

ioc->nr_batch_requests--;

2171

ioc->nr_batch_requests--;

2172

2173

rq_init(q, rq);

2173

rq_init(q, rq);

2174

2175

blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);

2175

blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);

2176

out:

2176

out:

2177

return rq;

2177

return rq;

2178

}

2178

}

2179

2180

/*

2180

/*

2181

* No available requests for this queue, unplug the device and wait for some

2181

* No available requests for this queue, unplug the device and wait for some

2182

* requests to become available.

2182

* requests to become available.

2183

*

2183

*

2184

* Called with q->queue_lock held, and returns with it unlocked.

2184

* Called with q->queue_lock held, and returns with it unlocked.

2185

*/

2185

*/

2186

static struct request *get_request_wait(struct request_queue *q, int rw_flags,

2186

static struct request *get_request_wait(struct request_queue *q, int rw_flags,

2187

struct bio *bio)

2187

struct bio *bio)

2188

{

2188

{

2189

const int rw = rw_flags & 0x01;

2189

const int rw = rw_flags & 0x01;

2190

struct request *rq;

2190

struct request *rq;

2191

2192

rq = get_request(q, rw_flags, bio, GFP_NOIO);

2192

rq = get_request(q, rw_flags, bio, GFP_NOIO);

2193

while (!rq) {

2193

while (!rq) {

2194

DEFINE_WAIT(wait);

2194

DEFINE_WAIT(wait);

2195

struct request_list *rl = &q->rq;

2195

struct request_list *rl = &q->rq;

2196

2197

prepare_to_wait_exclusive(&rl->wait[rw], &wait,

2197

prepare_to_wait_exclusive(&rl->wait[rw], &wait,

2198

TASK_UNINTERRUPTIBLE);

2198

TASK_UNINTERRUPTIBLE);

2199

2200

rq = get_request(q, rw_flags, bio, GFP_NOIO);

2200

rq = get_request(q, rw_flags, bio, GFP_NOIO);

2201

2202

if (!rq) {

2202

if (!rq) {

2203

struct io_context *ioc;

2203

struct io_context *ioc;

2204

2205

blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);

2205

blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);

2206

2207

__generic_unplug_device(q);

2207

__generic_unplug_device(q);

2208

spin_unlock_irq(q->queue_lock);

2208

spin_unlock_irq(q->queue_lock);

2209

io_schedule();

2209

io_schedule();

2210

2211

/*

2211

/*

2212

* After sleeping, we become a "batching" process and

2212

* After sleeping, we become a "batching" process and

2213

* will be able to allocate at least one request, and

2213

* will be able to allocate at least one request, and

2214

* up to a big batch of them for a small period time.

2214

* up to a big batch of them for a small period time.

2215

* See ioc_batching, ioc_set_batching

2215

* See ioc_batching, ioc_set_batching

2216

*/

2216

*/

2217

ioc = current_io_context(GFP_NOIO, q->node);

2217

ioc = current_io_context(GFP_NOIO, q->node);

2218

ioc_set_batching(q, ioc);

2218

ioc_set_batching(q, ioc);

2219

2220

spin_lock_irq(q->queue_lock);

2220

spin_lock_irq(q->queue_lock);

2221

}

2221

}

2222

finish_wait(&rl->wait[rw], &wait);

2222

finish_wait(&rl->wait[rw], &wait);

2223

}

2223

}

2224

2225

return rq;

2225

return rq;

2226

}

2226

}

2227

2228

struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)

2228

struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)

2229

{

2229

{

2230

struct request *rq;

2230

struct request *rq;

2231

2232

BUG_ON(rw != READ && rw != WRITE);

2232

BUG_ON(rw != READ && rw != WRITE);

2233

2234

spin_lock_irq(q->queue_lock);

2234

spin_lock_irq(q->queue_lock);

2235

if (gfp_mask & __GFP_WAIT) {

2235

if (gfp_mask & __GFP_WAIT) {

2236

rq = get_request_wait(q, rw, NULL);

2236

rq = get_request_wait(q, rw, NULL);

2237

} else {

2237

} else {

2238

rq = get_request(q, rw, NULL, gfp_mask);

2238

rq = get_request(q, rw, NULL, gfp_mask);

2239

if (!rq)

2239

if (!rq)

2240

spin_unlock_irq(q->queue_lock);

2240

spin_unlock_irq(q->queue_lock);

2241

}

2241

}

2242

/* q->queue_lock is unlocked at this point */

2242

/* q->queue_lock is unlocked at this point */

2243

2244

return rq;

2244

return rq;

2245

}

2245

}

2246

EXPORT_SYMBOL(blk_get_request);

2246

EXPORT_SYMBOL(blk_get_request);

2247

2248

/**

2248

/**

2249

* blk_start_queueing - initiate dispatch of requests to device

2249

* blk_start_queueing - initiate dispatch of requests to device

2250

* @q: request queue to kick into gear

2250

* @q: request queue to kick into gear

2251

*

2251

*

2252

* This is basically a helper to remove the need to know whether a queue

2252

* This is basically a helper to remove the need to know whether a queue

2253

* is plugged or not if someone just wants to initiate dispatch of requests

2253

* is plugged or not if someone just wants to initiate dispatch of requests

2254

* for this queue.

2254

* for this queue.

2255

*

2255

*

2256

* The queue lock must be held with interrupts disabled.

2256

* The queue lock must be held with interrupts disabled.

2257

*/

2257

*/

2258

void blk_start_queueing(struct request_queue *q)

2258

void blk_start_queueing(struct request_queue *q)

2259

{

2259

{

2260

if (!blk_queue_plugged(q))

2260

if (!blk_queue_plugged(q))

2261

q->request_fn(q);

2261

q->request_fn(q);

2262

else

2262

else

2263

__generic_unplug_device(q);

2263

__generic_unplug_device(q);

2264

}

2264

}

2265

EXPORT_SYMBOL(blk_start_queueing);

2265

EXPORT_SYMBOL(blk_start_queueing);

2266

2267

/**

2267

/**

2268

* blk_requeue_request - put a request back on queue

2268

* blk_requeue_request - put a request back on queue

2269

* @q: request queue where request should be inserted

2269

* @q: request queue where request should be inserted

2270

* @rq: request to be inserted

2270

* @rq: request to be inserted

2271

*

2271

*

2272

* Description:

2272

* Description:

2273

* Drivers often keep queueing requests until the hardware cannot accept

2273

* Drivers often keep queueing requests until the hardware cannot accept

2274

* more, when that condition happens we need to put the request back

2274

* more, when that condition happens we need to put the request back

2275

* on the queue. Must be called with queue lock held.

2275

* on the queue. Must be called with queue lock held.

2276

*/

2276

*/

2277

void blk_requeue_request(struct request_queue *q, struct request *rq)

2277

void blk_requeue_request(struct request_queue *q, struct request *rq)

2278

{

2278

{

2279

blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);

2279

blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);

2280

2281

if (blk_rq_tagged(rq))

2281

if (blk_rq_tagged(rq))

2282

blk_queue_end_tag(q, rq);

2282

blk_queue_end_tag(q, rq);

2283

2284

elv_requeue_request(q, rq);

2284

elv_requeue_request(q, rq);

2285

}

2285

}

2286

2287

EXPORT_SYMBOL(blk_requeue_request);

2287

EXPORT_SYMBOL(blk_requeue_request);

2288

2289

/**

2289

/**

2290

* blk_insert_request - insert a special request in to a request queue

2290

* blk_insert_request - insert a special request in to a request queue

2291

* @q: request queue where request should be inserted

2291

* @q: request queue where request should be inserted

2292

* @rq: request to be inserted

2292

* @rq: request to be inserted

2293

* @at_head: insert request at head or tail of queue

2293

* @at_head: insert request at head or tail of queue

2294

* @data: private data

2294

* @data: private data

2295

*

2295

*

2296

* Description:

2296

* Description:

2297

* Many block devices need to execute commands asynchronously, so they don't

2297

* Many block devices need to execute commands asynchronously, so they don't

2298

* block the whole kernel from preemption during request execution. This is

2298

* block the whole kernel from preemption during request execution. This is

2299

* accomplished normally by inserting aritficial requests tagged as

2299

* accomplished normally by inserting aritficial requests tagged as

2300

* REQ_SPECIAL in to the corresponding request queue, and letting them be

2300

* REQ_SPECIAL in to the corresponding request queue, and letting them be

2301

* scheduled for actual execution by the request queue.

2301

* scheduled for actual execution by the request queue.

2302

*

2302

*

2303

* We have the option of inserting the head or the tail of the queue.

2303

* We have the option of inserting the head or the tail of the queue.

2304

* Typically we use the tail for new ioctls and so forth. We use the head

2304

* Typically we use the tail for new ioctls and so forth. We use the head

2305

* of the queue for things like a QUEUE_FULL message from a device, or a

2305

* of the queue for things like a QUEUE_FULL message from a device, or a

2306

* host that is unable to accept a particular command.

2306

* host that is unable to accept a particular command.

2307

*/

2307

*/

2308

void blk_insert_request(struct request_queue *q, struct request *rq,

2308

void blk_insert_request(struct request_queue *q, struct request *rq,

2309

int at_head, void *data)

2309

int at_head, void *data)

2310

{

2310

{

2311

int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;

2311

int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;

2312

unsigned long flags;

2312

unsigned long flags;

2313

2314

/*

2314

/*

2315

* tell I/O scheduler that this isn't a regular read/write (ie it

2315

* tell I/O scheduler that this isn't a regular read/write (ie it

2316

* must not attempt merges on this) and that it acts as a soft

2316

* must not attempt merges on this) and that it acts as a soft

2317

* barrier

2317

* barrier

2318

*/

2318

*/

2319

rq->cmd_type = REQ_TYPE_SPECIAL;

2319

rq->cmd_type = REQ_TYPE_SPECIAL;

2320

rq->cmd_flags |= REQ_SOFTBARRIER;

2320

rq->cmd_flags |= REQ_SOFTBARRIER;

2321

2322

rq->special = data;

2322

rq->special = data;

2323

2324

spin_lock_irqsave(q->queue_lock, flags);

2324

spin_lock_irqsave(q->queue_lock, flags);

2325

2326

/*

2326

/*

2327

* If command is tagged, release the tag

2327

* If command is tagged, release the tag

2328

*/

2328

*/

2329

if (blk_rq_tagged(rq))

2329

if (blk_rq_tagged(rq))

2330

blk_queue_end_tag(q, rq);

2330

blk_queue_end_tag(q, rq);

2331

2332

drive_stat_acct(rq, rq->nr_sectors, 1);

2332

drive_stat_acct(rq, rq->nr_sectors, 1);

2333

__elv_add_request(q, rq, where, 0);

2333

__elv_add_request(q, rq, where, 0);

2334

blk_start_queueing(q);

2334

blk_start_queueing(q);

2335

spin_unlock_irqrestore(q->queue_lock, flags);

2335

spin_unlock_irqrestore(q->queue_lock, flags);

2336

}

2336

}

2337

2338

EXPORT_SYMBOL(blk_insert_request);

2338

EXPORT_SYMBOL(blk_insert_request);

2339

2340

static int __blk_rq_unmap_user(struct bio *bio)

2340

static int __blk_rq_unmap_user(struct bio *bio)

2341

{

2341

{

2342

int ret = 0;

2342

int ret = 0;

2343

2344

if (bio) {

2344

if (bio) {

2345

if (bio_flagged(bio, BIO_USER_MAPPED))

2345

if (bio_flagged(bio, BIO_USER_MAPPED))

2346

bio_unmap_user(bio);

2346

bio_unmap_user(bio);

2347

else

2347

else

2348

ret = bio_uncopy_user(bio);

2348

ret = bio_uncopy_user(bio);

2349

}

2349

}

2350

2351

return ret;

2351

return ret;

2352

}

2352

}

2353

2354

int blk_rq_append_bio(struct request_queue *q, struct request *rq,

2354

int blk_rq_append_bio(struct request_queue *q, struct request *rq,

2355

struct bio *bio)

2355

struct bio *bio)

2356

{

2356

{

2357

if (!rq->bio)

2357

if (!rq->bio)

2358

blk_rq_bio_prep(q, rq, bio);

2358

blk_rq_bio_prep(q, rq, bio);

2359

else if (!ll_back_merge_fn(q, rq, bio))

2359

else if (!ll_back_merge_fn(q, rq, bio))

2360

return -EINVAL;

2360

return -EINVAL;

2361

else {

2361

else {

2362

rq->biotail->bi_next = bio;

2362

rq->biotail->bi_next = bio;

2363

rq->biotail = bio;

2363

rq->biotail = bio;

2364

2365

rq->data_len += bio->bi_size;

2365

rq->data_len += bio->bi_size;

2366

}

2366

}

2367

return 0;

2367

return 0;

2368

}

2368

}

2369

EXPORT_SYMBOL(blk_rq_append_bio);

2369

EXPORT_SYMBOL(blk_rq_append_bio);

2370

2371

static int __blk_rq_map_user(struct request_queue *q, struct request *rq,

2371

static int __blk_rq_map_user(struct request_queue *q, struct request *rq,

2372

void __user *ubuf, unsigned int len)

2372

void __user *ubuf, unsigned int len)

2373

{

2373

{

2374

unsigned long uaddr;

2374

unsigned long uaddr;

2375

struct bio *bio, *orig_bio;

2375

struct bio *bio, *orig_bio;

2376

int reading, ret;

2376

int reading, ret;

2377

2378

reading = rq_data_dir(rq) == READ;

2378

reading = rq_data_dir(rq) == READ;

2379

2380

/*

2380

/*

2381

* if alignment requirement is satisfied, map in user pages for

2381

* if alignment requirement is satisfied, map in user pages for

2382

* direct dma. else, set up kernel bounce buffers

2382

* direct dma. else, set up kernel bounce buffers

2383

*/

2383

*/

2384

uaddr = (unsigned long) ubuf;

2384

uaddr = (unsigned long) ubuf;

2385

if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))

2385

if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))

2386

bio = bio_map_user(q, NULL, uaddr, len, reading);

2386

bio = bio_map_user(q, NULL, uaddr, len, reading);

2387

else

2387

else

2388

bio = bio_copy_user(q, uaddr, len, reading);

2388

bio = bio_copy_user(q, uaddr, len, reading);

2389

2390

if (IS_ERR(bio))

2390

if (IS_ERR(bio))

2391

return PTR_ERR(bio);

2391

return PTR_ERR(bio);

2392

2393

orig_bio = bio;

2393

orig_bio = bio;

2394

blk_queue_bounce(q, &bio);

2394

blk_queue_bounce(q, &bio);

2395

2396

/*

2396

/*

2397

* We link the bounce buffer in and could have to traverse it

2397

* We link the bounce buffer in and could have to traverse it

2398

* later so we have to get a ref to prevent it from being freed

2398

* later so we have to get a ref to prevent it from being freed

2399

*/

2399

*/

2400

bio_get(bio);

2400

bio_get(bio);

2401

2402

ret = blk_rq_append_bio(q, rq, bio);

2402

ret = blk_rq_append_bio(q, rq, bio);

2403

if (!ret)

2403

if (!ret)

2404

return bio->bi_size;

2404

return bio->bi_size;

2405

2406

/* if it was boucned we must call the end io function */

2406

/* if it was boucned we must call the end io function */

2407

bio_endio(bio, 0);

2407

bio_endio(bio, 0);

2408

__blk_rq_unmap_user(orig_bio);

2408

__blk_rq_unmap_user(orig_bio);

2409

bio_put(bio);

2409

bio_put(bio);

2410

return ret;

2410

return ret;

2411

}

2411

}

2412

2413

/**

2413

/**

2414

* blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage

2414

* blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage

2415

* @q: request queue where request should be inserted

2415

* @q: request queue where request should be inserted

2416

* @rq: request structure to fill

2416

* @rq: request structure to fill

2417

* @ubuf: the user buffer

2417

* @ubuf: the user buffer

2418

* @len: length of user data

2418

* @len: length of user data

2419

*

2419

*

2420

* Description:

2420

* Description:

2421

* Data will be mapped directly for zero copy io, if possible. Otherwise

2421

* Data will be mapped directly for zero copy io, if possible. Otherwise

2422

* a kernel bounce buffer is used.

2422

* a kernel bounce buffer is used.

2423

*

2423

*

2424

* A matching blk_rq_unmap_user() must be issued at the end of io, while

2424

* A matching blk_rq_unmap_user() must be issued at the end of io, while

2425

* still in process context.

2425

* still in process context.

2426

*

2426

*

2427

* Note: The mapped bio may need to be bounced through blk_queue_bounce()

2427

* Note: The mapped bio may need to be bounced through blk_queue_bounce()

2428

* before being submitted to the device, as pages mapped may be out of

2428

* before being submitted to the device, as pages mapped may be out of

2429

* reach. It's the callers responsibility to make sure this happens. The

2429

* reach. It's the callers responsibility to make sure this happens. The

2430

* original bio must be passed back in to blk_rq_unmap_user() for proper

2430

* original bio must be passed back in to blk_rq_unmap_user() for proper

2431

* unmapping.

2431

* unmapping.

2432

*/

2432

*/

2433

int blk_rq_map_user(struct request_queue *q, struct request *rq,

2433

int blk_rq_map_user(struct request_queue *q, struct request *rq,

2434

void __user *ubuf, unsigned long len)

2434

void __user *ubuf, unsigned long len)

2435

{

2435

{

2436

unsigned long bytes_read = 0;

2436

unsigned long bytes_read = 0;

2437

struct bio *bio = NULL;

2437

struct bio *bio = NULL;

2438

int ret;

2438

int ret;

2439

2440

if (len > (q->max_hw_sectors << 9))

2440

if (len > (q->max_hw_sectors << 9))

2441

return -EINVAL;

2441

return -EINVAL;

2442

if (!len || !ubuf)

2442

if (!len || !ubuf)

2443

return -EINVAL;

2443

return -EINVAL;

2444

2445

while (bytes_read != len) {

2445

while (bytes_read != len) {

2446

unsigned long map_len, end, start;

2446

unsigned long map_len, end, start;

2447

2448

map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);

2448

map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);

2449

end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)

2449

end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)

2450

>> PAGE_SHIFT;

2450

>> PAGE_SHIFT;

2451

start = (unsigned long)ubuf >> PAGE_SHIFT;

2451

start = (unsigned long)ubuf >> PAGE_SHIFT;

2452

2453

/*

2453

/*

2454

* A bad offset could cause us to require BIO_MAX_PAGES + 1

2454

* A bad offset could cause us to require BIO_MAX_PAGES + 1

2455

* pages. If this happens we just lower the requested

2455

* pages. If this happens we just lower the requested

2456

* mapping len by a page so that we can fit

2456

* mapping len by a page so that we can fit

2457

*/

2457

*/

2458

if (end - start > BIO_MAX_PAGES)

2458

if (end - start > BIO_MAX_PAGES)

2459

map_len -= PAGE_SIZE;

2459

map_len -= PAGE_SIZE;

2460

2461

ret = __blk_rq_map_user(q, rq, ubuf, map_len);

2461

ret = __blk_rq_map_user(q, rq, ubuf, map_len);

2462

if (ret < 0)

2462

if (ret < 0)

2463

goto unmap_rq;

2463

goto unmap_rq;

2464

if (!bio)

2464

if (!bio)

2465

bio = rq->bio;

2465

bio = rq->bio;

2466

bytes_read += ret;

2466

bytes_read += ret;

2467

ubuf += ret;

2467

ubuf += ret;

2468

}

2468

}

2469

2470

rq->buffer = rq->data = NULL;

2470

rq->buffer = rq->data = NULL;

2471

return 0;

2471

return 0;

2472

unmap_rq:

2472

unmap_rq:

2473

blk_rq_unmap_user(bio);

2473

blk_rq_unmap_user(bio);

2474

return ret;

2474

return ret;

2475

}

2475

}

2476

2477

EXPORT_SYMBOL(blk_rq_map_user);

2477

EXPORT_SYMBOL(blk_rq_map_user);

2478

2479

/**

2479

/**

2480

* blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage

2480

* blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage

2481

* @q: request queue where request should be inserted

2481

* @q: request queue where request should be inserted

2482

* @rq: request to map data to

2482

* @rq: request to map data to

2483

* @iov: pointer to the iovec

2483

* @iov: pointer to the iovec

2484

* @iov_count: number of elements in the iovec

2484

* @iov_count: number of elements in the iovec

2485

* @len: I/O byte count

2485

* @len: I/O byte count

2486

*

2486

*

2487

* Description:

2487

* Description:

2488

* Data will be mapped directly for zero copy io, if possible. Otherwise

2488

* Data will be mapped directly for zero copy io, if possible. Otherwise

2489

* a kernel bounce buffer is used.

2489

* a kernel bounce buffer is used.

2490

*

2490

*

2491

* A matching blk_rq_unmap_user() must be issued at the end of io, while

2491

* A matching blk_rq_unmap_user() must be issued at the end of io, while

2492

* still in process context.

2492

* still in process context.

2493

*

2493

*

2494

* Note: The mapped bio may need to be bounced through blk_queue_bounce()

2494

* Note: The mapped bio may need to be bounced through blk_queue_bounce()

2495

* before being submitted to the device, as pages mapped may be out of

2495

* before being submitted to the device, as pages mapped may be out of

2496

* reach. It's the callers responsibility to make sure this happens. The

2496

* reach. It's the callers responsibility to make sure this happens. The

2497

* original bio must be passed back in to blk_rq_unmap_user() for proper

2497

* original bio must be passed back in to blk_rq_unmap_user() for proper

2498

* unmapping.

2498

* unmapping.

2499

*/

2499

*/

2500

int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,

2500

int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,

2501

struct sg_iovec *iov, int iov_count, unsigned int len)

2501

struct sg_iovec *iov, int iov_count, unsigned int len)

2502

{

2502

{

2503

struct bio *bio;

2503

struct bio *bio;

2504

2505

if (!iov || iov_count <= 0)

2505

if (!iov || iov_count <= 0)

2506

return -EINVAL;

2506

return -EINVAL;

2507

2508

/* we don't allow misaligned data like bio_map_user() does. If the

2508

/* we don't allow misaligned data like bio_map_user() does. If the

2509

* user is using sg, they're expected to know the alignment constraints

2509

* user is using sg, they're expected to know the alignment constraints

2510

* and respect them accordingly */

2510

* and respect them accordingly */

2511

bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);

2511

bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);

2512

if (IS_ERR(bio))

2512

if (IS_ERR(bio))

2513

return PTR_ERR(bio);

2513

return PTR_ERR(bio);

2514

2515

if (bio->bi_size != len) {

2515

if (bio->bi_size != len) {

2516

bio_endio(bio, 0);

2516

bio_endio(bio, 0);

2517

bio_unmap_user(bio);

2517

bio_unmap_user(bio);

2518

return -EINVAL;

2518

return -EINVAL;

2519

}

2519

}

2520

2521

bio_get(bio);

2521

bio_get(bio);

2522

blk_rq_bio_prep(q, rq, bio);

2522

blk_rq_bio_prep(q, rq, bio);

2523

rq->buffer = rq->data = NULL;

2523

rq->buffer = rq->data = NULL;

2524

return 0;

2524

return 0;

2525

}

2525

}

2526

2527

EXPORT_SYMBOL(blk_rq_map_user_iov);

2527

EXPORT_SYMBOL(blk_rq_map_user_iov);

2528

2529

/**

2529

/**

2530

* blk_rq_unmap_user - unmap a request with user data

2530

* blk_rq_unmap_user - unmap a request with user data

2531

* @bio: start of bio list

2531

* @bio: start of bio list

2532

*

2532

*

2533

* Description:

2533

* Description:

2534

* Unmap a rq previously mapped by blk_rq_map_user(). The caller must

2534

* Unmap a rq previously mapped by blk_rq_map_user(). The caller must

2535

* supply the original rq->bio from the blk_rq_map_user() return, since

2535

* supply the original rq->bio from the blk_rq_map_user() return, since

2536

* the io completion may have changed rq->bio.

2536

* the io completion may have changed rq->bio.

2537

*/

2537

*/

2538

int blk_rq_unmap_user(struct bio *bio)

2538

int blk_rq_unmap_user(struct bio *bio)

2539

{

2539

{

2540

struct bio *mapped_bio;

2540

struct bio *mapped_bio;

2541

int ret = 0, ret2;

2541

int ret = 0, ret2;

2542

2543

while (bio) {

2543

while (bio) {

2544

mapped_bio = bio;

2544

mapped_bio = bio;

2545

if (unlikely(bio_flagged(bio, BIO_BOUNCED)))

2545

if (unlikely(bio_flagged(bio, BIO_BOUNCED)))

2546

mapped_bio = bio->bi_private;

2546

mapped_bio = bio->bi_private;

2547

2548

ret2 = __blk_rq_unmap_user(mapped_bio);

2548

ret2 = __blk_rq_unmap_user(mapped_bio);

2549

if (ret2 && !ret)

2549

if (ret2 && !ret)

2550

ret = ret2;

2550

ret = ret2;

2551

2552

mapped_bio = bio;

2552

mapped_bio = bio;

2553

bio = bio->bi_next;

2553

bio = bio->bi_next;

2554

bio_put(mapped_bio);

2554

bio_put(mapped_bio);

2555

}

2555

}

2556

2557

return ret;

2557

return ret;

2558

}

2558

}

2559

2560

EXPORT_SYMBOL(blk_rq_unmap_user);

2560

EXPORT_SYMBOL(blk_rq_unmap_user);

2561

2562

/**

2562

/**

2563

* blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage

2563

* blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage

2564

* @q: request queue where request should be inserted

2564

* @q: request queue where request should be inserted

2565

* @rq: request to fill

2565

* @rq: request to fill

2566

* @kbuf: the kernel buffer

2566

* @kbuf: the kernel buffer

2567

* @len: length of user data

2567

* @len: length of user data

2568

* @gfp_mask: memory allocation flags

2568

* @gfp_mask: memory allocation flags

2569

*/

2569

*/

2570

int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,

2570

int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,

2571

unsigned int len, gfp_t gfp_mask)

2571

unsigned int len, gfp_t gfp_mask)

2572

{

2572

{

2573

struct bio *bio;

2573

struct bio *bio;

2574

2575

if (len > (q->max_hw_sectors << 9))

2575

if (len > (q->max_hw_sectors << 9))

2576

return -EINVAL;

2576

return -EINVAL;

2577

if (!len || !kbuf)

2577

if (!len || !kbuf)

2578

return -EINVAL;

2578

return -EINVAL;

2579

2580

bio = bio_map_kern(q, kbuf, len, gfp_mask);

2580

bio = bio_map_kern(q, kbuf, len, gfp_mask);

2581

if (IS_ERR(bio))

2581

if (IS_ERR(bio))

2582

return PTR_ERR(bio);

2582

return PTR_ERR(bio);

2583

2584

if (rq_data_dir(rq) == WRITE)

2584

if (rq_data_dir(rq) == WRITE)

2585

bio->bi_rw |= (1 << BIO_RW);

2585

bio->bi_rw |= (1 << BIO_RW);

2586

2587

blk_rq_bio_prep(q, rq, bio);

2587

blk_rq_bio_prep(q, rq, bio);

2588

blk_queue_bounce(q, &rq->bio);

2588

blk_queue_bounce(q, &rq->bio);

2589

rq->buffer = rq->data = NULL;

2589

rq->buffer = rq->data = NULL;

2590

return 0;

2590

return 0;

2591

}

2591

}

2592

2593

EXPORT_SYMBOL(blk_rq_map_kern);

2593

EXPORT_SYMBOL(blk_rq_map_kern);

2594

2595

/**

2595

/**

2596

* blk_execute_rq_nowait - insert a request into queue for execution

2596

* blk_execute_rq_nowait - insert a request into queue for execution

2597

* @q: queue to insert the request in

2597

* @q: queue to insert the request in

2598

* @bd_disk: matching gendisk

2598

* @bd_disk: matching gendisk

2599

* @rq: request to insert

2599

* @rq: request to insert

2600

* @at_head: insert request at head or tail of queue

2600

* @at_head: insert request at head or tail of queue

2601

* @done: I/O completion handler

2601

* @done: I/O completion handler

2602

*

2602

*

2603

* Description:

2603

* Description:

2604

* Insert a fully prepared request at the back of the io scheduler queue

2604

* Insert a fully prepared request at the back of the io scheduler queue

2605

* for execution. Don't wait for completion.

2605

* for execution. Don't wait for completion.

2606

*/

2606

*/

2607

void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,

2607

void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,

2608

struct request *rq, int at_head,

2608

struct request *rq, int at_head,

2609

rq_end_io_fn *done)

2609

rq_end_io_fn *done)

2610

{

2610

{

2611

int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;

2611

int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;

2612

2613

rq->rq_disk = bd_disk;

2613

rq->rq_disk = bd_disk;

2614

rq->cmd_flags |= REQ_NOMERGE;

2614

rq->cmd_flags |= REQ_NOMERGE;

2615

rq->end_io = done;

2615

rq->end_io = done;

2616

WARN_ON(irqs_disabled());

2616

WARN_ON(irqs_disabled());

2617

spin_lock_irq(q->queue_lock);

2617

spin_lock_irq(q->queue_lock);

2618

__elv_add_request(q, rq, where, 1);

2618

__elv_add_request(q, rq, where, 1);

2619

__generic_unplug_device(q);

2619

__generic_unplug_device(q);

2620

spin_unlock_irq(q->queue_lock);

2620

spin_unlock_irq(q->queue_lock);

2621

}

2621

}

2622

EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);

2622

EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);

2623

2624

/**

2624

/**

2625

* blk_execute_rq - insert a request into queue for execution

2625

* blk_execute_rq - insert a request into queue for execution

2626

* @q: queue to insert the request in

2626

* @q: queue to insert the request in

2627

* @bd_disk: matching gendisk

2627

* @bd_disk: matching gendisk

2628

* @rq: request to insert

2628

* @rq: request to insert

2629

* @at_head: insert request at head or tail of queue

2629

* @at_head: insert request at head or tail of queue

2630

*

2630

*

2631

* Description:

2631

* Description:

2632

* Insert a fully prepared request at the back of the io scheduler queue

2632

* Insert a fully prepared request at the back of the io scheduler queue

2633

* for execution and wait for completion.

2633

* for execution and wait for completion.

2634

*/

2634

*/

2635

int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,

2635

int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,

2636

struct request *rq, int at_head)

2636

struct request *rq, int at_head)

2637

{

2637

{

2638

DECLARE_COMPLETION_ONSTACK(wait);

2638

DECLARE_COMPLETION_ONSTACK(wait);

2639

char sense[SCSI_SENSE_BUFFERSIZE];

2639

char sense[SCSI_SENSE_BUFFERSIZE];

2640

int err = 0;

2640

int err = 0;

2641

2642

/*

2642

/*

2643

* we need an extra reference to the request, so we can look at

2643

* we need an extra reference to the request, so we can look at

2644

* it after io completion

2644

* it after io completion

2645

*/

2645

*/

2646

rq->ref_count++;

2646

rq->ref_count++;

2647

2648

if (!rq->sense) {

2648

if (!rq->sense) {

2649

memset(sense, 0, sizeof(sense));

2649

memset(sense, 0, sizeof(sense));

2650

rq->sense = sense;

2650

rq->sense = sense;

2651

rq->sense_len = 0;

2651

rq->sense_len = 0;

2652

}

2652

}

2653

2654

rq->end_io_data = &wait;

2654

rq->end_io_data = &wait;

2655

blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);

2655

blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);

2656

wait_for_completion(&wait);

2656

wait_for_completion(&wait);

2657

2658

if (rq->errors)

2658

if (rq->errors)

2659

err = -EIO;

2659

err = -EIO;

2660

2661

return err;

2661

return err;

2662

}

2662

}

2663

2664

EXPORT_SYMBOL(blk_execute_rq);

2664

EXPORT_SYMBOL(blk_execute_rq);

2665

2666

/**

2666

/**

2667

* blkdev_issue_flush - queue a flush

2667

* blkdev_issue_flush - queue a flush

2668

* @bdev: blockdev to issue flush for

2668

* @bdev: blockdev to issue flush for

2669

* @error_sector: error sector

2669

* @error_sector: error sector

2670

*

2670

*

2671

* Description:

2671

* Description:

2672

* Issue a flush for the block device in question. Caller can supply

2672

* Issue a flush for the block device in question. Caller can supply

2673

* room for storing the error offset in case of a flush error, if they

2673

* room for storing the error offset in case of a flush error, if they

2674

* wish to. Caller must run wait_for_completion() on its own.

2674

* wish to. Caller must run wait_for_completion() on its own.

2675

*/

2675

*/

2676

int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)

2676

int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)

2677

{

2677

{

2678

struct request_queue *q;

2678

struct request_queue *q;

2679

2680

if (bdev->bd_disk == NULL)

2680

if (bdev->bd_disk == NULL)

2681

return -ENXIO;

2681

return -ENXIO;

2682

2683

q = bdev_get_queue(bdev);

2683

q = bdev_get_queue(bdev);

2684

if (!q)

2684

if (!q)

2685

return -ENXIO;

2685

return -ENXIO;

2686

if (!q->issue_flush_fn)

2686

if (!q->issue_flush_fn)

2687

return -EOPNOTSUPP;

2687

return -EOPNOTSUPP;

2688

2689

return q->issue_flush_fn(q, bdev->bd_disk, error_sector);

2689

return q->issue_flush_fn(q, bdev->bd_disk, error_sector);

2690

}

2690

}

2691

2692

EXPORT_SYMBOL(blkdev_issue_flush);

2692

EXPORT_SYMBOL(blkdev_issue_flush);

2693

2694

static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)

2694

static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)

2695

{

2695

{

2696

int rw = rq_data_dir(rq);

2696

int rw = rq_data_dir(rq);

2697

2698

if (!blk_fs_request(rq) || !rq->rq_disk)

2698

if (!blk_fs_request(rq) || !rq->rq_disk)

2699

return;

2699

return;

2700

2701

if (!new_io) {

2701

if (!new_io) {

2702

__disk_stat_inc(rq->rq_disk, merges[rw]);

2702

__disk_stat_inc(rq->rq_disk, merges[rw]);

2703

} else {

2703

} else {

2704

disk_round_stats(rq->rq_disk);

2704

disk_round_stats(rq->rq_disk);

2705

rq->rq_disk->in_flight++;

2705

rq->rq_disk->in_flight++;

2706

}

2706

}

2707

}

2707

}

2708

2709

/*

2709

/*

2710

* add-request adds a request to the linked list.

2710

* add-request adds a request to the linked list.

2711

* queue lock is held and interrupts disabled, as we muck with the

2711

* queue lock is held and interrupts disabled, as we muck with the

2712

* request queue list.

2712

* request queue list.

2713

*/

2713

*/

2714

static inline void add_request(struct request_queue * q, struct request * req)

2714

static inline void add_request(struct request_queue * q, struct request * req)

2715

{

2715

{

2716

drive_stat_acct(req, req->nr_sectors, 1);

2716

drive_stat_acct(req, req->nr_sectors, 1);

2717

2718

/*

2718

/*

2719

* elevator indicated where it wants this request to be

2719

* elevator indicated where it wants this request to be

2720

* inserted at elevator_merge time

2720

* inserted at elevator_merge time

2721

*/

2721

*/

2722

__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);

2722

__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);

2723

}

2723

}

2724

2725

/*

2725

/*

2726

* disk_round_stats() - Round off the performance stats on a struct

2726

* disk_round_stats() - Round off the performance stats on a struct

2727

* disk_stats.

2727

* disk_stats.

2728

*

2728

*

2729

* The average IO queue length and utilisation statistics are maintained

2729

* The average IO queue length and utilisation statistics are maintained

2730

* by observing the current state of the queue length and the amount of

2730

* by observing the current state of the queue length and the amount of

2731

* time it has been in this state for.

2731

* time it has been in this state for.

2732

*

2732

*

2733

* Normally, that accounting is done on IO completion, but that can result

2733

* Normally, that accounting is done on IO completion, but that can result

2734

* in more than a second's worth of IO being accounted for within any one

2734

* in more than a second's worth of IO being accounted for within any one

2735

* second, leading to >100% utilisation. To deal with that, we call this

2735

* second, leading to >100% utilisation. To deal with that, we call this

2736

* function to do a round-off before returning the results when reading

2736

* function to do a round-off before returning the results when reading

2737

* /proc/diskstats. This accounts immediately for all queue usage up to

2737

* /proc/diskstats. This accounts immediately for all queue usage up to

2738

* the current jiffies and restarts the counters again.

2738

* the current jiffies and restarts the counters again.

2739

*/

2739

*/

2740

void disk_round_stats(struct gendisk *disk)

2740

void disk_round_stats(struct gendisk *disk)

2741

{

2741

{

2742

unsigned long now = jiffies;

2742

unsigned long now = jiffies;

2743

2744

if (now == disk->stamp)

2744

if (now == disk->stamp)

2745

return;

2745

return;

2746

2747

if (disk->in_flight) {

2747

if (disk->in_flight) {

2748

__disk_stat_add(disk, time_in_queue,

2748

__disk_stat_add(disk, time_in_queue,

2749

disk->in_flight * (now - disk->stamp));

2749

disk->in_flight * (now - disk->stamp));

2750

__disk_stat_add(disk, io_ticks, (now - disk->stamp));

2750

__disk_stat_add(disk, io_ticks, (now - disk->stamp));

2751

}

2751

}

2752

disk->stamp = now;

2752

disk->stamp = now;

2753

}

2753

}

2754

2755

EXPORT_SYMBOL_GPL(disk_round_stats);

2755

EXPORT_SYMBOL_GPL(disk_round_stats);

2756

2757

/*

2757

/*

2758

* queue lock must be held

2758

* queue lock must be held

2759

*/

2759

*/

2760

void __blk_put_request(struct request_queue *q, struct request *req)

2760

void __blk_put_request(struct request_queue *q, struct request *req)

2761

{

2761

{

2762

if (unlikely(!q))

2762

if (unlikely(!q))

2763

return;

2763

return;

2764

if (unlikely(--req->ref_count))

2764

if (unlikely(--req->ref_count))

2765

return;

2765

return;

2766

2767

elv_completed_request(q, req);

2767

elv_completed_request(q, req);

2768

2769

/*

2769

/*

2770

* Request may not have originated from ll_rw_blk. if not,

2770

* Request may not have originated from ll_rw_blk. if not,

2771

* it didn't come out of our reserved rq pools

2771

* it didn't come out of our reserved rq pools

2772

*/

2772

*/

2773

if (req->cmd_flags & REQ_ALLOCED) {

2773

if (req->cmd_flags & REQ_ALLOCED) {

2774

int rw = rq_data_dir(req);

2774

int rw = rq_data_dir(req);

2775

int priv = req->cmd_flags & REQ_ELVPRIV;

2775

int priv = req->cmd_flags & REQ_ELVPRIV;

2776

2777

BUG_ON(!list_empty(&req->queuelist));

2777

BUG_ON(!list_empty(&req->queuelist));

2778

BUG_ON(!hlist_unhashed(&req->hash));

2778

BUG_ON(!hlist_unhashed(&req->hash));

2779

2780

blk_free_request(q, req);

2780

blk_free_request(q, req);

2781

freed_request(q, rw, priv);

2781

freed_request(q, rw, priv);

2782

}

2782

}

2783

}

2783

}

2784

2785

EXPORT_SYMBOL_GPL(__blk_put_request);

2785

EXPORT_SYMBOL_GPL(__blk_put_request);

2786

2787

void blk_put_request(struct request *req)

2787

void blk_put_request(struct request *req)

2788

{

2788

{

2789

unsigned long flags;

2789

unsigned long flags;

2790

struct request_queue *q = req->q;

2790

struct request_queue *q = req->q;

2791

2792

/*

2792

/*

2793

* Gee, IDE calls in w/ NULL q. Fix IDE and remove the

2793

* Gee, IDE calls in w/ NULL q. Fix IDE and remove the

2794

* following if (q) test.

2794

* following if (q) test.

2795

*/

2795

*/

2796

if (q) {

2796

if (q) {

2797

spin_lock_irqsave(q->queue_lock, flags);

2797

spin_lock_irqsave(q->queue_lock, flags);

2798

__blk_put_request(q, req);

2798

__blk_put_request(q, req);

2799

spin_unlock_irqrestore(q->queue_lock, flags);

2799

spin_unlock_irqrestore(q->queue_lock, flags);

2800

}

2800

}

2801

}

2801

}

2802

2803

EXPORT_SYMBOL(blk_put_request);

2803

EXPORT_SYMBOL(blk_put_request);

2804

2805

/**

2805

/**

2806

* blk_end_sync_rq - executes a completion event on a request

2806

* blk_end_sync_rq - executes a completion event on a request

2807

* @rq: request to complete

2807

* @rq: request to complete

2808

* @error: end io status of the request

2808

* @error: end io status of the request

2809

*/

2809

*/

2810

void blk_end_sync_rq(struct request *rq, int error)

2810

void blk_end_sync_rq(struct request *rq, int error)

2811

{

2811

{

2812

struct completion *waiting = rq->end_io_data;

2812

struct completion *waiting = rq->end_io_data;

2813

2814

rq->end_io_data = NULL;

2814

rq->end_io_data = NULL;

2815

__blk_put_request(rq->q, rq);

2815

__blk_put_request(rq->q, rq);

2816

2817

/*

2817

/*

2818

* complete last, if this is a stack request the process (and thus

2818

* complete last, if this is a stack request the process (and thus

2819

* the rq pointer) could be invalid right after this complete()

2819

* the rq pointer) could be invalid right after this complete()

2820

*/

2820

*/

2821

complete(waiting);

2821

complete(waiting);

2822

}

2822

}

2823

EXPORT_SYMBOL(blk_end_sync_rq);

2823

EXPORT_SYMBOL(blk_end_sync_rq);

2824

2825

/*

2825

/*

2826

* Has to be called with the request spinlock acquired

2826

* Has to be called with the request spinlock acquired

2827

*/

2827

*/

2828

static int attempt_merge(struct request_queue *q, struct request *req,

2828

static int attempt_merge(struct request_queue *q, struct request *req,

2829

struct request *next)

2829

struct request *next)

2830

{

2830

{

2831

if (!rq_mergeable(req) || !rq_mergeable(next))

2831

if (!rq_mergeable(req) || !rq_mergeable(next))

2832

return 0;

2832

return 0;

2833

2834

/*

2834

/*

2835

* not contiguous

2835

* not contiguous

2836

*/

2836

*/

2837

if (req->sector + req->nr_sectors != next->sector)

2837

if (req->sector + req->nr_sectors != next->sector)

2838

return 0;

2838

return 0;

2839

2840

if (rq_data_dir(req) != rq_data_dir(next)

2840

if (rq_data_dir(req) != rq_data_dir(next)

2841

|| req->rq_disk != next->rq_disk

2841

|| req->rq_disk != next->rq_disk

2842

|| next->special)

2842

|| next->special)

2843

return 0;

2843

return 0;

2844

2845

/*

2845

/*

2846

* If we are allowed to merge, then append bio list

2846

* If we are allowed to merge, then append bio list

2847

* from next to rq and release next. merge_requests_fn

2847

* from next to rq and release next. merge_requests_fn

2848

* will have updated segment counts, update sector

2848

* will have updated segment counts, update sector

2849

* counts here.

2849

* counts here.

2850

*/

2850

*/

2851

if (!ll_merge_requests_fn(q, req, next))

2851

if (!ll_merge_requests_fn(q, req, next))

2852

return 0;

2852

return 0;

2853

2854

/*

2854

/*

2855

* At this point we have either done a back merge

2855

* At this point we have either done a back merge

2856

* or front merge. We need the smaller start_time of

2856

* or front merge. We need the smaller start_time of

2857

* the merged requests to be the current request

2857

* the merged requests to be the current request

2858

* for accounting purposes.

2858

* for accounting purposes.

2859

*/

2859

*/

2860

if (time_after(req->start_time, next->start_time))

2860

if (time_after(req->start_time, next->start_time))

2861

req->start_time = next->start_time;

2861

req->start_time = next->start_time;

2862

2863

req->biotail->bi_next = next->bio;

2863

req->biotail->bi_next = next->bio;

2864

req->biotail = next->biotail;

2864

req->biotail = next->biotail;

2865

2866

req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;

2866

req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;

2867

2868

elv_merge_requests(q, req, next);

2868

elv_merge_requests(q, req, next);

2869

2870

if (req->rq_disk) {

2870

if (req->rq_disk) {

2871

disk_round_stats(req->rq_disk);

2871

disk_round_stats(req->rq_disk);

2872

req->rq_disk->in_flight--;

2872

req->rq_disk->in_flight--;

2873

}

2873

}

2874

2875

req->ioprio = ioprio_best(req->ioprio, next->ioprio);

2875

req->ioprio = ioprio_best(req->ioprio, next->ioprio);

2876

2877

__blk_put_request(q, next);

2877

__blk_put_request(q, next);

2878

return 1;

2878

return 1;

2879

}

2879

}

2880

2881

static inline int attempt_back_merge(struct request_queue *q,

2881

static inline int attempt_back_merge(struct request_queue *q,

2882

struct request *rq)

2882

struct request *rq)

2883

{

2883

{

2884

struct request *next = elv_latter_request(q, rq);

2884

struct request *next = elv_latter_request(q, rq);

2885

2886

if (next)

2886

if (next)

2887

return attempt_merge(q, rq, next);

2887

return attempt_merge(q, rq, next);

2888

2889

return 0;

2889

return 0;

2890

}

2890

}

2891

2892

static inline int attempt_front_merge(struct request_queue *q,

2892

static inline int attempt_front_merge(struct request_queue *q,

2893

struct request *rq)

2893

struct request *rq)

2894

{

2894

{

2895

struct request *prev = elv_former_request(q, rq);

2895

struct request *prev = elv_former_request(q, rq);

2896

2897

if (prev)

2897

if (prev)

2898

return attempt_merge(q, prev, rq);

2898

return attempt_merge(q, prev, rq);

2899

2900

return 0;

2900

return 0;

2901

}

2901

}

2902

2903

static void init_request_from_bio(struct request *req, struct bio *bio)

2903

static void init_request_from_bio(struct request *req, struct bio *bio)

2904

{

2904

{

2905

req->cmd_type = REQ_TYPE_FS;

2905

req->cmd_type = REQ_TYPE_FS;

2906

2907

/*

2907

/*

2908

* inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)

2908

* inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)

2909

*/

2909

*/

2910

if (bio_rw_ahead(bio) || bio_failfast(bio))

2910

if (bio_rw_ahead(bio) || bio_failfast(bio))

2911

req->cmd_flags |= REQ_FAILFAST;

2911

req->cmd_flags |= REQ_FAILFAST;

2912

2913

/*

2913

/*

2914

* REQ_BARRIER implies no merging, but lets make it explicit

2914

* REQ_BARRIER implies no merging, but lets make it explicit

2915

*/

2915

*/

2916

if (unlikely(bio_barrier(bio)))

2916

if (unlikely(bio_barrier(bio)))

2917

req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);

2917

req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);

2918

2919

if (bio_sync(bio))

2919

if (bio_sync(bio))

2920

req->cmd_flags |= REQ_RW_SYNC;

2920

req->cmd_flags |= REQ_RW_SYNC;

2921

if (bio_rw_meta(bio))

2921

if (bio_rw_meta(bio))

2922

req->cmd_flags |= REQ_RW_META;

2922

req->cmd_flags |= REQ_RW_META;

2923

2924

req->errors = 0;

2924

req->errors = 0;

2925

req->hard_sector = req->sector = bio->bi_sector;

2925

req->hard_sector = req->sector = bio->bi_sector;

2926

req->ioprio = bio_prio(bio);

2926

req->ioprio = bio_prio(bio);

2927

req->start_time = jiffies;

2927

req->start_time = jiffies;

2928

blk_rq_bio_prep(req->q, req, bio);

2928

blk_rq_bio_prep(req->q, req, bio);

2929

}

2929

}

2930

2931

static int __make_request(struct request_queue *q, struct bio *bio)

2931

static int __make_request(struct request_queue *q, struct bio *bio)

2932

{

2932

{

2933

struct request *req;

2933

struct request *req;

2934

int el_ret, nr_sectors, barrier, err;

2934

int el_ret, nr_sectors, barrier, err;

2935

const unsigned short prio = bio_prio(bio);

2935

const unsigned short prio = bio_prio(bio);

2936

const int sync = bio_sync(bio);

2936

const int sync = bio_sync(bio);

2937

int rw_flags;

2937

int rw_flags;

2938

2939

nr_sectors = bio_sectors(bio);

2939

nr_sectors = bio_sectors(bio);

2940

2941

/*

2941

/*

2942

* low level driver can indicate that it wants pages above a

2942

* low level driver can indicate that it wants pages above a

2943

* certain limit bounced to low memory (ie for highmem, or even

2943

* certain limit bounced to low memory (ie for highmem, or even

2944

* ISA dma in theory)

2944

* ISA dma in theory)

2945

*/

2945

*/

2946

blk_queue_bounce(q, &bio);

2946

blk_queue_bounce(q, &bio);

2947

2948

barrier = bio_barrier(bio);

2948

barrier = bio_barrier(bio);

2949

if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {

2949

if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {

2950

err = -EOPNOTSUPP;

2950

err = -EOPNOTSUPP;

2951

goto end_io;

2951

goto end_io;

2952

}

2952

}

2953

2954

spin_lock_irq(q->queue_lock);

2954

spin_lock_irq(q->queue_lock);

2955

2956

if (unlikely(barrier) || elv_queue_empty(q))

2956

if (unlikely(barrier) || elv_queue_empty(q))

2957

goto get_rq;

2957

goto get_rq;

2958

2959

el_ret = elv_merge(q, &req, bio);

2959

el_ret = elv_merge(q, &req, bio);

2960

switch (el_ret) {

2960

switch (el_ret) {

2961

case ELEVATOR_BACK_MERGE:

2961

case ELEVATOR_BACK_MERGE:

2962

BUG_ON(!rq_mergeable(req));

2962

BUG_ON(!rq_mergeable(req));

2963

2964

if (!ll_back_merge_fn(q, req, bio))

2964

if (!ll_back_merge_fn(q, req, bio))

2965

break;

2965

break;

2966

2967

blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);

2967

blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);

2968

2969

req->biotail->bi_next = bio;

2969

req->biotail->bi_next = bio;

2970

req->biotail = bio;

2970

req->biotail = bio;

2971

req->nr_sectors = req->hard_nr_sectors += nr_sectors;

2971

req->nr_sectors = req->hard_nr_sectors += nr_sectors;

2972

req->ioprio = ioprio_best(req->ioprio, prio);

2972

req->ioprio = ioprio_best(req->ioprio, prio);

2973

drive_stat_acct(req, nr_sectors, 0);

2973

drive_stat_acct(req, nr_sectors, 0);

2974

if (!attempt_back_merge(q, req))

2974

if (!attempt_back_merge(q, req))

2975

elv_merged_request(q, req, el_ret);

2975

elv_merged_request(q, req, el_ret);

2976

goto out;

2976

goto out;

2977

2978

case ELEVATOR_FRONT_MERGE:

2978

case ELEVATOR_FRONT_MERGE:

2979

BUG_ON(!rq_mergeable(req));

2979

BUG_ON(!rq_mergeable(req));

2980

2981

if (!ll_front_merge_fn(q, req, bio))

2981

if (!ll_front_merge_fn(q, req, bio))

2982

break;

2982

break;

2983

2984

blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);

2984

blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);

2985

2986

bio->bi_next = req->bio;

2986

bio->bi_next = req->bio;

2987

req->bio = bio;

2987

req->bio = bio;

2988

2989

/*

2989

/*

2990

* may not be valid. if the low level driver said

2990

* may not be valid. if the low level driver said

2991

* it didn't need a bounce buffer then it better

2991

* it didn't need a bounce buffer then it better

2992

* not touch req->buffer either...

2992

* not touch req->buffer either...

2993

*/

2993

*/

2994

req->buffer = bio_data(bio);

2994

req->buffer = bio_data(bio);

2995

req->current_nr_sectors = bio_cur_sectors(bio);

2995

req->current_nr_sectors = bio_cur_sectors(bio);

2996

req->hard_cur_sectors = req->current_nr_sectors;

2996

req->hard_cur_sectors = req->current_nr_sectors;

2997

req->sector = req->hard_sector = bio->bi_sector;

2997

req->sector = req->hard_sector = bio->bi_sector;

2998

req->nr_sectors = req->hard_nr_sectors += nr_sectors;

2998

req->nr_sectors = req->hard_nr_sectors += nr_sectors;

2999

req->ioprio = ioprio_best(req->ioprio, prio);

2999

req->ioprio = ioprio_best(req->ioprio, prio);

3000

drive_stat_acct(req, nr_sectors, 0);

3000

drive_stat_acct(req, nr_sectors, 0);

3001

if (!attempt_front_merge(q, req))

3001

if (!attempt_front_merge(q, req))

3002

elv_merged_request(q, req, el_ret);

3002

elv_merged_request(q, req, el_ret);

3003

goto out;

3003

goto out;

3004

3005

/* ELV_NO_MERGE: elevator says don't/can't merge. */

3005

/* ELV_NO_MERGE: elevator says don't/can't merge. */

3006

default:

3006

default:

3007

;

3007

;

3008

}

3008

}

3009

3010

get_rq:

3010

get_rq:

3011

/*

3011

/*

3012

* This sync check and mask will be re-done in init_request_from_bio(),

3012

* This sync check and mask will be re-done in init_request_from_bio(),

3013

* but we need to set it earlier to expose the sync flag to the

3013

* but we need to set it earlier to expose the sync flag to the

3014

* rq allocator and io schedulers.

3014

* rq allocator and io schedulers.

3015

*/

3015

*/

3016

rw_flags = bio_data_dir(bio);

3016

rw_flags = bio_data_dir(bio);

3017

if (sync)

3017

if (sync)

3018

rw_flags |= REQ_RW_SYNC;

3018

rw_flags |= REQ_RW_SYNC;

3019

3020

/*

3020

/*

3021

* Grab a free request. This is might sleep but can not fail.

3021

* Grab a free request. This is might sleep but can not fail.

3022

* Returns with the queue unlocked.

3022

* Returns with the queue unlocked.

3023

*/

3023

*/

3024

req = get_request_wait(q, rw_flags, bio);

3024

req = get_request_wait(q, rw_flags, bio);

3025

3026

/*

3026

/*

3027

* After dropping the lock and possibly sleeping here, our request

3027

* After dropping the lock and possibly sleeping here, our request

3028

* may now be mergeable after it had proven unmergeable (above).

3028

* may now be mergeable after it had proven unmergeable (above).

3029

* We don't worry about that case for efficiency. It won't happen

3029

* We don't worry about that case for efficiency. It won't happen

3030

* often, and the elevators are able to handle it.

3030

* often, and the elevators are able to handle it.

3031

*/

3031

*/

3032

init_request_from_bio(req, bio);

3032

init_request_from_bio(req, bio);

3033

3034

spin_lock_irq(q->queue_lock);

3034

spin_lock_irq(q->queue_lock);

3035

if (elv_queue_empty(q))

3035

if (elv_queue_empty(q))

3036

blk_plug_device(q);

3036

blk_plug_device(q);

3037

add_request(q, req);

3037

add_request(q, req);

3038

out:

3038

out:

3039

if (sync)

3039

if (sync)

3040

__generic_unplug_device(q);

3040

__generic_unplug_device(q);

3041

3042

spin_unlock_irq(q->queue_lock);

3042

spin_unlock_irq(q->queue_lock);

3043

return 0;

3043

return 0;

3044

3045

end_io:

3045

end_io:

3046

bio_endio(bio, err);

3046

bio_endio(bio, err);

3047

return 0;

3047

return 0;

3048

}

3048

}

3049

3050

/*

3050

/*

3051

* If bio->bi_dev is a partition, remap the location

3051

* If bio->bi_dev is a partition, remap the location

3052

*/

3052

*/

3053

static inline void blk_partition_remap(struct bio *bio)

3053

static inline void blk_partition_remap(struct bio *bio)

3054

{

3054

{

3055

struct block_device *bdev = bio->bi_bdev;

3055

struct block_device *bdev = bio->bi_bdev;

3056

3057

if (bdev != bdev->bd_contains) {

3057

if (bdev != bdev->bd_contains) {

3058

struct hd_struct *p = bdev->bd_part;

3058

struct hd_struct *p = bdev->bd_part;

3059

const int rw = bio_data_dir(bio);

3059

const int rw = bio_data_dir(bio);

3060

3061

p->sectors[rw] += bio_sectors(bio);

3061

p->sectors[rw] += bio_sectors(bio);

3062

p->ios[rw]++;

3062

p->ios[rw]++;

3063

3064

bio->bi_sector += p->start_sect;

3064

bio->bi_sector += p->start_sect;

3065

bio->bi_bdev = bdev->bd_contains;

3065

bio->bi_bdev = bdev->bd_contains;

3066

3067

blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,

3067

blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,

3068

bdev->bd_dev, bio->bi_sector,

3068

bdev->bd_dev, bio->bi_sector,

3069

bio->bi_sector - p->start_sect);

3069

bio->bi_sector - p->start_sect);

3070

}

3070

}

3071

}

3071

}

3072

3073

static void handle_bad_sector(struct bio *bio)

3073

static void handle_bad_sector(struct bio *bio)

3074

{

3074

{

3075

char b[BDEVNAME_SIZE];

3075

char b[BDEVNAME_SIZE];

3076

3077

printk(KERN_INFO "attempt to access beyond end of device\n");

3077

printk(KERN_INFO "attempt to access beyond end of device\n");

3078

printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",

3078

printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",

3079

bdevname(bio->bi_bdev, b),

3079

bdevname(bio->bi_bdev, b),

3080

bio->bi_rw,

3080

bio->bi_rw,

3081

(unsigned long long)bio->bi_sector + bio_sectors(bio),

3081

(unsigned long long)bio->bi_sector + bio_sectors(bio),

3082

(long long)(bio->bi_bdev->bd_inode->i_size >> 9));

3082

(long long)(bio->bi_bdev->bd_inode->i_size >> 9));

3083

3084

set_bit(BIO_EOF, &bio->bi_flags);

3084

set_bit(BIO_EOF, &bio->bi_flags);

3085

}

3085

}

3086

3087

#ifdef CONFIG_FAIL_MAKE_REQUEST

3087

#ifdef CONFIG_FAIL_MAKE_REQUEST

3088

3089

static DECLARE_FAULT_ATTR(fail_make_request);

3089

static DECLARE_FAULT_ATTR(fail_make_request);

3090

3091

static int __init setup_fail_make_request(char *str)

3091

static int __init setup_fail_make_request(char *str)

3092

{

3092

{

3093

return setup_fault_attr(&fail_make_request, str);

3093

return setup_fault_attr(&fail_make_request, str);

3094

}

3094

}

3095

__setup("fail_make_request=", setup_fail_make_request);

3095

__setup("fail_make_request=", setup_fail_make_request);

3096

3097

static int should_fail_request(struct bio *bio)

3097

static int should_fail_request(struct bio *bio)

3098

{

3098

{

3099

if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) ||

3099

if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) ||

3100

(bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))

3100

(bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))

3101

return should_fail(&fail_make_request, bio->bi_size);

3101

return should_fail(&fail_make_request, bio->bi_size);

3102

3103

return 0;

3103

return 0;

3104

}

3104

}

3105

3106

static int __init fail_make_request_debugfs(void)

3106

static int __init fail_make_request_debugfs(void)

3107

{

3107

{

3108

return init_fault_attr_dentries(&fail_make_request,

3108

return init_fault_attr_dentries(&fail_make_request,

3109

"fail_make_request");

3109

"fail_make_request");

3110

}

3110

}

3111

3112

late_initcall(fail_make_request_debugfs);

3112

late_initcall(fail_make_request_debugfs);

3113

3114

#else /* CONFIG_FAIL_MAKE_REQUEST */

3114

#else /* CONFIG_FAIL_MAKE_REQUEST */

3115

3116

static inline int should_fail_request(struct bio *bio)

3116

static inline int should_fail_request(struct bio *bio)

3117

{

3117

{

3118

return 0;

3118

return 0;

3119

}

3119

}

3120

3121

#endif /* CONFIG_FAIL_MAKE_REQUEST */

3121

#endif /* CONFIG_FAIL_MAKE_REQUEST */

3122

3123

/**

3123

/**

3124

* generic_make_request: hand a buffer to its device driver for I/O

3124

* generic_make_request: hand a buffer to its device driver for I/O

3125

* @bio: The bio describing the location in memory and on the device.

3125

* @bio: The bio describing the location in memory and on the device.

3126

*

3126

*

3127

* generic_make_request() is used to make I/O requests of block

3127

* generic_make_request() is used to make I/O requests of block

3128

* devices. It is passed a &struct bio, which describes the I/O that needs

3128

* devices. It is passed a &struct bio, which describes the I/O that needs

3129

* to be done.

3129

* to be done.

3130

*

3130

*

3131

* generic_make_request() does not return any status. The

3131

* generic_make_request() does not return any status. The

3132

* success/failure status of the request, along with notification of

3132

* success/failure status of the request, along with notification of

3133

* completion, is delivered asynchronously through the bio->bi_end_io

3133

* completion, is delivered asynchronously through the bio->bi_end_io

3134

* function described (one day) else where.

3134

* function described (one day) else where.

3135

*

3135

*

3136

* The caller of generic_make_request must make sure that bi_io_vec

3136

* The caller of generic_make_request must make sure that bi_io_vec

3137

* are set to describe the memory buffer, and that bi_dev and bi_sector are

3137

* are set to describe the memory buffer, and that bi_dev and bi_sector are

3138

* set to describe the device address, and the

3138

* set to describe the device address, and the

3139

* bi_end_io and optionally bi_private are set to describe how

3139

* bi_end_io and optionally bi_private are set to describe how

3140

* completion notification should be signaled.

3140

* completion notification should be signaled.

3141

*

3141

*

3142

* generic_make_request and the drivers it calls may use bi_next if this

3142

* generic_make_request and the drivers it calls may use bi_next if this

3143

* bio happens to be merged with someone else, and may change bi_dev and

3143

* bio happens to be merged with someone else, and may change bi_dev and

3144

* bi_sector for remaps as it sees fit. So the values of these fields

3144

* bi_sector for remaps as it sees fit. So the values of these fields

3145

* should NOT be depended on after the call to generic_make_request.

3145

* should NOT be depended on after the call to generic_make_request.

3146

*/

3146

*/

3147

static inline void __generic_make_request(struct bio *bio)

3147

static inline void __generic_make_request(struct bio *bio)

3148

{

3148

{

3149

struct request_queue *q;

3149

struct request_queue *q;

3150

sector_t maxsector;

3150

sector_t maxsector;

3151

sector_t old_sector;

3151

sector_t old_sector;

3152

int ret, nr_sectors = bio_sectors(bio);

3152

int ret, nr_sectors = bio_sectors(bio);

3153

dev_t old_dev;

3153

dev_t old_dev;

3154

3155

might_sleep();

3155

might_sleep();

3156

/* Test device or partition size, when known. */

3156

/* Test device or partition size, when known. */

3157

maxsector = bio->bi_bdev->bd_inode->i_size >> 9;

3157

maxsector = bio->bi_bdev->bd_inode->i_size >> 9;

3158

if (maxsector) {

3158

if (maxsector) {

3159

sector_t sector = bio->bi_sector;

3159

sector_t sector = bio->bi_sector;

3160

3161

if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {

3161

if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {

3162

/*

3162

/*

3163

* This may well happen - the kernel calls bread()

3163

* This may well happen - the kernel calls bread()

3164

* without checking the size of the device, e.g., when

3164

* without checking the size of the device, e.g., when

3165

* mounting a device.

3165

* mounting a device.

3166

*/

3166

*/

3167

handle_bad_sector(bio);

3167

handle_bad_sector(bio);

3168

goto end_io;

3168

goto end_io;

3169

}

3169

}

3170

}

3170

}

3171

3172

/*

3172

/*

3173

* Resolve the mapping until finished. (drivers are

3173

* Resolve the mapping until finished. (drivers are

3174

* still free to implement/resolve their own stacking

3174

* still free to implement/resolve their own stacking

3175

* by explicitly returning 0)

3175

* by explicitly returning 0)

3176

*

3176

*

3177

* NOTE: we don't repeat the blk_size check for each new device.

3177

* NOTE: we don't repeat the blk_size check for each new device.

3178

* Stacking drivers are expected to know what they are doing.

3178

* Stacking drivers are expected to know what they are doing.

3179

*/

3179

*/

3180

old_sector = -1;

3180

old_sector = -1;

3181

old_dev = 0;

3181

old_dev = 0;

3182

do {

3182

do {

3183

char b[BDEVNAME_SIZE];

3183

char b[BDEVNAME_SIZE];

3184

3185

q = bdev_get_queue(bio->bi_bdev);

3185

q = bdev_get_queue(bio->bi_bdev);

3186

if (!q) {

3186

if (!q) {

3187

printk(KERN_ERR

3187

printk(KERN_ERR

3188

"generic_make_request: Trying to access "

3188

"generic_make_request: Trying to access "

3189

"nonexistent block-device %s (%Lu)\n",

3189

"nonexistent block-device %s (%Lu)\n",

3190

bdevname(bio->bi_bdev, b),

3190

bdevname(bio->bi_bdev, b),

3191

(long long) bio->bi_sector);

3191

(long long) bio->bi_sector);

3192

end_io:

3192

end_io:

3193

bio_endio(bio, -EIO);

3193

bio_endio(bio, -EIO);

3194

break;

3194

break;

3195

}

3195

}

3196

3197

if (unlikely(nr_sectors > q->max_hw_sectors)) {

3197

if (unlikely(nr_sectors > q->max_hw_sectors)) {

3198

printk("bio too big device %s (%u > %u)\n",

3198

printk("bio too big device %s (%u > %u)\n",

3199

bdevname(bio->bi_bdev, b),

3199

bdevname(bio->bi_bdev, b),

3200

bio_sectors(bio),

3200

bio_sectors(bio),

3201

q->max_hw_sectors);

3201

q->max_hw_sectors);

3202

goto end_io;

3202

goto end_io;

3203

}

3203

}

3204

3205

if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))

3205

if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))

3206

goto end_io;

3206

goto end_io;

3207

3208

if (should_fail_request(bio))

3208

if (should_fail_request(bio))

3209

goto end_io;

3209

goto end_io;

3210

3211

/*

3211

/*

3212

* If this device has partitions, remap block n

3212

* If this device has partitions, remap block n

3213

* of partition p to block n+start(p) of the disk.

3213

* of partition p to block n+start(p) of the disk.

3214

*/

3214

*/

3215

blk_partition_remap(bio);

3215

blk_partition_remap(bio);

3216

3217

if (old_sector != -1)

3217

if (old_sector != -1)

3218

blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,

3218

blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,

3219

old_sector);

3219

old_sector);

3220

3221

blk_add_trace_bio(q, bio, BLK_TA_QUEUE);

3221

blk_add_trace_bio(q, bio, BLK_TA_QUEUE);

3222

3223

old_sector = bio->bi_sector;

3223

old_sector = bio->bi_sector;

3224

old_dev = bio->bi_bdev->bd_dev;

3224

old_dev = bio->bi_bdev->bd_dev;

3225

3226

maxsector = bio->bi_bdev->bd_inode->i_size >> 9;

3226

maxsector = bio->bi_bdev->bd_inode->i_size >> 9;

3227

if (maxsector) {

3227

if (maxsector) {

3228

sector_t sector = bio->bi_sector;

3228

sector_t sector = bio->bi_sector;

3229

3230

if (maxsector < nr_sectors ||

3230

if (maxsector < nr_sectors ||

3231

maxsector - nr_sectors < sector) {

3231

maxsector - nr_sectors < sector) {

3232

/*

3232

/*

3233

* This may well happen - partitions are not

3233

* This may well happen - partitions are not

3234

* checked to make sure they are within the size

3234

* checked to make sure they are within the size

3235

* of the whole device.

3235

* of the whole device.

3236

*/

3236

*/

3237

handle_bad_sector(bio);

3237

handle_bad_sector(bio);

3238

goto end_io;

3238

goto end_io;

3239

}

3239

}

3240

}

3240

}

3241

3242

ret = q->make_request_fn(q, bio);

3242

ret = q->make_request_fn(q, bio);

3243

} while (ret);

3243

} while (ret);

3244

}

3244

}

3245

3246

/*

3246

/*

3247

* We only want one ->make_request_fn to be active at a time,

3247

* We only want one ->make_request_fn to be active at a time,

3248

* else stack usage with stacked devices could be a problem.

3248

* else stack usage with stacked devices could be a problem.

3249

* So use current->bio_{list,tail} to keep a list of requests

3249

* So use current->bio_{list,tail} to keep a list of requests

3250

* submited by a make_request_fn function.

3250

* submited by a make_request_fn function.

3251

* current->bio_tail is also used as a flag to say if

3251

* current->bio_tail is also used as a flag to say if

3252

* generic_make_request is currently active in this task or not.

3252

* generic_make_request is currently active in this task or not.

3253

* If it is NULL, then no make_request is active. If it is non-NULL,

3253

* If it is NULL, then no make_request is active. If it is non-NULL,

3254

* then a make_request is active, and new requests should be added

3254

* then a make_request is active, and new requests should be added

3255

* at the tail

3255

* at the tail

3256

*/

3256

*/

3257

void generic_make_request(struct bio *bio)

3257

void generic_make_request(struct bio *bio)

3258

{

3258

{

3259

if (current->bio_tail) {

3259

if (current->bio_tail) {

3260

/* make_request is active */

3260

/* make_request is active */

3261

*(current->bio_tail) = bio;

3261

*(current->bio_tail) = bio;

3262

bio->bi_next = NULL;

3262

bio->bi_next = NULL;

3263

current->bio_tail = &bio->bi_next;

3263

current->bio_tail = &bio->bi_next;

3264

return;

3264

return;

3265

}

3265

}

3266

/* following loop may be a bit non-obvious, and so deserves some

3266

/* following loop may be a bit non-obvious, and so deserves some

3267

* explanation.

3267

* explanation.

3268

* Before entering the loop, bio->bi_next is NULL (as all callers

3268

* Before entering the loop, bio->bi_next is NULL (as all callers

3269

* ensure that) so we have a list with a single bio.

3269

* ensure that) so we have a list with a single bio.

3270

* We pretend that we have just taken it off a longer list, so

3270

* We pretend that we have just taken it off a longer list, so

3271

* we assign bio_list to the next (which is NULL) and bio_tail

3271

* we assign bio_list to the next (which is NULL) and bio_tail

3272

* to &bio_list, thus initialising the bio_list of new bios to be

3272

* to &bio_list, thus initialising the bio_list of new bios to be

3273

* added. __generic_make_request may indeed add some more bios

3273

* added. __generic_make_request may indeed add some more bios

3274

* through a recursive call to generic_make_request. If it

3274

* through a recursive call to generic_make_request. If it

3275

* did, we find a non-NULL value in bio_list and re-enter the loop

3275

* did, we find a non-NULL value in bio_list and re-enter the loop

3276

* from the top. In this case we really did just take the bio

3276

* from the top. In this case we really did just take the bio

3277

* of the top of the list (no pretending) and so fixup bio_list and

3277

* of the top of the list (no pretending) and so fixup bio_list and

3278

* bio_tail or bi_next, and call into __generic_make_request again.

3278

* bio_tail or bi_next, and call into __generic_make_request again.

3279

*

3279

*

3280

* The loop was structured like this to make only one call to

3280

* The loop was structured like this to make only one call to

3281

* __generic_make_request (which is important as it is large and

3281

* __generic_make_request (which is important as it is large and

3282

* inlined) and to keep the structure simple.

3282

* inlined) and to keep the structure simple.

3283

*/

3283

*/

3284

BUG_ON(bio->bi_next);

3284

BUG_ON(bio->bi_next);

3285

do {

3285

do {

3286

current->bio_list = bio->bi_next;

3286

current->bio_list = bio->bi_next;

3287

if (bio->bi_next == NULL)

3287

if (bio->bi_next == NULL)

3288

current->bio_tail = &current->bio_list;

3288

current->bio_tail = &current->bio_list;

3289

else

3289

else

3290

bio->bi_next = NULL;

3290

bio->bi_next = NULL;

3291

__generic_make_request(bio);

3291

__generic_make_request(bio);

3292

bio = current->bio_list;

3292

bio = current->bio_list;

3293

} while (bio);

3293

} while (bio);

3294

current->bio_tail = NULL; /* deactivate */

3294

current->bio_tail = NULL; /* deactivate */

3295

}

3295

}

3296

3297

EXPORT_SYMBOL(generic_make_request);

3297

EXPORT_SYMBOL(generic_make_request);

3298

3299

/**

3299

/**

3300

* submit_bio: submit a bio to the block device layer for I/O

3300

* submit_bio: submit a bio to the block device layer for I/O

3301

* @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)

3301

* @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)

3302

* @bio: The &struct bio which describes the I/O

3302

* @bio: The &struct bio which describes the I/O

3303

*

3303

*

3304

* submit_bio() is very similar in purpose to generic_make_request(), and

3304

* submit_bio() is very similar in purpose to generic_make_request(), and

3305

* uses that function to do most of the work. Both are fairly rough

3305

* uses that function to do most of the work. Both are fairly rough

3306

* interfaces, @bio must be presetup and ready for I/O.

3306

* interfaces, @bio must be presetup and ready for I/O.

3307

*

3307

*

3308

*/

3308

*/

3309

void submit_bio(int rw, struct bio *bio)

3309

void submit_bio(int rw, struct bio *bio)

3310

{

3310

{

3311

int count = bio_sectors(bio);

3311

int count = bio_sectors(bio);

3312

3313

BIO_BUG_ON(!bio->bi_size);

3313

BIO_BUG_ON(!bio->bi_size);

3314

BIO_BUG_ON(!bio->bi_io_vec);

3314

BIO_BUG_ON(!bio->bi_io_vec);

3315

bio->bi_rw |= rw;

3315

bio->bi_rw |= rw;

3316

if (rw & WRITE) {

3316

if (rw & WRITE) {

3317

count_vm_events(PGPGOUT, count);

3317

count_vm_events(PGPGOUT, count);

3318

} else {

3318

} else {

3319

task_io_account_read(bio->bi_size);

3319

task_io_account_read(bio->bi_size);

3320

count_vm_events(PGPGIN, count);

3320

count_vm_events(PGPGIN, count);

3321

}

3321

}

3322

3323

if (unlikely(block_dump)) {

3323

if (unlikely(block_dump)) {

3324

char b[BDEVNAME_SIZE];

3324

char b[BDEVNAME_SIZE];

3325

printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",

3325

printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",

3326

current->comm, current->pid,

3326

current->comm, current->pid,

3327

(rw & WRITE) ? "WRITE" : "READ",

3327

(rw & WRITE) ? "WRITE" : "READ",

3328

(unsigned long long)bio->bi_sector,

3328

(unsigned long long)bio->bi_sector,

3329

bdevname(bio->bi_bdev,b));

3329

bdevname(bio->bi_bdev,b));

3330

}

3330

}

3331

3332

generic_make_request(bio);

3332

generic_make_request(bio);

3333

}

3333

}

3334

3335

EXPORT_SYMBOL(submit_bio);

3335

EXPORT_SYMBOL(submit_bio);

3336

3337

static void blk_recalc_rq_sectors(struct request *rq, int nsect)

3337

static void blk_recalc_rq_sectors(struct request *rq, int nsect)

3338

{

3338

{

3339

if (blk_fs_request(rq)) {

3339

if (blk_fs_request(rq)) {

3340

rq->hard_sector += nsect;

3340

rq->hard_sector += nsect;

3341

rq->hard_nr_sectors -= nsect;

3341

rq->hard_nr_sectors -= nsect;

3342

3343

/*

3343

/*

3344

* Move the I/O submission pointers ahead if required.

3344

* Move the I/O submission pointers ahead if required.

3345

*/

3345

*/

3346

if ((rq->nr_sectors >= rq->hard_nr_sectors) &&

3346

if ((rq->nr_sectors >= rq->hard_nr_sectors) &&

3347

(rq->sector <= rq->hard_sector)) {

3347

(rq->sector <= rq->hard_sector)) {

3348

rq->sector = rq->hard_sector;

3348

rq->sector = rq->hard_sector;

3349

rq->nr_sectors = rq->hard_nr_sectors;

3349

rq->nr_sectors = rq->hard_nr_sectors;

3350

rq->hard_cur_sectors = bio_cur_sectors(rq->bio);

3350

rq->hard_cur_sectors = bio_cur_sectors(rq->bio);

3351

rq->current_nr_sectors = rq->hard_cur_sectors;

3351

rq->current_nr_sectors = rq->hard_cur_sectors;

3352

rq->buffer = bio_data(rq->bio);

3352

rq->buffer = bio_data(rq->bio);

3353

}

3353

}

3354

3355

/*

3355

/*

3356

* if total number of sectors is less than the first segment

3356

* if total number of sectors is less than the first segment

3357

* size, something has gone terribly wrong

3357

* size, something has gone terribly wrong

3358

*/

3358

*/

3359

if (rq->nr_sectors < rq->current_nr_sectors) {

3359

if (rq->nr_sectors < rq->current_nr_sectors) {

3360

printk("blk: request botched\n");

3360

printk("blk: request botched\n");

3361

rq->nr_sectors = rq->current_nr_sectors;

3361

rq->nr_sectors = rq->current_nr_sectors;

3362

}

3362

}

3363

}

3363

}

3364

}

3364

}

3365

3366

static int __end_that_request_first(struct request *req, int uptodate,

3366

static int __end_that_request_first(struct request *req, int uptodate,

3367

int nr_bytes)

3367

int nr_bytes)

3368

{

3368

{

3369

int total_bytes, bio_nbytes, error, next_idx = 0;

3369

int total_bytes, bio_nbytes, error, next_idx = 0;

3370

struct bio *bio;

3370

struct bio *bio;

3371

3372

blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);

3372

blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);

3373

3374

/*

3374

/*

3375

* extend uptodate bool to allow < 0 value to be direct io error

3375

* extend uptodate bool to allow < 0 value to be direct io error

3376

*/

3376

*/

3377

error = 0;

3377

error = 0;

3378

if (end_io_error(uptodate))

3378

if (end_io_error(uptodate))

3379

error = !uptodate ? -EIO : uptodate;

3379

error = !uptodate ? -EIO : uptodate;

3380

3381

/*

3381

/*

3382

* for a REQ_BLOCK_PC request, we want to carry any eventual

3382

* for a REQ_BLOCK_PC request, we want to carry any eventual

3383

* sense key with us all the way through

3383

* sense key with us all the way through

3384

*/

3384

*/

3385

if (!blk_pc_request(req))

3385

if (!blk_pc_request(req))

3386

req->errors = 0;

3386

req->errors = 0;

3387

3388

if (!uptodate) {

3388

if (!uptodate) {

3389

if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))

3389

if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))

3390

printk("end_request: I/O error, dev %s, sector %llu\n",

3390

printk("end_request: I/O error, dev %s, sector %llu\n",

3391

req->rq_disk ? req->rq_disk->disk_name : "?",

3391

req->rq_disk ? req->rq_disk->disk_name : "?",

3392

(unsigned long long)req->sector);

3392

(unsigned long long)req->sector);

3393

}

3393

}

3394

3395

if (blk_fs_request(req) && req->rq_disk) {

3395

if (blk_fs_request(req) && req->rq_disk) {

3396

const int rw = rq_data_dir(req);

3396

const int rw = rq_data_dir(req);

3397

3398

disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);

3398

disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);

3399

}

3399

}

3400

3401

total_bytes = bio_nbytes = 0;

3401

total_bytes = bio_nbytes = 0;

3402

while ((bio = req->bio) != NULL) {

3402

while ((bio = req->bio) != NULL) {

3403

int nbytes;

3403

int nbytes;

3404

3405

if (nr_bytes >= bio->bi_size) {

3405

if (nr_bytes >= bio->bi_size) {

3406

req->bio = bio->bi_next;

3406

req->bio = bio->bi_next;

3407

nbytes = bio->bi_size;

3407

nbytes = bio->bi_size;

3408

req_bio_endio(req, bio, nbytes, error);

3408

req_bio_endio(req, bio, nbytes, error);

3409

next_idx = 0;

3409

next_idx = 0;

3410

bio_nbytes = 0;

3410

bio_nbytes = 0;

3411

} else {

3411

} else {

3412

int idx = bio->bi_idx + next_idx;

3412

int idx = bio->bi_idx + next_idx;

3413

3414

if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {

3414

if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {

3415

blk_dump_rq_flags(req, "__end_that");

3415

blk_dump_rq_flags(req, "__end_that");

3416

printk("%s: bio idx %d >= vcnt %d\n",

3416

printk("%s: bio idx %d >= vcnt %d\n",

3417

__FUNCTION__,

3417

__FUNCTION__,

3418

bio->bi_idx, bio->bi_vcnt);

3418

bio->bi_idx, bio->bi_vcnt);

3419

break;

3419

break;

3420

}

3420

}

3421

3422

nbytes = bio_iovec_idx(bio, idx)->bv_len;

3422

nbytes = bio_iovec_idx(bio, idx)->bv_len;

3423

BIO_BUG_ON(nbytes > bio->bi_size);

3423

BIO_BUG_ON(nbytes > bio->bi_size);

3424

3425

/*

3425

/*

3426

* not a complete bvec done

3426

* not a complete bvec done

3427

*/

3427

*/

3428

if (unlikely(nbytes > nr_bytes)) {

3428

if (unlikely(nbytes > nr_bytes)) {

3429

bio_nbytes += nr_bytes;

3429

bio_nbytes += nr_bytes;

3430

total_bytes += nr_bytes;

3430

total_bytes += nr_bytes;

3431

break;

3431

break;

3432

}

3432

}

3433

3434

/*

3434

/*

3435

* advance to the next vector

3435

* advance to the next vector

3436

*/

3436

*/

3437

next_idx++;

3437

next_idx++;

3438

bio_nbytes += nbytes;

3438

bio_nbytes += nbytes;

3439

}

3439

}

3440

3441

total_bytes += nbytes;

3441

total_bytes += nbytes;

3442

nr_bytes -= nbytes;

3442

nr_bytes -= nbytes;

3443

3444

if ((bio = req->bio)) {

3444

if ((bio = req->bio)) {

3445

/*

3445

/*

3446

* end more in this run, or just return 'not-done'

3446

* end more in this run, or just return 'not-done'

3447

*/

3447

*/

3448

if (unlikely(nr_bytes <= 0))

3448

if (unlikely(nr_bytes <= 0))

3449

break;

3449

break;

3450

}

3450

}

3451

}

3451

}

3452

3453

/*

3453

/*

3454

* completely done

3454

* completely done

3455

*/

3455

*/

3456

if (!req->bio)

3456

if (!req->bio)

3457

return 0;

3457

return 0;

3458

3459

/*

3459

/*

3460

* if the request wasn't completed, update state

3460

* if the request wasn't completed, update state

3461

*/

3461

*/

3462

if (bio_nbytes) {

3462

if (bio_nbytes) {

3463

req_bio_endio(req, bio, bio_nbytes, error);

3463

req_bio_endio(req, bio, bio_nbytes, error);

3464

bio->bi_idx += next_idx;

3464

bio->bi_idx += next_idx;

3465

bio_iovec(bio)->bv_offset += nr_bytes;

3465

bio_iovec(bio)->bv_offset += nr_bytes;

3466

bio_iovec(bio)->bv_len -= nr_bytes;

3466

bio_iovec(bio)->bv_len -= nr_bytes;

3467

}

3467

}

3468

3469

blk_recalc_rq_sectors(req, total_bytes >> 9);

3469

blk_recalc_rq_sectors(req, total_bytes >> 9);

3470

blk_recalc_rq_segments(req);

3470

blk_recalc_rq_segments(req);

3471

return 1;

3471

return 1;

3472

}

3472

}

3473

3474

/**

3474

/**

3475

* end_that_request_first - end I/O on a request

3475

* end_that_request_first - end I/O on a request

3476

* @req: the request being processed

3476

* @req: the request being processed

3477

* @uptodate: 1 for success, 0 for I/O error, < 0 for specific error

3477

* @uptodate: 1 for success, 0 for I/O error, < 0 for specific error

3478

* @nr_sectors: number of sectors to end I/O on

3478

* @nr_sectors: number of sectors to end I/O on

3479

*

3479

*

3480

* Description:

3480

* Description:

3481

* Ends I/O on a number of sectors attached to @req, and sets it up

3481

* Ends I/O on a number of sectors attached to @req, and sets it up

3482

* for the next range of segments (if any) in the cluster.

3482

* for the next range of segments (if any) in the cluster.

3483

*

3483

*

3484

* Return:

3484

* Return:

3485

* 0 - we are done with this request, call end_that_request_last()

3485

* 0 - we are done with this request, call end_that_request_last()

3486

* 1 - still buffers pending for this request

3486

* 1 - still buffers pending for this request

3487

**/

3487

**/

3488

int end_that_request_first(struct request *req, int uptodate, int nr_sectors)

3488

int end_that_request_first(struct request *req, int uptodate, int nr_sectors)

3489

{

3489

{

3490

return __end_that_request_first(req, uptodate, nr_sectors << 9);

3490

return __end_that_request_first(req, uptodate, nr_sectors << 9);

3491

}

3491

}

3492

3493

EXPORT_SYMBOL(end_that_request_first);

3493

EXPORT_SYMBOL(end_that_request_first);

3494

3495

/**

3495

/**

3496

* end_that_request_chunk - end I/O on a request

3496

* end_that_request_chunk - end I/O on a request

3497

* @req: the request being processed

3497

* @req: the request being processed

3498

* @uptodate: 1 for success, 0 for I/O error, < 0 for specific error

3498

* @uptodate: 1 for success, 0 for I/O error, < 0 for specific error

3499

* @nr_bytes: number of bytes to complete

3499

* @nr_bytes: number of bytes to complete

3500

*

3500

*

3501

* Description:

3501

* Description:

3502

* Ends I/O on a number of bytes attached to @req, and sets it up

3502

* Ends I/O on a number of bytes attached to @req, and sets it up

3503

* for the next range of segments (if any). Like end_that_request_first(),

3503

* for the next range of segments (if any). Like end_that_request_first(),

3504

* but deals with bytes instead of sectors.

3504

* but deals with bytes instead of sectors.

3505

*

3505

*

3506

* Return:

3506

* Return:

3507

* 0 - we are done with this request, call end_that_request_last()

3507

* 0 - we are done with this request, call end_that_request_last()

3508

* 1 - still buffers pending for this request

3508

* 1 - still buffers pending for this request

3509

**/

3509

**/

3510

int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)

3510

int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)

3511

{

3511

{

3512

return __end_that_request_first(req, uptodate, nr_bytes);

3512

return __end_that_request_first(req, uptodate, nr_bytes);

3513

}

3513

}

3514

3515

EXPORT_SYMBOL(end_that_request_chunk);

3515

EXPORT_SYMBOL(end_that_request_chunk);

3516

3517

/*

3517

/*

3518

* splice the completion data to a local structure and hand off to

3518

* splice the completion data to a local structure and hand off to

3519

* process_completion_queue() to complete the requests

3519

* process_completion_queue() to complete the requests

3520

*/

3520

*/

3521

static void blk_done_softirq(struct softirq_action *h)

3521

static void blk_done_softirq(struct softirq_action *h)

3522

{

3522

{

3523

struct list_head *cpu_list, local_list;

3523

struct list_head *cpu_list, local_list;

3524

3525

local_irq_disable();

3525

local_irq_disable();

3526

cpu_list = &__get_cpu_var(blk_cpu_done);

3526

cpu_list = &__get_cpu_var(blk_cpu_done);

3527

list_replace_init(cpu_list, &local_list);

3527

list_replace_init(cpu_list, &local_list);

3528

local_irq_enable();

3528

local_irq_enable();

3529

3530

while (!list_empty(&local_list)) {

3530

while (!list_empty(&local_list)) {

3531

struct request *rq = list_entry(local_list.next, struct request, donelist);

3531

struct request *rq = list_entry(local_list.next, struct request, donelist);

3532

3533

list_del_init(&rq->donelist);

3533

list_del_init(&rq->donelist);

3534

rq->q->softirq_done_fn(rq);

3534

rq->q->softirq_done_fn(rq);

3535

}

3535

}

3536

}

3536

}

3537

3538

static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action,

3538

static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action,

3539

void *hcpu)

3539

void *hcpu)

3540

{

3540

{

3541

/*

3541

/*

3542

* If a CPU goes away, splice its entries to the current CPU

3542

* If a CPU goes away, splice its entries to the current CPU

3543

* and trigger a run of the softirq

3543

* and trigger a run of the softirq

3544

*/

3544

*/

3545

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

3545

if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {

3546

int cpu = (unsigned long) hcpu;

3546

int cpu = (unsigned long) hcpu;

3547

3548

local_irq_disable();

3548

local_irq_disable();

3549

list_splice_init(&per_cpu(blk_cpu_done, cpu),

3549

list_splice_init(&per_cpu(blk_cpu_done, cpu),

3550

&__get_cpu_var(blk_cpu_done));

3550

&__get_cpu_var(blk_cpu_done));

3551

raise_softirq_irqoff(BLOCK_SOFTIRQ);

3551

raise_softirq_irqoff(BLOCK_SOFTIRQ);

3552

local_irq_enable();

3552

local_irq_enable();

3553

}

3553

}

3554

3555

return NOTIFY_OK;

3555

return NOTIFY_OK;

3556

}

3556

}

3557

3558

3559

static struct notifier_block blk_cpu_notifier __cpuinitdata = {

3559

static struct notifier_block blk_cpu_notifier __cpuinitdata = {

3560

.notifier_call = blk_cpu_notify,

3560

.notifier_call = blk_cpu_notify,

3561

};

3561

};

3562

3563

/**

3563

/**

3564

* blk_complete_request - end I/O on a request

3564

* blk_complete_request - end I/O on a request

3565

* @req: the request being processed

3565

* @req: the request being processed

3566

*

3566

*

3567

* Description:

3567

* Description:

3568

* Ends all I/O on a request. It does not handle partial completions,

3568

* Ends all I/O on a request. It does not handle partial completions,

3569

* unless the driver actually implements this in its completion callback

3569

* unless the driver actually implements this in its completion callback

3570

* through requeueing. The actual completion happens out-of-order,

3570

* through requeueing. The actual completion happens out-of-order,

3571

* through a softirq handler. The user must have registered a completion

3571

* through a softirq handler. The user must have registered a completion

3572

* callback through blk_queue_softirq_done().

3572

* callback through blk_queue_softirq_done().

3573

**/

3573

**/

3574

3575

void blk_complete_request(struct request *req)

3575

void blk_complete_request(struct request *req)

3576

{

3576

{

3577

struct list_head *cpu_list;

3577

struct list_head *cpu_list;

3578

unsigned long flags;

3578

unsigned long flags;

3579

3580

BUG_ON(!req->q->softirq_done_fn);

3580

BUG_ON(!req->q->softirq_done_fn);

3581

3582

local_irq_save(flags);

3582

local_irq_save(flags);

3583

3584

cpu_list = &__get_cpu_var(blk_cpu_done);

3584

cpu_list = &__get_cpu_var(blk_cpu_done);

3585

list_add_tail(&req->donelist, cpu_list);

3585

list_add_tail(&req->donelist, cpu_list);

3586

raise_softirq_irqoff(BLOCK_SOFTIRQ);

3586

raise_softirq_irqoff(BLOCK_SOFTIRQ);

3587

3588

local_irq_restore(flags);

3588

local_irq_restore(flags);

3589

}

3589

}

3590

3591

EXPORT_SYMBOL(blk_complete_request);

3591

EXPORT_SYMBOL(blk_complete_request);

3592

3593

/*

3593

/*

3594

* queue lock must be held

3594

* queue lock must be held

3595

*/

3595

*/

3596

void end_that_request_last(struct request *req, int uptodate)

3596

void end_that_request_last(struct request *req, int uptodate)

3597

{

3597

{

3598

struct gendisk *disk = req->rq_disk;

3598

struct gendisk *disk = req->rq_disk;

3599

int error;

3599

int error;

3600

3601

/*

3601

/*

3602

* extend uptodate bool to allow < 0 value to be direct io error

3602

* extend uptodate bool to allow < 0 value to be direct io error

3603

*/

3603

*/

3604

error = 0;

3604

error = 0;

3605

if (end_io_error(uptodate))

3605

if (end_io_error(uptodate))

3606

error = !uptodate ? -EIO : uptodate;

3606

error = !uptodate ? -EIO : uptodate;

3607

3608

if (unlikely(laptop_mode) && blk_fs_request(req))

3608

if (unlikely(laptop_mode) && blk_fs_request(req))

3609

laptop_io_completion();

3609

laptop_io_completion();

3610

3611

/*

3611

/*

3612

* Account IO completion. bar_rq isn't accounted as a normal

3612

* Account IO completion. bar_rq isn't accounted as a normal

3613

* IO on queueing nor completion. Accounting the containing

3613

* IO on queueing nor completion. Accounting the containing

3614

* request is enough.

3614

* request is enough.

3615

*/

3615

*/

3616

if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {

3616

if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {

3617

unsigned long duration = jiffies - req->start_time;

3617

unsigned long duration = jiffies - req->start_time;

3618

const int rw = rq_data_dir(req);

3618

const int rw = rq_data_dir(req);

3619

3620

__disk_stat_inc(disk, ios[rw]);

3620

__disk_stat_inc(disk, ios[rw]);

3621

__disk_stat_add(disk, ticks[rw], duration);

3621

__disk_stat_add(disk, ticks[rw], duration);

3622

disk_round_stats(disk);

3622

disk_round_stats(disk);

3623

disk->in_flight--;

3623

disk->in_flight--;

3624

}

3624

}

3625

if (req->end_io)

3625

if (req->end_io)

3626

req->end_io(req, error);

3626

req->end_io(req, error);

3627

else

3627

else

3628

__blk_put_request(req->q, req);

3628

__blk_put_request(req->q, req);

3629

}

3629

}

3630

3631

EXPORT_SYMBOL(end_that_request_last);

3631

EXPORT_SYMBOL(end_that_request_last);

3632

3633

void end_request(struct request *req, int uptodate)

3633

static inline void __end_request(struct request *rq, int uptodate,

3634

unsigned int nr_bytes, int dequeue)

3634

{

3635

{

3635

if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {

3636

if (!end_that_request_chunk(rq, uptodate, nr_bytes)) {

3636

add_disk_randomness(req->rq_disk);

3637

if (dequeue)

3637

blkdev_dequeue_request(req);

3638

blkdev_dequeue_request(rq);

3638

end_that_request_last(req, uptodate);

3639

add_disk_randomness(rq->rq_disk);

3640

end_that_request_last(rq, uptodate);

3639

}

3641

}

3640

}

3642

}

3641

3643

3644

static unsigned int rq_byte_size(struct request *rq)

3645

{

3646

if (blk_fs_request(rq))

3647

return rq->hard_nr_sectors << 9;

3648

3649

return rq->data_len;

3650

}

3651

3652

/**

3653

* end_queued_request - end all I/O on a queued request

3654

* @rq: the request being processed

3655

* @uptodate: error value or 0/1 uptodate flag

3656

*

3657

* Description:

3658

* Ends all I/O on a request, and removes it from the block layer queues.

3659

* Not suitable for normal IO completion, unless the driver still has

3660

* the request attached to the block layer.

3661

*

3662

**/

3663

void end_queued_request(struct request *rq, int uptodate)

3664

{

3665

__end_request(rq, uptodate, rq_byte_size(rq), 1);

3666

}

3667

EXPORT_SYMBOL(end_queued_request);

3668

3669

/**

3670

* end_dequeued_request - end all I/O on a dequeued request

3671

* @rq: the request being processed

3672

* @uptodate: error value or 0/1 uptodate flag

3673

*

3674

* Description:

3675

* Ends all I/O on a request. The request must already have been

3676

* dequeued using blkdev_dequeue_request(), as is normally the case

3677

* for most drivers.

3678

*

3679

**/

3680

void end_dequeued_request(struct request *rq, int uptodate)

3681

{

3682

__end_request(rq, uptodate, rq_byte_size(rq), 0);

3683

}

3684

EXPORT_SYMBOL(end_dequeued_request);

3685

3686

3687

/**

3688

* end_request - end I/O on the current segment of the request

3689

* @rq: the request being processed

3690

* @uptodate: error value or 0/1 uptodate flag

3691

*

3692

* Description:

3693

* Ends I/O on the current segment of a request. If that is the only

3694

* remaining segment, the request is also completed and freed.

3695

*

3696

* This is a remnant of how older block drivers handled IO completions.

3697

* Modern drivers typically end IO on the full request in one go, unless

3698

* they have a residual value to account for. For that case this function

3699

* isn't really useful, unless the residual just happens to be the

3700

* full current segment. In other words, don't use this function in new

3701

* code. Either use end_request_completely(), or the

3702

* end_that_request_chunk() (along with end_that_request_last()) for

3703

* partial completions.

3704

*

3705

**/

3706

void end_request(struct request *req, int uptodate)

3707

{

3708

__end_request(req, uptodate, req->hard_cur_sectors << 9, 1);

3709

}

3642

EXPORT_SYMBOL(end_request);

3710

EXPORT_SYMBOL(end_request);

3643

3711

3644

static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,

3712

static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,

3645

struct bio *bio)

3713

struct bio *bio)

3646

{

3714

{

3647

/* first two bits are identical in rq->cmd_flags and bio->bi_rw */

3715

/* first two bits are identical in rq->cmd_flags and bio->bi_rw */

3648

rq->cmd_flags |= (bio->bi_rw & 3);

3716

rq->cmd_flags |= (bio->bi_rw & 3);

3649

3717

3650

rq->nr_phys_segments = bio_phys_segments(q, bio);

3718

rq->nr_phys_segments = bio_phys_segments(q, bio);

3651

rq->nr_hw_segments = bio_hw_segments(q, bio);

3719

rq->nr_hw_segments = bio_hw_segments(q, bio);

3652

rq->current_nr_sectors = bio_cur_sectors(bio);

3720

rq->current_nr_sectors = bio_cur_sectors(bio);

3653

rq->hard_cur_sectors = rq->current_nr_sectors;

3721

rq->hard_cur_sectors = rq->current_nr_sectors;

3654

rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);

3722

rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);

3655

rq->buffer = bio_data(bio);

3723

rq->buffer = bio_data(bio);

3656

rq->data_len = bio->bi_size;

3724

rq->data_len = bio->bi_size;

3657

3725

3658

rq->bio = rq->biotail = bio;

3726

rq->bio = rq->biotail = bio;

3659

3727

3660

if (bio->bi_bdev)

3728

if (bio->bi_bdev)

3661

rq->rq_disk = bio->bi_bdev->bd_disk;

3729

rq->rq_disk = bio->bi_bdev->bd_disk;

3662

}

3730

}

3663

3731

3664

int kblockd_schedule_work(struct work_struct *work)

3732

int kblockd_schedule_work(struct work_struct *work)

3665

{

3733

{

3666

return queue_work(kblockd_workqueue, work);

3734

return queue_work(kblockd_workqueue, work);

3667

}

3735

}

3668

3736

3669

EXPORT_SYMBOL(kblockd_schedule_work);

3737

EXPORT_SYMBOL(kblockd_schedule_work);

3670

3738

3671

void kblockd_flush_work(struct work_struct *work)

3739

void kblockd_flush_work(struct work_struct *work)

3672

{

3740

{

3673

cancel_work_sync(work);

3741

cancel_work_sync(work);

3674

}

3742

}

3675

EXPORT_SYMBOL(kblockd_flush_work);

3743

EXPORT_SYMBOL(kblockd_flush_work);

3676

3744

3677

int __init blk_dev_init(void)

3745

int __init blk_dev_init(void)

3678

{

3746

{

3679

int i;

3747

int i;

3680

3748

3681

kblockd_workqueue = create_workqueue("kblockd");

3749

kblockd_workqueue = create_workqueue("kblockd");

3682

if (!kblockd_workqueue)

3750

if (!kblockd_workqueue)

3683

panic("Failed to create kblockd\n");

3751

panic("Failed to create kblockd\n");

3684

3752

3685

request_cachep = kmem_cache_create("blkdev_requests",

3753

request_cachep = kmem_cache_create("blkdev_requests",

3686

sizeof(struct request), 0, SLAB_PANIC, NULL);

3754

sizeof(struct request), 0, SLAB_PANIC, NULL);

3687

3755

3688

requestq_cachep = kmem_cache_create("blkdev_queue",

3756

requestq_cachep = kmem_cache_create("blkdev_queue",

3689

sizeof(struct request_queue), 0, SLAB_PANIC, NULL);

3757

sizeof(struct request_queue), 0, SLAB_PANIC, NULL);

3690

3758

3691

iocontext_cachep = kmem_cache_create("blkdev_ioc",

3759

iocontext_cachep = kmem_cache_create("blkdev_ioc",

3692

sizeof(struct io_context), 0, SLAB_PANIC, NULL);

3760

sizeof(struct io_context), 0, SLAB_PANIC, NULL);

3693

3761

3694

for_each_possible_cpu(i)

3762

for_each_possible_cpu(i)

3695

INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));

3763

INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));

3696

3764

3697

open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);

3765

open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);

3698

register_hotcpu_notifier(&blk_cpu_notifier);

3766

register_hotcpu_notifier(&blk_cpu_notifier);

3699

3767

3700

blk_max_low_pfn = max_low_pfn - 1;

3768

blk_max_low_pfn = max_low_pfn - 1;

3701

blk_max_pfn = max_pfn - 1;

3769

blk_max_pfn = max_pfn - 1;

3702

3770

3703

return 0;

3771

return 0;

3704

}

3772

}

3705

3773

3706

/*

3774

/*

3707

* IO Context helper functions

3775

* IO Context helper functions

3708

*/

3776

*/

3709

void put_io_context(struct io_context *ioc)

3777

void put_io_context(struct io_context *ioc)

3710

{

3778

{

3711

if (ioc == NULL)

3779

if (ioc == NULL)

3712

return;

3780

return;

3713

3781

3714

BUG_ON(atomic_read(&ioc->refcount) == 0);

3782

BUG_ON(atomic_read(&ioc->refcount) == 0);

3715

3783

3716

if (atomic_dec_and_test(&ioc->refcount)) {

3784

if (atomic_dec_and_test(&ioc->refcount)) {

3717

struct cfq_io_context *cic;

3785

struct cfq_io_context *cic;

3718

3786

3719

rcu_read_lock();

3787

rcu_read_lock();

3720

if (ioc->aic && ioc->aic->dtor)

3788

if (ioc->aic && ioc->aic->dtor)

3721

ioc->aic->dtor(ioc->aic);

3789

ioc->aic->dtor(ioc->aic);

3722

if (ioc->cic_root.rb_node != NULL) {

3790

if (ioc->cic_root.rb_node != NULL) {

3723

struct rb_node *n = rb_first(&ioc->cic_root);

3791

struct rb_node *n = rb_first(&ioc->cic_root);

3724

3792

3725

cic = rb_entry(n, struct cfq_io_context, rb_node);

3793

cic = rb_entry(n, struct cfq_io_context, rb_node);

3726

cic->dtor(ioc);

3794

cic->dtor(ioc);

3727

}

3795

}

3728

rcu_read_unlock();

3796

rcu_read_unlock();

3729

3797

3730

kmem_cache_free(iocontext_cachep, ioc);

3798

kmem_cache_free(iocontext_cachep, ioc);

3731

}

3799

}

3732

}

3800

}

3733

EXPORT_SYMBOL(put_io_context);

3801

EXPORT_SYMBOL(put_io_context);

3734

3802

3735

/* Called by the exitting task */

3803

/* Called by the exitting task */

3736

void exit_io_context(void)

3804

void exit_io_context(void)

3737

{

3805

{

3738

struct io_context *ioc;

3806

struct io_context *ioc;

3739

struct cfq_io_context *cic;

3807

struct cfq_io_context *cic;

3740

3808

3741

task_lock(current);

3809

task_lock(current);

3742

ioc = current->io_context;

3810

ioc = current->io_context;

3743

current->io_context = NULL;

3811

current->io_context = NULL;

3744

task_unlock(current);

3812

task_unlock(current);

3745

3813

3746

ioc->task = NULL;

3814

ioc->task = NULL;

3747

if (ioc->aic && ioc->aic->exit)

3815

if (ioc->aic && ioc->aic->exit)

3748

ioc->aic->exit(ioc->aic);

3816

ioc->aic->exit(ioc->aic);

3749

if (ioc->cic_root.rb_node != NULL) {

3817

if (ioc->cic_root.rb_node != NULL) {

3750

cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);

3818

cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);

3751

cic->exit(ioc);

3819

cic->exit(ioc);

3752

}

3820

}

3753

3821

3754

put_io_context(ioc);

3822

put_io_context(ioc);

3755

}

3823

}

3756

3824

3757

/*

3825

/*

3758

* If the current task has no IO context then create one and initialise it.

3826

* If the current task has no IO context then create one and initialise it.

3759

* Otherwise, return its existing IO context.

3827

* Otherwise, return its existing IO context.

3760

*

3828

*

3761

* This returned IO context doesn't have a specifically elevated refcount,

3829

* This returned IO context doesn't have a specifically elevated refcount,

3762

* but since the current task itself holds a reference, the context can be

3830

* but since the current task itself holds a reference, the context can be

3763

* used in general code, so long as it stays within `current` context.

3831

* used in general code, so long as it stays within `current` context.

3764

*/

3832

*/

3765

static struct io_context *current_io_context(gfp_t gfp_flags, int node)

3833

static struct io_context *current_io_context(gfp_t gfp_flags, int node)

3766

{

3834

{

3767

struct task_struct *tsk = current;

3835

struct task_struct *tsk = current;

3768

struct io_context *ret;

3836

struct io_context *ret;

3769

3837

3770

ret = tsk->io_context;

3838

ret = tsk->io_context;

3771

if (likely(ret))

3839

if (likely(ret))

3772

return ret;

3840

return ret;

3773

3841

3774

ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);

3842

ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);

3775

if (ret) {

3843

if (ret) {

3776

atomic_set(&ret->refcount, 1);

3844

atomic_set(&ret->refcount, 1);

3777

ret->task = current;

3845

ret->task = current;

3778

ret->ioprio_changed = 0;

3846

ret->ioprio_changed = 0;

3779

ret->last_waited = jiffies; /* doesn't matter... */

3847

ret->last_waited = jiffies; /* doesn't matter... */

3780

ret->nr_batch_requests = 0; /* because this is 0 */

3848

ret->nr_batch_requests = 0; /* because this is 0 */

3781

ret->aic = NULL;

3849

ret->aic = NULL;

3782

ret->cic_root.rb_node = NULL;

3850

ret->cic_root.rb_node = NULL;

3783

ret->ioc_data = NULL;

3851

ret->ioc_data = NULL;

3784

/* make sure set_task_ioprio() sees the settings above */

3852

/* make sure set_task_ioprio() sees the settings above */

3785

smp_wmb();

3853

smp_wmb();

3786

tsk->io_context = ret;

3854

tsk->io_context = ret;

3787

}

3855

}

3788

3856

3789

return ret;

3857

return ret;

3790

}

3858

}

3791

3859

3792

/*

3860

/*

3793

* If the current task has no IO context then create one and initialise it.

3861

* If the current task has no IO context then create one and initialise it.

3794

* If it does have a context, take a ref on it.

3862

* If it does have a context, take a ref on it.

3795

*

3863

*

3796

* This is always called in the context of the task which submitted the I/O.

3864

* This is always called in the context of the task which submitted the I/O.

3797

*/

3865

*/

3798

struct io_context *get_io_context(gfp_t gfp_flags, int node)

3866

struct io_context *get_io_context(gfp_t gfp_flags, int node)

3799

{

3867

{

3800

struct io_context *ret;

3868

struct io_context *ret;

3801

ret = current_io_context(gfp_flags, node);

3869

ret = current_io_context(gfp_flags, node);

3802

if (likely(ret))

3870

if (likely(ret))

3803

atomic_inc(&ret->refcount);

3871

atomic_inc(&ret->refcount);

3804

return ret;

3872

return ret;

3805

}

3873

}

3806

EXPORT_SYMBOL(get_io_context);

3874

EXPORT_SYMBOL(get_io_context);

3807

3875

3808

void copy_io_context(struct io_context **pdst, struct io_context **psrc)

3876

void copy_io_context(struct io_context **pdst, struct io_context **psrc)

3809

{

3877

{

3810

struct io_context *src = *psrc;

3878

struct io_context *src = *psrc;

3811

struct io_context *dst = *pdst;

3879

struct io_context *dst = *pdst;

3812

3880

3813

if (src) {

3881

if (src) {

3814

BUG_ON(atomic_read(&src->refcount) == 0);

3882

BUG_ON(atomic_read(&src->refcount) == 0);

3815

atomic_inc(&src->refcount);

3883

atomic_inc(&src->refcount);

3816

put_io_context(dst);

3884

put_io_context(dst);

3817

*pdst = src;

3885

*pdst = src;

3818

}

3886

}

3819

}

3887

}

3820

EXPORT_SYMBOL(copy_io_context);

3888

EXPORT_SYMBOL(copy_io_context);

3821

3889

3822

void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)

3890

void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)

3823

{

3891

{

3824

struct io_context *temp;

3892

struct io_context *temp;

3825

temp = *ioc1;

3893

temp = *ioc1;

3826

*ioc1 = *ioc2;

3894

*ioc1 = *ioc2;

3827

*ioc2 = temp;

3895

*ioc2 = temp;

3828

}

3896

}

3829

EXPORT_SYMBOL(swap_io_context);

3897

EXPORT_SYMBOL(swap_io_context);

3830

3898

3831

/*

3899

/*

3832

* sysfs parts below

3900

* sysfs parts below

3833

*/

3901

*/

3834

struct queue_sysfs_entry {

3902

struct queue_sysfs_entry {

3835

struct attribute attr;

3903

struct attribute attr;

3836

ssize_t (*show)(struct request_queue *, char *);

3904

ssize_t (*show)(struct request_queue *, char *);

3837

ssize_t (*store)(struct request_queue *, const char *, size_t);

3905

ssize_t (*store)(struct request_queue *, const char *, size_t);

3838

};

3906

};

3839

3907

3840

static ssize_t

3908

static ssize_t

3841

queue_var_show(unsigned int var, char *page)

3909

queue_var_show(unsigned int var, char *page)

3842

{

3910

{

3843

return sprintf(page, "%d\n", var);

3911

return sprintf(page, "%d\n", var);

3844

}

3912

}

3845

3913

3846

static ssize_t

3914

static ssize_t

3847

queue_var_store(unsigned long *var, const char *page, size_t count)

3915

queue_var_store(unsigned long *var, const char *page, size_t count)

3848

{

3916

{

3849

char *p = (char *) page;

3917

char *p = (char *) page;

3850

3918

3851

*var = simple_strtoul(p, &p, 10);

3919

*var = simple_strtoul(p, &p, 10);

3852

return count;

3920

return count;

3853

}

3921

}

3854

3922

3855

static ssize_t queue_requests_show(struct request_queue *q, char *page)

3923

static ssize_t queue_requests_show(struct request_queue *q, char *page)

3856

{

3924

{

3857

return queue_var_show(q->nr_requests, (page));

3925

return queue_var_show(q->nr_requests, (page));

3858

}

3926

}

3859

3927

3860

static ssize_t

3928

static ssize_t

3861

queue_requests_store(struct request_queue *q, const char *page, size_t count)

3929

queue_requests_store(struct request_queue *q, const char *page, size_t count)

3862

{

3930

{

3863

struct request_list *rl = &q->rq;

3931

struct request_list *rl = &q->rq;

3864

unsigned long nr;

3932

unsigned long nr;

3865

int ret = queue_var_store(&nr, page, count);

3933

int ret = queue_var_store(&nr, page, count);

3866

if (nr < BLKDEV_MIN_RQ)

3934

if (nr < BLKDEV_MIN_RQ)

3867

nr = BLKDEV_MIN_RQ;

3935

nr = BLKDEV_MIN_RQ;

3868

3936

3869

spin_lock_irq(q->queue_lock);

3937

spin_lock_irq(q->queue_lock);

3870

q->nr_requests = nr;

3938

q->nr_requests = nr;

3871

blk_queue_congestion_threshold(q);

3939

blk_queue_congestion_threshold(q);

3872

3940

3873

if (rl->count[READ] >= queue_congestion_on_threshold(q))

3941

if (rl->count[READ] >= queue_congestion_on_threshold(q))

3874

blk_set_queue_congested(q, READ);

3942

blk_set_queue_congested(q, READ);

3875

else if (rl->count[READ] < queue_congestion_off_threshold(q))

3943

else if (rl->count[READ] < queue_congestion_off_threshold(q))

3876

blk_clear_queue_congested(q, READ);

3944

blk_clear_queue_congested(q, READ);

3877

3945

3878

if (rl->count[WRITE] >= queue_congestion_on_threshold(q))

3946

if (rl->count[WRITE] >= queue_congestion_on_threshold(q))

3879

blk_set_queue_congested(q, WRITE);

3947

blk_set_queue_congested(q, WRITE);

3880

else if (rl->count[WRITE] < queue_congestion_off_threshold(q))

3948

else if (rl->count[WRITE] < queue_congestion_off_threshold(q))

3881

blk_clear_queue_congested(q, WRITE);

3949

blk_clear_queue_congested(q, WRITE);

3882

3950

3883

if (rl->count[READ] >= q->nr_requests) {

3951

if (rl->count[READ] >= q->nr_requests) {

3884

blk_set_queue_full(q, READ);

3952

blk_set_queue_full(q, READ);

3885

} else if (rl->count[READ]+1 <= q->nr_requests) {

3953

} else if (rl->count[READ]+1 <= q->nr_requests) {

3886

blk_clear_queue_full(q, READ);

3954

blk_clear_queue_full(q, READ);

3887

wake_up(&rl->wait[READ]);

3955

wake_up(&rl->wait[READ]);

3888

}

3956

}

3889

3957

3890

if (rl->count[WRITE] >= q->nr_requests) {

3958

if (rl->count[WRITE] >= q->nr_requests) {

3891

blk_set_queue_full(q, WRITE);

3959

blk_set_queue_full(q, WRITE);

3892

} else if (rl->count[WRITE]+1 <= q->nr_requests) {

3960

} else if (rl->count[WRITE]+1 <= q->nr_requests) {

3893

blk_clear_queue_full(q, WRITE);

3961

blk_clear_queue_full(q, WRITE);

3894

wake_up(&rl->wait[WRITE]);

3962

wake_up(&rl->wait[WRITE]);

3895

}

3963

}

3896

spin_unlock_irq(q->queue_lock);

3964

spin_unlock_irq(q->queue_lock);

3897

return ret;

3965

return ret;

3898

}

3966

}

3899

3967

3900

static ssize_t queue_ra_show(struct request_queue *q, char *page)

3968

static ssize_t queue_ra_show(struct request_queue *q, char *page)

3901

{

3969

{

3902

int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);

3970

int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);

3903

3971

3904

return queue_var_show(ra_kb, (page));

3972

return queue_var_show(ra_kb, (page));

3905

}

3973

}

3906

3974

3907

static ssize_t

3975

static ssize_t

3908

queue_ra_store(struct request_queue *q, const char *page, size_t count)

3976

queue_ra_store(struct request_queue *q, const char *page, size_t count)

3909

{

3977

{

3910

unsigned long ra_kb;

3978

unsigned long ra_kb;

3911

ssize_t ret = queue_var_store(&ra_kb, page, count);

3979

ssize_t ret = queue_var_store(&ra_kb, page, count);

3912

3980

3913

spin_lock_irq(q->queue_lock);

3981

spin_lock_irq(q->queue_lock);

3914

q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);

3982

q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);

3915

spin_unlock_irq(q->queue_lock);

3983

spin_unlock_irq(q->queue_lock);

3916

3984

3917

return ret;

3985

return ret;

3918

}

3986

}

3919

3987

3920

static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)

3988

static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)

3921

{

3989

{

3922

int max_sectors_kb = q->max_sectors >> 1;

3990

int max_sectors_kb = q->max_sectors >> 1;

3923

3991

3924

return queue_var_show(max_sectors_kb, (page));

3992

return queue_var_show(max_sectors_kb, (page));

3925

}

3993

}

3926

3994

3927

static ssize_t

3995

static ssize_t

3928

queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)

3996

queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)

3929

{

3997

{

3930

unsigned long max_sectors_kb,

3998

unsigned long max_sectors_kb,

3931

max_hw_sectors_kb = q->max_hw_sectors >> 1,

3999

max_hw_sectors_kb = q->max_hw_sectors >> 1,

3932

page_kb = 1 << (PAGE_CACHE_SHIFT - 10);

4000

page_kb = 1 << (PAGE_CACHE_SHIFT - 10);

3933

ssize_t ret = queue_var_store(&max_sectors_kb, page, count);

4001

ssize_t ret = queue_var_store(&max_sectors_kb, page, count);

3934

int ra_kb;

4002

int ra_kb;

3935

4003

3936

if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)

4004

if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)

3937

return -EINVAL;

4005

return -EINVAL;

3938

/*

4006

/*

3939

* Take the queue lock to update the readahead and max_sectors

4007

* Take the queue lock to update the readahead and max_sectors

3940

* values synchronously:

4008

* values synchronously:

3941

*/

4009

*/

3942

spin_lock_irq(q->queue_lock);

4010

spin_lock_irq(q->queue_lock);

3943

/*

4011

/*

3944

* Trim readahead window as well, if necessary:

4012

* Trim readahead window as well, if necessary:

3945

*/

4013

*/

3946

ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);

4014

ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);

3947

if (ra_kb > max_sectors_kb)

4015

if (ra_kb > max_sectors_kb)

3948

q->backing_dev_info.ra_pages =

4016

q->backing_dev_info.ra_pages =

3949

max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);

4017

max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);

3950

4018

3951

q->max_sectors = max_sectors_kb << 1;

4019

q->max_sectors = max_sectors_kb << 1;

3952

spin_unlock_irq(q->queue_lock);

4020

spin_unlock_irq(q->queue_lock);

3953

4021

3954

return ret;

4022

return ret;

3955

}

4023

}

3956

4024

3957

static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)

4025

static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)

3958

{

4026

{

3959

int max_hw_sectors_kb = q->max_hw_sectors >> 1;

4027

int max_hw_sectors_kb = q->max_hw_sectors >> 1;

3960

4028

3961

return queue_var_show(max_hw_sectors_kb, (page));

4029

return queue_var_show(max_hw_sectors_kb, (page));

3962

}

4030

}

3963

4031

3964

4032

3965

static struct queue_sysfs_entry queue_requests_entry = {

4033

static struct queue_sysfs_entry queue_requests_entry = {

3966

.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },

4034

.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },

3967

.show = queue_requests_show,

4035

.show = queue_requests_show,

3968

.store = queue_requests_store,

4036

.store = queue_requests_store,

3969

};

4037

};

3970

4038

3971

static struct queue_sysfs_entry queue_ra_entry = {

4039

static struct queue_sysfs_entry queue_ra_entry = {

3972

.attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },

4040

.attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },

3973

.show = queue_ra_show,

4041

.show = queue_ra_show,

3974

.store = queue_ra_store,

4042

.store = queue_ra_store,

3975

};

4043

};

3976

4044

3977

static struct queue_sysfs_entry queue_max_sectors_entry = {

4045

static struct queue_sysfs_entry queue_max_sectors_entry = {

3978

.attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },

4046

.attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },

3979

.show = queue_max_sectors_show,

4047

.show = queue_max_sectors_show,

3980

.store = queue_max_sectors_store,

4048

.store = queue_max_sectors_store,

3981

};

4049

};

3982

4050

3983

static struct queue_sysfs_entry queue_max_hw_sectors_entry = {

4051

static struct queue_sysfs_entry queue_max_hw_sectors_entry = {

3984

.attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },

4052

.attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },

3985

.show = queue_max_hw_sectors_show,

4053

.show = queue_max_hw_sectors_show,

3986

};

4054

};

3987

4055

3988

static struct queue_sysfs_entry queue_iosched_entry = {

4056

static struct queue_sysfs_entry queue_iosched_entry = {

3989

.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },

4057

.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },

3990

.show = elv_iosched_show,

4058

.show = elv_iosched_show,

3991

.store = elv_iosched_store,

4059

.store = elv_iosched_store,

3992

};

4060

};

3993

4061

3994

static struct attribute *default_attrs[] = {

4062

static struct attribute *default_attrs[] = {

3995

&queue_requests_entry.attr,

4063

&queue_requests_entry.attr,

3996

&queue_ra_entry.attr,

4064

&queue_ra_entry.attr,

3997

&queue_max_hw_sectors_entry.attr,

4065

&queue_max_hw_sectors_entry.attr,

3998

&queue_max_sectors_entry.attr,

4066

&queue_max_sectors_entry.attr,

3999

&queue_iosched_entry.attr,

4067

&queue_iosched_entry.attr,

4000

NULL,

4068

NULL,

4001

};

4069

};

4002

4070

4003

#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)

4071

#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)

4004

4072

4005

static ssize_t

4073

static ssize_t

4006

queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)

4074

queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)

4007

{

4075

{

4008

struct queue_sysfs_entry *entry = to_queue(attr);

4076

struct queue_sysfs_entry *entry = to_queue(attr);

4009

struct request_queue *q =

4077

struct request_queue *q =

4010

container_of(kobj, struct request_queue, kobj);

4078

container_of(kobj, struct request_queue, kobj);

4011

ssize_t res;

4079

ssize_t res;

4012

4080

4013

if (!entry->show)

4081

if (!entry->show)

4014

return -EIO;

4082

return -EIO;

4015

mutex_lock(&q->sysfs_lock);

4083

mutex_lock(&q->sysfs_lock);

4016

if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {

4084

if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {

4017

mutex_unlock(&q->sysfs_lock);

4085

mutex_unlock(&q->sysfs_lock);

4018

return -ENOENT;

4086

return -ENOENT;

4019

}

4087

}

4020

res = entry->show(q, page);

4088

res = entry->show(q, page);

4021

mutex_unlock(&q->sysfs_lock);

4089

mutex_unlock(&q->sysfs_lock);

4022

return res;

4090

return res;

4023

}

4091

}

4024

4092

4025

static ssize_t

4093

static ssize_t

4026

queue_attr_store(struct kobject *kobj, struct attribute *attr,

4094

queue_attr_store(struct kobject *kobj, struct attribute *attr,

4027

const char *page, size_t length)

4095

const char *page, size_t length)

4028

{

4096

{

4029

struct queue_sysfs_entry *entry = to_queue(attr);

4097

struct queue_sysfs_entry *entry = to_queue(attr);

4030

struct request_queue *q = container_of(kobj, struct request_queue, kobj);

4098

struct request_queue *q = container_of(kobj, struct request_queue, kobj);

4031

4099

4032

ssize_t res;

4100

ssize_t res;

4033

4101

4034

if (!entry->store)

4102

if (!entry->store)

4035

return -EIO;

4103

return -EIO;

4036

mutex_lock(&q->sysfs_lock);

4104

mutex_lock(&q->sysfs_lock);

4037

if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {

4105

if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {

4038

mutex_unlock(&q->sysfs_lock);

4106

mutex_unlock(&q->sysfs_lock);

4039

return -ENOENT;

4107

return -ENOENT;

4040

}

4108

}

4041

res = entry->store(q, page, length);

4109

res = entry->store(q, page, length);

4042

mutex_unlock(&q->sysfs_lock);

4110

mutex_unlock(&q->sysfs_lock);

4043

return res;

4111

return res;

4044

}

4112

}

4045

4113

4046

static struct sysfs_ops queue_sysfs_ops = {

4114

static struct sysfs_ops queue_sysfs_ops = {

4047

.show = queue_attr_show,

4115

.show = queue_attr_show,

4048

.store = queue_attr_store,

4116

.store = queue_attr_store,

4049

};

4117

};

4050

4118

4051

static struct kobj_type queue_ktype = {

4119

static struct kobj_type queue_ktype = {

4052

.sysfs_ops = &queue_sysfs_ops,

4120

.sysfs_ops = &queue_sysfs_ops,

4053

.default_attrs = default_attrs,

4121

.default_attrs = default_attrs,

4054

.release = blk_release_queue,

4122

.release = blk_release_queue,

4055

};

4123

};

4056

4124

4057

int blk_register_queue(struct gendisk *disk)

4125

int blk_register_queue(struct gendisk *disk)

4058

{

4126

{

4059

int ret;

4127

int ret;

4060

4128

4061

struct request_queue *q = disk->queue;

4129

struct request_queue *q = disk->queue;

4062

4130

4063

if (!q || !q->request_fn)

4131

if (!q || !q->request_fn)

4064

return -ENXIO;

4132

return -ENXIO;

4065

4133

4066

q->kobj.parent = kobject_get(&disk->kobj);

4134

q->kobj.parent = kobject_get(&disk->kobj);

4067

4135

4068

ret = kobject_add(&q->kobj);

4136

ret = kobject_add(&q->kobj);

4069

if (ret < 0)

4137

if (ret < 0)

4070

return ret;

4138

return ret;

4071

4139

4072

kobject_uevent(&q->kobj, KOBJ_ADD);

4140

kobject_uevent(&q->kobj, KOBJ_ADD);

4073

4141

4074

ret = elv_register_queue(q);

4142

ret = elv_register_queue(q);

4075

if (ret) {

4143

if (ret) {

4076

kobject_uevent(&q->kobj, KOBJ_REMOVE);

4144

kobject_uevent(&q->kobj, KOBJ_REMOVE);

4077

kobject_del(&q->kobj);

4145

kobject_del(&q->kobj);

4078

return ret;

4146

return ret;

4079

}

4147

}

4080

4148

4081

return 0;

4149

return 0;

4082

}

4150

}

4083

4151

4084

void blk_unregister_queue(struct gendisk *disk)

4152

void blk_unregister_queue(struct gendisk *disk)

4085

{

4153

{

4086

struct request_queue *q = disk->queue;

4154

struct request_queue *q = disk->queue;

4087

4155

4088

if (q && q->request_fn) {

4156

if (q && q->request_fn) {

4089

elv_unregister_queue(q);

4157

elv_unregister_queue(q);

4090

4158

4091

kobject_uevent(&q->kobj, KOBJ_REMOVE);

4159

kobject_uevent(&q->kobj, KOBJ_REMOVE);

4092

kobject_del(&q->kobj);

4160

kobject_del(&q->kobj);

4093

kobject_put(&disk->kobj);

4161

kobject_put(&disk->kobj);

4094

}

4162

}

4095

}

4163

}

4096

4164

GITLAB

block: add end_queued_request() and end_dequeued_request() helpers

 /*
  *  Block device elevator/IO-scheduler.
  *
  *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
  *
  * 30042000 Jens Axboe <axboe@kernel.dk> :
  *
  * Split the elevator a bit so that it is possible to choose a different
  * one or even write a new "plug in". There are three pieces:
  * - elevator_fn, inserts a new request in the queue list
  * - elevator_merge_fn, decides whether a new buffer can be merged with
  *   an existing request
  * - elevator_dequeue_fn, called when a request is taken off the active list
  *
  * 20082000 Dave Jones <davej@suse.de> :
  * Removed tests for max-bomb-segments, which was breaking elvtune
  *  when run without -bN
  *
  * Jens:
  * - Rework again to work with bio instead of buffer_heads
  * - loose bi_dev comparisons, partition handling is right now
  * - completely modularize elevator setup and teardown
  *
  */
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/bio.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
 #include <linux/hash.h>
 #include <asm/uaccess.h>
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 /*
  * Merge hash stuff.
  */
 static const int elv_hash_shift = 6;
 #define ELV_HASH_BLOCK(sec)	((sec) >> 3)
 #define ELV_HASH_FN(sec)	(hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
 #define ELV_HASH_ENTRIES	(1 << elv_hash_shift)
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 /*
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
  */
 static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
 {
 	struct request_queue *q = rq->q;
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_allow_merge_fn)
 		return e->ops->elevator_allow_merge_fn(q, rq, bio);
 	return 1;
 }
 /*
  * can we safely merge with this request?
  */
 inline int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 {
 	if (!rq_mergeable(rq))
 		return 0;
 	/*
 	 * different data direction or already started, don't merge
 	 */
 	if (bio_data_dir(bio) != rq_data_dir(rq))
 		return 0;
 	/*
 	 * must be same device and not a special request
 	 */
 	if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
 		return 0;
 	if (!elv_iosched_allow_merge(rq, bio))
 		return 0;
 	return 1;
 }
 EXPORT_SYMBOL(elv_rq_merge_ok);
 static inline int elv_try_merge(struct request *__rq, struct bio *bio)
 {
 	int ret = ELEVATOR_NO_MERGE;
 	/*
 	 * we can merge and sequence is ok, check if it's possible
 	 */
 	if (elv_rq_merge_ok(__rq, bio)) {
 		if (__rq->sector + __rq->nr_sectors == bio->bi_sector)
 			ret = ELEVATOR_BACK_MERGE;
 		else if (__rq->sector - bio_sectors(bio) == bio->bi_sector)
 			ret = ELEVATOR_FRONT_MERGE;
 	}
 	return ret;
 }
 static struct elevator_type *elevator_find(const char *name)
 {
 	struct elevator_type *e;
 	list_for_each_entry(e, &elv_list, list) {
 		if (!strcmp(e->elevator_name, name))
 			return e;
 	}
 	return NULL;
 }
 static void elevator_put(struct elevator_type *e)
 {
 	module_put(e->elevator_owner);
 }
 static struct elevator_type *elevator_get(const char *name)
 {
 	struct elevator_type *e;
 	spin_lock(&elv_list_lock);
 	e = elevator_find(name);
 	if (e && !try_module_get(e->elevator_owner))
 		e = NULL;
 	spin_unlock(&elv_list_lock);
 	return e;
 }
 static void *elevator_init_queue(struct request_queue *q,
 				 struct elevator_queue *eq)
 {
 	return eq->ops->elevator_init_fn(q);
 }
 static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
 			   void *data)
 {
 	q->elevator = eq;
 	eq->elevator_data = data;
 }
 static char chosen_elevator[16];
 static int __init elevator_setup(char *str)
 {
 	/*
 	 * Be backwards-compatible with previous kernels, so users
 	 * won't get the wrong elevator.
 	 */
 	if (!strcmp(str, "as"))
 		strcpy(chosen_elevator, "anticipatory");
 	else
 		strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
 	return 1;
 }
 __setup("elevator=", elevator_setup);
 static struct kobj_type elv_ktype;
 static elevator_t *elevator_alloc(struct request_queue *q,
 				  struct elevator_type *e)
 {
 	elevator_t *eq;
 	int i;
 	eq = kmalloc_node(sizeof(elevator_t), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (unlikely(!eq))
 		goto err;
 	eq->ops = &e->ops;
 	eq->elevator_type = e;
 	kobject_init(&eq->kobj);
 	kobject_set_name(&eq->kobj, "%s", "iosched");
 	eq->kobj.ktype = &elv_ktype;
 	mutex_init(&eq->sysfs_lock);
 	eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES,
 					GFP_KERNEL, q->node);
 	if (!eq->hash)
 		goto err;
 	for (i = 0; i < ELV_HASH_ENTRIES; i++)
 		INIT_HLIST_HEAD(&eq->hash[i]);
 	return eq;
 err:
 	kfree(eq);
 	elevator_put(e);
 	return NULL;
 }
 static void elevator_release(struct kobject *kobj)
 {
 	elevator_t *e = container_of(kobj, elevator_t, kobj);
 	elevator_put(e->elevator_type);
 	kfree(e->hash);
 	kfree(e);
 }
 int elevator_init(struct request_queue *q, char *name)
 {
 	struct elevator_type *e = NULL;
 	struct elevator_queue *eq;
 	int ret = 0;
 	void *data;
 	INIT_LIST_HEAD(&q->queue_head);
 	q->last_merge = NULL;
 	q->end_sector = 0;
 	q->boundary_rq = NULL;
 	if (name && !(e = elevator_get(name)))
 		return -EINVAL;
 	if (!e && *chosen_elevator && !(e = elevator_get(chosen_elevator)))
 		printk("I/O scheduler %s not found\n", chosen_elevator);
 	if (!e && !(e = elevator_get(CONFIG_DEFAULT_IOSCHED))) {
 		printk("Default I/O scheduler not found, using no-op\n");
 		e = elevator_get("noop");
 	}
 	eq = elevator_alloc(q, e);
 	if (!eq)
 		return -ENOMEM;
 	data = elevator_init_queue(q, eq);
 	if (!data) {
 		kobject_put(&eq->kobj);
 		return -ENOMEM;
 	}
 	elevator_attach(q, eq, data);
 	return ret;
 }
 EXPORT_SYMBOL(elevator_init);
 void elevator_exit(elevator_t *e)
 {
 	mutex_lock(&e->sysfs_lock);
 	if (e->ops->elevator_exit_fn)
 		e->ops->elevator_exit_fn(e);
 	e->ops = NULL;
 	mutex_unlock(&e->sysfs_lock);
 	kobject_put(&e->kobj);
 }
 EXPORT_SYMBOL(elevator_exit);
 static void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_activate_req_fn)
 		e->ops->elevator_activate_req_fn(q, rq);
 }
 static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_deactivate_req_fn)
 		e->ops->elevator_deactivate_req_fn(q, rq);
 }
 static inline void __elv_rqhash_del(struct request *rq)
 {
 	hlist_del_init(&rq->hash);
 }
 static void elv_rqhash_del(struct request_queue *q, struct request *rq)
 {
 	if (ELV_ON_HASH(rq))
 		__elv_rqhash_del(rq);
 }
 static void elv_rqhash_add(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	BUG_ON(ELV_ON_HASH(rq));
 	hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]);
 }
 static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 {
 	__elv_rqhash_del(rq);
 	elv_rqhash_add(q, rq);
 }
 static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 {
 	elevator_t *e = q->elevator;
 	struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)];
 	struct hlist_node *entry, *next;
 	struct request *rq;
 	hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) {
 		BUG_ON(!ELV_ON_HASH(rq));
 		if (unlikely(!rq_mergeable(rq))) {
 			__elv_rqhash_del(rq);
 			continue;
 		}
 		if (rq_hash_key(rq) == offset)
 			return rq;
 	}
 	return NULL;
 }
 /*
  * RB-tree support functions for inserting/lookup/removal of requests
  * in a sorted RB tree.
  */
 struct request *elv_rb_add(struct rb_root *root, struct request *rq)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 	struct request *__rq;
 	while (*p) {
 		parent = *p;
 		__rq = rb_entry(parent, struct request, rb_node);
 		if (rq->sector < __rq->sector)
 			p = &(*p)->rb_left;
 		else if (rq->sector > __rq->sector)
 			p = &(*p)->rb_right;
 		else
 			return __rq;
 	}
 	rb_link_node(&rq->rb_node, parent, p);
 	rb_insert_color(&rq->rb_node, root);
 	return NULL;
 }
 EXPORT_SYMBOL(elv_rb_add);
 void elv_rb_del(struct rb_root *root, struct request *rq)
 {
 	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
 	rb_erase(&rq->rb_node, root);
 	RB_CLEAR_NODE(&rq->rb_node);
 }
 EXPORT_SYMBOL(elv_rb_del);
 struct request *elv_rb_find(struct rb_root *root, sector_t sector)
 {
 	struct rb_node *n = root->rb_node;
 	struct request *rq;
 	while (n) {
 		rq = rb_entry(n, struct request, rb_node);
 		if (sector < rq->sector)
 			n = n->rb_left;
 		else if (sector > rq->sector)
 			n = n->rb_right;
 		else
 			return rq;
 	}
 	return NULL;
 }
 EXPORT_SYMBOL(elv_rb_find);
 /*
  * Insert rq into dispatch queue of q.  Queue lock must be held on
  * entry.  rq is sort insted into the dispatch queue. To be used by
  * specific elevators.
  */
 void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 {
 	sector_t boundary;
 	struct list_head *entry;
 	if (q->last_merge == rq)
 		q->last_merge = NULL;
 	elv_rqhash_del(q, rq);
 	q->nr_sorted--;
 	boundary = q->end_sector;
 	list_for_each_prev(entry, &q->queue_head) {
 		struct request *pos = list_entry_rq(entry);
 		if (rq_data_dir(rq) != rq_data_dir(pos))
 			break;
 		if (pos->cmd_flags & (REQ_SOFTBARRIER|REQ_HARDBARRIER|REQ_STARTED))
 			break;
 		if (rq->sector >= boundary) {
 			if (pos->sector < boundary)
 				continue;
 		} else {
 			if (pos->sector >= boundary)
 				break;
 		}
 		if (rq->sector >= pos->sector)
 			break;
 	}
 	list_add(&rq->queuelist, entry);
 }
 EXPORT_SYMBOL(elv_dispatch_sort);
 /*
  * Insert rq into dispatch queue of q.  Queue lock must be held on
  * entry.  rq is added to the back of the dispatch queue. To be used by
  * specific elevators.
  */
 void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
 {
 	if (q->last_merge == rq)
 		q->last_merge = NULL;
 	elv_rqhash_del(q, rq);
 	q->nr_sorted--;
 	q->end_sector = rq_end_sector(rq);
 	q->boundary_rq = rq;
 	list_add_tail(&rq->queuelist, &q->queue_head);
 }
 EXPORT_SYMBOL(elv_dispatch_add_tail);
 int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 {
 	elevator_t *e = q->elevator;
 	struct request *__rq;
 	int ret;
 	/*
 	 * First try one-hit cache.
 	 */
 	if (q->last_merge) {
 		ret = elv_try_merge(q->last_merge, bio);
 		if (ret != ELEVATOR_NO_MERGE) {
 			*req = q->last_merge;
 			return ret;
 		}
 	}
 	/*
 	 * See if our hash lookup can find a potential backmerge.
 	 */
 	__rq = elv_rqhash_find(q, bio->bi_sector);
 	if (__rq && elv_rq_merge_ok(__rq, bio)) {
 		*req = __rq;
 		return ELEVATOR_BACK_MERGE;
 	}
 	if (e->ops->elevator_merge_fn)
 		return e->ops->elevator_merge_fn(q, req, bio);
 	return ELEVATOR_NO_MERGE;
 }
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_merged_fn)
 		e->ops->elevator_merged_fn(q, rq, type);
 	if (type == ELEVATOR_BACK_MERGE)
 		elv_rqhash_reposition(q, rq);
 	q->last_merge = rq;
 }
 void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_merge_req_fn)
 		e->ops->elevator_merge_req_fn(q, rq, next);
 	elv_rqhash_reposition(q, rq);
 	elv_rqhash_del(q, next);
 	q->nr_sorted--;
 	q->last_merge = rq;
 }
 void elv_requeue_request(struct request_queue *q, struct request *rq)
 {
 	/*
 	 * it already went through dequeue, we need to decrement the
 	 * in_flight count again
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight--;
 		if (blk_sorted_rq(rq))
 			elv_deactivate_rq(q, rq);
 	}
 	rq->cmd_flags &= ~REQ_STARTED;
 	elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
 }
 static void elv_drain_elevator(struct request_queue *q)
 {
 	static int printed;
 	while (q->elevator->ops->elevator_dispatch_fn(q, 1))
 		;
 	if (q->nr_sorted == 0)
 		return;
 	if (printed++ < 10) {
 		printk(KERN_ERR "%s: forced dispatching is broken "
 		       "(nr_sorted=%u), please report this\n",
 		       q->elevator->elevator_type->elevator_name, q->nr_sorted);
 	}
 }
 void elv_insert(struct request_queue *q, struct request *rq, int where)
 {
 	struct list_head *pos;
 	unsigned ordseq;
 	int unplug_it = 1;
 	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
 	rq->q = q;
 	switch (where) {
 	case ELEVATOR_INSERT_FRONT:
 		rq->cmd_flags |= REQ_SOFTBARRIER;
 		list_add(&rq->queuelist, &q->queue_head);
 		break;
 	case ELEVATOR_INSERT_BACK:
 		rq->cmd_flags |= REQ_SOFTBARRIER;
 		elv_drain_elevator(q);
 		list_add_tail(&rq->queuelist, &q->queue_head);
 		/*
 		 * We kick the queue here for the following reasons.
 		 * - The elevator might have returned NULL previously
 		 *   to delay requests and returned them now.  As the
 		 *   queue wasn't empty before this request, ll_rw_blk
 		 *   won't run the queue on return, resulting in hang.
 		 * - Usually, back inserted requests won't be merged
 		 *   with anything.  There's no point in delaying queue
 		 *   processing.
 		 */
 		blk_remove_plug(q);
 		q->request_fn(q);
 		break;
 	case ELEVATOR_INSERT_SORT:
 		BUG_ON(!blk_fs_request(rq));
 		rq->cmd_flags |= REQ_SORTED;
 		q->nr_sorted++;
 		if (rq_mergeable(rq)) {
 			elv_rqhash_add(q, rq);
 			if (!q->last_merge)
 				q->last_merge = rq;
 		}
 		/*
 		 * Some ioscheds (cfq) run q->request_fn directly, so
 		 * rq cannot be accessed after calling
 		 * elevator_add_req_fn.
 		 */
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
 	case ELEVATOR_INSERT_REQUEUE:
 		/*
 		 * If ordered flush isn't in progress, we do front
 		 * insertion; otherwise, requests should be requeued
 		 * in ordseq order.
 		 */
 		rq->cmd_flags |= REQ_SOFTBARRIER;
 		/*
 		 * Most requeues happen because of a busy condition,
 		 * don't force unplug of the queue for that case.
 		 */
 		unplug_it = 0;
 		if (q->ordseq == 0) {
 			list_add(&rq->queuelist, &q->queue_head);
 			break;
 		}
 		ordseq = blk_ordered_req_seq(rq);
 		list_for_each(pos, &q->queue_head) {
 			struct request *pos_rq = list_entry_rq(pos);
 			if (ordseq <= blk_ordered_req_seq(pos_rq))
 				break;
 		}
 		list_add_tail(&rq->queuelist, pos);
 		break;
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __FUNCTION__, where);
 		BUG();
 	}
 	if (unplug_it && blk_queue_plugged(q)) {
 		int nrq = q->rq.count[READ] + q->rq.count[WRITE]
 			- q->in_flight;
 		if (nrq >= q->unplug_thresh)
 			__generic_unplug_device(q);
 	}
 }
 void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		       int plug)
 {
 	if (q->ordcolor)
 		rq->cmd_flags |= REQ_ORDERED_COLOR;
 	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
 		/*
 		 * toggle ordered color
 		 */
 		if (blk_barrier_rq(rq))
 			q->ordcolor ^= 1;
 		/*
 		 * barriers implicitly indicate back insertion
 		 */
 		if (where == ELEVATOR_INSERT_SORT)
 			where = ELEVATOR_INSERT_BACK;
 		/*
 		 * this request is scheduling boundary, update
 		 * end_sector
 		 */
 		if (blk_fs_request(rq)) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = rq;
 		}
 	} else if (!(rq->cmd_flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
 		where = ELEVATOR_INSERT_BACK;
 	if (plug)
 		blk_plug_device(q);
 	elv_insert(q, rq, where);
 }
 EXPORT_SYMBOL(__elv_add_request);
 void elv_add_request(struct request_queue *q, struct request *rq, int where,
 		     int plug)
 {
 	unsigned long flags;
 	spin_lock_irqsave(q->queue_lock, flags);
 	__elv_add_request(q, rq, where, plug);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(elv_add_request);
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
 			if (blk_do_ordered(q, &rq))
 				return rq;
 		}
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
 			return NULL;
 	}
 }
 struct request *elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
 	int ret;
 	while ((rq = __elv_next_request(q)) != NULL) {
 		if (!(rq->cmd_flags & REQ_STARTED)) {
 			/*
 			 * This is the first time the device driver
 			 * sees this request (possibly after
 			 * requeueing).  Notify IO scheduler.
 			 */
 			if (blk_sorted_rq(rq))
 				elv_activate_rq(q, rq);
 			/*
 			 * just mark as started even if we don't start
 			 * it, a request that has been delayed should
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
 			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 		}
 		if (!q->boundary_rq || q->boundary_rq == rq) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = NULL;
 		}
 		if ((rq->cmd_flags & REQ_DONTPREP) || !q->prep_rq_fn)
 			break;
 		ret = q->prep_rq_fn(q, rq);
 		if (ret == BLKPREP_OK) {
 			break;
 		} else if (ret == BLKPREP_DEFER) {
 			/*
 			 * the request may have been (partially) prepped.
 			 * we need to keep this request in the front to
 			 * avoid resource deadlock.  REQ_STARTED will
 			 * prevent other fs requests from passing this one.
 			 */
 			rq = NULL;
 			break;
 		} else if (ret == BLKPREP_KILL) {
-			int nr_bytes = rq->hard_nr_sectors << 9;
-			if (!nr_bytes)
-				nr_bytes = rq->data_len;
-			blkdev_dequeue_request(rq);
 			rq->cmd_flags |= REQ_QUIET;
-			end_that_request_chunk(rq, 0, nr_bytes);
+			end_queued_request(rq, 0);
-			end_that_request_last(rq, 0);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
 								ret);
 			break;
 		}
 	}
 	return rq;
 }
 EXPORT_SYMBOL(elv_next_request);
 void elv_dequeue_request(struct request_queue *q, struct request *rq)
 {
 	BUG_ON(list_empty(&rq->queuelist));
 	BUG_ON(ELV_ON_HASH(rq));
 	list_del_init(&rq->queuelist);
 	/*
 	 * the time frame between a request being removed from the lists
 	 * and to it is freed is accounted as io that is in progress at
 	 * the driver side.
 	 */
 	if (blk_account_rq(rq))
 		q->in_flight++;
 }
 EXPORT_SYMBOL(elv_dequeue_request);
 int elv_queue_empty(struct request_queue *q)
 {
 	elevator_t *e = q->elevator;
 	if (!list_empty(&q->queue_head))
 		return 0;
 	if (e->ops->elevator_queue_empty_fn)
 		return e->ops->elevator_queue_empty_fn(q);
 	return 1;
 }
 EXPORT_SYMBOL(elv_queue_empty);
 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_latter_req_fn)
 		return e->ops->elevator_latter_req_fn(q, rq);
 	return NULL;
 }
 struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_former_req_fn)
 		return e->ops->elevator_former_req_fn(q, rq);
 	return NULL;
 }
 int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_set_req_fn)
 		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
 	rq->elevator_private = NULL;
 	return 0;
 }
 void elv_put_request(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_put_req_fn)
 		e->ops->elevator_put_req_fn(rq);
 }
 int elv_may_queue(struct request_queue *q, int rw)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_may_queue_fn)
 		return e->ops->elevator_may_queue_fn(q, rw);
 	return ELV_MQUEUE_MAY;
 }
 void elv_completed_request(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	/*
 	 * request is released from the driver, io must be done
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight--;
 		if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
 	/*
 	 * Check if the queue is waiting for fs requests to be
 	 * drained for flush sequence.
 	 */
 	if (unlikely(q->ordseq)) {
 		struct request *first_rq = list_entry_rq(q->queue_head.next);
 		if (q->in_flight == 0 &&
 		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
 		    blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
 			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
 			q->request_fn(q);
 		}
 	}
 }
 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
 static ssize_t
 elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
 	elevator_t *e = container_of(kobj, elevator_t, kobj);
 	struct elv_fs_entry *entry = to_elv(attr);
 	ssize_t error;
 	if (!entry->show)
 		return -EIO;
 	mutex_lock(&e->sysfs_lock);
 	error = e->ops ? entry->show(e, page) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
 	return error;
 }
 static ssize_t
 elv_attr_store(struct kobject *kobj, struct attribute *attr,
 	       const char *page, size_t length)
 {
 	elevator_t *e = container_of(kobj, elevator_t, kobj);
 	struct elv_fs_entry *entry = to_elv(attr);
 	ssize_t error;
 	if (!entry->store)
 		return -EIO;
 	mutex_lock(&e->sysfs_lock);
 	error = e->ops ? entry->store(e, page, length) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
 	return error;
 }
 static struct sysfs_ops elv_sysfs_ops = {
 	.show	= elv_attr_show,
 	.store	= elv_attr_store,
 };
 static struct kobj_type elv_ktype = {
 	.sysfs_ops	= &elv_sysfs_ops,
 	.release	= elevator_release,
 };
 int elv_register_queue(struct request_queue *q)
 {
 	elevator_t *e = q->elevator;
 	int error;
 	e->kobj.parent = &q->kobj;
 	error = kobject_add(&e->kobj);
 	if (!error) {
 		struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
 		if (attr) {
 			while (attr->attr.name) {
 				if (sysfs_create_file(&e->kobj, &attr->attr))
 					break;
 				attr++;
 			}
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 	}
 	return error;
 }
 static void __elv_unregister_queue(elevator_t *e)
 {
 	kobject_uevent(&e->kobj, KOBJ_REMOVE);
 	kobject_del(&e->kobj);
 }
 void elv_unregister_queue(struct request_queue *q)
 {
 	if (q)
 		__elv_unregister_queue(q->elevator);
 }
 int elv_register(struct elevator_type *e)
 {
 	char *def = "";
 	spin_lock(&elv_list_lock);
 	BUG_ON(elevator_find(e->elevator_name));
 	list_add_tail(&e->list, &elv_list);
 	spin_unlock(&elv_list_lock);
 	if (!strcmp(e->elevator_name, chosen_elevator) ||
 			(!*chosen_elevator &&
 			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
 				def = " (default)";
 	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, def);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(elv_register);
 void elv_unregister(struct elevator_type *e)
 {
 	struct task_struct *g, *p;
 	/*
 	 * Iterate every thread in the process to remove the io contexts.
 	 */
 	if (e->ops.trim) {
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			task_lock(p);
 			if (p->io_context)
 				e->ops.trim(p->io_context);
 			task_unlock(p);
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 	}
 	spin_lock(&elv_list_lock);
 	list_del_init(&e->list);
 	spin_unlock(&elv_list_lock);
 }
 EXPORT_SYMBOL_GPL(elv_unregister);
 /*
  * switch to new_e io scheduler. be careful not to introduce deadlocks -
  * we don't free the old io scheduler, before we have allocated what we
  * need for the new one. this way we have a chance of going back to the old
  * one, if the new one fails init for some reason.
  */
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
 	elevator_t *old_elevator, *e;
 	void *data;
 	/*
 	 * Allocate new elevator
 	 */
 	e = elevator_alloc(q, new_e);
 	if (!e)
 		return 0;
 	data = elevator_init_queue(q, e);
 	if (!data) {
 		kobject_put(&e->kobj);
 		return 0;
 	}
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data
 	 */
 	spin_lock_irq(q->queue_lock);
 	set_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	elv_drain_elevator(q);
 	while (q->rq.elvpriv) {
 		blk_remove_plug(q);
 		q->request_fn(q);
 		spin_unlock_irq(q->queue_lock);
 		msleep(10);
 		spin_lock_irq(q->queue_lock);
 		elv_drain_elevator(q);
 	}
 	/*
 	 * Remember old elevator.
 	 */
 	old_elevator = q->elevator;
 	/*
 	 * attach and start new elevator
 	 */
 	elevator_attach(q, e, data);
 	spin_unlock_irq(q->queue_lock);
 	__elv_unregister_queue(old_elevator);
 	if (elv_register_queue(q))
 		goto fail_register;
 	/*
 	 * finally exit old elevator and turn off BYPASS.
 	 */
 	elevator_exit(old_elevator);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	return 1;
 fail_register:
 	/*
 	 * switch failed, exit the new io scheduler and reattach the old
 	 * one again (along with re-adding the sysfs dir)
 	 */
 	elevator_exit(e);
 	q->elevator = old_elevator;
 	elv_register_queue(q);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	return 0;
 }
 ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 			  size_t count)
 {
 	char elevator_name[ELV_NAME_MAX];
 	size_t len;
 	struct elevator_type *e;
 	elevator_name[sizeof(elevator_name) - 1] = '\0';
 	strncpy(elevator_name, name, sizeof(elevator_name) - 1);
 	len = strlen(elevator_name);
 	if (len && elevator_name[len - 1] == '\n')
 		elevator_name[len - 1] = '\0';
 	e = elevator_get(elevator_name);
 	if (!e) {
 		printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
 		return -EINVAL;
 	}
 	if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
 		elevator_put(e);
 		return count;
 	}
 	if (!elevator_switch(q, e))
 		printk(KERN_ERR "elevator: switch to %s failed\n",elevator_name);
 	return count;
 }
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
 {
 	elevator_t *e = q->elevator;
 	struct elevator_type *elv = e->elevator_type;
 	struct elevator_type *__e;
 	int len = 0;
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
 		if (!strcmp(elv->elevator_name, __e->elevator_name))
 			len += sprintf(name+len, "[%s] ", elv->elevator_name);
 		else
 			len += sprintf(name+len, "%s ", __e->elevator_name);
 	}
 	spin_unlock(&elv_list_lock);
 	len += sprintf(len+name, "\n");
 	return len;
 }
 struct request *elv_rb_former_request(struct request_queue *q,
 				      struct request *rq)
 {
 	struct rb_node *rbprev = rb_prev(&rq->rb_node);
 	if (rbprev)
 		return rb_entry_rq(rbprev);
 	return NULL;
 }
 EXPORT_SYMBOL(elv_rb_former_request);
 struct request *elv_rb_latter_request(struct request_queue *q,
 				      struct request *rq)
 {
 	struct rb_node *rbnext = rb_next(&rq->rb_node);
 	if (rbnext)
 		return rb_entry_rq(rbnext);
 	return NULL;
 }
 EXPORT_SYMBOL(elv_rb_latter_request);

 /*
  * Copyright (C) 1991, 1992 Linus Torvalds
  * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
  * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
  * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
  * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> -  July2000
  * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
  */
 /*
  * This handles all read/write requests to block devices
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
 #include <linux/completion.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/interrupt.h>
 #include <linux/cpu.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 /*
  * for max sense size
  */
 #include <scsi/scsi_cmnd.h>
 static void blk_unplug_work(struct work_struct *work);
 static void blk_unplug_timeout(unsigned long data);
 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
 static void init_request_from_bio(struct request *req, struct bio *bio);
 static int __make_request(struct request_queue *q, struct bio *bio);
 static struct io_context *current_io_context(gfp_t gfp_flags, int node);
 static void blk_recalc_rq_segments(struct request *rq);
 static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			    struct bio *bio);
 /*
  * For the allocated request tables
  */
 static struct kmem_cache *request_cachep;
 /*
  * For queue allocation
  */
 static struct kmem_cache *requestq_cachep;
 /*
  * For io context allocations
  */
 static struct kmem_cache *iocontext_cachep;
 /*
  * Controlling structure to kblockd
  */
 static struct workqueue_struct *kblockd_workqueue;
 unsigned long blk_max_low_pfn, blk_max_pfn;
 EXPORT_SYMBOL(blk_max_low_pfn);
 EXPORT_SYMBOL(blk_max_pfn);
 static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
 /* Amount of time in which a process may batch requests */
 #define BLK_BATCH_TIME	(HZ/50UL)
 /* Number of requests a "batching" process may submit */
 #define BLK_BATCH_REQ	32
 /*
  * Return the threshold (number of used requests) at which the queue is
  * considered to be congested.  It include a little hysteresis to keep the
  * context switch rate down.
  */
 static inline int queue_congestion_on_threshold(struct request_queue *q)
 {
 	return q->nr_congestion_on;
 }
 /*
  * The threshold at which a queue is considered to be uncongested
  */
 static inline int queue_congestion_off_threshold(struct request_queue *q)
 {
 	return q->nr_congestion_off;
 }
 static void blk_queue_congestion_threshold(struct request_queue *q)
 {
 	int nr;
 	nr = q->nr_requests - (q->nr_requests / 8) + 1;
 	if (nr > q->nr_requests)
 		nr = q->nr_requests;
 	q->nr_congestion_on = nr;
 	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
 	if (nr < 1)
 		nr = 1;
 	q->nr_congestion_off = nr;
 }
 /**
  * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
  * @bdev:	device
  *
  * Locates the passed device's request queue and returns the address of its
  * backing_dev_info
  *
  * Will return NULL if the request queue cannot be located.
  */
 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
 {
 	struct backing_dev_info *ret = NULL;
 	struct request_queue *q = bdev_get_queue(bdev);
 	if (q)
 		ret = &q->backing_dev_info;
 	return ret;
 }
 EXPORT_SYMBOL(blk_get_backing_dev_info);
 /**
  * blk_queue_prep_rq - set a prepare_request function for queue
  * @q:		queue
  * @pfn:	prepare_request function
  *
  * It's possible for a queue to register a prepare_request callback which
  * is invoked before the request is handed to the request_fn. The goal of
  * the function is to prepare a request for I/O, it can be used to build a
  * cdb from the request data for instance.
  *
  */
 void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 {
 	q->prep_rq_fn = pfn;
 }
 EXPORT_SYMBOL(blk_queue_prep_rq);
 /**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:		queue
  * @mbfn:	merge_bvec_fn
  *
  * Usually queues have static limitations on the max sectors or segments that
  * we can put in a request. Stacking drivers may have some settings that
  * are dynamic, and thus we have to query the queue whether it is ok to
  * add a new bio_vec to a bio at a given offset or not. If the block device
  * has such limitations, it needs to register a merge_bvec_fn to control
  * the size of bio's sent to it. Note that a block device *must* allow a
  * single page to be added to an empty bio. The block device driver may want
  * to use the bio_split() function to deal with these bio's. By default
  * no merge_bvec_fn is defined for a queue, and only the fixed limits are
  * honored.
  */
 void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
 {
 	q->merge_bvec_fn = mbfn;
 }
 EXPORT_SYMBOL(blk_queue_merge_bvec);
 void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
 {
 	q->softirq_done_fn = fn;
 }
 EXPORT_SYMBOL(blk_queue_softirq_done);
 /**
  * blk_queue_make_request - define an alternate make_request function for a device
  * @q:  the request queue for the device to be affected
  * @mfn: the alternate make_request function
  *
  * Description:
  *    The normal way for &struct bios to be passed to a device
  *    driver is for them to be collected into requests on a request
  *    queue, and then to allow the device driver to select requests
  *    off that queue when it is ready.  This works well for many block
  *    devices. However some block devices (typically virtual devices
  *    such as md or lvm) do not benefit from the processing on the
  *    request queue, and are served best by having the requests passed
  *    directly to them.  This can be achieved by providing a function
  *    to blk_queue_make_request().
  *
  * Caveat:
  *    The driver that does this *must* be able to deal appropriately
  *    with buffers in "highmemory". This can be accomplished by either calling
  *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
  *    blk_queue_bounce() to create a buffer in normal memory.
  **/
 void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn)
 {
 	/*
 	 * set defaults
 	 */
 	q->nr_requests = BLKDEV_MAX_RQ;
 	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
 	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
 	q->make_request_fn = mfn;
 	q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
 	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
 	blk_queue_hardsect_size(q, 512);
 	blk_queue_dma_alignment(q, 511);
 	blk_queue_congestion_threshold(q);
 	q->nr_batching = BLK_BATCH_REQ;
 	q->unplug_thresh = 4;		/* hmm */
 	q->unplug_delay = (3 * HZ) / 1000;	/* 3 milliseconds */
 	if (q->unplug_delay == 0)
 		q->unplug_delay = 1;
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 	q->unplug_timer.function = blk_unplug_timeout;
 	q->unplug_timer.data = (unsigned long)q;
 	/*
 	 * by default assume old behaviour and bounce for any highmem page
 	 */
 	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 }
 EXPORT_SYMBOL(blk_queue_make_request);
 static void rq_init(struct request_queue *q, struct request *rq)
 {
 	INIT_LIST_HEAD(&rq->queuelist);
 	INIT_LIST_HEAD(&rq->donelist);
 	rq->errors = 0;
 	rq->bio = rq->biotail = NULL;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->ioprio = 0;
 	rq->buffer = NULL;
 	rq->ref_count = 1;
 	rq->q = q;
 	rq->special = NULL;
 	rq->data_len = 0;
 	rq->data = NULL;
 	rq->nr_phys_segments = 0;
 	rq->sense = NULL;
 	rq->end_io = NULL;
 	rq->end_io_data = NULL;
 	rq->completion_data = NULL;
 	rq->next_rq = NULL;
 }
 /**
  * blk_queue_ordered - does this queue support ordered writes
  * @q:        the request queue
  * @ordered:  one of QUEUE_ORDERED_*
  * @prepare_flush_fn: rq setup helper for cache flush ordered writes
  *
  * Description:
  *   For journalled file systems, doing ordered writes on a commit
  *   block instead of explicitly doing wait_on_buffer (which is bad
  *   for performance) can be a big win. Block drivers supporting this
  *   feature should call this function and indicate so.
  *
  **/
 int blk_queue_ordered(struct request_queue *q, unsigned ordered,
 		      prepare_flush_fn *prepare_flush_fn)
 {
 	if (ordered & (QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH) &&
 	    prepare_flush_fn == NULL) {
 		printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
 		return -EINVAL;
 	}
 	if (ordered != QUEUE_ORDERED_NONE &&
 	    ordered != QUEUE_ORDERED_DRAIN &&
 	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
 	    ordered != QUEUE_ORDERED_DRAIN_FUA &&
 	    ordered != QUEUE_ORDERED_TAG &&
 	    ordered != QUEUE_ORDERED_TAG_FLUSH &&
 	    ordered != QUEUE_ORDERED_TAG_FUA) {
 		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
 		return -EINVAL;
 	}
 	q->ordered = ordered;
 	q->next_ordered = ordered;
 	q->prepare_flush_fn = prepare_flush_fn;
 	return 0;
 }
 EXPORT_SYMBOL(blk_queue_ordered);
 /**
  * blk_queue_issue_flush_fn - set function for issuing a flush
  * @q:     the request queue
  * @iff:   the function to be called issuing the flush
  *
  * Description:
  *   If a driver supports issuing a flush command, the support is notified
  *   to the block layer by defining it through this call.
  *
  **/
 void blk_queue_issue_flush_fn(struct request_queue *q, issue_flush_fn *iff)
 {
 	q->issue_flush_fn = iff;
 }
 EXPORT_SYMBOL(blk_queue_issue_flush_fn);
 /*
  * Cache flushing for ordered writes handling
  */
 inline unsigned blk_ordered_cur_seq(struct request_queue *q)
 {
 	if (!q->ordseq)
 		return 0;
 	return 1 << ffz(q->ordseq);
 }
 unsigned blk_ordered_req_seq(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	BUG_ON(q->ordseq == 0);
 	if (rq == &q->pre_flush_rq)
 		return QUEUE_ORDSEQ_PREFLUSH;
 	if (rq == &q->bar_rq)
 		return QUEUE_ORDSEQ_BAR;
 	if (rq == &q->post_flush_rq)
 		return QUEUE_ORDSEQ_POSTFLUSH;
 	/*
 	 * !fs requests don't need to follow barrier ordering.  Always
 	 * put them at the front.  This fixes the following deadlock.
 	 *
 	 * http://thread.gmane.org/gmane.linux.kernel/537473
 	 */
 	if (!blk_fs_request(rq))
 		return QUEUE_ORDSEQ_DRAIN;
 	if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
 	    (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
 		return QUEUE_ORDSEQ_DRAIN;
 	else
 		return QUEUE_ORDSEQ_DONE;
 }
 void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 {
 	struct request *rq;
 	int uptodate;
 	if (error && !q->orderr)
 		q->orderr = error;
 	BUG_ON(q->ordseq & seq);
 	q->ordseq |= seq;
 	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
 		return;
 	/*
 	 * Okay, sequence complete.
 	 */
 	uptodate = 1;
 	if (q->orderr)
 		uptodate = q->orderr;
 	q->ordseq = 0;
 	rq = q->orig_bar_rq;
 	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
 	end_that_request_last(rq, uptodate);
 }
 static void pre_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
 	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
 }
 static void bar_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
 	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
 }
 static void post_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
 	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
 }
 static void queue_flush(struct request_queue *q, unsigned which)
 {
 	struct request *rq;
 	rq_end_io_fn *end_io;
 	if (which == QUEUE_ORDERED_PREFLUSH) {
 		rq = &q->pre_flush_rq;
 		end_io = pre_flush_end_io;
 	} else {
 		rq = &q->post_flush_rq;
 		end_io = post_flush_end_io;
 	}
 	rq->cmd_flags = REQ_HARDBARRIER;
 	rq_init(q, rq);
 	rq->elevator_private = NULL;
 	rq->elevator_private2 = NULL;
 	rq->rq_disk = q->bar_rq.rq_disk;
 	rq->end_io = end_io;
 	q->prepare_flush_fn(q, rq);
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 static inline struct request *start_ordered(struct request_queue *q,
 					    struct request *rq)
 {
 	q->orderr = 0;
 	q->ordered = q->next_ordered;
 	q->ordseq |= QUEUE_ORDSEQ_STARTED;
 	/*
 	 * Prep proxy barrier request.
 	 */
 	blkdev_dequeue_request(rq);
 	q->orig_bar_rq = rq;
 	rq = &q->bar_rq;
 	rq->cmd_flags = 0;
 	rq_init(q, rq);
 	if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
 		rq->cmd_flags |= REQ_RW;
 	if (q->ordered & QUEUE_ORDERED_FUA)
 		rq->cmd_flags |= REQ_FUA;
 	rq->elevator_private = NULL;
 	rq->elevator_private2 = NULL;
 	init_request_from_bio(rq, q->orig_bar_rq->bio);
 	rq->end_io = bar_end_io;
 	/*
 	 * Queue ordered sequence.  As we stack them at the head, we
 	 * need to queue in reverse order.  Note that we rely on that
 	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
 	 * request gets inbetween ordered sequence.
 	 */
 	if (q->ordered & QUEUE_ORDERED_POSTFLUSH)
 		queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
 	else
 		q->ordseq |= QUEUE_ORDSEQ_POSTFLUSH;
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 	if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
 		queue_flush(q, QUEUE_ORDERED_PREFLUSH);
 		rq = &q->pre_flush_rq;
 	} else
 		q->ordseq |= QUEUE_ORDSEQ_PREFLUSH;
 	if ((q->ordered & QUEUE_ORDERED_TAG) || q->in_flight == 0)
 		q->ordseq |= QUEUE_ORDSEQ_DRAIN;
 	else
 		rq = NULL;
 	return rq;
 }
 int blk_do_ordered(struct request_queue *q, struct request **rqp)
 {
 	struct request *rq = *rqp;
 	int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
 	if (!q->ordseq) {
 		if (!is_barrier)
 			return 1;
 		if (q->next_ordered != QUEUE_ORDERED_NONE) {
 			*rqp = start_ordered(q, rq);
 			return 1;
 		} else {
 			/*
 			 * This can happen when the queue switches to
 			 * ORDERED_NONE while this request is on it.
 			 */
 			blkdev_dequeue_request(rq);
 			end_that_request_first(rq, -EOPNOTSUPP,
 					       rq->hard_nr_sectors);
 			end_that_request_last(rq, -EOPNOTSUPP);
 			*rqp = NULL;
 			return 0;
 		}
 	}
 	/*
 	 * Ordered sequence in progress
 	 */
 	/* Special requests are not subject to ordering rules. */
 	if (!blk_fs_request(rq) &&
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
 		return 1;
 	if (q->ordered & QUEUE_ORDERED_TAG) {
 		/* Ordered by tag.  Blocking the next barrier is enough. */
 		if (is_barrier && rq != &q->bar_rq)
 			*rqp = NULL;
 	} else {
 		/* Ordered by draining.  Wait for turn. */
 		WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
 		if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
 			*rqp = NULL;
 	}
 	return 1;
 }
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
 	struct request_queue *q = rq->q;
 	if (&q->bar_rq != rq) {
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			error = -EIO;
 		if (unlikely(nbytes > bio->bi_size)) {
 			printk("%s: want %u bytes done, only %u left\n",
 			       __FUNCTION__, nbytes, bio->bi_size);
 			nbytes = bio->bi_size;
 		}
 		bio->bi_size -= nbytes;
 		bio->bi_sector += (nbytes >> 9);
 		if (bio->bi_size == 0)
 			bio_endio(bio, error);
 	} else {
 		/*
 		 * Okay, this is the barrier request in progress, just
 		 * record the error;
 		 */
 		if (error && !q->orderr)
 			q->orderr = error;
 	}
 }
 /**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q:  the request queue for the device
  * @dma_addr:   bus address limit
  *
  * Description:
  *    Different hardware can have different requirements as to what pages
  *    it can do I/O directly to. A low level driver can call
  *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
  *    buffers for doing I/O to pages residing above @page.
  **/
 void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
 {
 	unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
 	int dma = 0;
 	q->bounce_gfp = GFP_NOIO;
 #if BITS_PER_LONG == 64
 	/* Assume anything <= 4GB can be handled by IOMMU.
 	   Actually some IOMMUs can handle everything, but I don't
 	   know of a way to test this here. */
 	if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
 	q->bounce_pfn = max_low_pfn;
 #else
 	if (bounce_pfn < blk_max_low_pfn)
 		dma = 1;
 	q->bounce_pfn = bounce_pfn;
 #endif
 	if (dma) {
 		init_emergency_isa_pool();
 		q->bounce_gfp = GFP_NOIO | GFP_DMA;
 		q->bounce_pfn = bounce_pfn;
 	}
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 /**
  * blk_queue_max_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
  * @max_sectors:  max sectors in the usual 512b unit
  *
  * Description:
  *    Enables a low level driver to set an upper limit on the size of
  *    received requests.
  **/
 void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
 {
 	if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
 		max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
 		printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
 	}
 	if (BLK_DEF_MAX_SECTORS > max_sectors)
 		q->max_hw_sectors = q->max_sectors = max_sectors;
  	else {
 		q->max_sectors = BLK_DEF_MAX_SECTORS;
 		q->max_hw_sectors = max_sectors;
 	}
 }
 EXPORT_SYMBOL(blk_queue_max_sectors);
 /**
  * blk_queue_max_phys_segments - set max phys segments for a request for this queue
  * @q:  the request queue for the device
  * @max_segments:  max number of segments
  *
  * Description:
  *    Enables a low level driver to set an upper limit on the number of
  *    physical data segments in a request.  This would be the largest sized
  *    scatter list the driver could handle.
  **/
 void blk_queue_max_phys_segments(struct request_queue *q,
 				 unsigned short max_segments)
 {
 	if (!max_segments) {
 		max_segments = 1;
 		printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
 	}
 	q->max_phys_segments = max_segments;
 }
 EXPORT_SYMBOL(blk_queue_max_phys_segments);
 /**
  * blk_queue_max_hw_segments - set max hw segments for a request for this queue
  * @q:  the request queue for the device
  * @max_segments:  max number of segments
  *
  * Description:
  *    Enables a low level driver to set an upper limit on the number of
  *    hw data segments in a request.  This would be the largest number of
  *    address/length pairs the host adapter can actually give as once
  *    to the device.
  **/
 void blk_queue_max_hw_segments(struct request_queue *q,
 			       unsigned short max_segments)
 {
 	if (!max_segments) {
 		max_segments = 1;
 		printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
 	}
 	q->max_hw_segments = max_segments;
 }
 EXPORT_SYMBOL(blk_queue_max_hw_segments);
 /**
  * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
  * @q:  the request queue for the device
  * @max_size:  max size of segment in bytes
  *
  * Description:
  *    Enables a low level driver to set an upper limit on the size of a
  *    coalesced segment
  **/
 void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 {
 	if (max_size < PAGE_CACHE_SIZE) {
 		max_size = PAGE_CACHE_SIZE;
 		printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
 	}
 	q->max_segment_size = max_size;
 }
 EXPORT_SYMBOL(blk_queue_max_segment_size);
 /**
  * blk_queue_hardsect_size - set hardware sector size for the queue
  * @q:  the request queue for the device
  * @size:  the hardware sector size, in bytes
  *
  * Description:
  *   This should typically be set to the lowest possible sector size
  *   that the hardware can operate on (possible without reverting to
  *   even internal read-modify-write operations). Usually the default
  *   of 512 covers most hardware.
  **/
 void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
 {
 	q->hardsect_size = size;
 }
 EXPORT_SYMBOL(blk_queue_hardsect_size);
 /*
  * Returns the minimum that is _not_ zero, unless both are zero.
  */
 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 /**
  * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
  * @t:	the stacking driver (top)
  * @b:  the underlying device (bottom)
  **/
 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 {
 	/* zero is "infinity" */
 	t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);
 	t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
 	t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
 	t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
 	t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
 	if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
 		clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
 /**
  * blk_queue_segment_boundary - set boundary rules for segment merging
  * @q:  the request queue for the device
  * @mask:  the memory boundary mask
  **/
 void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
 {
 	if (mask < PAGE_CACHE_SIZE - 1) {
 		mask = PAGE_CACHE_SIZE - 1;
 		printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
 	}
 	q->seg_boundary_mask = mask;
 }
 EXPORT_SYMBOL(blk_queue_segment_boundary);
 /**
  * blk_queue_dma_alignment - set dma length and memory alignment
  * @q:     the request queue for the device
  * @mask:  alignment mask
  *
  * description:
  *    set required memory and length aligment for direct dma transactions.
  *    this is used when buiding direct io requests for the queue.
  *
  **/
 void blk_queue_dma_alignment(struct request_queue *q, int mask)
 {
 	q->dma_alignment = mask;
 }
 EXPORT_SYMBOL(blk_queue_dma_alignment);
 /**
  * blk_queue_find_tag - find a request by its tag and queue
  * @q:	 The request queue for the device
  * @tag: The tag of the request
  *
  * Notes:
  *    Should be used when a device returns a tag and you want to match
  *    it with a request.
  *
  *    no locks need be held.
  **/
 struct request *blk_queue_find_tag(struct request_queue *q, int tag)
 {
 	return blk_map_queue_find_tag(q->queue_tags, tag);
 }
 EXPORT_SYMBOL(blk_queue_find_tag);
 /**
  * __blk_free_tags - release a given set of tag maintenance info
  * @bqt:	the tag map to free
  *
  * Tries to free the specified @bqt@.  Returns true if it was
  * actually freed and false if there are still references using it
  */
 static int __blk_free_tags(struct blk_queue_tag *bqt)
 {
 	int retval;
 	retval = atomic_dec_and_test(&bqt->refcnt);
 	if (retval) {
 		BUG_ON(bqt->busy);
 		BUG_ON(!list_empty(&bqt->busy_list));
 		kfree(bqt->tag_index);
 		bqt->tag_index = NULL;
 		kfree(bqt->tag_map);
 		bqt->tag_map = NULL;
 		kfree(bqt);
 	}
 	return retval;
 }
 /**
  * __blk_queue_free_tags - release tag maintenance info
  * @q:  the request queue for the device
  *
  *  Notes:
  *    blk_cleanup_queue() will take care of calling this function, if tagging
  *    has been used. So there's no need to call this directly.
  **/
 static void __blk_queue_free_tags(struct request_queue *q)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
 	if (!bqt)
 		return;
 	__blk_free_tags(bqt);
 	q->queue_tags = NULL;
 	q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
 }
 /**
  * blk_free_tags - release a given set of tag maintenance info
  * @bqt:	the tag map to free
  *
  * For externally managed @bqt@ frees the map.  Callers of this
  * function must guarantee to have released all the queues that
  * might have been using this tag map.
  */
 void blk_free_tags(struct blk_queue_tag *bqt)
 {
 	if (unlikely(!__blk_free_tags(bqt)))
 		BUG();
 }
 EXPORT_SYMBOL(blk_free_tags);
 /**
  * blk_queue_free_tags - release tag maintenance info
  * @q:  the request queue for the device
  *
  *  Notes:
  *	This is used to disabled tagged queuing to a device, yet leave
  *	queue in function.
  **/
 void blk_queue_free_tags(struct request_queue *q)
 {
 	clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
 }
 EXPORT_SYMBOL(blk_queue_free_tags);
 static int
 init_tag_map(struct request_queue *q, struct blk_queue_tag *tags, int depth)
 {
 	struct request **tag_index;
 	unsigned long *tag_map;
 	int nr_ulongs;
 	if (q && depth > q->nr_requests * 2) {
 		depth = q->nr_requests * 2;
 		printk(KERN_ERR "%s: adjusted depth to %d\n",
 				__FUNCTION__, depth);
 	}
 	tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
 	if (!tag_index)
 		goto fail;
 	nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
 	tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
 	if (!tag_map)
 		goto fail;
 	tags->real_max_depth = depth;
 	tags->max_depth = depth;
 	tags->tag_index = tag_index;
 	tags->tag_map = tag_map;
 	return 0;
 fail:
 	kfree(tag_index);
 	return -ENOMEM;
 }
 static struct blk_queue_tag *__blk_queue_init_tags(struct request_queue *q,
 						   int depth)
 {
 	struct blk_queue_tag *tags;
 	tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
 	if (!tags)
 		goto fail;
 	if (init_tag_map(q, tags, depth))
 		goto fail;
 	INIT_LIST_HEAD(&tags->busy_list);
 	tags->busy = 0;
 	atomic_set(&tags->refcnt, 1);
 	return tags;
 fail:
 	kfree(tags);
 	return NULL;
 }
 /**
  * blk_init_tags - initialize the tag info for an external tag map
  * @depth:	the maximum queue depth supported
  * @tags: the tag to use
  **/
 struct blk_queue_tag *blk_init_tags(int depth)
 {
 	return __blk_queue_init_tags(NULL, depth);
 }
 EXPORT_SYMBOL(blk_init_tags);
 /**
  * blk_queue_init_tags - initialize the queue tag info
  * @q:  the request queue for the device
  * @depth:  the maximum queue depth supported
  * @tags: the tag to use
  **/
 int blk_queue_init_tags(struct request_queue *q, int depth,
 			struct blk_queue_tag *tags)
 {
 	int rc;
 	BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
 	if (!tags && !q->queue_tags) {
 		tags = __blk_queue_init_tags(q, depth);
 		if (!tags)
 			goto fail;
 	} else if (q->queue_tags) {
 		if ((rc = blk_queue_resize_tags(q, depth)))
 			return rc;
 		set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
 		return 0;
 	} else
 		atomic_inc(&tags->refcnt);
 	/*
 	 * assign it, all done
 	 */
 	q->queue_tags = tags;
 	q->queue_flags |= (1 << QUEUE_FLAG_QUEUED);
 	return 0;
 fail:
 	kfree(tags);
 	return -ENOMEM;
 }
 EXPORT_SYMBOL(blk_queue_init_tags);
 /**
  * blk_queue_resize_tags - change the queueing depth
  * @q:  the request queue for the device
  * @new_depth: the new max command queueing depth
  *
  *  Notes:
  *    Must be called with the queue lock held.
  **/
 int blk_queue_resize_tags(struct request_queue *q, int new_depth)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
 	struct request **tag_index;
 	unsigned long *tag_map;
 	int max_depth, nr_ulongs;
 	if (!bqt)
 		return -ENXIO;
 	/*
 	 * if we already have large enough real_max_depth.  just
 	 * adjust max_depth.  *NOTE* as requests with tag value
 	 * between new_depth and real_max_depth can be in-flight, tag
 	 * map can not be shrunk blindly here.
 	 */
 	if (new_depth <= bqt->real_max_depth) {
 		bqt->max_depth = new_depth;
 		return 0;
 	}
 	/*
 	 * Currently cannot replace a shared tag map with a new
 	 * one, so error out if this is the case
 	 */
 	if (atomic_read(&bqt->refcnt) != 1)
 		return -EBUSY;
 	/*
 	 * save the old state info, so we can copy it back
 	 */
 	tag_index = bqt->tag_index;
 	tag_map = bqt->tag_map;
 	max_depth = bqt->real_max_depth;
 	if (init_tag_map(q, bqt, new_depth))
 		return -ENOMEM;
 	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
 	nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
 	memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
 	kfree(tag_index);
 	kfree(tag_map);
 	return 0;
 }
 EXPORT_SYMBOL(blk_queue_resize_tags);
 /**
  * blk_queue_end_tag - end tag operations for a request
  * @q:  the request queue for the device
  * @rq: the request that has completed
  *
  *  Description:
  *    Typically called when end_that_request_first() returns 0, meaning
  *    all transfers have been done for a request. It's important to call
  *    this function before end_that_request_last(), as that will put the
  *    request back on the free list thus corrupting the internal tag list.
  *
  *  Notes:
  *   queue lock must be held.
  **/
 void blk_queue_end_tag(struct request_queue *q, struct request *rq)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
 	int tag = rq->tag;
 	BUG_ON(tag == -1);
 	if (unlikely(tag >= bqt->real_max_depth))
 		/*
 		 * This can happen after tag depth has been reduced.
 		 * FIXME: how about a warning or info message here?
 		 */
 		return;
 	list_del_init(&rq->queuelist);
 	rq->cmd_flags &= ~REQ_QUEUED;
 	rq->tag = -1;
 	if (unlikely(bqt->tag_index[tag] == NULL))
 		printk(KERN_ERR "%s: tag %d is missing\n",
 		       __FUNCTION__, tag);
 	bqt->tag_index[tag] = NULL;
 	/*
 	 * We use test_and_clear_bit's memory ordering properties here.
 	 * The tag_map bit acts as a lock for tag_index[bit], so we need
 	 * a barrer before clearing the bit (precisely: release semantics).
 	 * Could use clear_bit_unlock when it is merged.
 	 */
 	if (unlikely(!test_and_clear_bit(tag, bqt->tag_map))) {
 		printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
 		       __FUNCTION__, tag);
 		return;
 	}
 	bqt->busy--;
 }
 EXPORT_SYMBOL(blk_queue_end_tag);
 /**
  * blk_queue_start_tag - find a free tag and assign it
  * @q:  the request queue for the device
  * @rq:  the block request that needs tagging
  *
  *  Description:
  *    This can either be used as a stand-alone helper, or possibly be
  *    assigned as the queue &prep_rq_fn (in which case &struct request
  *    automagically gets a tag assigned). Note that this function
  *    assumes that any type of request can be queued! if this is not
  *    true for your device, you must check the request type before
  *    calling this function.  The request will also be removed from
  *    the request queue, so it's the drivers responsibility to readd
  *    it if it should need to be restarted for some reason.
  *
  *  Notes:
  *   queue lock must be held.
  **/
 int blk_queue_start_tag(struct request_queue *q, struct request *rq)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
 	int tag;
 	if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
 		printk(KERN_ERR
 		       "%s: request %p for device [%s] already tagged %d",
 		       __FUNCTION__, rq,
 		       rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
 		BUG();
 	}
 	/*
 	 * Protect against shared tag maps, as we may not have exclusive
 	 * access to the tag map.
 	 */
 	do {
 		tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
 		if (tag >= bqt->max_depth)
 			return 1;
 	} while (test_and_set_bit(tag, bqt->tag_map));
 	/*
 	 * We rely on test_and_set_bit providing lock memory ordering semantics
 	 * (could use test_and_set_bit_lock when it is merged).
 	 */
 	rq->cmd_flags |= REQ_QUEUED;
 	rq->tag = tag;
 	bqt->tag_index[tag] = rq;
 	blkdev_dequeue_request(rq);
 	list_add(&rq->queuelist, &bqt->busy_list);
 	bqt->busy++;
 	return 0;
 }
 EXPORT_SYMBOL(blk_queue_start_tag);
 /**
  * blk_queue_invalidate_tags - invalidate all pending tags
  * @q:  the request queue for the device
  *
  *  Description:
  *   Hardware conditions may dictate a need to stop all pending requests.
  *   In this case, we will safely clear the block side of the tag queue and
  *   readd all requests to the request queue in the right order.
  *
  *  Notes:
  *   queue lock must be held.
  **/
 void blk_queue_invalidate_tags(struct request_queue *q)
 {
 	struct blk_queue_tag *bqt = q->queue_tags;
 	struct list_head *tmp, *n;
 	struct request *rq;
 	list_for_each_safe(tmp, n, &bqt->busy_list) {
 		rq = list_entry_rq(tmp);
 		if (rq->tag == -1) {
 			printk(KERN_ERR
 			       "%s: bad tag found on list\n", __FUNCTION__);
 			list_del_init(&rq->queuelist);
 			rq->cmd_flags &= ~REQ_QUEUED;
 		} else
 			blk_queue_end_tag(q, rq);
 		rq->cmd_flags &= ~REQ_STARTED;
 		__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
 	}
 }
 EXPORT_SYMBOL(blk_queue_invalidate_tags);
 void blk_dump_rq_flags(struct request *rq, char *msg)
 {
 	int bit;
 	printk("%s: dev %s: type=%x, flags=%x\n", msg,
 		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
 		rq->cmd_flags);
 	printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
 						       rq->nr_sectors,
 						       rq->current_nr_sectors);
 	printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
 	if (blk_pc_request(rq)) {
 		printk("cdb: ");
 		for (bit = 0; bit < sizeof(rq->cmd); bit++)
 			printk("%02x ", rq->cmd[bit]);
 		printk("\n");
 	}
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
 	struct request rq;
 	struct bio *nxt = bio->bi_next;
 	rq.q = q;
 	rq.bio = rq.biotail = bio;
 	bio->bi_next = NULL;
 	blk_recalc_rq_segments(&rq);
 	bio->bi_next = nxt;
 	bio->bi_phys_segments = rq.nr_phys_segments;
 	bio->bi_hw_segments = rq.nr_hw_segments;
 	bio->bi_flags |= (1 << BIO_SEG_VALID);
 }
 EXPORT_SYMBOL(blk_recount_segments);
 static void blk_recalc_rq_segments(struct request *rq)
 {
 	int nr_phys_segs;
 	int nr_hw_segs;
 	unsigned int phys_size;
 	unsigned int hw_size;
 	struct bio_vec *bv, *bvprv = NULL;
 	int seg_size;
 	int hw_seg_size;
 	int cluster;
 	struct req_iterator iter;
 	int high, highprv = 1;
 	struct request_queue *q = rq->q;
 	if (!rq->bio)
 		return;
 	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
 	hw_seg_size = seg_size = 0;
 	phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
 	rq_for_each_segment(bv, rq, iter) {
 		/*
 		 * the trick here is making sure that a high page is never
 		 * considered part of another segment, since that might
 		 * change with the bounce page.
 		 */
 		high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
 		if (high || highprv)
 			goto new_hw_segment;
 		if (cluster) {
 			if (seg_size + bv->bv_len > q->max_segment_size)
 				goto new_segment;
 			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
 				goto new_segment;
 			if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
 				goto new_segment;
 			if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
 				goto new_hw_segment;
 			seg_size += bv->bv_len;
 			hw_seg_size += bv->bv_len;
 			bvprv = bv;
 			continue;
 		}
 new_segment:
 		if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
 		    !BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
 			hw_seg_size += bv->bv_len;
 		else {
 new_hw_segment:
 			if (nr_hw_segs == 1 &&
 			    hw_seg_size > rq->bio->bi_hw_front_size)
 				rq->bio->bi_hw_front_size = hw_seg_size;
 			hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
 			nr_hw_segs++;
 		}
 		nr_phys_segs++;
 		bvprv = bv;
 		seg_size = bv->bv_len;
 		highprv = high;
 	}
 	if (nr_hw_segs == 1 &&
 	    hw_seg_size > rq->bio->bi_hw_front_size)
 		rq->bio->bi_hw_front_size = hw_seg_size;
 	if (hw_seg_size > rq->biotail->bi_hw_back_size)
 		rq->biotail->bi_hw_back_size = hw_seg_size;
 	rq->nr_phys_segments = nr_phys_segs;
 	rq->nr_hw_segments = nr_hw_segs;
 }
 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 				   struct bio *nxt)
 {
 	if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
 		return 0;
 	if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
 		return 0;
 	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
 		return 0;
 	/*
 	 * bio and nxt are contigous in memory, check if the queue allows
 	 * these two to be merged into one
 	 */
 	if (BIO_SEG_BOUNDARY(q, bio, nxt))
 		return 1;
 	return 0;
 }
 static int blk_hw_contig_segment(struct request_queue *q, struct bio *bio,
 				 struct bio *nxt)
 {
 	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
 		blk_recount_segments(q, bio);
 	if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
 		blk_recount_segments(q, nxt);
 	if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) ||
 	    BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
 		return 0;
 	if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
 		return 0;
 	return 1;
 }
 /*
  * map a request to scatterlist, return number of sg entries setup. Caller
  * must make sure sg can hold rq->nr_phys_segments entries
  */
 int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 		  struct scatterlist *sg)
 {
 	struct bio_vec *bvec, *bvprv;
 	struct req_iterator iter;
 	int nsegs, cluster;
 	nsegs = 0;
 	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
 	/*
 	 * for each bio in rq
 	 */
 	bvprv = NULL;
 	rq_for_each_segment(bvec, rq, iter) {
 		int nbytes = bvec->bv_len;
 		if (bvprv && cluster) {
 			if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
 				goto new_segment;
 			if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
 				goto new_segment;
 			if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
 				goto new_segment;
 			sg[nsegs - 1].length += nbytes;
 		} else {
 new_segment:
 			memset(&sg[nsegs],0,sizeof(struct scatterlist));
 			sg[nsegs].page = bvec->bv_page;
 			sg[nsegs].length = nbytes;
 			sg[nsegs].offset = bvec->bv_offset;
 			nsegs++;
 		}
 		bvprv = bvec;
 	} /* segments in rq */
 	return nsegs;
 }
 EXPORT_SYMBOL(blk_rq_map_sg);
 /*
  * the standard queue merge functions, can be overridden with device
  * specific ones if so desired
  */
 static inline int ll_new_mergeable(struct request_queue *q,
 				   struct request *req,
 				   struct bio *bio)
 {
 	int nr_phys_segs = bio_phys_segments(q, bio);
 	if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
 		return 0;
 	}
 	/*
 	 * A hw segment is just getting larger, bump just the phys
 	 * counter.
 	 */
 	req->nr_phys_segments += nr_phys_segs;
 	return 1;
 }
 static inline int ll_new_hw_segment(struct request_queue *q,
 				    struct request *req,
 				    struct bio *bio)
 {
 	int nr_hw_segs = bio_hw_segments(q, bio);
 	int nr_phys_segs = bio_phys_segments(q, bio);
 	if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
 	    || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
 		return 0;
 	}
 	/*
 	 * This will form the start of a new hw segment.  Bump both
 	 * counters.
 	 */
 	req->nr_hw_segments += nr_hw_segs;
 	req->nr_phys_segments += nr_phys_segs;
 	return 1;
 }
 static int ll_back_merge_fn(struct request_queue *q, struct request *req,
 			    struct bio *bio)
 {
 	unsigned short max_sectors;
 	int len;
 	if (unlikely(blk_pc_request(req)))
 		max_sectors = q->max_hw_sectors;
 	else
 		max_sectors = q->max_sectors;
 	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
 		return 0;
 	}
 	if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
 		blk_recount_segments(q, req->biotail);
 	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
 		blk_recount_segments(q, bio);
 	len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
 	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
 	    !BIOVEC_VIRT_OVERSIZE(len)) {
 		int mergeable =  ll_new_mergeable(q, req, bio);
 		if (mergeable) {
 			if (req->nr_hw_segments == 1)
 				req->bio->bi_hw_front_size = len;
 			if (bio->bi_hw_segments == 1)
 				bio->bi_hw_back_size = len;
 		}
 		return mergeable;
 	}
 	return ll_new_hw_segment(q, req, bio);
 }
 static int ll_front_merge_fn(struct request_queue *q, struct request *req,
 			     struct bio *bio)
 {
 	unsigned short max_sectors;
 	int len;
 	if (unlikely(blk_pc_request(req)))
 		max_sectors = q->max_hw_sectors;
 	else
 		max_sectors = q->max_sectors;
 	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
 		return 0;
 	}
 	len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
 	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
 		blk_recount_segments(q, bio);
 	if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
 		blk_recount_segments(q, req->bio);
 	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
 	    !BIOVEC_VIRT_OVERSIZE(len)) {
 		int mergeable =  ll_new_mergeable(q, req, bio);
 		if (mergeable) {
 			if (bio->bi_hw_segments == 1)
 				bio->bi_hw_front_size = len;
 			if (req->nr_hw_segments == 1)
 				req->biotail->bi_hw_back_size = len;
 		}
 		return mergeable;
 	}
 	return ll_new_hw_segment(q, req, bio);
 }
 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 				struct request *next)
 {
 	int total_phys_segments;
 	int total_hw_segments;
 	/*
 	 * First check if the either of the requests are re-queued
 	 * requests.  Can't merge them if they are.
 	 */
 	if (req->special || next->special)
 		return 0;
 	/*
 	 * Will it become too large?
 	 */
 	if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
 		return 0;
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
 	if (blk_phys_contig_segment(q, req->biotail, next->bio))
 		total_phys_segments--;
 	if (total_phys_segments > q->max_phys_segments)
 		return 0;
 	total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
 	if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
 		int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
 		/*
 		 * propagate the combined length to the end of the requests
 		 */
 		if (req->nr_hw_segments == 1)
 			req->bio->bi_hw_front_size = len;
 		if (next->nr_hw_segments == 1)
 			next->biotail->bi_hw_back_size = len;
 		total_hw_segments--;
 	}
 	if (total_hw_segments > q->max_hw_segments)
 		return 0;
 	/* Merge is OK... */
 	req->nr_phys_segments = total_phys_segments;
 	req->nr_hw_segments = total_hw_segments;
 	return 1;
 }
 /*
  * "plug" the device if there are no outstanding requests: this will
  * force the transfer to start only after we have put all the requests
  * on the list.
  *
  * This is called with interrupts off and no requests on the queue and
  * with the queue lock held.
  */
 void blk_plug_device(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	/*
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
 	 * which will restart the queueing
 	 */
 	if (blk_queue_stopped(q))
 		return;
 	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
 		blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
 	}
 }
 EXPORT_SYMBOL(blk_plug_device);
 /*
  * remove the queue from the plugged list, if present. called with
  * queue lock held and interrupts disabled.
  */
 int blk_remove_plug(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
 		return 0;
 	del_timer(&q->unplug_timer);
 	return 1;
 }
 EXPORT_SYMBOL(blk_remove_plug);
 /*
  * remove the plug and let it rip..
  */
 void __generic_unplug_device(struct request_queue *q)
 {
 	if (unlikely(blk_queue_stopped(q)))
 		return;
 	if (!blk_remove_plug(q))
 		return;
 	q->request_fn(q);
 }
 EXPORT_SYMBOL(__generic_unplug_device);
 /**
  * generic_unplug_device - fire a request queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   Linux uses plugging to build bigger requests queues before letting
  *   the device have at them. If a queue is plugged, the I/O scheduler
  *   is still adding and merging requests on the queue. Once the queue
  *   gets unplugged, the request_fn defined for the queue is invoked and
  *   transfers started.
  **/
 void generic_unplug_device(struct request_queue *q)
 {
 	spin_lock_irq(q->queue_lock);
 	__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL(generic_unplug_device);
 static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
 				   struct page *page)
 {
 	struct request_queue *q = bdi->unplug_io_data;
 	/*
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
 		blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
 					q->rq.count[READ] + q->rq.count[WRITE]);
 		q->unplug_fn(q);
 	}
 }
 static void blk_unplug_work(struct work_struct *work)
 {
 	struct request_queue *q =
 		container_of(work, struct request_queue, unplug_work);
 	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 	q->unplug_fn(q);
 }
 static void blk_unplug_timeout(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
 	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
 				q->rq.count[READ] + q->rq.count[WRITE]);
 	kblockd_schedule_work(&q->unplug_work);
 }
 /**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   blk_start_queue() will clear the stop flag on the queue, and call
  *   the request_fn for the queue if it was in a stopped state when
  *   entered. Also see blk_stop_queue(). Queue lock must be held.
  **/
 void blk_start_queue(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
 	/*
 	 * one level of recursion is ok and is much faster than kicking
 	 * the unplug handling
 	 */
 	if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
 		q->request_fn(q);
 		clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
 	} else {
 		blk_plug_device(q);
 		kblockd_schedule_work(&q->unplug_work);
 	}
 }
 EXPORT_SYMBOL(blk_start_queue);
 /**
  * blk_stop_queue - stop a queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   The Linux block layer assumes that a block driver will consume all
  *   entries on the request queue when the request_fn strategy is called.
  *   Often this will not happen, because of hardware limitations (queue
  *   depth settings). If a device driver gets a 'queue full' response,
  *   or if it simply chooses not to queue more I/O at one point, it can
  *   call this function to prevent the request_fn from being called until
  *   the driver has signalled it's ready to go again. This happens by calling
  *   blk_start_queue() to restart queue operations. Queue lock must be held.
  **/
 void blk_stop_queue(struct request_queue *q)
 {
 	blk_remove_plug(q);
 	set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
 }
 EXPORT_SYMBOL(blk_stop_queue);
 /**
  * blk_sync_queue - cancel any pending callbacks on a queue
  * @q: the queue
  *
  * Description:
  *     The block layer may perform asynchronous callback activity
  *     on a queue, such as calling the unplug function after a timeout.
  *     A block device may call blk_sync_queue to ensure that any
  *     such activity is cancelled, thus allowing it to release resources
  *     that the callbacks might use. The caller must already have made sure
  *     that its ->make_request_fn will not re-add plugging prior to calling
  *     this function.
  *
  */
 void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->unplug_timer);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 /**
  * blk_run_queue - run a single device queue
  * @q:	The queue to run
  */
 void blk_run_queue(struct request_queue *q)
 {
 	unsigned long flags;
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_remove_plug(q);
 	/*
 	 * Only recurse once to avoid overrunning the stack, let the unplug
 	 * handling reinvoke the handler shortly if we already got there.
 	 */
 	if (!elv_queue_empty(q)) {
 		if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
 			q->request_fn(q);
 			clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
 		} else {
 			blk_plug_device(q);
 			kblockd_schedule_work(&q->unplug_work);
 		}
 	}
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_run_queue);
 /**
  * blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
  * @kobj:    the kobj belonging of the request queue to be released
  *
  * Description:
  *     blk_cleanup_queue is the pair to blk_init_queue() or
  *     blk_queue_make_request().  It should be called when a request queue is
  *     being released; typically when a block device is being de-registered.
  *     Currently, its primary task it to free all the &struct request
  *     structures that were allocated to the queue and the queue itself.
  *
  * Caveat:
  *     Hopefully the low level driver will have finished any
  *     outstanding requests first...
  **/
 static void blk_release_queue(struct kobject *kobj)
 {
 	struct request_queue *q =
 		container_of(kobj, struct request_queue, kobj);
 	struct request_list *rl = &q->rq;
 	blk_sync_queue(q);
 	if (rl->rq_pool)
 		mempool_destroy(rl->rq_pool);
 	if (q->queue_tags)
 		__blk_queue_free_tags(q);
 	blk_trace_shutdown(q);
 	kmem_cache_free(requestq_cachep, q);
 }
 void blk_put_queue(struct request_queue *q)
 {
 	kobject_put(&q->kobj);
 }
 EXPORT_SYMBOL(blk_put_queue);
 void blk_cleanup_queue(struct request_queue * q)
 {
 	mutex_lock(&q->sysfs_lock);
 	set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);
 	mutex_unlock(&q->sysfs_lock);
 	if (q->elevator)
 		elevator_exit(q->elevator);
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
 static int blk_init_free_list(struct request_queue *q)
 {
 	struct request_list *rl = &q->rq;
 	rl->count[READ] = rl->count[WRITE] = 0;
 	rl->starved[READ] = rl->starved[WRITE] = 0;
 	rl->elvpriv = 0;
 	init_waitqueue_head(&rl->wait[READ]);
 	init_waitqueue_head(&rl->wait[WRITE]);
 	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
 				mempool_free_slab, request_cachep, q->node);
 	if (!rl->rq_pool)
 		return -ENOMEM;
 	return 0;
 }
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 {
 	return blk_alloc_queue_node(gfp_mask, -1);
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 static struct kobj_type queue_ktype;
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
 	q = kmem_cache_alloc_node(requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
 	if (!q)
 		return NULL;
 	init_timer(&q->unplug_timer);
 	kobject_set_name(&q->kobj, "%s", "queue");
 	q->kobj.ktype = &queue_ktype;
 	kobject_init(&q->kobj);
 	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
 	q->backing_dev_info.unplug_io_data = q;
 	mutex_init(&q->sysfs_lock);
 	return q;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
 /**
  * blk_init_queue  - prepare a request queue for use with a block device
  * @rfn:  The function to be called to process requests that have been
  *        placed on the queue.
  * @lock: Request queue spin lock
  *
  * Description:
  *    If a block device wishes to use the standard request handling procedures,
  *    which sorts requests and coalesces adjacent requests, then it must
  *    call blk_init_queue().  The function @rfn will be called when there
  *    are requests on the queue that need to be processed.  If the device
  *    supports plugging, then @rfn may not be called immediately when requests
  *    are available on the queue, but may be called at some time later instead.
  *    Plugged queues are generally unplugged when a buffer belonging to one
  *    of the requests on the queue is needed, or due to memory pressure.
  *
  *    @rfn is not required, or even expected, to remove all requests off the
  *    queue, but only as many as it can handle at a time.  If it does leave
  *    requests on the queue, it is responsible for arranging that the requests
  *    get dealt with eventually.
  *
  *    The queue spin lock must be held while manipulating the requests on the
  *    request queue; this lock will be taken also from interrupt context, so irq
  *    disabling is needed for it.
  *
  *    Function returns a pointer to the initialized request queue, or NULL if
  *    it didn't succeed.
  *
  * Note:
  *    blk_init_queue() must be paired with a blk_cleanup_queue() call
  *    when the block device is deactivated (such as at module unload).
  **/
 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 {
 	return blk_init_queue_node(rfn, lock, -1);
 }
 EXPORT_SYMBOL(blk_init_queue);
 struct request_queue *
 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 {
 	struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
 	if (!q)
 		return NULL;
 	q->node = node_id;
 	if (blk_init_free_list(q)) {
 		kmem_cache_free(requestq_cachep, q);
 		return NULL;
 	}
 	/*
 	 * if caller didn't supply a lock, they get per-queue locking with
 	 * our embedded lock
 	 */
 	if (!lock) {
 		spin_lock_init(&q->__queue_lock);
 		lock = &q->__queue_lock;
 	}
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
 	q->unplug_fn		= generic_unplug_device;
 	q->queue_flags		= (1 << QUEUE_FLAG_CLUSTER);
 	q->queue_lock		= lock;
 	blk_queue_segment_boundary(q, 0xffffffff);
 	blk_queue_make_request(q, __make_request);
 	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
 	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
 	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
 	q->sg_reserved_size = INT_MAX;
 	/*
 	 * all done
 	 */
 	if (!elevator_init(q, NULL)) {
 		blk_queue_congestion_threshold(q);
 		return q;
 	}
 	blk_put_queue(q);
 	return NULL;
 }
 EXPORT_SYMBOL(blk_init_queue_node);
 int blk_get_queue(struct request_queue *q)
 {
 	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
 		kobject_get(&q->kobj);
 		return 0;
 	}
 	return 1;
 }
 EXPORT_SYMBOL(blk_get_queue);
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
 	if (rq->cmd_flags & REQ_ELVPRIV)
 		elv_put_request(q, rq);
 	mempool_free(rq, q->rq.rq_pool);
 }
 static struct request *
 blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 	if (!rq)
 		return NULL;
 	/*
 	 * first three bits are identical in rq->cmd_flags and bio->bi_rw,
 	 * see bio.h and blkdev.h
 	 */
 	rq->cmd_flags = rw | REQ_ALLOCED;
 	if (priv) {
 		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
 			mempool_free(rq, q->rq.rq_pool);
 			return NULL;
 		}
 		rq->cmd_flags |= REQ_ELVPRIV;
 	}
 	return rq;
 }
 /*
  * ioc_batching returns true if the ioc is a valid batching request and
  * should be given priority access to a request.
  */
 static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
 {
 	if (!ioc)
 		return 0;
 	/*
 	 * Make sure the process is able to allocate at least 1 request
 	 * even if the batch times out, otherwise we could theoretically
 	 * lose wakeups.
 	 */
 	return ioc->nr_batch_requests == q->nr_batching ||
 		(ioc->nr_batch_requests > 0
 		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
 }
 /*
  * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
  * will cause the process to be a "batcher" on all queues in the system. This
  * is the behaviour we want though - once it gets a wakeup it should be given
  * a nice run.
  */
 static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
 {
 	if (!ioc || ioc_batching(q, ioc))
 		return;
 	ioc->nr_batch_requests = q->nr_batching;
 	ioc->last_waited = jiffies;
 }
 static void __freed_request(struct request_queue *q, int rw)
 {
 	struct request_list *rl = &q->rq;
 	if (rl->count[rw] < queue_congestion_off_threshold(q))
 		blk_clear_queue_congested(q, rw);
 	if (rl->count[rw] + 1 <= q->nr_requests) {
 		if (waitqueue_active(&rl->wait[rw]))
 			wake_up(&rl->wait[rw]);
 		blk_clear_queue_full(q, rw);
 	}
 }
 /*
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
 static void freed_request(struct request_queue *q, int rw, int priv)
 {
 	struct request_list *rl = &q->rq;
 	rl->count[rw]--;
 	if (priv)
 		rl->elvpriv--;
 	__freed_request(q, rw);
 	if (unlikely(rl->starved[rw ^ 1]))
 		__freed_request(q, rw ^ 1);
 }
 #define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
 /*
  * Get a free request, queue_lock must be held.
  * Returns NULL on failure, with queue_lock held.
  * Returns !NULL on success, with queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, int rw_flags,
 				   struct bio *bio, gfp_t gfp_mask)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = NULL;
 	const int rw = rw_flags & 0x01;
 	int may_queue, priv;
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
 	if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
 		if (rl->count[rw]+1 >= q->nr_requests) {
 			ioc = current_io_context(GFP_ATOMIC, q->node);
 			/*
 			 * The queue will fill after this allocation, so set
 			 * it as full, and mark this process as "batching".
 			 * This process will be allowed to complete a batch of
 			 * requests, others will be blocked.
 			 */
 			if (!blk_queue_full(q, rw)) {
 				ioc_set_batching(q, ioc);
 				blk_set_queue_full(q, rw);
 			} else {
 				if (may_queue != ELV_MQUEUE_MUST
 						&& !ioc_batching(q, ioc)) {
 					/*
 					 * The queue is full and the allocating
 					 * process is not a "batcher", and not
 					 * exempted by the IO scheduler
 					 */
 					goto out;
 				}
 			}
 		}
 		blk_set_queue_congested(q, rw);
 	}
 	/*
 	 * Only allow batching queuers to allocate up to 50% over the defined
 	 * limit of requests, otherwise we could have thousands of requests
 	 * allocated with any setting of ->nr_requests
 	 */
 	if (rl->count[rw] >= (3 * q->nr_requests / 2))
 		goto out;
 	rl->count[rw]++;
 	rl->starved[rw] = 0;
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	if (priv)
 		rl->elvpriv++;
 	spin_unlock_irq(q->queue_lock);
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
 	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
 		 * we might have messed up.
 		 *
 		 * Allocating task should really be put onto the front of the
 		 * wait queue, but this is pretty rare.
 		 */
 		spin_lock_irq(q->queue_lock);
 		freed_request(q, rw, priv);
 		/*
 		 * in the very unlikely event that allocation failed and no
 		 * requests for this direction was pending, mark us starved
 		 * so that freeing of a request in the other direction will
 		 * notice us. another possible fix would be to split the
 		 * rq mempool into READ and WRITE
 		 */
 rq_starved:
 		if (unlikely(rl->count[rw] == 0))
 			rl->starved[rw] = 1;
 		goto out;
 	}
 	/*
 	 * ioc may be NULL here, and ioc_batching will be false. That's
 	 * OK, if the queue is under the request limit then requests need
 	 * not count toward the nr_batch_requests limit. There will always
 	 * be some limit enforced by BLK_BATCH_TIME.
 	 */
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 	rq_init(q, rq);
 	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
 out:
 	return rq;
 }
 /*
  * No available requests for this queue, unplug the device and wait for some
  * requests to become available.
  *
  * Called with q->queue_lock held, and returns with it unlocked.
  */
 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 					struct bio *bio)
 {
 	const int rw = rw_flags & 0x01;
 	struct request *rq;
 	rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	while (!rq) {
 		DEFINE_WAIT(wait);
 		struct request_list *rl = &q->rq;
 		prepare_to_wait_exclusive(&rl->wait[rw], &wait,
 				TASK_UNINTERRUPTIBLE);
 		rq = get_request(q, rw_flags, bio, GFP_NOIO);
 		if (!rq) {
 			struct io_context *ioc;
 			blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
 			__generic_unplug_device(q);
 			spin_unlock_irq(q->queue_lock);
 			io_schedule();
 			/*
 			 * After sleeping, we become a "batching" process and
 			 * will be able to allocate at least one request, and
 			 * up to a big batch of them for a small period time.
 			 * See ioc_batching, ioc_set_batching
 			 */
 			ioc = current_io_context(GFP_NOIO, q->node);
 			ioc_set_batching(q, ioc);
 			spin_lock_irq(q->queue_lock);
 		}
 		finish_wait(&rl->wait[rw], &wait);
 	}
 	return rq;
 }
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	struct request *rq;
 	BUG_ON(rw != READ && rw != WRITE);
 	spin_lock_irq(q->queue_lock);
 	if (gfp_mask & __GFP_WAIT) {
 		rq = get_request_wait(q, rw, NULL);
 	} else {
 		rq = get_request(q, rw, NULL, gfp_mask);
 		if (!rq)
 			spin_unlock_irq(q->queue_lock);
 	}
 	/* q->queue_lock is unlocked at this point */
 	return rq;
 }
 EXPORT_SYMBOL(blk_get_request);
 /**
  * blk_start_queueing - initiate dispatch of requests to device
  * @q:		request queue to kick into gear
  *
  * This is basically a helper to remove the need to know whether a queue
  * is plugged or not if someone just wants to initiate dispatch of requests
  * for this queue.
  *
  * The queue lock must be held with interrupts disabled.
  */
 void blk_start_queueing(struct request_queue *q)
 {
 	if (!blk_queue_plugged(q))
 		q->request_fn(q);
 	else
 		__generic_unplug_device(q);
 }
 EXPORT_SYMBOL(blk_start_queueing);
 /**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
  *
  * Description:
  *    Drivers often keep queueing requests until the hardware cannot accept
  *    more, when that condition happens we need to put the request back
  *    on the queue. Must be called with queue lock held.
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
 	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 	elv_requeue_request(q, rq);
 }
 EXPORT_SYMBOL(blk_requeue_request);
 /**
  * blk_insert_request - insert a special request in to a request queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
  * @at_head:	insert request at head or tail of queue
  * @data:	private data
  *
  * Description:
  *    Many block devices need to execute commands asynchronously, so they don't
  *    block the whole kernel from preemption during request execution.  This is
  *    accomplished normally by inserting aritficial requests tagged as
  *    REQ_SPECIAL in to the corresponding request queue, and letting them be
  *    scheduled for actual execution by the request queue.
  *
  *    We have the option of inserting the head or the tail of the queue.
  *    Typically we use the tail for new ioctls and so forth.  We use the head
  *    of the queue for things like a QUEUE_FULL message from a device, or a
  *    host that is unable to accept a particular command.
  */
 void blk_insert_request(struct request_queue *q, struct request *rq,
 			int at_head, void *data)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 	unsigned long flags;
 	/*
 	 * tell I/O scheduler that this isn't a regular read/write (ie it
 	 * must not attempt merges on this) and that it acts as a soft
 	 * barrier
 	 */
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->cmd_flags |= REQ_SOFTBARRIER;
 	rq->special = data;
 	spin_lock_irqsave(q->queue_lock, flags);
 	/*
 	 * If command is tagged, release the tag
 	 */
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 	drive_stat_acct(rq, rq->nr_sectors, 1);
 	__elv_add_request(q, rq, where, 0);
 	blk_start_queueing(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
 static int __blk_rq_unmap_user(struct bio *bio)
 {
 	int ret = 0;
 	if (bio) {
 		if (bio_flagged(bio, BIO_USER_MAPPED))
 			bio_unmap_user(bio);
 		else
 			ret = bio_uncopy_user(bio);
 	}
 	return ret;
 }
 int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 		      struct bio *bio)
 {
 	if (!rq->bio)
 		blk_rq_bio_prep(q, rq, bio);
 	else if (!ll_back_merge_fn(q, rq, bio))
 		return -EINVAL;
 	else {
 		rq->biotail->bi_next = bio;
 		rq->biotail = bio;
 		rq->data_len += bio->bi_size;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_append_bio);
 static int __blk_rq_map_user(struct request_queue *q, struct request *rq,
 			     void __user *ubuf, unsigned int len)
 {
 	unsigned long uaddr;
 	struct bio *bio, *orig_bio;
 	int reading, ret;
 	reading = rq_data_dir(rq) == READ;
 	/*
 	 * if alignment requirement is satisfied, map in user pages for
 	 * direct dma. else, set up kernel bounce buffers
 	 */
 	uaddr = (unsigned long) ubuf;
 	if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
 		bio = bio_map_user(q, NULL, uaddr, len, reading);
 	else
 		bio = bio_copy_user(q, uaddr, len, reading);
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 	orig_bio = bio;
 	blk_queue_bounce(q, &bio);
 	/*
 	 * We link the bounce buffer in and could have to traverse it
 	 * later so we have to get a ref to prevent it from being freed
 	 */
 	bio_get(bio);
 	ret = blk_rq_append_bio(q, rq, bio);
 	if (!ret)
 		return bio->bi_size;
 	/* if it was boucned we must call the end io function */
 	bio_endio(bio, 0);
 	__blk_rq_unmap_user(orig_bio);
 	bio_put(bio);
 	return ret;
 }
 /**
  * blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
  * @q:		request queue where request should be inserted
  * @rq:		request structure to fill
  * @ubuf:	the user buffer
  * @len:	length of user data
  *
  * Description:
  *    Data will be mapped directly for zero copy io, if possible. Otherwise
  *    a kernel bounce buffer is used.
  *
  *    A matching blk_rq_unmap_user() must be issued at the end of io, while
  *    still in process context.
  *
  *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
  *    before being submitted to the device, as pages mapped may be out of
  *    reach. It's the callers responsibility to make sure this happens. The
  *    original bio must be passed back in to blk_rq_unmap_user() for proper
  *    unmapping.
  */
 int blk_rq_map_user(struct request_queue *q, struct request *rq,
 		    void __user *ubuf, unsigned long len)
 {
 	unsigned long bytes_read = 0;
 	struct bio *bio = NULL;
 	int ret;
 	if (len > (q->max_hw_sectors << 9))
 		return -EINVAL;
 	if (!len || !ubuf)
 		return -EINVAL;
 	while (bytes_read != len) {
 		unsigned long map_len, end, start;
 		map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);
 		end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)
 								>> PAGE_SHIFT;
 		start = (unsigned long)ubuf >> PAGE_SHIFT;
 		/*
 		 * A bad offset could cause us to require BIO_MAX_PAGES + 1
 		 * pages. If this happens we just lower the requested
 		 * mapping len by a page so that we can fit
 		 */
 		if (end - start > BIO_MAX_PAGES)
 			map_len -= PAGE_SIZE;
 		ret = __blk_rq_map_user(q, rq, ubuf, map_len);
 		if (ret < 0)
 			goto unmap_rq;
 		if (!bio)
 			bio = rq->bio;
 		bytes_read += ret;
 		ubuf += ret;
 	}
 	rq->buffer = rq->data = NULL;
 	return 0;
 unmap_rq:
 	blk_rq_unmap_user(bio);
 	return ret;
 }
 EXPORT_SYMBOL(blk_rq_map_user);
 /**
  * blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
  * @q:		request queue where request should be inserted
  * @rq:		request to map data to
  * @iov:	pointer to the iovec
  * @iov_count:	number of elements in the iovec
  * @len:	I/O byte count
  *
  * Description:
  *    Data will be mapped directly for zero copy io, if possible. Otherwise
  *    a kernel bounce buffer is used.
  *
  *    A matching blk_rq_unmap_user() must be issued at the end of io, while
  *    still in process context.
  *
  *    Note: The mapped bio may need to be bounced through blk_queue_bounce()
  *    before being submitted to the device, as pages mapped may be out of
  *    reach. It's the callers responsibility to make sure this happens. The
  *    original bio must be passed back in to blk_rq_unmap_user() for proper
  *    unmapping.
  */
 int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
 			struct sg_iovec *iov, int iov_count, unsigned int len)
 {
 	struct bio *bio;
 	if (!iov || iov_count <= 0)
 		return -EINVAL;
 	/* we don't allow misaligned data like bio_map_user() does.  If the
 	 * user is using sg, they're expected to know the alignment constraints
 	 * and respect them accordingly */
 	bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 	if (bio->bi_size != len) {
 		bio_endio(bio, 0);
 		bio_unmap_user(bio);
 		return -EINVAL;
 	}
 	bio_get(bio);
 	blk_rq_bio_prep(q, rq, bio);
 	rq->buffer = rq->data = NULL;
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_user_iov);
 /**
  * blk_rq_unmap_user - unmap a request with user data
  * @bio:	       start of bio list
  *
  * Description:
  *    Unmap a rq previously mapped by blk_rq_map_user(). The caller must
  *    supply the original rq->bio from the blk_rq_map_user() return, since
  *    the io completion may have changed rq->bio.
  */
 int blk_rq_unmap_user(struct bio *bio)
 {
 	struct bio *mapped_bio;
 	int ret = 0, ret2;
 	while (bio) {
 		mapped_bio = bio;
 		if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
 			mapped_bio = bio->bi_private;
 		ret2 = __blk_rq_unmap_user(mapped_bio);
 		if (ret2 && !ret)
 			ret = ret2;
 		mapped_bio = bio;
 		bio = bio->bi_next;
 		bio_put(mapped_bio);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(blk_rq_unmap_user);
 /**
  * blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
  * @q:		request queue where request should be inserted
  * @rq:		request to fill
  * @kbuf:	the kernel buffer
  * @len:	length of user data
  * @gfp_mask:	memory allocation flags
  */
 int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
 		    unsigned int len, gfp_t gfp_mask)
 {
 	struct bio *bio;
 	if (len > (q->max_hw_sectors << 9))
 		return -EINVAL;
 	if (!len || !kbuf)
 		return -EINVAL;
 	bio = bio_map_kern(q, kbuf, len, gfp_mask);
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 	if (rq_data_dir(rq) == WRITE)
 		bio->bi_rw |= (1 << BIO_RW);
 	blk_rq_bio_prep(q, rq, bio);
 	blk_queue_bounce(q, &rq->bio);
 	rq->buffer = rq->data = NULL;
 	return 0;
 }
 EXPORT_SYMBOL(blk_rq_map_kern);
 /**
  * blk_execute_rq_nowait - insert a request into queue for execution
  * @q:		queue to insert the request in
  * @bd_disk:	matching gendisk
  * @rq:		request to insert
  * @at_head:    insert request at head or tail of queue
  * @done:	I/O completion handler
  *
  * Description:
  *    Insert a fully prepared request at the back of the io scheduler queue
  *    for execution.  Don't wait for completion.
  */
 void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
 			   struct request *rq, int at_head,
 			   rq_end_io_fn *done)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 	rq->rq_disk = bd_disk;
 	rq->cmd_flags |= REQ_NOMERGE;
 	rq->end_io = done;
 	WARN_ON(irqs_disabled());
 	spin_lock_irq(q->queue_lock);
 	__elv_add_request(q, rq, where, 1);
 	__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 }
 EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
 /**
  * blk_execute_rq - insert a request into queue for execution
  * @q:		queue to insert the request in
  * @bd_disk:	matching gendisk
  * @rq:		request to insert
  * @at_head:    insert request at head or tail of queue
  *
  * Description:
  *    Insert a fully prepared request at the back of the io scheduler queue
  *    for execution and wait for completion.
  */
 int blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
 		   struct request *rq, int at_head)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	char sense[SCSI_SENSE_BUFFERSIZE];
 	int err = 0;
 	/*
 	 * we need an extra reference to the request, so we can look at
 	 * it after io completion
 	 */
 	rq->ref_count++;
 	if (!rq->sense) {
 		memset(sense, 0, sizeof(sense));
 		rq->sense = sense;
 		rq->sense_len = 0;
 	}
 	rq->end_io_data = &wait;
 	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
 	wait_for_completion(&wait);
 	if (rq->errors)
 		err = -EIO;
 	return err;
 }
 EXPORT_SYMBOL(blk_execute_rq);
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:	blockdev to issue flush for
  * @error_sector:	error sector
  *
  * Description:
  *    Issue a flush for the block device in question. Caller can supply
  *    room for storing the error offset in case of a flush error, if they
  *    wish to.  Caller must run wait_for_completion() on its own.
  */
 int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector)
 {
 	struct request_queue *q;
 	if (bdev->bd_disk == NULL)
 		return -ENXIO;
 	q = bdev_get_queue(bdev);
 	if (!q)
 		return -ENXIO;
 	if (!q->issue_flush_fn)
 		return -EOPNOTSUPP;
 	return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
 }
 EXPORT_SYMBOL(blkdev_issue_flush);
 static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
 {
 	int rw = rq_data_dir(rq);
 	if (!blk_fs_request(rq) || !rq->rq_disk)
 		return;
 	if (!new_io) {
 		__disk_stat_inc(rq->rq_disk, merges[rw]);
 	} else {
 		disk_round_stats(rq->rq_disk);
 		rq->rq_disk->in_flight++;
 	}
 }
 /*
  * add-request adds a request to the linked list.
  * queue lock is held and interrupts disabled, as we muck with the
  * request queue list.
  */
 static inline void add_request(struct request_queue * q, struct request * req)
 {
 	drive_stat_acct(req, req->nr_sectors, 1);
 	/*
 	 * elevator indicated where it wants this request to be
 	 * inserted at elevator_merge time
 	 */
 	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
 }
 /*
  * disk_round_stats()	- Round off the performance stats on a struct
  * disk_stats.
  *
  * The average IO queue length and utilisation statistics are maintained
  * by observing the current state of the queue length and the amount of
  * time it has been in this state for.
  *
  * Normally, that accounting is done on IO completion, but that can result
  * in more than a second's worth of IO being accounted for within any one
  * second, leading to >100% utilisation.  To deal with that, we call this
  * function to do a round-off before returning the results when reading
  * /proc/diskstats.  This accounts immediately for all queue usage up to
  * the current jiffies and restarts the counters again.
  */
 void disk_round_stats(struct gendisk *disk)
 {
 	unsigned long now = jiffies;
 	if (now == disk->stamp)
 		return;
 	if (disk->in_flight) {
 		__disk_stat_add(disk, time_in_queue,
 				disk->in_flight * (now - disk->stamp));
 		__disk_stat_add(disk, io_ticks, (now - disk->stamp));
 	}
 	disk->stamp = now;
 }
 EXPORT_SYMBOL_GPL(disk_round_stats);
 /*
  * queue lock must be held
  */
 void __blk_put_request(struct request_queue *q, struct request *req)
 {
 	if (unlikely(!q))
 		return;
 	if (unlikely(--req->ref_count))
 		return;
 	elv_completed_request(q, req);
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
 	 * it didn't come out of our reserved rq pools
 	 */
 	if (req->cmd_flags & REQ_ALLOCED) {
 		int rw = rq_data_dir(req);
 		int priv = req->cmd_flags & REQ_ELVPRIV;
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(!hlist_unhashed(&req->hash));
 		blk_free_request(q, req);
 		freed_request(q, rw, priv);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
 void blk_put_request(struct request *req)
 {
 	unsigned long flags;
 	struct request_queue *q = req->q;
 	/*
 	 * Gee, IDE calls in w/ NULL q.  Fix IDE and remove the
 	 * following if (q) test.
 	 */
 	if (q) {
 		spin_lock_irqsave(q->queue_lock, flags);
 		__blk_put_request(q, req);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 	}
 }
 EXPORT_SYMBOL(blk_put_request);
 /**
  * blk_end_sync_rq - executes a completion event on a request
  * @rq: request to complete
  * @error: end io status of the request
  */
 void blk_end_sync_rq(struct request *rq, int error)
 {
 	struct completion *waiting = rq->end_io_data;
 	rq->end_io_data = NULL;
 	__blk_put_request(rq->q, rq);
 	/*
 	 * complete last, if this is a stack request the process (and thus
 	 * the rq pointer) could be invalid right after this complete()
 	 */
 	complete(waiting);
 }
 EXPORT_SYMBOL(blk_end_sync_rq);
 /*
  * Has to be called with the request spinlock acquired
  */
 static int attempt_merge(struct request_queue *q, struct request *req,
 			  struct request *next)
 {
 	if (!rq_mergeable(req) || !rq_mergeable(next))
 		return 0;
 	/*
 	 * not contiguous
 	 */
 	if (req->sector + req->nr_sectors != next->sector)
 		return 0;
 	if (rq_data_dir(req) != rq_data_dir(next)
 	    || req->rq_disk != next->rq_disk
 	    || next->special)
 		return 0;
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
 	 * will have updated segment counts, update sector
 	 * counts here.
 	 */
 	if (!ll_merge_requests_fn(q, req, next))
 		return 0;
 	/*
 	 * At this point we have either done a back merge
 	 * or front merge. We need the smaller start_time of
 	 * the merged requests to be the current request
 	 * for accounting purposes.
 	 */
 	if (time_after(req->start_time, next->start_time))
 		req->start_time = next->start_time;
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
 	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
 	elv_merge_requests(q, req, next);
 	if (req->rq_disk) {
 		disk_round_stats(req->rq_disk);
 		req->rq_disk->in_flight--;
 	}
 	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
 	__blk_put_request(q, next);
 	return 1;
 }
 static inline int attempt_back_merge(struct request_queue *q,
 				     struct request *rq)
 {
 	struct request *next = elv_latter_request(q, rq);
 	if (next)
 		return attempt_merge(q, rq, next);
 	return 0;
 }
 static inline int attempt_front_merge(struct request_queue *q,
 				      struct request *rq)
 {
 	struct request *prev = elv_former_request(q, rq);
 	if (prev)
 		return attempt_merge(q, prev, rq);
 	return 0;
 }
 static void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cmd_type = REQ_TYPE_FS;
 	/*
 	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
 	 */
 	if (bio_rw_ahead(bio) || bio_failfast(bio))
 		req->cmd_flags |= REQ_FAILFAST;
 	/*
 	 * REQ_BARRIER implies no merging, but lets make it explicit
 	 */
 	if (unlikely(bio_barrier(bio)))
 		req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE);
 	if (bio_sync(bio))
 		req->cmd_flags |= REQ_RW_SYNC;
 	if (bio_rw_meta(bio))
 		req->cmd_flags |= REQ_RW_META;
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
 	req->start_time = jiffies;
 	blk_rq_bio_prep(req->q, req, bio);
 }
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
 	int el_ret, nr_sectors, barrier, err;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	int rw_flags;
 	nr_sectors = bio_sectors(bio);
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
 	 * ISA dma in theory)
 	 */
 	blk_queue_bounce(q, &bio);
 	barrier = bio_barrier(bio);
 	if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
 		err = -EOPNOTSUPP;
 		goto end_io;
 	}
 	spin_lock_irq(q->queue_lock);
 	if (unlikely(barrier) || elv_queue_empty(q))
 		goto get_rq;
 	el_ret = elv_merge(q, &req, bio);
 	switch (el_ret) {
 		case ELEVATOR_BACK_MERGE:
 			BUG_ON(!rq_mergeable(req));
 			if (!ll_back_merge_fn(q, req, bio))
 				break;
 			blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
 			req->biotail->bi_next = bio;
 			req->biotail = bio;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
 			req->ioprio = ioprio_best(req->ioprio, prio);
 			drive_stat_acct(req, nr_sectors, 0);
 			if (!attempt_back_merge(q, req))
 				elv_merged_request(q, req, el_ret);
 			goto out;
 		case ELEVATOR_FRONT_MERGE:
 			BUG_ON(!rq_mergeable(req));
 			if (!ll_front_merge_fn(q, req, bio))
 				break;
 			blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
 			bio->bi_next = req->bio;
 			req->bio = bio;
 			/*
 			 * may not be valid. if the low level driver said
 			 * it didn't need a bounce buffer then it better
 			 * not touch req->buffer either...
 			 */
 			req->buffer = bio_data(bio);
 			req->current_nr_sectors = bio_cur_sectors(bio);
 			req->hard_cur_sectors = req->current_nr_sectors;
 			req->sector = req->hard_sector = bio->bi_sector;
 			req->nr_sectors = req->hard_nr_sectors += nr_sectors;
 			req->ioprio = ioprio_best(req->ioprio, prio);
 			drive_stat_acct(req, nr_sectors, 0);
 			if (!attempt_front_merge(q, req))
 				elv_merged_request(q, req, el_ret);
 			goto out;
 		/* ELV_NO_MERGE: elevator says don't/can't merge. */
 		default:
 			;
 	}
 get_rq:
 	/*
 	 * This sync check and mask will be re-done in init_request_from_bio(),
 	 * but we need to set it earlier to expose the sync flag to the
 	 * rq allocator and io schedulers.
 	 */
 	rw_flags = bio_data_dir(bio);
 	if (sync)
 		rw_flags |= REQ_RW_SYNC;
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
 	 * Returns with the queue unlocked.
 	 */
 	req = get_request_wait(q, rw_flags, bio);
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
 	 * may now be mergeable after it had proven unmergeable (above).
 	 * We don't worry about that case for efficiency. It won't happen
 	 * often, and the elevators are able to handle it.
 	 */
 	init_request_from_bio(req, bio);
 	spin_lock_irq(q->queue_lock);
 	if (elv_queue_empty(q))
 		blk_plug_device(q);
 	add_request(q, req);
 out:
 	if (sync)
 		__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
 end_io:
 	bio_endio(bio, err);
 	return 0;
 }
 /*
  * If bio->bi_dev is a partition, remap the location
  */
 static inline void blk_partition_remap(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	if (bdev != bdev->bd_contains) {
 		struct hd_struct *p = bdev->bd_part;
 		const int rw = bio_data_dir(bio);
 		p->sectors[rw] += bio_sectors(bio);
 		p->ios[rw]++;
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 		blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
 				    bdev->bd_dev, bio->bi_sector,
 				    bio->bi_sector - p->start_sect);
 	}
 }
 static void handle_bad_sector(struct bio *bio)
 {
 	char b[BDEVNAME_SIZE];
 	printk(KERN_INFO "attempt to access beyond end of device\n");
 	printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
 			bdevname(bio->bi_bdev, b),
 			bio->bi_rw,
 			(unsigned long long)bio->bi_sector + bio_sectors(bio),
 			(long long)(bio->bi_bdev->bd_inode->i_size >> 9));
 	set_bit(BIO_EOF, &bio->bi_flags);
 }
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static DECLARE_FAULT_ATTR(fail_make_request);
 static int __init setup_fail_make_request(char *str)
 {
 	return setup_fault_attr(&fail_make_request, str);
 }
 __setup("fail_make_request=", setup_fail_make_request);
 static int should_fail_request(struct bio *bio)
 {
 	if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) ||
 	    (bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))
 		return should_fail(&fail_make_request, bio->bi_size);
 	return 0;
 }
 static int __init fail_make_request_debugfs(void)
 {
 	return init_fault_attr_dentries(&fail_make_request,
 					"fail_make_request");
 }
 late_initcall(fail_make_request_debugfs);
 #else /* CONFIG_FAIL_MAKE_REQUEST */
 static inline int should_fail_request(struct bio *bio)
 {
 	return 0;
 }
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 /**
  * generic_make_request: hand a buffer to its device driver for I/O
  * @bio:  The bio describing the location in memory and on the device.
  *
  * generic_make_request() is used to make I/O requests of block
  * devices. It is passed a &struct bio, which describes the I/O that needs
  * to be done.
  *
  * generic_make_request() does not return any status.  The
  * success/failure status of the request, along with notification of
  * completion, is delivered asynchronously through the bio->bi_end_io
  * function described (one day) else where.
  *
  * The caller of generic_make_request must make sure that bi_io_vec
  * are set to describe the memory buffer, and that bi_dev and bi_sector are
  * set to describe the device address, and the
  * bi_end_io and optionally bi_private are set to describe how
  * completion notification should be signaled.
  *
  * generic_make_request and the drivers it calls may use bi_next if this
  * bio happens to be merged with someone else, and may change bi_dev and
  * bi_sector for remaps as it sees fit.  So the values of these fields
  * should NOT be depended on after the call to generic_make_request.
  */
 static inline void __generic_make_request(struct bio *bio)
 {
 	struct request_queue *q;
 	sector_t maxsector;
 	sector_t old_sector;
 	int ret, nr_sectors = bio_sectors(bio);
 	dev_t old_dev;
 	might_sleep();
 	/* Test device or partition size, when known. */
 	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
 	if (maxsector) {
 		sector_t sector = bio->bi_sector;
 		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
 			/*
 			 * This may well happen - the kernel calls bread()
 			 * without checking the size of the device, e.g., when
 			 * mounting a device.
 			 */
 			handle_bad_sector(bio);
 			goto end_io;
 		}
 	}
 	/*
 	 * Resolve the mapping until finished. (drivers are
 	 * still free to implement/resolve their own stacking
 	 * by explicitly returning 0)
 	 *
 	 * NOTE: we don't repeat the blk_size check for each new device.
 	 * Stacking drivers are expected to know what they are doing.
 	 */
 	old_sector = -1;
 	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
 		q = bdev_get_queue(bio->bi_bdev);
 		if (!q) {
 			printk(KERN_ERR
 			       "generic_make_request: Trying to access "
 				"nonexistent block-device %s (%Lu)\n",
 				bdevname(bio->bi_bdev, b),
 				(long long) bio->bi_sector);
 end_io:
 			bio_endio(bio, -EIO);
 			break;
 		}
 		if (unlikely(nr_sectors > q->max_hw_sectors)) {
 			printk("bio too big device %s (%u > %u)\n",
 				bdevname(bio->bi_bdev, b),
 				bio_sectors(bio),
 				q->max_hw_sectors);
 			goto end_io;
 		}
 		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
 			goto end_io;
 		if (should_fail_request(bio))
 			goto end_io;
 		/*
 		 * If this device has partitions, remap block n
 		 * of partition p to block n+start(p) of the disk.
 		 */
 		blk_partition_remap(bio);
 		if (old_sector != -1)
 			blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
 					    old_sector);
 		blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
 		maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
 		if (maxsector) {
 			sector_t sector = bio->bi_sector;
 			if (maxsector < nr_sectors ||
 					maxsector - nr_sectors < sector) {
 				/*
 				 * This may well happen - partitions are not
 				 * checked to make sure they are within the size
 				 * of the whole device.
 				 */
 				handle_bad_sector(bio);
 				goto end_io;
 			}
 		}
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 }
 /*
  * We only want one ->make_request_fn to be active at a time,
  * else stack usage with stacked devices could be a problem.
  * So use current->bio_{list,tail} to keep a list of requests
  * submited by a make_request_fn function.
  * current->bio_tail is also used as a flag to say if
  * generic_make_request is currently active in this task or not.
  * If it is NULL, then no make_request is active.  If it is non-NULL,
  * then a make_request is active, and new requests should be added
  * at the tail
  */
 void generic_make_request(struct bio *bio)
 {
 	if (current->bio_tail) {
 		/* make_request is active */
 		*(current->bio_tail) = bio;
 		bio->bi_next = NULL;
 		current->bio_tail = &bio->bi_next;
 		return;
 	}
 	/* following loop may be a bit non-obvious, and so deserves some
 	 * explanation.
 	 * Before entering the loop, bio->bi_next is NULL (as all callers
 	 * ensure that) so we have a list with a single bio.
 	 * We pretend that we have just taken it off a longer list, so
 	 * we assign bio_list to the next (which is NULL) and bio_tail
 	 * to &bio_list, thus initialising the bio_list of new bios to be
 	 * added.  __generic_make_request may indeed add some more bios
 	 * through a recursive call to generic_make_request.  If it
 	 * did, we find a non-NULL value in bio_list and re-enter the loop
 	 * from the top.  In this case we really did just take the bio
 	 * of the top of the list (no pretending) and so fixup bio_list and
 	 * bio_tail or bi_next, and call into __generic_make_request again.
 	 *
 	 * The loop was structured like this to make only one call to
 	 * __generic_make_request (which is important as it is large and
 	 * inlined) and to keep the structure simple.
 	 */
 	BUG_ON(bio->bi_next);
 	do {
 		current->bio_list = bio->bi_next;
 		if (bio->bi_next == NULL)
 			current->bio_tail = &current->bio_list;
 		else
 			bio->bi_next = NULL;
 		__generic_make_request(bio);
 		bio = current->bio_list;
 	} while (bio);
 	current->bio_tail = NULL; /* deactivate */
 }
 EXPORT_SYMBOL(generic_make_request);
 /**
  * submit_bio: submit a bio to the block device layer for I/O
  * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
  * @bio: The &struct bio which describes the I/O
  *
  * submit_bio() is very similar in purpose to generic_make_request(), and
  * uses that function to do most of the work. Both are fairly rough
  * interfaces, @bio must be presetup and ready for I/O.
  *
  */
 void submit_bio(int rw, struct bio *bio)
 {
 	int count = bio_sectors(bio);
 	BIO_BUG_ON(!bio->bi_size);
 	BIO_BUG_ON(!bio->bi_io_vec);
 	bio->bi_rw |= rw;
 	if (rw & WRITE) {
 		count_vm_events(PGPGOUT, count);
 	} else {
 		task_io_account_read(bio->bi_size);
 		count_vm_events(PGPGIN, count);
 	}
 	if (unlikely(block_dump)) {
 		char b[BDEVNAME_SIZE];
 		printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
 			current->comm, current->pid,
 			(rw & WRITE) ? "WRITE" : "READ",
 			(unsigned long long)bio->bi_sector,
 			bdevname(bio->bi_bdev,b));
 	}
 	generic_make_request(bio);
 }
 EXPORT_SYMBOL(submit_bio);
 static void blk_recalc_rq_sectors(struct request *rq, int nsect)
 {
 	if (blk_fs_request(rq)) {
 		rq->hard_sector += nsect;
 		rq->hard_nr_sectors -= nsect;
 		/*
 		 * Move the I/O submission pointers ahead if required.
 		 */
 		if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
 		    (rq->sector <= rq->hard_sector)) {
 			rq->sector = rq->hard_sector;
 			rq->nr_sectors = rq->hard_nr_sectors;
 			rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
 			rq->current_nr_sectors = rq->hard_cur_sectors;
 			rq->buffer = bio_data(rq->bio);
 		}
 		/*
 		 * if total number of sectors is less than the first segment
 		 * size, something has gone terribly wrong
 		 */
 		if (rq->nr_sectors < rq->current_nr_sectors) {
 			printk("blk: request botched\n");
 			rq->nr_sectors = rq->current_nr_sectors;
 		}
 	}
 }
 static int __end_that_request_first(struct request *req, int uptodate,
 				    int nr_bytes)
 {
 	int total_bytes, bio_nbytes, error, next_idx = 0;
 	struct bio *bio;
 	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
 	 */
 	error = 0;
 	if (end_io_error(uptodate))
 		error = !uptodate ? -EIO : uptodate;
 	/*
 	 * for a REQ_BLOCK_PC request, we want to carry any eventual
 	 * sense key with us all the way through
 	 */
 	if (!blk_pc_request(req))
 		req->errors = 0;
 	if (!uptodate) {
 		if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))
 			printk("end_request: I/O error, dev %s, sector %llu\n",
 				req->rq_disk ? req->rq_disk->disk_name : "?",
 				(unsigned long long)req->sector);
 	}
 	if (blk_fs_request(req) && req->rq_disk) {
 		const int rw = rq_data_dir(req);
 		disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);
 	}
 	total_bytes = bio_nbytes = 0;
 	while ((bio = req->bio) != NULL) {
 		int nbytes;
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
 			req_bio_endio(req, bio, nbytes, error);
 			next_idx = 0;
 			bio_nbytes = 0;
 		} else {
 			int idx = bio->bi_idx + next_idx;
 			if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
 				blk_dump_rq_flags(req, "__end_that");
 				printk("%s: bio idx %d >= vcnt %d\n",
 						__FUNCTION__,
 						bio->bi_idx, bio->bi_vcnt);
 				break;
 			}
 			nbytes = bio_iovec_idx(bio, idx)->bv_len;
 			BIO_BUG_ON(nbytes > bio->bi_size);
 			/*
 			 * not a complete bvec done
 			 */
 			if (unlikely(nbytes > nr_bytes)) {
 				bio_nbytes += nr_bytes;
 				total_bytes += nr_bytes;
 				break;
 			}
 			/*
 			 * advance to the next vector
 			 */
 			next_idx++;
 			bio_nbytes += nbytes;
 		}
 		total_bytes += nbytes;
 		nr_bytes -= nbytes;
 		if ((bio = req->bio)) {
 			/*
 			 * end more in this run, or just return 'not-done'
 			 */
 			if (unlikely(nr_bytes <= 0))
 				break;
 		}
 	}
 	/*
 	 * completely done
 	 */
 	if (!req->bio)
 		return 0;
 	/*
 	 * if the request wasn't completed, update state
 	 */
 	if (bio_nbytes) {
 		req_bio_endio(req, bio, bio_nbytes, error);
 		bio->bi_idx += next_idx;
 		bio_iovec(bio)->bv_offset += nr_bytes;
 		bio_iovec(bio)->bv_len -= nr_bytes;
 	}
 	blk_recalc_rq_sectors(req, total_bytes >> 9);
 	blk_recalc_rq_segments(req);
 	return 1;
 }
 /**
  * end_that_request_first - end I/O on a request
  * @req:      the request being processed
  * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
  * @nr_sectors: number of sectors to end I/O on
  *
  * Description:
  *     Ends I/O on a number of sectors attached to @req, and sets it up
  *     for the next range of segments (if any) in the cluster.
  *
  * Return:
  *     0 - we are done with this request, call end_that_request_last()
  *     1 - still buffers pending for this request
  **/
 int end_that_request_first(struct request *req, int uptodate, int nr_sectors)
 {
 	return __end_that_request_first(req, uptodate, nr_sectors << 9);
 }
 EXPORT_SYMBOL(end_that_request_first);
 /**
  * end_that_request_chunk - end I/O on a request
  * @req:      the request being processed
  * @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @req, and sets it up
  *     for the next range of segments (if any). Like end_that_request_first(),
  *     but deals with bytes instead of sectors.
  *
  * Return:
  *     0 - we are done with this request, call end_that_request_last()
  *     1 - still buffers pending for this request
  **/
 int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
 {
 	return __end_that_request_first(req, uptodate, nr_bytes);
 }
 EXPORT_SYMBOL(end_that_request_chunk);
 /*
  * splice the completion data to a local structure and hand off to
  * process_completion_queue() to complete the requests
  */
 static void blk_done_softirq(struct softirq_action *h)
 {
 	struct list_head *cpu_list, local_list;
 	local_irq_disable();
 	cpu_list = &__get_cpu_var(blk_cpu_done);
 	list_replace_init(cpu_list, &local_list);
 	local_irq_enable();
 	while (!list_empty(&local_list)) {
 		struct request *rq = list_entry(local_list.next, struct request, donelist);
 		list_del_init(&rq->donelist);
 		rq->q->softirq_done_fn(rq);
 	}
 }
 static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action,
 			  void *hcpu)
 {
 	/*
 	 * If a CPU goes away, splice its entries to the current CPU
 	 * and trigger a run of the softirq
 	 */
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
 		int cpu = (unsigned long) hcpu;
 		local_irq_disable();
 		list_splice_init(&per_cpu(blk_cpu_done, cpu),
 				 &__get_cpu_var(blk_cpu_done));
 		raise_softirq_irqoff(BLOCK_SOFTIRQ);
 		local_irq_enable();
 	}
 	return NOTIFY_OK;
 }
 static struct notifier_block blk_cpu_notifier __cpuinitdata = {
 	.notifier_call	= blk_cpu_notify,
 };
 /**
  * blk_complete_request - end I/O on a request
  * @req:      the request being processed
  *
  * Description:
  *     Ends all I/O on a request. It does not handle partial completions,
  *     unless the driver actually implements this in its completion callback
  *     through requeueing. The actual completion happens out-of-order,
  *     through a softirq handler. The user must have registered a completion
  *     callback through blk_queue_softirq_done().
  **/
 void blk_complete_request(struct request *req)
 {
 	struct list_head *cpu_list;
 	unsigned long flags;
 	BUG_ON(!req->q->softirq_done_fn);
 	local_irq_save(flags);
 	cpu_list = &__get_cpu_var(blk_cpu_done);
 	list_add_tail(&req->donelist, cpu_list);
 	raise_softirq_irqoff(BLOCK_SOFTIRQ);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(blk_complete_request);
 /*
  * queue lock must be held
  */
 void end_that_request_last(struct request *req, int uptodate)
 {
 	struct gendisk *disk = req->rq_disk;
 	int error;
 	/*
 	 * extend uptodate bool to allow < 0 value to be direct io error
 	 */
 	error = 0;
 	if (end_io_error(uptodate))
 		error = !uptodate ? -EIO : uptodate;
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
 	/*
 	 * Account IO completion.  bar_rq isn't accounted as a normal
 	 * IO on queueing nor completion.  Accounting the containing
 	 * request is enough.
 	 */
 	if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		__disk_stat_inc(disk, ios[rw]);
 		__disk_stat_add(disk, ticks[rw], duration);
 		disk_round_stats(disk);
 		disk->in_flight--;
 	}
 	if (req->end_io)
 		req->end_io(req, error);
 	else
 		__blk_put_request(req->q, req);
 }
 EXPORT_SYMBOL(end_that_request_last);
-void end_request(struct request *req, int uptodate)
+static inline void __end_request(struct request *rq, int uptodate,
+				 unsigned int nr_bytes, int dequeue)
 {
-	if (!end_that_request_first(req, uptodate, req->hard_cur_sectors)) {
+	if (!end_that_request_chunk(rq, uptodate, nr_bytes)) {
-		add_disk_randomness(req->rq_disk);
+		if (dequeue)
-		blkdev_dequeue_request(req);
+			blkdev_dequeue_request(rq);
-		end_that_request_last(req, uptodate);
+		add_disk_randomness(rq->rq_disk);
+		end_that_request_last(rq, uptodate);
 	}
 }
+static unsigned int rq_byte_size(struct request *rq)
+{
+	if (blk_fs_request(rq))
+		return rq->hard_nr_sectors << 9;
+	return rq->data_len;
+}
+/**
+ * end_queued_request - end all I/O on a queued request
+ * @rq:		the request being processed
+ * @uptodate:	error value or 0/1 uptodate flag
+ *
+ * Description:
+ *     Ends all I/O on a request, and removes it from the block layer queues.
+ *     Not suitable for normal IO completion, unless the driver still has
+ *     the request attached to the block layer.
+ *
+ **/
+void end_queued_request(struct request *rq, int uptodate)
+{
+	__end_request(rq, uptodate, rq_byte_size(rq), 1);
+}
+EXPORT_SYMBOL(end_queued_request);
+/**
+ * end_dequeued_request - end all I/O on a dequeued request
+ * @rq:		the request being processed
+ * @uptodate:	error value or 0/1 uptodate flag
+ *
+ * Description:
+ *     Ends all I/O on a request. The request must already have been
+ *     dequeued using blkdev_dequeue_request(), as is normally the case
+ *     for most drivers.
+ *
+ **/
+void end_dequeued_request(struct request *rq, int uptodate)
+{
+	__end_request(rq, uptodate, rq_byte_size(rq), 0);
+}
+EXPORT_SYMBOL(end_dequeued_request);
+/**
+ * end_request - end I/O on the current segment of the request
+ * @rq:		the request being processed
+ * @uptodate:	error value or 0/1 uptodate flag
+ *
+ * Description:
+ *     Ends I/O on the current segment of a request. If that is the only
+ *     remaining segment, the request is also completed and freed.
+ *
+ *     This is a remnant of how older block drivers handled IO completions.
+ *     Modern drivers typically end IO on the full request in one go, unless
+ *     they have a residual value to account for. For that case this function
+ *     isn't really useful, unless the residual just happens to be the
+ *     full current segment. In other words, don't use this function in new
+ *     code. Either use end_request_completely(), or the
+ *     end_that_request_chunk() (along with end_that_request_last()) for
+ *     partial completions.
+ *
+ **/
+void end_request(struct request *req, int uptodate)
+{
+	__end_request(req, uptodate, req->hard_cur_sectors << 9, 1);
+}
 EXPORT_SYMBOL(end_request);
 static void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			    struct bio *bio)
 {
 	/* first two bits are identical in rq->cmd_flags and bio->bi_rw */
 	rq->cmd_flags |= (bio->bi_rw & 3);
 	rq->nr_phys_segments = bio_phys_segments(q, bio);
 	rq->nr_hw_segments = bio_hw_segments(q, bio);
 	rq->current_nr_sectors = bio_cur_sectors(bio);
 	rq->hard_cur_sectors = rq->current_nr_sectors;
 	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
 	rq->buffer = bio_data(bio);
 	rq->data_len = bio->bi_size;
 	rq->bio = rq->biotail = bio;
 	if (bio->bi_bdev)
 		rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 int kblockd_schedule_work(struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 void kblockd_flush_work(struct work_struct *work)
 {
 	cancel_work_sync(work);
 }
 EXPORT_SYMBOL(kblockd_flush_work);
 int __init blk_dev_init(void)
 {
 	int i;
 	kblockd_workqueue = create_workqueue("kblockd");
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
 	request_cachep = kmem_cache_create("blkdev_requests",
 			sizeof(struct request), 0, SLAB_PANIC, NULL);
 	requestq_cachep = kmem_cache_create("blkdev_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 	iocontext_cachep = kmem_cache_create("blkdev_ioc",
 			sizeof(struct io_context), 0, SLAB_PANIC, NULL);
 	for_each_possible_cpu(i)
 		INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
 	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
 	register_hotcpu_notifier(&blk_cpu_notifier);
 	blk_max_low_pfn = max_low_pfn - 1;
 	blk_max_pfn = max_pfn - 1;
 	return 0;
 }
 /*
  * IO Context helper functions
  */
 void put_io_context(struct io_context *ioc)
 {
 	if (ioc == NULL)
 		return;
 	BUG_ON(atomic_read(&ioc->refcount) == 0);
 	if (atomic_dec_and_test(&ioc->refcount)) {
 		struct cfq_io_context *cic;
 		rcu_read_lock();
 		if (ioc->aic && ioc->aic->dtor)
 			ioc->aic->dtor(ioc->aic);
 		if (ioc->cic_root.rb_node != NULL) {
 			struct rb_node *n = rb_first(&ioc->cic_root);
 			cic = rb_entry(n, struct cfq_io_context, rb_node);
 			cic->dtor(ioc);
 		}
 		rcu_read_unlock();
 		kmem_cache_free(iocontext_cachep, ioc);
 	}
 }
 EXPORT_SYMBOL(put_io_context);
 /* Called by the exitting task */
 void exit_io_context(void)
 {
 	struct io_context *ioc;
 	struct cfq_io_context *cic;
 	task_lock(current);
 	ioc = current->io_context;
 	current->io_context = NULL;
 	task_unlock(current);
 	ioc->task = NULL;
 	if (ioc->aic && ioc->aic->exit)
 		ioc->aic->exit(ioc->aic);
 	if (ioc->cic_root.rb_node != NULL) {
 		cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
 		cic->exit(ioc);
 	}
 	put_io_context(ioc);
 }
 /*
  * If the current task has no IO context then create one and initialise it.
  * Otherwise, return its existing IO context.
  *
  * This returned IO context doesn't have a specifically elevated refcount,
  * but since the current task itself holds a reference, the context can be
  * used in general code, so long as it stays within `current` context.
  */
 static struct io_context *current_io_context(gfp_t gfp_flags, int node)
 {
 	struct task_struct *tsk = current;
 	struct io_context *ret;
 	ret = tsk->io_context;
 	if (likely(ret))
 		return ret;
 	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
 	if (ret) {
 		atomic_set(&ret->refcount, 1);
 		ret->task = current;
 		ret->ioprio_changed = 0;
 		ret->last_waited = jiffies; /* doesn't matter... */
 		ret->nr_batch_requests = 0; /* because this is 0 */
 		ret->aic = NULL;
 		ret->cic_root.rb_node = NULL;
 		ret->ioc_data = NULL;
 		/* make sure set_task_ioprio() sees the settings above */
 		smp_wmb();
 		tsk->io_context = ret;
 	}
 	return ret;
 }
 /*
  * If the current task has no IO context then create one and initialise it.
  * If it does have a context, take a ref on it.
  *
  * This is always called in the context of the task which submitted the I/O.
  */
 struct io_context *get_io_context(gfp_t gfp_flags, int node)
 {
 	struct io_context *ret;
 	ret = current_io_context(gfp_flags, node);
 	if (likely(ret))
 		atomic_inc(&ret->refcount);
 	return ret;
 }
 EXPORT_SYMBOL(get_io_context);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc)
 {
 	struct io_context *src = *psrc;
 	struct io_context *dst = *pdst;
 	if (src) {
 		BUG_ON(atomic_read(&src->refcount) == 0);
 		atomic_inc(&src->refcount);
 		put_io_context(dst);
 		*pdst = src;
 	}
 }
 EXPORT_SYMBOL(copy_io_context);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2)
 {
 	struct io_context *temp;
 	temp = *ioc1;
 	*ioc1 = *ioc2;
 	*ioc2 = temp;
 }
 EXPORT_SYMBOL(swap_io_context);
 /*
  * sysfs parts below
  */
 struct queue_sysfs_entry {
 	struct attribute attr;
 	ssize_t (*show)(struct request_queue *, char *);
 	ssize_t (*store)(struct request_queue *, const char *, size_t);
 };
 static ssize_t
 queue_var_show(unsigned int var, char *page)
 {
 	return sprintf(page, "%d\n", var);
 }
 static ssize_t
 queue_var_store(unsigned long *var, const char *page, size_t count)
 {
 	char *p = (char *) page;
 	*var = simple_strtoul(p, &p, 10);
 	return count;
 }
 static ssize_t queue_requests_show(struct request_queue *q, char *page)
 {
 	return queue_var_show(q->nr_requests, (page));
 }
 static ssize_t
 queue_requests_store(struct request_queue *q, const char *page, size_t count)
 {
 	struct request_list *rl = &q->rq;
 	unsigned long nr;
 	int ret = queue_var_store(&nr, page, count);
 	if (nr < BLKDEV_MIN_RQ)
 		nr = BLKDEV_MIN_RQ;
 	spin_lock_irq(q->queue_lock);
 	q->nr_requests = nr;
 	blk_queue_congestion_threshold(q);
 	if (rl->count[READ] >= queue_congestion_on_threshold(q))
 		blk_set_queue_congested(q, READ);
 	else if (rl->count[READ] < queue_congestion_off_threshold(q))
 		blk_clear_queue_congested(q, READ);
 	if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
 		blk_set_queue_congested(q, WRITE);
 	else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
 		blk_clear_queue_congested(q, WRITE);
 	if (rl->count[READ] >= q->nr_requests) {
 		blk_set_queue_full(q, READ);
 	} else if (rl->count[READ]+1 <= q->nr_requests) {
 		blk_clear_queue_full(q, READ);
 		wake_up(&rl->wait[READ]);
 	}
 	if (rl->count[WRITE] >= q->nr_requests) {
 		blk_set_queue_full(q, WRITE);
 	} else if (rl->count[WRITE]+1 <= q->nr_requests) {
 		blk_clear_queue_full(q, WRITE);
 		wake_up(&rl->wait[WRITE]);
 	}
 	spin_unlock_irq(q->queue_lock);
 	return ret;
 }
 static ssize_t queue_ra_show(struct request_queue *q, char *page)
 {
 	int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
 	return queue_var_show(ra_kb, (page));
 }
 static ssize_t
 queue_ra_store(struct request_queue *q, const char *page, size_t count)
 {
 	unsigned long ra_kb;
 	ssize_t ret = queue_var_store(&ra_kb, page, count);
 	spin_lock_irq(q->queue_lock);
 	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
 	spin_unlock_irq(q->queue_lock);
 	return ret;
 }
 static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
 {
 	int max_sectors_kb = q->max_sectors >> 1;
 	return queue_var_show(max_sectors_kb, (page));
 }
 static ssize_t
 queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
 {
 	unsigned long max_sectors_kb,
 			max_hw_sectors_kb = q->max_hw_sectors >> 1,
 			page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
 	ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
 	int ra_kb;
 	if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
 		return -EINVAL;
 	/*
 	 * Take the queue lock to update the readahead and max_sectors
 	 * values synchronously:
 	 */
 	spin_lock_irq(q->queue_lock);
 	/*
 	 * Trim readahead window as well, if necessary:
 	 */
 	ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
 	if (ra_kb > max_sectors_kb)
 		q->backing_dev_info.ra_pages =
 				max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);
 	q->max_sectors = max_sectors_kb << 1;
 	spin_unlock_irq(q->queue_lock);
 	return ret;
 }
 static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
 {
 	int max_hw_sectors_kb = q->max_hw_sectors >> 1;
 	return queue_var_show(max_hw_sectors_kb, (page));
 }
 static struct queue_sysfs_entry queue_requests_entry = {
 	.attr = {.name = "nr_requests", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_requests_show,
 	.store = queue_requests_store,
 };
 static struct queue_sysfs_entry queue_ra_entry = {
 	.attr = {.name = "read_ahead_kb", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_ra_show,
 	.store = queue_ra_store,
 };
 static struct queue_sysfs_entry queue_max_sectors_entry = {
 	.attr = {.name = "max_sectors_kb", .mode = S_IRUGO | S_IWUSR },
 	.show = queue_max_sectors_show,
 	.store = queue_max_sectors_store,
 };
 static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
 	.attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
 	.show = queue_max_hw_sectors_show,
 };
 static struct queue_sysfs_entry queue_iosched_entry = {
 	.attr = {.name = "scheduler", .mode = S_IRUGO | S_IWUSR },
 	.show = elv_iosched_show,
 	.store = elv_iosched_store,
 };
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
 	&queue_max_hw_sectors_entry.attr,
 	&queue_max_sectors_entry.attr,
 	&queue_iosched_entry.attr,
 	NULL,
 };
 #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
 static ssize_t
 queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
 	struct queue_sysfs_entry *entry = to_queue(attr);
 	struct request_queue *q =
 		container_of(kobj, struct request_queue, kobj);
 	ssize_t res;
 	if (!entry->show)
 		return -EIO;
 	mutex_lock(&q->sysfs_lock);
 	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
 		mutex_unlock(&q->sysfs_lock);
 		return -ENOENT;
 	}
 	res = entry->show(q, page);
 	mutex_unlock(&q->sysfs_lock);
 	return res;
 }
 static ssize_t
 queue_attr_store(struct kobject *kobj, struct attribute *attr,
 		    const char *page, size_t length)
 {
 	struct queue_sysfs_entry *entry = to_queue(attr);
 	struct request_queue *q = container_of(kobj, struct request_queue, kobj);
 	ssize_t res;
 	if (!entry->store)
 		return -EIO;
 	mutex_lock(&q->sysfs_lock);
 	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
 		mutex_unlock(&q->sysfs_lock);
 		return -ENOENT;
 	}
 	res = entry->store(q, page, length);
 	mutex_unlock(&q->sysfs_lock);
 	return res;
 }
 static struct sysfs_ops queue_sysfs_ops = {
 	.show	= queue_attr_show,
 	.store	= queue_attr_store,
 };
 static struct kobj_type queue_ktype = {
 	.sysfs_ops	= &queue_sysfs_ops,
 	.default_attrs	= default_attrs,
 	.release	= blk_release_queue,
 };
 int blk_register_queue(struct gendisk *disk)
 {
 	int ret;
 	struct request_queue *q = disk->queue;
 	if (!q || !q->request_fn)
 		return -ENXIO;
 	q->kobj.parent = kobject_get(&disk->kobj);
 	ret = kobject_add(&q->kobj);
 	if (ret < 0)
 		return ret;
 	kobject_uevent(&q->kobj, KOBJ_ADD);
 	ret = elv_register_queue(q);
 	if (ret) {
 		kobject_uevent(&q->kobj, KOBJ_REMOVE);
 		kobject_del(&q->kobj);
 		return ret;
 	}
 	return 0;
 }
 void blk_unregister_queue(struct gendisk *disk)
 {
 	struct request_queue *q = disk->queue;
 	if (q && q->request_fn) {
 		elv_unregister_queue(q);
 		kobject_uevent(&q->kobj, KOBJ_REMOVE);
 		kobject_del(&q->kobj);
 		kobject_put(&disk->kobj);
 	}
 }

 #ifndef _LINUX_BLKDEV_H
 #define _LINUX_BLKDEV_H
 #ifdef CONFIG_BLOCK
 #include <linux/sched.h>
 #include <linux/major.h>
 #include <linux/genhd.h>
 #include <linux/list.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include <linux/bsg.h>
 #include <asm/scatterlist.h>
 struct scsi_ioctl_command;
 struct request_queue;
 typedef struct request_queue request_queue_t __deprecated;
 struct elevator_queue;
 typedef struct elevator_queue elevator_t;
 struct request_pm_state;
 struct blk_trace;
 struct request;
 struct sg_io_hdr;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 /*
  * This is the per-process anticipatory I/O scheduler state.
  */
 struct as_io_context {
 	spinlock_t lock;
 	void (*dtor)(struct as_io_context *aic); /* destructor */
 	void (*exit)(struct as_io_context *aic); /* called on task exit */
 	unsigned long state;
 	atomic_t nr_queued; /* queued reads & sync writes */
 	atomic_t nr_dispatched; /* number of requests gone to the drivers */
 	/* IO History tracking */
 	/* Thinktime */
 	unsigned long last_end_request;
 	unsigned long ttime_total;
 	unsigned long ttime_samples;
 	unsigned long ttime_mean;
 	/* Layout pattern */
 	unsigned int seek_samples;
 	sector_t last_request_pos;
 	u64 seek_total;
 	sector_t seek_mean;
 };
 struct cfq_queue;
 struct cfq_io_context {
 	struct rb_node rb_node;
 	void *key;
 	struct cfq_queue *cfqq[2];
 	struct io_context *ioc;
 	unsigned long last_end_request;
 	sector_t last_request_pos;
 	unsigned long ttime_total;
 	unsigned long ttime_samples;
 	unsigned long ttime_mean;
 	unsigned int seek_samples;
 	u64 seek_total;
 	sector_t seek_mean;
 	struct list_head queue_list;
 	void (*dtor)(struct io_context *); /* destructor */
 	void (*exit)(struct io_context *); /* called on task exit */
 };
 /*
  * This is the per-process I/O subsystem state.  It is refcounted and
  * kmalloc'ed. Currently all fields are modified in process io context
  * (apart from the atomic refcount), so require no locking.
  */
 struct io_context {
 	atomic_t refcount;
 	struct task_struct *task;
 	unsigned int ioprio_changed;
 	/*
 	 * For request batching
 	 */
 	unsigned long last_waited; /* Time last woken after wait for request */
 	int nr_batch_requests;     /* Number of requests left in the batch */
 	struct as_io_context *aic;
 	struct rb_root cic_root;
 	void *ioc_data;
 };
 void put_io_context(struct io_context *ioc);
 void exit_io_context(void);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
 struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
 struct request_list {
 	int count[2];
 	int starved[2];
 	int elvpriv;
 	mempool_t *rq_pool;
 	wait_queue_head_t wait[2];
 };
 /*
  * request command types
  */
 enum rq_cmd_type_bits {
 	REQ_TYPE_FS		= 1,	/* fs request */
 	REQ_TYPE_BLOCK_PC,		/* scsi command */
 	REQ_TYPE_SENSE,			/* sense request */
 	REQ_TYPE_PM_SUSPEND,		/* suspend request */
 	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
 	REQ_TYPE_FLUSH,			/* flush request */
 	REQ_TYPE_SPECIAL,		/* driver defined type */
 	REQ_TYPE_LINUX_BLOCK,		/* generic block layer message */
 	/*
 	 * for ATA/ATAPI devices. this really doesn't belong here, ide should
 	 * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver
 	 * private REQ_LB opcodes to differentiate what type of request this is
 	 */
 	REQ_TYPE_ATA_CMD,
 	REQ_TYPE_ATA_TASK,
 	REQ_TYPE_ATA_TASKFILE,
 	REQ_TYPE_ATA_PC,
 };
 /*
  * For request of type REQ_TYPE_LINUX_BLOCK, rq->cmd[0] is the opcode being
  * sent down (similar to how REQ_TYPE_BLOCK_PC means that ->cmd[] holds a
  * SCSI cdb.
  *
  * 0x00 -> 0x3f are driver private, to be used for whatever purpose they need,
  * typically to differentiate REQ_TYPE_SPECIAL requests.
  *
  */
 enum {
 	/*
 	 * just examples for now
 	 */
 	REQ_LB_OP_EJECT	= 0x40,		/* eject request */
 	REQ_LB_OP_FLUSH = 0x41,		/* flush device */
 };
 /*
  * request type modified bits. first three bits match BIO_RW* bits, important
  */
 enum rq_flag_bits {
 	__REQ_RW,		/* not set, read. set, write */
 	__REQ_FAILFAST,		/* no low level driver retries */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
 	__REQ_HARDBARRIER,	/* may not be passed by drive either */
 	__REQ_FUA,		/* forced unit access */
 	__REQ_NOMERGE,		/* don't touch this for merging */
 	__REQ_STARTED,		/* drive already may have started this one */
 	__REQ_DONTPREP,		/* don't call prep for this one */
 	__REQ_QUEUED,		/* uses queueing */
 	__REQ_ELVPRIV,		/* elevator private data attached */
 	__REQ_FAILED,		/* set if the request failed */
 	__REQ_QUIET,		/* don't worry about errors */
 	__REQ_PREEMPT,		/* set for "ide_preempt" requests */
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
 	__REQ_RW_SYNC,		/* request is sync (O_DIRECT) */
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_RW_META,		/* metadata io request */
 	__REQ_NR_BITS,		/* stops here */
 };
 #define REQ_RW		(1 << __REQ_RW)
 #define REQ_FAILFAST	(1 << __REQ_FAILFAST)
 #define REQ_SORTED	(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
 #define REQ_HARDBARRIER	(1 << __REQ_HARDBARRIER)
 #define REQ_FUA		(1 << __REQ_FUA)
 #define REQ_NOMERGE	(1 << __REQ_NOMERGE)
 #define REQ_STARTED	(1 << __REQ_STARTED)
 #define REQ_DONTPREP	(1 << __REQ_DONTPREP)
 #define REQ_QUEUED	(1 << __REQ_QUEUED)
 #define REQ_ELVPRIV	(1 << __REQ_ELVPRIV)
 #define REQ_FAILED	(1 << __REQ_FAILED)
 #define REQ_QUIET	(1 << __REQ_QUIET)
 #define REQ_PREEMPT	(1 << __REQ_PREEMPT)
 #define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
 #define REQ_RW_SYNC	(1 << __REQ_RW_SYNC)
 #define REQ_ALLOCED	(1 << __REQ_ALLOCED)
 #define REQ_RW_META	(1 << __REQ_RW_META)
 #define BLK_MAX_CDB	16
 /*
  * try to put the fields that are referenced together in the same cacheline
  */
 struct request {
 	struct list_head queuelist;
 	struct list_head donelist;
 	struct request_queue *q;
 	unsigned int cmd_flags;
 	enum rq_cmd_type_bits cmd_type;
 	/* Maintain bio traversal state for part by part I/O submission.
 	 * hard_* are block layer internals, no driver should touch them!
 	 */
 	sector_t sector;		/* next sector to submit */
 	sector_t hard_sector;		/* next sector to complete */
 	unsigned long nr_sectors;	/* no. of sectors left to submit */
 	unsigned long hard_nr_sectors;	/* no. of sectors left to complete */
 	/* no. of sectors left to submit in the current segment */
 	unsigned int current_nr_sectors;
 	/* no. of sectors left to complete in the current segment */
 	unsigned int hard_cur_sectors;
 	struct bio *bio;
 	struct bio *biotail;
 	struct hlist_node hash;	/* merge hash */
 	/*
 	 * The rb_node is only used inside the io scheduler, requests
 	 * are pruned when moved to the dispatch queue. So let the
 	 * completion_data share space with the rb_node.
 	 */
 	union {
 		struct rb_node rb_node;	/* sort/lookup */
 		void *completion_data;
 	};
 	/*
 	 * two pointers are available for the IO schedulers, if they need
 	 * more they have to dynamically allocate it.
 	 */
 	void *elevator_private;
 	void *elevator_private2;
 	struct gendisk *rq_disk;
 	unsigned long start_time;
 	/* Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
 	/* Number of scatter-gather addr+len pairs after
 	 * physical and DMA remapping hardware coalescing is performed.
 	 * This is the number of scatter-gather entries the driver
 	 * will actually have to deal with after DMA mapping is done.
 	 */
 	unsigned short nr_hw_segments;
 	unsigned short ioprio;
 	void *special;
 	char *buffer;
 	int tag;
 	int errors;
 	int ref_count;
 	/*
 	 * when request is used as a packet command carrier
 	 */
 	unsigned int cmd_len;
 	unsigned char cmd[BLK_MAX_CDB];
 	unsigned int data_len;
 	unsigned int sense_len;
 	void *data;
 	void *sense;
 	unsigned int timeout;
 	int retries;
 	/*
 	 * completion callback.
 	 */
 	rq_end_io_fn *end_io;
 	void *end_io_data;
 	/* for bidi */
 	struct request *next_rq;
 };
 /*
  * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME
  * requests. Some step values could eventually be made generic.
  */
 struct request_pm_state
 {
 	/* PM state machine step value, currently driver specific */
 	int	pm_step;
 	/* requested PM state value (S1, S2, S3, S4, ...) */
 	u32	pm_state;
 	void*	data;		/* for driver use */
 };
 #include <linux/elevator.h>
 typedef void (request_fn_proc) (struct request_queue *q);
 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
 struct bio_vec;
 typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *);
 typedef int (issue_flush_fn) (struct request_queue *, struct gendisk *, sector_t *);
 typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
 enum blk_queue_state {
 	Queue_down,
 	Queue_up,
 };
 struct blk_queue_tag {
 	struct request **tag_index;	/* map of busy tags */
 	unsigned long *tag_map;		/* bit map of free/busy tags */
 	struct list_head busy_list;	/* fifo list of busy tags */
 	int busy;			/* current depth */
 	int max_depth;			/* what we will send to device */
 	int real_max_depth;		/* what the array can hold */
 	atomic_t refcnt;		/* map can be shared */
 };
 struct request_queue
 {
 	/*
 	 * Together with queue_head for cacheline sharing
 	 */
 	struct list_head	queue_head;
 	struct request		*last_merge;
 	elevator_t		*elevator;
 	/*
 	 * the queue request freelist, one for reads and one for writes
 	 */
 	struct request_list	rq;
 	request_fn_proc		*request_fn;
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	issue_flush_fn		*issue_flush_fn;
 	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
 	/*
 	 * Dispatch queue sorting
 	 */
 	sector_t		end_sector;
 	struct request		*boundary_rq;
 	/*
 	 * Auto-unplugging state
 	 */
 	struct timer_list	unplug_timer;
 	int			unplug_thresh;	/* After this many requests */
 	unsigned long		unplug_delay;	/* After this many jiffies */
 	struct work_struct	unplug_work;
 	struct backing_dev_info	backing_dev_info;
 	/*
 	 * The queue owner gets to use this for whatever they like.
 	 * ll_rw_blk doesn't touch it.
 	 */
 	void			*queuedata;
 	/*
 	 * queue needs bounce pages for pages above this limit
 	 */
 	unsigned long		bounce_pfn;
 	gfp_t			bounce_gfp;
 	/*
 	 * various queue flags, see QUEUE_* below
 	 */
 	unsigned long		queue_flags;
 	/*
 	 * protects queue structures from reentrancy. ->__queue_lock should
 	 * _never_ be used directly, it is queue private. always use
 	 * ->queue_lock.
 	 */
 	spinlock_t		__queue_lock;
 	spinlock_t		*queue_lock;
 	/*
 	 * queue kobject
 	 */
 	struct kobject kobj;
 	/*
 	 * queue settings
 	 */
 	unsigned long		nr_requests;	/* Max # of requests */
 	unsigned int		nr_congestion_on;
 	unsigned int		nr_congestion_off;
 	unsigned int		nr_batching;
 	unsigned int		max_sectors;
 	unsigned int		max_hw_sectors;
 	unsigned short		max_phys_segments;
 	unsigned short		max_hw_segments;
 	unsigned short		hardsect_size;
 	unsigned int		max_segment_size;
 	unsigned long		seg_boundary_mask;
 	unsigned int		dma_alignment;
 	struct blk_queue_tag	*queue_tags;
 	unsigned int		nr_sorted;
 	unsigned int		in_flight;
 	/*
 	 * sg stuff
 	 */
 	unsigned int		sg_timeout;
 	unsigned int		sg_reserved_size;
 	int			node;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	struct blk_trace	*blk_trace;
 #endif
 	/*
 	 * reserved for flush operations
 	 */
 	unsigned int		ordered, next_ordered, ordseq;
 	int			orderr, ordcolor;
 	struct request		pre_flush_rq, bar_rq, post_flush_rq;
 	struct request		*orig_bar_rq;
 	struct mutex		sysfs_lock;
 #if defined(CONFIG_BLK_DEV_BSG)
 	struct bsg_class_device bsg_dev;
 #endif
 };
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
 #define QUEUE_FLAG_STOPPED	2	/* queue is stopped */
 #define	QUEUE_FLAG_READFULL	3	/* read queue has been filled */
 #define QUEUE_FLAG_WRITEFULL	4	/* write queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 #define QUEUE_FLAG_ELVSWITCH	8	/* don't use elevator, just do FIFO */
 #define QUEUE_FLAG_BIDI		9	/* queue supports bidi requests */
 enum {
 	/*
 	 * Hardbarrier is supported with one of the following methods.
 	 *
 	 * NONE		: hardbarrier unsupported
 	 * DRAIN	: ordering by draining is enough
 	 * DRAIN_FLUSH	: ordering by draining w/ pre and post flushes
 	 * DRAIN_FUA	: ordering by draining w/ pre flush and FUA write
 	 * TAG		: ordering by tag is enough
 	 * TAG_FLUSH	: ordering by tag w/ pre and post flushes
 	 * TAG_FUA	: ordering by tag w/ pre flush and FUA write
 	 */
 	QUEUE_ORDERED_NONE	= 0x00,
 	QUEUE_ORDERED_DRAIN	= 0x01,
 	QUEUE_ORDERED_TAG	= 0x02,
 	QUEUE_ORDERED_PREFLUSH	= 0x10,
 	QUEUE_ORDERED_POSTFLUSH	= 0x20,
 	QUEUE_ORDERED_FUA	= 0x40,
 	QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
 			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
 	QUEUE_ORDERED_DRAIN_FUA	= QUEUE_ORDERED_DRAIN |
 			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
 	QUEUE_ORDERED_TAG_FLUSH	= QUEUE_ORDERED_TAG |
 			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
 	QUEUE_ORDERED_TAG_FUA	= QUEUE_ORDERED_TAG |
 			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
 	/*
 	 * Ordered operation sequence
 	 */
 	QUEUE_ORDSEQ_STARTED	= 0x01,	/* flushing in progress */
 	QUEUE_ORDSEQ_DRAIN	= 0x02,	/* waiting for the queue to be drained */
 	QUEUE_ORDSEQ_PREFLUSH	= 0x04,	/* pre-flushing in progress */
 	QUEUE_ORDSEQ_BAR	= 0x08,	/* original barrier req in progress */
 	QUEUE_ORDSEQ_POSTFLUSH	= 0x10,	/* post-flushing in progress */
 	QUEUE_ORDSEQ_DONE	= 0x20,
 };
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_flushing(q)	((q)->ordseq)
 #define blk_fs_request(rq)	((rq)->cmd_type == REQ_TYPE_FS)
 #define blk_pc_request(rq)	((rq)->cmd_type == REQ_TYPE_BLOCK_PC)
 #define blk_special_request(rq)	((rq)->cmd_type == REQ_TYPE_SPECIAL)
 #define blk_sense_request(rq)	((rq)->cmd_type == REQ_TYPE_SENSE)
 #define blk_noretry_request(rq)	((rq)->cmd_flags & REQ_FAILFAST)
 #define blk_rq_started(rq)	((rq)->cmd_flags & REQ_STARTED)
 #define blk_account_rq(rq)	(blk_rq_started(rq) && blk_fs_request(rq))
 #define blk_pm_suspend_request(rq)	((rq)->cmd_type == REQ_TYPE_PM_SUSPEND)
 #define blk_pm_resume_request(rq)	((rq)->cmd_type == REQ_TYPE_PM_RESUME)
 #define blk_pm_request(rq)	\
 	(blk_pm_suspend_request(rq) || blk_pm_resume_request(rq))
 #define blk_sorted_rq(rq)	((rq)->cmd_flags & REQ_SORTED)
 #define blk_barrier_rq(rq)	((rq)->cmd_flags & REQ_HARDBARRIER)
 #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 #define rq_data_dir(rq)		((rq)->cmd_flags & 1)
 /*
  * We regard a request as sync, if it's a READ or a SYNC write.
  */
 #define rq_is_sync(rq)		(rq_data_dir((rq)) == READ || (rq)->cmd_flags & REQ_RW_SYNC)
 #define rq_is_meta(rq)		((rq)->cmd_flags & REQ_RW_META)
 static inline int blk_queue_full(struct request_queue *q, int rw)
 {
 	if (rw == READ)
 		return test_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
 	return test_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
 }
 static inline void blk_set_queue_full(struct request_queue *q, int rw)
 {
 	if (rw == READ)
 		set_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
 	else
 		set_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
 }
 static inline void blk_clear_queue_full(struct request_queue *q, int rw)
 {
 	if (rw == READ)
 		clear_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
 	else
 		clear_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
 }
 /*
  * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may
  * it already be started by driver.
  */
 #define RQ_NOMERGE_FLAGS	\
 	(REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
 #define rq_mergeable(rq)	\
 	(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && blk_fs_request((rq)))
 /*
  * q->prep_rq_fn return values
  */
 #define BLKPREP_OK		0	/* serve it */
 #define BLKPREP_KILL		1	/* fatal error, kill */
 #define BLKPREP_DEFER		2	/* leave on queue */
 extern unsigned long blk_max_low_pfn, blk_max_pfn;
 /*
  * standard bounce addresses:
  *
  * BLK_BOUNCE_HIGH	: bounce all highmem pages
  * BLK_BOUNCE_ANY	: don't bounce anything
  * BLK_BOUNCE_ISA	: bounce pages above ISA DMA boundary
  */
 #define BLK_BOUNCE_HIGH		((u64)blk_max_low_pfn << PAGE_SHIFT)
 #define BLK_BOUNCE_ANY		((u64)blk_max_pfn << PAGE_SHIFT)
 #define BLK_BOUNCE_ISA		(ISA_DMA_THRESHOLD)
 /*
  * default timeout for SG_IO if none specified
  */
 #define BLK_DEFAULT_SG_TIMEOUT	(60 * HZ)
 #ifdef CONFIG_BOUNCE
 extern int init_emergency_isa_pool(void);
 extern void blk_queue_bounce(struct request_queue *q, struct bio **bio);
 #else
 static inline int init_emergency_isa_pool(void)
 {
 	return 0;
 }
 static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
 {
 }
 #endif /* CONFIG_MMU */
 struct req_iterator {
 	int i;
 	struct bio *bio;
 };
 /* This should not be used directly - use rq_for_each_segment */
 #define __rq_for_each_bio(_bio, rq)	\
 	if ((rq->bio))			\
 		for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
 #define rq_for_each_segment(bvl, _rq, _iter)			\
 	__rq_for_each_bio(_iter.bio, _rq)			\
 		bio_for_each_segment(bvl, _iter.bio, _iter.i)
 #define rq_iter_last(rq, _iter)					\
 		(_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1)
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern void register_disk(struct gendisk *dev);
 extern void generic_make_request(struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern void blk_end_sync_rq(struct request *rq, int error);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern void blk_plug_device(struct request_queue *);
 extern int blk_remove_plug(struct request_queue *);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_cmd_ioctl(struct file *, struct request_queue *,
 			  struct gendisk *, unsigned int, void __user *);
 extern int sg_scsi_ioctl(struct file *, struct request_queue *,
 		struct gendisk *, struct scsi_ioctl_command __user *);
 /*
  * Temporary export, until SCSI gets fixed up.
  */
 extern int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 			     struct bio *bio);
 /*
  * A queue has just exitted congestion.  Note this in the global counter of
  * congested queues, and wake up anyone who was waiting for requests to be
  * put back.
  */
 static inline void blk_clear_queue_congested(struct request_queue *q, int rw)
 {
 	clear_bdi_congested(&q->backing_dev_info, rw);
 }
 /*
  * A queue has just entered congestion.  Flag that in the queue's VM-visible
  * state flags and increment the global gounter of congested queues.
  */
 static inline void blk_set_queue_congested(struct request_queue *q, int rw)
 {
 	set_bdi_congested(&q->backing_dev_info, rw);
 }
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
 extern void __blk_stop_queue(struct request_queue *q);
 extern void blk_run_queue(struct request_queue *);
 extern void blk_start_queueing(struct request_queue *);
 extern int blk_rq_map_user(struct request_queue *, struct request *, void __user *, unsigned long);
 extern int blk_rq_unmap_user(struct bio *);
 extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
 			       struct sg_iovec *, int, unsigned int);
 extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
 extern int blk_verify_command(unsigned char *, int);
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
 	return bdev->bd_disk->queue;
 }
 static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
 				       struct page *page)
 {
 	if (bdi && bdi->unplug_io_fn)
 		bdi->unplug_io_fn(bdi, page);
 }
 static inline void blk_run_address_space(struct address_space *mapping)
 {
 	if (mapping)
 		blk_run_backing_dev(mapping->backing_dev_info, NULL);
 }
 /*
  * end_request() and friends. Must be called with the request queue spinlock
  * acquired. All functions called within end_request() _must_be_ atomic.
  *
  * Several drivers define their own end_request and call
  * end_that_request_first() and end_that_request_last()
  * for parts of the original function. This prevents
  * code duplication in drivers.
  */
 extern int end_that_request_first(struct request *, int, int);
 extern int end_that_request_chunk(struct request *, int, int);
 extern void end_that_request_last(struct request *, int);
-extern void end_request(struct request *req, int uptodate);
+extern void end_request(struct request *, int);
+extern void end_queued_request(struct request *, int);
+extern void end_dequeued_request(struct request *, int);
 extern void blk_complete_request(struct request *);
 /*
  * end_that_request_first/chunk() takes an uptodate argument. we account
  * any value <= as an io error. 0 means -EIO for compatability reasons,
  * any other < 0 value is the direct error type. An uptodate value of
  * 1 indicates successful io completion
  */
 #define end_io_error(uptodate)	(unlikely((uptodate) <= 0))
 static inline void blkdev_dequeue_request(struct request *req)
 {
 	elv_dequeue_request(req->q, req);
 }
 /*
  * Access functions for manipulating queue properties
  */
 extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
 					spinlock_t *lock, int node_id);
 extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
 extern void blk_queue_issue_flush_fn(struct request_queue *, issue_flush_fn *);
 extern int blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
 extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int);
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern void generic_unplug_device(struct request_queue *);
 extern void __generic_unplug_device(struct request_queue *);
 extern long nr_blockdev_pages(void);
 int blk_get_queue(struct request_queue *);
 struct request_queue *blk_alloc_queue(gfp_t);
 struct request_queue *blk_alloc_queue_node(gfp_t, int);
 extern void blk_put_queue(struct request_queue *);
 /*
  * tag stuff
  */
 #define blk_queue_tag_depth(q)		((q)->queue_tags->busy)
 #define blk_queue_tag_queue(q)		((q)->queue_tags->busy < (q)->queue_tags->max_depth)
 #define blk_rq_tagged(rq)		((rq)->cmd_flags & REQ_QUEUED)
 extern int blk_queue_start_tag(struct request_queue *, struct request *);
 extern struct request *blk_queue_find_tag(struct request_queue *, int);
 extern void blk_queue_end_tag(struct request_queue *, struct request *);
 extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *);
 extern void blk_queue_free_tags(struct request_queue *);
 extern int blk_queue_resize_tags(struct request_queue *, int);
 extern void blk_queue_invalidate_tags(struct request_queue *);
 extern struct blk_queue_tag *blk_init_tags(int);
 extern void blk_free_tags(struct blk_queue_tag *);
 static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 						int tag)
 {
 	if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
 		return NULL;
 	return bqt->tag_index[tag];
 }
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
 #define MAX_PHYS_SEGMENTS 128
 #define MAX_HW_SEGMENTS 128
 #define SAFE_MAX_SECTORS 255
 #define BLK_DEF_MAX_SECTORS 1024
 #define MAX_SEGMENT_SIZE	65536
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 static inline int queue_hardsect_size(struct request_queue *q)
 {
 	int retval = 512;
 	if (q && q->hardsect_size)
 		retval = q->hardsect_size;
 	return retval;
 }
 static inline int bdev_hardsect_size(struct block_device *bdev)
 {
 	return queue_hardsect_size(bdev_get_queue(bdev));
 }
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	int retval = 511;
 	if (q && q->dma_alignment)
 		retval = q->dma_alignment;
 	return retval;
 }
 /* assumes size > 256 */
 static inline unsigned int blksize_bits(unsigned int size)
 {
 	unsigned int bits = 8;
 	do {
 		bits++;
 		size >>= 1;
 	} while (size > 256);
 	return bits;
 }
 static inline unsigned int block_size(struct block_device *bdev)
 {
 	return bdev->bd_block_size;
 }
 typedef struct {struct page *v;} Sector;
 unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *);
 static inline void put_dev_sector(Sector p)
 {
 	page_cache_release(p.v);
 }
 struct work_struct;
 int kblockd_schedule_work(struct work_struct *work);
 void kblockd_flush_work(struct work_struct *work);
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-*")
 #else /* CONFIG_BLOCK */
 /*
  * stubs for when the block layer is configured out
  */
 #define buffer_heads_over_limit 0
 static inline long nr_blockdev_pages(void)
 {
 	return 0;
 }
 static inline void exit_io_context(void)
 {
 }
 #endif /* CONFIG_BLOCK */
 #endif