Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

3

4

5

* Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>

5

* Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>

6

* kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>

6

* kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>

7

* - July2000

7

* - July2000

8

* bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001

8

* bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001

9

*/

9

*/

10

11

/*

11

/*

12

* This handles all read/write requests to block devices

12

* This handles all read/write requests to block devices

13

*/

13

*/

14

#include <linux/kernel.h>

14

#include <linux/kernel.h>

15

#include <linux/module.h>

15

#include <linux/module.h>

16

#include <linux/backing-dev.h>

16

#include <linux/backing-dev.h>

17

#include <linux/bio.h>

17

#include <linux/bio.h>

18

#include <linux/blkdev.h>

18

#include <linux/blkdev.h>

19

#include <linux/highmem.h>

19

#include <linux/highmem.h>

20

#include <linux/mm.h>

20

#include <linux/mm.h>

21

#include <linux/kernel_stat.h>

21

#include <linux/kernel_stat.h>

22

#include <linux/string.h>

22

#include <linux/string.h>

23

#include <linux/init.h>

23

#include <linux/init.h>

24

#include <linux/completion.h>

24

#include <linux/completion.h>

25

#include <linux/slab.h>

25

#include <linux/slab.h>

26

#include <linux/swap.h>

26

#include <linux/swap.h>

27

#include <linux/writeback.h>

27

#include <linux/writeback.h>

28

#include <linux/task_io_accounting_ops.h>

28

#include <linux/task_io_accounting_ops.h>

29

#include <linux/blktrace_api.h>

29

#include <linux/blktrace_api.h>

30

#include <linux/fault-inject.h>

30

#include <linux/fault-inject.h>

31

#include <trace/block.h>

31

#include <trace/block.h>

32

33

#include "blk.h"

33

#include "blk.h"

34

35

DEFINE_TRACE(block_plug);

35

DEFINE_TRACE(block_plug);

36

DEFINE_TRACE(block_unplug_io);

36

DEFINE_TRACE(block_unplug_io);

37

DEFINE_TRACE(block_unplug_timer);

37

DEFINE_TRACE(block_unplug_timer);

38

DEFINE_TRACE(block_getrq);

38

DEFINE_TRACE(block_getrq);

39

DEFINE_TRACE(block_sleeprq);

39

DEFINE_TRACE(block_sleeprq);

40

DEFINE_TRACE(block_rq_requeue);

40

DEFINE_TRACE(block_rq_requeue);

41

DEFINE_TRACE(block_bio_backmerge);

41

DEFINE_TRACE(block_bio_backmerge);

42

DEFINE_TRACE(block_bio_frontmerge);

42

DEFINE_TRACE(block_bio_frontmerge);

43

DEFINE_TRACE(block_bio_queue);

43

DEFINE_TRACE(block_bio_queue);

44

DEFINE_TRACE(block_rq_complete);

44

DEFINE_TRACE(block_rq_complete);

45

DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */

45

DEFINE_TRACE(block_remap); /* Also used in drivers/md/dm.c */

46

EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);

46

EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);

47

48

static int __make_request(struct request_queue *q, struct bio *bio);

48

static int __make_request(struct request_queue *q, struct bio *bio);

49

50

/*

50

/*

51

* For the allocated request tables

51

* For the allocated request tables

52

*/

52

*/

53

static struct kmem_cache *request_cachep;

53

static struct kmem_cache *request_cachep;

54

55

/*

55

/*

56

* For queue allocation

56

* For queue allocation

57

*/

57

*/

58

struct kmem_cache *blk_requestq_cachep;

58

struct kmem_cache *blk_requestq_cachep;

59

60

/*

60

/*

61

* Controlling structure to kblockd

61

* Controlling structure to kblockd

62

*/

62

*/

63

static struct workqueue_struct *kblockd_workqueue;

63

static struct workqueue_struct *kblockd_workqueue;

64

65

static void drive_stat_acct(struct request *rq, int new_io)

65

static void drive_stat_acct(struct request *rq, int new_io)

66

{

66

{

67

struct hd_struct *part;

67

struct hd_struct *part;

68

int rw = rq_data_dir(rq);

68

int rw = rq_data_dir(rq);

69

int cpu;

69

int cpu;

70

71

if (!blk_do_io_stat(rq))

71

if (!blk_do_io_stat(rq))

72

return;

72

return;

73

74

cpu = part_stat_lock();

74

cpu = part_stat_lock();

75

part = disk_map_sector_rcu(rq->rq_disk, rq->sector);

75

part = disk_map_sector_rcu(rq->rq_disk, rq->sector);

76

77

if (!new_io)

77

if (!new_io)

78

part_stat_inc(cpu, part, merges[rw]);

78

part_stat_inc(cpu, part, merges[rw]);

79

else {

79

else {

80

part_round_stats(cpu, part);

80

part_round_stats(cpu, part);

81

part_inc_in_flight(part);

81

part_inc_in_flight(part);

82

}

82

}

83

84

part_stat_unlock();

84

part_stat_unlock();

85

}

85

}

86

87

void blk_queue_congestion_threshold(struct request_queue *q)

87

void blk_queue_congestion_threshold(struct request_queue *q)

88

{

88

{

89

int nr;

89

int nr;

90

91

nr = q->nr_requests - (q->nr_requests / 8) + 1;

91

nr = q->nr_requests - (q->nr_requests / 8) + 1;

92

if (nr > q->nr_requests)

92

if (nr > q->nr_requests)

93

nr = q->nr_requests;

93

nr = q->nr_requests;

94

q->nr_congestion_on = nr;

94

q->nr_congestion_on = nr;

95

96

nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;

96

nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;

97

if (nr < 1)

97

if (nr < 1)

98

nr = 1;

98

nr = 1;

99

q->nr_congestion_off = nr;

99

q->nr_congestion_off = nr;

100

}

100

}

101

102

/**

102

/**

103

* blk_get_backing_dev_info - get the address of a queue's backing_dev_info

103

* blk_get_backing_dev_info - get the address of a queue's backing_dev_info

104

* @bdev: device

104

* @bdev: device

105

*

105

*

106

* Locates the passed device's request queue and returns the address of its

106

* Locates the passed device's request queue and returns the address of its

107

* backing_dev_info

107

* backing_dev_info

108

*

108

*

109

* Will return NULL if the request queue cannot be located.

109

* Will return NULL if the request queue cannot be located.

110

*/

110

*/

111

struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)

111

struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)

112

{

112

{

113

struct backing_dev_info *ret = NULL;

113

struct backing_dev_info *ret = NULL;

114

struct request_queue *q = bdev_get_queue(bdev);

114

struct request_queue *q = bdev_get_queue(bdev);

115

116

if (q)

116

if (q)

117

ret = &q->backing_dev_info;

117

ret = &q->backing_dev_info;

118

return ret;

118

return ret;

119

}

119

}

120

EXPORT_SYMBOL(blk_get_backing_dev_info);

120

EXPORT_SYMBOL(blk_get_backing_dev_info);

121

122

void blk_rq_init(struct request_queue *q, struct request *rq)

122

void blk_rq_init(struct request_queue *q, struct request *rq)

123

{

123

{

124

memset(rq, 0, sizeof(*rq));

124

memset(rq, 0, sizeof(*rq));

125

126

INIT_LIST_HEAD(&rq->queuelist);

126

INIT_LIST_HEAD(&rq->queuelist);

127

INIT_LIST_HEAD(&rq->timeout_list);

127

INIT_LIST_HEAD(&rq->timeout_list);

128

rq->cpu = -1;

128

rq->cpu = -1;

129

rq->q = q;

129

rq->q = q;

130

rq->sector = rq->hard_sector = (sector_t) -1;

130

rq->sector = rq->hard_sector = (sector_t) -1;

131

INIT_HLIST_NODE(&rq->hash);

131

INIT_HLIST_NODE(&rq->hash);

132

RB_CLEAR_NODE(&rq->rb_node);

132

RB_CLEAR_NODE(&rq->rb_node);

133

rq->cmd = rq->__cmd;

133

rq->cmd = rq->__cmd;

134

rq->cmd_len = BLK_MAX_CDB;

134

rq->cmd_len = BLK_MAX_CDB;

135

rq->tag = -1;

135

rq->tag = -1;

136

rq->ref_count = 1;

136

rq->ref_count = 1;

137

rq->start_time = jiffies;

137

rq->start_time = jiffies;

138

}

138

}

139

EXPORT_SYMBOL(blk_rq_init);

139

EXPORT_SYMBOL(blk_rq_init);

140

141

static void req_bio_endio(struct request *rq, struct bio *bio,

141

static void req_bio_endio(struct request *rq, struct bio *bio,

142

unsigned int nbytes, int error)

142

unsigned int nbytes, int error)

143

{

143

{

144

struct request_queue *q = rq->q;

144

struct request_queue *q = rq->q;

145

146

if (&q->bar_rq != rq) {

146

if (&q->bar_rq != rq) {

147

if (error)

147

if (error)

148

clear_bit(BIO_UPTODATE, &bio->bi_flags);

148

clear_bit(BIO_UPTODATE, &bio->bi_flags);

149

else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))

149

else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))

150

error = -EIO;

150

error = -EIO;

151

152

if (unlikely(nbytes > bio->bi_size)) {

152

if (unlikely(nbytes > bio->bi_size)) {

153

printk(KERN_ERR "%s: want %u bytes done, %u left\n",

153

printk(KERN_ERR "%s: want %u bytes done, %u left\n",

154

__func__, nbytes, bio->bi_size);

154

__func__, nbytes, bio->bi_size);

155

nbytes = bio->bi_size;

155

nbytes = bio->bi_size;

156

}

156

}

157

158

if (unlikely(rq->cmd_flags & REQ_QUIET))

158

if (unlikely(rq->cmd_flags & REQ_QUIET))

159

set_bit(BIO_QUIET, &bio->bi_flags);

159

set_bit(BIO_QUIET, &bio->bi_flags);

160

161

bio->bi_size -= nbytes;

161

bio->bi_size -= nbytes;

162

bio->bi_sector += (nbytes >> 9);

162

bio->bi_sector += (nbytes >> 9);

163

164

if (bio_integrity(bio))

164

if (bio_integrity(bio))

165

bio_integrity_advance(bio, nbytes);

165

bio_integrity_advance(bio, nbytes);

166

167

if (bio->bi_size == 0)

167

if (bio->bi_size == 0)

168

bio_endio(bio, error);

168

bio_endio(bio, error);

169

} else {

169

} else {

170

171

/*

171

/*

172

* Okay, this is the barrier request in progress, just

172

* Okay, this is the barrier request in progress, just

173

* record the error;

173

* record the error;

174

*/

174

*/

175

if (error && !q->orderr)

175

if (error && !q->orderr)

176

q->orderr = error;

176

q->orderr = error;

177

}

177

}

178

}

178

}

179

180

void blk_dump_rq_flags(struct request *rq, char *msg)

180

void blk_dump_rq_flags(struct request *rq, char *msg)

181

{

181

{

182

int bit;

182

int bit;

183

184

printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,

184

printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,

185

rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,

185

rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,

186

rq->cmd_flags);

186

rq->cmd_flags);

187

188

printk(KERN_INFO " sector %llu, nr/cnr %lu/%u\n",

188

printk(KERN_INFO " sector %llu, nr/cnr %lu/%u\n",

189

(unsigned long long)rq->sector,

189

(unsigned long long)rq->sector,

190

rq->nr_sectors,

190

rq->nr_sectors,

191

rq->current_nr_sectors);

191

rq->current_nr_sectors);

192

printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n",

192

printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n",

193

rq->bio, rq->biotail,

193

rq->bio, rq->biotail,

194

rq->buffer, rq->data_len);

194

rq->buffer, rq->data_len);

195

196

if (blk_pc_request(rq)) {

196

if (blk_pc_request(rq)) {

197

printk(KERN_INFO " cdb: ");

197

printk(KERN_INFO " cdb: ");

198

for (bit = 0; bit < BLK_MAX_CDB; bit++)

198

for (bit = 0; bit < BLK_MAX_CDB; bit++)

199

printk("%02x ", rq->cmd[bit]);

199

printk("%02x ", rq->cmd[bit]);

200

printk("\n");

200

printk("\n");

201

}

201

}

202

}

202

}

203

EXPORT_SYMBOL(blk_dump_rq_flags);

203

EXPORT_SYMBOL(blk_dump_rq_flags);

204

205

/*

205

/*

206

* "plug" the device if there are no outstanding requests: this will

206

* "plug" the device if there are no outstanding requests: this will

207

* force the transfer to start only after we have put all the requests

207

* force the transfer to start only after we have put all the requests

208

* on the list.

208

* on the list.

209

*

209

*

210

* This is called with interrupts off and no requests on the queue and

210

* This is called with interrupts off and no requests on the queue and

211

* with the queue lock held.

211

* with the queue lock held.

212

*/

212

*/

213

void blk_plug_device(struct request_queue *q)

213

void blk_plug_device(struct request_queue *q)

214

{

214

{

215

WARN_ON(!irqs_disabled());

215

WARN_ON(!irqs_disabled());

216

217

/*

217

/*

218

* don't plug a stopped queue, it must be paired with blk_start_queue()

218

* don't plug a stopped queue, it must be paired with blk_start_queue()

219

* which will restart the queueing

219

* which will restart the queueing

220

*/

220

*/

221

if (blk_queue_stopped(q))

221

if (blk_queue_stopped(q))

222

return;

222

return;

223

224

if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {

224

if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {

225

mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);

225

mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);

226

trace_block_plug(q);

226

trace_block_plug(q);

227

}

227

}

228

}

228

}

229

EXPORT_SYMBOL(blk_plug_device);

229

EXPORT_SYMBOL(blk_plug_device);

230

231

/**

231

/**

232

* blk_plug_device_unlocked - plug a device without queue lock held

232

* blk_plug_device_unlocked - plug a device without queue lock held

233

* @q: The &struct request_queue to plug

233

* @q: The &struct request_queue to plug

234

*

234

*

235

* Description:

235

* Description:

236

* Like @blk_plug_device(), but grabs the queue lock and disables

236

* Like @blk_plug_device(), but grabs the queue lock and disables

237

* interrupts.

237

* interrupts.

238

**/

238

**/

239

void blk_plug_device_unlocked(struct request_queue *q)

239

void blk_plug_device_unlocked(struct request_queue *q)

240

{

240

{

241

unsigned long flags;

241

unsigned long flags;

242

243

spin_lock_irqsave(q->queue_lock, flags);

243

spin_lock_irqsave(q->queue_lock, flags);

244

blk_plug_device(q);

244

blk_plug_device(q);

245

spin_unlock_irqrestore(q->queue_lock, flags);

245

spin_unlock_irqrestore(q->queue_lock, flags);

246

}

246

}

247

EXPORT_SYMBOL(blk_plug_device_unlocked);

247

EXPORT_SYMBOL(blk_plug_device_unlocked);

248

249

/*

249

/*

250

* remove the queue from the plugged list, if present. called with

250

* remove the queue from the plugged list, if present. called with

251

* queue lock held and interrupts disabled.

251

* queue lock held and interrupts disabled.

252

*/

252

*/

253

int blk_remove_plug(struct request_queue *q)

253

int blk_remove_plug(struct request_queue *q)

254

{

254

{

255

WARN_ON(!irqs_disabled());

255

WARN_ON(!irqs_disabled());

256

257

if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))

257

if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))

258

return 0;

258

return 0;

259

260

del_timer(&q->unplug_timer);

260

del_timer(&q->unplug_timer);

261

return 1;

261

return 1;

262

}

262

}

263

EXPORT_SYMBOL(blk_remove_plug);

263

EXPORT_SYMBOL(blk_remove_plug);

264

265

/*

265

/*

266

* remove the plug and let it rip..

266

* remove the plug and let it rip..

267

*/

267

*/

268

void __generic_unplug_device(struct request_queue *q)

268

void __generic_unplug_device(struct request_queue *q)

269

{

269

{

270

if (unlikely(blk_queue_stopped(q)))

270

if (unlikely(blk_queue_stopped(q)))

271

return;

271

return;

272

if (!blk_remove_plug(q) && !blk_queue_nonrot(q))

272

if (!blk_remove_plug(q) && !blk_queue_nonrot(q))

273

return;

273

return;

274

275

q->request_fn(q);

275

q->request_fn(q);

276

}

276

}

277

278

/**

278

/**

279

* generic_unplug_device - fire a request queue

279

* generic_unplug_device - fire a request queue

280

* @q: The &struct request_queue in question

280

* @q: The &struct request_queue in question

281

*

281

*

282

* Description:

282

* Description:

283

* Linux uses plugging to build bigger requests queues before letting

283

* Linux uses plugging to build bigger requests queues before letting

284

* the device have at them. If a queue is plugged, the I/O scheduler

284

* the device have at them. If a queue is plugged, the I/O scheduler

285

* is still adding and merging requests on the queue. Once the queue

285

* is still adding and merging requests on the queue. Once the queue

286

* gets unplugged, the request_fn defined for the queue is invoked and

286

* gets unplugged, the request_fn defined for the queue is invoked and

287

* transfers started.

287

* transfers started.

288

**/

288

**/

289

void generic_unplug_device(struct request_queue *q)

289

void generic_unplug_device(struct request_queue *q)

290

{

290

{

291

if (blk_queue_plugged(q)) {

291

if (blk_queue_plugged(q)) {

292

spin_lock_irq(q->queue_lock);

292

spin_lock_irq(q->queue_lock);

293

__generic_unplug_device(q);

293

__generic_unplug_device(q);

294

spin_unlock_irq(q->queue_lock);

294

spin_unlock_irq(q->queue_lock);

295

}

295

}

296

}

296

}

297

EXPORT_SYMBOL(generic_unplug_device);

297

EXPORT_SYMBOL(generic_unplug_device);

298

299

static void blk_backing_dev_unplug(struct backing_dev_info *bdi,

299

static void blk_backing_dev_unplug(struct backing_dev_info *bdi,

300

struct page *page)

300

struct page *page)

301

{

301

{

302

struct request_queue *q = bdi->unplug_io_data;

302

struct request_queue *q = bdi->unplug_io_data;

303

304

blk_unplug(q);

304

blk_unplug(q);

305

}

305

}

306

307

void blk_unplug_work(struct work_struct *work)

307

void blk_unplug_work(struct work_struct *work)

308

{

308

{

309

struct request_queue *q =

309

struct request_queue *q =

310

container_of(work, struct request_queue, unplug_work);

310

container_of(work, struct request_queue, unplug_work);

311

312

trace_block_unplug_io(q);

312

trace_block_unplug_io(q);

313

q->unplug_fn(q);

313

q->unplug_fn(q);

314

}

314

}

315

316

void blk_unplug_timeout(unsigned long data)

316

void blk_unplug_timeout(unsigned long data)

317

{

317

{

318

struct request_queue *q = (struct request_queue *)data;

318

struct request_queue *q = (struct request_queue *)data;

319

320

trace_block_unplug_timer(q);

320

trace_block_unplug_timer(q);

321

kblockd_schedule_work(q, &q->unplug_work);

321

kblockd_schedule_work(q, &q->unplug_work);

322

}

322

}

323

324

void blk_unplug(struct request_queue *q)

324

void blk_unplug(struct request_queue *q)

325

{

325

{

326

/*

326

/*

327

* devices don't necessarily have an ->unplug_fn defined

327

* devices don't necessarily have an ->unplug_fn defined

328

*/

328

*/

329

if (q->unplug_fn) {

329

if (q->unplug_fn) {

330

trace_block_unplug_io(q);

330

trace_block_unplug_io(q);

331

q->unplug_fn(q);

331

q->unplug_fn(q);

332

}

332

}

333

}

333

}

334

EXPORT_SYMBOL(blk_unplug);

334

EXPORT_SYMBOL(blk_unplug);

335

336

/**

336

/**

337

* blk_start_queue - restart a previously stopped queue

337

* blk_start_queue - restart a previously stopped queue

338

* @q: The &struct request_queue in question

338

* @q: The &struct request_queue in question

339

*

339

*

340

* Description:

340

* Description:

341

* blk_start_queue() will clear the stop flag on the queue, and call

341

* blk_start_queue() will clear the stop flag on the queue, and call

342

* the request_fn for the queue if it was in a stopped state when

342

* the request_fn for the queue if it was in a stopped state when

343

* entered. Also see blk_stop_queue(). Queue lock must be held.

343

* entered. Also see blk_stop_queue(). Queue lock must be held.

344

**/

344

**/

345

void blk_start_queue(struct request_queue *q)

345

void blk_start_queue(struct request_queue *q)

346

{

346

{

347

WARN_ON(!irqs_disabled());

347

WARN_ON(!irqs_disabled());

348

349

queue_flag_clear(QUEUE_FLAG_STOPPED, q);

349

queue_flag_clear(QUEUE_FLAG_STOPPED, q);

350

__blk_run_queue(q);

350

__blk_run_queue(q);

351

}

351

}

352

EXPORT_SYMBOL(blk_start_queue);

352

EXPORT_SYMBOL(blk_start_queue);

353

354

/**

354

/**

355

* blk_stop_queue - stop a queue

355

* blk_stop_queue - stop a queue

356

* @q: The &struct request_queue in question

356

* @q: The &struct request_queue in question

357

*

357

*

358

* Description:

358

* Description:

359

* The Linux block layer assumes that a block driver will consume all

359

* The Linux block layer assumes that a block driver will consume all

360

* entries on the request queue when the request_fn strategy is called.

360

* entries on the request queue when the request_fn strategy is called.

361

* Often this will not happen, because of hardware limitations (queue

361

* Often this will not happen, because of hardware limitations (queue

362

* depth settings). If a device driver gets a 'queue full' response,

362

* depth settings). If a device driver gets a 'queue full' response,

363

* or if it simply chooses not to queue more I/O at one point, it can

363

* or if it simply chooses not to queue more I/O at one point, it can

364

* call this function to prevent the request_fn from being called until

364

* call this function to prevent the request_fn from being called until

365

* the driver has signalled it's ready to go again. This happens by calling

365

* the driver has signalled it's ready to go again. This happens by calling

366

* blk_start_queue() to restart queue operations. Queue lock must be held.

366

* blk_start_queue() to restart queue operations. Queue lock must be held.

367

**/

367

**/

368

void blk_stop_queue(struct request_queue *q)

368

void blk_stop_queue(struct request_queue *q)

369

{

369

{

370

blk_remove_plug(q);

370

blk_remove_plug(q);

371

queue_flag_set(QUEUE_FLAG_STOPPED, q);

371

queue_flag_set(QUEUE_FLAG_STOPPED, q);

372

}

372

}

373

EXPORT_SYMBOL(blk_stop_queue);

373

EXPORT_SYMBOL(blk_stop_queue);

374

375

/**

375

/**

376

* blk_sync_queue - cancel any pending callbacks on a queue

376

* blk_sync_queue - cancel any pending callbacks on a queue

377

* @q: the queue

377

* @q: the queue

378

*

378

*

379

* Description:

379

* Description:

380

* The block layer may perform asynchronous callback activity

380

* The block layer may perform asynchronous callback activity

381

* on a queue, such as calling the unplug function after a timeout.

381

* on a queue, such as calling the unplug function after a timeout.

382

* A block device may call blk_sync_queue to ensure that any

382

* A block device may call blk_sync_queue to ensure that any

383

* such activity is cancelled, thus allowing it to release resources

383

* such activity is cancelled, thus allowing it to release resources

384

* that the callbacks might use. The caller must already have made sure

384

* that the callbacks might use. The caller must already have made sure

385

* that its ->make_request_fn will not re-add plugging prior to calling

385

* that its ->make_request_fn will not re-add plugging prior to calling

386

* this function.

386

* this function.

387

*

387

*

388

*/

388

*/

389

void blk_sync_queue(struct request_queue *q)

389

void blk_sync_queue(struct request_queue *q)

390

{

390

{

391

del_timer_sync(&q->unplug_timer);

391

del_timer_sync(&q->unplug_timer);

392

del_timer_sync(&q->timeout);

392

del_timer_sync(&q->timeout);

393

cancel_work_sync(&q->unplug_work);

393

cancel_work_sync(&q->unplug_work);

394

}

394

}

395

EXPORT_SYMBOL(blk_sync_queue);

395

EXPORT_SYMBOL(blk_sync_queue);

396

397

/**

397

/**

398

* __blk_run_queue - run a single device queue

398

* __blk_run_queue - run a single device queue

399

* @q: The queue to run

399

* @q: The queue to run

400

*

400

*

401

* Description:

401

* Description:

402

* See @blk_run_queue. This variant must be called with the queue lock

402

* See @blk_run_queue. This variant must be called with the queue lock

403

* held and interrupts disabled.

403

* held and interrupts disabled.

404

*

404

*

405

*/

405

*/

406

void __blk_run_queue(struct request_queue *q)

406

void __blk_run_queue(struct request_queue *q)

407

{

407

{

408

blk_remove_plug(q);

408

blk_remove_plug(q);

409

410

if (unlikely(blk_queue_stopped(q)))

410

if (unlikely(blk_queue_stopped(q)))

411

return;

411

return;

412

413

if (elv_queue_empty(q))

413

if (elv_queue_empty(q))

414

return;

414

return;

415

416

/*

416

/*

417

* Only recurse once to avoid overrunning the stack, let the unplug

417

* Only recurse once to avoid overrunning the stack, let the unplug

418

* handling reinvoke the handler shortly if we already got there.

418

* handling reinvoke the handler shortly if we already got there.

419

*/

419

*/

420

if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {

420

if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {

421

q->request_fn(q);

421

q->request_fn(q);

422

queue_flag_clear(QUEUE_FLAG_REENTER, q);

422

queue_flag_clear(QUEUE_FLAG_REENTER, q);

423

} else {

423

} else {

424

queue_flag_set(QUEUE_FLAG_PLUGGED, q);

424

queue_flag_set(QUEUE_FLAG_PLUGGED, q);

425

kblockd_schedule_work(q, &q->unplug_work);

425

kblockd_schedule_work(q, &q->unplug_work);

426

}

426

}

427

}

427

}

428

EXPORT_SYMBOL(__blk_run_queue);

428

EXPORT_SYMBOL(__blk_run_queue);

429

430

/**

430

/**

431

* blk_run_queue - run a single device queue

431

* blk_run_queue - run a single device queue

432

* @q: The queue to run

432

* @q: The queue to run

433

*

433

*

434

* Description:

434

* Description:

435

* Invoke request handling on this queue, if it has pending work to do.

435

* Invoke request handling on this queue, if it has pending work to do.

436

* May be used to restart queueing when a request has completed.

436

* May be used to restart queueing when a request has completed.

437

*/

437

*/

438

void blk_run_queue(struct request_queue *q)

438

void blk_run_queue(struct request_queue *q)

439

{

439

{

440

unsigned long flags;

440

unsigned long flags;

441

442

spin_lock_irqsave(q->queue_lock, flags);

442

spin_lock_irqsave(q->queue_lock, flags);

443

__blk_run_queue(q);

443

__blk_run_queue(q);

444

spin_unlock_irqrestore(q->queue_lock, flags);

444

spin_unlock_irqrestore(q->queue_lock, flags);

445

}

445

}

446

EXPORT_SYMBOL(blk_run_queue);

446

EXPORT_SYMBOL(blk_run_queue);

447

448

void blk_put_queue(struct request_queue *q)

448

void blk_put_queue(struct request_queue *q)

449

{

449

{

450

kobject_put(&q->kobj);

450

kobject_put(&q->kobj);

451

}

451

}

452

453

void blk_cleanup_queue(struct request_queue *q)

453

void blk_cleanup_queue(struct request_queue *q)

454

{

454

{

455

/*

455

/*

456

* We know we have process context here, so we can be a little

456

* We know we have process context here, so we can be a little

457

* cautious and ensure that pending block actions on this device

457

* cautious and ensure that pending block actions on this device

458

* are done before moving on. Going into this function, we should

458

* are done before moving on. Going into this function, we should

459

* not have processes doing IO to this device.

459

* not have processes doing IO to this device.

460

*/

460

*/

461

blk_sync_queue(q);

461

blk_sync_queue(q);

462

463

mutex_lock(&q->sysfs_lock);

463

mutex_lock(&q->sysfs_lock);

464

queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);

464

queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);

465

mutex_unlock(&q->sysfs_lock);

465

mutex_unlock(&q->sysfs_lock);

466

467

if (q->elevator)

467

if (q->elevator)

468

elevator_exit(q->elevator);

468

elevator_exit(q->elevator);

469

470

blk_put_queue(q);

470

blk_put_queue(q);

471

}

471

}

472

EXPORT_SYMBOL(blk_cleanup_queue);

472

EXPORT_SYMBOL(blk_cleanup_queue);

473

474

static int blk_init_free_list(struct request_queue *q)

474

static int blk_init_free_list(struct request_queue *q)

475

{

475

{

476

struct request_list *rl = &q->rq;

476

struct request_list *rl = &q->rq;

477

478

rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;

478

rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;

479

rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;

479

rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;

480

rl->elvpriv = 0;

480

rl->elvpriv = 0;

481

init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);

481

init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);

482

init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);

482

init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);

483

484

rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,

484

rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,

485

mempool_free_slab, request_cachep, q->node);

485

mempool_free_slab, request_cachep, q->node);

486

487

if (!rl->rq_pool)

487

if (!rl->rq_pool)

488

return -ENOMEM;

488

return -ENOMEM;

489

490

return 0;

490

return 0;

491

}

491

}

492

493

struct request_queue *blk_alloc_queue(gfp_t gfp_mask)

493

struct request_queue *blk_alloc_queue(gfp_t gfp_mask)

494

{

494

{

495

return blk_alloc_queue_node(gfp_mask, -1);

495

return blk_alloc_queue_node(gfp_mask, -1);

496

}

496

}

497

EXPORT_SYMBOL(blk_alloc_queue);

497

EXPORT_SYMBOL(blk_alloc_queue);

498

499

struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)

499

struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)

500

{

500

{

501

struct request_queue *q;

501

struct request_queue *q;

502

int err;

502

int err;

503

504

q = kmem_cache_alloc_node(blk_requestq_cachep,

504

q = kmem_cache_alloc_node(blk_requestq_cachep,

505

gfp_mask | __GFP_ZERO, node_id);

505

gfp_mask | __GFP_ZERO, node_id);

506

if (!q)

506

if (!q)

507

return NULL;

507

return NULL;

508

509

q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;

509

q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;

510

q->backing_dev_info.unplug_io_data = q;

510

q->backing_dev_info.unplug_io_data = q;

511

err = bdi_init(&q->backing_dev_info);

511

err = bdi_init(&q->backing_dev_info);

512

if (err) {

512

if (err) {

513

kmem_cache_free(blk_requestq_cachep, q);

513

kmem_cache_free(blk_requestq_cachep, q);

514

return NULL;

514

return NULL;

515

}

515

}

516

517

init_timer(&q->unplug_timer);

517

init_timer(&q->unplug_timer);

518

setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);

518

setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);

519

INIT_LIST_HEAD(&q->timeout_list);

519

INIT_LIST_HEAD(&q->timeout_list);

520

INIT_WORK(&q->unplug_work, blk_unplug_work);

520

INIT_WORK(&q->unplug_work, blk_unplug_work);

521

522

kobject_init(&q->kobj, &blk_queue_ktype);

522

kobject_init(&q->kobj, &blk_queue_ktype);

523

524

mutex_init(&q->sysfs_lock);

524

mutex_init(&q->sysfs_lock);

525

spin_lock_init(&q->__queue_lock);

525

spin_lock_init(&q->__queue_lock);

526

527

return q;

527

return q;

528

}

528

}

529

EXPORT_SYMBOL(blk_alloc_queue_node);

529

EXPORT_SYMBOL(blk_alloc_queue_node);

530

531

/**

531

/**

532

* blk_init_queue - prepare a request queue for use with a block device

532

* blk_init_queue - prepare a request queue for use with a block device

533

* @rfn: The function to be called to process requests that have been

533

* @rfn: The function to be called to process requests that have been

534

* placed on the queue.

534

* placed on the queue.

535

* @lock: Request queue spin lock

535

* @lock: Request queue spin lock

536

*

536

*

537

* Description:

537

* Description:

538

* If a block device wishes to use the standard request handling procedures,

538

* If a block device wishes to use the standard request handling procedures,

539

* which sorts requests and coalesces adjacent requests, then it must

539

* which sorts requests and coalesces adjacent requests, then it must

540

* call blk_init_queue(). The function @rfn will be called when there

540

* call blk_init_queue(). The function @rfn will be called when there

541

* are requests on the queue that need to be processed. If the device

541

* are requests on the queue that need to be processed. If the device

542

* supports plugging, then @rfn may not be called immediately when requests

542

* supports plugging, then @rfn may not be called immediately when requests

543

* are available on the queue, but may be called at some time later instead.

543

* are available on the queue, but may be called at some time later instead.

544

* Plugged queues are generally unplugged when a buffer belonging to one

544

* Plugged queues are generally unplugged when a buffer belonging to one

545

* of the requests on the queue is needed, or due to memory pressure.

545

* of the requests on the queue is needed, or due to memory pressure.

546

*

546

*

547

* @rfn is not required, or even expected, to remove all requests off the

547

* @rfn is not required, or even expected, to remove all requests off the

548

* queue, but only as many as it can handle at a time. If it does leave

548

* queue, but only as many as it can handle at a time. If it does leave

549

* requests on the queue, it is responsible for arranging that the requests

549

* requests on the queue, it is responsible for arranging that the requests

550

* get dealt with eventually.

550

* get dealt with eventually.

551

*

551

*

552

* The queue spin lock must be held while manipulating the requests on the

552

* The queue spin lock must be held while manipulating the requests on the

553

* request queue; this lock will be taken also from interrupt context, so irq

553

* request queue; this lock will be taken also from interrupt context, so irq

554

* disabling is needed for it.

554

* disabling is needed for it.

555

*

555

*

556

* Function returns a pointer to the initialized request queue, or %NULL if

556

* Function returns a pointer to the initialized request queue, or %NULL if

557

* it didn't succeed.

557

* it didn't succeed.

558

*

558

*

559

* Note:

559

* Note:

560

* blk_init_queue() must be paired with a blk_cleanup_queue() call

560

* blk_init_queue() must be paired with a blk_cleanup_queue() call

561

* when the block device is deactivated (such as at module unload).

561

* when the block device is deactivated (such as at module unload).

562

**/

562

**/

563

564

struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)

564

struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)

565

{

565

{

566

return blk_init_queue_node(rfn, lock, -1);

566

return blk_init_queue_node(rfn, lock, -1);

567

}

567

}

568

EXPORT_SYMBOL(blk_init_queue);

568

EXPORT_SYMBOL(blk_init_queue);

569

570

struct request_queue *

570

struct request_queue *

571

blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)

571

blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)

572

{

572

{

573

struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);

573

struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);

574

575

if (!q)

575

if (!q)

576

return NULL;

576

return NULL;

577

578

q->node = node_id;

578

q->node = node_id;

579

if (blk_init_free_list(q)) {

579

if (blk_init_free_list(q)) {

580

kmem_cache_free(blk_requestq_cachep, q);

580

kmem_cache_free(blk_requestq_cachep, q);

581

return NULL;

581

return NULL;

582

}

582

}

583

584

/*

584

/*

585

* if caller didn't supply a lock, they get per-queue locking with

585

* if caller didn't supply a lock, they get per-queue locking with

586

* our embedded lock

586

* our embedded lock

587

*/

587

*/

588

if (!lock)

588

if (!lock)

589

lock = &q->__queue_lock;

589

lock = &q->__queue_lock;

590

591

q->request_fn = rfn;

591

q->request_fn = rfn;

592

q->prep_rq_fn = NULL;

592

q->prep_rq_fn = NULL;

593

q->unplug_fn = generic_unplug_device;

593

q->unplug_fn = generic_unplug_device;

594

q->queue_flags = QUEUE_FLAG_DEFAULT;

594

q->queue_flags = QUEUE_FLAG_DEFAULT;

595

q->queue_lock = lock;

595

q->queue_lock = lock;

596

597

/*

597

/*

598

* This also sets hw/phys segments, boundary and size

598

* This also sets hw/phys segments, boundary and size

599

*/

599

*/

600

blk_queue_make_request(q, __make_request);

600

blk_queue_make_request(q, __make_request);

601

602

q->sg_reserved_size = INT_MAX;

602

q->sg_reserved_size = INT_MAX;

603

604

blk_set_cmd_filter_defaults(&q->cmd_filter);

604

blk_set_cmd_filter_defaults(&q->cmd_filter);

605

606

/*

606

/*

607

* all done

607

* all done

608

*/

608

*/

609

if (!elevator_init(q, NULL)) {

609

if (!elevator_init(q, NULL)) {

610

blk_queue_congestion_threshold(q);

610

blk_queue_congestion_threshold(q);

611

return q;

611

return q;

612

}

612

}

613

614

blk_put_queue(q);

614

blk_put_queue(q);

615

return NULL;

615

return NULL;

616

}

616

}

617

EXPORT_SYMBOL(blk_init_queue_node);

617

EXPORT_SYMBOL(blk_init_queue_node);

618

619

int blk_get_queue(struct request_queue *q)

619

int blk_get_queue(struct request_queue *q)

620

{

620

{

621

if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {

621

if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {

622

kobject_get(&q->kobj);

622

kobject_get(&q->kobj);

623

return 0;

623

return 0;

624

}

624

}

625

626

return 1;

626

return 1;

627

}

627

}

628

629

static inline void blk_free_request(struct request_queue *q, struct request *rq)

629

static inline void blk_free_request(struct request_queue *q, struct request *rq)

630

{

630

{

631

if (rq->cmd_flags & REQ_ELVPRIV)

631

if (rq->cmd_flags & REQ_ELVPRIV)

632

elv_put_request(q, rq);

632

elv_put_request(q, rq);

633

mempool_free(rq, q->rq.rq_pool);

633

mempool_free(rq, q->rq.rq_pool);

634

}

634

}

635

636

static struct request *

636

static struct request *

637

blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)

637

blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)

638

{

638

{

639

struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

639

struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

640

641

if (!rq)

641

if (!rq)

642

return NULL;

642

return NULL;

643

644

blk_rq_init(q, rq);

644

blk_rq_init(q, rq);

645

646

rq->cmd_flags = flags | REQ_ALLOCED;

646

rq->cmd_flags = flags | REQ_ALLOCED;

647

648

if (priv) {

648

if (priv) {

649

if (unlikely(elv_set_request(q, rq, gfp_mask))) {

649

if (unlikely(elv_set_request(q, rq, gfp_mask))) {

650

mempool_free(rq, q->rq.rq_pool);

650

mempool_free(rq, q->rq.rq_pool);

651

return NULL;

651

return NULL;

652

}

652

}

653

rq->cmd_flags |= REQ_ELVPRIV;

653

rq->cmd_flags |= REQ_ELVPRIV;

654

}

654

}

655

656

return rq;

656

return rq;

657

}

657

}

658

659

/*

659

/*

660

* ioc_batching returns true if the ioc is a valid batching request and

660

* ioc_batching returns true if the ioc is a valid batching request and

661

* should be given priority access to a request.

661

* should be given priority access to a request.

662

*/

662

*/

663

static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)

663

static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)

664

{

664

{

665

if (!ioc)

665

if (!ioc)

666

return 0;

666

return 0;

667

668

/*

668

/*

669

* Make sure the process is able to allocate at least 1 request

669

* Make sure the process is able to allocate at least 1 request

670

* even if the batch times out, otherwise we could theoretically

670

* even if the batch times out, otherwise we could theoretically

671

* lose wakeups.

671

* lose wakeups.

672

*/

672

*/

673

return ioc->nr_batch_requests == q->nr_batching ||

673

return ioc->nr_batch_requests == q->nr_batching ||

674

(ioc->nr_batch_requests > 0

674

(ioc->nr_batch_requests > 0

675

&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));

675

&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));

676

}

676

}

677

678

/*

678

/*

679

* ioc_set_batching sets ioc to be a new "batcher" if it is not one. This

679

* ioc_set_batching sets ioc to be a new "batcher" if it is not one. This

680

* will cause the process to be a "batcher" on all queues in the system. This

680

* will cause the process to be a "batcher" on all queues in the system. This

681

* is the behaviour we want though - once it gets a wakeup it should be given

681

* is the behaviour we want though - once it gets a wakeup it should be given

682

* a nice run.

682

* a nice run.

683

*/

683

*/

684

static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)

684

static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)

685

{

685

{

686

if (!ioc || ioc_batching(q, ioc))

686

if (!ioc || ioc_batching(q, ioc))

687

return;

687

return;

688

689

ioc->nr_batch_requests = q->nr_batching;

689

ioc->nr_batch_requests = q->nr_batching;

690

ioc->last_waited = jiffies;

690

ioc->last_waited = jiffies;

691

}

691

}

692

693

static void __freed_request(struct request_queue *q, int sync)

693

static void __freed_request(struct request_queue *q, int sync)

694

{

694

{

695

struct request_list *rl = &q->rq;

695

struct request_list *rl = &q->rq;

696

697

if (rl->count[sync] < queue_congestion_off_threshold(q))

697

if (rl->count[sync] < queue_congestion_off_threshold(q))

698

blk_clear_queue_congested(q, sync);

698

blk_clear_queue_congested(q, sync);

699

700

if (rl->count[sync] + 1 <= q->nr_requests) {

700

if (rl->count[sync] + 1 <= q->nr_requests) {

701

if (waitqueue_active(&rl->wait[sync]))

701

if (waitqueue_active(&rl->wait[sync]))

702

wake_up(&rl->wait[sync]);

702

wake_up(&rl->wait[sync]);

703

704

blk_clear_queue_full(q, sync);

704

blk_clear_queue_full(q, sync);

705

}

705

}

706

}

706

}

707

708

/*

708

/*

709

* A request has just been released. Account for it, update the full and

709

* A request has just been released. Account for it, update the full and

710

* congestion status, wake up any waiters. Called under q->queue_lock.

710

* congestion status, wake up any waiters. Called under q->queue_lock.

711

*/

711

*/

712

static void freed_request(struct request_queue *q, int sync, int priv)

712

static void freed_request(struct request_queue *q, int sync, int priv)

713

{

713

{

714

struct request_list *rl = &q->rq;

714

struct request_list *rl = &q->rq;

715

716

rl->count[sync]--;

716

rl->count[sync]--;

717

if (priv)

717

if (priv)

718

rl->elvpriv--;

718

rl->elvpriv--;

719

720

__freed_request(q, sync);

720

__freed_request(q, sync);

721

722

if (unlikely(rl->starved[sync ^ 1]))

722

if (unlikely(rl->starved[sync ^ 1]))

723

__freed_request(q, sync ^ 1);

723

__freed_request(q, sync ^ 1);

724

}

724

}

725

726

/*

726

/*

727

* Get a free request, queue_lock must be held.

727

* Get a free request, queue_lock must be held.

728

* Returns NULL on failure, with queue_lock held.

728

* Returns NULL on failure, with queue_lock held.

729

* Returns !NULL on success, with queue_lock *not held*.

729

* Returns !NULL on success, with queue_lock *not held*.

730

*/

730

*/

731

static struct request *get_request(struct request_queue *q, int rw_flags,

731

static struct request *get_request(struct request_queue *q, int rw_flags,

732

struct bio *bio, gfp_t gfp_mask)

732

struct bio *bio, gfp_t gfp_mask)

733

{

733

{

734

struct request *rq = NULL;

734

struct request *rq = NULL;

735

struct request_list *rl = &q->rq;

735

struct request_list *rl = &q->rq;

736

struct io_context *ioc = NULL;

736

struct io_context *ioc = NULL;

737

const bool is_sync = rw_is_sync(rw_flags) != 0;

737

const bool is_sync = rw_is_sync(rw_flags) != 0;

738

int may_queue, priv;

738

int may_queue, priv;

739

740

may_queue = elv_may_queue(q, rw_flags);

740

may_queue = elv_may_queue(q, rw_flags);

741

if (may_queue == ELV_MQUEUE_NO)

741

if (may_queue == ELV_MQUEUE_NO)

742

goto rq_starved;

742

goto rq_starved;

743

744

if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {

744

if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {

745

if (rl->count[is_sync]+1 >= q->nr_requests) {

745

if (rl->count[is_sync]+1 >= q->nr_requests) {

746

ioc = current_io_context(GFP_ATOMIC, q->node);

746

ioc = current_io_context(GFP_ATOMIC, q->node);

747

/*

747

/*

748

* The queue will fill after this allocation, so set

748

* The queue will fill after this allocation, so set

749

* it as full, and mark this process as "batching".

749

* it as full, and mark this process as "batching".

750

* This process will be allowed to complete a batch of

750

* This process will be allowed to complete a batch of

751

* requests, others will be blocked.

751

* requests, others will be blocked.

752

*/

752

*/

753

if (!blk_queue_full(q, is_sync)) {

753

if (!blk_queue_full(q, is_sync)) {

754

ioc_set_batching(q, ioc);

754

ioc_set_batching(q, ioc);

755

blk_set_queue_full(q, is_sync);

755

blk_set_queue_full(q, is_sync);

756

} else {

756

} else {

757

if (may_queue != ELV_MQUEUE_MUST

757

if (may_queue != ELV_MQUEUE_MUST

758

&& !ioc_batching(q, ioc)) {

758

&& !ioc_batching(q, ioc)) {

759

/*

759

/*

760

* The queue is full and the allocating

760

* The queue is full and the allocating

761

* process is not a "batcher", and not

761

* process is not a "batcher", and not

762

* exempted by the IO scheduler

762

* exempted by the IO scheduler

763

*/

763

*/

764

goto out;

764

goto out;

765

}

765

}

766

}

766

}

767

}

767

}

768

blk_set_queue_congested(q, is_sync);

768

blk_set_queue_congested(q, is_sync);

769

}

769

}

770

771

/*

771

/*

772

* Only allow batching queuers to allocate up to 50% over the defined

772

* Only allow batching queuers to allocate up to 50% over the defined

773

* limit of requests, otherwise we could have thousands of requests

773

* limit of requests, otherwise we could have thousands of requests

774

* allocated with any setting of ->nr_requests

774

* allocated with any setting of ->nr_requests

775

*/

775

*/

776

if (rl->count[is_sync] >= (3 * q->nr_requests / 2))

776

if (rl->count[is_sync] >= (3 * q->nr_requests / 2))

777

goto out;

777

goto out;

778

779

rl->count[is_sync]++;

779

rl->count[is_sync]++;

780

rl->starved[is_sync] = 0;

780

rl->starved[is_sync] = 0;

781

782

priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);

782

priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);

783

if (priv)

783

if (priv)

784

rl->elvpriv++;

784

rl->elvpriv++;

785

786

if (blk_queue_io_stat(q))

786

if (blk_queue_io_stat(q))

787

rw_flags |= REQ_IO_STAT;

787

rw_flags |= REQ_IO_STAT;

788

spin_unlock_irq(q->queue_lock);

788

spin_unlock_irq(q->queue_lock);

789

790

rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);

790

rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);

791

if (unlikely(!rq)) {

791

if (unlikely(!rq)) {

792

/*

792

/*

793

* Allocation failed presumably due to memory. Undo anything

793

* Allocation failed presumably due to memory. Undo anything

794

* we might have messed up.

794

* we might have messed up.

795

*

795

*

796

* Allocating task should really be put onto the front of the

796

* Allocating task should really be put onto the front of the

797

* wait queue, but this is pretty rare.

797

* wait queue, but this is pretty rare.

798

*/

798

*/

799

spin_lock_irq(q->queue_lock);

799

spin_lock_irq(q->queue_lock);

800

freed_request(q, is_sync, priv);

800

freed_request(q, is_sync, priv);

801

802

/*

802

/*

803

* in the very unlikely event that allocation failed and no

803

* in the very unlikely event that allocation failed and no

804

* requests for this direction was pending, mark us starved

804

* requests for this direction was pending, mark us starved

805

* so that freeing of a request in the other direction will

805

* so that freeing of a request in the other direction will

806

* notice us. another possible fix would be to split the

806

* notice us. another possible fix would be to split the

807

* rq mempool into READ and WRITE

807

* rq mempool into READ and WRITE

808

*/

808

*/

809

rq_starved:

809

rq_starved:

810

if (unlikely(rl->count[is_sync] == 0))

810

if (unlikely(rl->count[is_sync] == 0))

811

rl->starved[is_sync] = 1;

811

rl->starved[is_sync] = 1;

812

813

goto out;

813

goto out;

814

}

814

}

815

816

/*

816

/*

817

* ioc may be NULL here, and ioc_batching will be false. That's

817

* ioc may be NULL here, and ioc_batching will be false. That's

818

* OK, if the queue is under the request limit then requests need

818

* OK, if the queue is under the request limit then requests need

819

* not count toward the nr_batch_requests limit. There will always

819

* not count toward the nr_batch_requests limit. There will always

820

* be some limit enforced by BLK_BATCH_TIME.

820

* be some limit enforced by BLK_BATCH_TIME.

821

*/

821

*/

822

if (ioc_batching(q, ioc))

822

if (ioc_batching(q, ioc))

823

ioc->nr_batch_requests--;

823

ioc->nr_batch_requests--;

824

825

trace_block_getrq(q, bio, rw_flags & 1);

825

trace_block_getrq(q, bio, rw_flags & 1);

826

out:

826

out:

827

return rq;

827

return rq;

828

}

828

}

829

830

/*

830

/*

831

* No available requests for this queue, unplug the device and wait for some

831

* No available requests for this queue, unplug the device and wait for some

832

* requests to become available.

832

* requests to become available.

833

*

833

*

834

* Called with q->queue_lock held, and returns with it unlocked.

834

* Called with q->queue_lock held, and returns with it unlocked.

835

*/

835

*/

836

static struct request *get_request_wait(struct request_queue *q, int rw_flags,

836

static struct request *get_request_wait(struct request_queue *q, int rw_flags,

837

struct bio *bio)

837

struct bio *bio)

838

{

838

{

839

const bool is_sync = rw_is_sync(rw_flags) != 0;

839

const bool is_sync = rw_is_sync(rw_flags) != 0;

840

struct request *rq;

840

struct request *rq;

841

842

rq = get_request(q, rw_flags, bio, GFP_NOIO);

842

rq = get_request(q, rw_flags, bio, GFP_NOIO);

843

while (!rq) {

843

while (!rq) {

844

DEFINE_WAIT(wait);

844

DEFINE_WAIT(wait);

845

struct io_context *ioc;

845

struct io_context *ioc;

846

struct request_list *rl = &q->rq;

846

struct request_list *rl = &q->rq;

847

848

prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,

848

prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,

849

TASK_UNINTERRUPTIBLE);

849

TASK_UNINTERRUPTIBLE);

850

851

trace_block_sleeprq(q, bio, rw_flags & 1);

851

trace_block_sleeprq(q, bio, rw_flags & 1);

852

853

__generic_unplug_device(q);

853

__generic_unplug_device(q);

854

spin_unlock_irq(q->queue_lock);

854

spin_unlock_irq(q->queue_lock);

855

io_schedule();

855

io_schedule();

856

857

/*

857

/*

858

* After sleeping, we become a "batching" process and

858

* After sleeping, we become a "batching" process and

859

* will be able to allocate at least one request, and

859

* will be able to allocate at least one request, and

860

* up to a big batch of them for a small period time.

860

* up to a big batch of them for a small period time.

861

* See ioc_batching, ioc_set_batching

861

* See ioc_batching, ioc_set_batching

862

*/

862

*/

863

ioc = current_io_context(GFP_NOIO, q->node);

863

ioc = current_io_context(GFP_NOIO, q->node);

864

ioc_set_batching(q, ioc);

864

ioc_set_batching(q, ioc);

865

866

spin_lock_irq(q->queue_lock);

866

spin_lock_irq(q->queue_lock);

867

finish_wait(&rl->wait[is_sync], &wait);

867

finish_wait(&rl->wait[is_sync], &wait);

868

869

rq = get_request(q, rw_flags, bio, GFP_NOIO);

869

rq = get_request(q, rw_flags, bio, GFP_NOIO);

870

};

870

};

871

872

return rq;

872

return rq;

873

}

873

}

874

875

struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)

875

struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)

876

{

876

{

877

struct request *rq;

877

struct request *rq;

878

879

BUG_ON(rw != READ && rw != WRITE);

879

BUG_ON(rw != READ && rw != WRITE);

880

881

spin_lock_irq(q->queue_lock);

881

spin_lock_irq(q->queue_lock);

882

if (gfp_mask & __GFP_WAIT) {

882

if (gfp_mask & __GFP_WAIT) {

883

rq = get_request_wait(q, rw, NULL);

883

rq = get_request_wait(q, rw, NULL);

884

} else {

884

} else {

885

rq = get_request(q, rw, NULL, gfp_mask);

885

rq = get_request(q, rw, NULL, gfp_mask);

886

if (!rq)

886

if (!rq)

887

spin_unlock_irq(q->queue_lock);

887

spin_unlock_irq(q->queue_lock);

888

}

888

}

889

/* q->queue_lock is unlocked at this point */

889

/* q->queue_lock is unlocked at this point */

890

891

return rq;

891

return rq;

892

}

892

}

893

EXPORT_SYMBOL(blk_get_request);

893

EXPORT_SYMBOL(blk_get_request);

894

895

/**

895

/**

896

* blk_requeue_request - put a request back on queue

896

* blk_requeue_request - put a request back on queue

897

* @q: request queue where request should be inserted

897

* @q: request queue where request should be inserted

898

* @rq: request to be inserted

898

* @rq: request to be inserted

899

*

899

*

900

* Description:

900

* Description:

901

* Drivers often keep queueing requests until the hardware cannot accept

901

* Drivers often keep queueing requests until the hardware cannot accept

902

* more, when that condition happens we need to put the request back

902

* more, when that condition happens we need to put the request back

903

* on the queue. Must be called with queue lock held.

903

* on the queue. Must be called with queue lock held.

904

*/

904

*/

905

void blk_requeue_request(struct request_queue *q, struct request *rq)

905

void blk_requeue_request(struct request_queue *q, struct request *rq)

906

{

906

{

907

blk_delete_timer(rq);

907

blk_delete_timer(rq);

908

blk_clear_rq_complete(rq);

908

blk_clear_rq_complete(rq);

909

trace_block_rq_requeue(q, rq);

909

trace_block_rq_requeue(q, rq);

910

911

if (blk_rq_tagged(rq))

911

if (blk_rq_tagged(rq))

912

blk_queue_end_tag(q, rq);

912

blk_queue_end_tag(q, rq);

913

914

elv_requeue_request(q, rq);

914

elv_requeue_request(q, rq);

915

}

915

}

916

EXPORT_SYMBOL(blk_requeue_request);

916

EXPORT_SYMBOL(blk_requeue_request);

917

918

/**

918

/**

919

* blk_insert_request - insert a special request into a request queue

919

* blk_insert_request - insert a special request into a request queue

920

* @q: request queue where request should be inserted

920

* @q: request queue where request should be inserted

921

* @rq: request to be inserted

921

* @rq: request to be inserted

922

* @at_head: insert request at head or tail of queue

922

* @at_head: insert request at head or tail of queue

923

* @data: private data

923

* @data: private data

924

*

924

*

925

* Description:

925

* Description:

926

* Many block devices need to execute commands asynchronously, so they don't

926

* Many block devices need to execute commands asynchronously, so they don't

927

* block the whole kernel from preemption during request execution. This is

927

* block the whole kernel from preemption during request execution. This is

928

* accomplished normally by inserting aritficial requests tagged as

928

* accomplished normally by inserting aritficial requests tagged as

929

* REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them

929

* REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them

930

* be scheduled for actual execution by the request queue.

930

* be scheduled for actual execution by the request queue.

931

*

931

*

932

* We have the option of inserting the head or the tail of the queue.

932

* We have the option of inserting the head or the tail of the queue.

933

* Typically we use the tail for new ioctls and so forth. We use the head

933

* Typically we use the tail for new ioctls and so forth. We use the head

934

* of the queue for things like a QUEUE_FULL message from a device, or a

934

* of the queue for things like a QUEUE_FULL message from a device, or a

935

* host that is unable to accept a particular command.

935

* host that is unable to accept a particular command.

936

*/

936

*/

937

void blk_insert_request(struct request_queue *q, struct request *rq,

937

void blk_insert_request(struct request_queue *q, struct request *rq,

938

int at_head, void *data)

938

int at_head, void *data)

939

{

939

{

940

int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;

940

int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;

941

unsigned long flags;

941

unsigned long flags;

942

943

/*

943

/*

944

* tell I/O scheduler that this isn't a regular read/write (ie it

944

* tell I/O scheduler that this isn't a regular read/write (ie it

945

* must not attempt merges on this) and that it acts as a soft

945

* must not attempt merges on this) and that it acts as a soft

946

* barrier

946

* barrier

947

*/

947

*/

948

rq->cmd_type = REQ_TYPE_SPECIAL;

948

rq->cmd_type = REQ_TYPE_SPECIAL;

949

950

rq->special = data;

950

rq->special = data;

951

952

spin_lock_irqsave(q->queue_lock, flags);

952

spin_lock_irqsave(q->queue_lock, flags);

953

954

/*

954

/*

955

* If command is tagged, release the tag

955

* If command is tagged, release the tag

956

*/

956

*/

957

if (blk_rq_tagged(rq))

957

if (blk_rq_tagged(rq))

958

blk_queue_end_tag(q, rq);

958

blk_queue_end_tag(q, rq);

959

960

drive_stat_acct(rq, 1);

960

drive_stat_acct(rq, 1);

961

__elv_add_request(q, rq, where, 0);

961

__elv_add_request(q, rq, where, 0);

962

__blk_run_queue(q);

962

__blk_run_queue(q);

963

spin_unlock_irqrestore(q->queue_lock, flags);

963

spin_unlock_irqrestore(q->queue_lock, flags);

964

}

964

}

965

EXPORT_SYMBOL(blk_insert_request);

965

EXPORT_SYMBOL(blk_insert_request);

966

967

/*

967

/*

968

* add-request adds a request to the linked list.

968

* add-request adds a request to the linked list.

969

* queue lock is held and interrupts disabled, as we muck with the

969

* queue lock is held and interrupts disabled, as we muck with the

970

* request queue list.

970

* request queue list.

971

*/

971

*/

972

static inline void add_request(struct request_queue *q, struct request *req)

972

static inline void add_request(struct request_queue *q, struct request *req)

973

{

973

{

974

drive_stat_acct(req, 1);

974

drive_stat_acct(req, 1);

975

976

/*

976

/*

977

* elevator indicated where it wants this request to be

977

* elevator indicated where it wants this request to be

978

* inserted at elevator_merge time

978

* inserted at elevator_merge time

979

*/

979

*/

980

__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);

980

__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);

981

}

981

}

982

983

static void part_round_stats_single(int cpu, struct hd_struct *part,

983

static void part_round_stats_single(int cpu, struct hd_struct *part,

984

unsigned long now)

984

unsigned long now)

985

{

985

{

986

if (now == part->stamp)

986

if (now == part->stamp)

987

return;

987

return;

988

989

if (part->in_flight) {

989

if (part->in_flight) {

990

__part_stat_add(cpu, part, time_in_queue,

990

__part_stat_add(cpu, part, time_in_queue,

991

part->in_flight * (now - part->stamp));

991

part->in_flight * (now - part->stamp));

992

__part_stat_add(cpu, part, io_ticks, (now - part->stamp));

992

__part_stat_add(cpu, part, io_ticks, (now - part->stamp));

993

}

993

}

994

part->stamp = now;

994

part->stamp = now;

995

}

995

}

996

997

/**

997

/**

998

* part_round_stats() - Round off the performance stats on a struct disk_stats.

998

* part_round_stats() - Round off the performance stats on a struct disk_stats.

999

* @cpu: cpu number for stats access

999

* @cpu: cpu number for stats access

1000

* @part: target partition

1000

* @part: target partition

1001

*

1001

*

1002

* The average IO queue length and utilisation statistics are maintained

1002

* The average IO queue length and utilisation statistics are maintained

1003

* by observing the current state of the queue length and the amount of

1003

* by observing the current state of the queue length and the amount of

1004

* time it has been in this state for.

1004

* time it has been in this state for.

1005

*

1005

*

1006

* Normally, that accounting is done on IO completion, but that can result

1006

* Normally, that accounting is done on IO completion, but that can result

1007

* in more than a second's worth of IO being accounted for within any one

1007

* in more than a second's worth of IO being accounted for within any one

1008

* second, leading to >100% utilisation. To deal with that, we call this

1008

* second, leading to >100% utilisation. To deal with that, we call this

1009

* function to do a round-off before returning the results when reading

1009

* function to do a round-off before returning the results when reading

1010

* /proc/diskstats. This accounts immediately for all queue usage up to

1010

* /proc/diskstats. This accounts immediately for all queue usage up to

1011

* the current jiffies and restarts the counters again.

1011

* the current jiffies and restarts the counters again.

1012

*/

1012

*/

1013

void part_round_stats(int cpu, struct hd_struct *part)

1013

void part_round_stats(int cpu, struct hd_struct *part)

1014

{

1014

{

1015

unsigned long now = jiffies;

1015

unsigned long now = jiffies;

1016

1017

if (part->partno)

1017

if (part->partno)

1018

part_round_stats_single(cpu, &part_to_disk(part)->part0, now);

1018

part_round_stats_single(cpu, &part_to_disk(part)->part0, now);

1019

part_round_stats_single(cpu, part, now);

1019

part_round_stats_single(cpu, part, now);

1020

}

1020

}

1021

EXPORT_SYMBOL_GPL(part_round_stats);

1021

EXPORT_SYMBOL_GPL(part_round_stats);

1022

1023

/*

1023

/*

1024

* queue lock must be held

1024

* queue lock must be held

1025

*/

1025

*/

1026

void __blk_put_request(struct request_queue *q, struct request *req)

1026

void __blk_put_request(struct request_queue *q, struct request *req)

1027

{

1027

{

1028

if (unlikely(!q))

1028

if (unlikely(!q))

1029

return;

1029

return;

1030

if (unlikely(--req->ref_count))

1030

if (unlikely(--req->ref_count))

1031

return;

1031

return;

1032

1033

elv_completed_request(q, req);

1033

elv_completed_request(q, req);

1034

1035

/* this is a bio leak */

1035

/* this is a bio leak */

1036

WARN_ON(req->bio != NULL);

1036

WARN_ON(req->bio != NULL);

1037

1038

/*

1038

/*

1039

* Request may not have originated from ll_rw_blk. if not,

1039

* Request may not have originated from ll_rw_blk. if not,

1040

* it didn't come out of our reserved rq pools

1040

* it didn't come out of our reserved rq pools

1041

*/

1041

*/

1042

if (req->cmd_flags & REQ_ALLOCED) {

1042

if (req->cmd_flags & REQ_ALLOCED) {

1043

int is_sync = rq_is_sync(req) != 0;

1043

int is_sync = rq_is_sync(req) != 0;

1044

int priv = req->cmd_flags & REQ_ELVPRIV;

1044

int priv = req->cmd_flags & REQ_ELVPRIV;

1045

1046

BUG_ON(!list_empty(&req->queuelist));

1046

BUG_ON(!list_empty(&req->queuelist));

1047

BUG_ON(!hlist_unhashed(&req->hash));

1047

BUG_ON(!hlist_unhashed(&req->hash));

1048

1049

blk_free_request(q, req);

1049

blk_free_request(q, req);

1050

freed_request(q, is_sync, priv);

1050

freed_request(q, is_sync, priv);

1051

}

1051

}

1052

}

1052

}

1053

EXPORT_SYMBOL_GPL(__blk_put_request);

1053

EXPORT_SYMBOL_GPL(__blk_put_request);

1054

1055

void blk_put_request(struct request *req)

1055

void blk_put_request(struct request *req)

1056

{

1056

{

1057

unsigned long flags;

1057

unsigned long flags;

1058

struct request_queue *q = req->q;

1058

struct request_queue *q = req->q;

1059

1060

spin_lock_irqsave(q->queue_lock, flags);

1060

spin_lock_irqsave(q->queue_lock, flags);

1061

__blk_put_request(q, req);

1061

__blk_put_request(q, req);

1062

spin_unlock_irqrestore(q->queue_lock, flags);

1062

spin_unlock_irqrestore(q->queue_lock, flags);

1063

}

1063

}

1064

EXPORT_SYMBOL(blk_put_request);

1064

EXPORT_SYMBOL(blk_put_request);

1065

1066

void init_request_from_bio(struct request *req, struct bio *bio)

1066

void init_request_from_bio(struct request *req, struct bio *bio)

1067

{

1067

{

1068

req->cpu = bio->bi_comp_cpu;

1068

req->cpu = bio->bi_comp_cpu;

1069

req->cmd_type = REQ_TYPE_FS;

1069

req->cmd_type = REQ_TYPE_FS;

1070

1071

/*

1071

/*

1072

* inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)

1072

* inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)

1073

*/

1073

*/

1074

if (bio_rw_ahead(bio))

1074

if (bio_rw_ahead(bio))

1075

req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |

1075

req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |

1076

REQ_FAILFAST_DRIVER);

1076

REQ_FAILFAST_DRIVER);

1077

if (bio_failfast_dev(bio))

1077

if (bio_failfast_dev(bio))

1078

req->cmd_flags |= REQ_FAILFAST_DEV;

1078

req->cmd_flags |= REQ_FAILFAST_DEV;

1079

if (bio_failfast_transport(bio))

1079

if (bio_failfast_transport(bio))

1080

req->cmd_flags |= REQ_FAILFAST_TRANSPORT;

1080

req->cmd_flags |= REQ_FAILFAST_TRANSPORT;

1081

if (bio_failfast_driver(bio))

1081

if (bio_failfast_driver(bio))

1082

req->cmd_flags |= REQ_FAILFAST_DRIVER;

1082

req->cmd_flags |= REQ_FAILFAST_DRIVER;

1083

1084

if (unlikely(bio_discard(bio))) {

1084

if (unlikely(bio_discard(bio))) {

1085

req->cmd_flags |= REQ_DISCARD;

1085

req->cmd_flags |= REQ_DISCARD;

1086

if (bio_barrier(bio))

1086

if (bio_barrier(bio))

1087

req->cmd_flags |= REQ_SOFTBARRIER;

1087

req->cmd_flags |= REQ_SOFTBARRIER;

1088

req->q->prepare_discard_fn(req->q, req);

1088

req->q->prepare_discard_fn(req->q, req);

1089

} else if (unlikely(bio_barrier(bio)))

1089

} else if (unlikely(bio_barrier(bio)))

1090

req->cmd_flags |= REQ_HARDBARRIER;

1090

req->cmd_flags |= REQ_HARDBARRIER;

1091

1092

if (bio_sync(bio))

1092

if (bio_sync(bio))

1093

req->cmd_flags |= REQ_RW_SYNC;

1093

req->cmd_flags |= REQ_RW_SYNC;

1094

if (bio_rw_meta(bio))

1094

if (bio_rw_meta(bio))

1095

req->cmd_flags |= REQ_RW_META;

1095

req->cmd_flags |= REQ_RW_META;

1096

if (bio_noidle(bio))

1096

if (bio_noidle(bio))

1097

req->cmd_flags |= REQ_NOIDLE;

1097

req->cmd_flags |= REQ_NOIDLE;

1098

1099

req->errors = 0;

1099

req->errors = 0;

1100

req->hard_sector = req->sector = bio->bi_sector;

1100

req->hard_sector = req->sector = bio->bi_sector;

1101

req->ioprio = bio_prio(bio);

1101

req->ioprio = bio_prio(bio);

1102

blk_rq_bio_prep(req->q, req, bio);

1102

blk_rq_bio_prep(req->q, req, bio);

1103

}

1103

}

1104

1105

/*

1105

/*

1106

* Only disabling plugging for non-rotational devices if it does tagging

1106

* Only disabling plugging for non-rotational devices if it does tagging

1107

* as well, otherwise we do need the proper merging

1107

* as well, otherwise we do need the proper merging

1108

*/

1108

*/

1109

static inline bool queue_should_plug(struct request_queue *q)

1109

static inline bool queue_should_plug(struct request_queue *q)

1110

{

1110

{

1111

return !(blk_queue_nonrot(q) && blk_queue_tagged(q));

1111

return !(blk_queue_nonrot(q) && blk_queue_tagged(q));

1112

}

1112

}

1113

1114

static int __make_request(struct request_queue *q, struct bio *bio)

1114

static int __make_request(struct request_queue *q, struct bio *bio)

1115

{

1115

{

1116

struct request *req;

1116

struct request *req;

1117

int el_ret, nr_sectors;

1117

int el_ret, nr_sectors;

1118

const unsigned short prio = bio_prio(bio);

1118

const unsigned short prio = bio_prio(bio);

1119

const int sync = bio_sync(bio);

1119

const int sync = bio_sync(bio);

1120

const int unplug = bio_unplug(bio);

1120

const int unplug = bio_unplug(bio);

1121

int rw_flags;

1121

int rw_flags;

1122

1123

nr_sectors = bio_sectors(bio);

1123

nr_sectors = bio_sectors(bio);

1124

1125

/*

1125

/*

1126

* low level driver can indicate that it wants pages above a

1126

* low level driver can indicate that it wants pages above a

1127

* certain limit bounced to low memory (ie for highmem, or even

1127

* certain limit bounced to low memory (ie for highmem, or even

1128

* ISA dma in theory)

1128

* ISA dma in theory)

1129

*/

1129

*/

1130

blk_queue_bounce(q, &bio);

1130

blk_queue_bounce(q, &bio);

1131

1132

spin_lock_irq(q->queue_lock);

1132

spin_lock_irq(q->queue_lock);

1133

1134

if (unlikely(bio_barrier(bio)) || elv_queue_empty(q))

1134

if (unlikely(bio_barrier(bio)) || elv_queue_empty(q))

1135

goto get_rq;

1135

goto get_rq;

1136

1137

el_ret = elv_merge(q, &req, bio);

1137

el_ret = elv_merge(q, &req, bio);

1138

switch (el_ret) {

1138

switch (el_ret) {

1139

case ELEVATOR_BACK_MERGE:

1139

case ELEVATOR_BACK_MERGE:

1140

BUG_ON(!rq_mergeable(req));

1140

BUG_ON(!rq_mergeable(req));

1141

1142

if (!ll_back_merge_fn(q, req, bio))

1142

if (!ll_back_merge_fn(q, req, bio))

1143

break;

1143

break;

1144

1145

trace_block_bio_backmerge(q, bio);

1145

trace_block_bio_backmerge(q, bio);

1146

1147

req->biotail->bi_next = bio;

1147

req->biotail->bi_next = bio;

1148

req->biotail = bio;

1148

req->biotail = bio;

1149

req->nr_sectors = req->hard_nr_sectors += nr_sectors;

1149

req->nr_sectors = req->hard_nr_sectors += nr_sectors;

1150

req->ioprio = ioprio_best(req->ioprio, prio);

1150

req->ioprio = ioprio_best(req->ioprio, prio);

1151

if (!blk_rq_cpu_valid(req))

1151

if (!blk_rq_cpu_valid(req))

1152

req->cpu = bio->bi_comp_cpu;

1152

req->cpu = bio->bi_comp_cpu;

1153

drive_stat_acct(req, 0);

1153

drive_stat_acct(req, 0);

1154

if (!attempt_back_merge(q, req))

1154

if (!attempt_back_merge(q, req))

1155

elv_merged_request(q, req, el_ret);

1155

elv_merged_request(q, req, el_ret);

1156

goto out;

1156

goto out;

1157

1158

case ELEVATOR_FRONT_MERGE:

1158

case ELEVATOR_FRONT_MERGE:

1159

BUG_ON(!rq_mergeable(req));

1159

BUG_ON(!rq_mergeable(req));

1160

1161

if (!ll_front_merge_fn(q, req, bio))

1161

if (!ll_front_merge_fn(q, req, bio))

1162

break;

1162

break;

1163

1164

trace_block_bio_frontmerge(q, bio);

1164

trace_block_bio_frontmerge(q, bio);

1165

1166

bio->bi_next = req->bio;

1166

bio->bi_next = req->bio;

1167

req->bio = bio;

1167

req->bio = bio;

1168

1169

/*

1169

/*

1170

* may not be valid. if the low level driver said

1170

* may not be valid. if the low level driver said

1171

* it didn't need a bounce buffer then it better

1171

* it didn't need a bounce buffer then it better

1172

* not touch req->buffer either...

1172

* not touch req->buffer either...

1173

*/

1173

*/

1174

req->buffer = bio_data(bio);

1174

req->buffer = bio_data(bio);

1175

req->current_nr_sectors = bio_cur_sectors(bio);

1175

req->current_nr_sectors = bio_cur_sectors(bio);

1176

req->hard_cur_sectors = req->current_nr_sectors;

1176

req->hard_cur_sectors = req->current_nr_sectors;

1177

req->sector = req->hard_sector = bio->bi_sector;

1177

req->sector = req->hard_sector = bio->bi_sector;

1178

req->nr_sectors = req->hard_nr_sectors += nr_sectors;

1178

req->nr_sectors = req->hard_nr_sectors += nr_sectors;

1179

req->ioprio = ioprio_best(req->ioprio, prio);

1179

req->ioprio = ioprio_best(req->ioprio, prio);

1180

if (!blk_rq_cpu_valid(req))

1180

if (!blk_rq_cpu_valid(req))

1181

req->cpu = bio->bi_comp_cpu;

1181

req->cpu = bio->bi_comp_cpu;

1182

drive_stat_acct(req, 0);

1182

drive_stat_acct(req, 0);

1183

if (!attempt_front_merge(q, req))

1183

if (!attempt_front_merge(q, req))

1184

elv_merged_request(q, req, el_ret);

1184

elv_merged_request(q, req, el_ret);

1185

goto out;

1185

goto out;

1186

1187

/* ELV_NO_MERGE: elevator says don't/can't merge. */

1187

/* ELV_NO_MERGE: elevator says don't/can't merge. */

1188

default:

1188

default:

1189

;

1189

;

1190

}

1190

}

1191

1192

get_rq:

1192

get_rq:

1193

/*

1193

/*

1194

* This sync check and mask will be re-done in init_request_from_bio(),

1194

* This sync check and mask will be re-done in init_request_from_bio(),

1195

* but we need to set it earlier to expose the sync flag to the

1195

* but we need to set it earlier to expose the sync flag to the

1196

* rq allocator and io schedulers.

1196

* rq allocator and io schedulers.

1197

*/

1197

*/

1198

rw_flags = bio_data_dir(bio);

1198

rw_flags = bio_data_dir(bio);

1199

if (sync)

1199

if (sync)

1200

rw_flags |= REQ_RW_SYNC;

1200

rw_flags |= REQ_RW_SYNC;

1201

1202

/*

1202

/*

1203

* Grab a free request. This is might sleep but can not fail.

1203

* Grab a free request. This is might sleep but can not fail.

1204

* Returns with the queue unlocked.

1204

* Returns with the queue unlocked.

1205

*/

1205

*/

1206

req = get_request_wait(q, rw_flags, bio);

1206

req = get_request_wait(q, rw_flags, bio);

1207

1208

/*

1208

/*

1209

* After dropping the lock and possibly sleeping here, our request

1209

* After dropping the lock and possibly sleeping here, our request

1210

* may now be mergeable after it had proven unmergeable (above).

1210

* may now be mergeable after it had proven unmergeable (above).

1211

* We don't worry about that case for efficiency. It won't happen

1211

* We don't worry about that case for efficiency. It won't happen

1212

* often, and the elevators are able to handle it.

1212

* often, and the elevators are able to handle it.

1213

*/

1213

*/

1214

init_request_from_bio(req, bio);

1214

init_request_from_bio(req, bio);

1215

1216

spin_lock_irq(q->queue_lock);

1216

spin_lock_irq(q->queue_lock);

1217

if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||

1217

if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||

1218

bio_flagged(bio, BIO_CPU_AFFINE))

1218

bio_flagged(bio, BIO_CPU_AFFINE))

1219

req->cpu = blk_cpu_to_group(smp_processor_id());

1219

req->cpu = blk_cpu_to_group(smp_processor_id());

1220

if (queue_should_plug(q) && elv_queue_empty(q))

1220

if (queue_should_plug(q) && elv_queue_empty(q))

1221

blk_plug_device(q);

1221

blk_plug_device(q);

1222

add_request(q, req);

1222

add_request(q, req);

1223

out:

1223

out:

1224

if (unplug || !queue_should_plug(q))

1224

if (unplug || !queue_should_plug(q))

1225

__generic_unplug_device(q);

1225

__generic_unplug_device(q);

1226

spin_unlock_irq(q->queue_lock);

1226

spin_unlock_irq(q->queue_lock);

1227

return 0;

1227

return 0;

1228

}

1228

}

1229

1230

/*

1230

/*

1231

* If bio->bi_dev is a partition, remap the location

1231

* If bio->bi_dev is a partition, remap the location

1232

*/

1232

*/

1233

static inline void blk_partition_remap(struct bio *bio)

1233

static inline void blk_partition_remap(struct bio *bio)

1234

{

1234

{

1235

struct block_device *bdev = bio->bi_bdev;

1235

struct block_device *bdev = bio->bi_bdev;

1236

1237

if (bio_sectors(bio) && bdev != bdev->bd_contains) {

1237

if (bio_sectors(bio) && bdev != bdev->bd_contains) {

1238

struct hd_struct *p = bdev->bd_part;

1238

struct hd_struct *p = bdev->bd_part;

1239

1240

bio->bi_sector += p->start_sect;

1240

bio->bi_sector += p->start_sect;

1241

bio->bi_bdev = bdev->bd_contains;

1241

bio->bi_bdev = bdev->bd_contains;

1242

1243

trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,

1243

trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,

1244

bdev->bd_dev, bio->bi_sector,

1244

bdev->bd_dev, bio->bi_sector,

1245

bio->bi_sector - p->start_sect);

1245

bio->bi_sector - p->start_sect);

1246

}

1246

}

1247

}

1247

}

1248

1249

static void handle_bad_sector(struct bio *bio)

1249

static void handle_bad_sector(struct bio *bio)

1250

{

1250

{

1251

char b[BDEVNAME_SIZE];

1251

char b[BDEVNAME_SIZE];

1252

1253

printk(KERN_INFO "attempt to access beyond end of device\n");

1253

printk(KERN_INFO "attempt to access beyond end of device\n");

1254

printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",

1254

printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",

1255

bdevname(bio->bi_bdev, b),

1255

bdevname(bio->bi_bdev, b),

1256

bio->bi_rw,

1256

bio->bi_rw,

1257

(unsigned long long)bio->bi_sector + bio_sectors(bio),

1257

(unsigned long long)bio->bi_sector + bio_sectors(bio),

1258

(long long)(bio->bi_bdev->bd_inode->i_size >> 9));

1258

(long long)(bio->bi_bdev->bd_inode->i_size >> 9));

1259

1260

set_bit(BIO_EOF, &bio->bi_flags);

1260

set_bit(BIO_EOF, &bio->bi_flags);

1261

}

1261

}

1262

1263

#ifdef CONFIG_FAIL_MAKE_REQUEST

1263

#ifdef CONFIG_FAIL_MAKE_REQUEST

1264

1265

static DECLARE_FAULT_ATTR(fail_make_request);

1265

static DECLARE_FAULT_ATTR(fail_make_request);

1266

1267

static int __init setup_fail_make_request(char *str)

1267

static int __init setup_fail_make_request(char *str)

1268

{

1268

{

1269

return setup_fault_attr(&fail_make_request, str);

1269

return setup_fault_attr(&fail_make_request, str);

1270

}

1270

}

1271

__setup("fail_make_request=", setup_fail_make_request);

1271

__setup("fail_make_request=", setup_fail_make_request);

1272

1273

static int should_fail_request(struct bio *bio)

1273

static int should_fail_request(struct bio *bio)

1274

{

1274

{

1275

struct hd_struct *part = bio->bi_bdev->bd_part;

1275

struct hd_struct *part = bio->bi_bdev->bd_part;

1276

1277

if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)

1277

if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)

1278

return should_fail(&fail_make_request, bio->bi_size);

1278

return should_fail(&fail_make_request, bio->bi_size);

1279

1280

return 0;

1280

return 0;

1281

}

1281

}

1282

1283

static int __init fail_make_request_debugfs(void)

1283

static int __init fail_make_request_debugfs(void)

1284

{

1284

{

1285

return init_fault_attr_dentries(&fail_make_request,

1285

return init_fault_attr_dentries(&fail_make_request,

1286

"fail_make_request");

1286

"fail_make_request");

1287

}

1287

}

1288

1289

late_initcall(fail_make_request_debugfs);

1289

late_initcall(fail_make_request_debugfs);

1290

1291

#else /* CONFIG_FAIL_MAKE_REQUEST */

1291

#else /* CONFIG_FAIL_MAKE_REQUEST */

1292

1293

static inline int should_fail_request(struct bio *bio)

1293

static inline int should_fail_request(struct bio *bio)

1294

{

1294

{

1295

return 0;

1295

return 0;

1296

}

1296

}

1297

1298

#endif /* CONFIG_FAIL_MAKE_REQUEST */

1298

#endif /* CONFIG_FAIL_MAKE_REQUEST */

1299

1300

/*

1300

/*

1301

* Check whether this bio extends beyond the end of the device.

1301

* Check whether this bio extends beyond the end of the device.

1302

*/

1302

*/

1303

static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)

1303

static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)

1304

{

1304

{

1305

sector_t maxsector;

1305

sector_t maxsector;

1306

1307

if (!nr_sectors)

1307

if (!nr_sectors)

1308

return 0;

1308

return 0;

1309

1310

/* Test device or partition size, when known. */

1310

/* Test device or partition size, when known. */

1311

maxsector = bio->bi_bdev->bd_inode->i_size >> 9;

1311

maxsector = bio->bi_bdev->bd_inode->i_size >> 9;

1312

if (maxsector) {

1312

if (maxsector) {

1313

sector_t sector = bio->bi_sector;

1313

sector_t sector = bio->bi_sector;

1314

1315

if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {

1315

if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {

1316

/*

1316

/*

1317

* This may well happen - the kernel calls bread()

1317

* This may well happen - the kernel calls bread()

1318

* without checking the size of the device, e.g., when

1318

* without checking the size of the device, e.g., when

1319

* mounting a device.

1319

* mounting a device.

1320

*/

1320

*/

1321

handle_bad_sector(bio);

1321

handle_bad_sector(bio);

1322

return 1;

1322

return 1;

1323

}

1323

}

1324

}

1324

}

1325

1326

return 0;

1326

return 0;

1327

}

1327

}

1328

1329

/**

1329

/**

1330

* generic_make_request - hand a buffer to its device driver for I/O

1330

* generic_make_request - hand a buffer to its device driver for I/O

1331

* @bio: The bio describing the location in memory and on the device.

1331

* @bio: The bio describing the location in memory and on the device.

1332

*

1332

*

1333

* generic_make_request() is used to make I/O requests of block

1333

* generic_make_request() is used to make I/O requests of block

1334

* devices. It is passed a &struct bio, which describes the I/O that needs

1334

* devices. It is passed a &struct bio, which describes the I/O that needs

1335

* to be done.

1335

* to be done.

1336

*

1336

*

1337

* generic_make_request() does not return any status. The

1337

* generic_make_request() does not return any status. The

1338

* success/failure status of the request, along with notification of

1338

* success/failure status of the request, along with notification of

1339

* completion, is delivered asynchronously through the bio->bi_end_io

1339

* completion, is delivered asynchronously through the bio->bi_end_io

1340

* function described (one day) else where.

1340

* function described (one day) else where.

1341

*

1341

*

1342

* The caller of generic_make_request must make sure that bi_io_vec

1342

* The caller of generic_make_request must make sure that bi_io_vec

1343

* are set to describe the memory buffer, and that bi_dev and bi_sector are

1343

* are set to describe the memory buffer, and that bi_dev and bi_sector are

1344

* set to describe the device address, and the

1344

* set to describe the device address, and the

1345

* bi_end_io and optionally bi_private are set to describe how

1345

* bi_end_io and optionally bi_private are set to describe how

1346

* completion notification should be signaled.

1346

* completion notification should be signaled.

1347

*

1347

*

1348

* generic_make_request and the drivers it calls may use bi_next if this

1348

* generic_make_request and the drivers it calls may use bi_next if this

1349

* bio happens to be merged with someone else, and may change bi_dev and

1349

* bio happens to be merged with someone else, and may change bi_dev and

1350

* bi_sector for remaps as it sees fit. So the values of these fields

1350

* bi_sector for remaps as it sees fit. So the values of these fields

1351

* should NOT be depended on after the call to generic_make_request.

1351

* should NOT be depended on after the call to generic_make_request.

1352

*/

1352

*/

1353

static inline void __generic_make_request(struct bio *bio)

1353

static inline void __generic_make_request(struct bio *bio)

1354

{

1354

{

1355

struct request_queue *q;

1355

struct request_queue *q;

1356

sector_t old_sector;

1356

sector_t old_sector;

1357

int ret, nr_sectors = bio_sectors(bio);

1357

int ret, nr_sectors = bio_sectors(bio);

1358

dev_t old_dev;

1358

dev_t old_dev;

1359

int err = -EIO;

1359

int err = -EIO;

1360

1361

might_sleep();

1361

might_sleep();

1362

1363

if (bio_check_eod(bio, nr_sectors))

1363

if (bio_check_eod(bio, nr_sectors))

1364

goto end_io;

1364

goto end_io;

1365

1366

/*

1366

/*

1367

* Resolve the mapping until finished. (drivers are

1367

* Resolve the mapping until finished. (drivers are

1368

* still free to implement/resolve their own stacking

1368

* still free to implement/resolve their own stacking

1369

* by explicitly returning 0)

1369

* by explicitly returning 0)

1370

*

1370

*

1371

* NOTE: we don't repeat the blk_size check for each new device.

1371

* NOTE: we don't repeat the blk_size check for each new device.

1372

* Stacking drivers are expected to know what they are doing.

1372

* Stacking drivers are expected to know what they are doing.

1373

*/

1373

*/

1374

old_sector = -1;

1374

old_sector = -1;

1375

old_dev = 0;

1375

old_dev = 0;

1376

do {

1376

do {

1377

char b[BDEVNAME_SIZE];

1377

char b[BDEVNAME_SIZE];

1378

1379

q = bdev_get_queue(bio->bi_bdev);

1379

q = bdev_get_queue(bio->bi_bdev);

1380

if (unlikely(!q)) {

1380

if (unlikely(!q)) {

1381

printk(KERN_ERR

1381

printk(KERN_ERR

1382

"generic_make_request: Trying to access "

1382

"generic_make_request: Trying to access "

1383

"nonexistent block-device %s (%Lu)\n",

1383

"nonexistent block-device %s (%Lu)\n",

1384

bdevname(bio->bi_bdev, b),

1384

bdevname(bio->bi_bdev, b),

1385

(long long) bio->bi_sector);

1385

(long long) bio->bi_sector);

1386

goto end_io;

1386

goto end_io;

1387

}

1387

}

1388

1389

if (unlikely(nr_sectors > q->max_hw_sectors)) {

1389

if (unlikely(nr_sectors > q->max_hw_sectors)) {

1390

printk(KERN_ERR "bio too big device %s (%u > %u)\n",

1390

printk(KERN_ERR "bio too big device %s (%u > %u)\n",

1391

bdevname(bio->bi_bdev, b),

1391

bdevname(bio->bi_bdev, b),

1392

bio_sectors(bio),

1392

bio_sectors(bio),

1393

q->max_hw_sectors);

1393

q->max_hw_sectors);

1394

goto end_io;

1394

goto end_io;

1395

}

1395

}

1396

1397

if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))

1397

if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))

1398

goto end_io;

1398

goto end_io;

1399

1400

if (should_fail_request(bio))

1400

if (should_fail_request(bio))

1401

goto end_io;

1401

goto end_io;

1402

1403

/*

1403

/*

1404

* If this device has partitions, remap block n

1404

* If this device has partitions, remap block n

1405

* of partition p to block n+start(p) of the disk.

1405

* of partition p to block n+start(p) of the disk.

1406

*/

1406

*/

1407

blk_partition_remap(bio);

1407

blk_partition_remap(bio);

1408

1409

if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))

1409

if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))

1410

goto end_io;

1410

goto end_io;

1411

1412

if (old_sector != -1)

1412

if (old_sector != -1)

1413

trace_block_remap(q, bio, old_dev, bio->bi_sector,

1413

trace_block_remap(q, bio, old_dev, bio->bi_sector,

1414

old_sector);

1414

old_sector);

1415

1416

trace_block_bio_queue(q, bio);

1416

trace_block_bio_queue(q, bio);

1417

1418

old_sector = bio->bi_sector;

1418

old_sector = bio->bi_sector;

1419

old_dev = bio->bi_bdev->bd_dev;

1419

old_dev = bio->bi_bdev->bd_dev;

1420

1421

if (bio_check_eod(bio, nr_sectors))

1421

if (bio_check_eod(bio, nr_sectors))

1422

goto end_io;

1422

goto end_io;

1423

1424

if (bio_discard(bio) && !q->prepare_discard_fn) {

1424

if (bio_discard(bio) && !q->prepare_discard_fn) {

1425

err = -EOPNOTSUPP;

1425

err = -EOPNOTSUPP;

1426

goto end_io;

1426

goto end_io;

1427

}

1427

}

1428

if (bio_barrier(bio) && bio_has_data(bio) &&

1428

if (bio_barrier(bio) && bio_has_data(bio) &&

1429

(q->next_ordered == QUEUE_ORDERED_NONE)) {

1429

(q->next_ordered == QUEUE_ORDERED_NONE)) {

1430

err = -EOPNOTSUPP;

1430

err = -EOPNOTSUPP;

1431

goto end_io;

1431

goto end_io;

1432

}

1432

}

1433

1434

ret = q->make_request_fn(q, bio);

1434

ret = q->make_request_fn(q, bio);

1435

} while (ret);

1435

} while (ret);

1436

1437

return;

1437

return;

1438

1439

end_io:

1439

end_io:

1440

bio_endio(bio, err);

1440

bio_endio(bio, err);

1441

}

1441

}

1442

1443

/*

1443

/*

1444

* We only want one ->make_request_fn to be active at a time,

1444

* We only want one ->make_request_fn to be active at a time,

1445

* else stack usage with stacked devices could be a problem.

1445

* else stack usage with stacked devices could be a problem.

1446

* So use current->bio_{list,tail} to keep a list of requests

1446

* So use current->bio_{list,tail} to keep a list of requests

1447

* submited by a make_request_fn function.

1447

* submited by a make_request_fn function.

1448

* current->bio_tail is also used as a flag to say if

1448

* current->bio_tail is also used as a flag to say if

1449

* generic_make_request is currently active in this task or not.

1449

* generic_make_request is currently active in this task or not.

1450

* If it is NULL, then no make_request is active. If it is non-NULL,

1450

* If it is NULL, then no make_request is active. If it is non-NULL,

1451

* then a make_request is active, and new requests should be added

1451

* then a make_request is active, and new requests should be added

1452

* at the tail

1452

* at the tail

1453

*/

1453

*/

1454

void generic_make_request(struct bio *bio)

1454

void generic_make_request(struct bio *bio)

1455

{

1455

{

1456

if (current->bio_tail) {

1456

if (current->bio_tail) {

1457

/* make_request is active */

1457

/* make_request is active */

1458

*(current->bio_tail) = bio;

1458

*(current->bio_tail) = bio;

1459

bio->bi_next = NULL;

1459

bio->bi_next = NULL;

1460

current->bio_tail = &bio->bi_next;

1460

current->bio_tail = &bio->bi_next;

1461

return;

1461

return;

1462

}

1462

}

1463

/* following loop may be a bit non-obvious, and so deserves some

1463

/* following loop may be a bit non-obvious, and so deserves some

1464

* explanation.

1464

* explanation.

1465

* Before entering the loop, bio->bi_next is NULL (as all callers

1465

* Before entering the loop, bio->bi_next is NULL (as all callers

1466

* ensure that) so we have a list with a single bio.

1466

* ensure that) so we have a list with a single bio.

1467

* We pretend that we have just taken it off a longer list, so

1467

* We pretend that we have just taken it off a longer list, so

1468

* we assign bio_list to the next (which is NULL) and bio_tail

1468

* we assign bio_list to the next (which is NULL) and bio_tail

1469

* to &bio_list, thus initialising the bio_list of new bios to be

1469

* to &bio_list, thus initialising the bio_list of new bios to be

1470

* added. __generic_make_request may indeed add some more bios

1470

* added. __generic_make_request may indeed add some more bios

1471

* through a recursive call to generic_make_request. If it

1471

* through a recursive call to generic_make_request. If it

1472

* did, we find a non-NULL value in bio_list and re-enter the loop

1472

* did, we find a non-NULL value in bio_list and re-enter the loop

1473

* from the top. In this case we really did just take the bio

1473

* from the top. In this case we really did just take the bio

1474

* of the top of the list (no pretending) and so fixup bio_list and

1474

* of the top of the list (no pretending) and so fixup bio_list and

1475

* bio_tail or bi_next, and call into __generic_make_request again.

1475

* bio_tail or bi_next, and call into __generic_make_request again.

1476

*

1476

*

1477

* The loop was structured like this to make only one call to

1477

* The loop was structured like this to make only one call to

1478

* __generic_make_request (which is important as it is large and

1478

* __generic_make_request (which is important as it is large and

1479

* inlined) and to keep the structure simple.

1479

* inlined) and to keep the structure simple.

1480

*/

1480

*/

1481

BUG_ON(bio->bi_next);

1481

BUG_ON(bio->bi_next);

1482

do {

1482

do {

1483

current->bio_list = bio->bi_next;

1483

current->bio_list = bio->bi_next;

1484

if (bio->bi_next == NULL)

1484

if (bio->bi_next == NULL)

1485

current->bio_tail = &current->bio_list;

1485

current->bio_tail = &current->bio_list;

1486

else

1486

else

1487

bio->bi_next = NULL;

1487

bio->bi_next = NULL;

1488

__generic_make_request(bio);

1488

__generic_make_request(bio);

1489

bio = current->bio_list;

1489

bio = current->bio_list;

1490

} while (bio);

1490

} while (bio);

1491

current->bio_tail = NULL; /* deactivate */

1491

current->bio_tail = NULL; /* deactivate */

1492

}

1492

}

1493

EXPORT_SYMBOL(generic_make_request);

1493

EXPORT_SYMBOL(generic_make_request);

1494

1495

/**

1495

/**

1496

* submit_bio - submit a bio to the block device layer for I/O

1496

* submit_bio - submit a bio to the block device layer for I/O

1497

* @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)

1497

* @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)

1498

* @bio: The &struct bio which describes the I/O

1498

* @bio: The &struct bio which describes the I/O

1499

*

1499

*

1500

* submit_bio() is very similar in purpose to generic_make_request(), and

1500

* submit_bio() is very similar in purpose to generic_make_request(), and

1501

* uses that function to do most of the work. Both are fairly rough

1501

* uses that function to do most of the work. Both are fairly rough

1502

* interfaces; @bio must be presetup and ready for I/O.

1502

* interfaces; @bio must be presetup and ready for I/O.

1503

*

1503

*

1504

*/

1504

*/

1505

void submit_bio(int rw, struct bio *bio)

1505

void submit_bio(int rw, struct bio *bio)

1506

{

1506

{

1507

int count = bio_sectors(bio);

1507

int count = bio_sectors(bio);

1508

1509

bio->bi_rw |= rw;

1509

bio->bi_rw |= rw;

1510

1511

/*

1511

/*

1512

* If it's a regular read/write or a barrier with data attached,

1512

* If it's a regular read/write or a barrier with data attached,

1513

* go through the normal accounting stuff before submission.

1513

* go through the normal accounting stuff before submission.

1514

*/

1514

*/

1515

if (bio_has_data(bio)) {

1515

if (bio_has_data(bio)) {

1516

if (rw & WRITE) {

1516

if (rw & WRITE) {

1517

count_vm_events(PGPGOUT, count);

1517

count_vm_events(PGPGOUT, count);

1518

} else {

1518

} else {

1519

task_io_account_read(bio->bi_size);

1519

task_io_account_read(bio->bi_size);

1520

count_vm_events(PGPGIN, count);

1520

count_vm_events(PGPGIN, count);

1521

}

1521

}

1522

1523

if (unlikely(block_dump)) {

1523

if (unlikely(block_dump)) {

1524

char b[BDEVNAME_SIZE];

1524

char b[BDEVNAME_SIZE];

1525

printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",

1525

printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",

1526

current->comm, task_pid_nr(current),

1526

current->comm, task_pid_nr(current),

1527

(rw & WRITE) ? "WRITE" : "READ",

1527

(rw & WRITE) ? "WRITE" : "READ",

1528

(unsigned long long)bio->bi_sector,

1528

(unsigned long long)bio->bi_sector,

1529

bdevname(bio->bi_bdev, b));

1529

bdevname(bio->bi_bdev, b));

1530

}

1530

}

1531

}

1531

}

1532

1533

generic_make_request(bio);

1533

generic_make_request(bio);

1534

}

1534

}

1535

EXPORT_SYMBOL(submit_bio);

1535

EXPORT_SYMBOL(submit_bio);

1536

1537

/**

1537

/**

1538

* blk_rq_check_limits - Helper function to check a request for the queue limit

1538

* blk_rq_check_limits - Helper function to check a request for the queue limit

1539

* @q: the queue

1539

* @q: the queue

1540

* @rq: the request being checked

1540

* @rq: the request being checked

1541

*

1541

*

1542

* Description:

1542

* Description:

1543

* @rq may have been made based on weaker limitations of upper-level queues

1543

* @rq may have been made based on weaker limitations of upper-level queues

1544

* in request stacking drivers, and it may violate the limitation of @q.

1544

* in request stacking drivers, and it may violate the limitation of @q.

1545

* Since the block layer and the underlying device driver trust @rq

1545

* Since the block layer and the underlying device driver trust @rq

1546

* after it is inserted to @q, it should be checked against @q before

1546

* after it is inserted to @q, it should be checked against @q before

1547

* the insertion using this generic function.

1547

* the insertion using this generic function.

1548

*

1548

*

1549

* This function should also be useful for request stacking drivers

1549

* This function should also be useful for request stacking drivers

1550

* in some cases below, so export this fuction.

1550

* in some cases below, so export this fuction.

1551

* Request stacking drivers like request-based dm may change the queue

1551

* Request stacking drivers like request-based dm may change the queue

1552

* limits while requests are in the queue (e.g. dm's table swapping).

1552

* limits while requests are in the queue (e.g. dm's table swapping).

1553

* Such request stacking drivers should check those requests agaist

1553

* Such request stacking drivers should check those requests agaist

1554

* the new queue limits again when they dispatch those requests,

1554

* the new queue limits again when they dispatch those requests,

1555

* although such checkings are also done against the old queue limits

1555

* although such checkings are also done against the old queue limits

1556

* when submitting requests.

1556

* when submitting requests.

1557

*/

1557

*/

1558

int blk_rq_check_limits(struct request_queue *q, struct request *rq)

1558

int blk_rq_check_limits(struct request_queue *q, struct request *rq)

1559

{

1559

{

1560

if (rq->nr_sectors > q->max_sectors ||

1560

if (rq->nr_sectors > q->max_sectors ||

1561

rq->data_len > q->max_hw_sectors << 9) {

1561

rq->data_len > q->max_hw_sectors << 9) {

1562

printk(KERN_ERR "%s: over max size limit.\n", __func__);

1562

printk(KERN_ERR "%s: over max size limit.\n", __func__);

1563

return -EIO;

1563

return -EIO;

1564

}

1564

}

1565

1566

/*

1566

/*

1567

* queue's settings related to segment counting like q->bounce_pfn

1567

* queue's settings related to segment counting like q->bounce_pfn

1568

* may differ from that of other stacking queues.

1568

* may differ from that of other stacking queues.

1569

* Recalculate it to check the request correctly on this queue's

1569

* Recalculate it to check the request correctly on this queue's

1570

* limitation.

1570

* limitation.

1571

*/

1571

*/

1572

blk_recalc_rq_segments(rq);

1572

blk_recalc_rq_segments(rq);

1573

if (rq->nr_phys_segments > q->max_phys_segments ||

1573

if (rq->nr_phys_segments > q->max_phys_segments ||

1574

rq->nr_phys_segments > q->max_hw_segments) {

1574

rq->nr_phys_segments > q->max_hw_segments) {

1575

printk(KERN_ERR "%s: over max segments limit.\n", __func__);

1575

printk(KERN_ERR "%s: over max segments limit.\n", __func__);

1576

return -EIO;

1576

return -EIO;

1577

}

1577

}

1578

1579

return 0;

1579

return 0;

1580

}

1580

}

1581

EXPORT_SYMBOL_GPL(blk_rq_check_limits);

1581

EXPORT_SYMBOL_GPL(blk_rq_check_limits);

1582

1583

/**

1583

/**

1584

* blk_insert_cloned_request - Helper for stacking drivers to submit a request

1584

* blk_insert_cloned_request - Helper for stacking drivers to submit a request

1585

* @q: the queue to submit the request

1585

* @q: the queue to submit the request

1586

* @rq: the request being queued

1586

* @rq: the request being queued

1587

*/

1587

*/

1588

int blk_insert_cloned_request(struct request_queue *q, struct request *rq)

1588

int blk_insert_cloned_request(struct request_queue *q, struct request *rq)

1589

{

1589

{

1590

unsigned long flags;

1590

unsigned long flags;

1591

1592

if (blk_rq_check_limits(q, rq))

1592

if (blk_rq_check_limits(q, rq))

1593

return -EIO;

1593

return -EIO;

1594

1595

#ifdef CONFIG_FAIL_MAKE_REQUEST

1595

#ifdef CONFIG_FAIL_MAKE_REQUEST

1596

if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&

1596

if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&

1597

should_fail(&fail_make_request, blk_rq_bytes(rq)))

1597

should_fail(&fail_make_request, blk_rq_bytes(rq)))

1598

return -EIO;

1598

return -EIO;

1599

#endif

1599

#endif

1600

1601

spin_lock_irqsave(q->queue_lock, flags);

1601

spin_lock_irqsave(q->queue_lock, flags);

1602

1603

/*

1603

/*

1604

* Submitting request must be dequeued before calling this function

1604

* Submitting request must be dequeued before calling this function

1605

* because it will be linked to another request_queue

1605

* because it will be linked to another request_queue

1606

*/

1606

*/

1607

BUG_ON(blk_queued_rq(rq));

1607

BUG_ON(blk_queued_rq(rq));

1608

1609

drive_stat_acct(rq, 1);

1609

drive_stat_acct(rq, 1);

1610

__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);

1610

__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);

1611

1612

spin_unlock_irqrestore(q->queue_lock, flags);

1612

spin_unlock_irqrestore(q->queue_lock, flags);

1613

1614

return 0;

1614

return 0;

1615

}

1615

}

1616

EXPORT_SYMBOL_GPL(blk_insert_cloned_request);

1616

EXPORT_SYMBOL_GPL(blk_insert_cloned_request);

1617

1618

/**

1618

/**

1619

* blkdev_dequeue_request - dequeue request and start timeout timer

1619

* blkdev_dequeue_request - dequeue request and start timeout timer

1620

* @req: request to dequeue

1620

* @req: request to dequeue

1621

*

1621

*

1622

* Dequeue @req and start timeout timer on it. This hands off the

1622

* Dequeue @req and start timeout timer on it. This hands off the

1623

* request to the driver.

1623

* request to the driver.

1624

*

1624

*

1625

* Block internal functions which don't want to start timer should

1625

* Block internal functions which don't want to start timer should

1626

* call elv_dequeue_request().

1626

* call elv_dequeue_request().

1627

*/

1627

*/

1628

void blkdev_dequeue_request(struct request *req)

1628

void blkdev_dequeue_request(struct request *req)

1629

{

1629

{

1630

elv_dequeue_request(req->q, req);

1630

elv_dequeue_request(req->q, req);

1631

1632

/*

1632

/*

1633

* We are now handing the request to the hardware, add the

1633

* We are now handing the request to the hardware, add the

1634

* timeout handler.

1634

* timeout handler.

1635

*/

1635

*/

1636

blk_add_timer(req);

1636

blk_add_timer(req);

1637

}

1637

}

1638

EXPORT_SYMBOL(blkdev_dequeue_request);

1638

EXPORT_SYMBOL(blkdev_dequeue_request);

1639

1640

static void blk_account_io_completion(struct request *req, unsigned int bytes)

1640

static void blk_account_io_completion(struct request *req, unsigned int bytes)

1641

{

1641

{

1642

if (blk_do_io_stat(req)) {

1642

if (blk_do_io_stat(req)) {

1643

const int rw = rq_data_dir(req);

1643

const int rw = rq_data_dir(req);

1644

struct hd_struct *part;

1644

struct hd_struct *part;

1645

int cpu;

1645

int cpu;

1646

1647

cpu = part_stat_lock();

1647

cpu = part_stat_lock();

1648

part = disk_map_sector_rcu(req->rq_disk, req->sector);

1648

part = disk_map_sector_rcu(req->rq_disk, req->sector);

1649

part_stat_add(cpu, part, sectors[rw], bytes >> 9);

1649

part_stat_add(cpu, part, sectors[rw], bytes >> 9);

1650

part_stat_unlock();

1650

part_stat_unlock();

1651

}

1651

}

1652

}

1652

}

1653

1654

static void blk_account_io_done(struct request *req)

1654

static void blk_account_io_done(struct request *req)

1655

{

1655

{

1656

/*

1656

/*

1657

* Account IO completion. bar_rq isn't accounted as a normal

1657

* Account IO completion. bar_rq isn't accounted as a normal

1658

* IO on queueing nor completion. Accounting the containing

1658

* IO on queueing nor completion. Accounting the containing

1659

* request is enough.

1659

* request is enough.

1660

*/

1660

*/

1661

if (blk_do_io_stat(req) && req != &req->q->bar_rq) {

1661

if (blk_do_io_stat(req) && req != &req->q->bar_rq) {

1662

unsigned long duration = jiffies - req->start_time;

1662

unsigned long duration = jiffies - req->start_time;

1663

const int rw = rq_data_dir(req);

1663

const int rw = rq_data_dir(req);

1664

struct hd_struct *part;

1664

struct hd_struct *part;

1665

int cpu;

1665

int cpu;

1666

1667

cpu = part_stat_lock();

1667

cpu = part_stat_lock();

1668

part = disk_map_sector_rcu(req->rq_disk, req->sector);

1668

part = disk_map_sector_rcu(req->rq_disk, req->sector);

1669

1670

part_stat_inc(cpu, part, ios[rw]);

1670

part_stat_inc(cpu, part, ios[rw]);

1671

part_stat_add(cpu, part, ticks[rw], duration);

1671

part_stat_add(cpu, part, ticks[rw], duration);

1672

part_round_stats(cpu, part);

1672

part_round_stats(cpu, part);

1673

part_dec_in_flight(part);

1673

part_dec_in_flight(part);

1674

1675

part_stat_unlock();

1675

part_stat_unlock();

1676

}

1676

}

1677

}

1677

}

1678

1679

/**

1679

/**

1680

* blk_rq_bytes - Returns bytes left to complete in the entire request

1680

* blk_rq_bytes - Returns bytes left to complete in the entire request

1681

* @rq: the request being processed

1681

* @rq: the request being processed

1682

**/

1682

**/

1683

unsigned int blk_rq_bytes(struct request *rq)

1683

unsigned int blk_rq_bytes(struct request *rq)

1684

{

1684

{

1685

if (blk_fs_request(rq))

1685

if (blk_fs_request(rq))

1686

return rq->hard_nr_sectors << 9;

1686

return rq->hard_nr_sectors << 9;

1687

1688

return rq->data_len;

1688

return rq->data_len;

1689

}

1689

}

1690

EXPORT_SYMBOL_GPL(blk_rq_bytes);

1690

EXPORT_SYMBOL_GPL(blk_rq_bytes);

1691

1692

/**

1692

/**

1693

* blk_rq_cur_bytes - Returns bytes left to complete in the current segment

1693

* blk_rq_cur_bytes - Returns bytes left to complete in the current segment

1694

* @rq: the request being processed

1694

* @rq: the request being processed

1695

**/

1695

**/

1696

unsigned int blk_rq_cur_bytes(struct request *rq)

1696

unsigned int blk_rq_cur_bytes(struct request *rq)

1697

{

1697

{

1698

if (blk_fs_request(rq))

1698

if (blk_fs_request(rq))

1699

return rq->current_nr_sectors << 9;

1699

return rq->current_nr_sectors << 9;

1700

1701

if (rq->bio)

1701

if (rq->bio)

1702

return rq->bio->bi_size;

1702

return rq->bio->bi_size;

1703

1704

return rq->data_len;

1704

return rq->data_len;

1705

}

1705

}

1706

EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);

1706

EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);

1707

1708

struct request *elv_next_request(struct request_queue *q)

1708

struct request *elv_next_request(struct request_queue *q)

1709

{

1709

{

1710

struct request *rq;

1710

struct request *rq;

1711

int ret;

1711

int ret;

1712

1713

while ((rq = __elv_next_request(q)) != NULL) {

1713

while ((rq = __elv_next_request(q)) != NULL) {

1714

if (!(rq->cmd_flags & REQ_STARTED)) {

1714

if (!(rq->cmd_flags & REQ_STARTED)) {

1715

/*

1715

/*

1716

* This is the first time the device driver

1716

* This is the first time the device driver

1717

* sees this request (possibly after

1717

* sees this request (possibly after

1718

* requeueing). Notify IO scheduler.

1718

* requeueing). Notify IO scheduler.

1719

*/

1719

*/

1720

if (blk_sorted_rq(rq))

1720

if (blk_sorted_rq(rq))

1721

elv_activate_rq(q, rq);

1721

elv_activate_rq(q, rq);

1722

1723

/*

1723

/*

1724

* just mark as started even if we don't start

1724

* just mark as started even if we don't start

1725

* it, a request that has been delayed should

1725

* it, a request that has been delayed should

1726

* not be passed by new incoming requests

1726

* not be passed by new incoming requests

1727

*/

1727

*/

1728

rq->cmd_flags |= REQ_STARTED;

1728

rq->cmd_flags |= REQ_STARTED;

1729

trace_block_rq_issue(q, rq);

1729

trace_block_rq_issue(q, rq);

1730

}

1730

}

1731

1732

if (!q->boundary_rq || q->boundary_rq == rq) {

1732

if (!q->boundary_rq || q->boundary_rq == rq) {

1733

q->end_sector = rq_end_sector(rq);

1733

q->end_sector = rq_end_sector(rq);

1734

q->boundary_rq = NULL;

1734

q->boundary_rq = NULL;

1735

}

1735

}

1736

1737

if (rq->cmd_flags & REQ_DONTPREP)

1737

if (rq->cmd_flags & REQ_DONTPREP)

1738

break;

1738

break;

1739

1740

if (q->dma_drain_size && rq->data_len) {

1740

if (q->dma_drain_size && rq->data_len) {

1741

/*

1741

/*

1742

* make sure space for the drain appears we

1742

* make sure space for the drain appears we

1743

* know we can do this because max_hw_segments

1743

* know we can do this because max_hw_segments

1744

* has been adjusted to be one fewer than the

1744

* has been adjusted to be one fewer than the

1745

* device can handle

1745

* device can handle

1746

*/

1746

*/

1747

rq->nr_phys_segments++;

1747

rq->nr_phys_segments++;

1748

}

1748

}

1749

1750

if (!q->prep_rq_fn)

1750

if (!q->prep_rq_fn)

1751

break;

1751

break;

1752

1753

ret = q->prep_rq_fn(q, rq);

1753

ret = q->prep_rq_fn(q, rq);

1754

if (ret == BLKPREP_OK) {

1754

if (ret == BLKPREP_OK) {

1755

break;

1755

break;

1756

} else if (ret == BLKPREP_DEFER) {

1756

} else if (ret == BLKPREP_DEFER) {

1757

/*

1757

/*

1758

* the request may have been (partially) prepped.

1758

* the request may have been (partially) prepped.

1759

* we need to keep this request in the front to

1759

* we need to keep this request in the front to

1760

* avoid resource deadlock. REQ_STARTED will

1760

* avoid resource deadlock. REQ_STARTED will

1761

* prevent other fs requests from passing this one.

1761

* prevent other fs requests from passing this one.

1762

*/

1762

*/

1763

if (q->dma_drain_size && rq->data_len &&

1763

if (q->dma_drain_size && rq->data_len &&

1764

!(rq->cmd_flags & REQ_DONTPREP)) {

1764

!(rq->cmd_flags & REQ_DONTPREP)) {

1765

/*

1765

/*

1766

* remove the space for the drain we added

1766

* remove the space for the drain we added

1767

* so that we don't add it again

1767

* so that we don't add it again

1768

*/

1768

*/

1769

--rq->nr_phys_segments;

1769

--rq->nr_phys_segments;

1770

}

1770

}

1771

1772

rq = NULL;

1772

rq = NULL;

1773

break;

1773

break;

1774

} else if (ret == BLKPREP_KILL) {

1774

} else if (ret == BLKPREP_KILL) {

1775

rq->cmd_flags |= REQ_QUIET;

1775

rq->cmd_flags |= REQ_QUIET;

1776

__blk_end_request_all(rq, -EIO);

1776

__blk_end_request_all(rq, -EIO);

1777

} else {

1777

} else {

1778

printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);

1778

printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);

1779

break;

1779

break;

1780

}

1780

}

1781

}

1781

}

1782

1783

return rq;

1783

return rq;

1784

}

1784

}

1785

EXPORT_SYMBOL(elv_next_request);

1785

EXPORT_SYMBOL(elv_next_request);

1786

1787

void elv_dequeue_request(struct request_queue *q, struct request *rq)

1787

void elv_dequeue_request(struct request_queue *q, struct request *rq)

1788

{

1788

{

1789

BUG_ON(list_empty(&rq->queuelist));

1789

BUG_ON(list_empty(&rq->queuelist));

1790

BUG_ON(ELV_ON_HASH(rq));

1790

BUG_ON(ELV_ON_HASH(rq));

1791

1792

list_del_init(&rq->queuelist);

1792

list_del_init(&rq->queuelist);

1793

1794

/*

1794

/*

1795

* the time frame between a request being removed from the lists

1795

* the time frame between a request being removed from the lists

1796

* and to it is freed is accounted as io that is in progress at

1796

* and to it is freed is accounted as io that is in progress at

1797

* the driver side.

1797

* the driver side.

1798

*/

1798

*/

1799

if (blk_account_rq(rq))

1799

if (blk_account_rq(rq))

1800

q->in_flight++;

1800

q->in_flight++;

1801

}

1801

}

1802

1803

/**

1803

/**

1804

* blk_update_request - Special helper function for request stacking drivers

1804

* blk_update_request - Special helper function for request stacking drivers

1805

* @rq: the request being processed

1805

* @rq: the request being processed

1806

* @error: %0 for success, < %0 for error

1806

* @error: %0 for success, < %0 for error

1807

* @nr_bytes: number of bytes to complete @rq

1807

* @nr_bytes: number of bytes to complete @rq

1808

*

1808

*

1809

* Description:

1809

* Description:

1810

* Ends I/O on a number of bytes attached to @rq, but doesn't complete

1810

* Ends I/O on a number of bytes attached to @rq, but doesn't complete

1811

* the request structure even if @rq doesn't have leftover.

1811

* the request structure even if @rq doesn't have leftover.

1812

* If @rq has leftover, sets it up for the next range of segments.

1812

* If @rq has leftover, sets it up for the next range of segments.

1813

*

1813

*

1814

* This special helper function is only for request stacking drivers

1814

* This special helper function is only for request stacking drivers

1815

* (e.g. request-based dm) so that they can handle partial completion.

1815

* (e.g. request-based dm) so that they can handle partial completion.

1816

* Actual device drivers should use blk_end_request instead.

1816

* Actual device drivers should use blk_end_request instead.

1817

*

1817

*

1818

* Passing the result of blk_rq_bytes() as @nr_bytes guarantees

1818

* Passing the result of blk_rq_bytes() as @nr_bytes guarantees

1819

* %false return from this function.

1819

* %false return from this function.

1820

*

1820

*

1821

* Return:

1821

* Return:

1822

* %false - this request doesn't have any more data

1822

* %false - this request doesn't have any more data

1823

* %true - this request has more data

1823

* %true - this request has more data

1824

**/

1824

**/

1825

bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)

1825

bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)

1826

{

1826

{

1827

int total_bytes, bio_nbytes, next_idx = 0;

1827

int total_bytes, bio_nbytes, next_idx = 0;

1828

struct bio *bio;

1828

struct bio *bio;

1829

1830

if (!req->bio)

1830

if (!req->bio)

1831

return false;

1831

return false;

1832

1833

trace_block_rq_complete(req->q, req);

1833

trace_block_rq_complete(req->q, req);

1834

1835

/*

1835

/*

1836

* For fs requests, rq is just carrier of independent bio's

1836

* For fs requests, rq is just carrier of independent bio's

1837

* and each partial completion should be handled separately.

1837

* and each partial completion should be handled separately.

1838

* Reset per-request error on each partial completion.

1838

* Reset per-request error on each partial completion.

1839

*

1839

*

1840

* TODO: tj: This is too subtle. It would be better to let

1840

* TODO: tj: This is too subtle. It would be better to let

1841

* low level drivers do what they see fit.

1841

* low level drivers do what they see fit.

1842

*/

1842

*/

1843

if (blk_fs_request(req))

1843

if (blk_fs_request(req))

1844

req->errors = 0;

1844

req->errors = 0;

1845

1846

if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {

1846

if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {

1847

printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",

1847

printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",

1848

req->rq_disk ? req->rq_disk->disk_name : "?",

1848

req->rq_disk ? req->rq_disk->disk_name : "?",

1849

(unsigned long long)req->sector);

1849

(unsigned long long)req->sector);

1850

}

1850

}

1851

1852

blk_account_io_completion(req, nr_bytes);

1852

blk_account_io_completion(req, nr_bytes);

1853

1854

total_bytes = bio_nbytes = 0;

1854

total_bytes = bio_nbytes = 0;

1855

while ((bio = req->bio) != NULL) {

1855

while ((bio = req->bio) != NULL) {

1856

int nbytes;

1856

int nbytes;

1857

1858

if (nr_bytes >= bio->bi_size) {

1858

if (nr_bytes >= bio->bi_size) {

1859

req->bio = bio->bi_next;

1859

req->bio = bio->bi_next;

1860

nbytes = bio->bi_size;

1860

nbytes = bio->bi_size;

1861

req_bio_endio(req, bio, nbytes, error);

1861

req_bio_endio(req, bio, nbytes, error);

1862

next_idx = 0;

1862

next_idx = 0;

1863

bio_nbytes = 0;

1863

bio_nbytes = 0;

1864

} else {

1864

} else {

1865

int idx = bio->bi_idx + next_idx;

1865

int idx = bio->bi_idx + next_idx;

1866

1867

if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {

1867

if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {

1868

blk_dump_rq_flags(req, "__end_that");

1868

blk_dump_rq_flags(req, "__end_that");

1869

printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",

1869

printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",

1870

__func__, bio->bi_idx, bio->bi_vcnt);

1870

__func__, bio->bi_idx, bio->bi_vcnt);

1871

break;

1871

break;

1872

}

1872

}

1873

1874

nbytes = bio_iovec_idx(bio, idx)->bv_len;

1874

nbytes = bio_iovec_idx(bio, idx)->bv_len;

1875

BIO_BUG_ON(nbytes > bio->bi_size);

1875

BIO_BUG_ON(nbytes > bio->bi_size);

1876

1877

/*

1877

/*

1878

* not a complete bvec done

1878

* not a complete bvec done

1879

*/

1879

*/

1880

if (unlikely(nbytes > nr_bytes)) {

1880

if (unlikely(nbytes > nr_bytes)) {

1881

bio_nbytes += nr_bytes;

1881

bio_nbytes += nr_bytes;

1882

total_bytes += nr_bytes;

1882

total_bytes += nr_bytes;

1883

break;

1883

break;

1884

}

1884

}

1885

1886

/*

1886

/*

1887

* advance to the next vector

1887

* advance to the next vector

1888

*/

1888

*/

1889

next_idx++;

1889

next_idx++;

1890

bio_nbytes += nbytes;

1890

bio_nbytes += nbytes;

1891

}

1891

}

1892

1893

total_bytes += nbytes;

1893

total_bytes += nbytes;

1894

nr_bytes -= nbytes;

1894

nr_bytes -= nbytes;

1895

1896

bio = req->bio;

1896

bio = req->bio;

1897

if (bio) {

1897

if (bio) {

1898

/*

1898

/*

1899

* end more in this run, or just return 'not-done'

1899

* end more in this run, or just return 'not-done'

1900

*/

1900

*/

1901

if (unlikely(nr_bytes <= 0))

1901

if (unlikely(nr_bytes <= 0))

1902

break;

1902

break;

1903

}

1903

}

1904

}

1904

}

1905

1906

/*

1906

/*

1907

* completely done

1907

* completely done

1908

*/

1908

*/

1909

if (!req->bio) {

1909

if (!req->bio) {

1910

/*

1910

/*

1911

* Reset counters so that the request stacking driver

1911

* Reset counters so that the request stacking driver

1912

* can find how many bytes remain in the request

1912

* can find how many bytes remain in the request

1913

* later.

1913

* later.

1914

*/

1914

*/

1915

req->nr_sectors = req->hard_nr_sectors = 0;

1915

req->nr_sectors = req->hard_nr_sectors = 0;

1916

req->current_nr_sectors = req->hard_cur_sectors = 0;

1916

req->current_nr_sectors = req->hard_cur_sectors = 0;

1917

return false;

1917

return false;

1918

}

1918

}

1919

1920

/*

1920

/*

1921

* if the request wasn't completed, update state

1921

* if the request wasn't completed, update state

1922

*/

1922

*/

1923

if (bio_nbytes) {

1923

if (bio_nbytes) {

1924

req_bio_endio(req, bio, bio_nbytes, error);

1924

req_bio_endio(req, bio, bio_nbytes, error);

1925

bio->bi_idx += next_idx;

1925

bio->bi_idx += next_idx;

1926

bio_iovec(bio)->bv_offset += nr_bytes;

1926

bio_iovec(bio)->bv_offset += nr_bytes;

1927

bio_iovec(bio)->bv_len -= nr_bytes;

1927

bio_iovec(bio)->bv_len -= nr_bytes;

1928

}

1928

}

1929

1930

blk_recalc_rq_sectors(req, total_bytes >> 9);

1930

blk_recalc_rq_sectors(req, total_bytes >> 9);

1931

blk_recalc_rq_segments(req);

1931

blk_recalc_rq_segments(req);

1932

return true;

1932

return true;

1933

}

1933

}

1934

EXPORT_SYMBOL_GPL(blk_update_request);

1934

EXPORT_SYMBOL_GPL(blk_update_request);

1935

1936

static bool blk_update_bidi_request(struct request *rq, int error,

1936

static bool blk_update_bidi_request(struct request *rq, int error,

1937

unsigned int nr_bytes,

1937

unsigned int nr_bytes,

1938

unsigned int bidi_bytes)

1938

unsigned int bidi_bytes)

1939

{

1939

{

1940

if (blk_update_request(rq, error, nr_bytes))

1940

if (blk_update_request(rq, error, nr_bytes))

1941

return true;

1941

return true;

1942

1943

/* Bidi request must be completed as a whole */

1943

/* Bidi request must be completed as a whole */

1944

if (unlikely(blk_bidi_rq(rq)) &&

1944

if (unlikely(blk_bidi_rq(rq)) &&

1945

blk_update_request(rq->next_rq, error, bidi_bytes))

1945

blk_update_request(rq->next_rq, error, bidi_bytes))

1946

return true;

1946

return true;

1947

1948

add_disk_randomness(rq->rq_disk);

1948

add_disk_randomness(rq->rq_disk);

1949

1950

return false;

1950

return false;

1951

}

1951

}

1952

1953

/*

1953

/*

1954

* queue lock must be held

1954

* queue lock must be held

1955

*/

1955

*/

1956

static void blk_finish_request(struct request *req, int error)

1956

static void blk_finish_request(struct request *req, int error)

1957

{

1957

{

1958

if (blk_rq_tagged(req))

1958

if (blk_rq_tagged(req))

1959

blk_queue_end_tag(req->q, req);

1959

blk_queue_end_tag(req->q, req);

1960

1961

if (blk_queued_rq(req))

1961

if (blk_queued_rq(req))

1962

elv_dequeue_request(req->q, req);

1962

elv_dequeue_request(req->q, req);

1963

1964

if (unlikely(laptop_mode) && blk_fs_request(req))

1964

if (unlikely(laptop_mode) && blk_fs_request(req))

1965

laptop_io_completion();

1965

laptop_io_completion();

1966

1967

blk_delete_timer(req);

1967

blk_delete_timer(req);

1968

1969

blk_account_io_done(req);

1969

blk_account_io_done(req);

1970

1971

if (req->end_io)

1971

if (req->end_io)

1972

req->end_io(req, error);

1972

req->end_io(req, error);

1973

else {

1973

else {

1974

if (blk_bidi_rq(req))

1974

if (blk_bidi_rq(req))

1975

__blk_put_request(req->next_rq->q, req->next_rq);

1975

__blk_put_request(req->next_rq->q, req->next_rq);

1976

1977

__blk_put_request(req->q, req);

1977

__blk_put_request(req->q, req);

1978

}

1978

}

1979

}

1979

}

1980

1981

/**

1981

/**

1982

* blk_end_bidi_request - Complete a bidi request

1982

* blk_end_bidi_request - Complete a bidi request

1983

* @rq: the request to complete

1983

* @rq: the request to complete

1984

* @error: %0 for success, < %0 for error

1984

* @error: %0 for success, < %0 for error

1985

* @nr_bytes: number of bytes to complete @rq

1985

* @nr_bytes: number of bytes to complete @rq

1986

* @bidi_bytes: number of bytes to complete @rq->next_rq

1986

* @bidi_bytes: number of bytes to complete @rq->next_rq

1987

*

1987

*

1988

* Description:

1988

* Description:

1989

* Ends I/O on a number of bytes attached to @rq and @rq->next_rq.

1989

* Ends I/O on a number of bytes attached to @rq and @rq->next_rq.

1990

* Drivers that supports bidi can safely call this member for any

1990

* Drivers that supports bidi can safely call this member for any

1991

* type of request, bidi or uni. In the later case @bidi_bytes is

1991

* type of request, bidi or uni. In the later case @bidi_bytes is

1992

* just ignored.

1992

* just ignored.

1993

*

1993

*

1994

* Return:

1994

* Return:

1995

* %false - we are done with this request

1995

* %false - we are done with this request

1996

* %true - still buffers pending for this request

1996

* %true - still buffers pending for this request

1997

**/

1997

**/

1998

bool blk_end_bidi_request(struct request *rq, int error,

1998

bool blk_end_bidi_request(struct request *rq, int error,

1999

unsigned int nr_bytes, unsigned int bidi_bytes)

1999

unsigned int nr_bytes, unsigned int bidi_bytes)

2000

{

2000

{

2001

struct request_queue *q = rq->q;

2001

struct request_queue *q = rq->q;

2002

unsigned long flags;

2002

unsigned long flags;

2003

2004

if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))

2004

if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))

2005

return true;

2005

return true;

2006

2007

spin_lock_irqsave(q->queue_lock, flags);

2007

spin_lock_irqsave(q->queue_lock, flags);

2008

blk_finish_request(rq, error);

2008

blk_finish_request(rq, error);

2009

spin_unlock_irqrestore(q->queue_lock, flags);

2009

spin_unlock_irqrestore(q->queue_lock, flags);

2010

2011

return false;

2011

return false;

2012

}

2012

}

2013

EXPORT_SYMBOL_GPL(blk_end_bidi_request);

2013

EXPORT_SYMBOL_GPL(blk_end_bidi_request);

2014

2015

/**

2015

/**

2016

* __blk_end_bidi_request - Complete a bidi request with queue lock held

2016

* __blk_end_bidi_request - Complete a bidi request with queue lock held

2017

* @rq: the request to complete

2017

* @rq: the request to complete

2018

* @error: %0 for success, < %0 for error

2018

* @error: %0 for success, < %0 for error

2019

* @nr_bytes: number of bytes to complete @rq

2019

* @nr_bytes: number of bytes to complete @rq

2020

* @bidi_bytes: number of bytes to complete @rq->next_rq

2020

* @bidi_bytes: number of bytes to complete @rq->next_rq

2021

*

2021

*

2022

* Description:

2022

* Description:

2023

* Identical to blk_end_bidi_request() except that queue lock is

2023

* Identical to blk_end_bidi_request() except that queue lock is

2024

* assumed to be locked on entry and remains so on return.

2024

* assumed to be locked on entry and remains so on return.

2025

*

2025

*

2026

* Return:

2026

* Return:

2027

* %false - we are done with this request

2027

* %false - we are done with this request

2028

* %true - still buffers pending for this request

2028

* %true - still buffers pending for this request

2029

**/

2029

**/

2030

bool __blk_end_bidi_request(struct request *rq, int error,

2030

bool __blk_end_bidi_request(struct request *rq, int error,

2031

unsigned int nr_bytes, unsigned int bidi_bytes)

2031

unsigned int nr_bytes, unsigned int bidi_bytes)

2032

{

2032

{

2033

if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))

2033

if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))

2034

return true;

2034

return true;

2035

2036

blk_finish_request(rq, error);

2036

blk_finish_request(rq, error);

2037

2038

return false;

2038

return false;

2039

}

2039

}

2040

EXPORT_SYMBOL_GPL(__blk_end_bidi_request);

2040

EXPORT_SYMBOL_GPL(__blk_end_bidi_request);

2041

2042

void blk_rq_bio_prep(struct request_queue *q, struct request *rq,

2042

void blk_rq_bio_prep(struct request_queue *q, struct request *rq,

2043

struct bio *bio)

2043

struct bio *bio)

2044

{

2044

{

2045

/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and

2045

/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and

2046

we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */

2046

we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */

2047

rq->cmd_flags |= (bio->bi_rw & 3);

2047

rq->cmd_flags |= (bio->bi_rw & 3);

2048

2049

if (bio_has_data(bio)) {

2049

if (bio_has_data(bio)) {

2050

rq->nr_phys_segments = bio_phys_segments(q, bio);

2050

rq->nr_phys_segments = bio_phys_segments(q, bio);

2051

rq->buffer = bio_data(bio);

2051

rq->buffer = bio_data(bio);

2052

}

2052

}

2053

rq->current_nr_sectors = bio_cur_sectors(bio);

2053

rq->current_nr_sectors = bio_cur_sectors(bio);

2054

rq->hard_cur_sectors = rq->current_nr_sectors;

2054

rq->hard_cur_sectors = rq->current_nr_sectors;

2055

rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);

2055

rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);

2056

rq->data_len = bio->bi_size;

2056

rq->data_len = bio->bi_size;

2057

2058

rq->bio = rq->biotail = bio;

2058

rq->bio = rq->biotail = bio;

2059

2060

if (bio->bi_bdev)

2060

if (bio->bi_bdev)

2061

rq->rq_disk = bio->bi_bdev->bd_disk;

2061

rq->rq_disk = bio->bi_bdev->bd_disk;

2062

}

2062

}

2063

2064

/**

2064

/**

2065

* blk_lld_busy - Check if underlying low-level drivers of a device are busy

2065

* blk_lld_busy - Check if underlying low-level drivers of a device are busy

2066

* @q : the queue of the device being checked

2066

* @q : the queue of the device being checked

2067

*

2067

*

2068

* Description:

2068

* Description:

2069

* Check if underlying low-level drivers of a device are busy.

2069

* Check if underlying low-level drivers of a device are busy.

2070

* If the drivers want to export their busy state, they must set own

2070

* If the drivers want to export their busy state, they must set own

2071

* exporting function using blk_queue_lld_busy() first.

2071

* exporting function using blk_queue_lld_busy() first.

2072

*

2072

*

2073

* Basically, this function is used only by request stacking drivers

2073

* Basically, this function is used only by request stacking drivers

2074

* to stop dispatching requests to underlying devices when underlying

2074

* to stop dispatching requests to underlying devices when underlying

2075

* devices are busy. This behavior helps more I/O merging on the queue

2075

* devices are busy. This behavior helps more I/O merging on the queue

2076

* of the request stacking driver and prevents I/O throughput regression

2076

* of the request stacking driver and prevents I/O throughput regression

2077

* on burst I/O load.

2077

* on burst I/O load.

2078

*

2078

*

2079

* Return:

2079

* Return:

2080

* 0 - Not busy (The request stacking driver should dispatch request)

2080

* 0 - Not busy (The request stacking driver should dispatch request)

2081

* 1 - Busy (The request stacking driver should stop dispatching request)

2081

* 1 - Busy (The request stacking driver should stop dispatching request)

2082

*/

2082

*/

2083

int blk_lld_busy(struct request_queue *q)

2083

int blk_lld_busy(struct request_queue *q)

2084

{

2084

{

2085

if (q->lld_busy_fn)

2085

if (q->lld_busy_fn)

2086

return q->lld_busy_fn(q);

2086

return q->lld_busy_fn(q);

2087

2088

return 0;

2088

return 0;

2089

}

2089

}

2090

EXPORT_SYMBOL_GPL(blk_lld_busy);

2090

EXPORT_SYMBOL_GPL(blk_lld_busy);

2091

2092

int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)

2092

int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)

2093

{

2093

{

2094

return queue_work(kblockd_workqueue, work);

2094

return queue_work(kblockd_workqueue, work);

2095

}

2095

}

2096

EXPORT_SYMBOL(kblockd_schedule_work);

2096

EXPORT_SYMBOL(kblockd_schedule_work);

2097

2098

int __init blk_dev_init(void)

2098

int __init blk_dev_init(void)

2099

{

2099

{

2100

BUILD_BUG_ON(__REQ_NR_BITS > 8 *

2101

sizeof(((struct request *)0)->cmd_flags));

2102

2100

kblockd_workqueue = create_workqueue("kblockd");

2103

kblockd_workqueue = create_workqueue("kblockd");

2101

if (!kblockd_workqueue)

2104

if (!kblockd_workqueue)

2102

panic("Failed to create kblockd\n");

2105

panic("Failed to create kblockd\n");

2103

2106

2104

request_cachep = kmem_cache_create("blkdev_requests",

2107

request_cachep = kmem_cache_create("blkdev_requests",

2105

sizeof(struct request), 0, SLAB_PANIC, NULL);

2108

sizeof(struct request), 0, SLAB_PANIC, NULL);

2106

2109

2107

blk_requestq_cachep = kmem_cache_create("blkdev_queue",

2110

blk_requestq_cachep = kmem_cache_create("blkdev_queue",

2108

sizeof(struct request_queue), 0, SLAB_PANIC, NULL);

2111

sizeof(struct request_queue), 0, SLAB_PANIC, NULL);

2109

2112

2110

return 0;

2113

return 0;

2111

}

2114

}

2112

2115

2113

2116

GITLAB

block: catch trying to use more bits than request->cmd_flags has

 /*
  * Copyright (C) 1991, 1992 Linus Torvalds
  * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
  * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
  * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
  * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
  *	-  July2000
  * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
  */
 /*
  * This handles all read/write requests to block devices
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/completion.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
 #include <trace/block.h>
 #include "blk.h"
 DEFINE_TRACE(block_plug);
 DEFINE_TRACE(block_unplug_io);
 DEFINE_TRACE(block_unplug_timer);
 DEFINE_TRACE(block_getrq);
 DEFINE_TRACE(block_sleeprq);
 DEFINE_TRACE(block_rq_requeue);
 DEFINE_TRACE(block_bio_backmerge);
 DEFINE_TRACE(block_bio_frontmerge);
 DEFINE_TRACE(block_bio_queue);
 DEFINE_TRACE(block_rq_complete);
 DEFINE_TRACE(block_remap);	/* Also used in drivers/md/dm.c */
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
 static int __make_request(struct request_queue *q, struct bio *bio);
 /*
  * For the allocated request tables
  */
 static struct kmem_cache *request_cachep;
 /*
  * For queue allocation
  */
 struct kmem_cache *blk_requestq_cachep;
 /*
  * Controlling structure to kblockd
  */
 static struct workqueue_struct *kblockd_workqueue;
 static void drive_stat_acct(struct request *rq, int new_io)
 {
 	struct hd_struct *part;
 	int rw = rq_data_dir(rq);
 	int cpu;
 	if (!blk_do_io_stat(rq))
 		return;
 	cpu = part_stat_lock();
 	part = disk_map_sector_rcu(rq->rq_disk, rq->sector);
 	if (!new_io)
 		part_stat_inc(cpu, part, merges[rw]);
 	else {
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part);
 	}
 	part_stat_unlock();
 }
 void blk_queue_congestion_threshold(struct request_queue *q)
 {
 	int nr;
 	nr = q->nr_requests - (q->nr_requests / 8) + 1;
 	if (nr > q->nr_requests)
 		nr = q->nr_requests;
 	q->nr_congestion_on = nr;
 	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
 	if (nr < 1)
 		nr = 1;
 	q->nr_congestion_off = nr;
 }
 /**
  * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
  * @bdev:	device
  *
  * Locates the passed device's request queue and returns the address of its
  * backing_dev_info
  *
  * Will return NULL if the request queue cannot be located.
  */
 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
 {
 	struct backing_dev_info *ret = NULL;
 	struct request_queue *q = bdev_get_queue(bdev);
 	if (q)
 		ret = &q->backing_dev_info;
 	return ret;
 }
 EXPORT_SYMBOL(blk_get_backing_dev_info);
 void blk_rq_init(struct request_queue *q, struct request *rq)
 {
 	memset(rq, 0, sizeof(*rq));
 	INIT_LIST_HEAD(&rq->queuelist);
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->cpu = -1;
 	rq->q = q;
 	rq->sector = rq->hard_sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->cmd = rq->__cmd;
 	rq->cmd_len = BLK_MAX_CDB;
 	rq->tag = -1;
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 }
 EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
 	struct request_queue *q = rq->q;
 	if (&q->bar_rq != rq) {
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			error = -EIO;
 		if (unlikely(nbytes > bio->bi_size)) {
 			printk(KERN_ERR "%s: want %u bytes done, %u left\n",
 			       __func__, nbytes, bio->bi_size);
 			nbytes = bio->bi_size;
 		}
 		if (unlikely(rq->cmd_flags & REQ_QUIET))
 			set_bit(BIO_QUIET, &bio->bi_flags);
 		bio->bi_size -= nbytes;
 		bio->bi_sector += (nbytes >> 9);
 		if (bio_integrity(bio))
 			bio_integrity_advance(bio, nbytes);
 		if (bio->bi_size == 0)
 			bio_endio(bio, error);
 	} else {
 		/*
 		 * Okay, this is the barrier request in progress, just
 		 * record the error;
 		 */
 		if (error && !q->orderr)
 			q->orderr = error;
 	}
 }
 void blk_dump_rq_flags(struct request *rq, char *msg)
 {
 	int bit;
 	printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
 		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
 		rq->cmd_flags);
 	printk(KERN_INFO "  sector %llu, nr/cnr %lu/%u\n",
 						(unsigned long long)rq->sector,
 						rq->nr_sectors,
 						rq->current_nr_sectors);
 	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
 						rq->bio, rq->biotail,
 						rq->buffer, rq->data_len);
 	if (blk_pc_request(rq)) {
 		printk(KERN_INFO "  cdb: ");
 		for (bit = 0; bit < BLK_MAX_CDB; bit++)
 			printk("%02x ", rq->cmd[bit]);
 		printk("\n");
 	}
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 /*
  * "plug" the device if there are no outstanding requests: this will
  * force the transfer to start only after we have put all the requests
  * on the list.
  *
  * This is called with interrupts off and no requests on the queue and
  * with the queue lock held.
  */
 void blk_plug_device(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	/*
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
 	 * which will restart the queueing
 	 */
 	if (blk_queue_stopped(q))
 		return;
 	if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
 		trace_block_plug(q);
 	}
 }
 EXPORT_SYMBOL(blk_plug_device);
 /**
  * blk_plug_device_unlocked - plug a device without queue lock held
  * @q:    The &struct request_queue to plug
  *
  * Description:
  *   Like @blk_plug_device(), but grabs the queue lock and disables
  *   interrupts.
  **/
 void blk_plug_device_unlocked(struct request_queue *q)
 {
 	unsigned long flags;
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_plug_device(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_plug_device_unlocked);
 /*
  * remove the queue from the plugged list, if present. called with
  * queue lock held and interrupts disabled.
  */
 int blk_remove_plug(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
 		return 0;
 	del_timer(&q->unplug_timer);
 	return 1;
 }
 EXPORT_SYMBOL(blk_remove_plug);
 /*
  * remove the plug and let it rip..
  */
 void __generic_unplug_device(struct request_queue *q)
 {
 	if (unlikely(blk_queue_stopped(q)))
 		return;
 	if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
 		return;
 	q->request_fn(q);
 }
 /**
  * generic_unplug_device - fire a request queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   Linux uses plugging to build bigger requests queues before letting
  *   the device have at them. If a queue is plugged, the I/O scheduler
  *   is still adding and merging requests on the queue. Once the queue
  *   gets unplugged, the request_fn defined for the queue is invoked and
  *   transfers started.
  **/
 void generic_unplug_device(struct request_queue *q)
 {
 	if (blk_queue_plugged(q)) {
 		spin_lock_irq(q->queue_lock);
 		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
 	}
 }
 EXPORT_SYMBOL(generic_unplug_device);
 static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
 				   struct page *page)
 {
 	struct request_queue *q = bdi->unplug_io_data;
 	blk_unplug(q);
 }
 void blk_unplug_work(struct work_struct *work)
 {
 	struct request_queue *q =
 		container_of(work, struct request_queue, unplug_work);
 	trace_block_unplug_io(q);
 	q->unplug_fn(q);
 }
 void blk_unplug_timeout(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
 	trace_block_unplug_timer(q);
 	kblockd_schedule_work(q, &q->unplug_work);
 }
 void blk_unplug(struct request_queue *q)
 {
 	/*
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
 		trace_block_unplug_io(q);
 		q->unplug_fn(q);
 	}
 }
 EXPORT_SYMBOL(blk_unplug);
 /**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   blk_start_queue() will clear the stop flag on the queue, and call
  *   the request_fn for the queue if it was in a stopped state when
  *   entered. Also see blk_stop_queue(). Queue lock must be held.
  **/
 void blk_start_queue(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 	__blk_run_queue(q);
 }
 EXPORT_SYMBOL(blk_start_queue);
 /**
  * blk_stop_queue - stop a queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   The Linux block layer assumes that a block driver will consume all
  *   entries on the request queue when the request_fn strategy is called.
  *   Often this will not happen, because of hardware limitations (queue
  *   depth settings). If a device driver gets a 'queue full' response,
  *   or if it simply chooses not to queue more I/O at one point, it can
  *   call this function to prevent the request_fn from being called until
  *   the driver has signalled it's ready to go again. This happens by calling
  *   blk_start_queue() to restart queue operations. Queue lock must be held.
  **/
 void blk_stop_queue(struct request_queue *q)
 {
 	blk_remove_plug(q);
 	queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
 EXPORT_SYMBOL(blk_stop_queue);
 /**
  * blk_sync_queue - cancel any pending callbacks on a queue
  * @q: the queue
  *
  * Description:
  *     The block layer may perform asynchronous callback activity
  *     on a queue, such as calling the unplug function after a timeout.
  *     A block device may call blk_sync_queue to ensure that any
  *     such activity is cancelled, thus allowing it to release resources
  *     that the callbacks might use. The caller must already have made sure
  *     that its ->make_request_fn will not re-add plugging prior to calling
  *     this function.
  *
  */
 void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
 	cancel_work_sync(&q->unplug_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 /**
  * __blk_run_queue - run a single device queue
  * @q:	The queue to run
  *
  * Description:
  *    See @blk_run_queue. This variant must be called with the queue lock
  *    held and interrupts disabled.
  *
  */
 void __blk_run_queue(struct request_queue *q)
 {
 	blk_remove_plug(q);
 	if (unlikely(blk_queue_stopped(q)))
 		return;
 	if (elv_queue_empty(q))
 		return;
 	/*
 	 * Only recurse once to avoid overrunning the stack, let the unplug
 	 * handling reinvoke the handler shortly if we already got there.
 	 */
 	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
 		q->request_fn(q);
 		queue_flag_clear(QUEUE_FLAG_REENTER, q);
 	} else {
 		queue_flag_set(QUEUE_FLAG_PLUGGED, q);
 		kblockd_schedule_work(q, &q->unplug_work);
 	}
 }
 EXPORT_SYMBOL(__blk_run_queue);
 /**
  * blk_run_queue - run a single device queue
  * @q: The queue to run
  *
  * Description:
  *    Invoke request handling on this queue, if it has pending work to do.
  *    May be used to restart queueing when a request has completed.
  */
 void blk_run_queue(struct request_queue *q)
 {
 	unsigned long flags;
 	spin_lock_irqsave(q->queue_lock, flags);
 	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_run_queue);
 void blk_put_queue(struct request_queue *q)
 {
 	kobject_put(&q->kobj);
 }
 void blk_cleanup_queue(struct request_queue *q)
 {
 	/*
 	 * We know we have process context here, so we can be a little
 	 * cautious and ensure that pending block actions on this device
 	 * are done before moving on. Going into this function, we should
 	 * not have processes doing IO to this device.
 	 */
 	blk_sync_queue(q);
 	mutex_lock(&q->sysfs_lock);
 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
 	mutex_unlock(&q->sysfs_lock);
 	if (q->elevator)
 		elevator_exit(q->elevator);
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
 static int blk_init_free_list(struct request_queue *q)
 {
 	struct request_list *rl = &q->rq;
 	rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
 	rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
 	rl->elvpriv = 0;
 	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
 	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
 	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
 				mempool_free_slab, request_cachep, q->node);
 	if (!rl->rq_pool)
 		return -ENOMEM;
 	return 0;
 }
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 {
 	return blk_alloc_queue_node(gfp_mask, -1);
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
 	int err;
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
 	if (!q)
 		return NULL;
 	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
 	q->backing_dev_info.unplug_io_data = q;
 	err = bdi_init(&q->backing_dev_info);
 	if (err) {
 		kmem_cache_free(blk_requestq_cachep, q);
 		return NULL;
 	}
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 	kobject_init(&q->kobj, &blk_queue_ktype);
 	mutex_init(&q->sysfs_lock);
 	spin_lock_init(&q->__queue_lock);
 	return q;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
 /**
  * blk_init_queue  - prepare a request queue for use with a block device
  * @rfn:  The function to be called to process requests that have been
  *        placed on the queue.
  * @lock: Request queue spin lock
  *
  * Description:
  *    If a block device wishes to use the standard request handling procedures,
  *    which sorts requests and coalesces adjacent requests, then it must
  *    call blk_init_queue().  The function @rfn will be called when there
  *    are requests on the queue that need to be processed.  If the device
  *    supports plugging, then @rfn may not be called immediately when requests
  *    are available on the queue, but may be called at some time later instead.
  *    Plugged queues are generally unplugged when a buffer belonging to one
  *    of the requests on the queue is needed, or due to memory pressure.
  *
  *    @rfn is not required, or even expected, to remove all requests off the
  *    queue, but only as many as it can handle at a time.  If it does leave
  *    requests on the queue, it is responsible for arranging that the requests
  *    get dealt with eventually.
  *
  *    The queue spin lock must be held while manipulating the requests on the
  *    request queue; this lock will be taken also from interrupt context, so irq
  *    disabling is needed for it.
  *
  *    Function returns a pointer to the initialized request queue, or %NULL if
  *    it didn't succeed.
  *
  * Note:
  *    blk_init_queue() must be paired with a blk_cleanup_queue() call
  *    when the block device is deactivated (such as at module unload).
  **/
 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 {
 	return blk_init_queue_node(rfn, lock, -1);
 }
 EXPORT_SYMBOL(blk_init_queue);
 struct request_queue *
 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 {
 	struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
 	if (!q)
 		return NULL;
 	q->node = node_id;
 	if (blk_init_free_list(q)) {
 		kmem_cache_free(blk_requestq_cachep, q);
 		return NULL;
 	}
 	/*
 	 * if caller didn't supply a lock, they get per-queue locking with
 	 * our embedded lock
 	 */
 	if (!lock)
 		lock = &q->__queue_lock;
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
 	q->unplug_fn		= generic_unplug_device;
 	q->queue_flags		= QUEUE_FLAG_DEFAULT;
 	q->queue_lock		= lock;
 	/*
 	 * This also sets hw/phys segments, boundary and size
 	 */
 	blk_queue_make_request(q, __make_request);
 	q->sg_reserved_size = INT_MAX;
 	blk_set_cmd_filter_defaults(&q->cmd_filter);
 	/*
 	 * all done
 	 */
 	if (!elevator_init(q, NULL)) {
 		blk_queue_congestion_threshold(q);
 		return q;
 	}
 	blk_put_queue(q);
 	return NULL;
 }
 EXPORT_SYMBOL(blk_init_queue_node);
 int blk_get_queue(struct request_queue *q)
 {
 	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
 		kobject_get(&q->kobj);
 		return 0;
 	}
 	return 1;
 }
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
 	if (rq->cmd_flags & REQ_ELVPRIV)
 		elv_put_request(q, rq);
 	mempool_free(rq, q->rq.rq_pool);
 }
 static struct request *
 blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 	if (!rq)
 		return NULL;
 	blk_rq_init(q, rq);
 	rq->cmd_flags = flags | REQ_ALLOCED;
 	if (priv) {
 		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
 			mempool_free(rq, q->rq.rq_pool);
 			return NULL;
 		}
 		rq->cmd_flags |= REQ_ELVPRIV;
 	}
 	return rq;
 }
 /*
  * ioc_batching returns true if the ioc is a valid batching request and
  * should be given priority access to a request.
  */
 static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
 {
 	if (!ioc)
 		return 0;
 	/*
 	 * Make sure the process is able to allocate at least 1 request
 	 * even if the batch times out, otherwise we could theoretically
 	 * lose wakeups.
 	 */
 	return ioc->nr_batch_requests == q->nr_batching ||
 		(ioc->nr_batch_requests > 0
 		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
 }
 /*
  * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
  * will cause the process to be a "batcher" on all queues in the system. This
  * is the behaviour we want though - once it gets a wakeup it should be given
  * a nice run.
  */
 static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
 {
 	if (!ioc || ioc_batching(q, ioc))
 		return;
 	ioc->nr_batch_requests = q->nr_batching;
 	ioc->last_waited = jiffies;
 }
 static void __freed_request(struct request_queue *q, int sync)
 {
 	struct request_list *rl = &q->rq;
 	if (rl->count[sync] < queue_congestion_off_threshold(q))
 		blk_clear_queue_congested(q, sync);
 	if (rl->count[sync] + 1 <= q->nr_requests) {
 		if (waitqueue_active(&rl->wait[sync]))
 			wake_up(&rl->wait[sync]);
 		blk_clear_queue_full(q, sync);
 	}
 }
 /*
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
 static void freed_request(struct request_queue *q, int sync, int priv)
 {
 	struct request_list *rl = &q->rq;
 	rl->count[sync]--;
 	if (priv)
 		rl->elvpriv--;
 	__freed_request(q, sync);
 	if (unlikely(rl->starved[sync ^ 1]))
 		__freed_request(q, sync ^ 1);
 }
 /*
  * Get a free request, queue_lock must be held.
  * Returns NULL on failure, with queue_lock held.
  * Returns !NULL on success, with queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, int rw_flags,
 				   struct bio *bio, gfp_t gfp_mask)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = NULL;
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	int may_queue, priv;
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
 	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
 		if (rl->count[is_sync]+1 >= q->nr_requests) {
 			ioc = current_io_context(GFP_ATOMIC, q->node);
 			/*
 			 * The queue will fill after this allocation, so set
 			 * it as full, and mark this process as "batching".
 			 * This process will be allowed to complete a batch of
 			 * requests, others will be blocked.
 			 */
 			if (!blk_queue_full(q, is_sync)) {
 				ioc_set_batching(q, ioc);
 				blk_set_queue_full(q, is_sync);
 			} else {
 				if (may_queue != ELV_MQUEUE_MUST
 						&& !ioc_batching(q, ioc)) {
 					/*
 					 * The queue is full and the allocating
 					 * process is not a "batcher", and not
 					 * exempted by the IO scheduler
 					 */
 					goto out;
 				}
 			}
 		}
 		blk_set_queue_congested(q, is_sync);
 	}
 	/*
 	 * Only allow batching queuers to allocate up to 50% over the defined
 	 * limit of requests, otherwise we could have thousands of requests
 	 * allocated with any setting of ->nr_requests
 	 */
 	if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
 		goto out;
 	rl->count[is_sync]++;
 	rl->starved[is_sync] = 0;
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	if (priv)
 		rl->elvpriv++;
 	if (blk_queue_io_stat(q))
 		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
 	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
 		 * we might have messed up.
 		 *
 		 * Allocating task should really be put onto the front of the
 		 * wait queue, but this is pretty rare.
 		 */
 		spin_lock_irq(q->queue_lock);
 		freed_request(q, is_sync, priv);
 		/*
 		 * in the very unlikely event that allocation failed and no
 		 * requests for this direction was pending, mark us starved
 		 * so that freeing of a request in the other direction will
 		 * notice us. another possible fix would be to split the
 		 * rq mempool into READ and WRITE
 		 */
 rq_starved:
 		if (unlikely(rl->count[is_sync] == 0))
 			rl->starved[is_sync] = 1;
 		goto out;
 	}
 	/*
 	 * ioc may be NULL here, and ioc_batching will be false. That's
 	 * OK, if the queue is under the request limit then requests need
 	 * not count toward the nr_batch_requests limit. There will always
 	 * be some limit enforced by BLK_BATCH_TIME.
 	 */
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 	trace_block_getrq(q, bio, rw_flags & 1);
 out:
 	return rq;
 }
 /*
  * No available requests for this queue, unplug the device and wait for some
  * requests to become available.
  *
  * Called with q->queue_lock held, and returns with it unlocked.
  */
 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 					struct bio *bio)
 {
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	struct request *rq;
 	rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	while (!rq) {
 		DEFINE_WAIT(wait);
 		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
 				TASK_UNINTERRUPTIBLE);
 		trace_block_sleeprq(q, bio, rw_flags & 1);
 		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
 		io_schedule();
 		/*
 		 * After sleeping, we become a "batching" process and
 		 * will be able to allocate at least one request, and
 		 * up to a big batch of them for a small period time.
 		 * See ioc_batching, ioc_set_batching
 		 */
 		ioc = current_io_context(GFP_NOIO, q->node);
 		ioc_set_batching(q, ioc);
 		spin_lock_irq(q->queue_lock);
 		finish_wait(&rl->wait[is_sync], &wait);
 		rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	};
 	return rq;
 }
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	struct request *rq;
 	BUG_ON(rw != READ && rw != WRITE);
 	spin_lock_irq(q->queue_lock);
 	if (gfp_mask & __GFP_WAIT) {
 		rq = get_request_wait(q, rw, NULL);
 	} else {
 		rq = get_request(q, rw, NULL, gfp_mask);
 		if (!rq)
 			spin_unlock_irq(q->queue_lock);
 	}
 	/* q->queue_lock is unlocked at this point */
 	return rq;
 }
 EXPORT_SYMBOL(blk_get_request);
 /**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
  *
  * Description:
  *    Drivers often keep queueing requests until the hardware cannot accept
  *    more, when that condition happens we need to put the request back
  *    on the queue. Must be called with queue lock held.
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 	elv_requeue_request(q, rq);
 }
 EXPORT_SYMBOL(blk_requeue_request);
 /**
  * blk_insert_request - insert a special request into a request queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
  * @at_head:	insert request at head or tail of queue
  * @data:	private data
  *
  * Description:
  *    Many block devices need to execute commands asynchronously, so they don't
  *    block the whole kernel from preemption during request execution.  This is
  *    accomplished normally by inserting aritficial requests tagged as
  *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
  *    be scheduled for actual execution by the request queue.
  *
  *    We have the option of inserting the head or the tail of the queue.
  *    Typically we use the tail for new ioctls and so forth.  We use the head
  *    of the queue for things like a QUEUE_FULL message from a device, or a
  *    host that is unable to accept a particular command.
  */
 void blk_insert_request(struct request_queue *q, struct request *rq,
 			int at_head, void *data)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 	unsigned long flags;
 	/*
 	 * tell I/O scheduler that this isn't a regular read/write (ie it
 	 * must not attempt merges on this) and that it acts as a soft
 	 * barrier
 	 */
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->special = data;
 	spin_lock_irqsave(q->queue_lock, flags);
 	/*
 	 * If command is tagged, release the tag
 	 */
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 	drive_stat_acct(rq, 1);
 	__elv_add_request(q, rq, where, 0);
 	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
 /*
  * add-request adds a request to the linked list.
  * queue lock is held and interrupts disabled, as we muck with the
  * request queue list.
  */
 static inline void add_request(struct request_queue *q, struct request *req)
 {
 	drive_stat_acct(req, 1);
 	/*
 	 * elevator indicated where it wants this request to be
 	 * inserted at elevator_merge time
 	 */
 	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
 }
 static void part_round_stats_single(int cpu, struct hd_struct *part,
 				    unsigned long now)
 {
 	if (now == part->stamp)
 		return;
 	if (part->in_flight) {
 		__part_stat_add(cpu, part, time_in_queue,
 				part->in_flight * (now - part->stamp));
 		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
 }
 /**
  * part_round_stats() - Round off the performance stats on a struct disk_stats.
  * @cpu: cpu number for stats access
  * @part: target partition
  *
  * The average IO queue length and utilisation statistics are maintained
  * by observing the current state of the queue length and the amount of
  * time it has been in this state for.
  *
  * Normally, that accounting is done on IO completion, but that can result
  * in more than a second's worth of IO being accounted for within any one
  * second, leading to >100% utilisation.  To deal with that, we call this
  * function to do a round-off before returning the results when reading
  * /proc/diskstats.  This accounts immediately for all queue usage up to
  * the current jiffies and restarts the counters again.
  */
 void part_round_stats(int cpu, struct hd_struct *part)
 {
 	unsigned long now = jiffies;
 	if (part->partno)
 		part_round_stats_single(cpu, &part_to_disk(part)->part0, now);
 	part_round_stats_single(cpu, part, now);
 }
 EXPORT_SYMBOL_GPL(part_round_stats);
 /*
  * queue lock must be held
  */
 void __blk_put_request(struct request_queue *q, struct request *req)
 {
 	if (unlikely(!q))
 		return;
 	if (unlikely(--req->ref_count))
 		return;
 	elv_completed_request(q, req);
 	/* this is a bio leak */
 	WARN_ON(req->bio != NULL);
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
 	 * it didn't come out of our reserved rq pools
 	 */
 	if (req->cmd_flags & REQ_ALLOCED) {
 		int is_sync = rq_is_sync(req) != 0;
 		int priv = req->cmd_flags & REQ_ELVPRIV;
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(!hlist_unhashed(&req->hash));
 		blk_free_request(q, req);
 		freed_request(q, is_sync, priv);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
 void blk_put_request(struct request *req)
 {
 	unsigned long flags;
 	struct request_queue *q = req->q;
 	spin_lock_irqsave(q->queue_lock, flags);
 	__blk_put_request(q, req);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_put_request);
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cpu = bio->bi_comp_cpu;
 	req->cmd_type = REQ_TYPE_FS;
 	/*
 	 * inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
 	 */
 	if (bio_rw_ahead(bio))
 		req->cmd_flags |= (REQ_FAILFAST_DEV | REQ_FAILFAST_TRANSPORT |
 				   REQ_FAILFAST_DRIVER);
 	if (bio_failfast_dev(bio))
 		req->cmd_flags |= REQ_FAILFAST_DEV;
 	if (bio_failfast_transport(bio))
 		req->cmd_flags |= REQ_FAILFAST_TRANSPORT;
 	if (bio_failfast_driver(bio))
 		req->cmd_flags |= REQ_FAILFAST_DRIVER;
 	if (unlikely(bio_discard(bio))) {
 		req->cmd_flags |= REQ_DISCARD;
 		if (bio_barrier(bio))
 			req->cmd_flags |= REQ_SOFTBARRIER;
 		req->q->prepare_discard_fn(req->q, req);
 	} else if (unlikely(bio_barrier(bio)))
 		req->cmd_flags |= REQ_HARDBARRIER;
 	if (bio_sync(bio))
 		req->cmd_flags |= REQ_RW_SYNC;
 	if (bio_rw_meta(bio))
 		req->cmd_flags |= REQ_RW_META;
 	if (bio_noidle(bio))
 		req->cmd_flags |= REQ_NOIDLE;
 	req->errors = 0;
 	req->hard_sector = req->sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
 /*
  * Only disabling plugging for non-rotational devices if it does tagging
  * as well, otherwise we do need the proper merging
  */
 static inline bool queue_should_plug(struct request_queue *q)
 {
 	return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
 }
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
 	int el_ret, nr_sectors;
 	const unsigned short prio = bio_prio(bio);
 	const int sync = bio_sync(bio);
 	const int unplug = bio_unplug(bio);
 	int rw_flags;
 	nr_sectors = bio_sectors(bio);
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
 	 * ISA dma in theory)
 	 */
 	blk_queue_bounce(q, &bio);
 	spin_lock_irq(q->queue_lock);
 	if (unlikely(bio_barrier(bio)) || elv_queue_empty(q))
 		goto get_rq;
 	el_ret = elv_merge(q, &req, bio);
 	switch (el_ret) {
 	case ELEVATOR_BACK_MERGE:
 		BUG_ON(!rq_mergeable(req));
 		if (!ll_back_merge_fn(q, req, bio))
 			break;
 		trace_block_bio_backmerge(q, bio);
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
 		req->nr_sectors = req->hard_nr_sectors += nr_sectors;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
 		drive_stat_acct(req, 0);
 		if (!attempt_back_merge(q, req))
 			elv_merged_request(q, req, el_ret);
 		goto out;
 	case ELEVATOR_FRONT_MERGE:
 		BUG_ON(!rq_mergeable(req));
 		if (!ll_front_merge_fn(q, req, bio))
 			break;
 		trace_block_bio_frontmerge(q, bio);
 		bio->bi_next = req->bio;
 		req->bio = bio;
 		/*
 		 * may not be valid. if the low level driver said
 		 * it didn't need a bounce buffer then it better
 		 * not touch req->buffer either...
 		 */
 		req->buffer = bio_data(bio);
 		req->current_nr_sectors = bio_cur_sectors(bio);
 		req->hard_cur_sectors = req->current_nr_sectors;
 		req->sector = req->hard_sector = bio->bi_sector;
 		req->nr_sectors = req->hard_nr_sectors += nr_sectors;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
 		drive_stat_acct(req, 0);
 		if (!attempt_front_merge(q, req))
 			elv_merged_request(q, req, el_ret);
 		goto out;
 	/* ELV_NO_MERGE: elevator says don't/can't merge. */
 	default:
 		;
 	}
 get_rq:
 	/*
 	 * This sync check and mask will be re-done in init_request_from_bio(),
 	 * but we need to set it earlier to expose the sync flag to the
 	 * rq allocator and io schedulers.
 	 */
 	rw_flags = bio_data_dir(bio);
 	if (sync)
 		rw_flags |= REQ_RW_SYNC;
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
 	 * Returns with the queue unlocked.
 	 */
 	req = get_request_wait(q, rw_flags, bio);
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
 	 * may now be mergeable after it had proven unmergeable (above).
 	 * We don't worry about that case for efficiency. It won't happen
 	 * often, and the elevators are able to handle it.
 	 */
 	init_request_from_bio(req, bio);
 	spin_lock_irq(q->queue_lock);
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
 	    bio_flagged(bio, BIO_CPU_AFFINE))
 		req->cpu = blk_cpu_to_group(smp_processor_id());
 	if (queue_should_plug(q) && elv_queue_empty(q))
 		blk_plug_device(q);
 	add_request(q, req);
 out:
 	if (unplug || !queue_should_plug(q))
 		__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
 /*
  * If bio->bi_dev is a partition, remap the location
  */
 static inline void blk_partition_remap(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	if (bio_sectors(bio) && bdev != bdev->bd_contains) {
 		struct hd_struct *p = bdev->bd_part;
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
 				    bdev->bd_dev, bio->bi_sector,
 				    bio->bi_sector - p->start_sect);
 	}
 }
 static void handle_bad_sector(struct bio *bio)
 {
 	char b[BDEVNAME_SIZE];
 	printk(KERN_INFO "attempt to access beyond end of device\n");
 	printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
 			bdevname(bio->bi_bdev, b),
 			bio->bi_rw,
 			(unsigned long long)bio->bi_sector + bio_sectors(bio),
 			(long long)(bio->bi_bdev->bd_inode->i_size >> 9));
 	set_bit(BIO_EOF, &bio->bi_flags);
 }
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static DECLARE_FAULT_ATTR(fail_make_request);
 static int __init setup_fail_make_request(char *str)
 {
 	return setup_fault_attr(&fail_make_request, str);
 }
 __setup("fail_make_request=", setup_fail_make_request);
 static int should_fail_request(struct bio *bio)
 {
 	struct hd_struct *part = bio->bi_bdev->bd_part;
 	if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)
 		return should_fail(&fail_make_request, bio->bi_size);
 	return 0;
 }
 static int __init fail_make_request_debugfs(void)
 {
 	return init_fault_attr_dentries(&fail_make_request,
 					"fail_make_request");
 }
 late_initcall(fail_make_request_debugfs);
 #else /* CONFIG_FAIL_MAKE_REQUEST */
 static inline int should_fail_request(struct bio *bio)
 {
 	return 0;
 }
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 /*
  * Check whether this bio extends beyond the end of the device.
  */
 static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 {
 	sector_t maxsector;
 	if (!nr_sectors)
 		return 0;
 	/* Test device or partition size, when known. */
 	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
 	if (maxsector) {
 		sector_t sector = bio->bi_sector;
 		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
 			/*
 			 * This may well happen - the kernel calls bread()
 			 * without checking the size of the device, e.g., when
 			 * mounting a device.
 			 */
 			handle_bad_sector(bio);
 			return 1;
 		}
 	}
 	return 0;
 }
 /**
  * generic_make_request - hand a buffer to its device driver for I/O
  * @bio:  The bio describing the location in memory and on the device.
  *
  * generic_make_request() is used to make I/O requests of block
  * devices. It is passed a &struct bio, which describes the I/O that needs
  * to be done.
  *
  * generic_make_request() does not return any status.  The
  * success/failure status of the request, along with notification of
  * completion, is delivered asynchronously through the bio->bi_end_io
  * function described (one day) else where.
  *
  * The caller of generic_make_request must make sure that bi_io_vec
  * are set to describe the memory buffer, and that bi_dev and bi_sector are
  * set to describe the device address, and the
  * bi_end_io and optionally bi_private are set to describe how
  * completion notification should be signaled.
  *
  * generic_make_request and the drivers it calls may use bi_next if this
  * bio happens to be merged with someone else, and may change bi_dev and
  * bi_sector for remaps as it sees fit.  So the values of these fields
  * should NOT be depended on after the call to generic_make_request.
  */
 static inline void __generic_make_request(struct bio *bio)
 {
 	struct request_queue *q;
 	sector_t old_sector;
 	int ret, nr_sectors = bio_sectors(bio);
 	dev_t old_dev;
 	int err = -EIO;
 	might_sleep();
 	if (bio_check_eod(bio, nr_sectors))
 		goto end_io;
 	/*
 	 * Resolve the mapping until finished. (drivers are
 	 * still free to implement/resolve their own stacking
 	 * by explicitly returning 0)
 	 *
 	 * NOTE: we don't repeat the blk_size check for each new device.
 	 * Stacking drivers are expected to know what they are doing.
 	 */
 	old_sector = -1;
 	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
 		q = bdev_get_queue(bio->bi_bdev);
 		if (unlikely(!q)) {
 			printk(KERN_ERR
 			       "generic_make_request: Trying to access "
 				"nonexistent block-device %s (%Lu)\n",
 				bdevname(bio->bi_bdev, b),
 				(long long) bio->bi_sector);
 			goto end_io;
 		}
 		if (unlikely(nr_sectors > q->max_hw_sectors)) {
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
 				bdevname(bio->bi_bdev, b),
 				bio_sectors(bio),
 				q->max_hw_sectors);
 			goto end_io;
 		}
 		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
 			goto end_io;
 		if (should_fail_request(bio))
 			goto end_io;
 		/*
 		 * If this device has partitions, remap block n
 		 * of partition p to block n+start(p) of the disk.
 		 */
 		blk_partition_remap(bio);
 		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
 			goto end_io;
 		if (old_sector != -1)
 			trace_block_remap(q, bio, old_dev, bio->bi_sector,
 					    old_sector);
 		trace_block_bio_queue(q, bio);
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
 		if (bio_discard(bio) && !q->prepare_discard_fn) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
 		if (bio_barrier(bio) && bio_has_data(bio) &&
 		    (q->next_ordered == QUEUE_ORDERED_NONE)) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 	return;
 end_io:
 	bio_endio(bio, err);
 }
 /*
  * We only want one ->make_request_fn to be active at a time,
  * else stack usage with stacked devices could be a problem.
  * So use current->bio_{list,tail} to keep a list of requests
  * submited by a make_request_fn function.
  * current->bio_tail is also used as a flag to say if
  * generic_make_request is currently active in this task or not.
  * If it is NULL, then no make_request is active.  If it is non-NULL,
  * then a make_request is active, and new requests should be added
  * at the tail
  */
 void generic_make_request(struct bio *bio)
 {
 	if (current->bio_tail) {
 		/* make_request is active */
 		*(current->bio_tail) = bio;
 		bio->bi_next = NULL;
 		current->bio_tail = &bio->bi_next;
 		return;
 	}
 	/* following loop may be a bit non-obvious, and so deserves some
 	 * explanation.
 	 * Before entering the loop, bio->bi_next is NULL (as all callers
 	 * ensure that) so we have a list with a single bio.
 	 * We pretend that we have just taken it off a longer list, so
 	 * we assign bio_list to the next (which is NULL) and bio_tail
 	 * to &bio_list, thus initialising the bio_list of new bios to be
 	 * added.  __generic_make_request may indeed add some more bios
 	 * through a recursive call to generic_make_request.  If it
 	 * did, we find a non-NULL value in bio_list and re-enter the loop
 	 * from the top.  In this case we really did just take the bio
 	 * of the top of the list (no pretending) and so fixup bio_list and
 	 * bio_tail or bi_next, and call into __generic_make_request again.
 	 *
 	 * The loop was structured like this to make only one call to
 	 * __generic_make_request (which is important as it is large and
 	 * inlined) and to keep the structure simple.
 	 */
 	BUG_ON(bio->bi_next);
 	do {
 		current->bio_list = bio->bi_next;
 		if (bio->bi_next == NULL)
 			current->bio_tail = &current->bio_list;
 		else
 			bio->bi_next = NULL;
 		__generic_make_request(bio);
 		bio = current->bio_list;
 	} while (bio);
 	current->bio_tail = NULL; /* deactivate */
 }
 EXPORT_SYMBOL(generic_make_request);
 /**
  * submit_bio - submit a bio to the block device layer for I/O
  * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
  * @bio: The &struct bio which describes the I/O
  *
  * submit_bio() is very similar in purpose to generic_make_request(), and
  * uses that function to do most of the work. Both are fairly rough
  * interfaces; @bio must be presetup and ready for I/O.
  *
  */
 void submit_bio(int rw, struct bio *bio)
 {
 	int count = bio_sectors(bio);
 	bio->bi_rw |= rw;
 	/*
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
 	 */
 	if (bio_has_data(bio)) {
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
 		} else {
 			task_io_account_read(bio->bi_size);
 			count_vm_events(PGPGIN, count);
 		}
 		if (unlikely(block_dump)) {
 			char b[BDEVNAME_SIZE];
 			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
 			current->comm, task_pid_nr(current),
 				(rw & WRITE) ? "WRITE" : "READ",
 				(unsigned long long)bio->bi_sector,
 				bdevname(bio->bi_bdev, b));
 		}
 	}
 	generic_make_request(bio);
 }
 EXPORT_SYMBOL(submit_bio);
 /**
  * blk_rq_check_limits - Helper function to check a request for the queue limit
  * @q:  the queue
  * @rq: the request being checked
  *
  * Description:
  *    @rq may have been made based on weaker limitations of upper-level queues
  *    in request stacking drivers, and it may violate the limitation of @q.
  *    Since the block layer and the underlying device driver trust @rq
  *    after it is inserted to @q, it should be checked against @q before
  *    the insertion using this generic function.
  *
  *    This function should also be useful for request stacking drivers
  *    in some cases below, so export this fuction.
  *    Request stacking drivers like request-based dm may change the queue
  *    limits while requests are in the queue (e.g. dm's table swapping).
  *    Such request stacking drivers should check those requests agaist
  *    the new queue limits again when they dispatch those requests,
  *    although such checkings are also done against the old queue limits
  *    when submitting requests.
  */
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
 	if (rq->nr_sectors > q->max_sectors ||
 	    rq->data_len > q->max_hw_sectors << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
 	}
 	/*
 	 * queue's settings related to segment counting like q->bounce_pfn
 	 * may differ from that of other stacking queues.
 	 * Recalculate it to check the request correctly on this queue's
 	 * limitation.
 	 */
 	blk_recalc_rq_segments(rq);
 	if (rq->nr_phys_segments > q->max_phys_segments ||
 	    rq->nr_phys_segments > q->max_hw_segments) {
 		printk(KERN_ERR "%s: over max segments limit.\n", __func__);
 		return -EIO;
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_rq_check_limits);
 /**
  * blk_insert_cloned_request - Helper for stacking drivers to submit a request
  * @q:  the queue to submit the request
  * @rq: the request being queued
  */
 int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
 	unsigned long flags;
 	if (blk_rq_check_limits(q, rq))
 		return -EIO;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
 	    should_fail(&fail_make_request, blk_rq_bytes(rq)))
 		return -EIO;
 #endif
 	spin_lock_irqsave(q->queue_lock, flags);
 	/*
 	 * Submitting request must be dequeued before calling this function
 	 * because it will be linked to another request_queue
 	 */
 	BUG_ON(blk_queued_rq(rq));
 	drive_stat_acct(rq, 1);
 	__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 /**
  * blkdev_dequeue_request - dequeue request and start timeout timer
  * @req: request to dequeue
  *
  * Dequeue @req and start timeout timer on it.  This hands off the
  * request to the driver.
  *
  * Block internal functions which don't want to start timer should
  * call elv_dequeue_request().
  */
 void blkdev_dequeue_request(struct request *req)
 {
 	elv_dequeue_request(req->q, req);
 	/*
 	 * We are now handing the request to the hardware, add the
 	 * timeout handler.
 	 */
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blkdev_dequeue_request);
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (blk_do_io_stat(req)) {
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
 		cpu = part_stat_lock();
 		part = disk_map_sector_rcu(req->rq_disk, req->sector);
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
 }
 static void blk_account_io_done(struct request *req)
 {
 	/*
 	 * Account IO completion.  bar_rq isn't accounted as a normal
 	 * IO on queueing nor completion.  Accounting the containing
 	 * request is enough.
 	 */
 	if (blk_do_io_stat(req) && req != &req->q->bar_rq) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
 		cpu = part_stat_lock();
 		part = disk_map_sector_rcu(req->rq_disk, req->sector);
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part);
 		part_stat_unlock();
 	}
 }
 /**
  * blk_rq_bytes - Returns bytes left to complete in the entire request
  * @rq: the request being processed
  **/
 unsigned int blk_rq_bytes(struct request *rq)
 {
 	if (blk_fs_request(rq))
 		return rq->hard_nr_sectors << 9;
 	return rq->data_len;
 }
 EXPORT_SYMBOL_GPL(blk_rq_bytes);
 /**
  * blk_rq_cur_bytes - Returns bytes left to complete in the current segment
  * @rq: the request being processed
  **/
 unsigned int blk_rq_cur_bytes(struct request *rq)
 {
 	if (blk_fs_request(rq))
 		return rq->current_nr_sectors << 9;
 	if (rq->bio)
 		return rq->bio->bi_size;
 	return rq->data_len;
 }
 EXPORT_SYMBOL_GPL(blk_rq_cur_bytes);
 struct request *elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
 	int ret;
 	while ((rq = __elv_next_request(q)) != NULL) {
 		if (!(rq->cmd_flags & REQ_STARTED)) {
 			/*
 			 * This is the first time the device driver
 			 * sees this request (possibly after
 			 * requeueing).  Notify IO scheduler.
 			 */
 			if (blk_sorted_rq(rq))
 				elv_activate_rq(q, rq);
 			/*
 			 * just mark as started even if we don't start
 			 * it, a request that has been delayed should
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
 			trace_block_rq_issue(q, rq);
 		}
 		if (!q->boundary_rq || q->boundary_rq == rq) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = NULL;
 		}
 		if (rq->cmd_flags & REQ_DONTPREP)
 			break;
 		if (q->dma_drain_size && rq->data_len) {
 			/*
 			 * make sure space for the drain appears we
 			 * know we can do this because max_hw_segments
 			 * has been adjusted to be one fewer than the
 			 * device can handle
 			 */
 			rq->nr_phys_segments++;
 		}
 		if (!q->prep_rq_fn)
 			break;
 		ret = q->prep_rq_fn(q, rq);
 		if (ret == BLKPREP_OK) {
 			break;
 		} else if (ret == BLKPREP_DEFER) {
 			/*
 			 * the request may have been (partially) prepped.
 			 * we need to keep this request in the front to
 			 * avoid resource deadlock.  REQ_STARTED will
 			 * prevent other fs requests from passing this one.
 			 */
 			if (q->dma_drain_size && rq->data_len &&
 			    !(rq->cmd_flags & REQ_DONTPREP)) {
 				/*
 				 * remove the space for the drain we added
 				 * so that we don't add it again
 				 */
 				--rq->nr_phys_segments;
 			}
 			rq = NULL;
 			break;
 		} else if (ret == BLKPREP_KILL) {
 			rq->cmd_flags |= REQ_QUIET;
 			__blk_end_request_all(rq, -EIO);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
 			break;
 		}
 	}
 	return rq;
 }
 EXPORT_SYMBOL(elv_next_request);
 void elv_dequeue_request(struct request_queue *q, struct request *rq)
 {
 	BUG_ON(list_empty(&rq->queuelist));
 	BUG_ON(ELV_ON_HASH(rq));
 	list_del_init(&rq->queuelist);
 	/*
 	 * the time frame between a request being removed from the lists
 	 * and to it is freed is accounted as io that is in progress at
 	 * the driver side.
 	 */
 	if (blk_account_rq(rq))
 		q->in_flight++;
 }
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @rq:	      the request being processed
  * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete @rq
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq, but doesn't complete
  *     the request structure even if @rq doesn't have leftover.
  *     If @rq has leftover, sets it up for the next range of segments.
  *
  *     This special helper function is only for request stacking drivers
  *     (e.g. request-based dm) so that they can handle partial completion.
  *     Actual device drivers should use blk_end_request instead.
  *
  *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
  *     %false return from this function.
  *
  * Return:
  *     %false - this request doesn't have any more data
  *     %true  - this request has more data
  **/
 bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 {
 	int total_bytes, bio_nbytes, next_idx = 0;
 	struct bio *bio;
 	if (!req->bio)
 		return false;
 	trace_block_rq_complete(req->q, req);
 	/*
 	 * For fs requests, rq is just carrier of independent bio's
 	 * and each partial completion should be handled separately.
 	 * Reset per-request error on each partial completion.
 	 *
 	 * TODO: tj: This is too subtle.  It would be better to let
 	 * low level drivers do what they see fit.
 	 */
 	if (blk_fs_request(req))
 		req->errors = 0;
 	if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {
 		printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",
 				req->rq_disk ? req->rq_disk->disk_name : "?",
 				(unsigned long long)req->sector);
 	}
 	blk_account_io_completion(req, nr_bytes);
 	total_bytes = bio_nbytes = 0;
 	while ((bio = req->bio) != NULL) {
 		int nbytes;
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
 			req_bio_endio(req, bio, nbytes, error);
 			next_idx = 0;
 			bio_nbytes = 0;
 		} else {
 			int idx = bio->bi_idx + next_idx;
 			if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
 				blk_dump_rq_flags(req, "__end_that");
 				printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",
 				       __func__, bio->bi_idx, bio->bi_vcnt);
 				break;
 			}
 			nbytes = bio_iovec_idx(bio, idx)->bv_len;
 			BIO_BUG_ON(nbytes > bio->bi_size);
 			/*
 			 * not a complete bvec done
 			 */
 			if (unlikely(nbytes > nr_bytes)) {
 				bio_nbytes += nr_bytes;
 				total_bytes += nr_bytes;
 				break;
 			}
 			/*
 			 * advance to the next vector
 			 */
 			next_idx++;
 			bio_nbytes += nbytes;
 		}
 		total_bytes += nbytes;
 		nr_bytes -= nbytes;
 		bio = req->bio;
 		if (bio) {
 			/*
 			 * end more in this run, or just return 'not-done'
 			 */
 			if (unlikely(nr_bytes <= 0))
 				break;
 		}
 	}
 	/*
 	 * completely done
 	 */
 	if (!req->bio) {
 		/*
 		 * Reset counters so that the request stacking driver
 		 * can find how many bytes remain in the request
 		 * later.
 		 */
 		req->nr_sectors = req->hard_nr_sectors = 0;
 		req->current_nr_sectors = req->hard_cur_sectors = 0;
 		return false;
 	}
 	/*
 	 * if the request wasn't completed, update state
 	 */
 	if (bio_nbytes) {
 		req_bio_endio(req, bio, bio_nbytes, error);
 		bio->bi_idx += next_idx;
 		bio_iovec(bio)->bv_offset += nr_bytes;
 		bio_iovec(bio)->bv_len -= nr_bytes;
 	}
 	blk_recalc_rq_sectors(req, total_bytes >> 9);
 	blk_recalc_rq_segments(req);
 	return true;
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
 static bool blk_update_bidi_request(struct request *rq, int error,
 				    unsigned int nr_bytes,
 				    unsigned int bidi_bytes)
 {
 	if (blk_update_request(rq, error, nr_bytes))
 		return true;
 	/* Bidi request must be completed as a whole */
 	if (unlikely(blk_bidi_rq(rq)) &&
 	    blk_update_request(rq->next_rq, error, bidi_bytes))
 		return true;
 	add_disk_randomness(rq->rq_disk);
 	return false;
 }
 /*
  * queue lock must be held
  */
 static void blk_finish_request(struct request *req, int error)
 {
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
 	if (blk_queued_rq(req))
 		elv_dequeue_request(req->q, req);
 	if (unlikely(laptop_mode) && blk_fs_request(req))
 		laptop_io_completion();
 	blk_delete_timer(req);
 	blk_account_io_done(req);
 	if (req->end_io)
 		req->end_io(req, error);
 	else {
 		if (blk_bidi_rq(req))
 			__blk_put_request(req->next_rq->q, req->next_rq);
 		__blk_put_request(req->q, req);
 	}
 }
 /**
  * blk_end_bidi_request - Complete a bidi request
  * @rq:         the request to complete
  * @error:      %0 for success, < %0 for error
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
  *     Drivers that supports bidi can safely call this member for any
  *     type of request, bidi or uni.  In the later case @bidi_bytes is
  *     just ignored.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
 bool blk_end_bidi_request(struct request *rq, int error,
 			  unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
 		return true;
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_finish_request(rq, error);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return false;
 }
 EXPORT_SYMBOL_GPL(blk_end_bidi_request);
 /**
  * __blk_end_bidi_request - Complete a bidi request with queue lock held
  * @rq:         the request to complete
  * @error:      %0 for success, < %0 for error
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
  *     Identical to blk_end_bidi_request() except that queue lock is
  *     assumed to be locked on entry and remains so on return.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
 bool __blk_end_bidi_request(struct request *rq, int error,
 			    unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
 		return true;
 	blk_finish_request(rq, error);
 	return false;
 }
 EXPORT_SYMBOL_GPL(__blk_end_bidi_request);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
 	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw, and
 	   we want BIO_RW_AHEAD (bit 1) to imply REQ_FAILFAST (bit 1). */
 	rq->cmd_flags |= (bio->bi_rw & 3);
 	if (bio_has_data(bio)) {
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 		rq->buffer = bio_data(bio);
 	}
 	rq->current_nr_sectors = bio_cur_sectors(bio);
 	rq->hard_cur_sectors = rq->current_nr_sectors;
 	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
 	rq->data_len = bio->bi_size;
 	rq->bio = rq->biotail = bio;
 	if (bio->bi_bdev)
 		rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 /**
  * blk_lld_busy - Check if underlying low-level drivers of a device are busy
  * @q : the queue of the device being checked
  *
  * Description:
  *    Check if underlying low-level drivers of a device are busy.
  *    If the drivers want to export their busy state, they must set own
  *    exporting function using blk_queue_lld_busy() first.
  *
  *    Basically, this function is used only by request stacking drivers
  *    to stop dispatching requests to underlying devices when underlying
  *    devices are busy.  This behavior helps more I/O merging on the queue
  *    of the request stacking driver and prevents I/O throughput regression
  *    on burst I/O load.
  *
  * Return:
  *    0 - Not busy (The request stacking driver should dispatch request)
  *    1 - Busy (The request stacking driver should stop dispatching request)
  */
 int blk_lld_busy(struct request_queue *q)
 {
 	if (q->lld_busy_fn)
 		return q->lld_busy_fn(q);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 int __init blk_dev_init(void)
 {
+	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
+			sizeof(((struct request *)0)->cmd_flags));
 	kblockd_workqueue = create_workqueue("kblockd");
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
 	request_cachep = kmem_cache_create("blkdev_requests",
 			sizeof(struct request), 0, SLAB_PANIC, NULL);
 	blk_requestq_cachep = kmem_cache_create("blkdev_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 	return 0;
 }