Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

3

4

5

* Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>

5

* Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>

6

* kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>

6

* kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>

7

* - July2000

7

* - July2000

8

* bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001

8

* bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001

9

*/

9

*/

10

11

/*

11

/*

12

* This handles all read/write requests to block devices

12

* This handles all read/write requests to block devices

13

*/

13

*/

14

#include <linux/kernel.h>

14

#include <linux/kernel.h>

15

#include <linux/module.h>

15

#include <linux/module.h>

16

#include <linux/backing-dev.h>

16

#include <linux/backing-dev.h>

17

#include <linux/bio.h>

17

#include <linux/bio.h>

18

#include <linux/blkdev.h>

18

#include <linux/blkdev.h>

19

#include <linux/highmem.h>

19

#include <linux/highmem.h>

20

#include <linux/mm.h>

20

#include <linux/mm.h>

21

#include <linux/kernel_stat.h>

21

#include <linux/kernel_stat.h>

22

#include <linux/string.h>

22

#include <linux/string.h>

23

#include <linux/init.h>

23

#include <linux/init.h>

24

#include <linux/completion.h>

24

#include <linux/completion.h>

25

#include <linux/slab.h>

25

#include <linux/slab.h>

26

#include <linux/swap.h>

26

#include <linux/swap.h>

27

#include <linux/writeback.h>

27

#include <linux/writeback.h>

28

#include <linux/task_io_accounting_ops.h>

28

#include <linux/task_io_accounting_ops.h>

29

#include <linux/fault-inject.h>

29

#include <linux/fault-inject.h>

30

#include <linux/list_sort.h>

30

#include <linux/list_sort.h>

31

32

#define CREATE_TRACE_POINTS

32

#define CREATE_TRACE_POINTS

33

#include <trace/events/block.h>

33

#include <trace/events/block.h>

34

35

#include "blk.h"

35

#include "blk.h"

36

37

EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);

37

EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);

38

EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);

38

EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);

39

EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);

39

EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);

40

41

static int __make_request(struct request_queue *q, struct bio *bio);

41

static int __make_request(struct request_queue *q, struct bio *bio);

42

43

/*

43

/*

44

* For the allocated request tables

44

* For the allocated request tables

45

*/

45

*/

46

static struct kmem_cache *request_cachep;

46

static struct kmem_cache *request_cachep;

47

48

/*

48

/*

49

* For queue allocation

49

* For queue allocation

50

*/

50

*/

51

struct kmem_cache *blk_requestq_cachep;

51

struct kmem_cache *blk_requestq_cachep;

52

53

/*

53

/*

54

* Controlling structure to kblockd

54

* Controlling structure to kblockd

55

*/

55

*/

56

static struct workqueue_struct *kblockd_workqueue;

56

static struct workqueue_struct *kblockd_workqueue;

57

58

static void drive_stat_acct(struct request *rq, int new_io)

58

static void drive_stat_acct(struct request *rq, int new_io)

59

{

59

{

60

struct hd_struct *part;

60

struct hd_struct *part;

61

int rw = rq_data_dir(rq);

61

int rw = rq_data_dir(rq);

62

int cpu;

62

int cpu;

63

64

if (!blk_do_io_stat(rq))

64

if (!blk_do_io_stat(rq))

65

return;

65

return;

66

67

cpu = part_stat_lock();

67

cpu = part_stat_lock();

68

69

if (!new_io) {

69

if (!new_io) {

70

part = rq->part;

70

part = rq->part;

71

part_stat_inc(cpu, part, merges[rw]);

71

part_stat_inc(cpu, part, merges[rw]);

72

} else {

72

} else {

73

part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));

73

part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));

74

if (!hd_struct_try_get(part)) {

74

if (!hd_struct_try_get(part)) {

75

/*

75

/*

76

* The partition is already being removed,

76

* The partition is already being removed,

77

* the request will be accounted on the disk only

77

* the request will be accounted on the disk only

78

*

78

*

79

* We take a reference on disk->part0 although that

79

* We take a reference on disk->part0 although that

80

* partition will never be deleted, so we can treat

80

* partition will never be deleted, so we can treat

81

* it as any other partition.

81

* it as any other partition.

82

*/

82

*/

83

part = &rq->rq_disk->part0;

83

part = &rq->rq_disk->part0;

84

hd_struct_get(part);

84

hd_struct_get(part);

85

}

85

}

86

part_round_stats(cpu, part);

86

part_round_stats(cpu, part);

87

part_inc_in_flight(part, rw);

87

part_inc_in_flight(part, rw);

88

rq->part = part;

88

rq->part = part;

89

}

89

}

90

91

part_stat_unlock();

91

part_stat_unlock();

92

}

92

}

93

94

void blk_queue_congestion_threshold(struct request_queue *q)

94

void blk_queue_congestion_threshold(struct request_queue *q)

95

{

95

{

96

int nr;

96

int nr;

97

98

nr = q->nr_requests - (q->nr_requests / 8) + 1;

98

nr = q->nr_requests - (q->nr_requests / 8) + 1;

99

if (nr > q->nr_requests)

99

if (nr > q->nr_requests)

100

nr = q->nr_requests;

100

nr = q->nr_requests;

101

q->nr_congestion_on = nr;

101

q->nr_congestion_on = nr;

102

103

nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;

103

nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;

104

if (nr < 1)

104

if (nr < 1)

105

nr = 1;

105

nr = 1;

106

q->nr_congestion_off = nr;

106

q->nr_congestion_off = nr;

107

}

107

}

108

109

/**

109

/**

110

* blk_get_backing_dev_info - get the address of a queue's backing_dev_info

110

* blk_get_backing_dev_info - get the address of a queue's backing_dev_info

111

* @bdev: device

111

* @bdev: device

112

*

112

*

113

* Locates the passed device's request queue and returns the address of its

113

* Locates the passed device's request queue and returns the address of its

114

* backing_dev_info

114

* backing_dev_info

115

*

115

*

116

* Will return NULL if the request queue cannot be located.

116

* Will return NULL if the request queue cannot be located.

117

*/

117

*/

118

struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)

118

struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)

119

{

119

{

120

struct backing_dev_info *ret = NULL;

120

struct backing_dev_info *ret = NULL;

121

struct request_queue *q = bdev_get_queue(bdev);

121

struct request_queue *q = bdev_get_queue(bdev);

122

123

if (q)

123

if (q)

124

ret = &q->backing_dev_info;

124

ret = &q->backing_dev_info;

125

return ret;

125

return ret;

126

}

126

}

127

EXPORT_SYMBOL(blk_get_backing_dev_info);

127

EXPORT_SYMBOL(blk_get_backing_dev_info);

128

129

void blk_rq_init(struct request_queue *q, struct request *rq)

129

void blk_rq_init(struct request_queue *q, struct request *rq)

130

{

130

{

131

memset(rq, 0, sizeof(*rq));

131

memset(rq, 0, sizeof(*rq));

132

133

INIT_LIST_HEAD(&rq->queuelist);

133

INIT_LIST_HEAD(&rq->queuelist);

134

INIT_LIST_HEAD(&rq->timeout_list);

134

INIT_LIST_HEAD(&rq->timeout_list);

135

rq->cpu = -1;

135

rq->cpu = -1;

136

rq->q = q;

136

rq->q = q;

137

rq->__sector = (sector_t) -1;

137

rq->__sector = (sector_t) -1;

138

INIT_HLIST_NODE(&rq->hash);

138

INIT_HLIST_NODE(&rq->hash);

139

RB_CLEAR_NODE(&rq->rb_node);

139

RB_CLEAR_NODE(&rq->rb_node);

140

rq->cmd = rq->__cmd;

140

rq->cmd = rq->__cmd;

141

rq->cmd_len = BLK_MAX_CDB;

141

rq->cmd_len = BLK_MAX_CDB;

142

rq->tag = -1;

142

rq->tag = -1;

143

rq->ref_count = 1;

143

rq->ref_count = 1;

144

rq->start_time = jiffies;

144

rq->start_time = jiffies;

145

set_start_time_ns(rq);

145

set_start_time_ns(rq);

146

rq->part = NULL;

146

rq->part = NULL;

147

}

147

}

148

EXPORT_SYMBOL(blk_rq_init);

148

EXPORT_SYMBOL(blk_rq_init);

149

150

static void req_bio_endio(struct request *rq, struct bio *bio,

150

static void req_bio_endio(struct request *rq, struct bio *bio,

151

unsigned int nbytes, int error)

151

unsigned int nbytes, int error)

152

{

152

{

153

if (error)

153

if (error)

154

clear_bit(BIO_UPTODATE, &bio->bi_flags);

154

clear_bit(BIO_UPTODATE, &bio->bi_flags);

155

else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))

155

else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))

156

error = -EIO;

156

error = -EIO;

157

158

if (unlikely(nbytes > bio->bi_size)) {

158

if (unlikely(nbytes > bio->bi_size)) {

159

printk(KERN_ERR "%s: want %u bytes done, %u left\n",

159

printk(KERN_ERR "%s: want %u bytes done, %u left\n",

160

__func__, nbytes, bio->bi_size);

160

__func__, nbytes, bio->bi_size);

161

nbytes = bio->bi_size;

161

nbytes = bio->bi_size;

162

}

162

}

163

164

if (unlikely(rq->cmd_flags & REQ_QUIET))

164

if (unlikely(rq->cmd_flags & REQ_QUIET))

165

set_bit(BIO_QUIET, &bio->bi_flags);

165

set_bit(BIO_QUIET, &bio->bi_flags);

166

167

bio->bi_size -= nbytes;

167

bio->bi_size -= nbytes;

168

bio->bi_sector += (nbytes >> 9);

168

bio->bi_sector += (nbytes >> 9);

169

170

if (bio_integrity(bio))

170

if (bio_integrity(bio))

171

bio_integrity_advance(bio, nbytes);

171

bio_integrity_advance(bio, nbytes);

172

173

/* don't actually finish bio if it's part of flush sequence */

173

/* don't actually finish bio if it's part of flush sequence */

174

if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))

174

if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))

175

bio_endio(bio, error);

175

bio_endio(bio, error);

176

}

176

}

177

178

void blk_dump_rq_flags(struct request *rq, char *msg)

178

void blk_dump_rq_flags(struct request *rq, char *msg)

179

{

179

{

180

int bit;

180

int bit;

181

182

printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,

182

printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,

183

rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,

183

rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,

184

rq->cmd_flags);

184

rq->cmd_flags);

185

186

printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",

186

printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",

187

(unsigned long long)blk_rq_pos(rq),

187

(unsigned long long)blk_rq_pos(rq),

188

blk_rq_sectors(rq), blk_rq_cur_sectors(rq));

188

blk_rq_sectors(rq), blk_rq_cur_sectors(rq));

189

printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n",

189

printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n",

190

rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));

190

rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));

191

192

if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {

192

if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {

193

printk(KERN_INFO " cdb: ");

193

printk(KERN_INFO " cdb: ");

194

for (bit = 0; bit < BLK_MAX_CDB; bit++)

194

for (bit = 0; bit < BLK_MAX_CDB; bit++)

195

printk("%02x ", rq->cmd[bit]);

195

printk("%02x ", rq->cmd[bit]);

196

printk("\n");

196

printk("\n");

197

}

197

}

198

}

198

}

199

EXPORT_SYMBOL(blk_dump_rq_flags);

199

EXPORT_SYMBOL(blk_dump_rq_flags);

200

201

static void blk_delay_work(struct work_struct *work)

201

static void blk_delay_work(struct work_struct *work)

202

{

202

{

203

struct request_queue *q;

203

struct request_queue *q;

204

205

q = container_of(work, struct request_queue, delay_work.work);

205

q = container_of(work, struct request_queue, delay_work.work);

206

spin_lock_irq(q->queue_lock);

206

spin_lock_irq(q->queue_lock);

207

__blk_run_queue(q, false);

207

__blk_run_queue(q, false);

208

spin_unlock_irq(q->queue_lock);

208

spin_unlock_irq(q->queue_lock);

209

}

209

}

210

211

/**

211

/**

212

* blk_delay_queue - restart queueing after defined interval

212

* blk_delay_queue - restart queueing after defined interval

213

* @q: The &struct request_queue in question

213

* @q: The &struct request_queue in question

214

* @msecs: Delay in msecs

214

* @msecs: Delay in msecs

215

*

215

*

216

* Description:

216

* Description:

217

* Sometimes queueing needs to be postponed for a little while, to allow

217

* Sometimes queueing needs to be postponed for a little while, to allow

218

* resources to come back. This function will make sure that queueing is

218

* resources to come back. This function will make sure that queueing is

219

* restarted around the specified time.

219

* restarted around the specified time.

220

*/

220

*/

221

void blk_delay_queue(struct request_queue *q, unsigned long msecs)

221

void blk_delay_queue(struct request_queue *q, unsigned long msecs)

222

{

222

{

223

schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs));

223

schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs));

224

}

224

}

225

EXPORT_SYMBOL(blk_delay_queue);

225

EXPORT_SYMBOL(blk_delay_queue);

226

227

/**

227

/**

228

* blk_start_queue - restart a previously stopped queue

228

* blk_start_queue - restart a previously stopped queue

229

* @q: The &struct request_queue in question

229

* @q: The &struct request_queue in question

230

*

230

*

231

* Description:

231

* Description:

232

* blk_start_queue() will clear the stop flag on the queue, and call

232

* blk_start_queue() will clear the stop flag on the queue, and call

233

* the request_fn for the queue if it was in a stopped state when

233

* the request_fn for the queue if it was in a stopped state when

234

* entered. Also see blk_stop_queue(). Queue lock must be held.

234

* entered. Also see blk_stop_queue(). Queue lock must be held.

235

**/

235

**/

236

void blk_start_queue(struct request_queue *q)

236

void blk_start_queue(struct request_queue *q)

237

{

237

{

238

WARN_ON(!irqs_disabled());

238

WARN_ON(!irqs_disabled());

239

240

queue_flag_clear(QUEUE_FLAG_STOPPED, q);

240

queue_flag_clear(QUEUE_FLAG_STOPPED, q);

241

__blk_run_queue(q, false);

241

__blk_run_queue(q, false);

242

}

242

}

243

EXPORT_SYMBOL(blk_start_queue);

243

EXPORT_SYMBOL(blk_start_queue);

244

245

/**

245

/**

246

* blk_stop_queue - stop a queue

246

* blk_stop_queue - stop a queue

247

* @q: The &struct request_queue in question

247

* @q: The &struct request_queue in question

248

*

248

*

249

* Description:

249

* Description:

250

* The Linux block layer assumes that a block driver will consume all

250

* The Linux block layer assumes that a block driver will consume all

251

* entries on the request queue when the request_fn strategy is called.

251

* entries on the request queue when the request_fn strategy is called.

252

* Often this will not happen, because of hardware limitations (queue

252

* Often this will not happen, because of hardware limitations (queue

253

* depth settings). If a device driver gets a 'queue full' response,

253

* depth settings). If a device driver gets a 'queue full' response,

254

* or if it simply chooses not to queue more I/O at one point, it can

254

* or if it simply chooses not to queue more I/O at one point, it can

255

* call this function to prevent the request_fn from being called until

255

* call this function to prevent the request_fn from being called until

256

* the driver has signalled it's ready to go again. This happens by calling

256

* the driver has signalled it's ready to go again. This happens by calling

257

* blk_start_queue() to restart queue operations. Queue lock must be held.

257

* blk_start_queue() to restart queue operations. Queue lock must be held.

258

**/

258

**/

259

void blk_stop_queue(struct request_queue *q)

259

void blk_stop_queue(struct request_queue *q)

260

{

260

{

261

__cancel_delayed_work(&q->delay_work);

261

__cancel_delayed_work(&q->delay_work);

262

queue_flag_set(QUEUE_FLAG_STOPPED, q);

262

queue_flag_set(QUEUE_FLAG_STOPPED, q);

263

}

263

}

264

EXPORT_SYMBOL(blk_stop_queue);

264

EXPORT_SYMBOL(blk_stop_queue);

265

266

/**

266

/**

267

* blk_sync_queue - cancel any pending callbacks on a queue

267

* blk_sync_queue - cancel any pending callbacks on a queue

268

* @q: the queue

268

* @q: the queue

269

*

269

*

270

* Description:

270

* Description:

271

* The block layer may perform asynchronous callback activity

271

* The block layer may perform asynchronous callback activity

272

* on a queue, such as calling the unplug function after a timeout.

272

* on a queue, such as calling the unplug function after a timeout.

273

* A block device may call blk_sync_queue to ensure that any

273

* A block device may call blk_sync_queue to ensure that any

274

* such activity is cancelled, thus allowing it to release resources

274

* such activity is cancelled, thus allowing it to release resources

275

* that the callbacks might use. The caller must already have made sure

275

* that the callbacks might use. The caller must already have made sure

276

* that its ->make_request_fn will not re-add plugging prior to calling

276

* that its ->make_request_fn will not re-add plugging prior to calling

277

* this function.

277

* this function.

278

*

278

*

279

* This function does not cancel any asynchronous activity arising

279

* This function does not cancel any asynchronous activity arising

280

* out of elevator or throttling code. That would require elevaotor_exit()

280

* out of elevator or throttling code. That would require elevaotor_exit()

281

* and blk_throtl_exit() to be called with queue lock initialized.

281

* and blk_throtl_exit() to be called with queue lock initialized.

282

*

282

*

283

*/

283

*/

284

void blk_sync_queue(struct request_queue *q)

284

void blk_sync_queue(struct request_queue *q)

285

{

285

{

286

del_timer_sync(&q->timeout);

286

del_timer_sync(&q->timeout);

287

cancel_delayed_work_sync(&q->delay_work);

287

cancel_delayed_work_sync(&q->delay_work);

288

}

288

}

289

EXPORT_SYMBOL(blk_sync_queue);

289

EXPORT_SYMBOL(blk_sync_queue);

290

291

/**

291

/**

292

* __blk_run_queue - run a single device queue

292

* __blk_run_queue - run a single device queue

293

* @q: The queue to run

293

* @q: The queue to run

294

* @force_kblockd: Don't run @q->request_fn directly. Use kblockd.

294

* @force_kblockd: Don't run @q->request_fn directly. Use kblockd.

295

*

295

*

296

* Description:

296

* Description:

297

* See @blk_run_queue. This variant must be called with the queue lock

297

* See @blk_run_queue. This variant must be called with the queue lock

298

* held and interrupts disabled.

298

* held and interrupts disabled.

299

*

299

*

300

*/

300

*/

301

void __blk_run_queue(struct request_queue *q, bool force_kblockd)

301

void __blk_run_queue(struct request_queue *q, bool force_kblockd)

302

{

302

{

303

if (unlikely(blk_queue_stopped(q)))

303

if (unlikely(blk_queue_stopped(q)))

304

return;

304

return;

305

306

/*

306

/*

307

* Only recurse once to avoid overrunning the stack, let the unplug

307

* Only recurse once to avoid overrunning the stack, let the unplug

308

* handling reinvoke the handler shortly if we already got there.

308

* handling reinvoke the handler shortly if we already got there.

309

*/

309

*/

310

if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {

310

if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {

311

q->request_fn(q);

311

q->request_fn(q);

312

queue_flag_clear(QUEUE_FLAG_REENTER, q);

312

queue_flag_clear(QUEUE_FLAG_REENTER, q);

313

} else

313

} else

314

queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);

314

queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);

315

}

315

}

316

EXPORT_SYMBOL(__blk_run_queue);

316

EXPORT_SYMBOL(__blk_run_queue);

317

318

/**

318

/**

319

* blk_run_queue - run a single device queue

319

* blk_run_queue - run a single device queue

320

* @q: The queue to run

320

* @q: The queue to run

321

*

321

*

322

* Description:

322

* Description:

323

* Invoke request handling on this queue, if it has pending work to do.

323

* Invoke request handling on this queue, if it has pending work to do.

324

* May be used to restart queueing when a request has completed.

324

* May be used to restart queueing when a request has completed.

325

*/

325

*/

326

void blk_run_queue(struct request_queue *q)

326

void blk_run_queue(struct request_queue *q)

327

{

327

{

328

unsigned long flags;

328

unsigned long flags;

329

330

spin_lock_irqsave(q->queue_lock, flags);

330

spin_lock_irqsave(q->queue_lock, flags);

331

__blk_run_queue(q, false);

331

__blk_run_queue(q, false);

332

spin_unlock_irqrestore(q->queue_lock, flags);

332

spin_unlock_irqrestore(q->queue_lock, flags);

333

}

333

}

334

EXPORT_SYMBOL(blk_run_queue);

334

EXPORT_SYMBOL(blk_run_queue);

335

336

void blk_put_queue(struct request_queue *q)

336

void blk_put_queue(struct request_queue *q)

337

{

337

{

338

kobject_put(&q->kobj);

338

kobject_put(&q->kobj);

339

}

339

}

340

341

/*

341

/*

342

* Note: If a driver supplied the queue lock, it should not zap that lock

342

* Note: If a driver supplied the queue lock, it should not zap that lock

343

* unexpectedly as some queue cleanup components like elevator_exit() and

343

* unexpectedly as some queue cleanup components like elevator_exit() and

344

* blk_throtl_exit() need queue lock.

344

* blk_throtl_exit() need queue lock.

345

*/

345

*/

346

void blk_cleanup_queue(struct request_queue *q)

346

void blk_cleanup_queue(struct request_queue *q)

347

{

347

{

348

/*

348

/*

349

* We know we have process context here, so we can be a little

349

* We know we have process context here, so we can be a little

350

* cautious and ensure that pending block actions on this device

350

* cautious and ensure that pending block actions on this device

351

* are done before moving on. Going into this function, we should

351

* are done before moving on. Going into this function, we should

352

* not have processes doing IO to this device.

352

* not have processes doing IO to this device.

353

*/

353

*/

354

blk_sync_queue(q);

354

blk_sync_queue(q);

355

356

del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);

356

del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);

357

mutex_lock(&q->sysfs_lock);

357

mutex_lock(&q->sysfs_lock);

358

queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);

358

queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);

359

mutex_unlock(&q->sysfs_lock);

359

mutex_unlock(&q->sysfs_lock);

360

361

if (q->elevator)

361

if (q->elevator)

362

elevator_exit(q->elevator);

362

elevator_exit(q->elevator);

363

364

blk_throtl_exit(q);

364

blk_throtl_exit(q);

365

366

blk_put_queue(q);

366

blk_put_queue(q);

367

}

367

}

368

EXPORT_SYMBOL(blk_cleanup_queue);

368

EXPORT_SYMBOL(blk_cleanup_queue);

369

370

static int blk_init_free_list(struct request_queue *q)

370

static int blk_init_free_list(struct request_queue *q)

371

{

371

{

372

struct request_list *rl = &q->rq;

372

struct request_list *rl = &q->rq;

373

374

if (unlikely(rl->rq_pool))

374

if (unlikely(rl->rq_pool))

375

return 0;

375

return 0;

376

377

rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;

377

rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;

378

rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;

378

rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;

379

rl->elvpriv = 0;

379

rl->elvpriv = 0;

380

init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);

380

init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);

381

init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);

381

init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);

382

383

rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,

383

rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,

384

mempool_free_slab, request_cachep, q->node);

384

mempool_free_slab, request_cachep, q->node);

385

386

if (!rl->rq_pool)

386

if (!rl->rq_pool)

387

return -ENOMEM;

387

return -ENOMEM;

388

389

return 0;

389

return 0;

390

}

390

}

391

392

struct request_queue *blk_alloc_queue(gfp_t gfp_mask)

392

struct request_queue *blk_alloc_queue(gfp_t gfp_mask)

393

{

393

{

394

return blk_alloc_queue_node(gfp_mask, -1);

394

return blk_alloc_queue_node(gfp_mask, -1);

395

}

395

}

396

EXPORT_SYMBOL(blk_alloc_queue);

396

EXPORT_SYMBOL(blk_alloc_queue);

397

398

struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)

398

struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)

399

{

399

{

400

struct request_queue *q;

400

struct request_queue *q;

401

int err;

401

int err;

402

403

q = kmem_cache_alloc_node(blk_requestq_cachep,

403

q = kmem_cache_alloc_node(blk_requestq_cachep,

404

gfp_mask | __GFP_ZERO, node_id);

404

gfp_mask | __GFP_ZERO, node_id);

405

if (!q)

405

if (!q)

406

return NULL;

406

return NULL;

407

408

q->backing_dev_info.ra_pages =

408

q->backing_dev_info.ra_pages =

409

(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;

409

(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;

410

q->backing_dev_info.state = 0;

410

q->backing_dev_info.state = 0;

411

q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;

411

q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;

412

q->backing_dev_info.name = "block";

412

q->backing_dev_info.name = "block";

413

414

err = bdi_init(&q->backing_dev_info);

414

err = bdi_init(&q->backing_dev_info);

415

if (err) {

415

if (err) {

416

kmem_cache_free(blk_requestq_cachep, q);

416

kmem_cache_free(blk_requestq_cachep, q);

417

return NULL;

417

return NULL;

418

}

418

}

419

420

if (blk_throtl_init(q)) {

420

if (blk_throtl_init(q)) {

421

kmem_cache_free(blk_requestq_cachep, q);

421

kmem_cache_free(blk_requestq_cachep, q);

422

return NULL;

422

return NULL;

423

}

423

}

424

425

setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,

425

setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,

426

laptop_mode_timer_fn, (unsigned long) q);

426

laptop_mode_timer_fn, (unsigned long) q);

427

setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);

427

setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);

428

INIT_LIST_HEAD(&q->timeout_list);

428

INIT_LIST_HEAD(&q->timeout_list);

429

INIT_LIST_HEAD(&q->flush_queue[0]);

429

INIT_LIST_HEAD(&q->flush_queue[0]);

430

INIT_LIST_HEAD(&q->flush_queue[1]);

430

INIT_LIST_HEAD(&q->flush_queue[1]);

431

INIT_LIST_HEAD(&q->flush_data_in_flight);

431

INIT_LIST_HEAD(&q->flush_data_in_flight);

432

INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);

432

INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);

433

434

kobject_init(&q->kobj, &blk_queue_ktype);

434

kobject_init(&q->kobj, &blk_queue_ktype);

435

436

mutex_init(&q->sysfs_lock);

436

mutex_init(&q->sysfs_lock);

437

spin_lock_init(&q->__queue_lock);

437

spin_lock_init(&q->__queue_lock);

438

439

/*

439

/*

440

* By default initialize queue_lock to internal lock and driver can

440

* By default initialize queue_lock to internal lock and driver can

441

* override it later if need be.

441

* override it later if need be.

442

*/

442

*/

443

q->queue_lock = &q->__queue_lock;

443

q->queue_lock = &q->__queue_lock;

444

445

return q;

445

return q;

446

}

446

}

447

EXPORT_SYMBOL(blk_alloc_queue_node);

447

EXPORT_SYMBOL(blk_alloc_queue_node);

448

449

/**

449

/**

450

* blk_init_queue - prepare a request queue for use with a block device

450

* blk_init_queue - prepare a request queue for use with a block device

451

* @rfn: The function to be called to process requests that have been

451

* @rfn: The function to be called to process requests that have been

452

* placed on the queue.

452

* placed on the queue.

453

* @lock: Request queue spin lock

453

* @lock: Request queue spin lock

454

*

454

*

455

* Description:

455

* Description:

456

* If a block device wishes to use the standard request handling procedures,

456

* If a block device wishes to use the standard request handling procedures,

457

* which sorts requests and coalesces adjacent requests, then it must

457

* which sorts requests and coalesces adjacent requests, then it must

458

* call blk_init_queue(). The function @rfn will be called when there

458

* call blk_init_queue(). The function @rfn will be called when there

459

* are requests on the queue that need to be processed. If the device

459

* are requests on the queue that need to be processed. If the device

460

* supports plugging, then @rfn may not be called immediately when requests

460

* supports plugging, then @rfn may not be called immediately when requests

461

* are available on the queue, but may be called at some time later instead.

461

* are available on the queue, but may be called at some time later instead.

462

* Plugged queues are generally unplugged when a buffer belonging to one

462

* Plugged queues are generally unplugged when a buffer belonging to one

463

* of the requests on the queue is needed, or due to memory pressure.

463

* of the requests on the queue is needed, or due to memory pressure.

464

*

464

*

465

* @rfn is not required, or even expected, to remove all requests off the

465

* @rfn is not required, or even expected, to remove all requests off the

466

* queue, but only as many as it can handle at a time. If it does leave

466

* queue, but only as many as it can handle at a time. If it does leave

467

* requests on the queue, it is responsible for arranging that the requests

467

* requests on the queue, it is responsible for arranging that the requests

468

* get dealt with eventually.

468

* get dealt with eventually.

469

*

469

*

470

* The queue spin lock must be held while manipulating the requests on the

470

* The queue spin lock must be held while manipulating the requests on the

471

* request queue; this lock will be taken also from interrupt context, so irq

471

* request queue; this lock will be taken also from interrupt context, so irq

472

* disabling is needed for it.

472

* disabling is needed for it.

473

*

473

*

474

* Function returns a pointer to the initialized request queue, or %NULL if

474

* Function returns a pointer to the initialized request queue, or %NULL if

475

* it didn't succeed.

475

* it didn't succeed.

476

*

476

*

477

* Note:

477

* Note:

478

* blk_init_queue() must be paired with a blk_cleanup_queue() call

478

* blk_init_queue() must be paired with a blk_cleanup_queue() call

479

* when the block device is deactivated (such as at module unload).

479

* when the block device is deactivated (such as at module unload).

480

**/

480

**/

481

482

struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)

482

struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)

483

{

483

{

484

return blk_init_queue_node(rfn, lock, -1);

484

return blk_init_queue_node(rfn, lock, -1);

485

}

485

}

486

EXPORT_SYMBOL(blk_init_queue);

486

EXPORT_SYMBOL(blk_init_queue);

487

488

struct request_queue *

488

struct request_queue *

489

blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)

489

blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)

490

{

490

{

491

struct request_queue *uninit_q, *q;

491

struct request_queue *uninit_q, *q;

492

493

uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);

493

uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);

494

if (!uninit_q)

494

if (!uninit_q)

495

return NULL;

495

return NULL;

496

497

q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);

497

q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);

498

if (!q)

498

if (!q)

499

blk_cleanup_queue(uninit_q);

499

blk_cleanup_queue(uninit_q);

500

501

return q;

501

return q;

502

}

502

}

503

EXPORT_SYMBOL(blk_init_queue_node);

503

EXPORT_SYMBOL(blk_init_queue_node);

504

505

struct request_queue *

505

struct request_queue *

506

blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,

506

blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,

507

spinlock_t *lock)

507

spinlock_t *lock)

508

{

508

{

509

return blk_init_allocated_queue_node(q, rfn, lock, -1);

509

return blk_init_allocated_queue_node(q, rfn, lock, -1);

510

}

510

}

511

EXPORT_SYMBOL(blk_init_allocated_queue);

511

EXPORT_SYMBOL(blk_init_allocated_queue);

512

513

struct request_queue *

513

struct request_queue *

514

blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,

514

blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,

515

spinlock_t *lock, int node_id)

515

spinlock_t *lock, int node_id)

516

{

516

{

517

if (!q)

517

if (!q)

518

return NULL;

518

return NULL;

519

520

q->node = node_id;

520

q->node = node_id;

521

if (blk_init_free_list(q))

521

if (blk_init_free_list(q))

522

return NULL;

522

return NULL;

523

524

q->request_fn = rfn;

524

q->request_fn = rfn;

525

q->prep_rq_fn = NULL;

525

q->prep_rq_fn = NULL;

526

q->unprep_rq_fn = NULL;

526

q->unprep_rq_fn = NULL;

527

q->queue_flags = QUEUE_FLAG_DEFAULT;

527

q->queue_flags = QUEUE_FLAG_DEFAULT;

528

529

/* Override internal queue lock with supplied lock pointer */

529

/* Override internal queue lock with supplied lock pointer */

530

if (lock)

530

if (lock)

531

q->queue_lock = lock;

531

q->queue_lock = lock;

532

533

/*

533

/*

534

* This also sets hw/phys segments, boundary and size

534

* This also sets hw/phys segments, boundary and size

535

*/

535

*/

536

blk_queue_make_request(q, __make_request);

536

blk_queue_make_request(q, __make_request);

537

538

q->sg_reserved_size = INT_MAX;

538

q->sg_reserved_size = INT_MAX;

539

540

/*

540

/*

541

* all done

541

* all done

542

*/

542

*/

543

if (!elevator_init(q, NULL)) {

543

if (!elevator_init(q, NULL)) {

544

blk_queue_congestion_threshold(q);

544

blk_queue_congestion_threshold(q);

545

return q;

545

return q;

546

}

546

}

547

548

return NULL;

548

return NULL;

549

}

549

}

550

EXPORT_SYMBOL(blk_init_allocated_queue_node);

550

EXPORT_SYMBOL(blk_init_allocated_queue_node);

551

552

int blk_get_queue(struct request_queue *q)

552

int blk_get_queue(struct request_queue *q)

553

{

553

{

554

if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {

554

if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {

555

kobject_get(&q->kobj);

555

kobject_get(&q->kobj);

556

return 0;

556

return 0;

557

}

557

}

558

559

return 1;

559

return 1;

560

}

560

}

561

562

static inline void blk_free_request(struct request_queue *q, struct request *rq)

562

static inline void blk_free_request(struct request_queue *q, struct request *rq)

563

{

563

{

564

BUG_ON(rq->cmd_flags & REQ_ON_PLUG);

564

BUG_ON(rq->cmd_flags & REQ_ON_PLUG);

565

566

if (rq->cmd_flags & REQ_ELVPRIV)

566

if (rq->cmd_flags & REQ_ELVPRIV)

567

elv_put_request(q, rq);

567

elv_put_request(q, rq);

568

mempool_free(rq, q->rq.rq_pool);

568

mempool_free(rq, q->rq.rq_pool);

569

}

569

}

570

571

static struct request *

571

static struct request *

572

blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)

572

blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)

573

{

573

{

574

struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

574

struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);

575

576

if (!rq)

576

if (!rq)

577

return NULL;

577

return NULL;

578

579

blk_rq_init(q, rq);

579

blk_rq_init(q, rq);

580

581

rq->cmd_flags = flags | REQ_ALLOCED;

581

rq->cmd_flags = flags | REQ_ALLOCED;

582

583

if (priv) {

583

if (priv) {

584

if (unlikely(elv_set_request(q, rq, gfp_mask))) {

584

if (unlikely(elv_set_request(q, rq, gfp_mask))) {

585

mempool_free(rq, q->rq.rq_pool);

585

mempool_free(rq, q->rq.rq_pool);

586

return NULL;

586

return NULL;

587

}

587

}

588

rq->cmd_flags |= REQ_ELVPRIV;

588

rq->cmd_flags |= REQ_ELVPRIV;

589

}

589

}

590

591

return rq;

591

return rq;

592

}

592

}

593

594

/*

594

/*

595

* ioc_batching returns true if the ioc is a valid batching request and

595

* ioc_batching returns true if the ioc is a valid batching request and

596

* should be given priority access to a request.

596

* should be given priority access to a request.

597

*/

597

*/

598

static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)

598

static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)

599

{

599

{

600

if (!ioc)

600

if (!ioc)

601

return 0;

601

return 0;

602

603

/*

603

/*

604

* Make sure the process is able to allocate at least 1 request

604

* Make sure the process is able to allocate at least 1 request

605

* even if the batch times out, otherwise we could theoretically

605

* even if the batch times out, otherwise we could theoretically

606

* lose wakeups.

606

* lose wakeups.

607

*/

607

*/

608

return ioc->nr_batch_requests == q->nr_batching ||

608

return ioc->nr_batch_requests == q->nr_batching ||

609

(ioc->nr_batch_requests > 0

609

(ioc->nr_batch_requests > 0

610

&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));

610

&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));

611

}

611

}

612

613

/*

613

/*

614

* ioc_set_batching sets ioc to be a new "batcher" if it is not one. This

614

* ioc_set_batching sets ioc to be a new "batcher" if it is not one. This

615

* will cause the process to be a "batcher" on all queues in the system. This

615

* will cause the process to be a "batcher" on all queues in the system. This

616

* is the behaviour we want though - once it gets a wakeup it should be given

616

* is the behaviour we want though - once it gets a wakeup it should be given

617

* a nice run.

617

* a nice run.

618

*/

618

*/

619

static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)

619

static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)

620

{

620

{

621

if (!ioc || ioc_batching(q, ioc))

621

if (!ioc || ioc_batching(q, ioc))

622

return;

622

return;

623

624

ioc->nr_batch_requests = q->nr_batching;

624

ioc->nr_batch_requests = q->nr_batching;

625

ioc->last_waited = jiffies;

625

ioc->last_waited = jiffies;

626

}

626

}

627

628

static void __freed_request(struct request_queue *q, int sync)

628

static void __freed_request(struct request_queue *q, int sync)

629

{

629

{

630

struct request_list *rl = &q->rq;

630

struct request_list *rl = &q->rq;

631

632

if (rl->count[sync] < queue_congestion_off_threshold(q))

632

if (rl->count[sync] < queue_congestion_off_threshold(q))

633

blk_clear_queue_congested(q, sync);

633

blk_clear_queue_congested(q, sync);

634

635

if (rl->count[sync] + 1 <= q->nr_requests) {

635

if (rl->count[sync] + 1 <= q->nr_requests) {

636

if (waitqueue_active(&rl->wait[sync]))

636

if (waitqueue_active(&rl->wait[sync]))

637

wake_up(&rl->wait[sync]);

637

wake_up(&rl->wait[sync]);

638

639

blk_clear_queue_full(q, sync);

639

blk_clear_queue_full(q, sync);

640

}

640

}

641

}

641

}

642

643

/*

643

/*

644

* A request has just been released. Account for it, update the full and

644

* A request has just been released. Account for it, update the full and

645

* congestion status, wake up any waiters. Called under q->queue_lock.

645

* congestion status, wake up any waiters. Called under q->queue_lock.

646

*/

646

*/

647

static void freed_request(struct request_queue *q, int sync, int priv)

647

static void freed_request(struct request_queue *q, int sync, int priv)

648

{

648

{

649

struct request_list *rl = &q->rq;

649

struct request_list *rl = &q->rq;

650

651

rl->count[sync]--;

651

rl->count[sync]--;

652

if (priv)

652

if (priv)

653

rl->elvpriv--;

653

rl->elvpriv--;

654

655

__freed_request(q, sync);

655

__freed_request(q, sync);

656

657

if (unlikely(rl->starved[sync ^ 1]))

657

if (unlikely(rl->starved[sync ^ 1]))

658

__freed_request(q, sync ^ 1);

658

__freed_request(q, sync ^ 1);

659

}

659

}

660

661

/*

661

/*

662

* Determine if elevator data should be initialized when allocating the

662

* Determine if elevator data should be initialized when allocating the

663

* request associated with @bio.

663

* request associated with @bio.

664

*/

664

*/

665

static bool blk_rq_should_init_elevator(struct bio *bio)

665

static bool blk_rq_should_init_elevator(struct bio *bio)

666

{

666

{

667

if (!bio)

667

if (!bio)

668

return true;

668

return true;

669

670

/*

670

/*

671

* Flush requests do not use the elevator so skip initialization.

671

* Flush requests do not use the elevator so skip initialization.

672

* This allows a request to share the flush and elevator data.

672

* This allows a request to share the flush and elevator data.

673

*/

673

*/

674

if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))

674

if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))

675

return false;

675

return false;

676

677

return true;

677

return true;

678

}

678

}

679

680

/*

680

/*

681

* Get a free request, queue_lock must be held.

681

* Get a free request, queue_lock must be held.

682

* Returns NULL on failure, with queue_lock held.

682

* Returns NULL on failure, with queue_lock held.

683

* Returns !NULL on success, with queue_lock *not held*.

683

* Returns !NULL on success, with queue_lock *not held*.

684

*/

684

*/

685

static struct request *get_request(struct request_queue *q, int rw_flags,

685

static struct request *get_request(struct request_queue *q, int rw_flags,

686

struct bio *bio, gfp_t gfp_mask)

686

struct bio *bio, gfp_t gfp_mask)

687

{

687

{

688

struct request *rq = NULL;

688

struct request *rq = NULL;

689

struct request_list *rl = &q->rq;

689

struct request_list *rl = &q->rq;

690

struct io_context *ioc = NULL;

690

struct io_context *ioc = NULL;

691

const bool is_sync = rw_is_sync(rw_flags) != 0;

691

const bool is_sync = rw_is_sync(rw_flags) != 0;

692

int may_queue, priv = 0;

692

int may_queue, priv = 0;

693

694

may_queue = elv_may_queue(q, rw_flags);

694

may_queue = elv_may_queue(q, rw_flags);

695

if (may_queue == ELV_MQUEUE_NO)

695

if (may_queue == ELV_MQUEUE_NO)

696

goto rq_starved;

696

goto rq_starved;

697

698

if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {

698

if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {

699

if (rl->count[is_sync]+1 >= q->nr_requests) {

699

if (rl->count[is_sync]+1 >= q->nr_requests) {

700

ioc = current_io_context(GFP_ATOMIC, q->node);

700

ioc = current_io_context(GFP_ATOMIC, q->node);

701

/*

701

/*

702

* The queue will fill after this allocation, so set

702

* The queue will fill after this allocation, so set

703

* it as full, and mark this process as "batching".

703

* it as full, and mark this process as "batching".

704

* This process will be allowed to complete a batch of

704

* This process will be allowed to complete a batch of

705

* requests, others will be blocked.

705

* requests, others will be blocked.

706

*/

706

*/

707

if (!blk_queue_full(q, is_sync)) {

707

if (!blk_queue_full(q, is_sync)) {

708

ioc_set_batching(q, ioc);

708

ioc_set_batching(q, ioc);

709

blk_set_queue_full(q, is_sync);

709

blk_set_queue_full(q, is_sync);

710

} else {

710

} else {

711

if (may_queue != ELV_MQUEUE_MUST

711

if (may_queue != ELV_MQUEUE_MUST

712

&& !ioc_batching(q, ioc)) {

712

&& !ioc_batching(q, ioc)) {

713

/*

713

/*

714

* The queue is full and the allocating

714

* The queue is full and the allocating

715

* process is not a "batcher", and not

715

* process is not a "batcher", and not

716

* exempted by the IO scheduler

716

* exempted by the IO scheduler

717

*/

717

*/

718

goto out;

718

goto out;

719

}

719

}

720

}

720

}

721

}

721

}

722

blk_set_queue_congested(q, is_sync);

722

blk_set_queue_congested(q, is_sync);

723

}

723

}

724

725

/*

725

/*

726

* Only allow batching queuers to allocate up to 50% over the defined

726

* Only allow batching queuers to allocate up to 50% over the defined

727

* limit of requests, otherwise we could have thousands of requests

727

* limit of requests, otherwise we could have thousands of requests

728

* allocated with any setting of ->nr_requests

728

* allocated with any setting of ->nr_requests

729

*/

729

*/

730

if (rl->count[is_sync] >= (3 * q->nr_requests / 2))

730

if (rl->count[is_sync] >= (3 * q->nr_requests / 2))

731

goto out;

731

goto out;

732

733

rl->count[is_sync]++;

733

rl->count[is_sync]++;

734

rl->starved[is_sync] = 0;

734

rl->starved[is_sync] = 0;

735

736

if (blk_rq_should_init_elevator(bio)) {

736

if (blk_rq_should_init_elevator(bio)) {

737

priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);

737

priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);

738

if (priv)

738

if (priv)

739

rl->elvpriv++;

739

rl->elvpriv++;

740

}

740

}

741

742

if (blk_queue_io_stat(q))

742

if (blk_queue_io_stat(q))

743

rw_flags |= REQ_IO_STAT;

743

rw_flags |= REQ_IO_STAT;

744

spin_unlock_irq(q->queue_lock);

744

spin_unlock_irq(q->queue_lock);

745

746

rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);

746

rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);

747

if (unlikely(!rq)) {

747

if (unlikely(!rq)) {

748

/*

748

/*

749

* Allocation failed presumably due to memory. Undo anything

749

* Allocation failed presumably due to memory. Undo anything

750

* we might have messed up.

750

* we might have messed up.

751

*

751

*

752

* Allocating task should really be put onto the front of the

752

* Allocating task should really be put onto the front of the

753

* wait queue, but this is pretty rare.

753

* wait queue, but this is pretty rare.

754

*/

754

*/

755

spin_lock_irq(q->queue_lock);

755

spin_lock_irq(q->queue_lock);

756

freed_request(q, is_sync, priv);

756

freed_request(q, is_sync, priv);

757

758

/*

758

/*

759

* in the very unlikely event that allocation failed and no

759

* in the very unlikely event that allocation failed and no

760

* requests for this direction was pending, mark us starved

760

* requests for this direction was pending, mark us starved

761

* so that freeing of a request in the other direction will

761

* so that freeing of a request in the other direction will

762

* notice us. another possible fix would be to split the

762

* notice us. another possible fix would be to split the

763

* rq mempool into READ and WRITE

763

* rq mempool into READ and WRITE

764

*/

764

*/

765

rq_starved:

765

rq_starved:

766

if (unlikely(rl->count[is_sync] == 0))

766

if (unlikely(rl->count[is_sync] == 0))

767

rl->starved[is_sync] = 1;

767

rl->starved[is_sync] = 1;

768

769

goto out;

769

goto out;

770

}

770

}

771

772

/*

772

/*

773

* ioc may be NULL here, and ioc_batching will be false. That's

773

* ioc may be NULL here, and ioc_batching will be false. That's

774

* OK, if the queue is under the request limit then requests need

774

* OK, if the queue is under the request limit then requests need

775

* not count toward the nr_batch_requests limit. There will always

775

* not count toward the nr_batch_requests limit. There will always

776

* be some limit enforced by BLK_BATCH_TIME.

776

* be some limit enforced by BLK_BATCH_TIME.

777

*/

777

*/

778

if (ioc_batching(q, ioc))

778

if (ioc_batching(q, ioc))

779

ioc->nr_batch_requests--;

779

ioc->nr_batch_requests--;

780

781

trace_block_getrq(q, bio, rw_flags & 1);

781

trace_block_getrq(q, bio, rw_flags & 1);

782

out:

782

out:

783

return rq;

783

return rq;

784

}

784

}

785

786

/*

786

/*

787

* No available requests for this queue, wait for some requests to become

787

* No available requests for this queue, wait for some requests to become

788

* available.

788

* available.

789

*

789

*

790

* Called with q->queue_lock held, and returns with it unlocked.

790

* Called with q->queue_lock held, and returns with it unlocked.

791

*/

791

*/

792

static struct request *get_request_wait(struct request_queue *q, int rw_flags,

792

static struct request *get_request_wait(struct request_queue *q, int rw_flags,

793

struct bio *bio)

793

struct bio *bio)

794

{

794

{

795

const bool is_sync = rw_is_sync(rw_flags) != 0;

795

const bool is_sync = rw_is_sync(rw_flags) != 0;

796

struct request *rq;

796

struct request *rq;

797

798

rq = get_request(q, rw_flags, bio, GFP_NOIO);

798

rq = get_request(q, rw_flags, bio, GFP_NOIO);

799

while (!rq) {

799

while (!rq) {

800

DEFINE_WAIT(wait);

800

DEFINE_WAIT(wait);

801

struct io_context *ioc;

801

struct io_context *ioc;

802

struct request_list *rl = &q->rq;

802

struct request_list *rl = &q->rq;

803

804

prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,

804

prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,

805

TASK_UNINTERRUPTIBLE);

805

TASK_UNINTERRUPTIBLE);

806

807

trace_block_sleeprq(q, bio, rw_flags & 1);

807

trace_block_sleeprq(q, bio, rw_flags & 1);

808

809

spin_unlock_irq(q->queue_lock);

809

spin_unlock_irq(q->queue_lock);

810

io_schedule();

810

io_schedule();

811

812

/*

812

/*

813

* After sleeping, we become a "batching" process and

813

* After sleeping, we become a "batching" process and

814

* will be able to allocate at least one request, and

814

* will be able to allocate at least one request, and

815

* up to a big batch of them for a small period time.

815

* up to a big batch of them for a small period time.

816

* See ioc_batching, ioc_set_batching

816

* See ioc_batching, ioc_set_batching

817

*/

817

*/

818

ioc = current_io_context(GFP_NOIO, q->node);

818

ioc = current_io_context(GFP_NOIO, q->node);

819

ioc_set_batching(q, ioc);

819

ioc_set_batching(q, ioc);

820

821

spin_lock_irq(q->queue_lock);

821

spin_lock_irq(q->queue_lock);

822

finish_wait(&rl->wait[is_sync], &wait);

822

finish_wait(&rl->wait[is_sync], &wait);

823

824

rq = get_request(q, rw_flags, bio, GFP_NOIO);

824

rq = get_request(q, rw_flags, bio, GFP_NOIO);

825

};

825

};

826

827

return rq;

827

return rq;

828

}

828

}

829

830

struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)

830

struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)

831

{

831

{

832

struct request *rq;

832

struct request *rq;

833

834

BUG_ON(rw != READ && rw != WRITE);

834

BUG_ON(rw != READ && rw != WRITE);

835

836

spin_lock_irq(q->queue_lock);

836

spin_lock_irq(q->queue_lock);

837

if (gfp_mask & __GFP_WAIT) {

837

if (gfp_mask & __GFP_WAIT) {

838

rq = get_request_wait(q, rw, NULL);

838

rq = get_request_wait(q, rw, NULL);

839

} else {

839

} else {

840

rq = get_request(q, rw, NULL, gfp_mask);

840

rq = get_request(q, rw, NULL, gfp_mask);

841

if (!rq)

841

if (!rq)

842

spin_unlock_irq(q->queue_lock);

842

spin_unlock_irq(q->queue_lock);

843

}

843

}

844

/* q->queue_lock is unlocked at this point */

844

/* q->queue_lock is unlocked at this point */

845

846

return rq;

846

return rq;

847

}

847

}

848

EXPORT_SYMBOL(blk_get_request);

848

EXPORT_SYMBOL(blk_get_request);

849

850

/**

850

/**

851

* blk_make_request - given a bio, allocate a corresponding struct request.

851

* blk_make_request - given a bio, allocate a corresponding struct request.

852

* @q: target request queue

852

* @q: target request queue

853

* @bio: The bio describing the memory mappings that will be submitted for IO.

853

* @bio: The bio describing the memory mappings that will be submitted for IO.

854

* It may be a chained-bio properly constructed by block/bio layer.

854

* It may be a chained-bio properly constructed by block/bio layer.

855

* @gfp_mask: gfp flags to be used for memory allocation

855

* @gfp_mask: gfp flags to be used for memory allocation

856

*

856

*

857

* blk_make_request is the parallel of generic_make_request for BLOCK_PC

857

* blk_make_request is the parallel of generic_make_request for BLOCK_PC

858

* type commands. Where the struct request needs to be farther initialized by

858

* type commands. Where the struct request needs to be farther initialized by

859

* the caller. It is passed a &struct bio, which describes the memory info of

859

* the caller. It is passed a &struct bio, which describes the memory info of

860

* the I/O transfer.

860

* the I/O transfer.

861

*

861

*

862

* The caller of blk_make_request must make sure that bi_io_vec

862

* The caller of blk_make_request must make sure that bi_io_vec

863

* are set to describe the memory buffers. That bio_data_dir() will return

863

* are set to describe the memory buffers. That bio_data_dir() will return

864

* the needed direction of the request. (And all bio's in the passed bio-chain

864

* the needed direction of the request. (And all bio's in the passed bio-chain

865

* are properly set accordingly)

865

* are properly set accordingly)

866

*

866

*

867

* If called under none-sleepable conditions, mapped bio buffers must not

867

* If called under none-sleepable conditions, mapped bio buffers must not

868

* need bouncing, by calling the appropriate masked or flagged allocator,

868

* need bouncing, by calling the appropriate masked or flagged allocator,

869

* suitable for the target device. Otherwise the call to blk_queue_bounce will

869

* suitable for the target device. Otherwise the call to blk_queue_bounce will

870

* BUG.

870

* BUG.

871

*

871

*

872

* WARNING: When allocating/cloning a bio-chain, careful consideration should be

872

* WARNING: When allocating/cloning a bio-chain, careful consideration should be

873

* given to how you allocate bios. In particular, you cannot use __GFP_WAIT for

873

* given to how you allocate bios. In particular, you cannot use __GFP_WAIT for

874

* anything but the first bio in the chain. Otherwise you risk waiting for IO

874

* anything but the first bio in the chain. Otherwise you risk waiting for IO

875

* completion of a bio that hasn't been submitted yet, thus resulting in a

875

* completion of a bio that hasn't been submitted yet, thus resulting in a

876

* deadlock. Alternatively bios should be allocated using bio_kmalloc() instead

876

* deadlock. Alternatively bios should be allocated using bio_kmalloc() instead

877

* of bio_alloc(), as that avoids the mempool deadlock.

877

* of bio_alloc(), as that avoids the mempool deadlock.

878

* If possible a big IO should be split into smaller parts when allocation

878

* If possible a big IO should be split into smaller parts when allocation

879

* fails. Partial allocation should not be an error, or you risk a live-lock.

879

* fails. Partial allocation should not be an error, or you risk a live-lock.

880

*/

880

*/

881

struct request *blk_make_request(struct request_queue *q, struct bio *bio,

881

struct request *blk_make_request(struct request_queue *q, struct bio *bio,

882

gfp_t gfp_mask)

882

gfp_t gfp_mask)

883

{

883

{

884

struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);

884

struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);

885

886

if (unlikely(!rq))

886

if (unlikely(!rq))

887

return ERR_PTR(-ENOMEM);

887

return ERR_PTR(-ENOMEM);

888

889

for_each_bio(bio) {

889

for_each_bio(bio) {

890

struct bio *bounce_bio = bio;

890

struct bio *bounce_bio = bio;

891

int ret;

891

int ret;

892

893

blk_queue_bounce(q, &bounce_bio);

893

blk_queue_bounce(q, &bounce_bio);

894

ret = blk_rq_append_bio(q, rq, bounce_bio);

894

ret = blk_rq_append_bio(q, rq, bounce_bio);

895

if (unlikely(ret)) {

895

if (unlikely(ret)) {

896

blk_put_request(rq);

896

blk_put_request(rq);

897

return ERR_PTR(ret);

897

return ERR_PTR(ret);

898

}

898

}

899

}

899

}

900

901

return rq;

901

return rq;

902

}

902

}

903

EXPORT_SYMBOL(blk_make_request);

903

EXPORT_SYMBOL(blk_make_request);

904

905

/**

905

/**

906

* blk_requeue_request - put a request back on queue

906

* blk_requeue_request - put a request back on queue

907

* @q: request queue where request should be inserted

907

* @q: request queue where request should be inserted

908

* @rq: request to be inserted

908

* @rq: request to be inserted

909

*

909

*

910

* Description:

910

* Description:

911

* Drivers often keep queueing requests until the hardware cannot accept

911

* Drivers often keep queueing requests until the hardware cannot accept

912

* more, when that condition happens we need to put the request back

912

* more, when that condition happens we need to put the request back

913

* on the queue. Must be called with queue lock held.

913

* on the queue. Must be called with queue lock held.

914

*/

914

*/

915

void blk_requeue_request(struct request_queue *q, struct request *rq)

915

void blk_requeue_request(struct request_queue *q, struct request *rq)

916

{

916

{

917

blk_delete_timer(rq);

917

blk_delete_timer(rq);

918

blk_clear_rq_complete(rq);

918

blk_clear_rq_complete(rq);

919

trace_block_rq_requeue(q, rq);

919

trace_block_rq_requeue(q, rq);

920

921

if (blk_rq_tagged(rq))

921

if (blk_rq_tagged(rq))

922

blk_queue_end_tag(q, rq);

922

blk_queue_end_tag(q, rq);

923

924

BUG_ON(blk_queued_rq(rq));

924

BUG_ON(blk_queued_rq(rq));

925

926

elv_requeue_request(q, rq);

926

elv_requeue_request(q, rq);

927

}

927

}

928

EXPORT_SYMBOL(blk_requeue_request);

928

EXPORT_SYMBOL(blk_requeue_request);

929

930

static void add_acct_request(struct request_queue *q, struct request *rq,

930

static void add_acct_request(struct request_queue *q, struct request *rq,

931

int where)

931

int where)

932

{

932

{

933

drive_stat_acct(rq, 1);

933

drive_stat_acct(rq, 1);

934

__elv_add_request(q, rq, where);

934

__elv_add_request(q, rq, where);

935

}

935

}

936

937

/**

937

/**

938

* blk_insert_request - insert a special request into a request queue

938

* blk_insert_request - insert a special request into a request queue

939

* @q: request queue where request should be inserted

939

* @q: request queue where request should be inserted

940

* @rq: request to be inserted

940

* @rq: request to be inserted

941

* @at_head: insert request at head or tail of queue

941

* @at_head: insert request at head or tail of queue

942

* @data: private data

942

* @data: private data

943

*

943

*

944

* Description:

944

* Description:

945

* Many block devices need to execute commands asynchronously, so they don't

945

* Many block devices need to execute commands asynchronously, so they don't

946

* block the whole kernel from preemption during request execution. This is

946

* block the whole kernel from preemption during request execution. This is

947

* accomplished normally by inserting aritficial requests tagged as

947

* accomplished normally by inserting aritficial requests tagged as

948

* REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them

948

* REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them

949

* be scheduled for actual execution by the request queue.

949

* be scheduled for actual execution by the request queue.

950

*

950

*

951

* We have the option of inserting the head or the tail of the queue.

951

* We have the option of inserting the head or the tail of the queue.

952

* Typically we use the tail for new ioctls and so forth. We use the head

952

* Typically we use the tail for new ioctls and so forth. We use the head

953

* of the queue for things like a QUEUE_FULL message from a device, or a

953

* of the queue for things like a QUEUE_FULL message from a device, or a

954

* host that is unable to accept a particular command.

954

* host that is unable to accept a particular command.

955

*/

955

*/

956

void blk_insert_request(struct request_queue *q, struct request *rq,

956

void blk_insert_request(struct request_queue *q, struct request *rq,

957

int at_head, void *data)

957

int at_head, void *data)

958

{

958

{

959

int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;

959

int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;

960

unsigned long flags;

960

unsigned long flags;

961

962

/*

962

/*

963

* tell I/O scheduler that this isn't a regular read/write (ie it

963

* tell I/O scheduler that this isn't a regular read/write (ie it

964

* must not attempt merges on this) and that it acts as a soft

964

* must not attempt merges on this) and that it acts as a soft

965

* barrier

965

* barrier

966

*/

966

*/

967

rq->cmd_type = REQ_TYPE_SPECIAL;

967

rq->cmd_type = REQ_TYPE_SPECIAL;

968

969

rq->special = data;

969

rq->special = data;

970

971

spin_lock_irqsave(q->queue_lock, flags);

971

spin_lock_irqsave(q->queue_lock, flags);

972

973

/*

973

/*

974

* If command is tagged, release the tag

974

* If command is tagged, release the tag

975

*/

975

*/

976

if (blk_rq_tagged(rq))

976

if (blk_rq_tagged(rq))

977

blk_queue_end_tag(q, rq);

977

blk_queue_end_tag(q, rq);

978

979

add_acct_request(q, rq, where);

979

add_acct_request(q, rq, where);

980

__blk_run_queue(q, false);

980

__blk_run_queue(q, false);

981

spin_unlock_irqrestore(q->queue_lock, flags);

981

spin_unlock_irqrestore(q->queue_lock, flags);

982

}

982

}

983

EXPORT_SYMBOL(blk_insert_request);

983

EXPORT_SYMBOL(blk_insert_request);

984

985

static void part_round_stats_single(int cpu, struct hd_struct *part,

985

static void part_round_stats_single(int cpu, struct hd_struct *part,

986

unsigned long now)

986

unsigned long now)

987

{

987

{

988

if (now == part->stamp)

988

if (now == part->stamp)

989

return;

989

return;

990

991

if (part_in_flight(part)) {

991

if (part_in_flight(part)) {

992

__part_stat_add(cpu, part, time_in_queue,

992

__part_stat_add(cpu, part, time_in_queue,

993

part_in_flight(part) * (now - part->stamp));

993

part_in_flight(part) * (now - part->stamp));

994

__part_stat_add(cpu, part, io_ticks, (now - part->stamp));

994

__part_stat_add(cpu, part, io_ticks, (now - part->stamp));

995

}

995

}

996

part->stamp = now;

996

part->stamp = now;

997

}

997

}

998

999

/**

999

/**

1000

* part_round_stats() - Round off the performance stats on a struct disk_stats.

1000

* part_round_stats() - Round off the performance stats on a struct disk_stats.

1001

* @cpu: cpu number for stats access

1001

* @cpu: cpu number for stats access

1002

* @part: target partition

1002

* @part: target partition

1003

*

1003

*

1004

* The average IO queue length and utilisation statistics are maintained

1004

* The average IO queue length and utilisation statistics are maintained

1005

* by observing the current state of the queue length and the amount of

1005

* by observing the current state of the queue length and the amount of

1006

* time it has been in this state for.

1006

* time it has been in this state for.

1007

*

1007

*

1008

* Normally, that accounting is done on IO completion, but that can result

1008

* Normally, that accounting is done on IO completion, but that can result

1009

* in more than a second's worth of IO being accounted for within any one

1009

* in more than a second's worth of IO being accounted for within any one

1010

* second, leading to >100% utilisation. To deal with that, we call this

1010

* second, leading to >100% utilisation. To deal with that, we call this

1011

* function to do a round-off before returning the results when reading

1011

* function to do a round-off before returning the results when reading

1012

* /proc/diskstats. This accounts immediately for all queue usage up to

1012

* /proc/diskstats. This accounts immediately for all queue usage up to

1013

* the current jiffies and restarts the counters again.

1013

* the current jiffies and restarts the counters again.

1014

*/

1014

*/

1015

void part_round_stats(int cpu, struct hd_struct *part)

1015

void part_round_stats(int cpu, struct hd_struct *part)

1016

{

1016

{

1017

unsigned long now = jiffies;

1017

unsigned long now = jiffies;

1018

1019

if (part->partno)

1019

if (part->partno)

1020

part_round_stats_single(cpu, &part_to_disk(part)->part0, now);

1020

part_round_stats_single(cpu, &part_to_disk(part)->part0, now);

1021

part_round_stats_single(cpu, part, now);

1021

part_round_stats_single(cpu, part, now);

1022

}

1022

}

1023

EXPORT_SYMBOL_GPL(part_round_stats);

1023

EXPORT_SYMBOL_GPL(part_round_stats);

1024

1025

/*

1025

/*

1026

* queue lock must be held

1026

* queue lock must be held

1027

*/

1027

*/

1028

void __blk_put_request(struct request_queue *q, struct request *req)

1028

void __blk_put_request(struct request_queue *q, struct request *req)

1029

{

1029

{

1030

if (unlikely(!q))

1030

if (unlikely(!q))

1031

return;

1031

return;

1032

if (unlikely(--req->ref_count))

1032

if (unlikely(--req->ref_count))

1033

return;

1033

return;

1034

1035

elv_completed_request(q, req);

1035

elv_completed_request(q, req);

1036

1037

/* this is a bio leak */

1037

/* this is a bio leak */

1038

WARN_ON(req->bio != NULL);

1038

WARN_ON(req->bio != NULL);

1039

1040

/*

1040

/*

1041

* Request may not have originated from ll_rw_blk. if not,

1041

* Request may not have originated from ll_rw_blk. if not,

1042

* it didn't come out of our reserved rq pools

1042

* it didn't come out of our reserved rq pools

1043

*/

1043

*/

1044

if (req->cmd_flags & REQ_ALLOCED) {

1044

if (req->cmd_flags & REQ_ALLOCED) {

1045

int is_sync = rq_is_sync(req) != 0;

1045

int is_sync = rq_is_sync(req) != 0;

1046

int priv = req->cmd_flags & REQ_ELVPRIV;

1046

int priv = req->cmd_flags & REQ_ELVPRIV;

1047

1048

BUG_ON(!list_empty(&req->queuelist));

1048

BUG_ON(!list_empty(&req->queuelist));

1049

BUG_ON(!hlist_unhashed(&req->hash));

1049

BUG_ON(!hlist_unhashed(&req->hash));

1050

1051

blk_free_request(q, req);

1051

blk_free_request(q, req);

1052

freed_request(q, is_sync, priv);

1052

freed_request(q, is_sync, priv);

1053

}

1053

}

1054

}

1054

}

1055

EXPORT_SYMBOL_GPL(__blk_put_request);

1055

EXPORT_SYMBOL_GPL(__blk_put_request);

1056

1057

void blk_put_request(struct request *req)

1057

void blk_put_request(struct request *req)

1058

{

1058

{

1059

unsigned long flags;

1059

unsigned long flags;

1060

struct request_queue *q = req->q;

1060

struct request_queue *q = req->q;

1061

1062

spin_lock_irqsave(q->queue_lock, flags);

1062

spin_lock_irqsave(q->queue_lock, flags);

1063

__blk_put_request(q, req);

1063

__blk_put_request(q, req);

1064

spin_unlock_irqrestore(q->queue_lock, flags);

1064

spin_unlock_irqrestore(q->queue_lock, flags);

1065

}

1065

}

1066

EXPORT_SYMBOL(blk_put_request);

1066

EXPORT_SYMBOL(blk_put_request);

1067

1068

/**

1068

/**

1069

* blk_add_request_payload - add a payload to a request

1069

* blk_add_request_payload - add a payload to a request

1070

* @rq: request to update

1070

* @rq: request to update

1071

* @page: page backing the payload

1071

* @page: page backing the payload

1072

* @len: length of the payload.

1072

* @len: length of the payload.

1073

*

1073

*

1074

* This allows to later add a payload to an already submitted request by

1074

* This allows to later add a payload to an already submitted request by

1075

* a block driver. The driver needs to take care of freeing the payload

1075

* a block driver. The driver needs to take care of freeing the payload

1076

* itself.

1076

* itself.

1077

*

1077

*

1078

* Note that this is a quite horrible hack and nothing but handling of

1078

* Note that this is a quite horrible hack and nothing but handling of

1079

* discard requests should ever use it.

1079

* discard requests should ever use it.

1080

*/

1080

*/

1081

void blk_add_request_payload(struct request *rq, struct page *page,

1081

void blk_add_request_payload(struct request *rq, struct page *page,

1082

unsigned int len)

1082

unsigned int len)

1083

{

1083

{

1084

struct bio *bio = rq->bio;

1084

struct bio *bio = rq->bio;

1085

1086

bio->bi_io_vec->bv_page = page;

1086

bio->bi_io_vec->bv_page = page;

1087

bio->bi_io_vec->bv_offset = 0;

1087

bio->bi_io_vec->bv_offset = 0;

1088

bio->bi_io_vec->bv_len = len;

1088

bio->bi_io_vec->bv_len = len;

1089

1090

bio->bi_size = len;

1090

bio->bi_size = len;

1091

bio->bi_vcnt = 1;

1091

bio->bi_vcnt = 1;

1092

bio->bi_phys_segments = 1;

1092

bio->bi_phys_segments = 1;

1093

1094

rq->__data_len = rq->resid_len = len;

1094

rq->__data_len = rq->resid_len = len;

1095

rq->nr_phys_segments = 1;

1095

rq->nr_phys_segments = 1;

1096

rq->buffer = bio_data(bio);

1096

rq->buffer = bio_data(bio);

1097

}

1097

}

1098

EXPORT_SYMBOL_GPL(blk_add_request_payload);

1098

EXPORT_SYMBOL_GPL(blk_add_request_payload);

1099

1100

static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,

1100

static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,

1101

struct bio *bio)

1101

struct bio *bio)

1102

{

1102

{

1103

const int ff = bio->bi_rw & REQ_FAILFAST_MASK;

1103

const int ff = bio->bi_rw & REQ_FAILFAST_MASK;

1104

1105

/*

1105

/*

1106

* Debug stuff, kill later

1106

* Debug stuff, kill later

1107

*/

1107

*/

1108

if (!rq_mergeable(req)) {

1108

if (!rq_mergeable(req)) {

1109

blk_dump_rq_flags(req, "back");

1109

blk_dump_rq_flags(req, "back");

1110

return false;

1110

return false;

1111

}

1111

}

1112

1113

if (!ll_back_merge_fn(q, req, bio))

1113

if (!ll_back_merge_fn(q, req, bio))

1114

return false;

1114

return false;

1115

1116

trace_block_bio_backmerge(q, bio);

1116

trace_block_bio_backmerge(q, bio);

1117

1118

if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)

1118

if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)

1119

blk_rq_set_mixed_merge(req);

1119

blk_rq_set_mixed_merge(req);

1120

1121

req->biotail->bi_next = bio;

1121

req->biotail->bi_next = bio;

1122

req->biotail = bio;

1122

req->biotail = bio;

1123

req->__data_len += bio->bi_size;

1123

req->__data_len += bio->bi_size;

1124

req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));

1124

req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));

1125

1126

drive_stat_acct(req, 0);

1126

drive_stat_acct(req, 0);

1127

return true;

1127

return true;

1128

}

1128

}

1129

1130

static bool bio_attempt_front_merge(struct request_queue *q,

1130

static bool bio_attempt_front_merge(struct request_queue *q,

1131

struct request *req, struct bio *bio)

1131

struct request *req, struct bio *bio)

1132

{

1132

{

1133

const int ff = bio->bi_rw & REQ_FAILFAST_MASK;

1133

const int ff = bio->bi_rw & REQ_FAILFAST_MASK;

1134

sector_t sector;

1134

sector_t sector;

1135

1136

/*

1136

/*

1137

* Debug stuff, kill later

1137

* Debug stuff, kill later

1138

*/

1138

*/

1139

if (!rq_mergeable(req)) {

1139

if (!rq_mergeable(req)) {

1140

blk_dump_rq_flags(req, "front");

1140

blk_dump_rq_flags(req, "front");

1141

return false;

1141

return false;

1142

}

1142

}

1143

1144

if (!ll_front_merge_fn(q, req, bio))

1144

if (!ll_front_merge_fn(q, req, bio))

1145

return false;

1145

return false;

1146

1147

trace_block_bio_frontmerge(q, bio);

1147

trace_block_bio_frontmerge(q, bio);

1148

1149

if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)

1149

if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)

1150

blk_rq_set_mixed_merge(req);

1150

blk_rq_set_mixed_merge(req);

1151

1152

sector = bio->bi_sector;

1152

sector = bio->bi_sector;

1153

1154

bio->bi_next = req->bio;

1154

bio->bi_next = req->bio;

1155

req->bio = bio;

1155

req->bio = bio;

1156

1157

/*

1157

/*

1158

* may not be valid. if the low level driver said

1158

* may not be valid. if the low level driver said

1159

* it didn't need a bounce buffer then it better

1159

* it didn't need a bounce buffer then it better

1160

* not touch req->buffer either...

1160

* not touch req->buffer either...

1161

*/

1161

*/

1162

req->buffer = bio_data(bio);

1162

req->buffer = bio_data(bio);

1163

req->__sector = bio->bi_sector;

1163

req->__sector = bio->bi_sector;

1164

req->__data_len += bio->bi_size;

1164

req->__data_len += bio->bi_size;

1165

req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));

1165

req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));

1166

1167

drive_stat_acct(req, 0);

1167

drive_stat_acct(req, 0);

1168

return true;

1168

return true;

1169

}

1169

}

1170

1171

/*

1171

/*

1172

* Attempts to merge with the plugged list in the current process. Returns

1172

* Attempts to merge with the plugged list in the current process. Returns

1173

* true if merge was successful, otherwise false.

1173

* true if merge was successful, otherwise false.

1174

*/

1174

*/

1175

static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,

1175

static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,

1176

struct bio *bio)

1176

struct bio *bio)

1177

{

1177

{

1178

struct blk_plug *plug;

1178

struct blk_plug *plug;

1179

struct request *rq;

1179

struct request *rq;

1180

bool ret = false;

1180

bool ret = false;

1181

1182

plug = tsk->plug;

1182

plug = tsk->plug;

1183

if (!plug)

1183

if (!plug)

1184

goto out;

1184

goto out;

1185

1186

list_for_each_entry_reverse(rq, &plug->list, queuelist) {

1186

list_for_each_entry_reverse(rq, &plug->list, queuelist) {

1187

int el_ret;

1187

int el_ret;

1188

1189

if (rq->q != q)

1189

if (rq->q != q)

1190

continue;

1190

continue;

1191

1192

el_ret = elv_try_merge(rq, bio);

1192

el_ret = elv_try_merge(rq, bio);

1193

if (el_ret == ELEVATOR_BACK_MERGE) {

1193

if (el_ret == ELEVATOR_BACK_MERGE) {

1194

ret = bio_attempt_back_merge(q, rq, bio);

1194

ret = bio_attempt_back_merge(q, rq, bio);

1195

if (ret)

1195

if (ret)

1196

break;

1196

break;

1197

} else if (el_ret == ELEVATOR_FRONT_MERGE) {

1197

} else if (el_ret == ELEVATOR_FRONT_MERGE) {

1198

ret = bio_attempt_front_merge(q, rq, bio);

1198

ret = bio_attempt_front_merge(q, rq, bio);

1199

if (ret)

1199

if (ret)

1200

break;

1200

break;

1201

}

1201

}

1202

}

1202

}

1203

out:

1203

out:

1204

return ret;

1204

return ret;

1205

}

1205

}

1206

1207

void init_request_from_bio(struct request *req, struct bio *bio)

1207

void init_request_from_bio(struct request *req, struct bio *bio)

1208

{

1208

{

1209

req->cpu = bio->bi_comp_cpu;

1209

req->cpu = bio->bi_comp_cpu;

1210

req->cmd_type = REQ_TYPE_FS;

1210

req->cmd_type = REQ_TYPE_FS;

1211

1212

req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;

1212

req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;

1213

if (bio->bi_rw & REQ_RAHEAD)

1213

if (bio->bi_rw & REQ_RAHEAD)

1214

req->cmd_flags |= REQ_FAILFAST_MASK;

1214

req->cmd_flags |= REQ_FAILFAST_MASK;

1215

1216

req->errors = 0;

1216

req->errors = 0;

1217

req->__sector = bio->bi_sector;

1217

req->__sector = bio->bi_sector;

1218

req->ioprio = bio_prio(bio);

1218

req->ioprio = bio_prio(bio);

1219

blk_rq_bio_prep(req->q, req, bio);

1219

blk_rq_bio_prep(req->q, req, bio);

1220

}

1220

}

1221

1222

static int __make_request(struct request_queue *q, struct bio *bio)

1222

static int __make_request(struct request_queue *q, struct bio *bio)

1223

{

1223

{

1224

const bool sync = !!(bio->bi_rw & REQ_SYNC);

1224

const bool sync = !!(bio->bi_rw & REQ_SYNC);

1225

struct blk_plug *plug;

1225

struct blk_plug *plug;

1226

int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;

1226

int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;

1227

struct request *req;

1227

struct request *req;

1228

1229

/*

1229

/*

1230

* low level driver can indicate that it wants pages above a

1230

* low level driver can indicate that it wants pages above a

1231

* certain limit bounced to low memory (ie for highmem, or even

1231

* certain limit bounced to low memory (ie for highmem, or even

1232

* ISA dma in theory)

1232

* ISA dma in theory)

1233

*/

1233

*/

1234

blk_queue_bounce(q, &bio);

1234

blk_queue_bounce(q, &bio);

1235

1236

if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {

1236

if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {

1237

spin_lock_irq(q->queue_lock);

1237

spin_lock_irq(q->queue_lock);

1238

where = ELEVATOR_INSERT_FLUSH;

1238

where = ELEVATOR_INSERT_FLUSH;

1239

goto get_rq;

1239

goto get_rq;

1240

}

1240

}

1241

1242

/*

1242

/*

1243

* Check if we can merge with the plugged list before grabbing

1243

* Check if we can merge with the plugged list before grabbing

1244

* any locks.

1244

* any locks.

1245

*/

1245

*/

1246

if (attempt_plug_merge(current, q, bio))

1246

if (attempt_plug_merge(current, q, bio))

1247

goto out;

1247

goto out;

1248

1249

spin_lock_irq(q->queue_lock);

1249

spin_lock_irq(q->queue_lock);

1250

1251

el_ret = elv_merge(q, &req, bio);

1251

el_ret = elv_merge(q, &req, bio);

1252

if (el_ret == ELEVATOR_BACK_MERGE) {

1252

if (el_ret == ELEVATOR_BACK_MERGE) {

1253

BUG_ON(req->cmd_flags & REQ_ON_PLUG);

1253

BUG_ON(req->cmd_flags & REQ_ON_PLUG);

1254

if (bio_attempt_back_merge(q, req, bio)) {

1254

if (bio_attempt_back_merge(q, req, bio)) {

1255

if (!attempt_back_merge(q, req))

1255

if (!attempt_back_merge(q, req))

1256

elv_merged_request(q, req, el_ret);

1256

elv_merged_request(q, req, el_ret);

1257

goto out_unlock;

1257

goto out_unlock;

1258

}

1258

}

1259

} else if (el_ret == ELEVATOR_FRONT_MERGE) {

1259

} else if (el_ret == ELEVATOR_FRONT_MERGE) {

1260

BUG_ON(req->cmd_flags & REQ_ON_PLUG);

1260

BUG_ON(req->cmd_flags & REQ_ON_PLUG);

1261

if (bio_attempt_front_merge(q, req, bio)) {

1261

if (bio_attempt_front_merge(q, req, bio)) {

1262

if (!attempt_front_merge(q, req))

1262

if (!attempt_front_merge(q, req))

1263

elv_merged_request(q, req, el_ret);

1263

elv_merged_request(q, req, el_ret);

1264

goto out_unlock;

1264

goto out_unlock;

1265

}

1265

}

1266

}

1266

}

1267

1268

get_rq:

1268

get_rq:

1269

/*

1269

/*

1270

* This sync check and mask will be re-done in init_request_from_bio(),

1270

* This sync check and mask will be re-done in init_request_from_bio(),

1271

* but we need to set it earlier to expose the sync flag to the

1271

* but we need to set it earlier to expose the sync flag to the

1272

* rq allocator and io schedulers.

1272

* rq allocator and io schedulers.

1273

*/

1273

*/

1274

rw_flags = bio_data_dir(bio);

1274

rw_flags = bio_data_dir(bio);

1275

if (sync)

1275

if (sync)

1276

rw_flags |= REQ_SYNC;

1276

rw_flags |= REQ_SYNC;

1277

1278

/*

1278

/*

1279

* Grab a free request. This is might sleep but can not fail.

1279

* Grab a free request. This is might sleep but can not fail.

1280

* Returns with the queue unlocked.

1280

* Returns with the queue unlocked.

1281

*/

1281

*/

1282

req = get_request_wait(q, rw_flags, bio);

1282

req = get_request_wait(q, rw_flags, bio);

1283

1284

/*

1284

/*

1285

* After dropping the lock and possibly sleeping here, our request

1285

* After dropping the lock and possibly sleeping here, our request

1286

* may now be mergeable after it had proven unmergeable (above).

1286

* may now be mergeable after it had proven unmergeable (above).

1287

* We don't worry about that case for efficiency. It won't happen

1287

* We don't worry about that case for efficiency. It won't happen

1288

* often, and the elevators are able to handle it.

1288

* often, and the elevators are able to handle it.

1289

*/

1289

*/

1290

init_request_from_bio(req, bio);

1290

init_request_from_bio(req, bio);

1291

1292

if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||

1292

if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||

1293

bio_flagged(bio, BIO_CPU_AFFINE)) {

1293

bio_flagged(bio, BIO_CPU_AFFINE)) {

1294

req->cpu = blk_cpu_to_group(get_cpu());

1294

req->cpu = blk_cpu_to_group(get_cpu());

1295

put_cpu();

1295

put_cpu();

1296

}

1296

}

1297

1298

plug = current->plug;

1298

plug = current->plug;

1299

if (plug) {

1299

if (plug) {

1300

/*

1300

/*

1301

* If this is the first request added after a plug, fire

1301

* If this is the first request added after a plug, fire

1302

* of a plug trace. If others have been added before, check

1302

* of a plug trace. If others have been added before, check

1303

* if we have multiple devices in this plug. If so, make a

1303

* if we have multiple devices in this plug. If so, make a

1304

* note to sort the list before dispatch.

1304

* note to sort the list before dispatch.

1305

*/

1305

*/

1306

if (list_empty(&plug->list))

1306

if (list_empty(&plug->list))

1307

trace_block_plug(q);

1307

trace_block_plug(q);

1308

else if (!plug->should_sort) {

1308

else if (!plug->should_sort) {

1309

struct request *__rq;

1309

struct request *__rq;

1310

1311

__rq = list_entry_rq(plug->list.prev);

1311

__rq = list_entry_rq(plug->list.prev);

1312

if (__rq->q != q)

1312

if (__rq->q != q)

1313

plug->should_sort = 1;

1313

plug->should_sort = 1;

1314

}

1314

}

1315

/*

1315

/*

1316

* Debug flag, kill later

1316

* Debug flag, kill later

1317

*/

1317

*/

1318

req->cmd_flags |= REQ_ON_PLUG;

1318

req->cmd_flags |= REQ_ON_PLUG;

1319

list_add_tail(&req->queuelist, &plug->list);

1319

list_add_tail(&req->queuelist, &plug->list);

1320

drive_stat_acct(req, 1);

1320

drive_stat_acct(req, 1);

1321

} else {

1321

} else {

1322

spin_lock_irq(q->queue_lock);

1322

spin_lock_irq(q->queue_lock);

1323

add_acct_request(q, req, where);

1323

add_acct_request(q, req, where);

1324

__blk_run_queue(q, false);

1324

__blk_run_queue(q, false);

1325

out_unlock:

1325

out_unlock:

1326

spin_unlock_irq(q->queue_lock);

1326

spin_unlock_irq(q->queue_lock);

1327

}

1327

}

1328

out:

1328

out:

1329

return 0;

1329

return 0;

1330

}

1330

}

1331

1332

/*

1332

/*

1333

* If bio->bi_dev is a partition, remap the location

1333

* If bio->bi_dev is a partition, remap the location

1334

*/

1334

*/

1335

static inline void blk_partition_remap(struct bio *bio)

1335

static inline void blk_partition_remap(struct bio *bio)

1336

{

1336

{

1337

struct block_device *bdev = bio->bi_bdev;

1337

struct block_device *bdev = bio->bi_bdev;

1338

1339

if (bio_sectors(bio) && bdev != bdev->bd_contains) {

1339

if (bio_sectors(bio) && bdev != bdev->bd_contains) {

1340

struct hd_struct *p = bdev->bd_part;

1340

struct hd_struct *p = bdev->bd_part;

1341

1342

bio->bi_sector += p->start_sect;

1342

bio->bi_sector += p->start_sect;

1343

bio->bi_bdev = bdev->bd_contains;

1343

bio->bi_bdev = bdev->bd_contains;

1344

1345

trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,

1345

trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,

1346

bdev->bd_dev,

1346

bdev->bd_dev,

1347

bio->bi_sector - p->start_sect);

1347

bio->bi_sector - p->start_sect);

1348

}

1348

}

1349

}

1349

}

1350

1351

static void handle_bad_sector(struct bio *bio)

1351

static void handle_bad_sector(struct bio *bio)

1352

{

1352

{

1353

char b[BDEVNAME_SIZE];

1353

char b[BDEVNAME_SIZE];

1354

1355

printk(KERN_INFO "attempt to access beyond end of device\n");

1355

printk(KERN_INFO "attempt to access beyond end of device\n");

1356

printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",

1356

printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",

1357

bdevname(bio->bi_bdev, b),

1357

bdevname(bio->bi_bdev, b),

1358

bio->bi_rw,

1358

bio->bi_rw,

1359

(unsigned long long)bio->bi_sector + bio_sectors(bio),

1359

(unsigned long long)bio->bi_sector + bio_sectors(bio),

1360

(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));

1360

(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));

1361

1362

set_bit(BIO_EOF, &bio->bi_flags);

1362

set_bit(BIO_EOF, &bio->bi_flags);

1363

}

1363

}

1364

1365

#ifdef CONFIG_FAIL_MAKE_REQUEST

1365

#ifdef CONFIG_FAIL_MAKE_REQUEST

1366

1367

static DECLARE_FAULT_ATTR(fail_make_request);

1367

static DECLARE_FAULT_ATTR(fail_make_request);

1368

1369

static int __init setup_fail_make_request(char *str)

1369

static int __init setup_fail_make_request(char *str)

1370

{

1370

{

1371

return setup_fault_attr(&fail_make_request, str);

1371

return setup_fault_attr(&fail_make_request, str);

1372

}

1372

}

1373

__setup("fail_make_request=", setup_fail_make_request);

1373

__setup("fail_make_request=", setup_fail_make_request);

1374

1375

static int should_fail_request(struct bio *bio)

1375

static int should_fail_request(struct bio *bio)

1376

{

1376

{

1377

struct hd_struct *part = bio->bi_bdev->bd_part;

1377

struct hd_struct *part = bio->bi_bdev->bd_part;

1378

1379

if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)

1379

if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)

1380

return should_fail(&fail_make_request, bio->bi_size);

1380

return should_fail(&fail_make_request, bio->bi_size);

1381

1382

return 0;

1382

return 0;

1383

}

1383

}

1384

1385

static int __init fail_make_request_debugfs(void)

1385

static int __init fail_make_request_debugfs(void)

1386

{

1386

{

1387

return init_fault_attr_dentries(&fail_make_request,

1387

return init_fault_attr_dentries(&fail_make_request,

1388

"fail_make_request");

1388

"fail_make_request");

1389

}

1389

}

1390

1391

late_initcall(fail_make_request_debugfs);

1391

late_initcall(fail_make_request_debugfs);

1392

1393

#else /* CONFIG_FAIL_MAKE_REQUEST */

1393

#else /* CONFIG_FAIL_MAKE_REQUEST */

1394

1395

static inline int should_fail_request(struct bio *bio)

1395

static inline int should_fail_request(struct bio *bio)

1396

{

1396

{

1397

return 0;

1397

return 0;

1398

}

1398

}

1399

1400

#endif /* CONFIG_FAIL_MAKE_REQUEST */

1400

#endif /* CONFIG_FAIL_MAKE_REQUEST */

1401

1402

/*

1402

/*

1403

* Check whether this bio extends beyond the end of the device.

1403

* Check whether this bio extends beyond the end of the device.

1404

*/

1404

*/

1405

static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)

1405

static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)

1406

{

1406

{

1407

sector_t maxsector;

1407

sector_t maxsector;

1408

1409

if (!nr_sectors)

1409

if (!nr_sectors)

1410

return 0;

1410

return 0;

1411

1412

/* Test device or partition size, when known. */

1412

/* Test device or partition size, when known. */

1413

maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;

1413

maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;

1414

if (maxsector) {

1414

if (maxsector) {

1415

sector_t sector = bio->bi_sector;

1415

sector_t sector = bio->bi_sector;

1416

1417

if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {

1417

if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {

1418

/*

1418

/*

1419

* This may well happen - the kernel calls bread()

1419

* This may well happen - the kernel calls bread()

1420

* without checking the size of the device, e.g., when

1420

* without checking the size of the device, e.g., when

1421

* mounting a device.

1421

* mounting a device.

1422

*/

1422

*/

1423

handle_bad_sector(bio);

1423

handle_bad_sector(bio);

1424

return 1;

1424

return 1;

1425

}

1425

}

1426

}

1426

}

1427

1428

return 0;

1428

return 0;

1429

}

1429

}

1430

1431

/**

1431

/**

1432

* generic_make_request - hand a buffer to its device driver for I/O

1432

* generic_make_request - hand a buffer to its device driver for I/O

1433

* @bio: The bio describing the location in memory and on the device.

1433

* @bio: The bio describing the location in memory and on the device.

1434

*

1434

*

1435

* generic_make_request() is used to make I/O requests of block

1435

* generic_make_request() is used to make I/O requests of block

1436

* devices. It is passed a &struct bio, which describes the I/O that needs

1436

* devices. It is passed a &struct bio, which describes the I/O that needs

1437

* to be done.

1437

* to be done.

1438

*

1438

*

1439

* generic_make_request() does not return any status. The

1439

* generic_make_request() does not return any status. The

1440

* success/failure status of the request, along with notification of

1440

* success/failure status of the request, along with notification of

1441

* completion, is delivered asynchronously through the bio->bi_end_io

1441

* completion, is delivered asynchronously through the bio->bi_end_io

1442

* function described (one day) else where.

1442

* function described (one day) else where.

1443

*

1443

*

1444

* The caller of generic_make_request must make sure that bi_io_vec

1444

* The caller of generic_make_request must make sure that bi_io_vec

1445

* are set to describe the memory buffer, and that bi_dev and bi_sector are

1445

* are set to describe the memory buffer, and that bi_dev and bi_sector are

1446

* set to describe the device address, and the

1446

* set to describe the device address, and the

1447

* bi_end_io and optionally bi_private are set to describe how

1447

* bi_end_io and optionally bi_private are set to describe how

1448

* completion notification should be signaled.

1448

* completion notification should be signaled.

1449

*

1449

*

1450

* generic_make_request and the drivers it calls may use bi_next if this

1450

* generic_make_request and the drivers it calls may use bi_next if this

1451

* bio happens to be merged with someone else, and may change bi_dev and

1451

* bio happens to be merged with someone else, and may change bi_dev and

1452

* bi_sector for remaps as it sees fit. So the values of these fields

1452

* bi_sector for remaps as it sees fit. So the values of these fields

1453

* should NOT be depended on after the call to generic_make_request.

1453

* should NOT be depended on after the call to generic_make_request.

1454

*/

1454

*/

1455

static inline void __generic_make_request(struct bio *bio)

1455

static inline void __generic_make_request(struct bio *bio)

1456

{

1456

{

1457

struct request_queue *q;

1457

struct request_queue *q;

1458

sector_t old_sector;

1458

sector_t old_sector;

1459

int ret, nr_sectors = bio_sectors(bio);

1459

int ret, nr_sectors = bio_sectors(bio);

1460

dev_t old_dev;

1460

dev_t old_dev;

1461

int err = -EIO;

1461

int err = -EIO;

1462

1463

might_sleep();

1463

might_sleep();

1464

1465

if (bio_check_eod(bio, nr_sectors))

1465

if (bio_check_eod(bio, nr_sectors))

1466

goto end_io;

1466

goto end_io;

1467

1468

/*

1468

/*

1469

* Resolve the mapping until finished. (drivers are

1469

* Resolve the mapping until finished. (drivers are

1470

* still free to implement/resolve their own stacking

1470

* still free to implement/resolve their own stacking

1471

* by explicitly returning 0)

1471

* by explicitly returning 0)

1472

*

1472

*

1473

* NOTE: we don't repeat the blk_size check for each new device.

1473

* NOTE: we don't repeat the blk_size check for each new device.

1474

* Stacking drivers are expected to know what they are doing.

1474

* Stacking drivers are expected to know what they are doing.

1475

*/

1475

*/

1476

old_sector = -1;

1476

old_sector = -1;

1477

old_dev = 0;

1477

old_dev = 0;

1478

do {

1478

do {

1479

char b[BDEVNAME_SIZE];

1479

char b[BDEVNAME_SIZE];

1480

1481

q = bdev_get_queue(bio->bi_bdev);

1481

q = bdev_get_queue(bio->bi_bdev);

1482

if (unlikely(!q)) {

1482

if (unlikely(!q)) {

1483

printk(KERN_ERR

1483

printk(KERN_ERR

1484

"generic_make_request: Trying to access "

1484

"generic_make_request: Trying to access "

1485

"nonexistent block-device %s (%Lu)\n",

1485

"nonexistent block-device %s (%Lu)\n",

1486

bdevname(bio->bi_bdev, b),

1486

bdevname(bio->bi_bdev, b),

1487

(long long) bio->bi_sector);

1487

(long long) bio->bi_sector);

1488

goto end_io;

1488

goto end_io;

1489

}

1489

}

1490

1491

if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&

1491

if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&

1492

nr_sectors > queue_max_hw_sectors(q))) {

1492

nr_sectors > queue_max_hw_sectors(q))) {

1493

printk(KERN_ERR "bio too big device %s (%u > %u)\n",

1493

printk(KERN_ERR "bio too big device %s (%u > %u)\n",

1494

bdevname(bio->bi_bdev, b),

1494

bdevname(bio->bi_bdev, b),

1495

bio_sectors(bio),

1495

bio_sectors(bio),

1496

queue_max_hw_sectors(q));

1496

queue_max_hw_sectors(q));

1497

goto end_io;

1497

goto end_io;

1498

}

1498

}

1499

1500

if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))

1500

if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))

1501

goto end_io;

1501

goto end_io;

1502

1503

if (should_fail_request(bio))

1503

if (should_fail_request(bio))

1504

goto end_io;

1504

goto end_io;

1505

1506

/*

1506

/*

1507

* If this device has partitions, remap block n

1507

* If this device has partitions, remap block n

1508

* of partition p to block n+start(p) of the disk.

1508

* of partition p to block n+start(p) of the disk.

1509

*/

1509

*/

1510

blk_partition_remap(bio);

1510

blk_partition_remap(bio);

1511

1512

if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))

1512

if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))

1513

goto end_io;

1513

goto end_io;

1514

1515

if (old_sector != -1)

1515

if (old_sector != -1)

1516

trace_block_bio_remap(q, bio, old_dev, old_sector);

1516

trace_block_bio_remap(q, bio, old_dev, old_sector);

1517

1518

old_sector = bio->bi_sector;

1518

old_sector = bio->bi_sector;

1519

old_dev = bio->bi_bdev->bd_dev;

1519

old_dev = bio->bi_bdev->bd_dev;

1520

1521

if (bio_check_eod(bio, nr_sectors))

1521

if (bio_check_eod(bio, nr_sectors))

1522

goto end_io;

1522

goto end_io;

1523

1524

/*

1524

/*

1525

* Filter flush bio's early so that make_request based

1525

* Filter flush bio's early so that make_request based

1526

* drivers without flush support don't have to worry

1526

* drivers without flush support don't have to worry

1527

* about them.

1527

* about them.

1528

*/

1528

*/

1529

if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {

1529

if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {

1530

bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);

1530

bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);

1531

if (!nr_sectors) {

1531

if (!nr_sectors) {

1532

err = 0;

1532

err = 0;

1533

goto end_io;

1533

goto end_io;

1534

}

1534

}

1535

}

1535

}

1536

1537

if ((bio->bi_rw & REQ_DISCARD) &&

1537

if ((bio->bi_rw & REQ_DISCARD) &&

1538

(!blk_queue_discard(q) ||

1538

(!blk_queue_discard(q) ||

1539

((bio->bi_rw & REQ_SECURE) &&

1539

((bio->bi_rw & REQ_SECURE) &&

1540

!blk_queue_secdiscard(q)))) {

1540

!blk_queue_secdiscard(q)))) {

1541

err = -EOPNOTSUPP;

1541

err = -EOPNOTSUPP;

1542

goto end_io;

1542

goto end_io;

1543

}

1543

}

1544

1545

blk_throtl_bio(q, &bio);

1545

blk_throtl_bio(q, &bio);

1546

1547

/*

1547

/*

1548

* If bio = NULL, bio has been throttled and will be submitted

1548

* If bio = NULL, bio has been throttled and will be submitted

1549

* later.

1549

* later.

1550

*/

1550

*/

1551

if (!bio)

1551

if (!bio)

1552

break;

1552

break;

1553

1554

trace_block_bio_queue(q, bio);

1554

trace_block_bio_queue(q, bio);

1555

1556

ret = q->make_request_fn(q, bio);

1556

ret = q->make_request_fn(q, bio);

1557

} while (ret);

1557

} while (ret);

1558

1559

return;

1559

return;

1560

1561

end_io:

1561

end_io:

1562

bio_endio(bio, err);

1562

bio_endio(bio, err);

1563

}

1563

}

1564

1565

/*

1565

/*

1566

* We only want one ->make_request_fn to be active at a time,

1566

* We only want one ->make_request_fn to be active at a time,

1567

* else stack usage with stacked devices could be a problem.

1567

* else stack usage with stacked devices could be a problem.

1568

* So use current->bio_list to keep a list of requests

1568

* So use current->bio_list to keep a list of requests

1569

* submited by a make_request_fn function.

1569

* submited by a make_request_fn function.

1570

* current->bio_list is also used as a flag to say if

1570

* current->bio_list is also used as a flag to say if

1571

* generic_make_request is currently active in this task or not.

1571

* generic_make_request is currently active in this task or not.

1572

* If it is NULL, then no make_request is active. If it is non-NULL,

1572

* If it is NULL, then no make_request is active. If it is non-NULL,

1573

* then a make_request is active, and new requests should be added

1573

* then a make_request is active, and new requests should be added

1574

* at the tail

1574

* at the tail

1575

*/

1575

*/

1576

void generic_make_request(struct bio *bio)

1576

void generic_make_request(struct bio *bio)

1577

{

1577

{

1578

struct bio_list bio_list_on_stack;

1578

struct bio_list bio_list_on_stack;

1579

1580

if (current->bio_list) {

1580

if (current->bio_list) {

1581

/* make_request is active */

1581

/* make_request is active */

1582

bio_list_add(current->bio_list, bio);

1582

bio_list_add(current->bio_list, bio);

1583

return;

1583

return;

1584

}

1584

}

1585

/* following loop may be a bit non-obvious, and so deserves some

1585

/* following loop may be a bit non-obvious, and so deserves some

1586

* explanation.

1586

* explanation.

1587

* Before entering the loop, bio->bi_next is NULL (as all callers

1587

* Before entering the loop, bio->bi_next is NULL (as all callers

1588

* ensure that) so we have a list with a single bio.

1588

* ensure that) so we have a list with a single bio.

1589

* We pretend that we have just taken it off a longer list, so

1589

* We pretend that we have just taken it off a longer list, so

1590

* we assign bio_list to a pointer to the bio_list_on_stack,

1590

* we assign bio_list to a pointer to the bio_list_on_stack,

1591

* thus initialising the bio_list of new bios to be

1591

* thus initialising the bio_list of new bios to be

1592

* added. __generic_make_request may indeed add some more bios

1592

* added. __generic_make_request may indeed add some more bios

1593

* through a recursive call to generic_make_request. If it

1593

* through a recursive call to generic_make_request. If it

1594

* did, we find a non-NULL value in bio_list and re-enter the loop

1594

* did, we find a non-NULL value in bio_list and re-enter the loop

1595

* from the top. In this case we really did just take the bio

1595

* from the top. In this case we really did just take the bio

1596

* of the top of the list (no pretending) and so remove it from

1596

* of the top of the list (no pretending) and so remove it from

1597

* bio_list, and call into __generic_make_request again.

1597

* bio_list, and call into __generic_make_request again.

1598

*

1598

*

1599

* The loop was structured like this to make only one call to

1599

* The loop was structured like this to make only one call to

1600

* __generic_make_request (which is important as it is large and

1600

* __generic_make_request (which is important as it is large and

1601

* inlined) and to keep the structure simple.

1601

* inlined) and to keep the structure simple.

1602

*/

1602

*/

1603

BUG_ON(bio->bi_next);

1603

BUG_ON(bio->bi_next);

1604

bio_list_init(&bio_list_on_stack);

1604

bio_list_init(&bio_list_on_stack);

1605

current->bio_list = &bio_list_on_stack;

1605

current->bio_list = &bio_list_on_stack;

1606

do {

1606

do {

1607

__generic_make_request(bio);

1607

__generic_make_request(bio);

1608

bio = bio_list_pop(current->bio_list);

1608

bio = bio_list_pop(current->bio_list);

1609

} while (bio);

1609

} while (bio);

1610

current->bio_list = NULL; /* deactivate */

1610

current->bio_list = NULL; /* deactivate */

1611

}

1611

}

1612

EXPORT_SYMBOL(generic_make_request);

1612

EXPORT_SYMBOL(generic_make_request);

1613

1614

/**

1614

/**

1615

* submit_bio - submit a bio to the block device layer for I/O

1615

* submit_bio - submit a bio to the block device layer for I/O

1616

* @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)

1616

* @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)

1617

* @bio: The &struct bio which describes the I/O

1617

* @bio: The &struct bio which describes the I/O

1618

*

1618

*

1619

* submit_bio() is very similar in purpose to generic_make_request(), and

1619

* submit_bio() is very similar in purpose to generic_make_request(), and

1620

* uses that function to do most of the work. Both are fairly rough

1620

* uses that function to do most of the work. Both are fairly rough

1621

* interfaces; @bio must be presetup and ready for I/O.

1621

* interfaces; @bio must be presetup and ready for I/O.

1622

*

1622

*

1623

*/

1623

*/

1624

void submit_bio(int rw, struct bio *bio)

1624

void submit_bio(int rw, struct bio *bio)

1625

{

1625

{

1626

int count = bio_sectors(bio);

1626

int count = bio_sectors(bio);

1627

1628

bio->bi_rw |= rw;

1628

bio->bi_rw |= rw;

1629

1630

/*

1630

/*

1631

* If it's a regular read/write or a barrier with data attached,

1631

* If it's a regular read/write or a barrier with data attached,

1632

* go through the normal accounting stuff before submission.

1632

* go through the normal accounting stuff before submission.

1633

*/

1633

*/

1634

if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {

1634

if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {

1635

if (rw & WRITE) {

1635

if (rw & WRITE) {

1636

count_vm_events(PGPGOUT, count);

1636

count_vm_events(PGPGOUT, count);

1637

} else {

1637

} else {

1638

task_io_account_read(bio->bi_size);

1638

task_io_account_read(bio->bi_size);

1639

count_vm_events(PGPGIN, count);

1639

count_vm_events(PGPGIN, count);

1640

}

1640

}

1641

1642

if (unlikely(block_dump)) {

1642

if (unlikely(block_dump)) {

1643

char b[BDEVNAME_SIZE];

1643

char b[BDEVNAME_SIZE];

1644

printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",

1644

printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",

1645

current->comm, task_pid_nr(current),

1645

current->comm, task_pid_nr(current),

1646

(rw & WRITE) ? "WRITE" : "READ",

1646

(rw & WRITE) ? "WRITE" : "READ",

1647

(unsigned long long)bio->bi_sector,

1647

(unsigned long long)bio->bi_sector,

1648

bdevname(bio->bi_bdev, b),

1648

bdevname(bio->bi_bdev, b),

1649

count);

1649

count);

1650

}

1650

}

1651

}

1651

}

1652

1653

generic_make_request(bio);

1653

generic_make_request(bio);

1654

}

1654

}

1655

EXPORT_SYMBOL(submit_bio);

1655

EXPORT_SYMBOL(submit_bio);

1656

1657

/**

1657

/**

1658

* blk_rq_check_limits - Helper function to check a request for the queue limit

1658

* blk_rq_check_limits - Helper function to check a request for the queue limit

1659

* @q: the queue

1659

* @q: the queue

1660

* @rq: the request being checked

1660

* @rq: the request being checked

1661

*

1661

*

1662

* Description:

1662

* Description:

1663

* @rq may have been made based on weaker limitations of upper-level queues

1663

* @rq may have been made based on weaker limitations of upper-level queues

1664

* in request stacking drivers, and it may violate the limitation of @q.

1664

* in request stacking drivers, and it may violate the limitation of @q.

1665

* Since the block layer and the underlying device driver trust @rq

1665

* Since the block layer and the underlying device driver trust @rq

1666

* after it is inserted to @q, it should be checked against @q before

1666

* after it is inserted to @q, it should be checked against @q before

1667

* the insertion using this generic function.

1667

* the insertion using this generic function.

1668

*

1668

*

1669

* This function should also be useful for request stacking drivers

1669

* This function should also be useful for request stacking drivers

1670

* in some cases below, so export this function.

1670

* in some cases below, so export this function.

1671

* Request stacking drivers like request-based dm may change the queue

1671

* Request stacking drivers like request-based dm may change the queue

1672

* limits while requests are in the queue (e.g. dm's table swapping).

1672

* limits while requests are in the queue (e.g. dm's table swapping).

1673

* Such request stacking drivers should check those requests agaist

1673

* Such request stacking drivers should check those requests agaist

1674

* the new queue limits again when they dispatch those requests,

1674

* the new queue limits again when they dispatch those requests,

1675

* although such checkings are also done against the old queue limits

1675

* although such checkings are also done against the old queue limits

1676

* when submitting requests.

1676

* when submitting requests.

1677

*/

1677

*/

1678

int blk_rq_check_limits(struct request_queue *q, struct request *rq)

1678

int blk_rq_check_limits(struct request_queue *q, struct request *rq)

1679

{

1679

{

1680

if (rq->cmd_flags & REQ_DISCARD)

1680

if (rq->cmd_flags & REQ_DISCARD)

1681

return 0;

1681

return 0;

1682

1683

if (blk_rq_sectors(rq) > queue_max_sectors(q) ||

1683

if (blk_rq_sectors(rq) > queue_max_sectors(q) ||

1684

blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {

1684

blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {

1685

printk(KERN_ERR "%s: over max size limit.\n", __func__);

1685

printk(KERN_ERR "%s: over max size limit.\n", __func__);

1686

return -EIO;

1686

return -EIO;

1687

}

1687

}

1688

1689

/*

1689

/*

1690

* queue's settings related to segment counting like q->bounce_pfn

1690

* queue's settings related to segment counting like q->bounce_pfn

1691

* may differ from that of other stacking queues.

1691

* may differ from that of other stacking queues.

1692

* Recalculate it to check the request correctly on this queue's

1692

* Recalculate it to check the request correctly on this queue's

1693

* limitation.

1693

* limitation.

1694

*/

1694

*/

1695

blk_recalc_rq_segments(rq);

1695

blk_recalc_rq_segments(rq);

1696

if (rq->nr_phys_segments > queue_max_segments(q)) {

1696

if (rq->nr_phys_segments > queue_max_segments(q)) {

1697

printk(KERN_ERR "%s: over max segments limit.\n", __func__);

1697

printk(KERN_ERR "%s: over max segments limit.\n", __func__);

1698

return -EIO;

1698

return -EIO;

1699

}

1699

}

1700

1701

return 0;

1701

return 0;

1702

}

1702

}

1703

EXPORT_SYMBOL_GPL(blk_rq_check_limits);

1703

EXPORT_SYMBOL_GPL(blk_rq_check_limits);

1704

1705

/**

1705

/**

1706

* blk_insert_cloned_request - Helper for stacking drivers to submit a request

1706

* blk_insert_cloned_request - Helper for stacking drivers to submit a request

1707

* @q: the queue to submit the request

1707

* @q: the queue to submit the request

1708

* @rq: the request being queued

1708

* @rq: the request being queued

1709

*/

1709

*/

1710

int blk_insert_cloned_request(struct request_queue *q, struct request *rq)

1710

int blk_insert_cloned_request(struct request_queue *q, struct request *rq)

1711

{

1711

{

1712

unsigned long flags;

1712

unsigned long flags;

1713

1714

if (blk_rq_check_limits(q, rq))

1714

if (blk_rq_check_limits(q, rq))

1715

return -EIO;

1715

return -EIO;

1716

1717

#ifdef CONFIG_FAIL_MAKE_REQUEST

1717

#ifdef CONFIG_FAIL_MAKE_REQUEST

1718

if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&

1718

if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&

1719

should_fail(&fail_make_request, blk_rq_bytes(rq)))

1719

should_fail(&fail_make_request, blk_rq_bytes(rq)))

1720

return -EIO;

1720

return -EIO;

1721

#endif

1721

#endif

1722

1723

spin_lock_irqsave(q->queue_lock, flags);

1723

spin_lock_irqsave(q->queue_lock, flags);

1724

1725

/*

1725

/*

1726

* Submitting request must be dequeued before calling this function

1726

* Submitting request must be dequeued before calling this function

1727

* because it will be linked to another request_queue

1727

* because it will be linked to another request_queue

1728

*/

1728

*/

1729

BUG_ON(blk_queued_rq(rq));

1729

BUG_ON(blk_queued_rq(rq));

1730

1731

add_acct_request(q, rq, ELEVATOR_INSERT_BACK);

1731

add_acct_request(q, rq, ELEVATOR_INSERT_BACK);

1732

spin_unlock_irqrestore(q->queue_lock, flags);

1732

spin_unlock_irqrestore(q->queue_lock, flags);

1733

1734

return 0;

1734

return 0;

1735

}

1735

}

1736

EXPORT_SYMBOL_GPL(blk_insert_cloned_request);

1736

EXPORT_SYMBOL_GPL(blk_insert_cloned_request);

1737

1738

/**

1738

/**

1739

* blk_rq_err_bytes - determine number of bytes till the next failure boundary

1739

* blk_rq_err_bytes - determine number of bytes till the next failure boundary

1740

* @rq: request to examine

1740

* @rq: request to examine

1741

*

1741

*

1742

* Description:

1742

* Description:

1743

* A request could be merge of IOs which require different failure

1743

* A request could be merge of IOs which require different failure

1744

* handling. This function determines the number of bytes which

1744

* handling. This function determines the number of bytes which

1745

* can be failed from the beginning of the request without

1745

* can be failed from the beginning of the request without

1746

* crossing into area which need to be retried further.

1746

* crossing into area which need to be retried further.

1747

*

1747

*

1748

* Return:

1748

* Return:

1749

* The number of bytes to fail.

1749

* The number of bytes to fail.

1750

*

1750

*

1751

* Context:

1751

* Context:

1752

* queue_lock must be held.

1752

* queue_lock must be held.

1753

*/

1753

*/

1754

unsigned int blk_rq_err_bytes(const struct request *rq)

1754

unsigned int blk_rq_err_bytes(const struct request *rq)

1755

{

1755

{

1756

unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;

1756

unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;

1757

unsigned int bytes = 0;

1757

unsigned int bytes = 0;

1758

struct bio *bio;

1758

struct bio *bio;

1759

1760

if (!(rq->cmd_flags & REQ_MIXED_MERGE))

1760

if (!(rq->cmd_flags & REQ_MIXED_MERGE))

1761

return blk_rq_bytes(rq);

1761

return blk_rq_bytes(rq);

1762

1763

/*

1763

/*

1764

* Currently the only 'mixing' which can happen is between

1764

* Currently the only 'mixing' which can happen is between

1765

* different fastfail types. We can safely fail portions

1765

* different fastfail types. We can safely fail portions

1766

* which have all the failfast bits that the first one has -

1766

* which have all the failfast bits that the first one has -

1767

* the ones which are at least as eager to fail as the first

1767

* the ones which are at least as eager to fail as the first

1768

* one.

1768

* one.

1769

*/

1769

*/

1770

for (bio = rq->bio; bio; bio = bio->bi_next) {

1770

for (bio = rq->bio; bio; bio = bio->bi_next) {

1771

if ((bio->bi_rw & ff) != ff)

1771

if ((bio->bi_rw & ff) != ff)

1772

break;

1772

break;

1773

bytes += bio->bi_size;

1773

bytes += bio->bi_size;

1774

}

1774

}

1775

1776

/* this could lead to infinite loop */

1776

/* this could lead to infinite loop */

1777

BUG_ON(blk_rq_bytes(rq) && !bytes);

1777

BUG_ON(blk_rq_bytes(rq) && !bytes);

1778

return bytes;

1778

return bytes;

1779

}

1779

}

1780

EXPORT_SYMBOL_GPL(blk_rq_err_bytes);

1780

EXPORT_SYMBOL_GPL(blk_rq_err_bytes);

1781

1782

static void blk_account_io_completion(struct request *req, unsigned int bytes)

1782

static void blk_account_io_completion(struct request *req, unsigned int bytes)

1783

{

1783

{

1784

if (blk_do_io_stat(req)) {

1784

if (blk_do_io_stat(req)) {

1785

const int rw = rq_data_dir(req);

1785

const int rw = rq_data_dir(req);

1786

struct hd_struct *part;

1786

struct hd_struct *part;

1787

int cpu;

1787

int cpu;

1788

1789

cpu = part_stat_lock();

1789

cpu = part_stat_lock();

1790

part = req->part;

1790

part = req->part;

1791

part_stat_add(cpu, part, sectors[rw], bytes >> 9);

1791

part_stat_add(cpu, part, sectors[rw], bytes >> 9);

1792

part_stat_unlock();

1792

part_stat_unlock();

1793

}

1793

}

1794

}

1794

}

1795

1796

static void blk_account_io_done(struct request *req)

1796

static void blk_account_io_done(struct request *req)

1797

{

1797

{

1798

/*

1798

/*

1799

* Account IO completion. flush_rq isn't accounted as a

1799

* Account IO completion. flush_rq isn't accounted as a

1800

* normal IO on queueing nor completion. Accounting the

1800

* normal IO on queueing nor completion. Accounting the

1801

* containing request is enough.

1801

* containing request is enough.

1802

*/

1802

*/

1803

if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {

1803

if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {

1804

unsigned long duration = jiffies - req->start_time;

1804

unsigned long duration = jiffies - req->start_time;

1805

const int rw = rq_data_dir(req);

1805

const int rw = rq_data_dir(req);

1806

struct hd_struct *part;

1806

struct hd_struct *part;

1807

int cpu;

1807

int cpu;

1808

1809

cpu = part_stat_lock();

1809

cpu = part_stat_lock();

1810

part = req->part;

1810

part = req->part;

1811

1812

part_stat_inc(cpu, part, ios[rw]);

1812

part_stat_inc(cpu, part, ios[rw]);

1813

part_stat_add(cpu, part, ticks[rw], duration);

1813

part_stat_add(cpu, part, ticks[rw], duration);

1814

part_round_stats(cpu, part);

1814

part_round_stats(cpu, part);

1815

part_dec_in_flight(part, rw);

1815

part_dec_in_flight(part, rw);

1816

1817

hd_struct_put(part);

1817

hd_struct_put(part);

1818

part_stat_unlock();

1818

part_stat_unlock();

1819

}

1819

}

1820

}

1820

}

1821

1822

/**

1822

/**

1823

* blk_peek_request - peek at the top of a request queue

1823

* blk_peek_request - peek at the top of a request queue

1824

* @q: request queue to peek at

1824

* @q: request queue to peek at

1825

*

1825

*

1826

* Description:

1826

* Description:

1827

* Return the request at the top of @q. The returned request

1827

* Return the request at the top of @q. The returned request

1828

* should be started using blk_start_request() before LLD starts

1828

* should be started using blk_start_request() before LLD starts

1829

* processing it.

1829

* processing it.

1830

*

1830

*

1831

* Return:

1831

* Return:

1832

* Pointer to the request at the top of @q if available. Null

1832

* Pointer to the request at the top of @q if available. Null

1833

* otherwise.

1833

* otherwise.

1834

*

1834

*

1835

* Context:

1835

* Context:

1836

* queue_lock must be held.

1836

* queue_lock must be held.

1837

*/

1837

*/

1838

struct request *blk_peek_request(struct request_queue *q)

1838

struct request *blk_peek_request(struct request_queue *q)

1839

{

1839

{

1840

struct request *rq;

1840

struct request *rq;

1841

int ret;

1841

int ret;

1842

1843

while ((rq = __elv_next_request(q)) != NULL) {

1843

while ((rq = __elv_next_request(q)) != NULL) {

1844

if (!(rq->cmd_flags & REQ_STARTED)) {

1844

if (!(rq->cmd_flags & REQ_STARTED)) {

1845

/*

1845

/*

1846

* This is the first time the device driver

1846

* This is the first time the device driver

1847

* sees this request (possibly after

1847

* sees this request (possibly after

1848

* requeueing). Notify IO scheduler.

1848

* requeueing). Notify IO scheduler.

1849

*/

1849

*/

1850

if (rq->cmd_flags & REQ_SORTED)

1850

if (rq->cmd_flags & REQ_SORTED)

1851

elv_activate_rq(q, rq);

1851

elv_activate_rq(q, rq);

1852

1853

/*

1853

/*

1854

* just mark as started even if we don't start

1854

* just mark as started even if we don't start

1855

* it, a request that has been delayed should

1855

* it, a request that has been delayed should

1856

* not be passed by new incoming requests

1856

* not be passed by new incoming requests

1857

*/

1857

*/

1858

rq->cmd_flags |= REQ_STARTED;

1858

rq->cmd_flags |= REQ_STARTED;

1859

trace_block_rq_issue(q, rq);

1859

trace_block_rq_issue(q, rq);

1860

}

1860

}

1861

1862

if (!q->boundary_rq || q->boundary_rq == rq) {

1862

if (!q->boundary_rq || q->boundary_rq == rq) {

1863

q->end_sector = rq_end_sector(rq);

1863

q->end_sector = rq_end_sector(rq);

1864

q->boundary_rq = NULL;

1864

q->boundary_rq = NULL;

1865

}

1865

}

1866

1867

if (rq->cmd_flags & REQ_DONTPREP)

1867

if (rq->cmd_flags & REQ_DONTPREP)

1868

break;

1868

break;

1869

1870

if (q->dma_drain_size && blk_rq_bytes(rq)) {

1870

if (q->dma_drain_size && blk_rq_bytes(rq)) {

1871

/*

1871

/*

1872

* make sure space for the drain appears we

1872

* make sure space for the drain appears we

1873

* know we can do this because max_hw_segments

1873

* know we can do this because max_hw_segments

1874

* has been adjusted to be one fewer than the

1874

* has been adjusted to be one fewer than the

1875

* device can handle

1875

* device can handle

1876

*/

1876

*/

1877

rq->nr_phys_segments++;

1877

rq->nr_phys_segments++;

1878

}

1878

}

1879

1880

if (!q->prep_rq_fn)

1880

if (!q->prep_rq_fn)

1881

break;

1881

break;

1882

1883

ret = q->prep_rq_fn(q, rq);

1883

ret = q->prep_rq_fn(q, rq);

1884

if (ret == BLKPREP_OK) {

1884

if (ret == BLKPREP_OK) {

1885

break;

1885

break;

1886

} else if (ret == BLKPREP_DEFER) {

1886

} else if (ret == BLKPREP_DEFER) {

1887

/*

1887

/*

1888

* the request may have been (partially) prepped.

1888

* the request may have been (partially) prepped.

1889

* we need to keep this request in the front to

1889

* we need to keep this request in the front to

1890

* avoid resource deadlock. REQ_STARTED will

1890

* avoid resource deadlock. REQ_STARTED will

1891

* prevent other fs requests from passing this one.

1891

* prevent other fs requests from passing this one.

1892

*/

1892

*/

1893

if (q->dma_drain_size && blk_rq_bytes(rq) &&

1893

if (q->dma_drain_size && blk_rq_bytes(rq) &&

1894

!(rq->cmd_flags & REQ_DONTPREP)) {

1894

!(rq->cmd_flags & REQ_DONTPREP)) {

1895

/*

1895

/*

1896

* remove the space for the drain we added

1896

* remove the space for the drain we added

1897

* so that we don't add it again

1897

* so that we don't add it again

1898

*/

1898

*/

1899

--rq->nr_phys_segments;

1899

--rq->nr_phys_segments;

1900

}

1900

}

1901

1902

rq = NULL;

1902

rq = NULL;

1903

break;

1903

break;

1904

} else if (ret == BLKPREP_KILL) {

1904

} else if (ret == BLKPREP_KILL) {

1905

rq->cmd_flags |= REQ_QUIET;

1905

rq->cmd_flags |= REQ_QUIET;

1906

/*

1906

/*

1907

* Mark this request as started so we don't trigger

1907

* Mark this request as started so we don't trigger

1908

* any debug logic in the end I/O path.

1908

* any debug logic in the end I/O path.

1909

*/

1909

*/

1910

blk_start_request(rq);

1910

blk_start_request(rq);

1911

__blk_end_request_all(rq, -EIO);

1911

__blk_end_request_all(rq, -EIO);

1912

} else {

1912

} else {

1913

printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);

1913

printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);

1914

break;

1914

break;

1915

}

1915

}

1916

}

1916

}

1917

1918

return rq;

1918

return rq;

1919

}

1919

}

1920

EXPORT_SYMBOL(blk_peek_request);

1920

EXPORT_SYMBOL(blk_peek_request);

1921

1922

void blk_dequeue_request(struct request *rq)

1922

void blk_dequeue_request(struct request *rq)

1923

{

1923

{

1924

struct request_queue *q = rq->q;

1924

struct request_queue *q = rq->q;

1925

1926

BUG_ON(list_empty(&rq->queuelist));

1926

BUG_ON(list_empty(&rq->queuelist));

1927

BUG_ON(ELV_ON_HASH(rq));

1927

BUG_ON(ELV_ON_HASH(rq));

1928

1929

list_del_init(&rq->queuelist);

1929

list_del_init(&rq->queuelist);

1930

1931

/*

1931

/*

1932

* the time frame between a request being removed from the lists

1932

* the time frame between a request being removed from the lists

1933

* and to it is freed is accounted as io that is in progress at

1933

* and to it is freed is accounted as io that is in progress at

1934

* the driver side.

1934

* the driver side.

1935

*/

1935

*/

1936

if (blk_account_rq(rq)) {

1936

if (blk_account_rq(rq)) {

1937

q->in_flight[rq_is_sync(rq)]++;

1937

q->in_flight[rq_is_sync(rq)]++;

1938

set_io_start_time_ns(rq);

1938

set_io_start_time_ns(rq);

1939

}

1939

}

1940

}

1940

}

1941

1942

/**

1942

/**

1943

* blk_start_request - start request processing on the driver

1943

* blk_start_request - start request processing on the driver

1944

* @req: request to dequeue

1944

* @req: request to dequeue

1945

*

1945

*

1946

* Description:

1946

* Description:

1947

* Dequeue @req and start timeout timer on it. This hands off the

1947

* Dequeue @req and start timeout timer on it. This hands off the

1948

* request to the driver.

1948

* request to the driver.

1949

*

1949

*

1950

* Block internal functions which don't want to start timer should

1950

* Block internal functions which don't want to start timer should

1951

* call blk_dequeue_request().

1951

* call blk_dequeue_request().

1952

*

1952

*

1953

* Context:

1953

* Context:

1954

* queue_lock must be held.

1954

* queue_lock must be held.

1955

*/

1955

*/

1956

void blk_start_request(struct request *req)

1956

void blk_start_request(struct request *req)

1957

{

1957

{

1958

blk_dequeue_request(req);

1958

blk_dequeue_request(req);

1959

1960

/*

1960

/*

1961

* We are now handing the request to the hardware, initialize

1961

* We are now handing the request to the hardware, initialize

1962

* resid_len to full count and add the timeout handler.

1962

* resid_len to full count and add the timeout handler.

1963

*/

1963

*/

1964

req->resid_len = blk_rq_bytes(req);

1964

req->resid_len = blk_rq_bytes(req);

1965

if (unlikely(blk_bidi_rq(req)))

1965

if (unlikely(blk_bidi_rq(req)))

1966

req->next_rq->resid_len = blk_rq_bytes(req->next_rq);

1966

req->next_rq->resid_len = blk_rq_bytes(req->next_rq);

1967

1968

blk_add_timer(req);

1968

blk_add_timer(req);

1969

}

1969

}

1970

EXPORT_SYMBOL(blk_start_request);

1970

EXPORT_SYMBOL(blk_start_request);

1971

1972

/**

1972

/**

1973

* blk_fetch_request - fetch a request from a request queue

1973

* blk_fetch_request - fetch a request from a request queue

1974

* @q: request queue to fetch a request from

1974

* @q: request queue to fetch a request from

1975

*

1975

*

1976

* Description:

1976

* Description:

1977

* Return the request at the top of @q. The request is started on

1977

* Return the request at the top of @q. The request is started on

1978

* return and LLD can start processing it immediately.

1978

* return and LLD can start processing it immediately.

1979

*

1979

*

1980

* Return:

1980

* Return:

1981

* Pointer to the request at the top of @q if available. Null

1981

* Pointer to the request at the top of @q if available. Null

1982

* otherwise.

1982

* otherwise.

1983

*

1983

*

1984

* Context:

1984

* Context:

1985

* queue_lock must be held.

1985

* queue_lock must be held.

1986

*/

1986

*/

1987

struct request *blk_fetch_request(struct request_queue *q)

1987

struct request *blk_fetch_request(struct request_queue *q)

1988

{

1988

{

1989

struct request *rq;

1989

struct request *rq;

1990

1991

rq = blk_peek_request(q);

1991

rq = blk_peek_request(q);

1992

if (rq)

1992

if (rq)

1993

blk_start_request(rq);

1993

blk_start_request(rq);

1994

return rq;

1994

return rq;

1995

}

1995

}

1996

EXPORT_SYMBOL(blk_fetch_request);

1996

EXPORT_SYMBOL(blk_fetch_request);

1997

1998

/**

1998

/**

1999

* blk_update_request - Special helper function for request stacking drivers

1999

* blk_update_request - Special helper function for request stacking drivers

2000

* @req: the request being processed

2000

* @req: the request being processed

2001

* @error: %0 for success, < %0 for error

2001

* @error: %0 for success, < %0 for error

2002

* @nr_bytes: number of bytes to complete @req

2002

* @nr_bytes: number of bytes to complete @req

2003

*

2003

*

2004

* Description:

2004

* Description:

2005

* Ends I/O on a number of bytes attached to @req, but doesn't complete

2005

* Ends I/O on a number of bytes attached to @req, but doesn't complete

2006

* the request structure even if @req doesn't have leftover.

2006

* the request structure even if @req doesn't have leftover.

2007

* If @req has leftover, sets it up for the next range of segments.

2007

* If @req has leftover, sets it up for the next range of segments.

2008

*

2008

*

2009

* This special helper function is only for request stacking drivers

2009

* This special helper function is only for request stacking drivers

2010

* (e.g. request-based dm) so that they can handle partial completion.

2010

* (e.g. request-based dm) so that they can handle partial completion.

2011

* Actual device drivers should use blk_end_request instead.

2011

* Actual device drivers should use blk_end_request instead.

2012

*

2012

*

2013

* Passing the result of blk_rq_bytes() as @nr_bytes guarantees

2013

* Passing the result of blk_rq_bytes() as @nr_bytes guarantees

2014

* %false return from this function.

2014

* %false return from this function.

2015

*

2015

*

2016

* Return:

2016

* Return:

2017

* %false - this request doesn't have any more data

2017

* %false - this request doesn't have any more data

2018

* %true - this request has more data

2018

* %true - this request has more data

2019

**/

2019

**/

2020

bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)

2020

bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)

2021

{

2021

{

2022

int total_bytes, bio_nbytes, next_idx = 0;

2022

int total_bytes, bio_nbytes, next_idx = 0;

2023

struct bio *bio;

2023

struct bio *bio;

2024

2025

if (!req->bio)

2025

if (!req->bio)

2026

return false;

2026

return false;

2027

2028

trace_block_rq_complete(req->q, req);

2028

trace_block_rq_complete(req->q, req);

2029

2030

/*

2030

/*

2031

* For fs requests, rq is just carrier of independent bio's

2031

* For fs requests, rq is just carrier of independent bio's

2032

* and each partial completion should be handled separately.

2032

* and each partial completion should be handled separately.

2033

* Reset per-request error on each partial completion.

2033

* Reset per-request error on each partial completion.

2034

*

2034

*

2035

* TODO: tj: This is too subtle. It would be better to let

2035

* TODO: tj: This is too subtle. It would be better to let

2036

* low level drivers do what they see fit.

2036

* low level drivers do what they see fit.

2037

*/

2037

*/

2038

if (req->cmd_type == REQ_TYPE_FS)

2038

if (req->cmd_type == REQ_TYPE_FS)

2039

req->errors = 0;

2039

req->errors = 0;

2040

2041

if (error && req->cmd_type == REQ_TYPE_FS &&

2041

if (error && req->cmd_type == REQ_TYPE_FS &&

2042

!(req->cmd_flags & REQ_QUIET)) {

2042

!(req->cmd_flags & REQ_QUIET)) {

2043

char *error_type;

2043

char *error_type;

2044

2045

switch (error) {

2045

switch (error) {

2046

case -ENOLINK:

2046

case -ENOLINK:

2047

error_type = "recoverable transport";

2047

error_type = "recoverable transport";

2048

break;

2048

break;

2049

case -EREMOTEIO:

2049

case -EREMOTEIO:

2050

error_type = "critical target";

2050

error_type = "critical target";

2051

break;

2051

break;

2052

case -EBADE:

2052

case -EBADE:

2053

error_type = "critical nexus";

2053

error_type = "critical nexus";

2054

break;

2054

break;

2055

case -EIO:

2055

case -EIO:

2056

default:

2056

default:

2057

error_type = "I/O";

2057

error_type = "I/O";

2058

break;

2058

break;

2059

}

2059

}

2060

printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",

2060

printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",

2061

error_type, req->rq_disk ? req->rq_disk->disk_name : "?",

2061

error_type, req->rq_disk ? req->rq_disk->disk_name : "?",

2062

(unsigned long long)blk_rq_pos(req));

2062

(unsigned long long)blk_rq_pos(req));

2063

}

2063

}

2064

2065

blk_account_io_completion(req, nr_bytes);

2065

blk_account_io_completion(req, nr_bytes);

2066

2067

total_bytes = bio_nbytes = 0;

2067

total_bytes = bio_nbytes = 0;

2068

while ((bio = req->bio) != NULL) {

2068

while ((bio = req->bio) != NULL) {

2069

int nbytes;

2069

int nbytes;

2070

2071

if (nr_bytes >= bio->bi_size) {

2071

if (nr_bytes >= bio->bi_size) {

2072

req->bio = bio->bi_next;

2072

req->bio = bio->bi_next;

2073

nbytes = bio->bi_size;

2073

nbytes = bio->bi_size;

2074

req_bio_endio(req, bio, nbytes, error);

2074

req_bio_endio(req, bio, nbytes, error);

2075

next_idx = 0;

2075

next_idx = 0;

2076

bio_nbytes = 0;

2076

bio_nbytes = 0;

2077

} else {

2077

} else {

2078

int idx = bio->bi_idx + next_idx;

2078

int idx = bio->bi_idx + next_idx;

2079

2080

if (unlikely(idx >= bio->bi_vcnt)) {

2080

if (unlikely(idx >= bio->bi_vcnt)) {

2081

blk_dump_rq_flags(req, "__end_that");

2081

blk_dump_rq_flags(req, "__end_that");

2082

printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",

2082

printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",

2083

__func__, idx, bio->bi_vcnt);

2083

__func__, idx, bio->bi_vcnt);

2084

break;

2084

break;

2085

}

2085

}

2086

2087

nbytes = bio_iovec_idx(bio, idx)->bv_len;

2087

nbytes = bio_iovec_idx(bio, idx)->bv_len;

2088

BIO_BUG_ON(nbytes > bio->bi_size);

2088

BIO_BUG_ON(nbytes > bio->bi_size);

2089

2090

/*

2090

/*

2091

* not a complete bvec done

2091

* not a complete bvec done

2092

*/

2092

*/

2093

if (unlikely(nbytes > nr_bytes)) {

2093

if (unlikely(nbytes > nr_bytes)) {

2094

bio_nbytes += nr_bytes;

2094

bio_nbytes += nr_bytes;

2095

total_bytes += nr_bytes;

2095

total_bytes += nr_bytes;

2096

break;

2096

break;

2097

}

2097

}

2098

2099

/*

2099

/*

2100

* advance to the next vector

2100

* advance to the next vector

2101

*/

2101

*/

2102

next_idx++;

2102

next_idx++;

2103

bio_nbytes += nbytes;

2103

bio_nbytes += nbytes;

2104

}

2104

}

2105

2106

total_bytes += nbytes;

2106

total_bytes += nbytes;

2107

nr_bytes -= nbytes;

2107

nr_bytes -= nbytes;

2108

2109

bio = req->bio;

2109

bio = req->bio;

2110

if (bio) {

2110

if (bio) {

2111

/*

2111

/*

2112

* end more in this run, or just return 'not-done'

2112

* end more in this run, or just return 'not-done'

2113

*/

2113

*/

2114

if (unlikely(nr_bytes <= 0))

2114

if (unlikely(nr_bytes <= 0))

2115

break;

2115

break;

2116

}

2116

}

2117

}

2117

}

2118

2119

/*

2119

/*

2120

* completely done

2120

* completely done

2121

*/

2121

*/

2122

if (!req->bio) {

2122

if (!req->bio) {

2123

/*

2123

/*

2124

* Reset counters so that the request stacking driver

2124

* Reset counters so that the request stacking driver

2125

* can find how many bytes remain in the request

2125

* can find how many bytes remain in the request

2126

* later.

2126

* later.

2127

*/

2127

*/

2128

req->__data_len = 0;

2128

req->__data_len = 0;

2129

return false;

2129

return false;

2130

}

2130

}

2131

2132

/*

2132

/*

2133

* if the request wasn't completed, update state

2133

* if the request wasn't completed, update state

2134

*/

2134

*/

2135

if (bio_nbytes) {

2135

if (bio_nbytes) {

2136

req_bio_endio(req, bio, bio_nbytes, error);

2136

req_bio_endio(req, bio, bio_nbytes, error);

2137

bio->bi_idx += next_idx;

2137

bio->bi_idx += next_idx;

2138

bio_iovec(bio)->bv_offset += nr_bytes;

2138

bio_iovec(bio)->bv_offset += nr_bytes;

2139

bio_iovec(bio)->bv_len -= nr_bytes;

2139

bio_iovec(bio)->bv_len -= nr_bytes;

2140

}

2140

}

2141

2142

req->__data_len -= total_bytes;

2142

req->__data_len -= total_bytes;

2143

req->buffer = bio_data(req->bio);

2143

req->buffer = bio_data(req->bio);

2144

2145

/* update sector only for requests with clear definition of sector */

2145

/* update sector only for requests with clear definition of sector */

2146

if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD))

2146

if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD))

2147

req->__sector += total_bytes >> 9;

2147

req->__sector += total_bytes >> 9;

2148

2149

/* mixed attributes always follow the first bio */

2149

/* mixed attributes always follow the first bio */

2150

if (req->cmd_flags & REQ_MIXED_MERGE) {

2150

if (req->cmd_flags & REQ_MIXED_MERGE) {

2151

req->cmd_flags &= ~REQ_FAILFAST_MASK;

2151

req->cmd_flags &= ~REQ_FAILFAST_MASK;

2152

req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK;

2152

req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK;

2153

}

2153

}

2154

2155

/*

2155

/*

2156

* If total number of sectors is less than the first segment

2156

* If total number of sectors is less than the first segment

2157

* size, something has gone terribly wrong.

2157

* size, something has gone terribly wrong.

2158

*/

2158

*/

2159

if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {

2159

if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {

2160

blk_dump_rq_flags(req, "request botched");

2160

blk_dump_rq_flags(req, "request botched");

2161

req->__data_len = blk_rq_cur_bytes(req);

2161

req->__data_len = blk_rq_cur_bytes(req);

2162

}

2162

}

2163

2164

/* recalculate the number of segments */

2164

/* recalculate the number of segments */

2165

blk_recalc_rq_segments(req);

2165

blk_recalc_rq_segments(req);

2166

2167

return true;

2167

return true;

2168

}

2168

}

2169

EXPORT_SYMBOL_GPL(blk_update_request);

2169

EXPORT_SYMBOL_GPL(blk_update_request);

2170

2171

static bool blk_update_bidi_request(struct request *rq, int error,

2171

static bool blk_update_bidi_request(struct request *rq, int error,

2172

unsigned int nr_bytes,

2172

unsigned int nr_bytes,

2173

unsigned int bidi_bytes)

2173

unsigned int bidi_bytes)

2174

{

2174

{

2175

if (blk_update_request(rq, error, nr_bytes))

2175

if (blk_update_request(rq, error, nr_bytes))

2176

return true;

2176

return true;

2177

2178

/* Bidi request must be completed as a whole */

2178

/* Bidi request must be completed as a whole */

2179

if (unlikely(blk_bidi_rq(rq)) &&

2179

if (unlikely(blk_bidi_rq(rq)) &&

2180

blk_update_request(rq->next_rq, error, bidi_bytes))

2180

blk_update_request(rq->next_rq, error, bidi_bytes))

2181

return true;

2181

return true;

2182

2183

if (blk_queue_add_random(rq->q))

2183

if (blk_queue_add_random(rq->q))

2184

add_disk_randomness(rq->rq_disk);

2184

add_disk_randomness(rq->rq_disk);

2185

2186

return false;

2186

return false;

2187

}

2187

}

2188

2189

/**

2189

/**

2190

* blk_unprep_request - unprepare a request

2190

* blk_unprep_request - unprepare a request

2191

* @req: the request

2191

* @req: the request

2192

*

2192

*

2193

* This function makes a request ready for complete resubmission (or

2193

* This function makes a request ready for complete resubmission (or

2194

* completion). It happens only after all error handling is complete,

2194

* completion). It happens only after all error handling is complete,

2195

* so represents the appropriate moment to deallocate any resources

2195

* so represents the appropriate moment to deallocate any resources

2196

* that were allocated to the request in the prep_rq_fn. The queue

2196

* that were allocated to the request in the prep_rq_fn. The queue

2197

* lock is held when calling this.

2197

* lock is held when calling this.

2198

*/

2198

*/

2199

void blk_unprep_request(struct request *req)

2199

void blk_unprep_request(struct request *req)

2200

{

2200

{

2201

struct request_queue *q = req->q;

2201

struct request_queue *q = req->q;

2202

2203

req->cmd_flags &= ~REQ_DONTPREP;

2203

req->cmd_flags &= ~REQ_DONTPREP;

2204

if (q->unprep_rq_fn)

2204

if (q->unprep_rq_fn)

2205

q->unprep_rq_fn(q, req);

2205

q->unprep_rq_fn(q, req);

2206

}

2206

}

2207

EXPORT_SYMBOL_GPL(blk_unprep_request);

2207

EXPORT_SYMBOL_GPL(blk_unprep_request);

2208

2209

/*

2209

/*

2210

* queue lock must be held

2210

* queue lock must be held

2211

*/

2211

*/

2212

static void blk_finish_request(struct request *req, int error)

2212

static void blk_finish_request(struct request *req, int error)

2213

{

2213

{

2214

if (blk_rq_tagged(req))

2214

if (blk_rq_tagged(req))

2215

blk_queue_end_tag(req->q, req);

2215

blk_queue_end_tag(req->q, req);

2216

2217

BUG_ON(blk_queued_rq(req));

2217

BUG_ON(blk_queued_rq(req));

2218

2219

if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS)

2219

if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS)

2220

laptop_io_completion(&req->q->backing_dev_info);

2220

laptop_io_completion(&req->q->backing_dev_info);

2221

2222

blk_delete_timer(req);

2222

blk_delete_timer(req);

2223

2224

if (req->cmd_flags & REQ_DONTPREP)

2224

if (req->cmd_flags & REQ_DONTPREP)

2225

blk_unprep_request(req);

2225

blk_unprep_request(req);

2226

2227

2228

blk_account_io_done(req);

2228

blk_account_io_done(req);

2229

2230

if (req->end_io)

2230

if (req->end_io)

2231

req->end_io(req, error);

2231

req->end_io(req, error);

2232

else {

2232

else {

2233

if (blk_bidi_rq(req))

2233

if (blk_bidi_rq(req))

2234

__blk_put_request(req->next_rq->q, req->next_rq);

2234

__blk_put_request(req->next_rq->q, req->next_rq);

2235

2236

__blk_put_request(req->q, req);

2236

__blk_put_request(req->q, req);

2237

}

2237

}

2238

}

2238

}

2239

2240

/**

2240

/**

2241

* blk_end_bidi_request - Complete a bidi request

2241

* blk_end_bidi_request - Complete a bidi request

2242

* @rq: the request to complete

2242

* @rq: the request to complete

2243

* @error: %0 for success, < %0 for error

2243

* @error: %0 for success, < %0 for error

2244

* @nr_bytes: number of bytes to complete @rq

2244

* @nr_bytes: number of bytes to complete @rq

2245

* @bidi_bytes: number of bytes to complete @rq->next_rq

2245

* @bidi_bytes: number of bytes to complete @rq->next_rq

2246

*

2246

*

2247

* Description:

2247

* Description:

2248

* Ends I/O on a number of bytes attached to @rq and @rq->next_rq.

2248

* Ends I/O on a number of bytes attached to @rq and @rq->next_rq.

2249

* Drivers that supports bidi can safely call this member for any

2249

* Drivers that supports bidi can safely call this member for any

2250

* type of request, bidi or uni. In the later case @bidi_bytes is

2250

* type of request, bidi or uni. In the later case @bidi_bytes is

2251

* just ignored.

2251

* just ignored.

2252

*

2252

*

2253

* Return:

2253

* Return:

2254

* %false - we are done with this request

2254

* %false - we are done with this request

2255

* %true - still buffers pending for this request

2255

* %true - still buffers pending for this request

2256

**/

2256

**/

2257

static bool blk_end_bidi_request(struct request *rq, int error,

2257

static bool blk_end_bidi_request(struct request *rq, int error,

2258

unsigned int nr_bytes, unsigned int bidi_bytes)

2258

unsigned int nr_bytes, unsigned int bidi_bytes)

2259

{

2259

{

2260

struct request_queue *q = rq->q;

2260

struct request_queue *q = rq->q;

2261

unsigned long flags;

2261

unsigned long flags;

2262

2263

if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))

2263

if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))

2264

return true;

2264

return true;

2265

2266

spin_lock_irqsave(q->queue_lock, flags);

2266

spin_lock_irqsave(q->queue_lock, flags);

2267

blk_finish_request(rq, error);

2267

blk_finish_request(rq, error);

2268

spin_unlock_irqrestore(q->queue_lock, flags);

2268

spin_unlock_irqrestore(q->queue_lock, flags);

2269

2270

return false;

2270

return false;

2271

}

2271

}

2272

2273

/**

2273

/**

2274

* __blk_end_bidi_request - Complete a bidi request with queue lock held

2274

* __blk_end_bidi_request - Complete a bidi request with queue lock held

2275

* @rq: the request to complete

2275

* @rq: the request to complete

2276

* @error: %0 for success, < %0 for error

2276

* @error: %0 for success, < %0 for error

2277

* @nr_bytes: number of bytes to complete @rq

2277

* @nr_bytes: number of bytes to complete @rq

2278

* @bidi_bytes: number of bytes to complete @rq->next_rq

2278

* @bidi_bytes: number of bytes to complete @rq->next_rq

2279

*

2279

*

2280

* Description:

2280

* Description:

2281

* Identical to blk_end_bidi_request() except that queue lock is

2281

* Identical to blk_end_bidi_request() except that queue lock is

2282

* assumed to be locked on entry and remains so on return.

2282

* assumed to be locked on entry and remains so on return.

2283

*

2283

*

2284

* Return:

2284

* Return:

2285

* %false - we are done with this request

2285

* %false - we are done with this request

2286

* %true - still buffers pending for this request

2286

* %true - still buffers pending for this request

2287

**/

2287

**/

2288

static bool __blk_end_bidi_request(struct request *rq, int error,

2288

static bool __blk_end_bidi_request(struct request *rq, int error,

2289

unsigned int nr_bytes, unsigned int bidi_bytes)

2289

unsigned int nr_bytes, unsigned int bidi_bytes)

2290

{

2290

{

2291

if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))

2291

if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))

2292

return true;

2292

return true;

2293

2294

blk_finish_request(rq, error);

2294

blk_finish_request(rq, error);

2295

2296

return false;

2296

return false;

2297

}

2297

}

2298

2299

/**

2299

/**

2300

* blk_end_request - Helper function for drivers to complete the request.

2300

* blk_end_request - Helper function for drivers to complete the request.

2301

* @rq: the request being processed

2301

* @rq: the request being processed

2302

* @error: %0 for success, < %0 for error

2302

* @error: %0 for success, < %0 for error

2303

* @nr_bytes: number of bytes to complete

2303

* @nr_bytes: number of bytes to complete

2304

*

2304

*

2305

* Description:

2305

* Description:

2306

* Ends I/O on a number of bytes attached to @rq.

2306

* Ends I/O on a number of bytes attached to @rq.

2307

* If @rq has leftover, sets it up for the next range of segments.

2307

* If @rq has leftover, sets it up for the next range of segments.

2308

*

2308

*

2309

* Return:

2309

* Return:

2310

* %false - we are done with this request

2310

* %false - we are done with this request

2311

* %true - still buffers pending for this request

2311

* %true - still buffers pending for this request

2312

**/

2312

**/

2313

bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes)

2313

bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes)

2314

{

2314

{

2315

return blk_end_bidi_request(rq, error, nr_bytes, 0);

2315

return blk_end_bidi_request(rq, error, nr_bytes, 0);

2316

}

2316

}

2317

EXPORT_SYMBOL(blk_end_request);

2317

EXPORT_SYMBOL(blk_end_request);

2318

2319

/**

2319

/**

2320

* blk_end_request_all - Helper function for drives to finish the request.

2320

* blk_end_request_all - Helper function for drives to finish the request.

2321

* @rq: the request to finish

2321

* @rq: the request to finish

2322

* @error: %0 for success, < %0 for error

2322

* @error: %0 for success, < %0 for error

2323

*

2323

*

2324

* Description:

2324

* Description:

2325

* Completely finish @rq.

2325

* Completely finish @rq.

2326

*/

2326

*/

2327

void blk_end_request_all(struct request *rq, int error)

2327

void blk_end_request_all(struct request *rq, int error)

2328

{

2328

{

2329

bool pending;

2329

bool pending;

2330

unsigned int bidi_bytes = 0;

2330

unsigned int bidi_bytes = 0;

2331

2332

if (unlikely(blk_bidi_rq(rq)))

2332

if (unlikely(blk_bidi_rq(rq)))

2333

bidi_bytes = blk_rq_bytes(rq->next_rq);

2333

bidi_bytes = blk_rq_bytes(rq->next_rq);

2334

2335

pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);

2335

pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);

2336

BUG_ON(pending);

2336

BUG_ON(pending);

2337

}

2337

}

2338

EXPORT_SYMBOL(blk_end_request_all);

2338

EXPORT_SYMBOL(blk_end_request_all);

2339

2340

/**

2340

/**

2341

* blk_end_request_cur - Helper function to finish the current request chunk.

2341

* blk_end_request_cur - Helper function to finish the current request chunk.

2342

* @rq: the request to finish the current chunk for

2342

* @rq: the request to finish the current chunk for

2343

* @error: %0 for success, < %0 for error

2343

* @error: %0 for success, < %0 for error

2344

*

2344

*

2345

* Description:

2345

* Description:

2346

* Complete the current consecutively mapped chunk from @rq.

2346

* Complete the current consecutively mapped chunk from @rq.

2347

*

2347

*

2348

* Return:

2348

* Return:

2349

* %false - we are done with this request

2349

* %false - we are done with this request

2350

* %true - still buffers pending for this request

2350

* %true - still buffers pending for this request

2351

*/

2351

*/

2352

bool blk_end_request_cur(struct request *rq, int error)

2352

bool blk_end_request_cur(struct request *rq, int error)

2353

{

2353

{

2354

return blk_end_request(rq, error, blk_rq_cur_bytes(rq));

2354

return blk_end_request(rq, error, blk_rq_cur_bytes(rq));

2355

}

2355

}

2356

EXPORT_SYMBOL(blk_end_request_cur);

2356

EXPORT_SYMBOL(blk_end_request_cur);

2357

2358

/**

2358

/**

2359

* blk_end_request_err - Finish a request till the next failure boundary.

2359

* blk_end_request_err - Finish a request till the next failure boundary.

2360

* @rq: the request to finish till the next failure boundary for

2360

* @rq: the request to finish till the next failure boundary for

2361

* @error: must be negative errno

2361

* @error: must be negative errno

2362

*

2362

*

2363

* Description:

2363

* Description:

2364

* Complete @rq till the next failure boundary.

2364

* Complete @rq till the next failure boundary.

2365

*

2365

*

2366

* Return:

2366

* Return:

2367

* %false - we are done with this request

2367

* %false - we are done with this request

2368

* %true - still buffers pending for this request

2368

* %true - still buffers pending for this request

2369

*/

2369

*/

2370

bool blk_end_request_err(struct request *rq, int error)

2370

bool blk_end_request_err(struct request *rq, int error)

2371

{

2371

{

2372

WARN_ON(error >= 0);

2372

WARN_ON(error >= 0);

2373

return blk_end_request(rq, error, blk_rq_err_bytes(rq));

2373

return blk_end_request(rq, error, blk_rq_err_bytes(rq));

2374

}

2374

}

2375

EXPORT_SYMBOL_GPL(blk_end_request_err);

2375

EXPORT_SYMBOL_GPL(blk_end_request_err);

2376

2377

/**

2377

/**

2378

* __blk_end_request - Helper function for drivers to complete the request.

2378

* __blk_end_request - Helper function for drivers to complete the request.

2379

* @rq: the request being processed

2379

* @rq: the request being processed

2380

* @error: %0 for success, < %0 for error

2380

* @error: %0 for success, < %0 for error

2381

* @nr_bytes: number of bytes to complete

2381

* @nr_bytes: number of bytes to complete

2382

*

2382

*

2383

* Description:

2383

* Description:

2384

* Must be called with queue lock held unlike blk_end_request().

2384

* Must be called with queue lock held unlike blk_end_request().

2385

*

2385

*

2386

* Return:

2386

* Return:

2387

* %false - we are done with this request

2387

* %false - we are done with this request

2388

* %true - still buffers pending for this request

2388

* %true - still buffers pending for this request

2389

**/

2389

**/

2390

bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)

2390

bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)

2391

{

2391

{

2392

return __blk_end_bidi_request(rq, error, nr_bytes, 0);

2392

return __blk_end_bidi_request(rq, error, nr_bytes, 0);

2393

}

2393

}

2394

EXPORT_SYMBOL(__blk_end_request);

2394

EXPORT_SYMBOL(__blk_end_request);

2395

2396

/**

2396

/**

2397

* __blk_end_request_all - Helper function for drives to finish the request.

2397

* __blk_end_request_all - Helper function for drives to finish the request.

2398

* @rq: the request to finish

2398

* @rq: the request to finish

2399

* @error: %0 for success, < %0 for error

2399

* @error: %0 for success, < %0 for error

2400

*

2400

*

2401

* Description:

2401

* Description:

2402

* Completely finish @rq. Must be called with queue lock held.

2402

* Completely finish @rq. Must be called with queue lock held.

2403

*/

2403

*/

2404

void __blk_end_request_all(struct request *rq, int error)

2404

void __blk_end_request_all(struct request *rq, int error)

2405

{

2405

{

2406

bool pending;

2406

bool pending;

2407

unsigned int bidi_bytes = 0;

2407

unsigned int bidi_bytes = 0;

2408

2409

if (unlikely(blk_bidi_rq(rq)))

2409

if (unlikely(blk_bidi_rq(rq)))

2410

bidi_bytes = blk_rq_bytes(rq->next_rq);

2410

bidi_bytes = blk_rq_bytes(rq->next_rq);

2411

2412

pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);

2412

pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);

2413

BUG_ON(pending);

2413

BUG_ON(pending);

2414

}

2414

}

2415

EXPORT_SYMBOL(__blk_end_request_all);

2415

EXPORT_SYMBOL(__blk_end_request_all);

2416

2417

/**

2417

/**

2418

* __blk_end_request_cur - Helper function to finish the current request chunk.

2418

* __blk_end_request_cur - Helper function to finish the current request chunk.

2419

* @rq: the request to finish the current chunk for

2419

* @rq: the request to finish the current chunk for

2420

* @error: %0 for success, < %0 for error

2420

* @error: %0 for success, < %0 for error

2421

*

2421

*

2422

* Description:

2422

* Description:

2423

* Complete the current consecutively mapped chunk from @rq. Must

2423

* Complete the current consecutively mapped chunk from @rq. Must

2424

* be called with queue lock held.

2424

* be called with queue lock held.

2425

*

2425

*

2426

* Return:

2426

* Return:

2427

* %false - we are done with this request

2427

* %false - we are done with this request

2428

* %true - still buffers pending for this request

2428

* %true - still buffers pending for this request

2429

*/

2429

*/

2430

bool __blk_end_request_cur(struct request *rq, int error)

2430

bool __blk_end_request_cur(struct request *rq, int error)

2431

{

2431

{

2432

return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));

2432

return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));

2433

}

2433

}

2434

EXPORT_SYMBOL(__blk_end_request_cur);

2434

EXPORT_SYMBOL(__blk_end_request_cur);

2435

2436

/**

2436

/**

2437

* __blk_end_request_err - Finish a request till the next failure boundary.

2437

* __blk_end_request_err - Finish a request till the next failure boundary.

2438

* @rq: the request to finish till the next failure boundary for

2438

* @rq: the request to finish till the next failure boundary for

2439

* @error: must be negative errno

2439

* @error: must be negative errno

2440

*

2440

*

2441

* Description:

2441

* Description:

2442

* Complete @rq till the next failure boundary. Must be called

2442

* Complete @rq till the next failure boundary. Must be called

2443

* with queue lock held.

2443

* with queue lock held.

2444

*

2444

*

2445

* Return:

2445

* Return:

2446

* %false - we are done with this request

2446

* %false - we are done with this request

2447

* %true - still buffers pending for this request

2447

* %true - still buffers pending for this request

2448

*/

2448

*/

2449

bool __blk_end_request_err(struct request *rq, int error)

2449

bool __blk_end_request_err(struct request *rq, int error)

2450

{

2450

{

2451

WARN_ON(error >= 0);

2451

WARN_ON(error >= 0);

2452

return __blk_end_request(rq, error, blk_rq_err_bytes(rq));

2452

return __blk_end_request(rq, error, blk_rq_err_bytes(rq));

2453

}

2453

}

2454

EXPORT_SYMBOL_GPL(__blk_end_request_err);

2454

EXPORT_SYMBOL_GPL(__blk_end_request_err);

2455

2456

void blk_rq_bio_prep(struct request_queue *q, struct request *rq,

2456

void blk_rq_bio_prep(struct request_queue *q, struct request *rq,

2457

struct bio *bio)

2457

struct bio *bio)

2458

{

2458

{

2459

/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */

2459

/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */

2460

rq->cmd_flags |= bio->bi_rw & REQ_WRITE;

2460

rq->cmd_flags |= bio->bi_rw & REQ_WRITE;

2461

2462

if (bio_has_data(bio)) {

2462

if (bio_has_data(bio)) {

2463

rq->nr_phys_segments = bio_phys_segments(q, bio);

2463

rq->nr_phys_segments = bio_phys_segments(q, bio);

2464

rq->buffer = bio_data(bio);

2464

rq->buffer = bio_data(bio);

2465

}

2465

}

2466

rq->__data_len = bio->bi_size;

2466

rq->__data_len = bio->bi_size;

2467

rq->bio = rq->biotail = bio;

2467

rq->bio = rq->biotail = bio;

2468

2469

if (bio->bi_bdev)

2469

if (bio->bi_bdev)

2470

rq->rq_disk = bio->bi_bdev->bd_disk;

2470

rq->rq_disk = bio->bi_bdev->bd_disk;

2471

}

2471

}

2472

2473

#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE

2473

#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE

2474

/**

2474

/**

2475

* rq_flush_dcache_pages - Helper function to flush all pages in a request

2475

* rq_flush_dcache_pages - Helper function to flush all pages in a request

2476

* @rq: the request to be flushed

2476

* @rq: the request to be flushed

2477

*

2477

*

2478

* Description:

2478

* Description:

2479

* Flush all pages in @rq.

2479

* Flush all pages in @rq.

2480

*/

2480

*/

2481

void rq_flush_dcache_pages(struct request *rq)

2481

void rq_flush_dcache_pages(struct request *rq)

2482

{

2482

{

2483

struct req_iterator iter;

2483

struct req_iterator iter;

2484

struct bio_vec *bvec;

2484

struct bio_vec *bvec;

2485

2486

rq_for_each_segment(bvec, rq, iter)

2486

rq_for_each_segment(bvec, rq, iter)

2487

flush_dcache_page(bvec->bv_page);

2487

flush_dcache_page(bvec->bv_page);

2488

}

2488

}

2489

EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);

2489

EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);

2490

#endif

2490

#endif

2491

2492

/**

2492

/**

2493

* blk_lld_busy - Check if underlying low-level drivers of a device are busy

2493

* blk_lld_busy - Check if underlying low-level drivers of a device are busy

2494

* @q : the queue of the device being checked

2494

* @q : the queue of the device being checked

2495

*

2495

*

2496

* Description:

2496

* Description:

2497

* Check if underlying low-level drivers of a device are busy.

2497

* Check if underlying low-level drivers of a device are busy.

2498

* If the drivers want to export their busy state, they must set own

2498

* If the drivers want to export their busy state, they must set own

2499

* exporting function using blk_queue_lld_busy() first.

2499

* exporting function using blk_queue_lld_busy() first.

2500

*

2500

*

2501

* Basically, this function is used only by request stacking drivers

2501

* Basically, this function is used only by request stacking drivers

2502

* to stop dispatching requests to underlying devices when underlying

2502

* to stop dispatching requests to underlying devices when underlying

2503

* devices are busy. This behavior helps more I/O merging on the queue

2503

* devices are busy. This behavior helps more I/O merging on the queue

2504

* of the request stacking driver and prevents I/O throughput regression

2504

* of the request stacking driver and prevents I/O throughput regression

2505

* on burst I/O load.

2505

* on burst I/O load.

2506

*

2506

*

2507

* Return:

2507

* Return:

2508

* 0 - Not busy (The request stacking driver should dispatch request)

2508

* 0 - Not busy (The request stacking driver should dispatch request)

2509

* 1 - Busy (The request stacking driver should stop dispatching request)

2509

* 1 - Busy (The request stacking driver should stop dispatching request)

2510

*/

2510

*/

2511

int blk_lld_busy(struct request_queue *q)

2511

int blk_lld_busy(struct request_queue *q)

2512

{

2512

{

2513

if (q->lld_busy_fn)

2513

if (q->lld_busy_fn)

2514

return q->lld_busy_fn(q);

2514

return q->lld_busy_fn(q);

2515

2516

return 0;

2516

return 0;

2517

}

2517

}

2518

EXPORT_SYMBOL_GPL(blk_lld_busy);

2518

EXPORT_SYMBOL_GPL(blk_lld_busy);

2519

2520

/**

2520

/**

2521

* blk_rq_unprep_clone - Helper function to free all bios in a cloned request

2521

* blk_rq_unprep_clone - Helper function to free all bios in a cloned request

2522

* @rq: the clone request to be cleaned up

2522

* @rq: the clone request to be cleaned up

2523

*

2523

*

2524

* Description:

2524

* Description:

2525

* Free all bios in @rq for a cloned request.

2525

* Free all bios in @rq for a cloned request.

2526

*/

2526

*/

2527

void blk_rq_unprep_clone(struct request *rq)

2527

void blk_rq_unprep_clone(struct request *rq)

2528

{

2528

{

2529

struct bio *bio;

2529

struct bio *bio;

2530

2531

while ((bio = rq->bio) != NULL) {

2531

while ((bio = rq->bio) != NULL) {

2532

rq->bio = bio->bi_next;

2532

rq->bio = bio->bi_next;

2533

2534

bio_put(bio);

2534

bio_put(bio);

2535

}

2535

}

2536

}

2536

}

2537

EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);

2537

EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);

2538

2539

/*

2539

/*

2540

* Copy attributes of the original request to the clone request.

2540

* Copy attributes of the original request to the clone request.

2541

* The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied.

2541

* The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied.

2542

*/

2542

*/

2543

static void __blk_rq_prep_clone(struct request *dst, struct request *src)

2543

static void __blk_rq_prep_clone(struct request *dst, struct request *src)

2544

{

2544

{

2545

dst->cpu = src->cpu;

2545

dst->cpu = src->cpu;

2546

dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;

2546

dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;

2547

dst->cmd_type = src->cmd_type;

2547

dst->cmd_type = src->cmd_type;

2548

dst->__sector = blk_rq_pos(src);

2548

dst->__sector = blk_rq_pos(src);

2549

dst->__data_len = blk_rq_bytes(src);

2549

dst->__data_len = blk_rq_bytes(src);

2550

dst->nr_phys_segments = src->nr_phys_segments;

2550

dst->nr_phys_segments = src->nr_phys_segments;

2551

dst->ioprio = src->ioprio;

2551

dst->ioprio = src->ioprio;

2552

dst->extra_len = src->extra_len;

2552

dst->extra_len = src->extra_len;

2553

}

2553

}

2554

2555

/**

2555

/**

2556

* blk_rq_prep_clone - Helper function to setup clone request

2556

* blk_rq_prep_clone - Helper function to setup clone request

2557

* @rq: the request to be setup

2557

* @rq: the request to be setup

2558

* @rq_src: original request to be cloned

2558

* @rq_src: original request to be cloned

2559

* @bs: bio_set that bios for clone are allocated from

2559

* @bs: bio_set that bios for clone are allocated from

2560

* @gfp_mask: memory allocation mask for bio

2560

* @gfp_mask: memory allocation mask for bio

2561

* @bio_ctr: setup function to be called for each clone bio.

2561

* @bio_ctr: setup function to be called for each clone bio.

2562

* Returns %0 for success, non %0 for failure.

2562

* Returns %0 for success, non %0 for failure.

2563

* @data: private data to be passed to @bio_ctr

2563

* @data: private data to be passed to @bio_ctr

2564

*

2564

*

2565

* Description:

2565

* Description:

2566

* Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.

2566

* Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.

2567

* The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense)

2567

* The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense)

2568

* are not copied, and copying such parts is the caller's responsibility.

2568

* are not copied, and copying such parts is the caller's responsibility.

2569

* Also, pages which the original bios are pointing to are not copied

2569

* Also, pages which the original bios are pointing to are not copied

2570

* and the cloned bios just point same pages.

2570

* and the cloned bios just point same pages.

2571

* So cloned bios must be completed before original bios, which means

2571

* So cloned bios must be completed before original bios, which means

2572

* the caller must complete @rq before @rq_src.

2572

* the caller must complete @rq before @rq_src.

2573

*/

2573

*/

2574

int blk_rq_prep_clone(struct request *rq, struct request *rq_src,

2574

int blk_rq_prep_clone(struct request *rq, struct request *rq_src,

2575

struct bio_set *bs, gfp_t gfp_mask,

2575

struct bio_set *bs, gfp_t gfp_mask,

2576

int (*bio_ctr)(struct bio *, struct bio *, void *),

2576

int (*bio_ctr)(struct bio *, struct bio *, void *),

2577

void *data)

2577

void *data)

2578

{

2578

{

2579

struct bio *bio, *bio_src;

2579

struct bio *bio, *bio_src;

2580

2581

if (!bs)

2581

if (!bs)

2582

bs = fs_bio_set;

2582

bs = fs_bio_set;

2583

2584

blk_rq_init(NULL, rq);

2584

blk_rq_init(NULL, rq);

2585

2586

__rq_for_each_bio(bio_src, rq_src) {

2586

__rq_for_each_bio(bio_src, rq_src) {

2587

bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);

2587

bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);

2588

if (!bio)

2588

if (!bio)

2589

goto free_and_out;

2589

goto free_and_out;

2590

2591

__bio_clone(bio, bio_src);

2591

__bio_clone(bio, bio_src);

2592

2593

if (bio_integrity(bio_src) &&

2593

if (bio_integrity(bio_src) &&

2594

bio_integrity_clone(bio, bio_src, gfp_mask, bs))

2594

bio_integrity_clone(bio, bio_src, gfp_mask, bs))

2595

goto free_and_out;

2595

goto free_and_out;

2596

2597

if (bio_ctr && bio_ctr(bio, bio_src, data))

2597

if (bio_ctr && bio_ctr(bio, bio_src, data))

2598

goto free_and_out;

2598

goto free_and_out;

2599

2600

if (rq->bio) {

2600

if (rq->bio) {

2601

rq->biotail->bi_next = bio;

2601

rq->biotail->bi_next = bio;

2602

rq->biotail = bio;

2602

rq->biotail = bio;

2603

} else

2603

} else

2604

rq->bio = rq->biotail = bio;

2604

rq->bio = rq->biotail = bio;

2605

}

2605

}

2606

2607

__blk_rq_prep_clone(rq, rq_src);

2607

__blk_rq_prep_clone(rq, rq_src);

2608

2609

return 0;

2609

return 0;

2610

2611

free_and_out:

2611

free_and_out:

2612

if (bio)

2612

if (bio)

2613

bio_free(bio, bs);

2613

bio_free(bio, bs);

2614

blk_rq_unprep_clone(rq);

2614

blk_rq_unprep_clone(rq);

2615

2616

return -ENOMEM;

2616

return -ENOMEM;

2617

}

2617

}

2618

EXPORT_SYMBOL_GPL(blk_rq_prep_clone);

2618

EXPORT_SYMBOL_GPL(blk_rq_prep_clone);

2619

2620

int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)

2620

int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)

2621

{

2621

{

2622

return queue_work(kblockd_workqueue, work);

2622

return queue_work(kblockd_workqueue, work);

2623

}

2623

}

2624

EXPORT_SYMBOL(kblockd_schedule_work);

2624

EXPORT_SYMBOL(kblockd_schedule_work);

2625

2626

int kblockd_schedule_delayed_work(struct request_queue *q,

2626

int kblockd_schedule_delayed_work(struct request_queue *q,

2627

struct delayed_work *dwork, unsigned long delay)

2627

struct delayed_work *dwork, unsigned long delay)

2628

{

2628

{

2629

return queue_delayed_work(kblockd_workqueue, dwork, delay);

2629

return queue_delayed_work(kblockd_workqueue, dwork, delay);

2630

}

2630

}

2631

EXPORT_SYMBOL(kblockd_schedule_delayed_work);

2631

EXPORT_SYMBOL(kblockd_schedule_delayed_work);

2632

2633

#define PLUG_MAGIC 0x91827364

2633

#define PLUG_MAGIC 0x91827364

2634

2635

void blk_start_plug(struct blk_plug *plug)

2635

void blk_start_plug(struct blk_plug *plug)

2636

{

2636

{

2637

struct task_struct *tsk = current;

2637

struct task_struct *tsk = current;

2638

2639

plug->magic = PLUG_MAGIC;

2639

plug->magic = PLUG_MAGIC;

2640

INIT_LIST_HEAD(&plug->list);

2640

INIT_LIST_HEAD(&plug->list);

2641

plug->should_sort = 0;

2641

plug->should_sort = 0;

2642

2643

/*

2643

/*

2644

* If this is a nested plug, don't actually assign it. It will be

2644

* If this is a nested plug, don't actually assign it. It will be

2645

* flushed on its own.

2645

* flushed on its own.

2646

*/

2646

*/

2647

if (!tsk->plug) {

2647

if (!tsk->plug) {

2648

/*

2648

/*

2649

* Store ordering should not be needed here, since a potential

2649

* Store ordering should not be needed here, since a potential

2650

* preempt will imply a full memory barrier

2650

* preempt will imply a full memory barrier

2651

*/

2651

*/

2652

tsk->plug = plug;

2652

tsk->plug = plug;

2653

}

2653

}

2654

}

2654

}

2655

EXPORT_SYMBOL(blk_start_plug);

2655

EXPORT_SYMBOL(blk_start_plug);

2656

2657

static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)

2657

static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)

2658

{

2658

{

2659

struct request *rqa = container_of(a, struct request, queuelist);

2659

struct request *rqa = container_of(a, struct request, queuelist);

2660

struct request *rqb = container_of(b, struct request, queuelist);

2660

struct request *rqb = container_of(b, struct request, queuelist);

2661

2662

return !(rqa->q <= rqb->q);

2662

return !(rqa->q <= rqb->q);

2663

}

2663

}

2664

2665

/*

2666

* If 'from_schedule' is true, then postpone the dispatch of requests

2667

* until a safe kblockd context. We due this to avoid accidental big

2668

* additional stack usage in driver dispatch, in places where the originally

2669

* plugger did not intend it.

2670

*/

2665

static void queue_unplugged(struct request_queue *q, unsigned int depth,

2671

static void queue_unplugged(struct request_queue *q, unsigned int depth,

2666

bool force_kblockd)

2672

bool from_schedule)

2667

{

2673

{

2668

trace_block_unplug_io(q, depth);

2674

trace_block_unplug(q, depth, !from_schedule);

2669

__blk_run_queue(q, force_kblockd);

2675

__blk_run_queue(q, from_schedule);

2670

2676

2671

if (q->unplugged_fn)

2677

if (q->unplugged_fn)

2672

q->unplugged_fn(q);

2678

q->unplugged_fn(q);

2673

}

2679

}

2674

2680

2675

void blk_flush_plug_list(struct blk_plug *plug, bool force_kblockd)

2681

void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)

2676

{

2682

{

2677

struct request_queue *q;

2683

struct request_queue *q;

2678

unsigned long flags;

2684

unsigned long flags;

2679

struct request *rq;

2685

struct request *rq;

2680

LIST_HEAD(list);

2686

LIST_HEAD(list);

2681

unsigned int depth;

2687

unsigned int depth;

2682

2688

2683

BUG_ON(plug->magic != PLUG_MAGIC);

2689

BUG_ON(plug->magic != PLUG_MAGIC);

2684

2690

2685

if (list_empty(&plug->list))

2691

if (list_empty(&plug->list))

2686

return;

2692

return;

2687

2693

2688

list_splice_init(&plug->list, &list);

2694

list_splice_init(&plug->list, &list);

2689

2695

2690

if (plug->should_sort) {

2696

if (plug->should_sort) {

2691

list_sort(NULL, &list, plug_rq_cmp);

2697

list_sort(NULL, &list, plug_rq_cmp);

2692

plug->should_sort = 0;

2698

plug->should_sort = 0;

2693

}

2699

}

2694

2700

2695

q = NULL;

2701

q = NULL;

2696

depth = 0;

2702

depth = 0;

2697

2703

2698

/*

2704

/*

2699

* Save and disable interrupts here, to avoid doing it for every

2705

* Save and disable interrupts here, to avoid doing it for every

2700

* queue lock we have to take.

2706

* queue lock we have to take.

2701

*/

2707

*/

2702

local_irq_save(flags);

2708

local_irq_save(flags);

2703

while (!list_empty(&list)) {

2709

while (!list_empty(&list)) {

2704

rq = list_entry_rq(list.next);

2710

rq = list_entry_rq(list.next);

2705

list_del_init(&rq->queuelist);

2711

list_del_init(&rq->queuelist);

2706

BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));

2712

BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));

2707

BUG_ON(!rq->q);

2713

BUG_ON(!rq->q);

2708

if (rq->q != q) {

2714

if (rq->q != q) {

2709

if (q) {

2715

if (q) {

2710

queue_unplugged(q, depth, force_kblockd);

2716

queue_unplugged(q, depth, from_schedule);

2711

spin_unlock(q->queue_lock);

2717

spin_unlock(q->queue_lock);

2712

}

2718

}

2713

q = rq->q;

2719

q = rq->q;

2714

depth = 0;

2720

depth = 0;

2715

spin_lock(q->queue_lock);

2721

spin_lock(q->queue_lock);

2716

}

2722

}

2717

rq->cmd_flags &= ~REQ_ON_PLUG;

2723

rq->cmd_flags &= ~REQ_ON_PLUG;

2718

2724

2719

/*

2725

/*

2720

* rq is already accounted, so use raw insert

2726

* rq is already accounted, so use raw insert

2721

*/

2727

*/

2722

if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA))

2728

if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA))

2723

__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);

2729

__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);

2724

else

2730

else

2725

__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);

2731

__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);

2726

2732

2727

depth++;

2733

depth++;

2728

}

2734

}

2729

2735

2730

if (q) {

2736

if (q) {

2731

queue_unplugged(q, depth, force_kblockd);

2737

queue_unplugged(q, depth, from_schedule);

2732

spin_unlock(q->queue_lock);

2738

spin_unlock(q->queue_lock);

2733

}

2739

}

2734

2740

2735

local_irq_restore(flags);

2741

local_irq_restore(flags);

2736

}

2742

}

2737

EXPORT_SYMBOL(blk_flush_plug_list);

2743

EXPORT_SYMBOL(blk_flush_plug_list);

2738

2744

2739

void blk_finish_plug(struct blk_plug *plug)

2745

void blk_finish_plug(struct blk_plug *plug)

2740

{

2746

{

2741

blk_flush_plug_list(plug, false);

2747

blk_flush_plug_list(plug, false);

2742

2748

2743

if (plug == current->plug)

2749

if (plug == current->plug)

2744

current->plug = NULL;

2750

current->plug = NULL;

2745

}

2751

}

2746

EXPORT_SYMBOL(blk_finish_plug);

2752

EXPORT_SYMBOL(blk_finish_plug);

2747

2753

2748

int __init blk_dev_init(void)

2754

int __init blk_dev_init(void)

2749

{

2755

{

2750

BUILD_BUG_ON(__REQ_NR_BITS > 8 *

2756

BUILD_BUG_ON(__REQ_NR_BITS > 8 *

2751

sizeof(((struct request *)0)->cmd_flags));

2757

sizeof(((struct request *)0)->cmd_flags));

2752

2758

2753

/* used for unplugging and affects IO latency/throughput - HIGHPRI */

2759

/* used for unplugging and affects IO latency/throughput - HIGHPRI */

2754

kblockd_workqueue = alloc_workqueue("kblockd",

2760

kblockd_workqueue = alloc_workqueue("kblockd",

2755

WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);

2761

WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);

2756

if (!kblockd_workqueue)

2762

if (!kblockd_workqueue)

2757

panic("Failed to create kblockd\n");

2763

panic("Failed to create kblockd\n");

2758

2764

2759

request_cachep = kmem_cache_create("blkdev_requests",

2765

request_cachep = kmem_cache_create("blkdev_requests",

2760

sizeof(struct request), 0, SLAB_PANIC, NULL);

2766

sizeof(struct request), 0, SLAB_PANIC, NULL);

2761

2767

2762

blk_requestq_cachep = kmem_cache_create("blkdev_queue",

2768

blk_requestq_cachep = kmem_cache_create("blkdev_queue",

2763

sizeof(struct request_queue), 0, SLAB_PANIC, NULL);

2769

sizeof(struct request_queue), 0, SLAB_PANIC, NULL);

2764

2770

2765

return 0;

2771

return 0;

2766

}

2772

}

2767

2773

GITLAB

block: make unplug timer trace event correspond to the schedule() unplug

 /*
  * Copyright (C) 1991, 1992 Linus Torvalds
  * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
  * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
  * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
  * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
  *	-  July2000
  * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
  */
 /*
  * This handles all read/write requests to block devices
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/completion.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/fault-inject.h>
 #include <linux/list_sort.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
 #include "blk.h"
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 static int __make_request(struct request_queue *q, struct bio *bio);
 /*
  * For the allocated request tables
  */
 static struct kmem_cache *request_cachep;
 /*
  * For queue allocation
  */
 struct kmem_cache *blk_requestq_cachep;
 /*
  * Controlling structure to kblockd
  */
 static struct workqueue_struct *kblockd_workqueue;
 static void drive_stat_acct(struct request *rq, int new_io)
 {
 	struct hd_struct *part;
 	int rw = rq_data_dir(rq);
 	int cpu;
 	if (!blk_do_io_stat(rq))
 		return;
 	cpu = part_stat_lock();
 	if (!new_io) {
 		part = rq->part;
 		part_stat_inc(cpu, part, merges[rw]);
 	} else {
 		part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 		if (!hd_struct_try_get(part)) {
 			/*
 			 * The partition is already being removed,
 			 * the request will be accounted on the disk only
 			 *
 			 * We take a reference on disk->part0 although that
 			 * partition will never be deleted, so we can treat
 			 * it as any other partition.
 			 */
 			part = &rq->rq_disk->part0;
 			hd_struct_get(part);
 		}
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
 		rq->part = part;
 	}
 	part_stat_unlock();
 }
 void blk_queue_congestion_threshold(struct request_queue *q)
 {
 	int nr;
 	nr = q->nr_requests - (q->nr_requests / 8) + 1;
 	if (nr > q->nr_requests)
 		nr = q->nr_requests;
 	q->nr_congestion_on = nr;
 	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
 	if (nr < 1)
 		nr = 1;
 	q->nr_congestion_off = nr;
 }
 /**
  * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
  * @bdev:	device
  *
  * Locates the passed device's request queue and returns the address of its
  * backing_dev_info
  *
  * Will return NULL if the request queue cannot be located.
  */
 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
 {
 	struct backing_dev_info *ret = NULL;
 	struct request_queue *q = bdev_get_queue(bdev);
 	if (q)
 		ret = &q->backing_dev_info;
 	return ret;
 }
 EXPORT_SYMBOL(blk_get_backing_dev_info);
 void blk_rq_init(struct request_queue *q, struct request *rq)
 {
 	memset(rq, 0, sizeof(*rq));
 	INIT_LIST_HEAD(&rq->queuelist);
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->cpu = -1;
 	rq->q = q;
 	rq->__sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->cmd = rq->__cmd;
 	rq->cmd_len = BLK_MAX_CDB;
 	rq->tag = -1;
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
 	rq->part = NULL;
 }
 EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
 	if (error)
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 		error = -EIO;
 	if (unlikely(nbytes > bio->bi_size)) {
 		printk(KERN_ERR "%s: want %u bytes done, %u left\n",
 		       __func__, nbytes, bio->bi_size);
 		nbytes = bio->bi_size;
 	}
 	if (unlikely(rq->cmd_flags & REQ_QUIET))
 		set_bit(BIO_QUIET, &bio->bi_flags);
 	bio->bi_size -= nbytes;
 	bio->bi_sector += (nbytes >> 9);
 	if (bio_integrity(bio))
 		bio_integrity_advance(bio, nbytes);
 	/* don't actually finish bio if it's part of flush sequence */
 	if (bio->bi_size == 0 && !(rq->cmd_flags & REQ_FLUSH_SEQ))
 		bio_endio(bio, error);
 }
 void blk_dump_rq_flags(struct request *rq, char *msg)
 {
 	int bit;
 	printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
 		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
 		rq->cmd_flags);
 	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
 	       (unsigned long long)blk_rq_pos(rq),
 	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
 	       rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		printk(KERN_INFO "  cdb: ");
 		for (bit = 0; bit < BLK_MAX_CDB; bit++)
 			printk("%02x ", rq->cmd[bit]);
 		printk("\n");
 	}
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 static void blk_delay_work(struct work_struct *work)
 {
 	struct request_queue *q;
 	q = container_of(work, struct request_queue, delay_work.work);
 	spin_lock_irq(q->queue_lock);
 	__blk_run_queue(q, false);
 	spin_unlock_irq(q->queue_lock);
 }
 /**
  * blk_delay_queue - restart queueing after defined interval
  * @q:		The &struct request_queue in question
  * @msecs:	Delay in msecs
  *
  * Description:
  *   Sometimes queueing needs to be postponed for a little while, to allow
  *   resources to come back. This function will make sure that queueing is
  *   restarted around the specified time.
  */
 void blk_delay_queue(struct request_queue *q, unsigned long msecs)
 {
 	schedule_delayed_work(&q->delay_work, msecs_to_jiffies(msecs));
 }
 EXPORT_SYMBOL(blk_delay_queue);
 /**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   blk_start_queue() will clear the stop flag on the queue, and call
  *   the request_fn for the queue if it was in a stopped state when
  *   entered. Also see blk_stop_queue(). Queue lock must be held.
  **/
 void blk_start_queue(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 	__blk_run_queue(q, false);
 }
 EXPORT_SYMBOL(blk_start_queue);
 /**
  * blk_stop_queue - stop a queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   The Linux block layer assumes that a block driver will consume all
  *   entries on the request queue when the request_fn strategy is called.
  *   Often this will not happen, because of hardware limitations (queue
  *   depth settings). If a device driver gets a 'queue full' response,
  *   or if it simply chooses not to queue more I/O at one point, it can
  *   call this function to prevent the request_fn from being called until
  *   the driver has signalled it's ready to go again. This happens by calling
  *   blk_start_queue() to restart queue operations. Queue lock must be held.
  **/
 void blk_stop_queue(struct request_queue *q)
 {
 	__cancel_delayed_work(&q->delay_work);
 	queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
 EXPORT_SYMBOL(blk_stop_queue);
 /**
  * blk_sync_queue - cancel any pending callbacks on a queue
  * @q: the queue
  *
  * Description:
  *     The block layer may perform asynchronous callback activity
  *     on a queue, such as calling the unplug function after a timeout.
  *     A block device may call blk_sync_queue to ensure that any
  *     such activity is cancelled, thus allowing it to release resources
  *     that the callbacks might use. The caller must already have made sure
  *     that its ->make_request_fn will not re-add plugging prior to calling
  *     this function.
  *
  *     This function does not cancel any asynchronous activity arising
  *     out of elevator or throttling code. That would require elevaotor_exit()
  *     and blk_throtl_exit() to be called with queue lock initialized.
  *
  */
 void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->timeout);
 	cancel_delayed_work_sync(&q->delay_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 /**
  * __blk_run_queue - run a single device queue
  * @q:	The queue to run
  * @force_kblockd: Don't run @q->request_fn directly.  Use kblockd.
  *
  * Description:
  *    See @blk_run_queue. This variant must be called with the queue lock
  *    held and interrupts disabled.
  *
  */
 void __blk_run_queue(struct request_queue *q, bool force_kblockd)
 {
 	if (unlikely(blk_queue_stopped(q)))
 		return;
 	/*
 	 * Only recurse once to avoid overrunning the stack, let the unplug
 	 * handling reinvoke the handler shortly if we already got there.
 	 */
 	if (!force_kblockd && !queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
 		q->request_fn(q);
 		queue_flag_clear(QUEUE_FLAG_REENTER, q);
 	} else
 		queue_delayed_work(kblockd_workqueue, &q->delay_work, 0);
 }
 EXPORT_SYMBOL(__blk_run_queue);
 /**
  * blk_run_queue - run a single device queue
  * @q: The queue to run
  *
  * Description:
  *    Invoke request handling on this queue, if it has pending work to do.
  *    May be used to restart queueing when a request has completed.
  */
 void blk_run_queue(struct request_queue *q)
 {
 	unsigned long flags;
 	spin_lock_irqsave(q->queue_lock, flags);
 	__blk_run_queue(q, false);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_run_queue);
 void blk_put_queue(struct request_queue *q)
 {
 	kobject_put(&q->kobj);
 }
 /*
  * Note: If a driver supplied the queue lock, it should not zap that lock
  * unexpectedly as some queue cleanup components like elevator_exit() and
  * blk_throtl_exit() need queue lock.
  */
 void blk_cleanup_queue(struct request_queue *q)
 {
 	/*
 	 * We know we have process context here, so we can be a little
 	 * cautious and ensure that pending block actions on this device
 	 * are done before moving on. Going into this function, we should
 	 * not have processes doing IO to this device.
 	 */
 	blk_sync_queue(q);
 	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
 	mutex_lock(&q->sysfs_lock);
 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
 	mutex_unlock(&q->sysfs_lock);
 	if (q->elevator)
 		elevator_exit(q->elevator);
 	blk_throtl_exit(q);
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
 static int blk_init_free_list(struct request_queue *q)
 {
 	struct request_list *rl = &q->rq;
 	if (unlikely(rl->rq_pool))
 		return 0;
 	rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
 	rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
 	rl->elvpriv = 0;
 	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
 	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
 	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
 				mempool_free_slab, request_cachep, q->node);
 	if (!rl->rq_pool)
 		return -ENOMEM;
 	return 0;
 }
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 {
 	return blk_alloc_queue_node(gfp_mask, -1);
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
 	int err;
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
 	if (!q)
 		return NULL;
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
 	q->backing_dev_info.name = "block";
 	err = bdi_init(&q->backing_dev_info);
 	if (err) {
 		kmem_cache_free(blk_requestq_cachep, q);
 		return NULL;
 	}
 	if (blk_throtl_init(q)) {
 		kmem_cache_free(blk_requestq_cachep, q);
 		return NULL;
 	}
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_LIST_HEAD(&q->flush_queue[0]);
 	INIT_LIST_HEAD(&q->flush_queue[1]);
 	INIT_LIST_HEAD(&q->flush_data_in_flight);
 	INIT_DELAYED_WORK(&q->delay_work, blk_delay_work);
 	kobject_init(&q->kobj, &blk_queue_ktype);
 	mutex_init(&q->sysfs_lock);
 	spin_lock_init(&q->__queue_lock);
 	/*
 	 * By default initialize queue_lock to internal lock and driver can
 	 * override it later if need be.
 	 */
 	q->queue_lock = &q->__queue_lock;
 	return q;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
 /**
  * blk_init_queue  - prepare a request queue for use with a block device
  * @rfn:  The function to be called to process requests that have been
  *        placed on the queue.
  * @lock: Request queue spin lock
  *
  * Description:
  *    If a block device wishes to use the standard request handling procedures,
  *    which sorts requests and coalesces adjacent requests, then it must
  *    call blk_init_queue().  The function @rfn will be called when there
  *    are requests on the queue that need to be processed.  If the device
  *    supports plugging, then @rfn may not be called immediately when requests
  *    are available on the queue, but may be called at some time later instead.
  *    Plugged queues are generally unplugged when a buffer belonging to one
  *    of the requests on the queue is needed, or due to memory pressure.
  *
  *    @rfn is not required, or even expected, to remove all requests off the
  *    queue, but only as many as it can handle at a time.  If it does leave
  *    requests on the queue, it is responsible for arranging that the requests
  *    get dealt with eventually.
  *
  *    The queue spin lock must be held while manipulating the requests on the
  *    request queue; this lock will be taken also from interrupt context, so irq
  *    disabling is needed for it.
  *
  *    Function returns a pointer to the initialized request queue, or %NULL if
  *    it didn't succeed.
  *
  * Note:
  *    blk_init_queue() must be paired with a blk_cleanup_queue() call
  *    when the block device is deactivated (such as at module unload).
  **/
 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 {
 	return blk_init_queue_node(rfn, lock, -1);
 }
 EXPORT_SYMBOL(blk_init_queue);
 struct request_queue *
 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 {
 	struct request_queue *uninit_q, *q;
 	uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
 	if (!uninit_q)
 		return NULL;
 	q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
 	if (!q)
 		blk_cleanup_queue(uninit_q);
 	return q;
 }
 EXPORT_SYMBOL(blk_init_queue_node);
 struct request_queue *
 blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 			 spinlock_t *lock)
 {
 	return blk_init_allocated_queue_node(q, rfn, lock, -1);
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
 struct request_queue *
 blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
 			      spinlock_t *lock, int node_id)
 {
 	if (!q)
 		return NULL;
 	q->node = node_id;
 	if (blk_init_free_list(q))
 		return NULL;
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
 	q->unprep_rq_fn		= NULL;
 	q->queue_flags		= QUEUE_FLAG_DEFAULT;
 	/* Override internal queue lock with supplied lock pointer */
 	if (lock)
 		q->queue_lock		= lock;
 	/*
 	 * This also sets hw/phys segments, boundary and size
 	 */
 	blk_queue_make_request(q, __make_request);
 	q->sg_reserved_size = INT_MAX;
 	/*
 	 * all done
 	 */
 	if (!elevator_init(q, NULL)) {
 		blk_queue_congestion_threshold(q);
 		return q;
 	}
 	return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue_node);
 int blk_get_queue(struct request_queue *q)
 {
 	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
 		kobject_get(&q->kobj);
 		return 0;
 	}
 	return 1;
 }
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
 	BUG_ON(rq->cmd_flags & REQ_ON_PLUG);
 	if (rq->cmd_flags & REQ_ELVPRIV)
 		elv_put_request(q, rq);
 	mempool_free(rq, q->rq.rq_pool);
 }
 static struct request *
 blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 	if (!rq)
 		return NULL;
 	blk_rq_init(q, rq);
 	rq->cmd_flags = flags | REQ_ALLOCED;
 	if (priv) {
 		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
 			mempool_free(rq, q->rq.rq_pool);
 			return NULL;
 		}
 		rq->cmd_flags |= REQ_ELVPRIV;
 	}
 	return rq;
 }
 /*
  * ioc_batching returns true if the ioc is a valid batching request and
  * should be given priority access to a request.
  */
 static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
 {
 	if (!ioc)
 		return 0;
 	/*
 	 * Make sure the process is able to allocate at least 1 request
 	 * even if the batch times out, otherwise we could theoretically
 	 * lose wakeups.
 	 */
 	return ioc->nr_batch_requests == q->nr_batching ||
 		(ioc->nr_batch_requests > 0
 		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
 }
 /*
  * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
  * will cause the process to be a "batcher" on all queues in the system. This
  * is the behaviour we want though - once it gets a wakeup it should be given
  * a nice run.
  */
 static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
 {
 	if (!ioc || ioc_batching(q, ioc))
 		return;
 	ioc->nr_batch_requests = q->nr_batching;
 	ioc->last_waited = jiffies;
 }
 static void __freed_request(struct request_queue *q, int sync)
 {
 	struct request_list *rl = &q->rq;
 	if (rl->count[sync] < queue_congestion_off_threshold(q))
 		blk_clear_queue_congested(q, sync);
 	if (rl->count[sync] + 1 <= q->nr_requests) {
 		if (waitqueue_active(&rl->wait[sync]))
 			wake_up(&rl->wait[sync]);
 		blk_clear_queue_full(q, sync);
 	}
 }
 /*
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
 static void freed_request(struct request_queue *q, int sync, int priv)
 {
 	struct request_list *rl = &q->rq;
 	rl->count[sync]--;
 	if (priv)
 		rl->elvpriv--;
 	__freed_request(q, sync);
 	if (unlikely(rl->starved[sync ^ 1]))
 		__freed_request(q, sync ^ 1);
 }
 /*
  * Determine if elevator data should be initialized when allocating the
  * request associated with @bio.
  */
 static bool blk_rq_should_init_elevator(struct bio *bio)
 {
 	if (!bio)
 		return true;
 	/*
 	 * Flush requests do not use the elevator so skip initialization.
 	 * This allows a request to share the flush and elevator data.
 	 */
 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA))
 		return false;
 	return true;
 }
 /*
  * Get a free request, queue_lock must be held.
  * Returns NULL on failure, with queue_lock held.
  * Returns !NULL on success, with queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, int rw_flags,
 				   struct bio *bio, gfp_t gfp_mask)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = NULL;
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	int may_queue, priv = 0;
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
 	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
 		if (rl->count[is_sync]+1 >= q->nr_requests) {
 			ioc = current_io_context(GFP_ATOMIC, q->node);
 			/*
 			 * The queue will fill after this allocation, so set
 			 * it as full, and mark this process as "batching".
 			 * This process will be allowed to complete a batch of
 			 * requests, others will be blocked.
 			 */
 			if (!blk_queue_full(q, is_sync)) {
 				ioc_set_batching(q, ioc);
 				blk_set_queue_full(q, is_sync);
 			} else {
 				if (may_queue != ELV_MQUEUE_MUST
 						&& !ioc_batching(q, ioc)) {
 					/*
 					 * The queue is full and the allocating
 					 * process is not a "batcher", and not
 					 * exempted by the IO scheduler
 					 */
 					goto out;
 				}
 			}
 		}
 		blk_set_queue_congested(q, is_sync);
 	}
 	/*
 	 * Only allow batching queuers to allocate up to 50% over the defined
 	 * limit of requests, otherwise we could have thousands of requests
 	 * allocated with any setting of ->nr_requests
 	 */
 	if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
 		goto out;
 	rl->count[is_sync]++;
 	rl->starved[is_sync] = 0;
 	if (blk_rq_should_init_elevator(bio)) {
 		priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 		if (priv)
 			rl->elvpriv++;
 	}
 	if (blk_queue_io_stat(q))
 		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
 	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
 		 * we might have messed up.
 		 *
 		 * Allocating task should really be put onto the front of the
 		 * wait queue, but this is pretty rare.
 		 */
 		spin_lock_irq(q->queue_lock);
 		freed_request(q, is_sync, priv);
 		/*
 		 * in the very unlikely event that allocation failed and no
 		 * requests for this direction was pending, mark us starved
 		 * so that freeing of a request in the other direction will
 		 * notice us. another possible fix would be to split the
 		 * rq mempool into READ and WRITE
 		 */
 rq_starved:
 		if (unlikely(rl->count[is_sync] == 0))
 			rl->starved[is_sync] = 1;
 		goto out;
 	}
 	/*
 	 * ioc may be NULL here, and ioc_batching will be false. That's
 	 * OK, if the queue is under the request limit then requests need
 	 * not count toward the nr_batch_requests limit. There will always
 	 * be some limit enforced by BLK_BATCH_TIME.
 	 */
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 	trace_block_getrq(q, bio, rw_flags & 1);
 out:
 	return rq;
 }
 /*
  * No available requests for this queue, wait for some requests to become
  * available.
  *
  * Called with q->queue_lock held, and returns with it unlocked.
  */
 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 					struct bio *bio)
 {
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	struct request *rq;
 	rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	while (!rq) {
 		DEFINE_WAIT(wait);
 		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
 				TASK_UNINTERRUPTIBLE);
 		trace_block_sleeprq(q, bio, rw_flags & 1);
 		spin_unlock_irq(q->queue_lock);
 		io_schedule();
 		/*
 		 * After sleeping, we become a "batching" process and
 		 * will be able to allocate at least one request, and
 		 * up to a big batch of them for a small period time.
 		 * See ioc_batching, ioc_set_batching
 		 */
 		ioc = current_io_context(GFP_NOIO, q->node);
 		ioc_set_batching(q, ioc);
 		spin_lock_irq(q->queue_lock);
 		finish_wait(&rl->wait[is_sync], &wait);
 		rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	};
 	return rq;
 }
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	struct request *rq;
 	BUG_ON(rw != READ && rw != WRITE);
 	spin_lock_irq(q->queue_lock);
 	if (gfp_mask & __GFP_WAIT) {
 		rq = get_request_wait(q, rw, NULL);
 	} else {
 		rq = get_request(q, rw, NULL, gfp_mask);
 		if (!rq)
 			spin_unlock_irq(q->queue_lock);
 	}
 	/* q->queue_lock is unlocked at this point */
 	return rq;
 }
 EXPORT_SYMBOL(blk_get_request);
 /**
  * blk_make_request - given a bio, allocate a corresponding struct request.
  * @q: target request queue
  * @bio:  The bio describing the memory mappings that will be submitted for IO.
  *        It may be a chained-bio properly constructed by block/bio layer.
  * @gfp_mask: gfp flags to be used for memory allocation
  *
  * blk_make_request is the parallel of generic_make_request for BLOCK_PC
  * type commands. Where the struct request needs to be farther initialized by
  * the caller. It is passed a &struct bio, which describes the memory info of
  * the I/O transfer.
  *
  * The caller of blk_make_request must make sure that bi_io_vec
  * are set to describe the memory buffers. That bio_data_dir() will return
  * the needed direction of the request. (And all bio's in the passed bio-chain
  * are properly set accordingly)
  *
  * If called under none-sleepable conditions, mapped bio buffers must not
  * need bouncing, by calling the appropriate masked or flagged allocator,
  * suitable for the target device. Otherwise the call to blk_queue_bounce will
  * BUG.
  *
  * WARNING: When allocating/cloning a bio-chain, careful consideration should be
  * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
  * anything but the first bio in the chain. Otherwise you risk waiting for IO
  * completion of a bio that hasn't been submitted yet, thus resulting in a
  * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
  * of bio_alloc(), as that avoids the mempool deadlock.
  * If possible a big IO should be split into smaller parts when allocation
  * fails. Partial allocation should not be an error, or you risk a live-lock.
  */
 struct request *blk_make_request(struct request_queue *q, struct bio *bio,
 				 gfp_t gfp_mask)
 {
 	struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);
 	if (unlikely(!rq))
 		return ERR_PTR(-ENOMEM);
 	for_each_bio(bio) {
 		struct bio *bounce_bio = bio;
 		int ret;
 		blk_queue_bounce(q, &bounce_bio);
 		ret = blk_rq_append_bio(q, rq, bounce_bio);
 		if (unlikely(ret)) {
 			blk_put_request(rq);
 			return ERR_PTR(ret);
 		}
 	}
 	return rq;
 }
 EXPORT_SYMBOL(blk_make_request);
 /**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
  *
  * Description:
  *    Drivers often keep queueing requests until the hardware cannot accept
  *    more, when that condition happens we need to put the request back
  *    on the queue. Must be called with queue lock held.
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 	BUG_ON(blk_queued_rq(rq));
 	elv_requeue_request(q, rq);
 }
 EXPORT_SYMBOL(blk_requeue_request);
 static void add_acct_request(struct request_queue *q, struct request *rq,
 			     int where)
 {
 	drive_stat_acct(rq, 1);
 	__elv_add_request(q, rq, where);
 }
 /**
  * blk_insert_request - insert a special request into a request queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
  * @at_head:	insert request at head or tail of queue
  * @data:	private data
  *
  * Description:
  *    Many block devices need to execute commands asynchronously, so they don't
  *    block the whole kernel from preemption during request execution.  This is
  *    accomplished normally by inserting aritficial requests tagged as
  *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
  *    be scheduled for actual execution by the request queue.
  *
  *    We have the option of inserting the head or the tail of the queue.
  *    Typically we use the tail for new ioctls and so forth.  We use the head
  *    of the queue for things like a QUEUE_FULL message from a device, or a
  *    host that is unable to accept a particular command.
  */
 void blk_insert_request(struct request_queue *q, struct request *rq,
 			int at_head, void *data)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 	unsigned long flags;
 	/*
 	 * tell I/O scheduler that this isn't a regular read/write (ie it
 	 * must not attempt merges on this) and that it acts as a soft
 	 * barrier
 	 */
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->special = data;
 	spin_lock_irqsave(q->queue_lock, flags);
 	/*
 	 * If command is tagged, release the tag
 	 */
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 	add_acct_request(q, rq, where);
 	__blk_run_queue(q, false);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
 static void part_round_stats_single(int cpu, struct hd_struct *part,
 				    unsigned long now)
 {
 	if (now == part->stamp)
 		return;
 	if (part_in_flight(part)) {
 		__part_stat_add(cpu, part, time_in_queue,
 				part_in_flight(part) * (now - part->stamp));
 		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
 }
 /**
  * part_round_stats() - Round off the performance stats on a struct disk_stats.
  * @cpu: cpu number for stats access
  * @part: target partition
  *
  * The average IO queue length and utilisation statistics are maintained
  * by observing the current state of the queue length and the amount of
  * time it has been in this state for.
  *
  * Normally, that accounting is done on IO completion, but that can result
  * in more than a second's worth of IO being accounted for within any one
  * second, leading to >100% utilisation.  To deal with that, we call this
  * function to do a round-off before returning the results when reading
  * /proc/diskstats.  This accounts immediately for all queue usage up to
  * the current jiffies and restarts the counters again.
  */
 void part_round_stats(int cpu, struct hd_struct *part)
 {
 	unsigned long now = jiffies;
 	if (part->partno)
 		part_round_stats_single(cpu, &part_to_disk(part)->part0, now);
 	part_round_stats_single(cpu, part, now);
 }
 EXPORT_SYMBOL_GPL(part_round_stats);
 /*
  * queue lock must be held
  */
 void __blk_put_request(struct request_queue *q, struct request *req)
 {
 	if (unlikely(!q))
 		return;
 	if (unlikely(--req->ref_count))
 		return;
 	elv_completed_request(q, req);
 	/* this is a bio leak */
 	WARN_ON(req->bio != NULL);
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
 	 * it didn't come out of our reserved rq pools
 	 */
 	if (req->cmd_flags & REQ_ALLOCED) {
 		int is_sync = rq_is_sync(req) != 0;
 		int priv = req->cmd_flags & REQ_ELVPRIV;
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(!hlist_unhashed(&req->hash));
 		blk_free_request(q, req);
 		freed_request(q, is_sync, priv);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
 void blk_put_request(struct request *req)
 {
 	unsigned long flags;
 	struct request_queue *q = req->q;
 	spin_lock_irqsave(q->queue_lock, flags);
 	__blk_put_request(q, req);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_put_request);
 /**
  * blk_add_request_payload - add a payload to a request
  * @rq: request to update
  * @page: page backing the payload
  * @len: length of the payload.
  *
  * This allows to later add a payload to an already submitted request by
  * a block driver.  The driver needs to take care of freeing the payload
  * itself.
  *
  * Note that this is a quite horrible hack and nothing but handling of
  * discard requests should ever use it.
  */
 void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len)
 {
 	struct bio *bio = rq->bio;
 	bio->bi_io_vec->bv_page = page;
 	bio->bi_io_vec->bv_offset = 0;
 	bio->bi_io_vec->bv_len = len;
 	bio->bi_size = len;
 	bio->bi_vcnt = 1;
 	bio->bi_phys_segments = 1;
 	rq->__data_len = rq->resid_len = len;
 	rq->nr_phys_segments = 1;
 	rq->buffer = bio_data(bio);
 }
 EXPORT_SYMBOL_GPL(blk_add_request_payload);
 static bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
 				   struct bio *bio)
 {
 	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	/*
 	 * Debug stuff, kill later
 	 */
 	if (!rq_mergeable(req)) {
 		blk_dump_rq_flags(req, "back");
 		return false;
 	}
 	if (!ll_back_merge_fn(q, req, bio))
 		return false;
 	trace_block_bio_backmerge(q, bio);
 	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
 		blk_rq_set_mixed_merge(req);
 	req->biotail->bi_next = bio;
 	req->biotail = bio;
 	req->__data_len += bio->bi_size;
 	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 	drive_stat_acct(req, 0);
 	return true;
 }
 static bool bio_attempt_front_merge(struct request_queue *q,
 				    struct request *req, struct bio *bio)
 {
 	const int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	sector_t sector;
 	/*
 	 * Debug stuff, kill later
 	 */
 	if (!rq_mergeable(req)) {
 		blk_dump_rq_flags(req, "front");
 		return false;
 	}
 	if (!ll_front_merge_fn(q, req, bio))
 		return false;
 	trace_block_bio_frontmerge(q, bio);
 	if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
 		blk_rq_set_mixed_merge(req);
 	sector = bio->bi_sector;
 	bio->bi_next = req->bio;
 	req->bio = bio;
 	/*
 	 * may not be valid. if the low level driver said
 	 * it didn't need a bounce buffer then it better
 	 * not touch req->buffer either...
 	 */
 	req->buffer = bio_data(bio);
 	req->__sector = bio->bi_sector;
 	req->__data_len += bio->bi_size;
 	req->ioprio = ioprio_best(req->ioprio, bio_prio(bio));
 	drive_stat_acct(req, 0);
 	return true;
 }
 /*
  * Attempts to merge with the plugged list in the current process. Returns
  * true if merge was successful, otherwise false.
  */
 static bool attempt_plug_merge(struct task_struct *tsk, struct request_queue *q,
 			       struct bio *bio)
 {
 	struct blk_plug *plug;
 	struct request *rq;
 	bool ret = false;
 	plug = tsk->plug;
 	if (!plug)
 		goto out;
 	list_for_each_entry_reverse(rq, &plug->list, queuelist) {
 		int el_ret;
 		if (rq->q != q)
 			continue;
 		el_ret = elv_try_merge(rq, bio);
 		if (el_ret == ELEVATOR_BACK_MERGE) {
 			ret = bio_attempt_back_merge(q, rq, bio);
 			if (ret)
 				break;
 		} else if (el_ret == ELEVATOR_FRONT_MERGE) {
 			ret = bio_attempt_front_merge(q, rq, bio);
 			if (ret)
 				break;
 		}
 	}
 out:
 	return ret;
 }
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cpu = bio->bi_comp_cpu;
 	req->cmd_type = REQ_TYPE_FS;
 	req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
 	if (bio->bi_rw & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 	req->errors = 0;
 	req->__sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	const bool sync = !!(bio->bi_rw & REQ_SYNC);
 	struct blk_plug *plug;
 	int el_ret, rw_flags, where = ELEVATOR_INSERT_SORT;
 	struct request *req;
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
 	 * ISA dma in theory)
 	 */
 	blk_queue_bounce(q, &bio);
 	if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
 		spin_lock_irq(q->queue_lock);
 		where = ELEVATOR_INSERT_FLUSH;
 		goto get_rq;
 	}
 	/*
 	 * Check if we can merge with the plugged list before grabbing
 	 * any locks.
 	 */
 	if (attempt_plug_merge(current, q, bio))
 		goto out;
 	spin_lock_irq(q->queue_lock);
 	el_ret = elv_merge(q, &req, bio);
 	if (el_ret == ELEVATOR_BACK_MERGE) {
 		BUG_ON(req->cmd_flags & REQ_ON_PLUG);
 		if (bio_attempt_back_merge(q, req, bio)) {
 			if (!attempt_back_merge(q, req))
 				elv_merged_request(q, req, el_ret);
 			goto out_unlock;
 		}
 	} else if (el_ret == ELEVATOR_FRONT_MERGE) {
 		BUG_ON(req->cmd_flags & REQ_ON_PLUG);
 		if (bio_attempt_front_merge(q, req, bio)) {
 			if (!attempt_front_merge(q, req))
 				elv_merged_request(q, req, el_ret);
 			goto out_unlock;
 		}
 	}
 get_rq:
 	/*
 	 * This sync check and mask will be re-done in init_request_from_bio(),
 	 * but we need to set it earlier to expose the sync flag to the
 	 * rq allocator and io schedulers.
 	 */
 	rw_flags = bio_data_dir(bio);
 	if (sync)
 		rw_flags |= REQ_SYNC;
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
 	 * Returns with the queue unlocked.
 	 */
 	req = get_request_wait(q, rw_flags, bio);
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
 	 * may now be mergeable after it had proven unmergeable (above).
 	 * We don't worry about that case for efficiency. It won't happen
 	 * often, and the elevators are able to handle it.
 	 */
 	init_request_from_bio(req, bio);
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
 	    bio_flagged(bio, BIO_CPU_AFFINE)) {
 		req->cpu = blk_cpu_to_group(get_cpu());
 		put_cpu();
 	}
 	plug = current->plug;
 	if (plug) {
 		/*
 		 * If this is the first request added after a plug, fire
 		 * of a plug trace. If others have been added before, check
 		 * if we have multiple devices in this plug. If so, make a
 		 * note to sort the list before dispatch.
 		 */
 		if (list_empty(&plug->list))
 			trace_block_plug(q);
 		else if (!plug->should_sort) {
 			struct request *__rq;
 			__rq = list_entry_rq(plug->list.prev);
 			if (__rq->q != q)
 				plug->should_sort = 1;
 		}
 		/*
 		 * Debug flag, kill later
 		 */
 		req->cmd_flags |= REQ_ON_PLUG;
 		list_add_tail(&req->queuelist, &plug->list);
 		drive_stat_acct(req, 1);
 	} else {
 		spin_lock_irq(q->queue_lock);
 		add_acct_request(q, req, where);
 		__blk_run_queue(q, false);
 out_unlock:
 		spin_unlock_irq(q->queue_lock);
 	}
 out:
 	return 0;
 }
 /*
  * If bio->bi_dev is a partition, remap the location
  */
 static inline void blk_partition_remap(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	if (bio_sectors(bio) && bdev != bdev->bd_contains) {
 		struct hd_struct *p = bdev->bd_part;
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 		trace_block_bio_remap(bdev_get_queue(bio->bi_bdev), bio,
 				      bdev->bd_dev,
 				      bio->bi_sector - p->start_sect);
 	}
 }
 static void handle_bad_sector(struct bio *bio)
 {
 	char b[BDEVNAME_SIZE];
 	printk(KERN_INFO "attempt to access beyond end of device\n");
 	printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
 			bdevname(bio->bi_bdev, b),
 			bio->bi_rw,
 			(unsigned long long)bio->bi_sector + bio_sectors(bio),
 			(long long)(i_size_read(bio->bi_bdev->bd_inode) >> 9));
 	set_bit(BIO_EOF, &bio->bi_flags);
 }
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static DECLARE_FAULT_ATTR(fail_make_request);
 static int __init setup_fail_make_request(char *str)
 {
 	return setup_fault_attr(&fail_make_request, str);
 }
 __setup("fail_make_request=", setup_fail_make_request);
 static int should_fail_request(struct bio *bio)
 {
 	struct hd_struct *part = bio->bi_bdev->bd_part;
 	if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)
 		return should_fail(&fail_make_request, bio->bi_size);
 	return 0;
 }
 static int __init fail_make_request_debugfs(void)
 {
 	return init_fault_attr_dentries(&fail_make_request,
 					"fail_make_request");
 }
 late_initcall(fail_make_request_debugfs);
 #else /* CONFIG_FAIL_MAKE_REQUEST */
 static inline int should_fail_request(struct bio *bio)
 {
 	return 0;
 }
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 /*
  * Check whether this bio extends beyond the end of the device.
  */
 static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 {
 	sector_t maxsector;
 	if (!nr_sectors)
 		return 0;
 	/* Test device or partition size, when known. */
 	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
 	if (maxsector) {
 		sector_t sector = bio->bi_sector;
 		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
 			/*
 			 * This may well happen - the kernel calls bread()
 			 * without checking the size of the device, e.g., when
 			 * mounting a device.
 			 */
 			handle_bad_sector(bio);
 			return 1;
 		}
 	}
 	return 0;
 }
 /**
  * generic_make_request - hand a buffer to its device driver for I/O
  * @bio:  The bio describing the location in memory and on the device.
  *
  * generic_make_request() is used to make I/O requests of block
  * devices. It is passed a &struct bio, which describes the I/O that needs
  * to be done.
  *
  * generic_make_request() does not return any status.  The
  * success/failure status of the request, along with notification of
  * completion, is delivered asynchronously through the bio->bi_end_io
  * function described (one day) else where.
  *
  * The caller of generic_make_request must make sure that bi_io_vec
  * are set to describe the memory buffer, and that bi_dev and bi_sector are
  * set to describe the device address, and the
  * bi_end_io and optionally bi_private are set to describe how
  * completion notification should be signaled.
  *
  * generic_make_request and the drivers it calls may use bi_next if this
  * bio happens to be merged with someone else, and may change bi_dev and
  * bi_sector for remaps as it sees fit.  So the values of these fields
  * should NOT be depended on after the call to generic_make_request.
  */
 static inline void __generic_make_request(struct bio *bio)
 {
 	struct request_queue *q;
 	sector_t old_sector;
 	int ret, nr_sectors = bio_sectors(bio);
 	dev_t old_dev;
 	int err = -EIO;
 	might_sleep();
 	if (bio_check_eod(bio, nr_sectors))
 		goto end_io;
 	/*
 	 * Resolve the mapping until finished. (drivers are
 	 * still free to implement/resolve their own stacking
 	 * by explicitly returning 0)
 	 *
 	 * NOTE: we don't repeat the blk_size check for each new device.
 	 * Stacking drivers are expected to know what they are doing.
 	 */
 	old_sector = -1;
 	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
 		q = bdev_get_queue(bio->bi_bdev);
 		if (unlikely(!q)) {
 			printk(KERN_ERR
 			       "generic_make_request: Trying to access "
 				"nonexistent block-device %s (%Lu)\n",
 				bdevname(bio->bi_bdev, b),
 				(long long) bio->bi_sector);
 			goto end_io;
 		}
 		if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
 			     nr_sectors > queue_max_hw_sectors(q))) {
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
 			       bdevname(bio->bi_bdev, b),
 			       bio_sectors(bio),
 			       queue_max_hw_sectors(q));
 			goto end_io;
 		}
 		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
 			goto end_io;
 		if (should_fail_request(bio))
 			goto end_io;
 		/*
 		 * If this device has partitions, remap block n
 		 * of partition p to block n+start(p) of the disk.
 		 */
 		blk_partition_remap(bio);
 		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
 			goto end_io;
 		if (old_sector != -1)
 			trace_block_bio_remap(q, bio, old_dev, old_sector);
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
 		/*
 		 * Filter flush bio's early so that make_request based
 		 * drivers without flush support don't have to worry
 		 * about them.
 		 */
 		if ((bio->bi_rw & (REQ_FLUSH | REQ_FUA)) && !q->flush_flags) {
 			bio->bi_rw &= ~(REQ_FLUSH | REQ_FUA);
 			if (!nr_sectors) {
 				err = 0;
 				goto end_io;
 			}
 		}
 		if ((bio->bi_rw & REQ_DISCARD) &&
 		    (!blk_queue_discard(q) ||
 		     ((bio->bi_rw & REQ_SECURE) &&
 		      !blk_queue_secdiscard(q)))) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
 		blk_throtl_bio(q, &bio);
 		/*
 		 * If bio = NULL, bio has been throttled and will be submitted
 		 * later.
 		 */
 		if (!bio)
 			break;
 		trace_block_bio_queue(q, bio);
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 	return;
 end_io:
 	bio_endio(bio, err);
 }
 /*
  * We only want one ->make_request_fn to be active at a time,
  * else stack usage with stacked devices could be a problem.
  * So use current->bio_list to keep a list of requests
  * submited by a make_request_fn function.
  * current->bio_list is also used as a flag to say if
  * generic_make_request is currently active in this task or not.
  * If it is NULL, then no make_request is active.  If it is non-NULL,
  * then a make_request is active, and new requests should be added
  * at the tail
  */
 void generic_make_request(struct bio *bio)
 {
 	struct bio_list bio_list_on_stack;
 	if (current->bio_list) {
 		/* make_request is active */
 		bio_list_add(current->bio_list, bio);
 		return;
 	}
 	/* following loop may be a bit non-obvious, and so deserves some
 	 * explanation.
 	 * Before entering the loop, bio->bi_next is NULL (as all callers
 	 * ensure that) so we have a list with a single bio.
 	 * We pretend that we have just taken it off a longer list, so
 	 * we assign bio_list to a pointer to the bio_list_on_stack,
 	 * thus initialising the bio_list of new bios to be
 	 * added.  __generic_make_request may indeed add some more bios
 	 * through a recursive call to generic_make_request.  If it
 	 * did, we find a non-NULL value in bio_list and re-enter the loop
 	 * from the top.  In this case we really did just take the bio
 	 * of the top of the list (no pretending) and so remove it from
 	 * bio_list, and call into __generic_make_request again.
 	 *
 	 * The loop was structured like this to make only one call to
 	 * __generic_make_request (which is important as it is large and
 	 * inlined) and to keep the structure simple.
 	 */
 	BUG_ON(bio->bi_next);
 	bio_list_init(&bio_list_on_stack);
 	current->bio_list = &bio_list_on_stack;
 	do {
 		__generic_make_request(bio);
 		bio = bio_list_pop(current->bio_list);
 	} while (bio);
 	current->bio_list = NULL; /* deactivate */
 }
 EXPORT_SYMBOL(generic_make_request);
 /**
  * submit_bio - submit a bio to the block device layer for I/O
  * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
  * @bio: The &struct bio which describes the I/O
  *
  * submit_bio() is very similar in purpose to generic_make_request(), and
  * uses that function to do most of the work. Both are fairly rough
  * interfaces; @bio must be presetup and ready for I/O.
  *
  */
 void submit_bio(int rw, struct bio *bio)
 {
 	int count = bio_sectors(bio);
 	bio->bi_rw |= rw;
 	/*
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
 	 */
 	if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
 		} else {
 			task_io_account_read(bio->bi_size);
 			count_vm_events(PGPGIN, count);
 		}
 		if (unlikely(block_dump)) {
 			char b[BDEVNAME_SIZE];
 			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
 			current->comm, task_pid_nr(current),
 				(rw & WRITE) ? "WRITE" : "READ",
 				(unsigned long long)bio->bi_sector,
 				bdevname(bio->bi_bdev, b),
 				count);
 		}
 	}
 	generic_make_request(bio);
 }
 EXPORT_SYMBOL(submit_bio);
 /**
  * blk_rq_check_limits - Helper function to check a request for the queue limit
  * @q:  the queue
  * @rq: the request being checked
  *
  * Description:
  *    @rq may have been made based on weaker limitations of upper-level queues
  *    in request stacking drivers, and it may violate the limitation of @q.
  *    Since the block layer and the underlying device driver trust @rq
  *    after it is inserted to @q, it should be checked against @q before
  *    the insertion using this generic function.
  *
  *    This function should also be useful for request stacking drivers
  *    in some cases below, so export this function.
  *    Request stacking drivers like request-based dm may change the queue
  *    limits while requests are in the queue (e.g. dm's table swapping).
  *    Such request stacking drivers should check those requests agaist
  *    the new queue limits again when they dispatch those requests,
  *    although such checkings are also done against the old queue limits
  *    when submitting requests.
  */
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
 	if (rq->cmd_flags & REQ_DISCARD)
 		return 0;
 	if (blk_rq_sectors(rq) > queue_max_sectors(q) ||
 	    blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
 	}
 	/*
 	 * queue's settings related to segment counting like q->bounce_pfn
 	 * may differ from that of other stacking queues.
 	 * Recalculate it to check the request correctly on this queue's
 	 * limitation.
 	 */
 	blk_recalc_rq_segments(rq);
 	if (rq->nr_phys_segments > queue_max_segments(q)) {
 		printk(KERN_ERR "%s: over max segments limit.\n", __func__);
 		return -EIO;
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_rq_check_limits);
 /**
  * blk_insert_cloned_request - Helper for stacking drivers to submit a request
  * @q:  the queue to submit the request
  * @rq: the request being queued
  */
 int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
 	unsigned long flags;
 	if (blk_rq_check_limits(q, rq))
 		return -EIO;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
 	    should_fail(&fail_make_request, blk_rq_bytes(rq)))
 		return -EIO;
 #endif
 	spin_lock_irqsave(q->queue_lock, flags);
 	/*
 	 * Submitting request must be dequeued before calling this function
 	 * because it will be linked to another request_queue
 	 */
 	BUG_ON(blk_queued_rq(rq));
 	add_acct_request(q, rq, ELEVATOR_INSERT_BACK);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 /**
  * blk_rq_err_bytes - determine number of bytes till the next failure boundary
  * @rq: request to examine
  *
  * Description:
  *     A request could be merge of IOs which require different failure
  *     handling.  This function determines the number of bytes which
  *     can be failed from the beginning of the request without
  *     crossing into area which need to be retried further.
  *
  * Return:
  *     The number of bytes to fail.
  *
  * Context:
  *     queue_lock must be held.
  */
 unsigned int blk_rq_err_bytes(const struct request *rq)
 {
 	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
 	unsigned int bytes = 0;
 	struct bio *bio;
 	if (!(rq->cmd_flags & REQ_MIXED_MERGE))
 		return blk_rq_bytes(rq);
 	/*
 	 * Currently the only 'mixing' which can happen is between
 	 * different fastfail types.  We can safely fail portions
 	 * which have all the failfast bits that the first one has -
 	 * the ones which are at least as eager to fail as the first
 	 * one.
 	 */
 	for (bio = rq->bio; bio; bio = bio->bi_next) {
 		if ((bio->bi_rw & ff) != ff)
 			break;
 		bytes += bio->bi_size;
 	}
 	/* this could lead to infinite loop */
 	BUG_ON(blk_rq_bytes(rq) && !bytes);
 	return bytes;
 }
 EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (blk_do_io_stat(req)) {
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
 		cpu = part_stat_lock();
 		part = req->part;
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
 }
 static void blk_account_io_done(struct request *req)
 {
 	/*
 	 * Account IO completion.  flush_rq isn't accounted as a
 	 * normal IO on queueing nor completion.  Accounting the
 	 * containing request is enough.
 	 */
 	if (blk_do_io_stat(req) && !(req->cmd_flags & REQ_FLUSH_SEQ)) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
 		cpu = part_stat_lock();
 		part = req->part;
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rw);
 		hd_struct_put(part);
 		part_stat_unlock();
 	}
 }
 /**
  * blk_peek_request - peek at the top of a request queue
  * @q: request queue to peek at
  *
  * Description:
  *     Return the request at the top of @q.  The returned request
  *     should be started using blk_start_request() before LLD starts
  *     processing it.
  *
  * Return:
  *     Pointer to the request at the top of @q if available.  Null
  *     otherwise.
  *
  * Context:
  *     queue_lock must be held.
  */
 struct request *blk_peek_request(struct request_queue *q)
 {
 	struct request *rq;
 	int ret;
 	while ((rq = __elv_next_request(q)) != NULL) {
 		if (!(rq->cmd_flags & REQ_STARTED)) {
 			/*
 			 * This is the first time the device driver
 			 * sees this request (possibly after
 			 * requeueing).  Notify IO scheduler.
 			 */
 			if (rq->cmd_flags & REQ_SORTED)
 				elv_activate_rq(q, rq);
 			/*
 			 * just mark as started even if we don't start
 			 * it, a request that has been delayed should
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
 			trace_block_rq_issue(q, rq);
 		}
 		if (!q->boundary_rq || q->boundary_rq == rq) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = NULL;
 		}
 		if (rq->cmd_flags & REQ_DONTPREP)
 			break;
 		if (q->dma_drain_size && blk_rq_bytes(rq)) {
 			/*
 			 * make sure space for the drain appears we
 			 * know we can do this because max_hw_segments
 			 * has been adjusted to be one fewer than the
 			 * device can handle
 			 */
 			rq->nr_phys_segments++;
 		}
 		if (!q->prep_rq_fn)
 			break;
 		ret = q->prep_rq_fn(q, rq);
 		if (ret == BLKPREP_OK) {
 			break;
 		} else if (ret == BLKPREP_DEFER) {
 			/*
 			 * the request may have been (partially) prepped.
 			 * we need to keep this request in the front to
 			 * avoid resource deadlock.  REQ_STARTED will
 			 * prevent other fs requests from passing this one.
 			 */
 			if (q->dma_drain_size && blk_rq_bytes(rq) &&
 			    !(rq->cmd_flags & REQ_DONTPREP)) {
 				/*
 				 * remove the space for the drain we added
 				 * so that we don't add it again
 				 */
 				--rq->nr_phys_segments;
 			}
 			rq = NULL;
 			break;
 		} else if (ret == BLKPREP_KILL) {
 			rq->cmd_flags |= REQ_QUIET;
 			/*
 			 * Mark this request as started so we don't trigger
 			 * any debug logic in the end I/O path.
 			 */
 			blk_start_request(rq);
 			__blk_end_request_all(rq, -EIO);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
 			break;
 		}
 	}
 	return rq;
 }
 EXPORT_SYMBOL(blk_peek_request);
 void blk_dequeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	BUG_ON(list_empty(&rq->queuelist));
 	BUG_ON(ELV_ON_HASH(rq));
 	list_del_init(&rq->queuelist);
 	/*
 	 * the time frame between a request being removed from the lists
 	 * and to it is freed is accounted as io that is in progress at
 	 * the driver side.
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]++;
 		set_io_start_time_ns(rq);
 	}
 }
 /**
  * blk_start_request - start request processing on the driver
  * @req: request to dequeue
  *
  * Description:
  *     Dequeue @req and start timeout timer on it.  This hands off the
  *     request to the driver.
  *
  *     Block internal functions which don't want to start timer should
  *     call blk_dequeue_request().
  *
  * Context:
  *     queue_lock must be held.
  */
 void blk_start_request(struct request *req)
 {
 	blk_dequeue_request(req);
 	/*
 	 * We are now handing the request to the hardware, initialize
 	 * resid_len to full count and add the timeout handler.
 	 */
 	req->resid_len = blk_rq_bytes(req);
 	if (unlikely(blk_bidi_rq(req)))
 		req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blk_start_request);
 /**
  * blk_fetch_request - fetch a request from a request queue
  * @q: request queue to fetch a request from
  *
  * Description:
  *     Return the request at the top of @q.  The request is started on
  *     return and LLD can start processing it immediately.
  *
  * Return:
  *     Pointer to the request at the top of @q if available.  Null
  *     otherwise.
  *
  * Context:
  *     queue_lock must be held.
  */
 struct request *blk_fetch_request(struct request_queue *q)
 {
 	struct request *rq;
 	rq = blk_peek_request(q);
 	if (rq)
 		blk_start_request(rq);
 	return rq;
 }
 EXPORT_SYMBOL(blk_fetch_request);
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @req:      the request being processed
  * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete @req
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @req, but doesn't complete
  *     the request structure even if @req doesn't have leftover.
  *     If @req has leftover, sets it up for the next range of segments.
  *
  *     This special helper function is only for request stacking drivers
  *     (e.g. request-based dm) so that they can handle partial completion.
  *     Actual device drivers should use blk_end_request instead.
  *
  *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
  *     %false return from this function.
  *
  * Return:
  *     %false - this request doesn't have any more data
  *     %true  - this request has more data
  **/
 bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 {
 	int total_bytes, bio_nbytes, next_idx = 0;
 	struct bio *bio;
 	if (!req->bio)
 		return false;
 	trace_block_rq_complete(req->q, req);
 	/*
 	 * For fs requests, rq is just carrier of independent bio's
 	 * and each partial completion should be handled separately.
 	 * Reset per-request error on each partial completion.
 	 *
 	 * TODO: tj: This is too subtle.  It would be better to let
 	 * low level drivers do what they see fit.
 	 */
 	if (req->cmd_type == REQ_TYPE_FS)
 		req->errors = 0;
 	if (error && req->cmd_type == REQ_TYPE_FS &&
 	    !(req->cmd_flags & REQ_QUIET)) {
 		char *error_type;
 		switch (error) {
 		case -ENOLINK:
 			error_type = "recoverable transport";
 			break;
 		case -EREMOTEIO:
 			error_type = "critical target";
 			break;
 		case -EBADE:
 			error_type = "critical nexus";
 			break;
 		case -EIO:
 		default:
 			error_type = "I/O";
 			break;
 		}
 		printk(KERN_ERR "end_request: %s error, dev %s, sector %llu\n",
 		       error_type, req->rq_disk ? req->rq_disk->disk_name : "?",
 		       (unsigned long long)blk_rq_pos(req));
 	}
 	blk_account_io_completion(req, nr_bytes);
 	total_bytes = bio_nbytes = 0;
 	while ((bio = req->bio) != NULL) {
 		int nbytes;
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
 			req_bio_endio(req, bio, nbytes, error);
 			next_idx = 0;
 			bio_nbytes = 0;
 		} else {
 			int idx = bio->bi_idx + next_idx;
 			if (unlikely(idx >= bio->bi_vcnt)) {
 				blk_dump_rq_flags(req, "__end_that");
 				printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",
 				       __func__, idx, bio->bi_vcnt);
 				break;
 			}
 			nbytes = bio_iovec_idx(bio, idx)->bv_len;
 			BIO_BUG_ON(nbytes > bio->bi_size);
 			/*
 			 * not a complete bvec done
 			 */
 			if (unlikely(nbytes > nr_bytes)) {
 				bio_nbytes += nr_bytes;
 				total_bytes += nr_bytes;
 				break;
 			}
 			/*
 			 * advance to the next vector
 			 */
 			next_idx++;
 			bio_nbytes += nbytes;
 		}
 		total_bytes += nbytes;
 		nr_bytes -= nbytes;
 		bio = req->bio;
 		if (bio) {
 			/*
 			 * end more in this run, or just return 'not-done'
 			 */
 			if (unlikely(nr_bytes <= 0))
 				break;
 		}
 	}
 	/*
 	 * completely done
 	 */
 	if (!req->bio) {
 		/*
 		 * Reset counters so that the request stacking driver
 		 * can find how many bytes remain in the request
 		 * later.
 		 */
 		req->__data_len = 0;
 		return false;
 	}
 	/*
 	 * if the request wasn't completed, update state
 	 */
 	if (bio_nbytes) {
 		req_bio_endio(req, bio, bio_nbytes, error);
 		bio->bi_idx += next_idx;
 		bio_iovec(bio)->bv_offset += nr_bytes;
 		bio_iovec(bio)->bv_len -= nr_bytes;
 	}
 	req->__data_len -= total_bytes;
 	req->buffer = bio_data(req->bio);
 	/* update sector only for requests with clear definition of sector */
 	if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD))
 		req->__sector += total_bytes >> 9;
 	/* mixed attributes always follow the first bio */
 	if (req->cmd_flags & REQ_MIXED_MERGE) {
 		req->cmd_flags &= ~REQ_FAILFAST_MASK;
 		req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK;
 	}
 	/*
 	 * If total number of sectors is less than the first segment
 	 * size, something has gone terribly wrong.
 	 */
 	if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
 		blk_dump_rq_flags(req, "request botched");
 		req->__data_len = blk_rq_cur_bytes(req);
 	}
 	/* recalculate the number of segments */
 	blk_recalc_rq_segments(req);
 	return true;
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
 static bool blk_update_bidi_request(struct request *rq, int error,
 				    unsigned int nr_bytes,
 				    unsigned int bidi_bytes)
 {
 	if (blk_update_request(rq, error, nr_bytes))
 		return true;
 	/* Bidi request must be completed as a whole */
 	if (unlikely(blk_bidi_rq(rq)) &&
 	    blk_update_request(rq->next_rq, error, bidi_bytes))
 		return true;
 	if (blk_queue_add_random(rq->q))
 		add_disk_randomness(rq->rq_disk);
 	return false;
 }
 /**
  * blk_unprep_request - unprepare a request
  * @req:	the request
  *
  * This function makes a request ready for complete resubmission (or
  * completion).  It happens only after all error handling is complete,
  * so represents the appropriate moment to deallocate any resources
  * that were allocated to the request in the prep_rq_fn.  The queue
  * lock is held when calling this.
  */
 void blk_unprep_request(struct request *req)
 {
 	struct request_queue *q = req->q;
 	req->cmd_flags &= ~REQ_DONTPREP;
 	if (q->unprep_rq_fn)
 		q->unprep_rq_fn(q, req);
 }
 EXPORT_SYMBOL_GPL(blk_unprep_request);
 /*
  * queue lock must be held
  */
 static void blk_finish_request(struct request *req, int error)
 {
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
 	BUG_ON(blk_queued_rq(req));
 	if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS)
 		laptop_io_completion(&req->q->backing_dev_info);
 	blk_delete_timer(req);
 	if (req->cmd_flags & REQ_DONTPREP)
 		blk_unprep_request(req);
 	blk_account_io_done(req);
 	if (req->end_io)
 		req->end_io(req, error);
 	else {
 		if (blk_bidi_rq(req))
 			__blk_put_request(req->next_rq->q, req->next_rq);
 		__blk_put_request(req->q, req);
 	}
 }
 /**
  * blk_end_bidi_request - Complete a bidi request
  * @rq:         the request to complete
  * @error:      %0 for success, < %0 for error
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
  *     Drivers that supports bidi can safely call this member for any
  *     type of request, bidi or uni.  In the later case @bidi_bytes is
  *     just ignored.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
 static bool blk_end_bidi_request(struct request *rq, int error,
 				 unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
 		return true;
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_finish_request(rq, error);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return false;
 }
 /**
  * __blk_end_bidi_request - Complete a bidi request with queue lock held
  * @rq:         the request to complete
  * @error:      %0 for success, < %0 for error
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
  *     Identical to blk_end_bidi_request() except that queue lock is
  *     assumed to be locked on entry and remains so on return.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
 static bool __blk_end_bidi_request(struct request *rq, int error,
 				   unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
 		return true;
 	blk_finish_request(rq, error);
 	return false;
 }
 /**
  * blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
  * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq.
  *     If @rq has leftover, sets it up for the next range of segments.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
 bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
 	return blk_end_bidi_request(rq, error, nr_bytes, 0);
 }
 EXPORT_SYMBOL(blk_end_request);
 /**
  * blk_end_request_all - Helper function for drives to finish the request.
  * @rq: the request to finish
  * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Completely finish @rq.
  */
 void blk_end_request_all(struct request *rq, int error)
 {
 	bool pending;
 	unsigned int bidi_bytes = 0;
 	if (unlikely(blk_bidi_rq(rq)))
 		bidi_bytes = blk_rq_bytes(rq->next_rq);
 	pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
 	BUG_ON(pending);
 }
 EXPORT_SYMBOL(blk_end_request_all);
 /**
  * blk_end_request_cur - Helper function to finish the current request chunk.
  * @rq: the request to finish the current chunk for
  * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  */
 bool blk_end_request_cur(struct request *rq, int error)
 {
 	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
 }
 EXPORT_SYMBOL(blk_end_request_cur);
 /**
  * blk_end_request_err - Finish a request till the next failure boundary.
  * @rq: the request to finish till the next failure boundary for
  * @error: must be negative errno
  *
  * Description:
  *     Complete @rq till the next failure boundary.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  */
 bool blk_end_request_err(struct request *rq, int error)
 {
 	WARN_ON(error >= 0);
 	return blk_end_request(rq, error, blk_rq_err_bytes(rq));
 }
 EXPORT_SYMBOL_GPL(blk_end_request_err);
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
  * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
  *     Must be called with queue lock held unlike blk_end_request().
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
 bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
 	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
 }
 EXPORT_SYMBOL(__blk_end_request);
 /**
  * __blk_end_request_all - Helper function for drives to finish the request.
  * @rq: the request to finish
  * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Completely finish @rq.  Must be called with queue lock held.
  */
 void __blk_end_request_all(struct request *rq, int error)
 {
 	bool pending;
 	unsigned int bidi_bytes = 0;
 	if (unlikely(blk_bidi_rq(rq)))
 		bidi_bytes = blk_rq_bytes(rq->next_rq);
 	pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
 	BUG_ON(pending);
 }
 EXPORT_SYMBOL(__blk_end_request_all);
 /**
  * __blk_end_request_cur - Helper function to finish the current request chunk.
  * @rq: the request to finish the current chunk for
  * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.  Must
  *     be called with queue lock held.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  */
 bool __blk_end_request_cur(struct request *rq, int error)
 {
 	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
 }
 EXPORT_SYMBOL(__blk_end_request_cur);
 /**
  * __blk_end_request_err - Finish a request till the next failure boundary.
  * @rq: the request to finish till the next failure boundary for
  * @error: must be negative errno
  *
  * Description:
  *     Complete @rq till the next failure boundary.  Must be called
  *     with queue lock held.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  */
 bool __blk_end_request_err(struct request *rq, int error)
 {
 	WARN_ON(error >= 0);
 	return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
 }
 EXPORT_SYMBOL_GPL(__blk_end_request_err);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
 	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
 	rq->cmd_flags |= bio->bi_rw & REQ_WRITE;
 	if (bio_has_data(bio)) {
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 		rq->buffer = bio_data(bio);
 	}
 	rq->__data_len = bio->bi_size;
 	rq->bio = rq->biotail = bio;
 	if (bio->bi_bdev)
 		rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 /**
  * rq_flush_dcache_pages - Helper function to flush all pages in a request
  * @rq: the request to be flushed
  *
  * Description:
  *     Flush all pages in @rq.
  */
 void rq_flush_dcache_pages(struct request *rq)
 {
 	struct req_iterator iter;
 	struct bio_vec *bvec;
 	rq_for_each_segment(bvec, rq, iter)
 		flush_dcache_page(bvec->bv_page);
 }
 EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
 #endif
 /**
  * blk_lld_busy - Check if underlying low-level drivers of a device are busy
  * @q : the queue of the device being checked
  *
  * Description:
  *    Check if underlying low-level drivers of a device are busy.
  *    If the drivers want to export their busy state, they must set own
  *    exporting function using blk_queue_lld_busy() first.
  *
  *    Basically, this function is used only by request stacking drivers
  *    to stop dispatching requests to underlying devices when underlying
  *    devices are busy.  This behavior helps more I/O merging on the queue
  *    of the request stacking driver and prevents I/O throughput regression
  *    on burst I/O load.
  *
  * Return:
  *    0 - Not busy (The request stacking driver should dispatch request)
  *    1 - Busy (The request stacking driver should stop dispatching request)
  */
 int blk_lld_busy(struct request_queue *q)
 {
 	if (q->lld_busy_fn)
 		return q->lld_busy_fn(q);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 /**
  * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
  * @rq: the clone request to be cleaned up
  *
  * Description:
  *     Free all bios in @rq for a cloned request.
  */
 void blk_rq_unprep_clone(struct request *rq)
 {
 	struct bio *bio;
 	while ((bio = rq->bio) != NULL) {
 		rq->bio = bio->bi_next;
 		bio_put(bio);
 	}
 }
 EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
 /*
  * Copy attributes of the original request to the clone request.
  * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied.
  */
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
 	dst->cmd_flags = (src->cmd_flags & REQ_CLONE_MASK) | REQ_NOMERGE;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	dst->nr_phys_segments = src->nr_phys_segments;
 	dst->ioprio = src->ioprio;
 	dst->extra_len = src->extra_len;
 }
 /**
  * blk_rq_prep_clone - Helper function to setup clone request
  * @rq: the request to be setup
  * @rq_src: original request to be cloned
  * @bs: bio_set that bios for clone are allocated from
  * @gfp_mask: memory allocation mask for bio
  * @bio_ctr: setup function to be called for each clone bio.
  *           Returns %0 for success, non %0 for failure.
  * @data: private data to be passed to @bio_ctr
  *
  * Description:
  *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
  *     The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense)
  *     are not copied, and copying such parts is the caller's responsibility.
  *     Also, pages which the original bios are pointing to are not copied
  *     and the cloned bios just point same pages.
  *     So cloned bios must be completed before original bios, which means
  *     the caller must complete @rq before @rq_src.
  */
 int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 		      struct bio_set *bs, gfp_t gfp_mask,
 		      int (*bio_ctr)(struct bio *, struct bio *, void *),
 		      void *data)
 {
 	struct bio *bio, *bio_src;
 	if (!bs)
 		bs = fs_bio_set;
 	blk_rq_init(NULL, rq);
 	__rq_for_each_bio(bio_src, rq_src) {
 		bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);
 		if (!bio)
 			goto free_and_out;
 		__bio_clone(bio, bio_src);
 		if (bio_integrity(bio_src) &&
 		    bio_integrity_clone(bio, bio_src, gfp_mask, bs))
 			goto free_and_out;
 		if (bio_ctr && bio_ctr(bio, bio_src, data))
 			goto free_and_out;
 		if (rq->bio) {
 			rq->biotail->bi_next = bio;
 			rq->biotail = bio;
 		} else
 			rq->bio = rq->biotail = bio;
 	}
 	__blk_rq_prep_clone(rq, rq_src);
 	return 0;
 free_and_out:
 	if (bio)
 		bio_free(bio, bs);
 	blk_rq_unprep_clone(rq);
 	return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 int kblockd_schedule_delayed_work(struct request_queue *q,
 			struct delayed_work *dwork, unsigned long delay)
 {
 	return queue_delayed_work(kblockd_workqueue, dwork, delay);
 }
 EXPORT_SYMBOL(kblockd_schedule_delayed_work);
 #define PLUG_MAGIC	0x91827364
 void blk_start_plug(struct blk_plug *plug)
 {
 	struct task_struct *tsk = current;
 	plug->magic = PLUG_MAGIC;
 	INIT_LIST_HEAD(&plug->list);
 	plug->should_sort = 0;
 	/*
 	 * If this is a nested plug, don't actually assign it. It will be
 	 * flushed on its own.
 	 */
 	if (!tsk->plug) {
 		/*
 		 * Store ordering should not be needed here, since a potential
 		 * preempt will imply a full memory barrier
 		 */
 		tsk->plug = plug;
 	}
 }
 EXPORT_SYMBOL(blk_start_plug);
 static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
 	struct request *rqa = container_of(a, struct request, queuelist);
 	struct request *rqb = container_of(b, struct request, queuelist);
 	return !(rqa->q <= rqb->q);
 }
+/*
+ * If 'from_schedule' is true, then postpone the dispatch of requests
+ * until a safe kblockd context. We due this to avoid accidental big
+ * additional stack usage in driver dispatch, in places where the originally
+ * plugger did not intend it.
+ */
 static void queue_unplugged(struct request_queue *q, unsigned int depth,
-			    bool force_kblockd)
+			    bool from_schedule)
 {
-	trace_block_unplug_io(q, depth);
+	trace_block_unplug(q, depth, !from_schedule);
-	__blk_run_queue(q, force_kblockd);
+	__blk_run_queue(q, from_schedule);
 	if (q->unplugged_fn)
 		q->unplugged_fn(q);
 }
-void blk_flush_plug_list(struct blk_plug *plug, bool force_kblockd)
+void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct request_queue *q;
 	unsigned long flags;
 	struct request *rq;
 	LIST_HEAD(list);
 	unsigned int depth;
 	BUG_ON(plug->magic != PLUG_MAGIC);
 	if (list_empty(&plug->list))
 		return;
 	list_splice_init(&plug->list, &list);
 	if (plug->should_sort) {
 		list_sort(NULL, &list, plug_rq_cmp);
 		plug->should_sort = 0;
 	}
 	q = NULL;
 	depth = 0;
 	/*
 	 * Save and disable interrupts here, to avoid doing it for every
 	 * queue lock we have to take.
 	 */
 	local_irq_save(flags);
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
 		list_del_init(&rq->queuelist);
 		BUG_ON(!(rq->cmd_flags & REQ_ON_PLUG));
 		BUG_ON(!rq->q);
 		if (rq->q != q) {
 			if (q) {
-				queue_unplugged(q, depth, force_kblockd);
+				queue_unplugged(q, depth, from_schedule);
 				spin_unlock(q->queue_lock);
 			}
 			q = rq->q;
 			depth = 0;
 			spin_lock(q->queue_lock);
 		}
 		rq->cmd_flags &= ~REQ_ON_PLUG;
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
 		if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA))
 			__elv_add_request(q, rq, ELEVATOR_INSERT_FLUSH);
 		else
 			__elv_add_request(q, rq, ELEVATOR_INSERT_SORT_MERGE);
 		depth++;
 	}
 	if (q) {
-		queue_unplugged(q, depth, force_kblockd);
+		queue_unplugged(q, depth, from_schedule);
 		spin_unlock(q->queue_lock);
 	}
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL(blk_flush_plug_list);
 void blk_finish_plug(struct blk_plug *plug)
 {
 	blk_flush_plug_list(plug, false);
 	if (plug == current->plug)
 		current->plug = NULL;
 }
 EXPORT_SYMBOL(blk_finish_plug);
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
 			sizeof(((struct request *)0)->cmd_flags));
 	/* used for unplugging and affects IO latency/throughput - HIGHPRI */
 	kblockd_workqueue = alloc_workqueue("kblockd",
 					    WQ_MEM_RECLAIM | WQ_HIGHPRI, 0);
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
 	request_cachep = kmem_cache_create("blkdev_requests",
 			sizeof(struct request), 0, SLAB_PANIC, NULL);
 	blk_requestq_cachep = kmem_cache_create("blkdev_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 	return 0;
 }

 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM block
 #if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
 #define _TRACE_BLOCK_H
 #include <linux/blktrace_api.h>
 #include <linux/blkdev.h>
 #include <linux/tracepoint.h>
 DECLARE_EVENT_CLASS(block_rq_with_error,
 	TP_PROTO(struct request_queue *q, struct request *rq),
 	TP_ARGS(q, rq),
 	TP_STRUCT__entry(
 		__field(  dev_t,	dev			)
 		__field(  sector_t,	sector			)
 		__field(  unsigned int,	nr_sector		)
 		__field(  int,		errors			)
 		__array(  char,		rwbs,	6		)
 		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
 	),
 	TP_fast_assign(
 		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
 		__entry->sector    = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
 					0 : blk_rq_pos(rq);
 		__entry->nr_sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
 					0 : blk_rq_sectors(rq);
 		__entry->errors    = rq->errors;
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 		blk_dump_cmd(__get_str(cmd), rq);
 	),
 	TP_printk("%d,%d %s (%s) %llu + %u [%d]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __get_str(cmd),
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector, __entry->errors)
 );
 /**
  * block_rq_abort - abort block operation request
  * @q: queue containing the block operation request
  * @rq: block IO operation request
  *
  * Called immediately after pending block IO operation request @rq in
  * queue @q is aborted. The fields in the operation request @rq
  * can be examined to determine which device and sectors the pending
  * operation would access.
  */
 DEFINE_EVENT(block_rq_with_error, block_rq_abort,
 	TP_PROTO(struct request_queue *q, struct request *rq),
 	TP_ARGS(q, rq)
 );
 /**
  * block_rq_requeue - place block IO request back on a queue
  * @q: queue holding operation
  * @rq: block IO operation request
  *
  * The block operation request @rq is being placed back into queue
  * @q.  For some reason the request was not completed and needs to be
  * put back in the queue.
  */
 DEFINE_EVENT(block_rq_with_error, block_rq_requeue,
 	TP_PROTO(struct request_queue *q, struct request *rq),
 	TP_ARGS(q, rq)
 );
 /**
  * block_rq_complete - block IO operation completed by device driver
  * @q: queue containing the block operation request
  * @rq: block operations request
  *
  * The block_rq_complete tracepoint event indicates that some portion
  * of operation request has been completed by the device driver.  If
  * the @rq->bio is %NULL, then there is absolutely no additional work to
  * do for the request. If @rq->bio is non-NULL then there is
  * additional work required to complete the request.
  */
 DEFINE_EVENT(block_rq_with_error, block_rq_complete,
 	TP_PROTO(struct request_queue *q, struct request *rq),
 	TP_ARGS(q, rq)
 );
 DECLARE_EVENT_CLASS(block_rq,
 	TP_PROTO(struct request_queue *q, struct request *rq),
 	TP_ARGS(q, rq),
 	TP_STRUCT__entry(
 		__field(  dev_t,	dev			)
 		__field(  sector_t,	sector			)
 		__field(  unsigned int,	nr_sector		)
 		__field(  unsigned int,	bytes			)
 		__array(  char,		rwbs,	6		)
 		__array(  char,         comm,   TASK_COMM_LEN   )
 		__dynamic_array( char,	cmd,	blk_cmd_buf_len(rq)	)
 	),
 	TP_fast_assign(
 		__entry->dev	   = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
 		__entry->sector    = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
 					0 : blk_rq_pos(rq);
 		__entry->nr_sector = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
 					0 : blk_rq_sectors(rq);
 		__entry->bytes     = (rq->cmd_type == REQ_TYPE_BLOCK_PC) ?
 					blk_rq_bytes(rq) : 0;
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 		blk_dump_cmd(__get_str(cmd), rq);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 	TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev),
 		  __entry->rwbs, __entry->bytes, __get_str(cmd),
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector, __entry->comm)
 );
 /**
  * block_rq_insert - insert block operation request into queue
  * @q: target queue
  * @rq: block IO operation request
  *
  * Called immediately before block operation request @rq is inserted
  * into queue @q.  The fields in the operation request @rq struct can
  * be examined to determine which device and sectors the pending
  * operation would access.
  */
 DEFINE_EVENT(block_rq, block_rq_insert,
 	TP_PROTO(struct request_queue *q, struct request *rq),
 	TP_ARGS(q, rq)
 );
 /**
  * block_rq_issue - issue pending block IO request operation to device driver
  * @q: queue holding operation
  * @rq: block IO operation operation request
  *
  * Called when block operation request @rq from queue @q is sent to a
  * device driver for processing.
  */
 DEFINE_EVENT(block_rq, block_rq_issue,
 	TP_PROTO(struct request_queue *q, struct request *rq),
 	TP_ARGS(q, rq)
 );
 /**
  * block_bio_bounce - used bounce buffer when processing block operation
  * @q: queue holding the block operation
  * @bio: block operation
  *
  * A bounce buffer was used to handle the block operation @bio in @q.
  * This occurs when hardware limitations prevent a direct transfer of
  * data between the @bio data memory area and the IO device.  Use of a
  * bounce buffer requires extra copying of data and decreases
  * performance.
  */
 TRACE_EVENT(block_bio_bounce,
 	TP_PROTO(struct request_queue *q, struct bio *bio),
 	TP_ARGS(q, bio),
 	TP_STRUCT__entry(
 		__field( dev_t,		dev			)
 		__field( sector_t,	sector			)
 		__field( unsigned int,	nr_sector		)
 		__array( char,		rwbs,	6		)
 		__array( char,		comm,	TASK_COMM_LEN	)
 	),
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev ?
 					  bio->bi_bdev->bd_dev : 0;
 		__entry->sector		= bio->bi_sector;
 		__entry->nr_sector	= bio->bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector, __entry->comm)
 );
 /**
  * block_bio_complete - completed all work on the block operation
  * @q: queue holding the block operation
  * @bio: block operation completed
  * @error: io error value
  *
  * This tracepoint indicates there is no further work to do on this
  * block IO operation @bio.
  */
 TRACE_EVENT(block_bio_complete,
 	TP_PROTO(struct request_queue *q, struct bio *bio, int error),
 	TP_ARGS(q, bio, error),
 	TP_STRUCT__entry(
 		__field( dev_t,		dev		)
 		__field( sector_t,	sector		)
 		__field( unsigned,	nr_sector	)
 		__field( int,		error		)
 		__array( char,		rwbs,	6	)
 	),
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_sector;
 		__entry->nr_sector	= bio->bi_size >> 9;
 		__entry->error		= error;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
 	),
 	TP_printk("%d,%d %s %llu + %u [%d]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector, __entry->error)
 );
 DECLARE_EVENT_CLASS(block_bio,
 	TP_PROTO(struct request_queue *q, struct bio *bio),
 	TP_ARGS(q, bio),
 	TP_STRUCT__entry(
 		__field( dev_t,		dev			)
 		__field( sector_t,	sector			)
 		__field( unsigned int,	nr_sector		)
 		__array( char,		rwbs,	6		)
 		__array( char,		comm,	TASK_COMM_LEN	)
 	),
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_sector;
 		__entry->nr_sector	= bio->bi_size >> 9;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector, __entry->comm)
 );
 /**
  * block_bio_backmerge - merging block operation to the end of an existing operation
  * @q: queue holding operation
  * @bio: new block operation to merge
  *
  * Merging block request @bio to the end of an existing block request
  * in queue @q.
  */
 DEFINE_EVENT(block_bio, block_bio_backmerge,
 	TP_PROTO(struct request_queue *q, struct bio *bio),
 	TP_ARGS(q, bio)
 );
 /**
  * block_bio_frontmerge - merging block operation to the beginning of an existing operation
  * @q: queue holding operation
  * @bio: new block operation to merge
  *
  * Merging block IO operation @bio to the beginning of an existing block
  * operation in queue @q.
  */
 DEFINE_EVENT(block_bio, block_bio_frontmerge,
 	TP_PROTO(struct request_queue *q, struct bio *bio),
 	TP_ARGS(q, bio)
 );
 /**
  * block_bio_queue - putting new block IO operation in queue
  * @q: queue holding operation
  * @bio: new block operation
  *
  * About to place the block IO operation @bio into queue @q.
  */
 DEFINE_EVENT(block_bio, block_bio_queue,
 	TP_PROTO(struct request_queue *q, struct bio *bio),
 	TP_ARGS(q, bio)
 );
 DECLARE_EVENT_CLASS(block_get_rq,
 	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
 	TP_ARGS(q, bio, rw),
 	TP_STRUCT__entry(
 		__field( dev_t,		dev			)
 		__field( sector_t,	sector			)
 		__field( unsigned int,	nr_sector		)
 		__array( char,		rwbs,	6		)
 		__array( char,		comm,	TASK_COMM_LEN	)
         ),
 	TP_fast_assign(
 		__entry->dev		= bio ? bio->bi_bdev->bd_dev : 0;
 		__entry->sector		= bio ? bio->bi_sector : 0;
 		__entry->nr_sector	= bio ? bio->bi_size >> 9 : 0;
 		blk_fill_rwbs(__entry->rwbs,
 			      bio ? bio->bi_rw : 0, __entry->nr_sector);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
         ),
 	TP_printk("%d,%d %s %llu + %u [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector, __entry->comm)
 );
 /**
  * block_getrq - get a free request entry in queue for block IO operations
  * @q: queue for operations
  * @bio: pending block IO operation
  * @rw: low bit indicates a read (%0) or a write (%1)
  *
  * A request struct for queue @q has been allocated to handle the
  * block IO operation @bio.
  */
 DEFINE_EVENT(block_get_rq, block_getrq,
 	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
 	TP_ARGS(q, bio, rw)
 );
 /**
  * block_sleeprq - waiting to get a free request entry in queue for block IO operation
  * @q: queue for operation
  * @bio: pending block IO operation
  * @rw: low bit indicates a read (%0) or a write (%1)
  *
  * In the case where a request struct cannot be provided for queue @q
  * the process needs to wait for an request struct to become
  * available.  This tracepoint event is generated each time the
  * process goes to sleep waiting for request struct become available.
  */
 DEFINE_EVENT(block_get_rq, block_sleeprq,
 	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
 	TP_ARGS(q, bio, rw)
 );
 /**
  * block_plug - keep operations requests in request queue
  * @q: request queue to plug
  *
  * Plug the request queue @q.  Do not allow block operation requests
  * to be sent to the device driver. Instead, accumulate requests in
  * the queue to improve throughput performance of the block device.
  */
 TRACE_EVENT(block_plug,
 	TP_PROTO(struct request_queue *q),
 	TP_ARGS(q),
 	TP_STRUCT__entry(
 		__array( char,		comm,	TASK_COMM_LEN	)
 	),
 	TP_fast_assign(
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 	TP_printk("[%s]", __entry->comm)
 );
 DECLARE_EVENT_CLASS(block_unplug,
-	TP_PROTO(struct request_queue *q, unsigned int depth),
+	TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),
-	TP_ARGS(q, depth),
+	TP_ARGS(q, depth, explicit),
 	TP_STRUCT__entry(
 		__field( int,		nr_rq			)
 		__array( char,		comm,	TASK_COMM_LEN	)
 	),
 	TP_fast_assign(
 		__entry->nr_rq = depth;
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 	TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
 );
 /**
- * block_unplug_io - release of operations requests in request queue
+ * block_unplug - release of operations requests in request queue
  * @q: request queue to unplug
  * @depth: number of requests just added to the queue
+ * @explicit: whether this was an explicit unplug, or one from schedule()
  *
  * Unplug request queue @q because device driver is scheduled to work
  * on elements in the request queue.
  */
-DEFINE_EVENT(block_unplug, block_unplug_io,
+DEFINE_EVENT(block_unplug, block_unplug,
-	TP_PROTO(struct request_queue *q, unsigned int depth),
+	TP_PROTO(struct request_queue *q, unsigned int depth, bool explicit),
-	TP_ARGS(q, depth)
+	TP_ARGS(q, depth, explicit)
 );
 /**
  * block_split - split a single bio struct into two bio structs
  * @q: queue containing the bio
  * @bio: block operation being split
  * @new_sector: The starting sector for the new bio
  *
  * The bio request @bio in request queue @q needs to be split into two
  * bio requests. The newly created @bio request starts at
  * @new_sector. This split may be required due to hardware limitation
  * such as operation crossing device boundaries in a RAID system.
  */
 TRACE_EVENT(block_split,
 	TP_PROTO(struct request_queue *q, struct bio *bio,
 		 unsigned int new_sector),
 	TP_ARGS(q, bio, new_sector),
 	TP_STRUCT__entry(
 		__field( dev_t,		dev				)
 		__field( sector_t,	sector				)
 		__field( sector_t,	new_sector			)
 		__array( char,		rwbs,		6		)
 		__array( char,		comm,		TASK_COMM_LEN	)
 	),
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_sector;
 		__entry->new_sector	= new_sector;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 	),
 	TP_printk("%d,%d %s %llu / %llu [%s]",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
 		  (unsigned long long)__entry->sector,
 		  (unsigned long long)__entry->new_sector,
 		  __entry->comm)
 );
 /**
  * block_bio_remap - map request for a logical device to the raw device
  * @q: queue holding the operation
  * @bio: revised operation
  * @dev: device for the operation
  * @from: original sector for the operation
  *
  * An operation for a logical device has been mapped to the
  * raw block device.
  */
 TRACE_EVENT(block_bio_remap,
 	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
 		 sector_t from),
 	TP_ARGS(q, bio, dev, from),
 	TP_STRUCT__entry(
 		__field( dev_t,		dev		)
 		__field( sector_t,	sector		)
 		__field( unsigned int,	nr_sector	)
 		__field( dev_t,		old_dev		)
 		__field( sector_t,	old_sector	)
 		__array( char,		rwbs,	6	)
 	),
 	TP_fast_assign(
 		__entry->dev		= bio->bi_bdev->bd_dev;
 		__entry->sector		= bio->bi_sector;
 		__entry->nr_sector	= bio->bi_size >> 9;
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
 		blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
 	),
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector,
 		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
 		  (unsigned long long)__entry->old_sector)
 );
 /**
  * block_rq_remap - map request for a block operation request
  * @q: queue holding the operation
  * @rq: block IO operation request
  * @dev: device for the operation
  * @from: original sector for the operation
  *
  * The block operation request @rq in @q has been remapped.  The block
  * operation request @rq holds the current information and @from hold
  * the original sector.
  */
 TRACE_EVENT(block_rq_remap,
 	TP_PROTO(struct request_queue *q, struct request *rq, dev_t dev,
 		 sector_t from),
 	TP_ARGS(q, rq, dev, from),
 	TP_STRUCT__entry(
 		__field( dev_t,		dev		)
 		__field( sector_t,	sector		)
 		__field( unsigned int,	nr_sector	)
 		__field( dev_t,		old_dev		)
 		__field( sector_t,	old_sector	)
 		__array( char,		rwbs,	6	)
 	),
 	TP_fast_assign(
 		__entry->dev		= disk_devt(rq->rq_disk);
 		__entry->sector		= blk_rq_pos(rq);
 		__entry->nr_sector	= blk_rq_sectors(rq);
 		__entry->old_dev	= dev;
 		__entry->old_sector	= from;
 		blk_fill_rwbs(__entry->rwbs, rq->cmd_flags, blk_rq_bytes(rq));
 	),
 	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
 		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
 		  (unsigned long long)__entry->sector,
 		  __entry->nr_sector,
 		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
 		  (unsigned long long)__entry->old_sector)
 );
 #endif /* _TRACE_BLOCK_H */
 /* This part must be outside protection */
 #include <trace/define_trace.h>

 /*
  * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public License
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  *
  */
 #include <linux/kernel.h>
 #include <linux/blkdev.h>
 #include <linux/blktrace_api.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/mutex.h>
 #include <linux/slab.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
 #include <linux/uaccess.h>
 #include <trace/events/block.h>
 #include "trace_output.h"
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 static unsigned int blktrace_seq __read_mostly = 1;
 static struct trace_array *blk_tr;
 static bool blk_tracer_enabled __read_mostly;
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_BLK_OPT_CLASSIC	0x1
 static struct tracer_opt blk_tracer_opts[] = {
 	/* Default disable the minimalistic output */
 	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
 	{ }
 };
 static struct tracer_flags blk_tracer_flags = {
 	.val  = 0,
 	.opts = blk_tracer_opts,
 };
 /* Global reference count of probes */
 static atomic_t blk_probes_ref = ATOMIC_INIT(0);
 static void blk_register_tracepoints(void);
 static void blk_unregister_tracepoints(void);
 /*
  * Send out a notify message.
  */
 static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 		       const void *data, size_t len)
 {
 	struct blk_io_trace *t;
 	struct ring_buffer_event *event = NULL;
 	struct ring_buffer *buffer = NULL;
 	int pc = 0;
 	int cpu = smp_processor_id();
 	bool blk_tracer = blk_tracer_enabled;
 	if (blk_tracer) {
 		buffer = blk_tr->buffer;
 		pc = preempt_count();
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
 						  sizeof(*t) + len,
 						  0, pc);
 		if (!event)
 			return;
 		t = ring_buffer_event_data(event);
 		goto record_it;
 	}
 	if (!bt->rchan)
 		return;
 	t = relay_reserve(bt->rchan, sizeof(*t) + len);
 	if (t) {
 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 		t->time = ktime_to_ns(ktime_get());
 record_it:
 		t->device = bt->dev;
 		t->action = action;
 		t->pid = pid;
 		t->cpu = cpu;
 		t->pdu_len = len;
 		memcpy((void *) t + sizeof(*t), data, len);
 		if (blk_tracer)
 			trace_buffer_unlock_commit(buffer, event, 0, pc);
 	}
 }
 /*
  * Send out a notify for this process, if we haven't done so since a trace
  * started
  */
 static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
 {
 	tsk->btrace_seq = blktrace_seq;
 	trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
 }
 static void trace_note_time(struct blk_trace *bt)
 {
 	struct timespec now;
 	unsigned long flags;
 	u32 words[2];
 	getnstimeofday(&now);
 	words[0] = now.tv_sec;
 	words[1] = now.tv_nsec;
 	local_irq_save(flags);
 	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
 	local_irq_restore(flags);
 }
 void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 {
 	int n;
 	va_list args;
 	unsigned long flags;
 	char *buf;
 	if (unlikely(bt->trace_state != Blktrace_running &&
 		     !blk_tracer_enabled))
 		return;
 	/*
 	 * If the BLK_TC_NOTIFY action mask isn't set, don't send any note
 	 * message to the trace.
 	 */
 	if (!(bt->act_mask & BLK_TC_NOTIFY))
 		return;
 	local_irq_save(flags);
 	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
 	va_start(args, fmt);
 	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
 	va_end(args);
 	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
 	local_irq_restore(flags);
 }
 EXPORT_SYMBOL_GPL(__trace_note_message);
 static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 			 pid_t pid)
 {
 	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
 		return 1;
 	if (sector && (sector < bt->start_lba || sector > bt->end_lba))
 		return 1;
 	if (bt->pid && pid != bt->pid)
 		return 1;
 	return 0;
 }
 /*
  * Data direction bit lookup
  */
 static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
 				 BLK_TC_ACT(BLK_TC_WRITE) };
 #define BLK_TC_RAHEAD		BLK_TC_AHEAD
 /* The ilog2() calls fall out because they're constant */
 #define MASK_TC_BIT(rw, __name) ((rw & REQ_ ## __name) << \
 	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - __REQ_ ## __name))
 /*
  * The worker for the various blk_add_trace*() types. Fills out a
  * blk_io_trace structure and places it in a per-cpu subbuffer.
  */
 static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
 {
 	struct task_struct *tsk = current;
 	struct ring_buffer_event *event = NULL;
 	struct ring_buffer *buffer = NULL;
 	struct blk_io_trace *t;
 	unsigned long flags = 0;
 	unsigned long *sequence;
 	pid_t pid;
 	int cpu, pc = 0;
 	bool blk_tracer = blk_tracer_enabled;
 	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
 		return;
 	what |= ddir_act[rw & WRITE];
 	what |= MASK_TC_BIT(rw, SYNC);
 	what |= MASK_TC_BIT(rw, RAHEAD);
 	what |= MASK_TC_BIT(rw, META);
 	what |= MASK_TC_BIT(rw, DISCARD);
 	pid = tsk->pid;
 	if (act_log_check(bt, what, sector, pid))
 		return;
 	cpu = raw_smp_processor_id();
 	if (blk_tracer) {
 		tracing_record_cmdline(current);
 		buffer = blk_tr->buffer;
 		pc = preempt_count();
 		event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
 						  sizeof(*t) + pdu_len,
 						  0, pc);
 		if (!event)
 			return;
 		t = ring_buffer_event_data(event);
 		goto record_it;
 	}
 	/*
 	 * A word about the locking here - we disable interrupts to reserve
 	 * some space in the relay per-cpu buffer, to prevent an irq
 	 * from coming in and stepping on our toes.
 	 */
 	local_irq_save(flags);
 	if (unlikely(tsk->btrace_seq != blktrace_seq))
 		trace_note_tsk(bt, tsk);
 	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
 	if (t) {
 		sequence = per_cpu_ptr(bt->sequence, cpu);
 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 		t->sequence = ++(*sequence);
 		t->time = ktime_to_ns(ktime_get());
 record_it:
 		/*
 		 * These two are not needed in ftrace as they are in the
 		 * generic trace_entry, filled by tracing_generic_entry_update,
 		 * but for the trace_event->bin() synthesizer benefit we do it
 		 * here too.
 		 */
 		t->cpu = cpu;
 		t->pid = pid;
 		t->sector = sector;
 		t->bytes = bytes;
 		t->action = what;
 		t->device = bt->dev;
 		t->error = error;
 		t->pdu_len = pdu_len;
 		if (pdu_len)
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 		if (blk_tracer) {
 			trace_buffer_unlock_commit(buffer, event, 0, pc);
 			return;
 		}
 	}
 	local_irq_restore(flags);
 }
 static struct dentry *blk_tree_root;
 static DEFINE_MUTEX(blk_tree_mutex);
 static void blk_trace_free(struct blk_trace *bt)
 {
 	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->dropped_file);
 	relay_close(bt->rchan);
 	debugfs_remove(bt->dir);
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	kfree(bt);
 }
 static void blk_trace_cleanup(struct blk_trace *bt)
 {
 	blk_trace_free(bt);
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
 }
 int blk_trace_remove(struct request_queue *q)
 {
 	struct blk_trace *bt;
 	bt = xchg(&q->blk_trace, NULL);
 	if (!bt)
 		return -EINVAL;
 	if (bt->trace_state != Blktrace_running)
 		blk_trace_cleanup(bt);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_trace_remove);
 static int blk_dropped_open(struct inode *inode, struct file *filp)
 {
 	filp->private_data = inode->i_private;
 	return 0;
 }
 static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
 				size_t count, loff_t *ppos)
 {
 	struct blk_trace *bt = filp->private_data;
 	char buf[16];
 	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
 	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
 }
 static const struct file_operations blk_dropped_fops = {
 	.owner =	THIS_MODULE,
 	.open =		blk_dropped_open,
 	.read =		blk_dropped_read,
 	.llseek =	default_llseek,
 };
 static int blk_msg_open(struct inode *inode, struct file *filp)
 {
 	filp->private_data = inode->i_private;
 	return 0;
 }
 static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 				size_t count, loff_t *ppos)
 {
 	char *msg;
 	struct blk_trace *bt;
 	if (count >= BLK_TN_MAX_MSG)
 		return -EINVAL;
 	msg = kmalloc(count + 1, GFP_KERNEL);
 	if (msg == NULL)
 		return -ENOMEM;
 	if (copy_from_user(msg, buffer, count)) {
 		kfree(msg);
 		return -EFAULT;
 	}
 	msg[count] = '\0';
 	bt = filp->private_data;
 	__trace_note_message(bt, "%s", msg);
 	kfree(msg);
 	return count;
 }
 static const struct file_operations blk_msg_fops = {
 	.owner =	THIS_MODULE,
 	.open =		blk_msg_open,
 	.write =	blk_msg_write,
 	.llseek =	noop_llseek,
 };
 /*
  * Keep track of how many times we encountered a full subbuffer, to aid
  * the user space app in telling how many lost events there were.
  */
 static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
 				     void *prev_subbuf, size_t prev_padding)
 {
 	struct blk_trace *bt;
 	if (!relay_buf_full(buf))
 		return 1;
 	bt = buf->chan->private_data;
 	atomic_inc(&bt->dropped);
 	return 0;
 }
 static int blk_remove_buf_file_callback(struct dentry *dentry)
 {
 	debugfs_remove(dentry);
 	return 0;
 }
 static struct dentry *blk_create_buf_file_callback(const char *filename,
 						   struct dentry *parent,
 						   int mode,
 						   struct rchan_buf *buf,
 						   int *is_global)
 {
 	return debugfs_create_file(filename, mode, parent, buf,
 					&relay_file_operations);
 }
 static struct rchan_callbacks blk_relay_callbacks = {
 	.subbuf_start		= blk_subbuf_start_callback,
 	.create_buf_file	= blk_create_buf_file_callback,
 	.remove_buf_file	= blk_remove_buf_file_callback,
 };
 static void blk_trace_setup_lba(struct blk_trace *bt,
 				struct block_device *bdev)
 {
 	struct hd_struct *part = NULL;
 	if (bdev)
 		part = bdev->bd_part;
 	if (part) {
 		bt->start_lba = part->start_sect;
 		bt->end_lba = part->start_sect + part->nr_sects;
 	} else {
 		bt->start_lba = 0;
 		bt->end_lba = -1ULL;
 	}
 }
 /*
  * Setup everything required to start tracing
  */
 int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		       struct block_device *bdev,
 		       struct blk_user_trace_setup *buts)
 {
 	struct blk_trace *old_bt, *bt = NULL;
 	struct dentry *dir = NULL;
 	int ret, i;
 	if (!buts->buf_size || !buts->buf_nr)
 		return -EINVAL;
 	strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
 	buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
 	/*
 	 * some device names have larger paths - convert the slashes
 	 * to underscores for this to work as expected
 	 */
 	for (i = 0; i < strlen(buts->name); i++)
 		if (buts->name[i] == '/')
 			buts->name[i] = '_';
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
 		return -ENOMEM;
 	ret = -ENOMEM;
 	bt->sequence = alloc_percpu(unsigned long);
 	if (!bt->sequence)
 		goto err;
 	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
 	if (!bt->msg_data)
 		goto err;
 	ret = -ENOENT;
 	mutex_lock(&blk_tree_mutex);
 	if (!blk_tree_root) {
 		blk_tree_root = debugfs_create_dir("block", NULL);
 		if (!blk_tree_root) {
 			mutex_unlock(&blk_tree_mutex);
 			goto err;
 		}
 	}
 	mutex_unlock(&blk_tree_mutex);
 	dir = debugfs_create_dir(buts->name, blk_tree_root);
 	if (!dir)
 		goto err;
 	bt->dir = dir;
 	bt->dev = dev;
 	atomic_set(&bt->dropped, 0);
 	ret = -EIO;
 	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
 					       &blk_dropped_fops);
 	if (!bt->dropped_file)
 		goto err;
 	bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
 	if (!bt->msg_file)
 		goto err;
 	bt->rchan = relay_open("trace", dir, buts->buf_size,
 				buts->buf_nr, &blk_relay_callbacks, bt);
 	if (!bt->rchan)
 		goto err;
 	bt->act_mask = buts->act_mask;
 	if (!bt->act_mask)
 		bt->act_mask = (u16) -1;
 	blk_trace_setup_lba(bt, bdev);
 	/* overwrite with user settings */
 	if (buts->start_lba)
 		bt->start_lba = buts->start_lba;
 	if (buts->end_lba)
 		bt->end_lba = buts->end_lba;
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 	ret = -EBUSY;
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt) {
 		(void) xchg(&q->blk_trace, old_bt);
 		goto err;
 	}
 	if (atomic_inc_return(&blk_probes_ref) == 1)
 		blk_register_tracepoints();
 	return 0;
 err:
 	blk_trace_free(bt);
 	return ret;
 }
 int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		    struct block_device *bdev,
 		    char __user *arg)
 {
 	struct blk_user_trace_setup buts;
 	int ret;
 	ret = copy_from_user(&buts, arg, sizeof(buts));
 	if (ret)
 		return -EFAULT;
 	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
 	if (ret)
 		return ret;
 	if (copy_to_user(arg, &buts, sizeof(buts))) {
 		blk_trace_remove(q);
 		return -EFAULT;
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_trace_setup);
 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
 static int compat_blk_trace_setup(struct request_queue *q, char *name,
 				  dev_t dev, struct block_device *bdev,
 				  char __user *arg)
 {
 	struct blk_user_trace_setup buts;
 	struct compat_blk_user_trace_setup cbuts;
 	int ret;
 	if (copy_from_user(&cbuts, arg, sizeof(cbuts)))
 		return -EFAULT;
 	buts = (struct blk_user_trace_setup) {
 		.act_mask = cbuts.act_mask,
 		.buf_size = cbuts.buf_size,
 		.buf_nr = cbuts.buf_nr,
 		.start_lba = cbuts.start_lba,
 		.end_lba = cbuts.end_lba,
 		.pid = cbuts.pid,
 	};
 	memcpy(&buts.name, &cbuts.name, 32);
 	ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
 	if (ret)
 		return ret;
 	if (copy_to_user(arg, &buts.name, 32)) {
 		blk_trace_remove(q);
 		return -EFAULT;
 	}
 	return 0;
 }
 #endif
 int blk_trace_startstop(struct request_queue *q, int start)
 {
 	int ret;
 	struct blk_trace *bt = q->blk_trace;
 	if (bt == NULL)
 		return -EINVAL;
 	/*
 	 * For starting a trace, we can transition from a setup or stopped
 	 * trace. For stopping a trace, the state must be running
 	 */
 	ret = -EINVAL;
 	if (start) {
 		if (bt->trace_state == Blktrace_setup ||
 		    bt->trace_state == Blktrace_stopped) {
 			blktrace_seq++;
 			smp_mb();
 			bt->trace_state = Blktrace_running;
 			trace_note_time(bt);
 			ret = 0;
 		}
 	} else {
 		if (bt->trace_state == Blktrace_running) {
 			bt->trace_state = Blktrace_stopped;
 			relay_flush(bt->rchan);
 			ret = 0;
 		}
 	}
 	return ret;
 }
 EXPORT_SYMBOL_GPL(blk_trace_startstop);
 /**
  * blk_trace_ioctl: - handle the ioctls associated with tracing
  * @bdev:	the block device
  * @cmd:	the ioctl cmd
  * @arg:	the argument data, if any
  *
  **/
 int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
 {
 	struct request_queue *q;
 	int ret, start = 0;
 	char b[BDEVNAME_SIZE];
 	q = bdev_get_queue(bdev);
 	if (!q)
 		return -ENXIO;
 	mutex_lock(&bdev->bd_mutex);
 	switch (cmd) {
 	case BLKTRACESETUP:
 		bdevname(bdev, b);
 		ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 #if defined(CONFIG_COMPAT) && defined(CONFIG_X86_64)
 	case BLKTRACESETUP32:
 		bdevname(bdev, b);
 		ret = compat_blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
 		break;
 #endif
 	case BLKTRACESTART:
 		start = 1;
 	case BLKTRACESTOP:
 		ret = blk_trace_startstop(q, start);
 		break;
 	case BLKTRACETEARDOWN:
 		ret = blk_trace_remove(q);
 		break;
 	default:
 		ret = -ENOTTY;
 		break;
 	}
 	mutex_unlock(&bdev->bd_mutex);
 	return ret;
 }
 /**
  * blk_trace_shutdown: - stop and cleanup trace structures
  * @q:    the request queue associated with the device
  *
  **/
 void blk_trace_shutdown(struct request_queue *q)
 {
 	if (q->blk_trace) {
 		blk_trace_startstop(q, 0);
 		blk_trace_remove(q);
 	}
 }
 /*
  * blktrace probes
  */
 /**
  * blk_add_trace_rq - Add a trace for a request oriented action
  * @q:		queue the io is for
  * @rq:		the source request
  * @what:	the action
  *
  * Description:
  *     Records an action against a request. Will log the bio offset + size.
  *
  **/
 static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 			     u32 what)
 {
 	struct blk_trace *bt = q->blk_trace;
 	if (likely(!bt))
 		return;
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
 		__blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
 				what, rq->errors, rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
 		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
 				rq->cmd_flags, what, rq->errors, 0, NULL);
 	}
 }
 static void blk_add_trace_rq_abort(void *ignore,
 				   struct request_queue *q, struct request *rq)
 {
 	blk_add_trace_rq(q, rq, BLK_TA_ABORT);
 }
 static void blk_add_trace_rq_insert(void *ignore,
 				    struct request_queue *q, struct request *rq)
 {
 	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
 }
 static void blk_add_trace_rq_issue(void *ignore,
 				   struct request_queue *q, struct request *rq)
 {
 	blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 }
 static void blk_add_trace_rq_requeue(void *ignore,
 				     struct request_queue *q,
 				     struct request *rq)
 {
 	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
 }
 static void blk_add_trace_rq_complete(void *ignore,
 				      struct request_queue *q,
 				      struct request *rq)
 {
 	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
 }
 /**
  * blk_add_trace_bio - Add a trace for a bio oriented action
  * @q:		queue the io is for
  * @bio:	the source bio
  * @what:	the action
  * @error:	error, if any
  *
  * Description:
  *     Records an action against a bio. Will log the bio offset + size.
  *
  **/
 static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
 			      u32 what, int error)
 {
 	struct blk_trace *bt = q->blk_trace;
 	if (likely(!bt))
 		return;
 	if (!error && !bio_flagged(bio, BIO_UPTODATE))
 		error = EIO;
 	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
 			error, 0, NULL);
 }
 static void blk_add_trace_bio_bounce(void *ignore,
 				     struct request_queue *q, struct bio *bio)
 {
 	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
 }
 static void blk_add_trace_bio_complete(void *ignore,
 				       struct request_queue *q, struct bio *bio,
 				       int error)
 {
 	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
 }
 static void blk_add_trace_bio_backmerge(void *ignore,
 					struct request_queue *q,
 					struct bio *bio)
 {
 	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE, 0);
 }
 static void blk_add_trace_bio_frontmerge(void *ignore,
 					 struct request_queue *q,
 					 struct bio *bio)
 {
 	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE, 0);
 }
 static void blk_add_trace_bio_queue(void *ignore,
 				    struct request_queue *q, struct bio *bio)
 {
 	blk_add_trace_bio(q, bio, BLK_TA_QUEUE, 0);
 }
 static void blk_add_trace_getrq(void *ignore,
 				struct request_queue *q,
 				struct bio *bio, int rw)
 {
 	if (bio)
 		blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0);
 	else {
 		struct blk_trace *bt = q->blk_trace;
 		if (bt)
 			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
 	}
 }
 static void blk_add_trace_sleeprq(void *ignore,
 				  struct request_queue *q,
 				  struct bio *bio, int rw)
 {
 	if (bio)
 		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0);
 	else {
 		struct blk_trace *bt = q->blk_trace;
 		if (bt)
 			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
 					0, 0, NULL);
 	}
 }
 static void blk_add_trace_plug(void *ignore, struct request_queue *q)
 {
 	struct blk_trace *bt = q->blk_trace;
 	if (bt)
 		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
 }
-static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q,
+static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
-				    unsigned int depth)
+				    unsigned int depth, bool explicit)
 {
 	struct blk_trace *bt = q->blk_trace;
 	if (bt) {
 		__be64 rpdu = cpu_to_be64(depth);
+		u32 what;
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
+		if (explicit)
-				sizeof(rpdu), &rpdu);
+			what = BLK_TA_UNPLUG_IO;
+		else
+			what = BLK_TA_UNPLUG_TIMER;
+		__blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
 	}
 }
 static void blk_add_trace_split(void *ignore,
 				struct request_queue *q, struct bio *bio,
 				unsigned int pdu)
 {
 	struct blk_trace *bt = q->blk_trace;
 	if (bt) {
 		__be64 rpdu = cpu_to_be64(pdu);
 		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
 				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
 				sizeof(rpdu), &rpdu);
 	}
 }
 /**
  * blk_add_trace_bio_remap - Add a trace for a bio-remap operation
  * @ignore:	trace callback data parameter (not used)
  * @q:		queue the io is for
  * @bio:	the source bio
  * @dev:	target device
  * @from:	source sector
  *
  * Description:
  *     Device mapper or raid target sometimes need to split a bio because
  *     it spans a stripe (or similar). Add a trace for that action.
  *
  **/
 static void blk_add_trace_bio_remap(void *ignore,
 				    struct request_queue *q, struct bio *bio,
 				    dev_t dev, sector_t from)
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
 	if (likely(!bt))
 		return;
 	r.device_from = cpu_to_be32(dev);
 	r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
 	r.sector_from = cpu_to_be64(from);
 	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
 			BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
 			sizeof(r), &r);
 }
 /**
  * blk_add_trace_rq_remap - Add a trace for a request-remap operation
  * @ignore:	trace callback data parameter (not used)
  * @q:		queue the io is for
  * @rq:		the source request
  * @dev:	target device
  * @from:	source sector
  *
  * Description:
  *     Device mapper remaps request to other devices.
  *     Add a trace for that action.
  *
  **/
 static void blk_add_trace_rq_remap(void *ignore,
 				   struct request_queue *q,
 				   struct request *rq, dev_t dev,
 				   sector_t from)
 {
 	struct blk_trace *bt = q->blk_trace;
 	struct blk_io_trace_remap r;
 	if (likely(!bt))
 		return;
 	r.device_from = cpu_to_be32(dev);
 	r.device_to   = cpu_to_be32(disk_devt(rq->rq_disk));
 	r.sector_from = cpu_to_be64(from);
 	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
 			rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
 			sizeof(r), &r);
 }
 /**
  * blk_add_driver_data - Add binary message with driver-specific data
  * @q:		queue the io is for
  * @rq:		io request
  * @data:	driver-specific data
  * @len:	length of driver-specific data
  *
  * Description:
  *     Some drivers might want to write driver-specific data per request.
  *
  **/
 void blk_add_driver_data(struct request_queue *q,
 			 struct request *rq,
 			 void *data, size_t len)
 {
 	struct blk_trace *bt = q->blk_trace;
 	if (likely(!bt))
 		return;
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC)
 		__blk_add_trace(bt, 0, blk_rq_bytes(rq), 0,
 				BLK_TA_DRV_DATA, rq->errors, len, data);
 	else
 		__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0,
 				BLK_TA_DRV_DATA, rq->errors, len, data);
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 static void blk_register_tracepoints(void)
 {
 	int ret;
 	ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_getrq(blk_add_trace_getrq, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_plug(blk_add_trace_plug, NULL);
 	WARN_ON(ret);
-	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+	ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_split(blk_add_trace_split, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
 	WARN_ON(ret);
 	ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
 	WARN_ON(ret);
 }
 static void blk_unregister_tracepoints(void)
 {
 	unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
 	unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
 	unregister_trace_block_split(blk_add_trace_split, NULL);
-	unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+	unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
 	unregister_trace_block_plug(blk_add_trace_plug, NULL);
 	unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
 	unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
 	unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL);
 	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL);
 	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL);
 	unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL);
 	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL);
 	unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL);
 	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL);
 	unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL);
 	unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL);
 	unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL);
 	tracepoint_synchronize_unregister();
 }
 /*
  * struct blk_io_tracer formatting routines
  */
 static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 {
 	int i = 0;
 	int tc = t->action >> BLK_TC_SHIFT;
 	if (t->action == BLK_TN_MESSAGE) {
 		rwbs[i++] = 'N';
 		goto out;
 	}
 	if (tc & BLK_TC_DISCARD)
 		rwbs[i++] = 'D';
 	else if (tc & BLK_TC_WRITE)
 		rwbs[i++] = 'W';
 	else if (t->bytes)
 		rwbs[i++] = 'R';
 	else
 		rwbs[i++] = 'N';
 	if (tc & BLK_TC_AHEAD)
 		rwbs[i++] = 'A';
 	if (tc & BLK_TC_BARRIER)
 		rwbs[i++] = 'B';
 	if (tc & BLK_TC_SYNC)
 		rwbs[i++] = 'S';
 	if (tc & BLK_TC_META)
 		rwbs[i++] = 'M';
 out:
 	rwbs[i] = '\0';
 }
 static inline
 const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
 {
 	return (const struct blk_io_trace *)ent;
 }
 static inline const void *pdu_start(const struct trace_entry *ent)
 {
 	return te_blk_io_trace(ent) + 1;
 }
 static inline u32 t_action(const struct trace_entry *ent)
 {
 	return te_blk_io_trace(ent)->action;
 }
 static inline u32 t_bytes(const struct trace_entry *ent)
 {
 	return te_blk_io_trace(ent)->bytes;
 }
 static inline u32 t_sec(const struct trace_entry *ent)
 {
 	return te_blk_io_trace(ent)->bytes >> 9;
 }
 static inline unsigned long long t_sector(const struct trace_entry *ent)
 {
 	return te_blk_io_trace(ent)->sector;
 }
 static inline __u16 t_error(const struct trace_entry *ent)
 {
 	return te_blk_io_trace(ent)->error;
 }
 static __u64 get_pdu_int(const struct trace_entry *ent)
 {
 	const __u64 *val = pdu_start(ent);
 	return be64_to_cpu(*val);
 }
 static void get_pdu_remap(const struct trace_entry *ent,
 			  struct blk_io_trace_remap *r)
 {
 	const struct blk_io_trace_remap *__r = pdu_start(ent);
 	__u64 sector_from = __r->sector_from;
 	r->device_from = be32_to_cpu(__r->device_from);
 	r->device_to   = be32_to_cpu(__r->device_to);
 	r->sector_from = be64_to_cpu(sector_from);
 }
 typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
 static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 {
 	char rwbs[6];
 	unsigned long long ts  = iter->ts;
 	unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
 	unsigned secs	       = (unsigned long)ts;
 	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
 	fill_rwbs(rwbs, t);
 	return trace_seq_printf(&iter->seq,
 				"%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
 				MAJOR(t->device), MINOR(t->device), iter->cpu,
 				secs, nsec_rem, iter->ent->pid, act, rwbs);
 }
 static int blk_log_action(struct trace_iterator *iter, const char *act)
 {
 	char rwbs[6];
 	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
 	fill_rwbs(rwbs, t);
 	return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
 				MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
 static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
 {
 	const unsigned char *pdu_buf;
 	int pdu_len;
 	int i, end, ret;
 	pdu_buf = pdu_start(ent);
 	pdu_len = te_blk_io_trace(ent)->pdu_len;
 	if (!pdu_len)
 		return 1;
 	/* find the last zero that needs to be printed */
 	for (end = pdu_len - 1; end >= 0; end--)
 		if (pdu_buf[end])
 			break;
 	end++;
 	if (!trace_seq_putc(s, '('))
 		return 0;
 	for (i = 0; i < pdu_len; i++) {
 		ret = trace_seq_printf(s, "%s%02x",
 				       i == 0 ? "" : " ", pdu_buf[i]);
 		if (!ret)
 			return ret;
 		/*
 		 * stop when the rest is just zeroes and indicate so
 		 * with a ".." appended
 		 */
 		if (i == end && end != pdu_len - 1)
 			return trace_seq_puts(s, " ..) ");
 	}
 	return trace_seq_puts(s, ") ");
 }
 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 {
 	char cmd[TASK_COMM_LEN];
 	trace_find_cmdline(ent->pid, cmd);
 	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
 		int ret;
 		ret = trace_seq_printf(s, "%u ", t_bytes(ent));
 		if (!ret)
 			return 0;
 		ret = blk_log_dump_pdu(s, ent);
 		if (!ret)
 			return 0;
 		return trace_seq_printf(s, "[%s]\n", cmd);
 	} else {
 		if (t_sec(ent))
 			return trace_seq_printf(s, "%llu + %u [%s]\n",
 						t_sector(ent), t_sec(ent), cmd);
 		return trace_seq_printf(s, "[%s]\n", cmd);
 	}
 }
 static int blk_log_with_error(struct trace_seq *s,
 			      const struct trace_entry *ent)
 {
 	if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
 		int ret;
 		ret = blk_log_dump_pdu(s, ent);
 		if (ret)
 			return trace_seq_printf(s, "[%d]\n", t_error(ent));
 		return 0;
 	} else {
 		if (t_sec(ent))
 			return trace_seq_printf(s, "%llu + %u [%d]\n",
 						t_sector(ent),
 						t_sec(ent), t_error(ent));
 		return trace_seq_printf(s, "%llu [%d]\n",
 					t_sector(ent), t_error(ent));
 	}
 }
 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 {
 	struct blk_io_trace_remap r = { .device_from = 0, };
 	get_pdu_remap(ent, &r);
 	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
 				t_sector(ent), t_sec(ent),
 				MAJOR(r.device_from), MINOR(r.device_from),
 				(unsigned long long)r.sector_from);
 }
 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
 {
 	char cmd[TASK_COMM_LEN];
 	trace_find_cmdline(ent->pid, cmd);
 	return trace_seq_printf(s, "[%s]\n", cmd);
 }
 static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
 {
 	char cmd[TASK_COMM_LEN];
 	trace_find_cmdline(ent->pid, cmd);
 	return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
 }
 static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
 {
 	char cmd[TASK_COMM_LEN];
 	trace_find_cmdline(ent->pid, cmd);
 	return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
 				get_pdu_int(ent), cmd);
 }
 static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
 {
 	int ret;
 	const struct blk_io_trace *t = te_blk_io_trace(ent);
 	ret = trace_seq_putmem(s, t + 1, t->pdu_len);
 	if (ret)
 		return trace_seq_putc(s, '\n');
 	return ret;
 }
 /*
  * struct tracer operations
  */
 static void blk_tracer_print_header(struct seq_file *m)
 {
 	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
 		return;
 	seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
 		    "#  |     |     |           |   |   |\n");
 }
 static void blk_tracer_start(struct trace_array *tr)
 {
 	blk_tracer_enabled = true;
 }
 static int blk_tracer_init(struct trace_array *tr)
 {
 	blk_tr = tr;
 	blk_tracer_start(tr);
 	return 0;
 }
 static void blk_tracer_stop(struct trace_array *tr)
 {
 	blk_tracer_enabled = false;
 }
 static void blk_tracer_reset(struct trace_array *tr)
 {
 	blk_tracer_stop(tr);
 }
 static const struct {
 	const char *act[2];
 	int	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
 } what2act[] = {
 	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
 	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
 	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
 	[__BLK_TA_SLEEPRQ]	= {{  "S", "sleeprq" },	   blk_log_generic },
 	[__BLK_TA_REQUEUE]	= {{  "R", "requeue" },	   blk_log_with_error },
 	[__BLK_TA_ISSUE]	= {{  "D", "issue" },	   blk_log_generic },
 	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
 	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
 	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
+	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
 	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
 	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
 	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
 	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
 };
 static enum print_line_t print_one_line(struct trace_iterator *iter,
 					bool classic)
 {
 	struct trace_seq *s = &iter->seq;
 	const struct blk_io_trace *t;
 	u16 what;
 	int ret;
 	bool long_act;
 	blk_log_action_t *log_action;
 	t	   = te_blk_io_trace(iter->ent);
 	what	   = t->action & ((1 << BLK_TC_SHIFT) - 1);
 	long_act   = !!(trace_flags & TRACE_ITER_VERBOSE);
 	log_action = classic ? &blk_log_action_classic : &blk_log_action;
 	if (t->action == BLK_TN_MESSAGE) {
 		ret = log_action(iter, long_act ? "message" : "m");
 		if (ret)
 			ret = blk_log_msg(s, iter->ent);
 		goto out;
 	}
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(s, "Unknown action %x\n", what);
 	else {
 		ret = log_action(iter, what2act[what].act[long_act]);
 		if (ret)
 			ret = what2act[what].print(s, iter->ent);
 	}
 out:
 	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
 }
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
 					       int flags, struct trace_event *event)
 {
 	return print_one_line(iter, false);
 }
 static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
 	const int offset = offsetof(struct blk_io_trace, sector);
 	struct blk_io_trace old = {
 		.magic	  = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
 		.time     = iter->ts,
 	};
 	if (!trace_seq_putmem(s, &old, offset))
 		return 0;
 	return trace_seq_putmem(s, &t->sector,
 				sizeof(old) - offset + t->pdu_len);
 }
 static enum print_line_t
 blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
 			     struct trace_event *event)
 {
 	return blk_trace_synthesize_old_trace(iter) ?
 			TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
 }
 static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
 {
 	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
 		return TRACE_TYPE_UNHANDLED;
 	return print_one_line(iter, true);
 }
 static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
 {
 	/* don't output context-info for blk_classic output */
 	if (bit == TRACE_BLK_OPT_CLASSIC) {
 		if (set)
 			trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 		else
 			trace_flags |= TRACE_ITER_CONTEXT_INFO;
 	}
 	return 0;
 }
 static struct tracer blk_tracer __read_mostly = {
 	.name		= "blk",
 	.init		= blk_tracer_init,
 	.reset		= blk_tracer_reset,
 	.start		= blk_tracer_start,
 	.stop		= blk_tracer_stop,
 	.print_header	= blk_tracer_print_header,
 	.print_line	= blk_tracer_print_line,
 	.flags		= &blk_tracer_flags,
 	.set_flag	= blk_tracer_set_flag,
 };
 static struct trace_event_functions trace_blk_event_funcs = {
 	.trace		= blk_trace_event_print,
 	.binary		= blk_trace_event_print_binary,
 };
 static struct trace_event trace_blk_event = {
 	.type		= TRACE_BLK,
 	.funcs		= &trace_blk_event_funcs,
 };
 static int __init init_blk_tracer(void)
 {
 	if (!register_ftrace_event(&trace_blk_event)) {
 		pr_warning("Warning: could not register block events\n");
 		return 1;
 	}
 	if (register_tracer(&blk_tracer) != 0) {
 		pr_warning("Warning: could not register the block tracer\n");
 		unregister_ftrace_event(&trace_blk_event);
 		return 1;
 	}
 	return 0;
 }
 device_initcall(init_blk_tracer);
 static int blk_trace_remove_queue(struct request_queue *q)
 {
 	struct blk_trace *bt;
 	bt = xchg(&q->blk_trace, NULL);
 	if (bt == NULL)
 		return -EINVAL;
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
 	blk_trace_free(bt);
 	return 0;
 }
 /*
  * Setup everything required to start tracing
  */
 static int blk_trace_setup_queue(struct request_queue *q,
 				 struct block_device *bdev)
 {
 	struct blk_trace *old_bt, *bt = NULL;
 	int ret = -ENOMEM;
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
 		return -ENOMEM;
 	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
 	if (!bt->msg_data)
 		goto free_bt;
 	bt->dev = bdev->bd_dev;
 	bt->act_mask = (u16)-1;
 	blk_trace_setup_lba(bt, bdev);
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt != NULL) {
 		(void)xchg(&q->blk_trace, old_bt);
 		ret = -EBUSY;
 		goto free_bt;
 	}
 	if (atomic_inc_return(&blk_probes_ref) == 1)
 		blk_register_tracepoints();
 	return 0;
 free_bt:
 	blk_trace_free(bt);
 	return ret;
 }
 /*
  * sysfs interface to enable and configure tracing
  */
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf);
 static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 					  struct device_attribute *attr,
 					  const char *buf, size_t count);
 #define BLK_TRACE_DEVICE_ATTR(_name) \
 	DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
 		    sysfs_blk_trace_attr_show, \
 		    sysfs_blk_trace_attr_store)
 static BLK_TRACE_DEVICE_ATTR(enable);
 static BLK_TRACE_DEVICE_ATTR(act_mask);
 static BLK_TRACE_DEVICE_ATTR(pid);
 static BLK_TRACE_DEVICE_ATTR(start_lba);
 static BLK_TRACE_DEVICE_ATTR(end_lba);
 static struct attribute *blk_trace_attrs[] = {
 	&dev_attr_enable.attr,
 	&dev_attr_act_mask.attr,
 	&dev_attr_pid.attr,
 	&dev_attr_start_lba.attr,
 	&dev_attr_end_lba.attr,
 	NULL
 };
 struct attribute_group blk_trace_attr_group = {
 	.name  = "trace",
 	.attrs = blk_trace_attrs,
 };
 static const struct {
 	int mask;
 	const char *str;
 } mask_maps[] = {
 	{ BLK_TC_READ,		"read"		},
 	{ BLK_TC_WRITE,		"write"		},
 	{ BLK_TC_BARRIER,	"barrier"	},
 	{ BLK_TC_SYNC,		"sync"		},
 	{ BLK_TC_QUEUE,		"queue"		},
 	{ BLK_TC_REQUEUE,	"requeue"	},
 	{ BLK_TC_ISSUE,		"issue"		},
 	{ BLK_TC_COMPLETE,	"complete"	},
 	{ BLK_TC_FS,		"fs"		},
 	{ BLK_TC_PC,		"pc"		},
 	{ BLK_TC_AHEAD,		"ahead"		},
 	{ BLK_TC_META,		"meta"		},
 	{ BLK_TC_DISCARD,	"discard"	},
 	{ BLK_TC_DRV_DATA,	"drv_data"	},
 };
 static int blk_trace_str2mask(const char *str)
 {
 	int i;
 	int mask = 0;
 	char *buf, *s, *token;
 	buf = kstrdup(str, GFP_KERNEL);
 	if (buf == NULL)
 		return -ENOMEM;
 	s = strstrip(buf);
 	while (1) {
 		token = strsep(&s, ",");
 		if (token == NULL)
 			break;
 		if (*token == '\0')
 			continue;
 		for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
 			if (strcasecmp(token, mask_maps[i].str) == 0) {
 				mask |= mask_maps[i].mask;
 				break;
 			}
 		}
 		if (i == ARRAY_SIZE(mask_maps)) {
 			mask = -EINVAL;
 			break;
 		}
 	}
 	kfree(buf);
 	return mask;
 }
 static ssize_t blk_trace_mask2str(char *buf, int mask)
 {
 	int i;
 	char *p = buf;
 	for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
 		if (mask & mask_maps[i].mask) {
 			p += sprintf(p, "%s%s",
 				    (p == buf) ? "" : ",", mask_maps[i].str);
 		}
 	}
 	*p++ = '\n';
 	return p - buf;
 }
 static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
 {
 	if (bdev->bd_disk == NULL)
 		return NULL;
 	return bdev_get_queue(bdev);
 }
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
 {
 	struct hd_struct *p = dev_to_part(dev);
 	struct request_queue *q;
 	struct block_device *bdev;
 	ssize_t ret = -ENXIO;
 	bdev = bdget(part_devt(p));
 	if (bdev == NULL)
 		goto out;
 	q = blk_trace_get_queue(bdev);
 	if (q == NULL)
 		goto out_bdput;
 	mutex_lock(&bdev->bd_mutex);
 	if (attr == &dev_attr_enable) {
 		ret = sprintf(buf, "%u\n", !!q->blk_trace);
 		goto out_unlock_bdev;
 	}
 	if (q->blk_trace == NULL)
 		ret = sprintf(buf, "disabled\n");
 	else if (attr == &dev_attr_act_mask)
 		ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
 	else if (attr == &dev_attr_pid)
 		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
 	else if (attr == &dev_attr_start_lba)
 		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
 	else if (attr == &dev_attr_end_lba)
 		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
 out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 out_bdput:
 	bdput(bdev);
 out:
 	return ret;
 }
 static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 					  struct device_attribute *attr,
 					  const char *buf, size_t count)
 {
 	struct block_device *bdev;
 	struct request_queue *q;
 	struct hd_struct *p;
 	u64 value;
 	ssize_t ret = -EINVAL;
 	if (count == 0)
 		goto out;
 	if (attr == &dev_attr_act_mask) {
 		if (sscanf(buf, "%llx", &value) != 1) {
 			/* Assume it is a list of trace category names */
 			ret = blk_trace_str2mask(buf);
 			if (ret < 0)
 				goto out;
 			value = ret;
 		}
 	} else if (sscanf(buf, "%llu", &value) != 1)
 		goto out;
 	ret = -ENXIO;
 	p = dev_to_part(dev);
 	bdev = bdget(part_devt(p));
 	if (bdev == NULL)
 		goto out;
 	q = blk_trace_get_queue(bdev);
 	if (q == NULL)
 		goto out_bdput;
 	mutex_lock(&bdev->bd_mutex);
 	if (attr == &dev_attr_enable) {
 		if (value)
 			ret = blk_trace_setup_queue(q, bdev);
 		else
 			ret = blk_trace_remove_queue(q);
 		goto out_unlock_bdev;
 	}
 	ret = 0;
 	if (q->blk_trace == NULL)
 		ret = blk_trace_setup_queue(q, bdev);
 	if (ret == 0) {
 		if (attr == &dev_attr_act_mask)
 			q->blk_trace->act_mask = value;
 		else if (attr == &dev_attr_pid)
 			q->blk_trace->pid = value;
 		else if (attr == &dev_attr_start_lba)
 			q->blk_trace->start_lba = value;
 		else if (attr == &dev_attr_end_lba)
 			q->blk_trace->end_lba = value;
 	}
 out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 out_bdput:
 	bdput(bdev);
 out:
 	return ret ? ret : count;
 }
 int blk_trace_init_sysfs(struct device *dev)
 {
 	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
 }
 void blk_trace_remove_sysfs(struct device *dev)
 {
 	sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
 }
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 #ifdef CONFIG_EVENT_TRACING
 void blk_dump_cmd(char *buf, struct request *rq)
 {
 	int i, end;
 	int len = rq->cmd_len;
 	unsigned char *cmd = rq->cmd;
 	if (rq->cmd_type != REQ_TYPE_BLOCK_PC) {
 		buf[0] = '\0';
 		return;
 	}
 	for (end = len - 1; end >= 0; end--)
 		if (cmd[end])
 			break;
 	end++;
 	for (i = 0; i < len; i++) {
 		buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
 		if (i == end && end != len - 1) {
 			sprintf(buf, " ..");
 			break;
 		}
 	}
 }
 void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
 {
 	int i = 0;
 	if (rw & WRITE)
 		rwbs[i++] = 'W';
 	else if (rw & REQ_DISCARD)
 		rwbs[i++] = 'D';
 	else if (bytes)
 		rwbs[i++] = 'R';
 	else
 		rwbs[i++] = 'N';
 	if (rw & REQ_RAHEAD)
 		rwbs[i++] = 'A';
 	if (rw & REQ_SYNC)
 		rwbs[i++] = 'S';
 	if (rw & REQ_META)
 		rwbs[i++] = 'M';
 	if (rw & REQ_SECURE)
 		rwbs[i++] = 'E';
 	rwbs[i] = '\0';
 }
 #endif /* CONFIG_EVENT_TRACING */