Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

3

4

*

4

*

5

* This file is released under the GPL.

5

* This file is released under the GPL.

6

*/

6

*/

7

8

#include "dm.h"

8

#include "dm.h"

9

#include "dm-uevent.h"

9

#include "dm-uevent.h"

10

11

#include <linux/init.h>

11

#include <linux/init.h>

12

#include <linux/module.h>

12

#include <linux/module.h>

13

#include <linux/mutex.h>

13

#include <linux/mutex.h>

14

#include <linux/moduleparam.h>

14

#include <linux/moduleparam.h>

15

#include <linux/blkpg.h>

15

#include <linux/blkpg.h>

16

#include <linux/bio.h>

16

#include <linux/bio.h>

17

#include <linux/buffer_head.h>

17

#include <linux/buffer_head.h>

18

#include <linux/mempool.h>

18

#include <linux/mempool.h>

19

#include <linux/slab.h>

19

#include <linux/slab.h>

20

#include <linux/idr.h>

20

#include <linux/idr.h>

21

#include <linux/hdreg.h>

21

#include <linux/hdreg.h>

22

#include <linux/delay.h>

22

#include <linux/delay.h>

23

24

#include <trace/events/block.h>

24

#include <trace/events/block.h>

25

26

#define DM_MSG_PREFIX "core"

26

#define DM_MSG_PREFIX "core"

27

28

/*

28

/*

29

* Cookies are numeric values sent with CHANGE and REMOVE

29

* Cookies are numeric values sent with CHANGE and REMOVE

30

* uevents while resuming, removing or renaming the device.

30

* uevents while resuming, removing or renaming the device.

31

*/

31

*/

32

#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"

32

#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"

33

#define DM_COOKIE_LENGTH 24

33

#define DM_COOKIE_LENGTH 24

34

35

static const char *_name = DM_NAME;

35

static const char *_name = DM_NAME;

36

37

static unsigned int major = 0;

37

static unsigned int major = 0;

38

static unsigned int _major = 0;

38

static unsigned int _major = 0;

39

40

static DEFINE_IDR(_minor_idr);

40

static DEFINE_IDR(_minor_idr);

41

42

static DEFINE_SPINLOCK(_minor_lock);

42

static DEFINE_SPINLOCK(_minor_lock);

43

/*

43

/*

44

* For bio-based dm.

44

* For bio-based dm.

45

* One of these is allocated per bio.

45

* One of these is allocated per bio.

46

*/

46

*/

47

struct dm_io {

47

struct dm_io {

48

struct mapped_device *md;

48

struct mapped_device *md;

49

int error;

49

int error;

50

atomic_t io_count;

50

atomic_t io_count;

51

struct bio *bio;

51

struct bio *bio;

52

unsigned long start_time;

52

unsigned long start_time;

53

spinlock_t endio_lock;

53

spinlock_t endio_lock;

54

};

54

};

55

56

/*

56

/*

57

* For bio-based dm.

57

* For bio-based dm.

58

* One of these is allocated per target within a bio. Hopefully

58

* One of these is allocated per target within a bio. Hopefully

59

* this will be simplified out one day.

59

* this will be simplified out one day.

60

*/

60

*/

61

struct dm_target_io {

61

struct dm_target_io {

62

struct dm_io *io;

62

struct dm_io *io;

63

struct dm_target *ti;

63

struct dm_target *ti;

64

union map_info info;

64

union map_info info;

65

};

65

};

66

67

/*

67

/*

68

* For request-based dm.

68

* For request-based dm.

69

* One of these is allocated per request.

69

* One of these is allocated per request.

70

*/

70

*/

71

struct dm_rq_target_io {

71

struct dm_rq_target_io {

72

struct mapped_device *md;

72

struct mapped_device *md;

73

struct dm_target *ti;

73

struct dm_target *ti;

74

struct request *orig, clone;

74

struct request *orig, clone;

75

int error;

75

int error;

76

union map_info info;

76

union map_info info;

77

};

77

};

78

79

/*

79

/*

80

* For request-based dm.

80

* For request-based dm.

81

* One of these is allocated per bio.

81

* One of these is allocated per bio.

82

*/

82

*/

83

struct dm_rq_clone_bio_info {

83

struct dm_rq_clone_bio_info {

84

struct bio *orig;

84

struct bio *orig;

85

struct dm_rq_target_io *tio;

85

struct dm_rq_target_io *tio;

86

};

86

};

87

88

union map_info *dm_get_mapinfo(struct bio *bio)

88

union map_info *dm_get_mapinfo(struct bio *bio)

89

{

89

{

90

if (bio && bio->bi_private)

90

if (bio && bio->bi_private)

91

return &((struct dm_target_io *)bio->bi_private)->info;

91

return &((struct dm_target_io *)bio->bi_private)->info;

92

return NULL;

92

return NULL;

93

}

93

}

94

95

union map_info *dm_get_rq_mapinfo(struct request *rq)

95

union map_info *dm_get_rq_mapinfo(struct request *rq)

96

{

96

{

97

if (rq && rq->end_io_data)

97

if (rq && rq->end_io_data)

98

return &((struct dm_rq_target_io *)rq->end_io_data)->info;

98

return &((struct dm_rq_target_io *)rq->end_io_data)->info;

99

return NULL;

99

return NULL;

100

}

100

}

101

EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);

101

EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);

102

103

#define MINOR_ALLOCED ((void *)-1)

103

#define MINOR_ALLOCED ((void *)-1)

104

105

/*

105

/*

106

* Bits for the md->flags field.

106

* Bits for the md->flags field.

107

*/

107

*/

108

#define DMF_BLOCK_IO_FOR_SUSPEND 0

108

#define DMF_BLOCK_IO_FOR_SUSPEND 0

109

#define DMF_SUSPENDED 1

109

#define DMF_SUSPENDED 1

110

#define DMF_FROZEN 2

110

#define DMF_FROZEN 2

111

#define DMF_FREEING 3

111

#define DMF_FREEING 3

112

#define DMF_DELETING 4

112

#define DMF_DELETING 4

113

#define DMF_NOFLUSH_SUSPENDING 5

113

#define DMF_NOFLUSH_SUSPENDING 5

114

#define DMF_MERGE_IS_OPTIONAL 6

114

#define DMF_MERGE_IS_OPTIONAL 6

115

116

/*

116

/*

117

* Work processed by per-device workqueue.

117

* Work processed by per-device workqueue.

118

*/

118

*/

119

struct mapped_device {

119

struct mapped_device {

120

struct rw_semaphore io_lock;

120

struct rw_semaphore io_lock;

121

struct mutex suspend_lock;

121

struct mutex suspend_lock;

122

rwlock_t map_lock;

122

rwlock_t map_lock;

123

atomic_t holders;

123

atomic_t holders;

124

atomic_t open_count;

124

atomic_t open_count;

125

126

unsigned long flags;

126

unsigned long flags;

127

128

struct request_queue *queue;

128

struct request_queue *queue;

129

unsigned type;

129

unsigned type;

130

/* Protect queue and type against concurrent access. */

130

/* Protect queue and type against concurrent access. */

131

struct mutex type_lock;

131

struct mutex type_lock;

132

133

struct gendisk *disk;

133

struct gendisk *disk;

134

char name[16];

134

char name[16];

135

136

void *interface_ptr;

136

void *interface_ptr;

137

138

/*

138

/*

139

* A list of ios that arrived while we were suspended.

139

* A list of ios that arrived while we were suspended.

140

*/

140

*/

141

atomic_t pending[2];

141

atomic_t pending[2];

142

wait_queue_head_t wait;

142

wait_queue_head_t wait;

143

struct work_struct work;

143

struct work_struct work;

144

struct bio_list deferred;

144

struct bio_list deferred;

145

spinlock_t deferred_lock;

145

spinlock_t deferred_lock;

146

147

/*

147

/*

148

* Processing queue (flush)

148

* Processing queue (flush)

149

*/

149

*/

150

struct workqueue_struct *wq;

150

struct workqueue_struct *wq;

151

152

/*

152

/*

153

* The current mapping.

153

* The current mapping.

154

*/

154

*/

155

struct dm_table *map;

155

struct dm_table *map;

156

157

/*

157

/*

158

* io objects are allocated from here.

158

* io objects are allocated from here.

159

*/

159

*/

160

mempool_t *io_pool;

160

mempool_t *io_pool;

161

mempool_t *tio_pool;

161

mempool_t *tio_pool;

162

163

struct bio_set *bs;

163

struct bio_set *bs;

164

165

/*

165

/*

166

* Event handling.

166

* Event handling.

167

*/

167

*/

168

atomic_t event_nr;

168

atomic_t event_nr;

169

wait_queue_head_t eventq;

169

wait_queue_head_t eventq;

170

atomic_t uevent_seq;

170

atomic_t uevent_seq;

171

struct list_head uevent_list;

171

struct list_head uevent_list;

172

spinlock_t uevent_lock; /* Protect access to uevent_list */

172

spinlock_t uevent_lock; /* Protect access to uevent_list */

173

174

/*

174

/*

175

* freeze/thaw support require holding onto a super block

175

* freeze/thaw support require holding onto a super block

176

*/

176

*/

177

struct super_block *frozen_sb;

177

struct super_block *frozen_sb;

178

struct block_device *bdev;

178

struct block_device *bdev;

179

180

/* forced geometry settings */

180

/* forced geometry settings */

181

struct hd_geometry geometry;

181

struct hd_geometry geometry;

182

183

/* For saving the address of __make_request for request based dm */

183

/* For saving the address of __make_request for request based dm */

184

make_request_fn *saved_make_request_fn;

184

make_request_fn *saved_make_request_fn;

185

186

/* sysfs handle */

186

/* sysfs handle */

187

struct kobject kobj;

187

struct kobject kobj;

188

189

/* zero-length flush that will be cloned and submitted to targets */

189

/* zero-length flush that will be cloned and submitted to targets */

190

struct bio flush_bio;

190

struct bio flush_bio;

191

};

191

};

192

193

/*

193

/*

194

* For mempools pre-allocation at the table loading time.

194

* For mempools pre-allocation at the table loading time.

195

*/

195

*/

196

struct dm_md_mempools {

196

struct dm_md_mempools {

197

mempool_t *io_pool;

197

mempool_t *io_pool;

198

mempool_t *tio_pool;

198

mempool_t *tio_pool;

199

struct bio_set *bs;

199

struct bio_set *bs;

200

};

200

};

201

202

#define MIN_IOS 256

202

#define MIN_IOS 256

203

static struct kmem_cache *_io_cache;

203

static struct kmem_cache *_io_cache;

204

static struct kmem_cache *_tio_cache;

204

static struct kmem_cache *_tio_cache;

205

static struct kmem_cache *_rq_tio_cache;

205

static struct kmem_cache *_rq_tio_cache;

206

static struct kmem_cache *_rq_bio_info_cache;

206

static struct kmem_cache *_rq_bio_info_cache;

207

208

static int __init local_init(void)

208

static int __init local_init(void)

209

{

209

{

210

int r = -ENOMEM;

210

int r = -ENOMEM;

211

212

/* allocate a slab for the dm_ios */

212

/* allocate a slab for the dm_ios */

213

_io_cache = KMEM_CACHE(dm_io, 0);

213

_io_cache = KMEM_CACHE(dm_io, 0);

214

if (!_io_cache)

214

if (!_io_cache)

215

return r;

215

return r;

216

217

/* allocate a slab for the target ios */

217

/* allocate a slab for the target ios */

218

_tio_cache = KMEM_CACHE(dm_target_io, 0);

218

_tio_cache = KMEM_CACHE(dm_target_io, 0);

219

if (!_tio_cache)

219

if (!_tio_cache)

220

goto out_free_io_cache;

220

goto out_free_io_cache;

221

222

_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);

222

_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);

223

if (!_rq_tio_cache)

223

if (!_rq_tio_cache)

224

goto out_free_tio_cache;

224

goto out_free_tio_cache;

225

226

_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);

226

_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);

227

if (!_rq_bio_info_cache)

227

if (!_rq_bio_info_cache)

228

goto out_free_rq_tio_cache;

228

goto out_free_rq_tio_cache;

229

230

r = dm_uevent_init();

230

r = dm_uevent_init();

231

if (r)

231

if (r)

232

goto out_free_rq_bio_info_cache;

232

goto out_free_rq_bio_info_cache;

233

234

_major = major;

234

_major = major;

235

r = register_blkdev(_major, _name);

235

r = register_blkdev(_major, _name);

236

if (r < 0)

236

if (r < 0)

237

goto out_uevent_exit;

237

goto out_uevent_exit;

238

239

if (!_major)

239

if (!_major)

240

_major = r;

240

_major = r;

241

242

return 0;

242

return 0;

243

244

out_uevent_exit:

244

out_uevent_exit:

245

dm_uevent_exit();

245

dm_uevent_exit();

246

out_free_rq_bio_info_cache:

246

out_free_rq_bio_info_cache:

247

kmem_cache_destroy(_rq_bio_info_cache);

247

kmem_cache_destroy(_rq_bio_info_cache);

248

out_free_rq_tio_cache:

248

out_free_rq_tio_cache:

249

kmem_cache_destroy(_rq_tio_cache);

249

kmem_cache_destroy(_rq_tio_cache);

250

out_free_tio_cache:

250

out_free_tio_cache:

251

kmem_cache_destroy(_tio_cache);

251

kmem_cache_destroy(_tio_cache);

252

out_free_io_cache:

252

out_free_io_cache:

253

kmem_cache_destroy(_io_cache);

253

kmem_cache_destroy(_io_cache);

254

255

return r;

255

return r;

256

}

256

}

257

258

static void local_exit(void)

258

static void local_exit(void)

259

{

259

{

260

kmem_cache_destroy(_rq_bio_info_cache);

260

kmem_cache_destroy(_rq_bio_info_cache);

261

kmem_cache_destroy(_rq_tio_cache);

261

kmem_cache_destroy(_rq_tio_cache);

262

kmem_cache_destroy(_tio_cache);

262

kmem_cache_destroy(_tio_cache);

263

kmem_cache_destroy(_io_cache);

263

kmem_cache_destroy(_io_cache);

264

unregister_blkdev(_major, _name);

264

unregister_blkdev(_major, _name);

265

dm_uevent_exit();

265

dm_uevent_exit();

266

267

_major = 0;

267

_major = 0;

268

269

DMINFO("cleaned up");

269

DMINFO("cleaned up");

270

}

270

}

271

272

static int (*_inits[])(void) __initdata = {

272

static int (*_inits[])(void) __initdata = {

273

local_init,

273

local_init,

274

dm_target_init,

274

dm_target_init,

275

dm_linear_init,

275

dm_linear_init,

276

dm_stripe_init,

276

dm_stripe_init,

277

dm_io_init,

277

dm_io_init,

278

dm_kcopyd_init,

278

dm_kcopyd_init,

279

dm_interface_init,

279

dm_interface_init,

280

};

280

};

281

282

static void (*_exits[])(void) = {

282

static void (*_exits[])(void) = {

283

local_exit,

283

local_exit,

284

dm_target_exit,

284

dm_target_exit,

285

dm_linear_exit,

285

dm_linear_exit,

286

dm_stripe_exit,

286

dm_stripe_exit,

287

dm_io_exit,

287

dm_io_exit,

288

dm_kcopyd_exit,

288

dm_kcopyd_exit,

289

dm_interface_exit,

289

dm_interface_exit,

290

};

290

};

291

292

static int __init dm_init(void)

292

static int __init dm_init(void)

293

{

293

{

294

const int count = ARRAY_SIZE(_inits);

294

const int count = ARRAY_SIZE(_inits);

295

296

int r, i;

296

int r, i;

297

298

for (i = 0; i < count; i++) {

298

for (i = 0; i < count; i++) {

299

r = _inits[i]();

299

r = _inits[i]();

300

if (r)

300

if (r)

301

goto bad;

301

goto bad;

302

}

302

}

303

304

return 0;

304

return 0;

305

306

bad:

306

bad:

307

while (i--)

307

while (i--)

308

_exits[i]();

308

_exits[i]();

309

310

return r;

310

return r;

311

}

311

}

312

313

static void __exit dm_exit(void)

313

static void __exit dm_exit(void)

314

{

314

{

315

int i = ARRAY_SIZE(_exits);

315

int i = ARRAY_SIZE(_exits);

316

317

while (i--)

317

while (i--)

318

_exits[i]();

318

_exits[i]();

319

320

/*

320

/*

321

* Should be empty by this point.

321

* Should be empty by this point.

322

*/

322

*/

323

idr_remove_all(&_minor_idr);

323

idr_remove_all(&_minor_idr);

324

idr_destroy(&_minor_idr);

324

idr_destroy(&_minor_idr);

325

}

325

}

326

327

/*

327

/*

328

* Block device functions

328

* Block device functions

329

*/

329

*/

330

int dm_deleting_md(struct mapped_device *md)

330

int dm_deleting_md(struct mapped_device *md)

331

{

331

{

332

return test_bit(DMF_DELETING, &md->flags);

332

return test_bit(DMF_DELETING, &md->flags);

333

}

333

}

334

335

static int dm_blk_open(struct block_device *bdev, fmode_t mode)

335

static int dm_blk_open(struct block_device *bdev, fmode_t mode)

336

{

336

{

337

struct mapped_device *md;

337

struct mapped_device *md;

338

339

spin_lock(&_minor_lock);

339

spin_lock(&_minor_lock);

340

341

md = bdev->bd_disk->private_data;

341

md = bdev->bd_disk->private_data;

342

if (!md)

342

if (!md)

343

goto out;

343

goto out;

344

345

if (test_bit(DMF_FREEING, &md->flags) ||

345

if (test_bit(DMF_FREEING, &md->flags) ||

346

dm_deleting_md(md)) {

346

dm_deleting_md(md)) {

347

md = NULL;

347

md = NULL;

348

goto out;

348

goto out;

349

}

349

}

350

351

dm_get(md);

351

dm_get(md);

352

atomic_inc(&md->open_count);

352

atomic_inc(&md->open_count);

353

354

out:

354

out:

355

spin_unlock(&_minor_lock);

355

spin_unlock(&_minor_lock);

356

357

return md ? 0 : -ENXIO;

357

return md ? 0 : -ENXIO;

358

}

358

}

359

360

static int dm_blk_close(struct gendisk *disk, fmode_t mode)

360

static int dm_blk_close(struct gendisk *disk, fmode_t mode)

361

{

361

{

362

struct mapped_device *md = disk->private_data;

362

struct mapped_device *md = disk->private_data;

363

364

spin_lock(&_minor_lock);

364

spin_lock(&_minor_lock);

365

366

atomic_dec(&md->open_count);

366

atomic_dec(&md->open_count);

367

dm_put(md);

367

dm_put(md);

368

369

spin_unlock(&_minor_lock);

369

spin_unlock(&_minor_lock);

370

371

return 0;

371

return 0;

372

}

372

}

373

374

int dm_open_count(struct mapped_device *md)

374

int dm_open_count(struct mapped_device *md)

375

{

375

{

376

return atomic_read(&md->open_count);

376

return atomic_read(&md->open_count);

377

}

377

}

378

379

/*

379

/*

380

* Guarantees nothing is using the device before it's deleted.

380

* Guarantees nothing is using the device before it's deleted.

381

*/

381

*/

382

int dm_lock_for_deletion(struct mapped_device *md)

382

int dm_lock_for_deletion(struct mapped_device *md)

383

{

383

{

384

int r = 0;

384

int r = 0;

385

386

spin_lock(&_minor_lock);

386

spin_lock(&_minor_lock);

387

388

if (dm_open_count(md))

388

if (dm_open_count(md))

389

r = -EBUSY;

389

r = -EBUSY;

390

else

390

else

391

set_bit(DMF_DELETING, &md->flags);

391

set_bit(DMF_DELETING, &md->flags);

392

393

spin_unlock(&_minor_lock);

393

spin_unlock(&_minor_lock);

394

395

return r;

395

return r;

396

}

396

}

397

398

static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)

398

static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)

399

{

399

{

400

struct mapped_device *md = bdev->bd_disk->private_data;

400

struct mapped_device *md = bdev->bd_disk->private_data;

401

402

return dm_get_geometry(md, geo);

402

return dm_get_geometry(md, geo);

403

}

403

}

404

405

static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,

405

static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,

406

unsigned int cmd, unsigned long arg)

406

unsigned int cmd, unsigned long arg)

407

{

407

{

408

struct mapped_device *md = bdev->bd_disk->private_data;

408

struct mapped_device *md = bdev->bd_disk->private_data;

409

struct dm_table *map = dm_get_live_table(md);

409

struct dm_table *map = dm_get_live_table(md);

410

struct dm_target *tgt;

410

struct dm_target *tgt;

411

int r = -ENOTTY;

411

int r = -ENOTTY;

412

413

if (!map || !dm_table_get_size(map))

413

if (!map || !dm_table_get_size(map))

414

goto out;

414

goto out;

415

416

/* We only support devices that have a single target */

416

/* We only support devices that have a single target */

417

if (dm_table_get_num_targets(map) != 1)

417

if (dm_table_get_num_targets(map) != 1)

418

goto out;

418

goto out;

419

420

tgt = dm_table_get_target(map, 0);

420

tgt = dm_table_get_target(map, 0);

421

422

if (dm_suspended_md(md)) {

422

if (dm_suspended_md(md)) {

423

r = -EAGAIN;

423

r = -EAGAIN;

424

goto out;

424

goto out;

425

}

425

}

426

427

if (tgt->type->ioctl)

427

if (tgt->type->ioctl)

428

r = tgt->type->ioctl(tgt, cmd, arg);

428

r = tgt->type->ioctl(tgt, cmd, arg);

429

430

out:

430

out:

431

dm_table_put(map);

431

dm_table_put(map);

432

433

return r;

433

return r;

434

}

434

}

435

436

static struct dm_io *alloc_io(struct mapped_device *md)

436

static struct dm_io *alloc_io(struct mapped_device *md)

437

{

437

{

438

return mempool_alloc(md->io_pool, GFP_NOIO);

438

return mempool_alloc(md->io_pool, GFP_NOIO);

439

}

439

}

440

441

static void free_io(struct mapped_device *md, struct dm_io *io)

441

static void free_io(struct mapped_device *md, struct dm_io *io)

442

{

442

{

443

mempool_free(io, md->io_pool);

443

mempool_free(io, md->io_pool);

444

}

444

}

445

446

static void free_tio(struct mapped_device *md, struct dm_target_io *tio)

446

static void free_tio(struct mapped_device *md, struct dm_target_io *tio)

447

{

447

{

448

mempool_free(tio, md->tio_pool);

448

mempool_free(tio, md->tio_pool);

449

}

449

}

450

451

static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,

451

static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,

452

gfp_t gfp_mask)

452

gfp_t gfp_mask)

453

{

453

{

454

return mempool_alloc(md->tio_pool, gfp_mask);

454

return mempool_alloc(md->tio_pool, gfp_mask);

455

}

455

}

456

457

static void free_rq_tio(struct dm_rq_target_io *tio)

457

static void free_rq_tio(struct dm_rq_target_io *tio)

458

{

458

{

459

mempool_free(tio, tio->md->tio_pool);

459

mempool_free(tio, tio->md->tio_pool);

460

}

460

}

461

462

static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)

462

static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)

463

{

463

{

464

return mempool_alloc(md->io_pool, GFP_ATOMIC);

464

return mempool_alloc(md->io_pool, GFP_ATOMIC);

465

}

465

}

466

467

static void free_bio_info(struct dm_rq_clone_bio_info *info)

467

static void free_bio_info(struct dm_rq_clone_bio_info *info)

468

{

468

{

469

mempool_free(info, info->tio->md->io_pool);

469

mempool_free(info, info->tio->md->io_pool);

470

}

470

}

471

472

static int md_in_flight(struct mapped_device *md)

472

static int md_in_flight(struct mapped_device *md)

473

{

473

{

474

return atomic_read(&md->pending[READ]) +

474

return atomic_read(&md->pending[READ]) +

475

atomic_read(&md->pending[WRITE]);

475

atomic_read(&md->pending[WRITE]);

476

}

476

}

477

478

static void start_io_acct(struct dm_io *io)

478

static void start_io_acct(struct dm_io *io)

479

{

479

{

480

struct mapped_device *md = io->md;

480

struct mapped_device *md = io->md;

481

int cpu;

481

int cpu;

482

int rw = bio_data_dir(io->bio);

482

int rw = bio_data_dir(io->bio);

483

484

io->start_time = jiffies;

484

io->start_time = jiffies;

485

486

cpu = part_stat_lock();

486

cpu = part_stat_lock();

487

part_round_stats(cpu, &dm_disk(md)->part0);

487

part_round_stats(cpu, &dm_disk(md)->part0);

488

part_stat_unlock();

488

part_stat_unlock();

489

atomic_set(&dm_disk(md)->part0.in_flight[rw],

489

atomic_set(&dm_disk(md)->part0.in_flight[rw],

490

atomic_inc_return(&md->pending[rw]));

490

atomic_inc_return(&md->pending[rw]));

491

}

491

}

492

493

static void end_io_acct(struct dm_io *io)

493

static void end_io_acct(struct dm_io *io)

494

{

494

{

495

struct mapped_device *md = io->md;

495

struct mapped_device *md = io->md;

496

struct bio *bio = io->bio;

496

struct bio *bio = io->bio;

497

unsigned long duration = jiffies - io->start_time;

497

unsigned long duration = jiffies - io->start_time;

498

int pending, cpu;

498

int pending, cpu;

499

int rw = bio_data_dir(bio);

499

int rw = bio_data_dir(bio);

500

501

cpu = part_stat_lock();

501

cpu = part_stat_lock();

502

part_round_stats(cpu, &dm_disk(md)->part0);

502

part_round_stats(cpu, &dm_disk(md)->part0);

503

part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);

503

part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);

504

part_stat_unlock();

504

part_stat_unlock();

505

506

/*

506

/*

507

* After this is decremented the bio must not be touched if it is

507

* After this is decremented the bio must not be touched if it is

508

* a flush.

508

* a flush.

509

*/

509

*/

510

pending = atomic_dec_return(&md->pending[rw]);

510

pending = atomic_dec_return(&md->pending[rw]);

511

atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);

511

atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);

512

pending += atomic_read(&md->pending[rw^0x1]);

512

pending += atomic_read(&md->pending[rw^0x1]);

513

514

/* nudge anyone waiting on suspend queue */

514

/* nudge anyone waiting on suspend queue */

515

if (!pending)

515

if (!pending)

516

wake_up(&md->wait);

516

wake_up(&md->wait);

517

}

517

}

518

519

/*

519

/*

520

* Add the bio to the list of deferred io.

520

* Add the bio to the list of deferred io.

521

*/

521

*/

522

static void queue_io(struct mapped_device *md, struct bio *bio)

522

static void queue_io(struct mapped_device *md, struct bio *bio)

523

{

523

{

524

unsigned long flags;

524

unsigned long flags;

525

526

spin_lock_irqsave(&md->deferred_lock, flags);

526

spin_lock_irqsave(&md->deferred_lock, flags);

527

bio_list_add(&md->deferred, bio);

527

bio_list_add(&md->deferred, bio);

528

spin_unlock_irqrestore(&md->deferred_lock, flags);

528

spin_unlock_irqrestore(&md->deferred_lock, flags);

529

queue_work(md->wq, &md->work);

529

queue_work(md->wq, &md->work);

530

}

530

}

531

532

/*

532

/*

533

* Everyone (including functions in this file), should use this

533

* Everyone (including functions in this file), should use this

534

* function to access the md->map field, and make sure they call

534

* function to access the md->map field, and make sure they call

535

* dm_table_put() when finished.

535

* dm_table_put() when finished.

536

*/

536

*/

537

struct dm_table *dm_get_live_table(struct mapped_device *md)

537

struct dm_table *dm_get_live_table(struct mapped_device *md)

538

{

538

{

539

struct dm_table *t;

539

struct dm_table *t;

540

unsigned long flags;

540

unsigned long flags;

541

542

read_lock_irqsave(&md->map_lock, flags);

542

read_lock_irqsave(&md->map_lock, flags);

543

t = md->map;

543

t = md->map;

544

if (t)

544

if (t)

545

dm_table_get(t);

545

dm_table_get(t);

546

read_unlock_irqrestore(&md->map_lock, flags);

546

read_unlock_irqrestore(&md->map_lock, flags);

547

548

return t;

548

return t;

549

}

549

}

550

551

/*

551

/*

552

* Get the geometry associated with a dm device

552

* Get the geometry associated with a dm device

553

*/

553

*/

554

int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)

554

int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)

555

{

555

{

556

*geo = md->geometry;

556

*geo = md->geometry;

557

558

return 0;

558

return 0;

559

}

559

}

560

561

/*

561

/*

562

* Set the geometry of a device.

562

* Set the geometry of a device.

563

*/

563

*/

564

int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)

564

int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)

565

{

565

{

566

sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;

566

sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;

567

568

if (geo->start > sz) {

568

if (geo->start > sz) {

569

DMWARN("Start sector is beyond the geometry limits.");

569

DMWARN("Start sector is beyond the geometry limits.");

570

return -EINVAL;

570

return -EINVAL;

571

}

571

}

572

573

md->geometry = *geo;

573

md->geometry = *geo;

574

575

return 0;

575

return 0;

576

}

576

}

577

578

/*-----------------------------------------------------------------

578

/*-----------------------------------------------------------------

579

* CRUD START:

579

* CRUD START:

580

* A more elegant soln is in the works that uses the queue

580

* A more elegant soln is in the works that uses the queue

581

* merge fn, unfortunately there are a couple of changes to

581

* merge fn, unfortunately there are a couple of changes to

582

* the block layer that I want to make for this. So in the

582

* the block layer that I want to make for this. So in the

583

* interests of getting something for people to use I give

583

* interests of getting something for people to use I give

584

* you this clearly demarcated crap.

584

* you this clearly demarcated crap.

585

*---------------------------------------------------------------*/

585

*---------------------------------------------------------------*/

586

587

static int __noflush_suspending(struct mapped_device *md)

587

static int __noflush_suspending(struct mapped_device *md)

588

{

588

{

589

return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);

589

return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);

590

}

590

}

591

592

/*

592

/*

593

* Decrements the number of outstanding ios that a bio has been

593

* Decrements the number of outstanding ios that a bio has been

594

* cloned into, completing the original io if necc.

594

* cloned into, completing the original io if necc.

595

*/

595

*/

596

static void dec_pending(struct dm_io *io, int error)

596

static void dec_pending(struct dm_io *io, int error)

597

{

597

{

598

unsigned long flags;

598

unsigned long flags;

599

int io_error;

599

int io_error;

600

struct bio *bio;

600

struct bio *bio;

601

struct mapped_device *md = io->md;

601

struct mapped_device *md = io->md;

602

603

/* Push-back supersedes any I/O errors */

603

/* Push-back supersedes any I/O errors */

604

if (unlikely(error)) {

604

if (unlikely(error)) {

605

spin_lock_irqsave(&io->endio_lock, flags);

605

spin_lock_irqsave(&io->endio_lock, flags);

606

if (!(io->error > 0 && __noflush_suspending(md)))

606

if (!(io->error > 0 && __noflush_suspending(md)))

607

io->error = error;

607

io->error = error;

608

spin_unlock_irqrestore(&io->endio_lock, flags);

608

spin_unlock_irqrestore(&io->endio_lock, flags);

609

}

609

}

610

611

if (atomic_dec_and_test(&io->io_count)) {

611

if (atomic_dec_and_test(&io->io_count)) {

612

if (io->error == DM_ENDIO_REQUEUE) {

612

if (io->error == DM_ENDIO_REQUEUE) {

613

/*

613

/*

614

* Target requested pushing back the I/O.

614

* Target requested pushing back the I/O.

615

*/

615

*/

616

spin_lock_irqsave(&md->deferred_lock, flags);

616

spin_lock_irqsave(&md->deferred_lock, flags);

617

if (__noflush_suspending(md))

617

if (__noflush_suspending(md))

618

bio_list_add_head(&md->deferred, io->bio);

618

bio_list_add_head(&md->deferred, io->bio);

619

else

619

else

620

/* noflush suspend was interrupted. */

620

/* noflush suspend was interrupted. */

621

io->error = -EIO;

621

io->error = -EIO;

622

spin_unlock_irqrestore(&md->deferred_lock, flags);

622

spin_unlock_irqrestore(&md->deferred_lock, flags);

623

}

623

}

624

625

io_error = io->error;

625

io_error = io->error;

626

bio = io->bio;

626

bio = io->bio;

627

end_io_acct(io);

627

end_io_acct(io);

628

free_io(md, io);

628

free_io(md, io);

629

630

if (io_error == DM_ENDIO_REQUEUE)

630

if (io_error == DM_ENDIO_REQUEUE)

631

return;

631

return;

632

633

if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {

633

if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {

634

/*

634

/*

635

* Preflush done for flush with data, reissue

635

* Preflush done for flush with data, reissue

636

* without REQ_FLUSH.

636

* without REQ_FLUSH.

637

*/

637

*/

638

bio->bi_rw &= ~REQ_FLUSH;

638

bio->bi_rw &= ~REQ_FLUSH;

639

queue_io(md, bio);

639

queue_io(md, bio);

640

} else {

640

} else {

641

/* done with normal IO or empty flush */

641

/* done with normal IO or empty flush */

642

trace_block_bio_complete(md->queue, bio, io_error);

642

trace_block_bio_complete(md->queue, bio, io_error);

643

bio_endio(bio, io_error);

643

bio_endio(bio, io_error);

644

}

644

}

645

}

645

}

646

}

646

}

647

648

static void clone_endio(struct bio *bio, int error)

648

static void clone_endio(struct bio *bio, int error)

649

{

649

{

650

int r = 0;

650

int r = 0;

651

struct dm_target_io *tio = bio->bi_private;

651

struct dm_target_io *tio = bio->bi_private;

652

struct dm_io *io = tio->io;

652

struct dm_io *io = tio->io;

653

struct mapped_device *md = tio->io->md;

653

struct mapped_device *md = tio->io->md;

654

dm_endio_fn endio = tio->ti->type->end_io;

654

dm_endio_fn endio = tio->ti->type->end_io;

655

656

if (!bio_flagged(bio, BIO_UPTODATE) && !error)

656

if (!bio_flagged(bio, BIO_UPTODATE) && !error)

657

error = -EIO;

657

error = -EIO;

658

659

if (endio) {

659

if (endio) {

660

r = endio(tio->ti, bio, error, &tio->info);

660

r = endio(tio->ti, bio, error, &tio->info);

661

if (r < 0 || r == DM_ENDIO_REQUEUE)

661

if (r < 0 || r == DM_ENDIO_REQUEUE)

662

/*

662

/*

663

* error and requeue request are handled

663

* error and requeue request are handled

664

* in dec_pending().

664

* in dec_pending().

665

*/

665

*/

666

error = r;

666

error = r;

667

else if (r == DM_ENDIO_INCOMPLETE)

667

else if (r == DM_ENDIO_INCOMPLETE)

668

/* The target will handle the io */

668

/* The target will handle the io */

669

return;

669

return;

670

else if (r) {

670

else if (r) {

671

DMWARN("unimplemented target endio return value: %d", r);

671

DMWARN("unimplemented target endio return value: %d", r);

672

BUG();

672

BUG();

673

}

673

}

674

}

674

}

675

676

/*

676

/*

677

* Store md for cleanup instead of tio which is about to get freed.

677

* Store md for cleanup instead of tio which is about to get freed.

678

*/

678

*/

679

bio->bi_private = md->bs;

679

bio->bi_private = md->bs;

680

681

free_tio(md, tio);

681

free_tio(md, tio);

682

bio_put(bio);

682

bio_put(bio);

683

dec_pending(io, error);

683

dec_pending(io, error);

684

}

684

}

685

686

/*

686

/*

687

* Partial completion handling for request-based dm

687

* Partial completion handling for request-based dm

688

*/

688

*/

689

static void end_clone_bio(struct bio *clone, int error)

689

static void end_clone_bio(struct bio *clone, int error)

690

{

690

{

691

struct dm_rq_clone_bio_info *info = clone->bi_private;

691

struct dm_rq_clone_bio_info *info = clone->bi_private;

692

struct dm_rq_target_io *tio = info->tio;

692

struct dm_rq_target_io *tio = info->tio;

693

struct bio *bio = info->orig;

693

struct bio *bio = info->orig;

694

unsigned int nr_bytes = info->orig->bi_size;

694

unsigned int nr_bytes = info->orig->bi_size;

695

696

bio_put(clone);

696

bio_put(clone);

697

698

if (tio->error)

698

if (tio->error)

699

/*

699

/*

700

* An error has already been detected on the request.

700

* An error has already been detected on the request.

701

* Once error occurred, just let clone->end_io() handle

701

* Once error occurred, just let clone->end_io() handle

702

* the remainder.

702

* the remainder.

703

*/

703

*/

704

return;

704

return;

705

else if (error) {

705

else if (error) {

706

/*

706

/*

707

* Don't notice the error to the upper layer yet.

707

* Don't notice the error to the upper layer yet.

708

* The error handling decision is made by the target driver,

708

* The error handling decision is made by the target driver,

709

* when the request is completed.

709

* when the request is completed.

710

*/

710

*/

711

tio->error = error;

711

tio->error = error;

712

return;

712

return;

713

}

713

}

714

715

/*

715

/*

716

* I/O for the bio successfully completed.

716

* I/O for the bio successfully completed.

717

* Notice the data completion to the upper layer.

717

* Notice the data completion to the upper layer.

718

*/

718

*/

719

720

/*

720

/*

721

* bios are processed from the head of the list.

721

* bios are processed from the head of the list.

722

* So the completing bio should always be rq->bio.

722

* So the completing bio should always be rq->bio.

723

* If it's not, something wrong is happening.

723

* If it's not, something wrong is happening.

724

*/

724

*/

725

if (tio->orig->bio != bio)

725

if (tio->orig->bio != bio)

726

DMERR("bio completion is going in the middle of the request");

726

DMERR("bio completion is going in the middle of the request");

727

728

/*

728

/*

729

* Update the original request.

729

* Update the original request.

730

* Do not use blk_end_request() here, because it may complete

730

* Do not use blk_end_request() here, because it may complete

731

* the original request before the clone, and break the ordering.

731

* the original request before the clone, and break the ordering.

732

*/

732

*/

733

blk_update_request(tio->orig, 0, nr_bytes);

733

blk_update_request(tio->orig, 0, nr_bytes);

734

}

734

}

735

736

/*

736

/*

737

* Don't touch any member of the md after calling this function because

737

* Don't touch any member of the md after calling this function because

738

* the md may be freed in dm_put() at the end of this function.

738

* the md may be freed in dm_put() at the end of this function.

739

* Or do dm_get() before calling this function and dm_put() later.

739

* Or do dm_get() before calling this function and dm_put() later.

740

*/

740

*/

741

static void rq_completed(struct mapped_device *md, int rw, int run_queue)

741

static void rq_completed(struct mapped_device *md, int rw, int run_queue)

742

{

742

{

743

atomic_dec(&md->pending[rw]);

743

atomic_dec(&md->pending[rw]);

744

745

/* nudge anyone waiting on suspend queue */

745

/* nudge anyone waiting on suspend queue */

746

if (!md_in_flight(md))

746

if (!md_in_flight(md))

747

wake_up(&md->wait);

747

wake_up(&md->wait);

748

749

if (run_queue)

749

if (run_queue)

750

blk_run_queue(md->queue);

750

blk_run_queue(md->queue);

751

752

/*

752

/*

753

* dm_put() must be at the end of this function. See the comment above

753

* dm_put() must be at the end of this function. See the comment above

754

*/

754

*/

755

dm_put(md);

755

dm_put(md);

756

}

756

}

757

758

static void free_rq_clone(struct request *clone)

758

static void free_rq_clone(struct request *clone)

759

{

759

{

760

struct dm_rq_target_io *tio = clone->end_io_data;

760

struct dm_rq_target_io *tio = clone->end_io_data;

761

762

blk_rq_unprep_clone(clone);

762

blk_rq_unprep_clone(clone);

763

free_rq_tio(tio);

763

free_rq_tio(tio);

764

}

764

}

765

766

/*

766

/*

767

* Complete the clone and the original request.

767

* Complete the clone and the original request.

768

* Must be called without queue lock.

768

* Must be called without queue lock.

769

*/

769

*/

770

static void dm_end_request(struct request *clone, int error)

770

static void dm_end_request(struct request *clone, int error)

771

{

771

{

772

int rw = rq_data_dir(clone);

772

int rw = rq_data_dir(clone);

773

struct dm_rq_target_io *tio = clone->end_io_data;

773

struct dm_rq_target_io *tio = clone->end_io_data;

774

struct mapped_device *md = tio->md;

774

struct mapped_device *md = tio->md;

775

struct request *rq = tio->orig;

775

struct request *rq = tio->orig;

776

777

if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {

777

if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {

778

rq->errors = clone->errors;

778

rq->errors = clone->errors;

779

rq->resid_len = clone->resid_len;

779

rq->resid_len = clone->resid_len;

780

781

if (rq->sense)

781

if (rq->sense)

782

/*

782

/*

783

* We are using the sense buffer of the original

783

* We are using the sense buffer of the original

784

* request.

784

* request.

785

* So setting the length of the sense data is enough.

785

* So setting the length of the sense data is enough.

786

*/

786

*/

787

rq->sense_len = clone->sense_len;

787

rq->sense_len = clone->sense_len;

788

}

788

}

789

790

free_rq_clone(clone);

790

free_rq_clone(clone);

791

blk_end_request_all(rq, error);

791

blk_end_request_all(rq, error);

792

rq_completed(md, rw, true);

792

rq_completed(md, rw, true);

793

}

793

}

794

795

static void dm_unprep_request(struct request *rq)

795

static void dm_unprep_request(struct request *rq)

796

{

796

{

797

struct request *clone = rq->special;

797

struct request *clone = rq->special;

798

799

rq->special = NULL;

799

rq->special = NULL;

800

rq->cmd_flags &= ~REQ_DONTPREP;

800

rq->cmd_flags &= ~REQ_DONTPREP;

801

802

free_rq_clone(clone);

802

free_rq_clone(clone);

803

}

803

}

804

805

/*

805

/*

806

* Requeue the original request of a clone.

806

* Requeue the original request of a clone.

807

*/

807

*/

808

void dm_requeue_unmapped_request(struct request *clone)

808

void dm_requeue_unmapped_request(struct request *clone)

809

{

809

{

810

int rw = rq_data_dir(clone);

810

int rw = rq_data_dir(clone);

811

struct dm_rq_target_io *tio = clone->end_io_data;

811

struct dm_rq_target_io *tio = clone->end_io_data;

812

struct mapped_device *md = tio->md;

812

struct mapped_device *md = tio->md;

813

struct request *rq = tio->orig;

813

struct request *rq = tio->orig;

814

struct request_queue *q = rq->q;

814

struct request_queue *q = rq->q;

815

unsigned long flags;

815

unsigned long flags;

816

817

dm_unprep_request(rq);

817

dm_unprep_request(rq);

818

819

spin_lock_irqsave(q->queue_lock, flags);

819

spin_lock_irqsave(q->queue_lock, flags);

820

blk_requeue_request(q, rq);

820

blk_requeue_request(q, rq);

821

spin_unlock_irqrestore(q->queue_lock, flags);

821

spin_unlock_irqrestore(q->queue_lock, flags);

822

823

rq_completed(md, rw, 0);

823

rq_completed(md, rw, 0);

824

}

824

}

825

EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);

825

EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);

826

827

static void __stop_queue(struct request_queue *q)

827

static void __stop_queue(struct request_queue *q)

828

{

828

{

829

blk_stop_queue(q);

829

blk_stop_queue(q);

830

}

830

}

831

832

static void stop_queue(struct request_queue *q)

832

static void stop_queue(struct request_queue *q)

833

{

833

{

834

unsigned long flags;

834

unsigned long flags;

835

836

spin_lock_irqsave(q->queue_lock, flags);

836

spin_lock_irqsave(q->queue_lock, flags);

837

__stop_queue(q);

837

__stop_queue(q);

838

spin_unlock_irqrestore(q->queue_lock, flags);

838

spin_unlock_irqrestore(q->queue_lock, flags);

839

}

839

}

840

841

static void __start_queue(struct request_queue *q)

841

static void __start_queue(struct request_queue *q)

842

{

842

{

843

if (blk_queue_stopped(q))

843

if (blk_queue_stopped(q))

844

blk_start_queue(q);

844

blk_start_queue(q);

845

}

845

}

846

847

static void start_queue(struct request_queue *q)

847

static void start_queue(struct request_queue *q)

848

{

848

{

849

unsigned long flags;

849

unsigned long flags;

850

851

spin_lock_irqsave(q->queue_lock, flags);

851

spin_lock_irqsave(q->queue_lock, flags);

852

__start_queue(q);

852

__start_queue(q);

853

spin_unlock_irqrestore(q->queue_lock, flags);

853

spin_unlock_irqrestore(q->queue_lock, flags);

854

}

854

}

855

856

static void dm_done(struct request *clone, int error, bool mapped)

856

static void dm_done(struct request *clone, int error, bool mapped)

857

{

857

{

858

int r = error;

858

int r = error;

859

struct dm_rq_target_io *tio = clone->end_io_data;

859

struct dm_rq_target_io *tio = clone->end_io_data;

860

dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;

860

dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;

861

862

if (mapped && rq_end_io)

862

if (mapped && rq_end_io)

863

r = rq_end_io(tio->ti, clone, error, &tio->info);

863

r = rq_end_io(tio->ti, clone, error, &tio->info);

864

865

if (r <= 0)

865

if (r <= 0)

866

/* The target wants to complete the I/O */

866

/* The target wants to complete the I/O */

867

dm_end_request(clone, r);

867

dm_end_request(clone, r);

868

else if (r == DM_ENDIO_INCOMPLETE)

868

else if (r == DM_ENDIO_INCOMPLETE)

869

/* The target will handle the I/O */

869

/* The target will handle the I/O */

870

return;

870

return;

871

else if (r == DM_ENDIO_REQUEUE)

871

else if (r == DM_ENDIO_REQUEUE)

872

/* The target wants to requeue the I/O */

872

/* The target wants to requeue the I/O */

873

dm_requeue_unmapped_request(clone);

873

dm_requeue_unmapped_request(clone);

874

else {

874

else {

875

DMWARN("unimplemented target endio return value: %d", r);

875

DMWARN("unimplemented target endio return value: %d", r);

876

BUG();

876

BUG();

877

}

877

}

878

}

878

}

879

880

/*

880

/*

881

* Request completion handler for request-based dm

881

* Request completion handler for request-based dm

882

*/

882

*/

883

static void dm_softirq_done(struct request *rq)

883

static void dm_softirq_done(struct request *rq)

884

{

884

{

885

bool mapped = true;

885

bool mapped = true;

886

struct request *clone = rq->completion_data;

886

struct request *clone = rq->completion_data;

887

struct dm_rq_target_io *tio = clone->end_io_data;

887

struct dm_rq_target_io *tio = clone->end_io_data;

888

889

if (rq->cmd_flags & REQ_FAILED)

889

if (rq->cmd_flags & REQ_FAILED)

890

mapped = false;

890

mapped = false;

891

892

dm_done(clone, tio->error, mapped);

892

dm_done(clone, tio->error, mapped);

893

}

893

}

894

895

/*

895

/*

896

* Complete the clone and the original request with the error status

896

* Complete the clone and the original request with the error status

897

* through softirq context.

897

* through softirq context.

898

*/

898

*/

899

static void dm_complete_request(struct request *clone, int error)

899

static void dm_complete_request(struct request *clone, int error)

900

{

900

{

901

struct dm_rq_target_io *tio = clone->end_io_data;

901

struct dm_rq_target_io *tio = clone->end_io_data;

902

struct request *rq = tio->orig;

902

struct request *rq = tio->orig;

903

904

tio->error = error;

904

tio->error = error;

905

rq->completion_data = clone;

905

rq->completion_data = clone;

906

blk_complete_request(rq);

906

blk_complete_request(rq);

907

}

907

}

908

909

/*

909

/*

910

* Complete the not-mapped clone and the original request with the error status

910

* Complete the not-mapped clone and the original request with the error status

911

* through softirq context.

911

* through softirq context.

912

* Target's rq_end_io() function isn't called.

912

* Target's rq_end_io() function isn't called.

913

* This may be used when the target's map_rq() function fails.

913

* This may be used when the target's map_rq() function fails.

914

*/

914

*/

915

void dm_kill_unmapped_request(struct request *clone, int error)

915

void dm_kill_unmapped_request(struct request *clone, int error)

916

{

916

{

917

struct dm_rq_target_io *tio = clone->end_io_data;

917

struct dm_rq_target_io *tio = clone->end_io_data;

918

struct request *rq = tio->orig;

918

struct request *rq = tio->orig;

919

920

rq->cmd_flags |= REQ_FAILED;

920

rq->cmd_flags |= REQ_FAILED;

921

dm_complete_request(clone, error);

921

dm_complete_request(clone, error);

922

}

922

}

923

EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);

923

EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);

924

925

/*

925

/*

926

* Called with the queue lock held

926

* Called with the queue lock held

927

*/

927

*/

928

static void end_clone_request(struct request *clone, int error)

928

static void end_clone_request(struct request *clone, int error)

929

{

929

{

930

/*

930

/*

931

* For just cleaning up the information of the queue in which

931

* For just cleaning up the information of the queue in which

932

* the clone was dispatched.

932

* the clone was dispatched.

933

* The clone is *NOT* freed actually here because it is alloced from

933

* The clone is *NOT* freed actually here because it is alloced from

934

* dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.

934

* dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.

935

*/

935

*/

936

__blk_put_request(clone->q, clone);

936

__blk_put_request(clone->q, clone);

937

938

/*

938

/*

939

* Actual request completion is done in a softirq context which doesn't

939

* Actual request completion is done in a softirq context which doesn't

940

* hold the queue lock. Otherwise, deadlock could occur because:

940

* hold the queue lock. Otherwise, deadlock could occur because:

941

* - another request may be submitted by the upper level driver

941

* - another request may be submitted by the upper level driver

942

* of the stacking during the completion

942

* of the stacking during the completion

943

* - the submission which requires queue lock may be done

943

* - the submission which requires queue lock may be done

944

* against this queue

944

* against this queue

945

*/

945

*/

946

dm_complete_request(clone, error);

946

dm_complete_request(clone, error);

947

}

947

}

948

949

/*

949

/*

950

* Return maximum size of I/O possible at the supplied sector up to the current

950

* Return maximum size of I/O possible at the supplied sector up to the current

951

* target boundary.

951

* target boundary.

952

*/

952

*/

953

static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)

953

static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)

954

{

954

{

955

sector_t target_offset = dm_target_offset(ti, sector);

955

sector_t target_offset = dm_target_offset(ti, sector);

956

957

return ti->len - target_offset;

957

return ti->len - target_offset;

958

}

958

}

959

960

static sector_t max_io_len(sector_t sector, struct dm_target *ti)

960

static sector_t max_io_len(sector_t sector, struct dm_target *ti)

961

{

961

{

962

sector_t len = max_io_len_target_boundary(sector, ti);

962

sector_t len = max_io_len_target_boundary(sector, ti);

963

964

/*

964

/*

965

* Does the target need to split even further ?

965

* Does the target need to split even further ?

966

*/

966

*/

967

if (ti->split_io) {

967

if (ti->split_io) {

968

sector_t boundary;

968

sector_t boundary;

969

sector_t offset = dm_target_offset(ti, sector);

969

sector_t offset = dm_target_offset(ti, sector);

970

boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))

970

boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))

971

- offset;

971

- offset;

972

if (len > boundary)

972

if (len > boundary)

973

len = boundary;

973

len = boundary;

974

}

974

}

975

976

return len;

976

return len;

977

}

977

}

978

979

static void __map_bio(struct dm_target *ti, struct bio *clone,

979

static void __map_bio(struct dm_target *ti, struct bio *clone,

980

struct dm_target_io *tio)

980

struct dm_target_io *tio)

981

{

981

{

982

int r;

982

int r;

983

sector_t sector;

983

sector_t sector;

984

struct mapped_device *md;

984

struct mapped_device *md;

985

986

clone->bi_end_io = clone_endio;

986

clone->bi_end_io = clone_endio;

987

clone->bi_private = tio;

987

clone->bi_private = tio;

988

989

/*

989

/*

990

* Map the clone. If r == 0 we don't need to do

990

* Map the clone. If r == 0 we don't need to do

991

* anything, the target has assumed ownership of

991

* anything, the target has assumed ownership of

992

* this io.

992

* this io.

993

*/

993

*/

994

atomic_inc(&tio->io->io_count);

994

atomic_inc(&tio->io->io_count);

995

sector = clone->bi_sector;

995

sector = clone->bi_sector;

996

r = ti->type->map(ti, clone, &tio->info);

996

r = ti->type->map(ti, clone, &tio->info);

997

if (r == DM_MAPIO_REMAPPED) {

997

if (r == DM_MAPIO_REMAPPED) {

998

/* the bio has been remapped so dispatch it */

998

/* the bio has been remapped so dispatch it */

999

1000

trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,

1000

trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,

1001

tio->io->bio->bi_bdev->bd_dev, sector);

1001

tio->io->bio->bi_bdev->bd_dev, sector);

1002

1003

generic_make_request(clone);

1003

generic_make_request(clone);

1004

} else if (r < 0 || r == DM_MAPIO_REQUEUE) {

1004

} else if (r < 0 || r == DM_MAPIO_REQUEUE) {

1005

/* error the io and bail out, or requeue it if needed */

1005

/* error the io and bail out, or requeue it if needed */

1006

md = tio->io->md;

1006

md = tio->io->md;

1007

dec_pending(tio->io, r);

1007

dec_pending(tio->io, r);

1008

/*

1008

/*

1009

* Store bio_set for cleanup.

1009

* Store bio_set for cleanup.

1010

*/

1010

*/

1011

clone->bi_private = md->bs;

1011

clone->bi_private = md->bs;

1012

bio_put(clone);

1012

bio_put(clone);

1013

free_tio(md, tio);

1013

free_tio(md, tio);

1014

} else if (r) {

1014

} else if (r) {

1015

DMWARN("unimplemented target map return value: %d", r);

1015

DMWARN("unimplemented target map return value: %d", r);

1016

BUG();

1016

BUG();

1017

}

1017

}

1018

}

1018

}

1019

1020

struct clone_info {

1020

struct clone_info {

1021

struct mapped_device *md;

1021

struct mapped_device *md;

1022

struct dm_table *map;

1022

struct dm_table *map;

1023

struct bio *bio;

1023

struct bio *bio;

1024

struct dm_io *io;

1024

struct dm_io *io;

1025

sector_t sector;

1025

sector_t sector;

1026

sector_t sector_count;

1026

sector_t sector_count;

1027

unsigned short idx;

1027

unsigned short idx;

1028

};

1028

};

1029

1030

static void dm_bio_destructor(struct bio *bio)

1030

static void dm_bio_destructor(struct bio *bio)

1031

{

1031

{

1032

struct bio_set *bs = bio->bi_private;

1032

struct bio_set *bs = bio->bi_private;

1033

1034

bio_free(bio, bs);

1034

bio_free(bio, bs);

1035

}

1035

}

1036

1037

/*

1037

/*

1038

* Creates a little bio that just does part of a bvec.

1038

* Creates a little bio that just does part of a bvec.

1039

*/

1039

*/

1040

static struct bio *split_bvec(struct bio *bio, sector_t sector,

1040

static struct bio *split_bvec(struct bio *bio, sector_t sector,

1041

unsigned short idx, unsigned int offset,

1041

unsigned short idx, unsigned int offset,

1042

unsigned int len, struct bio_set *bs)

1042

unsigned int len, struct bio_set *bs)

1043

{

1043

{

1044

struct bio *clone;

1044

struct bio *clone;

1045

struct bio_vec *bv = bio->bi_io_vec + idx;

1045

struct bio_vec *bv = bio->bi_io_vec + idx;

1046

1047

clone = bio_alloc_bioset(GFP_NOIO, 1, bs);

1047

clone = bio_alloc_bioset(GFP_NOIO, 1, bs);

1048

clone->bi_destructor = dm_bio_destructor;

1048

clone->bi_destructor = dm_bio_destructor;

1049

*clone->bi_io_vec = *bv;

1049

*clone->bi_io_vec = *bv;

1050

1051

clone->bi_sector = sector;

1051

clone->bi_sector = sector;

1052

clone->bi_bdev = bio->bi_bdev;

1052

clone->bi_bdev = bio->bi_bdev;

1053

clone->bi_rw = bio->bi_rw;

1053

clone->bi_rw = bio->bi_rw;

1054

clone->bi_vcnt = 1;

1054

clone->bi_vcnt = 1;

1055

clone->bi_size = to_bytes(len);

1055

clone->bi_size = to_bytes(len);

1056

clone->bi_io_vec->bv_offset = offset;

1056

clone->bi_io_vec->bv_offset = offset;

1057

clone->bi_io_vec->bv_len = clone->bi_size;

1057

clone->bi_io_vec->bv_len = clone->bi_size;

1058

clone->bi_flags |= 1 << BIO_CLONED;

1058

clone->bi_flags |= 1 << BIO_CLONED;

1059

1060

if (bio_integrity(bio)) {

1060

if (bio_integrity(bio)) {

1061

bio_integrity_clone(clone, bio, GFP_NOIO, bs);

1061

bio_integrity_clone(clone, bio, GFP_NOIO, bs);

1062

bio_integrity_trim(clone,

1062

bio_integrity_trim(clone,

1063

bio_sector_offset(bio, idx, offset), len);

1063

bio_sector_offset(bio, idx, offset), len);

1064

}

1064

}

1065

1066

return clone;

1066

return clone;

1067

}

1067

}

1068

1069

/*

1069

/*

1070

* Creates a bio that consists of range of complete bvecs.

1070

* Creates a bio that consists of range of complete bvecs.

1071

*/

1071

*/

1072

static struct bio *clone_bio(struct bio *bio, sector_t sector,

1072

static struct bio *clone_bio(struct bio *bio, sector_t sector,

1073

unsigned short idx, unsigned short bv_count,

1073

unsigned short idx, unsigned short bv_count,

1074

unsigned int len, struct bio_set *bs)

1074

unsigned int len, struct bio_set *bs)

1075

{

1075

{

1076

struct bio *clone;

1076

struct bio *clone;

1077

1078

clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);

1078

clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);

1079

__bio_clone(clone, bio);

1079

__bio_clone(clone, bio);

1080

clone->bi_destructor = dm_bio_destructor;

1080

clone->bi_destructor = dm_bio_destructor;

1081

clone->bi_sector = sector;

1081

clone->bi_sector = sector;

1082

clone->bi_idx = idx;

1082

clone->bi_idx = idx;

1083

clone->bi_vcnt = idx + bv_count;

1083

clone->bi_vcnt = idx + bv_count;

1084

clone->bi_size = to_bytes(len);

1084

clone->bi_size = to_bytes(len);

1085

clone->bi_flags &= ~(1 << BIO_SEG_VALID);

1085

clone->bi_flags &= ~(1 << BIO_SEG_VALID);

1086

1087

if (bio_integrity(bio)) {

1087

if (bio_integrity(bio)) {

1088

bio_integrity_clone(clone, bio, GFP_NOIO, bs);

1088

bio_integrity_clone(clone, bio, GFP_NOIO, bs);

1089

1090

if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)

1090

if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)

1091

bio_integrity_trim(clone,

1091

bio_integrity_trim(clone,

1092

bio_sector_offset(bio, idx, 0), len);

1092

bio_sector_offset(bio, idx, 0), len);

1093

}

1093

}

1094

1095

return clone;

1095

return clone;

1096

}

1096

}

1097

1098

static struct dm_target_io *alloc_tio(struct clone_info *ci,

1098

static struct dm_target_io *alloc_tio(struct clone_info *ci,

1099

struct dm_target *ti)

1099

struct dm_target *ti)

1100

{

1100

{

1101

struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);

1101

struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);

1102

1103

tio->io = ci->io;

1103

tio->io = ci->io;

1104

tio->ti = ti;

1104

tio->ti = ti;

1105

memset(&tio->info, 0, sizeof(tio->info));

1105

memset(&tio->info, 0, sizeof(tio->info));

1106

1107

return tio;

1107

return tio;

1108

}

1108

}

1109

1110

static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,

1110

static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,

1111

unsigned request_nr, sector_t len)

1111

unsigned request_nr, sector_t len)

1112

{

1112

{

1113

struct dm_target_io *tio = alloc_tio(ci, ti);

1113

struct dm_target_io *tio = alloc_tio(ci, ti);

1114

struct bio *clone;

1114

struct bio *clone;

1115

1116

tio->info.target_request_nr = request_nr;

1116

tio->info.target_request_nr = request_nr;

1117

1118

/*

1118

/*

1119

* Discard requests require the bio's inline iovecs be initialized.

1119

* Discard requests require the bio's inline iovecs be initialized.

1120

* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush

1120

* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush

1121

* and discard, so no need for concern about wasted bvec allocations.

1121

* and discard, so no need for concern about wasted bvec allocations.

1122

*/

1122

*/

1123

clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);

1123

clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);

1124

__bio_clone(clone, ci->bio);

1124

__bio_clone(clone, ci->bio);

1125

clone->bi_destructor = dm_bio_destructor;

1125

clone->bi_destructor = dm_bio_destructor;

1126

if (len) {

1126

if (len) {

1127

clone->bi_sector = ci->sector;

1127

clone->bi_sector = ci->sector;

1128

clone->bi_size = to_bytes(len);

1128

clone->bi_size = to_bytes(len);

1129

}

1129

}

1130

1131

__map_bio(ti, clone, tio);

1131

__map_bio(ti, clone, tio);

1132

}

1132

}

1133

1134

static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,

1134

static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,

1135

unsigned num_requests, sector_t len)

1135

unsigned num_requests, sector_t len)

1136

{

1136

{

1137

unsigned request_nr;

1137

unsigned request_nr;

1138

1139

for (request_nr = 0; request_nr < num_requests; request_nr++)

1139

for (request_nr = 0; request_nr < num_requests; request_nr++)

1140

__issue_target_request(ci, ti, request_nr, len);

1140

__issue_target_request(ci, ti, request_nr, len);

1141

}

1141

}

1142

1143

static int __clone_and_map_empty_flush(struct clone_info *ci)

1143

static int __clone_and_map_empty_flush(struct clone_info *ci)

1144

{

1144

{

1145

unsigned target_nr = 0;

1145

unsigned target_nr = 0;

1146

struct dm_target *ti;

1146

struct dm_target *ti;

1147

1148

BUG_ON(bio_has_data(ci->bio));

1148

BUG_ON(bio_has_data(ci->bio));

1149

while ((ti = dm_table_get_target(ci->map, target_nr++)))

1149

while ((ti = dm_table_get_target(ci->map, target_nr++)))

1150

__issue_target_requests(ci, ti, ti->num_flush_requests, 0);

1150

__issue_target_requests(ci, ti, ti->num_flush_requests, 0);

1151

1152

return 0;

1152

return 0;

1153

}

1153

}

1154

1155

/*

1155

/*

1156

* Perform all io with a single clone.

1156

* Perform all io with a single clone.

1157

*/

1157

*/

1158

static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)

1158

static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)

1159

{

1159

{

1160

struct bio *clone, *bio = ci->bio;

1160

struct bio *clone, *bio = ci->bio;

1161

struct dm_target_io *tio;

1161

struct dm_target_io *tio;

1162

1163

tio = alloc_tio(ci, ti);

1163

tio = alloc_tio(ci, ti);

1164

clone = clone_bio(bio, ci->sector, ci->idx,

1164

clone = clone_bio(bio, ci->sector, ci->idx,

1165

bio->bi_vcnt - ci->idx, ci->sector_count,

1165

bio->bi_vcnt - ci->idx, ci->sector_count,

1166

ci->md->bs);

1166

ci->md->bs);

1167

__map_bio(ti, clone, tio);

1167

__map_bio(ti, clone, tio);

1168

ci->sector_count = 0;

1168

ci->sector_count = 0;

1169

}

1169

}

1170

1171

static int __clone_and_map_discard(struct clone_info *ci)

1171

static int __clone_and_map_discard(struct clone_info *ci)

1172

{

1172

{

1173

struct dm_target *ti;

1173

struct dm_target *ti;

1174

sector_t len;

1174

sector_t len;

1175

1176

do {

1176

do {

1177

ti = dm_table_find_target(ci->map, ci->sector);

1177

ti = dm_table_find_target(ci->map, ci->sector);

1178

if (!dm_target_is_valid(ti))

1178

if (!dm_target_is_valid(ti))

1179

return -EIO;

1179

return -EIO;

1180

1181

/*

1181

/*

1182

* Even though the device advertised discard support,

1182

* Even though the device advertised discard support,

1183

* that does not mean every target supports it, and

1183

* that does not mean every target supports it, and

1184

* reconfiguration might also have changed that since the

1184

* reconfiguration might also have changed that since the

1185

* check was performed.

1185

* check was performed.

1186

*/

1186

*/

1187

if (!ti->num_discard_requests)

1187

if (!ti->num_discard_requests)

1188

return -EOPNOTSUPP;

1188

return -EOPNOTSUPP;

1189

1190

len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));

1190

len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));

1191

1192

__issue_target_requests(ci, ti, ti->num_discard_requests, len);

1192

__issue_target_requests(ci, ti, ti->num_discard_requests, len);

1193

1194

ci->sector += len;

1194

ci->sector += len;

1195

} while (ci->sector_count -= len);

1195

} while (ci->sector_count -= len);

1196

1197

return 0;

1197

return 0;

1198

}

1198

}

1199

1200

static int __clone_and_map(struct clone_info *ci)

1200

static int __clone_and_map(struct clone_info *ci)

1201

{

1201

{

1202

struct bio *clone, *bio = ci->bio;

1202

struct bio *clone, *bio = ci->bio;

1203

struct dm_target *ti;

1203

struct dm_target *ti;

1204

sector_t len = 0, max;

1204

sector_t len = 0, max;

1205

struct dm_target_io *tio;

1205

struct dm_target_io *tio;

1206

1207

if (unlikely(bio->bi_rw & REQ_DISCARD))

1207

if (unlikely(bio->bi_rw & REQ_DISCARD))

1208

return __clone_and_map_discard(ci);

1208

return __clone_and_map_discard(ci);

1209

1210

ti = dm_table_find_target(ci->map, ci->sector);

1210

ti = dm_table_find_target(ci->map, ci->sector);

1211

if (!dm_target_is_valid(ti))

1211

if (!dm_target_is_valid(ti))

1212

return -EIO;

1212

return -EIO;

1213

1214

max = max_io_len(ci->sector, ti);

1214

max = max_io_len(ci->sector, ti);

1215

1216

if (ci->sector_count <= max) {

1216

if (ci->sector_count <= max) {

1217

/*

1217

/*

1218

* Optimise for the simple case where we can do all of

1218

* Optimise for the simple case where we can do all of

1219

* the remaining io with a single clone.

1219

* the remaining io with a single clone.

1220

*/

1220

*/

1221

__clone_and_map_simple(ci, ti);

1221

__clone_and_map_simple(ci, ti);

1222

1223

} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {

1223

} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {

1224

/*

1224

/*

1225

* There are some bvecs that don't span targets.

1225

* There are some bvecs that don't span targets.

1226

* Do as many of these as possible.

1226

* Do as many of these as possible.

1227

*/

1227

*/

1228

int i;

1228

int i;

1229

sector_t remaining = max;

1229

sector_t remaining = max;

1230

sector_t bv_len;

1230

sector_t bv_len;

1231

1232

for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {

1232

for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {

1233

bv_len = to_sector(bio->bi_io_vec[i].bv_len);

1233

bv_len = to_sector(bio->bi_io_vec[i].bv_len);

1234

1235

if (bv_len > remaining)

1235

if (bv_len > remaining)

1236

break;

1236

break;

1237

1238

remaining -= bv_len;

1238

remaining -= bv_len;

1239

len += bv_len;

1239

len += bv_len;

1240

}

1240

}

1241

1242

tio = alloc_tio(ci, ti);

1242

tio = alloc_tio(ci, ti);

1243

clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,

1243

clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,

1244

ci->md->bs);

1244

ci->md->bs);

1245

__map_bio(ti, clone, tio);

1245

__map_bio(ti, clone, tio);

1246

1247

ci->sector += len;

1247

ci->sector += len;

1248

ci->sector_count -= len;

1248

ci->sector_count -= len;

1249

ci->idx = i;

1249

ci->idx = i;

1250

1251

} else {

1251

} else {

1252

/*

1252

/*

1253

* Handle a bvec that must be split between two or more targets.

1253

* Handle a bvec that must be split between two or more targets.

1254

*/

1254

*/

1255

struct bio_vec *bv = bio->bi_io_vec + ci->idx;

1255

struct bio_vec *bv = bio->bi_io_vec + ci->idx;

1256

sector_t remaining = to_sector(bv->bv_len);

1256

sector_t remaining = to_sector(bv->bv_len);

1257

unsigned int offset = 0;

1257

unsigned int offset = 0;

1258

1259

do {

1259

do {

1260

if (offset) {

1260

if (offset) {

1261

ti = dm_table_find_target(ci->map, ci->sector);

1261

ti = dm_table_find_target(ci->map, ci->sector);

1262

if (!dm_target_is_valid(ti))

1262

if (!dm_target_is_valid(ti))

1263

return -EIO;

1263

return -EIO;

1264

1265

max = max_io_len(ci->sector, ti);

1265

max = max_io_len(ci->sector, ti);

1266

}

1266

}

1267

1268

len = min(remaining, max);

1268

len = min(remaining, max);

1269

1270

tio = alloc_tio(ci, ti);

1270

tio = alloc_tio(ci, ti);

1271

clone = split_bvec(bio, ci->sector, ci->idx,

1271

clone = split_bvec(bio, ci->sector, ci->idx,

1272

bv->bv_offset + offset, len,

1272

bv->bv_offset + offset, len,

1273

ci->md->bs);

1273

ci->md->bs);

1274

1275

__map_bio(ti, clone, tio);

1275

__map_bio(ti, clone, tio);

1276

1277

ci->sector += len;

1277

ci->sector += len;

1278

ci->sector_count -= len;

1278

ci->sector_count -= len;

1279

offset += to_bytes(len);

1279

offset += to_bytes(len);

1280

} while (remaining -= len);

1280

} while (remaining -= len);

1281

1282

ci->idx++;

1282

ci->idx++;

1283

}

1283

}

1284

1285

return 0;

1285

return 0;

1286

}

1286

}

1287

1288

/*

1288

/*

1289

* Split the bio into several clones and submit it to targets.

1289

* Split the bio into several clones and submit it to targets.

1290

*/

1290

*/

1291

static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)

1291

static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)

1292

{

1292

{

1293

struct clone_info ci;

1293

struct clone_info ci;

1294

int error = 0;

1294

int error = 0;

1295

1296

ci.map = dm_get_live_table(md);

1296

ci.map = dm_get_live_table(md);

1297

if (unlikely(!ci.map)) {

1297

if (unlikely(!ci.map)) {

1298

bio_io_error(bio);

1298

bio_io_error(bio);

1299

return;

1299

return;

1300

}

1300

}

1301

1302

ci.md = md;

1302

ci.md = md;

1303

ci.io = alloc_io(md);

1303

ci.io = alloc_io(md);

1304

ci.io->error = 0;

1304

ci.io->error = 0;

1305

atomic_set(&ci.io->io_count, 1);

1305

atomic_set(&ci.io->io_count, 1);

1306

ci.io->bio = bio;

1306

ci.io->bio = bio;

1307

ci.io->md = md;

1307

ci.io->md = md;

1308

spin_lock_init(&ci.io->endio_lock);

1308

spin_lock_init(&ci.io->endio_lock);

1309

ci.sector = bio->bi_sector;

1309

ci.sector = bio->bi_sector;

1310

ci.idx = bio->bi_idx;

1310

ci.idx = bio->bi_idx;

1311

1312

start_io_acct(ci.io);

1312

start_io_acct(ci.io);

1313

if (bio->bi_rw & REQ_FLUSH) {

1313

if (bio->bi_rw & REQ_FLUSH) {

1314

ci.bio = &ci.md->flush_bio;

1314

ci.bio = &ci.md->flush_bio;

1315

ci.sector_count = 0;

1315

ci.sector_count = 0;

1316

error = __clone_and_map_empty_flush(&ci);

1316

error = __clone_and_map_empty_flush(&ci);

1317

/* dec_pending submits any data associated with flush */

1317

/* dec_pending submits any data associated with flush */

1318

} else {

1318

} else {

1319

ci.bio = bio;

1319

ci.bio = bio;

1320

ci.sector_count = bio_sectors(bio);

1320

ci.sector_count = bio_sectors(bio);

1321

while (ci.sector_count && !error)

1321

while (ci.sector_count && !error)

1322

error = __clone_and_map(&ci);

1322

error = __clone_and_map(&ci);

1323

}

1323

}

1324

1325

/* drop the extra reference count */

1325

/* drop the extra reference count */

1326

dec_pending(ci.io, error);

1326

dec_pending(ci.io, error);

1327

dm_table_put(ci.map);

1327

dm_table_put(ci.map);

1328

}

1328

}

1329

/*-----------------------------------------------------------------

1329

/*-----------------------------------------------------------------

1330

* CRUD END

1330

* CRUD END

1331

*---------------------------------------------------------------*/

1331

*---------------------------------------------------------------*/

1332

1333

static int dm_merge_bvec(struct request_queue *q,

1333

static int dm_merge_bvec(struct request_queue *q,

1334

struct bvec_merge_data *bvm,

1334

struct bvec_merge_data *bvm,

1335

struct bio_vec *biovec)

1335

struct bio_vec *biovec)

1336

{

1336

{

1337

struct mapped_device *md = q->queuedata;

1337

struct mapped_device *md = q->queuedata;

1338

struct dm_table *map = dm_get_live_table(md);

1338

struct dm_table *map = dm_get_live_table(md);

1339

struct dm_target *ti;

1339

struct dm_target *ti;

1340

sector_t max_sectors;

1340

sector_t max_sectors;

1341

int max_size = 0;

1341

int max_size = 0;

1342

1343

if (unlikely(!map))

1343

if (unlikely(!map))

1344

goto out;

1344

goto out;

1345

1346

ti = dm_table_find_target(map, bvm->bi_sector);

1346

ti = dm_table_find_target(map, bvm->bi_sector);

1347

if (!dm_target_is_valid(ti))

1347

if (!dm_target_is_valid(ti))

1348

goto out_table;

1348

goto out_table;

1349

1350

/*

1350

/*

1351

* Find maximum amount of I/O that won't need splitting

1351

* Find maximum amount of I/O that won't need splitting

1352

*/

1352

*/

1353

max_sectors = min(max_io_len(bvm->bi_sector, ti),

1353

max_sectors = min(max_io_len(bvm->bi_sector, ti),

1354

(sector_t) BIO_MAX_SECTORS);

1354

(sector_t) BIO_MAX_SECTORS);

1355

max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;

1355

max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;

1356

if (max_size < 0)

1356

if (max_size < 0)

1357

max_size = 0;

1357

max_size = 0;

1358

1359

/*

1359

/*

1360

* merge_bvec_fn() returns number of bytes

1360

* merge_bvec_fn() returns number of bytes

1361

* it can accept at this offset

1361

* it can accept at this offset

1362

* max is precomputed maximal io size

1362

* max is precomputed maximal io size

1363

*/

1363

*/

1364

if (max_size && ti->type->merge)

1364

if (max_size && ti->type->merge)

1365

max_size = ti->type->merge(ti, bvm, biovec, max_size);

1365

max_size = ti->type->merge(ti, bvm, biovec, max_size);

1366

/*

1366

/*

1367

* If the target doesn't support merge method and some of the devices

1367

* If the target doesn't support merge method and some of the devices

1368

* provided their merge_bvec method (we know this by looking at

1368

* provided their merge_bvec method (we know this by looking at

1369

* queue_max_hw_sectors), then we can't allow bios with multiple vector

1369

* queue_max_hw_sectors), then we can't allow bios with multiple vector

1370

* entries. So always set max_size to 0, and the code below allows

1370

* entries. So always set max_size to 0, and the code below allows

1371

* just one page.

1371

* just one page.

1372

*/

1372

*/

1373

else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)

1373

else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)

1374

1375

max_size = 0;

1375

max_size = 0;

1376

1377

out_table:

1377

out_table:

1378

dm_table_put(map);

1378

dm_table_put(map);

1379

1380

out:

1380

out:

1381

/*

1381

/*

1382

* Always allow an entire first page

1382

* Always allow an entire first page

1383

*/

1383

*/

1384

if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))

1384

if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))

1385

max_size = biovec->bv_len;

1385

max_size = biovec->bv_len;

1386

1387

return max_size;

1387

return max_size;

1388

}

1388

}

1389

1390

/*

1390

/*

1391

* The request function that just remaps the bio built up by

1391

* The request function that just remaps the bio built up by

1392

* dm_merge_bvec.

1392

* dm_merge_bvec.

1393

*/

1393

*/

1394

static int _dm_request(struct request_queue *q, struct bio *bio)

1394

static int _dm_request(struct request_queue *q, struct bio *bio)

1395

{

1395

{

1396

int rw = bio_data_dir(bio);

1396

int rw = bio_data_dir(bio);

1397

struct mapped_device *md = q->queuedata;

1397

struct mapped_device *md = q->queuedata;

1398

int cpu;

1398

int cpu;

1399

1400

down_read(&md->io_lock);

1400

down_read(&md->io_lock);

1401

1402

cpu = part_stat_lock();

1402

cpu = part_stat_lock();

1403

part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);

1403

part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);

1404

part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));

1404

part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));

1405

part_stat_unlock();

1405

part_stat_unlock();

1406

1407

/* if we're suspended, we have to queue this io for later */

1407

/* if we're suspended, we have to queue this io for later */

1408

if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {

1408

if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {

1409

up_read(&md->io_lock);

1409

up_read(&md->io_lock);

1410

1411

if (bio_rw(bio) != READA)

1411

if (bio_rw(bio) != READA)

1412

queue_io(md, bio);

1412

queue_io(md, bio);

1413

else

1413

else

1414

bio_io_error(bio);

1414

bio_io_error(bio);

1415

return 0;

1415

return 0;

1416

}

1416

}

1417

1418

__split_and_process_bio(md, bio);

1418

__split_and_process_bio(md, bio);

1419

up_read(&md->io_lock);

1419

up_read(&md->io_lock);

1420

return 0;

1420

return 0;

1421

}

1421

}

1422

1423

static int dm_make_request(struct request_queue *q, struct bio *bio)

1423

static int dm_make_request(struct request_queue *q, struct bio *bio)

1424

{

1424

{

1425

struct mapped_device *md = q->queuedata;

1425

struct mapped_device *md = q->queuedata;

1426

1427

return md->saved_make_request_fn(q, bio); /* call __make_request() */

1427

return md->saved_make_request_fn(q, bio); /* call __make_request() */

1428

}

1428

}

1429

1430

static int dm_request_based(struct mapped_device *md)

1430

static int dm_request_based(struct mapped_device *md)

1431

{

1431

{

1432

return blk_queue_stackable(md->queue);

1432

return blk_queue_stackable(md->queue);

1433

}

1433

}

1434

1435

static int dm_request(struct request_queue *q, struct bio *bio)

1435

static int dm_request(struct request_queue *q, struct bio *bio)

1436

{

1436

{

1437

struct mapped_device *md = q->queuedata;

1437

struct mapped_device *md = q->queuedata;

1438

1439

if (dm_request_based(md))

1439

if (dm_request_based(md))

1440

return dm_make_request(q, bio);

1440

return dm_make_request(q, bio);

1441

1442

return _dm_request(q, bio);

1442

return _dm_request(q, bio);

1443

}

1443

}

1444

1445

void dm_dispatch_request(struct request *rq)

1445

void dm_dispatch_request(struct request *rq)

1446

{

1446

{

1447

int r;

1447

int r;

1448

1449

if (blk_queue_io_stat(rq->q))

1449

if (blk_queue_io_stat(rq->q))

1450

rq->cmd_flags |= REQ_IO_STAT;

1450

rq->cmd_flags |= REQ_IO_STAT;

1451

1452

rq->start_time = jiffies;

1452

rq->start_time = jiffies;

1453

r = blk_insert_cloned_request(rq->q, rq);

1453

r = blk_insert_cloned_request(rq->q, rq);

1454

if (r)

1454

if (r)

1455

dm_complete_request(rq, r);

1455

dm_complete_request(rq, r);

1456

}

1456

}

1457

EXPORT_SYMBOL_GPL(dm_dispatch_request);

1457

EXPORT_SYMBOL_GPL(dm_dispatch_request);

1458

1459

static void dm_rq_bio_destructor(struct bio *bio)

1459

static void dm_rq_bio_destructor(struct bio *bio)

1460

{

1460

{

1461

struct dm_rq_clone_bio_info *info = bio->bi_private;

1461

struct dm_rq_clone_bio_info *info = bio->bi_private;

1462

struct mapped_device *md = info->tio->md;

1462

struct mapped_device *md = info->tio->md;

1463

1464

free_bio_info(info);

1464

free_bio_info(info);

1465

bio_free(bio, md->bs);

1465

bio_free(bio, md->bs);

1466

}

1466

}

1467

1468

static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,

1468

static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,

1469

void *data)

1469

void *data)

1470

{

1470

{

1471

struct dm_rq_target_io *tio = data;

1471

struct dm_rq_target_io *tio = data;

1472

struct mapped_device *md = tio->md;

1472

struct mapped_device *md = tio->md;

1473

struct dm_rq_clone_bio_info *info = alloc_bio_info(md);

1473

struct dm_rq_clone_bio_info *info = alloc_bio_info(md);

1474

1475

if (!info)

1475

if (!info)

1476

return -ENOMEM;

1476

return -ENOMEM;

1477

1478

info->orig = bio_orig;

1478

info->orig = bio_orig;

1479

info->tio = tio;

1479

info->tio = tio;

1480

bio->bi_end_io = end_clone_bio;

1480

bio->bi_end_io = end_clone_bio;

1481

bio->bi_private = info;

1481

bio->bi_private = info;

1482

bio->bi_destructor = dm_rq_bio_destructor;

1482

bio->bi_destructor = dm_rq_bio_destructor;

1483

1484

return 0;

1484

return 0;

1485

}

1485

}

1486

1487

static int setup_clone(struct request *clone, struct request *rq,

1487

static int setup_clone(struct request *clone, struct request *rq,

1488

struct dm_rq_target_io *tio)

1488

struct dm_rq_target_io *tio)

1489

{

1489

{

1490

int r;

1490

int r;

1491

1492

r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,

1492

r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,

1493

dm_rq_bio_constructor, tio);

1493

dm_rq_bio_constructor, tio);

1494

if (r)

1494

if (r)

1495

return r;

1495

return r;

1496

1497

clone->cmd = rq->cmd;

1497

clone->cmd = rq->cmd;

1498

clone->cmd_len = rq->cmd_len;

1498

clone->cmd_len = rq->cmd_len;

1499

clone->sense = rq->sense;

1499

clone->sense = rq->sense;

1500

clone->buffer = rq->buffer;

1500

clone->buffer = rq->buffer;

1501

clone->end_io = end_clone_request;

1501

clone->end_io = end_clone_request;

1502

clone->end_io_data = tio;

1502

clone->end_io_data = tio;

1503

1504

return 0;

1504

return 0;

1505

}

1505

}

1506

1507

static struct request *clone_rq(struct request *rq, struct mapped_device *md,

1507

static struct request *clone_rq(struct request *rq, struct mapped_device *md,

1508

gfp_t gfp_mask)

1508

gfp_t gfp_mask)

1509

{

1509

{

1510

struct request *clone;

1510

struct request *clone;

1511

struct dm_rq_target_io *tio;

1511

struct dm_rq_target_io *tio;

1512

1513

tio = alloc_rq_tio(md, gfp_mask);

1513

tio = alloc_rq_tio(md, gfp_mask);

1514

if (!tio)

1514

if (!tio)

1515

return NULL;

1515

return NULL;

1516

1517

tio->md = md;

1517

tio->md = md;

1518

tio->ti = NULL;

1518

tio->ti = NULL;

1519

tio->orig = rq;

1519

tio->orig = rq;

1520

tio->error = 0;

1520

tio->error = 0;

1521

memset(&tio->info, 0, sizeof(tio->info));

1521

memset(&tio->info, 0, sizeof(tio->info));

1522

1523

clone = &tio->clone;

1523

clone = &tio->clone;

1524

if (setup_clone(clone, rq, tio)) {

1524

if (setup_clone(clone, rq, tio)) {

1525

/* -ENOMEM */

1525

/* -ENOMEM */

1526

free_rq_tio(tio);

1526

free_rq_tio(tio);

1527

return NULL;

1527

return NULL;

1528

}

1528

}

1529

1530

return clone;

1530

return clone;

1531

}

1531

}

1532

1533

/*

1533

/*

1534

* Called with the queue lock held.

1534

* Called with the queue lock held.

1535

*/

1535

*/

1536

static int dm_prep_fn(struct request_queue *q, struct request *rq)

1536

static int dm_prep_fn(struct request_queue *q, struct request *rq)

1537

{

1537

{

1538

struct mapped_device *md = q->queuedata;

1538

struct mapped_device *md = q->queuedata;

1539

struct request *clone;

1539

struct request *clone;

1540

1541

if (unlikely(rq->special)) {

1541

if (unlikely(rq->special)) {

1542

DMWARN("Already has something in rq->special.");

1542

DMWARN("Already has something in rq->special.");

1543

return BLKPREP_KILL;

1543

return BLKPREP_KILL;

1544

}

1544

}

1545

1546

clone = clone_rq(rq, md, GFP_ATOMIC);

1546

clone = clone_rq(rq, md, GFP_ATOMIC);

1547

if (!clone)

1547

if (!clone)

1548

return BLKPREP_DEFER;

1548

return BLKPREP_DEFER;

1549

1550

rq->special = clone;

1550

rq->special = clone;

1551

rq->cmd_flags |= REQ_DONTPREP;

1551

rq->cmd_flags |= REQ_DONTPREP;

1552

1553

return BLKPREP_OK;

1553

return BLKPREP_OK;

1554

}

1554

}

1555

1556

/*

1556

/*

1557

* Returns:

1557

* Returns:

1558

* 0 : the request has been processed (not requeued)

1558

* 0 : the request has been processed (not requeued)

1559

* !0 : the request has been requeued

1559

* !0 : the request has been requeued

1560

*/

1560

*/

1561

static int map_request(struct dm_target *ti, struct request *clone,

1561

static int map_request(struct dm_target *ti, struct request *clone,

1562

struct mapped_device *md)

1562

struct mapped_device *md)

1563

{

1563

{

1564

int r, requeued = 0;

1564

int r, requeued = 0;

1565

struct dm_rq_target_io *tio = clone->end_io_data;

1565

struct dm_rq_target_io *tio = clone->end_io_data;

1566

1567

/*

1567

/*

1568

* Hold the md reference here for the in-flight I/O.

1568

* Hold the md reference here for the in-flight I/O.

1569

* We can't rely on the reference count by device opener,

1569

* We can't rely on the reference count by device opener,

1570

* because the device may be closed during the request completion

1570

* because the device may be closed during the request completion

1571

* when all bios are completed.

1571

* when all bios are completed.

1572

* See the comment in rq_completed() too.

1572

* See the comment in rq_completed() too.

1573

*/

1573

*/

1574

dm_get(md);

1574

dm_get(md);

1575

1576

tio->ti = ti;

1576

tio->ti = ti;

1577

r = ti->type->map_rq(ti, clone, &tio->info);

1577

r = ti->type->map_rq(ti, clone, &tio->info);

1578

switch (r) {

1578

switch (r) {

1579

case DM_MAPIO_SUBMITTED:

1579

case DM_MAPIO_SUBMITTED:

1580

/* The target has taken the I/O to submit by itself later */

1580

/* The target has taken the I/O to submit by itself later */

1581

break;

1581

break;

1582

case DM_MAPIO_REMAPPED:

1582

case DM_MAPIO_REMAPPED:

1583

/* The target has remapped the I/O so dispatch it */

1583

/* The target has remapped the I/O so dispatch it */

1584

trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),

1584

trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),

1585

blk_rq_pos(tio->orig));

1585

blk_rq_pos(tio->orig));

1586

dm_dispatch_request(clone);

1586

dm_dispatch_request(clone);

1587

break;

1587

break;

1588

case DM_MAPIO_REQUEUE:

1588

case DM_MAPIO_REQUEUE:

1589

/* The target wants to requeue the I/O */

1589

/* The target wants to requeue the I/O */

1590

dm_requeue_unmapped_request(clone);

1590

dm_requeue_unmapped_request(clone);

1591

requeued = 1;

1591

requeued = 1;

1592

break;

1592

break;

1593

default:

1593

default:

1594

if (r > 0) {

1594

if (r > 0) {

1595

DMWARN("unimplemented target map return value: %d", r);

1595

DMWARN("unimplemented target map return value: %d", r);

1596

BUG();

1596

BUG();

1597

}

1597

}

1598

1599

/* The target wants to complete the I/O */

1599

/* The target wants to complete the I/O */

1600

dm_kill_unmapped_request(clone, r);

1600

dm_kill_unmapped_request(clone, r);

1601

break;

1601

break;

1602

}

1602

}

1603

1604

return requeued;

1604

return requeued;

1605

}

1605

}

1606

1607

/*

1607

/*

1608

* q->request_fn for request-based dm.

1608

* q->request_fn for request-based dm.

1609

* Called with the queue lock held.

1609

* Called with the queue lock held.

1610

*/

1610

*/

1611

static void dm_request_fn(struct request_queue *q)

1611

static void dm_request_fn(struct request_queue *q)

1612

{

1612

{

1613

struct mapped_device *md = q->queuedata;

1613

struct mapped_device *md = q->queuedata;

1614

struct dm_table *map = dm_get_live_table(md);

1614

struct dm_table *map = dm_get_live_table(md);

1615

struct dm_target *ti;

1615

struct dm_target *ti;

1616

struct request *rq, *clone;

1616

struct request *rq, *clone;

1617

sector_t pos;

1617

sector_t pos;

1618

1619

/*

1619

/*

1620

* For suspend, check blk_queue_stopped() and increment

1620

* For suspend, check blk_queue_stopped() and increment

1621

* ->pending within a single queue_lock not to increment the

1621

* ->pending within a single queue_lock not to increment the

1622

* number of in-flight I/Os after the queue is stopped in

1622

* number of in-flight I/Os after the queue is stopped in

1623

* dm_suspend().

1623

* dm_suspend().

1624

*/

1624

*/

1625

while (!blk_queue_stopped(q)) {

1625

while (!blk_queue_stopped(q)) {

1626

rq = blk_peek_request(q);

1626

rq = blk_peek_request(q);

1627

if (!rq)

1627

if (!rq)

1628

goto delay_and_out;

1628

goto delay_and_out;

1629

1630

/* always use block 0 to find the target for flushes for now */

1630

/* always use block 0 to find the target for flushes for now */

1631

pos = 0;

1631

pos = 0;

1632

if (!(rq->cmd_flags & REQ_FLUSH))

1632

if (!(rq->cmd_flags & REQ_FLUSH))

1633

pos = blk_rq_pos(rq);

1633

pos = blk_rq_pos(rq);

1634

1635

ti = dm_table_find_target(map, pos);

1635

ti = dm_table_find_target(map, pos);

1636

BUG_ON(!dm_target_is_valid(ti));

1636

BUG_ON(!dm_target_is_valid(ti));

1637

1638

if (ti->type->busy && ti->type->busy(ti))

1638

if (ti->type->busy && ti->type->busy(ti))

1639

goto delay_and_out;

1639

goto delay_and_out;

1640

1641

blk_start_request(rq);

1641

blk_start_request(rq);

1642

clone = rq->special;

1642

clone = rq->special;

1643

atomic_inc(&md->pending[rq_data_dir(clone)]);

1643

atomic_inc(&md->pending[rq_data_dir(clone)]);

1644

1645

spin_unlock(q->queue_lock);

1645

spin_unlock(q->queue_lock);

1646

if (map_request(ti, clone, md))

1646

if (map_request(ti, clone, md))

1647

goto requeued;

1647

goto requeued;

1648

1649

BUG_ON(!irqs_disabled());

1649

BUG_ON(!irqs_disabled());

1650

spin_lock(q->queue_lock);

1650

spin_lock(q->queue_lock);

1651

}

1651

}

1652

1653

goto out;

1653

goto out;

1654

1655

requeued:

1655

requeued:

1656

BUG_ON(!irqs_disabled());

1656

BUG_ON(!irqs_disabled());

1657

spin_lock(q->queue_lock);

1657

spin_lock(q->queue_lock);

1658

1659

delay_and_out:

1659

delay_and_out:

1660

blk_delay_queue(q, HZ / 10);

1660

blk_delay_queue(q, HZ / 10);

1661

out:

1661

out:

1662

dm_table_put(map);

1662

dm_table_put(map);

1663

1664

return;

1664

return;

1665

}

1665

}

1666

1667

int dm_underlying_device_busy(struct request_queue *q)

1667

int dm_underlying_device_busy(struct request_queue *q)

1668

{

1668

{

1669

return blk_lld_busy(q);

1669

return blk_lld_busy(q);

1670

}

1670

}

1671

EXPORT_SYMBOL_GPL(dm_underlying_device_busy);

1671

EXPORT_SYMBOL_GPL(dm_underlying_device_busy);

1672

1673

static int dm_lld_busy(struct request_queue *q)

1673

static int dm_lld_busy(struct request_queue *q)

1674

{

1674

{

1675

int r;

1675

int r;

1676

struct mapped_device *md = q->queuedata;

1676

struct mapped_device *md = q->queuedata;

1677

struct dm_table *map = dm_get_live_table(md);

1677

struct dm_table *map = dm_get_live_table(md);

1678

1679

if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))

1679

if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))

1680

r = 1;

1680

r = 1;

1681

else

1681

else

1682

r = dm_table_any_busy_target(map);

1682

r = dm_table_any_busy_target(map);

1683

1684

dm_table_put(map);

1684

dm_table_put(map);

1685

1686

return r;

1686

return r;

1687

}

1687

}

1688

1689

static int dm_any_congested(void *congested_data, int bdi_bits)

1689

static int dm_any_congested(void *congested_data, int bdi_bits)

1690

{

1690

{

1691

int r = bdi_bits;

1691

int r = bdi_bits;

1692

struct mapped_device *md = congested_data;

1692

struct mapped_device *md = congested_data;

1693

struct dm_table *map;

1693

struct dm_table *map;

1694

1695

if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {

1695

if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {

1696

map = dm_get_live_table(md);

1696

map = dm_get_live_table(md);

1697

if (map) {

1697

if (map) {

1698

/*

1698

/*

1699

* Request-based dm cares about only own queue for

1699

* Request-based dm cares about only own queue for

1700

* the query about congestion status of request_queue

1700

* the query about congestion status of request_queue

1701

*/

1701

*/

1702

if (dm_request_based(md))

1702

if (dm_request_based(md))

1703

r = md->queue->backing_dev_info.state &

1703

r = md->queue->backing_dev_info.state &

1704

bdi_bits;

1704

bdi_bits;

1705

else

1705

else

1706

r = dm_table_any_congested(map, bdi_bits);

1706

r = dm_table_any_congested(map, bdi_bits);

1707

1708

dm_table_put(map);

1708

dm_table_put(map);

1709

}

1709

}

1710

}

1710

}

1711

1712

return r;

1712

return r;

1713

}

1713

}

1714

1715

/*-----------------------------------------------------------------

1715

/*-----------------------------------------------------------------

1716

* An IDR is used to keep track of allocated minor numbers.

1716

* An IDR is used to keep track of allocated minor numbers.

1717

*---------------------------------------------------------------*/

1717

*---------------------------------------------------------------*/

1718

static void free_minor(int minor)

1718

static void free_minor(int minor)

1719

{

1719

{

1720

spin_lock(&_minor_lock);

1720

spin_lock(&_minor_lock);

1721

idr_remove(&_minor_idr, minor);

1721

idr_remove(&_minor_idr, minor);

1722

spin_unlock(&_minor_lock);

1722

spin_unlock(&_minor_lock);

1723

}

1723

}

1724

1725

/*

1725

/*

1726

* See if the device with a specific minor # is free.

1726

* See if the device with a specific minor # is free.

1727

*/

1727

*/

1728

static int specific_minor(int minor)

1728

static int specific_minor(int minor)

1729

{

1729

{

1730

int r, m;

1730

int r, m;

1731

1732

if (minor >= (1 << MINORBITS))

1732

if (minor >= (1 << MINORBITS))

1733

return -EINVAL;

1733

return -EINVAL;

1734

1735

r = idr_pre_get(&_minor_idr, GFP_KERNEL);

1735

r = idr_pre_get(&_minor_idr, GFP_KERNEL);

1736

if (!r)

1736

if (!r)

1737

return -ENOMEM;

1737

return -ENOMEM;

1738

1739

spin_lock(&_minor_lock);

1739

spin_lock(&_minor_lock);

1740

1741

if (idr_find(&_minor_idr, minor)) {

1741

if (idr_find(&_minor_idr, minor)) {

1742

r = -EBUSY;

1742

r = -EBUSY;

1743

goto out;

1743

goto out;

1744

}

1744

}

1745

1746

r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);

1746

r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);

1747

if (r)

1747

if (r)

1748

goto out;

1748

goto out;

1749

1750

if (m != minor) {

1750

if (m != minor) {

1751

idr_remove(&_minor_idr, m);

1751

idr_remove(&_minor_idr, m);

1752

r = -EBUSY;

1752

r = -EBUSY;

1753

goto out;

1753

goto out;

1754

}

1754

}

1755

1756

out:

1756

out:

1757

spin_unlock(&_minor_lock);

1757

spin_unlock(&_minor_lock);

1758

return r;

1758

return r;

1759

}

1759

}

1760

1761

static int next_free_minor(int *minor)

1761

static int next_free_minor(int *minor)

1762

{

1762

{

1763

int r, m;

1763

int r, m;

1764

1765

r = idr_pre_get(&_minor_idr, GFP_KERNEL);

1765

r = idr_pre_get(&_minor_idr, GFP_KERNEL);

1766

if (!r)

1766

if (!r)

1767

return -ENOMEM;

1767

return -ENOMEM;

1768

1769

spin_lock(&_minor_lock);

1769

spin_lock(&_minor_lock);

1770

1771

r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);

1771

r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);

1772

if (r)

1772

if (r)

1773

goto out;

1773

goto out;

1774

1775

if (m >= (1 << MINORBITS)) {

1775

if (m >= (1 << MINORBITS)) {

1776

idr_remove(&_minor_idr, m);

1776

idr_remove(&_minor_idr, m);

1777

r = -ENOSPC;

1777

r = -ENOSPC;

1778

goto out;

1778

goto out;

1779

}

1779

}

1780

1781

*minor = m;

1781

*minor = m;

1782

1783

out:

1783

out:

1784

spin_unlock(&_minor_lock);

1784

spin_unlock(&_minor_lock);

1785

return r;

1785

return r;

1786

}

1786

}

1787

1788

static const struct block_device_operations dm_blk_dops;

1788

static const struct block_device_operations dm_blk_dops;

1789

1790

static void dm_wq_work(struct work_struct *work);

1790

static void dm_wq_work(struct work_struct *work);

1791

1792

static void dm_init_md_queue(struct mapped_device *md)

1792

static void dm_init_md_queue(struct mapped_device *md)

1793

{

1793

{

1794

/*

1794

/*

1795

* Request-based dm devices cannot be stacked on top of bio-based dm

1795

* Request-based dm devices cannot be stacked on top of bio-based dm

1796

* devices. The type of this dm device has not been decided yet.

1796

* devices. The type of this dm device has not been decided yet.

1797

* The type is decided at the first table loading time.

1797

* The type is decided at the first table loading time.

1798

* To prevent problematic device stacking, clear the queue flag

1798

* To prevent problematic device stacking, clear the queue flag

1799

* for request stacking support until then.

1799

* for request stacking support until then.

1800

*

1800

*

1801

* This queue is new, so no concurrency on the queue_flags.

1801

* This queue is new, so no concurrency on the queue_flags.

1802

*/

1802

*/

1803

queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);

1803

queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);

1804

1805

md->queue->queuedata = md;

1805

md->queue->queuedata = md;

1806

md->queue->backing_dev_info.congested_fn = dm_any_congested;

1806

md->queue->backing_dev_info.congested_fn = dm_any_congested;

1807

md->queue->backing_dev_info.congested_data = md;

1807

md->queue->backing_dev_info.congested_data = md;

1808

blk_queue_make_request(md->queue, dm_request);

1808

blk_queue_make_request(md->queue, dm_request);

1809

blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);

1809

blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);

1810

blk_queue_merge_bvec(md->queue, dm_merge_bvec);

1810

blk_queue_merge_bvec(md->queue, dm_merge_bvec);

1811

blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);

1812

}

1811

}

1813

1812

1814

/*

1813

/*

1815

* Allocate and initialise a blank device with a given minor.

1814

* Allocate and initialise a blank device with a given minor.

1816

*/

1815

*/

1817

static struct mapped_device *alloc_dev(int minor)

1816

static struct mapped_device *alloc_dev(int minor)

1818

{

1817

{

1819

int r;

1818

int r;

1820

struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);

1819

struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);

1821

void *old_md;

1820

void *old_md;

1822

1821

1823

if (!md) {

1822

if (!md) {

1824

DMWARN("unable to allocate device, out of memory.");

1823

DMWARN("unable to allocate device, out of memory.");

1825

return NULL;

1824

return NULL;

1826

}

1825

}

1827

1826

1828

if (!try_module_get(THIS_MODULE))

1827

if (!try_module_get(THIS_MODULE))

1829

goto bad_module_get;

1828

goto bad_module_get;

1830

1829

1831

/* get a minor number for the dev */

1830

/* get a minor number for the dev */

1832

if (minor == DM_ANY_MINOR)

1831

if (minor == DM_ANY_MINOR)

1833

r = next_free_minor(&minor);

1832

r = next_free_minor(&minor);

1834

else

1833

else

1835

r = specific_minor(minor);

1834

r = specific_minor(minor);

1836

if (r < 0)

1835

if (r < 0)

1837

goto bad_minor;

1836

goto bad_minor;

1838

1837

1839

md->type = DM_TYPE_NONE;

1838

md->type = DM_TYPE_NONE;

1840

init_rwsem(&md->io_lock);

1839

init_rwsem(&md->io_lock);

1841

mutex_init(&md->suspend_lock);

1840

mutex_init(&md->suspend_lock);

1842

mutex_init(&md->type_lock);

1841

mutex_init(&md->type_lock);

1843

spin_lock_init(&md->deferred_lock);

1842

spin_lock_init(&md->deferred_lock);

1844

rwlock_init(&md->map_lock);

1843

rwlock_init(&md->map_lock);

1845

atomic_set(&md->holders, 1);

1844

atomic_set(&md->holders, 1);

1846

atomic_set(&md->open_count, 0);

1845

atomic_set(&md->open_count, 0);

1847

atomic_set(&md->event_nr, 0);

1846

atomic_set(&md->event_nr, 0);

1848

atomic_set(&md->uevent_seq, 0);

1847

atomic_set(&md->uevent_seq, 0);

1849

INIT_LIST_HEAD(&md->uevent_list);

1848

INIT_LIST_HEAD(&md->uevent_list);

1850

spin_lock_init(&md->uevent_lock);

1849

spin_lock_init(&md->uevent_lock);

1851

1850

1852

md->queue = blk_alloc_queue(GFP_KERNEL);

1851

md->queue = blk_alloc_queue(GFP_KERNEL);

1853

if (!md->queue)

1852

if (!md->queue)

1854

goto bad_queue;

1853

goto bad_queue;

1855

1854

1856

dm_init_md_queue(md);

1855

dm_init_md_queue(md);

1857

1856

1858

md->disk = alloc_disk(1);

1857

md->disk = alloc_disk(1);

1859

if (!md->disk)

1858

if (!md->disk)

1860

goto bad_disk;

1859

goto bad_disk;

1861

1860

1862

atomic_set(&md->pending[0], 0);

1861

atomic_set(&md->pending[0], 0);

1863

atomic_set(&md->pending[1], 0);

1862

atomic_set(&md->pending[1], 0);

1864

init_waitqueue_head(&md->wait);

1863

init_waitqueue_head(&md->wait);

1865

INIT_WORK(&md->work, dm_wq_work);

1864

INIT_WORK(&md->work, dm_wq_work);

1866

init_waitqueue_head(&md->eventq);

1865

init_waitqueue_head(&md->eventq);

1867

1866

1868

md->disk->major = _major;

1867

md->disk->major = _major;

1869

md->disk->first_minor = minor;

1868

md->disk->first_minor = minor;

1870

md->disk->fops = &dm_blk_dops;

1869

md->disk->fops = &dm_blk_dops;

1871

md->disk->queue = md->queue;

1870

md->disk->queue = md->queue;

1872

md->disk->private_data = md;

1871

md->disk->private_data = md;

1873

sprintf(md->disk->disk_name, "dm-%d", minor);

1872

sprintf(md->disk->disk_name, "dm-%d", minor);

1874

add_disk(md->disk);

1873

add_disk(md->disk);

1875

format_dev_t(md->name, MKDEV(_major, minor));

1874

format_dev_t(md->name, MKDEV(_major, minor));

1876

1875

1877

md->wq = alloc_workqueue("kdmflush",

1876

md->wq = alloc_workqueue("kdmflush",

1878

WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);

1877

WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);

1879

if (!md->wq)

1878

if (!md->wq)

1880

goto bad_thread;

1879

goto bad_thread;

1881

1880

1882

md->bdev = bdget_disk(md->disk, 0);

1881

md->bdev = bdget_disk(md->disk, 0);

1883

if (!md->bdev)

1882

if (!md->bdev)

1884

goto bad_bdev;

1883

goto bad_bdev;

1885

1884

1886

bio_init(&md->flush_bio);

1885

bio_init(&md->flush_bio);

1887

md->flush_bio.bi_bdev = md->bdev;

1886

md->flush_bio.bi_bdev = md->bdev;

1888

md->flush_bio.bi_rw = WRITE_FLUSH;

1887

md->flush_bio.bi_rw = WRITE_FLUSH;

1889

1888

1890

/* Populate the mapping, nobody knows we exist yet */

1889

/* Populate the mapping, nobody knows we exist yet */

1891

spin_lock(&_minor_lock);

1890

spin_lock(&_minor_lock);

1892

old_md = idr_replace(&_minor_idr, md, minor);

1891

old_md = idr_replace(&_minor_idr, md, minor);

1893

spin_unlock(&_minor_lock);

1892

spin_unlock(&_minor_lock);

1894

1893

1895

BUG_ON(old_md != MINOR_ALLOCED);

1894

BUG_ON(old_md != MINOR_ALLOCED);

1896

1895

1897

return md;

1896

return md;

1898

1897

1899

bad_bdev:

1898

bad_bdev:

1900

destroy_workqueue(md->wq);

1899

destroy_workqueue(md->wq);

1901

bad_thread:

1900

bad_thread:

1902

del_gendisk(md->disk);

1901

del_gendisk(md->disk);

1903

put_disk(md->disk);

1902

put_disk(md->disk);

1904

bad_disk:

1903

bad_disk:

1905

blk_cleanup_queue(md->queue);

1904

blk_cleanup_queue(md->queue);

1906

bad_queue:

1905

bad_queue:

1907

free_minor(minor);

1906

free_minor(minor);

1908

bad_minor:

1907

bad_minor:

1909

module_put(THIS_MODULE);

1908

module_put(THIS_MODULE);

1910

bad_module_get:

1909

bad_module_get:

1911

kfree(md);

1910

kfree(md);

1912

return NULL;

1911

return NULL;

1913

}

1912

}

1914

1913

1915

static void unlock_fs(struct mapped_device *md);

1914

static void unlock_fs(struct mapped_device *md);

1916

1915

1917

static void free_dev(struct mapped_device *md)

1916

static void free_dev(struct mapped_device *md)

1918

{

1917

{

1919

int minor = MINOR(disk_devt(md->disk));

1918

int minor = MINOR(disk_devt(md->disk));

1920

1919

1921

unlock_fs(md);

1920

unlock_fs(md);

1922

bdput(md->bdev);

1921

bdput(md->bdev);

1923

destroy_workqueue(md->wq);

1922

destroy_workqueue(md->wq);

1924

if (md->tio_pool)

1923

if (md->tio_pool)

1925

mempool_destroy(md->tio_pool);

1924

mempool_destroy(md->tio_pool);

1926

if (md->io_pool)

1925

if (md->io_pool)

1927

mempool_destroy(md->io_pool);

1926

mempool_destroy(md->io_pool);

1928

if (md->bs)

1927

if (md->bs)

1929

bioset_free(md->bs);

1928

bioset_free(md->bs);

1930

blk_integrity_unregister(md->disk);

1929

blk_integrity_unregister(md->disk);

1931

del_gendisk(md->disk);

1930

del_gendisk(md->disk);

1932

free_minor(minor);

1931

free_minor(minor);

1933

1932

1934

spin_lock(&_minor_lock);

1933

spin_lock(&_minor_lock);

1935

md->disk->private_data = NULL;

1934

md->disk->private_data = NULL;

1936

spin_unlock(&_minor_lock);

1935

spin_unlock(&_minor_lock);

1937

1936

1938

put_disk(md->disk);

1937

put_disk(md->disk);

1939

blk_cleanup_queue(md->queue);

1938

blk_cleanup_queue(md->queue);

1940

module_put(THIS_MODULE);

1939

module_put(THIS_MODULE);

1941

kfree(md);

1940

kfree(md);

1942

}

1941

}

1943

1942

1944

static void __bind_mempools(struct mapped_device *md, struct dm_table *t)

1943

static void __bind_mempools(struct mapped_device *md, struct dm_table *t)

1945

{

1944

{

1946

struct dm_md_mempools *p;

1945

struct dm_md_mempools *p;

1947

1946

1948

if (md->io_pool && md->tio_pool && md->bs)

1947

if (md->io_pool && md->tio_pool && md->bs)

1949

/* the md already has necessary mempools */

1948

/* the md already has necessary mempools */

1950

goto out;

1949

goto out;

1951

1950

1952

p = dm_table_get_md_mempools(t);

1951

p = dm_table_get_md_mempools(t);

1953

BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);

1952

BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);

1954

1953

1955

md->io_pool = p->io_pool;

1954

md->io_pool = p->io_pool;

1956

p->io_pool = NULL;

1955

p->io_pool = NULL;

1957

md->tio_pool = p->tio_pool;

1956

md->tio_pool = p->tio_pool;

1958

p->tio_pool = NULL;

1957

p->tio_pool = NULL;

1959

md->bs = p->bs;

1958

md->bs = p->bs;

1960

p->bs = NULL;

1959

p->bs = NULL;

1961

1960

1962

out:

1961

out:

1963

/* mempool bind completed, now no need any mempools in the table */

1962

/* mempool bind completed, now no need any mempools in the table */

1964

dm_table_free_md_mempools(t);

1963

dm_table_free_md_mempools(t);

1965

}

1964

}

1966

1965

1967

/*

1966

/*

1968

* Bind a table to the device.

1967

* Bind a table to the device.

1969

*/

1968

*/

1970

static void event_callback(void *context)

1969

static void event_callback(void *context)

1971

{

1970

{

1972

unsigned long flags;

1971

unsigned long flags;

1973

LIST_HEAD(uevents);

1972

LIST_HEAD(uevents);

1974

struct mapped_device *md = (struct mapped_device *) context;

1973

struct mapped_device *md = (struct mapped_device *) context;

1975

1974

1976

spin_lock_irqsave(&md->uevent_lock, flags);

1975

spin_lock_irqsave(&md->uevent_lock, flags);

1977

list_splice_init(&md->uevent_list, &uevents);

1976

list_splice_init(&md->uevent_list, &uevents);

1978

spin_unlock_irqrestore(&md->uevent_lock, flags);

1977

spin_unlock_irqrestore(&md->uevent_lock, flags);

1979

1978

1980

dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);

1979

dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);

1981

1980

1982

atomic_inc(&md->event_nr);

1981

atomic_inc(&md->event_nr);

1983

wake_up(&md->eventq);

1982

wake_up(&md->eventq);

1984

}

1983

}

1985

1984

1986

/*

1985

/*

1987

* Protected by md->suspend_lock obtained by dm_swap_table().

1986

* Protected by md->suspend_lock obtained by dm_swap_table().

1988

*/

1987

*/

1989

static void __set_size(struct mapped_device *md, sector_t size)

1988

static void __set_size(struct mapped_device *md, sector_t size)

1990

{

1989

{

1991

set_capacity(md->disk, size);

1990

set_capacity(md->disk, size);

1992

1991

1993

i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);

1992

i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);

1994

}

1993

}

1995

1994

1996

/*

1995

/*

1997

* Return 1 if the queue has a compulsory merge_bvec_fn function.

1996

* Return 1 if the queue has a compulsory merge_bvec_fn function.

1998

*

1997

*

1999

* If this function returns 0, then the device is either a non-dm

1998

* If this function returns 0, then the device is either a non-dm

2000

* device without a merge_bvec_fn, or it is a dm device that is

1999

* device without a merge_bvec_fn, or it is a dm device that is

2001

* able to split any bios it receives that are too big.

2000

* able to split any bios it receives that are too big.

2002

*/

2001

*/

2003

int dm_queue_merge_is_compulsory(struct request_queue *q)

2002

int dm_queue_merge_is_compulsory(struct request_queue *q)

2004

{

2003

{

2005

struct mapped_device *dev_md;

2004

struct mapped_device *dev_md;

2006

2005

2007

if (!q->merge_bvec_fn)

2006

if (!q->merge_bvec_fn)

2008

return 0;

2007

return 0;

2009

2008

2010

if (q->make_request_fn == dm_request) {

2009

if (q->make_request_fn == dm_request) {

2011

dev_md = q->queuedata;

2010

dev_md = q->queuedata;

2012

if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))

2011

if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))

2013

return 0;

2012

return 0;

2014

}

2013

}

2015

2014

2016

return 1;

2015

return 1;

2017

}

2016

}

2018

2017

2019

static int dm_device_merge_is_compulsory(struct dm_target *ti,

2018

static int dm_device_merge_is_compulsory(struct dm_target *ti,

2020

struct dm_dev *dev, sector_t start,

2019

struct dm_dev *dev, sector_t start,

2021

sector_t len, void *data)

2020

sector_t len, void *data)

2022

{

2021

{

2023

struct block_device *bdev = dev->bdev;

2022

struct block_device *bdev = dev->bdev;

2024

struct request_queue *q = bdev_get_queue(bdev);

2023

struct request_queue *q = bdev_get_queue(bdev);

2025

2024

2026

return dm_queue_merge_is_compulsory(q);

2025

return dm_queue_merge_is_compulsory(q);

2027

}

2026

}

2028

2027

2029

/*

2028

/*

2030

* Return 1 if it is acceptable to ignore merge_bvec_fn based

2029

* Return 1 if it is acceptable to ignore merge_bvec_fn based

2031

* on the properties of the underlying devices.

2030

* on the properties of the underlying devices.

2032

*/

2031

*/

2033

static int dm_table_merge_is_optional(struct dm_table *table)

2032

static int dm_table_merge_is_optional(struct dm_table *table)

2034

{

2033

{

2035

unsigned i = 0;

2034

unsigned i = 0;

2036

struct dm_target *ti;

2035

struct dm_target *ti;

2037

2036

2038

while (i < dm_table_get_num_targets(table)) {

2037

while (i < dm_table_get_num_targets(table)) {

2039

ti = dm_table_get_target(table, i++);

2038

ti = dm_table_get_target(table, i++);

2040

2039

2041

if (ti->type->iterate_devices &&

2040

if (ti->type->iterate_devices &&

2042

ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))

2041

ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))

2043

return 0;

2042

return 0;

2044

}

2043

}

2045

2044

2046

return 1;

2045

return 1;

2047

}

2046

}

2048

2047

2049

/*

2048

/*

2050

* Returns old map, which caller must destroy.

2049

* Returns old map, which caller must destroy.

2051

*/

2050

*/

2052

static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,

2051

static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,

2053

struct queue_limits *limits)

2052

struct queue_limits *limits)

2054

{

2053

{

2055

struct dm_table *old_map;

2054

struct dm_table *old_map;

2056

struct request_queue *q = md->queue;

2055

struct request_queue *q = md->queue;

2057

sector_t size;

2056

sector_t size;

2058

unsigned long flags;

2057

unsigned long flags;

2059

int merge_is_optional;

2058

int merge_is_optional;

2060

2059

2061

size = dm_table_get_size(t);

2060

size = dm_table_get_size(t);

2062

2061

2063

/*

2062

/*

2064

* Wipe any geometry if the size of the table changed.

2063

* Wipe any geometry if the size of the table changed.

2065

*/

2064

*/

2066

if (size != get_capacity(md->disk))

2065

if (size != get_capacity(md->disk))

2067

memset(&md->geometry, 0, sizeof(md->geometry));

2066

memset(&md->geometry, 0, sizeof(md->geometry));

2068

2067

2069

__set_size(md, size);

2068

__set_size(md, size);

2070

2069

2071

dm_table_event_callback(t, event_callback, md);

2070

dm_table_event_callback(t, event_callback, md);

2072

2071

2073

/*

2072

/*

2074

* The queue hasn't been stopped yet, if the old table type wasn't

2073

* The queue hasn't been stopped yet, if the old table type wasn't

2075

* for request-based during suspension. So stop it to prevent

2074

* for request-based during suspension. So stop it to prevent

2076

* I/O mapping before resume.

2075

* I/O mapping before resume.

2077

* This must be done before setting the queue restrictions,

2076

* This must be done before setting the queue restrictions,

2078

* because request-based dm may be run just after the setting.

2077

* because request-based dm may be run just after the setting.

2079

*/

2078

*/

2080

if (dm_table_request_based(t) && !blk_queue_stopped(q))

2079

if (dm_table_request_based(t) && !blk_queue_stopped(q))

2081

stop_queue(q);

2080

stop_queue(q);

2082

2081

2083

__bind_mempools(md, t);

2082

__bind_mempools(md, t);

2084

2083

2085

merge_is_optional = dm_table_merge_is_optional(t);

2084

merge_is_optional = dm_table_merge_is_optional(t);

2086

2085

2087

write_lock_irqsave(&md->map_lock, flags);

2086

write_lock_irqsave(&md->map_lock, flags);

2088

old_map = md->map;

2087

old_map = md->map;

2089

md->map = t;

2088

md->map = t;

2090

dm_table_set_restrictions(t, q, limits);

2089

dm_table_set_restrictions(t, q, limits);

2091

if (merge_is_optional)

2090

if (merge_is_optional)

2092

set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);

2091

set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);

2093

else

2092

else

2094

clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);

2093

clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);

2095

write_unlock_irqrestore(&md->map_lock, flags);

2094

write_unlock_irqrestore(&md->map_lock, flags);

2096

2095

2097

return old_map;

2096

return old_map;

2098

}

2097

}

2099

2098

2100

/*

2099

/*

2101

* Returns unbound table for the caller to free.

2100

* Returns unbound table for the caller to free.

2102

*/

2101

*/

2103

static struct dm_table *__unbind(struct mapped_device *md)

2102

static struct dm_table *__unbind(struct mapped_device *md)

2104

{

2103

{

2105

struct dm_table *map = md->map;

2104

struct dm_table *map = md->map;

2106

unsigned long flags;

2105

unsigned long flags;

2107

2106

2108

if (!map)

2107

if (!map)

2109

return NULL;

2108

return NULL;

2110

2109

2111

dm_table_event_callback(map, NULL, NULL);

2110

dm_table_event_callback(map, NULL, NULL);

2112

write_lock_irqsave(&md->map_lock, flags);

2111

write_lock_irqsave(&md->map_lock, flags);

2113

md->map = NULL;

2112

md->map = NULL;

2114

write_unlock_irqrestore(&md->map_lock, flags);

2113

write_unlock_irqrestore(&md->map_lock, flags);

2115

2114

2116

return map;

2115

return map;

2117

}

2116

}

2118

2117

2119

/*

2118

/*

2120

* Constructor for a new device.

2119

* Constructor for a new device.

2121

*/

2120

*/

2122

int dm_create(int minor, struct mapped_device **result)

2121

int dm_create(int minor, struct mapped_device **result)

2123

{

2122

{

2124

struct mapped_device *md;

2123

struct mapped_device *md;

2125

2124

2126

md = alloc_dev(minor);

2125

md = alloc_dev(minor);

2127

if (!md)

2126

if (!md)

2128

return -ENXIO;

2127

return -ENXIO;

2129

2128

2130

dm_sysfs_init(md);

2129

dm_sysfs_init(md);

2131

2130

2132

*result = md;

2131

*result = md;

2133

return 0;

2132

return 0;

2134

}

2133

}

2135

2134

2136

/*

2135

/*

2137

* Functions to manage md->type.

2136

* Functions to manage md->type.

2138

* All are required to hold md->type_lock.

2137

* All are required to hold md->type_lock.

2139

*/

2138

*/

2140

void dm_lock_md_type(struct mapped_device *md)

2139

void dm_lock_md_type(struct mapped_device *md)

2141

{

2140

{

2142

mutex_lock(&md->type_lock);

2141

mutex_lock(&md->type_lock);

2143

}

2142

}

2144

2143

2145

void dm_unlock_md_type(struct mapped_device *md)

2144

void dm_unlock_md_type(struct mapped_device *md)

2146

{

2145

{

2147

mutex_unlock(&md->type_lock);

2146

mutex_unlock(&md->type_lock);

2148

}

2147

}

2149

2148

2150

void dm_set_md_type(struct mapped_device *md, unsigned type)

2149

void dm_set_md_type(struct mapped_device *md, unsigned type)

2151

{

2150

{

2152

md->type = type;

2151

md->type = type;

2153

}

2152

}

2154

2153

2155

unsigned dm_get_md_type(struct mapped_device *md)

2154

unsigned dm_get_md_type(struct mapped_device *md)

2156

{

2155

{

2157

return md->type;

2156

return md->type;

2158

}

2157

}

2159

2158

2160

/*

2159

/*

2161

* Fully initialize a request-based queue (->elevator, ->request_fn, etc).

2160

* Fully initialize a request-based queue (->elevator, ->request_fn, etc).

2162

*/

2161

*/

2163

static int dm_init_request_based_queue(struct mapped_device *md)

2162

static int dm_init_request_based_queue(struct mapped_device *md)

2164

{

2163

{

2165

struct request_queue *q = NULL;

2164

struct request_queue *q = NULL;

2166

2165

2167

if (md->queue->elevator)

2166

if (md->queue->elevator)

2168

return 1;

2167

return 1;

2169

2168

2170

/* Fully initialize the queue */

2169

/* Fully initialize the queue */

2171

q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);

2170

q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);

2172

if (!q)

2171

if (!q)

2173

return 0;

2172

return 0;

2174

2173

2175

md->queue = q;

2174

md->queue = q;

2176

md->saved_make_request_fn = md->queue->make_request_fn;

2175

md->saved_make_request_fn = md->queue->make_request_fn;

2177

dm_init_md_queue(md);

2176

dm_init_md_queue(md);

2178

blk_queue_softirq_done(md->queue, dm_softirq_done);

2177

blk_queue_softirq_done(md->queue, dm_softirq_done);

2179

blk_queue_prep_rq(md->queue, dm_prep_fn);

2178

blk_queue_prep_rq(md->queue, dm_prep_fn);

2180

blk_queue_lld_busy(md->queue, dm_lld_busy);

2179

blk_queue_lld_busy(md->queue, dm_lld_busy);

2181

2180

2182

elv_register_queue(md->queue);

2181

elv_register_queue(md->queue);

2183

2182

2184

return 1;

2183

return 1;

2185

}

2184

}

2186

2185

2187

/*

2186

/*

2188

* Setup the DM device's queue based on md's type

2187

* Setup the DM device's queue based on md's type

2189

*/

2188

*/

2190

int dm_setup_md_queue(struct mapped_device *md)

2189

int dm_setup_md_queue(struct mapped_device *md)

2191

{

2190

{

2192

if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&

2191

if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&

2193

!dm_init_request_based_queue(md)) {

2192

!dm_init_request_based_queue(md)) {

2194

DMWARN("Cannot initialize queue for request-based mapped device");

2193

DMWARN("Cannot initialize queue for request-based mapped device");

2195

return -EINVAL;

2194

return -EINVAL;

2196

}

2195

}

2197

2196

2198

return 0;

2197

return 0;

2199

}

2198

}

2200

2199

2201

static struct mapped_device *dm_find_md(dev_t dev)

2200

static struct mapped_device *dm_find_md(dev_t dev)

2202

{

2201

{

2203

struct mapped_device *md;

2202

struct mapped_device *md;

2204

unsigned minor = MINOR(dev);

2203

unsigned minor = MINOR(dev);

2205

2204

2206

if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))

2205

if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))

2207

return NULL;

2206

return NULL;

2208

2207

2209

spin_lock(&_minor_lock);

2208

spin_lock(&_minor_lock);

2210

2209

2211

md = idr_find(&_minor_idr, minor);

2210

md = idr_find(&_minor_idr, minor);

2212

if (md && (md == MINOR_ALLOCED ||

2211

if (md && (md == MINOR_ALLOCED ||

2213

(MINOR(disk_devt(dm_disk(md))) != minor) ||

2212

(MINOR(disk_devt(dm_disk(md))) != minor) ||

2214

dm_deleting_md(md) ||

2213

dm_deleting_md(md) ||

2215

test_bit(DMF_FREEING, &md->flags))) {

2214

test_bit(DMF_FREEING, &md->flags))) {

2216

md = NULL;

2215

md = NULL;

2217

goto out;

2216

goto out;

2218

}

2217

}

2219

2218

2220

out:

2219

out:

2221

spin_unlock(&_minor_lock);

2220

spin_unlock(&_minor_lock);

2222

2221

2223

return md;

2222

return md;

2224

}

2223

}

2225

2224

2226

struct mapped_device *dm_get_md(dev_t dev)

2225

struct mapped_device *dm_get_md(dev_t dev)

2227

{

2226

{

2228

struct mapped_device *md = dm_find_md(dev);

2227

struct mapped_device *md = dm_find_md(dev);

2229

2228

2230

if (md)

2229

if (md)

2231

dm_get(md);

2230

dm_get(md);

2232

2231

2233

return md;

2232

return md;

2234

}

2233

}

2235

2234

2236

void *dm_get_mdptr(struct mapped_device *md)

2235

void *dm_get_mdptr(struct mapped_device *md)

2237

{

2236

{

2238

return md->interface_ptr;

2237

return md->interface_ptr;

2239

}

2238

}

2240

2239

2241

void dm_set_mdptr(struct mapped_device *md, void *ptr)

2240

void dm_set_mdptr(struct mapped_device *md, void *ptr)

2242

{

2241

{

2243

md->interface_ptr = ptr;

2242

md->interface_ptr = ptr;

2244

}

2243

}

2245

2244

2246

void dm_get(struct mapped_device *md)

2245

void dm_get(struct mapped_device *md)

2247

{

2246

{

2248

atomic_inc(&md->holders);

2247

atomic_inc(&md->holders);

2249

BUG_ON(test_bit(DMF_FREEING, &md->flags));

2248

BUG_ON(test_bit(DMF_FREEING, &md->flags));

2250

}

2249

}

2251

2250

2252

const char *dm_device_name(struct mapped_device *md)

2251

const char *dm_device_name(struct mapped_device *md)

2253

{

2252

{

2254

return md->name;

2253

return md->name;

2255

}

2254

}

2256

EXPORT_SYMBOL_GPL(dm_device_name);

2255

EXPORT_SYMBOL_GPL(dm_device_name);

2257

2256

2258

static void __dm_destroy(struct mapped_device *md, bool wait)

2257

static void __dm_destroy(struct mapped_device *md, bool wait)

2259

{

2258

{

2260

struct dm_table *map;

2259

struct dm_table *map;

2261

2260

2262

might_sleep();

2261

might_sleep();

2263

2262

2264

spin_lock(&_minor_lock);

2263

spin_lock(&_minor_lock);

2265

map = dm_get_live_table(md);

2264

map = dm_get_live_table(md);

2266

idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));

2265

idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));

2267

set_bit(DMF_FREEING, &md->flags);

2266

set_bit(DMF_FREEING, &md->flags);

2268

spin_unlock(&_minor_lock);

2267

spin_unlock(&_minor_lock);

2269

2268

2270

if (!dm_suspended_md(md)) {

2269

if (!dm_suspended_md(md)) {

2271

dm_table_presuspend_targets(map);

2270

dm_table_presuspend_targets(map);

2272

dm_table_postsuspend_targets(map);

2271

dm_table_postsuspend_targets(map);

2273

}

2272

}

2274

2273

2275

/*

2274

/*

2276

* Rare, but there may be I/O requests still going to complete,

2275

* Rare, but there may be I/O requests still going to complete,

2277

* for example. Wait for all references to disappear.

2276

* for example. Wait for all references to disappear.

2278

* No one should increment the reference count of the mapped_device,

2277

* No one should increment the reference count of the mapped_device,

2279

* after the mapped_device state becomes DMF_FREEING.

2278

* after the mapped_device state becomes DMF_FREEING.

2280

*/

2279

*/

2281

if (wait)

2280

if (wait)

2282

while (atomic_read(&md->holders))

2281

while (atomic_read(&md->holders))

2283

msleep(1);

2282

msleep(1);

2284

else if (atomic_read(&md->holders))

2283

else if (atomic_read(&md->holders))

2285

DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",

2284

DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",

2286

dm_device_name(md), atomic_read(&md->holders));

2285

dm_device_name(md), atomic_read(&md->holders));

2287

2286

2288

dm_sysfs_exit(md);

2287

dm_sysfs_exit(md);

2289

dm_table_put(map);

2288

dm_table_put(map);

2290

dm_table_destroy(__unbind(md));

2289

dm_table_destroy(__unbind(md));

2291

free_dev(md);

2290

free_dev(md);

2292

}

2291

}

2293

2292

2294

void dm_destroy(struct mapped_device *md)

2293

void dm_destroy(struct mapped_device *md)

2295

{

2294

{

2296

__dm_destroy(md, true);

2295

__dm_destroy(md, true);

2297

}

2296

}

2298

2297

2299

void dm_destroy_immediate(struct mapped_device *md)

2298

void dm_destroy_immediate(struct mapped_device *md)

2300

{

2299

{

2301

__dm_destroy(md, false);

2300

__dm_destroy(md, false);

2302

}

2301

}

2303

2302

2304

void dm_put(struct mapped_device *md)

2303

void dm_put(struct mapped_device *md)

2305

{

2304

{

2306

atomic_dec(&md->holders);

2305

atomic_dec(&md->holders);

2307

}

2306

}

2308

EXPORT_SYMBOL_GPL(dm_put);

2307

EXPORT_SYMBOL_GPL(dm_put);

2309

2308

2310

static int dm_wait_for_completion(struct mapped_device *md, int interruptible)

2309

static int dm_wait_for_completion(struct mapped_device *md, int interruptible)

2311

{

2310

{

2312

int r = 0;

2311

int r = 0;

2313

DECLARE_WAITQUEUE(wait, current);

2312

DECLARE_WAITQUEUE(wait, current);

2314

2313

2315

add_wait_queue(&md->wait, &wait);

2314

add_wait_queue(&md->wait, &wait);

2316

2315

2317

while (1) {

2316

while (1) {

2318

set_current_state(interruptible);

2317

set_current_state(interruptible);

2319

2318

2320

smp_mb();

2319

smp_mb();

2321

if (!md_in_flight(md))

2320

if (!md_in_flight(md))

2322

break;

2321

break;

2323

2322

2324

if (interruptible == TASK_INTERRUPTIBLE &&

2323

if (interruptible == TASK_INTERRUPTIBLE &&

2325

signal_pending(current)) {

2324

signal_pending(current)) {

2326

r = -EINTR;

2325

r = -EINTR;

2327

break;

2326

break;

2328

}

2327

}

2329

2328

2330

io_schedule();

2329

io_schedule();

2331

}

2330

}

2332

set_current_state(TASK_RUNNING);

2331

set_current_state(TASK_RUNNING);

2333

2332

2334

remove_wait_queue(&md->wait, &wait);

2333

remove_wait_queue(&md->wait, &wait);

2335

2334

2336

return r;

2335

return r;

2337

}

2336

}

2338

2337

2339

/*

2338

/*

2340

* Process the deferred bios

2339

* Process the deferred bios

2341

*/

2340

*/

2342

static void dm_wq_work(struct work_struct *work)

2341

static void dm_wq_work(struct work_struct *work)

2343

{

2342

{

2344

struct mapped_device *md = container_of(work, struct mapped_device,

2343

struct mapped_device *md = container_of(work, struct mapped_device,

2345

work);

2344

work);

2346

struct bio *c;

2345

struct bio *c;

2347

2346

2348

down_read(&md->io_lock);

2347

down_read(&md->io_lock);

2349

2348

2350

while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {

2349

while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {

2351

spin_lock_irq(&md->deferred_lock);

2350

spin_lock_irq(&md->deferred_lock);

2352

c = bio_list_pop(&md->deferred);

2351

c = bio_list_pop(&md->deferred);

2353

spin_unlock_irq(&md->deferred_lock);

2352

spin_unlock_irq(&md->deferred_lock);

2354

2353

2355

if (!c)

2354

if (!c)

2356

break;

2355

break;

2357

2356

2358

up_read(&md->io_lock);

2357

up_read(&md->io_lock);

2359

2358

2360

if (dm_request_based(md))

2359

if (dm_request_based(md))

2361

generic_make_request(c);

2360

generic_make_request(c);

2362

else

2361

else

2363

__split_and_process_bio(md, c);

2362

__split_and_process_bio(md, c);

2364

2363

2365

down_read(&md->io_lock);

2364

down_read(&md->io_lock);

2366

}

2365

}

2367

2366

2368

up_read(&md->io_lock);

2367

up_read(&md->io_lock);

2369

}

2368

}

2370

2369

2371

static void dm_queue_flush(struct mapped_device *md)

2370

static void dm_queue_flush(struct mapped_device *md)

2372

{

2371

{

2373

clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);

2372

clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);

2374

smp_mb__after_clear_bit();

2373

smp_mb__after_clear_bit();

2375

queue_work(md->wq, &md->work);

2374

queue_work(md->wq, &md->work);

2376

}

2375

}

2377

2376

2378

/*

2377

/*

2379

* Swap in a new table, returning the old one for the caller to destroy.

2378

* Swap in a new table, returning the old one for the caller to destroy.

2380

*/

2379

*/

2381

struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)

2380

struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)

2382

{

2381

{

2383

struct dm_table *map = ERR_PTR(-EINVAL);

2382

struct dm_table *map = ERR_PTR(-EINVAL);

2384

struct queue_limits limits;

2383

struct queue_limits limits;

2385

int r;

2384

int r;

2386

2385

2387

mutex_lock(&md->suspend_lock);

2386

mutex_lock(&md->suspend_lock);

2388

2387

2389

/* device must be suspended */

2388

/* device must be suspended */

2390

if (!dm_suspended_md(md))

2389

if (!dm_suspended_md(md))

2391

goto out;

2390

goto out;

2392

2391

2393

r = dm_calculate_queue_limits(table, &limits);

2392

r = dm_calculate_queue_limits(table, &limits);

2394

if (r) {

2393

if (r) {

2395

map = ERR_PTR(r);

2394

map = ERR_PTR(r);

2396

goto out;

2395

goto out;

2397

}

2396

}

2398

2397

2399

map = __bind(md, table, &limits);

2398

map = __bind(md, table, &limits);

2400

2399

2401

out:

2400

out:

2402

mutex_unlock(&md->suspend_lock);

2401

mutex_unlock(&md->suspend_lock);

2403

return map;

2402

return map;

2404

}

2403

}

2405

2404

2406

/*

2405

/*

2407

* Functions to lock and unlock any filesystem running on the

2406

* Functions to lock and unlock any filesystem running on the

2408

* device.

2407

* device.

2409

*/

2408

*/

2410

static int lock_fs(struct mapped_device *md)

2409

static int lock_fs(struct mapped_device *md)

2411

{

2410

{

2412

int r;

2411

int r;

2413

2412

2414

WARN_ON(md->frozen_sb);

2413

WARN_ON(md->frozen_sb);

2415

2414

2416

md->frozen_sb = freeze_bdev(md->bdev);

2415

md->frozen_sb = freeze_bdev(md->bdev);

2417

if (IS_ERR(md->frozen_sb)) {

2416

if (IS_ERR(md->frozen_sb)) {

2418

r = PTR_ERR(md->frozen_sb);

2417

r = PTR_ERR(md->frozen_sb);

2419

md->frozen_sb = NULL;

2418

md->frozen_sb = NULL;

2420

return r;

2419

return r;

2421

}

2420

}

2422

2421

2423

set_bit(DMF_FROZEN, &md->flags);

2422

set_bit(DMF_FROZEN, &md->flags);

2424

2423

2425

return 0;

2424

return 0;

2426

}

2425

}

2427

2426

2428

static void unlock_fs(struct mapped_device *md)

2427

static void unlock_fs(struct mapped_device *md)

2429

{

2428

{

2430

if (!test_bit(DMF_FROZEN, &md->flags))

2429

if (!test_bit(DMF_FROZEN, &md->flags))

2431

return;

2430

return;

2432

2431

2433

thaw_bdev(md->bdev, md->frozen_sb);

2432

thaw_bdev(md->bdev, md->frozen_sb);

2434

md->frozen_sb = NULL;

2433

md->frozen_sb = NULL;

2435

clear_bit(DMF_FROZEN, &md->flags);

2434

clear_bit(DMF_FROZEN, &md->flags);

2436

}

2435

}

2437

2436

2438

/*

2437

/*

2439

* We need to be able to change a mapping table under a mounted

2438

* We need to be able to change a mapping table under a mounted

2440

* filesystem. For example we might want to move some data in

2439

* filesystem. For example we might want to move some data in

2441

* the background. Before the table can be swapped with

2440

* the background. Before the table can be swapped with

2442

* dm_bind_table, dm_suspend must be called to flush any in

2441

* dm_bind_table, dm_suspend must be called to flush any in

2443

* flight bios and ensure that any further io gets deferred.

2442

* flight bios and ensure that any further io gets deferred.

2444

*/

2443

*/

2445

/*

2444

/*

2446

* Suspend mechanism in request-based dm.

2445

* Suspend mechanism in request-based dm.

2447

*

2446

*

2448

* 1. Flush all I/Os by lock_fs() if needed.

2447

* 1. Flush all I/Os by lock_fs() if needed.

2449

* 2. Stop dispatching any I/O by stopping the request_queue.

2448

* 2. Stop dispatching any I/O by stopping the request_queue.

2450

* 3. Wait for all in-flight I/Os to be completed or requeued.

2449

* 3. Wait for all in-flight I/Os to be completed or requeued.

2451

*

2450

*

2452

* To abort suspend, start the request_queue.

2451

* To abort suspend, start the request_queue.

2453

*/

2452

*/

2454

int dm_suspend(struct mapped_device *md, unsigned suspend_flags)

2453

int dm_suspend(struct mapped_device *md, unsigned suspend_flags)

2455

{

2454

{

2456

struct dm_table *map = NULL;

2455

struct dm_table *map = NULL;

2457

int r = 0;

2456

int r = 0;

2458

int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;

2457

int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;

2459

int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;

2458

int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;

2460

2459

2461

mutex_lock(&md->suspend_lock);

2460

mutex_lock(&md->suspend_lock);

2462

2461

2463

if (dm_suspended_md(md)) {

2462

if (dm_suspended_md(md)) {

2464

r = -EINVAL;

2463

r = -EINVAL;

2465

goto out_unlock;

2464

goto out_unlock;

2466

}

2465

}

2467

2466

2468

map = dm_get_live_table(md);

2467

map = dm_get_live_table(md);

2469

2468

2470

/*

2469

/*

2471

* DMF_NOFLUSH_SUSPENDING must be set before presuspend.

2470

* DMF_NOFLUSH_SUSPENDING must be set before presuspend.

2472

* This flag is cleared before dm_suspend returns.

2471

* This flag is cleared before dm_suspend returns.

2473

*/

2472

*/

2474

if (noflush)

2473

if (noflush)

2475

set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);

2474

set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);

2476

2475

2477

/* This does not get reverted if there's an error later. */

2476

/* This does not get reverted if there's an error later. */

2478

dm_table_presuspend_targets(map);

2477

dm_table_presuspend_targets(map);

2479

2478

2480

/*

2479

/*

2481

* Flush I/O to the device.

2480

* Flush I/O to the device.

2482

* Any I/O submitted after lock_fs() may not be flushed.

2481

* Any I/O submitted after lock_fs() may not be flushed.

2483

* noflush takes precedence over do_lockfs.

2482

* noflush takes precedence over do_lockfs.

2484

* (lock_fs() flushes I/Os and waits for them to complete.)

2483

* (lock_fs() flushes I/Os and waits for them to complete.)

2485

*/

2484

*/

2486

if (!noflush && do_lockfs) {

2485

if (!noflush && do_lockfs) {

2487

r = lock_fs(md);

2486

r = lock_fs(md);

2488

if (r)

2487

if (r)

2489

goto out;

2488

goto out;

2490

}

2489

}

2491

2490

2492

/*

2491

/*

2493

* Here we must make sure that no processes are submitting requests

2492

* Here we must make sure that no processes are submitting requests

2494

* to target drivers i.e. no one may be executing

2493

* to target drivers i.e. no one may be executing

2495

* __split_and_process_bio. This is called from dm_request and

2494

* __split_and_process_bio. This is called from dm_request and

2496

* dm_wq_work.

2495

* dm_wq_work.

2497

*

2496

*

2498

* To get all processes out of __split_and_process_bio in dm_request,

2497

* To get all processes out of __split_and_process_bio in dm_request,

2499

* we take the write lock. To prevent any process from reentering

2498

* we take the write lock. To prevent any process from reentering

2500

* __split_and_process_bio from dm_request and quiesce the thread

2499

* __split_and_process_bio from dm_request and quiesce the thread

2501

* (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call

2500

* (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call

2502

* flush_workqueue(md->wq).

2501

* flush_workqueue(md->wq).

2503

*/

2502

*/

2504

down_write(&md->io_lock);

2503

down_write(&md->io_lock);

2505

set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);

2504

set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);

2506

up_write(&md->io_lock);

2505

up_write(&md->io_lock);

2507

2506

2508

/*

2507

/*

2509

* Stop md->queue before flushing md->wq in case request-based

2508

* Stop md->queue before flushing md->wq in case request-based

2510

* dm defers requests to md->wq from md->queue.

2509

* dm defers requests to md->wq from md->queue.

2511

*/

2510

*/

2512

if (dm_request_based(md))

2511

if (dm_request_based(md))

2513

stop_queue(md->queue);

2512

stop_queue(md->queue);

2514

2513

2515

flush_workqueue(md->wq);

2514

flush_workqueue(md->wq);

2516

2515

2517

/*

2516

/*

2518

* At this point no more requests are entering target request routines.

2517

* At this point no more requests are entering target request routines.

2519

* We call dm_wait_for_completion to wait for all existing requests

2518

* We call dm_wait_for_completion to wait for all existing requests

2520

* to finish.

2519

* to finish.

2521

*/

2520

*/

2522

r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);

2521

r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);

2523

2522

2524

down_write(&md->io_lock);

2523

down_write(&md->io_lock);

2525

if (noflush)

2524

if (noflush)

2526

clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);

2525

clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);

2527

up_write(&md->io_lock);

2526

up_write(&md->io_lock);

2528

2527

2529

/* were we interrupted ? */

2528

/* were we interrupted ? */

2530

if (r < 0) {

2529

if (r < 0) {

2531

dm_queue_flush(md);

2530

dm_queue_flush(md);

2532

2531

2533

if (dm_request_based(md))

2532

if (dm_request_based(md))

2534

start_queue(md->queue);

2533

start_queue(md->queue);

2535

2534

2536

unlock_fs(md);

2535

unlock_fs(md);

2537

goto out; /* pushback list is already flushed, so skip flush */

2536

goto out; /* pushback list is already flushed, so skip flush */

2538

}

2537

}

2539

2538

2540

/*

2539

/*

2541

* If dm_wait_for_completion returned 0, the device is completely

2540

* If dm_wait_for_completion returned 0, the device is completely

2542

* quiescent now. There is no request-processing activity. All new

2541

* quiescent now. There is no request-processing activity. All new

2543

* requests are being added to md->deferred list.

2542

* requests are being added to md->deferred list.

2544

*/

2543

*/

2545

2544

2546

set_bit(DMF_SUSPENDED, &md->flags);

2545

set_bit(DMF_SUSPENDED, &md->flags);

2547

2546

2548

dm_table_postsuspend_targets(map);

2547

dm_table_postsuspend_targets(map);

2549

2548

2550

out:

2549

out:

2551

dm_table_put(map);

2550

dm_table_put(map);

2552

2551

2553

out_unlock:

2552

out_unlock:

2554

mutex_unlock(&md->suspend_lock);

2553

mutex_unlock(&md->suspend_lock);

2555

return r;

2554

return r;

2556

}

2555

}

2557

2556

2558

int dm_resume(struct mapped_device *md)

2557

int dm_resume(struct mapped_device *md)

2559

{

2558

{

2560

int r = -EINVAL;

2559

int r = -EINVAL;

2561

struct dm_table *map = NULL;

2560

struct dm_table *map = NULL;

2562

2561

2563

mutex_lock(&md->suspend_lock);

2562

mutex_lock(&md->suspend_lock);

2564

if (!dm_suspended_md(md))

2563

if (!dm_suspended_md(md))

2565

goto out;

2564

goto out;

2566

2565

2567

map = dm_get_live_table(md);

2566

map = dm_get_live_table(md);

2568

if (!map || !dm_table_get_size(map))

2567

if (!map || !dm_table_get_size(map))

2569

goto out;

2568

goto out;

2570

2569

2571

r = dm_table_resume_targets(map);

2570

r = dm_table_resume_targets(map);

2572

if (r)

2571

if (r)

2573

goto out;

2572

goto out;

2574

2573

2575

dm_queue_flush(md);

2574

dm_queue_flush(md);

2576

2575

2577

/*

2576

/*

2578

* Flushing deferred I/Os must be done after targets are resumed

2577

* Flushing deferred I/Os must be done after targets are resumed

2579

* so that mapping of targets can work correctly.

2578

* so that mapping of targets can work correctly.

2580

* Request-based dm is queueing the deferred I/Os in its request_queue.

2579

* Request-based dm is queueing the deferred I/Os in its request_queue.

2581

*/

2580

*/

2582

if (dm_request_based(md))

2581

if (dm_request_based(md))

2583

start_queue(md->queue);

2582

start_queue(md->queue);

2584

2583

2585

unlock_fs(md);

2584

unlock_fs(md);

2586

2585

2587

clear_bit(DMF_SUSPENDED, &md->flags);

2586

clear_bit(DMF_SUSPENDED, &md->flags);

2588

2587

2589

r = 0;

2588

r = 0;

2590

out:

2589

out:

2591

dm_table_put(map);

2590

dm_table_put(map);

2592

mutex_unlock(&md->suspend_lock);

2591

mutex_unlock(&md->suspend_lock);

2593

2592

2594

return r;

2593

return r;

2595

}

2594

}

2596

2595

2597

/*-----------------------------------------------------------------

2596

/*-----------------------------------------------------------------

2598

* Event notification.

2597

* Event notification.

2599

*---------------------------------------------------------------*/

2598

*---------------------------------------------------------------*/

2600

int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,

2599

int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,

2601

unsigned cookie)

2600

unsigned cookie)

2602

{

2601

{

2603

char udev_cookie[DM_COOKIE_LENGTH];

2602

char udev_cookie[DM_COOKIE_LENGTH];

2604

char *envp[] = { udev_cookie, NULL };

2603

char *envp[] = { udev_cookie, NULL };

2605

2604

2606

if (!cookie)

2605

if (!cookie)

2607

return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);

2606

return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);

2608

else {

2607

else {

2609

snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",

2608

snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",

2610

DM_COOKIE_ENV_VAR_NAME, cookie);

2609

DM_COOKIE_ENV_VAR_NAME, cookie);

2611

return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,

2610

return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,

2612

action, envp);

2611

action, envp);

2613

}

2612

}

2614

}

2613

}

2615

2614

2616

uint32_t dm_next_uevent_seq(struct mapped_device *md)

2615

uint32_t dm_next_uevent_seq(struct mapped_device *md)

2617

{

2616

{

2618

return atomic_add_return(1, &md->uevent_seq);

2617

return atomic_add_return(1, &md->uevent_seq);

2619

}

2618

}

2620

2619

2621

uint32_t dm_get_event_nr(struct mapped_device *md)

2620

uint32_t dm_get_event_nr(struct mapped_device *md)

2622

{

2621

{

2623

return atomic_read(&md->event_nr);

2622

return atomic_read(&md->event_nr);

2624

}

2623

}

2625

2624

2626

int dm_wait_event(struct mapped_device *md, int event_nr)

2625

int dm_wait_event(struct mapped_device *md, int event_nr)

2627

{

2626

{

2628

return wait_event_interruptible(md->eventq,

2627

return wait_event_interruptible(md->eventq,

2629

(event_nr != atomic_read(&md->event_nr)));

2628

(event_nr != atomic_read(&md->event_nr)));

2630

}

2629

}

2631

2630

2632

void dm_uevent_add(struct mapped_device *md, struct list_head *elist)

2631

void dm_uevent_add(struct mapped_device *md, struct list_head *elist)

2633

{

2632

{

2634

unsigned long flags;

2633

unsigned long flags;

2635

2634

2636

spin_lock_irqsave(&md->uevent_lock, flags);

2635

spin_lock_irqsave(&md->uevent_lock, flags);

2637

list_add(elist, &md->uevent_list);

2636

list_add(elist, &md->uevent_list);

2638

spin_unlock_irqrestore(&md->uevent_lock, flags);

2637

spin_unlock_irqrestore(&md->uevent_lock, flags);

2639

}

2638

}

2640

2639

2641

/*

2640

/*

2642

* The gendisk is only valid as long as you have a reference

2641

* The gendisk is only valid as long as you have a reference

2643

* count on 'md'.

2642

* count on 'md'.

2644

*/

2643

*/

2645

struct gendisk *dm_disk(struct mapped_device *md)

2644

struct gendisk *dm_disk(struct mapped_device *md)

2646

{

2645

{

2647

return md->disk;

2646

return md->disk;

2648

}

2647

}

2649

2648

2650

struct kobject *dm_kobject(struct mapped_device *md)

2649

struct kobject *dm_kobject(struct mapped_device *md)

2651

{

2650

{

2652

return &md->kobj;

2651

return &md->kobj;

2653

}

2652

}

2654

2653

2655

/*

2654

/*

2656

* struct mapped_device should not be exported outside of dm.c

2655

* struct mapped_device should not be exported outside of dm.c

2657

* so use this check to verify that kobj is part of md structure

2656

* so use this check to verify that kobj is part of md structure

2658

*/

2657

*/

2659

struct mapped_device *dm_get_from_kobject(struct kobject *kobj)

2658

struct mapped_device *dm_get_from_kobject(struct kobject *kobj)

2660

{

2659

{

2661

struct mapped_device *md;

2660

struct mapped_device *md;

2662

2661

2663

md = container_of(kobj, struct mapped_device, kobj);

2662

md = container_of(kobj, struct mapped_device, kobj);

2664

if (&md->kobj != kobj)

2663

if (&md->kobj != kobj)

2665

return NULL;

2664

return NULL;

2666

2665

2667

if (test_bit(DMF_FREEING, &md->flags) ||

2666

if (test_bit(DMF_FREEING, &md->flags) ||

2668

dm_deleting_md(md))

2667

dm_deleting_md(md))

2669

return NULL;

2668

return NULL;

2670

2669

2671

dm_get(md);

2670

dm_get(md);

2672

return md;

2671

return md;

2673

}

2672

}

2674

2673

2675

int dm_suspended_md(struct mapped_device *md)

2674

int dm_suspended_md(struct mapped_device *md)

2676

{

2675

{

2677

return test_bit(DMF_SUSPENDED, &md->flags);

2676

return test_bit(DMF_SUSPENDED, &md->flags);

2678

}

2677

}

2679

2678

2680

int dm_suspended(struct dm_target *ti)

2679

int dm_suspended(struct dm_target *ti)

2681

{

2680

{

2682

return dm_suspended_md(dm_table_get_md(ti->table));

2681

return dm_suspended_md(dm_table_get_md(ti->table));

2683

}

2682

}

2684

EXPORT_SYMBOL_GPL(dm_suspended);

2683

EXPORT_SYMBOL_GPL(dm_suspended);

2685

2684

2686

int dm_noflush_suspending(struct dm_target *ti)

2685

int dm_noflush_suspending(struct dm_target *ti)

2687

{

2686

{

2688

return __noflush_suspending(dm_table_get_md(ti->table));

2687

return __noflush_suspending(dm_table_get_md(ti->table));

2689

}

2688

}

2690

EXPORT_SYMBOL_GPL(dm_noflush_suspending);

2689

EXPORT_SYMBOL_GPL(dm_noflush_suspending);

2691

2690

2692

struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)

2691

struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)

2693

{

2692

{

2694

struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);

2693

struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);

2695

unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;

2694

unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;

2696

2695

2697

if (!pools)

2696

if (!pools)

2698

return NULL;

2697

return NULL;

2699

2698

2700

pools->io_pool = (type == DM_TYPE_BIO_BASED) ?

2699

pools->io_pool = (type == DM_TYPE_BIO_BASED) ?

2701

mempool_create_slab_pool(MIN_IOS, _io_cache) :

2700

mempool_create_slab_pool(MIN_IOS, _io_cache) :

2702

mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);

2701

mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);

2703

if (!pools->io_pool)

2702

if (!pools->io_pool)

2704

goto free_pools_and_out;

2703

goto free_pools_and_out;

2705

2704

2706

pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?

2705

pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?

2707

mempool_create_slab_pool(MIN_IOS, _tio_cache) :

2706

mempool_create_slab_pool(MIN_IOS, _tio_cache) :

2708

mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);

2707

mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);

2709

if (!pools->tio_pool)

2708

if (!pools->tio_pool)

2710

goto free_io_pool_and_out;

2709

goto free_io_pool_and_out;

2711

2710

2712

pools->bs = bioset_create(pool_size, 0);

2711

pools->bs = bioset_create(pool_size, 0);

2713

if (!pools->bs)

2712

if (!pools->bs)

2714

goto free_tio_pool_and_out;

2713

goto free_tio_pool_and_out;

2715

2714

2716

if (integrity && bioset_integrity_create(pools->bs, pool_size))

2715

if (integrity && bioset_integrity_create(pools->bs, pool_size))

2717

goto free_bioset_and_out;

2716

goto free_bioset_and_out;

2718

2717

2719

return pools;

2718

return pools;

2720

2719

2721

free_bioset_and_out:

2720

free_bioset_and_out:

2722

bioset_free(pools->bs);

2721

bioset_free(pools->bs);

2723

2722

2724

free_tio_pool_and_out:

2723

free_tio_pool_and_out:

2725

mempool_destroy(pools->tio_pool);

2724

mempool_destroy(pools->tio_pool);

2726

2725

2727

free_io_pool_and_out:

2726

free_io_pool_and_out:

2728

mempool_destroy(pools->io_pool);

2727

mempool_destroy(pools->io_pool);

2729

2728

2730

free_pools_and_out:

2729

free_pools_and_out:

2731

kfree(pools);

2730

kfree(pools);

2732

2731

2733

return NULL;

2732

return NULL;

2734

}

2733

}

2735

2734

2736

void dm_free_md_mempools(struct dm_md_mempools *pools)

2735

void dm_free_md_mempools(struct dm_md_mempools *pools)

2737

{

2736

{

2738

if (!pools)

2737

if (!pools)

2739

return;

2738

return;

2740

2739

2741

if (pools->io_pool)

2740

if (pools->io_pool)

2742

mempool_destroy(pools->io_pool);

2741

mempool_destroy(pools->io_pool);

2743

2742

2744

if (pools->tio_pool)

2743

if (pools->tio_pool)

2745

mempool_destroy(pools->tio_pool);

2744

mempool_destroy(pools->tio_pool);

2746

2745

2747

if (pools->bs)

2746

if (pools->bs)

2748

bioset_free(pools->bs);

2747

bioset_free(pools->bs);

2749

2748

2750

kfree(pools);

2749

kfree(pools);

2751

}

2750

}

2752

2751

2753

static const struct block_device_operations dm_blk_dops = {

2752

static const struct block_device_operations dm_blk_dops = {

2754

.open = dm_blk_open,

2753

.open = dm_blk_open,

2755

.release = dm_blk_close,

2754

.release = dm_blk_close,

2756

.ioctl = dm_blk_ioctl,

2755

.ioctl = dm_blk_ioctl,

2757

.getgeo = dm_blk_getgeo,

2756

.getgeo = dm_blk_getgeo,

2758

.owner = THIS_MODULE

2757

.owner = THIS_MODULE

2759

};

2758

};

2760

2759

2761

EXPORT_SYMBOL(dm_get_mapinfo);

2760

EXPORT_SYMBOL(dm_get_mapinfo);

2762

2761

2763

/*

2762

/*

2764

* module hooks

2763

* module hooks

2765

*/

2764

*/

2766

module_init(dm_init);

2765

module_init(dm_init);

2767

module_exit(dm_exit);

2766

module_exit(dm_exit);

2768

2767

2769

module_param(major, uint, 0);

2768

module_param(major, uint, 0);

2770

MODULE_PARM_DESC(major, "The major number of the device mapper");

2769

MODULE_PARM_DESC(major, "The major number of the device mapper");

2771

MODULE_DESCRIPTION(DM_NAME " driver");

2770

MODULE_DESCRIPTION(DM_NAME " driver");

2772

MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");

2771

MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");

2773

MODULE_LICENSE("GPL");

2772

MODULE_LICENSE("GPL");

2774

2773

GITLAB

dm table: set flush capability based on underlying devices

 /*
  * Copyright (C) 2001 Sistina Software (UK) Limited.
  * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
 #include "dm.h"
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/blkdev.h>
 #include <linux/namei.h>
 #include <linux/ctype.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/mutex.h>
 #include <linux/delay.h>
 #include <linux/atomic.h>
 #define DM_MSG_PREFIX "table"
 #define MAX_DEPTH 16
 #define NODE_SIZE L1_CACHE_BYTES
 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
 /*
  * The table has always exactly one reference from either mapped_device->map
  * or hash_cell->new_map. This reference is not counted in table->holders.
  * A pair of dm_create_table/dm_destroy_table functions is used for table
  * creation/destruction.
  *
  * Temporary references from the other code increase table->holders. A pair
  * of dm_table_get/dm_table_put functions is used to manipulate it.
  *
  * When the table is about to be destroyed, we wait for table->holders to
  * drop to zero.
  */
 struct dm_table {
 	struct mapped_device *md;
 	atomic_t holders;
 	unsigned type;
 	/* btree table */
 	unsigned int depth;
 	unsigned int counts[MAX_DEPTH];	/* in nodes */
 	sector_t *index[MAX_DEPTH];
 	unsigned int num_targets;
 	unsigned int num_allocated;
 	sector_t *highs;
 	struct dm_target *targets;
 	unsigned integrity_supported:1;
 	/*
 	 * Indicates the rw permissions for the new logical
 	 * device.  This should be a combination of FMODE_READ
 	 * and FMODE_WRITE.
 	 */
 	fmode_t mode;
 	/* a list of devices used by this table */
 	struct list_head devices;
 	/* events get handed up using this callback */
 	void (*event_fn)(void *);
 	void *event_context;
 	struct dm_md_mempools *mempools;
 	struct list_head target_callbacks;
 };
 /*
  * Similar to ceiling(log_size(n))
  */
 static unsigned int int_log(unsigned int n, unsigned int base)
 {
 	int result = 0;
 	while (n > 1) {
 		n = dm_div_up(n, base);
 		result++;
 	}
 	return result;
 }
 /*
  * Calculate the index of the child node of the n'th node k'th key.
  */
 static inline unsigned int get_child(unsigned int n, unsigned int k)
 {
 	return (n * CHILDREN_PER_NODE) + k;
 }
 /*
  * Return the n'th node of level l from table t.
  */
 static inline sector_t *get_node(struct dm_table *t,
 				 unsigned int l, unsigned int n)
 {
 	return t->index[l] + (n * KEYS_PER_NODE);
 }
 /*
  * Return the highest key that you could lookup from the n'th
  * node on level l of the btree.
  */
 static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
 {
 	for (; l < t->depth - 1; l++)
 		n = get_child(n, CHILDREN_PER_NODE - 1);
 	if (n >= t->counts[l])
 		return (sector_t) - 1;
 	return get_node(t, l, n)[KEYS_PER_NODE - 1];
 }
 /*
  * Fills in a level of the btree based on the highs of the level
  * below it.
  */
 static int setup_btree_index(unsigned int l, struct dm_table *t)
 {
 	unsigned int n, k;
 	sector_t *node;
 	for (n = 0U; n < t->counts[l]; n++) {
 		node = get_node(t, l, n);
 		for (k = 0U; k < KEYS_PER_NODE; k++)
 			node[k] = high(t, l + 1, get_child(n, k));
 	}
 	return 0;
 }
 void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
 {
 	unsigned long size;
 	void *addr;
 	/*
 	 * Check that we're not going to overflow.
 	 */
 	if (nmemb > (ULONG_MAX / elem_size))
 		return NULL;
 	size = nmemb * elem_size;
 	addr = vzalloc(size);
 	return addr;
 }
 EXPORT_SYMBOL(dm_vcalloc);
 /*
  * highs, and targets are managed as dynamic arrays during a
  * table load.
  */
 static int alloc_targets(struct dm_table *t, unsigned int num)
 {
 	sector_t *n_highs;
 	struct dm_target *n_targets;
 	int n = t->num_targets;
 	/*
 	 * Allocate both the target array and offset array at once.
 	 * Append an empty entry to catch sectors beyond the end of
 	 * the device.
 	 */
 	n_highs = (sector_t *) dm_vcalloc(num + 1, sizeof(struct dm_target) +
 					  sizeof(sector_t));
 	if (!n_highs)
 		return -ENOMEM;
 	n_targets = (struct dm_target *) (n_highs + num);
 	if (n) {
 		memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
 		memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
 	}
 	memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
 	vfree(t->highs);
 	t->num_allocated = num;
 	t->highs = n_highs;
 	t->targets = n_targets;
 	return 0;
 }
 int dm_table_create(struct dm_table **result, fmode_t mode,
 		    unsigned num_targets, struct mapped_device *md)
 {
 	struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL);
 	if (!t)
 		return -ENOMEM;
 	INIT_LIST_HEAD(&t->devices);
 	INIT_LIST_HEAD(&t->target_callbacks);
 	atomic_set(&t->holders, 0);
 	if (!num_targets)
 		num_targets = KEYS_PER_NODE;
 	num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
 	if (alloc_targets(t, num_targets)) {
 		kfree(t);
 		t = NULL;
 		return -ENOMEM;
 	}
 	t->mode = mode;
 	t->md = md;
 	*result = t;
 	return 0;
 }
 static void free_devices(struct list_head *devices)
 {
 	struct list_head *tmp, *next;
 	list_for_each_safe(tmp, next, devices) {
 		struct dm_dev_internal *dd =
 		    list_entry(tmp, struct dm_dev_internal, list);
 		DMWARN("dm_table_destroy: dm_put_device call missing for %s",
 		       dd->dm_dev.name);
 		kfree(dd);
 	}
 }
 void dm_table_destroy(struct dm_table *t)
 {
 	unsigned int i;
 	if (!t)
 		return;
 	while (atomic_read(&t->holders))
 		msleep(1);
 	smp_mb();
 	/* free the indexes */
 	if (t->depth >= 2)
 		vfree(t->index[t->depth - 2]);
 	/* free the targets */
 	for (i = 0; i < t->num_targets; i++) {
 		struct dm_target *tgt = t->targets + i;
 		if (tgt->type->dtr)
 			tgt->type->dtr(tgt);
 		dm_put_target_type(tgt->type);
 	}
 	vfree(t->highs);
 	/* free the device list */
 	if (t->devices.next != &t->devices)
 		free_devices(&t->devices);
 	dm_free_md_mempools(t->mempools);
 	kfree(t);
 }
 void dm_table_get(struct dm_table *t)
 {
 	atomic_inc(&t->holders);
 }
 EXPORT_SYMBOL(dm_table_get);
 void dm_table_put(struct dm_table *t)
 {
 	if (!t)
 		return;
 	smp_mb__before_atomic_dec();
 	atomic_dec(&t->holders);
 }
 EXPORT_SYMBOL(dm_table_put);
 /*
  * Checks to see if we need to extend highs or targets.
  */
 static inline int check_space(struct dm_table *t)
 {
 	if (t->num_targets >= t->num_allocated)
 		return alloc_targets(t, t->num_allocated * 2);
 	return 0;
 }
 /*
  * See if we've already got a device in the list.
  */
 static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
 {
 	struct dm_dev_internal *dd;
 	list_for_each_entry (dd, l, list)
 		if (dd->dm_dev.bdev->bd_dev == dev)
 			return dd;
 	return NULL;
 }
 /*
  * Open a device so we can use it as a map destination.
  */
 static int open_dev(struct dm_dev_internal *d, dev_t dev,
 		    struct mapped_device *md)
 {
 	static char *_claim_ptr = "I belong to device-mapper";
 	struct block_device *bdev;
 	int r;
 	BUG_ON(d->dm_dev.bdev);
 	bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
 	if (IS_ERR(bdev))
 		return PTR_ERR(bdev);
 	r = bd_link_disk_holder(bdev, dm_disk(md));
 	if (r) {
 		blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL);
 		return r;
 	}
 	d->dm_dev.bdev = bdev;
 	return 0;
 }
 /*
  * Close a device that we've been using.
  */
 static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
 {
 	if (!d->dm_dev.bdev)
 		return;
 	bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md));
 	blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL);
 	d->dm_dev.bdev = NULL;
 }
 /*
  * If possible, this checks an area of a destination device is invalid.
  */
 static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
 				  sector_t start, sector_t len, void *data)
 {
 	struct request_queue *q;
 	struct queue_limits *limits = data;
 	struct block_device *bdev = dev->bdev;
 	sector_t dev_size =
 		i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
 	unsigned short logical_block_size_sectors =
 		limits->logical_block_size >> SECTOR_SHIFT;
 	char b[BDEVNAME_SIZE];
 	/*
 	 * Some devices exist without request functions,
 	 * such as loop devices not yet bound to backing files.
 	 * Forbid the use of such devices.
 	 */
 	q = bdev_get_queue(bdev);
 	if (!q || !q->make_request_fn) {
 		DMWARN("%s: %s is not yet initialised: "
 		       "start=%llu, len=%llu, dev_size=%llu",
 		       dm_device_name(ti->table->md), bdevname(bdev, b),
 		       (unsigned long long)start,
 		       (unsigned long long)len,
 		       (unsigned long long)dev_size);
 		return 1;
 	}
 	if (!dev_size)
 		return 0;
 	if ((start >= dev_size) || (start + len > dev_size)) {
 		DMWARN("%s: %s too small for target: "
 		       "start=%llu, len=%llu, dev_size=%llu",
 		       dm_device_name(ti->table->md), bdevname(bdev, b),
 		       (unsigned long long)start,
 		       (unsigned long long)len,
 		       (unsigned long long)dev_size);
 		return 1;
 	}
 	if (logical_block_size_sectors <= 1)
 		return 0;
 	if (start & (logical_block_size_sectors - 1)) {
 		DMWARN("%s: start=%llu not aligned to h/w "
 		       "logical block size %u of %s",
 		       dm_device_name(ti->table->md),
 		       (unsigned long long)start,
 		       limits->logical_block_size, bdevname(bdev, b));
 		return 1;
 	}
 	if (len & (logical_block_size_sectors - 1)) {
 		DMWARN("%s: len=%llu not aligned to h/w "
 		       "logical block size %u of %s",
 		       dm_device_name(ti->table->md),
 		       (unsigned long long)len,
 		       limits->logical_block_size, bdevname(bdev, b));
 		return 1;
 	}
 	return 0;
 }
 /*
  * This upgrades the mode on an already open dm_dev, being
  * careful to leave things as they were if we fail to reopen the
  * device and not to touch the existing bdev field in case
  * it is accessed concurrently inside dm_table_any_congested().
  */
 static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
 			struct mapped_device *md)
 {
 	int r;
 	struct dm_dev_internal dd_new, dd_old;
 	dd_new = dd_old = *dd;
 	dd_new.dm_dev.mode |= new_mode;
 	dd_new.dm_dev.bdev = NULL;
 	r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md);
 	if (r)
 		return r;
 	dd->dm_dev.mode |= new_mode;
 	close_dev(&dd_old, md);
 	return 0;
 }
 /*
  * Add a device to the list, or just increment the usage count if
  * it's already present.
  */
 int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
 		  struct dm_dev **result)
 {
 	int r;
 	dev_t uninitialized_var(dev);
 	struct dm_dev_internal *dd;
 	unsigned int major, minor;
 	struct dm_table *t = ti->table;
 	BUG_ON(!t);
 	if (sscanf(path, "%u:%u", &major, &minor) == 2) {
 		/* Extract the major/minor numbers */
 		dev = MKDEV(major, minor);
 		if (MAJOR(dev) != major || MINOR(dev) != minor)
 			return -EOVERFLOW;
 	} else {
 		/* convert the path to a device */
 		struct block_device *bdev = lookup_bdev(path);
 		if (IS_ERR(bdev))
 			return PTR_ERR(bdev);
 		dev = bdev->bd_dev;
 		bdput(bdev);
 	}
 	dd = find_device(&t->devices, dev);
 	if (!dd) {
 		dd = kmalloc(sizeof(*dd), GFP_KERNEL);
 		if (!dd)
 			return -ENOMEM;
 		dd->dm_dev.mode = mode;
 		dd->dm_dev.bdev = NULL;
 		if ((r = open_dev(dd, dev, t->md))) {
 			kfree(dd);
 			return r;
 		}
 		format_dev_t(dd->dm_dev.name, dev);
 		atomic_set(&dd->count, 0);
 		list_add(&dd->list, &t->devices);
 	} else if (dd->dm_dev.mode != (mode | dd->dm_dev.mode)) {
 		r = upgrade_mode(dd, mode, t->md);
 		if (r)
 			return r;
 	}
 	atomic_inc(&dd->count);
 	*result = &dd->dm_dev;
 	return 0;
 }
 EXPORT_SYMBOL(dm_get_device);
 int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
 			 sector_t start, sector_t len, void *data)
 {
 	struct queue_limits *limits = data;
 	struct block_device *bdev = dev->bdev;
 	struct request_queue *q = bdev_get_queue(bdev);
 	char b[BDEVNAME_SIZE];
 	if (unlikely(!q)) {
 		DMWARN("%s: Cannot set limits for nonexistent device %s",
 		       dm_device_name(ti->table->md), bdevname(bdev, b));
 		return 0;
 	}
 	if (bdev_stack_limits(limits, bdev, start) < 0)
 		DMWARN("%s: adding target device %s caused an alignment inconsistency: "
 		       "physical_block_size=%u, logical_block_size=%u, "
 		       "alignment_offset=%u, start=%llu",
 		       dm_device_name(ti->table->md), bdevname(bdev, b),
 		       q->limits.physical_block_size,
 		       q->limits.logical_block_size,
 		       q->limits.alignment_offset,
 		       (unsigned long long) start << SECTOR_SHIFT);
 	/*
 	 * Check if merge fn is supported.
 	 * If not we'll force DM to use PAGE_SIZE or
 	 * smaller I/O, just to be safe.
 	 */
 	if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
 		blk_limits_max_hw_sectors(limits,
 					  (unsigned int) (PAGE_SIZE >> 9));
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dm_set_device_limits);
 /*
  * Decrement a device's use count and remove it if necessary.
  */
 void dm_put_device(struct dm_target *ti, struct dm_dev *d)
 {
 	struct dm_dev_internal *dd = container_of(d, struct dm_dev_internal,
 						  dm_dev);
 	if (atomic_dec_and_test(&dd->count)) {
 		close_dev(dd, ti->table->md);
 		list_del(&dd->list);
 		kfree(dd);
 	}
 }
 EXPORT_SYMBOL(dm_put_device);
 /*
  * Checks to see if the target joins onto the end of the table.
  */
 static int adjoin(struct dm_table *table, struct dm_target *ti)
 {
 	struct dm_target *prev;
 	if (!table->num_targets)
 		return !ti->begin;
 	prev = &table->targets[table->num_targets - 1];
 	return (ti->begin == (prev->begin + prev->len));
 }
 /*
  * Used to dynamically allocate the arg array.
  */
 static char **realloc_argv(unsigned *array_size, char **old_argv)
 {
 	char **argv;
 	unsigned new_size;
 	new_size = *array_size ? *array_size * 2 : 64;
 	argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
 	if (argv) {
 		memcpy(argv, old_argv, *array_size * sizeof(*argv));
 		*array_size = new_size;
 	}
 	kfree(old_argv);
 	return argv;
 }
 /*
  * Destructively splits up the argument list to pass to ctr.
  */
 int dm_split_args(int *argc, char ***argvp, char *input)
 {
 	char *start, *end = input, *out, **argv = NULL;
 	unsigned array_size = 0;
 	*argc = 0;
 	if (!input) {
 		*argvp = NULL;
 		return 0;
 	}
 	argv = realloc_argv(&array_size, argv);
 	if (!argv)
 		return -ENOMEM;
 	while (1) {
 		/* Skip whitespace */
 		start = skip_spaces(end);
 		if (!*start)
 			break;	/* success, we hit the end */
 		/* 'out' is used to remove any back-quotes */
 		end = out = start;
 		while (*end) {
 			/* Everything apart from '\0' can be quoted */
 			if (*end == '\\' && *(end + 1)) {
 				*out++ = *(end + 1);
 				end += 2;
 				continue;
 			}
 			if (isspace(*end))
 				break;	/* end of token */
 			*out++ = *end++;
 		}
 		/* have we already filled the array ? */
 		if ((*argc + 1) > array_size) {
 			argv = realloc_argv(&array_size, argv);
 			if (!argv)
 				return -ENOMEM;
 		}
 		/* we know this is whitespace */
 		if (*end)
 			end++;
 		/* terminate the string and put it in the array */
 		*out = '\0';
 		argv[*argc] = start;
 		(*argc)++;
 	}
 	*argvp = argv;
 	return 0;
 }
 /*
  * Impose necessary and sufficient conditions on a devices's table such
  * that any incoming bio which respects its logical_block_size can be
  * processed successfully.  If it falls across the boundary between
  * two or more targets, the size of each piece it gets split into must
  * be compatible with the logical_block_size of the target processing it.
  */
 static int validate_hardware_logical_block_alignment(struct dm_table *table,
 						 struct queue_limits *limits)
 {
 	/*
 	 * This function uses arithmetic modulo the logical_block_size
 	 * (in units of 512-byte sectors).
 	 */
 	unsigned short device_logical_block_size_sects =
 		limits->logical_block_size >> SECTOR_SHIFT;
 	/*
 	 * Offset of the start of the next table entry, mod logical_block_size.
 	 */
 	unsigned short next_target_start = 0;
 	/*
 	 * Given an aligned bio that extends beyond the end of a
 	 * target, how many sectors must the next target handle?
 	 */
 	unsigned short remaining = 0;
 	struct dm_target *uninitialized_var(ti);
 	struct queue_limits ti_limits;
 	unsigned i = 0;
 	/*
 	 * Check each entry in the table in turn.
 	 */
 	while (i < dm_table_get_num_targets(table)) {
 		ti = dm_table_get_target(table, i++);
 		blk_set_default_limits(&ti_limits);
 		/* combine all target devices' limits */
 		if (ti->type->iterate_devices)
 			ti->type->iterate_devices(ti, dm_set_device_limits,
 						  &ti_limits);
 		/*
 		 * If the remaining sectors fall entirely within this
 		 * table entry are they compatible with its logical_block_size?
 		 */
 		if (remaining < ti->len &&
 		    remaining & ((ti_limits.logical_block_size >>
 				  SECTOR_SHIFT) - 1))
 			break;	/* Error */
 		next_target_start =
 		    (unsigned short) ((next_target_start + ti->len) &
 				      (device_logical_block_size_sects - 1));
 		remaining = next_target_start ?
 		    device_logical_block_size_sects - next_target_start : 0;
 	}
 	if (remaining) {
 		DMWARN("%s: table line %u (start sect %llu len %llu) "
 		       "not aligned to h/w logical block size %u",
 		       dm_device_name(table->md), i,
 		       (unsigned long long) ti->begin,
 		       (unsigned long long) ti->len,
 		       limits->logical_block_size);
 		return -EINVAL;
 	}
 	return 0;
 }
 int dm_table_add_target(struct dm_table *t, const char *type,
 			sector_t start, sector_t len, char *params)
 {
 	int r = -EINVAL, argc;
 	char **argv;
 	struct dm_target *tgt;
 	if ((r = check_space(t)))
 		return r;
 	tgt = t->targets + t->num_targets;
 	memset(tgt, 0, sizeof(*tgt));
 	if (!len) {
 		DMERR("%s: zero-length target", dm_device_name(t->md));
 		return -EINVAL;
 	}
 	tgt->type = dm_get_target_type(type);
 	if (!tgt->type) {
 		DMERR("%s: %s: unknown target type", dm_device_name(t->md),
 		      type);
 		return -EINVAL;
 	}
 	tgt->table = t;
 	tgt->begin = start;
 	tgt->len = len;
 	tgt->error = "Unknown error";
 	/*
 	 * Does this target adjoin the previous one ?
 	 */
 	if (!adjoin(t, tgt)) {
 		tgt->error = "Gap in table";
 		r = -EINVAL;
 		goto bad;
 	}
 	r = dm_split_args(&argc, &argv, params);
 	if (r) {
 		tgt->error = "couldn't split parameters (insufficient memory)";
 		goto bad;
 	}
 	r = tgt->type->ctr(tgt, argc, argv);
 	kfree(argv);
 	if (r)
 		goto bad;
 	t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
 	if (!tgt->num_discard_requests && tgt->discards_supported)
 		DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.",
 		       dm_device_name(t->md), type);
 	return 0;
  bad:
 	DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error);
 	dm_put_target_type(tgt->type);
 	return r;
 }
 /*
  * Target argument parsing helpers.
  */
 static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
 			     unsigned *value, char **error, unsigned grouped)
 {
 	const char *arg_str = dm_shift_arg(arg_set);
 	if (!arg_str ||
 	    (sscanf(arg_str, "%u", value) != 1) ||
 	    (*value < arg->min) ||
 	    (*value > arg->max) ||
 	    (grouped && arg_set->argc < *value)) {
 		*error = arg->error;
 		return -EINVAL;
 	}
 	return 0;
 }
 int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
 		unsigned *value, char **error)
 {
 	return validate_next_arg(arg, arg_set, value, error, 0);
 }
 EXPORT_SYMBOL(dm_read_arg);
 int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
 		      unsigned *value, char **error)
 {
 	return validate_next_arg(arg, arg_set, value, error, 1);
 }
 EXPORT_SYMBOL(dm_read_arg_group);
 const char *dm_shift_arg(struct dm_arg_set *as)
 {
 	char *r;
 	if (as->argc) {
 		as->argc--;
 		r = *as->argv;
 		as->argv++;
 		return r;
 	}
 	return NULL;
 }
 EXPORT_SYMBOL(dm_shift_arg);
 void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
 {
 	BUG_ON(as->argc < num_args);
 	as->argc -= num_args;
 	as->argv += num_args;
 }
 EXPORT_SYMBOL(dm_consume_args);
 static int dm_table_set_type(struct dm_table *t)
 {
 	unsigned i;
 	unsigned bio_based = 0, request_based = 0;
 	struct dm_target *tgt;
 	struct dm_dev_internal *dd;
 	struct list_head *devices;
 	for (i = 0; i < t->num_targets; i++) {
 		tgt = t->targets + i;
 		if (dm_target_request_based(tgt))
 			request_based = 1;
 		else
 			bio_based = 1;
 		if (bio_based && request_based) {
 			DMWARN("Inconsistent table: different target types"
 			       " can't be mixed up");
 			return -EINVAL;
 		}
 	}
 	if (bio_based) {
 		/* We must use this table as bio-based */
 		t->type = DM_TYPE_BIO_BASED;
 		return 0;
 	}
 	BUG_ON(!request_based); /* No targets in this table */
 	/* Non-request-stackable devices can't be used for request-based dm */
 	devices = dm_table_get_devices(t);
 	list_for_each_entry(dd, devices, list) {
 		if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) {
 			DMWARN("table load rejected: including"
 			       " non-request-stackable devices");
 			return -EINVAL;
 		}
 	}
 	/*
 	 * Request-based dm supports only tables that have a single target now.
 	 * To support multiple targets, request splitting support is needed,
 	 * and that needs lots of changes in the block-layer.
 	 * (e.g. request completion process for partial completion.)
 	 */
 	if (t->num_targets > 1) {
 		DMWARN("Request-based dm doesn't support multiple targets yet");
 		return -EINVAL;
 	}
 	t->type = DM_TYPE_REQUEST_BASED;
 	return 0;
 }
 unsigned dm_table_get_type(struct dm_table *t)
 {
 	return t->type;
 }
 bool dm_table_request_based(struct dm_table *t)
 {
 	return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
 }
 int dm_table_alloc_md_mempools(struct dm_table *t)
 {
 	unsigned type = dm_table_get_type(t);
 	if (unlikely(type == DM_TYPE_NONE)) {
 		DMWARN("no table type is set, can't allocate mempools");
 		return -EINVAL;
 	}
 	t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
 	if (!t->mempools)
 		return -ENOMEM;
 	return 0;
 }
 void dm_table_free_md_mempools(struct dm_table *t)
 {
 	dm_free_md_mempools(t->mempools);
 	t->mempools = NULL;
 }
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t)
 {
 	return t->mempools;
 }
 static int setup_indexes(struct dm_table *t)
 {
 	int i;
 	unsigned int total = 0;
 	sector_t *indexes;
 	/* allocate the space for *all* the indexes */
 	for (i = t->depth - 2; i >= 0; i--) {
 		t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
 		total += t->counts[i];
 	}
 	indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE);
 	if (!indexes)
 		return -ENOMEM;
 	/* set up internal nodes, bottom-up */
 	for (i = t->depth - 2; i >= 0; i--) {
 		t->index[i] = indexes;
 		indexes += (KEYS_PER_NODE * t->counts[i]);
 		setup_btree_index(i, t);
 	}
 	return 0;
 }
 /*
  * Builds the btree to index the map.
  */
 static int dm_table_build_index(struct dm_table *t)
 {
 	int r = 0;
 	unsigned int leaf_nodes;
 	/* how many indexes will the btree have ? */
 	leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
 	t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
 	/* leaf layer has already been set up */
 	t->counts[t->depth - 1] = leaf_nodes;
 	t->index[t->depth - 1] = t->highs;
 	if (t->depth >= 2)
 		r = setup_indexes(t);
 	return r;
 }
 /*
  * Get a disk whose integrity profile reflects the table's profile.
  * If %match_all is true, all devices' profiles must match.
  * If %match_all is false, all devices must at least have an
  * allocated integrity profile; but uninitialized is ok.
  * Returns NULL if integrity support was inconsistent or unavailable.
  */
 static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
 						    bool match_all)
 {
 	struct list_head *devices = dm_table_get_devices(t);
 	struct dm_dev_internal *dd = NULL;
 	struct gendisk *prev_disk = NULL, *template_disk = NULL;
 	list_for_each_entry(dd, devices, list) {
 		template_disk = dd->dm_dev.bdev->bd_disk;
 		if (!blk_get_integrity(template_disk))
 			goto no_integrity;
 		if (!match_all && !blk_integrity_is_initialized(template_disk))
 			continue; /* skip uninitialized profiles */
 		else if (prev_disk &&
 			 blk_integrity_compare(prev_disk, template_disk) < 0)
 			goto no_integrity;
 		prev_disk = template_disk;
 	}
 	return template_disk;
 no_integrity:
 	if (prev_disk)
 		DMWARN("%s: integrity not set: %s and %s profile mismatch",
 		       dm_device_name(t->md),
 		       prev_disk->disk_name,
 		       template_disk->disk_name);
 	return NULL;
 }
 /*
  * Register the mapped device for blk_integrity support if
  * the underlying devices have an integrity profile.  But all devices
  * may not have matching profiles (checking all devices isn't reliable
  * during table load because this table may use other DM device(s) which
  * must be resumed before they will have an initialized integity profile).
  * Stacked DM devices force a 2 stage integrity profile validation:
  * 1 - during load, validate all initialized integrity profiles match
  * 2 - during resume, validate all integrity profiles match
  */
 static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
 {
 	struct gendisk *template_disk = NULL;
 	template_disk = dm_table_get_integrity_disk(t, false);
 	if (!template_disk)
 		return 0;
 	if (!blk_integrity_is_initialized(dm_disk(md))) {
 		t->integrity_supported = 1;
 		return blk_integrity_register(dm_disk(md), NULL);
 	}
 	/*
 	 * If DM device already has an initalized integrity
 	 * profile the new profile should not conflict.
 	 */
 	if (blk_integrity_is_initialized(template_disk) &&
 	    blk_integrity_compare(dm_disk(md), template_disk) < 0) {
 		DMWARN("%s: conflict with existing integrity profile: "
 		       "%s profile mismatch",
 		       dm_device_name(t->md),
 		       template_disk->disk_name);
 		return 1;
 	}
 	/* Preserve existing initialized integrity profile */
 	t->integrity_supported = 1;
 	return 0;
 }
 /*
  * Prepares the table for use by building the indices,
  * setting the type, and allocating mempools.
  */
 int dm_table_complete(struct dm_table *t)
 {
 	int r;
 	r = dm_table_set_type(t);
 	if (r) {
 		DMERR("unable to set table type");
 		return r;
 	}
 	r = dm_table_build_index(t);
 	if (r) {
 		DMERR("unable to build btrees");
 		return r;
 	}
 	r = dm_table_prealloc_integrity(t, t->md);
 	if (r) {
 		DMERR("could not register integrity profile.");
 		return r;
 	}
 	r = dm_table_alloc_md_mempools(t);
 	if (r)
 		DMERR("unable to allocate mempools");
 	return r;
 }
 static DEFINE_MUTEX(_event_lock);
 void dm_table_event_callback(struct dm_table *t,
 			     void (*fn)(void *), void *context)
 {
 	mutex_lock(&_event_lock);
 	t->event_fn = fn;
 	t->event_context = context;
 	mutex_unlock(&_event_lock);
 }
 void dm_table_event(struct dm_table *t)
 {
 	/*
 	 * You can no longer call dm_table_event() from interrupt
 	 * context, use a bottom half instead.
 	 */
 	BUG_ON(in_interrupt());
 	mutex_lock(&_event_lock);
 	if (t->event_fn)
 		t->event_fn(t->event_context);
 	mutex_unlock(&_event_lock);
 }
 EXPORT_SYMBOL(dm_table_event);
 sector_t dm_table_get_size(struct dm_table *t)
 {
 	return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
 }
 EXPORT_SYMBOL(dm_table_get_size);
 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
 {
 	if (index >= t->num_targets)
 		return NULL;
 	return t->targets + index;
 }
 /*
  * Search the btree for the correct target.
  *
  * Caller should check returned pointer with dm_target_is_valid()
  * to trap I/O beyond end of device.
  */
 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
 {
 	unsigned int l, n = 0, k = 0;
 	sector_t *node;
 	for (l = 0; l < t->depth; l++) {
 		n = get_child(n, k);
 		node = get_node(t, l, n);
 		for (k = 0; k < KEYS_PER_NODE; k++)
 			if (node[k] >= sector)
 				break;
 	}
 	return &t->targets[(KEYS_PER_NODE * n) + k];
 }
 /*
  * Establish the new table's queue_limits and validate them.
  */
 int dm_calculate_queue_limits(struct dm_table *table,
 			      struct queue_limits *limits)
 {
 	struct dm_target *uninitialized_var(ti);
 	struct queue_limits ti_limits;
 	unsigned i = 0;
 	blk_set_default_limits(limits);
 	while (i < dm_table_get_num_targets(table)) {
 		blk_set_default_limits(&ti_limits);
 		ti = dm_table_get_target(table, i++);
 		if (!ti->type->iterate_devices)
 			goto combine_limits;
 		/*
 		 * Combine queue limits of all the devices this target uses.
 		 */
 		ti->type->iterate_devices(ti, dm_set_device_limits,
 					  &ti_limits);
 		/* Set I/O hints portion of queue limits */
 		if (ti->type->io_hints)
 			ti->type->io_hints(ti, &ti_limits);
 		/*
 		 * Check each device area is consistent with the target's
 		 * overall queue limits.
 		 */
 		if (ti->type->iterate_devices(ti, device_area_is_invalid,
 					      &ti_limits))
 			return -EINVAL;
 combine_limits:
 		/*
 		 * Merge this target's queue limits into the overall limits
 		 * for the table.
 		 */
 		if (blk_stack_limits(limits, &ti_limits, 0) < 0)
 			DMWARN("%s: adding target device "
 			       "(start sect %llu len %llu) "
 			       "caused an alignment inconsistency",
 			       dm_device_name(table->md),
 			       (unsigned long long) ti->begin,
 			       (unsigned long long) ti->len);
 	}
 	return validate_hardware_logical_block_alignment(table, limits);
 }
 /*
  * Set the integrity profile for this device if all devices used have
  * matching profiles.  We're quite deep in the resume path but still
  * don't know if all devices (particularly DM devices this device
  * may be stacked on) have matching profiles.  Even if the profiles
  * don't match we have no way to fail (to resume) at this point.
  */
 static void dm_table_set_integrity(struct dm_table *t)
 {
 	struct gendisk *template_disk = NULL;
 	if (!blk_get_integrity(dm_disk(t->md)))
 		return;
 	template_disk = dm_table_get_integrity_disk(t, true);
 	if (!template_disk &&
 	    blk_integrity_is_initialized(dm_disk(t->md))) {
 		DMWARN("%s: device no longer has a valid integrity profile",
 		       dm_device_name(t->md));
 		return;
 	}
 	blk_integrity_register(dm_disk(t->md),
 			       blk_get_integrity(template_disk));
 }
+static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
+				sector_t start, sector_t len, void *data)
+{
+	unsigned flush = (*(unsigned *)data);
+	struct request_queue *q = bdev_get_queue(dev->bdev);
+	return q && (q->flush_flags & flush);
+}
+static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
+{
+	struct dm_target *ti;
+	unsigned i = 0;
+	/*
+	 * Require at least one underlying device to support flushes.
+	 * t->devices includes internal dm devices such as mirror logs
+	 * so we need to use iterate_devices here, which targets
+	 * supporting flushes must provide.
+	 */
+	while (i < dm_table_get_num_targets(t)) {
+		ti = dm_table_get_target(t, i++);
+		if (!ti->num_flush_requests)
+			continue;
+		if (ti->type->iterate_devices &&
+		    ti->type->iterate_devices(ti, device_flush_capable, &flush))
+			return 1;
+	}
+	return 0;
+}
 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
 			       struct queue_limits *limits)
 {
+	unsigned flush = 0;
 	/*
 	 * Copy table's limits to the DM device's request_queue
 	 */
 	q->limits = *limits;
 	if (!dm_table_supports_discards(t))
 		queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
+	if (dm_table_supports_flush(t, REQ_FLUSH)) {
+		flush |= REQ_FLUSH;
+		if (dm_table_supports_flush(t, REQ_FUA))
+			flush |= REQ_FUA;
+	}
+	blk_queue_flush(q, flush);
 	dm_table_set_integrity(t);
 	/*
 	 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
 	 * visible to other CPUs because, once the flag is set, incoming bios
 	 * are processed by request-based dm, which refers to the queue
 	 * settings.
 	 * Until the flag set, bios are passed to bio-based dm and queued to
 	 * md->deferred where queue settings are not needed yet.
 	 * Those bios are passed to request-based dm at the resume time.
 	 */
 	smp_mb();
 	if (dm_table_request_based(t))
 		queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
 }
 unsigned int dm_table_get_num_targets(struct dm_table *t)
 {
 	return t->num_targets;
 }
 struct list_head *dm_table_get_devices(struct dm_table *t)
 {
 	return &t->devices;
 }
 fmode_t dm_table_get_mode(struct dm_table *t)
 {
 	return t->mode;
 }
 EXPORT_SYMBOL(dm_table_get_mode);
 static void suspend_targets(struct dm_table *t, unsigned postsuspend)
 {
 	int i = t->num_targets;
 	struct dm_target *ti = t->targets;
 	while (i--) {
 		if (postsuspend) {
 			if (ti->type->postsuspend)
 				ti->type->postsuspend(ti);
 		} else if (ti->type->presuspend)
 			ti->type->presuspend(ti);
 		ti++;
 	}
 }
 void dm_table_presuspend_targets(struct dm_table *t)
 {
 	if (!t)
 		return;
 	suspend_targets(t, 0);
 }
 void dm_table_postsuspend_targets(struct dm_table *t)
 {
 	if (!t)
 		return;
 	suspend_targets(t, 1);
 }
 int dm_table_resume_targets(struct dm_table *t)
 {
 	int i, r = 0;
 	for (i = 0; i < t->num_targets; i++) {
 		struct dm_target *ti = t->targets + i;
 		if (!ti->type->preresume)
 			continue;
 		r = ti->type->preresume(ti);
 		if (r)
 			return r;
 	}
 	for (i = 0; i < t->num_targets; i++) {
 		struct dm_target *ti = t->targets + i;
 		if (ti->type->resume)
 			ti->type->resume(ti);
 	}
 	return 0;
 }
 void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
 {
 	list_add(&cb->list, &t->target_callbacks);
 }
 EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
 int dm_table_any_congested(struct dm_table *t, int bdi_bits)
 {
 	struct dm_dev_internal *dd;
 	struct list_head *devices = dm_table_get_devices(t);
 	struct dm_target_callbacks *cb;
 	int r = 0;
 	list_for_each_entry(dd, devices, list) {
 		struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
 		char b[BDEVNAME_SIZE];
 		if (likely(q))
 			r |= bdi_congested(&q->backing_dev_info, bdi_bits);
 		else
 			DMWARN_LIMIT("%s: any_congested: nonexistent device %s",
 				     dm_device_name(t->md),
 				     bdevname(dd->dm_dev.bdev, b));
 	}
 	list_for_each_entry(cb, &t->target_callbacks, list)
 		if (cb->congested_fn)
 			r |= cb->congested_fn(cb, bdi_bits);
 	return r;
 }
 int dm_table_any_busy_target(struct dm_table *t)
 {
 	unsigned i;
 	struct dm_target *ti;
 	for (i = 0; i < t->num_targets; i++) {
 		ti = t->targets + i;
 		if (ti->type->busy && ti->type->busy(ti))
 			return 1;
 	}
 	return 0;
 }
 struct mapped_device *dm_table_get_md(struct dm_table *t)
 {
 	return t->md;
 }
 EXPORT_SYMBOL(dm_table_get_md);
 static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
 				  sector_t start, sector_t len, void *data)
 {
 	struct request_queue *q = bdev_get_queue(dev->bdev);
 	return q && blk_queue_discard(q);
 }
 bool dm_table_supports_discards(struct dm_table *t)
 {
 	struct dm_target *ti;
 	unsigned i = 0;
 	/*
 	 * Unless any target used by the table set discards_supported,
 	 * require at least one underlying device to support discards.
 	 * t->devices includes internal dm devices such as mirror logs
 	 * so we need to use iterate_devices here, which targets
 	 * supporting discard selectively must provide.
 	 */
 	while (i < dm_table_get_num_targets(t)) {
 		ti = dm_table_get_target(t, i++);
 		if (!ti->num_discard_requests)
 			continue;
 		if (ti->discards_supported)
 			return 1;
 		if (ti->type->iterate_devices &&
 		    ti->type->iterate_devices(ti, device_discard_capable, NULL))
 			return 1;
 	}
 	return 0;
 }

 /*
  * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
  * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
  *
  * This file is released under the GPL.
  */
 #include "dm.h"
 #include "dm-uevent.h"
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/moduleparam.h>
 #include <linux/blkpg.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
 #include <linux/mempool.h>
 #include <linux/slab.h>
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/delay.h>
 #include <trace/events/block.h>
 #define DM_MSG_PREFIX "core"
 /*
  * Cookies are numeric values sent with CHANGE and REMOVE
  * uevents while resuming, removing or renaming the device.
  */
 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
 #define DM_COOKIE_LENGTH 24
 static const char *_name = DM_NAME;
 static unsigned int major = 0;
 static unsigned int _major = 0;
 static DEFINE_IDR(_minor_idr);
 static DEFINE_SPINLOCK(_minor_lock);
 /*
  * For bio-based dm.
  * One of these is allocated per bio.
  */
 struct dm_io {
 	struct mapped_device *md;
 	int error;
 	atomic_t io_count;
 	struct bio *bio;
 	unsigned long start_time;
 	spinlock_t endio_lock;
 };
 /*
  * For bio-based dm.
  * One of these is allocated per target within a bio.  Hopefully
  * this will be simplified out one day.
  */
 struct dm_target_io {
 	struct dm_io *io;
 	struct dm_target *ti;
 	union map_info info;
 };
 /*
  * For request-based dm.
  * One of these is allocated per request.
  */
 struct dm_rq_target_io {
 	struct mapped_device *md;
 	struct dm_target *ti;
 	struct request *orig, clone;
 	int error;
 	union map_info info;
 };
 /*
  * For request-based dm.
  * One of these is allocated per bio.
  */
 struct dm_rq_clone_bio_info {
 	struct bio *orig;
 	struct dm_rq_target_io *tio;
 };
 union map_info *dm_get_mapinfo(struct bio *bio)
 {
 	if (bio && bio->bi_private)
 		return &((struct dm_target_io *)bio->bi_private)->info;
 	return NULL;
 }
 union map_info *dm_get_rq_mapinfo(struct request *rq)
 {
 	if (rq && rq->end_io_data)
 		return &((struct dm_rq_target_io *)rq->end_io_data)->info;
 	return NULL;
 }
 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
 #define MINOR_ALLOCED ((void *)-1)
 /*
  * Bits for the md->flags field.
  */
 #define DMF_BLOCK_IO_FOR_SUSPEND 0
 #define DMF_SUSPENDED 1
 #define DMF_FROZEN 2
 #define DMF_FREEING 3
 #define DMF_DELETING 4
 #define DMF_NOFLUSH_SUSPENDING 5
 #define DMF_MERGE_IS_OPTIONAL 6
 /*
  * Work processed by per-device workqueue.
  */
 struct mapped_device {
 	struct rw_semaphore io_lock;
 	struct mutex suspend_lock;
 	rwlock_t map_lock;
 	atomic_t holders;
 	atomic_t open_count;
 	unsigned long flags;
 	struct request_queue *queue;
 	unsigned type;
 	/* Protect queue and type against concurrent access. */
 	struct mutex type_lock;
 	struct gendisk *disk;
 	char name[16];
 	void *interface_ptr;
 	/*
 	 * A list of ios that arrived while we were suspended.
 	 */
 	atomic_t pending[2];
 	wait_queue_head_t wait;
 	struct work_struct work;
 	struct bio_list deferred;
 	spinlock_t deferred_lock;
 	/*
 	 * Processing queue (flush)
 	 */
 	struct workqueue_struct *wq;
 	/*
 	 * The current mapping.
 	 */
 	struct dm_table *map;
 	/*
 	 * io objects are allocated from here.
 	 */
 	mempool_t *io_pool;
 	mempool_t *tio_pool;
 	struct bio_set *bs;
 	/*
 	 * Event handling.
 	 */
 	atomic_t event_nr;
 	wait_queue_head_t eventq;
 	atomic_t uevent_seq;
 	struct list_head uevent_list;
 	spinlock_t uevent_lock; /* Protect access to uevent_list */
 	/*
 	 * freeze/thaw support require holding onto a super block
 	 */
 	struct super_block *frozen_sb;
 	struct block_device *bdev;
 	/* forced geometry settings */
 	struct hd_geometry geometry;
 	/* For saving the address of __make_request for request based dm */
 	make_request_fn *saved_make_request_fn;
 	/* sysfs handle */
 	struct kobject kobj;
 	/* zero-length flush that will be cloned and submitted to targets */
 	struct bio flush_bio;
 };
 /*
  * For mempools pre-allocation at the table loading time.
  */
 struct dm_md_mempools {
 	mempool_t *io_pool;
 	mempool_t *tio_pool;
 	struct bio_set *bs;
 };
 #define MIN_IOS 256
 static struct kmem_cache *_io_cache;
 static struct kmem_cache *_tio_cache;
 static struct kmem_cache *_rq_tio_cache;
 static struct kmem_cache *_rq_bio_info_cache;
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
 	/* allocate a slab for the dm_ios */
 	_io_cache = KMEM_CACHE(dm_io, 0);
 	if (!_io_cache)
 		return r;
 	/* allocate a slab for the target ios */
 	_tio_cache = KMEM_CACHE(dm_target_io, 0);
 	if (!_tio_cache)
 		goto out_free_io_cache;
 	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
 	if (!_rq_tio_cache)
 		goto out_free_tio_cache;
 	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
 	if (!_rq_bio_info_cache)
 		goto out_free_rq_tio_cache;
 	r = dm_uevent_init();
 	if (r)
 		goto out_free_rq_bio_info_cache;
 	_major = major;
 	r = register_blkdev(_major, _name);
 	if (r < 0)
 		goto out_uevent_exit;
 	if (!_major)
 		_major = r;
 	return 0;
 out_uevent_exit:
 	dm_uevent_exit();
 out_free_rq_bio_info_cache:
 	kmem_cache_destroy(_rq_bio_info_cache);
 out_free_rq_tio_cache:
 	kmem_cache_destroy(_rq_tio_cache);
 out_free_tio_cache:
 	kmem_cache_destroy(_tio_cache);
 out_free_io_cache:
 	kmem_cache_destroy(_io_cache);
 	return r;
 }
 static void local_exit(void)
 {
 	kmem_cache_destroy(_rq_bio_info_cache);
 	kmem_cache_destroy(_rq_tio_cache);
 	kmem_cache_destroy(_tio_cache);
 	kmem_cache_destroy(_io_cache);
 	unregister_blkdev(_major, _name);
 	dm_uevent_exit();
 	_major = 0;
 	DMINFO("cleaned up");
 }
 static int (*_inits[])(void) __initdata = {
 	local_init,
 	dm_target_init,
 	dm_linear_init,
 	dm_stripe_init,
 	dm_io_init,
 	dm_kcopyd_init,
 	dm_interface_init,
 };
 static void (*_exits[])(void) = {
 	local_exit,
 	dm_target_exit,
 	dm_linear_exit,
 	dm_stripe_exit,
 	dm_io_exit,
 	dm_kcopyd_exit,
 	dm_interface_exit,
 };
 static int __init dm_init(void)
 {
 	const int count = ARRAY_SIZE(_inits);
 	int r, i;
 	for (i = 0; i < count; i++) {
 		r = _inits[i]();
 		if (r)
 			goto bad;
 	}
 	return 0;
       bad:
 	while (i--)
 		_exits[i]();
 	return r;
 }
 static void __exit dm_exit(void)
 {
 	int i = ARRAY_SIZE(_exits);
 	while (i--)
 		_exits[i]();
 	/*
 	 * Should be empty by this point.
 	 */
 	idr_remove_all(&_minor_idr);
 	idr_destroy(&_minor_idr);
 }
 /*
  * Block device functions
  */
 int dm_deleting_md(struct mapped_device *md)
 {
 	return test_bit(DMF_DELETING, &md->flags);
 }
 static int dm_blk_open(struct block_device *bdev, fmode_t mode)
 {
 	struct mapped_device *md;
 	spin_lock(&_minor_lock);
 	md = bdev->bd_disk->private_data;
 	if (!md)
 		goto out;
 	if (test_bit(DMF_FREEING, &md->flags) ||
 	    dm_deleting_md(md)) {
 		md = NULL;
 		goto out;
 	}
 	dm_get(md);
 	atomic_inc(&md->open_count);
 out:
 	spin_unlock(&_minor_lock);
 	return md ? 0 : -ENXIO;
 }
 static int dm_blk_close(struct gendisk *disk, fmode_t mode)
 {
 	struct mapped_device *md = disk->private_data;
 	spin_lock(&_minor_lock);
 	atomic_dec(&md->open_count);
 	dm_put(md);
 	spin_unlock(&_minor_lock);
 	return 0;
 }
 int dm_open_count(struct mapped_device *md)
 {
 	return atomic_read(&md->open_count);
 }
 /*
  * Guarantees nothing is using the device before it's deleted.
  */
 int dm_lock_for_deletion(struct mapped_device *md)
 {
 	int r = 0;
 	spin_lock(&_minor_lock);
 	if (dm_open_count(md))
 		r = -EBUSY;
 	else
 		set_bit(DMF_DELETING, &md->flags);
 	spin_unlock(&_minor_lock);
 	return r;
 }
 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
 	return dm_get_geometry(md, geo);
 }
 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long arg)
 {
 	struct mapped_device *md = bdev->bd_disk->private_data;
 	struct dm_table *map = dm_get_live_table(md);
 	struct dm_target *tgt;
 	int r = -ENOTTY;
 	if (!map || !dm_table_get_size(map))
 		goto out;
 	/* We only support devices that have a single target */
 	if (dm_table_get_num_targets(map) != 1)
 		goto out;
 	tgt = dm_table_get_target(map, 0);
 	if (dm_suspended_md(md)) {
 		r = -EAGAIN;
 		goto out;
 	}
 	if (tgt->type->ioctl)
 		r = tgt->type->ioctl(tgt, cmd, arg);
 out:
 	dm_table_put(map);
 	return r;
 }
 static struct dm_io *alloc_io(struct mapped_device *md)
 {
 	return mempool_alloc(md->io_pool, GFP_NOIO);
 }
 static void free_io(struct mapped_device *md, struct dm_io *io)
 {
 	mempool_free(io, md->io_pool);
 }
 static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
 {
 	mempool_free(tio, md->tio_pool);
 }
 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
 					    gfp_t gfp_mask)
 {
 	return mempool_alloc(md->tio_pool, gfp_mask);
 }
 static void free_rq_tio(struct dm_rq_target_io *tio)
 {
 	mempool_free(tio, tio->md->tio_pool);
 }
 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
 {
 	return mempool_alloc(md->io_pool, GFP_ATOMIC);
 }
 static void free_bio_info(struct dm_rq_clone_bio_info *info)
 {
 	mempool_free(info, info->tio->md->io_pool);
 }
 static int md_in_flight(struct mapped_device *md)
 {
 	return atomic_read(&md->pending[READ]) +
 	       atomic_read(&md->pending[WRITE]);
 }
 static void start_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
 	int cpu;
 	int rw = bio_data_dir(io->bio);
 	io->start_time = jiffies;
 	cpu = part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
 	part_stat_unlock();
 	atomic_set(&dm_disk(md)->part0.in_flight[rw],
 		atomic_inc_return(&md->pending[rw]));
 }
 static void end_io_acct(struct dm_io *io)
 {
 	struct mapped_device *md = io->md;
 	struct bio *bio = io->bio;
 	unsigned long duration = jiffies - io->start_time;
 	int pending, cpu;
 	int rw = bio_data_dir(bio);
 	cpu = part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
 	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
 	part_stat_unlock();
 	/*
 	 * After this is decremented the bio must not be touched if it is
 	 * a flush.
 	 */
 	pending = atomic_dec_return(&md->pending[rw]);
 	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
 	pending += atomic_read(&md->pending[rw^0x1]);
 	/* nudge anyone waiting on suspend queue */
 	if (!pending)
 		wake_up(&md->wait);
 }
 /*
  * Add the bio to the list of deferred io.
  */
 static void queue_io(struct mapped_device *md, struct bio *bio)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&md->deferred_lock, flags);
 	bio_list_add(&md->deferred, bio);
 	spin_unlock_irqrestore(&md->deferred_lock, flags);
 	queue_work(md->wq, &md->work);
 }
 /*
  * Everyone (including functions in this file), should use this
  * function to access the md->map field, and make sure they call
  * dm_table_put() when finished.
  */
 struct dm_table *dm_get_live_table(struct mapped_device *md)
 {
 	struct dm_table *t;
 	unsigned long flags;
 	read_lock_irqsave(&md->map_lock, flags);
 	t = md->map;
 	if (t)
 		dm_table_get(t);
 	read_unlock_irqrestore(&md->map_lock, flags);
 	return t;
 }
 /*
  * Get the geometry associated with a dm device
  */
 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
 {
 	*geo = md->geometry;
 	return 0;
 }
 /*
  * Set the geometry of a device.
  */
 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
 {
 	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
 	if (geo->start > sz) {
 		DMWARN("Start sector is beyond the geometry limits.");
 		return -EINVAL;
 	}
 	md->geometry = *geo;
 	return 0;
 }
 /*-----------------------------------------------------------------
  * CRUD START:
  *   A more elegant soln is in the works that uses the queue
  *   merge fn, unfortunately there are a couple of changes to
  *   the block layer that I want to make for this.  So in the
  *   interests of getting something for people to use I give
  *   you this clearly demarcated crap.
  *---------------------------------------------------------------*/
 static int __noflush_suspending(struct mapped_device *md)
 {
 	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 }
 /*
  * Decrements the number of outstanding ios that a bio has been
  * cloned into, completing the original io if necc.
  */
 static void dec_pending(struct dm_io *io, int error)
 {
 	unsigned long flags;
 	int io_error;
 	struct bio *bio;
 	struct mapped_device *md = io->md;
 	/* Push-back supersedes any I/O errors */
 	if (unlikely(error)) {
 		spin_lock_irqsave(&io->endio_lock, flags);
 		if (!(io->error > 0 && __noflush_suspending(md)))
 			io->error = error;
 		spin_unlock_irqrestore(&io->endio_lock, flags);
 	}
 	if (atomic_dec_and_test(&io->io_count)) {
 		if (io->error == DM_ENDIO_REQUEUE) {
 			/*
 			 * Target requested pushing back the I/O.
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md))
 				bio_list_add_head(&md->deferred, io->bio);
 			else
 				/* noflush suspend was interrupted. */
 				io->error = -EIO;
 			spin_unlock_irqrestore(&md->deferred_lock, flags);
 		}
 		io_error = io->error;
 		bio = io->bio;
 		end_io_acct(io);
 		free_io(md, io);
 		if (io_error == DM_ENDIO_REQUEUE)
 			return;
 		if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
 			/*
 			 * Preflush done for flush with data, reissue
 			 * without REQ_FLUSH.
 			 */
 			bio->bi_rw &= ~REQ_FLUSH;
 			queue_io(md, bio);
 		} else {
 			/* done with normal IO or empty flush */
 			trace_block_bio_complete(md->queue, bio, io_error);
 			bio_endio(bio, io_error);
 		}
 	}
 }
 static void clone_endio(struct bio *bio, int error)
 {
 	int r = 0;
 	struct dm_target_io *tio = bio->bi_private;
 	struct dm_io *io = tio->io;
 	struct mapped_device *md = tio->io->md;
 	dm_endio_fn endio = tio->ti->type->end_io;
 	if (!bio_flagged(bio, BIO_UPTODATE) && !error)
 		error = -EIO;
 	if (endio) {
 		r = endio(tio->ti, bio, error, &tio->info);
 		if (r < 0 || r == DM_ENDIO_REQUEUE)
 			/*
 			 * error and requeue request are handled
 			 * in dec_pending().
 			 */
 			error = r;
 		else if (r == DM_ENDIO_INCOMPLETE)
 			/* The target will handle the io */
 			return;
 		else if (r) {
 			DMWARN("unimplemented target endio return value: %d", r);
 			BUG();
 		}
 	}
 	/*
 	 * Store md for cleanup instead of tio which is about to get freed.
 	 */
 	bio->bi_private = md->bs;
 	free_tio(md, tio);
 	bio_put(bio);
 	dec_pending(io, error);
 }
 /*
  * Partial completion handling for request-based dm
  */
 static void end_clone_bio(struct bio *clone, int error)
 {
 	struct dm_rq_clone_bio_info *info = clone->bi_private;
 	struct dm_rq_target_io *tio = info->tio;
 	struct bio *bio = info->orig;
 	unsigned int nr_bytes = info->orig->bi_size;
 	bio_put(clone);
 	if (tio->error)
 		/*
 		 * An error has already been detected on the request.
 		 * Once error occurred, just let clone->end_io() handle
 		 * the remainder.
 		 */
 		return;
 	else if (error) {
 		/*
 		 * Don't notice the error to the upper layer yet.
 		 * The error handling decision is made by the target driver,
 		 * when the request is completed.
 		 */
 		tio->error = error;
 		return;
 	}
 	/*
 	 * I/O for the bio successfully completed.
 	 * Notice the data completion to the upper layer.
 	 */
 	/*
 	 * bios are processed from the head of the list.
 	 * So the completing bio should always be rq->bio.
 	 * If it's not, something wrong is happening.
 	 */
 	if (tio->orig->bio != bio)
 		DMERR("bio completion is going in the middle of the request");
 	/*
 	 * Update the original request.
 	 * Do not use blk_end_request() here, because it may complete
 	 * the original request before the clone, and break the ordering.
 	 */
 	blk_update_request(tio->orig, 0, nr_bytes);
 }
 /*
  * Don't touch any member of the md after calling this function because
  * the md may be freed in dm_put() at the end of this function.
  * Or do dm_get() before calling this function and dm_put() later.
  */
 static void rq_completed(struct mapped_device *md, int rw, int run_queue)
 {
 	atomic_dec(&md->pending[rw]);
 	/* nudge anyone waiting on suspend queue */
 	if (!md_in_flight(md))
 		wake_up(&md->wait);
 	if (run_queue)
 		blk_run_queue(md->queue);
 	/*
 	 * dm_put() must be at the end of this function. See the comment above
 	 */
 	dm_put(md);
 }
 static void free_rq_clone(struct request *clone)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	blk_rq_unprep_clone(clone);
 	free_rq_tio(tio);
 }
 /*
  * Complete the clone and the original request.
  * Must be called without queue lock.
  */
 static void dm_end_request(struct request *clone, int error)
 {
 	int rw = rq_data_dir(clone);
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 	struct request *rq = tio->orig;
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		rq->errors = clone->errors;
 		rq->resid_len = clone->resid_len;
 		if (rq->sense)
 			/*
 			 * We are using the sense buffer of the original
 			 * request.
 			 * So setting the length of the sense data is enough.
 			 */
 			rq->sense_len = clone->sense_len;
 	}
 	free_rq_clone(clone);
 	blk_end_request_all(rq, error);
 	rq_completed(md, rw, true);
 }
 static void dm_unprep_request(struct request *rq)
 {
 	struct request *clone = rq->special;
 	rq->special = NULL;
 	rq->cmd_flags &= ~REQ_DONTPREP;
 	free_rq_clone(clone);
 }
 /*
  * Requeue the original request of a clone.
  */
 void dm_requeue_unmapped_request(struct request *clone)
 {
 	int rw = rq_data_dir(clone);
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct mapped_device *md = tio->md;
 	struct request *rq = tio->orig;
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 	dm_unprep_request(rq);
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_requeue_request(q, rq);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	rq_completed(md, rw, 0);
 }
 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
 static void __stop_queue(struct request_queue *q)
 {
 	blk_stop_queue(q);
 }
 static void stop_queue(struct request_queue *q)
 {
 	unsigned long flags;
 	spin_lock_irqsave(q->queue_lock, flags);
 	__stop_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 static void __start_queue(struct request_queue *q)
 {
 	if (blk_queue_stopped(q))
 		blk_start_queue(q);
 }
 static void start_queue(struct request_queue *q)
 {
 	unsigned long flags;
 	spin_lock_irqsave(q->queue_lock, flags);
 	__start_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 static void dm_done(struct request *clone, int error, bool mapped)
 {
 	int r = error;
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
 	if (mapped && rq_end_io)
 		r = rq_end_io(tio->ti, clone, error, &tio->info);
 	if (r <= 0)
 		/* The target wants to complete the I/O */
 		dm_end_request(clone, r);
 	else if (r == DM_ENDIO_INCOMPLETE)
 		/* The target will handle the I/O */
 		return;
 	else if (r == DM_ENDIO_REQUEUE)
 		/* The target wants to requeue the I/O */
 		dm_requeue_unmapped_request(clone);
 	else {
 		DMWARN("unimplemented target endio return value: %d", r);
 		BUG();
 	}
 }
 /*
  * Request completion handler for request-based dm
  */
 static void dm_softirq_done(struct request *rq)
 {
 	bool mapped = true;
 	struct request *clone = rq->completion_data;
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	if (rq->cmd_flags & REQ_FAILED)
 		mapped = false;
 	dm_done(clone, tio->error, mapped);
 }
 /*
  * Complete the clone and the original request with the error status
  * through softirq context.
  */
 static void dm_complete_request(struct request *clone, int error)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct request *rq = tio->orig;
 	tio->error = error;
 	rq->completion_data = clone;
 	blk_complete_request(rq);
 }
 /*
  * Complete the not-mapped clone and the original request with the error status
  * through softirq context.
  * Target's rq_end_io() function isn't called.
  * This may be used when the target's map_rq() function fails.
  */
 void dm_kill_unmapped_request(struct request *clone, int error)
 {
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	struct request *rq = tio->orig;
 	rq->cmd_flags |= REQ_FAILED;
 	dm_complete_request(clone, error);
 }
 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
 /*
  * Called with the queue lock held
  */
 static void end_clone_request(struct request *clone, int error)
 {
 	/*
 	 * For just cleaning up the information of the queue in which
 	 * the clone was dispatched.
 	 * The clone is *NOT* freed actually here because it is alloced from
 	 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
 	 */
 	__blk_put_request(clone->q, clone);
 	/*
 	 * Actual request completion is done in a softirq context which doesn't
 	 * hold the queue lock.  Otherwise, deadlock could occur because:
 	 *     - another request may be submitted by the upper level driver
 	 *       of the stacking during the completion
 	 *     - the submission which requires queue lock may be done
 	 *       against this queue
 	 */
 	dm_complete_request(clone, error);
 }
 /*
  * Return maximum size of I/O possible at the supplied sector up to the current
  * target boundary.
  */
 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
 {
 	sector_t target_offset = dm_target_offset(ti, sector);
 	return ti->len - target_offset;
 }
 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
 {
 	sector_t len = max_io_len_target_boundary(sector, ti);
 	/*
 	 * Does the target need to split even further ?
 	 */
 	if (ti->split_io) {
 		sector_t boundary;
 		sector_t offset = dm_target_offset(ti, sector);
 		boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
 			   - offset;
 		if (len > boundary)
 			len = boundary;
 	}
 	return len;
 }
 static void __map_bio(struct dm_target *ti, struct bio *clone,
 		      struct dm_target_io *tio)
 {
 	int r;
 	sector_t sector;
 	struct mapped_device *md;
 	clone->bi_end_io = clone_endio;
 	clone->bi_private = tio;
 	/*
 	 * Map the clone.  If r == 0 we don't need to do
 	 * anything, the target has assumed ownership of
 	 * this io.
 	 */
 	atomic_inc(&tio->io->io_count);
 	sector = clone->bi_sector;
 	r = ti->type->map(ti, clone, &tio->info);
 	if (r == DM_MAPIO_REMAPPED) {
 		/* the bio has been remapped so dispatch it */
 		trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
 				      tio->io->bio->bi_bdev->bd_dev, sector);
 		generic_make_request(clone);
 	} else if (r < 0 || r == DM_MAPIO_REQUEUE) {
 		/* error the io and bail out, or requeue it if needed */
 		md = tio->io->md;
 		dec_pending(tio->io, r);
 		/*
 		 * Store bio_set for cleanup.
 		 */
 		clone->bi_private = md->bs;
 		bio_put(clone);
 		free_tio(md, tio);
 	} else if (r) {
 		DMWARN("unimplemented target map return value: %d", r);
 		BUG();
 	}
 }
 struct clone_info {
 	struct mapped_device *md;
 	struct dm_table *map;
 	struct bio *bio;
 	struct dm_io *io;
 	sector_t sector;
 	sector_t sector_count;
 	unsigned short idx;
 };
 static void dm_bio_destructor(struct bio *bio)
 {
 	struct bio_set *bs = bio->bi_private;
 	bio_free(bio, bs);
 }
 /*
  * Creates a little bio that just does part of a bvec.
  */
 static struct bio *split_bvec(struct bio *bio, sector_t sector,
 			      unsigned short idx, unsigned int offset,
 			      unsigned int len, struct bio_set *bs)
 {
 	struct bio *clone;
 	struct bio_vec *bv = bio->bi_io_vec + idx;
 	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
 	clone->bi_destructor = dm_bio_destructor;
 	*clone->bi_io_vec = *bv;
 	clone->bi_sector = sector;
 	clone->bi_bdev = bio->bi_bdev;
 	clone->bi_rw = bio->bi_rw;
 	clone->bi_vcnt = 1;
 	clone->bi_size = to_bytes(len);
 	clone->bi_io_vec->bv_offset = offset;
 	clone->bi_io_vec->bv_len = clone->bi_size;
 	clone->bi_flags |= 1 << BIO_CLONED;
 	if (bio_integrity(bio)) {
 		bio_integrity_clone(clone, bio, GFP_NOIO, bs);
 		bio_integrity_trim(clone,
 				   bio_sector_offset(bio, idx, offset), len);
 	}
 	return clone;
 }
 /*
  * Creates a bio that consists of range of complete bvecs.
  */
 static struct bio *clone_bio(struct bio *bio, sector_t sector,
 			     unsigned short idx, unsigned short bv_count,
 			     unsigned int len, struct bio_set *bs)
 {
 	struct bio *clone;
 	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 	__bio_clone(clone, bio);
 	clone->bi_destructor = dm_bio_destructor;
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
 	clone->bi_vcnt = idx + bv_count;
 	clone->bi_size = to_bytes(len);
 	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
 	if (bio_integrity(bio)) {
 		bio_integrity_clone(clone, bio, GFP_NOIO, bs);
 		if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
 			bio_integrity_trim(clone,
 					   bio_sector_offset(bio, idx, 0), len);
 	}
 	return clone;
 }
 static struct dm_target_io *alloc_tio(struct clone_info *ci,
 				      struct dm_target *ti)
 {
 	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
 	tio->io = ci->io;
 	tio->ti = ti;
 	memset(&tio->info, 0, sizeof(tio->info));
 	return tio;
 }
 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
 				   unsigned request_nr, sector_t len)
 {
 	struct dm_target_io *tio = alloc_tio(ci, ti);
 	struct bio *clone;
 	tio->info.target_request_nr = request_nr;
 	/*
 	 * Discard requests require the bio's inline iovecs be initialized.
 	 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
 	 * and discard, so no need for concern about wasted bvec allocations.
 	 */
 	clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
 	__bio_clone(clone, ci->bio);
 	clone->bi_destructor = dm_bio_destructor;
 	if (len) {
 		clone->bi_sector = ci->sector;
 		clone->bi_size = to_bytes(len);
 	}
 	__map_bio(ti, clone, tio);
 }
 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
 				    unsigned num_requests, sector_t len)
 {
 	unsigned request_nr;
 	for (request_nr = 0; request_nr < num_requests; request_nr++)
 		__issue_target_request(ci, ti, request_nr, len);
 }
 static int __clone_and_map_empty_flush(struct clone_info *ci)
 {
 	unsigned target_nr = 0;
 	struct dm_target *ti;
 	BUG_ON(bio_has_data(ci->bio));
 	while ((ti = dm_table_get_target(ci->map, target_nr++)))
 		__issue_target_requests(ci, ti, ti->num_flush_requests, 0);
 	return 0;
 }
 /*
  * Perform all io with a single clone.
  */
 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
 {
 	struct bio *clone, *bio = ci->bio;
 	struct dm_target_io *tio;
 	tio = alloc_tio(ci, ti);
 	clone = clone_bio(bio, ci->sector, ci->idx,
 			  bio->bi_vcnt - ci->idx, ci->sector_count,
 			  ci->md->bs);
 	__map_bio(ti, clone, tio);
 	ci->sector_count = 0;
 }
 static int __clone_and_map_discard(struct clone_info *ci)
 {
 	struct dm_target *ti;
 	sector_t len;
 	do {
 		ti = dm_table_find_target(ci->map, ci->sector);
 		if (!dm_target_is_valid(ti))
 			return -EIO;
 		/*
 		 * Even though the device advertised discard support,
 		 * that does not mean every target supports it, and
 		 * reconfiguration might also have changed that since the
 		 * check was performed.
 		 */
 		if (!ti->num_discard_requests)
 			return -EOPNOTSUPP;
 		len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
 		__issue_target_requests(ci, ti, ti->num_discard_requests, len);
 		ci->sector += len;
 	} while (ci->sector_count -= len);
 	return 0;
 }
 static int __clone_and_map(struct clone_info *ci)
 {
 	struct bio *clone, *bio = ci->bio;
 	struct dm_target *ti;
 	sector_t len = 0, max;
 	struct dm_target_io *tio;
 	if (unlikely(bio->bi_rw & REQ_DISCARD))
 		return __clone_and_map_discard(ci);
 	ti = dm_table_find_target(ci->map, ci->sector);
 	if (!dm_target_is_valid(ti))
 		return -EIO;
 	max = max_io_len(ci->sector, ti);
 	if (ci->sector_count <= max) {
 		/*
 		 * Optimise for the simple case where we can do all of
 		 * the remaining io with a single clone.
 		 */
 		__clone_and_map_simple(ci, ti);
 	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
 		/*
 		 * There are some bvecs that don't span targets.
 		 * Do as many of these as possible.
 		 */
 		int i;
 		sector_t remaining = max;
 		sector_t bv_len;
 		for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
 			bv_len = to_sector(bio->bi_io_vec[i].bv_len);
 			if (bv_len > remaining)
 				break;
 			remaining -= bv_len;
 			len += bv_len;
 		}
 		tio = alloc_tio(ci, ti);
 		clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
 				  ci->md->bs);
 		__map_bio(ti, clone, tio);
 		ci->sector += len;
 		ci->sector_count -= len;
 		ci->idx = i;
 	} else {
 		/*
 		 * Handle a bvec that must be split between two or more targets.
 		 */
 		struct bio_vec *bv = bio->bi_io_vec + ci->idx;
 		sector_t remaining = to_sector(bv->bv_len);
 		unsigned int offset = 0;
 		do {
 			if (offset) {
 				ti = dm_table_find_target(ci->map, ci->sector);
 				if (!dm_target_is_valid(ti))
 					return -EIO;
 				max = max_io_len(ci->sector, ti);
 			}
 			len = min(remaining, max);
 			tio = alloc_tio(ci, ti);
 			clone = split_bvec(bio, ci->sector, ci->idx,
 					   bv->bv_offset + offset, len,
 					   ci->md->bs);
 			__map_bio(ti, clone, tio);
 			ci->sector += len;
 			ci->sector_count -= len;
 			offset += to_bytes(len);
 		} while (remaining -= len);
 		ci->idx++;
 	}
 	return 0;
 }
 /*
  * Split the bio into several clones and submit it to targets.
  */
 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
 {
 	struct clone_info ci;
 	int error = 0;
 	ci.map = dm_get_live_table(md);
 	if (unlikely(!ci.map)) {
 		bio_io_error(bio);
 		return;
 	}
 	ci.md = md;
 	ci.io = alloc_io(md);
 	ci.io->error = 0;
 	atomic_set(&ci.io->io_count, 1);
 	ci.io->bio = bio;
 	ci.io->md = md;
 	spin_lock_init(&ci.io->endio_lock);
 	ci.sector = bio->bi_sector;
 	ci.idx = bio->bi_idx;
 	start_io_acct(ci.io);
 	if (bio->bi_rw & REQ_FLUSH) {
 		ci.bio = &ci.md->flush_bio;
 		ci.sector_count = 0;
 		error = __clone_and_map_empty_flush(&ci);
 		/* dec_pending submits any data associated with flush */
 	} else {
 		ci.bio = bio;
 		ci.sector_count = bio_sectors(bio);
 		while (ci.sector_count && !error)
 			error = __clone_and_map(&ci);
 	}
 	/* drop the extra reference count */
 	dec_pending(ci.io, error);
 	dm_table_put(ci.map);
 }
 /*-----------------------------------------------------------------
  * CRUD END
  *---------------------------------------------------------------*/
 static int dm_merge_bvec(struct request_queue *q,
 			 struct bvec_merge_data *bvm,
 			 struct bio_vec *biovec)
 {
 	struct mapped_device *md = q->queuedata;
 	struct dm_table *map = dm_get_live_table(md);
 	struct dm_target *ti;
 	sector_t max_sectors;
 	int max_size = 0;
 	if (unlikely(!map))
 		goto out;
 	ti = dm_table_find_target(map, bvm->bi_sector);
 	if (!dm_target_is_valid(ti))
 		goto out_table;
 	/*
 	 * Find maximum amount of I/O that won't need splitting
 	 */
 	max_sectors = min(max_io_len(bvm->bi_sector, ti),
 			  (sector_t) BIO_MAX_SECTORS);
 	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
 	if (max_size < 0)
 		max_size = 0;
 	/*
 	 * merge_bvec_fn() returns number of bytes
 	 * it can accept at this offset
 	 * max is precomputed maximal io size
 	 */
 	if (max_size && ti->type->merge)
 		max_size = ti->type->merge(ti, bvm, biovec, max_size);
 	/*
 	 * If the target doesn't support merge method and some of the devices
 	 * provided their merge_bvec method (we know this by looking at
 	 * queue_max_hw_sectors), then we can't allow bios with multiple vector
 	 * entries.  So always set max_size to 0, and the code below allows
 	 * just one page.
 	 */
 	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
 		max_size = 0;
 out_table:
 	dm_table_put(map);
 out:
 	/*
 	 * Always allow an entire first page
 	 */
 	if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
 		max_size = biovec->bv_len;
 	return max_size;
 }
 /*
  * The request function that just remaps the bio built up by
  * dm_merge_bvec.
  */
 static int _dm_request(struct request_queue *q, struct bio *bio)
 {
 	int rw = bio_data_dir(bio);
 	struct mapped_device *md = q->queuedata;
 	int cpu;
 	down_read(&md->io_lock);
 	cpu = part_stat_lock();
 	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
 	part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
 	part_stat_unlock();
 	/* if we're suspended, we have to queue this io for later */
 	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
 		up_read(&md->io_lock);
 		if (bio_rw(bio) != READA)
 			queue_io(md, bio);
 		else
 			bio_io_error(bio);
 		return 0;
 	}
 	__split_and_process_bio(md, bio);
 	up_read(&md->io_lock);
 	return 0;
 }
 static int dm_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct mapped_device *md = q->queuedata;
 	return md->saved_make_request_fn(q, bio); /* call __make_request() */
 }
 static int dm_request_based(struct mapped_device *md)
 {
 	return blk_queue_stackable(md->queue);
 }
 static int dm_request(struct request_queue *q, struct bio *bio)
 {
 	struct mapped_device *md = q->queuedata;
 	if (dm_request_based(md))
 		return dm_make_request(q, bio);
 	return _dm_request(q, bio);
 }
 void dm_dispatch_request(struct request *rq)
 {
 	int r;
 	if (blk_queue_io_stat(rq->q))
 		rq->cmd_flags |= REQ_IO_STAT;
 	rq->start_time = jiffies;
 	r = blk_insert_cloned_request(rq->q, rq);
 	if (r)
 		dm_complete_request(rq, r);
 }
 EXPORT_SYMBOL_GPL(dm_dispatch_request);
 static void dm_rq_bio_destructor(struct bio *bio)
 {
 	struct dm_rq_clone_bio_info *info = bio->bi_private;
 	struct mapped_device *md = info->tio->md;
 	free_bio_info(info);
 	bio_free(bio, md->bs);
 }
 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
 				 void *data)
 {
 	struct dm_rq_target_io *tio = data;
 	struct mapped_device *md = tio->md;
 	struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
 	if (!info)
 		return -ENOMEM;
 	info->orig = bio_orig;
 	info->tio = tio;
 	bio->bi_end_io = end_clone_bio;
 	bio->bi_private = info;
 	bio->bi_destructor = dm_rq_bio_destructor;
 	return 0;
 }
 static int setup_clone(struct request *clone, struct request *rq,
 		       struct dm_rq_target_io *tio)
 {
 	int r;
 	r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
 			      dm_rq_bio_constructor, tio);
 	if (r)
 		return r;
 	clone->cmd = rq->cmd;
 	clone->cmd_len = rq->cmd_len;
 	clone->sense = rq->sense;
 	clone->buffer = rq->buffer;
 	clone->end_io = end_clone_request;
 	clone->end_io_data = tio;
 	return 0;
 }
 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
 				gfp_t gfp_mask)
 {
 	struct request *clone;
 	struct dm_rq_target_io *tio;
 	tio = alloc_rq_tio(md, gfp_mask);
 	if (!tio)
 		return NULL;
 	tio->md = md;
 	tio->ti = NULL;
 	tio->orig = rq;
 	tio->error = 0;
 	memset(&tio->info, 0, sizeof(tio->info));
 	clone = &tio->clone;
 	if (setup_clone(clone, rq, tio)) {
 		/* -ENOMEM */
 		free_rq_tio(tio);
 		return NULL;
 	}
 	return clone;
 }
 /*
  * Called with the queue lock held.
  */
 static int dm_prep_fn(struct request_queue *q, struct request *rq)
 {
 	struct mapped_device *md = q->queuedata;
 	struct request *clone;
 	if (unlikely(rq->special)) {
 		DMWARN("Already has something in rq->special.");
 		return BLKPREP_KILL;
 	}
 	clone = clone_rq(rq, md, GFP_ATOMIC);
 	if (!clone)
 		return BLKPREP_DEFER;
 	rq->special = clone;
 	rq->cmd_flags |= REQ_DONTPREP;
 	return BLKPREP_OK;
 }
 /*
  * Returns:
  * 0  : the request has been processed (not requeued)
  * !0 : the request has been requeued
  */
 static int map_request(struct dm_target *ti, struct request *clone,
 		       struct mapped_device *md)
 {
 	int r, requeued = 0;
 	struct dm_rq_target_io *tio = clone->end_io_data;
 	/*
 	 * Hold the md reference here for the in-flight I/O.
 	 * We can't rely on the reference count by device opener,
 	 * because the device may be closed during the request completion
 	 * when all bios are completed.
 	 * See the comment in rq_completed() too.
 	 */
 	dm_get(md);
 	tio->ti = ti;
 	r = ti->type->map_rq(ti, clone, &tio->info);
 	switch (r) {
 	case DM_MAPIO_SUBMITTED:
 		/* The target has taken the I/O to submit by itself later */
 		break;
 	case DM_MAPIO_REMAPPED:
 		/* The target has remapped the I/O so dispatch it */
 		trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
 				     blk_rq_pos(tio->orig));
 		dm_dispatch_request(clone);
 		break;
 	case DM_MAPIO_REQUEUE:
 		/* The target wants to requeue the I/O */
 		dm_requeue_unmapped_request(clone);
 		requeued = 1;
 		break;
 	default:
 		if (r > 0) {
 			DMWARN("unimplemented target map return value: %d", r);
 			BUG();
 		}
 		/* The target wants to complete the I/O */
 		dm_kill_unmapped_request(clone, r);
 		break;
 	}
 	return requeued;
 }
 /*
  * q->request_fn for request-based dm.
  * Called with the queue lock held.
  */
 static void dm_request_fn(struct request_queue *q)
 {
 	struct mapped_device *md = q->queuedata;
 	struct dm_table *map = dm_get_live_table(md);
 	struct dm_target *ti;
 	struct request *rq, *clone;
 	sector_t pos;
 	/*
 	 * For suspend, check blk_queue_stopped() and increment
 	 * ->pending within a single queue_lock not to increment the
 	 * number of in-flight I/Os after the queue is stopped in
 	 * dm_suspend().
 	 */
 	while (!blk_queue_stopped(q)) {
 		rq = blk_peek_request(q);
 		if (!rq)
 			goto delay_and_out;
 		/* always use block 0 to find the target for flushes for now */
 		pos = 0;
 		if (!(rq->cmd_flags & REQ_FLUSH))
 			pos = blk_rq_pos(rq);
 		ti = dm_table_find_target(map, pos);
 		BUG_ON(!dm_target_is_valid(ti));
 		if (ti->type->busy && ti->type->busy(ti))
 			goto delay_and_out;
 		blk_start_request(rq);
 		clone = rq->special;
 		atomic_inc(&md->pending[rq_data_dir(clone)]);
 		spin_unlock(q->queue_lock);
 		if (map_request(ti, clone, md))
 			goto requeued;
 		BUG_ON(!irqs_disabled());
 		spin_lock(q->queue_lock);
 	}
 	goto out;
 requeued:
 	BUG_ON(!irqs_disabled());
 	spin_lock(q->queue_lock);
 delay_and_out:
 	blk_delay_queue(q, HZ / 10);
 out:
 	dm_table_put(map);
 	return;
 }
 int dm_underlying_device_busy(struct request_queue *q)
 {
 	return blk_lld_busy(q);
 }
 EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
 static int dm_lld_busy(struct request_queue *q)
 {
 	int r;
 	struct mapped_device *md = q->queuedata;
 	struct dm_table *map = dm_get_live_table(md);
 	if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
 		r = 1;
 	else
 		r = dm_table_any_busy_target(map);
 	dm_table_put(map);
 	return r;
 }
 static int dm_any_congested(void *congested_data, int bdi_bits)
 {
 	int r = bdi_bits;
 	struct mapped_device *md = congested_data;
 	struct dm_table *map;
 	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
 		map = dm_get_live_table(md);
 		if (map) {
 			/*
 			 * Request-based dm cares about only own queue for
 			 * the query about congestion status of request_queue
 			 */
 			if (dm_request_based(md))
 				r = md->queue->backing_dev_info.state &
 				    bdi_bits;
 			else
 				r = dm_table_any_congested(map, bdi_bits);
 			dm_table_put(map);
 		}
 	}
 	return r;
 }
 /*-----------------------------------------------------------------
  * An IDR is used to keep track of allocated minor numbers.
  *---------------------------------------------------------------*/
 static void free_minor(int minor)
 {
 	spin_lock(&_minor_lock);
 	idr_remove(&_minor_idr, minor);
 	spin_unlock(&_minor_lock);
 }
 /*
  * See if the device with a specific minor # is free.
  */
 static int specific_minor(int minor)
 {
 	int r, m;
 	if (minor >= (1 << MINORBITS))
 		return -EINVAL;
 	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
 	if (!r)
 		return -ENOMEM;
 	spin_lock(&_minor_lock);
 	if (idr_find(&_minor_idr, minor)) {
 		r = -EBUSY;
 		goto out;
 	}
 	r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
 	if (r)
 		goto out;
 	if (m != minor) {
 		idr_remove(&_minor_idr, m);
 		r = -EBUSY;
 		goto out;
 	}
 out:
 	spin_unlock(&_minor_lock);
 	return r;
 }
 static int next_free_minor(int *minor)
 {
 	int r, m;
 	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
 	if (!r)
 		return -ENOMEM;
 	spin_lock(&_minor_lock);
 	r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
 	if (r)
 		goto out;
 	if (m >= (1 << MINORBITS)) {
 		idr_remove(&_minor_idr, m);
 		r = -ENOSPC;
 		goto out;
 	}
 	*minor = m;
 out:
 	spin_unlock(&_minor_lock);
 	return r;
 }
 static const struct block_device_operations dm_blk_dops;
 static void dm_wq_work(struct work_struct *work);
 static void dm_init_md_queue(struct mapped_device *md)
 {
 	/*
 	 * Request-based dm devices cannot be stacked on top of bio-based dm
 	 * devices.  The type of this dm device has not been decided yet.
 	 * The type is decided at the first table loading time.
 	 * To prevent problematic device stacking, clear the queue flag
 	 * for request stacking support until then.
 	 *
 	 * This queue is new, so no concurrency on the queue_flags.
 	 */
 	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
 	md->queue->queuedata = md;
 	md->queue->backing_dev_info.congested_fn = dm_any_congested;
 	md->queue->backing_dev_info.congested_data = md;
 	blk_queue_make_request(md->queue, dm_request);
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
-	blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 }
 /*
  * Allocate and initialise a blank device with a given minor.
  */
 static struct mapped_device *alloc_dev(int minor)
 {
 	int r;
 	struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
 	void *old_md;
 	if (!md) {
 		DMWARN("unable to allocate device, out of memory.");
 		return NULL;
 	}
 	if (!try_module_get(THIS_MODULE))
 		goto bad_module_get;
 	/* get a minor number for the dev */
 	if (minor == DM_ANY_MINOR)
 		r = next_free_minor(&minor);
 	else
 		r = specific_minor(minor);
 	if (r < 0)
 		goto bad_minor;
 	md->type = DM_TYPE_NONE;
 	init_rwsem(&md->io_lock);
 	mutex_init(&md->suspend_lock);
 	mutex_init(&md->type_lock);
 	spin_lock_init(&md->deferred_lock);
 	rwlock_init(&md->map_lock);
 	atomic_set(&md->holders, 1);
 	atomic_set(&md->open_count, 0);
 	atomic_set(&md->event_nr, 0);
 	atomic_set(&md->uevent_seq, 0);
 	INIT_LIST_HEAD(&md->uevent_list);
 	spin_lock_init(&md->uevent_lock);
 	md->queue = blk_alloc_queue(GFP_KERNEL);
 	if (!md->queue)
 		goto bad_queue;
 	dm_init_md_queue(md);
 	md->disk = alloc_disk(1);
 	if (!md->disk)
 		goto bad_disk;
 	atomic_set(&md->pending[0], 0);
 	atomic_set(&md->pending[1], 0);
 	init_waitqueue_head(&md->wait);
 	INIT_WORK(&md->work, dm_wq_work);
 	init_waitqueue_head(&md->eventq);
 	md->disk->major = _major;
 	md->disk->first_minor = minor;
 	md->disk->fops = &dm_blk_dops;
 	md->disk->queue = md->queue;
 	md->disk->private_data = md;
 	sprintf(md->disk->disk_name, "dm-%d", minor);
 	add_disk(md->disk);
 	format_dev_t(md->name, MKDEV(_major, minor));
 	md->wq = alloc_workqueue("kdmflush",
 				 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
 	if (!md->wq)
 		goto bad_thread;
 	md->bdev = bdget_disk(md->disk, 0);
 	if (!md->bdev)
 		goto bad_bdev;
 	bio_init(&md->flush_bio);
 	md->flush_bio.bi_bdev = md->bdev;
 	md->flush_bio.bi_rw = WRITE_FLUSH;
 	/* Populate the mapping, nobody knows we exist yet */
 	spin_lock(&_minor_lock);
 	old_md = idr_replace(&_minor_idr, md, minor);
 	spin_unlock(&_minor_lock);
 	BUG_ON(old_md != MINOR_ALLOCED);
 	return md;
 bad_bdev:
 	destroy_workqueue(md->wq);
 bad_thread:
 	del_gendisk(md->disk);
 	put_disk(md->disk);
 bad_disk:
 	blk_cleanup_queue(md->queue);
 bad_queue:
 	free_minor(minor);
 bad_minor:
 	module_put(THIS_MODULE);
 bad_module_get:
 	kfree(md);
 	return NULL;
 }
 static void unlock_fs(struct mapped_device *md);
 static void free_dev(struct mapped_device *md)
 {
 	int minor = MINOR(disk_devt(md->disk));
 	unlock_fs(md);
 	bdput(md->bdev);
 	destroy_workqueue(md->wq);
 	if (md->tio_pool)
 		mempool_destroy(md->tio_pool);
 	if (md->io_pool)
 		mempool_destroy(md->io_pool);
 	if (md->bs)
 		bioset_free(md->bs);
 	blk_integrity_unregister(md->disk);
 	del_gendisk(md->disk);
 	free_minor(minor);
 	spin_lock(&_minor_lock);
 	md->disk->private_data = NULL;
 	spin_unlock(&_minor_lock);
 	put_disk(md->disk);
 	blk_cleanup_queue(md->queue);
 	module_put(THIS_MODULE);
 	kfree(md);
 }
 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
 {
 	struct dm_md_mempools *p;
 	if (md->io_pool && md->tio_pool && md->bs)
 		/* the md already has necessary mempools */
 		goto out;
 	p = dm_table_get_md_mempools(t);
 	BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
 	md->io_pool = p->io_pool;
 	p->io_pool = NULL;
 	md->tio_pool = p->tio_pool;
 	p->tio_pool = NULL;
 	md->bs = p->bs;
 	p->bs = NULL;
 out:
 	/* mempool bind completed, now no need any mempools in the table */
 	dm_table_free_md_mempools(t);
 }
 /*
  * Bind a table to the device.
  */
 static void event_callback(void *context)
 {
 	unsigned long flags;
 	LIST_HEAD(uevents);
 	struct mapped_device *md = (struct mapped_device *) context;
 	spin_lock_irqsave(&md->uevent_lock, flags);
 	list_splice_init(&md->uevent_list, &uevents);
 	spin_unlock_irqrestore(&md->uevent_lock, flags);
 	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
 	atomic_inc(&md->event_nr);
 	wake_up(&md->eventq);
 }
 /*
  * Protected by md->suspend_lock obtained by dm_swap_table().
  */
 static void __set_size(struct mapped_device *md, sector_t size)
 {
 	set_capacity(md->disk, size);
 	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
 }
 /*
  * Return 1 if the queue has a compulsory merge_bvec_fn function.
  *
  * If this function returns 0, then the device is either a non-dm
  * device without a merge_bvec_fn, or it is a dm device that is
  * able to split any bios it receives that are too big.
  */
 int dm_queue_merge_is_compulsory(struct request_queue *q)
 {
 	struct mapped_device *dev_md;
 	if (!q->merge_bvec_fn)
 		return 0;
 	if (q->make_request_fn == dm_request) {
 		dev_md = q->queuedata;
 		if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
 			return 0;
 	}
 	return 1;
 }
 static int dm_device_merge_is_compulsory(struct dm_target *ti,
 					 struct dm_dev *dev, sector_t start,
 					 sector_t len, void *data)
 {
 	struct block_device *bdev = dev->bdev;
 	struct request_queue *q = bdev_get_queue(bdev);
 	return dm_queue_merge_is_compulsory(q);
 }
 /*
  * Return 1 if it is acceptable to ignore merge_bvec_fn based
  * on the properties of the underlying devices.
  */
 static int dm_table_merge_is_optional(struct dm_table *table)
 {
 	unsigned i = 0;
 	struct dm_target *ti;
 	while (i < dm_table_get_num_targets(table)) {
 		ti = dm_table_get_target(table, i++);
 		if (ti->type->iterate_devices &&
 		    ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
 			return 0;
 	}
 	return 1;
 }
 /*
  * Returns old map, which caller must destroy.
  */
 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
 			       struct queue_limits *limits)
 {
 	struct dm_table *old_map;
 	struct request_queue *q = md->queue;
 	sector_t size;
 	unsigned long flags;
 	int merge_is_optional;
 	size = dm_table_get_size(t);
 	/*
 	 * Wipe any geometry if the size of the table changed.
 	 */
 	if (size != get_capacity(md->disk))
 		memset(&md->geometry, 0, sizeof(md->geometry));
 	__set_size(md, size);
 	dm_table_event_callback(t, event_callback, md);
 	/*
 	 * The queue hasn't been stopped yet, if the old table type wasn't
 	 * for request-based during suspension.  So stop it to prevent
 	 * I/O mapping before resume.
 	 * This must be done before setting the queue restrictions,
 	 * because request-based dm may be run just after the setting.
 	 */
 	if (dm_table_request_based(t) && !blk_queue_stopped(q))
 		stop_queue(q);
 	__bind_mempools(md, t);
 	merge_is_optional = dm_table_merge_is_optional(t);
 	write_lock_irqsave(&md->map_lock, flags);
 	old_map = md->map;
 	md->map = t;
 	dm_table_set_restrictions(t, q, limits);
 	if (merge_is_optional)
 		set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
 	else
 		clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
 	write_unlock_irqrestore(&md->map_lock, flags);
 	return old_map;
 }
 /*
  * Returns unbound table for the caller to free.
  */
 static struct dm_table *__unbind(struct mapped_device *md)
 {
 	struct dm_table *map = md->map;
 	unsigned long flags;
 	if (!map)
 		return NULL;
 	dm_table_event_callback(map, NULL, NULL);
 	write_lock_irqsave(&md->map_lock, flags);
 	md->map = NULL;
 	write_unlock_irqrestore(&md->map_lock, flags);
 	return map;
 }
 /*
  * Constructor for a new device.
  */
 int dm_create(int minor, struct mapped_device **result)
 {
 	struct mapped_device *md;
 	md = alloc_dev(minor);
 	if (!md)
 		return -ENXIO;
 	dm_sysfs_init(md);
 	*result = md;
 	return 0;
 }
 /*
  * Functions to manage md->type.
  * All are required to hold md->type_lock.
  */
 void dm_lock_md_type(struct mapped_device *md)
 {
 	mutex_lock(&md->type_lock);
 }
 void dm_unlock_md_type(struct mapped_device *md)
 {
 	mutex_unlock(&md->type_lock);
 }
 void dm_set_md_type(struct mapped_device *md, unsigned type)
 {
 	md->type = type;
 }
 unsigned dm_get_md_type(struct mapped_device *md)
 {
 	return md->type;
 }
 /*
  * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
  */
 static int dm_init_request_based_queue(struct mapped_device *md)
 {
 	struct request_queue *q = NULL;
 	if (md->queue->elevator)
 		return 1;
 	/* Fully initialize the queue */
 	q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
 	if (!q)
 		return 0;
 	md->queue = q;
 	md->saved_make_request_fn = md->queue->make_request_fn;
 	dm_init_md_queue(md);
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
 	blk_queue_lld_busy(md->queue, dm_lld_busy);
 	elv_register_queue(md->queue);
 	return 1;
 }
 /*
  * Setup the DM device's queue based on md's type
  */
 int dm_setup_md_queue(struct mapped_device *md)
 {
 	if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
 	    !dm_init_request_based_queue(md)) {
 		DMWARN("Cannot initialize queue for request-based mapped device");
 		return -EINVAL;
 	}
 	return 0;
 }
 static struct mapped_device *dm_find_md(dev_t dev)
 {
 	struct mapped_device *md;
 	unsigned minor = MINOR(dev);
 	if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
 		return NULL;
 	spin_lock(&_minor_lock);
 	md = idr_find(&_minor_idr, minor);
 	if (md && (md == MINOR_ALLOCED ||
 		   (MINOR(disk_devt(dm_disk(md))) != minor) ||
 		   dm_deleting_md(md) ||
 		   test_bit(DMF_FREEING, &md->flags))) {
 		md = NULL;
 		goto out;
 	}
 out:
 	spin_unlock(&_minor_lock);
 	return md;
 }
 struct mapped_device *dm_get_md(dev_t dev)
 {
 	struct mapped_device *md = dm_find_md(dev);
 	if (md)
 		dm_get(md);
 	return md;
 }
 void *dm_get_mdptr(struct mapped_device *md)
 {
 	return md->interface_ptr;
 }
 void dm_set_mdptr(struct mapped_device *md, void *ptr)
 {
 	md->interface_ptr = ptr;
 }
 void dm_get(struct mapped_device *md)
 {
 	atomic_inc(&md->holders);
 	BUG_ON(test_bit(DMF_FREEING, &md->flags));
 }
 const char *dm_device_name(struct mapped_device *md)
 {
 	return md->name;
 }
 EXPORT_SYMBOL_GPL(dm_device_name);
 static void __dm_destroy(struct mapped_device *md, bool wait)
 {
 	struct dm_table *map;
 	might_sleep();
 	spin_lock(&_minor_lock);
 	map = dm_get_live_table(md);
 	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
 	set_bit(DMF_FREEING, &md->flags);
 	spin_unlock(&_minor_lock);
 	if (!dm_suspended_md(md)) {
 		dm_table_presuspend_targets(map);
 		dm_table_postsuspend_targets(map);
 	}
 	/*
 	 * Rare, but there may be I/O requests still going to complete,
 	 * for example.  Wait for all references to disappear.
 	 * No one should increment the reference count of the mapped_device,
 	 * after the mapped_device state becomes DMF_FREEING.
 	 */
 	if (wait)
 		while (atomic_read(&md->holders))
 			msleep(1);
 	else if (atomic_read(&md->holders))
 		DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
 		       dm_device_name(md), atomic_read(&md->holders));
 	dm_sysfs_exit(md);
 	dm_table_put(map);
 	dm_table_destroy(__unbind(md));
 	free_dev(md);
 }
 void dm_destroy(struct mapped_device *md)
 {
 	__dm_destroy(md, true);
 }
 void dm_destroy_immediate(struct mapped_device *md)
 {
 	__dm_destroy(md, false);
 }
 void dm_put(struct mapped_device *md)
 {
 	atomic_dec(&md->holders);
 }
 EXPORT_SYMBOL_GPL(dm_put);
 static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
 {
 	int r = 0;
 	DECLARE_WAITQUEUE(wait, current);
 	add_wait_queue(&md->wait, &wait);
 	while (1) {
 		set_current_state(interruptible);
 		smp_mb();
 		if (!md_in_flight(md))
 			break;
 		if (interruptible == TASK_INTERRUPTIBLE &&
 		    signal_pending(current)) {
 			r = -EINTR;
 			break;
 		}
 		io_schedule();
 	}
 	set_current_state(TASK_RUNNING);
 	remove_wait_queue(&md->wait, &wait);
 	return r;
 }
 /*
  * Process the deferred bios
  */
 static void dm_wq_work(struct work_struct *work)
 {
 	struct mapped_device *md = container_of(work, struct mapped_device,
 						work);
 	struct bio *c;
 	down_read(&md->io_lock);
 	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
 		spin_lock_irq(&md->deferred_lock);
 		c = bio_list_pop(&md->deferred);
 		spin_unlock_irq(&md->deferred_lock);
 		if (!c)
 			break;
 		up_read(&md->io_lock);
 		if (dm_request_based(md))
 			generic_make_request(c);
 		else
 			__split_and_process_bio(md, c);
 		down_read(&md->io_lock);
 	}
 	up_read(&md->io_lock);
 }
 static void dm_queue_flush(struct mapped_device *md)
 {
 	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
 	smp_mb__after_clear_bit();
 	queue_work(md->wq, &md->work);
 }
 /*
  * Swap in a new table, returning the old one for the caller to destroy.
  */
 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
 {
 	struct dm_table *map = ERR_PTR(-EINVAL);
 	struct queue_limits limits;
 	int r;
 	mutex_lock(&md->suspend_lock);
 	/* device must be suspended */
 	if (!dm_suspended_md(md))
 		goto out;
 	r = dm_calculate_queue_limits(table, &limits);
 	if (r) {
 		map = ERR_PTR(r);
 		goto out;
 	}
 	map = __bind(md, table, &limits);
 out:
 	mutex_unlock(&md->suspend_lock);
 	return map;
 }
 /*
  * Functions to lock and unlock any filesystem running on the
  * device.
  */
 static int lock_fs(struct mapped_device *md)
 {
 	int r;
 	WARN_ON(md->frozen_sb);
 	md->frozen_sb = freeze_bdev(md->bdev);
 	if (IS_ERR(md->frozen_sb)) {
 		r = PTR_ERR(md->frozen_sb);
 		md->frozen_sb = NULL;
 		return r;
 	}
 	set_bit(DMF_FROZEN, &md->flags);
 	return 0;
 }
 static void unlock_fs(struct mapped_device *md)
 {
 	if (!test_bit(DMF_FROZEN, &md->flags))
 		return;
 	thaw_bdev(md->bdev, md->frozen_sb);
 	md->frozen_sb = NULL;
 	clear_bit(DMF_FROZEN, &md->flags);
 }
 /*
  * We need to be able to change a mapping table under a mounted
  * filesystem.  For example we might want to move some data in
  * the background.  Before the table can be swapped with
  * dm_bind_table, dm_suspend must be called to flush any in
  * flight bios and ensure that any further io gets deferred.
  */
 /*
  * Suspend mechanism in request-based dm.
  *
  * 1. Flush all I/Os by lock_fs() if needed.
  * 2. Stop dispatching any I/O by stopping the request_queue.
  * 3. Wait for all in-flight I/Os to be completed or requeued.
  *
  * To abort suspend, start the request_queue.
  */
 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
 {
 	struct dm_table *map = NULL;
 	int r = 0;
 	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
 	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
 	mutex_lock(&md->suspend_lock);
 	if (dm_suspended_md(md)) {
 		r = -EINVAL;
 		goto out_unlock;
 	}
 	map = dm_get_live_table(md);
 	/*
 	 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
 	 * This flag is cleared before dm_suspend returns.
 	 */
 	if (noflush)
 		set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 	/* This does not get reverted if there's an error later. */
 	dm_table_presuspend_targets(map);
 	/*
 	 * Flush I/O to the device.
 	 * Any I/O submitted after lock_fs() may not be flushed.
 	 * noflush takes precedence over do_lockfs.
 	 * (lock_fs() flushes I/Os and waits for them to complete.)
 	 */
 	if (!noflush && do_lockfs) {
 		r = lock_fs(md);
 		if (r)
 			goto out;
 	}
 	/*
 	 * Here we must make sure that no processes are submitting requests
 	 * to target drivers i.e. no one may be executing
 	 * __split_and_process_bio. This is called from dm_request and
 	 * dm_wq_work.
 	 *
 	 * To get all processes out of __split_and_process_bio in dm_request,
 	 * we take the write lock. To prevent any process from reentering
 	 * __split_and_process_bio from dm_request and quiesce the thread
 	 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
 	 * flush_workqueue(md->wq).
 	 */
 	down_write(&md->io_lock);
 	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
 	up_write(&md->io_lock);
 	/*
 	 * Stop md->queue before flushing md->wq in case request-based
 	 * dm defers requests to md->wq from md->queue.
 	 */
 	if (dm_request_based(md))
 		stop_queue(md->queue);
 	flush_workqueue(md->wq);
 	/*
 	 * At this point no more requests are entering target request routines.
 	 * We call dm_wait_for_completion to wait for all existing requests
 	 * to finish.
 	 */
 	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
 	down_write(&md->io_lock);
 	if (noflush)
 		clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
 	up_write(&md->io_lock);
 	/* were we interrupted ? */
 	if (r < 0) {
 		dm_queue_flush(md);
 		if (dm_request_based(md))
 			start_queue(md->queue);
 		unlock_fs(md);
 		goto out; /* pushback list is already flushed, so skip flush */
 	}
 	/*
 	 * If dm_wait_for_completion returned 0, the device is completely
 	 * quiescent now. There is no request-processing activity. All new
 	 * requests are being added to md->deferred list.
 	 */
 	set_bit(DMF_SUSPENDED, &md->flags);
 	dm_table_postsuspend_targets(map);
 out:
 	dm_table_put(map);
 out_unlock:
 	mutex_unlock(&md->suspend_lock);
 	return r;
 }
 int dm_resume(struct mapped_device *md)
 {
 	int r = -EINVAL;
 	struct dm_table *map = NULL;
 	mutex_lock(&md->suspend_lock);
 	if (!dm_suspended_md(md))
 		goto out;
 	map = dm_get_live_table(md);
 	if (!map || !dm_table_get_size(map))
 		goto out;
 	r = dm_table_resume_targets(map);
 	if (r)
 		goto out;
 	dm_queue_flush(md);
 	/*
 	 * Flushing deferred I/Os must be done after targets are resumed
 	 * so that mapping of targets can work correctly.
 	 * Request-based dm is queueing the deferred I/Os in its request_queue.
 	 */
 	if (dm_request_based(md))
 		start_queue(md->queue);
 	unlock_fs(md);
 	clear_bit(DMF_SUSPENDED, &md->flags);
 	r = 0;
 out:
 	dm_table_put(map);
 	mutex_unlock(&md->suspend_lock);
 	return r;
 }
 /*-----------------------------------------------------------------
  * Event notification.
  *---------------------------------------------------------------*/
 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
 		       unsigned cookie)
 {
 	char udev_cookie[DM_COOKIE_LENGTH];
 	char *envp[] = { udev_cookie, NULL };
 	if (!cookie)
 		return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
 	else {
 		snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
 			 DM_COOKIE_ENV_VAR_NAME, cookie);
 		return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
 					  action, envp);
 	}
 }
 uint32_t dm_next_uevent_seq(struct mapped_device *md)
 {
 	return atomic_add_return(1, &md->uevent_seq);
 }
 uint32_t dm_get_event_nr(struct mapped_device *md)
 {
 	return atomic_read(&md->event_nr);
 }
 int dm_wait_event(struct mapped_device *md, int event_nr)
 {
 	return wait_event_interruptible(md->eventq,
 			(event_nr != atomic_read(&md->event_nr)));
 }
 void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&md->uevent_lock, flags);
 	list_add(elist, &md->uevent_list);
 	spin_unlock_irqrestore(&md->uevent_lock, flags);
 }
 /*
  * The gendisk is only valid as long as you have a reference
  * count on 'md'.
  */
 struct gendisk *dm_disk(struct mapped_device *md)
 {
 	return md->disk;
 }
 struct kobject *dm_kobject(struct mapped_device *md)
 {
 	return &md->kobj;
 }
 /*
  * struct mapped_device should not be exported outside of dm.c
  * so use this check to verify that kobj is part of md structure
  */
 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
 {
 	struct mapped_device *md;
 	md = container_of(kobj, struct mapped_device, kobj);
 	if (&md->kobj != kobj)
 		return NULL;
 	if (test_bit(DMF_FREEING, &md->flags) ||
 	    dm_deleting_md(md))
 		return NULL;
 	dm_get(md);
 	return md;
 }
 int dm_suspended_md(struct mapped_device *md)
 {
 	return test_bit(DMF_SUSPENDED, &md->flags);
 }
 int dm_suspended(struct dm_target *ti)
 {
 	return dm_suspended_md(dm_table_get_md(ti->table));
 }
 EXPORT_SYMBOL_GPL(dm_suspended);
 int dm_noflush_suspending(struct dm_target *ti)
 {
 	return __noflush_suspending(dm_table_get_md(ti->table));
 }
 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
 {
 	struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
 	unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
 	if (!pools)
 		return NULL;
 	pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
 			 mempool_create_slab_pool(MIN_IOS, _io_cache) :
 			 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
 	if (!pools->io_pool)
 		goto free_pools_and_out;
 	pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
 			  mempool_create_slab_pool(MIN_IOS, _tio_cache) :
 			  mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
 	if (!pools->tio_pool)
 		goto free_io_pool_and_out;
 	pools->bs = bioset_create(pool_size, 0);
 	if (!pools->bs)
 		goto free_tio_pool_and_out;
 	if (integrity && bioset_integrity_create(pools->bs, pool_size))
 		goto free_bioset_and_out;
 	return pools;
 free_bioset_and_out:
 	bioset_free(pools->bs);
 free_tio_pool_and_out:
 	mempool_destroy(pools->tio_pool);
 free_io_pool_and_out:
 	mempool_destroy(pools->io_pool);
 free_pools_and_out:
 	kfree(pools);
 	return NULL;
 }
 void dm_free_md_mempools(struct dm_md_mempools *pools)
 {
 	if (!pools)
 		return;
 	if (pools->io_pool)
 		mempool_destroy(pools->io_pool);
 	if (pools->tio_pool)
 		mempool_destroy(pools->tio_pool);
 	if (pools->bs)
 		bioset_free(pools->bs);
 	kfree(pools);
 }
 static const struct block_device_operations dm_blk_dops = {
 	.open = dm_blk_open,
 	.release = dm_blk_close,
 	.ioctl = dm_blk_ioctl,
 	.getgeo = dm_blk_getgeo,
 	.owner = THIS_MODULE
 };
 EXPORT_SYMBOL(dm_get_mapinfo);
 /*
  * module hooks
  */
 module_init(dm_init);
 module_exit(dm_exit);
 module_param(major, uint, 0);
 MODULE_PARM_DESC(major, "The major number of the device mapper");
 MODULE_DESCRIPTION(DM_NAME " driver");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");