Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

3

*

3

*

4

* This file is released under the GPL.

4

* This file is released under the GPL.

5

*/

5

*/

6

7

#include "dm-thin-metadata.h"

7

#include "dm-thin-metadata.h"

8

#include "dm-bio-prison.h"

8

#include "dm-bio-prison.h"

9

#include "dm.h"

9

#include "dm.h"

10

11

#include <linux/device-mapper.h>

11

#include <linux/device-mapper.h>

12

#include <linux/dm-io.h>

12

#include <linux/dm-io.h>

13

#include <linux/dm-kcopyd.h>

13

#include <linux/dm-kcopyd.h>

14

#include <linux/log2.h>

14

#include <linux/log2.h>

15

#include <linux/list.h>

15

#include <linux/list.h>

16

#include <linux/rculist.h>

16

#include <linux/rculist.h>

17

#include <linux/init.h>

17

#include <linux/init.h>

18

#include <linux/module.h>

18

#include <linux/module.h>

19

#include <linux/slab.h>

19

#include <linux/slab.h>

20

#include <linux/sort.h>

20

#include <linux/sort.h>

21

#include <linux/rbtree.h>

21

#include <linux/rbtree.h>

22

23

#define DM_MSG_PREFIX "thin"

23

#define DM_MSG_PREFIX "thin"

24

25

/*

25

/*

26

* Tunable constants

26

* Tunable constants

27

*/

27

*/

28

#define ENDIO_HOOK_POOL_SIZE 1024

28

#define ENDIO_HOOK_POOL_SIZE 1024

29

#define MAPPING_POOL_SIZE 1024

29

#define MAPPING_POOL_SIZE 1024

30

#define COMMIT_PERIOD HZ

30

#define COMMIT_PERIOD HZ

31

#define NO_SPACE_TIMEOUT_SECS 60

31

#define NO_SPACE_TIMEOUT_SECS 60

32

33

static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;

33

static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;

34

35

DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,

35

DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,

36

"A percentage of time allocated for copy on write");

36

"A percentage of time allocated for copy on write");

37

38

/*

38

/*

39

* The block size of the device holding pool data must be

39

* The block size of the device holding pool data must be

40

* between 64KB and 1GB.

40

* between 64KB and 1GB.

41

*/

41

*/

42

#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)

42

#define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)

43

#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)

43

#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)

44

45

/*

45

/*

46

* Device id is restricted to 24 bits.

46

* Device id is restricted to 24 bits.

47

*/

47

*/

48

#define MAX_DEV_ID ((1 << 24) - 1)

48

#define MAX_DEV_ID ((1 << 24) - 1)

49

50

/*

50

/*

51

* How do we handle breaking sharing of data blocks?

51

* How do we handle breaking sharing of data blocks?

52

* =================================================

52

* =================================================

53

*

53

*

54

* We use a standard copy-on-write btree to store the mappings for the

54

* We use a standard copy-on-write btree to store the mappings for the

55

* devices (note I'm talking about copy-on-write of the metadata here, not

55

* devices (note I'm talking about copy-on-write of the metadata here, not

56

* the data). When you take an internal snapshot you clone the root node

56

* the data). When you take an internal snapshot you clone the root node

57

* of the origin btree. After this there is no concept of an origin or a

57

* of the origin btree. After this there is no concept of an origin or a

58

* snapshot. They are just two device trees that happen to point to the

58

* snapshot. They are just two device trees that happen to point to the

59

* same data blocks.

59

* same data blocks.

60

*

60

*

61

* When we get a write in we decide if it's to a shared data block using

61

* When we get a write in we decide if it's to a shared data block using

62

* some timestamp magic. If it is, we have to break sharing.

62

* some timestamp magic. If it is, we have to break sharing.

63

*

63

*

64

* Let's say we write to a shared block in what was the origin. The

64

* Let's say we write to a shared block in what was the origin. The

65

* steps are:

65

* steps are:

66

*

66

*

67

* i) plug io further to this physical block. (see bio_prison code).

67

* i) plug io further to this physical block. (see bio_prison code).

68

*

68

*

69

* ii) quiesce any read io to that shared data block. Obviously

69

* ii) quiesce any read io to that shared data block. Obviously

70

* including all devices that share this block. (see dm_deferred_set code)

70

* including all devices that share this block. (see dm_deferred_set code)

71

*

71

*

72

* iii) copy the data block to a newly allocate block. This step can be

72

* iii) copy the data block to a newly allocate block. This step can be

73

* missed out if the io covers the block. (schedule_copy).

73

* missed out if the io covers the block. (schedule_copy).

74

*

74

*

75

* iv) insert the new mapping into the origin's btree

75

* iv) insert the new mapping into the origin's btree

76

* (process_prepared_mapping). This act of inserting breaks some

76

* (process_prepared_mapping). This act of inserting breaks some

77

* sharing of btree nodes between the two devices. Breaking sharing only

77

* sharing of btree nodes between the two devices. Breaking sharing only

78

* effects the btree of that specific device. Btrees for the other

78

* effects the btree of that specific device. Btrees for the other

79

* devices that share the block never change. The btree for the origin

79

* devices that share the block never change. The btree for the origin

80

* device as it was after the last commit is untouched, ie. we're using

80

* device as it was after the last commit is untouched, ie. we're using

81

* persistent data structures in the functional programming sense.

81

* persistent data structures in the functional programming sense.

82

*

82

*

83

* v) unplug io to this physical block, including the io that triggered

83

* v) unplug io to this physical block, including the io that triggered

84

* the breaking of sharing.

84

* the breaking of sharing.

85

*

85

*

86

* Steps (ii) and (iii) occur in parallel.

86

* Steps (ii) and (iii) occur in parallel.

87

*

87

*

88

* The metadata _doesn't_ need to be committed before the io continues. We

88

* The metadata _doesn't_ need to be committed before the io continues. We

89

* get away with this because the io is always written to a _new_ block.

89

* get away with this because the io is always written to a _new_ block.

90

* If there's a crash, then:

90

* If there's a crash, then:

91

*

91

*

92

* - The origin mapping will point to the old origin block (the shared

92

* - The origin mapping will point to the old origin block (the shared

93

* one). This will contain the data as it was before the io that triggered

93

* one). This will contain the data as it was before the io that triggered

94

* the breaking of sharing came in.

94

* the breaking of sharing came in.

95

*

95

*

96

* - The snap mapping still points to the old block. As it would after

96

* - The snap mapping still points to the old block. As it would after

97

* the commit.

97

* the commit.

98

*

98

*

99

* The downside of this scheme is the timestamp magic isn't perfect, and

99

* The downside of this scheme is the timestamp magic isn't perfect, and

100

* will continue to think that data block in the snapshot device is shared

100

* will continue to think that data block in the snapshot device is shared

101

* even after the write to the origin has broken sharing. I suspect data

101

* even after the write to the origin has broken sharing. I suspect data

102

* blocks will typically be shared by many different devices, so we're

102

* blocks will typically be shared by many different devices, so we're

103

* breaking sharing n + 1 times, rather than n, where n is the number of

103

* breaking sharing n + 1 times, rather than n, where n is the number of

104

* devices that reference this data block. At the moment I think the

104

* devices that reference this data block. At the moment I think the

105

* benefits far, far outweigh the disadvantages.

105

* benefits far, far outweigh the disadvantages.

106

*/

106

*/

107

108

/*----------------------------------------------------------------*/

108

/*----------------------------------------------------------------*/

109

110

/*

110

/*

111

* Key building.

111

* Key building.

112

*/

112

*/

113

static void build_data_key(struct dm_thin_device *td,

113

static void build_data_key(struct dm_thin_device *td,

114

dm_block_t b, struct dm_cell_key *key)

114

dm_block_t b, struct dm_cell_key *key)

115

{

115

{

116

key->virtual = 0;

116

key->virtual = 0;

117

key->dev = dm_thin_dev_id(td);

117

key->dev = dm_thin_dev_id(td);

118

key->block_begin = b;

118

key->block_begin = b;

119

key->block_end = b + 1ULL;

119

key->block_end = b + 1ULL;

120

}

120

}

121

122

static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,

122

static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,

123

struct dm_cell_key *key)

123

struct dm_cell_key *key)

124

{

124

{

125

key->virtual = 1;

125

key->virtual = 1;

126

key->dev = dm_thin_dev_id(td);

126

key->dev = dm_thin_dev_id(td);

127

key->block_begin = b;

127

key->block_begin = b;

128

key->block_end = b + 1ULL;

128

key->block_end = b + 1ULL;

129

}

129

}

130

131

/*----------------------------------------------------------------*/

131

/*----------------------------------------------------------------*/

132

133

#define THROTTLE_THRESHOLD (1 * HZ)

133

#define THROTTLE_THRESHOLD (1 * HZ)

134

135

struct throttle {

135

struct throttle {

136

struct rw_semaphore lock;

136

struct rw_semaphore lock;

137

unsigned long threshold;

137

unsigned long threshold;

138

bool throttle_applied;

138

bool throttle_applied;

139

};

139

};

140

141

static void throttle_init(struct throttle *t)

141

static void throttle_init(struct throttle *t)

142

{

142

{

143

init_rwsem(&t->lock);

143

init_rwsem(&t->lock);

144

t->throttle_applied = false;

144

t->throttle_applied = false;

145

}

145

}

146

147

static void throttle_work_start(struct throttle *t)

147

static void throttle_work_start(struct throttle *t)

148

{

148

{

149

t->threshold = jiffies + THROTTLE_THRESHOLD;

149

t->threshold = jiffies + THROTTLE_THRESHOLD;

150

}

150

}

151

152

static void throttle_work_update(struct throttle *t)

152

static void throttle_work_update(struct throttle *t)

153

{

153

{

154

if (!t->throttle_applied && jiffies > t->threshold) {

154

if (!t->throttle_applied && jiffies > t->threshold) {

155

down_write(&t->lock);

155

down_write(&t->lock);

156

t->throttle_applied = true;

156

t->throttle_applied = true;

157

}

157

}

158

}

158

}

159

160

static void throttle_work_complete(struct throttle *t)

160

static void throttle_work_complete(struct throttle *t)

161

{

161

{

162

if (t->throttle_applied) {

162

if (t->throttle_applied) {

163

t->throttle_applied = false;

163

t->throttle_applied = false;

164

up_write(&t->lock);

164

up_write(&t->lock);

165

}

165

}

166

}

166

}

167

168

static void throttle_lock(struct throttle *t)

168

static void throttle_lock(struct throttle *t)

169

{

169

{

170

down_read(&t->lock);

170

down_read(&t->lock);

171

}

171

}

172

173

static void throttle_unlock(struct throttle *t)

173

static void throttle_unlock(struct throttle *t)

174

{

174

{

175

up_read(&t->lock);

175

up_read(&t->lock);

176

}

176

}

177

178

/*----------------------------------------------------------------*/

178

/*----------------------------------------------------------------*/

179

180

/*

180

/*

181

* A pool device ties together a metadata device and a data device. It

181

* A pool device ties together a metadata device and a data device. It

182

* also provides the interface for creating and destroying internal

182

* also provides the interface for creating and destroying internal

183

* devices.

183

* devices.

184

*/

184

*/

185

struct dm_thin_new_mapping;

185

struct dm_thin_new_mapping;

186

187

/*

187

/*

188

* The pool runs in 4 modes. Ordered in degraded order for comparisons.

188

* The pool runs in 4 modes. Ordered in degraded order for comparisons.

189

*/

189

*/

190

enum pool_mode {

190

enum pool_mode {

191

PM_WRITE, /* metadata may be changed */

191

PM_WRITE, /* metadata may be changed */

192

PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */

192

PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */

193

PM_READ_ONLY, /* metadata may not be changed */

193

PM_READ_ONLY, /* metadata may not be changed */

194

PM_FAIL, /* all I/O fails */

194

PM_FAIL, /* all I/O fails */

195

};

195

};

196

197

struct pool_features {

197

struct pool_features {

198

enum pool_mode mode;

198

enum pool_mode mode;

199

200

bool zero_new_blocks:1;

200

bool zero_new_blocks:1;

201

bool discard_enabled:1;

201

bool discard_enabled:1;

202

bool discard_passdown:1;

202

bool discard_passdown:1;

203

bool error_if_no_space:1;

203

bool error_if_no_space:1;

204

};

204

};

205

206

struct thin_c;

206

struct thin_c;

207

typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);

207

typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);

208

typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);

208

typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);

209

typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);

209

typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);

210

211

#define CELL_SORT_ARRAY_SIZE 8192

211

#define CELL_SORT_ARRAY_SIZE 8192

212

213

struct pool {

213

struct pool {

214

struct list_head list;

214

struct list_head list;

215

struct dm_target *ti; /* Only set if a pool target is bound */

215

struct dm_target *ti; /* Only set if a pool target is bound */

216

217

struct mapped_device *pool_md;

217

struct mapped_device *pool_md;

218

struct block_device *md_dev;

218

struct block_device *md_dev;

219

struct dm_pool_metadata *pmd;

219

struct dm_pool_metadata *pmd;

220

221

dm_block_t low_water_blocks;

221

dm_block_t low_water_blocks;

222

uint32_t sectors_per_block;

222

uint32_t sectors_per_block;

223

int sectors_per_block_shift;

223

int sectors_per_block_shift;

224

225

struct pool_features pf;

225

struct pool_features pf;

226

bool low_water_triggered:1; /* A dm event has been sent */

226

bool low_water_triggered:1; /* A dm event has been sent */

227

bool suspended:1;

227

bool suspended:1;

228

229

struct dm_bio_prison *prison;

229

struct dm_bio_prison *prison;

230

struct dm_kcopyd_client *copier;

230

struct dm_kcopyd_client *copier;

231

232

struct workqueue_struct *wq;

232

struct workqueue_struct *wq;

233

struct throttle throttle;

233

struct throttle throttle;

234

struct work_struct worker;

234

struct work_struct worker;

235

struct delayed_work waker;

235

struct delayed_work waker;

236

struct delayed_work no_space_timeout;

236

struct delayed_work no_space_timeout;

237

238

unsigned long last_commit_jiffies;

238

unsigned long last_commit_jiffies;

239

unsigned ref_count;

239

unsigned ref_count;

240

241

spinlock_t lock;

241

spinlock_t lock;

242

struct bio_list deferred_flush_bios;

242

struct bio_list deferred_flush_bios;

243

struct list_head prepared_mappings;

243

struct list_head prepared_mappings;

244

struct list_head prepared_discards;

244

struct list_head prepared_discards;

245

struct list_head active_thins;

245

struct list_head active_thins;

246

247

struct dm_deferred_set *shared_read_ds;

247

struct dm_deferred_set *shared_read_ds;

248

struct dm_deferred_set *all_io_ds;

248

struct dm_deferred_set *all_io_ds;

249

250

struct dm_thin_new_mapping *next_mapping;

250

struct dm_thin_new_mapping *next_mapping;

251

mempool_t *mapping_pool;

251

mempool_t *mapping_pool;

252

253

process_bio_fn process_bio;

253

process_bio_fn process_bio;

254

process_bio_fn process_discard;

254

process_bio_fn process_discard;

255

256

process_cell_fn process_cell;

256

process_cell_fn process_cell;

257

process_cell_fn process_discard_cell;

257

process_cell_fn process_discard_cell;

258

259

process_mapping_fn process_prepared_mapping;

259

process_mapping_fn process_prepared_mapping;

260

process_mapping_fn process_prepared_discard;

260

process_mapping_fn process_prepared_discard;

261

262

struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];

262

struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];

263

};

263

};

264

265

static enum pool_mode get_pool_mode(struct pool *pool);

265

static enum pool_mode get_pool_mode(struct pool *pool);

266

static void metadata_operation_failed(struct pool *pool, const char *op, int r);

266

static void metadata_operation_failed(struct pool *pool, const char *op, int r);

267

268

/*

268

/*

269

* Target context for a pool.

269

* Target context for a pool.

270

*/

270

*/

271

struct pool_c {

271

struct pool_c {

272

struct dm_target *ti;

272

struct dm_target *ti;

273

struct pool *pool;

273

struct pool *pool;

274

struct dm_dev *data_dev;

274

struct dm_dev *data_dev;

275

struct dm_dev *metadata_dev;

275

struct dm_dev *metadata_dev;

276

struct dm_target_callbacks callbacks;

276

struct dm_target_callbacks callbacks;

277

278

dm_block_t low_water_blocks;

278

dm_block_t low_water_blocks;

279

struct pool_features requested_pf; /* Features requested during table load */

279

struct pool_features requested_pf; /* Features requested during table load */

280

struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */

280

struct pool_features adjusted_pf; /* Features used after adjusting for constituent devices */

281

};

281

};

282

283

/*

283

/*

284

* Target context for a thin.

284

* Target context for a thin.

285

*/

285

*/

286

struct thin_c {

286

struct thin_c {

287

struct list_head list;

287

struct list_head list;

288

struct dm_dev *pool_dev;

288

struct dm_dev *pool_dev;

289

struct dm_dev *origin_dev;

289

struct dm_dev *origin_dev;

290

sector_t origin_size;

290

sector_t origin_size;

291

dm_thin_id dev_id;

291

dm_thin_id dev_id;

292

293

struct pool *pool;

293

struct pool *pool;

294

struct dm_thin_device *td;

294

struct dm_thin_device *td;

295

struct mapped_device *thin_md;

295

struct mapped_device *thin_md;

296

297

bool requeue_mode:1;

297

bool requeue_mode:1;

298

spinlock_t lock;

298

spinlock_t lock;

299

struct list_head deferred_cells;

299

struct list_head deferred_cells;

300

struct bio_list deferred_bio_list;

300

struct bio_list deferred_bio_list;

301

struct bio_list retry_on_resume_list;

301

struct bio_list retry_on_resume_list;

302

struct rb_root sort_bio_list; /* sorted list of deferred bios */

302

struct rb_root sort_bio_list; /* sorted list of deferred bios */

303

304

/*

304

/*

305

* Ensures the thin is not destroyed until the worker has finished

305

* Ensures the thin is not destroyed until the worker has finished

306

* iterating the active_thins list.

306

* iterating the active_thins list.

307

*/

307

*/

308

atomic_t refcount;

308

atomic_t refcount;

309

struct completion can_destroy;

309

struct completion can_destroy;

310

};

310

};

311

312

/*----------------------------------------------------------------*/

312

/*----------------------------------------------------------------*/

313

314

/*

314

/*

315

* wake_worker() is used when new work is queued and when pool_resume is

315

* wake_worker() is used when new work is queued and when pool_resume is

316

* ready to continue deferred IO processing.

316

* ready to continue deferred IO processing.

317

*/

317

*/

318

static void wake_worker(struct pool *pool)

318

static void wake_worker(struct pool *pool)

319

{

319

{

320

queue_work(pool->wq, &pool->worker);

320

queue_work(pool->wq, &pool->worker);

321

}

321

}

322

323

/*----------------------------------------------------------------*/

323

/*----------------------------------------------------------------*/

324

325

static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,

325

static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,

326

struct dm_bio_prison_cell **cell_result)

326

struct dm_bio_prison_cell **cell_result)

327

{

327

{

328

int r;

328

int r;

329

struct dm_bio_prison_cell *cell_prealloc;

329

struct dm_bio_prison_cell *cell_prealloc;

330

331

/*

331

/*

332

* Allocate a cell from the prison's mempool.

332

* Allocate a cell from the prison's mempool.

333

* This might block but it can't fail.

333

* This might block but it can't fail.

334

*/

334

*/

335

cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);

335

cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);

336

337

r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);

337

r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);

338

if (r)

338

if (r)

339

/*

339

/*

340

* We reused an old cell; we can get rid of

340

* We reused an old cell; we can get rid of

341

* the new one.

341

* the new one.

342

*/

342

*/

343

dm_bio_prison_free_cell(pool->prison, cell_prealloc);

343

dm_bio_prison_free_cell(pool->prison, cell_prealloc);

344

345

return r;

345

return r;

346

}

346

}

347

348

static void cell_release(struct pool *pool,

348

static void cell_release(struct pool *pool,

349

struct dm_bio_prison_cell *cell,

349

struct dm_bio_prison_cell *cell,

350

struct bio_list *bios)

350

struct bio_list *bios)

351

{

351

{

352

dm_cell_release(pool->prison, cell, bios);

352

dm_cell_release(pool->prison, cell, bios);

353

dm_bio_prison_free_cell(pool->prison, cell);

353

dm_bio_prison_free_cell(pool->prison, cell);

354

}

354

}

355

356

static void cell_visit_release(struct pool *pool,

356

static void cell_visit_release(struct pool *pool,

357

void (*fn)(void *, struct dm_bio_prison_cell *),

357

void (*fn)(void *, struct dm_bio_prison_cell *),

358

void *context,

358

void *context,

359

struct dm_bio_prison_cell *cell)

359

struct dm_bio_prison_cell *cell)

360

{

360

{

361

dm_cell_visit_release(pool->prison, fn, context, cell);

361

dm_cell_visit_release(pool->prison, fn, context, cell);

362

dm_bio_prison_free_cell(pool->prison, cell);

362

dm_bio_prison_free_cell(pool->prison, cell);

363

}

363

}

364

365

static void cell_release_no_holder(struct pool *pool,

365

static void cell_release_no_holder(struct pool *pool,

366

struct dm_bio_prison_cell *cell,

366

struct dm_bio_prison_cell *cell,

367

struct bio_list *bios)

367

struct bio_list *bios)

368

{

368

{

369

dm_cell_release_no_holder(pool->prison, cell, bios);

369

dm_cell_release_no_holder(pool->prison, cell, bios);

370

dm_bio_prison_free_cell(pool->prison, cell);

370

dm_bio_prison_free_cell(pool->prison, cell);

371

}

371

}

372

373

static void cell_error_with_code(struct pool *pool,

373

static void cell_error_with_code(struct pool *pool,

374

struct dm_bio_prison_cell *cell, int error_code)

374

struct dm_bio_prison_cell *cell, int error_code)

375

{

375

{

376

dm_cell_error(pool->prison, cell, error_code);

376

dm_cell_error(pool->prison, cell, error_code);

377

dm_bio_prison_free_cell(pool->prison, cell);

377

dm_bio_prison_free_cell(pool->prison, cell);

378

}

378

}

379

380

static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)

380

static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)

381

{

381

{

382

cell_error_with_code(pool, cell, -EIO);

382

cell_error_with_code(pool, cell, -EIO);

383

}

383

}

384

385

static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)

385

static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)

386

{

386

{

387

cell_error_with_code(pool, cell, 0);

387

cell_error_with_code(pool, cell, 0);

388

}

388

}

389

390

static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)

390

static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)

391

{

391

{

392

cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);

392

cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);

393

}

393

}

394

395

/*----------------------------------------------------------------*/

395

/*----------------------------------------------------------------*/

396

397

/*

397

/*

398

* A global list of pools that uses a struct mapped_device as a key.

398

* A global list of pools that uses a struct mapped_device as a key.

399

*/

399

*/

400

static struct dm_thin_pool_table {

400

static struct dm_thin_pool_table {

401

struct mutex mutex;

401

struct mutex mutex;

402

struct list_head pools;

402

struct list_head pools;

403

} dm_thin_pool_table;

403

} dm_thin_pool_table;

404

405

static void pool_table_init(void)

405

static void pool_table_init(void)

406

{

406

{

407

mutex_init(&dm_thin_pool_table.mutex);

407

mutex_init(&dm_thin_pool_table.mutex);

408

INIT_LIST_HEAD(&dm_thin_pool_table.pools);

408

INIT_LIST_HEAD(&dm_thin_pool_table.pools);

409

}

409

}

410

411

static void __pool_table_insert(struct pool *pool)

411

static void __pool_table_insert(struct pool *pool)

412

{

412

{

413

BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

413

BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

414

list_add(&pool->list, &dm_thin_pool_table.pools);

414

list_add(&pool->list, &dm_thin_pool_table.pools);

415

}

415

}

416

417

static void __pool_table_remove(struct pool *pool)

417

static void __pool_table_remove(struct pool *pool)

418

{

418

{

419

BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

419

BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

420

list_del(&pool->list);

420

list_del(&pool->list);

421

}

421

}

422

423

static struct pool *__pool_table_lookup(struct mapped_device *md)

423

static struct pool *__pool_table_lookup(struct mapped_device *md)

424

{

424

{

425

struct pool *pool = NULL, *tmp;

425

struct pool *pool = NULL, *tmp;

426

427

BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

427

BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

428

429

list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {

429

list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {

430

if (tmp->pool_md == md) {

430

if (tmp->pool_md == md) {

431

pool = tmp;

431

pool = tmp;

432

break;

432

break;

433

}

433

}

434

}

434

}

435

436

return pool;

436

return pool;

437

}

437

}

438

439

static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)

439

static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)

440

{

440

{

441

struct pool *pool = NULL, *tmp;

441

struct pool *pool = NULL, *tmp;

442

443

BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

443

BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

444

445

list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {

445

list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {

446

if (tmp->md_dev == md_dev) {

446

if (tmp->md_dev == md_dev) {

447

pool = tmp;

447

pool = tmp;

448

break;

448

break;

449

}

449

}

450

}

450

}

451

452

return pool;

452

return pool;

453

}

453

}

454

455

/*----------------------------------------------------------------*/

455

/*----------------------------------------------------------------*/

456

457

struct dm_thin_endio_hook {

457

struct dm_thin_endio_hook {

458

struct thin_c *tc;

458

struct thin_c *tc;

459

struct dm_deferred_entry *shared_read_entry;

459

struct dm_deferred_entry *shared_read_entry;

460

struct dm_deferred_entry *all_io_entry;

460

struct dm_deferred_entry *all_io_entry;

461

struct dm_thin_new_mapping *overwrite_mapping;

461

struct dm_thin_new_mapping *overwrite_mapping;

462

struct rb_node rb_node;

462

struct rb_node rb_node;

463

};

463

};

464

465

static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)

465

static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)

466

{

466

{

467

bio_list_merge(bios, master);

467

bio_list_merge(bios, master);

468

bio_list_init(master);

468

bio_list_init(master);

469

}

469

}

470

471

static void error_bio_list(struct bio_list *bios, int error)

471

static void error_bio_list(struct bio_list *bios, int error)

472

{

472

{

473

struct bio *bio;

473

struct bio *bio;

474

475

while ((bio = bio_list_pop(bios)))

475

while ((bio = bio_list_pop(bios)))

476

bio_endio(bio, error);

476

bio_endio(bio, error);

477

}

477

}

478

479

static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)

479

static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)

480

{

480

{

481

struct bio_list bios;

481

struct bio_list bios;

482

unsigned long flags;

482

unsigned long flags;

483

484

bio_list_init(&bios);

484

bio_list_init(&bios);

485

486

spin_lock_irqsave(&tc->lock, flags);

486

spin_lock_irqsave(&tc->lock, flags);

487

__merge_bio_list(&bios, master);

487

__merge_bio_list(&bios, master);

488

spin_unlock_irqrestore(&tc->lock, flags);

488

spin_unlock_irqrestore(&tc->lock, flags);

489

490

error_bio_list(&bios, error);

490

error_bio_list(&bios, error);

491

}

491

}

492

493

static void requeue_deferred_cells(struct thin_c *tc)

493

static void requeue_deferred_cells(struct thin_c *tc)

494

{

494

{

495

struct pool *pool = tc->pool;

495

struct pool *pool = tc->pool;

496

unsigned long flags;

496

unsigned long flags;

497

struct list_head cells;

497

struct list_head cells;

498

struct dm_bio_prison_cell *cell, *tmp;

498

struct dm_bio_prison_cell *cell, *tmp;

499

500

INIT_LIST_HEAD(&cells);

500

INIT_LIST_HEAD(&cells);

501

502

spin_lock_irqsave(&tc->lock, flags);

502

spin_lock_irqsave(&tc->lock, flags);

503

list_splice_init(&tc->deferred_cells, &cells);

503

list_splice_init(&tc->deferred_cells, &cells);

504

spin_unlock_irqrestore(&tc->lock, flags);

504

spin_unlock_irqrestore(&tc->lock, flags);

505

506

list_for_each_entry_safe(cell, tmp, &cells, user_list)

506

list_for_each_entry_safe(cell, tmp, &cells, user_list)

507

cell_requeue(pool, cell);

507

cell_requeue(pool, cell);

508

}

508

}

509

510

static void requeue_io(struct thin_c *tc)

510

static void requeue_io(struct thin_c *tc)

511

{

511

{

512

struct bio_list bios;

512

struct bio_list bios;

513

unsigned long flags;

513

unsigned long flags;

514

515

bio_list_init(&bios);

515

bio_list_init(&bios);

516

517

spin_lock_irqsave(&tc->lock, flags);

517

spin_lock_irqsave(&tc->lock, flags);

518

__merge_bio_list(&bios, &tc->deferred_bio_list);

518

__merge_bio_list(&bios, &tc->deferred_bio_list);

519

__merge_bio_list(&bios, &tc->retry_on_resume_list);

519

__merge_bio_list(&bios, &tc->retry_on_resume_list);

520

spin_unlock_irqrestore(&tc->lock, flags);

520

spin_unlock_irqrestore(&tc->lock, flags);

521

522

error_bio_list(&bios, DM_ENDIO_REQUEUE);

522

error_bio_list(&bios, DM_ENDIO_REQUEUE);

523

requeue_deferred_cells(tc);

523

requeue_deferred_cells(tc);

524

}

524

}

525

526

static void error_retry_list(struct pool *pool)

526

static void error_retry_list(struct pool *pool)

527

{

527

{

528

struct thin_c *tc;

528

struct thin_c *tc;

529

530

rcu_read_lock();

530

rcu_read_lock();

531

list_for_each_entry_rcu(tc, &pool->active_thins, list)

531

list_for_each_entry_rcu(tc, &pool->active_thins, list)

532

error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);

532

error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);

533

rcu_read_unlock();

533

rcu_read_unlock();

534

}

534

}

535

536

/*

536

/*

537

* This section of code contains the logic for processing a thin device's IO.

537

* This section of code contains the logic for processing a thin device's IO.

538

* Much of the code depends on pool object resources (lists, workqueues, etc)

538

* Much of the code depends on pool object resources (lists, workqueues, etc)

539

* but most is exclusively called from the thin target rather than the thin-pool

539

* but most is exclusively called from the thin target rather than the thin-pool

540

* target.

540

* target.

541

*/

541

*/

542

543

static bool block_size_is_power_of_two(struct pool *pool)

543

static bool block_size_is_power_of_two(struct pool *pool)

544

{

544

{

545

return pool->sectors_per_block_shift >= 0;

545

return pool->sectors_per_block_shift >= 0;

546

}

546

}

547

548

static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)

548

static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)

549

{

549

{

550

struct pool *pool = tc->pool;

550

struct pool *pool = tc->pool;

551

sector_t block_nr = bio->bi_iter.bi_sector;

551

sector_t block_nr = bio->bi_iter.bi_sector;

552

553

if (block_size_is_power_of_two(pool))

553

if (block_size_is_power_of_two(pool))

554

block_nr >>= pool->sectors_per_block_shift;

554

block_nr >>= pool->sectors_per_block_shift;

555

else

555

else

556

(void) sector_div(block_nr, pool->sectors_per_block);

556

(void) sector_div(block_nr, pool->sectors_per_block);

557

558

return block_nr;

558

return block_nr;

559

}

559

}

560

561

static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)

561

static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)

562

{

562

{

563

struct pool *pool = tc->pool;

563

struct pool *pool = tc->pool;

564

sector_t bi_sector = bio->bi_iter.bi_sector;

564

sector_t bi_sector = bio->bi_iter.bi_sector;

565

566

bio->bi_bdev = tc->pool_dev->bdev;

566

bio->bi_bdev = tc->pool_dev->bdev;

567

if (block_size_is_power_of_two(pool))

567

if (block_size_is_power_of_two(pool))

568

bio->bi_iter.bi_sector =

568

bio->bi_iter.bi_sector =

569

(block << pool->sectors_per_block_shift) |

569

(block << pool->sectors_per_block_shift) |

570

(bi_sector & (pool->sectors_per_block - 1));

570

(bi_sector & (pool->sectors_per_block - 1));

571

else

571

else

572

bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +

572

bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +

573

sector_div(bi_sector, pool->sectors_per_block);

573

sector_div(bi_sector, pool->sectors_per_block);

574

}

574

}

575

576

static void remap_to_origin(struct thin_c *tc, struct bio *bio)

576

static void remap_to_origin(struct thin_c *tc, struct bio *bio)

577

{

577

{

578

bio->bi_bdev = tc->origin_dev->bdev;

578

bio->bi_bdev = tc->origin_dev->bdev;

579

}

579

}

580

581

static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)

581

static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)

582

{

582

{

583

return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&

583

return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&

584

dm_thin_changed_this_transaction(tc->td);

584

dm_thin_changed_this_transaction(tc->td);

585

}

585

}

586

587

static void inc_all_io_entry(struct pool *pool, struct bio *bio)

587

static void inc_all_io_entry(struct pool *pool, struct bio *bio)

588

{

588

{

589

struct dm_thin_endio_hook *h;

589

struct dm_thin_endio_hook *h;

590

591

if (bio->bi_rw & REQ_DISCARD)

591

if (bio->bi_rw & REQ_DISCARD)

592

return;

592

return;

593

594

h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

594

h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

595

h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);

595

h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);

596

}

596

}

597

598

static void issue(struct thin_c *tc, struct bio *bio)

598

static void issue(struct thin_c *tc, struct bio *bio)

599

{

599

{

600

struct pool *pool = tc->pool;

600

struct pool *pool = tc->pool;

601

unsigned long flags;

601

unsigned long flags;

602

603

if (!bio_triggers_commit(tc, bio)) {

603

if (!bio_triggers_commit(tc, bio)) {

604

generic_make_request(bio);

604

generic_make_request(bio);

605

return;

605

return;

606

}

606

}

607

608

/*

608

/*

609

* Complete bio with an error if earlier I/O caused changes to

609

* Complete bio with an error if earlier I/O caused changes to

610

* the metadata that can't be committed e.g, due to I/O errors

610

* the metadata that can't be committed e.g, due to I/O errors

611

* on the metadata device.

611

* on the metadata device.

612

*/

612

*/

613

if (dm_thin_aborted_changes(tc->td)) {

613

if (dm_thin_aborted_changes(tc->td)) {

614

bio_io_error(bio);

614

bio_io_error(bio);

615

return;

615

return;

616

}

616

}

617

618

/*

618

/*

619

* Batch together any bios that trigger commits and then issue a

619

* Batch together any bios that trigger commits and then issue a

620

* single commit for them in process_deferred_bios().

620

* single commit for them in process_deferred_bios().

621

*/

621

*/

622

spin_lock_irqsave(&pool->lock, flags);

622

spin_lock_irqsave(&pool->lock, flags);

623

bio_list_add(&pool->deferred_flush_bios, bio);

623

bio_list_add(&pool->deferred_flush_bios, bio);

624

spin_unlock_irqrestore(&pool->lock, flags);

624

spin_unlock_irqrestore(&pool->lock, flags);

625

}

625

}

626

627

static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)

627

static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)

628

{

628

{

629

remap_to_origin(tc, bio);

629

remap_to_origin(tc, bio);

630

issue(tc, bio);

630

issue(tc, bio);

631

}

631

}

632

633

static void remap_and_issue(struct thin_c *tc, struct bio *bio,

633

static void remap_and_issue(struct thin_c *tc, struct bio *bio,

634

dm_block_t block)

634

dm_block_t block)

635

{

635

{

636

remap(tc, bio, block);

636

remap(tc, bio, block);

637

issue(tc, bio);

637

issue(tc, bio);

638

}

638

}

639

640

/*----------------------------------------------------------------*/

640

/*----------------------------------------------------------------*/

641

642

/*

642

/*

643

* Bio endio functions.

643

* Bio endio functions.

644

*/

644

*/

645

struct dm_thin_new_mapping {

645

struct dm_thin_new_mapping {

646

struct list_head list;

646

struct list_head list;

647

648

bool pass_discard:1;

648

bool pass_discard:1;

649

bool definitely_not_shared:1;

649

bool definitely_not_shared:1;

650

651

/*

651

/*

652

* Track quiescing, copying and zeroing preparation actions. When this

652

* Track quiescing, copying and zeroing preparation actions. When this

653

* counter hits zero the block is prepared and can be inserted into the

653

* counter hits zero the block is prepared and can be inserted into the

654

* btree.

654

* btree.

655

*/

655

*/

656

atomic_t prepare_actions;

656

atomic_t prepare_actions;

657

658

int err;

658

int err;

659

struct thin_c *tc;

659

struct thin_c *tc;

660

dm_block_t virt_block;

660

dm_block_t virt_block;

661

dm_block_t data_block;

661

dm_block_t data_block;

662

struct dm_bio_prison_cell *cell, *cell2;

662

struct dm_bio_prison_cell *cell, *cell2;

663

664

/*

664

/*

665

* If the bio covers the whole area of a block then we can avoid

665

* If the bio covers the whole area of a block then we can avoid

666

* zeroing or copying. Instead this bio is hooked. The bio will

666

* zeroing or copying. Instead this bio is hooked. The bio will

667

* still be in the cell, so care has to be taken to avoid issuing

667

* still be in the cell, so care has to be taken to avoid issuing

668

* the bio twice.

668

* the bio twice.

669

*/

669

*/

670

struct bio *bio;

670

struct bio *bio;

671

bio_end_io_t *saved_bi_end_io;

671

bio_end_io_t *saved_bi_end_io;

672

};

672

};

673

674

static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)

674

static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)

675

{

675

{

676

struct pool *pool = m->tc->pool;

676

struct pool *pool = m->tc->pool;

677

678

if (atomic_dec_and_test(&m->prepare_actions)) {

678

if (atomic_dec_and_test(&m->prepare_actions)) {

679

list_add_tail(&m->list, &pool->prepared_mappings);

679

list_add_tail(&m->list, &pool->prepared_mappings);

680

wake_worker(pool);

680

wake_worker(pool);

681

}

681

}

682

}

682

}

683

684

static void complete_mapping_preparation(struct dm_thin_new_mapping *m)

684

static void complete_mapping_preparation(struct dm_thin_new_mapping *m)

685

{

685

{

686

unsigned long flags;

686

unsigned long flags;

687

struct pool *pool = m->tc->pool;

687

struct pool *pool = m->tc->pool;

688

689

spin_lock_irqsave(&pool->lock, flags);

689

spin_lock_irqsave(&pool->lock, flags);

690

__complete_mapping_preparation(m);

690

__complete_mapping_preparation(m);

691

spin_unlock_irqrestore(&pool->lock, flags);

691

spin_unlock_irqrestore(&pool->lock, flags);

692

}

692

}

693

694

static void copy_complete(int read_err, unsigned long write_err, void *context)

694

static void copy_complete(int read_err, unsigned long write_err, void *context)

695

{

695

{

696

struct dm_thin_new_mapping *m = context;

696

struct dm_thin_new_mapping *m = context;

697

698

m->err = read_err || write_err ? -EIO : 0;

698

m->err = read_err || write_err ? -EIO : 0;

699

complete_mapping_preparation(m);

699

complete_mapping_preparation(m);

700

}

700

}

701

702

static void overwrite_endio(struct bio *bio, int err)

702

static void overwrite_endio(struct bio *bio, int err)

703

{

703

{

704

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

704

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

705

struct dm_thin_new_mapping *m = h->overwrite_mapping;

705

struct dm_thin_new_mapping *m = h->overwrite_mapping;

706

707

m->err = err;

707

m->err = err;

708

complete_mapping_preparation(m);

708

complete_mapping_preparation(m);

709

}

709

}

710

711

/*----------------------------------------------------------------*/

711

/*----------------------------------------------------------------*/

712

713

/*

713

/*

714

* Workqueue.

714

* Workqueue.

715

*/

715

*/

716

717

/*

717

/*

718

* Prepared mapping jobs.

718

* Prepared mapping jobs.

719

*/

719

*/

720

721

/*

721

/*

722

* This sends the bios in the cell, except the original holder, back

722

* This sends the bios in the cell, except the original holder, back

723

* to the deferred_bios list.

723

* to the deferred_bios list.

724

*/

724

*/

725

static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)

725

static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)

726

{

726

{

727

struct pool *pool = tc->pool;

727

struct pool *pool = tc->pool;

728

unsigned long flags;

728

unsigned long flags;

729

730

spin_lock_irqsave(&tc->lock, flags);

730

spin_lock_irqsave(&tc->lock, flags);

731

cell_release_no_holder(pool, cell, &tc->deferred_bio_list);

731

cell_release_no_holder(pool, cell, &tc->deferred_bio_list);

732

spin_unlock_irqrestore(&tc->lock, flags);

732

spin_unlock_irqrestore(&tc->lock, flags);

733

734

wake_worker(pool);

734

wake_worker(pool);

735

}

735

}

736

737

static void thin_defer_bio(struct thin_c *tc, struct bio *bio);

737

static void thin_defer_bio(struct thin_c *tc, struct bio *bio);

738

739

struct remap_info {

739

struct remap_info {

740

struct thin_c *tc;

740

struct thin_c *tc;

741

struct bio_list defer_bios;

741

struct bio_list defer_bios;

742

struct bio_list issue_bios;

742

struct bio_list issue_bios;

743

};

743

};

744

745

static void __inc_remap_and_issue_cell(void *context,

745

static void __inc_remap_and_issue_cell(void *context,

746

struct dm_bio_prison_cell *cell)

746

struct dm_bio_prison_cell *cell)

747

{

747

{

748

struct remap_info *info = context;

748

struct remap_info *info = context;

749

struct bio *bio;

749

struct bio *bio;

750

751

while ((bio = bio_list_pop(&cell->bios))) {

751

while ((bio = bio_list_pop(&cell->bios))) {

752

if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))

752

if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))

753

bio_list_add(&info->defer_bios, bio);

753

bio_list_add(&info->defer_bios, bio);

754

else {

754

else {

755

inc_all_io_entry(info->tc->pool, bio);

755

inc_all_io_entry(info->tc->pool, bio);

756

757

/*

757

/*

758

* We can't issue the bios with the bio prison lock

758

* We can't issue the bios with the bio prison lock

759

* held, so we add them to a list to issue on

759

* held, so we add them to a list to issue on

760

* return from this function.

760

* return from this function.

761

*/

761

*/

762

bio_list_add(&info->issue_bios, bio);

762

bio_list_add(&info->issue_bios, bio);

763

}

763

}

764

}

764

}

765

}

765

}

766

767

static void inc_remap_and_issue_cell(struct thin_c *tc,

767

static void inc_remap_and_issue_cell(struct thin_c *tc,

768

struct dm_bio_prison_cell *cell,

768

struct dm_bio_prison_cell *cell,

769

dm_block_t block)

769

dm_block_t block)

770

{

770

{

771

struct bio *bio;

771

struct bio *bio;

772

struct remap_info info;

772

struct remap_info info;

773

774

info.tc = tc;

774

info.tc = tc;

775

bio_list_init(&info.defer_bios);

775

bio_list_init(&info.defer_bios);

776

bio_list_init(&info.issue_bios);

776

bio_list_init(&info.issue_bios);

777

778

/*

778

/*

779

* We have to be careful to inc any bios we're about to issue

779

* We have to be careful to inc any bios we're about to issue

780

* before the cell is released, and avoid a race with new bios

780

* before the cell is released, and avoid a race with new bios

781

* being added to the cell.

781

* being added to the cell.

782

*/

782

*/

783

cell_visit_release(tc->pool, __inc_remap_and_issue_cell,

783

cell_visit_release(tc->pool, __inc_remap_and_issue_cell,

784

&info, cell);

784

&info, cell);

785

786

while ((bio = bio_list_pop(&info.defer_bios)))

786

while ((bio = bio_list_pop(&info.defer_bios)))

787

thin_defer_bio(tc, bio);

787

thin_defer_bio(tc, bio);

788

789

while ((bio = bio_list_pop(&info.issue_bios)))

789

while ((bio = bio_list_pop(&info.issue_bios)))

790

remap_and_issue(info.tc, bio, block);

790

remap_and_issue(info.tc, bio, block);

791

}

791

}

792

793

static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)

793

static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)

794

{

794

{

795

if (m->bio) {

795

if (m->bio) {

796

m->bio->bi_end_io = m->saved_bi_end_io;

796

m->bio->bi_end_io = m->saved_bi_end_io;

797

atomic_inc(&m->bio->bi_remaining);

797

atomic_inc(&m->bio->bi_remaining);

798

}

798

}

799

cell_error(m->tc->pool, m->cell);

799

cell_error(m->tc->pool, m->cell);

800

list_del(&m->list);

800

list_del(&m->list);

801

mempool_free(m, m->tc->pool->mapping_pool);

801

mempool_free(m, m->tc->pool->mapping_pool);

802

}

802

}

803

804

static void process_prepared_mapping(struct dm_thin_new_mapping *m)

804

static void process_prepared_mapping(struct dm_thin_new_mapping *m)

805

{

805

{

806

struct thin_c *tc = m->tc;

806

struct thin_c *tc = m->tc;

807

struct pool *pool = tc->pool;

807

struct pool *pool = tc->pool;

808

struct bio *bio;

808

struct bio *bio;

809

int r;

809

int r;

810

811

bio = m->bio;

811

bio = m->bio;

812

if (bio) {

812

if (bio) {

813

bio->bi_end_io = m->saved_bi_end_io;

813

bio->bi_end_io = m->saved_bi_end_io;

814

atomic_inc(&bio->bi_remaining);

814

atomic_inc(&bio->bi_remaining);

815

}

815

}

816

817

if (m->err) {

817

if (m->err) {

818

cell_error(pool, m->cell);

818

cell_error(pool, m->cell);

819

goto out;

819

goto out;

820

}

820

}

821

822

/*

822

/*

823

* Commit the prepared block into the mapping btree.

823

* Commit the prepared block into the mapping btree.

824

* Any I/O for this block arriving after this point will get

824

* Any I/O for this block arriving after this point will get

825

* remapped to it directly.

825

* remapped to it directly.

826

*/

826

*/

827

r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);

827

r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);

828

if (r) {

828

if (r) {

829

metadata_operation_failed(pool, "dm_thin_insert_block", r);

829

metadata_operation_failed(pool, "dm_thin_insert_block", r);

830

cell_error(pool, m->cell);

830

cell_error(pool, m->cell);

831

goto out;

831

goto out;

832

}

832

}

833

834

/*

834

/*

835

* Release any bios held while the block was being provisioned.

835

* Release any bios held while the block was being provisioned.

836

* If we are processing a write bio that completely covers the block,

836

* If we are processing a write bio that completely covers the block,

837

* we already processed it so can ignore it now when processing

837

* we already processed it so can ignore it now when processing

838

* the bios in the cell.

838

* the bios in the cell.

839

*/

839

*/

840

if (bio) {

840

if (bio) {

841

inc_remap_and_issue_cell(tc, m->cell, m->data_block);

841

inc_remap_and_issue_cell(tc, m->cell, m->data_block);

842

bio_endio(bio, 0);

842

bio_endio(bio, 0);

843

} else {

843

} else {

844

inc_all_io_entry(tc->pool, m->cell->holder);

844

inc_all_io_entry(tc->pool, m->cell->holder);

845

remap_and_issue(tc, m->cell->holder, m->data_block);

845

remap_and_issue(tc, m->cell->holder, m->data_block);

846

inc_remap_and_issue_cell(tc, m->cell, m->data_block);

846

inc_remap_and_issue_cell(tc, m->cell, m->data_block);

847

}

847

}

848

849

out:

849

out:

850

list_del(&m->list);

850

list_del(&m->list);

851

mempool_free(m, pool->mapping_pool);

851

mempool_free(m, pool->mapping_pool);

852

}

852

}

853

854

static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)

854

static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)

855

{

855

{

856

struct thin_c *tc = m->tc;

856

struct thin_c *tc = m->tc;

857

858

bio_io_error(m->bio);

858

bio_io_error(m->bio);

859

cell_defer_no_holder(tc, m->cell);

859

cell_defer_no_holder(tc, m->cell);

860

cell_defer_no_holder(tc, m->cell2);

860

cell_defer_no_holder(tc, m->cell2);

861

mempool_free(m, tc->pool->mapping_pool);

861

mempool_free(m, tc->pool->mapping_pool);

862

}

862

}

863

864

static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)

864

static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)

865

{

865

{

866

struct thin_c *tc = m->tc;

866

struct thin_c *tc = m->tc;

867

868

inc_all_io_entry(tc->pool, m->bio);

868

inc_all_io_entry(tc->pool, m->bio);

869

cell_defer_no_holder(tc, m->cell);

869

cell_defer_no_holder(tc, m->cell);

870

cell_defer_no_holder(tc, m->cell2);

870

cell_defer_no_holder(tc, m->cell2);

871

872

if (m->pass_discard)

872

if (m->pass_discard)

873

if (m->definitely_not_shared)

873

if (m->definitely_not_shared)

874

remap_and_issue(tc, m->bio, m->data_block);

874

remap_and_issue(tc, m->bio, m->data_block);

875

else {

875

else {

876

bool used = false;

876

bool used = false;

877

if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)

877

if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)

878

bio_endio(m->bio, 0);

878

bio_endio(m->bio, 0);

879

else

879

else

880

remap_and_issue(tc, m->bio, m->data_block);

880

remap_and_issue(tc, m->bio, m->data_block);

881

}

881

}

882

else

882

else

883

bio_endio(m->bio, 0);

883

bio_endio(m->bio, 0);

884

885

mempool_free(m, tc->pool->mapping_pool);

885

mempool_free(m, tc->pool->mapping_pool);

886

}

886

}

887

888

static void process_prepared_discard(struct dm_thin_new_mapping *m)

888

static void process_prepared_discard(struct dm_thin_new_mapping *m)

889

{

889

{

890

int r;

890

int r;

891

struct thin_c *tc = m->tc;

891

struct thin_c *tc = m->tc;

892

893

r = dm_thin_remove_block(tc->td, m->virt_block);

893

r = dm_thin_remove_block(tc->td, m->virt_block);

894

if (r)

894

if (r)

895

DMERR_LIMIT("dm_thin_remove_block() failed");

895

DMERR_LIMIT("dm_thin_remove_block() failed");

896

897

process_prepared_discard_passdown(m);

897

process_prepared_discard_passdown(m);

898

}

898

}

899

900

static void process_prepared(struct pool *pool, struct list_head *head,

900

static void process_prepared(struct pool *pool, struct list_head *head,

901

process_mapping_fn *fn)

901

process_mapping_fn *fn)

902

{

902

{

903

unsigned long flags;

903

unsigned long flags;

904

struct list_head maps;

904

struct list_head maps;

905

struct dm_thin_new_mapping *m, *tmp;

905

struct dm_thin_new_mapping *m, *tmp;

906

907

INIT_LIST_HEAD(&maps);

907

INIT_LIST_HEAD(&maps);

908

spin_lock_irqsave(&pool->lock, flags);

908

spin_lock_irqsave(&pool->lock, flags);

909

list_splice_init(head, &maps);

909

list_splice_init(head, &maps);

910

spin_unlock_irqrestore(&pool->lock, flags);

910

spin_unlock_irqrestore(&pool->lock, flags);

911

912

list_for_each_entry_safe(m, tmp, &maps, list)

912

list_for_each_entry_safe(m, tmp, &maps, list)

913

(*fn)(m);

913

(*fn)(m);

914

}

914

}

915

916

/*

916

/*

917

* Deferred bio jobs.

917

* Deferred bio jobs.

918

*/

918

*/

919

static int io_overlaps_block(struct pool *pool, struct bio *bio)

919

static int io_overlaps_block(struct pool *pool, struct bio *bio)

920

{

920

{

921

return bio->bi_iter.bi_size ==

921

return bio->bi_iter.bi_size ==

922

(pool->sectors_per_block << SECTOR_SHIFT);

922

(pool->sectors_per_block << SECTOR_SHIFT);

923

}

923

}

924

925

static int io_overwrites_block(struct pool *pool, struct bio *bio)

925

static int io_overwrites_block(struct pool *pool, struct bio *bio)

926

{

926

{

927

return (bio_data_dir(bio) == WRITE) &&

927

return (bio_data_dir(bio) == WRITE) &&

928

io_overlaps_block(pool, bio);

928

io_overlaps_block(pool, bio);

929

}

929

}

930

931

static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,

931

static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,

932

bio_end_io_t *fn)

932

bio_end_io_t *fn)

933

{

933

{

934

*save = bio->bi_end_io;

934

*save = bio->bi_end_io;

935

bio->bi_end_io = fn;

935

bio->bi_end_io = fn;

936

}

936

}

937

938

static int ensure_next_mapping(struct pool *pool)

938

static int ensure_next_mapping(struct pool *pool)

939

{

939

{

940

if (pool->next_mapping)

940

if (pool->next_mapping)

941

return 0;

941

return 0;

942

943

pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);

943

pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);

944

945

return pool->next_mapping ? 0 : -ENOMEM;

945

return pool->next_mapping ? 0 : -ENOMEM;

946

}

946

}

947

948

static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)

948

static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)

949

{

949

{

950

struct dm_thin_new_mapping *m = pool->next_mapping;

950

struct dm_thin_new_mapping *m = pool->next_mapping;

951

952

BUG_ON(!pool->next_mapping);

952

BUG_ON(!pool->next_mapping);

953

954

memset(m, 0, sizeof(struct dm_thin_new_mapping));

954

memset(m, 0, sizeof(struct dm_thin_new_mapping));

955

INIT_LIST_HEAD(&m->list);

955

INIT_LIST_HEAD(&m->list);

956

m->bio = NULL;

956

m->bio = NULL;

957

958

pool->next_mapping = NULL;

958

pool->next_mapping = NULL;

959

960

return m;

960

return m;

961

}

961

}

962

963

static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,

963

static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,

964

sector_t begin, sector_t end)

964

sector_t begin, sector_t end)

965

{

965

{

966

int r;

966

int r;

967

struct dm_io_region to;

967

struct dm_io_region to;

968

969

to.bdev = tc->pool_dev->bdev;

969

to.bdev = tc->pool_dev->bdev;

970

to.sector = begin;

970

to.sector = begin;

971

to.count = end - begin;

971

to.count = end - begin;

972

973

r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);

973

r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);

974

if (r < 0) {

974

if (r < 0) {

975

DMERR_LIMIT("dm_kcopyd_zero() failed");

975

DMERR_LIMIT("dm_kcopyd_zero() failed");

976

copy_complete(1, 1, m);

976

copy_complete(1, 1, m);

977

}

977

}

978

}

978

}

979

980

static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,

980

static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,

981

dm_block_t data_block,

981

dm_block_t data_block,

982

struct dm_thin_new_mapping *m)

982

struct dm_thin_new_mapping *m)

983

{

983

{

984

struct pool *pool = tc->pool;

984

struct pool *pool = tc->pool;

985

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

985

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

986

987

h->overwrite_mapping = m;

987

h->overwrite_mapping = m;

988

m->bio = bio;

988

m->bio = bio;

989

save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);

989

save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);

990

inc_all_io_entry(pool, bio);

990

inc_all_io_entry(pool, bio);

991

remap_and_issue(tc, bio, data_block);

991

remap_and_issue(tc, bio, data_block);

992

}

992

}

993

994

/*

994

/*

995

* A partial copy also needs to zero the uncopied region.

995

* A partial copy also needs to zero the uncopied region.

996

*/

996

*/

997

static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,

997

static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,

998

struct dm_dev *origin, dm_block_t data_origin,

998

struct dm_dev *origin, dm_block_t data_origin,

999

dm_block_t data_dest,

999

dm_block_t data_dest,

1000

struct dm_bio_prison_cell *cell, struct bio *bio,

1000

struct dm_bio_prison_cell *cell, struct bio *bio,

1001

sector_t len)

1001

sector_t len)

1002

{

1002

{

1003

int r;

1003

int r;

1004

struct pool *pool = tc->pool;

1004

struct pool *pool = tc->pool;

1005

struct dm_thin_new_mapping *m = get_next_mapping(pool);

1005

struct dm_thin_new_mapping *m = get_next_mapping(pool);

1006

1007

m->tc = tc;

1007

m->tc = tc;

1008

m->virt_block = virt_block;

1008

m->virt_block = virt_block;

1009

m->data_block = data_dest;

1009

m->data_block = data_dest;

1010

m->cell = cell;

1010

m->cell = cell;

1011

1012

/*

1012

/*

1013

* quiesce action + copy action + an extra reference held for the

1013

* quiesce action + copy action + an extra reference held for the

1014

* duration of this function (we may need to inc later for a

1014

* duration of this function (we may need to inc later for a

1015

* partial zero).

1015

* partial zero).

1016

*/

1016

*/

1017

atomic_set(&m->prepare_actions, 3);

1017

atomic_set(&m->prepare_actions, 3);

1018

1019

if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))

1019

if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))

1020

complete_mapping_preparation(m); /* already quiesced */

1020

complete_mapping_preparation(m); /* already quiesced */

1021

1022

/*

1022

/*

1023

* IO to pool_dev remaps to the pool target's data_dev.

1023

* IO to pool_dev remaps to the pool target's data_dev.

1024

*

1024

*

1025

* If the whole block of data is being overwritten, we can issue the

1025

* If the whole block of data is being overwritten, we can issue the

1026

* bio immediately. Otherwise we use kcopyd to clone the data first.

1026

* bio immediately. Otherwise we use kcopyd to clone the data first.

1027

*/

1027

*/

1028

if (io_overwrites_block(pool, bio))

1028

if (io_overwrites_block(pool, bio))

1029

remap_and_issue_overwrite(tc, bio, data_dest, m);

1029

remap_and_issue_overwrite(tc, bio, data_dest, m);

1030

else {

1030

else {

1031

struct dm_io_region from, to;

1031

struct dm_io_region from, to;

1032

1033

from.bdev = origin->bdev;

1033

from.bdev = origin->bdev;

1034

from.sector = data_origin * pool->sectors_per_block;

1034

from.sector = data_origin * pool->sectors_per_block;

1035

from.count = len;

1035

from.count = len;

1036

1037

to.bdev = tc->pool_dev->bdev;

1037

to.bdev = tc->pool_dev->bdev;

1038

to.sector = data_dest * pool->sectors_per_block;

1038

to.sector = data_dest * pool->sectors_per_block;

1039

to.count = len;

1039

to.count = len;

1040

1041

r = dm_kcopyd_copy(pool->copier, &from, 1, &to,

1041

r = dm_kcopyd_copy(pool->copier, &from, 1, &to,

1042

0, copy_complete, m);

1042

0, copy_complete, m);

1043

if (r < 0) {

1043

if (r < 0) {

1044

DMERR_LIMIT("dm_kcopyd_copy() failed");

1044

DMERR_LIMIT("dm_kcopyd_copy() failed");

1045

copy_complete(1, 1, m);

1045

copy_complete(1, 1, m);

1046

1047

/*

1047

/*

1048

* We allow the zero to be issued, to simplify the

1048

* We allow the zero to be issued, to simplify the

1049

* error path. Otherwise we'd need to start

1049

* error path. Otherwise we'd need to start

1050

* worrying about decrementing the prepare_actions

1050

* worrying about decrementing the prepare_actions

1051

* counter.

1051

* counter.

1052

*/

1052

*/

1053

}

1053

}

1054

1055

/*

1055

/*

1056

* Do we need to zero a tail region?

1056

* Do we need to zero a tail region?

1057

*/

1057

*/

1058

if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {

1058

if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {

1059

atomic_inc(&m->prepare_actions);

1059

atomic_inc(&m->prepare_actions);

1060

ll_zero(tc, m,

1060

ll_zero(tc, m,

1061

data_dest * pool->sectors_per_block + len,

1061

data_dest * pool->sectors_per_block + len,

1062

(data_dest + 1) * pool->sectors_per_block);

1062

(data_dest + 1) * pool->sectors_per_block);

1063

}

1063

}

1064

}

1064

}

1065

1066

complete_mapping_preparation(m); /* drop our ref */

1066

complete_mapping_preparation(m); /* drop our ref */

1067

}

1067

}

1068

1069

static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,

1069

static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,

1070

dm_block_t data_origin, dm_block_t data_dest,

1070

dm_block_t data_origin, dm_block_t data_dest,

1071

struct dm_bio_prison_cell *cell, struct bio *bio)

1071

struct dm_bio_prison_cell *cell, struct bio *bio)

1072

{

1072

{

1073

schedule_copy(tc, virt_block, tc->pool_dev,

1073

schedule_copy(tc, virt_block, tc->pool_dev,

1074

data_origin, data_dest, cell, bio,

1074

data_origin, data_dest, cell, bio,

1075

tc->pool->sectors_per_block);

1075

tc->pool->sectors_per_block);

1076

}

1076

}

1077

1078

static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,

1078

static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,

1079

dm_block_t data_block, struct dm_bio_prison_cell *cell,

1079

dm_block_t data_block, struct dm_bio_prison_cell *cell,

1080

struct bio *bio)

1080

struct bio *bio)

1081

{

1081

{

1082

struct pool *pool = tc->pool;

1082

struct pool *pool = tc->pool;

1083

struct dm_thin_new_mapping *m = get_next_mapping(pool);

1083

struct dm_thin_new_mapping *m = get_next_mapping(pool);

1084

1085

atomic_set(&m->prepare_actions, 1); /* no need to quiesce */

1085

atomic_set(&m->prepare_actions, 1); /* no need to quiesce */

1086

m->tc = tc;

1086

m->tc = tc;

1087

m->virt_block = virt_block;

1087

m->virt_block = virt_block;

1088

m->data_block = data_block;

1088

m->data_block = data_block;

1089

m->cell = cell;

1089

m->cell = cell;

1090

1091

/*

1091

/*

1092

* If the whole block of data is being overwritten or we are not

1092

* If the whole block of data is being overwritten or we are not

1093

* zeroing pre-existing data, we can issue the bio immediately.

1093

* zeroing pre-existing data, we can issue the bio immediately.

1094

* Otherwise we use kcopyd to zero the data first.

1094

* Otherwise we use kcopyd to zero the data first.

1095

*/

1095

*/

1096

if (!pool->pf.zero_new_blocks)

1096

if (!pool->pf.zero_new_blocks)

1097

process_prepared_mapping(m);

1097

process_prepared_mapping(m);

1098

1099

else if (io_overwrites_block(pool, bio))

1099

else if (io_overwrites_block(pool, bio))

1100

remap_and_issue_overwrite(tc, bio, data_block, m);

1100

remap_and_issue_overwrite(tc, bio, data_block, m);

1101

1102

else

1102

else

1103

ll_zero(tc, m,

1103

ll_zero(tc, m,

1104

data_block * pool->sectors_per_block,

1104

data_block * pool->sectors_per_block,

1105

(data_block + 1) * pool->sectors_per_block);

1105

(data_block + 1) * pool->sectors_per_block);

1106

}

1106

}

1107

1108

static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,

1108

static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,

1109

dm_block_t data_dest,

1109

dm_block_t data_dest,

1110

struct dm_bio_prison_cell *cell, struct bio *bio)

1110

struct dm_bio_prison_cell *cell, struct bio *bio)

1111

{

1111

{

1112

struct pool *pool = tc->pool;

1112

struct pool *pool = tc->pool;

1113

sector_t virt_block_begin = virt_block * pool->sectors_per_block;

1113

sector_t virt_block_begin = virt_block * pool->sectors_per_block;

1114

sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;

1114

sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;

1115

1116

if (virt_block_end <= tc->origin_size)

1116

if (virt_block_end <= tc->origin_size)

1117

schedule_copy(tc, virt_block, tc->origin_dev,

1117

schedule_copy(tc, virt_block, tc->origin_dev,

1118

virt_block, data_dest, cell, bio,

1118

virt_block, data_dest, cell, bio,

1119

pool->sectors_per_block);

1119

pool->sectors_per_block);

1120

1121

else if (virt_block_begin < tc->origin_size)

1121

else if (virt_block_begin < tc->origin_size)

1122

schedule_copy(tc, virt_block, tc->origin_dev,

1122

schedule_copy(tc, virt_block, tc->origin_dev,

1123

virt_block, data_dest, cell, bio,

1123

virt_block, data_dest, cell, bio,

1124

tc->origin_size - virt_block_begin);

1124

tc->origin_size - virt_block_begin);

1125

1126

else

1126

else

1127

schedule_zero(tc, virt_block, data_dest, cell, bio);

1127

schedule_zero(tc, virt_block, data_dest, cell, bio);

1128

}

1128

}

1129

1130

static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);

1130

static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);

1131

1132

static void check_for_space(struct pool *pool)

1132

static void check_for_space(struct pool *pool)

1133

{

1133

{

1134

int r;

1134

int r;

1135

dm_block_t nr_free;

1135

dm_block_t nr_free;

1136

1137

if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)

1137

if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)

1138

return;

1138

return;

1139

1140

r = dm_pool_get_free_block_count(pool->pmd, &nr_free);

1140

r = dm_pool_get_free_block_count(pool->pmd, &nr_free);

1141

if (r)

1141

if (r)

1142

return;

1142

return;

1143

1144

if (nr_free)

1144

if (nr_free)

1145

set_pool_mode(pool, PM_WRITE);

1145

set_pool_mode(pool, PM_WRITE);

1146

}

1146

}

1147

1148

/*

1148

/*

1149

* A non-zero return indicates read_only or fail_io mode.

1149

* A non-zero return indicates read_only or fail_io mode.

1150

* Many callers don't care about the return value.

1150

* Many callers don't care about the return value.

1151

*/

1151

*/

1152

static int commit(struct pool *pool)

1152

static int commit(struct pool *pool)

1153

{

1153

{

1154

int r;

1154

int r;

1155

1156

if (get_pool_mode(pool) >= PM_READ_ONLY)

1156

if (get_pool_mode(pool) >= PM_READ_ONLY)

1157

return -EINVAL;

1157

return -EINVAL;

1158

1159

r = dm_pool_commit_metadata(pool->pmd);

1159

r = dm_pool_commit_metadata(pool->pmd);

1160

if (r)

1160

if (r)

1161

metadata_operation_failed(pool, "dm_pool_commit_metadata", r);

1161

metadata_operation_failed(pool, "dm_pool_commit_metadata", r);

1162

else

1162

else

1163

check_for_space(pool);

1163

check_for_space(pool);

1164

1165

return r;

1165

return r;

1166

}

1166

}

1167

1168

static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)

1168

static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)

1169

{

1169

{

1170

unsigned long flags;

1170

unsigned long flags;

1171

1172

if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {

1172

if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {

1173

DMWARN("%s: reached low water mark for data device: sending event.",

1173

DMWARN("%s: reached low water mark for data device: sending event.",

1174

dm_device_name(pool->pool_md));

1174

dm_device_name(pool->pool_md));

1175

spin_lock_irqsave(&pool->lock, flags);

1175

spin_lock_irqsave(&pool->lock, flags);

1176

pool->low_water_triggered = true;

1176

pool->low_water_triggered = true;

1177

spin_unlock_irqrestore(&pool->lock, flags);

1177

spin_unlock_irqrestore(&pool->lock, flags);

1178

dm_table_event(pool->ti->table);

1178

dm_table_event(pool->ti->table);

1179

}

1179

}

1180

}

1180

}

1181

1182

static int alloc_data_block(struct thin_c *tc, dm_block_t *result)

1182

static int alloc_data_block(struct thin_c *tc, dm_block_t *result)

1183

{

1183

{

1184

int r;

1184

int r;

1185

dm_block_t free_blocks;

1185

dm_block_t free_blocks;

1186

struct pool *pool = tc->pool;

1186

struct pool *pool = tc->pool;

1187

1188

if (WARN_ON(get_pool_mode(pool) != PM_WRITE))

1188

if (WARN_ON(get_pool_mode(pool) != PM_WRITE))

1189

return -EINVAL;

1189

return -EINVAL;

1190

1191

r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);

1191

r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);

1192

if (r) {

1192

if (r) {

1193

metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);

1193

metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);

1194

return r;

1194

return r;

1195

}

1195

}

1196

1197

check_low_water_mark(pool, free_blocks);

1197

check_low_water_mark(pool, free_blocks);

1198

1199

if (!free_blocks) {

1199

if (!free_blocks) {

1200

/*

1200

/*

1201

* Try to commit to see if that will free up some

1201

* Try to commit to see if that will free up some

1202

* more space.

1202

* more space.

1203

*/

1203

*/

1204

r = commit(pool);

1204

r = commit(pool);

1205

if (r)

1205

if (r)

1206

return r;

1206

return r;

1207

1208

r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);

1208

r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);

1209

if (r) {

1209

if (r) {

1210

metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);

1210

metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);

1211

return r;

1211

return r;

1212

}

1212

}

1213

1214

if (!free_blocks) {

1214

if (!free_blocks) {

1215

set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);

1215

set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);

1216

return -ENOSPC;

1216

return -ENOSPC;

1217

}

1217

}

1218

}

1218

}

1219

1220

r = dm_pool_alloc_data_block(pool->pmd, result);

1220

r = dm_pool_alloc_data_block(pool->pmd, result);

1221

if (r) {

1221

if (r) {

1222

metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);

1222

metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);

1223

return r;

1223

return r;

1224

}

1224

}

1225

1226

return 0;

1226

return 0;

1227

}

1227

}

1228

1229

/*

1229

/*

1230

* If we have run out of space, queue bios until the device is

1230

* If we have run out of space, queue bios until the device is

1231

* resumed, presumably after having been reloaded with more space.

1231

* resumed, presumably after having been reloaded with more space.

1232

*/

1232

*/

1233

static void retry_on_resume(struct bio *bio)

1233

static void retry_on_resume(struct bio *bio)

1234

{

1234

{

1235

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

1235

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

1236

struct thin_c *tc = h->tc;

1236

struct thin_c *tc = h->tc;

1237

unsigned long flags;

1237

unsigned long flags;

1238

1239

spin_lock_irqsave(&tc->lock, flags);

1239

spin_lock_irqsave(&tc->lock, flags);

1240

bio_list_add(&tc->retry_on_resume_list, bio);

1240

bio_list_add(&tc->retry_on_resume_list, bio);

1241

spin_unlock_irqrestore(&tc->lock, flags);

1241

spin_unlock_irqrestore(&tc->lock, flags);

1242

}

1242

}

1243

1244

static int should_error_unserviceable_bio(struct pool *pool)

1244

static int should_error_unserviceable_bio(struct pool *pool)

1245

{

1245

{

1246

enum pool_mode m = get_pool_mode(pool);

1246

enum pool_mode m = get_pool_mode(pool);

1247

1248

switch (m) {

1248

switch (m) {

1249

case PM_WRITE:

1249

case PM_WRITE:

1250

/* Shouldn't get here */

1250

/* Shouldn't get here */

1251

DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");

1251

DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");

1252

return -EIO;

1252

return -EIO;

1253

1254

case PM_OUT_OF_DATA_SPACE:

1254

case PM_OUT_OF_DATA_SPACE:

1255

return pool->pf.error_if_no_space ? -ENOSPC : 0;

1255

return pool->pf.error_if_no_space ? -ENOSPC : 0;

1256

1257

case PM_READ_ONLY:

1257

case PM_READ_ONLY:

1258

case PM_FAIL:

1258

case PM_FAIL:

1259

return -EIO;

1259

return -EIO;

1260

default:

1260

default:

1261

/* Shouldn't get here */

1261

/* Shouldn't get here */

1262

DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");

1262

DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");

1263

return -EIO;

1263

return -EIO;

1264

}

1264

}

1265

}

1265

}

1266

1267

static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)

1267

static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)

1268

{

1268

{

1269

int error = should_error_unserviceable_bio(pool);

1269

int error = should_error_unserviceable_bio(pool);

1270

1271

if (error)

1271

if (error)

1272

bio_endio(bio, error);

1272

bio_endio(bio, error);

1273

else

1273

else

1274

retry_on_resume(bio);

1274

retry_on_resume(bio);

1275

}

1275

}

1276

1277

static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)

1277

static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)

1278

{

1278

{

1279

struct bio *bio;

1279

struct bio *bio;

1280

struct bio_list bios;

1280

struct bio_list bios;

1281

int error;

1281

int error;

1282

1283

error = should_error_unserviceable_bio(pool);

1283

error = should_error_unserviceable_bio(pool);

1284

if (error) {

1284

if (error) {

1285

cell_error_with_code(pool, cell, error);

1285

cell_error_with_code(pool, cell, error);

1286

return;

1286

return;

1287

}

1287

}

1288

1289

bio_list_init(&bios);

1289

bio_list_init(&bios);

1290

cell_release(pool, cell, &bios);

1290

cell_release(pool, cell, &bios);

1291

1292

while ((bio = bio_list_pop(&bios)))

1292

while ((bio = bio_list_pop(&bios)))

1293

retry_on_resume(bio);

1293

retry_on_resume(bio);

1294

}

1294

}

1295

1296

static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)

1296

static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)

1297

{

1297

{

1298

int r;

1298

int r;

1299

struct bio *bio = cell->holder;

1299

struct bio *bio = cell->holder;

1300

struct pool *pool = tc->pool;

1300

struct pool *pool = tc->pool;

1301

struct dm_bio_prison_cell *cell2;

1301

struct dm_bio_prison_cell *cell2;

1302

struct dm_cell_key key2;

1302

struct dm_cell_key key2;

1303

dm_block_t block = get_bio_block(tc, bio);

1303

dm_block_t block = get_bio_block(tc, bio);

1304

struct dm_thin_lookup_result lookup_result;

1304

struct dm_thin_lookup_result lookup_result;

1305

struct dm_thin_new_mapping *m;

1305

struct dm_thin_new_mapping *m;

1306

1307

if (tc->requeue_mode) {

1307

if (tc->requeue_mode) {

1308

cell_requeue(pool, cell);

1308

cell_requeue(pool, cell);

1309

return;

1309

return;

1310

}

1310

}

1311

1312

r = dm_thin_find_block(tc->td, block, 1, &lookup_result);

1312

r = dm_thin_find_block(tc->td, block, 1, &lookup_result);

1313

switch (r) {

1313

switch (r) {

1314

case 0:

1314

case 0:

1315

/*

1315

/*

1316

* Check nobody is fiddling with this pool block. This can

1316

* Check nobody is fiddling with this pool block. This can

1317

* happen if someone's in the process of breaking sharing

1317

* happen if someone's in the process of breaking sharing

1318

* on this block.

1318

* on this block.

1319

*/

1319

*/

1320

build_data_key(tc->td, lookup_result.block, &key2);

1320

build_data_key(tc->td, lookup_result.block, &key2);

1321

if (bio_detain(tc->pool, &key2, bio, &cell2)) {

1321

if (bio_detain(tc->pool, &key2, bio, &cell2)) {

1322

cell_defer_no_holder(tc, cell);

1322

cell_defer_no_holder(tc, cell);

1323

break;

1323

break;

1324

}

1324

}

1325

1326

if (io_overlaps_block(pool, bio)) {

1326

if (io_overlaps_block(pool, bio)) {

1327

/*

1327

/*

1328

* IO may still be going to the destination block. We must

1328

* IO may still be going to the destination block. We must

1329

* quiesce before we can do the removal.

1329

* quiesce before we can do the removal.

1330

*/

1330

*/

1331

m = get_next_mapping(pool);

1331

m = get_next_mapping(pool);

1332

m->tc = tc;

1332

m->tc = tc;

1333

m->pass_discard = pool->pf.discard_passdown;

1333

m->pass_discard = pool->pf.discard_passdown;

1334

m->definitely_not_shared = !lookup_result.shared;

1334

m->definitely_not_shared = !lookup_result.shared;

1335

m->virt_block = block;

1335

m->virt_block = block;

1336

m->data_block = lookup_result.block;

1336

m->data_block = lookup_result.block;

1337

m->cell = cell;

1337

m->cell = cell;

1338

m->cell2 = cell2;

1338

m->cell2 = cell2;

1339

m->bio = bio;

1339

m->bio = bio;

1340

1341

if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))

1341

if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))

1342

pool->process_prepared_discard(m);

1342

pool->process_prepared_discard(m);

1343

1344

} else {

1344

} else {

1345

inc_all_io_entry(pool, bio);

1345

inc_all_io_entry(pool, bio);

1346

cell_defer_no_holder(tc, cell);

1346

cell_defer_no_holder(tc, cell);

1347

cell_defer_no_holder(tc, cell2);

1347

cell_defer_no_holder(tc, cell2);

1348

1349

/*

1349

/*

1350

* The DM core makes sure that the discard doesn't span

1350

* The DM core makes sure that the discard doesn't span

1351

* a block boundary. So we submit the discard of a

1351

* a block boundary. So we submit the discard of a

1352

* partial block appropriately.

1352

* partial block appropriately.

1353

*/

1353

*/

1354

if ((!lookup_result.shared) && pool->pf.discard_passdown)

1354

if ((!lookup_result.shared) && pool->pf.discard_passdown)

1355

remap_and_issue(tc, bio, lookup_result.block);

1355

remap_and_issue(tc, bio, lookup_result.block);

1356

else

1356

else

1357

bio_endio(bio, 0);

1357

bio_endio(bio, 0);

1358

}

1358

}

1359

break;

1359

break;

1360

1361

case -ENODATA:

1361

case -ENODATA:

1362

/*

1362

/*

1363

* It isn't provisioned, just forget it.

1363

* It isn't provisioned, just forget it.

1364

*/

1364

*/

1365

cell_defer_no_holder(tc, cell);

1365

cell_defer_no_holder(tc, cell);

1366

bio_endio(bio, 0);

1366

bio_endio(bio, 0);

1367

break;

1367

break;

1368

1369

default:

1369

default:

1370

DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",

1370

DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",

1371

__func__, r);

1371

__func__, r);

1372

cell_defer_no_holder(tc, cell);

1372

cell_defer_no_holder(tc, cell);

1373

bio_io_error(bio);

1373

bio_io_error(bio);

1374

break;

1374

break;

1375

}

1375

}

1376

}

1376

}

1377

1378

static void process_discard_bio(struct thin_c *tc, struct bio *bio)

1378

static void process_discard_bio(struct thin_c *tc, struct bio *bio)

1379

{

1379

{

1380

struct dm_bio_prison_cell *cell;

1380

struct dm_bio_prison_cell *cell;

1381

struct dm_cell_key key;

1381

struct dm_cell_key key;

1382

dm_block_t block = get_bio_block(tc, bio);

1382

dm_block_t block = get_bio_block(tc, bio);

1383

1384

build_virtual_key(tc->td, block, &key);

1384

build_virtual_key(tc->td, block, &key);

1385

if (bio_detain(tc->pool, &key, bio, &cell))

1385

if (bio_detain(tc->pool, &key, bio, &cell))

1386

return;

1386

return;

1387

1388

process_discard_cell(tc, cell);

1388

process_discard_cell(tc, cell);

1389

}

1389

}

1390

1391

static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,

1391

static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,

1392

struct dm_cell_key *key,

1392

struct dm_cell_key *key,

1393

struct dm_thin_lookup_result *lookup_result,

1393

struct dm_thin_lookup_result *lookup_result,

1394

struct dm_bio_prison_cell *cell)

1394

struct dm_bio_prison_cell *cell)

1395

{

1395

{

1396

int r;

1396

int r;

1397

dm_block_t data_block;

1397

dm_block_t data_block;

1398

struct pool *pool = tc->pool;

1398

struct pool *pool = tc->pool;

1399

1400

r = alloc_data_block(tc, &data_block);

1400

r = alloc_data_block(tc, &data_block);

1401

switch (r) {

1401

switch (r) {

1402

case 0:

1402

case 0:

1403

schedule_internal_copy(tc, block, lookup_result->block,

1403

schedule_internal_copy(tc, block, lookup_result->block,

1404

data_block, cell, bio);

1404

data_block, cell, bio);

1405

break;

1405

break;

1406

1407

case -ENOSPC:

1407

case -ENOSPC:

1408

retry_bios_on_resume(pool, cell);

1408

retry_bios_on_resume(pool, cell);

1409

break;

1409

break;

1410

1411

default:

1411

default:

1412

DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",

1412

DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",

1413

__func__, r);

1413

__func__, r);

1414

cell_error(pool, cell);

1414

cell_error(pool, cell);

1415

break;

1415

break;

1416

}

1416

}

1417

}

1417

}

1418

1419

static void __remap_and_issue_shared_cell(void *context,

1419

static void __remap_and_issue_shared_cell(void *context,

1420

struct dm_bio_prison_cell *cell)

1420

struct dm_bio_prison_cell *cell)

1421

{

1421

{

1422

struct remap_info *info = context;

1422

struct remap_info *info = context;

1423

struct bio *bio;

1423

struct bio *bio;

1424

1425

while ((bio = bio_list_pop(&cell->bios))) {

1425

while ((bio = bio_list_pop(&cell->bios))) {

1426

if ((bio_data_dir(bio) == WRITE) ||

1426

if ((bio_data_dir(bio) == WRITE) ||

1427

(bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)))

1427

(bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)))

1428

bio_list_add(&info->defer_bios, bio);

1428

bio_list_add(&info->defer_bios, bio);

1429

else {

1429

else {

1430

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;

1430

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;

1431

1432

h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);

1432

h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);

1433

inc_all_io_entry(info->tc->pool, bio);

1433

inc_all_io_entry(info->tc->pool, bio);

1434

bio_list_add(&info->issue_bios, bio);

1434

bio_list_add(&info->issue_bios, bio);

1435

}

1435

}

1436

}

1436

}

1437

}

1437

}

1438

1439

static void remap_and_issue_shared_cell(struct thin_c *tc,

1439

static void remap_and_issue_shared_cell(struct thin_c *tc,

1440

struct dm_bio_prison_cell *cell,

1440

struct dm_bio_prison_cell *cell,

1441

dm_block_t block)

1441

dm_block_t block)

1442

{

1442

{

1443

struct bio *bio;

1443

struct bio *bio;

1444

struct remap_info info;

1444

struct remap_info info;

1445

1446

info.tc = tc;

1446

info.tc = tc;

1447

bio_list_init(&info.defer_bios);

1447

bio_list_init(&info.defer_bios);

1448

bio_list_init(&info.issue_bios);

1448

bio_list_init(&info.issue_bios);

1449

1450

cell_visit_release(tc->pool, __remap_and_issue_shared_cell,

1450

cell_visit_release(tc->pool, __remap_and_issue_shared_cell,

1451

&info, cell);

1451

&info, cell);

1452

1453

while ((bio = bio_list_pop(&info.defer_bios)))

1453

while ((bio = bio_list_pop(&info.defer_bios)))

1454

thin_defer_bio(tc, bio);

1454

thin_defer_bio(tc, bio);

1455

1456

while ((bio = bio_list_pop(&info.issue_bios)))

1456

while ((bio = bio_list_pop(&info.issue_bios)))

1457

remap_and_issue(tc, bio, block);

1457

remap_and_issue(tc, bio, block);

1458

}

1458

}

1459

1460

static void process_shared_bio(struct thin_c *tc, struct bio *bio,

1460

static void process_shared_bio(struct thin_c *tc, struct bio *bio,

1461

dm_block_t block,

1461

dm_block_t block,

1462

struct dm_thin_lookup_result *lookup_result,

1462

struct dm_thin_lookup_result *lookup_result,

1463

struct dm_bio_prison_cell *virt_cell)

1463

struct dm_bio_prison_cell *virt_cell)

1464

{

1464

{

1465

struct dm_bio_prison_cell *data_cell;

1465

struct dm_bio_prison_cell *data_cell;

1466

struct pool *pool = tc->pool;

1466

struct pool *pool = tc->pool;

1467

struct dm_cell_key key;

1467

struct dm_cell_key key;

1468

1469

/*

1469

/*

1470

* If cell is already occupied, then sharing is already in the process

1470

* If cell is already occupied, then sharing is already in the process

1471

* of being broken so we have nothing further to do here.

1471

* of being broken so we have nothing further to do here.

1472

*/

1472

*/

1473

build_data_key(tc->td, lookup_result->block, &key);

1473

build_data_key(tc->td, lookup_result->block, &key);

1474

if (bio_detain(pool, &key, bio, &data_cell)) {

1474

if (bio_detain(pool, &key, bio, &data_cell)) {

1475

cell_defer_no_holder(tc, virt_cell);

1475

cell_defer_no_holder(tc, virt_cell);

1476

return;

1476

return;

1477

}

1477

}

1478

1479

if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {

1479

if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {

1480

break_sharing(tc, bio, block, &key, lookup_result, data_cell);

1480

break_sharing(tc, bio, block, &key, lookup_result, data_cell);

1481

cell_defer_no_holder(tc, virt_cell);

1481

cell_defer_no_holder(tc, virt_cell);

1482

} else {

1482

} else {

1483

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

1483

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

1484

1485

h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);

1485

h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);

1486

inc_all_io_entry(pool, bio);

1486

inc_all_io_entry(pool, bio);

1487

remap_and_issue(tc, bio, lookup_result->block);

1487

remap_and_issue(tc, bio, lookup_result->block);

1488

1489

remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);

1489

remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);

1490

remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);

1490

remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);

1491

}

1491

}

1492

}

1492

}

1493

1494

static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,

1494

static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,

1495

struct dm_bio_prison_cell *cell)

1495

struct dm_bio_prison_cell *cell)

1496

{

1496

{

1497

int r;

1497

int r;

1498

dm_block_t data_block;

1498

dm_block_t data_block;

1499

struct pool *pool = tc->pool;

1499

struct pool *pool = tc->pool;

1500

1501

/*

1501

/*

1502

* Remap empty bios (flushes) immediately, without provisioning.

1502

* Remap empty bios (flushes) immediately, without provisioning.

1503

*/

1503

*/

1504

if (!bio->bi_iter.bi_size) {

1504

if (!bio->bi_iter.bi_size) {

1505

inc_all_io_entry(pool, bio);

1505

inc_all_io_entry(pool, bio);

1506

cell_defer_no_holder(tc, cell);

1506

cell_defer_no_holder(tc, cell);

1507

1508

remap_and_issue(tc, bio, 0);

1508

remap_and_issue(tc, bio, 0);

1509

return;

1509

return;

1510

}

1510

}

1511

1512

/*

1512

/*

1513

* Fill read bios with zeroes and complete them immediately.

1513

* Fill read bios with zeroes and complete them immediately.

1514

*/

1514

*/

1515

if (bio_data_dir(bio) == READ) {

1515

if (bio_data_dir(bio) == READ) {

1516

zero_fill_bio(bio);

1516

zero_fill_bio(bio);

1517

cell_defer_no_holder(tc, cell);

1517

cell_defer_no_holder(tc, cell);

1518

bio_endio(bio, 0);

1518

bio_endio(bio, 0);

1519

return;

1519

return;

1520

}

1520

}

1521

1522

r = alloc_data_block(tc, &data_block);

1522

r = alloc_data_block(tc, &data_block);

1523

switch (r) {

1523

switch (r) {

1524

case 0:

1524

case 0:

1525

if (tc->origin_dev)

1525

if (tc->origin_dev)

1526

schedule_external_copy(tc, block, data_block, cell, bio);

1526

schedule_external_copy(tc, block, data_block, cell, bio);

1527

else

1527

else

1528

schedule_zero(tc, block, data_block, cell, bio);

1528

schedule_zero(tc, block, data_block, cell, bio);

1529

break;

1529

break;

1530

1531

case -ENOSPC:

1531

case -ENOSPC:

1532

retry_bios_on_resume(pool, cell);

1532

retry_bios_on_resume(pool, cell);

1533

break;

1533

break;

1534

1535

default:

1535

default:

1536

DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",

1536

DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",

1537

__func__, r);

1537

__func__, r);

1538

cell_error(pool, cell);

1538

cell_error(pool, cell);

1539

break;

1539

break;

1540

}

1540

}

1541

}

1541

}

1542

1543

static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)

1543

static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)

1544

{

1544

{

1545

int r;

1545

int r;

1546

struct pool *pool = tc->pool;

1546

struct pool *pool = tc->pool;

1547

struct bio *bio = cell->holder;

1547

struct bio *bio = cell->holder;

1548

dm_block_t block = get_bio_block(tc, bio);

1548

dm_block_t block = get_bio_block(tc, bio);

1549

struct dm_thin_lookup_result lookup_result;

1549

struct dm_thin_lookup_result lookup_result;

1550

1551

if (tc->requeue_mode) {

1551

if (tc->requeue_mode) {

1552

cell_requeue(pool, cell);

1552

cell_requeue(pool, cell);

1553

return;

1553

return;

1554

}

1554

}

1555

1556

r = dm_thin_find_block(tc->td, block, 1, &lookup_result);

1556

r = dm_thin_find_block(tc->td, block, 1, &lookup_result);

1557

switch (r) {

1557

switch (r) {

1558

case 0:

1558

case 0:

1559

if (lookup_result.shared)

1559

if (lookup_result.shared)

1560

process_shared_bio(tc, bio, block, &lookup_result, cell);

1560

process_shared_bio(tc, bio, block, &lookup_result, cell);

1561

else {

1561

else {

1562

inc_all_io_entry(pool, bio);

1562

inc_all_io_entry(pool, bio);

1563

remap_and_issue(tc, bio, lookup_result.block);

1563

remap_and_issue(tc, bio, lookup_result.block);

1564

inc_remap_and_issue_cell(tc, cell, lookup_result.block);

1564

inc_remap_and_issue_cell(tc, cell, lookup_result.block);

1565

}

1565

}

1566

break;

1566

break;

1567

1568

case -ENODATA:

1568

case -ENODATA:

1569

if (bio_data_dir(bio) == READ && tc->origin_dev) {

1569

if (bio_data_dir(bio) == READ && tc->origin_dev) {

1570

inc_all_io_entry(pool, bio);

1570

inc_all_io_entry(pool, bio);

1571

cell_defer_no_holder(tc, cell);

1571

cell_defer_no_holder(tc, cell);

1572

1573

if (bio_end_sector(bio) <= tc->origin_size)

1573

if (bio_end_sector(bio) <= tc->origin_size)

1574

remap_to_origin_and_issue(tc, bio);

1574

remap_to_origin_and_issue(tc, bio);

1575

1576

else if (bio->bi_iter.bi_sector < tc->origin_size) {

1576

else if (bio->bi_iter.bi_sector < tc->origin_size) {

1577

zero_fill_bio(bio);

1577

zero_fill_bio(bio);

1578

bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;

1578

bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;

1579

remap_to_origin_and_issue(tc, bio);

1579

remap_to_origin_and_issue(tc, bio);

1580

1581

} else {

1581

} else {

1582

zero_fill_bio(bio);

1582

zero_fill_bio(bio);

1583

bio_endio(bio, 0);

1583

bio_endio(bio, 0);

1584

}

1584

}

1585

} else

1585

} else

1586

provision_block(tc, bio, block, cell);

1586

provision_block(tc, bio, block, cell);

1587

break;

1587

break;

1588

1589

default:

1589

default:

1590

DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",

1590

DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",

1591

__func__, r);

1591

__func__, r);

1592

cell_defer_no_holder(tc, cell);

1592

cell_defer_no_holder(tc, cell);

1593

bio_io_error(bio);

1593

bio_io_error(bio);

1594

break;

1594

break;

1595

}

1595

}

1596

}

1596

}

1597

1598

static void process_bio(struct thin_c *tc, struct bio *bio)

1598

static void process_bio(struct thin_c *tc, struct bio *bio)

1599

{

1599

{

1600

struct pool *pool = tc->pool;

1600

struct pool *pool = tc->pool;

1601

dm_block_t block = get_bio_block(tc, bio);

1601

dm_block_t block = get_bio_block(tc, bio);

1602

struct dm_bio_prison_cell *cell;

1602

struct dm_bio_prison_cell *cell;

1603

struct dm_cell_key key;

1603

struct dm_cell_key key;

1604

1605

/*

1605

/*

1606

* If cell is already occupied, then the block is already

1606

* If cell is already occupied, then the block is already

1607

* being provisioned so we have nothing further to do here.

1607

* being provisioned so we have nothing further to do here.

1608

*/

1608

*/

1609

build_virtual_key(tc->td, block, &key);

1609

build_virtual_key(tc->td, block, &key);

1610

if (bio_detain(pool, &key, bio, &cell))

1610

if (bio_detain(pool, &key, bio, &cell))

1611

return;

1611

return;

1612

1613

process_cell(tc, cell);

1613

process_cell(tc, cell);

1614

}

1614

}

1615

1616

static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,

1616

static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,

1617

struct dm_bio_prison_cell *cell)

1617

struct dm_bio_prison_cell *cell)

1618

{

1618

{

1619

int r;

1619

int r;

1620

int rw = bio_data_dir(bio);

1620

int rw = bio_data_dir(bio);

1621

dm_block_t block = get_bio_block(tc, bio);

1621

dm_block_t block = get_bio_block(tc, bio);

1622

struct dm_thin_lookup_result lookup_result;

1622

struct dm_thin_lookup_result lookup_result;

1623

1624

r = dm_thin_find_block(tc->td, block, 1, &lookup_result);

1624

r = dm_thin_find_block(tc->td, block, 1, &lookup_result);

1625

switch (r) {

1625

switch (r) {

1626

case 0:

1626

case 0:

1627

if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {

1627

if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {

1628

handle_unserviceable_bio(tc->pool, bio);

1628

handle_unserviceable_bio(tc->pool, bio);

1629

if (cell)

1629

if (cell)

1630

cell_defer_no_holder(tc, cell);

1630

cell_defer_no_holder(tc, cell);

1631

} else {

1631

} else {

1632

inc_all_io_entry(tc->pool, bio);

1632

inc_all_io_entry(tc->pool, bio);

1633

remap_and_issue(tc, bio, lookup_result.block);

1633

remap_and_issue(tc, bio, lookup_result.block);

1634

if (cell)

1634

if (cell)

1635

inc_remap_and_issue_cell(tc, cell, lookup_result.block);

1635

inc_remap_and_issue_cell(tc, cell, lookup_result.block);

1636

}

1636

}

1637

break;

1637

break;

1638

1639

case -ENODATA:

1639

case -ENODATA:

1640

if (cell)

1640

if (cell)

1641

cell_defer_no_holder(tc, cell);

1641

cell_defer_no_holder(tc, cell);

1642

if (rw != READ) {

1642

if (rw != READ) {

1643

handle_unserviceable_bio(tc->pool, bio);

1643

handle_unserviceable_bio(tc->pool, bio);

1644

break;

1644

break;

1645

}

1645

}

1646

1647

if (tc->origin_dev) {

1647

if (tc->origin_dev) {

1648

inc_all_io_entry(tc->pool, bio);

1648

inc_all_io_entry(tc->pool, bio);

1649

remap_to_origin_and_issue(tc, bio);

1649

remap_to_origin_and_issue(tc, bio);

1650

break;

1650

break;

1651

}

1651

}

1652

1653

zero_fill_bio(bio);

1653

zero_fill_bio(bio);

1654

bio_endio(bio, 0);

1654

bio_endio(bio, 0);

1655

break;

1655

break;

1656

1657

default:

1657

default:

1658

DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",

1658

DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",

1659

__func__, r);

1659

__func__, r);

1660

if (cell)

1660

if (cell)

1661

cell_defer_no_holder(tc, cell);

1661

cell_defer_no_holder(tc, cell);

1662

bio_io_error(bio);

1662

bio_io_error(bio);

1663

break;

1663

break;

1664

}

1664

}

1665

}

1665

}

1666

1667

static void process_bio_read_only(struct thin_c *tc, struct bio *bio)

1667

static void process_bio_read_only(struct thin_c *tc, struct bio *bio)

1668

{

1668

{

1669

__process_bio_read_only(tc, bio, NULL);

1669

__process_bio_read_only(tc, bio, NULL);

1670

}

1670

}

1671

1672

static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)

1672

static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)

1673

{

1673

{

1674

__process_bio_read_only(tc, cell->holder, cell);

1674

__process_bio_read_only(tc, cell->holder, cell);

1675

}

1675

}

1676

1677

static void process_bio_success(struct thin_c *tc, struct bio *bio)

1677

static void process_bio_success(struct thin_c *tc, struct bio *bio)

1678

{

1678

{

1679

bio_endio(bio, 0);

1679

bio_endio(bio, 0);

1680

}

1680

}

1681

1682

static void process_bio_fail(struct thin_c *tc, struct bio *bio)

1682

static void process_bio_fail(struct thin_c *tc, struct bio *bio)

1683

{

1683

{

1684

bio_io_error(bio);

1684

bio_io_error(bio);

1685

}

1685

}

1686

1687

static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)

1687

static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)

1688

{

1688

{

1689

cell_success(tc->pool, cell);

1689

cell_success(tc->pool, cell);

1690

}

1690

}

1691

1692

static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)

1692

static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)

1693

{

1693

{

1694

cell_error(tc->pool, cell);

1694

cell_error(tc->pool, cell);

1695

}

1695

}

1696

1697

/*

1697

/*

1698

* FIXME: should we also commit due to size of transaction, measured in

1698

* FIXME: should we also commit due to size of transaction, measured in

1699

* metadata blocks?

1699

* metadata blocks?

1700

*/

1700

*/

1701

static int need_commit_due_to_time(struct pool *pool)

1701

static int need_commit_due_to_time(struct pool *pool)

1702

{

1702

{

1703

return jiffies < pool->last_commit_jiffies ||

1703

return jiffies < pool->last_commit_jiffies ||

1704

jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;

1704

jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;

1705

}

1705

}

1706

1707

#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)

1707

#define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)

1708

#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))

1708

#define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))

1709

1710

static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)

1710

static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)

1711

{

1711

{

1712

struct rb_node **rbp, *parent;

1712

struct rb_node **rbp, *parent;

1713

struct dm_thin_endio_hook *pbd;

1713

struct dm_thin_endio_hook *pbd;

1714

sector_t bi_sector = bio->bi_iter.bi_sector;

1714

sector_t bi_sector = bio->bi_iter.bi_sector;

1715

1716

rbp = &tc->sort_bio_list.rb_node;

1716

rbp = &tc->sort_bio_list.rb_node;

1717

parent = NULL;

1717

parent = NULL;

1718

while (*rbp) {

1718

while (*rbp) {

1719

parent = *rbp;

1719

parent = *rbp;

1720

pbd = thin_pbd(parent);

1720

pbd = thin_pbd(parent);

1721

1722

if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)

1722

if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)

1723

rbp = &(*rbp)->rb_left;

1723

rbp = &(*rbp)->rb_left;

1724

else

1724

else

1725

rbp = &(*rbp)->rb_right;

1725

rbp = &(*rbp)->rb_right;

1726

}

1726

}

1727

1728

pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

1728

pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

1729

rb_link_node(&pbd->rb_node, parent, rbp);

1729

rb_link_node(&pbd->rb_node, parent, rbp);

1730

rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);

1730

rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);

1731

}

1731

}

1732

1733

static void __extract_sorted_bios(struct thin_c *tc)

1733

static void __extract_sorted_bios(struct thin_c *tc)

1734

{

1734

{

1735

struct rb_node *node;

1735

struct rb_node *node;

1736

struct dm_thin_endio_hook *pbd;

1736

struct dm_thin_endio_hook *pbd;

1737

struct bio *bio;

1737

struct bio *bio;

1738

1739

for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {

1739

for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {

1740

pbd = thin_pbd(node);

1740

pbd = thin_pbd(node);

1741

bio = thin_bio(pbd);

1741

bio = thin_bio(pbd);

1742

1743

bio_list_add(&tc->deferred_bio_list, bio);

1743

bio_list_add(&tc->deferred_bio_list, bio);

1744

rb_erase(&pbd->rb_node, &tc->sort_bio_list);

1744

rb_erase(&pbd->rb_node, &tc->sort_bio_list);

1745

}

1745

}

1746

1747

WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));

1747

WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));

1748

}

1748

}

1749

1750

static void __sort_thin_deferred_bios(struct thin_c *tc)

1750

static void __sort_thin_deferred_bios(struct thin_c *tc)

1751

{

1751

{

1752

struct bio *bio;

1752

struct bio *bio;

1753

struct bio_list bios;

1753

struct bio_list bios;

1754

1755

bio_list_init(&bios);

1755

bio_list_init(&bios);

1756

bio_list_merge(&bios, &tc->deferred_bio_list);

1756

bio_list_merge(&bios, &tc->deferred_bio_list);

1757

bio_list_init(&tc->deferred_bio_list);

1757

bio_list_init(&tc->deferred_bio_list);

1758

1759

/* Sort deferred_bio_list using rb-tree */

1759

/* Sort deferred_bio_list using rb-tree */

1760

while ((bio = bio_list_pop(&bios)))

1760

while ((bio = bio_list_pop(&bios)))

1761

__thin_bio_rb_add(tc, bio);

1761

__thin_bio_rb_add(tc, bio);

1762

1763

/*

1763

/*

1764

* Transfer the sorted bios in sort_bio_list back to

1764

* Transfer the sorted bios in sort_bio_list back to

1765

* deferred_bio_list to allow lockless submission of

1765

* deferred_bio_list to allow lockless submission of

1766

* all bios.

1766

* all bios.

1767

*/

1767

*/

1768

__extract_sorted_bios(tc);

1768

__extract_sorted_bios(tc);

1769

}

1769

}

1770

1771

static void process_thin_deferred_bios(struct thin_c *tc)

1771

static void process_thin_deferred_bios(struct thin_c *tc)

1772

{

1772

{

1773

struct pool *pool = tc->pool;

1773

struct pool *pool = tc->pool;

1774

unsigned long flags;

1774

unsigned long flags;

1775

struct bio *bio;

1775

struct bio *bio;

1776

struct bio_list bios;

1776

struct bio_list bios;

1777

struct blk_plug plug;

1777

struct blk_plug plug;

1778

unsigned count = 0;

1778

unsigned count = 0;

1779

1780

if (tc->requeue_mode) {

1780

if (tc->requeue_mode) {

1781

error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);

1781

error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);

1782

return;

1782

return;

1783

}

1783

}

1784

1785

bio_list_init(&bios);

1785

bio_list_init(&bios);

1786

1787

spin_lock_irqsave(&tc->lock, flags);

1787

spin_lock_irqsave(&tc->lock, flags);

1788

1789

if (bio_list_empty(&tc->deferred_bio_list)) {

1789

if (bio_list_empty(&tc->deferred_bio_list)) {

1790

spin_unlock_irqrestore(&tc->lock, flags);

1790

spin_unlock_irqrestore(&tc->lock, flags);

1791

return;

1791

return;

1792

}

1792

}

1793

1794

__sort_thin_deferred_bios(tc);

1794

__sort_thin_deferred_bios(tc);

1795

1796

bio_list_merge(&bios, &tc->deferred_bio_list);

1796

bio_list_merge(&bios, &tc->deferred_bio_list);

1797

bio_list_init(&tc->deferred_bio_list);

1797

bio_list_init(&tc->deferred_bio_list);

1798

1799

spin_unlock_irqrestore(&tc->lock, flags);

1799

spin_unlock_irqrestore(&tc->lock, flags);

1800

1801

blk_start_plug(&plug);

1801

blk_start_plug(&plug);

1802

while ((bio = bio_list_pop(&bios))) {

1802

while ((bio = bio_list_pop(&bios))) {

1803

/*

1803

/*

1804

* If we've got no free new_mapping structs, and processing

1804

* If we've got no free new_mapping structs, and processing

1805

* this bio might require one, we pause until there are some

1805

* this bio might require one, we pause until there are some

1806

* prepared mappings to process.

1806

* prepared mappings to process.

1807

*/

1807

*/

1808

if (ensure_next_mapping(pool)) {

1808

if (ensure_next_mapping(pool)) {

1809

spin_lock_irqsave(&tc->lock, flags);

1809

spin_lock_irqsave(&tc->lock, flags);

1810

bio_list_add(&tc->deferred_bio_list, bio);

1810

bio_list_add(&tc->deferred_bio_list, bio);

1811

bio_list_merge(&tc->deferred_bio_list, &bios);

1811

bio_list_merge(&tc->deferred_bio_list, &bios);

1812

spin_unlock_irqrestore(&tc->lock, flags);

1812

spin_unlock_irqrestore(&tc->lock, flags);

1813

break;

1813

break;

1814

}

1814

}

1815

1816

if (bio->bi_rw & REQ_DISCARD)

1816

if (bio->bi_rw & REQ_DISCARD)

1817

pool->process_discard(tc, bio);

1817

pool->process_discard(tc, bio);

1818

else

1818

else

1819

pool->process_bio(tc, bio);

1819

pool->process_bio(tc, bio);

1820

1821

if ((count++ & 127) == 0) {

1821

if ((count++ & 127) == 0) {

1822

throttle_work_update(&pool->throttle);

1822

throttle_work_update(&pool->throttle);

1823

dm_pool_issue_prefetches(pool->pmd);

1823

dm_pool_issue_prefetches(pool->pmd);

1824

}

1824

}

1825

}

1825

}

1826

blk_finish_plug(&plug);

1826

blk_finish_plug(&plug);

1827

}

1827

}

1828

1829

static int cmp_cells(const void *lhs, const void *rhs)

1829

static int cmp_cells(const void *lhs, const void *rhs)

1830

{

1830

{

1831

struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);

1831

struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);

1832

struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);

1832

struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);

1833

1834

BUG_ON(!lhs_cell->holder);

1834

BUG_ON(!lhs_cell->holder);

1835

BUG_ON(!rhs_cell->holder);

1835

BUG_ON(!rhs_cell->holder);

1836

1837

if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)

1837

if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)

1838

return -1;

1838

return -1;

1839

1840

if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)

1840

if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)

1841

return 1;

1841

return 1;

1842

1843

return 0;

1843

return 0;

1844

}

1844

}

1845

1846

static unsigned sort_cells(struct pool *pool, struct list_head *cells)

1846

static unsigned sort_cells(struct pool *pool, struct list_head *cells)

1847

{

1847

{

1848

unsigned count = 0;

1848

unsigned count = 0;

1849

struct dm_bio_prison_cell *cell, *tmp;

1849

struct dm_bio_prison_cell *cell, *tmp;

1850

1851

list_for_each_entry_safe(cell, tmp, cells, user_list) {

1851

list_for_each_entry_safe(cell, tmp, cells, user_list) {

1852

if (count >= CELL_SORT_ARRAY_SIZE)

1852

if (count >= CELL_SORT_ARRAY_SIZE)

1853

break;

1853

break;

1854

1855

pool->cell_sort_array[count++] = cell;

1855

pool->cell_sort_array[count++] = cell;

1856

list_del(&cell->user_list);

1856

list_del(&cell->user_list);

1857

}

1857

}

1858

1859

sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);

1859

sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);

1860

1861

return count;

1861

return count;

1862

}

1862

}

1863

1864

static void process_thin_deferred_cells(struct thin_c *tc)

1864

static void process_thin_deferred_cells(struct thin_c *tc)

1865

{

1865

{

1866

struct pool *pool = tc->pool;

1866

struct pool *pool = tc->pool;

1867

unsigned long flags;

1867

unsigned long flags;

1868

struct list_head cells;

1868

struct list_head cells;

1869

struct dm_bio_prison_cell *cell;

1869

struct dm_bio_prison_cell *cell;

1870

unsigned i, j, count;

1870

unsigned i, j, count;

1871

1872

INIT_LIST_HEAD(&cells);

1872

INIT_LIST_HEAD(&cells);

1873

1874

spin_lock_irqsave(&tc->lock, flags);

1874

spin_lock_irqsave(&tc->lock, flags);

1875

list_splice_init(&tc->deferred_cells, &cells);

1875

list_splice_init(&tc->deferred_cells, &cells);

1876

spin_unlock_irqrestore(&tc->lock, flags);

1876

spin_unlock_irqrestore(&tc->lock, flags);

1877

1878

if (list_empty(&cells))

1878

if (list_empty(&cells))

1879

return;

1879

return;

1880

1881

do {

1881

do {

1882

count = sort_cells(tc->pool, &cells);

1882

count = sort_cells(tc->pool, &cells);

1883

1884

for (i = 0; i < count; i++) {

1884

for (i = 0; i < count; i++) {

1885

cell = pool->cell_sort_array[i];

1885

cell = pool->cell_sort_array[i];

1886

BUG_ON(!cell->holder);

1886

BUG_ON(!cell->holder);

1887

1888

/*

1888

/*

1889

* If we've got no free new_mapping structs, and processing

1889

* If we've got no free new_mapping structs, and processing

1890

* this bio might require one, we pause until there are some

1890

* this bio might require one, we pause until there are some

1891

* prepared mappings to process.

1891

* prepared mappings to process.

1892

*/

1892

*/

1893

if (ensure_next_mapping(pool)) {

1893

if (ensure_next_mapping(pool)) {

1894

for (j = i; j < count; j++)

1894

for (j = i; j < count; j++)

1895

list_add(&pool->cell_sort_array[j]->user_list, &cells);

1895

list_add(&pool->cell_sort_array[j]->user_list, &cells);

1896

1897

spin_lock_irqsave(&tc->lock, flags);

1897

spin_lock_irqsave(&tc->lock, flags);

1898

list_splice(&cells, &tc->deferred_cells);

1898

list_splice(&cells, &tc->deferred_cells);

1899

spin_unlock_irqrestore(&tc->lock, flags);

1899

spin_unlock_irqrestore(&tc->lock, flags);

1900

return;

1900

return;

1901

}

1901

}

1902

1903

if (cell->holder->bi_rw & REQ_DISCARD)

1903

if (cell->holder->bi_rw & REQ_DISCARD)

1904

pool->process_discard_cell(tc, cell);

1904

pool->process_discard_cell(tc, cell);

1905

else

1905

else

1906

pool->process_cell(tc, cell);

1906

pool->process_cell(tc, cell);

1907

}

1907

}

1908

} while (!list_empty(&cells));

1908

} while (!list_empty(&cells));

1909

}

1909

}

1910

1911

static void thin_get(struct thin_c *tc);

1911

static void thin_get(struct thin_c *tc);

1912

static void thin_put(struct thin_c *tc);

1912

static void thin_put(struct thin_c *tc);

1913

1914

/*

1914

/*

1915

* We can't hold rcu_read_lock() around code that can block. So we

1915

* We can't hold rcu_read_lock() around code that can block. So we

1916

* find a thin with the rcu lock held; bump a refcount; then drop

1916

* find a thin with the rcu lock held; bump a refcount; then drop

1917

* the lock.

1917

* the lock.

1918

*/

1918

*/

1919

static struct thin_c *get_first_thin(struct pool *pool)

1919

static struct thin_c *get_first_thin(struct pool *pool)

1920

{

1920

{

1921

struct thin_c *tc = NULL;

1921

struct thin_c *tc = NULL;

1922

1923

rcu_read_lock();

1923

rcu_read_lock();

1924

if (!list_empty(&pool->active_thins)) {

1924

if (!list_empty(&pool->active_thins)) {

1925

tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);

1925

tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);

1926

thin_get(tc);

1926

thin_get(tc);

1927

}

1927

}

1928

rcu_read_unlock();

1928

rcu_read_unlock();

1929

1930

return tc;

1930

return tc;

1931

}

1931

}

1932

1933

static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)

1933

static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)

1934

{

1934

{

1935

struct thin_c *old_tc = tc;

1935

struct thin_c *old_tc = tc;

1936

1937

rcu_read_lock();

1937

rcu_read_lock();

1938

list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {

1938

list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {

1939

thin_get(tc);

1939

thin_get(tc);

1940

thin_put(old_tc);

1940

thin_put(old_tc);

1941

rcu_read_unlock();

1941

rcu_read_unlock();

1942

return tc;

1942

return tc;

1943

}

1943

}

1944

thin_put(old_tc);

1944

thin_put(old_tc);

1945

rcu_read_unlock();

1945

rcu_read_unlock();

1946

1947

return NULL;

1947

return NULL;

1948

}

1948

}

1949

1950

static void process_deferred_bios(struct pool *pool)

1950

static void process_deferred_bios(struct pool *pool)

1951

{

1951

{

1952

unsigned long flags;

1952

unsigned long flags;

1953

struct bio *bio;

1953

struct bio *bio;

1954

struct bio_list bios;

1954

struct bio_list bios;

1955

struct thin_c *tc;

1955

struct thin_c *tc;

1956

1957

tc = get_first_thin(pool);

1957

tc = get_first_thin(pool);

1958

while (tc) {

1958

while (tc) {

1959

process_thin_deferred_cells(tc);

1959

process_thin_deferred_cells(tc);

1960

process_thin_deferred_bios(tc);

1960

process_thin_deferred_bios(tc);

1961

tc = get_next_thin(pool, tc);

1961

tc = get_next_thin(pool, tc);

1962

}

1962

}

1963

1964

/*

1964

/*

1965

* If there are any deferred flush bios, we must commit

1965

* If there are any deferred flush bios, we must commit

1966

* the metadata before issuing them.

1966

* the metadata before issuing them.

1967

*/

1967

*/

1968

bio_list_init(&bios);

1968

bio_list_init(&bios);

1969

spin_lock_irqsave(&pool->lock, flags);

1969

spin_lock_irqsave(&pool->lock, flags);

1970

bio_list_merge(&bios, &pool->deferred_flush_bios);

1970

bio_list_merge(&bios, &pool->deferred_flush_bios);

1971

bio_list_init(&pool->deferred_flush_bios);

1971

bio_list_init(&pool->deferred_flush_bios);

1972

spin_unlock_irqrestore(&pool->lock, flags);

1972

spin_unlock_irqrestore(&pool->lock, flags);

1973

1974

if (bio_list_empty(&bios) &&

1974

if (bio_list_empty(&bios) &&

1975

!(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))

1975

!(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))

1976

return;

1976

return;

1977

1978

if (commit(pool)) {

1978

if (commit(pool)) {

1979

while ((bio = bio_list_pop(&bios)))

1979

while ((bio = bio_list_pop(&bios)))

1980

bio_io_error(bio);

1980

bio_io_error(bio);

1981

return;

1981

return;

1982

}

1982

}

1983

pool->last_commit_jiffies = jiffies;

1983

pool->last_commit_jiffies = jiffies;

1984

1985

while ((bio = bio_list_pop(&bios)))

1985

while ((bio = bio_list_pop(&bios)))

1986

generic_make_request(bio);

1986

generic_make_request(bio);

1987

}

1987

}

1988

1989

static void do_worker(struct work_struct *ws)

1989

static void do_worker(struct work_struct *ws)

1990

{

1990

{

1991

struct pool *pool = container_of(ws, struct pool, worker);

1991

struct pool *pool = container_of(ws, struct pool, worker);

1992

1993

throttle_work_start(&pool->throttle);

1993

throttle_work_start(&pool->throttle);

1994

dm_pool_issue_prefetches(pool->pmd);

1994

dm_pool_issue_prefetches(pool->pmd);

1995

throttle_work_update(&pool->throttle);

1995

throttle_work_update(&pool->throttle);

1996

process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);

1996

process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);

1997

throttle_work_update(&pool->throttle);

1997

throttle_work_update(&pool->throttle);

1998

process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);

1998

process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);

1999

throttle_work_update(&pool->throttle);

1999

throttle_work_update(&pool->throttle);

2000

process_deferred_bios(pool);

2000

process_deferred_bios(pool);

2001

throttle_work_complete(&pool->throttle);

2001

throttle_work_complete(&pool->throttle);

2002

}

2002

}

2003

2004

/*

2004

/*

2005

* We want to commit periodically so that not too much

2005

* We want to commit periodically so that not too much

2006

* unwritten data builds up.

2006

* unwritten data builds up.

2007

*/

2007

*/

2008

static void do_waker(struct work_struct *ws)

2008

static void do_waker(struct work_struct *ws)

2009

{

2009

{

2010

struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);

2010

struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);

2011

wake_worker(pool);

2011

wake_worker(pool);

2012

queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);

2012

queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);

2013

}

2013

}

2014

2015

/*

2015

/*

2016

* We're holding onto IO to allow userland time to react. After the

2016

* We're holding onto IO to allow userland time to react. After the

2017

* timeout either the pool will have been resized (and thus back in

2017

* timeout either the pool will have been resized (and thus back in

2018

* PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.

2018

* PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.

2019

*/

2019

*/

2020

static void do_no_space_timeout(struct work_struct *ws)

2020

static void do_no_space_timeout(struct work_struct *ws)

2021

{

2021

{

2022

struct pool *pool = container_of(to_delayed_work(ws), struct pool,

2022

struct pool *pool = container_of(to_delayed_work(ws), struct pool,

2023

no_space_timeout);

2023

no_space_timeout);

2024

2025

if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)

2025

if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)

2026

set_pool_mode(pool, PM_READ_ONLY);

2026

set_pool_mode(pool, PM_READ_ONLY);

2027

}

2027

}

2028

2029

/*----------------------------------------------------------------*/

2029

/*----------------------------------------------------------------*/

2030

2031

struct pool_work {

2031

struct pool_work {

2032

struct work_struct worker;

2032

struct work_struct worker;

2033

struct completion complete;

2033

struct completion complete;

2034

};

2034

};

2035

2036

static struct pool_work *to_pool_work(struct work_struct *ws)

2036

static struct pool_work *to_pool_work(struct work_struct *ws)

2037

{

2037

{

2038

return container_of(ws, struct pool_work, worker);

2038

return container_of(ws, struct pool_work, worker);

2039

}

2039

}

2040

2041

static void pool_work_complete(struct pool_work *pw)

2041

static void pool_work_complete(struct pool_work *pw)

2042

{

2042

{

2043

complete(&pw->complete);

2043

complete(&pw->complete);

2044

}

2044

}

2045

2046

static void pool_work_wait(struct pool_work *pw, struct pool *pool,

2046

static void pool_work_wait(struct pool_work *pw, struct pool *pool,

2047

void (*fn)(struct work_struct *))

2047

void (*fn)(struct work_struct *))

2048

{

2048

{

2049

INIT_WORK_ONSTACK(&pw->worker, fn);

2049

INIT_WORK_ONSTACK(&pw->worker, fn);

2050

init_completion(&pw->complete);

2050

init_completion(&pw->complete);

2051

queue_work(pool->wq, &pw->worker);

2051

queue_work(pool->wq, &pw->worker);

2052

wait_for_completion(&pw->complete);

2052

wait_for_completion(&pw->complete);

2053

}

2053

}

2054

2055

/*----------------------------------------------------------------*/

2055

/*----------------------------------------------------------------*/

2056

2057

struct noflush_work {

2057

struct noflush_work {

2058

struct pool_work pw;

2058

struct pool_work pw;

2059

struct thin_c *tc;

2059

struct thin_c *tc;

2060

};

2060

};

2061

2062

static struct noflush_work *to_noflush(struct work_struct *ws)

2062

static struct noflush_work *to_noflush(struct work_struct *ws)

2063

{

2063

{

2064

return container_of(to_pool_work(ws), struct noflush_work, pw);

2064

return container_of(to_pool_work(ws), struct noflush_work, pw);

2065

}

2065

}

2066

2067

static void do_noflush_start(struct work_struct *ws)

2067

static void do_noflush_start(struct work_struct *ws)

2068

{

2068

{

2069

struct noflush_work *w = to_noflush(ws);

2069

struct noflush_work *w = to_noflush(ws);

2070

w->tc->requeue_mode = true;

2070

w->tc->requeue_mode = true;

2071

requeue_io(w->tc);

2071

requeue_io(w->tc);

2072

pool_work_complete(&w->pw);

2072

pool_work_complete(&w->pw);

2073

}

2073

}

2074

2075

static void do_noflush_stop(struct work_struct *ws)

2075

static void do_noflush_stop(struct work_struct *ws)

2076

{

2076

{

2077

struct noflush_work *w = to_noflush(ws);

2077

struct noflush_work *w = to_noflush(ws);

2078

w->tc->requeue_mode = false;

2078

w->tc->requeue_mode = false;

2079

pool_work_complete(&w->pw);

2079

pool_work_complete(&w->pw);

2080

}

2080

}

2081

2082

static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))

2082

static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))

2083

{

2083

{

2084

struct noflush_work w;

2084

struct noflush_work w;

2085

2086

w.tc = tc;

2086

w.tc = tc;

2087

pool_work_wait(&w.pw, tc->pool, fn);

2087

pool_work_wait(&w.pw, tc->pool, fn);

2088

}

2088

}

2089

2090

/*----------------------------------------------------------------*/

2090

/*----------------------------------------------------------------*/

2091

2092

static enum pool_mode get_pool_mode(struct pool *pool)

2092

static enum pool_mode get_pool_mode(struct pool *pool)

2093

{

2093

{

2094

return pool->pf.mode;

2094

return pool->pf.mode;

2095

}

2095

}

2096

2097

static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)

2097

static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)

2098

{

2098

{

2099

dm_table_event(pool->ti->table);

2099

dm_table_event(pool->ti->table);

2100

DMINFO("%s: switching pool to %s mode",

2100

DMINFO("%s: switching pool to %s mode",

2101

dm_device_name(pool->pool_md), new_mode);

2101

dm_device_name(pool->pool_md), new_mode);

2102

}

2102

}

2103

2104

static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)

2104

static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)

2105

{

2105

{

2106

struct pool_c *pt = pool->ti->private;

2106

struct pool_c *pt = pool->ti->private;

2107

bool needs_check = dm_pool_metadata_needs_check(pool->pmd);

2107

bool needs_check = dm_pool_metadata_needs_check(pool->pmd);

2108

enum pool_mode old_mode = get_pool_mode(pool);

2108

enum pool_mode old_mode = get_pool_mode(pool);

2109

unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;

2109

unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;

2110

2111

/*

2111

/*

2112

* Never allow the pool to transition to PM_WRITE mode if user

2112

* Never allow the pool to transition to PM_WRITE mode if user

2113

* intervention is required to verify metadata and data consistency.

2113

* intervention is required to verify metadata and data consistency.

2114

*/

2114

*/

2115

if (new_mode == PM_WRITE && needs_check) {

2115

if (new_mode == PM_WRITE && needs_check) {

2116

DMERR("%s: unable to switch pool to write mode until repaired.",

2116

DMERR("%s: unable to switch pool to write mode until repaired.",

2117

dm_device_name(pool->pool_md));

2117

dm_device_name(pool->pool_md));

2118

if (old_mode != new_mode)

2118

if (old_mode != new_mode)

2119

new_mode = old_mode;

2119

new_mode = old_mode;

2120

else

2120

else

2121

new_mode = PM_READ_ONLY;

2121

new_mode = PM_READ_ONLY;

2122

}

2122

}

2123

/*

2123

/*

2124

* If we were in PM_FAIL mode, rollback of metadata failed. We're

2124

* If we were in PM_FAIL mode, rollback of metadata failed. We're

2125

* not going to recover without a thin_repair. So we never let the

2125

* not going to recover without a thin_repair. So we never let the

2126

* pool move out of the old mode.

2126

* pool move out of the old mode.

2127

*/

2127

*/

2128

if (old_mode == PM_FAIL)

2128

if (old_mode == PM_FAIL)

2129

new_mode = old_mode;

2129

new_mode = old_mode;

2130

2131

switch (new_mode) {

2131

switch (new_mode) {

2132

case PM_FAIL:

2132

case PM_FAIL:

2133

if (old_mode != new_mode)

2133

if (old_mode != new_mode)

2134

notify_of_pool_mode_change(pool, "failure");

2134

notify_of_pool_mode_change(pool, "failure");

2135

dm_pool_metadata_read_only(pool->pmd);

2135

dm_pool_metadata_read_only(pool->pmd);

2136

pool->process_bio = process_bio_fail;

2136

pool->process_bio = process_bio_fail;

2137

pool->process_discard = process_bio_fail;

2137

pool->process_discard = process_bio_fail;

2138

pool->process_cell = process_cell_fail;

2138

pool->process_cell = process_cell_fail;

2139

pool->process_discard_cell = process_cell_fail;

2139

pool->process_discard_cell = process_cell_fail;

2140

pool->process_prepared_mapping = process_prepared_mapping_fail;

2140

pool->process_prepared_mapping = process_prepared_mapping_fail;

2141

pool->process_prepared_discard = process_prepared_discard_fail;

2141

pool->process_prepared_discard = process_prepared_discard_fail;

2142

2143

error_retry_list(pool);

2143

error_retry_list(pool);

2144

break;

2144

break;

2145

2146

case PM_READ_ONLY:

2146

case PM_READ_ONLY:

2147

if (old_mode != new_mode)

2147

if (old_mode != new_mode)

2148

notify_of_pool_mode_change(pool, "read-only");

2148

notify_of_pool_mode_change(pool, "read-only");

2149

dm_pool_metadata_read_only(pool->pmd);

2149

dm_pool_metadata_read_only(pool->pmd);

2150

pool->process_bio = process_bio_read_only;

2150

pool->process_bio = process_bio_read_only;

2151

pool->process_discard = process_bio_success;

2151

pool->process_discard = process_bio_success;

2152

pool->process_cell = process_cell_read_only;

2152

pool->process_cell = process_cell_read_only;

2153

pool->process_discard_cell = process_cell_success;

2153

pool->process_discard_cell = process_cell_success;

2154

pool->process_prepared_mapping = process_prepared_mapping_fail;

2154

pool->process_prepared_mapping = process_prepared_mapping_fail;

2155

pool->process_prepared_discard = process_prepared_discard_passdown;

2155

pool->process_prepared_discard = process_prepared_discard_passdown;

2156

2157

error_retry_list(pool);

2157

error_retry_list(pool);

2158

break;

2158

break;

2159

2160

case PM_OUT_OF_DATA_SPACE:

2160

case PM_OUT_OF_DATA_SPACE:

2161

/*

2161

/*

2162

* Ideally we'd never hit this state; the low water mark

2162

* Ideally we'd never hit this state; the low water mark

2163

* would trigger userland to extend the pool before we

2163

* would trigger userland to extend the pool before we

2164

* completely run out of data space. However, many small

2164

* completely run out of data space. However, many small

2165

* IOs to unprovisioned space can consume data space at an

2165

* IOs to unprovisioned space can consume data space at an

2166

* alarming rate. Adjust your low water mark if you're

2166

* alarming rate. Adjust your low water mark if you're

2167

* frequently seeing this mode.

2167

* frequently seeing this mode.

2168

*/

2168

*/

2169

if (old_mode != new_mode)

2169

if (old_mode != new_mode)

2170

notify_of_pool_mode_change(pool, "out-of-data-space");

2170

notify_of_pool_mode_change(pool, "out-of-data-space");

2171

pool->process_bio = process_bio_read_only;

2171

pool->process_bio = process_bio_read_only;

2172

pool->process_discard = process_discard_bio;

2172

pool->process_discard = process_discard_bio;

2173

pool->process_cell = process_cell_read_only;

2173

pool->process_cell = process_cell_read_only;

2174

pool->process_discard_cell = process_discard_cell;

2174

pool->process_discard_cell = process_discard_cell;

2175

pool->process_prepared_mapping = process_prepared_mapping;

2175

pool->process_prepared_mapping = process_prepared_mapping;

2176

pool->process_prepared_discard = process_prepared_discard;

2176

pool->process_prepared_discard = process_prepared_discard;

2177

2178

if (!pool->pf.error_if_no_space && no_space_timeout)

2178

if (!pool->pf.error_if_no_space && no_space_timeout)

2179

queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);

2179

queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);

2180

break;

2180

break;

2181

2182

case PM_WRITE:

2182

case PM_WRITE:

2183

if (old_mode != new_mode)

2183

if (old_mode != new_mode)

2184

notify_of_pool_mode_change(pool, "write");

2184

notify_of_pool_mode_change(pool, "write");

2185

dm_pool_metadata_read_write(pool->pmd);

2185

dm_pool_metadata_read_write(pool->pmd);

2186

pool->process_bio = process_bio;

2186

pool->process_bio = process_bio;

2187

pool->process_discard = process_discard_bio;

2187

pool->process_discard = process_discard_bio;

2188

pool->process_cell = process_cell;

2188

pool->process_cell = process_cell;

2189

pool->process_discard_cell = process_discard_cell;

2189

pool->process_discard_cell = process_discard_cell;

2190

pool->process_prepared_mapping = process_prepared_mapping;

2190

pool->process_prepared_mapping = process_prepared_mapping;

2191

pool->process_prepared_discard = process_prepared_discard;

2191

pool->process_prepared_discard = process_prepared_discard;

2192

break;

2192

break;

2193

}

2193

}

2194

2195

pool->pf.mode = new_mode;

2195

pool->pf.mode = new_mode;

2196

/*

2196

/*

2197

* The pool mode may have changed, sync it so bind_control_target()

2197

* The pool mode may have changed, sync it so bind_control_target()

2198

* doesn't cause an unexpected mode transition on resume.

2198

* doesn't cause an unexpected mode transition on resume.

2199

*/

2199

*/

2200

pt->adjusted_pf.mode = new_mode;

2200

pt->adjusted_pf.mode = new_mode;

2201

}

2201

}

2202

2203

static void abort_transaction(struct pool *pool)

2203

static void abort_transaction(struct pool *pool)

2204

{

2204

{

2205

const char *dev_name = dm_device_name(pool->pool_md);

2205

const char *dev_name = dm_device_name(pool->pool_md);

2206

2207

DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);

2207

DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);

2208

if (dm_pool_abort_metadata(pool->pmd)) {

2208

if (dm_pool_abort_metadata(pool->pmd)) {

2209

DMERR("%s: failed to abort metadata transaction", dev_name);

2209

DMERR("%s: failed to abort metadata transaction", dev_name);

2210

set_pool_mode(pool, PM_FAIL);

2210

set_pool_mode(pool, PM_FAIL);

2211

}

2211

}

2212

2213

if (dm_pool_metadata_set_needs_check(pool->pmd)) {

2213

if (dm_pool_metadata_set_needs_check(pool->pmd)) {

2214

DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);

2214

DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);

2215

set_pool_mode(pool, PM_FAIL);

2215

set_pool_mode(pool, PM_FAIL);

2216

}

2216

}

2217

}

2217

}

2218

2219

static void metadata_operation_failed(struct pool *pool, const char *op, int r)

2219

static void metadata_operation_failed(struct pool *pool, const char *op, int r)

2220

{

2220

{

2221

DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",

2221

DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",

2222

dm_device_name(pool->pool_md), op, r);

2222

dm_device_name(pool->pool_md), op, r);

2223

2224

abort_transaction(pool);

2224

abort_transaction(pool);

2225

set_pool_mode(pool, PM_READ_ONLY);

2225

set_pool_mode(pool, PM_READ_ONLY);

2226

}

2226

}

2227

2228

/*----------------------------------------------------------------*/

2228

/*----------------------------------------------------------------*/

2229

2230

/*

2230

/*

2231

* Mapping functions.

2231

* Mapping functions.

2232

*/

2232

*/

2233

2234

/*

2234

/*

2235

* Called only while mapping a thin bio to hand it over to the workqueue.

2235

* Called only while mapping a thin bio to hand it over to the workqueue.

2236

*/

2236

*/

2237

static void thin_defer_bio(struct thin_c *tc, struct bio *bio)

2237

static void thin_defer_bio(struct thin_c *tc, struct bio *bio)

2238

{

2238

{

2239

unsigned long flags;

2239

unsigned long flags;

2240

struct pool *pool = tc->pool;

2240

struct pool *pool = tc->pool;

2241

2242

spin_lock_irqsave(&tc->lock, flags);

2242

spin_lock_irqsave(&tc->lock, flags);

2243

bio_list_add(&tc->deferred_bio_list, bio);

2243

bio_list_add(&tc->deferred_bio_list, bio);

2244

spin_unlock_irqrestore(&tc->lock, flags);

2244

spin_unlock_irqrestore(&tc->lock, flags);

2245

2246

wake_worker(pool);

2246

wake_worker(pool);

2247

}

2247

}

2248

2249

static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)

2249

static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)

2250

{

2250

{

2251

struct pool *pool = tc->pool;

2251

struct pool *pool = tc->pool;

2252

2253

throttle_lock(&pool->throttle);

2253

throttle_lock(&pool->throttle);

2254

thin_defer_bio(tc, bio);

2254

thin_defer_bio(tc, bio);

2255

throttle_unlock(&pool->throttle);

2255

throttle_unlock(&pool->throttle);

2256

}

2256

}

2257

2258

static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)

2258

static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)

2259

{

2259

{

2260

unsigned long flags;

2260

unsigned long flags;

2261

struct pool *pool = tc->pool;

2261

struct pool *pool = tc->pool;

2262

2263

throttle_lock(&pool->throttle);

2263

throttle_lock(&pool->throttle);

2264

spin_lock_irqsave(&tc->lock, flags);

2264

spin_lock_irqsave(&tc->lock, flags);

2265

list_add_tail(&cell->user_list, &tc->deferred_cells);

2265

list_add_tail(&cell->user_list, &tc->deferred_cells);

2266

spin_unlock_irqrestore(&tc->lock, flags);

2266

spin_unlock_irqrestore(&tc->lock, flags);

2267

throttle_unlock(&pool->throttle);

2267

throttle_unlock(&pool->throttle);

2268

2269

wake_worker(pool);

2269

wake_worker(pool);

2270

}

2270

}

2271

2272

static void thin_hook_bio(struct thin_c *tc, struct bio *bio)

2272

static void thin_hook_bio(struct thin_c *tc, struct bio *bio)

2273

{

2273

{

2274

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

2274

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

2275

2276

h->tc = tc;

2276

h->tc = tc;

2277

h->shared_read_entry = NULL;

2277

h->shared_read_entry = NULL;

2278

h->all_io_entry = NULL;

2278

h->all_io_entry = NULL;

2279

h->overwrite_mapping = NULL;

2279

h->overwrite_mapping = NULL;

2280

}

2280

}

2281

2282

/*

2282

/*

2283

* Non-blocking function called from the thin target's map function.

2283

* Non-blocking function called from the thin target's map function.

2284

*/

2284

*/

2285

static int thin_bio_map(struct dm_target *ti, struct bio *bio)

2285

static int thin_bio_map(struct dm_target *ti, struct bio *bio)

2286

{

2286

{

2287

int r;

2287

int r;

2288

struct thin_c *tc = ti->private;

2288

struct thin_c *tc = ti->private;

2289

dm_block_t block = get_bio_block(tc, bio);

2289

dm_block_t block = get_bio_block(tc, bio);

2290

struct dm_thin_device *td = tc->td;

2290

struct dm_thin_device *td = tc->td;

2291

struct dm_thin_lookup_result result;

2291

struct dm_thin_lookup_result result;

2292

struct dm_bio_prison_cell *virt_cell, *data_cell;

2292

struct dm_bio_prison_cell *virt_cell, *data_cell;

2293

struct dm_cell_key key;

2293

struct dm_cell_key key;

2294

2295

thin_hook_bio(tc, bio);

2295

thin_hook_bio(tc, bio);

2296

2297

if (tc->requeue_mode) {

2297

if (tc->requeue_mode) {

2298

bio_endio(bio, DM_ENDIO_REQUEUE);

2298

bio_endio(bio, DM_ENDIO_REQUEUE);

2299

return DM_MAPIO_SUBMITTED;

2299

return DM_MAPIO_SUBMITTED;

2300

}

2300

}

2301

2302

if (get_pool_mode(tc->pool) == PM_FAIL) {

2302

if (get_pool_mode(tc->pool) == PM_FAIL) {

2303

bio_io_error(bio);

2303

bio_io_error(bio);

2304

return DM_MAPIO_SUBMITTED;

2304

return DM_MAPIO_SUBMITTED;

2305

}

2305

}

2306

2307

if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {

2307

if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {

2308

thin_defer_bio_with_throttle(tc, bio);

2308

thin_defer_bio_with_throttle(tc, bio);

2309

return DM_MAPIO_SUBMITTED;

2309

return DM_MAPIO_SUBMITTED;

2310

}

2310

}

2311

2312

/*

2312

/*

2313

* We must hold the virtual cell before doing the lookup, otherwise

2313

* We must hold the virtual cell before doing the lookup, otherwise

2314

* there's a race with discard.

2314

* there's a race with discard.

2315

*/

2315

*/

2316

build_virtual_key(tc->td, block, &key);

2316

build_virtual_key(tc->td, block, &key);

2317

if (bio_detain(tc->pool, &key, bio, &virt_cell))

2317

if (bio_detain(tc->pool, &key, bio, &virt_cell))

2318

return DM_MAPIO_SUBMITTED;

2318

return DM_MAPIO_SUBMITTED;

2319

2320

r = dm_thin_find_block(td, block, 0, &result);

2320

r = dm_thin_find_block(td, block, 0, &result);

2321

2322

/*

2322

/*

2323

* Note that we defer readahead too.

2323

* Note that we defer readahead too.

2324

*/

2324

*/

2325

switch (r) {

2325

switch (r) {

2326

case 0:

2326

case 0:

2327

if (unlikely(result.shared)) {

2327

if (unlikely(result.shared)) {

2328

/*

2328

/*

2329

* We have a race condition here between the

2329

* We have a race condition here between the

2330

* result.shared value returned by the lookup and

2330

* result.shared value returned by the lookup and

2331

* snapshot creation, which may cause new

2331

* snapshot creation, which may cause new

2332

* sharing.

2332

* sharing.

2333

*

2333

*

2334

* To avoid this always quiesce the origin before

2334

* To avoid this always quiesce the origin before

2335

* taking the snap. You want to do this anyway to

2335

* taking the snap. You want to do this anyway to

2336

* ensure a consistent application view

2336

* ensure a consistent application view

2337

* (i.e. lockfs).

2337

* (i.e. lockfs).

2338

*

2338

*

2339

* More distant ancestors are irrelevant. The

2339

* More distant ancestors are irrelevant. The

2340

* shared flag will be set in their case.

2340

* shared flag will be set in their case.

2341

*/

2341

*/

2342

thin_defer_cell(tc, virt_cell);

2342

thin_defer_cell(tc, virt_cell);

2343

return DM_MAPIO_SUBMITTED;

2343

return DM_MAPIO_SUBMITTED;

2344

}

2344

}

2345

2346

build_data_key(tc->td, result.block, &key);

2346

build_data_key(tc->td, result.block, &key);

2347

if (bio_detain(tc->pool, &key, bio, &data_cell)) {

2347

if (bio_detain(tc->pool, &key, bio, &data_cell)) {

2348

cell_defer_no_holder(tc, virt_cell);

2348

cell_defer_no_holder(tc, virt_cell);

2349

return DM_MAPIO_SUBMITTED;

2349

return DM_MAPIO_SUBMITTED;

2350

}

2350

}

2351

2352

inc_all_io_entry(tc->pool, bio);

2352

inc_all_io_entry(tc->pool, bio);

2353

cell_defer_no_holder(tc, data_cell);

2353

cell_defer_no_holder(tc, data_cell);

2354

cell_defer_no_holder(tc, virt_cell);

2354

cell_defer_no_holder(tc, virt_cell);

2355

2356

remap(tc, bio, result.block);

2356

remap(tc, bio, result.block);

2357

return DM_MAPIO_REMAPPED;

2357

return DM_MAPIO_REMAPPED;

2358

2359

case -ENODATA:

2359

case -ENODATA:

2360

if (get_pool_mode(tc->pool) == PM_READ_ONLY) {

2360

if (get_pool_mode(tc->pool) == PM_READ_ONLY) {

2361

/*

2361

/*

2362

* This block isn't provisioned, and we have no way

2362

* This block isn't provisioned, and we have no way

2363

* of doing so.

2363

* of doing so.

2364

*/

2364

*/

2365

handle_unserviceable_bio(tc->pool, bio);

2365

handle_unserviceable_bio(tc->pool, bio);

2366

cell_defer_no_holder(tc, virt_cell);

2366

cell_defer_no_holder(tc, virt_cell);

2367

return DM_MAPIO_SUBMITTED;

2367

return DM_MAPIO_SUBMITTED;

2368

}

2368

}

2369

/* fall through */

2369

/* fall through */

2370

2371

case -EWOULDBLOCK:

2371

case -EWOULDBLOCK:

2372

thin_defer_cell(tc, virt_cell);

2372

thin_defer_cell(tc, virt_cell);

2373

return DM_MAPIO_SUBMITTED;

2373

return DM_MAPIO_SUBMITTED;

2374

2375

default:

2375

default:

2376

/*

2376

/*

2377

* Must always call bio_io_error on failure.

2377

* Must always call bio_io_error on failure.

2378

* dm_thin_find_block can fail with -EINVAL if the

2378

* dm_thin_find_block can fail with -EINVAL if the

2379

* pool is switched to fail-io mode.

2379

* pool is switched to fail-io mode.

2380

*/

2380

*/

2381

bio_io_error(bio);

2381

bio_io_error(bio);

2382

cell_defer_no_holder(tc, virt_cell);

2382

cell_defer_no_holder(tc, virt_cell);

2383

return DM_MAPIO_SUBMITTED;

2383

return DM_MAPIO_SUBMITTED;

2384

}

2384

}

2385

}

2385

}

2386

2387

static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)

2387

static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)

2388

{

2388

{

2389

struct pool_c *pt = container_of(cb, struct pool_c, callbacks);

2389

struct pool_c *pt = container_of(cb, struct pool_c, callbacks);

2390

struct request_queue *q;

2390

struct request_queue *q;

2391

2392

if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)

2392

if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)

2393

return 1;

2393

return 1;

2394

2395

q = bdev_get_queue(pt->data_dev->bdev);

2395

q = bdev_get_queue(pt->data_dev->bdev);

2396

return bdi_congested(&q->backing_dev_info, bdi_bits);

2396

return bdi_congested(&q->backing_dev_info, bdi_bits);

2397

}

2397

}

2398

2399

static void requeue_bios(struct pool *pool)

2399

static void requeue_bios(struct pool *pool)

2400

{

2400

{

2401

unsigned long flags;

2401

unsigned long flags;

2402

struct thin_c *tc;

2402

struct thin_c *tc;

2403

2404

rcu_read_lock();

2404

rcu_read_lock();

2405

list_for_each_entry_rcu(tc, &pool->active_thins, list) {

2405

list_for_each_entry_rcu(tc, &pool->active_thins, list) {

2406

spin_lock_irqsave(&tc->lock, flags);

2406

spin_lock_irqsave(&tc->lock, flags);

2407

bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);

2407

bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);

2408

bio_list_init(&tc->retry_on_resume_list);

2408

bio_list_init(&tc->retry_on_resume_list);

2409

spin_unlock_irqrestore(&tc->lock, flags);

2409

spin_unlock_irqrestore(&tc->lock, flags);

2410

}

2410

}

2411

rcu_read_unlock();

2411

rcu_read_unlock();

2412

}

2412

}

2413

2414

/*----------------------------------------------------------------

2414

/*----------------------------------------------------------------

2415

* Binding of control targets to a pool object

2415

* Binding of control targets to a pool object

2416

*--------------------------------------------------------------*/

2416

*--------------------------------------------------------------*/

2417

static bool data_dev_supports_discard(struct pool_c *pt)

2417

static bool data_dev_supports_discard(struct pool_c *pt)

2418

{

2418

{

2419

struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);

2419

struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);

2420

2421

return q && blk_queue_discard(q);

2421

return q && blk_queue_discard(q);

2422

}

2422

}

2423

2424

static bool is_factor(sector_t block_size, uint32_t n)

2424

static bool is_factor(sector_t block_size, uint32_t n)

2425

{

2425

{

2426

return !sector_div(block_size, n);

2426

return !sector_div(block_size, n);

2427

}

2427

}

2428

2429

/*

2429

/*

2430

* If discard_passdown was enabled verify that the data device

2430

* If discard_passdown was enabled verify that the data device

2431

* supports discards. Disable discard_passdown if not.

2431

* supports discards. Disable discard_passdown if not.

2432

*/

2432

*/

2433

static void disable_passdown_if_not_supported(struct pool_c *pt)

2433

static void disable_passdown_if_not_supported(struct pool_c *pt)

2434

{

2434

{

2435

struct pool *pool = pt->pool;

2435

struct pool *pool = pt->pool;

2436

struct block_device *data_bdev = pt->data_dev->bdev;

2436

struct block_device *data_bdev = pt->data_dev->bdev;

2437

struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;

2437

struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;

2438

sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;

2438

sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;

2439

const char *reason = NULL;

2439

const char *reason = NULL;

2440

char buf[BDEVNAME_SIZE];

2440

char buf[BDEVNAME_SIZE];

2441

2442

if (!pt->adjusted_pf.discard_passdown)

2442

if (!pt->adjusted_pf.discard_passdown)

2443

return;

2443

return;

2444

2445

if (!data_dev_supports_discard(pt))

2445

if (!data_dev_supports_discard(pt))

2446

reason = "discard unsupported";

2446

reason = "discard unsupported";

2447

2448

else if (data_limits->max_discard_sectors < pool->sectors_per_block)

2448

else if (data_limits->max_discard_sectors < pool->sectors_per_block)

2449

reason = "max discard sectors smaller than a block";

2449

reason = "max discard sectors smaller than a block";

2450

2451

else if (data_limits->discard_granularity > block_size)

2451

else if (data_limits->discard_granularity > block_size)

2452

reason = "discard granularity larger than a block";

2452

reason = "discard granularity larger than a block";

2453

2454

else if (!is_factor(block_size, data_limits->discard_granularity))

2454

else if (!is_factor(block_size, data_limits->discard_granularity))

2455

reason = "discard granularity not a factor of block size";

2455

reason = "discard granularity not a factor of block size";

2456

2457

if (reason) {

2457

if (reason) {

2458

DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);

2458

DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);

2459

pt->adjusted_pf.discard_passdown = false;

2459

pt->adjusted_pf.discard_passdown = false;

2460

}

2460

}

2461

}

2461

}

2462

2463

static int bind_control_target(struct pool *pool, struct dm_target *ti)

2463

static int bind_control_target(struct pool *pool, struct dm_target *ti)

2464

{

2464

{

2465

struct pool_c *pt = ti->private;

2465

struct pool_c *pt = ti->private;

2466

2467

/*

2467

/*

2468

* We want to make sure that a pool in PM_FAIL mode is never upgraded.

2468

* We want to make sure that a pool in PM_FAIL mode is never upgraded.

2469

*/

2469

*/

2470

enum pool_mode old_mode = get_pool_mode(pool);

2470

enum pool_mode old_mode = get_pool_mode(pool);

2471

enum pool_mode new_mode = pt->adjusted_pf.mode;

2471

enum pool_mode new_mode = pt->adjusted_pf.mode;

2472

2473

/*

2473

/*

2474

* Don't change the pool's mode until set_pool_mode() below.

2474

* Don't change the pool's mode until set_pool_mode() below.

2475

* Otherwise the pool's process_* function pointers may

2475

* Otherwise the pool's process_* function pointers may

2476

* not match the desired pool mode.

2476

* not match the desired pool mode.

2477

*/

2477

*/

2478

pt->adjusted_pf.mode = old_mode;

2478

pt->adjusted_pf.mode = old_mode;

2479

2480

pool->ti = ti;

2480

pool->ti = ti;

2481

pool->pf = pt->adjusted_pf;

2481

pool->pf = pt->adjusted_pf;

2482

pool->low_water_blocks = pt->low_water_blocks;

2482

pool->low_water_blocks = pt->low_water_blocks;

2483

2484

set_pool_mode(pool, new_mode);

2484

set_pool_mode(pool, new_mode);

2485

2486

return 0;

2486

return 0;

2487

}

2487

}

2488

2489

static void unbind_control_target(struct pool *pool, struct dm_target *ti)

2489

static void unbind_control_target(struct pool *pool, struct dm_target *ti)

2490

{

2490

{

2491

if (pool->ti == ti)

2491

if (pool->ti == ti)

2492

pool->ti = NULL;

2492

pool->ti = NULL;

2493

}

2493

}

2494

2495

/*----------------------------------------------------------------

2495

/*----------------------------------------------------------------

2496

* Pool creation

2496

* Pool creation

2497

*--------------------------------------------------------------*/

2497

*--------------------------------------------------------------*/

2498

/* Initialize pool features. */

2498

/* Initialize pool features. */

2499

static void pool_features_init(struct pool_features *pf)

2499

static void pool_features_init(struct pool_features *pf)

2500

{

2500

{

2501

pf->mode = PM_WRITE;

2501

pf->mode = PM_WRITE;

2502

pf->zero_new_blocks = true;

2502

pf->zero_new_blocks = true;

2503

pf->discard_enabled = true;

2503

pf->discard_enabled = true;

2504

pf->discard_passdown = true;

2504

pf->discard_passdown = true;

2505

pf->error_if_no_space = false;

2505

pf->error_if_no_space = false;

2506

}

2506

}

2507

2508

static void __pool_destroy(struct pool *pool)

2508

static void __pool_destroy(struct pool *pool)

2509

{

2509

{

2510

__pool_table_remove(pool);

2510

__pool_table_remove(pool);

2511

2512

if (dm_pool_metadata_close(pool->pmd) < 0)

2512

if (dm_pool_metadata_close(pool->pmd) < 0)

2513

DMWARN("%s: dm_pool_metadata_close() failed.", __func__);

2513

DMWARN("%s: dm_pool_metadata_close() failed.", __func__);

2514

2515

dm_bio_prison_destroy(pool->prison);

2515

dm_bio_prison_destroy(pool->prison);

2516

dm_kcopyd_client_destroy(pool->copier);

2516

dm_kcopyd_client_destroy(pool->copier);

2517

2518

if (pool->wq)

2518

if (pool->wq)

2519

destroy_workqueue(pool->wq);

2519

destroy_workqueue(pool->wq);

2520

2521

if (pool->next_mapping)

2521

if (pool->next_mapping)

2522

mempool_free(pool->next_mapping, pool->mapping_pool);

2522

mempool_free(pool->next_mapping, pool->mapping_pool);

2523

mempool_destroy(pool->mapping_pool);

2523

mempool_destroy(pool->mapping_pool);

2524

dm_deferred_set_destroy(pool->shared_read_ds);

2524

dm_deferred_set_destroy(pool->shared_read_ds);

2525

dm_deferred_set_destroy(pool->all_io_ds);

2525

dm_deferred_set_destroy(pool->all_io_ds);

2526

kfree(pool);

2526

kfree(pool);

2527

}

2527

}

2528

2529

static struct kmem_cache *_new_mapping_cache;

2529

static struct kmem_cache *_new_mapping_cache;

2530

2531

static struct pool *pool_create(struct mapped_device *pool_md,

2531

static struct pool *pool_create(struct mapped_device *pool_md,

2532

struct block_device *metadata_dev,

2532

struct block_device *metadata_dev,

2533

unsigned long block_size,

2533

unsigned long block_size,

2534

int read_only, char **error)

2534

int read_only, char **error)

2535

{

2535

{

2536

int r;

2536

int r;

2537

void *err_p;

2537

void *err_p;

2538

struct pool *pool;

2538

struct pool *pool;

2539

struct dm_pool_metadata *pmd;

2539

struct dm_pool_metadata *pmd;

2540

bool format_device = read_only ? false : true;

2540

bool format_device = read_only ? false : true;

2541

2542

pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);

2542

pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);

2543

if (IS_ERR(pmd)) {

2543

if (IS_ERR(pmd)) {

2544

*error = "Error creating metadata object";

2544

*error = "Error creating metadata object";

2545

return (struct pool *)pmd;

2545

return (struct pool *)pmd;

2546

}

2546

}

2547

2548

pool = kmalloc(sizeof(*pool), GFP_KERNEL);

2548

pool = kmalloc(sizeof(*pool), GFP_KERNEL);

2549

if (!pool) {

2549

if (!pool) {

2550

*error = "Error allocating memory for pool";

2550

*error = "Error allocating memory for pool";

2551

err_p = ERR_PTR(-ENOMEM);

2551

err_p = ERR_PTR(-ENOMEM);

2552

goto bad_pool;

2552

goto bad_pool;

2553

}

2553

}

2554

2555

pool->pmd = pmd;

2555

pool->pmd = pmd;

2556

pool->sectors_per_block = block_size;

2556

pool->sectors_per_block = block_size;

2557

if (block_size & (block_size - 1))

2557

if (block_size & (block_size - 1))

2558

pool->sectors_per_block_shift = -1;

2558

pool->sectors_per_block_shift = -1;

2559

else

2559

else

2560

pool->sectors_per_block_shift = __ffs(block_size);

2560

pool->sectors_per_block_shift = __ffs(block_size);

2561

pool->low_water_blocks = 0;

2561

pool->low_water_blocks = 0;

2562

pool_features_init(&pool->pf);

2562

pool_features_init(&pool->pf);

2563

pool->prison = dm_bio_prison_create();

2563

pool->prison = dm_bio_prison_create();

2564

if (!pool->prison) {

2564

if (!pool->prison) {

2565

*error = "Error creating pool's bio prison";

2565

*error = "Error creating pool's bio prison";

2566

err_p = ERR_PTR(-ENOMEM);

2566

err_p = ERR_PTR(-ENOMEM);

2567

goto bad_prison;

2567

goto bad_prison;

2568

}

2568

}

2569

2570

pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);

2570

pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);

2571

if (IS_ERR(pool->copier)) {

2571

if (IS_ERR(pool->copier)) {

2572

r = PTR_ERR(pool->copier);

2572

r = PTR_ERR(pool->copier);

2573

*error = "Error creating pool's kcopyd client";

2573

*error = "Error creating pool's kcopyd client";

2574

err_p = ERR_PTR(r);

2574

err_p = ERR_PTR(r);

2575

goto bad_kcopyd_client;

2575

goto bad_kcopyd_client;

2576

}

2576

}

2577

2578

/*

2578

/*

2579

* Create singlethreaded workqueue that will service all devices

2579

* Create singlethreaded workqueue that will service all devices

2580

* that use this metadata.

2580

* that use this metadata.

2581

*/

2581

*/

2582

pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);

2582

pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);

2583

if (!pool->wq) {

2583

if (!pool->wq) {

2584

*error = "Error creating pool's workqueue";

2584

*error = "Error creating pool's workqueue";

2585

err_p = ERR_PTR(-ENOMEM);

2585

err_p = ERR_PTR(-ENOMEM);

2586

goto bad_wq;

2586

goto bad_wq;

2587

}

2587

}

2588

2589

throttle_init(&pool->throttle);

2589

throttle_init(&pool->throttle);

2590

INIT_WORK(&pool->worker, do_worker);

2590

INIT_WORK(&pool->worker, do_worker);

2591

INIT_DELAYED_WORK(&pool->waker, do_waker);

2591

INIT_DELAYED_WORK(&pool->waker, do_waker);

2592

INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);

2592

INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);

2593

spin_lock_init(&pool->lock);

2593

spin_lock_init(&pool->lock);

2594

bio_list_init(&pool->deferred_flush_bios);

2594

bio_list_init(&pool->deferred_flush_bios);

2595

INIT_LIST_HEAD(&pool->prepared_mappings);

2595

INIT_LIST_HEAD(&pool->prepared_mappings);

2596

INIT_LIST_HEAD(&pool->prepared_discards);

2596

INIT_LIST_HEAD(&pool->prepared_discards);

2597

INIT_LIST_HEAD(&pool->active_thins);

2597

INIT_LIST_HEAD(&pool->active_thins);

2598

pool->low_water_triggered = false;

2598

pool->low_water_triggered = false;

2599

pool->suspended = true;

2599

pool->suspended = true;

2600

2601

pool->shared_read_ds = dm_deferred_set_create();

2601

pool->shared_read_ds = dm_deferred_set_create();

2602

if (!pool->shared_read_ds) {

2602

if (!pool->shared_read_ds) {

2603

*error = "Error creating pool's shared read deferred set";

2603

*error = "Error creating pool's shared read deferred set";

2604

err_p = ERR_PTR(-ENOMEM);

2604

err_p = ERR_PTR(-ENOMEM);

2605

goto bad_shared_read_ds;

2605

goto bad_shared_read_ds;

2606

}

2606

}

2607

2608

pool->all_io_ds = dm_deferred_set_create();

2608

pool->all_io_ds = dm_deferred_set_create();

2609

if (!pool->all_io_ds) {

2609

if (!pool->all_io_ds) {

2610

*error = "Error creating pool's all io deferred set";

2610

*error = "Error creating pool's all io deferred set";

2611

err_p = ERR_PTR(-ENOMEM);

2611

err_p = ERR_PTR(-ENOMEM);

2612

goto bad_all_io_ds;

2612

goto bad_all_io_ds;

2613

}

2613

}

2614

2615

pool->next_mapping = NULL;

2615

pool->next_mapping = NULL;

2616

pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,

2616

pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,

2617

_new_mapping_cache);

2617

_new_mapping_cache);

2618

if (!pool->mapping_pool) {

2618

if (!pool->mapping_pool) {

2619

*error = "Error creating pool's mapping mempool";

2619

*error = "Error creating pool's mapping mempool";

2620

err_p = ERR_PTR(-ENOMEM);

2620

err_p = ERR_PTR(-ENOMEM);

2621

goto bad_mapping_pool;

2621

goto bad_mapping_pool;

2622

}

2622

}

2623

2624

pool->ref_count = 1;

2624

pool->ref_count = 1;

2625

pool->last_commit_jiffies = jiffies;

2625

pool->last_commit_jiffies = jiffies;

2626

pool->pool_md = pool_md;

2626

pool->pool_md = pool_md;

2627

pool->md_dev = metadata_dev;

2627

pool->md_dev = metadata_dev;

2628

__pool_table_insert(pool);

2628

__pool_table_insert(pool);

2629

2630

return pool;

2630

return pool;

2631

2632

bad_mapping_pool:

2632

bad_mapping_pool:

2633

dm_deferred_set_destroy(pool->all_io_ds);

2633

dm_deferred_set_destroy(pool->all_io_ds);

2634

bad_all_io_ds:

2634

bad_all_io_ds:

2635

dm_deferred_set_destroy(pool->shared_read_ds);

2635

dm_deferred_set_destroy(pool->shared_read_ds);

2636

bad_shared_read_ds:

2636

bad_shared_read_ds:

2637

destroy_workqueue(pool->wq);

2637

destroy_workqueue(pool->wq);

2638

bad_wq:

2638

bad_wq:

2639

dm_kcopyd_client_destroy(pool->copier);

2639

dm_kcopyd_client_destroy(pool->copier);

2640

bad_kcopyd_client:

2640

bad_kcopyd_client:

2641

dm_bio_prison_destroy(pool->prison);

2641

dm_bio_prison_destroy(pool->prison);

2642

bad_prison:

2642

bad_prison:

2643

kfree(pool);

2643

kfree(pool);

2644

bad_pool:

2644

bad_pool:

2645

if (dm_pool_metadata_close(pmd))

2645

if (dm_pool_metadata_close(pmd))

2646

DMWARN("%s: dm_pool_metadata_close() failed.", __func__);

2646

DMWARN("%s: dm_pool_metadata_close() failed.", __func__);

2647

2648

return err_p;

2648

return err_p;

2649

}

2649

}

2650

2651

static void __pool_inc(struct pool *pool)

2651

static void __pool_inc(struct pool *pool)

2652

{

2652

{

2653

BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

2653

BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

2654

pool->ref_count++;

2654

pool->ref_count++;

2655

}

2655

}

2656

2657

static void __pool_dec(struct pool *pool)

2657

static void __pool_dec(struct pool *pool)

2658

{

2658

{

2659

BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

2659

BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));

2660

BUG_ON(!pool->ref_count);

2660

BUG_ON(!pool->ref_count);

2661

if (!--pool->ref_count)

2661

if (!--pool->ref_count)

2662

__pool_destroy(pool);

2662

__pool_destroy(pool);

2663

}

2663

}

2664

2665

static struct pool *__pool_find(struct mapped_device *pool_md,

2665

static struct pool *__pool_find(struct mapped_device *pool_md,

2666

struct block_device *metadata_dev,

2666

struct block_device *metadata_dev,

2667

unsigned long block_size, int read_only,

2667

unsigned long block_size, int read_only,

2668

char **error, int *created)

2668

char **error, int *created)

2669

{

2669

{

2670

struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);

2670

struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);

2671

2672

if (pool) {

2672

if (pool) {

2673

if (pool->pool_md != pool_md) {

2673

if (pool->pool_md != pool_md) {

2674

*error = "metadata device already in use by a pool";

2674

*error = "metadata device already in use by a pool";

2675

return ERR_PTR(-EBUSY);

2675

return ERR_PTR(-EBUSY);

2676

}

2676

}

2677

__pool_inc(pool);

2677

__pool_inc(pool);

2678

2679

} else {

2679

} else {

2680

pool = __pool_table_lookup(pool_md);

2680

pool = __pool_table_lookup(pool_md);

2681

if (pool) {

2681

if (pool) {

2682

if (pool->md_dev != metadata_dev) {

2682

if (pool->md_dev != metadata_dev) {

2683

*error = "different pool cannot replace a pool";

2683

*error = "different pool cannot replace a pool";

2684

return ERR_PTR(-EINVAL);

2684

return ERR_PTR(-EINVAL);

2685

}

2685

}

2686

__pool_inc(pool);

2686

__pool_inc(pool);

2687

2688

} else {

2688

} else {

2689

pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);

2689

pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);

2690

*created = 1;

2690

*created = 1;

2691

}

2691

}

2692

}

2692

}

2693

2694

return pool;

2694

return pool;

2695

}

2695

}

2696

2697

/*----------------------------------------------------------------

2697

/*----------------------------------------------------------------

2698

* Pool target methods

2698

* Pool target methods

2699

*--------------------------------------------------------------*/

2699

*--------------------------------------------------------------*/

2700

static void pool_dtr(struct dm_target *ti)

2700

static void pool_dtr(struct dm_target *ti)

2701

{

2701

{

2702

struct pool_c *pt = ti->private;

2702

struct pool_c *pt = ti->private;

2703

2704

mutex_lock(&dm_thin_pool_table.mutex);

2704

mutex_lock(&dm_thin_pool_table.mutex);

2705

2706

unbind_control_target(pt->pool, ti);

2706

unbind_control_target(pt->pool, ti);

2707

__pool_dec(pt->pool);

2707

__pool_dec(pt->pool);

2708

dm_put_device(ti, pt->metadata_dev);

2708

dm_put_device(ti, pt->metadata_dev);

2709

dm_put_device(ti, pt->data_dev);

2709

dm_put_device(ti, pt->data_dev);

2710

kfree(pt);

2710

kfree(pt);

2711

2712

mutex_unlock(&dm_thin_pool_table.mutex);

2712

mutex_unlock(&dm_thin_pool_table.mutex);

2713

}

2713

}

2714

2715

static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,

2715

static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,

2716

struct dm_target *ti)

2716

struct dm_target *ti)

2717

{

2717

{

2718

int r;

2718

int r;

2719

unsigned argc;

2719

unsigned argc;

2720

const char *arg_name;

2720

const char *arg_name;

2721

2722

static struct dm_arg _args[] = {

2722

static struct dm_arg _args[] = {

2723

{0, 4, "Invalid number of pool feature arguments"},

2723

{0, 4, "Invalid number of pool feature arguments"},

2724

};

2724

};

2725

2726

/*

2726

/*

2727

* No feature arguments supplied.

2727

* No feature arguments supplied.

2728

*/

2728

*/

2729

if (!as->argc)

2729

if (!as->argc)

2730

return 0;

2730

return 0;

2731

2732

r = dm_read_arg_group(_args, as, &argc, &ti->error);

2732

r = dm_read_arg_group(_args, as, &argc, &ti->error);

2733

if (r)

2733

if (r)

2734

return -EINVAL;

2734

return -EINVAL;

2735

2736

while (argc && !r) {

2736

while (argc && !r) {

2737

arg_name = dm_shift_arg(as);

2737

arg_name = dm_shift_arg(as);

2738

argc--;

2738

argc--;

2739

2740

if (!strcasecmp(arg_name, "skip_block_zeroing"))

2740

if (!strcasecmp(arg_name, "skip_block_zeroing"))

2741

pf->zero_new_blocks = false;

2741

pf->zero_new_blocks = false;

2742

2743

else if (!strcasecmp(arg_name, "ignore_discard"))

2743

else if (!strcasecmp(arg_name, "ignore_discard"))

2744

pf->discard_enabled = false;

2744

pf->discard_enabled = false;

2745

2746

else if (!strcasecmp(arg_name, "no_discard_passdown"))

2746

else if (!strcasecmp(arg_name, "no_discard_passdown"))

2747

pf->discard_passdown = false;

2747

pf->discard_passdown = false;

2748

2749

else if (!strcasecmp(arg_name, "read_only"))

2749

else if (!strcasecmp(arg_name, "read_only"))

2750

pf->mode = PM_READ_ONLY;

2750

pf->mode = PM_READ_ONLY;

2751

2752

else if (!strcasecmp(arg_name, "error_if_no_space"))

2752

else if (!strcasecmp(arg_name, "error_if_no_space"))

2753

pf->error_if_no_space = true;

2753

pf->error_if_no_space = true;

2754

2755

else {

2755

else {

2756

ti->error = "Unrecognised pool feature requested";

2756

ti->error = "Unrecognised pool feature requested";

2757

r = -EINVAL;

2757

r = -EINVAL;

2758

break;

2758

break;

2759

}

2759

}

2760

}

2760

}

2761

2762

return r;

2762

return r;

2763

}

2763

}

2764

2765

static void metadata_low_callback(void *context)

2765

static void metadata_low_callback(void *context)

2766

{

2766

{

2767

struct pool *pool = context;

2767

struct pool *pool = context;

2768

2769

DMWARN("%s: reached low water mark for metadata device: sending event.",

2769

DMWARN("%s: reached low water mark for metadata device: sending event.",

2770

dm_device_name(pool->pool_md));

2770

dm_device_name(pool->pool_md));

2771

2772

dm_table_event(pool->ti->table);

2772

dm_table_event(pool->ti->table);

2773

}

2773

}

2774

2775

static sector_t get_dev_size(struct block_device *bdev)

2775

static sector_t get_dev_size(struct block_device *bdev)

2776

{

2776

{

2777

return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;

2777

return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;

2778

}

2778

}

2779

2780

static void warn_if_metadata_device_too_big(struct block_device *bdev)

2780

static void warn_if_metadata_device_too_big(struct block_device *bdev)

2781

{

2781

{

2782

sector_t metadata_dev_size = get_dev_size(bdev);

2782

sector_t metadata_dev_size = get_dev_size(bdev);

2783

char buffer[BDEVNAME_SIZE];

2783

char buffer[BDEVNAME_SIZE];

2784

2785

if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)

2785

if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)

2786

DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",

2786

DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",

2787

bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);

2787

bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);

2788

}

2788

}

2789

2790

static sector_t get_metadata_dev_size(struct block_device *bdev)

2790

static sector_t get_metadata_dev_size(struct block_device *bdev)

2791

{

2791

{

2792

sector_t metadata_dev_size = get_dev_size(bdev);

2792

sector_t metadata_dev_size = get_dev_size(bdev);

2793

2794

if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)

2794

if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)

2795

metadata_dev_size = THIN_METADATA_MAX_SECTORS;

2795

metadata_dev_size = THIN_METADATA_MAX_SECTORS;

2796

2797

return metadata_dev_size;

2797

return metadata_dev_size;

2798

}

2798

}

2799

2800

static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)

2800

static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)

2801

{

2801

{

2802

sector_t metadata_dev_size = get_metadata_dev_size(bdev);

2802

sector_t metadata_dev_size = get_metadata_dev_size(bdev);

2803

2804

sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);

2804

sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);

2805

2806

return metadata_dev_size;

2806

return metadata_dev_size;

2807

}

2807

}

2808

2809

/*

2809

/*

2810

* When a metadata threshold is crossed a dm event is triggered, and

2810

* When a metadata threshold is crossed a dm event is triggered, and

2811

* userland should respond by growing the metadata device. We could let

2811

* userland should respond by growing the metadata device. We could let

2812

* userland set the threshold, like we do with the data threshold, but I'm

2812

* userland set the threshold, like we do with the data threshold, but I'm

2813

* not sure they know enough to do this well.

2813

* not sure they know enough to do this well.

2814

*/

2814

*/

2815

static dm_block_t calc_metadata_threshold(struct pool_c *pt)

2815

static dm_block_t calc_metadata_threshold(struct pool_c *pt)

2816

{

2816

{

2817

/*

2817

/*

2818

* 4M is ample for all ops with the possible exception of thin

2818

* 4M is ample for all ops with the possible exception of thin

2819

* device deletion which is harmless if it fails (just retry the

2819

* device deletion which is harmless if it fails (just retry the

2820

* delete after you've grown the device).

2820

* delete after you've grown the device).

2821

*/

2821

*/

2822

dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;

2822

dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;

2823

return min((dm_block_t)1024ULL /* 4M */, quarter);

2823

return min((dm_block_t)1024ULL /* 4M */, quarter);

2824

}

2824

}

2825

2826

/*

2826

/*

2827

* thin-pool <metadata dev> <data dev>

2827

* thin-pool <metadata dev> <data dev>

2828

* <data block size (sectors)>

2828

* <data block size (sectors)>

2829

* <low water mark (blocks)>

2829

* <low water mark (blocks)>

2830

* [<#feature args> [<arg>]*]

2830

* [<#feature args> [<arg>]*]

2831

*

2831

*

2832

* Optional feature arguments are:

2832

* Optional feature arguments are:

2833

* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.

2833

* skip_block_zeroing: skips the zeroing of newly-provisioned blocks.

2834

* ignore_discard: disable discard

2834

* ignore_discard: disable discard

2835

* no_discard_passdown: don't pass discards down to the data device

2835

* no_discard_passdown: don't pass discards down to the data device

2836

* read_only: Don't allow any changes to be made to the pool metadata.

2836

* read_only: Don't allow any changes to be made to the pool metadata.

2837

* error_if_no_space: error IOs, instead of queueing, if no space.

2837

* error_if_no_space: error IOs, instead of queueing, if no space.

2838

*/

2838

*/

2839

static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)

2839

static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)

2840

{

2840

{

2841

int r, pool_created = 0;

2841

int r, pool_created = 0;

2842

struct pool_c *pt;

2842

struct pool_c *pt;

2843

struct pool *pool;

2843

struct pool *pool;

2844

struct pool_features pf;

2844

struct pool_features pf;

2845

struct dm_arg_set as;

2845

struct dm_arg_set as;

2846

struct dm_dev *data_dev;

2846

struct dm_dev *data_dev;

2847

unsigned long block_size;

2847

unsigned long block_size;

2848

dm_block_t low_water_blocks;

2848

dm_block_t low_water_blocks;

2849

struct dm_dev *metadata_dev;

2849

struct dm_dev *metadata_dev;

2850

fmode_t metadata_mode;

2850

fmode_t metadata_mode;

2851

2852

/*

2852

/*

2853

* FIXME Remove validation from scope of lock.

2853

* FIXME Remove validation from scope of lock.

2854

*/

2854

*/

2855

mutex_lock(&dm_thin_pool_table.mutex);

2855

mutex_lock(&dm_thin_pool_table.mutex);

2856

2857

if (argc < 4) {

2857

if (argc < 4) {

2858

ti->error = "Invalid argument count";

2858

ti->error = "Invalid argument count";

2859

r = -EINVAL;

2859

r = -EINVAL;

2860

goto out_unlock;

2860

goto out_unlock;

2861

}

2861

}

2862

2863

as.argc = argc;

2863

as.argc = argc;

2864

as.argv = argv;

2864

as.argv = argv;

2865

2866

/*

2866

/*

2867

* Set default pool features.

2867

* Set default pool features.

2868

*/

2868

*/

2869

pool_features_init(&pf);

2869

pool_features_init(&pf);

2870

2871

dm_consume_args(&as, 4);

2871

dm_consume_args(&as, 4);

2872

r = parse_pool_features(&as, &pf, ti);

2872

r = parse_pool_features(&as, &pf, ti);

2873

if (r)

2873

if (r)

2874

goto out_unlock;

2874

goto out_unlock;

2875

2876

metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);

2876

metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);

2877

r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);

2877

r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);

2878

if (r) {

2878

if (r) {

2879

ti->error = "Error opening metadata block device";

2879

ti->error = "Error opening metadata block device";

2880

goto out_unlock;

2880

goto out_unlock;

2881

}

2881

}

2882

warn_if_metadata_device_too_big(metadata_dev->bdev);

2882

warn_if_metadata_device_too_big(metadata_dev->bdev);

2883

2884

r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);

2884

r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);

2885

if (r) {

2885

if (r) {

2886

ti->error = "Error getting data device";

2886

ti->error = "Error getting data device";

2887

goto out_metadata;

2887

goto out_metadata;

2888

}

2888

}

2889

2890

if (kstrtoul(argv[2], 10, &block_size) || !block_size ||

2890

if (kstrtoul(argv[2], 10, &block_size) || !block_size ||

2891

block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||

2891

block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||

2892

block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||

2892

block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||

2893

block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {

2893

block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {

2894

ti->error = "Invalid block size";

2894

ti->error = "Invalid block size";

2895

r = -EINVAL;

2895

r = -EINVAL;

2896

goto out;

2896

goto out;

2897

}

2897

}

2898

2899

if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {

2899

if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {

2900

ti->error = "Invalid low water mark";

2900

ti->error = "Invalid low water mark";

2901

r = -EINVAL;

2901

r = -EINVAL;

2902

goto out;

2902

goto out;

2903

}

2903

}

2904

2905

pt = kzalloc(sizeof(*pt), GFP_KERNEL);

2905

pt = kzalloc(sizeof(*pt), GFP_KERNEL);

2906

if (!pt) {

2906

if (!pt) {

2907

r = -ENOMEM;

2907

r = -ENOMEM;

2908

goto out;

2908

goto out;

2909

}

2909

}

2910

2911

pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,

2911

pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,

2912

block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);

2912

block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);

2913

if (IS_ERR(pool)) {

2913

if (IS_ERR(pool)) {

2914

r = PTR_ERR(pool);

2914

r = PTR_ERR(pool);

2915

goto out_free_pt;

2915

goto out_free_pt;

2916

}

2916

}

2917

2918

/*

2918

/*

2919

* 'pool_created' reflects whether this is the first table load.

2919

* 'pool_created' reflects whether this is the first table load.

2920

* Top level discard support is not allowed to be changed after

2920

* Top level discard support is not allowed to be changed after

2921

* initial load. This would require a pool reload to trigger thin

2921

* initial load. This would require a pool reload to trigger thin

2922

* device changes.

2922

* device changes.

2923

*/

2923

*/

2924

if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {

2924

if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {

2925

ti->error = "Discard support cannot be disabled once enabled";

2925

ti->error = "Discard support cannot be disabled once enabled";

2926

r = -EINVAL;

2926

r = -EINVAL;

2927

goto out_flags_changed;

2927

goto out_flags_changed;

2928

}

2928

}

2929

2930

pt->pool = pool;

2930

pt->pool = pool;

2931

pt->ti = ti;

2931

pt->ti = ti;

2932

pt->metadata_dev = metadata_dev;

2932

pt->metadata_dev = metadata_dev;

2933

pt->data_dev = data_dev;

2933

pt->data_dev = data_dev;

2934

pt->low_water_blocks = low_water_blocks;

2934

pt->low_water_blocks = low_water_blocks;

2935

pt->adjusted_pf = pt->requested_pf = pf;

2935

pt->adjusted_pf = pt->requested_pf = pf;

2936

ti->num_flush_bios = 1;

2936

ti->num_flush_bios = 1;

2937

2938

/*

2938

/*

2939

* Only need to enable discards if the pool should pass

2939

* Only need to enable discards if the pool should pass

2940

* them down to the data device. The thin device's discard

2940

* them down to the data device. The thin device's discard

2941

* processing will cause mappings to be removed from the btree.

2941

* processing will cause mappings to be removed from the btree.

2942

*/

2942

*/

2943

ti->discard_zeroes_data_unsupported = true;

2943

ti->discard_zeroes_data_unsupported = true;

2944

if (pf.discard_enabled && pf.discard_passdown) {

2944

if (pf.discard_enabled && pf.discard_passdown) {

2945

ti->num_discard_bios = 1;

2945

ti->num_discard_bios = 1;

2946

2947

/*

2947

/*

2948

* Setting 'discards_supported' circumvents the normal

2948

* Setting 'discards_supported' circumvents the normal

2949

* stacking of discard limits (this keeps the pool and

2949

* stacking of discard limits (this keeps the pool and

2950

* thin devices' discard limits consistent).

2950

* thin devices' discard limits consistent).

2951

*/

2951

*/

2952

ti->discards_supported = true;

2952

ti->discards_supported = true;

2953

}

2953

}

2954

ti->private = pt;

2954

ti->private = pt;

2955

2956

r = dm_pool_register_metadata_threshold(pt->pool->pmd,

2956

r = dm_pool_register_metadata_threshold(pt->pool->pmd,

2957

calc_metadata_threshold(pt),

2957

calc_metadata_threshold(pt),

2958

metadata_low_callback,

2958

metadata_low_callback,

2959

pool);

2959

pool);

2960

if (r)

2960

if (r)

2961

goto out_free_pt;

2961

goto out_free_pt;

2962

2963

pt->callbacks.congested_fn = pool_is_congested;

2963

pt->callbacks.congested_fn = pool_is_congested;

2964

dm_table_add_target_callbacks(ti->table, &pt->callbacks);

2964

dm_table_add_target_callbacks(ti->table, &pt->callbacks);

2965

2966

mutex_unlock(&dm_thin_pool_table.mutex);

2966

mutex_unlock(&dm_thin_pool_table.mutex);

2967

2968

return 0;

2968

return 0;

2969

2970

out_flags_changed:

2970

out_flags_changed:

2971

__pool_dec(pool);

2971

__pool_dec(pool);

2972

out_free_pt:

2972

out_free_pt:

2973

kfree(pt);

2973

kfree(pt);

2974

out:

2974

out:

2975

dm_put_device(ti, data_dev);

2975

dm_put_device(ti, data_dev);

2976

out_metadata:

2976

out_metadata:

2977

dm_put_device(ti, metadata_dev);

2977

dm_put_device(ti, metadata_dev);

2978

out_unlock:

2978

out_unlock:

2979

mutex_unlock(&dm_thin_pool_table.mutex);

2979

mutex_unlock(&dm_thin_pool_table.mutex);

2980

2981

return r;

2981

return r;

2982

}

2982

}

2983

2984

static int pool_map(struct dm_target *ti, struct bio *bio)

2984

static int pool_map(struct dm_target *ti, struct bio *bio)

2985

{

2985

{

2986

int r;

2986

int r;

2987

struct pool_c *pt = ti->private;

2987

struct pool_c *pt = ti->private;

2988

struct pool *pool = pt->pool;

2988

struct pool *pool = pt->pool;

2989

unsigned long flags;

2989

unsigned long flags;

2990

2991

/*

2991

/*

2992

* As this is a singleton target, ti->begin is always zero.

2992

* As this is a singleton target, ti->begin is always zero.

2993

*/

2993

*/

2994

spin_lock_irqsave(&pool->lock, flags);

2994

spin_lock_irqsave(&pool->lock, flags);

2995

bio->bi_bdev = pt->data_dev->bdev;

2995

bio->bi_bdev = pt->data_dev->bdev;

2996

r = DM_MAPIO_REMAPPED;

2996

r = DM_MAPIO_REMAPPED;

2997

spin_unlock_irqrestore(&pool->lock, flags);

2997

spin_unlock_irqrestore(&pool->lock, flags);

2998

2999

return r;

2999

return r;

3000

}

3000

}

3001

3002

static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)

3002

static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)

3003

{

3003

{

3004

int r;

3004

int r;

3005

struct pool_c *pt = ti->private;

3005

struct pool_c *pt = ti->private;

3006

struct pool *pool = pt->pool;

3006

struct pool *pool = pt->pool;

3007

sector_t data_size = ti->len;

3007

sector_t data_size = ti->len;

3008

dm_block_t sb_data_size;

3008

dm_block_t sb_data_size;

3009

3010

*need_commit = false;

3010

*need_commit = false;

3011

3012

(void) sector_div(data_size, pool->sectors_per_block);

3012

(void) sector_div(data_size, pool->sectors_per_block);

3013

3014

r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);

3014

r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);

3015

if (r) {

3015

if (r) {

3016

DMERR("%s: failed to retrieve data device size",

3016

DMERR("%s: failed to retrieve data device size",

3017

dm_device_name(pool->pool_md));

3017

dm_device_name(pool->pool_md));

3018

return r;

3018

return r;

3019

}

3019

}

3020

3021

if (data_size < sb_data_size) {

3021

if (data_size < sb_data_size) {

3022

DMERR("%s: pool target (%llu blocks) too small: expected %llu",

3022

DMERR("%s: pool target (%llu blocks) too small: expected %llu",

3023

dm_device_name(pool->pool_md),

3023

dm_device_name(pool->pool_md),

3024

(unsigned long long)data_size, sb_data_size);

3024

(unsigned long long)data_size, sb_data_size);

3025

return -EINVAL;

3025

return -EINVAL;

3026

3027

} else if (data_size > sb_data_size) {

3027

} else if (data_size > sb_data_size) {

3028

if (dm_pool_metadata_needs_check(pool->pmd)) {

3028

if (dm_pool_metadata_needs_check(pool->pmd)) {

3029

DMERR("%s: unable to grow the data device until repaired.",

3029

DMERR("%s: unable to grow the data device until repaired.",

3030

dm_device_name(pool->pool_md));

3030

dm_device_name(pool->pool_md));

3031

return 0;

3031

return 0;

3032

}

3032

}

3033

3034

if (sb_data_size)

3034

if (sb_data_size)

3035

DMINFO("%s: growing the data device from %llu to %llu blocks",

3035

DMINFO("%s: growing the data device from %llu to %llu blocks",

3036

dm_device_name(pool->pool_md),

3036

dm_device_name(pool->pool_md),

3037

sb_data_size, (unsigned long long)data_size);

3037

sb_data_size, (unsigned long long)data_size);

3038

r = dm_pool_resize_data_dev(pool->pmd, data_size);

3038

r = dm_pool_resize_data_dev(pool->pmd, data_size);

3039

if (r) {

3039

if (r) {

3040

metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);

3040

metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);

3041

return r;

3041

return r;

3042

}

3042

}

3043

3044

*need_commit = true;

3044

*need_commit = true;

3045

}

3045

}

3046

3047

return 0;

3047

return 0;

3048

}

3048

}

3049

3050

static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)

3050

static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)

3051

{

3051

{

3052

int r;

3052

int r;

3053

struct pool_c *pt = ti->private;

3053

struct pool_c *pt = ti->private;

3054

struct pool *pool = pt->pool;

3054

struct pool *pool = pt->pool;

3055

dm_block_t metadata_dev_size, sb_metadata_dev_size;

3055

dm_block_t metadata_dev_size, sb_metadata_dev_size;

3056

3057

*need_commit = false;

3057

*need_commit = false;

3058

3059

metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);

3059

metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);

3060

3061

r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);

3061

r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);

3062

if (r) {

3062

if (r) {

3063

DMERR("%s: failed to retrieve metadata device size",

3063

DMERR("%s: failed to retrieve metadata device size",

3064

dm_device_name(pool->pool_md));

3064

dm_device_name(pool->pool_md));

3065

return r;

3065

return r;

3066

}

3066

}

3067

3068

if (metadata_dev_size < sb_metadata_dev_size) {

3068

if (metadata_dev_size < sb_metadata_dev_size) {

3069

DMERR("%s: metadata device (%llu blocks) too small: expected %llu",

3069

DMERR("%s: metadata device (%llu blocks) too small: expected %llu",

3070

dm_device_name(pool->pool_md),

3070

dm_device_name(pool->pool_md),

3071

metadata_dev_size, sb_metadata_dev_size);

3071

metadata_dev_size, sb_metadata_dev_size);

3072

return -EINVAL;

3072

return -EINVAL;

3073

3074

} else if (metadata_dev_size > sb_metadata_dev_size) {

3074

} else if (metadata_dev_size > sb_metadata_dev_size) {

3075

if (dm_pool_metadata_needs_check(pool->pmd)) {

3075

if (dm_pool_metadata_needs_check(pool->pmd)) {

3076

DMERR("%s: unable to grow the metadata device until repaired.",

3076

DMERR("%s: unable to grow the metadata device until repaired.",

3077

dm_device_name(pool->pool_md));

3077

dm_device_name(pool->pool_md));

3078

return 0;

3078

return 0;

3079

}

3079

}

3080

3081

warn_if_metadata_device_too_big(pool->md_dev);

3081

warn_if_metadata_device_too_big(pool->md_dev);

3082

DMINFO("%s: growing the metadata device from %llu to %llu blocks",

3082

DMINFO("%s: growing the metadata device from %llu to %llu blocks",

3083

dm_device_name(pool->pool_md),

3083

dm_device_name(pool->pool_md),

3084

sb_metadata_dev_size, metadata_dev_size);

3084

sb_metadata_dev_size, metadata_dev_size);

3085

r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);

3085

r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);

3086

if (r) {

3086

if (r) {

3087

metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);

3087

metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);

3088

return r;

3088

return r;

3089

}

3089

}

3090

3091

*need_commit = true;

3091

*need_commit = true;

3092

}

3092

}

3093

3094

return 0;

3094

return 0;

3095

}

3095

}

3096

3097

/*

3097

/*

3098

* Retrieves the number of blocks of the data device from

3098

* Retrieves the number of blocks of the data device from

3099

* the superblock and compares it to the actual device size,

3099

* the superblock and compares it to the actual device size,

3100

* thus resizing the data device in case it has grown.

3100

* thus resizing the data device in case it has grown.

3101

*

3101

*

3102

* This both copes with opening preallocated data devices in the ctr

3102

* This both copes with opening preallocated data devices in the ctr

3103

* being followed by a resume

3103

* being followed by a resume

3104

* -and-

3104

* -and-

3105

* calling the resume method individually after userspace has

3105

* calling the resume method individually after userspace has

3106

* grown the data device in reaction to a table event.

3106

* grown the data device in reaction to a table event.

3107

*/

3107

*/

3108

static int pool_preresume(struct dm_target *ti)

3108

static int pool_preresume(struct dm_target *ti)

3109

{

3109

{

3110

int r;

3110

int r;

3111

bool need_commit1, need_commit2;

3111

bool need_commit1, need_commit2;

3112

struct pool_c *pt = ti->private;

3112

struct pool_c *pt = ti->private;

3113

struct pool *pool = pt->pool;

3113

struct pool *pool = pt->pool;

3114

3115

/*

3115

/*

3116

* Take control of the pool object.

3116

* Take control of the pool object.

3117

*/

3117

*/

3118

r = bind_control_target(pool, ti);

3118

r = bind_control_target(pool, ti);

3119

if (r)

3119

if (r)

3120

return r;

3120

return r;

3121

3122

r = maybe_resize_data_dev(ti, &need_commit1);

3122

r = maybe_resize_data_dev(ti, &need_commit1);

3123

if (r)

3123

if (r)

3124

return r;

3124

return r;

3125

3126

r = maybe_resize_metadata_dev(ti, &need_commit2);

3126

r = maybe_resize_metadata_dev(ti, &need_commit2);

3127

if (r)

3127

if (r)

3128

return r;

3128

return r;

3129

3130

if (need_commit1 || need_commit2)

3130

if (need_commit1 || need_commit2)

3131

(void) commit(pool);

3131

(void) commit(pool);

3132

3133

return 0;

3133

return 0;

3134

}

3134

}

3135

3136

static void pool_suspend_active_thins(struct pool *pool)

3136

static void pool_suspend_active_thins(struct pool *pool)

3137

{

3137

{

3138

struct thin_c *tc;

3138

struct thin_c *tc;

3139

3140

/* Suspend all active thin devices */

3140

/* Suspend all active thin devices */

3141

tc = get_first_thin(pool);

3141

tc = get_first_thin(pool);

3142

while (tc) {

3142

while (tc) {

3143

dm_internal_suspend_noflush(tc->thin_md);

3143

dm_internal_suspend_noflush(tc->thin_md);

3144

tc = get_next_thin(pool, tc);

3144

tc = get_next_thin(pool, tc);

3145

}

3145

}

3146

}

3146

}

3147

3148

static void pool_resume_active_thins(struct pool *pool)

3148

static void pool_resume_active_thins(struct pool *pool)

3149

{

3149

{

3150

struct thin_c *tc;

3150

struct thin_c *tc;

3151

3152

/* Resume all active thin devices */

3152

/* Resume all active thin devices */

3153

tc = get_first_thin(pool);

3153

tc = get_first_thin(pool);

3154

while (tc) {

3154

while (tc) {

3155

dm_internal_resume(tc->thin_md);

3155

dm_internal_resume(tc->thin_md);

3156

tc = get_next_thin(pool, tc);

3156

tc = get_next_thin(pool, tc);

3157

}

3157

}

3158

}

3158

}

3159

3160

static void pool_resume(struct dm_target *ti)

3160

static void pool_resume(struct dm_target *ti)

3161

{

3161

{

3162

struct pool_c *pt = ti->private;

3162

struct pool_c *pt = ti->private;

3163

struct pool *pool = pt->pool;

3163

struct pool *pool = pt->pool;

3164

unsigned long flags;

3164

unsigned long flags;

3165

3166

/*

3166

/*

3167

* Must requeue active_thins' bios and then resume

3167

* Must requeue active_thins' bios and then resume

3168

* active_thins _before_ clearing 'suspend' flag.

3168

* active_thins _before_ clearing 'suspend' flag.

3169

*/

3169

*/

3170

requeue_bios(pool);

3170

requeue_bios(pool);

3171

pool_resume_active_thins(pool);

3171

pool_resume_active_thins(pool);

3172

3173

spin_lock_irqsave(&pool->lock, flags);

3173

spin_lock_irqsave(&pool->lock, flags);

3174

pool->low_water_triggered = false;

3174

pool->low_water_triggered = false;

3175

pool->suspended = false;

3175

pool->suspended = false;

3176

spin_unlock_irqrestore(&pool->lock, flags);

3176

spin_unlock_irqrestore(&pool->lock, flags);

3177

3178

do_waker(&pool->waker.work);

3178

do_waker(&pool->waker.work);

3179

}

3179

}

3180

3181

static void pool_presuspend(struct dm_target *ti)

3181

static void pool_presuspend(struct dm_target *ti)

3182

{

3182

{

3183

struct pool_c *pt = ti->private;

3183

struct pool_c *pt = ti->private;

3184

struct pool *pool = pt->pool;

3184

struct pool *pool = pt->pool;

3185

unsigned long flags;

3185

unsigned long flags;

3186

3187

spin_lock_irqsave(&pool->lock, flags);

3187

spin_lock_irqsave(&pool->lock, flags);

3188

pool->suspended = true;

3188

pool->suspended = true;

3189

spin_unlock_irqrestore(&pool->lock, flags);

3189

spin_unlock_irqrestore(&pool->lock, flags);

3190

3191

pool_suspend_active_thins(pool);

3191

pool_suspend_active_thins(pool);

3192

}

3192

}

3193

3194

static void pool_presuspend_undo(struct dm_target *ti)

3194

static void pool_presuspend_undo(struct dm_target *ti)

3195

{

3195

{

3196

struct pool_c *pt = ti->private;

3196

struct pool_c *pt = ti->private;

3197

struct pool *pool = pt->pool;

3197

struct pool *pool = pt->pool;

3198

unsigned long flags;

3198

unsigned long flags;

3199

3200

pool_resume_active_thins(pool);

3200

pool_resume_active_thins(pool);

3201

3202

spin_lock_irqsave(&pool->lock, flags);

3202

spin_lock_irqsave(&pool->lock, flags);

3203

pool->suspended = false;

3203

pool->suspended = false;

3204

spin_unlock_irqrestore(&pool->lock, flags);

3204

spin_unlock_irqrestore(&pool->lock, flags);

3205

}

3205

}

3206

3207

static void pool_postsuspend(struct dm_target *ti)

3207

static void pool_postsuspend(struct dm_target *ti)

3208

{

3208

{

3209

struct pool_c *pt = ti->private;

3209

struct pool_c *pt = ti->private;

3210

struct pool *pool = pt->pool;

3210

struct pool *pool = pt->pool;

3211

3212

cancel_delayed_work(&pool->waker);

3212

cancel_delayed_work(&pool->waker);

3213

cancel_delayed_work(&pool->no_space_timeout);

3213

cancel_delayed_work(&pool->no_space_timeout);

3214

flush_workqueue(pool->wq);

3214

flush_workqueue(pool->wq);

3215

(void) commit(pool);

3215

(void) commit(pool);

3216

}

3216

}

3217

3218

static int check_arg_count(unsigned argc, unsigned args_required)

3218

static int check_arg_count(unsigned argc, unsigned args_required)

3219

{

3219

{

3220

if (argc != args_required) {

3220

if (argc != args_required) {

3221

DMWARN("Message received with %u arguments instead of %u.",

3221

DMWARN("Message received with %u arguments instead of %u.",

3222

argc, args_required);

3222

argc, args_required);

3223

return -EINVAL;

3223

return -EINVAL;

3224

}

3224

}

3225

3226

return 0;

3226

return 0;

3227

}

3227

}

3228

3229

static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)

3229

static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)

3230

{

3230

{

3231

if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&

3231

if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&

3232

*dev_id <= MAX_DEV_ID)

3232

*dev_id <= MAX_DEV_ID)

3233

return 0;

3233

return 0;

3234

3235

if (warning)

3235

if (warning)

3236

DMWARN("Message received with invalid device id: %s", arg);

3236

DMWARN("Message received with invalid device id: %s", arg);

3237

3238

return -EINVAL;

3238

return -EINVAL;

3239

}

3239

}

3240

3241

static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)

3241

static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)

3242

{

3242

{

3243

dm_thin_id dev_id;

3243

dm_thin_id dev_id;

3244

int r;

3244

int r;

3245

3246

r = check_arg_count(argc, 2);

3246

r = check_arg_count(argc, 2);

3247

if (r)

3247

if (r)

3248

return r;

3248

return r;

3249

3250

r = read_dev_id(argv[1], &dev_id, 1);

3250

r = read_dev_id(argv[1], &dev_id, 1);

3251

if (r)

3251

if (r)

3252

return r;

3252

return r;

3253

3254

r = dm_pool_create_thin(pool->pmd, dev_id);

3254

r = dm_pool_create_thin(pool->pmd, dev_id);

3255

if (r) {

3255

if (r) {

3256

DMWARN("Creation of new thinly-provisioned device with id %s failed.",

3256

DMWARN("Creation of new thinly-provisioned device with id %s failed.",

3257

argv[1]);

3257

argv[1]);

3258

return r;

3258

return r;

3259

}

3259

}

3260

3261

return 0;

3261

return 0;

3262

}

3262

}

3263

3264

static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)

3264

static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)

3265

{

3265

{

3266

dm_thin_id dev_id;

3266

dm_thin_id dev_id;

3267

dm_thin_id origin_dev_id;

3267

dm_thin_id origin_dev_id;

3268

int r;

3268

int r;

3269

3270

r = check_arg_count(argc, 3);

3270

r = check_arg_count(argc, 3);

3271

if (r)

3271

if (r)

3272

return r;

3272

return r;

3273

3274

r = read_dev_id(argv[1], &dev_id, 1);

3274

r = read_dev_id(argv[1], &dev_id, 1);

3275

if (r)

3275

if (r)

3276

return r;

3276

return r;

3277

3278

r = read_dev_id(argv[2], &origin_dev_id, 1);

3278

r = read_dev_id(argv[2], &origin_dev_id, 1);

3279

if (r)

3279

if (r)

3280

return r;

3280

return r;

3281

3282

r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);

3282

r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);

3283

if (r) {

3283

if (r) {

3284

DMWARN("Creation of new snapshot %s of device %s failed.",

3284

DMWARN("Creation of new snapshot %s of device %s failed.",

3285

argv[1], argv[2]);

3285

argv[1], argv[2]);

3286

return r;

3286

return r;

3287

}

3287

}

3288

3289

return 0;

3289

return 0;

3290

}

3290

}

3291

3292

static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)

3292

static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)

3293

{

3293

{

3294

dm_thin_id dev_id;

3294

dm_thin_id dev_id;

3295

int r;

3295

int r;

3296

3297

r = check_arg_count(argc, 2);

3297

r = check_arg_count(argc, 2);

3298

if (r)

3298

if (r)

3299

return r;

3299

return r;

3300

3301

r = read_dev_id(argv[1], &dev_id, 1);

3301

r = read_dev_id(argv[1], &dev_id, 1);

3302

if (r)

3302

if (r)

3303

return r;

3303

return r;

3304

3305

r = dm_pool_delete_thin_device(pool->pmd, dev_id);

3305

r = dm_pool_delete_thin_device(pool->pmd, dev_id);

3306

if (r)

3306

if (r)

3307

DMWARN("Deletion of thin device %s failed.", argv[1]);

3307

DMWARN("Deletion of thin device %s failed.", argv[1]);

3308

3309

return r;

3309

return r;

3310

}

3310

}

3311

3312

static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)

3312

static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)

3313

{

3313

{

3314

dm_thin_id old_id, new_id;

3314

dm_thin_id old_id, new_id;

3315

int r;

3315

int r;

3316

3317

r = check_arg_count(argc, 3);

3317

r = check_arg_count(argc, 3);

3318

if (r)

3318

if (r)

3319

return r;

3319

return r;

3320

3321

if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {

3321

if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {

3322

DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);

3322

DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);

3323

return -EINVAL;

3323

return -EINVAL;

3324

}

3324

}

3325

3326

if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {

3326

if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {

3327

DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);

3327

DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);

3328

return -EINVAL;

3328

return -EINVAL;

3329

}

3329

}

3330

3331

r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);

3331

r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);

3332

if (r) {

3332

if (r) {

3333

DMWARN("Failed to change transaction id from %s to %s.",

3333

DMWARN("Failed to change transaction id from %s to %s.",

3334

argv[1], argv[2]);

3334

argv[1], argv[2]);

3335

return r;

3335

return r;

3336

}

3336

}

3337

3338

return 0;

3338

return 0;

3339

}

3339

}

3340

3341

static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)

3341

static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)

3342

{

3342

{

3343

int r;

3343

int r;

3344

3345

r = check_arg_count(argc, 1);

3345

r = check_arg_count(argc, 1);

3346

if (r)

3346

if (r)

3347

return r;

3347

return r;

3348

3349

(void) commit(pool);

3349

(void) commit(pool);

3350

3351

r = dm_pool_reserve_metadata_snap(pool->pmd);

3351

r = dm_pool_reserve_metadata_snap(pool->pmd);

3352

if (r)

3352

if (r)

3353

DMWARN("reserve_metadata_snap message failed.");

3353

DMWARN("reserve_metadata_snap message failed.");

3354

3355

return r;

3355

return r;

3356

}

3356

}

3357

3358

static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)

3358

static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)

3359

{

3359

{

3360

int r;

3360

int r;

3361

3362

r = check_arg_count(argc, 1);

3362

r = check_arg_count(argc, 1);

3363

if (r)

3363

if (r)

3364

return r;

3364

return r;

3365

3366

r = dm_pool_release_metadata_snap(pool->pmd);

3366

r = dm_pool_release_metadata_snap(pool->pmd);

3367

if (r)

3367

if (r)

3368

DMWARN("release_metadata_snap message failed.");

3368

DMWARN("release_metadata_snap message failed.");

3369

3370

return r;

3370

return r;

3371

}

3371

}

3372

3373

/*

3373

/*

3374

* Messages supported:

3374

* Messages supported:

3375

* create_thin <dev_id>

3375

* create_thin <dev_id>

3376

* create_snap <dev_id> <origin_id>

3376

* create_snap <dev_id> <origin_id>

3377

* delete <dev_id>

3377

* delete <dev_id>

3378

* set_transaction_id <current_trans_id> <new_trans_id>

3378

* set_transaction_id <current_trans_id> <new_trans_id>

3379

* reserve_metadata_snap

3379

* reserve_metadata_snap

3380

* release_metadata_snap

3380

* release_metadata_snap

3381

*/

3381

*/

3382

static int pool_message(struct dm_target *ti, unsigned argc, char **argv)

3382

static int pool_message(struct dm_target *ti, unsigned argc, char **argv)

3383

{

3383

{

3384

int r = -EINVAL;

3384

int r = -EINVAL;

3385

struct pool_c *pt = ti->private;

3385

struct pool_c *pt = ti->private;

3386

struct pool *pool = pt->pool;

3386

struct pool *pool = pt->pool;

3387

3388

if (get_pool_mode(pool) >= PM_READ_ONLY) {

3389

DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",

3390

dm_device_name(pool->pool_md));

3391

return -EINVAL;

3392

}

3393

3388

if (!strcasecmp(argv[0], "create_thin"))

3394

if (!strcasecmp(argv[0], "create_thin"))

3389

r = process_create_thin_mesg(argc, argv, pool);

3395

r = process_create_thin_mesg(argc, argv, pool);

3390

3396

3391

else if (!strcasecmp(argv[0], "create_snap"))

3397

else if (!strcasecmp(argv[0], "create_snap"))

3392

r = process_create_snap_mesg(argc, argv, pool);

3398

r = process_create_snap_mesg(argc, argv, pool);

3393

3399

3394

else if (!strcasecmp(argv[0], "delete"))

3400

else if (!strcasecmp(argv[0], "delete"))

3395

r = process_delete_mesg(argc, argv, pool);

3401

r = process_delete_mesg(argc, argv, pool);

3396

3402

3397

else if (!strcasecmp(argv[0], "set_transaction_id"))

3403

else if (!strcasecmp(argv[0], "set_transaction_id"))

3398

r = process_set_transaction_id_mesg(argc, argv, pool);

3404

r = process_set_transaction_id_mesg(argc, argv, pool);

3399

3405

3400

else if (!strcasecmp(argv[0], "reserve_metadata_snap"))

3406

else if (!strcasecmp(argv[0], "reserve_metadata_snap"))

3401

r = process_reserve_metadata_snap_mesg(argc, argv, pool);

3407

r = process_reserve_metadata_snap_mesg(argc, argv, pool);

3402

3408

3403

else if (!strcasecmp(argv[0], "release_metadata_snap"))

3409

else if (!strcasecmp(argv[0], "release_metadata_snap"))

3404

r = process_release_metadata_snap_mesg(argc, argv, pool);

3410

r = process_release_metadata_snap_mesg(argc, argv, pool);

3405

3411

3406

else

3412

else

3407

DMWARN("Unrecognised thin pool target message received: %s", argv[0]);

3413

DMWARN("Unrecognised thin pool target message received: %s", argv[0]);

3408

3414

3409

if (!r)

3415

if (!r)

3410

(void) commit(pool);

3416

(void) commit(pool);

3411

3417

3412

return r;

3418

return r;

3413

}

3419

}

3414

3420

3415

static void emit_flags(struct pool_features *pf, char *result,

3421

static void emit_flags(struct pool_features *pf, char *result,

3416

unsigned sz, unsigned maxlen)

3422

unsigned sz, unsigned maxlen)

3417

{

3423

{

3418

unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +

3424

unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +

3419

!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +

3425

!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +

3420

pf->error_if_no_space;

3426

pf->error_if_no_space;

3421

DMEMIT("%u ", count);

3427

DMEMIT("%u ", count);

3422

3428

3423

if (!pf->zero_new_blocks)

3429

if (!pf->zero_new_blocks)

3424

DMEMIT("skip_block_zeroing ");

3430

DMEMIT("skip_block_zeroing ");

3425

3431

3426

if (!pf->discard_enabled)

3432

if (!pf->discard_enabled)

3427

DMEMIT("ignore_discard ");

3433

DMEMIT("ignore_discard ");

3428

3434

3429

if (!pf->discard_passdown)

3435

if (!pf->discard_passdown)

3430

DMEMIT("no_discard_passdown ");

3436

DMEMIT("no_discard_passdown ");

3431

3437

3432

if (pf->mode == PM_READ_ONLY)

3438

if (pf->mode == PM_READ_ONLY)

3433

DMEMIT("read_only ");

3439

DMEMIT("read_only ");

3434

3440

3435

if (pf->error_if_no_space)

3441

if (pf->error_if_no_space)

3436

DMEMIT("error_if_no_space ");

3442

DMEMIT("error_if_no_space ");

3437

}

3443

}

3438

3444

3439

/*

3445

/*

3440

* Status line is:

3446

* Status line is:

3441

* <transaction id> <used metadata sectors>/<total metadata sectors>

3447

* <transaction id> <used metadata sectors>/<total metadata sectors>

3442

* <used data sectors>/<total data sectors> <held metadata root>

3448

* <used data sectors>/<total data sectors> <held metadata root>

3443

*/

3449

*/

3444

static void pool_status(struct dm_target *ti, status_type_t type,

3450

static void pool_status(struct dm_target *ti, status_type_t type,

3445

unsigned status_flags, char *result, unsigned maxlen)

3451

unsigned status_flags, char *result, unsigned maxlen)

3446

{

3452

{

3447

int r;

3453

int r;

3448

unsigned sz = 0;

3454

unsigned sz = 0;

3449

uint64_t transaction_id;

3455

uint64_t transaction_id;

3450

dm_block_t nr_free_blocks_data;

3456

dm_block_t nr_free_blocks_data;

3451

dm_block_t nr_free_blocks_metadata;

3457

dm_block_t nr_free_blocks_metadata;

3452

dm_block_t nr_blocks_data;

3458

dm_block_t nr_blocks_data;

3453

dm_block_t nr_blocks_metadata;

3459

dm_block_t nr_blocks_metadata;

3454

dm_block_t held_root;

3460

dm_block_t held_root;

3455

char buf[BDEVNAME_SIZE];

3461

char buf[BDEVNAME_SIZE];

3456

char buf2[BDEVNAME_SIZE];

3462

char buf2[BDEVNAME_SIZE];

3457

struct pool_c *pt = ti->private;

3463

struct pool_c *pt = ti->private;

3458

struct pool *pool = pt->pool;

3464

struct pool *pool = pt->pool;

3459

3465

3460

switch (type) {

3466

switch (type) {

3461

case STATUSTYPE_INFO:

3467

case STATUSTYPE_INFO:

3462

if (get_pool_mode(pool) == PM_FAIL) {

3468

if (get_pool_mode(pool) == PM_FAIL) {

3463

DMEMIT("Fail");

3469

DMEMIT("Fail");

3464

break;

3470

break;

3465

}

3471

}

3466

3472

3467

/* Commit to ensure statistics aren't out-of-date */

3473

/* Commit to ensure statistics aren't out-of-date */

3468

if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))

3474

if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))

3469

(void) commit(pool);

3475

(void) commit(pool);

3470

3476

3471

r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);

3477

r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);

3472

if (r) {

3478

if (r) {

3473

DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",

3479

DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",

3474

dm_device_name(pool->pool_md), r);

3480

dm_device_name(pool->pool_md), r);

3475

goto err;

3481

goto err;

3476

}

3482

}

3477

3483

3478

r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);

3484

r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);

3479

if (r) {

3485

if (r) {

3480

DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",

3486

DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",

3481

dm_device_name(pool->pool_md), r);

3487

dm_device_name(pool->pool_md), r);

3482

goto err;

3488

goto err;

3483

}

3489

}

3484

3490

3485

r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);

3491

r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);

3486

if (r) {

3492

if (r) {

3487

DMERR("%s: dm_pool_get_metadata_dev_size returned %d",

3493

DMERR("%s: dm_pool_get_metadata_dev_size returned %d",

3488

dm_device_name(pool->pool_md), r);

3494

dm_device_name(pool->pool_md), r);

3489

goto err;

3495

goto err;

3490

}

3496

}

3491

3497

3492

r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);

3498

r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);

3493

if (r) {

3499

if (r) {

3494

DMERR("%s: dm_pool_get_free_block_count returned %d",

3500

DMERR("%s: dm_pool_get_free_block_count returned %d",

3495

dm_device_name(pool->pool_md), r);

3501

dm_device_name(pool->pool_md), r);

3496

goto err;

3502

goto err;

3497

}

3503

}

3498

3504

3499

r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);

3505

r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);

3500

if (r) {

3506

if (r) {

3501

DMERR("%s: dm_pool_get_data_dev_size returned %d",

3507

DMERR("%s: dm_pool_get_data_dev_size returned %d",

3502

dm_device_name(pool->pool_md), r);

3508

dm_device_name(pool->pool_md), r);

3503

goto err;

3509

goto err;

3504

}

3510

}

3505

3511

3506

r = dm_pool_get_metadata_snap(pool->pmd, &held_root);

3512

r = dm_pool_get_metadata_snap(pool->pmd, &held_root);

3507

if (r) {

3513

if (r) {

3508

DMERR("%s: dm_pool_get_metadata_snap returned %d",

3514

DMERR("%s: dm_pool_get_metadata_snap returned %d",

3509

dm_device_name(pool->pool_md), r);

3515

dm_device_name(pool->pool_md), r);

3510

goto err;

3516

goto err;

3511

}

3517

}

3512

3518

3513

DMEMIT("%llu %llu/%llu %llu/%llu ",

3519

DMEMIT("%llu %llu/%llu %llu/%llu ",

3514

(unsigned long long)transaction_id,

3520

(unsigned long long)transaction_id,

3515

(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),

3521

(unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),

3516

(unsigned long long)nr_blocks_metadata,

3522

(unsigned long long)nr_blocks_metadata,

3517

(unsigned long long)(nr_blocks_data - nr_free_blocks_data),

3523

(unsigned long long)(nr_blocks_data - nr_free_blocks_data),

3518

(unsigned long long)nr_blocks_data);

3524

(unsigned long long)nr_blocks_data);

3519

3525

3520

if (held_root)

3526

if (held_root)

3521

DMEMIT("%llu ", held_root);

3527

DMEMIT("%llu ", held_root);

3522

else

3528

else

3523

DMEMIT("- ");

3529

DMEMIT("- ");

3524

3530

3525

if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)

3531

if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)

3526

DMEMIT("out_of_data_space ");

3532

DMEMIT("out_of_data_space ");

3527

else if (pool->pf.mode == PM_READ_ONLY)

3533

else if (pool->pf.mode == PM_READ_ONLY)

3528

DMEMIT("ro ");

3534

DMEMIT("ro ");

3529

else

3535

else

3530

DMEMIT("rw ");

3536

DMEMIT("rw ");

3531

3537

3532

if (!pool->pf.discard_enabled)

3538

if (!pool->pf.discard_enabled)

3533

DMEMIT("ignore_discard ");

3539

DMEMIT("ignore_discard ");

3534

else if (pool->pf.discard_passdown)

3540

else if (pool->pf.discard_passdown)

3535

DMEMIT("discard_passdown ");

3541

DMEMIT("discard_passdown ");

3536

else

3542

else

3537

DMEMIT("no_discard_passdown ");

3543

DMEMIT("no_discard_passdown ");

3538

3544

3539

if (pool->pf.error_if_no_space)

3545

if (pool->pf.error_if_no_space)

3540

DMEMIT("error_if_no_space ");

3546

DMEMIT("error_if_no_space ");

3541

else

3547

else

3542

DMEMIT("queue_if_no_space ");

3548

DMEMIT("queue_if_no_space ");

3543

3549

3544

break;

3550

break;

3545

3551

3546

case STATUSTYPE_TABLE:

3552

case STATUSTYPE_TABLE:

3547

DMEMIT("%s %s %lu %llu ",

3553

DMEMIT("%s %s %lu %llu ",

3548

format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),

3554

format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),

3549

format_dev_t(buf2, pt->data_dev->bdev->bd_dev),

3555

format_dev_t(buf2, pt->data_dev->bdev->bd_dev),

3550

(unsigned long)pool->sectors_per_block,

3556

(unsigned long)pool->sectors_per_block,

3551

(unsigned long long)pt->low_water_blocks);

3557

(unsigned long long)pt->low_water_blocks);

3552

emit_flags(&pt->requested_pf, result, sz, maxlen);

3558

emit_flags(&pt->requested_pf, result, sz, maxlen);

3553

break;

3559

break;

3554

}

3560

}

3555

return;

3561

return;

3556

3562

3557

err:

3563

err:

3558

DMEMIT("Error");

3564

DMEMIT("Error");

3559

}

3565

}

3560

3566

3561

static int pool_iterate_devices(struct dm_target *ti,

3567

static int pool_iterate_devices(struct dm_target *ti,

3562

iterate_devices_callout_fn fn, void *data)

3568

iterate_devices_callout_fn fn, void *data)

3563

{

3569

{

3564

struct pool_c *pt = ti->private;

3570

struct pool_c *pt = ti->private;

3565

3571

3566

return fn(ti, pt->data_dev, 0, ti->len, data);

3572

return fn(ti, pt->data_dev, 0, ti->len, data);

3567

}

3573

}

3568

3574

3569

static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,

3575

static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,

3570

struct bio_vec *biovec, int max_size)

3576

struct bio_vec *biovec, int max_size)

3571

{

3577

{

3572

struct pool_c *pt = ti->private;

3578

struct pool_c *pt = ti->private;

3573

struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);

3579

struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);

3574

3580

3575

if (!q->merge_bvec_fn)

3581

if (!q->merge_bvec_fn)

3576

return max_size;

3582

return max_size;

3577

3583

3578

bvm->bi_bdev = pt->data_dev->bdev;

3584

bvm->bi_bdev = pt->data_dev->bdev;

3579

3585

3580

return min(max_size, q->merge_bvec_fn(q, bvm, biovec));

3586

return min(max_size, q->merge_bvec_fn(q, bvm, biovec));

3581

}

3587

}

3582

3588

3583

static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)

3589

static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)

3584

{

3590

{

3585

struct pool *pool = pt->pool;

3591

struct pool *pool = pt->pool;

3586

struct queue_limits *data_limits;

3592

struct queue_limits *data_limits;

3587

3593

3588

limits->max_discard_sectors = pool->sectors_per_block;

3594

limits->max_discard_sectors = pool->sectors_per_block;

3589

3595

3590

/*

3596

/*

3591

* discard_granularity is just a hint, and not enforced.

3597

* discard_granularity is just a hint, and not enforced.

3592

*/

3598

*/

3593

if (pt->adjusted_pf.discard_passdown) {

3599

if (pt->adjusted_pf.discard_passdown) {

3594

data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;

3600

data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;

3595

limits->discard_granularity = max(data_limits->discard_granularity,

3601

limits->discard_granularity = max(data_limits->discard_granularity,

3596

pool->sectors_per_block << SECTOR_SHIFT);

3602

pool->sectors_per_block << SECTOR_SHIFT);

3597

} else

3603

} else

3598

limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;

3604

limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;

3599

}

3605

}

3600

3606

3601

static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)

3607

static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)

3602

{

3608

{

3603

struct pool_c *pt = ti->private;

3609

struct pool_c *pt = ti->private;

3604

struct pool *pool = pt->pool;

3610

struct pool *pool = pt->pool;

3605

sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;

3611

sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;

3606

3612

3607

/*

3613

/*

3608

* If max_sectors is smaller than pool->sectors_per_block adjust it

3614

* If max_sectors is smaller than pool->sectors_per_block adjust it

3609

* to the highest possible power-of-2 factor of pool->sectors_per_block.

3615

* to the highest possible power-of-2 factor of pool->sectors_per_block.

3610

* This is especially beneficial when the pool's data device is a RAID

3616

* This is especially beneficial when the pool's data device is a RAID

3611

* device that has a full stripe width that matches pool->sectors_per_block

3617

* device that has a full stripe width that matches pool->sectors_per_block

3612

* -- because even though partial RAID stripe-sized IOs will be issued to a

3618

* -- because even though partial RAID stripe-sized IOs will be issued to a

3613

* single RAID stripe; when aggregated they will end on a full RAID stripe

3619

* single RAID stripe; when aggregated they will end on a full RAID stripe

3614

* boundary.. which avoids additional partial RAID stripe writes cascading

3620

* boundary.. which avoids additional partial RAID stripe writes cascading

3615

*/

3621

*/

3616

if (limits->max_sectors < pool->sectors_per_block) {

3622

if (limits->max_sectors < pool->sectors_per_block) {

3617

while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {

3623

while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {

3618

if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)

3624

if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)

3619

limits->max_sectors--;

3625

limits->max_sectors--;

3620

limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);

3626

limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);

3621

}

3627

}

3622

}

3628

}

3623

3629

3624

/*

3630

/*

3625

* If the system-determined stacked limits are compatible with the

3631

* If the system-determined stacked limits are compatible with the

3626

* pool's blocksize (io_opt is a factor) do not override them.

3632

* pool's blocksize (io_opt is a factor) do not override them.

3627

*/

3633

*/

3628

if (io_opt_sectors < pool->sectors_per_block ||

3634

if (io_opt_sectors < pool->sectors_per_block ||

3629

!is_factor(io_opt_sectors, pool->sectors_per_block)) {

3635

!is_factor(io_opt_sectors, pool->sectors_per_block)) {

3630

if (is_factor(pool->sectors_per_block, limits->max_sectors))

3636

if (is_factor(pool->sectors_per_block, limits->max_sectors))

3631

blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);

3637

blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);

3632

else

3638

else

3633

blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);

3639

blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);

3634

blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);

3640

blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);

3635

}

3641

}

3636

3642

3637

/*

3643

/*

3638

* pt->adjusted_pf is a staging area for the actual features to use.

3644

* pt->adjusted_pf is a staging area for the actual features to use.

3639

* They get transferred to the live pool in bind_control_target()

3645

* They get transferred to the live pool in bind_control_target()

3640

* called from pool_preresume().

3646

* called from pool_preresume().

3641

*/

3647

*/

3642

if (!pt->adjusted_pf.discard_enabled) {

3648

if (!pt->adjusted_pf.discard_enabled) {

3643

/*

3649

/*

3644

* Must explicitly disallow stacking discard limits otherwise the

3650

* Must explicitly disallow stacking discard limits otherwise the

3645

* block layer will stack them if pool's data device has support.

3651

* block layer will stack them if pool's data device has support.

3646

* QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the

3652

* QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the

3647

* user to see that, so make sure to set all discard limits to 0.

3653

* user to see that, so make sure to set all discard limits to 0.

3648

*/

3654

*/

3649

limits->discard_granularity = 0;

3655

limits->discard_granularity = 0;

3650

return;

3656

return;

3651

}

3657

}

3652

3658

3653

disable_passdown_if_not_supported(pt);

3659

disable_passdown_if_not_supported(pt);

3654

3660

3655

set_discard_limits(pt, limits);

3661

set_discard_limits(pt, limits);

3656

}

3662

}

3657

3663

3658

static struct target_type pool_target = {

3664

static struct target_type pool_target = {

3659

.name = "thin-pool",

3665

.name = "thin-pool",

3660

.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |

3666

.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |

3661

DM_TARGET_IMMUTABLE,

3667

DM_TARGET_IMMUTABLE,

3662

.version = {1, 14, 0},

3668

.version = {1, 14, 0},

3663

.module = THIS_MODULE,

3669

.module = THIS_MODULE,

3664

.ctr = pool_ctr,

3670

.ctr = pool_ctr,

3665

.dtr = pool_dtr,

3671

.dtr = pool_dtr,

3666

.map = pool_map,

3672

.map = pool_map,

3667

.presuspend = pool_presuspend,

3673

.presuspend = pool_presuspend,

3668

.presuspend_undo = pool_presuspend_undo,

3674

.presuspend_undo = pool_presuspend_undo,

3669

.postsuspend = pool_postsuspend,

3675

.postsuspend = pool_postsuspend,

3670

.preresume = pool_preresume,

3676

.preresume = pool_preresume,

3671

.resume = pool_resume,

3677

.resume = pool_resume,

3672

.message = pool_message,

3678

.message = pool_message,

3673

.status = pool_status,

3679

.status = pool_status,

3674

.merge = pool_merge,

3680

.merge = pool_merge,

3675

.iterate_devices = pool_iterate_devices,

3681

.iterate_devices = pool_iterate_devices,

3676

.io_hints = pool_io_hints,

3682

.io_hints = pool_io_hints,

3677

};

3683

};

3678

3684

3679

/*----------------------------------------------------------------

3685

/*----------------------------------------------------------------

3680

* Thin target methods

3686

* Thin target methods

3681

*--------------------------------------------------------------*/

3687

*--------------------------------------------------------------*/

3682

static void thin_get(struct thin_c *tc)

3688

static void thin_get(struct thin_c *tc)

3683

{

3689

{

3684

atomic_inc(&tc->refcount);

3690

atomic_inc(&tc->refcount);

3685

}

3691

}

3686

3692

3687

static void thin_put(struct thin_c *tc)

3693

static void thin_put(struct thin_c *tc)

3688

{

3694

{

3689

if (atomic_dec_and_test(&tc->refcount))

3695

if (atomic_dec_and_test(&tc->refcount))

3690

complete(&tc->can_destroy);

3696

complete(&tc->can_destroy);

3691

}

3697

}

3692

3698

3693

static void thin_dtr(struct dm_target *ti)

3699

static void thin_dtr(struct dm_target *ti)

3694

{

3700

{

3695

struct thin_c *tc = ti->private;

3701

struct thin_c *tc = ti->private;

3696

unsigned long flags;

3702

unsigned long flags;

3697

3703

3698

spin_lock_irqsave(&tc->pool->lock, flags);

3704

spin_lock_irqsave(&tc->pool->lock, flags);

3699

list_del_rcu(&tc->list);

3705

list_del_rcu(&tc->list);

3700

spin_unlock_irqrestore(&tc->pool->lock, flags);

3706

spin_unlock_irqrestore(&tc->pool->lock, flags);

3701

synchronize_rcu();

3707

synchronize_rcu();

3702

3708

3703

thin_put(tc);

3709

thin_put(tc);

3704

wait_for_completion(&tc->can_destroy);

3710

wait_for_completion(&tc->can_destroy);

3705

3711

3706

mutex_lock(&dm_thin_pool_table.mutex);

3712

mutex_lock(&dm_thin_pool_table.mutex);

3707

3713

3708

__pool_dec(tc->pool);

3714

__pool_dec(tc->pool);

3709

dm_pool_close_thin_device(tc->td);

3715

dm_pool_close_thin_device(tc->td);

3710

dm_put_device(ti, tc->pool_dev);

3716

dm_put_device(ti, tc->pool_dev);

3711

if (tc->origin_dev)

3717

if (tc->origin_dev)

3712

dm_put_device(ti, tc->origin_dev);

3718

dm_put_device(ti, tc->origin_dev);

3713

kfree(tc);

3719

kfree(tc);

3714

3720

3715

mutex_unlock(&dm_thin_pool_table.mutex);

3721

mutex_unlock(&dm_thin_pool_table.mutex);

3716

}

3722

}

3717

3723

3718

/*

3724

/*

3719

* Thin target parameters:

3725

* Thin target parameters:

3720

*

3726

*

3721

* <pool_dev> <dev_id> [origin_dev]

3727

* <pool_dev> <dev_id> [origin_dev]

3722

*

3728

*

3723

* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)

3729

* pool_dev: the path to the pool (eg, /dev/mapper/my_pool)

3724

* dev_id: the internal device identifier

3730

* dev_id: the internal device identifier

3725

* origin_dev: a device external to the pool that should act as the origin

3731

* origin_dev: a device external to the pool that should act as the origin

3726

*

3732

*

3727

* If the pool device has discards disabled, they get disabled for the thin

3733

* If the pool device has discards disabled, they get disabled for the thin

3728

* device as well.

3734

* device as well.

3729

*/

3735

*/

3730

static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)

3736

static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)

3731

{

3737

{

3732

int r;

3738

int r;

3733

struct thin_c *tc;

3739

struct thin_c *tc;

3734

struct dm_dev *pool_dev, *origin_dev;

3740

struct dm_dev *pool_dev, *origin_dev;

3735

struct mapped_device *pool_md;

3741

struct mapped_device *pool_md;

3736

unsigned long flags;

3742

unsigned long flags;

3737

3743

3738

mutex_lock(&dm_thin_pool_table.mutex);

3744

mutex_lock(&dm_thin_pool_table.mutex);

3739

3745

3740

if (argc != 2 && argc != 3) {

3746

if (argc != 2 && argc != 3) {

3741

ti->error = "Invalid argument count";

3747

ti->error = "Invalid argument count";

3742

r = -EINVAL;

3748

r = -EINVAL;

3743

goto out_unlock;

3749

goto out_unlock;

3744

}

3750

}

3745

3751

3746

tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);

3752

tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);

3747

if (!tc) {

3753

if (!tc) {

3748

ti->error = "Out of memory";

3754

ti->error = "Out of memory";

3749

r = -ENOMEM;

3755

r = -ENOMEM;

3750

goto out_unlock;

3756

goto out_unlock;

3751

}

3757

}

3752

tc->thin_md = dm_table_get_md(ti->table);

3758

tc->thin_md = dm_table_get_md(ti->table);

3753

spin_lock_init(&tc->lock);

3759

spin_lock_init(&tc->lock);

3754

INIT_LIST_HEAD(&tc->deferred_cells);

3760

INIT_LIST_HEAD(&tc->deferred_cells);

3755

bio_list_init(&tc->deferred_bio_list);

3761

bio_list_init(&tc->deferred_bio_list);

3756

bio_list_init(&tc->retry_on_resume_list);

3762

bio_list_init(&tc->retry_on_resume_list);

3757

tc->sort_bio_list = RB_ROOT;

3763

tc->sort_bio_list = RB_ROOT;

3758

3764

3759

if (argc == 3) {

3765

if (argc == 3) {

3760

r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);

3766

r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);

3761

if (r) {

3767

if (r) {

3762

ti->error = "Error opening origin device";

3768

ti->error = "Error opening origin device";

3763

goto bad_origin_dev;

3769

goto bad_origin_dev;

3764

}

3770

}

3765

tc->origin_dev = origin_dev;

3771

tc->origin_dev = origin_dev;

3766

}

3772

}

3767

3773

3768

r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);

3774

r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);

3769

if (r) {

3775

if (r) {

3770

ti->error = "Error opening pool device";

3776

ti->error = "Error opening pool device";

3771

goto bad_pool_dev;

3777

goto bad_pool_dev;

3772

}

3778

}

3773

tc->pool_dev = pool_dev;

3779

tc->pool_dev = pool_dev;

3774

3780

3775

if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {

3781

if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {

3776

ti->error = "Invalid device id";

3782

ti->error = "Invalid device id";

3777

r = -EINVAL;

3783

r = -EINVAL;

3778

goto bad_common;

3784

goto bad_common;

3779

}

3785

}

3780

3786

3781

pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);

3787

pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);

3782

if (!pool_md) {

3788

if (!pool_md) {

3783

ti->error = "Couldn't get pool mapped device";

3789

ti->error = "Couldn't get pool mapped device";

3784

r = -EINVAL;

3790

r = -EINVAL;

3785

goto bad_common;

3791

goto bad_common;

3786

}

3792

}

3787

3793

3788

tc->pool = __pool_table_lookup(pool_md);

3794

tc->pool = __pool_table_lookup(pool_md);

3789

if (!tc->pool) {

3795

if (!tc->pool) {

3790

ti->error = "Couldn't find pool object";

3796

ti->error = "Couldn't find pool object";

3791

r = -EINVAL;

3797

r = -EINVAL;

3792

goto bad_pool_lookup;

3798

goto bad_pool_lookup;

3793

}

3799

}

3794

__pool_inc(tc->pool);

3800

__pool_inc(tc->pool);

3795

3801

3796

if (get_pool_mode(tc->pool) == PM_FAIL) {

3802

if (get_pool_mode(tc->pool) == PM_FAIL) {

3797

ti->error = "Couldn't open thin device, Pool is in fail mode";

3803

ti->error = "Couldn't open thin device, Pool is in fail mode";

3798

r = -EINVAL;

3804

r = -EINVAL;

3799

goto bad_pool;

3805

goto bad_pool;

3800

}

3806

}

3801

3807

3802

r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);

3808

r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);

3803

if (r) {

3809

if (r) {

3804

ti->error = "Couldn't open thin internal device";

3810

ti->error = "Couldn't open thin internal device";

3805

goto bad_pool;

3811

goto bad_pool;

3806

}

3812

}

3807

3813

3808

r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);

3814

r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);

3809

if (r)

3815

if (r)

3810

goto bad;

3816

goto bad;

3811

3817

3812

ti->num_flush_bios = 1;

3818

ti->num_flush_bios = 1;

3813

ti->flush_supported = true;

3819

ti->flush_supported = true;

3814

ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);

3820

ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);

3815

3821

3816

/* In case the pool supports discards, pass them on. */

3822

/* In case the pool supports discards, pass them on. */

3817

ti->discard_zeroes_data_unsupported = true;

3823

ti->discard_zeroes_data_unsupported = true;

3818

if (tc->pool->pf.discard_enabled) {

3824

if (tc->pool->pf.discard_enabled) {

3819

ti->discards_supported = true;

3825

ti->discards_supported = true;

3820

ti->num_discard_bios = 1;

3826

ti->num_discard_bios = 1;

3821

/* Discard bios must be split on a block boundary */

3827

/* Discard bios must be split on a block boundary */

3822

ti->split_discard_bios = true;

3828

ti->split_discard_bios = true;

3823

}

3829

}

3824

3830

3825

mutex_unlock(&dm_thin_pool_table.mutex);

3831

mutex_unlock(&dm_thin_pool_table.mutex);

3826

3832

3827

spin_lock_irqsave(&tc->pool->lock, flags);

3833

spin_lock_irqsave(&tc->pool->lock, flags);

3828

if (tc->pool->suspended) {

3834

if (tc->pool->suspended) {

3829

spin_unlock_irqrestore(&tc->pool->lock, flags);

3835

spin_unlock_irqrestore(&tc->pool->lock, flags);

3830

mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */

3836

mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */

3831

ti->error = "Unable to activate thin device while pool is suspended";

3837

ti->error = "Unable to activate thin device while pool is suspended";

3832

r = -EINVAL;

3838

r = -EINVAL;

3833

goto bad;

3839

goto bad;

3834

}

3840

}

3835

atomic_set(&tc->refcount, 1);

3841

atomic_set(&tc->refcount, 1);

3836

init_completion(&tc->can_destroy);

3842

init_completion(&tc->can_destroy);

3837

list_add_tail_rcu(&tc->list, &tc->pool->active_thins);

3843

list_add_tail_rcu(&tc->list, &tc->pool->active_thins);

3838

spin_unlock_irqrestore(&tc->pool->lock, flags);

3844

spin_unlock_irqrestore(&tc->pool->lock, flags);

3839

/*

3845

/*

3840

* This synchronize_rcu() call is needed here otherwise we risk a

3846

* This synchronize_rcu() call is needed here otherwise we risk a

3841

* wake_worker() call finding no bios to process (because the newly

3847

* wake_worker() call finding no bios to process (because the newly

3842

* added tc isn't yet visible). So this reduces latency since we

3848

* added tc isn't yet visible). So this reduces latency since we

3843

* aren't then dependent on the periodic commit to wake_worker().

3849

* aren't then dependent on the periodic commit to wake_worker().

3844

*/

3850

*/

3845

synchronize_rcu();

3851

synchronize_rcu();

3846

3852

3847

dm_put(pool_md);

3853

dm_put(pool_md);

3848

3854

3849

return 0;

3855

return 0;

3850

3856

3851

bad:

3857

bad:

3852

dm_pool_close_thin_device(tc->td);

3858

dm_pool_close_thin_device(tc->td);

3853

bad_pool:

3859

bad_pool:

3854

__pool_dec(tc->pool);

3860

__pool_dec(tc->pool);

3855

bad_pool_lookup:

3861

bad_pool_lookup:

3856

dm_put(pool_md);

3862

dm_put(pool_md);

3857

bad_common:

3863

bad_common:

3858

dm_put_device(ti, tc->pool_dev);

3864

dm_put_device(ti, tc->pool_dev);

3859

bad_pool_dev:

3865

bad_pool_dev:

3860

if (tc->origin_dev)

3866

if (tc->origin_dev)

3861

dm_put_device(ti, tc->origin_dev);

3867

dm_put_device(ti, tc->origin_dev);

3862

bad_origin_dev:

3868

bad_origin_dev:

3863

kfree(tc);

3869

kfree(tc);

3864

out_unlock:

3870

out_unlock:

3865

mutex_unlock(&dm_thin_pool_table.mutex);

3871

mutex_unlock(&dm_thin_pool_table.mutex);

3866

3872

3867

return r;

3873

return r;

3868

}

3874

}

3869

3875

3870

static int thin_map(struct dm_target *ti, struct bio *bio)

3876

static int thin_map(struct dm_target *ti, struct bio *bio)

3871

{

3877

{

3872

bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);

3878

bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);

3873

3879

3874

return thin_bio_map(ti, bio);

3880

return thin_bio_map(ti, bio);

3875

}

3881

}

3876

3882

3877

static int thin_endio(struct dm_target *ti, struct bio *bio, int err)

3883

static int thin_endio(struct dm_target *ti, struct bio *bio, int err)

3878

{

3884

{

3879

unsigned long flags;

3885

unsigned long flags;

3880

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

3886

struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));

3881

struct list_head work;

3887

struct list_head work;

3882

struct dm_thin_new_mapping *m, *tmp;

3888

struct dm_thin_new_mapping *m, *tmp;

3883

struct pool *pool = h->tc->pool;

3889

struct pool *pool = h->tc->pool;

3884

3890

3885

if (h->shared_read_entry) {

3891

if (h->shared_read_entry) {

3886

INIT_LIST_HEAD(&work);

3892

INIT_LIST_HEAD(&work);

3887

dm_deferred_entry_dec(h->shared_read_entry, &work);

3893

dm_deferred_entry_dec(h->shared_read_entry, &work);

3888

3894

3889

spin_lock_irqsave(&pool->lock, flags);

3895

spin_lock_irqsave(&pool->lock, flags);

3890

list_for_each_entry_safe(m, tmp, &work, list) {

3896

list_for_each_entry_safe(m, tmp, &work, list) {

3891

list_del(&m->list);

3897

list_del(&m->list);

3892

__complete_mapping_preparation(m);

3898

__complete_mapping_preparation(m);

3893

}

3899

}

3894

spin_unlock_irqrestore(&pool->lock, flags);

3900

spin_unlock_irqrestore(&pool->lock, flags);

3895

}

3901

}

3896

3902

3897

if (h->all_io_entry) {

3903

if (h->all_io_entry) {

3898

INIT_LIST_HEAD(&work);

3904

INIT_LIST_HEAD(&work);

3899

dm_deferred_entry_dec(h->all_io_entry, &work);

3905

dm_deferred_entry_dec(h->all_io_entry, &work);

3900

if (!list_empty(&work)) {

3906

if (!list_empty(&work)) {

3901

spin_lock_irqsave(&pool->lock, flags);

3907

spin_lock_irqsave(&pool->lock, flags);

3902

list_for_each_entry_safe(m, tmp, &work, list)

3908

list_for_each_entry_safe(m, tmp, &work, list)

3903

list_add_tail(&m->list, &pool->prepared_discards);

3909

list_add_tail(&m->list, &pool->prepared_discards);

3904

spin_unlock_irqrestore(&pool->lock, flags);

3910

spin_unlock_irqrestore(&pool->lock, flags);

3905

wake_worker(pool);

3911

wake_worker(pool);

3906

}

3912

}

3907

}

3913

}

3908

3914

3909

return 0;

3915

return 0;

3910

}

3916

}

3911

3917

3912

static void thin_presuspend(struct dm_target *ti)

3918

static void thin_presuspend(struct dm_target *ti)

3913

{

3919

{

3914

struct thin_c *tc = ti->private;

3920

struct thin_c *tc = ti->private;

3915

3921

3916

if (dm_noflush_suspending(ti))

3922

if (dm_noflush_suspending(ti))

3917

noflush_work(tc, do_noflush_start);

3923

noflush_work(tc, do_noflush_start);

3918

}

3924

}

3919

3925

3920

static void thin_postsuspend(struct dm_target *ti)

3926

static void thin_postsuspend(struct dm_target *ti)

3921

{

3927

{

3922

struct thin_c *tc = ti->private;

3928

struct thin_c *tc = ti->private;

3923

3929

3924

/*

3930

/*

3925

* The dm_noflush_suspending flag has been cleared by now, so

3931

* The dm_noflush_suspending flag has been cleared by now, so

3926

* unfortunately we must always run this.

3932

* unfortunately we must always run this.

3927

*/

3933

*/

3928

noflush_work(tc, do_noflush_stop);

3934

noflush_work(tc, do_noflush_stop);

3929

}

3935

}

3930

3936

3931

static int thin_preresume(struct dm_target *ti)

3937

static int thin_preresume(struct dm_target *ti)

3932

{

3938

{

3933

struct thin_c *tc = ti->private;

3939

struct thin_c *tc = ti->private;

3934

3940

3935

if (tc->origin_dev)

3941

if (tc->origin_dev)

3936

tc->origin_size = get_dev_size(tc->origin_dev->bdev);

3942

tc->origin_size = get_dev_size(tc->origin_dev->bdev);

3937

3943

3938

return 0;

3944

return 0;

3939

}

3945

}

3940

3946

3941

/*

3947

/*

3942

* <nr mapped sectors> <highest mapped sector>

3948

* <nr mapped sectors> <highest mapped sector>

3943

*/

3949

*/

3944

static void thin_status(struct dm_target *ti, status_type_t type,

3950

static void thin_status(struct dm_target *ti, status_type_t type,

3945

unsigned status_flags, char *result, unsigned maxlen)

3951

unsigned status_flags, char *result, unsigned maxlen)

3946

{

3952

{

3947

int r;

3953

int r;

3948

ssize_t sz = 0;

3954

ssize_t sz = 0;

3949

dm_block_t mapped, highest;

3955

dm_block_t mapped, highest;

3950

char buf[BDEVNAME_SIZE];

3956

char buf[BDEVNAME_SIZE];

3951

struct thin_c *tc = ti->private;

3957

struct thin_c *tc = ti->private;

3952

3958

3953

if (get_pool_mode(tc->pool) == PM_FAIL) {

3959

if (get_pool_mode(tc->pool) == PM_FAIL) {

3954

DMEMIT("Fail");

3960

DMEMIT("Fail");

3955

return;

3961

return;

3956

}

3962

}

3957

3963

3958

if (!tc->td)

3964

if (!tc->td)

3959

DMEMIT("-");

3965

DMEMIT("-");

3960

else {

3966

else {

3961

switch (type) {

3967

switch (type) {

3962

case STATUSTYPE_INFO:

3968

case STATUSTYPE_INFO:

3963

r = dm_thin_get_mapped_count(tc->td, &mapped);

3969

r = dm_thin_get_mapped_count(tc->td, &mapped);

3964

if (r) {

3970

if (r) {

3965

DMERR("dm_thin_get_mapped_count returned %d", r);

3971

DMERR("dm_thin_get_mapped_count returned %d", r);

3966

goto err;

3972

goto err;

3967

}

3973

}

3968

3974

3969

r = dm_thin_get_highest_mapped_block(tc->td, &highest);

3975

r = dm_thin_get_highest_mapped_block(tc->td, &highest);

3970

if (r < 0) {

3976

if (r < 0) {

3971

DMERR("dm_thin_get_highest_mapped_block returned %d", r);

3977

DMERR("dm_thin_get_highest_mapped_block returned %d", r);

3972

goto err;

3978

goto err;

3973

}

3979

}

3974

3980

3975

DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);

3981

DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);

3976

if (r)

3982

if (r)

3977

DMEMIT("%llu", ((highest + 1) *

3983

DMEMIT("%llu", ((highest + 1) *

3978

tc->pool->sectors_per_block) - 1);

3984

tc->pool->sectors_per_block) - 1);

3979

else

3985

else

3980

DMEMIT("-");

3986

DMEMIT("-");

3981

break;

3987

break;

3982

3988

3983

case STATUSTYPE_TABLE:

3989

case STATUSTYPE_TABLE:

3984

DMEMIT("%s %lu",

3990

DMEMIT("%s %lu",

3985

format_dev_t(buf, tc->pool_dev->bdev->bd_dev),

3991

format_dev_t(buf, tc->pool_dev->bdev->bd_dev),

3986

(unsigned long) tc->dev_id);

3992

(unsigned long) tc->dev_id);

3987

if (tc->origin_dev)

3993

if (tc->origin_dev)

3988

DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));

3994

DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));

3989

break;

3995

break;

3990

}

3996

}

3991

}

3997

}

3992

3998

3993

return;

3999

return;

3994

4000

3995

err:

4001

err:

3996

DMEMIT("Error");

4002

DMEMIT("Error");

3997

}

4003

}

3998

4004

3999

static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,

4005

static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,

4000

struct bio_vec *biovec, int max_size)

4006

struct bio_vec *biovec, int max_size)

4001

{

4007

{

4002

struct thin_c *tc = ti->private;

4008

struct thin_c *tc = ti->private;

4003

struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);

4009

struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);

4004

4010

4005

if (!q->merge_bvec_fn)

4011

if (!q->merge_bvec_fn)

4006

return max_size;

4012

return max_size;

4007

4013

4008

bvm->bi_bdev = tc->pool_dev->bdev;

4014

bvm->bi_bdev = tc->pool_dev->bdev;

4009

bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);

4015

bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);

4010

4016

4011

return min(max_size, q->merge_bvec_fn(q, bvm, biovec));

4017

return min(max_size, q->merge_bvec_fn(q, bvm, biovec));

4012

}

4018

}

4013

4019

4014

static int thin_iterate_devices(struct dm_target *ti,

4020

static int thin_iterate_devices(struct dm_target *ti,

4015

iterate_devices_callout_fn fn, void *data)

4021

iterate_devices_callout_fn fn, void *data)

4016

{

4022

{

4017

sector_t blocks;

4023

sector_t blocks;

4018

struct thin_c *tc = ti->private;

4024

struct thin_c *tc = ti->private;

4019

struct pool *pool = tc->pool;

4025

struct pool *pool = tc->pool;

4020

4026

4021

/*

4027

/*

4022

* We can't call dm_pool_get_data_dev_size() since that blocks. So

4028

* We can't call dm_pool_get_data_dev_size() since that blocks. So

4023

* we follow a more convoluted path through to the pool's target.

4029

* we follow a more convoluted path through to the pool's target.

4024

*/

4030

*/

4025

if (!pool->ti)

4031

if (!pool->ti)

4026

return 0; /* nothing is bound */

4032

return 0; /* nothing is bound */

4027

4033

4028

blocks = pool->ti->len;

4034

blocks = pool->ti->len;

4029

(void) sector_div(blocks, pool->sectors_per_block);

4035

(void) sector_div(blocks, pool->sectors_per_block);

4030

if (blocks)

4036

if (blocks)

4031

return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);

4037

return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);

4032

4038

4033

return 0;

4039

return 0;

4034

}

4040

}

4035

4041

4036

static struct target_type thin_target = {

4042

static struct target_type thin_target = {

4037

.name = "thin",

4043

.name = "thin",

4038

.version = {1, 14, 0},

4044

.version = {1, 14, 0},

4039

.module = THIS_MODULE,

4045

.module = THIS_MODULE,

4040

.ctr = thin_ctr,

4046

.ctr = thin_ctr,

4041

.dtr = thin_dtr,

4047

.dtr = thin_dtr,

4042

.map = thin_map,

4048

.map = thin_map,

4043

.end_io = thin_endio,

4049

.end_io = thin_endio,

4044

.preresume = thin_preresume,

4050

.preresume = thin_preresume,

4045

.presuspend = thin_presuspend,

4051

.presuspend = thin_presuspend,

4046

.postsuspend = thin_postsuspend,

4052

.postsuspend = thin_postsuspend,

4047

.status = thin_status,

4053

.status = thin_status,

4048

.merge = thin_merge,

4054

.merge = thin_merge,

4049

.iterate_devices = thin_iterate_devices,

4055

.iterate_devices = thin_iterate_devices,

4050

};

4056

};

4051

4057

4052

/*----------------------------------------------------------------*/

4058

/*----------------------------------------------------------------*/

4053

4059

4054

static int __init dm_thin_init(void)

4060

static int __init dm_thin_init(void)

4055

{

4061

{

4056

int r;

4062

int r;

4057

4063

4058

pool_table_init();

4064

pool_table_init();

4059

4065

4060

r = dm_register_target(&thin_target);

4066

r = dm_register_target(&thin_target);

4061

if (r)

4067

if (r)

4062

return r;

4068

return r;

4063

4069

4064

r = dm_register_target(&pool_target);

4070

r = dm_register_target(&pool_target);

4065

if (r)

4071

if (r)

4066

goto bad_pool_target;

4072

goto bad_pool_target;

4067

4073

4068

r = -ENOMEM;

4074

r = -ENOMEM;

4069

4075

4070

_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);

4076

_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);

4071

if (!_new_mapping_cache)

4077

if (!_new_mapping_cache)

4072

goto bad_new_mapping_cache;

4078

goto bad_new_mapping_cache;

4073

4079

4074

return 0;

4080

return 0;

4075

4081

4076

bad_new_mapping_cache:

4082

bad_new_mapping_cache:

4077

dm_unregister_target(&pool_target);

4083

dm_unregister_target(&pool_target);

4078

bad_pool_target:

4084

bad_pool_target:

4079

dm_unregister_target(&thin_target);

4085

dm_unregister_target(&thin_target);

4080

4086

4081

return r;

4087

return r;

4082

}

4088

}

4083

4089

4084

static void dm_thin_exit(void)

4090

static void dm_thin_exit(void)

4085

{

4091

{

4086

dm_unregister_target(&thin_target);

4092

dm_unregister_target(&thin_target);

4087

dm_unregister_target(&pool_target);

4093

dm_unregister_target(&pool_target);

4088

4094

4089

kmem_cache_destroy(_new_mapping_cache);

4095

kmem_cache_destroy(_new_mapping_cache);

4090

}

4096

}

4091

4097

4092

module_init(dm_thin_init);

4098

module_init(dm_thin_init);

4093

module_exit(dm_thin_exit);

4099

module_exit(dm_thin_exit);

4094

4100

4095

module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);

4101

module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);

4096

MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");

4102

MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");

4097

4103

4098

MODULE_DESCRIPTION(DM_NAME " thin provisioning target");

4104

MODULE_DESCRIPTION(DM_NAME " thin provisioning target");

4099

MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");

4105

MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");

4100

MODULE_LICENSE("GPL");

4106

MODULE_LICENSE("GPL");

4101

4107

GITLAB

dm thin: don't allow messages to be sent to a pool target in READ_ONLY or FAIL mode

 /*
  * Copyright (C) 2011-2012 Red Hat UK.
  *
  * This file is released under the GPL.
  */
 #include "dm-thin-metadata.h"
 #include "dm-bio-prison.h"
 #include "dm.h"
 #include <linux/device-mapper.h>
 #include <linux/dm-io.h>
 #include <linux/dm-kcopyd.h>
 #include <linux/log2.h>
 #include <linux/list.h>
 #include <linux/rculist.h>
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/sort.h>
 #include <linux/rbtree.h>
 #define	DM_MSG_PREFIX	"thin"
 /*
  * Tunable constants
  */
 #define ENDIO_HOOK_POOL_SIZE 1024
 #define MAPPING_POOL_SIZE 1024
 #define COMMIT_PERIOD HZ
 #define NO_SPACE_TIMEOUT_SECS 60
 static unsigned no_space_timeout_secs = NO_SPACE_TIMEOUT_SECS;
 DECLARE_DM_KCOPYD_THROTTLE_WITH_MODULE_PARM(snapshot_copy_throttle,
 		"A percentage of time allocated for copy on write");
 /*
  * The block size of the device holding pool data must be
  * between 64KB and 1GB.
  */
 #define DATA_DEV_BLOCK_SIZE_MIN_SECTORS (64 * 1024 >> SECTOR_SHIFT)
 #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
 /*
  * Device id is restricted to 24 bits.
  */
 #define MAX_DEV_ID ((1 << 24) - 1)
 /*
  * How do we handle breaking sharing of data blocks?
  * =================================================
  *
  * We use a standard copy-on-write btree to store the mappings for the
  * devices (note I'm talking about copy-on-write of the metadata here, not
  * the data).  When you take an internal snapshot you clone the root node
  * of the origin btree.  After this there is no concept of an origin or a
  * snapshot.  They are just two device trees that happen to point to the
  * same data blocks.
  *
  * When we get a write in we decide if it's to a shared data block using
  * some timestamp magic.  If it is, we have to break sharing.
  *
  * Let's say we write to a shared block in what was the origin.  The
  * steps are:
  *
  * i) plug io further to this physical block. (see bio_prison code).
  *
  * ii) quiesce any read io to that shared data block.  Obviously
  * including all devices that share this block.  (see dm_deferred_set code)
  *
  * iii) copy the data block to a newly allocate block.  This step can be
  * missed out if the io covers the block. (schedule_copy).
  *
  * iv) insert the new mapping into the origin's btree
  * (process_prepared_mapping).  This act of inserting breaks some
  * sharing of btree nodes between the two devices.  Breaking sharing only
  * effects the btree of that specific device.  Btrees for the other
  * devices that share the block never change.  The btree for the origin
  * device as it was after the last commit is untouched, ie. we're using
  * persistent data structures in the functional programming sense.
  *
  * v) unplug io to this physical block, including the io that triggered
  * the breaking of sharing.
  *
  * Steps (ii) and (iii) occur in parallel.
  *
  * The metadata _doesn't_ need to be committed before the io continues.  We
  * get away with this because the io is always written to a _new_ block.
  * If there's a crash, then:
  *
  * - The origin mapping will point to the old origin block (the shared
  * one).  This will contain the data as it was before the io that triggered
  * the breaking of sharing came in.
  *
  * - The snap mapping still points to the old block.  As it would after
  * the commit.
  *
  * The downside of this scheme is the timestamp magic isn't perfect, and
  * will continue to think that data block in the snapshot device is shared
  * even after the write to the origin has broken sharing.  I suspect data
  * blocks will typically be shared by many different devices, so we're
  * breaking sharing n + 1 times, rather than n, where n is the number of
  * devices that reference this data block.  At the moment I think the
  * benefits far, far outweigh the disadvantages.
  */
 /*----------------------------------------------------------------*/
 /*
  * Key building.
  */
 static void build_data_key(struct dm_thin_device *td,
 			   dm_block_t b, struct dm_cell_key *key)
 {
 	key->virtual = 0;
 	key->dev = dm_thin_dev_id(td);
 	key->block_begin = b;
 	key->block_end = b + 1ULL;
 }
 static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
 			      struct dm_cell_key *key)
 {
 	key->virtual = 1;
 	key->dev = dm_thin_dev_id(td);
 	key->block_begin = b;
 	key->block_end = b + 1ULL;
 }
 /*----------------------------------------------------------------*/
 #define THROTTLE_THRESHOLD (1 * HZ)
 struct throttle {
 	struct rw_semaphore lock;
 	unsigned long threshold;
 	bool throttle_applied;
 };
 static void throttle_init(struct throttle *t)
 {
 	init_rwsem(&t->lock);
 	t->throttle_applied = false;
 }
 static void throttle_work_start(struct throttle *t)
 {
 	t->threshold = jiffies + THROTTLE_THRESHOLD;
 }
 static void throttle_work_update(struct throttle *t)
 {
 	if (!t->throttle_applied && jiffies > t->threshold) {
 		down_write(&t->lock);
 		t->throttle_applied = true;
 	}
 }
 static void throttle_work_complete(struct throttle *t)
 {
 	if (t->throttle_applied) {
 		t->throttle_applied = false;
 		up_write(&t->lock);
 	}
 }
 static void throttle_lock(struct throttle *t)
 {
 	down_read(&t->lock);
 }
 static void throttle_unlock(struct throttle *t)
 {
 	up_read(&t->lock);
 }
 /*----------------------------------------------------------------*/
 /*
  * A pool device ties together a metadata device and a data device.  It
  * also provides the interface for creating and destroying internal
  * devices.
  */
 struct dm_thin_new_mapping;
 /*
  * The pool runs in 4 modes.  Ordered in degraded order for comparisons.
  */
 enum pool_mode {
 	PM_WRITE,		/* metadata may be changed */
 	PM_OUT_OF_DATA_SPACE,	/* metadata may be changed, though data may not be allocated */
 	PM_READ_ONLY,		/* metadata may not be changed */
 	PM_FAIL,		/* all I/O fails */
 };
 struct pool_features {
 	enum pool_mode mode;
 	bool zero_new_blocks:1;
 	bool discard_enabled:1;
 	bool discard_passdown:1;
 	bool error_if_no_space:1;
 };
 struct thin_c;
 typedef void (*process_bio_fn)(struct thin_c *tc, struct bio *bio);
 typedef void (*process_cell_fn)(struct thin_c *tc, struct dm_bio_prison_cell *cell);
 typedef void (*process_mapping_fn)(struct dm_thin_new_mapping *m);
 #define CELL_SORT_ARRAY_SIZE 8192
 struct pool {
 	struct list_head list;
 	struct dm_target *ti;	/* Only set if a pool target is bound */
 	struct mapped_device *pool_md;
 	struct block_device *md_dev;
 	struct dm_pool_metadata *pmd;
 	dm_block_t low_water_blocks;
 	uint32_t sectors_per_block;
 	int sectors_per_block_shift;
 	struct pool_features pf;
 	bool low_water_triggered:1;	/* A dm event has been sent */
 	bool suspended:1;
 	struct dm_bio_prison *prison;
 	struct dm_kcopyd_client *copier;
 	struct workqueue_struct *wq;
 	struct throttle throttle;
 	struct work_struct worker;
 	struct delayed_work waker;
 	struct delayed_work no_space_timeout;
 	unsigned long last_commit_jiffies;
 	unsigned ref_count;
 	spinlock_t lock;
 	struct bio_list deferred_flush_bios;
 	struct list_head prepared_mappings;
 	struct list_head prepared_discards;
 	struct list_head active_thins;
 	struct dm_deferred_set *shared_read_ds;
 	struct dm_deferred_set *all_io_ds;
 	struct dm_thin_new_mapping *next_mapping;
 	mempool_t *mapping_pool;
 	process_bio_fn process_bio;
 	process_bio_fn process_discard;
 	process_cell_fn process_cell;
 	process_cell_fn process_discard_cell;
 	process_mapping_fn process_prepared_mapping;
 	process_mapping_fn process_prepared_discard;
 	struct dm_bio_prison_cell *cell_sort_array[CELL_SORT_ARRAY_SIZE];
 };
 static enum pool_mode get_pool_mode(struct pool *pool);
 static void metadata_operation_failed(struct pool *pool, const char *op, int r);
 /*
  * Target context for a pool.
  */
 struct pool_c {
 	struct dm_target *ti;
 	struct pool *pool;
 	struct dm_dev *data_dev;
 	struct dm_dev *metadata_dev;
 	struct dm_target_callbacks callbacks;
 	dm_block_t low_water_blocks;
 	struct pool_features requested_pf; /* Features requested during table load */
 	struct pool_features adjusted_pf;  /* Features used after adjusting for constituent devices */
 };
 /*
  * Target context for a thin.
  */
 struct thin_c {
 	struct list_head list;
 	struct dm_dev *pool_dev;
 	struct dm_dev *origin_dev;
 	sector_t origin_size;
 	dm_thin_id dev_id;
 	struct pool *pool;
 	struct dm_thin_device *td;
 	struct mapped_device *thin_md;
 	bool requeue_mode:1;
 	spinlock_t lock;
 	struct list_head deferred_cells;
 	struct bio_list deferred_bio_list;
 	struct bio_list retry_on_resume_list;
 	struct rb_root sort_bio_list; /* sorted list of deferred bios */
 	/*
 	 * Ensures the thin is not destroyed until the worker has finished
 	 * iterating the active_thins list.
 	 */
 	atomic_t refcount;
 	struct completion can_destroy;
 };
 /*----------------------------------------------------------------*/
 /*
  * wake_worker() is used when new work is queued and when pool_resume is
  * ready to continue deferred IO processing.
  */
 static void wake_worker(struct pool *pool)
 {
 	queue_work(pool->wq, &pool->worker);
 }
 /*----------------------------------------------------------------*/
 static int bio_detain(struct pool *pool, struct dm_cell_key *key, struct bio *bio,
 		      struct dm_bio_prison_cell **cell_result)
 {
 	int r;
 	struct dm_bio_prison_cell *cell_prealloc;
 	/*
 	 * Allocate a cell from the prison's mempool.
 	 * This might block but it can't fail.
 	 */
 	cell_prealloc = dm_bio_prison_alloc_cell(pool->prison, GFP_NOIO);
 	r = dm_bio_detain(pool->prison, key, bio, cell_prealloc, cell_result);
 	if (r)
 		/*
 		 * We reused an old cell; we can get rid of
 		 * the new one.
 		 */
 		dm_bio_prison_free_cell(pool->prison, cell_prealloc);
 	return r;
 }
 static void cell_release(struct pool *pool,
 			 struct dm_bio_prison_cell *cell,
 			 struct bio_list *bios)
 {
 	dm_cell_release(pool->prison, cell, bios);
 	dm_bio_prison_free_cell(pool->prison, cell);
 }
 static void cell_visit_release(struct pool *pool,
 			       void (*fn)(void *, struct dm_bio_prison_cell *),
 			       void *context,
 			       struct dm_bio_prison_cell *cell)
 {
 	dm_cell_visit_release(pool->prison, fn, context, cell);
 	dm_bio_prison_free_cell(pool->prison, cell);
 }
 static void cell_release_no_holder(struct pool *pool,
 				   struct dm_bio_prison_cell *cell,
 				   struct bio_list *bios)
 {
 	dm_cell_release_no_holder(pool->prison, cell, bios);
 	dm_bio_prison_free_cell(pool->prison, cell);
 }
 static void cell_error_with_code(struct pool *pool,
 				 struct dm_bio_prison_cell *cell, int error_code)
 {
 	dm_cell_error(pool->prison, cell, error_code);
 	dm_bio_prison_free_cell(pool->prison, cell);
 }
 static void cell_error(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
 	cell_error_with_code(pool, cell, -EIO);
 }
 static void cell_success(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
 	cell_error_with_code(pool, cell, 0);
 }
 static void cell_requeue(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
 	cell_error_with_code(pool, cell, DM_ENDIO_REQUEUE);
 }
 /*----------------------------------------------------------------*/
 /*
  * A global list of pools that uses a struct mapped_device as a key.
  */
 static struct dm_thin_pool_table {
 	struct mutex mutex;
 	struct list_head pools;
 } dm_thin_pool_table;
 static void pool_table_init(void)
 {
 	mutex_init(&dm_thin_pool_table.mutex);
 	INIT_LIST_HEAD(&dm_thin_pool_table.pools);
 }
 static void __pool_table_insert(struct pool *pool)
 {
 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 	list_add(&pool->list, &dm_thin_pool_table.pools);
 }
 static void __pool_table_remove(struct pool *pool)
 {
 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 	list_del(&pool->list);
 }
 static struct pool *__pool_table_lookup(struct mapped_device *md)
 {
 	struct pool *pool = NULL, *tmp;
 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 		if (tmp->pool_md == md) {
 			pool = tmp;
 			break;
 		}
 	}
 	return pool;
 }
 static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev)
 {
 	struct pool *pool = NULL, *tmp;
 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 	list_for_each_entry(tmp, &dm_thin_pool_table.pools, list) {
 		if (tmp->md_dev == md_dev) {
 			pool = tmp;
 			break;
 		}
 	}
 	return pool;
 }
 /*----------------------------------------------------------------*/
 struct dm_thin_endio_hook {
 	struct thin_c *tc;
 	struct dm_deferred_entry *shared_read_entry;
 	struct dm_deferred_entry *all_io_entry;
 	struct dm_thin_new_mapping *overwrite_mapping;
 	struct rb_node rb_node;
 };
 static void __merge_bio_list(struct bio_list *bios, struct bio_list *master)
 {
 	bio_list_merge(bios, master);
 	bio_list_init(master);
 }
 static void error_bio_list(struct bio_list *bios, int error)
 {
 	struct bio *bio;
 	while ((bio = bio_list_pop(bios)))
 		bio_endio(bio, error);
 }
 static void error_thin_bio_list(struct thin_c *tc, struct bio_list *master, int error)
 {
 	struct bio_list bios;
 	unsigned long flags;
 	bio_list_init(&bios);
 	spin_lock_irqsave(&tc->lock, flags);
 	__merge_bio_list(&bios, master);
 	spin_unlock_irqrestore(&tc->lock, flags);
 	error_bio_list(&bios, error);
 }
 static void requeue_deferred_cells(struct thin_c *tc)
 {
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 	struct list_head cells;
 	struct dm_bio_prison_cell *cell, *tmp;
 	INIT_LIST_HEAD(&cells);
 	spin_lock_irqsave(&tc->lock, flags);
 	list_splice_init(&tc->deferred_cells, &cells);
 	spin_unlock_irqrestore(&tc->lock, flags);
 	list_for_each_entry_safe(cell, tmp, &cells, user_list)
 		cell_requeue(pool, cell);
 }
 static void requeue_io(struct thin_c *tc)
 {
 	struct bio_list bios;
 	unsigned long flags;
 	bio_list_init(&bios);
 	spin_lock_irqsave(&tc->lock, flags);
 	__merge_bio_list(&bios, &tc->deferred_bio_list);
 	__merge_bio_list(&bios, &tc->retry_on_resume_list);
 	spin_unlock_irqrestore(&tc->lock, flags);
 	error_bio_list(&bios, DM_ENDIO_REQUEUE);
 	requeue_deferred_cells(tc);
 }
 static void error_retry_list(struct pool *pool)
 {
 	struct thin_c *tc;
 	rcu_read_lock();
 	list_for_each_entry_rcu(tc, &pool->active_thins, list)
 		error_thin_bio_list(tc, &tc->retry_on_resume_list, -EIO);
 	rcu_read_unlock();
 }
 /*
  * This section of code contains the logic for processing a thin device's IO.
  * Much of the code depends on pool object resources (lists, workqueues, etc)
  * but most is exclusively called from the thin target rather than the thin-pool
  * target.
  */
 static bool block_size_is_power_of_two(struct pool *pool)
 {
 	return pool->sectors_per_block_shift >= 0;
 }
 static dm_block_t get_bio_block(struct thin_c *tc, struct bio *bio)
 {
 	struct pool *pool = tc->pool;
 	sector_t block_nr = bio->bi_iter.bi_sector;
 	if (block_size_is_power_of_two(pool))
 		block_nr >>= pool->sectors_per_block_shift;
 	else
 		(void) sector_div(block_nr, pool->sectors_per_block);
 	return block_nr;
 }
 static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
 {
 	struct pool *pool = tc->pool;
 	sector_t bi_sector = bio->bi_iter.bi_sector;
 	bio->bi_bdev = tc->pool_dev->bdev;
 	if (block_size_is_power_of_two(pool))
 		bio->bi_iter.bi_sector =
 			(block << pool->sectors_per_block_shift) |
 			(bi_sector & (pool->sectors_per_block - 1));
 	else
 		bio->bi_iter.bi_sector = (block * pool->sectors_per_block) +
 				 sector_div(bi_sector, pool->sectors_per_block);
 }
 static void remap_to_origin(struct thin_c *tc, struct bio *bio)
 {
 	bio->bi_bdev = tc->origin_dev->bdev;
 }
 static int bio_triggers_commit(struct thin_c *tc, struct bio *bio)
 {
 	return (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
 		dm_thin_changed_this_transaction(tc->td);
 }
 static void inc_all_io_entry(struct pool *pool, struct bio *bio)
 {
 	struct dm_thin_endio_hook *h;
 	if (bio->bi_rw & REQ_DISCARD)
 		return;
 	h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	h->all_io_entry = dm_deferred_entry_inc(pool->all_io_ds);
 }
 static void issue(struct thin_c *tc, struct bio *bio)
 {
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 	if (!bio_triggers_commit(tc, bio)) {
 		generic_make_request(bio);
 		return;
 	}
 	/*
 	 * Complete bio with an error if earlier I/O caused changes to
 	 * the metadata that can't be committed e.g, due to I/O errors
 	 * on the metadata device.
 	 */
 	if (dm_thin_aborted_changes(tc->td)) {
 		bio_io_error(bio);
 		return;
 	}
 	/*
 	 * Batch together any bios that trigger commits and then issue a
 	 * single commit for them in process_deferred_bios().
 	 */
 	spin_lock_irqsave(&pool->lock, flags);
 	bio_list_add(&pool->deferred_flush_bios, bio);
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
 static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
 {
 	remap_to_origin(tc, bio);
 	issue(tc, bio);
 }
 static void remap_and_issue(struct thin_c *tc, struct bio *bio,
 			    dm_block_t block)
 {
 	remap(tc, bio, block);
 	issue(tc, bio);
 }
 /*----------------------------------------------------------------*/
 /*
  * Bio endio functions.
  */
 struct dm_thin_new_mapping {
 	struct list_head list;
 	bool pass_discard:1;
 	bool definitely_not_shared:1;
 	/*
 	 * Track quiescing, copying and zeroing preparation actions.  When this
 	 * counter hits zero the block is prepared and can be inserted into the
 	 * btree.
 	 */
 	atomic_t prepare_actions;
 	int err;
 	struct thin_c *tc;
 	dm_block_t virt_block;
 	dm_block_t data_block;
 	struct dm_bio_prison_cell *cell, *cell2;
 	/*
 	 * If the bio covers the whole area of a block then we can avoid
 	 * zeroing or copying.  Instead this bio is hooked.  The bio will
 	 * still be in the cell, so care has to be taken to avoid issuing
 	 * the bio twice.
 	 */
 	struct bio *bio;
 	bio_end_io_t *saved_bi_end_io;
 };
 static void __complete_mapping_preparation(struct dm_thin_new_mapping *m)
 {
 	struct pool *pool = m->tc->pool;
 	if (atomic_dec_and_test(&m->prepare_actions)) {
 		list_add_tail(&m->list, &pool->prepared_mappings);
 		wake_worker(pool);
 	}
 }
 static void complete_mapping_preparation(struct dm_thin_new_mapping *m)
 {
 	unsigned long flags;
 	struct pool *pool = m->tc->pool;
 	spin_lock_irqsave(&pool->lock, flags);
 	__complete_mapping_preparation(m);
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
 static void copy_complete(int read_err, unsigned long write_err, void *context)
 {
 	struct dm_thin_new_mapping *m = context;
 	m->err = read_err || write_err ? -EIO : 0;
 	complete_mapping_preparation(m);
 }
 static void overwrite_endio(struct bio *bio, int err)
 {
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	struct dm_thin_new_mapping *m = h->overwrite_mapping;
 	m->err = err;
 	complete_mapping_preparation(m);
 }
 /*----------------------------------------------------------------*/
 /*
  * Workqueue.
  */
 /*
  * Prepared mapping jobs.
  */
 /*
  * This sends the bios in the cell, except the original holder, back
  * to the deferred_bios list.
  */
 static void cell_defer_no_holder(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 	spin_lock_irqsave(&tc->lock, flags);
 	cell_release_no_holder(pool, cell, &tc->deferred_bio_list);
 	spin_unlock_irqrestore(&tc->lock, flags);
 	wake_worker(pool);
 }
 static void thin_defer_bio(struct thin_c *tc, struct bio *bio);
 struct remap_info {
 	struct thin_c *tc;
 	struct bio_list defer_bios;
 	struct bio_list issue_bios;
 };
 static void __inc_remap_and_issue_cell(void *context,
 				       struct dm_bio_prison_cell *cell)
 {
 	struct remap_info *info = context;
 	struct bio *bio;
 	while ((bio = bio_list_pop(&cell->bios))) {
 		if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA))
 			bio_list_add(&info->defer_bios, bio);
 		else {
 			inc_all_io_entry(info->tc->pool, bio);
 			/*
 			 * We can't issue the bios with the bio prison lock
 			 * held, so we add them to a list to issue on
 			 * return from this function.
 			 */
 			bio_list_add(&info->issue_bios, bio);
 		}
 	}
 }
 static void inc_remap_and_issue_cell(struct thin_c *tc,
 				     struct dm_bio_prison_cell *cell,
 				     dm_block_t block)
 {
 	struct bio *bio;
 	struct remap_info info;
 	info.tc = tc;
 	bio_list_init(&info.defer_bios);
 	bio_list_init(&info.issue_bios);
 	/*
 	 * We have to be careful to inc any bios we're about to issue
 	 * before the cell is released, and avoid a race with new bios
 	 * being added to the cell.
 	 */
 	cell_visit_release(tc->pool, __inc_remap_and_issue_cell,
 			   &info, cell);
 	while ((bio = bio_list_pop(&info.defer_bios)))
 		thin_defer_bio(tc, bio);
 	while ((bio = bio_list_pop(&info.issue_bios)))
 		remap_and_issue(info.tc, bio, block);
 }
 static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m)
 {
 	if (m->bio) {
 		m->bio->bi_end_io = m->saved_bi_end_io;
 		atomic_inc(&m->bio->bi_remaining);
 	}
 	cell_error(m->tc->pool, m->cell);
 	list_del(&m->list);
 	mempool_free(m, m->tc->pool->mapping_pool);
 }
 static void process_prepared_mapping(struct dm_thin_new_mapping *m)
 {
 	struct thin_c *tc = m->tc;
 	struct pool *pool = tc->pool;
 	struct bio *bio;
 	int r;
 	bio = m->bio;
 	if (bio) {
 		bio->bi_end_io = m->saved_bi_end_io;
 		atomic_inc(&bio->bi_remaining);
 	}
 	if (m->err) {
 		cell_error(pool, m->cell);
 		goto out;
 	}
 	/*
 	 * Commit the prepared block into the mapping btree.
 	 * Any I/O for this block arriving after this point will get
 	 * remapped to it directly.
 	 */
 	r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block);
 	if (r) {
 		metadata_operation_failed(pool, "dm_thin_insert_block", r);
 		cell_error(pool, m->cell);
 		goto out;
 	}
 	/*
 	 * Release any bios held while the block was being provisioned.
 	 * If we are processing a write bio that completely covers the block,
 	 * we already processed it so can ignore it now when processing
 	 * the bios in the cell.
 	 */
 	if (bio) {
 		inc_remap_and_issue_cell(tc, m->cell, m->data_block);
 		bio_endio(bio, 0);
 	} else {
 		inc_all_io_entry(tc->pool, m->cell->holder);
 		remap_and_issue(tc, m->cell->holder, m->data_block);
 		inc_remap_and_issue_cell(tc, m->cell, m->data_block);
 	}
 out:
 	list_del(&m->list);
 	mempool_free(m, pool->mapping_pool);
 }
 static void process_prepared_discard_fail(struct dm_thin_new_mapping *m)
 {
 	struct thin_c *tc = m->tc;
 	bio_io_error(m->bio);
 	cell_defer_no_holder(tc, m->cell);
 	cell_defer_no_holder(tc, m->cell2);
 	mempool_free(m, tc->pool->mapping_pool);
 }
 static void process_prepared_discard_passdown(struct dm_thin_new_mapping *m)
 {
 	struct thin_c *tc = m->tc;
 	inc_all_io_entry(tc->pool, m->bio);
 	cell_defer_no_holder(tc, m->cell);
 	cell_defer_no_holder(tc, m->cell2);
 	if (m->pass_discard)
 		if (m->definitely_not_shared)
 			remap_and_issue(tc, m->bio, m->data_block);
 		else {
 			bool used = false;
 			if (dm_pool_block_is_used(tc->pool->pmd, m->data_block, &used) || used)
 				bio_endio(m->bio, 0);
 			else
 				remap_and_issue(tc, m->bio, m->data_block);
 		}
 	else
 		bio_endio(m->bio, 0);
 	mempool_free(m, tc->pool->mapping_pool);
 }
 static void process_prepared_discard(struct dm_thin_new_mapping *m)
 {
 	int r;
 	struct thin_c *tc = m->tc;
 	r = dm_thin_remove_block(tc->td, m->virt_block);
 	if (r)
 		DMERR_LIMIT("dm_thin_remove_block() failed");
 	process_prepared_discard_passdown(m);
 }
 static void process_prepared(struct pool *pool, struct list_head *head,
 			     process_mapping_fn *fn)
 {
 	unsigned long flags;
 	struct list_head maps;
 	struct dm_thin_new_mapping *m, *tmp;
 	INIT_LIST_HEAD(&maps);
 	spin_lock_irqsave(&pool->lock, flags);
 	list_splice_init(head, &maps);
 	spin_unlock_irqrestore(&pool->lock, flags);
 	list_for_each_entry_safe(m, tmp, &maps, list)
 		(*fn)(m);
 }
 /*
  * Deferred bio jobs.
  */
 static int io_overlaps_block(struct pool *pool, struct bio *bio)
 {
 	return bio->bi_iter.bi_size ==
 		(pool->sectors_per_block << SECTOR_SHIFT);
 }
 static int io_overwrites_block(struct pool *pool, struct bio *bio)
 {
 	return (bio_data_dir(bio) == WRITE) &&
 		io_overlaps_block(pool, bio);
 }
 static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
 			       bio_end_io_t *fn)
 {
 	*save = bio->bi_end_io;
 	bio->bi_end_io = fn;
 }
 static int ensure_next_mapping(struct pool *pool)
 {
 	if (pool->next_mapping)
 		return 0;
 	pool->next_mapping = mempool_alloc(pool->mapping_pool, GFP_ATOMIC);
 	return pool->next_mapping ? 0 : -ENOMEM;
 }
 static struct dm_thin_new_mapping *get_next_mapping(struct pool *pool)
 {
 	struct dm_thin_new_mapping *m = pool->next_mapping;
 	BUG_ON(!pool->next_mapping);
 	memset(m, 0, sizeof(struct dm_thin_new_mapping));
 	INIT_LIST_HEAD(&m->list);
 	m->bio = NULL;
 	pool->next_mapping = NULL;
 	return m;
 }
 static void ll_zero(struct thin_c *tc, struct dm_thin_new_mapping *m,
 		    sector_t begin, sector_t end)
 {
 	int r;
 	struct dm_io_region to;
 	to.bdev = tc->pool_dev->bdev;
 	to.sector = begin;
 	to.count = end - begin;
 	r = dm_kcopyd_zero(tc->pool->copier, 1, &to, 0, copy_complete, m);
 	if (r < 0) {
 		DMERR_LIMIT("dm_kcopyd_zero() failed");
 		copy_complete(1, 1, m);
 	}
 }
 static void remap_and_issue_overwrite(struct thin_c *tc, struct bio *bio,
 				      dm_block_t data_block,
 				      struct dm_thin_new_mapping *m)
 {
 	struct pool *pool = tc->pool;
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	h->overwrite_mapping = m;
 	m->bio = bio;
 	save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
 	inc_all_io_entry(pool, bio);
 	remap_and_issue(tc, bio, data_block);
 }
 /*
  * A partial copy also needs to zero the uncopied region.
  */
 static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
 			  struct dm_dev *origin, dm_block_t data_origin,
 			  dm_block_t data_dest,
 			  struct dm_bio_prison_cell *cell, struct bio *bio,
 			  sector_t len)
 {
 	int r;
 	struct pool *pool = tc->pool;
 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 	m->tc = tc;
 	m->virt_block = virt_block;
 	m->data_block = data_dest;
 	m->cell = cell;
 	/*
 	 * quiesce action + copy action + an extra reference held for the
 	 * duration of this function (we may need to inc later for a
 	 * partial zero).
 	 */
 	atomic_set(&m->prepare_actions, 3);
 	if (!dm_deferred_set_add_work(pool->shared_read_ds, &m->list))
 		complete_mapping_preparation(m); /* already quiesced */
 	/*
 	 * IO to pool_dev remaps to the pool target's data_dev.
 	 *
 	 * If the whole block of data is being overwritten, we can issue the
 	 * bio immediately. Otherwise we use kcopyd to clone the data first.
 	 */
 	if (io_overwrites_block(pool, bio))
 		remap_and_issue_overwrite(tc, bio, data_dest, m);
 	else {
 		struct dm_io_region from, to;
 		from.bdev = origin->bdev;
 		from.sector = data_origin * pool->sectors_per_block;
 		from.count = len;
 		to.bdev = tc->pool_dev->bdev;
 		to.sector = data_dest * pool->sectors_per_block;
 		to.count = len;
 		r = dm_kcopyd_copy(pool->copier, &from, 1, &to,
 				   0, copy_complete, m);
 		if (r < 0) {
 			DMERR_LIMIT("dm_kcopyd_copy() failed");
 			copy_complete(1, 1, m);
 			/*
 			 * We allow the zero to be issued, to simplify the
 			 * error path.  Otherwise we'd need to start
 			 * worrying about decrementing the prepare_actions
 			 * counter.
 			 */
 		}
 		/*
 		 * Do we need to zero a tail region?
 		 */
 		if (len < pool->sectors_per_block && pool->pf.zero_new_blocks) {
 			atomic_inc(&m->prepare_actions);
 			ll_zero(tc, m,
 				data_dest * pool->sectors_per_block + len,
 				(data_dest + 1) * pool->sectors_per_block);
 		}
 	}
 	complete_mapping_preparation(m); /* drop our ref */
 }
 static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
 				   dm_block_t data_origin, dm_block_t data_dest,
 				   struct dm_bio_prison_cell *cell, struct bio *bio)
 {
 	schedule_copy(tc, virt_block, tc->pool_dev,
 		      data_origin, data_dest, cell, bio,
 		      tc->pool->sectors_per_block);
 }
 static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
 			  dm_block_t data_block, struct dm_bio_prison_cell *cell,
 			  struct bio *bio)
 {
 	struct pool *pool = tc->pool;
 	struct dm_thin_new_mapping *m = get_next_mapping(pool);
 	atomic_set(&m->prepare_actions, 1); /* no need to quiesce */
 	m->tc = tc;
 	m->virt_block = virt_block;
 	m->data_block = data_block;
 	m->cell = cell;
 	/*
 	 * If the whole block of data is being overwritten or we are not
 	 * zeroing pre-existing data, we can issue the bio immediately.
 	 * Otherwise we use kcopyd to zero the data first.
 	 */
 	if (!pool->pf.zero_new_blocks)
 		process_prepared_mapping(m);
 	else if (io_overwrites_block(pool, bio))
 		remap_and_issue_overwrite(tc, bio, data_block, m);
 	else
 		ll_zero(tc, m,
 			data_block * pool->sectors_per_block,
 			(data_block + 1) * pool->sectors_per_block);
 }
 static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
 				   dm_block_t data_dest,
 				   struct dm_bio_prison_cell *cell, struct bio *bio)
 {
 	struct pool *pool = tc->pool;
 	sector_t virt_block_begin = virt_block * pool->sectors_per_block;
 	sector_t virt_block_end = (virt_block + 1) * pool->sectors_per_block;
 	if (virt_block_end <= tc->origin_size)
 		schedule_copy(tc, virt_block, tc->origin_dev,
 			      virt_block, data_dest, cell, bio,
 			      pool->sectors_per_block);
 	else if (virt_block_begin < tc->origin_size)
 		schedule_copy(tc, virt_block, tc->origin_dev,
 			      virt_block, data_dest, cell, bio,
 			      tc->origin_size - virt_block_begin);
 	else
 		schedule_zero(tc, virt_block, data_dest, cell, bio);
 }
 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
 static void check_for_space(struct pool *pool)
 {
 	int r;
 	dm_block_t nr_free;
 	if (get_pool_mode(pool) != PM_OUT_OF_DATA_SPACE)
 		return;
 	r = dm_pool_get_free_block_count(pool->pmd, &nr_free);
 	if (r)
 		return;
 	if (nr_free)
 		set_pool_mode(pool, PM_WRITE);
 }
 /*
  * A non-zero return indicates read_only or fail_io mode.
  * Many callers don't care about the return value.
  */
 static int commit(struct pool *pool)
 {
 	int r;
 	if (get_pool_mode(pool) >= PM_READ_ONLY)
 		return -EINVAL;
 	r = dm_pool_commit_metadata(pool->pmd);
 	if (r)
 		metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
 	else
 		check_for_space(pool);
 	return r;
 }
 static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks)
 {
 	unsigned long flags;
 	if (free_blocks <= pool->low_water_blocks && !pool->low_water_triggered) {
 		DMWARN("%s: reached low water mark for data device: sending event.",
 		       dm_device_name(pool->pool_md));
 		spin_lock_irqsave(&pool->lock, flags);
 		pool->low_water_triggered = true;
 		spin_unlock_irqrestore(&pool->lock, flags);
 		dm_table_event(pool->ti->table);
 	}
 }
 static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
 {
 	int r;
 	dm_block_t free_blocks;
 	struct pool *pool = tc->pool;
 	if (WARN_ON(get_pool_mode(pool) != PM_WRITE))
 		return -EINVAL;
 	r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 	if (r) {
 		metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
 		return r;
 	}
 	check_low_water_mark(pool, free_blocks);
 	if (!free_blocks) {
 		/*
 		 * Try to commit to see if that will free up some
 		 * more space.
 		 */
 		r = commit(pool);
 		if (r)
 			return r;
 		r = dm_pool_get_free_block_count(pool->pmd, &free_blocks);
 		if (r) {
 			metadata_operation_failed(pool, "dm_pool_get_free_block_count", r);
 			return r;
 		}
 		if (!free_blocks) {
 			set_pool_mode(pool, PM_OUT_OF_DATA_SPACE);
 			return -ENOSPC;
 		}
 	}
 	r = dm_pool_alloc_data_block(pool->pmd, result);
 	if (r) {
 		metadata_operation_failed(pool, "dm_pool_alloc_data_block", r);
 		return r;
 	}
 	return 0;
 }
 /*
  * If we have run out of space, queue bios until the device is
  * resumed, presumably after having been reloaded with more space.
  */
 static void retry_on_resume(struct bio *bio)
 {
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	struct thin_c *tc = h->tc;
 	unsigned long flags;
 	spin_lock_irqsave(&tc->lock, flags);
 	bio_list_add(&tc->retry_on_resume_list, bio);
 	spin_unlock_irqrestore(&tc->lock, flags);
 }
 static int should_error_unserviceable_bio(struct pool *pool)
 {
 	enum pool_mode m = get_pool_mode(pool);
 	switch (m) {
 	case PM_WRITE:
 		/* Shouldn't get here */
 		DMERR_LIMIT("bio unserviceable, yet pool is in PM_WRITE mode");
 		return -EIO;
 	case PM_OUT_OF_DATA_SPACE:
 		return pool->pf.error_if_no_space ? -ENOSPC : 0;
 	case PM_READ_ONLY:
 	case PM_FAIL:
 		return -EIO;
 	default:
 		/* Shouldn't get here */
 		DMERR_LIMIT("bio unserviceable, yet pool has an unknown mode");
 		return -EIO;
 	}
 }
 static void handle_unserviceable_bio(struct pool *pool, struct bio *bio)
 {
 	int error = should_error_unserviceable_bio(pool);
 	if (error)
 		bio_endio(bio, error);
 	else
 		retry_on_resume(bio);
 }
 static void retry_bios_on_resume(struct pool *pool, struct dm_bio_prison_cell *cell)
 {
 	struct bio *bio;
 	struct bio_list bios;
 	int error;
 	error = should_error_unserviceable_bio(pool);
 	if (error) {
 		cell_error_with_code(pool, cell, error);
 		return;
 	}
 	bio_list_init(&bios);
 	cell_release(pool, cell, &bios);
 	while ((bio = bio_list_pop(&bios)))
 		retry_on_resume(bio);
 }
 static void process_discard_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	int r;
 	struct bio *bio = cell->holder;
 	struct pool *pool = tc->pool;
 	struct dm_bio_prison_cell *cell2;
 	struct dm_cell_key key2;
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_thin_lookup_result lookup_result;
 	struct dm_thin_new_mapping *m;
 	if (tc->requeue_mode) {
 		cell_requeue(pool, cell);
 		return;
 	}
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
 	case 0:
 		/*
 		 * Check nobody is fiddling with this pool block.  This can
 		 * happen if someone's in the process of breaking sharing
 		 * on this block.
 		 */
 		build_data_key(tc->td, lookup_result.block, &key2);
 		if (bio_detain(tc->pool, &key2, bio, &cell2)) {
 			cell_defer_no_holder(tc, cell);
 			break;
 		}
 		if (io_overlaps_block(pool, bio)) {
 			/*
 			 * IO may still be going to the destination block.  We must
 			 * quiesce before we can do the removal.
 			 */
 			m = get_next_mapping(pool);
 			m->tc = tc;
 			m->pass_discard = pool->pf.discard_passdown;
 			m->definitely_not_shared = !lookup_result.shared;
 			m->virt_block = block;
 			m->data_block = lookup_result.block;
 			m->cell = cell;
 			m->cell2 = cell2;
 			m->bio = bio;
 			if (!dm_deferred_set_add_work(pool->all_io_ds, &m->list))
 				pool->process_prepared_discard(m);
 		} else {
 			inc_all_io_entry(pool, bio);
 			cell_defer_no_holder(tc, cell);
 			cell_defer_no_holder(tc, cell2);
 			/*
 			 * The DM core makes sure that the discard doesn't span
 			 * a block boundary.  So we submit the discard of a
 			 * partial block appropriately.
 			 */
 			if ((!lookup_result.shared) && pool->pf.discard_passdown)
 				remap_and_issue(tc, bio, lookup_result.block);
 			else
 				bio_endio(bio, 0);
 		}
 		break;
 	case -ENODATA:
 		/*
 		 * It isn't provisioned, just forget it.
 		 */
 		cell_defer_no_holder(tc, cell);
 		bio_endio(bio, 0);
 		break;
 	default:
 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
 			    __func__, r);
 		cell_defer_no_holder(tc, cell);
 		bio_io_error(bio);
 		break;
 	}
 }
 static void process_discard_bio(struct thin_c *tc, struct bio *bio)
 {
 	struct dm_bio_prison_cell *cell;
 	struct dm_cell_key key;
 	dm_block_t block = get_bio_block(tc, bio);
 	build_virtual_key(tc->td, block, &key);
 	if (bio_detain(tc->pool, &key, bio, &cell))
 		return;
 	process_discard_cell(tc, cell);
 }
 static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
 			  struct dm_cell_key *key,
 			  struct dm_thin_lookup_result *lookup_result,
 			  struct dm_bio_prison_cell *cell)
 {
 	int r;
 	dm_block_t data_block;
 	struct pool *pool = tc->pool;
 	r = alloc_data_block(tc, &data_block);
 	switch (r) {
 	case 0:
 		schedule_internal_copy(tc, block, lookup_result->block,
 				       data_block, cell, bio);
 		break;
 	case -ENOSPC:
 		retry_bios_on_resume(pool, cell);
 		break;
 	default:
 		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
 			    __func__, r);
 		cell_error(pool, cell);
 		break;
 	}
 }
 static void __remap_and_issue_shared_cell(void *context,
 					  struct dm_bio_prison_cell *cell)
 {
 	struct remap_info *info = context;
 	struct bio *bio;
 	while ((bio = bio_list_pop(&cell->bios))) {
 		if ((bio_data_dir(bio) == WRITE) ||
 		    (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)))
 			bio_list_add(&info->defer_bios, bio);
 		else {
 			struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));;
 			h->shared_read_entry = dm_deferred_entry_inc(info->tc->pool->shared_read_ds);
 			inc_all_io_entry(info->tc->pool, bio);
 			bio_list_add(&info->issue_bios, bio);
 		}
 	}
 }
 static void remap_and_issue_shared_cell(struct thin_c *tc,
 					struct dm_bio_prison_cell *cell,
 					dm_block_t block)
 {
 	struct bio *bio;
 	struct remap_info info;
 	info.tc = tc;
 	bio_list_init(&info.defer_bios);
 	bio_list_init(&info.issue_bios);
 	cell_visit_release(tc->pool, __remap_and_issue_shared_cell,
 			   &info, cell);
 	while ((bio = bio_list_pop(&info.defer_bios)))
 		thin_defer_bio(tc, bio);
 	while ((bio = bio_list_pop(&info.issue_bios)))
 		remap_and_issue(tc, bio, block);
 }
 static void process_shared_bio(struct thin_c *tc, struct bio *bio,
 			       dm_block_t block,
 			       struct dm_thin_lookup_result *lookup_result,
 			       struct dm_bio_prison_cell *virt_cell)
 {
 	struct dm_bio_prison_cell *data_cell;
 	struct pool *pool = tc->pool;
 	struct dm_cell_key key;
 	/*
 	 * If cell is already occupied, then sharing is already in the process
 	 * of being broken so we have nothing further to do here.
 	 */
 	build_data_key(tc->td, lookup_result->block, &key);
 	if (bio_detain(pool, &key, bio, &data_cell)) {
 		cell_defer_no_holder(tc, virt_cell);
 		return;
 	}
 	if (bio_data_dir(bio) == WRITE && bio->bi_iter.bi_size) {
 		break_sharing(tc, bio, block, &key, lookup_result, data_cell);
 		cell_defer_no_holder(tc, virt_cell);
 	} else {
 		struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 		h->shared_read_entry = dm_deferred_entry_inc(pool->shared_read_ds);
 		inc_all_io_entry(pool, bio);
 		remap_and_issue(tc, bio, lookup_result->block);
 		remap_and_issue_shared_cell(tc, data_cell, lookup_result->block);
 		remap_and_issue_shared_cell(tc, virt_cell, lookup_result->block);
 	}
 }
 static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block,
 			    struct dm_bio_prison_cell *cell)
 {
 	int r;
 	dm_block_t data_block;
 	struct pool *pool = tc->pool;
 	/*
 	 * Remap empty bios (flushes) immediately, without provisioning.
 	 */
 	if (!bio->bi_iter.bi_size) {
 		inc_all_io_entry(pool, bio);
 		cell_defer_no_holder(tc, cell);
 		remap_and_issue(tc, bio, 0);
 		return;
 	}
 	/*
 	 * Fill read bios with zeroes and complete them immediately.
 	 */
 	if (bio_data_dir(bio) == READ) {
 		zero_fill_bio(bio);
 		cell_defer_no_holder(tc, cell);
 		bio_endio(bio, 0);
 		return;
 	}
 	r = alloc_data_block(tc, &data_block);
 	switch (r) {
 	case 0:
 		if (tc->origin_dev)
 			schedule_external_copy(tc, block, data_block, cell, bio);
 		else
 			schedule_zero(tc, block, data_block, cell, bio);
 		break;
 	case -ENOSPC:
 		retry_bios_on_resume(pool, cell);
 		break;
 	default:
 		DMERR_LIMIT("%s: alloc_data_block() failed: error = %d",
 			    __func__, r);
 		cell_error(pool, cell);
 		break;
 	}
 }
 static void process_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	int r;
 	struct pool *pool = tc->pool;
 	struct bio *bio = cell->holder;
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_thin_lookup_result lookup_result;
 	if (tc->requeue_mode) {
 		cell_requeue(pool, cell);
 		return;
 	}
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
 	case 0:
 		if (lookup_result.shared)
 			process_shared_bio(tc, bio, block, &lookup_result, cell);
 		else {
 			inc_all_io_entry(pool, bio);
 			remap_and_issue(tc, bio, lookup_result.block);
 			inc_remap_and_issue_cell(tc, cell, lookup_result.block);
 		}
 		break;
 	case -ENODATA:
 		if (bio_data_dir(bio) == READ && tc->origin_dev) {
 			inc_all_io_entry(pool, bio);
 			cell_defer_no_holder(tc, cell);
 			if (bio_end_sector(bio) <= tc->origin_size)
 				remap_to_origin_and_issue(tc, bio);
 			else if (bio->bi_iter.bi_sector < tc->origin_size) {
 				zero_fill_bio(bio);
 				bio->bi_iter.bi_size = (tc->origin_size - bio->bi_iter.bi_sector) << SECTOR_SHIFT;
 				remap_to_origin_and_issue(tc, bio);
 			} else {
 				zero_fill_bio(bio);
 				bio_endio(bio, 0);
 			}
 		} else
 			provision_block(tc, bio, block, cell);
 		break;
 	default:
 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
 			    __func__, r);
 		cell_defer_no_holder(tc, cell);
 		bio_io_error(bio);
 		break;
 	}
 }
 static void process_bio(struct thin_c *tc, struct bio *bio)
 {
 	struct pool *pool = tc->pool;
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_bio_prison_cell *cell;
 	struct dm_cell_key key;
 	/*
 	 * If cell is already occupied, then the block is already
 	 * being provisioned so we have nothing further to do here.
 	 */
 	build_virtual_key(tc->td, block, &key);
 	if (bio_detain(pool, &key, bio, &cell))
 		return;
 	process_cell(tc, cell);
 }
 static void __process_bio_read_only(struct thin_c *tc, struct bio *bio,
 				    struct dm_bio_prison_cell *cell)
 {
 	int r;
 	int rw = bio_data_dir(bio);
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_thin_lookup_result lookup_result;
 	r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
 	switch (r) {
 	case 0:
 		if (lookup_result.shared && (rw == WRITE) && bio->bi_iter.bi_size) {
 			handle_unserviceable_bio(tc->pool, bio);
 			if (cell)
 				cell_defer_no_holder(tc, cell);
 		} else {
 			inc_all_io_entry(tc->pool, bio);
 			remap_and_issue(tc, bio, lookup_result.block);
 			if (cell)
 				inc_remap_and_issue_cell(tc, cell, lookup_result.block);
 		}
 		break;
 	case -ENODATA:
 		if (cell)
 			cell_defer_no_holder(tc, cell);
 		if (rw != READ) {
 			handle_unserviceable_bio(tc->pool, bio);
 			break;
 		}
 		if (tc->origin_dev) {
 			inc_all_io_entry(tc->pool, bio);
 			remap_to_origin_and_issue(tc, bio);
 			break;
 		}
 		zero_fill_bio(bio);
 		bio_endio(bio, 0);
 		break;
 	default:
 		DMERR_LIMIT("%s: dm_thin_find_block() failed: error = %d",
 			    __func__, r);
 		if (cell)
 			cell_defer_no_holder(tc, cell);
 		bio_io_error(bio);
 		break;
 	}
 }
 static void process_bio_read_only(struct thin_c *tc, struct bio *bio)
 {
 	__process_bio_read_only(tc, bio, NULL);
 }
 static void process_cell_read_only(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	__process_bio_read_only(tc, cell->holder, cell);
 }
 static void process_bio_success(struct thin_c *tc, struct bio *bio)
 {
 	bio_endio(bio, 0);
 }
 static void process_bio_fail(struct thin_c *tc, struct bio *bio)
 {
 	bio_io_error(bio);
 }
 static void process_cell_success(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	cell_success(tc->pool, cell);
 }
 static void process_cell_fail(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	cell_error(tc->pool, cell);
 }
 /*
  * FIXME: should we also commit due to size of transaction, measured in
  * metadata blocks?
  */
 static int need_commit_due_to_time(struct pool *pool)
 {
 	return jiffies < pool->last_commit_jiffies ||
 	       jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
 }
 #define thin_pbd(node) rb_entry((node), struct dm_thin_endio_hook, rb_node)
 #define thin_bio(pbd) dm_bio_from_per_bio_data((pbd), sizeof(struct dm_thin_endio_hook))
 static void __thin_bio_rb_add(struct thin_c *tc, struct bio *bio)
 {
 	struct rb_node **rbp, *parent;
 	struct dm_thin_endio_hook *pbd;
 	sector_t bi_sector = bio->bi_iter.bi_sector;
 	rbp = &tc->sort_bio_list.rb_node;
 	parent = NULL;
 	while (*rbp) {
 		parent = *rbp;
 		pbd = thin_pbd(parent);
 		if (bi_sector < thin_bio(pbd)->bi_iter.bi_sector)
 			rbp = &(*rbp)->rb_left;
 		else
 			rbp = &(*rbp)->rb_right;
 	}
 	pbd = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	rb_link_node(&pbd->rb_node, parent, rbp);
 	rb_insert_color(&pbd->rb_node, &tc->sort_bio_list);
 }
 static void __extract_sorted_bios(struct thin_c *tc)
 {
 	struct rb_node *node;
 	struct dm_thin_endio_hook *pbd;
 	struct bio *bio;
 	for (node = rb_first(&tc->sort_bio_list); node; node = rb_next(node)) {
 		pbd = thin_pbd(node);
 		bio = thin_bio(pbd);
 		bio_list_add(&tc->deferred_bio_list, bio);
 		rb_erase(&pbd->rb_node, &tc->sort_bio_list);
 	}
 	WARN_ON(!RB_EMPTY_ROOT(&tc->sort_bio_list));
 }
 static void __sort_thin_deferred_bios(struct thin_c *tc)
 {
 	struct bio *bio;
 	struct bio_list bios;
 	bio_list_init(&bios);
 	bio_list_merge(&bios, &tc->deferred_bio_list);
 	bio_list_init(&tc->deferred_bio_list);
 	/* Sort deferred_bio_list using rb-tree */
 	while ((bio = bio_list_pop(&bios)))
 		__thin_bio_rb_add(tc, bio);
 	/*
 	 * Transfer the sorted bios in sort_bio_list back to
 	 * deferred_bio_list to allow lockless submission of
 	 * all bios.
 	 */
 	__extract_sorted_bios(tc);
 }
 static void process_thin_deferred_bios(struct thin_c *tc)
 {
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 	struct bio *bio;
 	struct bio_list bios;
 	struct blk_plug plug;
 	unsigned count = 0;
 	if (tc->requeue_mode) {
 		error_thin_bio_list(tc, &tc->deferred_bio_list, DM_ENDIO_REQUEUE);
 		return;
 	}
 	bio_list_init(&bios);
 	spin_lock_irqsave(&tc->lock, flags);
 	if (bio_list_empty(&tc->deferred_bio_list)) {
 		spin_unlock_irqrestore(&tc->lock, flags);
 		return;
 	}
 	__sort_thin_deferred_bios(tc);
 	bio_list_merge(&bios, &tc->deferred_bio_list);
 	bio_list_init(&tc->deferred_bio_list);
 	spin_unlock_irqrestore(&tc->lock, flags);
 	blk_start_plug(&plug);
 	while ((bio = bio_list_pop(&bios))) {
 		/*
 		 * If we've got no free new_mapping structs, and processing
 		 * this bio might require one, we pause until there are some
 		 * prepared mappings to process.
 		 */
 		if (ensure_next_mapping(pool)) {
 			spin_lock_irqsave(&tc->lock, flags);
 			bio_list_add(&tc->deferred_bio_list, bio);
 			bio_list_merge(&tc->deferred_bio_list, &bios);
 			spin_unlock_irqrestore(&tc->lock, flags);
 			break;
 		}
 		if (bio->bi_rw & REQ_DISCARD)
 			pool->process_discard(tc, bio);
 		else
 			pool->process_bio(tc, bio);
 		if ((count++ & 127) == 0) {
 			throttle_work_update(&pool->throttle);
 			dm_pool_issue_prefetches(pool->pmd);
 		}
 	}
 	blk_finish_plug(&plug);
 }
 static int cmp_cells(const void *lhs, const void *rhs)
 {
 	struct dm_bio_prison_cell *lhs_cell = *((struct dm_bio_prison_cell **) lhs);
 	struct dm_bio_prison_cell *rhs_cell = *((struct dm_bio_prison_cell **) rhs);
 	BUG_ON(!lhs_cell->holder);
 	BUG_ON(!rhs_cell->holder);
 	if (lhs_cell->holder->bi_iter.bi_sector < rhs_cell->holder->bi_iter.bi_sector)
 		return -1;
 	if (lhs_cell->holder->bi_iter.bi_sector > rhs_cell->holder->bi_iter.bi_sector)
 		return 1;
 	return 0;
 }
 static unsigned sort_cells(struct pool *pool, struct list_head *cells)
 {
 	unsigned count = 0;
 	struct dm_bio_prison_cell *cell, *tmp;
 	list_for_each_entry_safe(cell, tmp, cells, user_list) {
 		if (count >= CELL_SORT_ARRAY_SIZE)
 			break;
 		pool->cell_sort_array[count++] = cell;
 		list_del(&cell->user_list);
 	}
 	sort(pool->cell_sort_array, count, sizeof(cell), cmp_cells, NULL);
 	return count;
 }
 static void process_thin_deferred_cells(struct thin_c *tc)
 {
 	struct pool *pool = tc->pool;
 	unsigned long flags;
 	struct list_head cells;
 	struct dm_bio_prison_cell *cell;
 	unsigned i, j, count;
 	INIT_LIST_HEAD(&cells);
 	spin_lock_irqsave(&tc->lock, flags);
 	list_splice_init(&tc->deferred_cells, &cells);
 	spin_unlock_irqrestore(&tc->lock, flags);
 	if (list_empty(&cells))
 		return;
 	do {
 		count = sort_cells(tc->pool, &cells);
 		for (i = 0; i < count; i++) {
 			cell = pool->cell_sort_array[i];
 			BUG_ON(!cell->holder);
 			/*
 			 * If we've got no free new_mapping structs, and processing
 			 * this bio might require one, we pause until there are some
 			 * prepared mappings to process.
 			 */
 			if (ensure_next_mapping(pool)) {
 				for (j = i; j < count; j++)
 					list_add(&pool->cell_sort_array[j]->user_list, &cells);
 				spin_lock_irqsave(&tc->lock, flags);
 				list_splice(&cells, &tc->deferred_cells);
 				spin_unlock_irqrestore(&tc->lock, flags);
 				return;
 			}
 			if (cell->holder->bi_rw & REQ_DISCARD)
 				pool->process_discard_cell(tc, cell);
 			else
 				pool->process_cell(tc, cell);
 		}
 	} while (!list_empty(&cells));
 }
 static void thin_get(struct thin_c *tc);
 static void thin_put(struct thin_c *tc);
 /*
  * We can't hold rcu_read_lock() around code that can block.  So we
  * find a thin with the rcu lock held; bump a refcount; then drop
  * the lock.
  */
 static struct thin_c *get_first_thin(struct pool *pool)
 {
 	struct thin_c *tc = NULL;
 	rcu_read_lock();
 	if (!list_empty(&pool->active_thins)) {
 		tc = list_entry_rcu(pool->active_thins.next, struct thin_c, list);
 		thin_get(tc);
 	}
 	rcu_read_unlock();
 	return tc;
 }
 static struct thin_c *get_next_thin(struct pool *pool, struct thin_c *tc)
 {
 	struct thin_c *old_tc = tc;
 	rcu_read_lock();
 	list_for_each_entry_continue_rcu(tc, &pool->active_thins, list) {
 		thin_get(tc);
 		thin_put(old_tc);
 		rcu_read_unlock();
 		return tc;
 	}
 	thin_put(old_tc);
 	rcu_read_unlock();
 	return NULL;
 }
 static void process_deferred_bios(struct pool *pool)
 {
 	unsigned long flags;
 	struct bio *bio;
 	struct bio_list bios;
 	struct thin_c *tc;
 	tc = get_first_thin(pool);
 	while (tc) {
 		process_thin_deferred_cells(tc);
 		process_thin_deferred_bios(tc);
 		tc = get_next_thin(pool, tc);
 	}
 	/*
 	 * If there are any deferred flush bios, we must commit
 	 * the metadata before issuing them.
 	 */
 	bio_list_init(&bios);
 	spin_lock_irqsave(&pool->lock, flags);
 	bio_list_merge(&bios, &pool->deferred_flush_bios);
 	bio_list_init(&pool->deferred_flush_bios);
 	spin_unlock_irqrestore(&pool->lock, flags);
 	if (bio_list_empty(&bios) &&
 	    !(dm_pool_changed_this_transaction(pool->pmd) && need_commit_due_to_time(pool)))
 		return;
 	if (commit(pool)) {
 		while ((bio = bio_list_pop(&bios)))
 			bio_io_error(bio);
 		return;
 	}
 	pool->last_commit_jiffies = jiffies;
 	while ((bio = bio_list_pop(&bios)))
 		generic_make_request(bio);
 }
 static void do_worker(struct work_struct *ws)
 {
 	struct pool *pool = container_of(ws, struct pool, worker);
 	throttle_work_start(&pool->throttle);
 	dm_pool_issue_prefetches(pool->pmd);
 	throttle_work_update(&pool->throttle);
 	process_prepared(pool, &pool->prepared_mappings, &pool->process_prepared_mapping);
 	throttle_work_update(&pool->throttle);
 	process_prepared(pool, &pool->prepared_discards, &pool->process_prepared_discard);
 	throttle_work_update(&pool->throttle);
 	process_deferred_bios(pool);
 	throttle_work_complete(&pool->throttle);
 }
 /*
  * We want to commit periodically so that not too much
  * unwritten data builds up.
  */
 static void do_waker(struct work_struct *ws)
 {
 	struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
 	wake_worker(pool);
 	queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
 }
 /*
  * We're holding onto IO to allow userland time to react.  After the
  * timeout either the pool will have been resized (and thus back in
  * PM_WRITE mode), or we degrade to PM_READ_ONLY and start erroring IO.
  */
 static void do_no_space_timeout(struct work_struct *ws)
 {
 	struct pool *pool = container_of(to_delayed_work(ws), struct pool,
 					 no_space_timeout);
 	if (get_pool_mode(pool) == PM_OUT_OF_DATA_SPACE && !pool->pf.error_if_no_space)
 		set_pool_mode(pool, PM_READ_ONLY);
 }
 /*----------------------------------------------------------------*/
 struct pool_work {
 	struct work_struct worker;
 	struct completion complete;
 };
 static struct pool_work *to_pool_work(struct work_struct *ws)
 {
 	return container_of(ws, struct pool_work, worker);
 }
 static void pool_work_complete(struct pool_work *pw)
 {
 	complete(&pw->complete);
 }
 static void pool_work_wait(struct pool_work *pw, struct pool *pool,
 			   void (*fn)(struct work_struct *))
 {
 	INIT_WORK_ONSTACK(&pw->worker, fn);
 	init_completion(&pw->complete);
 	queue_work(pool->wq, &pw->worker);
 	wait_for_completion(&pw->complete);
 }
 /*----------------------------------------------------------------*/
 struct noflush_work {
 	struct pool_work pw;
 	struct thin_c *tc;
 };
 static struct noflush_work *to_noflush(struct work_struct *ws)
 {
 	return container_of(to_pool_work(ws), struct noflush_work, pw);
 }
 static void do_noflush_start(struct work_struct *ws)
 {
 	struct noflush_work *w = to_noflush(ws);
 	w->tc->requeue_mode = true;
 	requeue_io(w->tc);
 	pool_work_complete(&w->pw);
 }
 static void do_noflush_stop(struct work_struct *ws)
 {
 	struct noflush_work *w = to_noflush(ws);
 	w->tc->requeue_mode = false;
 	pool_work_complete(&w->pw);
 }
 static void noflush_work(struct thin_c *tc, void (*fn)(struct work_struct *))
 {
 	struct noflush_work w;
 	w.tc = tc;
 	pool_work_wait(&w.pw, tc->pool, fn);
 }
 /*----------------------------------------------------------------*/
 static enum pool_mode get_pool_mode(struct pool *pool)
 {
 	return pool->pf.mode;
 }
 static void notify_of_pool_mode_change(struct pool *pool, const char *new_mode)
 {
 	dm_table_event(pool->ti->table);
 	DMINFO("%s: switching pool to %s mode",
 	       dm_device_name(pool->pool_md), new_mode);
 }
 static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
 {
 	struct pool_c *pt = pool->ti->private;
 	bool needs_check = dm_pool_metadata_needs_check(pool->pmd);
 	enum pool_mode old_mode = get_pool_mode(pool);
 	unsigned long no_space_timeout = ACCESS_ONCE(no_space_timeout_secs) * HZ;
 	/*
 	 * Never allow the pool to transition to PM_WRITE mode if user
 	 * intervention is required to verify metadata and data consistency.
 	 */
 	if (new_mode == PM_WRITE && needs_check) {
 		DMERR("%s: unable to switch pool to write mode until repaired.",
 		      dm_device_name(pool->pool_md));
 		if (old_mode != new_mode)
 			new_mode = old_mode;
 		else
 			new_mode = PM_READ_ONLY;
 	}
 	/*
 	 * If we were in PM_FAIL mode, rollback of metadata failed.  We're
 	 * not going to recover without a thin_repair.	So we never let the
 	 * pool move out of the old mode.
 	 */
 	if (old_mode == PM_FAIL)
 		new_mode = old_mode;
 	switch (new_mode) {
 	case PM_FAIL:
 		if (old_mode != new_mode)
 			notify_of_pool_mode_change(pool, "failure");
 		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_fail;
 		pool->process_discard = process_bio_fail;
 		pool->process_cell = process_cell_fail;
 		pool->process_discard_cell = process_cell_fail;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
 		pool->process_prepared_discard = process_prepared_discard_fail;
 		error_retry_list(pool);
 		break;
 	case PM_READ_ONLY:
 		if (old_mode != new_mode)
 			notify_of_pool_mode_change(pool, "read-only");
 		dm_pool_metadata_read_only(pool->pmd);
 		pool->process_bio = process_bio_read_only;
 		pool->process_discard = process_bio_success;
 		pool->process_cell = process_cell_read_only;
 		pool->process_discard_cell = process_cell_success;
 		pool->process_prepared_mapping = process_prepared_mapping_fail;
 		pool->process_prepared_discard = process_prepared_discard_passdown;
 		error_retry_list(pool);
 		break;
 	case PM_OUT_OF_DATA_SPACE:
 		/*
 		 * Ideally we'd never hit this state; the low water mark
 		 * would trigger userland to extend the pool before we
 		 * completely run out of data space.  However, many small
 		 * IOs to unprovisioned space can consume data space at an
 		 * alarming rate.  Adjust your low water mark if you're
 		 * frequently seeing this mode.
 		 */
 		if (old_mode != new_mode)
 			notify_of_pool_mode_change(pool, "out-of-data-space");
 		pool->process_bio = process_bio_read_only;
 		pool->process_discard = process_discard_bio;
 		pool->process_cell = process_cell_read_only;
 		pool->process_discard_cell = process_discard_cell;
 		pool->process_prepared_mapping = process_prepared_mapping;
 		pool->process_prepared_discard = process_prepared_discard;
 		if (!pool->pf.error_if_no_space && no_space_timeout)
 			queue_delayed_work(pool->wq, &pool->no_space_timeout, no_space_timeout);
 		break;
 	case PM_WRITE:
 		if (old_mode != new_mode)
 			notify_of_pool_mode_change(pool, "write");
 		dm_pool_metadata_read_write(pool->pmd);
 		pool->process_bio = process_bio;
 		pool->process_discard = process_discard_bio;
 		pool->process_cell = process_cell;
 		pool->process_discard_cell = process_discard_cell;
 		pool->process_prepared_mapping = process_prepared_mapping;
 		pool->process_prepared_discard = process_prepared_discard;
 		break;
 	}
 	pool->pf.mode = new_mode;
 	/*
 	 * The pool mode may have changed, sync it so bind_control_target()
 	 * doesn't cause an unexpected mode transition on resume.
 	 */
 	pt->adjusted_pf.mode = new_mode;
 }
 static void abort_transaction(struct pool *pool)
 {
 	const char *dev_name = dm_device_name(pool->pool_md);
 	DMERR_LIMIT("%s: aborting current metadata transaction", dev_name);
 	if (dm_pool_abort_metadata(pool->pmd)) {
 		DMERR("%s: failed to abort metadata transaction", dev_name);
 		set_pool_mode(pool, PM_FAIL);
 	}
 	if (dm_pool_metadata_set_needs_check(pool->pmd)) {
 		DMERR("%s: failed to set 'needs_check' flag in metadata", dev_name);
 		set_pool_mode(pool, PM_FAIL);
 	}
 }
 static void metadata_operation_failed(struct pool *pool, const char *op, int r)
 {
 	DMERR_LIMIT("%s: metadata operation '%s' failed: error = %d",
 		    dm_device_name(pool->pool_md), op, r);
 	abort_transaction(pool);
 	set_pool_mode(pool, PM_READ_ONLY);
 }
 /*----------------------------------------------------------------*/
 /*
  * Mapping functions.
  */
 /*
  * Called only while mapping a thin bio to hand it over to the workqueue.
  */
 static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
 {
 	unsigned long flags;
 	struct pool *pool = tc->pool;
 	spin_lock_irqsave(&tc->lock, flags);
 	bio_list_add(&tc->deferred_bio_list, bio);
 	spin_unlock_irqrestore(&tc->lock, flags);
 	wake_worker(pool);
 }
 static void thin_defer_bio_with_throttle(struct thin_c *tc, struct bio *bio)
 {
 	struct pool *pool = tc->pool;
 	throttle_lock(&pool->throttle);
 	thin_defer_bio(tc, bio);
 	throttle_unlock(&pool->throttle);
 }
 static void thin_defer_cell(struct thin_c *tc, struct dm_bio_prison_cell *cell)
 {
 	unsigned long flags;
 	struct pool *pool = tc->pool;
 	throttle_lock(&pool->throttle);
 	spin_lock_irqsave(&tc->lock, flags);
 	list_add_tail(&cell->user_list, &tc->deferred_cells);
 	spin_unlock_irqrestore(&tc->lock, flags);
 	throttle_unlock(&pool->throttle);
 	wake_worker(pool);
 }
 static void thin_hook_bio(struct thin_c *tc, struct bio *bio)
 {
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	h->tc = tc;
 	h->shared_read_entry = NULL;
 	h->all_io_entry = NULL;
 	h->overwrite_mapping = NULL;
 }
 /*
  * Non-blocking function called from the thin target's map function.
  */
 static int thin_bio_map(struct dm_target *ti, struct bio *bio)
 {
 	int r;
 	struct thin_c *tc = ti->private;
 	dm_block_t block = get_bio_block(tc, bio);
 	struct dm_thin_device *td = tc->td;
 	struct dm_thin_lookup_result result;
 	struct dm_bio_prison_cell *virt_cell, *data_cell;
 	struct dm_cell_key key;
 	thin_hook_bio(tc, bio);
 	if (tc->requeue_mode) {
 		bio_endio(bio, DM_ENDIO_REQUEUE);
 		return DM_MAPIO_SUBMITTED;
 	}
 	if (get_pool_mode(tc->pool) == PM_FAIL) {
 		bio_io_error(bio);
 		return DM_MAPIO_SUBMITTED;
 	}
 	if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
 		thin_defer_bio_with_throttle(tc, bio);
 		return DM_MAPIO_SUBMITTED;
 	}
 	/*
 	 * We must hold the virtual cell before doing the lookup, otherwise
 	 * there's a race with discard.
 	 */
 	build_virtual_key(tc->td, block, &key);
 	if (bio_detain(tc->pool, &key, bio, &virt_cell))
 		return DM_MAPIO_SUBMITTED;
 	r = dm_thin_find_block(td, block, 0, &result);
 	/*
 	 * Note that we defer readahead too.
 	 */
 	switch (r) {
 	case 0:
 		if (unlikely(result.shared)) {
 			/*
 			 * We have a race condition here between the
 			 * result.shared value returned by the lookup and
 			 * snapshot creation, which may cause new
 			 * sharing.
 			 *
 			 * To avoid this always quiesce the origin before
 			 * taking the snap.  You want to do this anyway to
 			 * ensure a consistent application view
 			 * (i.e. lockfs).
 			 *
 			 * More distant ancestors are irrelevant. The
 			 * shared flag will be set in their case.
 			 */
 			thin_defer_cell(tc, virt_cell);
 			return DM_MAPIO_SUBMITTED;
 		}
 		build_data_key(tc->td, result.block, &key);
 		if (bio_detain(tc->pool, &key, bio, &data_cell)) {
 			cell_defer_no_holder(tc, virt_cell);
 			return DM_MAPIO_SUBMITTED;
 		}
 		inc_all_io_entry(tc->pool, bio);
 		cell_defer_no_holder(tc, data_cell);
 		cell_defer_no_holder(tc, virt_cell);
 		remap(tc, bio, result.block);
 		return DM_MAPIO_REMAPPED;
 	case -ENODATA:
 		if (get_pool_mode(tc->pool) == PM_READ_ONLY) {
 			/*
 			 * This block isn't provisioned, and we have no way
 			 * of doing so.
 			 */
 			handle_unserviceable_bio(tc->pool, bio);
 			cell_defer_no_holder(tc, virt_cell);
 			return DM_MAPIO_SUBMITTED;
 		}
 		/* fall through */
 	case -EWOULDBLOCK:
 		thin_defer_cell(tc, virt_cell);
 		return DM_MAPIO_SUBMITTED;
 	default:
 		/*
 		 * Must always call bio_io_error on failure.
 		 * dm_thin_find_block can fail with -EINVAL if the
 		 * pool is switched to fail-io mode.
 		 */
 		bio_io_error(bio);
 		cell_defer_no_holder(tc, virt_cell);
 		return DM_MAPIO_SUBMITTED;
 	}
 }
 static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits)
 {
 	struct pool_c *pt = container_of(cb, struct pool_c, callbacks);
 	struct request_queue *q;
 	if (get_pool_mode(pt->pool) == PM_OUT_OF_DATA_SPACE)
 		return 1;
 	q = bdev_get_queue(pt->data_dev->bdev);
 	return bdi_congested(&q->backing_dev_info, bdi_bits);
 }
 static void requeue_bios(struct pool *pool)
 {
 	unsigned long flags;
 	struct thin_c *tc;
 	rcu_read_lock();
 	list_for_each_entry_rcu(tc, &pool->active_thins, list) {
 		spin_lock_irqsave(&tc->lock, flags);
 		bio_list_merge(&tc->deferred_bio_list, &tc->retry_on_resume_list);
 		bio_list_init(&tc->retry_on_resume_list);
 		spin_unlock_irqrestore(&tc->lock, flags);
 	}
 	rcu_read_unlock();
 }
 /*----------------------------------------------------------------
  * Binding of control targets to a pool object
  *--------------------------------------------------------------*/
 static bool data_dev_supports_discard(struct pool_c *pt)
 {
 	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
 	return q && blk_queue_discard(q);
 }
 static bool is_factor(sector_t block_size, uint32_t n)
 {
 	return !sector_div(block_size, n);
 }
 /*
  * If discard_passdown was enabled verify that the data device
  * supports discards.  Disable discard_passdown if not.
  */
 static void disable_passdown_if_not_supported(struct pool_c *pt)
 {
 	struct pool *pool = pt->pool;
 	struct block_device *data_bdev = pt->data_dev->bdev;
 	struct queue_limits *data_limits = &bdev_get_queue(data_bdev)->limits;
 	sector_t block_size = pool->sectors_per_block << SECTOR_SHIFT;
 	const char *reason = NULL;
 	char buf[BDEVNAME_SIZE];
 	if (!pt->adjusted_pf.discard_passdown)
 		return;
 	if (!data_dev_supports_discard(pt))
 		reason = "discard unsupported";
 	else if (data_limits->max_discard_sectors < pool->sectors_per_block)
 		reason = "max discard sectors smaller than a block";
 	else if (data_limits->discard_granularity > block_size)
 		reason = "discard granularity larger than a block";
 	else if (!is_factor(block_size, data_limits->discard_granularity))
 		reason = "discard granularity not a factor of block size";
 	if (reason) {
 		DMWARN("Data device (%s) %s: Disabling discard passdown.", bdevname(data_bdev, buf), reason);
 		pt->adjusted_pf.discard_passdown = false;
 	}
 }
 static int bind_control_target(struct pool *pool, struct dm_target *ti)
 {
 	struct pool_c *pt = ti->private;
 	/*
 	 * We want to make sure that a pool in PM_FAIL mode is never upgraded.
 	 */
 	enum pool_mode old_mode = get_pool_mode(pool);
 	enum pool_mode new_mode = pt->adjusted_pf.mode;
 	/*
 	 * Don't change the pool's mode until set_pool_mode() below.
 	 * Otherwise the pool's process_* function pointers may
 	 * not match the desired pool mode.
 	 */
 	pt->adjusted_pf.mode = old_mode;
 	pool->ti = ti;
 	pool->pf = pt->adjusted_pf;
 	pool->low_water_blocks = pt->low_water_blocks;
 	set_pool_mode(pool, new_mode);
 	return 0;
 }
 static void unbind_control_target(struct pool *pool, struct dm_target *ti)
 {
 	if (pool->ti == ti)
 		pool->ti = NULL;
 }
 /*----------------------------------------------------------------
  * Pool creation
  *--------------------------------------------------------------*/
 /* Initialize pool features. */
 static void pool_features_init(struct pool_features *pf)
 {
 	pf->mode = PM_WRITE;
 	pf->zero_new_blocks = true;
 	pf->discard_enabled = true;
 	pf->discard_passdown = true;
 	pf->error_if_no_space = false;
 }
 static void __pool_destroy(struct pool *pool)
 {
 	__pool_table_remove(pool);
 	if (dm_pool_metadata_close(pool->pmd) < 0)
 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
 	dm_bio_prison_destroy(pool->prison);
 	dm_kcopyd_client_destroy(pool->copier);
 	if (pool->wq)
 		destroy_workqueue(pool->wq);
 	if (pool->next_mapping)
 		mempool_free(pool->next_mapping, pool->mapping_pool);
 	mempool_destroy(pool->mapping_pool);
 	dm_deferred_set_destroy(pool->shared_read_ds);
 	dm_deferred_set_destroy(pool->all_io_ds);
 	kfree(pool);
 }
 static struct kmem_cache *_new_mapping_cache;
 static struct pool *pool_create(struct mapped_device *pool_md,
 				struct block_device *metadata_dev,
 				unsigned long block_size,
 				int read_only, char **error)
 {
 	int r;
 	void *err_p;
 	struct pool *pool;
 	struct dm_pool_metadata *pmd;
 	bool format_device = read_only ? false : true;
 	pmd = dm_pool_metadata_open(metadata_dev, block_size, format_device);
 	if (IS_ERR(pmd)) {
 		*error = "Error creating metadata object";
 		return (struct pool *)pmd;
 	}
 	pool = kmalloc(sizeof(*pool), GFP_KERNEL);
 	if (!pool) {
 		*error = "Error allocating memory for pool";
 		err_p = ERR_PTR(-ENOMEM);
 		goto bad_pool;
 	}
 	pool->pmd = pmd;
 	pool->sectors_per_block = block_size;
 	if (block_size & (block_size - 1))
 		pool->sectors_per_block_shift = -1;
 	else
 		pool->sectors_per_block_shift = __ffs(block_size);
 	pool->low_water_blocks = 0;
 	pool_features_init(&pool->pf);
 	pool->prison = dm_bio_prison_create();
 	if (!pool->prison) {
 		*error = "Error creating pool's bio prison";
 		err_p = ERR_PTR(-ENOMEM);
 		goto bad_prison;
 	}
 	pool->copier = dm_kcopyd_client_create(&dm_kcopyd_throttle);
 	if (IS_ERR(pool->copier)) {
 		r = PTR_ERR(pool->copier);
 		*error = "Error creating pool's kcopyd client";
 		err_p = ERR_PTR(r);
 		goto bad_kcopyd_client;
 	}
 	/*
 	 * Create singlethreaded workqueue that will service all devices
 	 * that use this metadata.
 	 */
 	pool->wq = alloc_ordered_workqueue("dm-" DM_MSG_PREFIX, WQ_MEM_RECLAIM);
 	if (!pool->wq) {
 		*error = "Error creating pool's workqueue";
 		err_p = ERR_PTR(-ENOMEM);
 		goto bad_wq;
 	}
 	throttle_init(&pool->throttle);
 	INIT_WORK(&pool->worker, do_worker);
 	INIT_DELAYED_WORK(&pool->waker, do_waker);
 	INIT_DELAYED_WORK(&pool->no_space_timeout, do_no_space_timeout);
 	spin_lock_init(&pool->lock);
 	bio_list_init(&pool->deferred_flush_bios);
 	INIT_LIST_HEAD(&pool->prepared_mappings);
 	INIT_LIST_HEAD(&pool->prepared_discards);
 	INIT_LIST_HEAD(&pool->active_thins);
 	pool->low_water_triggered = false;
 	pool->suspended = true;
 	pool->shared_read_ds = dm_deferred_set_create();
 	if (!pool->shared_read_ds) {
 		*error = "Error creating pool's shared read deferred set";
 		err_p = ERR_PTR(-ENOMEM);
 		goto bad_shared_read_ds;
 	}
 	pool->all_io_ds = dm_deferred_set_create();
 	if (!pool->all_io_ds) {
 		*error = "Error creating pool's all io deferred set";
 		err_p = ERR_PTR(-ENOMEM);
 		goto bad_all_io_ds;
 	}
 	pool->next_mapping = NULL;
 	pool->mapping_pool = mempool_create_slab_pool(MAPPING_POOL_SIZE,
 						      _new_mapping_cache);
 	if (!pool->mapping_pool) {
 		*error = "Error creating pool's mapping mempool";
 		err_p = ERR_PTR(-ENOMEM);
 		goto bad_mapping_pool;
 	}
 	pool->ref_count = 1;
 	pool->last_commit_jiffies = jiffies;
 	pool->pool_md = pool_md;
 	pool->md_dev = metadata_dev;
 	__pool_table_insert(pool);
 	return pool;
 bad_mapping_pool:
 	dm_deferred_set_destroy(pool->all_io_ds);
 bad_all_io_ds:
 	dm_deferred_set_destroy(pool->shared_read_ds);
 bad_shared_read_ds:
 	destroy_workqueue(pool->wq);
 bad_wq:
 	dm_kcopyd_client_destroy(pool->copier);
 bad_kcopyd_client:
 	dm_bio_prison_destroy(pool->prison);
 bad_prison:
 	kfree(pool);
 bad_pool:
 	if (dm_pool_metadata_close(pmd))
 		DMWARN("%s: dm_pool_metadata_close() failed.", __func__);
 	return err_p;
 }
 static void __pool_inc(struct pool *pool)
 {
 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 	pool->ref_count++;
 }
 static void __pool_dec(struct pool *pool)
 {
 	BUG_ON(!mutex_is_locked(&dm_thin_pool_table.mutex));
 	BUG_ON(!pool->ref_count);
 	if (!--pool->ref_count)
 		__pool_destroy(pool);
 }
 static struct pool *__pool_find(struct mapped_device *pool_md,
 				struct block_device *metadata_dev,
 				unsigned long block_size, int read_only,
 				char **error, int *created)
 {
 	struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
 	if (pool) {
 		if (pool->pool_md != pool_md) {
 			*error = "metadata device already in use by a pool";
 			return ERR_PTR(-EBUSY);
 		}
 		__pool_inc(pool);
 	} else {
 		pool = __pool_table_lookup(pool_md);
 		if (pool) {
 			if (pool->md_dev != metadata_dev) {
 				*error = "different pool cannot replace a pool";
 				return ERR_PTR(-EINVAL);
 			}
 			__pool_inc(pool);
 		} else {
 			pool = pool_create(pool_md, metadata_dev, block_size, read_only, error);
 			*created = 1;
 		}
 	}
 	return pool;
 }
 /*----------------------------------------------------------------
  * Pool target methods
  *--------------------------------------------------------------*/
 static void pool_dtr(struct dm_target *ti)
 {
 	struct pool_c *pt = ti->private;
 	mutex_lock(&dm_thin_pool_table.mutex);
 	unbind_control_target(pt->pool, ti);
 	__pool_dec(pt->pool);
 	dm_put_device(ti, pt->metadata_dev);
 	dm_put_device(ti, pt->data_dev);
 	kfree(pt);
 	mutex_unlock(&dm_thin_pool_table.mutex);
 }
 static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
 			       struct dm_target *ti)
 {
 	int r;
 	unsigned argc;
 	const char *arg_name;
 	static struct dm_arg _args[] = {
 		{0, 4, "Invalid number of pool feature arguments"},
 	};
 	/*
 	 * No feature arguments supplied.
 	 */
 	if (!as->argc)
 		return 0;
 	r = dm_read_arg_group(_args, as, &argc, &ti->error);
 	if (r)
 		return -EINVAL;
 	while (argc && !r) {
 		arg_name = dm_shift_arg(as);
 		argc--;
 		if (!strcasecmp(arg_name, "skip_block_zeroing"))
 			pf->zero_new_blocks = false;
 		else if (!strcasecmp(arg_name, "ignore_discard"))
 			pf->discard_enabled = false;
 		else if (!strcasecmp(arg_name, "no_discard_passdown"))
 			pf->discard_passdown = false;
 		else if (!strcasecmp(arg_name, "read_only"))
 			pf->mode = PM_READ_ONLY;
 		else if (!strcasecmp(arg_name, "error_if_no_space"))
 			pf->error_if_no_space = true;
 		else {
 			ti->error = "Unrecognised pool feature requested";
 			r = -EINVAL;
 			break;
 		}
 	}
 	return r;
 }
 static void metadata_low_callback(void *context)
 {
 	struct pool *pool = context;
 	DMWARN("%s: reached low water mark for metadata device: sending event.",
 	       dm_device_name(pool->pool_md));
 	dm_table_event(pool->ti->table);
 }
 static sector_t get_dev_size(struct block_device *bdev)
 {
 	return i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
 }
 static void warn_if_metadata_device_too_big(struct block_device *bdev)
 {
 	sector_t metadata_dev_size = get_dev_size(bdev);
 	char buffer[BDEVNAME_SIZE];
 	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
 		DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
 		       bdevname(bdev, buffer), THIN_METADATA_MAX_SECTORS);
 }
 static sector_t get_metadata_dev_size(struct block_device *bdev)
 {
 	sector_t metadata_dev_size = get_dev_size(bdev);
 	if (metadata_dev_size > THIN_METADATA_MAX_SECTORS)
 		metadata_dev_size = THIN_METADATA_MAX_SECTORS;
 	return metadata_dev_size;
 }
 static dm_block_t get_metadata_dev_size_in_blocks(struct block_device *bdev)
 {
 	sector_t metadata_dev_size = get_metadata_dev_size(bdev);
 	sector_div(metadata_dev_size, THIN_METADATA_BLOCK_SIZE);
 	return metadata_dev_size;
 }
 /*
  * When a metadata threshold is crossed a dm event is triggered, and
  * userland should respond by growing the metadata device.  We could let
  * userland set the threshold, like we do with the data threshold, but I'm
  * not sure they know enough to do this well.
  */
 static dm_block_t calc_metadata_threshold(struct pool_c *pt)
 {
 	/*
 	 * 4M is ample for all ops with the possible exception of thin
 	 * device deletion which is harmless if it fails (just retry the
 	 * delete after you've grown the device).
 	 */
 	dm_block_t quarter = get_metadata_dev_size_in_blocks(pt->metadata_dev->bdev) / 4;
 	return min((dm_block_t)1024ULL /* 4M */, quarter);
 }
 /*
  * thin-pool <metadata dev> <data dev>
  *	     <data block size (sectors)>
  *	     <low water mark (blocks)>
  *	     [<#feature args> [<arg>]*]
  *
  * Optional feature arguments are:
  *	     skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
  *	     ignore_discard: disable discard
  *	     no_discard_passdown: don't pass discards down to the data device
  *	     read_only: Don't allow any changes to be made to the pool metadata.
  *	     error_if_no_space: error IOs, instead of queueing, if no space.
  */
 static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	int r, pool_created = 0;
 	struct pool_c *pt;
 	struct pool *pool;
 	struct pool_features pf;
 	struct dm_arg_set as;
 	struct dm_dev *data_dev;
 	unsigned long block_size;
 	dm_block_t low_water_blocks;
 	struct dm_dev *metadata_dev;
 	fmode_t metadata_mode;
 	/*
 	 * FIXME Remove validation from scope of lock.
 	 */
 	mutex_lock(&dm_thin_pool_table.mutex);
 	if (argc < 4) {
 		ti->error = "Invalid argument count";
 		r = -EINVAL;
 		goto out_unlock;
 	}
 	as.argc = argc;
 	as.argv = argv;
 	/*
 	 * Set default pool features.
 	 */
 	pool_features_init(&pf);
 	dm_consume_args(&as, 4);
 	r = parse_pool_features(&as, &pf, ti);
 	if (r)
 		goto out_unlock;
 	metadata_mode = FMODE_READ | ((pf.mode == PM_READ_ONLY) ? 0 : FMODE_WRITE);
 	r = dm_get_device(ti, argv[0], metadata_mode, &metadata_dev);
 	if (r) {
 		ti->error = "Error opening metadata block device";
 		goto out_unlock;
 	}
 	warn_if_metadata_device_too_big(metadata_dev->bdev);
 	r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
 	if (r) {
 		ti->error = "Error getting data device";
 		goto out_metadata;
 	}
 	if (kstrtoul(argv[2], 10, &block_size) || !block_size ||
 	    block_size < DATA_DEV_BLOCK_SIZE_MIN_SECTORS ||
 	    block_size > DATA_DEV_BLOCK_SIZE_MAX_SECTORS ||
 	    block_size & (DATA_DEV_BLOCK_SIZE_MIN_SECTORS - 1)) {
 		ti->error = "Invalid block size";
 		r = -EINVAL;
 		goto out;
 	}
 	if (kstrtoull(argv[3], 10, (unsigned long long *)&low_water_blocks)) {
 		ti->error = "Invalid low water mark";
 		r = -EINVAL;
 		goto out;
 	}
 	pt = kzalloc(sizeof(*pt), GFP_KERNEL);
 	if (!pt) {
 		r = -ENOMEM;
 		goto out;
 	}
 	pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
 			   block_size, pf.mode == PM_READ_ONLY, &ti->error, &pool_created);
 	if (IS_ERR(pool)) {
 		r = PTR_ERR(pool);
 		goto out_free_pt;
 	}
 	/*
 	 * 'pool_created' reflects whether this is the first table load.
 	 * Top level discard support is not allowed to be changed after
 	 * initial load.  This would require a pool reload to trigger thin
 	 * device changes.
 	 */
 	if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
 		ti->error = "Discard support cannot be disabled once enabled";
 		r = -EINVAL;
 		goto out_flags_changed;
 	}
 	pt->pool = pool;
 	pt->ti = ti;
 	pt->metadata_dev = metadata_dev;
 	pt->data_dev = data_dev;
 	pt->low_water_blocks = low_water_blocks;
 	pt->adjusted_pf = pt->requested_pf = pf;
 	ti->num_flush_bios = 1;
 	/*
 	 * Only need to enable discards if the pool should pass
 	 * them down to the data device.  The thin device's discard
 	 * processing will cause mappings to be removed from the btree.
 	 */
 	ti->discard_zeroes_data_unsupported = true;
 	if (pf.discard_enabled && pf.discard_passdown) {
 		ti->num_discard_bios = 1;
 		/*
 		 * Setting 'discards_supported' circumvents the normal
 		 * stacking of discard limits (this keeps the pool and
 		 * thin devices' discard limits consistent).
 		 */
 		ti->discards_supported = true;
 	}
 	ti->private = pt;
 	r = dm_pool_register_metadata_threshold(pt->pool->pmd,
 						calc_metadata_threshold(pt),
 						metadata_low_callback,
 						pool);
 	if (r)
 		goto out_free_pt;
 	pt->callbacks.congested_fn = pool_is_congested;
 	dm_table_add_target_callbacks(ti->table, &pt->callbacks);
 	mutex_unlock(&dm_thin_pool_table.mutex);
 	return 0;
 out_flags_changed:
 	__pool_dec(pool);
 out_free_pt:
 	kfree(pt);
 out:
 	dm_put_device(ti, data_dev);
 out_metadata:
 	dm_put_device(ti, metadata_dev);
 out_unlock:
 	mutex_unlock(&dm_thin_pool_table.mutex);
 	return r;
 }
 static int pool_map(struct dm_target *ti, struct bio *bio)
 {
 	int r;
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	unsigned long flags;
 	/*
 	 * As this is a singleton target, ti->begin is always zero.
 	 */
 	spin_lock_irqsave(&pool->lock, flags);
 	bio->bi_bdev = pt->data_dev->bdev;
 	r = DM_MAPIO_REMAPPED;
 	spin_unlock_irqrestore(&pool->lock, flags);
 	return r;
 }
 static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit)
 {
 	int r;
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	sector_t data_size = ti->len;
 	dm_block_t sb_data_size;
 	*need_commit = false;
 	(void) sector_div(data_size, pool->sectors_per_block);
 	r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size);
 	if (r) {
 		DMERR("%s: failed to retrieve data device size",
 		      dm_device_name(pool->pool_md));
 		return r;
 	}
 	if (data_size < sb_data_size) {
 		DMERR("%s: pool target (%llu blocks) too small: expected %llu",
 		      dm_device_name(pool->pool_md),
 		      (unsigned long long)data_size, sb_data_size);
 		return -EINVAL;
 	} else if (data_size > sb_data_size) {
 		if (dm_pool_metadata_needs_check(pool->pmd)) {
 			DMERR("%s: unable to grow the data device until repaired.",
 			      dm_device_name(pool->pool_md));
 			return 0;
 		}
 		if (sb_data_size)
 			DMINFO("%s: growing the data device from %llu to %llu blocks",
 			       dm_device_name(pool->pool_md),
 			       sb_data_size, (unsigned long long)data_size);
 		r = dm_pool_resize_data_dev(pool->pmd, data_size);
 		if (r) {
 			metadata_operation_failed(pool, "dm_pool_resize_data_dev", r);
 			return r;
 		}
 		*need_commit = true;
 	}
 	return 0;
 }
 static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
 {
 	int r;
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	dm_block_t metadata_dev_size, sb_metadata_dev_size;
 	*need_commit = false;
 	metadata_dev_size = get_metadata_dev_size_in_blocks(pool->md_dev);
 	r = dm_pool_get_metadata_dev_size(pool->pmd, &sb_metadata_dev_size);
 	if (r) {
 		DMERR("%s: failed to retrieve metadata device size",
 		      dm_device_name(pool->pool_md));
 		return r;
 	}
 	if (metadata_dev_size < sb_metadata_dev_size) {
 		DMERR("%s: metadata device (%llu blocks) too small: expected %llu",
 		      dm_device_name(pool->pool_md),
 		      metadata_dev_size, sb_metadata_dev_size);
 		return -EINVAL;
 	} else if (metadata_dev_size > sb_metadata_dev_size) {
 		if (dm_pool_metadata_needs_check(pool->pmd)) {
 			DMERR("%s: unable to grow the metadata device until repaired.",
 			      dm_device_name(pool->pool_md));
 			return 0;
 		}
 		warn_if_metadata_device_too_big(pool->md_dev);
 		DMINFO("%s: growing the metadata device from %llu to %llu blocks",
 		       dm_device_name(pool->pool_md),
 		       sb_metadata_dev_size, metadata_dev_size);
 		r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
 		if (r) {
 			metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
 			return r;
 		}
 		*need_commit = true;
 	}
 	return 0;
 }
 /*
  * Retrieves the number of blocks of the data device from
  * the superblock and compares it to the actual device size,
  * thus resizing the data device in case it has grown.
  *
  * This both copes with opening preallocated data devices in the ctr
  * being followed by a resume
  * -and-
  * calling the resume method individually after userspace has
  * grown the data device in reaction to a table event.
  */
 static int pool_preresume(struct dm_target *ti)
 {
 	int r;
 	bool need_commit1, need_commit2;
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	/*
 	 * Take control of the pool object.
 	 */
 	r = bind_control_target(pool, ti);
 	if (r)
 		return r;
 	r = maybe_resize_data_dev(ti, &need_commit1);
 	if (r)
 		return r;
 	r = maybe_resize_metadata_dev(ti, &need_commit2);
 	if (r)
 		return r;
 	if (need_commit1 || need_commit2)
 		(void) commit(pool);
 	return 0;
 }
 static void pool_suspend_active_thins(struct pool *pool)
 {
 	struct thin_c *tc;
 	/* Suspend all active thin devices */
 	tc = get_first_thin(pool);
 	while (tc) {
 		dm_internal_suspend_noflush(tc->thin_md);
 		tc = get_next_thin(pool, tc);
 	}
 }
 static void pool_resume_active_thins(struct pool *pool)
 {
 	struct thin_c *tc;
 	/* Resume all active thin devices */
 	tc = get_first_thin(pool);
 	while (tc) {
 		dm_internal_resume(tc->thin_md);
 		tc = get_next_thin(pool, tc);
 	}
 }
 static void pool_resume(struct dm_target *ti)
 {
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	unsigned long flags;
 	/*
 	 * Must requeue active_thins' bios and then resume
 	 * active_thins _before_ clearing 'suspend' flag.
 	 */
 	requeue_bios(pool);
 	pool_resume_active_thins(pool);
 	spin_lock_irqsave(&pool->lock, flags);
 	pool->low_water_triggered = false;
 	pool->suspended = false;
 	spin_unlock_irqrestore(&pool->lock, flags);
 	do_waker(&pool->waker.work);
 }
 static void pool_presuspend(struct dm_target *ti)
 {
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	unsigned long flags;
 	spin_lock_irqsave(&pool->lock, flags);
 	pool->suspended = true;
 	spin_unlock_irqrestore(&pool->lock, flags);
 	pool_suspend_active_thins(pool);
 }
 static void pool_presuspend_undo(struct dm_target *ti)
 {
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	unsigned long flags;
 	pool_resume_active_thins(pool);
 	spin_lock_irqsave(&pool->lock, flags);
 	pool->suspended = false;
 	spin_unlock_irqrestore(&pool->lock, flags);
 }
 static void pool_postsuspend(struct dm_target *ti)
 {
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	cancel_delayed_work(&pool->waker);
 	cancel_delayed_work(&pool->no_space_timeout);
 	flush_workqueue(pool->wq);
 	(void) commit(pool);
 }
 static int check_arg_count(unsigned argc, unsigned args_required)
 {
 	if (argc != args_required) {
 		DMWARN("Message received with %u arguments instead of %u.",
 		       argc, args_required);
 		return -EINVAL;
 	}
 	return 0;
 }
 static int read_dev_id(char *arg, dm_thin_id *dev_id, int warning)
 {
 	if (!kstrtoull(arg, 10, (unsigned long long *)dev_id) &&
 	    *dev_id <= MAX_DEV_ID)
 		return 0;
 	if (warning)
 		DMWARN("Message received with invalid device id: %s", arg);
 	return -EINVAL;
 }
 static int process_create_thin_mesg(unsigned argc, char **argv, struct pool *pool)
 {
 	dm_thin_id dev_id;
 	int r;
 	r = check_arg_count(argc, 2);
 	if (r)
 		return r;
 	r = read_dev_id(argv[1], &dev_id, 1);
 	if (r)
 		return r;
 	r = dm_pool_create_thin(pool->pmd, dev_id);
 	if (r) {
 		DMWARN("Creation of new thinly-provisioned device with id %s failed.",
 		       argv[1]);
 		return r;
 	}
 	return 0;
 }
 static int process_create_snap_mesg(unsigned argc, char **argv, struct pool *pool)
 {
 	dm_thin_id dev_id;
 	dm_thin_id origin_dev_id;
 	int r;
 	r = check_arg_count(argc, 3);
 	if (r)
 		return r;
 	r = read_dev_id(argv[1], &dev_id, 1);
 	if (r)
 		return r;
 	r = read_dev_id(argv[2], &origin_dev_id, 1);
 	if (r)
 		return r;
 	r = dm_pool_create_snap(pool->pmd, dev_id, origin_dev_id);
 	if (r) {
 		DMWARN("Creation of new snapshot %s of device %s failed.",
 		       argv[1], argv[2]);
 		return r;
 	}
 	return 0;
 }
 static int process_delete_mesg(unsigned argc, char **argv, struct pool *pool)
 {
 	dm_thin_id dev_id;
 	int r;
 	r = check_arg_count(argc, 2);
 	if (r)
 		return r;
 	r = read_dev_id(argv[1], &dev_id, 1);
 	if (r)
 		return r;
 	r = dm_pool_delete_thin_device(pool->pmd, dev_id);
 	if (r)
 		DMWARN("Deletion of thin device %s failed.", argv[1]);
 	return r;
 }
 static int process_set_transaction_id_mesg(unsigned argc, char **argv, struct pool *pool)
 {
 	dm_thin_id old_id, new_id;
 	int r;
 	r = check_arg_count(argc, 3);
 	if (r)
 		return r;
 	if (kstrtoull(argv[1], 10, (unsigned long long *)&old_id)) {
 		DMWARN("set_transaction_id message: Unrecognised id %s.", argv[1]);
 		return -EINVAL;
 	}
 	if (kstrtoull(argv[2], 10, (unsigned long long *)&new_id)) {
 		DMWARN("set_transaction_id message: Unrecognised new id %s.", argv[2]);
 		return -EINVAL;
 	}
 	r = dm_pool_set_metadata_transaction_id(pool->pmd, old_id, new_id);
 	if (r) {
 		DMWARN("Failed to change transaction id from %s to %s.",
 		       argv[1], argv[2]);
 		return r;
 	}
 	return 0;
 }
 static int process_reserve_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
 {
 	int r;
 	r = check_arg_count(argc, 1);
 	if (r)
 		return r;
 	(void) commit(pool);
 	r = dm_pool_reserve_metadata_snap(pool->pmd);
 	if (r)
 		DMWARN("reserve_metadata_snap message failed.");
 	return r;
 }
 static int process_release_metadata_snap_mesg(unsigned argc, char **argv, struct pool *pool)
 {
 	int r;
 	r = check_arg_count(argc, 1);
 	if (r)
 		return r;
 	r = dm_pool_release_metadata_snap(pool->pmd);
 	if (r)
 		DMWARN("release_metadata_snap message failed.");
 	return r;
 }
 /*
  * Messages supported:
  *   create_thin	<dev_id>
  *   create_snap	<dev_id> <origin_id>
  *   delete		<dev_id>
  *   set_transaction_id <current_trans_id> <new_trans_id>
  *   reserve_metadata_snap
  *   release_metadata_snap
  */
 static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
 {
 	int r = -EINVAL;
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
+	if (get_pool_mode(pool) >= PM_READ_ONLY) {
+		DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
+		      dm_device_name(pool->pool_md));
+		return -EINVAL;
+	}
 	if (!strcasecmp(argv[0], "create_thin"))
 		r = process_create_thin_mesg(argc, argv, pool);
 	else if (!strcasecmp(argv[0], "create_snap"))
 		r = process_create_snap_mesg(argc, argv, pool);
 	else if (!strcasecmp(argv[0], "delete"))
 		r = process_delete_mesg(argc, argv, pool);
 	else if (!strcasecmp(argv[0], "set_transaction_id"))
 		r = process_set_transaction_id_mesg(argc, argv, pool);
 	else if (!strcasecmp(argv[0], "reserve_metadata_snap"))
 		r = process_reserve_metadata_snap_mesg(argc, argv, pool);
 	else if (!strcasecmp(argv[0], "release_metadata_snap"))
 		r = process_release_metadata_snap_mesg(argc, argv, pool);
 	else
 		DMWARN("Unrecognised thin pool target message received: %s", argv[0]);
 	if (!r)
 		(void) commit(pool);
 	return r;
 }
 static void emit_flags(struct pool_features *pf, char *result,
 		       unsigned sz, unsigned maxlen)
 {
 	unsigned count = !pf->zero_new_blocks + !pf->discard_enabled +
 		!pf->discard_passdown + (pf->mode == PM_READ_ONLY) +
 		pf->error_if_no_space;
 	DMEMIT("%u ", count);
 	if (!pf->zero_new_blocks)
 		DMEMIT("skip_block_zeroing ");
 	if (!pf->discard_enabled)
 		DMEMIT("ignore_discard ");
 	if (!pf->discard_passdown)
 		DMEMIT("no_discard_passdown ");
 	if (pf->mode == PM_READ_ONLY)
 		DMEMIT("read_only ");
 	if (pf->error_if_no_space)
 		DMEMIT("error_if_no_space ");
 }
 /*
  * Status line is:
  *    <transaction id> <used metadata sectors>/<total metadata sectors>
  *    <used data sectors>/<total data sectors> <held metadata root>
  */
 static void pool_status(struct dm_target *ti, status_type_t type,
 			unsigned status_flags, char *result, unsigned maxlen)
 {
 	int r;
 	unsigned sz = 0;
 	uint64_t transaction_id;
 	dm_block_t nr_free_blocks_data;
 	dm_block_t nr_free_blocks_metadata;
 	dm_block_t nr_blocks_data;
 	dm_block_t nr_blocks_metadata;
 	dm_block_t held_root;
 	char buf[BDEVNAME_SIZE];
 	char buf2[BDEVNAME_SIZE];
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	switch (type) {
 	case STATUSTYPE_INFO:
 		if (get_pool_mode(pool) == PM_FAIL) {
 			DMEMIT("Fail");
 			break;
 		}
 		/* Commit to ensure statistics aren't out-of-date */
 		if (!(status_flags & DM_STATUS_NOFLUSH_FLAG) && !dm_suspended(ti))
 			(void) commit(pool);
 		r = dm_pool_get_metadata_transaction_id(pool->pmd, &transaction_id);
 		if (r) {
 			DMERR("%s: dm_pool_get_metadata_transaction_id returned %d",
 			      dm_device_name(pool->pool_md), r);
 			goto err;
 		}
 		r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free_blocks_metadata);
 		if (r) {
 			DMERR("%s: dm_pool_get_free_metadata_block_count returned %d",
 			      dm_device_name(pool->pool_md), r);
 			goto err;
 		}
 		r = dm_pool_get_metadata_dev_size(pool->pmd, &nr_blocks_metadata);
 		if (r) {
 			DMERR("%s: dm_pool_get_metadata_dev_size returned %d",
 			      dm_device_name(pool->pool_md), r);
 			goto err;
 		}
 		r = dm_pool_get_free_block_count(pool->pmd, &nr_free_blocks_data);
 		if (r) {
 			DMERR("%s: dm_pool_get_free_block_count returned %d",
 			      dm_device_name(pool->pool_md), r);
 			goto err;
 		}
 		r = dm_pool_get_data_dev_size(pool->pmd, &nr_blocks_data);
 		if (r) {
 			DMERR("%s: dm_pool_get_data_dev_size returned %d",
 			      dm_device_name(pool->pool_md), r);
 			goto err;
 		}
 		r = dm_pool_get_metadata_snap(pool->pmd, &held_root);
 		if (r) {
 			DMERR("%s: dm_pool_get_metadata_snap returned %d",
 			      dm_device_name(pool->pool_md), r);
 			goto err;
 		}
 		DMEMIT("%llu %llu/%llu %llu/%llu ",
 		       (unsigned long long)transaction_id,
 		       (unsigned long long)(nr_blocks_metadata - nr_free_blocks_metadata),
 		       (unsigned long long)nr_blocks_metadata,
 		       (unsigned long long)(nr_blocks_data - nr_free_blocks_data),
 		       (unsigned long long)nr_blocks_data);
 		if (held_root)
 			DMEMIT("%llu ", held_root);
 		else
 			DMEMIT("- ");
 		if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
 			DMEMIT("out_of_data_space ");
 		else if (pool->pf.mode == PM_READ_ONLY)
 			DMEMIT("ro ");
 		else
 			DMEMIT("rw ");
 		if (!pool->pf.discard_enabled)
 			DMEMIT("ignore_discard ");
 		else if (pool->pf.discard_passdown)
 			DMEMIT("discard_passdown ");
 		else
 			DMEMIT("no_discard_passdown ");
 		if (pool->pf.error_if_no_space)
 			DMEMIT("error_if_no_space ");
 		else
 			DMEMIT("queue_if_no_space ");
 		break;
 	case STATUSTYPE_TABLE:
 		DMEMIT("%s %s %lu %llu ",
 		       format_dev_t(buf, pt->metadata_dev->bdev->bd_dev),
 		       format_dev_t(buf2, pt->data_dev->bdev->bd_dev),
 		       (unsigned long)pool->sectors_per_block,
 		       (unsigned long long)pt->low_water_blocks);
 		emit_flags(&pt->requested_pf, result, sz, maxlen);
 		break;
 	}
 	return;
 err:
 	DMEMIT("Error");
 }
 static int pool_iterate_devices(struct dm_target *ti,
 				iterate_devices_callout_fn fn, void *data)
 {
 	struct pool_c *pt = ti->private;
 	return fn(ti, pt->data_dev, 0, ti->len, data);
 }
 static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 		      struct bio_vec *biovec, int max_size)
 {
 	struct pool_c *pt = ti->private;
 	struct request_queue *q = bdev_get_queue(pt->data_dev->bdev);
 	if (!q->merge_bvec_fn)
 		return max_size;
 	bvm->bi_bdev = pt->data_dev->bdev;
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 static void set_discard_limits(struct pool_c *pt, struct queue_limits *limits)
 {
 	struct pool *pool = pt->pool;
 	struct queue_limits *data_limits;
 	limits->max_discard_sectors = pool->sectors_per_block;
 	/*
 	 * discard_granularity is just a hint, and not enforced.
 	 */
 	if (pt->adjusted_pf.discard_passdown) {
 		data_limits = &bdev_get_queue(pt->data_dev->bdev)->limits;
 		limits->discard_granularity = max(data_limits->discard_granularity,
 						  pool->sectors_per_block << SECTOR_SHIFT);
 	} else
 		limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
 }
 static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
 {
 	struct pool_c *pt = ti->private;
 	struct pool *pool = pt->pool;
 	sector_t io_opt_sectors = limits->io_opt >> SECTOR_SHIFT;
 	/*
 	 * If max_sectors is smaller than pool->sectors_per_block adjust it
 	 * to the highest possible power-of-2 factor of pool->sectors_per_block.
 	 * This is especially beneficial when the pool's data device is a RAID
 	 * device that has a full stripe width that matches pool->sectors_per_block
 	 * -- because even though partial RAID stripe-sized IOs will be issued to a
 	 *    single RAID stripe; when aggregated they will end on a full RAID stripe
 	 *    boundary.. which avoids additional partial RAID stripe writes cascading
 	 */
 	if (limits->max_sectors < pool->sectors_per_block) {
 		while (!is_factor(pool->sectors_per_block, limits->max_sectors)) {
 			if ((limits->max_sectors & (limits->max_sectors - 1)) == 0)
 				limits->max_sectors--;
 			limits->max_sectors = rounddown_pow_of_two(limits->max_sectors);
 		}
 	}
 	/*
 	 * If the system-determined stacked limits are compatible with the
 	 * pool's blocksize (io_opt is a factor) do not override them.
 	 */
 	if (io_opt_sectors < pool->sectors_per_block ||
 	    !is_factor(io_opt_sectors, pool->sectors_per_block)) {
 		if (is_factor(pool->sectors_per_block, limits->max_sectors))
 			blk_limits_io_min(limits, limits->max_sectors << SECTOR_SHIFT);
 		else
 			blk_limits_io_min(limits, pool->sectors_per_block << SECTOR_SHIFT);
 		blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
 	}
 	/*
 	 * pt->adjusted_pf is a staging area for the actual features to use.
 	 * They get transferred to the live pool in bind_control_target()
 	 * called from pool_preresume().
 	 */
 	if (!pt->adjusted_pf.discard_enabled) {
 		/*
 		 * Must explicitly disallow stacking discard limits otherwise the
 		 * block layer will stack them if pool's data device has support.
 		 * QUEUE_FLAG_DISCARD wouldn't be set but there is no way for the
 		 * user to see that, so make sure to set all discard limits to 0.
 		 */
 		limits->discard_granularity = 0;
 		return;
 	}
 	disable_passdown_if_not_supported(pt);
 	set_discard_limits(pt, limits);
 }
 static struct target_type pool_target = {
 	.name = "thin-pool",
 	.features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
 		    DM_TARGET_IMMUTABLE,
 	.version = {1, 14, 0},
 	.module = THIS_MODULE,
 	.ctr = pool_ctr,
 	.dtr = pool_dtr,
 	.map = pool_map,
 	.presuspend = pool_presuspend,
 	.presuspend_undo = pool_presuspend_undo,
 	.postsuspend = pool_postsuspend,
 	.preresume = pool_preresume,
 	.resume = pool_resume,
 	.message = pool_message,
 	.status = pool_status,
 	.merge = pool_merge,
 	.iterate_devices = pool_iterate_devices,
 	.io_hints = pool_io_hints,
 };
 /*----------------------------------------------------------------
  * Thin target methods
  *--------------------------------------------------------------*/
 static void thin_get(struct thin_c *tc)
 {
 	atomic_inc(&tc->refcount);
 }
 static void thin_put(struct thin_c *tc)
 {
 	if (atomic_dec_and_test(&tc->refcount))
 		complete(&tc->can_destroy);
 }
 static void thin_dtr(struct dm_target *ti)
 {
 	struct thin_c *tc = ti->private;
 	unsigned long flags;
 	spin_lock_irqsave(&tc->pool->lock, flags);
 	list_del_rcu(&tc->list);
 	spin_unlock_irqrestore(&tc->pool->lock, flags);
 	synchronize_rcu();
 	thin_put(tc);
 	wait_for_completion(&tc->can_destroy);
 	mutex_lock(&dm_thin_pool_table.mutex);
 	__pool_dec(tc->pool);
 	dm_pool_close_thin_device(tc->td);
 	dm_put_device(ti, tc->pool_dev);
 	if (tc->origin_dev)
 		dm_put_device(ti, tc->origin_dev);
 	kfree(tc);
 	mutex_unlock(&dm_thin_pool_table.mutex);
 }
 /*
  * Thin target parameters:
  *
  * <pool_dev> <dev_id> [origin_dev]
  *
  * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
  * dev_id: the internal device identifier
  * origin_dev: a device external to the pool that should act as the origin
  *
  * If the pool device has discards disabled, they get disabled for the thin
  * device as well.
  */
 static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
 {
 	int r;
 	struct thin_c *tc;
 	struct dm_dev *pool_dev, *origin_dev;
 	struct mapped_device *pool_md;
 	unsigned long flags;
 	mutex_lock(&dm_thin_pool_table.mutex);
 	if (argc != 2 && argc != 3) {
 		ti->error = "Invalid argument count";
 		r = -EINVAL;
 		goto out_unlock;
 	}
 	tc = ti->private = kzalloc(sizeof(*tc), GFP_KERNEL);
 	if (!tc) {
 		ti->error = "Out of memory";
 		r = -ENOMEM;
 		goto out_unlock;
 	}
 	tc->thin_md = dm_table_get_md(ti->table);
 	spin_lock_init(&tc->lock);
 	INIT_LIST_HEAD(&tc->deferred_cells);
 	bio_list_init(&tc->deferred_bio_list);
 	bio_list_init(&tc->retry_on_resume_list);
 	tc->sort_bio_list = RB_ROOT;
 	if (argc == 3) {
 		r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
 		if (r) {
 			ti->error = "Error opening origin device";
 			goto bad_origin_dev;
 		}
 		tc->origin_dev = origin_dev;
 	}
 	r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
 	if (r) {
 		ti->error = "Error opening pool device";
 		goto bad_pool_dev;
 	}
 	tc->pool_dev = pool_dev;
 	if (read_dev_id(argv[1], (unsigned long long *)&tc->dev_id, 0)) {
 		ti->error = "Invalid device id";
 		r = -EINVAL;
 		goto bad_common;
 	}
 	pool_md = dm_get_md(tc->pool_dev->bdev->bd_dev);
 	if (!pool_md) {
 		ti->error = "Couldn't get pool mapped device";
 		r = -EINVAL;
 		goto bad_common;
 	}
 	tc->pool = __pool_table_lookup(pool_md);
 	if (!tc->pool) {
 		ti->error = "Couldn't find pool object";
 		r = -EINVAL;
 		goto bad_pool_lookup;
 	}
 	__pool_inc(tc->pool);
 	if (get_pool_mode(tc->pool) == PM_FAIL) {
 		ti->error = "Couldn't open thin device, Pool is in fail mode";
 		r = -EINVAL;
 		goto bad_pool;
 	}
 	r = dm_pool_open_thin_device(tc->pool->pmd, tc->dev_id, &tc->td);
 	if (r) {
 		ti->error = "Couldn't open thin internal device";
 		goto bad_pool;
 	}
 	r = dm_set_target_max_io_len(ti, tc->pool->sectors_per_block);
 	if (r)
 		goto bad;
 	ti->num_flush_bios = 1;
 	ti->flush_supported = true;
 	ti->per_bio_data_size = sizeof(struct dm_thin_endio_hook);
 	/* In case the pool supports discards, pass them on. */
 	ti->discard_zeroes_data_unsupported = true;
 	if (tc->pool->pf.discard_enabled) {
 		ti->discards_supported = true;
 		ti->num_discard_bios = 1;
 		/* Discard bios must be split on a block boundary */
 		ti->split_discard_bios = true;
 	}
 	mutex_unlock(&dm_thin_pool_table.mutex);
 	spin_lock_irqsave(&tc->pool->lock, flags);
 	if (tc->pool->suspended) {
 		spin_unlock_irqrestore(&tc->pool->lock, flags);
 		mutex_lock(&dm_thin_pool_table.mutex); /* reacquire for __pool_dec */
 		ti->error = "Unable to activate thin device while pool is suspended";
 		r = -EINVAL;
 		goto bad;
 	}
 	atomic_set(&tc->refcount, 1);
 	init_completion(&tc->can_destroy);
 	list_add_tail_rcu(&tc->list, &tc->pool->active_thins);
 	spin_unlock_irqrestore(&tc->pool->lock, flags);
 	/*
 	 * This synchronize_rcu() call is needed here otherwise we risk a
 	 * wake_worker() call finding no bios to process (because the newly
 	 * added tc isn't yet visible).  So this reduces latency since we
 	 * aren't then dependent on the periodic commit to wake_worker().
 	 */
 	synchronize_rcu();
 	dm_put(pool_md);
 	return 0;
 bad:
 	dm_pool_close_thin_device(tc->td);
 bad_pool:
 	__pool_dec(tc->pool);
 bad_pool_lookup:
 	dm_put(pool_md);
 bad_common:
 	dm_put_device(ti, tc->pool_dev);
 bad_pool_dev:
 	if (tc->origin_dev)
 		dm_put_device(ti, tc->origin_dev);
 bad_origin_dev:
 	kfree(tc);
 out_unlock:
 	mutex_unlock(&dm_thin_pool_table.mutex);
 	return r;
 }
 static int thin_map(struct dm_target *ti, struct bio *bio)
 {
 	bio->bi_iter.bi_sector = dm_target_offset(ti, bio->bi_iter.bi_sector);
 	return thin_bio_map(ti, bio);
 }
 static int thin_endio(struct dm_target *ti, struct bio *bio, int err)
 {
 	unsigned long flags;
 	struct dm_thin_endio_hook *h = dm_per_bio_data(bio, sizeof(struct dm_thin_endio_hook));
 	struct list_head work;
 	struct dm_thin_new_mapping *m, *tmp;
 	struct pool *pool = h->tc->pool;
 	if (h->shared_read_entry) {
 		INIT_LIST_HEAD(&work);
 		dm_deferred_entry_dec(h->shared_read_entry, &work);
 		spin_lock_irqsave(&pool->lock, flags);
 		list_for_each_entry_safe(m, tmp, &work, list) {
 			list_del(&m->list);
 			__complete_mapping_preparation(m);
 		}
 		spin_unlock_irqrestore(&pool->lock, flags);
 	}
 	if (h->all_io_entry) {
 		INIT_LIST_HEAD(&work);
 		dm_deferred_entry_dec(h->all_io_entry, &work);
 		if (!list_empty(&work)) {
 			spin_lock_irqsave(&pool->lock, flags);
 			list_for_each_entry_safe(m, tmp, &work, list)
 				list_add_tail(&m->list, &pool->prepared_discards);
 			spin_unlock_irqrestore(&pool->lock, flags);
 			wake_worker(pool);
 		}
 	}
 	return 0;
 }
 static void thin_presuspend(struct dm_target *ti)
 {
 	struct thin_c *tc = ti->private;
 	if (dm_noflush_suspending(ti))
 		noflush_work(tc, do_noflush_start);
 }
 static void thin_postsuspend(struct dm_target *ti)
 {
 	struct thin_c *tc = ti->private;
 	/*
 	 * The dm_noflush_suspending flag has been cleared by now, so
 	 * unfortunately we must always run this.
 	 */
 	noflush_work(tc, do_noflush_stop);
 }
 static int thin_preresume(struct dm_target *ti)
 {
 	struct thin_c *tc = ti->private;
 	if (tc->origin_dev)
 		tc->origin_size = get_dev_size(tc->origin_dev->bdev);
 	return 0;
 }
 /*
  * <nr mapped sectors> <highest mapped sector>
  */
 static void thin_status(struct dm_target *ti, status_type_t type,
 			unsigned status_flags, char *result, unsigned maxlen)
 {
 	int r;
 	ssize_t sz = 0;
 	dm_block_t mapped, highest;
 	char buf[BDEVNAME_SIZE];
 	struct thin_c *tc = ti->private;
 	if (get_pool_mode(tc->pool) == PM_FAIL) {
 		DMEMIT("Fail");
 		return;
 	}
 	if (!tc->td)
 		DMEMIT("-");
 	else {
 		switch (type) {
 		case STATUSTYPE_INFO:
 			r = dm_thin_get_mapped_count(tc->td, &mapped);
 			if (r) {
 				DMERR("dm_thin_get_mapped_count returned %d", r);
 				goto err;
 			}
 			r = dm_thin_get_highest_mapped_block(tc->td, &highest);
 			if (r < 0) {
 				DMERR("dm_thin_get_highest_mapped_block returned %d", r);
 				goto err;
 			}
 			DMEMIT("%llu ", mapped * tc->pool->sectors_per_block);
 			if (r)
 				DMEMIT("%llu", ((highest + 1) *
 						tc->pool->sectors_per_block) - 1);
 			else
 				DMEMIT("-");
 			break;
 		case STATUSTYPE_TABLE:
 			DMEMIT("%s %lu",
 			       format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
 			       (unsigned long) tc->dev_id);
 			if (tc->origin_dev)
 				DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
 			break;
 		}
 	}
 	return;
 err:
 	DMEMIT("Error");
 }
 static int thin_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
 		      struct bio_vec *biovec, int max_size)
 {
 	struct thin_c *tc = ti->private;
 	struct request_queue *q = bdev_get_queue(tc->pool_dev->bdev);
 	if (!q->merge_bvec_fn)
 		return max_size;
 	bvm->bi_bdev = tc->pool_dev->bdev;
 	bvm->bi_sector = dm_target_offset(ti, bvm->bi_sector);
 	return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
 }
 static int thin_iterate_devices(struct dm_target *ti,
 				iterate_devices_callout_fn fn, void *data)
 {
 	sector_t blocks;
 	struct thin_c *tc = ti->private;
 	struct pool *pool = tc->pool;
 	/*
 	 * We can't call dm_pool_get_data_dev_size() since that blocks.  So
 	 * we follow a more convoluted path through to the pool's target.
 	 */
 	if (!pool->ti)
 		return 0;	/* nothing is bound */
 	blocks = pool->ti->len;
 	(void) sector_div(blocks, pool->sectors_per_block);
 	if (blocks)
 		return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data);
 	return 0;
 }
 static struct target_type thin_target = {
 	.name = "thin",
 	.version = {1, 14, 0},
 	.module	= THIS_MODULE,
 	.ctr = thin_ctr,
 	.dtr = thin_dtr,
 	.map = thin_map,
 	.end_io = thin_endio,
 	.preresume = thin_preresume,
 	.presuspend = thin_presuspend,
 	.postsuspend = thin_postsuspend,
 	.status = thin_status,
 	.merge = thin_merge,
 	.iterate_devices = thin_iterate_devices,
 };
 /*----------------------------------------------------------------*/
 static int __init dm_thin_init(void)
 {
 	int r;
 	pool_table_init();
 	r = dm_register_target(&thin_target);
 	if (r)
 		return r;
 	r = dm_register_target(&pool_target);
 	if (r)
 		goto bad_pool_target;
 	r = -ENOMEM;
 	_new_mapping_cache = KMEM_CACHE(dm_thin_new_mapping, 0);
 	if (!_new_mapping_cache)
 		goto bad_new_mapping_cache;
 	return 0;
 bad_new_mapping_cache:
 	dm_unregister_target(&pool_target);
 bad_pool_target:
 	dm_unregister_target(&thin_target);
 	return r;
 }
 static void dm_thin_exit(void)
 {
 	dm_unregister_target(&thin_target);
 	dm_unregister_target(&pool_target);
 	kmem_cache_destroy(_new_mapping_cache);
 }
 module_init(dm_thin_init);
 module_exit(dm_thin_exit);
 module_param_named(no_space_timeout, no_space_timeout_secs, uint, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(no_space_timeout, "Out of data space queue IO timeout in seconds");
 MODULE_DESCRIPTION(DM_NAME " thin provisioning target");
 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
 MODULE_LICENSE("GPL");