Eric Lee / smarc-ti-linux-kernel

1

/*

1

/*

2

3

*

3

*

4

* This program is free software; you can redistribute it and/or

4

* This program is free software; you can redistribute it and/or

5

* modify it under the terms of the GNU General Public

5

* modify it under the terms of the GNU General Public

6

* License v2 as published by the Free Software Foundation.

6

* License v2 as published by the Free Software Foundation.

7

*

7

*

8

* This program is distributed in the hope that it will be useful,

8

* This program is distributed in the hope that it will be useful,

9

* but WITHOUT ANY WARRANTY; without even the implied warranty of

9

* but WITHOUT ANY WARRANTY; without even the implied warranty of

10

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

10

* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU

11

* General Public License for more details.

11

* General Public License for more details.

12

*

12

*

13

* You should have received a copy of the GNU General Public

13

* You should have received a copy of the GNU General Public

14

* License along with this program; if not, write to the

14

* License along with this program; if not, write to the

15

* Free Software Foundation, Inc., 59 Temple Place - Suite 330,

15

* Free Software Foundation, Inc., 59 Temple Place - Suite 330,

16

* Boston, MA 021110-1307, USA.

16

* Boston, MA 021110-1307, USA.

17

*/

17

*/

18

19

#include <linux/blkdev.h>

19

#include <linux/blkdev.h>

20

#include <linux/ratelimit.h>

20

#include <linux/ratelimit.h>

21

#include "ctree.h"

21

#include "ctree.h"

22

#include "volumes.h"

22

#include "volumes.h"

23

#include "disk-io.h"

23

#include "disk-io.h"

24

#include "ordered-data.h"

24

#include "ordered-data.h"

25

#include "transaction.h"

25

#include "transaction.h"

26

#include "backref.h"

26

#include "backref.h"

27

#include "extent_io.h"

27

#include "extent_io.h"

28

#include "dev-replace.h"

28

#include "dev-replace.h"

29

#include "check-integrity.h"

29

#include "check-integrity.h"

30

#include "rcu-string.h"

30

#include "rcu-string.h"

31

#include "raid56.h"

31

#include "raid56.h"

32

33

/*

33

/*

34

* This is only the first step towards a full-features scrub. It reads all

34

* This is only the first step towards a full-features scrub. It reads all

35

* extent and super block and verifies the checksums. In case a bad checksum

35

* extent and super block and verifies the checksums. In case a bad checksum

36

* is found or the extent cannot be read, good data will be written back if

36

* is found or the extent cannot be read, good data will be written back if

37

* any can be found.

37

* any can be found.

38

*

38

*

39

* Future enhancements:

39

* Future enhancements:

40

* - In case an unrepairable extent is encountered, track which files are

40

* - In case an unrepairable extent is encountered, track which files are

41

* affected and report them

41

* affected and report them

42

* - track and record media errors, throw out bad devices

42

* - track and record media errors, throw out bad devices

43

* - add a mode to also read unallocated space

43

* - add a mode to also read unallocated space

44

*/

44

*/

45

46

struct scrub_block;

46

struct scrub_block;

47

struct scrub_ctx;

47

struct scrub_ctx;

48

49

/*

49

/*

50

* the following three values only influence the performance.

50

* the following three values only influence the performance.

51

* The last one configures the number of parallel and outstanding I/O

51

* The last one configures the number of parallel and outstanding I/O

52

* operations. The first two values configure an upper limit for the number

52

* operations. The first two values configure an upper limit for the number

53

* of (dynamically allocated) pages that are added to a bio.

53

* of (dynamically allocated) pages that are added to a bio.

54

*/

54

*/

55

#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */

55

#define SCRUB_PAGES_PER_RD_BIO 32 /* 128k per bio */

56

#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */

56

#define SCRUB_PAGES_PER_WR_BIO 32 /* 128k per bio */

57

#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */

57

#define SCRUB_BIOS_PER_SCTX 64 /* 8MB per device in flight */

58

59

/*

59

/*

60

* the following value times PAGE_SIZE needs to be large enough to match the

60

* the following value times PAGE_SIZE needs to be large enough to match the

61

* largest node/leaf/sector size that shall be supported.

61

* largest node/leaf/sector size that shall be supported.

62

* Values larger than BTRFS_STRIPE_LEN are not supported.

62

* Values larger than BTRFS_STRIPE_LEN are not supported.

63

*/

63

*/

64

#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */

64

#define SCRUB_MAX_PAGES_PER_BLOCK 16 /* 64k per node/leaf/sector */

65

66

struct scrub_page {

66

struct scrub_page {

67

struct scrub_block *sblock;

67

struct scrub_block *sblock;

68

struct page *page;

68

struct page *page;

69

struct btrfs_device *dev;

69

struct btrfs_device *dev;

70

u64 flags; /* extent flags */

70

u64 flags; /* extent flags */

71

u64 generation;

71

u64 generation;

72

u64 logical;

72

u64 logical;

73

u64 physical;

73

u64 physical;

74

u64 physical_for_dev_replace;

74

u64 physical_for_dev_replace;

75

atomic_t ref_count;

75

atomic_t ref_count;

76

struct {

76

struct {

77

unsigned int mirror_num:8;

77

unsigned int mirror_num:8;

78

unsigned int have_csum:1;

78

unsigned int have_csum:1;

79

unsigned int io_error:1;

79

unsigned int io_error:1;

80

};

80

};

81

u8 csum[BTRFS_CSUM_SIZE];

81

u8 csum[BTRFS_CSUM_SIZE];

82

};

82

};

83

84

struct scrub_bio {

84

struct scrub_bio {

85

int index;

85

int index;

86

struct scrub_ctx *sctx;

86

struct scrub_ctx *sctx;

87

struct btrfs_device *dev;

87

struct btrfs_device *dev;

88

struct bio *bio;

88

struct bio *bio;

89

int err;

89

int err;

90

u64 logical;

90

u64 logical;

91

u64 physical;

91

u64 physical;

92

#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO

92

#if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO

93

struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];

93

struct scrub_page *pagev[SCRUB_PAGES_PER_WR_BIO];

94

#else

94

#else

95

struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];

95

struct scrub_page *pagev[SCRUB_PAGES_PER_RD_BIO];

96

#endif

96

#endif

97

int page_count;

97

int page_count;

98

int next_free;

98

int next_free;

99

struct btrfs_work work;

99

struct btrfs_work work;

100

};

100

};

101

102

struct scrub_block {

102

struct scrub_block {

103

struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];

103

struct scrub_page *pagev[SCRUB_MAX_PAGES_PER_BLOCK];

104

int page_count;

104

int page_count;

105

atomic_t outstanding_pages;

105

atomic_t outstanding_pages;

106

atomic_t ref_count; /* free mem on transition to zero */

106

atomic_t ref_count; /* free mem on transition to zero */

107

struct scrub_ctx *sctx;

107

struct scrub_ctx *sctx;

108

struct {

108

struct {

109

unsigned int header_error:1;

109

unsigned int header_error:1;

110

unsigned int checksum_error:1;

110

unsigned int checksum_error:1;

111

unsigned int no_io_error_seen:1;

111

unsigned int no_io_error_seen:1;

112

unsigned int generation_error:1; /* also sets header_error */

112

unsigned int generation_error:1; /* also sets header_error */

113

};

113

};

114

};

114

};

115

116

struct scrub_wr_ctx {

116

struct scrub_wr_ctx {

117

struct scrub_bio *wr_curr_bio;

117

struct scrub_bio *wr_curr_bio;

118

struct btrfs_device *tgtdev;

118

struct btrfs_device *tgtdev;

119

int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */

119

int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */

120

atomic_t flush_all_writes;

120

atomic_t flush_all_writes;

121

struct mutex wr_lock;

121

struct mutex wr_lock;

122

};

122

};

123

124

struct scrub_ctx {

124

struct scrub_ctx {

125

struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];

125

struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];

126

struct btrfs_root *dev_root;

126

struct btrfs_root *dev_root;

127

int first_free;

127

int first_free;

128

int curr;

128

int curr;

129

atomic_t bios_in_flight;

129

atomic_t bios_in_flight;

130

atomic_t workers_pending;

130

atomic_t workers_pending;

131

spinlock_t list_lock;

131

spinlock_t list_lock;

132

wait_queue_head_t list_wait;

132

wait_queue_head_t list_wait;

133

u16 csum_size;

133

u16 csum_size;

134

struct list_head csum_list;

134

struct list_head csum_list;

135

atomic_t cancel_req;

135

atomic_t cancel_req;

136

int readonly;

136

int readonly;

137

int pages_per_rd_bio;

137

int pages_per_rd_bio;

138

u32 sectorsize;

138

u32 sectorsize;

139

u32 nodesize;

139

u32 nodesize;

140

u32 leafsize;

140

u32 leafsize;

141

142

int is_dev_replace;

142

int is_dev_replace;

143

struct scrub_wr_ctx wr_ctx;

143

struct scrub_wr_ctx wr_ctx;

144

145

/*

145

/*

146

* statistics

146

* statistics

147

*/

147

*/

148

struct btrfs_scrub_progress stat;

148

struct btrfs_scrub_progress stat;

149

spinlock_t stat_lock;

149

spinlock_t stat_lock;

150

};

150

};

151

152

struct scrub_fixup_nodatasum {

152

struct scrub_fixup_nodatasum {

153

struct scrub_ctx *sctx;

153

struct scrub_ctx *sctx;

154

struct btrfs_device *dev;

154

struct btrfs_device *dev;

155

u64 logical;

155

u64 logical;

156

struct btrfs_root *root;

156

struct btrfs_root *root;

157

struct btrfs_work work;

157

struct btrfs_work work;

158

int mirror_num;

158

int mirror_num;

159

};

159

};

160

161

struct scrub_nocow_inode {

161

struct scrub_nocow_inode {

162

u64 inum;

162

u64 inum;

163

u64 offset;

163

u64 offset;

164

u64 root;

164

u64 root;

165

struct list_head list;

165

struct list_head list;

166

};

166

};

167

168

struct scrub_copy_nocow_ctx {

168

struct scrub_copy_nocow_ctx {

169

struct scrub_ctx *sctx;

169

struct scrub_ctx *sctx;

170

u64 logical;

170

u64 logical;

171

u64 len;

171

u64 len;

172

int mirror_num;

172

int mirror_num;

173

u64 physical_for_dev_replace;

173

u64 physical_for_dev_replace;

174

struct list_head inodes;

174

struct list_head inodes;

175

struct btrfs_work work;

175

struct btrfs_work work;

176

};

176

};

177

178

struct scrub_warning {

178

struct scrub_warning {

179

struct btrfs_path *path;

179

struct btrfs_path *path;

180

u64 extent_item_size;

180

u64 extent_item_size;

181

char *scratch_buf;

181

char *scratch_buf;

182

char *msg_buf;

182

char *msg_buf;

183

const char *errstr;

183

const char *errstr;

184

sector_t sector;

184

sector_t sector;

185

u64 logical;

185

u64 logical;

186

struct btrfs_device *dev;

186

struct btrfs_device *dev;

187

int msg_bufsize;

187

int msg_bufsize;

188

int scratch_bufsize;

188

int scratch_bufsize;

189

};

189

};

190

191

192

static void scrub_pending_bio_inc(struct scrub_ctx *sctx);

192

static void scrub_pending_bio_inc(struct scrub_ctx *sctx);

193

static void scrub_pending_bio_dec(struct scrub_ctx *sctx);

193

static void scrub_pending_bio_dec(struct scrub_ctx *sctx);

194

static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);

194

static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);

195

static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);

195

static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);

196

static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);

196

static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);

197

static int scrub_setup_recheck_block(struct scrub_ctx *sctx,

197

static int scrub_setup_recheck_block(struct scrub_ctx *sctx,

198

struct btrfs_fs_info *fs_info,

198

struct btrfs_fs_info *fs_info,

199

struct scrub_block *original_sblock,

199

struct scrub_block *original_sblock,

200

u64 length, u64 logical,

200

u64 length, u64 logical,

201

struct scrub_block *sblocks_for_recheck);

201

struct scrub_block *sblocks_for_recheck);

202

static void scrub_recheck_block(struct btrfs_fs_info *fs_info,

202

static void scrub_recheck_block(struct btrfs_fs_info *fs_info,

203

struct scrub_block *sblock, int is_metadata,

203

struct scrub_block *sblock, int is_metadata,

204

int have_csum, u8 *csum, u64 generation,

204

int have_csum, u8 *csum, u64 generation,

205

u16 csum_size);

205

u16 csum_size);

206

static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,

206

static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,

207

struct scrub_block *sblock,

207

struct scrub_block *sblock,

208

int is_metadata, int have_csum,

208

int is_metadata, int have_csum,

209

const u8 *csum, u64 generation,

209

const u8 *csum, u64 generation,

210

u16 csum_size);

210

u16 csum_size);

211

static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,

211

static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,

212

struct scrub_block *sblock_good,

212

struct scrub_block *sblock_good,

213

int force_write);

213

int force_write);

214

static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,

214

static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,

215

struct scrub_block *sblock_good,

215

struct scrub_block *sblock_good,

216

int page_num, int force_write);

216

int page_num, int force_write);

217

static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);

217

static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);

218

static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,

218

static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,

219

int page_num);

219

int page_num);

220

static int scrub_checksum_data(struct scrub_block *sblock);

220

static int scrub_checksum_data(struct scrub_block *sblock);

221

static int scrub_checksum_tree_block(struct scrub_block *sblock);

221

static int scrub_checksum_tree_block(struct scrub_block *sblock);

222

static int scrub_checksum_super(struct scrub_block *sblock);

222

static int scrub_checksum_super(struct scrub_block *sblock);

223

static void scrub_block_get(struct scrub_block *sblock);

223

static void scrub_block_get(struct scrub_block *sblock);

224

static void scrub_block_put(struct scrub_block *sblock);

224

static void scrub_block_put(struct scrub_block *sblock);

225

static void scrub_page_get(struct scrub_page *spage);

225

static void scrub_page_get(struct scrub_page *spage);

226

static void scrub_page_put(struct scrub_page *spage);

226

static void scrub_page_put(struct scrub_page *spage);

227

static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,

227

static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,

228

struct scrub_page *spage);

228

struct scrub_page *spage);

229

static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,

229

static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,

230

u64 physical, struct btrfs_device *dev, u64 flags,

230

u64 physical, struct btrfs_device *dev, u64 flags,

231

u64 gen, int mirror_num, u8 *csum, int force,

231

u64 gen, int mirror_num, u8 *csum, int force,

232

u64 physical_for_dev_replace);

232

u64 physical_for_dev_replace);

233

static void scrub_bio_end_io(struct bio *bio, int err);

233

static void scrub_bio_end_io(struct bio *bio, int err);

234

static void scrub_bio_end_io_worker(struct btrfs_work *work);

234

static void scrub_bio_end_io_worker(struct btrfs_work *work);

235

static void scrub_block_complete(struct scrub_block *sblock);

235

static void scrub_block_complete(struct scrub_block *sblock);

236

static void scrub_remap_extent(struct btrfs_fs_info *fs_info,

236

static void scrub_remap_extent(struct btrfs_fs_info *fs_info,

237

u64 extent_logical, u64 extent_len,

237

u64 extent_logical, u64 extent_len,

238

u64 *extent_physical,

238

u64 *extent_physical,

239

struct btrfs_device **extent_dev,

239

struct btrfs_device **extent_dev,

240

int *extent_mirror_num);

240

int *extent_mirror_num);

241

static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,

241

static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,

242

struct scrub_wr_ctx *wr_ctx,

242

struct scrub_wr_ctx *wr_ctx,

243

struct btrfs_fs_info *fs_info,

243

struct btrfs_fs_info *fs_info,

244

struct btrfs_device *dev,

244

struct btrfs_device *dev,

245

int is_dev_replace);

245

int is_dev_replace);

246

static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);

246

static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);

247

static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,

247

static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,

248

struct scrub_page *spage);

248

struct scrub_page *spage);

249

static void scrub_wr_submit(struct scrub_ctx *sctx);

249

static void scrub_wr_submit(struct scrub_ctx *sctx);

250

static void scrub_wr_bio_end_io(struct bio *bio, int err);

250

static void scrub_wr_bio_end_io(struct bio *bio, int err);

251

static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);

251

static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);

252

static int write_page_nocow(struct scrub_ctx *sctx,

252

static int write_page_nocow(struct scrub_ctx *sctx,

253

u64 physical_for_dev_replace, struct page *page);

253

u64 physical_for_dev_replace, struct page *page);

254

static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,

254

static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,

255

struct scrub_copy_nocow_ctx *ctx);

255

struct scrub_copy_nocow_ctx *ctx);

256

static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,

256

static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,

257

int mirror_num, u64 physical_for_dev_replace);

257

int mirror_num, u64 physical_for_dev_replace);

258

static void copy_nocow_pages_worker(struct btrfs_work *work);

258

static void copy_nocow_pages_worker(struct btrfs_work *work);

259

static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);

259

static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);

260

static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);

260

static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);

261

262

263

static void scrub_pending_bio_inc(struct scrub_ctx *sctx)

263

static void scrub_pending_bio_inc(struct scrub_ctx *sctx)

264

{

264

{

265

atomic_inc(&sctx->bios_in_flight);

265

atomic_inc(&sctx->bios_in_flight);

266

}

266

}

267

268

static void scrub_pending_bio_dec(struct scrub_ctx *sctx)

268

static void scrub_pending_bio_dec(struct scrub_ctx *sctx)

269

{

269

{

270

atomic_dec(&sctx->bios_in_flight);

270

atomic_dec(&sctx->bios_in_flight);

271

wake_up(&sctx->list_wait);

271

wake_up(&sctx->list_wait);

272

}

272

}

273

274

static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)

274

static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)

275

{

275

{

276

while (atomic_read(&fs_info->scrub_pause_req)) {

276

while (atomic_read(&fs_info->scrub_pause_req)) {

277

mutex_unlock(&fs_info->scrub_lock);

277

mutex_unlock(&fs_info->scrub_lock);

278

wait_event(fs_info->scrub_pause_wait,

278

wait_event(fs_info->scrub_pause_wait,

279

atomic_read(&fs_info->scrub_pause_req) == 0);

279

atomic_read(&fs_info->scrub_pause_req) == 0);

280

mutex_lock(&fs_info->scrub_lock);

280

mutex_lock(&fs_info->scrub_lock);

281

}

281

}

282

}

282

}

283

284

static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)

284

static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)

285

{

285

{

286

atomic_inc(&fs_info->scrubs_paused);

286

atomic_inc(&fs_info->scrubs_paused);

287

wake_up(&fs_info->scrub_pause_wait);

287

wake_up(&fs_info->scrub_pause_wait);

288

289

mutex_lock(&fs_info->scrub_lock);

289

mutex_lock(&fs_info->scrub_lock);

290

__scrub_blocked_if_needed(fs_info);

290

__scrub_blocked_if_needed(fs_info);

291

atomic_dec(&fs_info->scrubs_paused);

291

atomic_dec(&fs_info->scrubs_paused);

292

mutex_unlock(&fs_info->scrub_lock);

292

mutex_unlock(&fs_info->scrub_lock);

293

294

wake_up(&fs_info->scrub_pause_wait);

294

wake_up(&fs_info->scrub_pause_wait);

295

}

295

}

296

297

/*

297

/*

298

* used for workers that require transaction commits (i.e., for the

298

* used for workers that require transaction commits (i.e., for the

299

* NOCOW case)

299

* NOCOW case)

300

*/

300

*/

301

static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)

301

static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)

302

{

302

{

303

struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;

303

struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;

304

305

/*

305

/*

306

* increment scrubs_running to prevent cancel requests from

306

* increment scrubs_running to prevent cancel requests from

307

* completing as long as a worker is running. we must also

307

* completing as long as a worker is running. we must also

308

* increment scrubs_paused to prevent deadlocking on pause

308

* increment scrubs_paused to prevent deadlocking on pause

309

* requests used for transactions commits (as the worker uses a

309

* requests used for transactions commits (as the worker uses a

310

* transaction context). it is safe to regard the worker

310

* transaction context). it is safe to regard the worker

311

* as paused for all matters practical. effectively, we only

311

* as paused for all matters practical. effectively, we only

312

* avoid cancellation requests from completing.

312

* avoid cancellation requests from completing.

313

*/

313

*/

314

mutex_lock(&fs_info->scrub_lock);

314

mutex_lock(&fs_info->scrub_lock);

315

atomic_inc(&fs_info->scrubs_running);

315

atomic_inc(&fs_info->scrubs_running);

316

atomic_inc(&fs_info->scrubs_paused);

316

atomic_inc(&fs_info->scrubs_paused);

317

mutex_unlock(&fs_info->scrub_lock);

317

mutex_unlock(&fs_info->scrub_lock);

318

319

/*

319

/*

320

* check if @scrubs_running=@scrubs_paused condition

320

* check if @scrubs_running=@scrubs_paused condition

321

* inside wait_event() is not an atomic operation.

321

* inside wait_event() is not an atomic operation.

322

* which means we may inc/dec @scrub_running/paused

322

* which means we may inc/dec @scrub_running/paused

323

* at any time. Let's wake up @scrub_pause_wait as

323

* at any time. Let's wake up @scrub_pause_wait as

324

* much as we can to let commit transaction blocked less.

324

* much as we can to let commit transaction blocked less.

325

*/

325

*/

326

wake_up(&fs_info->scrub_pause_wait);

326

wake_up(&fs_info->scrub_pause_wait);

327

328

atomic_inc(&sctx->workers_pending);

328

atomic_inc(&sctx->workers_pending);

329

}

329

}

330

331

/* used for workers that require transaction commits */

331

/* used for workers that require transaction commits */

332

static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)

332

static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)

333

{

333

{

334

struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;

334

struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;

335

336

/*

336

/*

337

* see scrub_pending_trans_workers_inc() why we're pretending

337

* see scrub_pending_trans_workers_inc() why we're pretending

338

* to be paused in the scrub counters

338

* to be paused in the scrub counters

339

*/

339

*/

340

mutex_lock(&fs_info->scrub_lock);

340

mutex_lock(&fs_info->scrub_lock);

341

atomic_dec(&fs_info->scrubs_running);

341

atomic_dec(&fs_info->scrubs_running);

342

atomic_dec(&fs_info->scrubs_paused);

342

atomic_dec(&fs_info->scrubs_paused);

343

mutex_unlock(&fs_info->scrub_lock);

343

mutex_unlock(&fs_info->scrub_lock);

344

atomic_dec(&sctx->workers_pending);

344

atomic_dec(&sctx->workers_pending);

345

wake_up(&fs_info->scrub_pause_wait);

345

wake_up(&fs_info->scrub_pause_wait);

346

wake_up(&sctx->list_wait);

346

wake_up(&sctx->list_wait);

347

}

347

}

348

349

static void scrub_free_csums(struct scrub_ctx *sctx)

349

static void scrub_free_csums(struct scrub_ctx *sctx)

350

{

350

{

351

while (!list_empty(&sctx->csum_list)) {

351

while (!list_empty(&sctx->csum_list)) {

352

struct btrfs_ordered_sum *sum;

352

struct btrfs_ordered_sum *sum;

353

sum = list_first_entry(&sctx->csum_list,

353

sum = list_first_entry(&sctx->csum_list,

354

struct btrfs_ordered_sum, list);

354

struct btrfs_ordered_sum, list);

355

list_del(&sum->list);

355

list_del(&sum->list);

356

kfree(sum);

356

kfree(sum);

357

}

357

}

358

}

358

}

359

360

static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)

360

static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)

361

{

361

{

362

int i;

362

int i;

363

364

if (!sctx)

364

if (!sctx)

365

return;

365

return;

366

367

scrub_free_wr_ctx(&sctx->wr_ctx);

367

scrub_free_wr_ctx(&sctx->wr_ctx);

368

369

/* this can happen when scrub is cancelled */

369

/* this can happen when scrub is cancelled */

370

if (sctx->curr != -1) {

370

if (sctx->curr != -1) {

371

struct scrub_bio *sbio = sctx->bios[sctx->curr];

371

struct scrub_bio *sbio = sctx->bios[sctx->curr];

372

373

for (i = 0; i < sbio->page_count; i++) {

373

for (i = 0; i < sbio->page_count; i++) {

374

WARN_ON(!sbio->pagev[i]->page);

374

WARN_ON(!sbio->pagev[i]->page);

375

scrub_block_put(sbio->pagev[i]->sblock);

375

scrub_block_put(sbio->pagev[i]->sblock);

376

}

376

}

377

bio_put(sbio->bio);

377

bio_put(sbio->bio);

378

}

378

}

379

380

for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {

380

for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {

381

struct scrub_bio *sbio = sctx->bios[i];

381

struct scrub_bio *sbio = sctx->bios[i];

382

383

if (!sbio)

383

if (!sbio)

384

break;

384

break;

385

kfree(sbio);

385

kfree(sbio);

386

}

386

}

387

388

scrub_free_csums(sctx);

388

scrub_free_csums(sctx);

389

kfree(sctx);

389

kfree(sctx);

390

}

390

}

391

392

static noinline_for_stack

392

static noinline_for_stack

393

struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)

393

struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)

394

{

394

{

395

struct scrub_ctx *sctx;

395

struct scrub_ctx *sctx;

396

int i;

396

int i;

397

struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;

397

struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;

398

int pages_per_rd_bio;

398

int pages_per_rd_bio;

399

int ret;

399

int ret;

400

401

/*

401

/*

402

* the setting of pages_per_rd_bio is correct for scrub but might

402

* the setting of pages_per_rd_bio is correct for scrub but might

403

* be wrong for the dev_replace code where we might read from

403

* be wrong for the dev_replace code where we might read from

404

* different devices in the initial huge bios. However, that

404

* different devices in the initial huge bios. However, that

405

* code is able to correctly handle the case when adding a page

405

* code is able to correctly handle the case when adding a page

406

* to a bio fails.

406

* to a bio fails.

407

*/

407

*/

408

if (dev->bdev)

408

if (dev->bdev)

409

pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,

409

pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,

410

bio_get_nr_vecs(dev->bdev));

410

bio_get_nr_vecs(dev->bdev));

411

else

411

else

412

pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;

412

pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;

413

sctx = kzalloc(sizeof(*sctx), GFP_NOFS);

413

sctx = kzalloc(sizeof(*sctx), GFP_NOFS);

414

if (!sctx)

414

if (!sctx)

415

goto nomem;

415

goto nomem;

416

sctx->is_dev_replace = is_dev_replace;

416

sctx->is_dev_replace = is_dev_replace;

417

sctx->pages_per_rd_bio = pages_per_rd_bio;

417

sctx->pages_per_rd_bio = pages_per_rd_bio;

418

sctx->curr = -1;

418

sctx->curr = -1;

419

sctx->dev_root = dev->dev_root;

419

sctx->dev_root = dev->dev_root;

420

for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {

420

for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {

421

struct scrub_bio *sbio;

421

struct scrub_bio *sbio;

422

423

sbio = kzalloc(sizeof(*sbio), GFP_NOFS);

423

sbio = kzalloc(sizeof(*sbio), GFP_NOFS);

424

if (!sbio)

424

if (!sbio)

425

goto nomem;

425

goto nomem;

426

sctx->bios[i] = sbio;

426

sctx->bios[i] = sbio;

427

428

sbio->index = i;

428

sbio->index = i;

429

sbio->sctx = sctx;

429

sbio->sctx = sctx;

430

sbio->page_count = 0;

430

sbio->page_count = 0;

431

btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,

431

btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,

432

NULL, NULL);

432

NULL, NULL);

433

434

if (i != SCRUB_BIOS_PER_SCTX - 1)

434

if (i != SCRUB_BIOS_PER_SCTX - 1)

435

sctx->bios[i]->next_free = i + 1;

435

sctx->bios[i]->next_free = i + 1;

436

else

436

else

437

sctx->bios[i]->next_free = -1;

437

sctx->bios[i]->next_free = -1;

438

}

438

}

439

sctx->first_free = 0;

439

sctx->first_free = 0;

440

sctx->nodesize = dev->dev_root->nodesize;

440

sctx->nodesize = dev->dev_root->nodesize;

441

sctx->leafsize = dev->dev_root->leafsize;

441

sctx->leafsize = dev->dev_root->leafsize;

442

sctx->sectorsize = dev->dev_root->sectorsize;

442

sctx->sectorsize = dev->dev_root->sectorsize;

443

atomic_set(&sctx->bios_in_flight, 0);

443

atomic_set(&sctx->bios_in_flight, 0);

444

atomic_set(&sctx->workers_pending, 0);

444

atomic_set(&sctx->workers_pending, 0);

445

atomic_set(&sctx->cancel_req, 0);

445

atomic_set(&sctx->cancel_req, 0);

446

sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);

446

sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);

447

INIT_LIST_HEAD(&sctx->csum_list);

447

INIT_LIST_HEAD(&sctx->csum_list);

448

449

spin_lock_init(&sctx->list_lock);

449

spin_lock_init(&sctx->list_lock);

450

spin_lock_init(&sctx->stat_lock);

450

spin_lock_init(&sctx->stat_lock);

451

init_waitqueue_head(&sctx->list_wait);

451

init_waitqueue_head(&sctx->list_wait);

452

453

ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,

453

ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,

454

fs_info->dev_replace.tgtdev, is_dev_replace);

454

fs_info->dev_replace.tgtdev, is_dev_replace);

455

if (ret) {

455

if (ret) {

456

scrub_free_ctx(sctx);

456

scrub_free_ctx(sctx);

457

return ERR_PTR(ret);

457

return ERR_PTR(ret);

458

}

458

}

459

return sctx;

459

return sctx;

460

461

nomem:

461

nomem:

462

scrub_free_ctx(sctx);

462

scrub_free_ctx(sctx);

463

return ERR_PTR(-ENOMEM);

463

return ERR_PTR(-ENOMEM);

464

}

464

}

465

466

static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,

466

static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,

467

void *warn_ctx)

467

void *warn_ctx)

468

{

468

{

469

u64 isize;

469

u64 isize;

470

u32 nlink;

470

u32 nlink;

471

int ret;

471

int ret;

472

int i;

472

int i;

473

struct extent_buffer *eb;

473

struct extent_buffer *eb;

474

struct btrfs_inode_item *inode_item;

474

struct btrfs_inode_item *inode_item;

475

struct scrub_warning *swarn = warn_ctx;

475

struct scrub_warning *swarn = warn_ctx;

476

struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;

476

struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;

477

struct inode_fs_paths *ipath = NULL;

477

struct inode_fs_paths *ipath = NULL;

478

struct btrfs_root *local_root;

478

struct btrfs_root *local_root;

479

struct btrfs_key root_key;

479

struct btrfs_key root_key;

480

481

root_key.objectid = root;

481

root_key.objectid = root;

482

root_key.type = BTRFS_ROOT_ITEM_KEY;

482

root_key.type = BTRFS_ROOT_ITEM_KEY;

483

root_key.offset = (u64)-1;

483

root_key.offset = (u64)-1;

484

local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);

484

local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);

485

if (IS_ERR(local_root)) {

485

if (IS_ERR(local_root)) {

486

ret = PTR_ERR(local_root);

486

ret = PTR_ERR(local_root);

487

goto err;

487

goto err;

488

}

488

}

489

490

ret = inode_item_info(inum, 0, local_root, swarn->path);

490

ret = inode_item_info(inum, 0, local_root, swarn->path);

491

if (ret) {

491

if (ret) {

492

btrfs_release_path(swarn->path);

492

btrfs_release_path(swarn->path);

493

goto err;

493

goto err;

494

}

494

}

495

496

eb = swarn->path->nodes[0];

496

eb = swarn->path->nodes[0];

497

inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],

497

inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],

498

struct btrfs_inode_item);

498

struct btrfs_inode_item);

499

isize = btrfs_inode_size(eb, inode_item);

499

isize = btrfs_inode_size(eb, inode_item);

500

nlink = btrfs_inode_nlink(eb, inode_item);

500

nlink = btrfs_inode_nlink(eb, inode_item);

501

btrfs_release_path(swarn->path);

501

btrfs_release_path(swarn->path);

502

503

ipath = init_ipath(4096, local_root, swarn->path);

503

ipath = init_ipath(4096, local_root, swarn->path);

504

if (IS_ERR(ipath)) {

504

if (IS_ERR(ipath)) {

505

ret = PTR_ERR(ipath);

505

ret = PTR_ERR(ipath);

506

ipath = NULL;

506

ipath = NULL;

507

goto err;

507

goto err;

508

}

508

}

509

ret = paths_from_inode(inum, ipath);

509

ret = paths_from_inode(inum, ipath);

510

511

if (ret < 0)

511

if (ret < 0)

512

goto err;

512

goto err;

513

514

/*

514

/*

515

* we deliberately ignore the bit ipath might have been too small to

515

* we deliberately ignore the bit ipath might have been too small to

516

* hold all of the paths here

516

* hold all of the paths here

517

*/

517

*/

518

for (i = 0; i < ipath->fspath->elem_cnt; ++i)

518

for (i = 0; i < ipath->fspath->elem_cnt; ++i)

519

printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "

519

printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "

520

"%s, sector %llu, root %llu, inode %llu, offset %llu, "

520

"%s, sector %llu, root %llu, inode %llu, offset %llu, "

521

"length %llu, links %u (path: %s)\n", swarn->errstr,

521

"length %llu, links %u (path: %s)\n", swarn->errstr,

522

swarn->logical, rcu_str_deref(swarn->dev->name),

522

swarn->logical, rcu_str_deref(swarn->dev->name),

523

(unsigned long long)swarn->sector, root, inum, offset,

523

(unsigned long long)swarn->sector, root, inum, offset,

524

min(isize - offset, (u64)PAGE_SIZE), nlink,

524

min(isize - offset, (u64)PAGE_SIZE), nlink,

525

(char *)(unsigned long)ipath->fspath->val[i]);

525

(char *)(unsigned long)ipath->fspath->val[i]);

526

527

free_ipath(ipath);

527

free_ipath(ipath);

528

return 0;

528

return 0;

529

530

err:

530

err:

531

printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "

531

printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "

532

"%s, sector %llu, root %llu, inode %llu, offset %llu: path "

532

"%s, sector %llu, root %llu, inode %llu, offset %llu: path "

533

"resolving failed with ret=%d\n", swarn->errstr,

533

"resolving failed with ret=%d\n", swarn->errstr,

534

swarn->logical, rcu_str_deref(swarn->dev->name),

534

swarn->logical, rcu_str_deref(swarn->dev->name),

535

(unsigned long long)swarn->sector, root, inum, offset, ret);

535

(unsigned long long)swarn->sector, root, inum, offset, ret);

536

537

free_ipath(ipath);

537

free_ipath(ipath);

538

return 0;

538

return 0;

539

}

539

}

540

541

static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)

541

static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)

542

{

542

{

543

struct btrfs_device *dev;

543

struct btrfs_device *dev;

544

struct btrfs_fs_info *fs_info;

544

struct btrfs_fs_info *fs_info;

545

struct btrfs_path *path;

545

struct btrfs_path *path;

546

struct btrfs_key found_key;

546

struct btrfs_key found_key;

547

struct extent_buffer *eb;

547

struct extent_buffer *eb;

548

struct btrfs_extent_item *ei;

548

struct btrfs_extent_item *ei;

549

struct scrub_warning swarn;

549

struct scrub_warning swarn;

550

unsigned long ptr = 0;

550

unsigned long ptr = 0;

551

u64 extent_item_pos;

551

u64 extent_item_pos;

552

u64 flags = 0;

552

u64 flags = 0;

553

u64 ref_root;

553

u64 ref_root;

554

u32 item_size;

554

u32 item_size;

555

u8 ref_level;

555

u8 ref_level;

556

const int bufsize = 4096;

556

const int bufsize = 4096;

557

int ret;

557

int ret;

558

559

WARN_ON(sblock->page_count < 1);

559

WARN_ON(sblock->page_count < 1);

560

dev = sblock->pagev[0]->dev;

560

dev = sblock->pagev[0]->dev;

561

fs_info = sblock->sctx->dev_root->fs_info;

561

fs_info = sblock->sctx->dev_root->fs_info;

562

563

path = btrfs_alloc_path();

563

path = btrfs_alloc_path();

564

565

swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);

565

swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);

566

swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);

566

swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);

567

swarn.sector = (sblock->pagev[0]->physical) >> 9;

567

swarn.sector = (sblock->pagev[0]->physical) >> 9;

568

swarn.logical = sblock->pagev[0]->logical;

568

swarn.logical = sblock->pagev[0]->logical;

569

swarn.errstr = errstr;

569

swarn.errstr = errstr;

570

swarn.dev = NULL;

570

swarn.dev = NULL;

571

swarn.msg_bufsize = bufsize;

571

swarn.msg_bufsize = bufsize;

572

swarn.scratch_bufsize = bufsize;

572

swarn.scratch_bufsize = bufsize;

573

574

if (!path || !swarn.scratch_buf || !swarn.msg_buf)

574

if (!path || !swarn.scratch_buf || !swarn.msg_buf)

575

goto out;

575

goto out;

576

577

ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,

577

ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,

578

&flags);

578

&flags);

579

if (ret < 0)

579

if (ret < 0)

580

goto out;

580

goto out;

581

582

extent_item_pos = swarn.logical - found_key.objectid;

582

extent_item_pos = swarn.logical - found_key.objectid;

583

swarn.extent_item_size = found_key.offset;

583

swarn.extent_item_size = found_key.offset;

584

585

eb = path->nodes[0];

585

eb = path->nodes[0];

586

ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);

586

ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);

587

item_size = btrfs_item_size_nr(eb, path->slots[0]);

587

item_size = btrfs_item_size_nr(eb, path->slots[0]);

588

589

if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {

589

if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {

590

do {

590

do {

591

ret = tree_backref_for_extent(&ptr, eb, ei, item_size,

591

ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,

592

&ref_root, &ref_level);

592

item_size, &ref_root,

593

&ref_level);

593

printk_in_rcu(KERN_WARNING

594

printk_in_rcu(KERN_WARNING

594

"BTRFS: %s at logical %llu on dev %s, "

595

"BTRFS: %s at logical %llu on dev %s, "

595

"sector %llu: metadata %s (level %d) in tree "

596

"sector %llu: metadata %s (level %d) in tree "

596

"%llu\n", errstr, swarn.logical,

597

"%llu\n", errstr, swarn.logical,

597

rcu_str_deref(dev->name),

598

rcu_str_deref(dev->name),

598

(unsigned long long)swarn.sector,

599

(unsigned long long)swarn.sector,

599

ref_level ? "node" : "leaf",

600

ref_level ? "node" : "leaf",

600

ret < 0 ? -1 : ref_level,

601

ret < 0 ? -1 : ref_level,

601

ret < 0 ? -1 : ref_root);

602

ret < 0 ? -1 : ref_root);

602

} while (ret != 1);

603

} while (ret != 1);

603

btrfs_release_path(path);

604

btrfs_release_path(path);

604

} else {

605

} else {

605

btrfs_release_path(path);

606

btrfs_release_path(path);

606

swarn.path = path;

607

swarn.path = path;

607

swarn.dev = dev;

608

swarn.dev = dev;

608

iterate_extent_inodes(fs_info, found_key.objectid,

609

iterate_extent_inodes(fs_info, found_key.objectid,

609

extent_item_pos, 1,

610

extent_item_pos, 1,

610

scrub_print_warning_inode, &swarn);

611

scrub_print_warning_inode, &swarn);

611

}

612

}

612

613

out:

614

out:

614

btrfs_free_path(path);

615

btrfs_free_path(path);

615

kfree(swarn.scratch_buf);

616

kfree(swarn.scratch_buf);

616

kfree(swarn.msg_buf);

617

kfree(swarn.msg_buf);

617

}

618

}

618

619

static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)

620

static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)

620

{

621

{

621

struct page *page = NULL;

622

struct page *page = NULL;

622

unsigned long index;

623

unsigned long index;

623

struct scrub_fixup_nodatasum *fixup = fixup_ctx;

624

struct scrub_fixup_nodatasum *fixup = fixup_ctx;

624

int ret;

625

int ret;

625

int corrected = 0;

626

int corrected = 0;

626

struct btrfs_key key;

627

struct btrfs_key key;

627

struct inode *inode = NULL;

628

struct inode *inode = NULL;

628

struct btrfs_fs_info *fs_info;

629

struct btrfs_fs_info *fs_info;

629

u64 end = offset + PAGE_SIZE - 1;

630

u64 end = offset + PAGE_SIZE - 1;

630

struct btrfs_root *local_root;

631

struct btrfs_root *local_root;

631

int srcu_index;

632

int srcu_index;

632

633

key.objectid = root;

634

key.objectid = root;

634

key.type = BTRFS_ROOT_ITEM_KEY;

635

key.type = BTRFS_ROOT_ITEM_KEY;

635

key.offset = (u64)-1;

636

key.offset = (u64)-1;

636

637

fs_info = fixup->root->fs_info;

638

fs_info = fixup->root->fs_info;

638

srcu_index = srcu_read_lock(&fs_info->subvol_srcu);

639

srcu_index = srcu_read_lock(&fs_info->subvol_srcu);

639

640

local_root = btrfs_read_fs_root_no_name(fs_info, &key);

641

local_root = btrfs_read_fs_root_no_name(fs_info, &key);

641

if (IS_ERR(local_root)) {

642

if (IS_ERR(local_root)) {

642

srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);

643

srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);

643

return PTR_ERR(local_root);

644

return PTR_ERR(local_root);

644

}

645

}

645

646

key.type = BTRFS_INODE_ITEM_KEY;

647

key.type = BTRFS_INODE_ITEM_KEY;

647

key.objectid = inum;

648

key.objectid = inum;

648

key.offset = 0;

649

key.offset = 0;

649

inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);

650

inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);

650

srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);

651

srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);

651

if (IS_ERR(inode))

652

if (IS_ERR(inode))

652

return PTR_ERR(inode);

653

return PTR_ERR(inode);

653

654

index = offset >> PAGE_CACHE_SHIFT;

655

index = offset >> PAGE_CACHE_SHIFT;

655

656

page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);

657

page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);

657

if (!page) {

658

if (!page) {

658

ret = -ENOMEM;

659

ret = -ENOMEM;

659

goto out;

660

goto out;

660

}

661

}

661

662

if (PageUptodate(page)) {

663

if (PageUptodate(page)) {

663

if (PageDirty(page)) {

664

if (PageDirty(page)) {

664

/*

665

/*

665

* we need to write the data to the defect sector. the

666

* we need to write the data to the defect sector. the

666

* data that was in that sector is not in memory,

667

* data that was in that sector is not in memory,

667

* because the page was modified. we must not write the

668

* because the page was modified. we must not write the

668

* modified page to that sector.

669

* modified page to that sector.

669

*

670

*

670

* TODO: what could be done here: wait for the delalloc

671

* TODO: what could be done here: wait for the delalloc

671

* runner to write out that page (might involve

672

* runner to write out that page (might involve

672

* COW) and see whether the sector is still

673

* COW) and see whether the sector is still

673

* referenced afterwards.

674

* referenced afterwards.

674

*

675

*

675

* For the meantime, we'll treat this error

676

* For the meantime, we'll treat this error

676

* incorrectable, although there is a chance that a

677

* incorrectable, although there is a chance that a

677

* later scrub will find the bad sector again and that

678

* later scrub will find the bad sector again and that

678

* there's no dirty page in memory, then.

679

* there's no dirty page in memory, then.

679

*/

680

*/

680

ret = -EIO;

681

ret = -EIO;

681

goto out;

682

goto out;

682

}

683

}

683

fs_info = BTRFS_I(inode)->root->fs_info;

684

fs_info = BTRFS_I(inode)->root->fs_info;

684

ret = repair_io_failure(fs_info, offset, PAGE_SIZE,

685

ret = repair_io_failure(fs_info, offset, PAGE_SIZE,

685

fixup->logical, page,

686

fixup->logical, page,

686

fixup->mirror_num);

687

fixup->mirror_num);

687

unlock_page(page);

688

unlock_page(page);

688

corrected = !ret;

689

corrected = !ret;

689

} else {

690

} else {

690

/*

691

/*

691

* we need to get good data first. the general readpage path

692

* we need to get good data first. the general readpage path

692

* will call repair_io_failure for us, we just have to make

693

* will call repair_io_failure for us, we just have to make

693

* sure we read the bad mirror.

694

* sure we read the bad mirror.

694

*/

695

*/

695

ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,

696

ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,

696

EXTENT_DAMAGED, GFP_NOFS);

697

EXTENT_DAMAGED, GFP_NOFS);

697

if (ret) {

698

if (ret) {

698

/* set_extent_bits should give proper error */

699

/* set_extent_bits should give proper error */

699

WARN_ON(ret > 0);

700

WARN_ON(ret > 0);

700

if (ret > 0)

701

if (ret > 0)

701

ret = -EFAULT;

702

ret = -EFAULT;

702

goto out;

703

goto out;

703

}

704

}

704

705

ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,

706

ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,

706

btrfs_get_extent,

707

btrfs_get_extent,

707

fixup->mirror_num);

708

fixup->mirror_num);

708

wait_on_page_locked(page);

709

wait_on_page_locked(page);

709

710

corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,

711

corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,

711

end, EXTENT_DAMAGED, 0, NULL);

712

end, EXTENT_DAMAGED, 0, NULL);

712

if (!corrected)

713

if (!corrected)

713

clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,

714

clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,

714

EXTENT_DAMAGED, GFP_NOFS);

715

EXTENT_DAMAGED, GFP_NOFS);

715

}

716

}

716

717

out:

718

out:

718

if (page)

719

if (page)

719

put_page(page);

720

put_page(page);

720

if (inode)

721

if (inode)

721

iput(inode);

722

iput(inode);

722

723

if (ret < 0)

724

if (ret < 0)

724

return ret;

725

return ret;

725

726

if (ret == 0 && corrected) {

727

if (ret == 0 && corrected) {

727

/*

728

/*

728

* we only need to call readpage for one of the inodes belonging

729

* we only need to call readpage for one of the inodes belonging

729

* to this extent. so make iterate_extent_inodes stop

730

* to this extent. so make iterate_extent_inodes stop

730

*/

731

*/

731

return 1;

732

return 1;

732

}

733

}

733

734

return -EIO;

735

return -EIO;

735

}

736

}

736

737

static void scrub_fixup_nodatasum(struct btrfs_work *work)

738

static void scrub_fixup_nodatasum(struct btrfs_work *work)

738

{

739

{

739

int ret;

740

int ret;

740

struct scrub_fixup_nodatasum *fixup;

741

struct scrub_fixup_nodatasum *fixup;

741

struct scrub_ctx *sctx;

742

struct scrub_ctx *sctx;

742

struct btrfs_trans_handle *trans = NULL;

743

struct btrfs_trans_handle *trans = NULL;

743

struct btrfs_path *path;

744

struct btrfs_path *path;

744

int uncorrectable = 0;

745

int uncorrectable = 0;

745

746

fixup = container_of(work, struct scrub_fixup_nodatasum, work);

747

fixup = container_of(work, struct scrub_fixup_nodatasum, work);

747

sctx = fixup->sctx;

748

sctx = fixup->sctx;

748

749

path = btrfs_alloc_path();

750

path = btrfs_alloc_path();

750

if (!path) {

751

if (!path) {

751

spin_lock(&sctx->stat_lock);

752

spin_lock(&sctx->stat_lock);

752

++sctx->stat.malloc_errors;

753

++sctx->stat.malloc_errors;

753

spin_unlock(&sctx->stat_lock);

754

spin_unlock(&sctx->stat_lock);

754

uncorrectable = 1;

755

uncorrectable = 1;

755

goto out;

756

goto out;

756

}

757

}

757

758

trans = btrfs_join_transaction(fixup->root);

759

trans = btrfs_join_transaction(fixup->root);

759

if (IS_ERR(trans)) {

760

if (IS_ERR(trans)) {

760

uncorrectable = 1;

761

uncorrectable = 1;

761

goto out;

762

goto out;

762

}

763

}

763

764

/*

765

/*

765

* the idea is to trigger a regular read through the standard path. we

766

* the idea is to trigger a regular read through the standard path. we

766

* read a page from the (failed) logical address by specifying the

767

* read a page from the (failed) logical address by specifying the

767

* corresponding copynum of the failed sector. thus, that readpage is

768

* corresponding copynum of the failed sector. thus, that readpage is

768

* expected to fail.

769

* expected to fail.

769

* that is the point where on-the-fly error correction will kick in

770

* that is the point where on-the-fly error correction will kick in

770

* (once it's finished) and rewrite the failed sector if a good copy

771

* (once it's finished) and rewrite the failed sector if a good copy

771

* can be found.

772

* can be found.

772

*/

773

*/

773

ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,

774

ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,

774

path, scrub_fixup_readpage,

775

path, scrub_fixup_readpage,

775

fixup);

776

fixup);

776

if (ret < 0) {

777

if (ret < 0) {

777

uncorrectable = 1;

778

uncorrectable = 1;

778

goto out;

779

goto out;

779

}

780

}

780

WARN_ON(ret != 1);

781

WARN_ON(ret != 1);

781

782

spin_lock(&sctx->stat_lock);

783

spin_lock(&sctx->stat_lock);

783

++sctx->stat.corrected_errors;

784

++sctx->stat.corrected_errors;

784

spin_unlock(&sctx->stat_lock);

785

spin_unlock(&sctx->stat_lock);

785

786

out:

787

out:

787

if (trans && !IS_ERR(trans))

788

if (trans && !IS_ERR(trans))

788

btrfs_end_transaction(trans, fixup->root);

789

btrfs_end_transaction(trans, fixup->root);

789

if (uncorrectable) {

790

if (uncorrectable) {

790

spin_lock(&sctx->stat_lock);

791

spin_lock(&sctx->stat_lock);

791

++sctx->stat.uncorrectable_errors;

792

++sctx->stat.uncorrectable_errors;

792

spin_unlock(&sctx->stat_lock);

793

spin_unlock(&sctx->stat_lock);

793

btrfs_dev_replace_stats_inc(

794

btrfs_dev_replace_stats_inc(

794

&sctx->dev_root->fs_info->dev_replace.

795

&sctx->dev_root->fs_info->dev_replace.

795

num_uncorrectable_read_errors);

796

num_uncorrectable_read_errors);

796

printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "

797

printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "

797

"unable to fixup (nodatasum) error at logical %llu on dev %s\n",

798

"unable to fixup (nodatasum) error at logical %llu on dev %s\n",

798

fixup->logical, rcu_str_deref(fixup->dev->name));

799

fixup->logical, rcu_str_deref(fixup->dev->name));

799

}

800

}

800

801

btrfs_free_path(path);

802

btrfs_free_path(path);

802

kfree(fixup);

803

kfree(fixup);

803

804

scrub_pending_trans_workers_dec(sctx);

805

scrub_pending_trans_workers_dec(sctx);

805

}

806

}

806

807

/*

808

/*

808

* scrub_handle_errored_block gets called when either verification of the

809

* scrub_handle_errored_block gets called when either verification of the

809

* pages failed or the bio failed to read, e.g. with EIO. In the latter

810

* pages failed or the bio failed to read, e.g. with EIO. In the latter

810

* case, this function handles all pages in the bio, even though only one

811

* case, this function handles all pages in the bio, even though only one

811

* may be bad.

812

* may be bad.

812

* The goal of this function is to repair the errored block by using the

813

* The goal of this function is to repair the errored block by using the

813

* contents of one of the mirrors.

814

* contents of one of the mirrors.

814

*/

815

*/

815

static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)

816

static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)

816

{

817

{

817

struct scrub_ctx *sctx = sblock_to_check->sctx;

818

struct scrub_ctx *sctx = sblock_to_check->sctx;

818

struct btrfs_device *dev;

819

struct btrfs_device *dev;

819

struct btrfs_fs_info *fs_info;

820

struct btrfs_fs_info *fs_info;

820

u64 length;

821

u64 length;

821

u64 logical;

822

u64 logical;

822

u64 generation;

823

u64 generation;

823

unsigned int failed_mirror_index;

824

unsigned int failed_mirror_index;

824

unsigned int is_metadata;

825

unsigned int is_metadata;

825

unsigned int have_csum;

826

unsigned int have_csum;

826

u8 *csum;

827

u8 *csum;

827

struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */

828

struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */

828

struct scrub_block *sblock_bad;

829

struct scrub_block *sblock_bad;

829

int ret;

830

int ret;

830

int mirror_index;

831

int mirror_index;

831

int page_num;

832

int page_num;

832

int success;

833

int success;

833

static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,

834

static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,

834

DEFAULT_RATELIMIT_BURST);

835

DEFAULT_RATELIMIT_BURST);

835

836

BUG_ON(sblock_to_check->page_count < 1);

837

BUG_ON(sblock_to_check->page_count < 1);

837

fs_info = sctx->dev_root->fs_info;

838

fs_info = sctx->dev_root->fs_info;

838

if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {

839

if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {

839

/*

840

/*

840

* if we find an error in a super block, we just report it.

841

* if we find an error in a super block, we just report it.

841

* They will get written with the next transaction commit

842

* They will get written with the next transaction commit

842

* anyway

843

* anyway

843

*/

844

*/

844

spin_lock(&sctx->stat_lock);

845

spin_lock(&sctx->stat_lock);

845

++sctx->stat.super_errors;

846

++sctx->stat.super_errors;

846

spin_unlock(&sctx->stat_lock);

847

spin_unlock(&sctx->stat_lock);

847

return 0;

848

return 0;

848

}

849

}

849

length = sblock_to_check->page_count * PAGE_SIZE;

850

length = sblock_to_check->page_count * PAGE_SIZE;

850

logical = sblock_to_check->pagev[0]->logical;

851

logical = sblock_to_check->pagev[0]->logical;

851

generation = sblock_to_check->pagev[0]->generation;

852

generation = sblock_to_check->pagev[0]->generation;

852

BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);

853

BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);

853

failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;

854

failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;

854

is_metadata = !(sblock_to_check->pagev[0]->flags &

855

is_metadata = !(sblock_to_check->pagev[0]->flags &

855

BTRFS_EXTENT_FLAG_DATA);

856

BTRFS_EXTENT_FLAG_DATA);

856

have_csum = sblock_to_check->pagev[0]->have_csum;

857

have_csum = sblock_to_check->pagev[0]->have_csum;

857

csum = sblock_to_check->pagev[0]->csum;

858

csum = sblock_to_check->pagev[0]->csum;

858

dev = sblock_to_check->pagev[0]->dev;

859

dev = sblock_to_check->pagev[0]->dev;

859

860

if (sctx->is_dev_replace && !is_metadata && !have_csum) {

861

if (sctx->is_dev_replace && !is_metadata && !have_csum) {

861

sblocks_for_recheck = NULL;

862

sblocks_for_recheck = NULL;

862

goto nodatasum_case;

863

goto nodatasum_case;

863

}

864

}

864

865

/*

866

/*

866

* read all mirrors one after the other. This includes to

867

* read all mirrors one after the other. This includes to

867

* re-read the extent or metadata block that failed (that was

868

* re-read the extent or metadata block that failed (that was

868

* the cause that this fixup code is called) another time,

869

* the cause that this fixup code is called) another time,

869

* page by page this time in order to know which pages

870

* page by page this time in order to know which pages

870

* caused I/O errors and which ones are good (for all mirrors).

871

* caused I/O errors and which ones are good (for all mirrors).

871

* It is the goal to handle the situation when more than one

872

* It is the goal to handle the situation when more than one

872

* mirror contains I/O errors, but the errors do not

873

* mirror contains I/O errors, but the errors do not

873

* overlap, i.e. the data can be repaired by selecting the

874

* overlap, i.e. the data can be repaired by selecting the

874

* pages from those mirrors without I/O error on the

875

* pages from those mirrors without I/O error on the

875

* particular pages. One example (with blocks >= 2 * PAGE_SIZE)

876

* particular pages. One example (with blocks >= 2 * PAGE_SIZE)

876

* would be that mirror #1 has an I/O error on the first page,

877

* would be that mirror #1 has an I/O error on the first page,

877

* the second page is good, and mirror #2 has an I/O error on

878

* the second page is good, and mirror #2 has an I/O error on

878

* the second page, but the first page is good.

879

* the second page, but the first page is good.

879

* Then the first page of the first mirror can be repaired by

880

* Then the first page of the first mirror can be repaired by

880

* taking the first page of the second mirror, and the

881

* taking the first page of the second mirror, and the

881

* second page of the second mirror can be repaired by

882

* second page of the second mirror can be repaired by

882

* copying the contents of the 2nd page of the 1st mirror.

883

* copying the contents of the 2nd page of the 1st mirror.

883

* One more note: if the pages of one mirror contain I/O

884

* One more note: if the pages of one mirror contain I/O

884

* errors, the checksum cannot be verified. In order to get

885

* errors, the checksum cannot be verified. In order to get

885

* the best data for repairing, the first attempt is to find

886

* the best data for repairing, the first attempt is to find

886

* a mirror without I/O errors and with a validated checksum.

887

* a mirror without I/O errors and with a validated checksum.

887

* Only if this is not possible, the pages are picked from

888

* Only if this is not possible, the pages are picked from

888

* mirrors with I/O errors without considering the checksum.

889

* mirrors with I/O errors without considering the checksum.

889

* If the latter is the case, at the end, the checksum of the

890

* If the latter is the case, at the end, the checksum of the

890

* repaired area is verified in order to correctly maintain

891

* repaired area is verified in order to correctly maintain

891

* the statistics.

892

* the statistics.

892

*/

893

*/

893

894

sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *

895

sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *

895

sizeof(*sblocks_for_recheck),

896

sizeof(*sblocks_for_recheck),

896

GFP_NOFS);

897

GFP_NOFS);

897

if (!sblocks_for_recheck) {

898

if (!sblocks_for_recheck) {

898

spin_lock(&sctx->stat_lock);

899

spin_lock(&sctx->stat_lock);

899

sctx->stat.malloc_errors++;

900

sctx->stat.malloc_errors++;

900

sctx->stat.read_errors++;

901

sctx->stat.read_errors++;

901

sctx->stat.uncorrectable_errors++;

902

sctx->stat.uncorrectable_errors++;

902

spin_unlock(&sctx->stat_lock);

903

spin_unlock(&sctx->stat_lock);

903

btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);

904

btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);

904

goto out;

905

goto out;

905

}

906

}

906

907

/* setup the context, map the logical blocks and alloc the pages */

908

/* setup the context, map the logical blocks and alloc the pages */

908

ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,

909

ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,

909

logical, sblocks_for_recheck);

910

logical, sblocks_for_recheck);

910

if (ret) {

911

if (ret) {

911

spin_lock(&sctx->stat_lock);

912

spin_lock(&sctx->stat_lock);

912

sctx->stat.read_errors++;

913

sctx->stat.read_errors++;

913

sctx->stat.uncorrectable_errors++;

914

sctx->stat.uncorrectable_errors++;

914

spin_unlock(&sctx->stat_lock);

915

spin_unlock(&sctx->stat_lock);

915

btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);

916

btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);

916

goto out;

917

goto out;

917

}

918

}

918

BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);

919

BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);

919

sblock_bad = sblocks_for_recheck + failed_mirror_index;

920

sblock_bad = sblocks_for_recheck + failed_mirror_index;

920

921

/* build and submit the bios for the failed mirror, check checksums */

922

/* build and submit the bios for the failed mirror, check checksums */

922

scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,

923

scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,

923

csum, generation, sctx->csum_size);

924

csum, generation, sctx->csum_size);

924

925

if (!sblock_bad->header_error && !sblock_bad->checksum_error &&

926

if (!sblock_bad->header_error && !sblock_bad->checksum_error &&

926

sblock_bad->no_io_error_seen) {

927

sblock_bad->no_io_error_seen) {

927

/*

928

/*

928

* the error disappeared after reading page by page, or

929

* the error disappeared after reading page by page, or

929

* the area was part of a huge bio and other parts of the

930

* the area was part of a huge bio and other parts of the

930

* bio caused I/O errors, or the block layer merged several

931

* bio caused I/O errors, or the block layer merged several

931

* read requests into one and the error is caused by a

932

* read requests into one and the error is caused by a

932

* different bio (usually one of the two latter cases is

933

* different bio (usually one of the two latter cases is

933

* the cause)

934

* the cause)

934

*/

935

*/

935

spin_lock(&sctx->stat_lock);

936

spin_lock(&sctx->stat_lock);

936

sctx->stat.unverified_errors++;

937

sctx->stat.unverified_errors++;

937

spin_unlock(&sctx->stat_lock);

938

spin_unlock(&sctx->stat_lock);

938

939

if (sctx->is_dev_replace)

940

if (sctx->is_dev_replace)

940

scrub_write_block_to_dev_replace(sblock_bad);

941

scrub_write_block_to_dev_replace(sblock_bad);

941

goto out;

942

goto out;

942

}

943

}

943

944

if (!sblock_bad->no_io_error_seen) {

945

if (!sblock_bad->no_io_error_seen) {

945

spin_lock(&sctx->stat_lock);

946

spin_lock(&sctx->stat_lock);

946

sctx->stat.read_errors++;

947

sctx->stat.read_errors++;

947

spin_unlock(&sctx->stat_lock);

948

spin_unlock(&sctx->stat_lock);

948

if (__ratelimit(&_rs))

949

if (__ratelimit(&_rs))

949

scrub_print_warning("i/o error", sblock_to_check);

950

scrub_print_warning("i/o error", sblock_to_check);

950

btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);

951

btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);

951

} else if (sblock_bad->checksum_error) {

952

} else if (sblock_bad->checksum_error) {

952

spin_lock(&sctx->stat_lock);

953

spin_lock(&sctx->stat_lock);

953

sctx->stat.csum_errors++;

954

sctx->stat.csum_errors++;

954

spin_unlock(&sctx->stat_lock);

955

spin_unlock(&sctx->stat_lock);

955

if (__ratelimit(&_rs))

956

if (__ratelimit(&_rs))

956

scrub_print_warning("checksum error", sblock_to_check);

957

scrub_print_warning("checksum error", sblock_to_check);

957

btrfs_dev_stat_inc_and_print(dev,

958

btrfs_dev_stat_inc_and_print(dev,

958

BTRFS_DEV_STAT_CORRUPTION_ERRS);

959

BTRFS_DEV_STAT_CORRUPTION_ERRS);

959

} else if (sblock_bad->header_error) {

960

} else if (sblock_bad->header_error) {

960

spin_lock(&sctx->stat_lock);

961

spin_lock(&sctx->stat_lock);

961

sctx->stat.verify_errors++;

962

sctx->stat.verify_errors++;

962

spin_unlock(&sctx->stat_lock);

963

spin_unlock(&sctx->stat_lock);

963

if (__ratelimit(&_rs))

964

if (__ratelimit(&_rs))

964

scrub_print_warning("checksum/header error",

965

scrub_print_warning("checksum/header error",

965

sblock_to_check);

966

sblock_to_check);

966

if (sblock_bad->generation_error)

967

if (sblock_bad->generation_error)

967

btrfs_dev_stat_inc_and_print(dev,

968

btrfs_dev_stat_inc_and_print(dev,

968

BTRFS_DEV_STAT_GENERATION_ERRS);

969

BTRFS_DEV_STAT_GENERATION_ERRS);

969

else

970

else

970

btrfs_dev_stat_inc_and_print(dev,

971

btrfs_dev_stat_inc_and_print(dev,

971

BTRFS_DEV_STAT_CORRUPTION_ERRS);

972

BTRFS_DEV_STAT_CORRUPTION_ERRS);

972

}

973

}

973

974

if (sctx->readonly) {

975

if (sctx->readonly) {

975

ASSERT(!sctx->is_dev_replace);

976

ASSERT(!sctx->is_dev_replace);

976

goto out;

977

goto out;

977

}

978

}

978

979

if (!is_metadata && !have_csum) {

980

if (!is_metadata && !have_csum) {

980

struct scrub_fixup_nodatasum *fixup_nodatasum;

981

struct scrub_fixup_nodatasum *fixup_nodatasum;

981

982

nodatasum_case:

983

nodatasum_case:

983

WARN_ON(sctx->is_dev_replace);

984

WARN_ON(sctx->is_dev_replace);

984

985

/*

986

/*

986

* !is_metadata and !have_csum, this means that the data

987

* !is_metadata and !have_csum, this means that the data

987

* might not be COW'ed, that it might be modified

988

* might not be COW'ed, that it might be modified

988

* concurrently. The general strategy to work on the

989

* concurrently. The general strategy to work on the

989

* commit root does not help in the case when COW is not

990

* commit root does not help in the case when COW is not

990

* used.

991

* used.

991

*/

992

*/

992

fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);

993

fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);

993

if (!fixup_nodatasum)

994

if (!fixup_nodatasum)

994

goto did_not_correct_error;

995

goto did_not_correct_error;

995

fixup_nodatasum->sctx = sctx;

996

fixup_nodatasum->sctx = sctx;

996

fixup_nodatasum->dev = dev;

997

fixup_nodatasum->dev = dev;

997

fixup_nodatasum->logical = logical;

998

fixup_nodatasum->logical = logical;

998

fixup_nodatasum->root = fs_info->extent_root;

999

fixup_nodatasum->root = fs_info->extent_root;

999

fixup_nodatasum->mirror_num = failed_mirror_index + 1;

1000

fixup_nodatasum->mirror_num = failed_mirror_index + 1;

1000

scrub_pending_trans_workers_inc(sctx);

1001

scrub_pending_trans_workers_inc(sctx);

1001

btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,

1002

btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,

1002

NULL, NULL);

1003

NULL, NULL);

1003

btrfs_queue_work(fs_info->scrub_workers,

1004

btrfs_queue_work(fs_info->scrub_workers,

1004

&fixup_nodatasum->work);

1005

&fixup_nodatasum->work);

1005

goto out;

1006

goto out;

1006

}

1007

}

1007

1008

/*

1009

/*

1009

* now build and submit the bios for the other mirrors, check

1010

* now build and submit the bios for the other mirrors, check

1010

* checksums.

1011

* checksums.

1011

* First try to pick the mirror which is completely without I/O

1012

* First try to pick the mirror which is completely without I/O

1012

* errors and also does not have a checksum error.

1013

* errors and also does not have a checksum error.

1013

* If one is found, and if a checksum is present, the full block

1014

* If one is found, and if a checksum is present, the full block

1014

* that is known to contain an error is rewritten. Afterwards

1015

* that is known to contain an error is rewritten. Afterwards

1015

* the block is known to be corrected.

1016

* the block is known to be corrected.

1016

* If a mirror is found which is completely correct, and no

1017

* If a mirror is found which is completely correct, and no

1017

* checksum is present, only those pages are rewritten that had

1018

* checksum is present, only those pages are rewritten that had

1018

* an I/O error in the block to be repaired, since it cannot be

1019

* an I/O error in the block to be repaired, since it cannot be

1019

* determined, which copy of the other pages is better (and it

1020

* determined, which copy of the other pages is better (and it

1020

* could happen otherwise that a correct page would be

1021

* could happen otherwise that a correct page would be

1021

* overwritten by a bad one).

1022

* overwritten by a bad one).

1022

*/

1023

*/

1023

for (mirror_index = 0;

1024

for (mirror_index = 0;

1024

mirror_index < BTRFS_MAX_MIRRORS &&

1025

mirror_index < BTRFS_MAX_MIRRORS &&

1025

sblocks_for_recheck[mirror_index].page_count > 0;

1026

sblocks_for_recheck[mirror_index].page_count > 0;

1026

mirror_index++) {

1027

mirror_index++) {

1027

struct scrub_block *sblock_other;

1028

struct scrub_block *sblock_other;

1028

1029

if (mirror_index == failed_mirror_index)

1030

if (mirror_index == failed_mirror_index)

1030

continue;

1031

continue;

1031

sblock_other = sblocks_for_recheck + mirror_index;

1032

sblock_other = sblocks_for_recheck + mirror_index;

1032

1033

/* build and submit the bios, check checksums */

1034

/* build and submit the bios, check checksums */

1034

scrub_recheck_block(fs_info, sblock_other, is_metadata,

1035

scrub_recheck_block(fs_info, sblock_other, is_metadata,

1035

have_csum, csum, generation,

1036

have_csum, csum, generation,

1036

sctx->csum_size);

1037

sctx->csum_size);

1037

1038

if (!sblock_other->header_error &&

1039

if (!sblock_other->header_error &&

1039

!sblock_other->checksum_error &&

1040

!sblock_other->checksum_error &&

1040

sblock_other->no_io_error_seen) {

1041

sblock_other->no_io_error_seen) {

1041

if (sctx->is_dev_replace) {

1042

if (sctx->is_dev_replace) {

1042

scrub_write_block_to_dev_replace(sblock_other);

1043

scrub_write_block_to_dev_replace(sblock_other);

1043

} else {

1044

} else {

1044

int force_write = is_metadata || have_csum;

1045

int force_write = is_metadata || have_csum;

1045

1046

ret = scrub_repair_block_from_good_copy(

1047

ret = scrub_repair_block_from_good_copy(

1047

sblock_bad, sblock_other,

1048

sblock_bad, sblock_other,

1048

force_write);

1049

force_write);

1049

}

1050

}

1050

if (0 == ret)

1051

if (0 == ret)

1051

goto corrected_error;

1052

goto corrected_error;

1052

}

1053

}

1053

}

1054

}

1054

1055

/*

1056

/*

1056

* for dev_replace, pick good pages and write to the target device.

1057

* for dev_replace, pick good pages and write to the target device.

1057

*/

1058

*/

1058

if (sctx->is_dev_replace) {

1059

if (sctx->is_dev_replace) {

1059

success = 1;

1060

success = 1;

1060

for (page_num = 0; page_num < sblock_bad->page_count;

1061

for (page_num = 0; page_num < sblock_bad->page_count;

1061

page_num++) {

1062

page_num++) {

1062

int sub_success;

1063

int sub_success;

1063

1064

sub_success = 0;

1065

sub_success = 0;

1065

for (mirror_index = 0;

1066

for (mirror_index = 0;

1066

mirror_index < BTRFS_MAX_MIRRORS &&

1067

mirror_index < BTRFS_MAX_MIRRORS &&

1067

sblocks_for_recheck[mirror_index].page_count > 0;

1068

sblocks_for_recheck[mirror_index].page_count > 0;

1068

mirror_index++) {

1069

mirror_index++) {

1069

struct scrub_block *sblock_other =

1070

struct scrub_block *sblock_other =

1070

sblocks_for_recheck + mirror_index;

1071

sblocks_for_recheck + mirror_index;

1071

struct scrub_page *page_other =

1072

struct scrub_page *page_other =

1072

sblock_other->pagev[page_num];

1073

sblock_other->pagev[page_num];

1073

1074

if (!page_other->io_error) {

1075

if (!page_other->io_error) {

1075

ret = scrub_write_page_to_dev_replace(

1076

ret = scrub_write_page_to_dev_replace(

1076

sblock_other, page_num);

1077

sblock_other, page_num);

1077

if (ret == 0) {

1078

if (ret == 0) {

1078

/* succeeded for this page */

1079

/* succeeded for this page */

1079

sub_success = 1;

1080

sub_success = 1;

1080

break;

1081

break;

1081

} else {

1082

} else {

1082

btrfs_dev_replace_stats_inc(

1083

btrfs_dev_replace_stats_inc(

1083

&sctx->dev_root->

1084

&sctx->dev_root->

1084

fs_info->dev_replace.

1085

fs_info->dev_replace.

1085

num_write_errors);

1086

num_write_errors);

1086

}

1087

}

1087

}

1088

}

1088

}

1089

}

1089

1090

if (!sub_success) {

1091

if (!sub_success) {

1091

/*

1092

/*

1092

* did not find a mirror to fetch the page

1093

* did not find a mirror to fetch the page

1093

* from. scrub_write_page_to_dev_replace()

1094

* from. scrub_write_page_to_dev_replace()

1094

* handles this case (page->io_error), by

1095

* handles this case (page->io_error), by

1095

* filling the block with zeros before

1096

* filling the block with zeros before

1096

* submitting the write request

1097

* submitting the write request

1097

*/

1098

*/

1098

success = 0;

1099

success = 0;

1099

ret = scrub_write_page_to_dev_replace(

1100

ret = scrub_write_page_to_dev_replace(

1100

sblock_bad, page_num);

1101

sblock_bad, page_num);

1101

if (ret)

1102

if (ret)

1102

btrfs_dev_replace_stats_inc(

1103

btrfs_dev_replace_stats_inc(

1103

&sctx->dev_root->fs_info->

1104

&sctx->dev_root->fs_info->

1104

dev_replace.num_write_errors);

1105

dev_replace.num_write_errors);

1105

}

1106

}

1106

}

1107

}

1107

1108

goto out;

1109

goto out;

1109

}

1110

}

1110

1111

/*

1112

/*

1112

* for regular scrub, repair those pages that are errored.

1113

* for regular scrub, repair those pages that are errored.

1113

* In case of I/O errors in the area that is supposed to be

1114

* In case of I/O errors in the area that is supposed to be

1114

* repaired, continue by picking good copies of those pages.

1115

* repaired, continue by picking good copies of those pages.

1115

* Select the good pages from mirrors to rewrite bad pages from

1116

* Select the good pages from mirrors to rewrite bad pages from

1116

* the area to fix. Afterwards verify the checksum of the block

1117

* the area to fix. Afterwards verify the checksum of the block

1117

* that is supposed to be repaired. This verification step is

1118

* that is supposed to be repaired. This verification step is

1118

* only done for the purpose of statistic counting and for the

1119

* only done for the purpose of statistic counting and for the

1119

* final scrub report, whether errors remain.

1120

* final scrub report, whether errors remain.

1120

* A perfect algorithm could make use of the checksum and try

1121

* A perfect algorithm could make use of the checksum and try

1121

* all possible combinations of pages from the different mirrors

1122

* all possible combinations of pages from the different mirrors

1122

* until the checksum verification succeeds. For example, when

1123

* until the checksum verification succeeds. For example, when

1123

* the 2nd page of mirror #1 faces I/O errors, and the 2nd page

1124

* the 2nd page of mirror #1 faces I/O errors, and the 2nd page

1124

* of mirror #2 is readable but the final checksum test fails,

1125

* of mirror #2 is readable but the final checksum test fails,

1125

* then the 2nd page of mirror #3 could be tried, whether now

1126

* then the 2nd page of mirror #3 could be tried, whether now

1126

* the final checksum succeedes. But this would be a rare

1127

* the final checksum succeedes. But this would be a rare

1127

* exception and is therefore not implemented. At least it is

1128

* exception and is therefore not implemented. At least it is

1128

* avoided that the good copy is overwritten.

1129

* avoided that the good copy is overwritten.

1129

* A more useful improvement would be to pick the sectors

1130

* A more useful improvement would be to pick the sectors

1130

* without I/O error based on sector sizes (512 bytes on legacy

1131

* without I/O error based on sector sizes (512 bytes on legacy

1131

* disks) instead of on PAGE_SIZE. Then maybe 512 byte of one

1132

* disks) instead of on PAGE_SIZE. Then maybe 512 byte of one

1132

* mirror could be repaired by taking 512 byte of a different

1133

* mirror could be repaired by taking 512 byte of a different

1133

* mirror, even if other 512 byte sectors in the same PAGE_SIZE

1134

* mirror, even if other 512 byte sectors in the same PAGE_SIZE

1134

* area are unreadable.

1135

* area are unreadable.

1135

*/

1136

*/

1136

1137

/* can only fix I/O errors from here on */

1138

/* can only fix I/O errors from here on */

1138

if (sblock_bad->no_io_error_seen)

1139

if (sblock_bad->no_io_error_seen)

1139

goto did_not_correct_error;

1140

goto did_not_correct_error;

1140

1141

success = 1;

1142

success = 1;

1142

for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {

1143

for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {

1143

struct scrub_page *page_bad = sblock_bad->pagev[page_num];

1144

struct scrub_page *page_bad = sblock_bad->pagev[page_num];

1144

1145

if (!page_bad->io_error)

1146

if (!page_bad->io_error)

1146

continue;

1147

continue;

1147

1148

for (mirror_index = 0;

1149

for (mirror_index = 0;

1149

mirror_index < BTRFS_MAX_MIRRORS &&

1150

mirror_index < BTRFS_MAX_MIRRORS &&

1150

sblocks_for_recheck[mirror_index].page_count > 0;

1151

sblocks_for_recheck[mirror_index].page_count > 0;

1151

mirror_index++) {

1152

mirror_index++) {

1152

struct scrub_block *sblock_other = sblocks_for_recheck +

1153

struct scrub_block *sblock_other = sblocks_for_recheck +

1153

mirror_index;

1154

mirror_index;

1154

struct scrub_page *page_other = sblock_other->pagev[

1155

struct scrub_page *page_other = sblock_other->pagev[

1155

page_num];

1156

page_num];

1156

1157

if (!page_other->io_error) {

1158

if (!page_other->io_error) {

1158

ret = scrub_repair_page_from_good_copy(

1159

ret = scrub_repair_page_from_good_copy(

1159

sblock_bad, sblock_other, page_num, 0);

1160

sblock_bad, sblock_other, page_num, 0);

1160

if (0 == ret) {

1161

if (0 == ret) {

1161

page_bad->io_error = 0;

1162

page_bad->io_error = 0;

1162

break; /* succeeded for this page */

1163

break; /* succeeded for this page */

1163

}

1164

}

1164

}

1165

}

1165

}

1166

}

1166

1167

if (page_bad->io_error) {

1168

if (page_bad->io_error) {

1168

/* did not find a mirror to copy the page from */

1169

/* did not find a mirror to copy the page from */

1169

success = 0;

1170

success = 0;

1170

}

1171

}

1171

}

1172

}

1172

1173

if (success) {

1174

if (success) {

1174

if (is_metadata || have_csum) {

1175

if (is_metadata || have_csum) {

1175

/*

1176

/*

1176

* need to verify the checksum now that all

1177

* need to verify the checksum now that all

1177

* sectors on disk are repaired (the write

1178

* sectors on disk are repaired (the write

1178

* request for data to be repaired is on its way).

1179

* request for data to be repaired is on its way).

1179

* Just be lazy and use scrub_recheck_block()

1180

* Just be lazy and use scrub_recheck_block()

1180

* which re-reads the data before the checksum

1181

* which re-reads the data before the checksum

1181

* is verified, but most likely the data comes out

1182

* is verified, but most likely the data comes out

1182

* of the page cache.

1183

* of the page cache.

1183

*/

1184

*/

1184

scrub_recheck_block(fs_info, sblock_bad,

1185

scrub_recheck_block(fs_info, sblock_bad,

1185

is_metadata, have_csum, csum,

1186

is_metadata, have_csum, csum,

1186

generation, sctx->csum_size);

1187

generation, sctx->csum_size);

1187

if (!sblock_bad->header_error &&

1188

if (!sblock_bad->header_error &&

1188

!sblock_bad->checksum_error &&

1189

!sblock_bad->checksum_error &&

1189

sblock_bad->no_io_error_seen)

1190

sblock_bad->no_io_error_seen)

1190

goto corrected_error;

1191

goto corrected_error;

1191

else

1192

else

1192

goto did_not_correct_error;

1193

goto did_not_correct_error;

1193

} else {

1194

} else {

1194

corrected_error:

1195

corrected_error:

1195

spin_lock(&sctx->stat_lock);

1196

spin_lock(&sctx->stat_lock);

1196

sctx->stat.corrected_errors++;

1197

sctx->stat.corrected_errors++;

1197

spin_unlock(&sctx->stat_lock);

1198

spin_unlock(&sctx->stat_lock);

1198

printk_ratelimited_in_rcu(KERN_ERR

1199

printk_ratelimited_in_rcu(KERN_ERR

1199

"BTRFS: fixed up error at logical %llu on dev %s\n",

1200

"BTRFS: fixed up error at logical %llu on dev %s\n",

1200

logical, rcu_str_deref(dev->name));

1201

logical, rcu_str_deref(dev->name));

1201

}

1202

}

1202

} else {

1203

} else {

1203

did_not_correct_error:

1204

did_not_correct_error:

1204

spin_lock(&sctx->stat_lock);

1205

spin_lock(&sctx->stat_lock);

1205

sctx->stat.uncorrectable_errors++;

1206

sctx->stat.uncorrectable_errors++;

1206

spin_unlock(&sctx->stat_lock);

1207

spin_unlock(&sctx->stat_lock);

1207

printk_ratelimited_in_rcu(KERN_ERR

1208

printk_ratelimited_in_rcu(KERN_ERR

1208

"BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",

1209

"BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",

1209

logical, rcu_str_deref(dev->name));

1210

logical, rcu_str_deref(dev->name));

1210

}

1211

}

1211

1212

out:

1213

out:

1213

if (sblocks_for_recheck) {

1214

if (sblocks_for_recheck) {

1214

for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;

1215

for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;

1215

mirror_index++) {

1216

mirror_index++) {

1216

struct scrub_block *sblock = sblocks_for_recheck +

1217

struct scrub_block *sblock = sblocks_for_recheck +

1217

mirror_index;

1218

mirror_index;

1218

int page_index;

1219

int page_index;

1219

1220

for (page_index = 0; page_index < sblock->page_count;

1221

for (page_index = 0; page_index < sblock->page_count;

1221

page_index++) {

1222

page_index++) {

1222

sblock->pagev[page_index]->sblock = NULL;

1223

sblock->pagev[page_index]->sblock = NULL;

1223

scrub_page_put(sblock->pagev[page_index]);

1224

scrub_page_put(sblock->pagev[page_index]);

1224

}

1225

}

1225

}

1226

}

1226

kfree(sblocks_for_recheck);

1227

kfree(sblocks_for_recheck);

1227

}

1228

}

1228

1229

return 0;

1230

return 0;

1230

}

1231

}

1231

1232

static int scrub_setup_recheck_block(struct scrub_ctx *sctx,

1233

static int scrub_setup_recheck_block(struct scrub_ctx *sctx,

1233

struct btrfs_fs_info *fs_info,

1234

struct btrfs_fs_info *fs_info,

1234

struct scrub_block *original_sblock,

1235

struct scrub_block *original_sblock,

1235

u64 length, u64 logical,

1236

u64 length, u64 logical,

1236

struct scrub_block *sblocks_for_recheck)

1237

struct scrub_block *sblocks_for_recheck)

1237

{

1238

{

1238

int page_index;

1239

int page_index;

1239

int mirror_index;

1240

int mirror_index;

1240

int ret;

1241

int ret;

1241

1242

/*

1243

/*

1243

* note: the two members ref_count and outstanding_pages

1244

* note: the two members ref_count and outstanding_pages

1244

* are not used (and not set) in the blocks that are used for

1245

* are not used (and not set) in the blocks that are used for

1245

* the recheck procedure

1246

* the recheck procedure

1246

*/

1247

*/

1247

1248

page_index = 0;

1249

page_index = 0;

1249

while (length > 0) {

1250

while (length > 0) {

1250

u64 sublen = min_t(u64, length, PAGE_SIZE);

1251

u64 sublen = min_t(u64, length, PAGE_SIZE);

1251

u64 mapped_length = sublen;

1252

u64 mapped_length = sublen;

1252

struct btrfs_bio *bbio = NULL;

1253

struct btrfs_bio *bbio = NULL;

1253

1254

/*

1255

/*

1255

* with a length of PAGE_SIZE, each returned stripe

1256

* with a length of PAGE_SIZE, each returned stripe

1256

* represents one mirror

1257

* represents one mirror

1257

*/

1258

*/

1258

ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,

1259

ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,

1259

&mapped_length, &bbio, 0);

1260

&mapped_length, &bbio, 0);

1260

if (ret || !bbio || mapped_length < sublen) {

1261

if (ret || !bbio || mapped_length < sublen) {

1261

kfree(bbio);

1262

kfree(bbio);

1262

return -EIO;

1263

return -EIO;

1263

}

1264

}

1264

1265

BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);

1266

BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);

1266

for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;

1267

for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;

1267

mirror_index++) {

1268

mirror_index++) {

1268

struct scrub_block *sblock;

1269

struct scrub_block *sblock;

1269

struct scrub_page *page;

1270

struct scrub_page *page;

1270

1271

if (mirror_index >= BTRFS_MAX_MIRRORS)

1272

if (mirror_index >= BTRFS_MAX_MIRRORS)

1272

continue;

1273

continue;

1273

1274

sblock = sblocks_for_recheck + mirror_index;

1275

sblock = sblocks_for_recheck + mirror_index;

1275

sblock->sctx = sctx;

1276

sblock->sctx = sctx;

1276

page = kzalloc(sizeof(*page), GFP_NOFS);

1277

page = kzalloc(sizeof(*page), GFP_NOFS);

1277

if (!page) {

1278

if (!page) {

1278

leave_nomem:

1279

leave_nomem:

1279

spin_lock(&sctx->stat_lock);

1280

spin_lock(&sctx->stat_lock);

1280

sctx->stat.malloc_errors++;

1281

sctx->stat.malloc_errors++;

1281

spin_unlock(&sctx->stat_lock);

1282

spin_unlock(&sctx->stat_lock);

1282

kfree(bbio);

1283

kfree(bbio);

1283

return -ENOMEM;

1284

return -ENOMEM;

1284

}

1285

}

1285

scrub_page_get(page);

1286

scrub_page_get(page);

1286

sblock->pagev[page_index] = page;

1287

sblock->pagev[page_index] = page;

1287

page->logical = logical;

1288

page->logical = logical;

1288

page->physical = bbio->stripes[mirror_index].physical;

1289

page->physical = bbio->stripes[mirror_index].physical;

1289

BUG_ON(page_index >= original_sblock->page_count);

1290

BUG_ON(page_index >= original_sblock->page_count);

1290

page->physical_for_dev_replace =

1291

page->physical_for_dev_replace =

1291

original_sblock->pagev[page_index]->

1292

original_sblock->pagev[page_index]->

1292

physical_for_dev_replace;

1293

physical_for_dev_replace;

1293

/* for missing devices, dev->bdev is NULL */

1294

/* for missing devices, dev->bdev is NULL */

1294

page->dev = bbio->stripes[mirror_index].dev;

1295

page->dev = bbio->stripes[mirror_index].dev;

1295

page->mirror_num = mirror_index + 1;

1296

page->mirror_num = mirror_index + 1;

1296

sblock->page_count++;

1297

sblock->page_count++;

1297

page->page = alloc_page(GFP_NOFS);

1298

page->page = alloc_page(GFP_NOFS);

1298

if (!page->page)

1299

if (!page->page)

1299

goto leave_nomem;

1300

goto leave_nomem;

1300

}

1301

}

1301

kfree(bbio);

1302

kfree(bbio);

1302

length -= sublen;

1303

length -= sublen;

1303

logical += sublen;

1304

logical += sublen;

1304

page_index++;

1305

page_index++;

1305

}

1306

}

1306

1307

return 0;

1308

return 0;

1308

}

1309

}

1309

1310

/*

1311

/*

1311

* this function will check the on disk data for checksum errors, header

1312

* this function will check the on disk data for checksum errors, header

1312

* errors and read I/O errors. If any I/O errors happen, the exact pages

1313

* errors and read I/O errors. If any I/O errors happen, the exact pages

1313

* which are errored are marked as being bad. The goal is to enable scrub

1314

* which are errored are marked as being bad. The goal is to enable scrub

1314

* to take those pages that are not errored from all the mirrors so that

1315

* to take those pages that are not errored from all the mirrors so that

1315

* the pages that are errored in the just handled mirror can be repaired.

1316

* the pages that are errored in the just handled mirror can be repaired.

1316

*/

1317

*/

1317

static void scrub_recheck_block(struct btrfs_fs_info *fs_info,

1318

static void scrub_recheck_block(struct btrfs_fs_info *fs_info,

1318

struct scrub_block *sblock, int is_metadata,

1319

struct scrub_block *sblock, int is_metadata,

1319

int have_csum, u8 *csum, u64 generation,

1320

int have_csum, u8 *csum, u64 generation,

1320

u16 csum_size)

1321

u16 csum_size)

1321

{

1322

{

1322

int page_num;

1323

int page_num;

1323

1324

sblock->no_io_error_seen = 1;

1325

sblock->no_io_error_seen = 1;

1325

sblock->header_error = 0;

1326

sblock->header_error = 0;

1326

sblock->checksum_error = 0;

1327

sblock->checksum_error = 0;

1327

1328

for (page_num = 0; page_num < sblock->page_count; page_num++) {

1329

for (page_num = 0; page_num < sblock->page_count; page_num++) {

1329

struct bio *bio;

1330

struct bio *bio;

1330

struct scrub_page *page = sblock->pagev[page_num];

1331

struct scrub_page *page = sblock->pagev[page_num];

1331

1332

if (page->dev->bdev == NULL) {

1333

if (page->dev->bdev == NULL) {

1333

page->io_error = 1;

1334

page->io_error = 1;

1334

sblock->no_io_error_seen = 0;

1335

sblock->no_io_error_seen = 0;

1335

continue;

1336

continue;

1336

}

1337

}

1337

1338

WARN_ON(!page->page);

1339

WARN_ON(!page->page);

1339

bio = btrfs_io_bio_alloc(GFP_NOFS, 1);

1340

bio = btrfs_io_bio_alloc(GFP_NOFS, 1);

1340

if (!bio) {

1341

if (!bio) {

1341

page->io_error = 1;

1342

page->io_error = 1;

1342

sblock->no_io_error_seen = 0;

1343

sblock->no_io_error_seen = 0;

1343

continue;

1344

continue;

1344

}

1345

}

1345

bio->bi_bdev = page->dev->bdev;

1346

bio->bi_bdev = page->dev->bdev;

1346

bio->bi_iter.bi_sector = page->physical >> 9;

1347

bio->bi_iter.bi_sector = page->physical >> 9;

1347

1348

bio_add_page(bio, page->page, PAGE_SIZE, 0);

1349

bio_add_page(bio, page->page, PAGE_SIZE, 0);

1349

if (btrfsic_submit_bio_wait(READ, bio))

1350

if (btrfsic_submit_bio_wait(READ, bio))

1350

sblock->no_io_error_seen = 0;

1351

sblock->no_io_error_seen = 0;

1351

1352

bio_put(bio);

1353

bio_put(bio);

1353

}

1354

}

1354

1355

if (sblock->no_io_error_seen)

1356

if (sblock->no_io_error_seen)

1356

scrub_recheck_block_checksum(fs_info, sblock, is_metadata,

1357

scrub_recheck_block_checksum(fs_info, sblock, is_metadata,

1357

have_csum, csum, generation,

1358

have_csum, csum, generation,

1358

csum_size);

1359

csum_size);

1359

1360

return;

1361

return;

1361

}

1362

}

1362

1363

static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,

1364

static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,

1364

struct scrub_block *sblock,

1365

struct scrub_block *sblock,

1365

int is_metadata, int have_csum,

1366

int is_metadata, int have_csum,

1366

const u8 *csum, u64 generation,

1367

const u8 *csum, u64 generation,

1367

u16 csum_size)

1368

u16 csum_size)

1368

{

1369

{

1369

int page_num;

1370

int page_num;

1370

u8 calculated_csum[BTRFS_CSUM_SIZE];

1371

u8 calculated_csum[BTRFS_CSUM_SIZE];

1371

u32 crc = ~(u32)0;

1372

u32 crc = ~(u32)0;

1372

void *mapped_buffer;

1373

void *mapped_buffer;

1373

1374

WARN_ON(!sblock->pagev[0]->page);

1375

WARN_ON(!sblock->pagev[0]->page);

1375

if (is_metadata) {

1376

if (is_metadata) {

1376

struct btrfs_header *h;

1377

struct btrfs_header *h;

1377

1378

mapped_buffer = kmap_atomic(sblock->pagev[0]->page);

1379

mapped_buffer = kmap_atomic(sblock->pagev[0]->page);

1379

h = (struct btrfs_header *)mapped_buffer;

1380

h = (struct btrfs_header *)mapped_buffer;

1380

1381

if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||

1382

if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||

1382

memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||

1383

memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||

1383

memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,

1384

memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,

1384

BTRFS_UUID_SIZE)) {

1385

BTRFS_UUID_SIZE)) {

1385

sblock->header_error = 1;

1386

sblock->header_error = 1;

1386

} else if (generation != btrfs_stack_header_generation(h)) {

1387

} else if (generation != btrfs_stack_header_generation(h)) {

1387

sblock->header_error = 1;

1388

sblock->header_error = 1;

1388

sblock->generation_error = 1;

1389

sblock->generation_error = 1;

1389

}

1390

}

1390

csum = h->csum;

1391

csum = h->csum;

1391

} else {

1392

} else {

1392

if (!have_csum)

1393

if (!have_csum)

1393

return;

1394

return;

1394

1395

mapped_buffer = kmap_atomic(sblock->pagev[0]->page);

1396

mapped_buffer = kmap_atomic(sblock->pagev[0]->page);

1396

}

1397

}

1397

1398

for (page_num = 0;;) {

1399

for (page_num = 0;;) {

1399

if (page_num == 0 && is_metadata)

1400

if (page_num == 0 && is_metadata)

1400

crc = btrfs_csum_data(

1401

crc = btrfs_csum_data(

1401

((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,

1402

((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,

1402

crc, PAGE_SIZE - BTRFS_CSUM_SIZE);

1403

crc, PAGE_SIZE - BTRFS_CSUM_SIZE);

1403

else

1404

else

1404

crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);

1405

crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);

1405

1406

kunmap_atomic(mapped_buffer);

1407

kunmap_atomic(mapped_buffer);

1407

page_num++;

1408

page_num++;

1408

if (page_num >= sblock->page_count)

1409

if (page_num >= sblock->page_count)

1409

break;

1410

break;

1410

WARN_ON(!sblock->pagev[page_num]->page);

1411

WARN_ON(!sblock->pagev[page_num]->page);

1411

1412

mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);

1413

mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);

1413

}

1414

}

1414

1415

btrfs_csum_final(crc, calculated_csum);

1416

btrfs_csum_final(crc, calculated_csum);

1416

if (memcmp(calculated_csum, csum, csum_size))

1417

if (memcmp(calculated_csum, csum, csum_size))

1417

sblock->checksum_error = 1;

1418

sblock->checksum_error = 1;

1418

}

1419

}

1419

1420

static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,

1421

static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,

1421

struct scrub_block *sblock_good,

1422

struct scrub_block *sblock_good,

1422

int force_write)

1423

int force_write)

1423

{

1424

{

1424

int page_num;

1425

int page_num;

1425

int ret = 0;

1426

int ret = 0;

1426

1427

for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {

1428

for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {

1428

int ret_sub;

1429

int ret_sub;

1429

1430

ret_sub = scrub_repair_page_from_good_copy(sblock_bad,

1431

ret_sub = scrub_repair_page_from_good_copy(sblock_bad,

1431

sblock_good,

1432

sblock_good,

1432

page_num,

1433

page_num,

1433

force_write);

1434

force_write);

1434

if (ret_sub)

1435

if (ret_sub)

1435

ret = ret_sub;

1436

ret = ret_sub;

1436

}

1437

}

1437

1438

return ret;

1439

return ret;

1439

}

1440

}

1440

1441

static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,

1442

static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,

1442

struct scrub_block *sblock_good,

1443

struct scrub_block *sblock_good,

1443

int page_num, int force_write)

1444

int page_num, int force_write)

1444

{

1445

{

1445

struct scrub_page *page_bad = sblock_bad->pagev[page_num];

1446

struct scrub_page *page_bad = sblock_bad->pagev[page_num];

1446

struct scrub_page *page_good = sblock_good->pagev[page_num];

1447

struct scrub_page *page_good = sblock_good->pagev[page_num];

1447

1448

BUG_ON(page_bad->page == NULL);

1449

BUG_ON(page_bad->page == NULL);

1449

BUG_ON(page_good->page == NULL);

1450

BUG_ON(page_good->page == NULL);

1450

if (force_write || sblock_bad->header_error ||

1451

if (force_write || sblock_bad->header_error ||

1451

sblock_bad->checksum_error || page_bad->io_error) {

1452

sblock_bad->checksum_error || page_bad->io_error) {

1452

struct bio *bio;

1453

struct bio *bio;

1453

int ret;

1454

int ret;

1454

1455

if (!page_bad->dev->bdev) {

1456

if (!page_bad->dev->bdev) {

1456

printk_ratelimited(KERN_WARNING "BTRFS: "

1457

printk_ratelimited(KERN_WARNING "BTRFS: "

1457

"scrub_repair_page_from_good_copy(bdev == NULL) "

1458

"scrub_repair_page_from_good_copy(bdev == NULL) "

1458

"is unexpected!\n");

1459

"is unexpected!\n");

1459

return -EIO;

1460

return -EIO;

1460

}

1461

}

1461

1462

bio = btrfs_io_bio_alloc(GFP_NOFS, 1);

1463

bio = btrfs_io_bio_alloc(GFP_NOFS, 1);

1463

if (!bio)

1464

if (!bio)

1464

return -EIO;

1465

return -EIO;

1465

bio->bi_bdev = page_bad->dev->bdev;

1466

bio->bi_bdev = page_bad->dev->bdev;

1466

bio->bi_iter.bi_sector = page_bad->physical >> 9;

1467

bio->bi_iter.bi_sector = page_bad->physical >> 9;

1467

1468

ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);

1469

ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);

1469

if (PAGE_SIZE != ret) {

1470

if (PAGE_SIZE != ret) {

1470

bio_put(bio);

1471

bio_put(bio);

1471

return -EIO;

1472

return -EIO;

1472

}

1473

}

1473

1474

if (btrfsic_submit_bio_wait(WRITE, bio)) {

1475

if (btrfsic_submit_bio_wait(WRITE, bio)) {

1475

btrfs_dev_stat_inc_and_print(page_bad->dev,

1476

btrfs_dev_stat_inc_and_print(page_bad->dev,

1476

BTRFS_DEV_STAT_WRITE_ERRS);

1477

BTRFS_DEV_STAT_WRITE_ERRS);

1477

btrfs_dev_replace_stats_inc(

1478

btrfs_dev_replace_stats_inc(

1478

&sblock_bad->sctx->dev_root->fs_info->

1479

&sblock_bad->sctx->dev_root->fs_info->

1479

dev_replace.num_write_errors);

1480

dev_replace.num_write_errors);

1480

bio_put(bio);

1481

bio_put(bio);

1481

return -EIO;

1482

return -EIO;

1482

}

1483

}

1483

bio_put(bio);

1484

bio_put(bio);

1484

}

1485

}

1485

1486

return 0;

1487

return 0;

1487

}

1488

}

1488

1489

static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)

1490

static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)

1490

{

1491

{

1491

int page_num;

1492

int page_num;

1492

1493

for (page_num = 0; page_num < sblock->page_count; page_num++) {

1494

for (page_num = 0; page_num < sblock->page_count; page_num++) {

1494

int ret;

1495

int ret;

1495

1496

ret = scrub_write_page_to_dev_replace(sblock, page_num);

1497

ret = scrub_write_page_to_dev_replace(sblock, page_num);

1497

if (ret)

1498

if (ret)

1498

btrfs_dev_replace_stats_inc(

1499

btrfs_dev_replace_stats_inc(

1499

&sblock->sctx->dev_root->fs_info->dev_replace.

1500

&sblock->sctx->dev_root->fs_info->dev_replace.

1500

num_write_errors);

1501

num_write_errors);

1501

}

1502

}

1502

}

1503

}

1503

1504

static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,

1505

static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,

1505

int page_num)

1506

int page_num)

1506

{

1507

{

1507

struct scrub_page *spage = sblock->pagev[page_num];

1508

struct scrub_page *spage = sblock->pagev[page_num];

1508

1509

BUG_ON(spage->page == NULL);

1510

BUG_ON(spage->page == NULL);

1510

if (spage->io_error) {

1511

if (spage->io_error) {

1511

void *mapped_buffer = kmap_atomic(spage->page);

1512

void *mapped_buffer = kmap_atomic(spage->page);

1512

1513

memset(mapped_buffer, 0, PAGE_CACHE_SIZE);

1514

memset(mapped_buffer, 0, PAGE_CACHE_SIZE);

1514

flush_dcache_page(spage->page);

1515

flush_dcache_page(spage->page);

1515

kunmap_atomic(mapped_buffer);

1516

kunmap_atomic(mapped_buffer);

1516

}

1517

}

1517

return scrub_add_page_to_wr_bio(sblock->sctx, spage);

1518

return scrub_add_page_to_wr_bio(sblock->sctx, spage);

1518

}

1519

}

1519

1520

static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,

1521

static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,

1521

struct scrub_page *spage)

1522

struct scrub_page *spage)

1522

{

1523

{

1523

struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;

1524

struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;

1524

struct scrub_bio *sbio;

1525

struct scrub_bio *sbio;

1525

int ret;

1526

int ret;

1526

1527

mutex_lock(&wr_ctx->wr_lock);

1528

mutex_lock(&wr_ctx->wr_lock);

1528

again:

1529

again:

1529

if (!wr_ctx->wr_curr_bio) {

1530

if (!wr_ctx->wr_curr_bio) {

1530

wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),

1531

wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),

1531

GFP_NOFS);

1532

GFP_NOFS);

1532

if (!wr_ctx->wr_curr_bio) {

1533

if (!wr_ctx->wr_curr_bio) {

1533

mutex_unlock(&wr_ctx->wr_lock);

1534

mutex_unlock(&wr_ctx->wr_lock);

1534

return -ENOMEM;

1535

return -ENOMEM;

1535

}

1536

}

1536

wr_ctx->wr_curr_bio->sctx = sctx;

1537

wr_ctx->wr_curr_bio->sctx = sctx;

1537

wr_ctx->wr_curr_bio->page_count = 0;

1538

wr_ctx->wr_curr_bio->page_count = 0;

1538

}

1539

}

1539

sbio = wr_ctx->wr_curr_bio;

1540

sbio = wr_ctx->wr_curr_bio;

1540

if (sbio->page_count == 0) {

1541

if (sbio->page_count == 0) {

1541

struct bio *bio;

1542

struct bio *bio;

1542

1543

sbio->physical = spage->physical_for_dev_replace;

1544

sbio->physical = spage->physical_for_dev_replace;

1544

sbio->logical = spage->logical;

1545

sbio->logical = spage->logical;

1545

sbio->dev = wr_ctx->tgtdev;

1546

sbio->dev = wr_ctx->tgtdev;

1546

bio = sbio->bio;

1547

bio = sbio->bio;

1547

if (!bio) {

1548

if (!bio) {

1548

bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);

1549

bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);

1549

if (!bio) {

1550

if (!bio) {

1550

mutex_unlock(&wr_ctx->wr_lock);

1551

mutex_unlock(&wr_ctx->wr_lock);

1551

return -ENOMEM;

1552

return -ENOMEM;

1552

}

1553

}

1553

sbio->bio = bio;

1554

sbio->bio = bio;

1554

}

1555

}

1555

1556

bio->bi_private = sbio;

1557

bio->bi_private = sbio;

1557

bio->bi_end_io = scrub_wr_bio_end_io;

1558

bio->bi_end_io = scrub_wr_bio_end_io;

1558

bio->bi_bdev = sbio->dev->bdev;

1559

bio->bi_bdev = sbio->dev->bdev;

1559

bio->bi_iter.bi_sector = sbio->physical >> 9;

1560

bio->bi_iter.bi_sector = sbio->physical >> 9;

1560

sbio->err = 0;

1561

sbio->err = 0;

1561

} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=

1562

} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=

1562

spage->physical_for_dev_replace ||

1563

spage->physical_for_dev_replace ||

1563

sbio->logical + sbio->page_count * PAGE_SIZE !=

1564

sbio->logical + sbio->page_count * PAGE_SIZE !=

1564

spage->logical) {

1565

spage->logical) {

1565

scrub_wr_submit(sctx);

1566

scrub_wr_submit(sctx);

1566

goto again;

1567

goto again;

1567

}

1568

}

1568

1569

ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);

1570

ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);

1570

if (ret != PAGE_SIZE) {

1571

if (ret != PAGE_SIZE) {

1571

if (sbio->page_count < 1) {

1572

if (sbio->page_count < 1) {

1572

bio_put(sbio->bio);

1573

bio_put(sbio->bio);

1573

sbio->bio = NULL;

1574

sbio->bio = NULL;

1574

mutex_unlock(&wr_ctx->wr_lock);

1575

mutex_unlock(&wr_ctx->wr_lock);

1575

return -EIO;

1576

return -EIO;

1576

}

1577

}

1577

scrub_wr_submit(sctx);

1578

scrub_wr_submit(sctx);

1578

goto again;

1579

goto again;

1579

}

1580

}

1580

1581

sbio->pagev[sbio->page_count] = spage;

1582

sbio->pagev[sbio->page_count] = spage;

1582

scrub_page_get(spage);

1583

scrub_page_get(spage);

1583

sbio->page_count++;

1584

sbio->page_count++;

1584

if (sbio->page_count == wr_ctx->pages_per_wr_bio)

1585

if (sbio->page_count == wr_ctx->pages_per_wr_bio)

1585

scrub_wr_submit(sctx);

1586

scrub_wr_submit(sctx);

1586

mutex_unlock(&wr_ctx->wr_lock);

1587

mutex_unlock(&wr_ctx->wr_lock);

1587

1588

return 0;

1589

return 0;

1589

}

1590

}

1590

1591

static void scrub_wr_submit(struct scrub_ctx *sctx)

1592

static void scrub_wr_submit(struct scrub_ctx *sctx)

1592

{

1593

{

1593

struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;

1594

struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;

1594

struct scrub_bio *sbio;

1595

struct scrub_bio *sbio;

1595

1596

if (!wr_ctx->wr_curr_bio)

1597

if (!wr_ctx->wr_curr_bio)

1597

return;

1598

return;

1598

1599

sbio = wr_ctx->wr_curr_bio;

1600

sbio = wr_ctx->wr_curr_bio;

1600

wr_ctx->wr_curr_bio = NULL;

1601

wr_ctx->wr_curr_bio = NULL;

1601

WARN_ON(!sbio->bio->bi_bdev);

1602

WARN_ON(!sbio->bio->bi_bdev);

1602

scrub_pending_bio_inc(sctx);

1603

scrub_pending_bio_inc(sctx);

1603

/* process all writes in a single worker thread. Then the block layer

1604

/* process all writes in a single worker thread. Then the block layer

1604

* orders the requests before sending them to the driver which

1605

* orders the requests before sending them to the driver which

1605

* doubled the write performance on spinning disks when measured

1606

* doubled the write performance on spinning disks when measured

1606

* with Linux 3.5 */

1607

* with Linux 3.5 */

1607

btrfsic_submit_bio(WRITE, sbio->bio);

1608

btrfsic_submit_bio(WRITE, sbio->bio);

1608

}

1609

}

1609

1610

static void scrub_wr_bio_end_io(struct bio *bio, int err)

1611

static void scrub_wr_bio_end_io(struct bio *bio, int err)

1611

{

1612

{

1612

struct scrub_bio *sbio = bio->bi_private;

1613

struct scrub_bio *sbio = bio->bi_private;

1613

struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;

1614

struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;

1614

1615

sbio->err = err;

1616

sbio->err = err;

1616

sbio->bio = bio;

1617

sbio->bio = bio;

1617

1618

btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);

1619

btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);

1619

btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);

1620

btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);

1620

}

1621

}

1621

1622

static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)

1623

static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)

1623

{

1624

{

1624

struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);

1625

struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);

1625

struct scrub_ctx *sctx = sbio->sctx;

1626

struct scrub_ctx *sctx = sbio->sctx;

1626

int i;

1627

int i;

1627

1628

WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);

1629

WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);

1629

if (sbio->err) {

1630

if (sbio->err) {

1630

struct btrfs_dev_replace *dev_replace =

1631

struct btrfs_dev_replace *dev_replace =

1631

&sbio->sctx->dev_root->fs_info->dev_replace;

1632

&sbio->sctx->dev_root->fs_info->dev_replace;

1632

1633

for (i = 0; i < sbio->page_count; i++) {

1634

for (i = 0; i < sbio->page_count; i++) {

1634

struct scrub_page *spage = sbio->pagev[i];

1635

struct scrub_page *spage = sbio->pagev[i];

1635

1636

spage->io_error = 1;

1637

spage->io_error = 1;

1637

btrfs_dev_replace_stats_inc(&dev_replace->

1638

btrfs_dev_replace_stats_inc(&dev_replace->

1638

num_write_errors);

1639

num_write_errors);

1639

}

1640

}

1640

}

1641

}

1641

1642

for (i = 0; i < sbio->page_count; i++)

1643

for (i = 0; i < sbio->page_count; i++)

1643

scrub_page_put(sbio->pagev[i]);

1644

scrub_page_put(sbio->pagev[i]);

1644

1645

bio_put(sbio->bio);

1646

bio_put(sbio->bio);

1646

kfree(sbio);

1647

kfree(sbio);

1647

scrub_pending_bio_dec(sctx);

1648

scrub_pending_bio_dec(sctx);

1648

}

1649

}

1649

1650

static int scrub_checksum(struct scrub_block *sblock)

1651

static int scrub_checksum(struct scrub_block *sblock)

1651

{

1652

{

1652

u64 flags;

1653

u64 flags;

1653

int ret;

1654

int ret;

1654

1655

WARN_ON(sblock->page_count < 1);

1656

WARN_ON(sblock->page_count < 1);

1656

flags = sblock->pagev[0]->flags;

1657

flags = sblock->pagev[0]->flags;

1657

ret = 0;

1658

ret = 0;

1658

if (flags & BTRFS_EXTENT_FLAG_DATA)

1659

if (flags & BTRFS_EXTENT_FLAG_DATA)

1659

ret = scrub_checksum_data(sblock);

1660

ret = scrub_checksum_data(sblock);

1660

else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)

1661

else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)

1661

ret = scrub_checksum_tree_block(sblock);

1662

ret = scrub_checksum_tree_block(sblock);

1662

else if (flags & BTRFS_EXTENT_FLAG_SUPER)

1663

else if (flags & BTRFS_EXTENT_FLAG_SUPER)

1663

(void)scrub_checksum_super(sblock);

1664

(void)scrub_checksum_super(sblock);

1664

else

1665

else

1665

WARN_ON(1);

1666

WARN_ON(1);

1666

if (ret)

1667

if (ret)

1667

scrub_handle_errored_block(sblock);

1668

scrub_handle_errored_block(sblock);

1668

1669

return ret;

1670

return ret;

1670

}

1671

}

1671

1672

static int scrub_checksum_data(struct scrub_block *sblock)

1673

static int scrub_checksum_data(struct scrub_block *sblock)

1673

{

1674

{

1674

struct scrub_ctx *sctx = sblock->sctx;

1675

struct scrub_ctx *sctx = sblock->sctx;

1675

u8 csum[BTRFS_CSUM_SIZE];

1676

u8 csum[BTRFS_CSUM_SIZE];

1676

u8 *on_disk_csum;

1677

u8 *on_disk_csum;

1677

struct page *page;

1678

struct page *page;

1678

void *buffer;

1679

void *buffer;

1679

u32 crc = ~(u32)0;

1680

u32 crc = ~(u32)0;

1680

int fail = 0;

1681

int fail = 0;

1681

u64 len;

1682

u64 len;

1682

int index;

1683

int index;

1683

1684

BUG_ON(sblock->page_count < 1);

1685

BUG_ON(sblock->page_count < 1);

1685

if (!sblock->pagev[0]->have_csum)

1686

if (!sblock->pagev[0]->have_csum)

1686

return 0;

1687

return 0;

1687

1688

on_disk_csum = sblock->pagev[0]->csum;

1689

on_disk_csum = sblock->pagev[0]->csum;

1689

page = sblock->pagev[0]->page;

1690

page = sblock->pagev[0]->page;

1690

buffer = kmap_atomic(page);

1691

buffer = kmap_atomic(page);

1691

1692

len = sctx->sectorsize;

1693

len = sctx->sectorsize;

1693

index = 0;

1694

index = 0;

1694

for (;;) {

1695

for (;;) {

1695

u64 l = min_t(u64, len, PAGE_SIZE);

1696

u64 l = min_t(u64, len, PAGE_SIZE);

1696

1697

crc = btrfs_csum_data(buffer, crc, l);

1698

crc = btrfs_csum_data(buffer, crc, l);

1698

kunmap_atomic(buffer);

1699

kunmap_atomic(buffer);

1699

len -= l;

1700

len -= l;

1700

if (len == 0)

1701

if (len == 0)

1701

break;

1702

break;

1702

index++;

1703

index++;

1703

BUG_ON(index >= sblock->page_count);

1704

BUG_ON(index >= sblock->page_count);

1704

BUG_ON(!sblock->pagev[index]->page);

1705

BUG_ON(!sblock->pagev[index]->page);

1705

page = sblock->pagev[index]->page;

1706

page = sblock->pagev[index]->page;

1706

buffer = kmap_atomic(page);

1707

buffer = kmap_atomic(page);

1707

}

1708

}

1708

1709

btrfs_csum_final(crc, csum);

1710

btrfs_csum_final(crc, csum);

1710

if (memcmp(csum, on_disk_csum, sctx->csum_size))

1711

if (memcmp(csum, on_disk_csum, sctx->csum_size))

1711

fail = 1;

1712

fail = 1;

1712

1713

return fail;

1714

return fail;

1714

}

1715

}

1715

1716

static int scrub_checksum_tree_block(struct scrub_block *sblock)

1717

static int scrub_checksum_tree_block(struct scrub_block *sblock)

1717

{

1718

{

1718

struct scrub_ctx *sctx = sblock->sctx;

1719

struct scrub_ctx *sctx = sblock->sctx;

1719

struct btrfs_header *h;

1720

struct btrfs_header *h;

1720

struct btrfs_root *root = sctx->dev_root;

1721

struct btrfs_root *root = sctx->dev_root;

1721

struct btrfs_fs_info *fs_info = root->fs_info;

1722

struct btrfs_fs_info *fs_info = root->fs_info;

1722

u8 calculated_csum[BTRFS_CSUM_SIZE];

1723

u8 calculated_csum[BTRFS_CSUM_SIZE];

1723

u8 on_disk_csum[BTRFS_CSUM_SIZE];

1724

u8 on_disk_csum[BTRFS_CSUM_SIZE];

1724

struct page *page;

1725

struct page *page;

1725

void *mapped_buffer;

1726

void *mapped_buffer;

1726

u64 mapped_size;

1727

u64 mapped_size;

1727

void *p;

1728

void *p;

1728

u32 crc = ~(u32)0;

1729

u32 crc = ~(u32)0;

1729

int fail = 0;

1730

int fail = 0;

1730

int crc_fail = 0;

1731

int crc_fail = 0;

1731

u64 len;

1732

u64 len;

1732

int index;

1733

int index;

1733

1734

BUG_ON(sblock->page_count < 1);

1735

BUG_ON(sblock->page_count < 1);

1735

page = sblock->pagev[0]->page;

1736

page = sblock->pagev[0]->page;

1736

mapped_buffer = kmap_atomic(page);

1737

mapped_buffer = kmap_atomic(page);

1737

h = (struct btrfs_header *)mapped_buffer;

1738

h = (struct btrfs_header *)mapped_buffer;

1738

memcpy(on_disk_csum, h->csum, sctx->csum_size);

1739

memcpy(on_disk_csum, h->csum, sctx->csum_size);

1739

1740

/*

1741

/*

1741

* we don't use the getter functions here, as we

1742

* we don't use the getter functions here, as we

1742

* a) don't have an extent buffer and

1743

* a) don't have an extent buffer and

1743

* b) the page is already kmapped

1744

* b) the page is already kmapped

1744

*/

1745

*/

1745

1746

if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))

1747

if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))

1747

++fail;

1748

++fail;

1748

1749

if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))

1750

if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))

1750

++fail;

1751

++fail;

1751

1752

if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))

1753

if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))

1753

++fail;

1754

++fail;

1754

1755

if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,

1756

if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,

1756

BTRFS_UUID_SIZE))

1757

BTRFS_UUID_SIZE))

1757

++fail;

1758

++fail;

1758

1759

WARN_ON(sctx->nodesize != sctx->leafsize);

1760

WARN_ON(sctx->nodesize != sctx->leafsize);

1760

len = sctx->nodesize - BTRFS_CSUM_SIZE;

1761

len = sctx->nodesize - BTRFS_CSUM_SIZE;

1761

mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;

1762

mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;

1762

p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;

1763

p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;

1763

index = 0;

1764

index = 0;

1764

for (;;) {

1765

for (;;) {

1765

u64 l = min_t(u64, len, mapped_size);

1766

u64 l = min_t(u64, len, mapped_size);

1766

1767

crc = btrfs_csum_data(p, crc, l);

1768

crc = btrfs_csum_data(p, crc, l);

1768

kunmap_atomic(mapped_buffer);

1769

kunmap_atomic(mapped_buffer);

1769

len -= l;

1770

len -= l;

1770

if (len == 0)

1771

if (len == 0)

1771

break;

1772

break;

1772

index++;

1773

index++;

1773

BUG_ON(index >= sblock->page_count);

1774

BUG_ON(index >= sblock->page_count);

1774

BUG_ON(!sblock->pagev[index]->page);

1775

BUG_ON(!sblock->pagev[index]->page);

1775

page = sblock->pagev[index]->page;

1776

page = sblock->pagev[index]->page;

1776

mapped_buffer = kmap_atomic(page);

1777

mapped_buffer = kmap_atomic(page);

1777

mapped_size = PAGE_SIZE;

1778

mapped_size = PAGE_SIZE;

1778

p = mapped_buffer;

1779

p = mapped_buffer;

1779

}

1780

}

1780

1781

btrfs_csum_final(crc, calculated_csum);

1782

btrfs_csum_final(crc, calculated_csum);

1782

if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))

1783

if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))

1783

++crc_fail;

1784

++crc_fail;

1784

1785

return fail || crc_fail;

1786

return fail || crc_fail;

1786

}

1787

}

1787

1788

static int scrub_checksum_super(struct scrub_block *sblock)

1789

static int scrub_checksum_super(struct scrub_block *sblock)

1789

{

1790

{

1790

struct btrfs_super_block *s;

1791

struct btrfs_super_block *s;

1791

struct scrub_ctx *sctx = sblock->sctx;

1792

struct scrub_ctx *sctx = sblock->sctx;

1792

struct btrfs_root *root = sctx->dev_root;

1793

struct btrfs_root *root = sctx->dev_root;

1793

struct btrfs_fs_info *fs_info = root->fs_info;

1794

struct btrfs_fs_info *fs_info = root->fs_info;

1794

u8 calculated_csum[BTRFS_CSUM_SIZE];

1795

u8 calculated_csum[BTRFS_CSUM_SIZE];

1795

u8 on_disk_csum[BTRFS_CSUM_SIZE];

1796

u8 on_disk_csum[BTRFS_CSUM_SIZE];

1796

struct page *page;

1797

struct page *page;

1797

void *mapped_buffer;

1798

void *mapped_buffer;

1798

u64 mapped_size;

1799

u64 mapped_size;

1799

void *p;

1800

void *p;

1800

u32 crc = ~(u32)0;

1801

u32 crc = ~(u32)0;

1801

int fail_gen = 0;

1802

int fail_gen = 0;

1802

int fail_cor = 0;

1803

int fail_cor = 0;

1803

u64 len;

1804

u64 len;

1804

int index;

1805

int index;

1805

1806

BUG_ON(sblock->page_count < 1);

1807

BUG_ON(sblock->page_count < 1);

1807

page = sblock->pagev[0]->page;

1808

page = sblock->pagev[0]->page;

1808

mapped_buffer = kmap_atomic(page);

1809

mapped_buffer = kmap_atomic(page);

1809

s = (struct btrfs_super_block *)mapped_buffer;

1810

s = (struct btrfs_super_block *)mapped_buffer;

1810

memcpy(on_disk_csum, s->csum, sctx->csum_size);

1811

memcpy(on_disk_csum, s->csum, sctx->csum_size);

1811

1812

if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))

1813

if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))

1813

++fail_cor;

1814

++fail_cor;

1814

1815

if (sblock->pagev[0]->generation != btrfs_super_generation(s))

1816

if (sblock->pagev[0]->generation != btrfs_super_generation(s))

1816

++fail_gen;

1817

++fail_gen;

1817

1818

if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))

1819

if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))

1819

++fail_cor;

1820

++fail_cor;

1820

1821

len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;

1822

len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;

1822

mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;

1823

mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;

1823

p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;

1824

p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;

1824

index = 0;

1825

index = 0;

1825

for (;;) {

1826

for (;;) {

1826

u64 l = min_t(u64, len, mapped_size);

1827

u64 l = min_t(u64, len, mapped_size);

1827

1828

crc = btrfs_csum_data(p, crc, l);

1829

crc = btrfs_csum_data(p, crc, l);

1829

kunmap_atomic(mapped_buffer);

1830

kunmap_atomic(mapped_buffer);

1830

len -= l;

1831

len -= l;

1831

if (len == 0)

1832

if (len == 0)

1832

break;

1833

break;

1833

index++;

1834

index++;

1834

BUG_ON(index >= sblock->page_count);

1835

BUG_ON(index >= sblock->page_count);

1835

BUG_ON(!sblock->pagev[index]->page);

1836

BUG_ON(!sblock->pagev[index]->page);

1836

page = sblock->pagev[index]->page;

1837

page = sblock->pagev[index]->page;

1837

mapped_buffer = kmap_atomic(page);

1838

mapped_buffer = kmap_atomic(page);

1838

mapped_size = PAGE_SIZE;

1839

mapped_size = PAGE_SIZE;

1839

p = mapped_buffer;

1840

p = mapped_buffer;

1840

}

1841

}

1841

1842

btrfs_csum_final(crc, calculated_csum);

1843

btrfs_csum_final(crc, calculated_csum);

1843

if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))

1844

if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))

1844

++fail_cor;

1845

++fail_cor;

1845

1846

if (fail_cor + fail_gen) {

1847

if (fail_cor + fail_gen) {

1847

/*

1848

/*

1848

* if we find an error in a super block, we just report it.

1849

* if we find an error in a super block, we just report it.

1849

* They will get written with the next transaction commit

1850

* They will get written with the next transaction commit

1850

* anyway

1851

* anyway

1851

*/

1852

*/

1852

spin_lock(&sctx->stat_lock);

1853

spin_lock(&sctx->stat_lock);

1853

++sctx->stat.super_errors;

1854

++sctx->stat.super_errors;

1854

spin_unlock(&sctx->stat_lock);

1855

spin_unlock(&sctx->stat_lock);

1855

if (fail_cor)

1856

if (fail_cor)

1856

btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,

1857

btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,

1857

BTRFS_DEV_STAT_CORRUPTION_ERRS);

1858

BTRFS_DEV_STAT_CORRUPTION_ERRS);

1858

else

1859

else

1859

btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,

1860

btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,

1860

BTRFS_DEV_STAT_GENERATION_ERRS);

1861

BTRFS_DEV_STAT_GENERATION_ERRS);

1861

}

1862

}

1862

1863

return fail_cor + fail_gen;

1864

return fail_cor + fail_gen;

1864

}

1865

}

1865

1866

static void scrub_block_get(struct scrub_block *sblock)

1867

static void scrub_block_get(struct scrub_block *sblock)

1867

{

1868

{

1868

atomic_inc(&sblock->ref_count);

1869

atomic_inc(&sblock->ref_count);

1869

}

1870

}

1870

1871

static void scrub_block_put(struct scrub_block *sblock)

1872

static void scrub_block_put(struct scrub_block *sblock)

1872

{

1873

{

1873

if (atomic_dec_and_test(&sblock->ref_count)) {

1874

if (atomic_dec_and_test(&sblock->ref_count)) {

1874

int i;

1875

int i;

1875

1876

for (i = 0; i < sblock->page_count; i++)

1877

for (i = 0; i < sblock->page_count; i++)

1877

scrub_page_put(sblock->pagev[i]);

1878

scrub_page_put(sblock->pagev[i]);

1878

kfree(sblock);

1879

kfree(sblock);

1879

}

1880

}

1880

}

1881

}

1881

1882

static void scrub_page_get(struct scrub_page *spage)

1883

static void scrub_page_get(struct scrub_page *spage)

1883

{

1884

{

1884

atomic_inc(&spage->ref_count);

1885

atomic_inc(&spage->ref_count);

1885

}

1886

}

1886

1887

static void scrub_page_put(struct scrub_page *spage)

1888

static void scrub_page_put(struct scrub_page *spage)

1888

{

1889

{

1889

if (atomic_dec_and_test(&spage->ref_count)) {

1890

if (atomic_dec_and_test(&spage->ref_count)) {

1890

if (spage->page)

1891

if (spage->page)

1891

__free_page(spage->page);

1892

__free_page(spage->page);

1892

kfree(spage);

1893

kfree(spage);

1893

}

1894

}

1894

}

1895

}

1895

1896

static void scrub_submit(struct scrub_ctx *sctx)

1897

static void scrub_submit(struct scrub_ctx *sctx)

1897

{

1898

{

1898

struct scrub_bio *sbio;

1899

struct scrub_bio *sbio;

1899

1900

if (sctx->curr == -1)

1901

if (sctx->curr == -1)

1901

return;

1902

return;

1902

1903

sbio = sctx->bios[sctx->curr];

1904

sbio = sctx->bios[sctx->curr];

1904

sctx->curr = -1;

1905

sctx->curr = -1;

1905

scrub_pending_bio_inc(sctx);

1906

scrub_pending_bio_inc(sctx);

1906

1907

if (!sbio->bio->bi_bdev) {

1908

if (!sbio->bio->bi_bdev) {

1908

/*

1909

/*

1909

* this case should not happen. If btrfs_map_block() is

1910

* this case should not happen. If btrfs_map_block() is

1910

* wrong, it could happen for dev-replace operations on

1911

* wrong, it could happen for dev-replace operations on

1911

* missing devices when no mirrors are available, but in

1912

* missing devices when no mirrors are available, but in

1912

* this case it should already fail the mount.

1913

* this case it should already fail the mount.

1913

* This case is handled correctly (but _very_ slowly).

1914

* This case is handled correctly (but _very_ slowly).

1914

*/

1915

*/

1915

printk_ratelimited(KERN_WARNING

1916

printk_ratelimited(KERN_WARNING

1916

"BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");

1917

"BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");

1917

bio_endio(sbio->bio, -EIO);

1918

bio_endio(sbio->bio, -EIO);

1918

} else {

1919

} else {

1919

btrfsic_submit_bio(READ, sbio->bio);

1920

btrfsic_submit_bio(READ, sbio->bio);

1920

}

1921

}

1921

}

1922

}

1922

1923

static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,

1924

static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,

1924

struct scrub_page *spage)

1925

struct scrub_page *spage)

1925

{

1926

{

1926

struct scrub_block *sblock = spage->sblock;

1927

struct scrub_block *sblock = spage->sblock;

1927

struct scrub_bio *sbio;

1928

struct scrub_bio *sbio;

1928

int ret;

1929

int ret;

1929

1930

again:

1931

again:

1931

/*

1932

/*

1932

* grab a fresh bio or wait for one to become available

1933

* grab a fresh bio or wait for one to become available

1933

*/

1934

*/

1934

while (sctx->curr == -1) {

1935

while (sctx->curr == -1) {

1935

spin_lock(&sctx->list_lock);

1936

spin_lock(&sctx->list_lock);

1936

sctx->curr = sctx->first_free;

1937

sctx->curr = sctx->first_free;

1937

if (sctx->curr != -1) {

1938

if (sctx->curr != -1) {

1938

sctx->first_free = sctx->bios[sctx->curr]->next_free;

1939

sctx->first_free = sctx->bios[sctx->curr]->next_free;

1939

sctx->bios[sctx->curr]->next_free = -1;

1940

sctx->bios[sctx->curr]->next_free = -1;

1940

sctx->bios[sctx->curr]->page_count = 0;

1941

sctx->bios[sctx->curr]->page_count = 0;

1941

spin_unlock(&sctx->list_lock);

1942

spin_unlock(&sctx->list_lock);

1942

} else {

1943

} else {

1943

spin_unlock(&sctx->list_lock);

1944

spin_unlock(&sctx->list_lock);

1944

wait_event(sctx->list_wait, sctx->first_free != -1);

1945

wait_event(sctx->list_wait, sctx->first_free != -1);

1945

}

1946

}

1946

}

1947

}

1947

sbio = sctx->bios[sctx->curr];

1948

sbio = sctx->bios[sctx->curr];

1948

if (sbio->page_count == 0) {

1949

if (sbio->page_count == 0) {

1949

struct bio *bio;

1950

struct bio *bio;

1950

1951

sbio->physical = spage->physical;

1952

sbio->physical = spage->physical;

1952

sbio->logical = spage->logical;

1953

sbio->logical = spage->logical;

1953

sbio->dev = spage->dev;

1954

sbio->dev = spage->dev;

1954

bio = sbio->bio;

1955

bio = sbio->bio;

1955

if (!bio) {

1956

if (!bio) {

1956

bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);

1957

bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);

1957

if (!bio)

1958

if (!bio)

1958

return -ENOMEM;

1959

return -ENOMEM;

1959

sbio->bio = bio;

1960

sbio->bio = bio;

1960

}

1961

}

1961

1962

bio->bi_private = sbio;

1963

bio->bi_private = sbio;

1963

bio->bi_end_io = scrub_bio_end_io;

1964

bio->bi_end_io = scrub_bio_end_io;

1964

bio->bi_bdev = sbio->dev->bdev;

1965

bio->bi_bdev = sbio->dev->bdev;

1965

bio->bi_iter.bi_sector = sbio->physical >> 9;

1966

bio->bi_iter.bi_sector = sbio->physical >> 9;

1966

sbio->err = 0;

1967

sbio->err = 0;

1967

} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=

1968

} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=

1968

spage->physical ||

1969

spage->physical ||

1969

sbio->logical + sbio->page_count * PAGE_SIZE !=

1970

sbio->logical + sbio->page_count * PAGE_SIZE !=

1970

spage->logical ||

1971

spage->logical ||

1971

sbio->dev != spage->dev) {

1972

sbio->dev != spage->dev) {

1972

scrub_submit(sctx);

1973

scrub_submit(sctx);

1973

goto again;

1974

goto again;

1974

}

1975

}

1975

1976

sbio->pagev[sbio->page_count] = spage;

1977

sbio->pagev[sbio->page_count] = spage;

1977

ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);

1978

ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);

1978

if (ret != PAGE_SIZE) {

1979

if (ret != PAGE_SIZE) {

1979

if (sbio->page_count < 1) {

1980

if (sbio->page_count < 1) {

1980

bio_put(sbio->bio);

1981

bio_put(sbio->bio);

1981

sbio->bio = NULL;

1982

sbio->bio = NULL;

1982

return -EIO;

1983

return -EIO;

1983

}

1984

}

1984

scrub_submit(sctx);

1985

scrub_submit(sctx);

1985

goto again;

1986

goto again;

1986

}

1987

}

1987

1988

scrub_block_get(sblock); /* one for the page added to the bio */

1989

scrub_block_get(sblock); /* one for the page added to the bio */

1989

atomic_inc(&sblock->outstanding_pages);

1990

atomic_inc(&sblock->outstanding_pages);

1990

sbio->page_count++;

1991

sbio->page_count++;

1991

if (sbio->page_count == sctx->pages_per_rd_bio)

1992

if (sbio->page_count == sctx->pages_per_rd_bio)

1992

scrub_submit(sctx);

1993

scrub_submit(sctx);

1993

1994

return 0;

1995

return 0;

1995

}

1996

}

1996

1997

static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,

1998

static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,

1998

u64 physical, struct btrfs_device *dev, u64 flags,

1999

u64 physical, struct btrfs_device *dev, u64 flags,

1999

u64 gen, int mirror_num, u8 *csum, int force,

2000

u64 gen, int mirror_num, u8 *csum, int force,

2000

u64 physical_for_dev_replace)

2001

u64 physical_for_dev_replace)

2001

{

2002

{

2002

struct scrub_block *sblock;

2003

struct scrub_block *sblock;

2003

int index;

2004

int index;

2004

2005

sblock = kzalloc(sizeof(*sblock), GFP_NOFS);

2006

sblock = kzalloc(sizeof(*sblock), GFP_NOFS);

2006

if (!sblock) {

2007

if (!sblock) {

2007

spin_lock(&sctx->stat_lock);

2008

spin_lock(&sctx->stat_lock);

2008

sctx->stat.malloc_errors++;

2009

sctx->stat.malloc_errors++;

2009

spin_unlock(&sctx->stat_lock);

2010

spin_unlock(&sctx->stat_lock);

2010

return -ENOMEM;

2011

return -ENOMEM;

2011

}

2012

}

2012

2013

/* one ref inside this function, plus one for each page added to

2014

/* one ref inside this function, plus one for each page added to

2014

* a bio later on */

2015

* a bio later on */

2015

atomic_set(&sblock->ref_count, 1);

2016

atomic_set(&sblock->ref_count, 1);

2016

sblock->sctx = sctx;

2017

sblock->sctx = sctx;

2017

sblock->no_io_error_seen = 1;

2018

sblock->no_io_error_seen = 1;

2018

2019

for (index = 0; len > 0; index++) {

2020

for (index = 0; len > 0; index++) {

2020

struct scrub_page *spage;

2021

struct scrub_page *spage;

2021

u64 l = min_t(u64, len, PAGE_SIZE);

2022

u64 l = min_t(u64, len, PAGE_SIZE);

2022

2023

spage = kzalloc(sizeof(*spage), GFP_NOFS);

2024

spage = kzalloc(sizeof(*spage), GFP_NOFS);

2024

if (!spage) {

2025

if (!spage) {

2025

leave_nomem:

2026

leave_nomem:

2026

spin_lock(&sctx->stat_lock);

2027

spin_lock(&sctx->stat_lock);

2027

sctx->stat.malloc_errors++;

2028

sctx->stat.malloc_errors++;

2028

spin_unlock(&sctx->stat_lock);

2029

spin_unlock(&sctx->stat_lock);

2029

scrub_block_put(sblock);

2030

scrub_block_put(sblock);

2030

return -ENOMEM;

2031

return -ENOMEM;

2031

}

2032

}

2032

BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);

2033

BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);

2033

scrub_page_get(spage);

2034

scrub_page_get(spage);

2034

sblock->pagev[index] = spage;

2035

sblock->pagev[index] = spage;

2035

spage->sblock = sblock;

2036

spage->sblock = sblock;

2036

spage->dev = dev;

2037

spage->dev = dev;

2037

spage->flags = flags;

2038

spage->flags = flags;

2038

spage->generation = gen;

2039

spage->generation = gen;

2039

spage->logical = logical;

2040

spage->logical = logical;

2040

spage->physical = physical;

2041

spage->physical = physical;

2041

spage->physical_for_dev_replace = physical_for_dev_replace;

2042

spage->physical_for_dev_replace = physical_for_dev_replace;

2042

spage->mirror_num = mirror_num;

2043

spage->mirror_num = mirror_num;

2043

if (csum) {

2044

if (csum) {

2044

spage->have_csum = 1;

2045

spage->have_csum = 1;

2045

memcpy(spage->csum, csum, sctx->csum_size);

2046

memcpy(spage->csum, csum, sctx->csum_size);

2046

} else {

2047

} else {

2047

spage->have_csum = 0;

2048

spage->have_csum = 0;

2048

}

2049

}

2049

sblock->page_count++;

2050

sblock->page_count++;

2050

spage->page = alloc_page(GFP_NOFS);

2051

spage->page = alloc_page(GFP_NOFS);

2051

if (!spage->page)

2052

if (!spage->page)

2052

goto leave_nomem;

2053

goto leave_nomem;

2053

len -= l;

2054

len -= l;

2054

logical += l;

2055

logical += l;

2055

physical += l;

2056

physical += l;

2056

physical_for_dev_replace += l;

2057

physical_for_dev_replace += l;

2057

}

2058

}

2058

2059

WARN_ON(sblock->page_count == 0);

2060

WARN_ON(sblock->page_count == 0);

2060

for (index = 0; index < sblock->page_count; index++) {

2061

for (index = 0; index < sblock->page_count; index++) {

2061

struct scrub_page *spage = sblock->pagev[index];

2062

struct scrub_page *spage = sblock->pagev[index];

2062

int ret;

2063

int ret;

2063

2064

ret = scrub_add_page_to_rd_bio(sctx, spage);

2065

ret = scrub_add_page_to_rd_bio(sctx, spage);

2065

if (ret) {

2066

if (ret) {

2066

scrub_block_put(sblock);

2067

scrub_block_put(sblock);

2067

return ret;

2068

return ret;

2068

}

2069

}

2069

}

2070

}

2070

2071

if (force)

2072

if (force)

2072

scrub_submit(sctx);

2073

scrub_submit(sctx);

2073

2074

/* last one frees, either here or in bio completion for last page */

2075

/* last one frees, either here or in bio completion for last page */

2075

scrub_block_put(sblock);

2076

scrub_block_put(sblock);

2076

return 0;

2077

return 0;

2077

}

2078

}

2078

2079

static void scrub_bio_end_io(struct bio *bio, int err)

2080

static void scrub_bio_end_io(struct bio *bio, int err)

2080

{

2081

{

2081

struct scrub_bio *sbio = bio->bi_private;

2082

struct scrub_bio *sbio = bio->bi_private;

2082

struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;

2083

struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;

2083

2084

sbio->err = err;

2085

sbio->err = err;

2085

sbio->bio = bio;

2086

sbio->bio = bio;

2086

2087

btrfs_queue_work(fs_info->scrub_workers, &sbio->work);

2088

btrfs_queue_work(fs_info->scrub_workers, &sbio->work);

2088

}

2089

}

2089

2090

static void scrub_bio_end_io_worker(struct btrfs_work *work)

2091

static void scrub_bio_end_io_worker(struct btrfs_work *work)

2091

{

2092

{

2092

struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);

2093

struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);

2093

struct scrub_ctx *sctx = sbio->sctx;

2094

struct scrub_ctx *sctx = sbio->sctx;

2094

int i;

2095

int i;

2095

2096

BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);

2097

BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);

2097

if (sbio->err) {

2098

if (sbio->err) {

2098

for (i = 0; i < sbio->page_count; i++) {

2099

for (i = 0; i < sbio->page_count; i++) {

2099

struct scrub_page *spage = sbio->pagev[i];

2100

struct scrub_page *spage = sbio->pagev[i];

2100

2101

spage->io_error = 1;

2102

spage->io_error = 1;

2102

spage->sblock->no_io_error_seen = 0;

2103

spage->sblock->no_io_error_seen = 0;

2103

}

2104

}

2104

}

2105

}

2105

2106

/* now complete the scrub_block items that have all pages completed */

2107

/* now complete the scrub_block items that have all pages completed */

2107

for (i = 0; i < sbio->page_count; i++) {

2108

for (i = 0; i < sbio->page_count; i++) {

2108

struct scrub_page *spage = sbio->pagev[i];

2109

struct scrub_page *spage = sbio->pagev[i];

2109

struct scrub_block *sblock = spage->sblock;

2110

struct scrub_block *sblock = spage->sblock;

2110

2111

if (atomic_dec_and_test(&sblock->outstanding_pages))

2112

if (atomic_dec_and_test(&sblock->outstanding_pages))

2112

scrub_block_complete(sblock);

2113

scrub_block_complete(sblock);

2113

scrub_block_put(sblock);

2114

scrub_block_put(sblock);

2114

}

2115

}

2115

2116

bio_put(sbio->bio);

2117

bio_put(sbio->bio);

2117

sbio->bio = NULL;

2118

sbio->bio = NULL;

2118

spin_lock(&sctx->list_lock);

2119

spin_lock(&sctx->list_lock);

2119

sbio->next_free = sctx->first_free;

2120

sbio->next_free = sctx->first_free;

2120

sctx->first_free = sbio->index;

2121

sctx->first_free = sbio->index;

2121

spin_unlock(&sctx->list_lock);

2122

spin_unlock(&sctx->list_lock);

2122

2123

if (sctx->is_dev_replace &&

2124

if (sctx->is_dev_replace &&

2124

atomic_read(&sctx->wr_ctx.flush_all_writes)) {

2125

atomic_read(&sctx->wr_ctx.flush_all_writes)) {

2125

mutex_lock(&sctx->wr_ctx.wr_lock);

2126

mutex_lock(&sctx->wr_ctx.wr_lock);

2126

scrub_wr_submit(sctx);

2127

scrub_wr_submit(sctx);

2127

mutex_unlock(&sctx->wr_ctx.wr_lock);

2128

mutex_unlock(&sctx->wr_ctx.wr_lock);

2128

}

2129

}

2129

2130

scrub_pending_bio_dec(sctx);

2131

scrub_pending_bio_dec(sctx);

2131

}

2132

}

2132

2133

static void scrub_block_complete(struct scrub_block *sblock)

2134

static void scrub_block_complete(struct scrub_block *sblock)

2134

{

2135

{

2135

if (!sblock->no_io_error_seen) {

2136

if (!sblock->no_io_error_seen) {

2136

scrub_handle_errored_block(sblock);

2137

scrub_handle_errored_block(sblock);

2137

} else {

2138

} else {

2138

/*

2139

/*

2139

* if has checksum error, write via repair mechanism in

2140

* if has checksum error, write via repair mechanism in

2140

* dev replace case, otherwise write here in dev replace

2141

* dev replace case, otherwise write here in dev replace

2141

* case.

2142

* case.

2142

*/

2143

*/

2143

if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)

2144

if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)

2144

scrub_write_block_to_dev_replace(sblock);

2145

scrub_write_block_to_dev_replace(sblock);

2145

}

2146

}

2146

}

2147

}

2147

2148

static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,

2149

static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,

2149

u8 *csum)

2150

u8 *csum)

2150

{

2151

{

2151

struct btrfs_ordered_sum *sum = NULL;

2152

struct btrfs_ordered_sum *sum = NULL;

2152

unsigned long index;

2153

unsigned long index;

2153

unsigned long num_sectors;

2154

unsigned long num_sectors;

2154

2155

while (!list_empty(&sctx->csum_list)) {

2156

while (!list_empty(&sctx->csum_list)) {

2156

sum = list_first_entry(&sctx->csum_list,

2157

sum = list_first_entry(&sctx->csum_list,

2157

struct btrfs_ordered_sum, list);

2158

struct btrfs_ordered_sum, list);

2158

if (sum->bytenr > logical)

2159

if (sum->bytenr > logical)

2159

return 0;

2160

return 0;

2160

if (sum->bytenr + sum->len > logical)

2161

if (sum->bytenr + sum->len > logical)

2161

break;

2162

break;

2162

2163

++sctx->stat.csum_discards;

2164

++sctx->stat.csum_discards;

2164

list_del(&sum->list);

2165

list_del(&sum->list);

2165

kfree(sum);

2166

kfree(sum);

2166

sum = NULL;

2167

sum = NULL;

2167

}

2168

}

2168

if (!sum)

2169

if (!sum)

2169

return 0;

2170

return 0;

2170

2171

index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;

2172

index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;

2172

num_sectors = sum->len / sctx->sectorsize;

2173

num_sectors = sum->len / sctx->sectorsize;

2173

memcpy(csum, sum->sums + index, sctx->csum_size);

2174

memcpy(csum, sum->sums + index, sctx->csum_size);

2174

if (index == num_sectors - 1) {

2175

if (index == num_sectors - 1) {

2175

list_del(&sum->list);

2176

list_del(&sum->list);

2176

kfree(sum);

2177

kfree(sum);

2177

}

2178

}

2178

return 1;

2179

return 1;

2179

}

2180

}

2180

2181

/* scrub extent tries to collect up to 64 kB for each bio */

2182

/* scrub extent tries to collect up to 64 kB for each bio */

2182

static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,

2183

static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,

2183

u64 physical, struct btrfs_device *dev, u64 flags,

2184

u64 physical, struct btrfs_device *dev, u64 flags,

2184

u64 gen, int mirror_num, u64 physical_for_dev_replace)

2185

u64 gen, int mirror_num, u64 physical_for_dev_replace)

2185

{

2186

{

2186

int ret;

2187

int ret;

2187

u8 csum[BTRFS_CSUM_SIZE];

2188

u8 csum[BTRFS_CSUM_SIZE];

2188

u32 blocksize;

2189

u32 blocksize;

2189

2190

if (flags & BTRFS_EXTENT_FLAG_DATA) {

2191

if (flags & BTRFS_EXTENT_FLAG_DATA) {

2191

blocksize = sctx->sectorsize;

2192

blocksize = sctx->sectorsize;

2192

spin_lock(&sctx->stat_lock);

2193

spin_lock(&sctx->stat_lock);

2193

sctx->stat.data_extents_scrubbed++;

2194

sctx->stat.data_extents_scrubbed++;

2194

sctx->stat.data_bytes_scrubbed += len;

2195

sctx->stat.data_bytes_scrubbed += len;

2195

spin_unlock(&sctx->stat_lock);

2196

spin_unlock(&sctx->stat_lock);

2196

} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {

2197

} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {

2197

WARN_ON(sctx->nodesize != sctx->leafsize);

2198

WARN_ON(sctx->nodesize != sctx->leafsize);

2198

blocksize = sctx->nodesize;

2199

blocksize = sctx->nodesize;

2199

spin_lock(&sctx->stat_lock);

2200

spin_lock(&sctx->stat_lock);

2200

sctx->stat.tree_extents_scrubbed++;

2201

sctx->stat.tree_extents_scrubbed++;

2201

sctx->stat.tree_bytes_scrubbed += len;

2202

sctx->stat.tree_bytes_scrubbed += len;

2202

spin_unlock(&sctx->stat_lock);

2203

spin_unlock(&sctx->stat_lock);

2203

} else {

2204

} else {

2204

blocksize = sctx->sectorsize;

2205

blocksize = sctx->sectorsize;

2205

WARN_ON(1);

2206

WARN_ON(1);

2206

}

2207

}

2207

2208

while (len) {

2209

while (len) {

2209

u64 l = min_t(u64, len, blocksize);

2210

u64 l = min_t(u64, len, blocksize);

2210

int have_csum = 0;

2211

int have_csum = 0;

2211

2212

if (flags & BTRFS_EXTENT_FLAG_DATA) {

2213

if (flags & BTRFS_EXTENT_FLAG_DATA) {

2213

/* push csums to sbio */

2214

/* push csums to sbio */

2214

have_csum = scrub_find_csum(sctx, logical, l, csum);

2215

have_csum = scrub_find_csum(sctx, logical, l, csum);

2215

if (have_csum == 0)

2216

if (have_csum == 0)

2216

++sctx->stat.no_csum;

2217

++sctx->stat.no_csum;

2217

if (sctx->is_dev_replace && !have_csum) {

2218

if (sctx->is_dev_replace && !have_csum) {

2218

ret = copy_nocow_pages(sctx, logical, l,

2219

ret = copy_nocow_pages(sctx, logical, l,

2219

mirror_num,

2220

mirror_num,

2220

physical_for_dev_replace);

2221

physical_for_dev_replace);

2221

goto behind_scrub_pages;

2222

goto behind_scrub_pages;

2222

}

2223

}

2223

}

2224

}

2224

ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,

2225

ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,

2225

mirror_num, have_csum ? csum : NULL, 0,

2226

mirror_num, have_csum ? csum : NULL, 0,

2226

physical_for_dev_replace);

2227

physical_for_dev_replace);

2227

behind_scrub_pages:

2228

behind_scrub_pages:

2228

if (ret)

2229

if (ret)

2229

return ret;

2230

return ret;

2230

len -= l;

2231

len -= l;

2231

logical += l;

2232

logical += l;

2232

physical += l;

2233

physical += l;

2233

physical_for_dev_replace += l;

2234

physical_for_dev_replace += l;

2234

}

2235

}

2235

return 0;

2236

return 0;

2236

}

2237

}

2237

2238

/*

2239

/*

2239

* Given a physical address, this will calculate it's

2240

* Given a physical address, this will calculate it's

2240

* logical offset. if this is a parity stripe, it will return

2241

* logical offset. if this is a parity stripe, it will return

2241

* the most left data stripe's logical offset.

2242

* the most left data stripe's logical offset.

2242

*

2243

*

2243

* return 0 if it is a data stripe, 1 means parity stripe.

2244

* return 0 if it is a data stripe, 1 means parity stripe.

2244

*/

2245

*/

2245

static int get_raid56_logic_offset(u64 physical, int num,

2246

static int get_raid56_logic_offset(u64 physical, int num,

2246

struct map_lookup *map, u64 *offset)

2247

struct map_lookup *map, u64 *offset)

2247

{

2248

{

2248

int i;

2249

int i;

2249

int j = 0;

2250

int j = 0;

2250

u64 stripe_nr;

2251

u64 stripe_nr;

2251

u64 last_offset;

2252

u64 last_offset;

2252

int stripe_index;

2253

int stripe_index;

2253

int rot;

2254

int rot;

2254

2255

last_offset = (physical - map->stripes[num].physical) *

2256

last_offset = (physical - map->stripes[num].physical) *

2256

nr_data_stripes(map);

2257

nr_data_stripes(map);

2257

*offset = last_offset;

2258

*offset = last_offset;

2258

for (i = 0; i < nr_data_stripes(map); i++) {

2259

for (i = 0; i < nr_data_stripes(map); i++) {

2259

*offset = last_offset + i * map->stripe_len;

2260

*offset = last_offset + i * map->stripe_len;

2260

2261

stripe_nr = *offset;

2262

stripe_nr = *offset;

2262

do_div(stripe_nr, map->stripe_len);

2263

do_div(stripe_nr, map->stripe_len);

2263

do_div(stripe_nr, nr_data_stripes(map));

2264

do_div(stripe_nr, nr_data_stripes(map));

2264

2265

/* Work out the disk rotation on this stripe-set */

2266

/* Work out the disk rotation on this stripe-set */

2266

rot = do_div(stripe_nr, map->num_stripes);

2267

rot = do_div(stripe_nr, map->num_stripes);

2267

/* calculate which stripe this data locates */

2268

/* calculate which stripe this data locates */

2268

rot += i;

2269

rot += i;

2269

stripe_index = rot % map->num_stripes;

2270

stripe_index = rot % map->num_stripes;

2270

if (stripe_index == num)

2271

if (stripe_index == num)

2271

return 0;

2272

return 0;

2272

if (stripe_index < num)

2273

if (stripe_index < num)

2273

j++;

2274

j++;

2274

}

2275

}

2275

*offset = last_offset + j * map->stripe_len;

2276

*offset = last_offset + j * map->stripe_len;

2276

return 1;

2277

return 1;

2277

}

2278

}

2278

2279

static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,

2280

static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,

2280

struct map_lookup *map,

2281

struct map_lookup *map,

2281

struct btrfs_device *scrub_dev,

2282

struct btrfs_device *scrub_dev,

2282

int num, u64 base, u64 length,

2283

int num, u64 base, u64 length,

2283

int is_dev_replace)

2284

int is_dev_replace)

2284

{

2285

{

2285

struct btrfs_path *path;

2286

struct btrfs_path *path;

2286

struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;

2287

struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;

2287

struct btrfs_root *root = fs_info->extent_root;

2288

struct btrfs_root *root = fs_info->extent_root;

2288

struct btrfs_root *csum_root = fs_info->csum_root;

2289

struct btrfs_root *csum_root = fs_info->csum_root;

2289

struct btrfs_extent_item *extent;

2290

struct btrfs_extent_item *extent;

2290

struct blk_plug plug;

2291

struct blk_plug plug;

2291

u64 flags;

2292

u64 flags;

2292

int ret;

2293

int ret;

2293

int slot;

2294

int slot;

2294

u64 nstripes;

2295

u64 nstripes;

2295

struct extent_buffer *l;

2296

struct extent_buffer *l;

2296

struct btrfs_key key;

2297

struct btrfs_key key;

2297

u64 physical;

2298

u64 physical;

2298

u64 logical;

2299

u64 logical;

2299

u64 logic_end;

2300

u64 logic_end;

2300

u64 physical_end;

2301

u64 physical_end;

2301

u64 generation;

2302

u64 generation;

2302

int mirror_num;

2303

int mirror_num;

2303

struct reada_control *reada1;

2304

struct reada_control *reada1;

2304

struct reada_control *reada2;

2305

struct reada_control *reada2;

2305

struct btrfs_key key_start;

2306

struct btrfs_key key_start;

2306

struct btrfs_key key_end;

2307

struct btrfs_key key_end;

2307

u64 increment = map->stripe_len;

2308

u64 increment = map->stripe_len;

2308

u64 offset;

2309

u64 offset;

2309

u64 extent_logical;

2310

u64 extent_logical;

2310

u64 extent_physical;

2311

u64 extent_physical;

2311

u64 extent_len;

2312

u64 extent_len;

2312

struct btrfs_device *extent_dev;

2313

struct btrfs_device *extent_dev;

2313

int extent_mirror_num;

2314

int extent_mirror_num;

2314

int stop_loop = 0;

2315

int stop_loop = 0;

2315

2316

nstripes = length;

2317

nstripes = length;

2317

physical = map->stripes[num].physical;

2318

physical = map->stripes[num].physical;

2318

offset = 0;

2319

offset = 0;

2319

do_div(nstripes, map->stripe_len);

2320

do_div(nstripes, map->stripe_len);

2320

if (map->type & BTRFS_BLOCK_GROUP_RAID0) {

2321

if (map->type & BTRFS_BLOCK_GROUP_RAID0) {

2321

offset = map->stripe_len * num;

2322

offset = map->stripe_len * num;

2322

increment = map->stripe_len * map->num_stripes;

2323

increment = map->stripe_len * map->num_stripes;

2323

mirror_num = 1;

2324

mirror_num = 1;

2324

} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {

2325

} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {

2325

int factor = map->num_stripes / map->sub_stripes;

2326

int factor = map->num_stripes / map->sub_stripes;

2326

offset = map->stripe_len * (num / map->sub_stripes);

2327

offset = map->stripe_len * (num / map->sub_stripes);

2327

increment = map->stripe_len * factor;

2328

increment = map->stripe_len * factor;

2328

mirror_num = num % map->sub_stripes + 1;

2329

mirror_num = num % map->sub_stripes + 1;

2329

} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {

2330

} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {

2330

increment = map->stripe_len;

2331

increment = map->stripe_len;

2331

mirror_num = num % map->num_stripes + 1;

2332

mirror_num = num % map->num_stripes + 1;

2332

} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {

2333

} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {

2333

increment = map->stripe_len;

2334

increment = map->stripe_len;

2334

mirror_num = num % map->num_stripes + 1;

2335

mirror_num = num % map->num_stripes + 1;

2335

} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |

2336

} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |

2336

BTRFS_BLOCK_GROUP_RAID6)) {

2337

BTRFS_BLOCK_GROUP_RAID6)) {

2337

get_raid56_logic_offset(physical, num, map, &offset);

2338

get_raid56_logic_offset(physical, num, map, &offset);

2338

increment = map->stripe_len * nr_data_stripes(map);

2339

increment = map->stripe_len * nr_data_stripes(map);

2339

mirror_num = 1;

2340

mirror_num = 1;

2340

} else {

2341

} else {

2341

increment = map->stripe_len;

2342

increment = map->stripe_len;

2342

mirror_num = 1;

2343

mirror_num = 1;

2343

}

2344

}

2344

2345

path = btrfs_alloc_path();

2346

path = btrfs_alloc_path();

2346

if (!path)

2347

if (!path)

2347

return -ENOMEM;

2348

return -ENOMEM;

2348

2349

/*

2350

/*

2350

* work on commit root. The related disk blocks are static as

2351

* work on commit root. The related disk blocks are static as

2351

* long as COW is applied. This means, it is save to rewrite

2352

* long as COW is applied. This means, it is save to rewrite

2352

* them to repair disk errors without any race conditions

2353

* them to repair disk errors without any race conditions

2353

*/

2354

*/

2354

path->search_commit_root = 1;

2355

path->search_commit_root = 1;

2355

path->skip_locking = 1;

2356

path->skip_locking = 1;

2356

2357

/*

2358

/*

2358

* trigger the readahead for extent tree csum tree and wait for

2359

* trigger the readahead for extent tree csum tree and wait for

2359

* completion. During readahead, the scrub is officially paused

2360

* completion. During readahead, the scrub is officially paused

2360

* to not hold off transaction commits

2361

* to not hold off transaction commits

2361

*/

2362

*/

2362

logical = base + offset;

2363

logical = base + offset;

2363

physical_end = physical + nstripes * map->stripe_len;

2364

physical_end = physical + nstripes * map->stripe_len;

2364

if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |

2365

if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |

2365

BTRFS_BLOCK_GROUP_RAID6)) {

2366

BTRFS_BLOCK_GROUP_RAID6)) {

2366

get_raid56_logic_offset(physical_end, num,

2367

get_raid56_logic_offset(physical_end, num,

2367

map, &logic_end);

2368

map, &logic_end);

2368

logic_end += base;

2369

logic_end += base;

2369

} else {

2370

} else {

2370

logic_end = logical + increment * nstripes;

2371

logic_end = logical + increment * nstripes;

2371

}

2372

}

2372

wait_event(sctx->list_wait,

2373

wait_event(sctx->list_wait,

2373

atomic_read(&sctx->bios_in_flight) == 0);

2374

atomic_read(&sctx->bios_in_flight) == 0);

2374

scrub_blocked_if_needed(fs_info);

2375

scrub_blocked_if_needed(fs_info);

2375

2376

/* FIXME it might be better to start readahead at commit root */

2377

/* FIXME it might be better to start readahead at commit root */

2377

key_start.objectid = logical;

2378

key_start.objectid = logical;

2378

key_start.type = BTRFS_EXTENT_ITEM_KEY;

2379

key_start.type = BTRFS_EXTENT_ITEM_KEY;

2379

key_start.offset = (u64)0;

2380

key_start.offset = (u64)0;

2380

key_end.objectid = logic_end;

2381

key_end.objectid = logic_end;

2381

key_end.type = BTRFS_METADATA_ITEM_KEY;

2382

key_end.type = BTRFS_METADATA_ITEM_KEY;

2382

key_end.offset = (u64)-1;

2383

key_end.offset = (u64)-1;

2383

reada1 = btrfs_reada_add(root, &key_start, &key_end);

2384

reada1 = btrfs_reada_add(root, &key_start, &key_end);

2384

2385

key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;

2386

key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;

2386

key_start.type = BTRFS_EXTENT_CSUM_KEY;

2387

key_start.type = BTRFS_EXTENT_CSUM_KEY;

2387

key_start.offset = logical;

2388

key_start.offset = logical;

2388

key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;

2389

key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;

2389

key_end.type = BTRFS_EXTENT_CSUM_KEY;

2390

key_end.type = BTRFS_EXTENT_CSUM_KEY;

2390

key_end.offset = logic_end;

2391

key_end.offset = logic_end;

2391

reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);

2392

reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);

2392

2393

if (!IS_ERR(reada1))

2394

if (!IS_ERR(reada1))

2394

btrfs_reada_wait(reada1);

2395

btrfs_reada_wait(reada1);

2395

if (!IS_ERR(reada2))

2396

if (!IS_ERR(reada2))

2396

btrfs_reada_wait(reada2);

2397

btrfs_reada_wait(reada2);

2397

2398

2399

/*

2400

/*

2400

* collect all data csums for the stripe to avoid seeking during

2401

* collect all data csums for the stripe to avoid seeking during

2401

* the scrub. This might currently (crc32) end up to be about 1MB

2402

* the scrub. This might currently (crc32) end up to be about 1MB

2402

*/

2403

*/

2403

blk_start_plug(&plug);

2404

blk_start_plug(&plug);

2404

2405

/*

2406

/*

2406

* now find all extents for each stripe and scrub them

2407

* now find all extents for each stripe and scrub them

2407

*/

2408

*/

2408

ret = 0;

2409

ret = 0;

2409

while (physical < physical_end) {

2410

while (physical < physical_end) {

2410

/* for raid56, we skip parity stripe */

2411

/* for raid56, we skip parity stripe */

2411

if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |

2412

if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |

2412

BTRFS_BLOCK_GROUP_RAID6)) {

2413

BTRFS_BLOCK_GROUP_RAID6)) {

2413

ret = get_raid56_logic_offset(physical, num,

2414

ret = get_raid56_logic_offset(physical, num,

2414

map, &logical);

2415

map, &logical);

2415

logical += base;

2416

logical += base;

2416

if (ret)

2417

if (ret)

2417

goto skip;

2418

goto skip;

2418

}

2419

}

2419

/*

2420

/*

2420

* canceled?

2421

* canceled?

2421

*/

2422

*/

2422

if (atomic_read(&fs_info->scrub_cancel_req) ||

2423

if (atomic_read(&fs_info->scrub_cancel_req) ||

2423

atomic_read(&sctx->cancel_req)) {

2424

atomic_read(&sctx->cancel_req)) {

2424

ret = -ECANCELED;

2425

ret = -ECANCELED;

2425

goto out;

2426

goto out;

2426

}

2427

}

2427

/*

2428

/*

2428

* check to see if we have to pause

2429

* check to see if we have to pause

2429

*/

2430

*/

2430

if (atomic_read(&fs_info->scrub_pause_req)) {

2431

if (atomic_read(&fs_info->scrub_pause_req)) {

2431

/* push queued extents */

2432

/* push queued extents */

2432

atomic_set(&sctx->wr_ctx.flush_all_writes, 1);

2433

atomic_set(&sctx->wr_ctx.flush_all_writes, 1);

2433

scrub_submit(sctx);

2434

scrub_submit(sctx);

2434

mutex_lock(&sctx->wr_ctx.wr_lock);

2435

mutex_lock(&sctx->wr_ctx.wr_lock);

2435

scrub_wr_submit(sctx);

2436

scrub_wr_submit(sctx);

2436

mutex_unlock(&sctx->wr_ctx.wr_lock);

2437

mutex_unlock(&sctx->wr_ctx.wr_lock);

2437

wait_event(sctx->list_wait,

2438

wait_event(sctx->list_wait,

2438

atomic_read(&sctx->bios_in_flight) == 0);

2439

atomic_read(&sctx->bios_in_flight) == 0);

2439

atomic_set(&sctx->wr_ctx.flush_all_writes, 0);

2440

atomic_set(&sctx->wr_ctx.flush_all_writes, 0);

2440

scrub_blocked_if_needed(fs_info);

2441

scrub_blocked_if_needed(fs_info);

2441

}

2442

}

2442

2443

if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))

2444

if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))

2444

key.type = BTRFS_METADATA_ITEM_KEY;

2445

key.type = BTRFS_METADATA_ITEM_KEY;

2445

else

2446

else

2446

key.type = BTRFS_EXTENT_ITEM_KEY;

2447

key.type = BTRFS_EXTENT_ITEM_KEY;

2447

key.objectid = logical;

2448

key.objectid = logical;

2448

key.offset = (u64)-1;

2449

key.offset = (u64)-1;

2449

2450

ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);

2451

ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);

2451

if (ret < 0)

2452

if (ret < 0)

2452

goto out;

2453

goto out;

2453

2454

if (ret > 0) {

2455

if (ret > 0) {

2455

ret = btrfs_previous_extent_item(root, path, 0);

2456

ret = btrfs_previous_extent_item(root, path, 0);

2456

if (ret < 0)

2457

if (ret < 0)

2457

goto out;

2458

goto out;

2458

if (ret > 0) {

2459

if (ret > 0) {

2459

/* there's no smaller item, so stick with the

2460

/* there's no smaller item, so stick with the

2460

* larger one */

2461

* larger one */

2461

btrfs_release_path(path);

2462

btrfs_release_path(path);

2462

ret = btrfs_search_slot(NULL, root, &key,

2463

ret = btrfs_search_slot(NULL, root, &key,

2463

path, 0, 0);

2464

path, 0, 0);

2464

if (ret < 0)

2465

if (ret < 0)

2465

goto out;

2466

goto out;

2466

}

2467

}

2467

}

2468

}

2468

2469

stop_loop = 0;

2470

stop_loop = 0;

2470

while (1) {

2471

while (1) {

2471

u64 bytes;

2472

u64 bytes;

2472

2473

l = path->nodes[0];

2474

l = path->nodes[0];

2474

slot = path->slots[0];

2475

slot = path->slots[0];

2475

if (slot >= btrfs_header_nritems(l)) {

2476

if (slot >= btrfs_header_nritems(l)) {

2476

ret = btrfs_next_leaf(root, path);

2477

ret = btrfs_next_leaf(root, path);

2477

if (ret == 0)

2478

if (ret == 0)

2478

continue;

2479

continue;

2479

if (ret < 0)

2480

if (ret < 0)

2480

goto out;

2481

goto out;

2481

2482

stop_loop = 1;

2483

stop_loop = 1;

2483

break;

2484

break;

2484

}

2485

}

2485

btrfs_item_key_to_cpu(l, &key, slot);

2486

btrfs_item_key_to_cpu(l, &key, slot);

2486

2487

if (key.type == BTRFS_METADATA_ITEM_KEY)

2488

if (key.type == BTRFS_METADATA_ITEM_KEY)

2488

bytes = root->leafsize;

2489

bytes = root->leafsize;

2489

else

2490

else

2490

bytes = key.offset;

2491

bytes = key.offset;

2491

2492

if (key.objectid + bytes <= logical)

2493

if (key.objectid + bytes <= logical)

2493

goto next;

2494

goto next;

2494

2495

if (key.type != BTRFS_EXTENT_ITEM_KEY &&

2496

if (key.type != BTRFS_EXTENT_ITEM_KEY &&

2496

key.type != BTRFS_METADATA_ITEM_KEY)

2497

key.type != BTRFS_METADATA_ITEM_KEY)

2497

goto next;

2498

goto next;

2498

2499

if (key.objectid >= logical + map->stripe_len) {

2500

if (key.objectid >= logical + map->stripe_len) {

2500

/* out of this device extent */

2501

/* out of this device extent */

2501

if (key.objectid >= logic_end)

2502

if (key.objectid >= logic_end)

2502

stop_loop = 1;

2503

stop_loop = 1;

2503

break;

2504

break;

2504

}

2505

}

2505

2506

extent = btrfs_item_ptr(l, slot,

2507

extent = btrfs_item_ptr(l, slot,

2507

struct btrfs_extent_item);

2508

struct btrfs_extent_item);

2508

flags = btrfs_extent_flags(l, extent);

2509

flags = btrfs_extent_flags(l, extent);

2509

generation = btrfs_extent_generation(l, extent);

2510

generation = btrfs_extent_generation(l, extent);

2510

2511

if (key.objectid < logical &&

2512

if (key.objectid < logical &&

2512

(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {

2513

(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {

2513

btrfs_err(fs_info,

2514

btrfs_err(fs_info,

2514

"scrub: tree block %llu spanning "

2515

"scrub: tree block %llu spanning "

2515

"stripes, ignored. logical=%llu",

2516

"stripes, ignored. logical=%llu",

2516

key.objectid, logical);

2517

key.objectid, logical);

2517

goto next;

2518

goto next;

2518

}

2519

}

2519

2520

again:

2521

again:

2521

extent_logical = key.objectid;

2522

extent_logical = key.objectid;

2522

extent_len = bytes;

2523

extent_len = bytes;

2523

2524

/*

2525

/*

2525

* trim extent to this stripe

2526

* trim extent to this stripe

2526

*/

2527

*/

2527

if (extent_logical < logical) {

2528

if (extent_logical < logical) {

2528

extent_len -= logical - extent_logical;

2529

extent_len -= logical - extent_logical;

2529

extent_logical = logical;

2530

extent_logical = logical;

2530

}

2531

}

2531

if (extent_logical + extent_len >

2532

if (extent_logical + extent_len >

2532

logical + map->stripe_len) {

2533

logical + map->stripe_len) {

2533

extent_len = logical + map->stripe_len -

2534

extent_len = logical + map->stripe_len -

2534

extent_logical;

2535

extent_logical;

2535

}

2536

}

2536

2537

extent_physical = extent_logical - logical + physical;

2538

extent_physical = extent_logical - logical + physical;

2538

extent_dev = scrub_dev;

2539

extent_dev = scrub_dev;

2539

extent_mirror_num = mirror_num;

2540

extent_mirror_num = mirror_num;

2540

if (is_dev_replace)

2541

if (is_dev_replace)

2541

scrub_remap_extent(fs_info, extent_logical,

2542

scrub_remap_extent(fs_info, extent_logical,

2542

extent_len, &extent_physical,

2543

extent_len, &extent_physical,

2543

&extent_dev,

2544

&extent_dev,

2544

&extent_mirror_num);

2545

&extent_mirror_num);

2545

2546

ret = btrfs_lookup_csums_range(csum_root, logical,

2547

ret = btrfs_lookup_csums_range(csum_root, logical,

2547

logical + map->stripe_len - 1,

2548

logical + map->stripe_len - 1,

2548

&sctx->csum_list, 1);

2549

&sctx->csum_list, 1);

2549

if (ret)

2550

if (ret)

2550

goto out;

2551

goto out;

2551

2552

ret = scrub_extent(sctx, extent_logical, extent_len,

2553

ret = scrub_extent(sctx, extent_logical, extent_len,

2553

extent_physical, extent_dev, flags,

2554

extent_physical, extent_dev, flags,

2554

generation, extent_mirror_num,

2555

generation, extent_mirror_num,

2555

extent_logical - logical + physical);

2556

extent_logical - logical + physical);

2556

if (ret)

2557

if (ret)

2557

goto out;

2558

goto out;

2558

2559

scrub_free_csums(sctx);

2560

scrub_free_csums(sctx);

2560

if (extent_logical + extent_len <

2561

if (extent_logical + extent_len <

2561

key.objectid + bytes) {

2562

key.objectid + bytes) {

2562

if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |

2563

if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |

2563

BTRFS_BLOCK_GROUP_RAID6)) {

2564

BTRFS_BLOCK_GROUP_RAID6)) {

2564

/*

2565

/*

2565

* loop until we find next data stripe

2566

* loop until we find next data stripe

2566

* or we have finished all stripes.

2567

* or we have finished all stripes.

2567

*/

2568

*/

2568

do {

2569

do {

2569

physical += map->stripe_len;

2570

physical += map->stripe_len;

2570

ret = get_raid56_logic_offset(

2571

ret = get_raid56_logic_offset(

2571

physical, num,

2572

physical, num,

2572

map, &logical);

2573

map, &logical);

2573

logical += base;

2574

logical += base;

2574

} while (physical < physical_end && ret);

2575

} while (physical < physical_end && ret);

2575

} else {

2576

} else {

2576

physical += map->stripe_len;

2577

physical += map->stripe_len;

2577

logical += increment;

2578

logical += increment;

2578

}

2579

}

2579

if (logical < key.objectid + bytes) {

2580

if (logical < key.objectid + bytes) {

2580

cond_resched();

2581

cond_resched();

2581

goto again;

2582

goto again;

2582

}

2583

}

2583

2584

if (physical >= physical_end) {

2585

if (physical >= physical_end) {

2585

stop_loop = 1;

2586

stop_loop = 1;

2586

break;

2587

break;

2587

}

2588

}

2588

}

2589

}

2589

path->slots[0]++;

2591

path->slots[0]++;

2591

}

2592

}

2592

btrfs_release_path(path);

2593

btrfs_release_path(path);

2593

skip:

2594

skip:

2594

logical += increment;

2595

logical += increment;

2595

physical += map->stripe_len;

2596

physical += map->stripe_len;

2596

spin_lock(&sctx->stat_lock);

2597

spin_lock(&sctx->stat_lock);

2597

if (stop_loop)

2598

if (stop_loop)

2598

sctx->stat.last_physical = map->stripes[num].physical +

2599

sctx->stat.last_physical = map->stripes[num].physical +

2599

length;

2600

length;

2600

else

2601

else

2601

sctx->stat.last_physical = physical;

2602

sctx->stat.last_physical = physical;

2602

spin_unlock(&sctx->stat_lock);

2603

spin_unlock(&sctx->stat_lock);

2603

if (stop_loop)

2604

if (stop_loop)

2604

break;

2605

break;

2605

}

2606

}

2606

out:

2607

out:

2607

/* push queued extents */

2608

/* push queued extents */

2608

scrub_submit(sctx);

2609

scrub_submit(sctx);

2609

mutex_lock(&sctx->wr_ctx.wr_lock);

2610

mutex_lock(&sctx->wr_ctx.wr_lock);

2610

scrub_wr_submit(sctx);

2611

scrub_wr_submit(sctx);

2611

mutex_unlock(&sctx->wr_ctx.wr_lock);

2612

mutex_unlock(&sctx->wr_ctx.wr_lock);

2612

2613

blk_finish_plug(&plug);

2614

blk_finish_plug(&plug);

2614

btrfs_free_path(path);

2615

btrfs_free_path(path);

2615

return ret < 0 ? ret : 0;

2616

return ret < 0 ? ret : 0;

2616

}

2617

}

2617

2618

static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,

2619

static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,

2619

struct btrfs_device *scrub_dev,

2620

struct btrfs_device *scrub_dev,

2620

u64 chunk_tree, u64 chunk_objectid,

2621

u64 chunk_tree, u64 chunk_objectid,

2621

u64 chunk_offset, u64 length,

2622

u64 chunk_offset, u64 length,

2622

u64 dev_offset, int is_dev_replace)

2623

u64 dev_offset, int is_dev_replace)

2623

{

2624

{

2624

struct btrfs_mapping_tree *map_tree =

2625

struct btrfs_mapping_tree *map_tree =

2625

&sctx->dev_root->fs_info->mapping_tree;

2626

&sctx->dev_root->fs_info->mapping_tree;

2626

struct map_lookup *map;

2627

struct map_lookup *map;

2627

struct extent_map *em;

2628

struct extent_map *em;

2628

int i;

2629

int i;

2629

int ret = 0;

2630

int ret = 0;

2630

2631

read_lock(&map_tree->map_tree.lock);

2632

read_lock(&map_tree->map_tree.lock);

2632

em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);

2633

em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);

2633

read_unlock(&map_tree->map_tree.lock);

2634

read_unlock(&map_tree->map_tree.lock);

2634

2635

if (!em)

2636

if (!em)

2636

return -EINVAL;

2637

return -EINVAL;

2637

2638

map = (struct map_lookup *)em->bdev;

2639

map = (struct map_lookup *)em->bdev;

2639

if (em->start != chunk_offset)

2640

if (em->start != chunk_offset)

2640

goto out;

2641

goto out;

2641

2642

if (em->len < length)

2643

if (em->len < length)

2643

goto out;

2644

goto out;

2644

2645

for (i = 0; i < map->num_stripes; ++i) {

2646

for (i = 0; i < map->num_stripes; ++i) {

2646

if (map->stripes[i].dev->bdev == scrub_dev->bdev &&

2647

if (map->stripes[i].dev->bdev == scrub_dev->bdev &&

2647

map->stripes[i].physical == dev_offset) {

2648

map->stripes[i].physical == dev_offset) {

2648

ret = scrub_stripe(sctx, map, scrub_dev, i,

2649

ret = scrub_stripe(sctx, map, scrub_dev, i,

2649

chunk_offset, length,

2650

chunk_offset, length,

2650

is_dev_replace);

2651

is_dev_replace);

2651

if (ret)

2652

if (ret)

2652

goto out;

2653

goto out;

2653

}

2654

}

2654

}

2655

}

2655

out:

2656

out:

2656

free_extent_map(em);

2657

free_extent_map(em);

2657

2658

return ret;

2659

return ret;

2659

}

2660

}

2660

2661

static noinline_for_stack

2662

static noinline_for_stack

2662

int scrub_enumerate_chunks(struct scrub_ctx *sctx,

2663

int scrub_enumerate_chunks(struct scrub_ctx *sctx,

2663

struct btrfs_device *scrub_dev, u64 start, u64 end,

2664

struct btrfs_device *scrub_dev, u64 start, u64 end,

2664

int is_dev_replace)

2665

int is_dev_replace)

2665

{

2666

{

2666

struct btrfs_dev_extent *dev_extent = NULL;

2667

struct btrfs_dev_extent *dev_extent = NULL;

2667

struct btrfs_path *path;

2668

struct btrfs_path *path;

2668

struct btrfs_root *root = sctx->dev_root;

2669

struct btrfs_root *root = sctx->dev_root;

2669

struct btrfs_fs_info *fs_info = root->fs_info;

2670

struct btrfs_fs_info *fs_info = root->fs_info;

2670

u64 length;

2671

u64 length;

2671

u64 chunk_tree;

2672

u64 chunk_tree;

2672

u64 chunk_objectid;

2673

u64 chunk_objectid;

2673

u64 chunk_offset;

2674

u64 chunk_offset;

2674

int ret;

2675

int ret;

2675

int slot;

2676

int slot;

2676

struct extent_buffer *l;

2677

struct extent_buffer *l;

2677

struct btrfs_key key;

2678

struct btrfs_key key;

2678

struct btrfs_key found_key;

2679

struct btrfs_key found_key;

2679

struct btrfs_block_group_cache *cache;

2680

struct btrfs_block_group_cache *cache;

2680

struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

2681

struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;

2681

2682

path = btrfs_alloc_path();

2683

path = btrfs_alloc_path();

2683

if (!path)

2684

if (!path)

2684

return -ENOMEM;

2685

return -ENOMEM;

2685

2686

path->reada = 2;

2687

path->reada = 2;

2687

path->search_commit_root = 1;

2688

path->search_commit_root = 1;

2688

path->skip_locking = 1;

2689

path->skip_locking = 1;

2689

2690

key.objectid = scrub_dev->devid;

2691

key.objectid = scrub_dev->devid;

2691

key.offset = 0ull;

2692

key.offset = 0ull;

2692

key.type = BTRFS_DEV_EXTENT_KEY;

2693

key.type = BTRFS_DEV_EXTENT_KEY;

2693

2694

while (1) {

2695

while (1) {

2695

ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);

2696

ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);

2696

if (ret < 0)

2697

if (ret < 0)

2697

break;

2698

break;

2698

if (ret > 0) {

2699

if (ret > 0) {

2699

if (path->slots[0] >=

2700

if (path->slots[0] >=

2700

btrfs_header_nritems(path->nodes[0])) {

2701

btrfs_header_nritems(path->nodes[0])) {

2701

ret = btrfs_next_leaf(root, path);

2702

ret = btrfs_next_leaf(root, path);

2702

if (ret)

2703

if (ret)

2703

break;

2704

break;

2704

}

2705

}

2705

}

2706

}

2706

2707

l = path->nodes[0];

2708

l = path->nodes[0];

2708

slot = path->slots[0];

2709

slot = path->slots[0];

2709

2710

btrfs_item_key_to_cpu(l, &found_key, slot);

2711

btrfs_item_key_to_cpu(l, &found_key, slot);

2711

2712

if (found_key.objectid != scrub_dev->devid)

2713

if (found_key.objectid != scrub_dev->devid)

2713

break;

2714

break;

2714

2715

if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)

2716

if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)

2716

break;

2717

break;

2717

2718

if (found_key.offset >= end)

2719

if (found_key.offset >= end)

2719

break;

2720

break;

2720

2721

if (found_key.offset < key.offset)

2722

if (found_key.offset < key.offset)

2722

break;

2723

break;

2723

2724

dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);

2725

dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);

2725

length = btrfs_dev_extent_length(l, dev_extent);

2726

length = btrfs_dev_extent_length(l, dev_extent);

2726

2727

if (found_key.offset + length <= start) {

2728

if (found_key.offset + length <= start) {

2728

key.offset = found_key.offset + length;

2729

key.offset = found_key.offset + length;

2729

btrfs_release_path(path);

2730

btrfs_release_path(path);

2730

continue;

2731

continue;

2731

}

2732

}

2732

2733

chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);

2734

chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);

2734

chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);

2735

chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);

2735

chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);

2736

chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);

2736

2737

/*

2738

/*

2738

* get a reference on the corresponding block group to prevent

2739

* get a reference on the corresponding block group to prevent

2739

* the chunk from going away while we scrub it

2740

* the chunk from going away while we scrub it

2740

*/

2741

*/

2741

cache = btrfs_lookup_block_group(fs_info, chunk_offset);

2742

cache = btrfs_lookup_block_group(fs_info, chunk_offset);

2742

if (!cache) {

2743

if (!cache) {

2743

ret = -ENOENT;

2744

ret = -ENOENT;

2744

break;

2745

break;

2745

}

2746

}

2746

dev_replace->cursor_right = found_key.offset + length;

2747

dev_replace->cursor_right = found_key.offset + length;

2747

dev_replace->cursor_left = found_key.offset;

2748

dev_replace->cursor_left = found_key.offset;

2748

dev_replace->item_needs_writeback = 1;

2749

dev_replace->item_needs_writeback = 1;

2749

ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,

2750

ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,

2750

chunk_offset, length, found_key.offset,

2751

chunk_offset, length, found_key.offset,

2751

is_dev_replace);

2752

is_dev_replace);

2752

2753

/*

2754

/*

2754

* flush, submit all pending read and write bios, afterwards

2755

* flush, submit all pending read and write bios, afterwards

2755

* wait for them.

2756

* wait for them.

2756

* Note that in the dev replace case, a read request causes

2757

* Note that in the dev replace case, a read request causes

2757

* write requests that are submitted in the read completion

2758

* write requests that are submitted in the read completion

2758

* worker. Therefore in the current situation, it is required

2759

* worker. Therefore in the current situation, it is required

2759

* that all write requests are flushed, so that all read and

2760

* that all write requests are flushed, so that all read and

2760

* write requests are really completed when bios_in_flight

2761

* write requests are really completed when bios_in_flight

2761

* changes to 0.

2762

* changes to 0.

2762

*/

2763

*/

2763

atomic_set(&sctx->wr_ctx.flush_all_writes, 1);

2764

atomic_set(&sctx->wr_ctx.flush_all_writes, 1);

2764

scrub_submit(sctx);

2765

scrub_submit(sctx);

2765

mutex_lock(&sctx->wr_ctx.wr_lock);

2766

mutex_lock(&sctx->wr_ctx.wr_lock);

2766

scrub_wr_submit(sctx);

2767

scrub_wr_submit(sctx);

2767

mutex_unlock(&sctx->wr_ctx.wr_lock);

2768

mutex_unlock(&sctx->wr_ctx.wr_lock);

2768

2769

wait_event(sctx->list_wait,

2770

wait_event(sctx->list_wait,

2770

atomic_read(&sctx->bios_in_flight) == 0);

2771

atomic_read(&sctx->bios_in_flight) == 0);

2771

atomic_inc(&fs_info->scrubs_paused);

2772

atomic_inc(&fs_info->scrubs_paused);

2772

wake_up(&fs_info->scrub_pause_wait);

2773

wake_up(&fs_info->scrub_pause_wait);

2773

2774

/*

2775

/*

2775

* must be called before we decrease @scrub_paused.

2776

* must be called before we decrease @scrub_paused.

2776

* make sure we don't block transaction commit while

2777

* make sure we don't block transaction commit while

2777

* we are waiting pending workers finished.

2778

* we are waiting pending workers finished.

2778

*/

2779

*/

2779

wait_event(sctx->list_wait,

2780

wait_event(sctx->list_wait,

2780

atomic_read(&sctx->workers_pending) == 0);

2781

atomic_read(&sctx->workers_pending) == 0);

2781

atomic_set(&sctx->wr_ctx.flush_all_writes, 0);

2782

atomic_set(&sctx->wr_ctx.flush_all_writes, 0);

2782

2783

mutex_lock(&fs_info->scrub_lock);

2784

mutex_lock(&fs_info->scrub_lock);

2784

__scrub_blocked_if_needed(fs_info);

2785

__scrub_blocked_if_needed(fs_info);

2785

atomic_dec(&fs_info->scrubs_paused);

2786

atomic_dec(&fs_info->scrubs_paused);

2786

mutex_unlock(&fs_info->scrub_lock);

2787

mutex_unlock(&fs_info->scrub_lock);

2787

wake_up(&fs_info->scrub_pause_wait);

2788

wake_up(&fs_info->scrub_pause_wait);

2788

2789

btrfs_put_block_group(cache);

2790

btrfs_put_block_group(cache);

2790

if (ret)

2791

if (ret)

2791

break;

2792

break;

2792

if (is_dev_replace &&

2793

if (is_dev_replace &&

2793

atomic64_read(&dev_replace->num_write_errors) > 0) {

2794

atomic64_read(&dev_replace->num_write_errors) > 0) {

2794

ret = -EIO;

2795

ret = -EIO;

2795

break;

2796

break;

2796

}

2797

}

2797

if (sctx->stat.malloc_errors > 0) {

2798

if (sctx->stat.malloc_errors > 0) {

2798

ret = -ENOMEM;

2799

ret = -ENOMEM;

2799

break;

2800

break;

2800

}

2801

}

2801

2802

dev_replace->cursor_left = dev_replace->cursor_right;

2803

dev_replace->cursor_left = dev_replace->cursor_right;

2803

dev_replace->item_needs_writeback = 1;

2804

dev_replace->item_needs_writeback = 1;

2804

2805

key.offset = found_key.offset + length;

2806

key.offset = found_key.offset + length;

2806

btrfs_release_path(path);

2807

btrfs_release_path(path);

2807

}

2808

}

2808

2809

btrfs_free_path(path);

2810

btrfs_free_path(path);

2810

2811

/*

2812

/*

2812

* ret can still be 1 from search_slot or next_leaf,

2813

* ret can still be 1 from search_slot or next_leaf,

2813

* that's not an error

2814

* that's not an error

2814

*/

2815

*/

2815

return ret < 0 ? ret : 0;

2816

return ret < 0 ? ret : 0;

2816

}

2817

}

2817

2818

static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,

2819

static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,

2819

struct btrfs_device *scrub_dev)

2820

struct btrfs_device *scrub_dev)

2820

{

2821

{

2821

int i;

2822

int i;

2822

u64 bytenr;

2823

u64 bytenr;

2823

u64 gen;

2824

u64 gen;

2824

int ret;

2825

int ret;

2825

struct btrfs_root *root = sctx->dev_root;

2826

struct btrfs_root *root = sctx->dev_root;

2826

2827

if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))

2828

if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))

2828

return -EIO;

2829

return -EIO;

2829

2830

gen = root->fs_info->last_trans_committed;

2831

gen = root->fs_info->last_trans_committed;

2831

2832

for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {

2833

for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {

2833

bytenr = btrfs_sb_offset(i);

2834

bytenr = btrfs_sb_offset(i);

2834

if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)

2835

if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)

2835

break;

2836

break;

2836

2837

ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,

2838

ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,

2838

scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,

2839

scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,

2839

NULL, 1, bytenr);

2840

NULL, 1, bytenr);

2840

if (ret)

2841

if (ret)

2841

return ret;

2842

return ret;

2842

}

2843

}

2843

wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);

2844

wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);

2844

2845

return 0;

2846

return 0;

2846

}

2847

}

2847

2848

/*

2849

/*

2849

* get a reference count on fs_info->scrub_workers. start worker if necessary

2850

* get a reference count on fs_info->scrub_workers. start worker if necessary

2850

*/

2851

*/

2851

static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,

2852

static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,

2852

int is_dev_replace)

2853

int is_dev_replace)

2853

{

2854

{

2854

int ret = 0;

2855

int ret = 0;

2855

int flags = WQ_FREEZABLE | WQ_UNBOUND;

2856

int flags = WQ_FREEZABLE | WQ_UNBOUND;

2856

int max_active = fs_info->thread_pool_size;

2857

int max_active = fs_info->thread_pool_size;

2857

2858

if (fs_info->scrub_workers_refcnt == 0) {

2859

if (fs_info->scrub_workers_refcnt == 0) {

2859

if (is_dev_replace)

2860

if (is_dev_replace)

2860

fs_info->scrub_workers =

2861

fs_info->scrub_workers =

2861

btrfs_alloc_workqueue("btrfs-scrub", flags,

2862

btrfs_alloc_workqueue("btrfs-scrub", flags,

2862

1, 4);

2863

1, 4);

2863

else

2864

else

2864

fs_info->scrub_workers =

2865

fs_info->scrub_workers =

2865

btrfs_alloc_workqueue("btrfs-scrub", flags,

2866

btrfs_alloc_workqueue("btrfs-scrub", flags,

2866

max_active, 4);

2867

max_active, 4);

2867

if (!fs_info->scrub_workers) {

2868

if (!fs_info->scrub_workers) {

2868

ret = -ENOMEM;

2869

ret = -ENOMEM;

2869

goto out;

2870

goto out;

2870

}

2871

}

2871

fs_info->scrub_wr_completion_workers =

2872

fs_info->scrub_wr_completion_workers =

2872

btrfs_alloc_workqueue("btrfs-scrubwrc", flags,

2873

btrfs_alloc_workqueue("btrfs-scrubwrc", flags,

2873

max_active, 2);

2874

max_active, 2);

2874

if (!fs_info->scrub_wr_completion_workers) {

2875

if (!fs_info->scrub_wr_completion_workers) {

2875

ret = -ENOMEM;

2876

ret = -ENOMEM;

2876

goto out;

2877

goto out;

2877

}

2878

}

2878

fs_info->scrub_nocow_workers =

2879

fs_info->scrub_nocow_workers =

2879

btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);

2880

btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);

2880

if (!fs_info->scrub_nocow_workers) {

2881

if (!fs_info->scrub_nocow_workers) {

2881

ret = -ENOMEM;

2882

ret = -ENOMEM;

2882

goto out;

2883

goto out;

2883

}

2884

}

2884

}

2885

}

2885

++fs_info->scrub_workers_refcnt;

2886

++fs_info->scrub_workers_refcnt;

2886

out:

2887

out:

2887

return ret;

2888

return ret;

2888

}

2889

}

2889

2890

static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)

2891

static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)

2891

{

2892

{

2892

if (--fs_info->scrub_workers_refcnt == 0) {

2893

if (--fs_info->scrub_workers_refcnt == 0) {

2893

btrfs_destroy_workqueue(fs_info->scrub_workers);

2894

btrfs_destroy_workqueue(fs_info->scrub_workers);

2894

btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);

2895

btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);

2895

btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);

2896

btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);

2896

}

2897

}

2897

WARN_ON(fs_info->scrub_workers_refcnt < 0);

2898

WARN_ON(fs_info->scrub_workers_refcnt < 0);

2898

}

2899

}

2899

2900

int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,

2901

int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,

2901

u64 end, struct btrfs_scrub_progress *progress,

2902

u64 end, struct btrfs_scrub_progress *progress,

2902

int readonly, int is_dev_replace)

2903

int readonly, int is_dev_replace)

2903

{

2904

{

2904

struct scrub_ctx *sctx;

2905

struct scrub_ctx *sctx;

2905

int ret;

2906

int ret;

2906

struct btrfs_device *dev;

2907

struct btrfs_device *dev;

2907

2908

if (btrfs_fs_closing(fs_info))

2909

if (btrfs_fs_closing(fs_info))

2909

return -EINVAL;

2910

return -EINVAL;

2910

2911

/*

2912

/*

2912

* check some assumptions

2913

* check some assumptions

2913

*/

2914

*/

2914

if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {

2915

if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {

2915

btrfs_err(fs_info,

2916

btrfs_err(fs_info,

2916

"scrub: size assumption nodesize == leafsize (%d == %d) fails",

2917

"scrub: size assumption nodesize == leafsize (%d == %d) fails",

2917

fs_info->chunk_root->nodesize,

2918

fs_info->chunk_root->nodesize,

2918

fs_info->chunk_root->leafsize);

2919

fs_info->chunk_root->leafsize);

2919

return -EINVAL;

2920

return -EINVAL;

2920

}

2921

}

2921

2922

if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {

2923

if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {

2923

/*

2924

/*

2924

* in this case scrub is unable to calculate the checksum

2925

* in this case scrub is unable to calculate the checksum

2925

* the way scrub is implemented. Do not handle this

2926

* the way scrub is implemented. Do not handle this

2926

* situation at all because it won't ever happen.

2927

* situation at all because it won't ever happen.

2927

*/

2928

*/

2928

btrfs_err(fs_info,

2929

btrfs_err(fs_info,

2929

"scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",

2930

"scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",

2930

fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);

2931

fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);

2931

return -EINVAL;

2932

return -EINVAL;

2932

}

2933

}

2933

2934

if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {

2935

if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {

2935

/* not supported for data w/o checksums */

2936

/* not supported for data w/o checksums */

2936

btrfs_err(fs_info,

2937

btrfs_err(fs_info,

2937

"scrub: size assumption sectorsize != PAGE_SIZE "

2938

"scrub: size assumption sectorsize != PAGE_SIZE "

2938

"(%d != %lu) fails",

2939

"(%d != %lu) fails",

2939

fs_info->chunk_root->sectorsize, PAGE_SIZE);

2940

fs_info->chunk_root->sectorsize, PAGE_SIZE);

2940

return -EINVAL;

2941

return -EINVAL;

2941

}

2942

}

2942

2943

if (fs_info->chunk_root->nodesize >

2944

if (fs_info->chunk_root->nodesize >

2944

PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||

2945

PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||

2945

fs_info->chunk_root->sectorsize >

2946

fs_info->chunk_root->sectorsize >

2946

PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {

2947

PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {

2947

/*

2948

/*

2948

* would exhaust the array bounds of pagev member in

2949

* would exhaust the array bounds of pagev member in

2949

* struct scrub_block

2950

* struct scrub_block

2950

*/

2951

*/

2951

btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "

2952

btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "

2952

"<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",

2953

"<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",

2953

fs_info->chunk_root->nodesize,

2954

fs_info->chunk_root->nodesize,

2954

SCRUB_MAX_PAGES_PER_BLOCK,

2955

SCRUB_MAX_PAGES_PER_BLOCK,

2955

fs_info->chunk_root->sectorsize,

2956

fs_info->chunk_root->sectorsize,

2956

SCRUB_MAX_PAGES_PER_BLOCK);

2957

SCRUB_MAX_PAGES_PER_BLOCK);

2957

return -EINVAL;

2958

return -EINVAL;

2958

}

2959

}

2959

2960

2961

mutex_lock(&fs_info->fs_devices->device_list_mutex);

2962

mutex_lock(&fs_info->fs_devices->device_list_mutex);

2962

dev = btrfs_find_device(fs_info, devid, NULL, NULL);

2963

dev = btrfs_find_device(fs_info, devid, NULL, NULL);

2963

if (!dev || (dev->missing && !is_dev_replace)) {

2964

if (!dev || (dev->missing && !is_dev_replace)) {

2964

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

2965

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

2965

return -ENODEV;

2966

return -ENODEV;

2966

}

2967

}

2967

2968

mutex_lock(&fs_info->scrub_lock);

2969

mutex_lock(&fs_info->scrub_lock);

2969

if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {

2970

if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {

2970

mutex_unlock(&fs_info->scrub_lock);

2971

mutex_unlock(&fs_info->scrub_lock);

2971

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

2972

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

2972

return -EIO;

2973

return -EIO;

2973

}

2974

}

2974

2975

btrfs_dev_replace_lock(&fs_info->dev_replace);

2976

btrfs_dev_replace_lock(&fs_info->dev_replace);

2976

if (dev->scrub_device ||

2977

if (dev->scrub_device ||

2977

(!is_dev_replace &&

2978

(!is_dev_replace &&

2978

btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {

2979

btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {

2979

btrfs_dev_replace_unlock(&fs_info->dev_replace);

2980

btrfs_dev_replace_unlock(&fs_info->dev_replace);

2980

mutex_unlock(&fs_info->scrub_lock);

2981

mutex_unlock(&fs_info->scrub_lock);

2981

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

2982

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

2982

return -EINPROGRESS;

2983

return -EINPROGRESS;

2983

}

2984

}

2984

btrfs_dev_replace_unlock(&fs_info->dev_replace);

2985

btrfs_dev_replace_unlock(&fs_info->dev_replace);

2985

2986

ret = scrub_workers_get(fs_info, is_dev_replace);

2987

ret = scrub_workers_get(fs_info, is_dev_replace);

2987

if (ret) {

2988

if (ret) {

2988

mutex_unlock(&fs_info->scrub_lock);

2989

mutex_unlock(&fs_info->scrub_lock);

2989

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

2990

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

2990

return ret;

2991

return ret;

2991

}

2992

}

2992

2993

sctx = scrub_setup_ctx(dev, is_dev_replace);

2994

sctx = scrub_setup_ctx(dev, is_dev_replace);

2994

if (IS_ERR(sctx)) {

2995

if (IS_ERR(sctx)) {

2995

mutex_unlock(&fs_info->scrub_lock);

2996

mutex_unlock(&fs_info->scrub_lock);

2996

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

2997

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

2997

scrub_workers_put(fs_info);

2998

scrub_workers_put(fs_info);

2998

return PTR_ERR(sctx);

2999

return PTR_ERR(sctx);

2999

}

3000

}

3000

sctx->readonly = readonly;

3001

sctx->readonly = readonly;

3001

dev->scrub_device = sctx;

3002

dev->scrub_device = sctx;

3002

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

3003

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

3003

3004

/*

3005

/*

3005

* checking @scrub_pause_req here, we can avoid

3006

* checking @scrub_pause_req here, we can avoid

3006

* race between committing transaction and scrubbing.

3007

* race between committing transaction and scrubbing.

3007

*/

3008

*/

3008

__scrub_blocked_if_needed(fs_info);

3009

__scrub_blocked_if_needed(fs_info);

3009

atomic_inc(&fs_info->scrubs_running);

3010

atomic_inc(&fs_info->scrubs_running);

3010

mutex_unlock(&fs_info->scrub_lock);

3011

mutex_unlock(&fs_info->scrub_lock);

3011

3012

if (!is_dev_replace) {

3013

if (!is_dev_replace) {

3013

/*

3014

/*

3014

* by holding device list mutex, we can

3015

* by holding device list mutex, we can

3015

* kick off writing super in log tree sync.

3016

* kick off writing super in log tree sync.

3016

*/

3017

*/

3017

mutex_lock(&fs_info->fs_devices->device_list_mutex);

3018

mutex_lock(&fs_info->fs_devices->device_list_mutex);

3018

ret = scrub_supers(sctx, dev);

3019

ret = scrub_supers(sctx, dev);

3019

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

3020

mutex_unlock(&fs_info->fs_devices->device_list_mutex);

3020

}

3021

}

3021

3022

if (!ret)

3023

if (!ret)

3023

ret = scrub_enumerate_chunks(sctx, dev, start, end,

3024

ret = scrub_enumerate_chunks(sctx, dev, start, end,

3024

is_dev_replace);

3025

is_dev_replace);

3025

3026

wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);

3027

wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);

3027

atomic_dec(&fs_info->scrubs_running);

3028

atomic_dec(&fs_info->scrubs_running);

3028

wake_up(&fs_info->scrub_pause_wait);

3029

wake_up(&fs_info->scrub_pause_wait);

3029

3030

wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);

3031

wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);

3031

3032

if (progress)

3033

if (progress)

3033

memcpy(progress, &sctx->stat, sizeof(*progress));

3034

memcpy(progress, &sctx->stat, sizeof(*progress));

3034

3035

mutex_lock(&fs_info->scrub_lock);

3036

mutex_lock(&fs_info->scrub_lock);

3036

dev->scrub_device = NULL;

3037

dev->scrub_device = NULL;

3037

scrub_workers_put(fs_info);

3038

scrub_workers_put(fs_info);

3038

mutex_unlock(&fs_info->scrub_lock);

3039

mutex_unlock(&fs_info->scrub_lock);

3039

3040

scrub_free_ctx(sctx);

3041

scrub_free_ctx(sctx);

3041

3042

return ret;

3043

return ret;

3043

}

3044

}

3044

3045

void btrfs_scrub_pause(struct btrfs_root *root)

3046

void btrfs_scrub_pause(struct btrfs_root *root)

3046

{

3047

{

3047

struct btrfs_fs_info *fs_info = root->fs_info;

3048

struct btrfs_fs_info *fs_info = root->fs_info;

3048

3049

mutex_lock(&fs_info->scrub_lock);

3050

mutex_lock(&fs_info->scrub_lock);

3050

atomic_inc(&fs_info->scrub_pause_req);

3051

atomic_inc(&fs_info->scrub_pause_req);

3051

while (atomic_read(&fs_info->scrubs_paused) !=

3052

while (atomic_read(&fs_info->scrubs_paused) !=

3052

atomic_read(&fs_info->scrubs_running)) {

3053

atomic_read(&fs_info->scrubs_running)) {

3053

mutex_unlock(&fs_info->scrub_lock);

3054

mutex_unlock(&fs_info->scrub_lock);

3054

wait_event(fs_info->scrub_pause_wait,

3055

wait_event(fs_info->scrub_pause_wait,

3055

atomic_read(&fs_info->scrubs_paused) ==

3056

atomic_read(&fs_info->scrubs_paused) ==

3056

atomic_read(&fs_info->scrubs_running));

3057

atomic_read(&fs_info->scrubs_running));

3057

mutex_lock(&fs_info->scrub_lock);

3058

mutex_lock(&fs_info->scrub_lock);

3058

}

3059

}

3059

mutex_unlock(&fs_info->scrub_lock);

3060

mutex_unlock(&fs_info->scrub_lock);

3060

}

3061

}

3061

3062

void btrfs_scrub_continue(struct btrfs_root *root)

3063

void btrfs_scrub_continue(struct btrfs_root *root)

3063

{

3064

{

3064

struct btrfs_fs_info *fs_info = root->fs_info;

3065

struct btrfs_fs_info *fs_info = root->fs_info;

3065

3066

atomic_dec(&fs_info->scrub_pause_req);

3067

atomic_dec(&fs_info->scrub_pause_req);

3067

wake_up(&fs_info->scrub_pause_wait);

3068

wake_up(&fs_info->scrub_pause_wait);

3068

}

3069

}

3069

3070

int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)

3071

int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)

3071

{

3072

{

3072

mutex_lock(&fs_info->scrub_lock);

3073

mutex_lock(&fs_info->scrub_lock);

3073

if (!atomic_read(&fs_info->scrubs_running)) {

3074

if (!atomic_read(&fs_info->scrubs_running)) {

3074

mutex_unlock(&fs_info->scrub_lock);

3075

mutex_unlock(&fs_info->scrub_lock);

3075

return -ENOTCONN;

3076

return -ENOTCONN;

3076

}

3077

}

3077

3078

atomic_inc(&fs_info->scrub_cancel_req);

3079

atomic_inc(&fs_info->scrub_cancel_req);

3079

while (atomic_read(&fs_info->scrubs_running)) {

3080

while (atomic_read(&fs_info->scrubs_running)) {

3080

mutex_unlock(&fs_info->scrub_lock);

3081

mutex_unlock(&fs_info->scrub_lock);

3081

wait_event(fs_info->scrub_pause_wait,

3082

wait_event(fs_info->scrub_pause_wait,

3082

atomic_read(&fs_info->scrubs_running) == 0);

3083

atomic_read(&fs_info->scrubs_running) == 0);

3083

mutex_lock(&fs_info->scrub_lock);

3084

mutex_lock(&fs_info->scrub_lock);

3084

}

3085

}

3085

atomic_dec(&fs_info->scrub_cancel_req);

3086

atomic_dec(&fs_info->scrub_cancel_req);

3086

mutex_unlock(&fs_info->scrub_lock);

3087

mutex_unlock(&fs_info->scrub_lock);

3087

3088

return 0;

3089

return 0;

3089

}

3090

}

3090

3091

int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,

3092

int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,

3092

struct btrfs_device *dev)

3093

struct btrfs_device *dev)

3093

{

3094

{

3094

struct scrub_ctx *sctx;

3095

struct scrub_ctx *sctx;

3095

3096

mutex_lock(&fs_info->scrub_lock);

3097

mutex_lock(&fs_info->scrub_lock);

3097

sctx = dev->scrub_device;

3098

sctx = dev->scrub_device;

3098

if (!sctx) {

3099

if (!sctx) {

3099

mutex_unlock(&fs_info->scrub_lock);

3100

mutex_unlock(&fs_info->scrub_lock);

3100

return -ENOTCONN;

3101

return -ENOTCONN;

3101

}

3102

}

3102

atomic_inc(&sctx->cancel_req);

3103

atomic_inc(&sctx->cancel_req);

3103

while (dev->scrub_device) {

3104

while (dev->scrub_device) {

3104

mutex_unlock(&fs_info->scrub_lock);

3105

mutex_unlock(&fs_info->scrub_lock);

3105

wait_event(fs_info->scrub_pause_wait,

3106

wait_event(fs_info->scrub_pause_wait,

3106

dev->scrub_device == NULL);

3107

dev->scrub_device == NULL);

3107

mutex_lock(&fs_info->scrub_lock);

3108

mutex_lock(&fs_info->scrub_lock);

3108

}

3109

}

3109

mutex_unlock(&fs_info->scrub_lock);

3110

mutex_unlock(&fs_info->scrub_lock);

3110

3111

return 0;

3112

return 0;

3112

}

3113

}

3113

3114

int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,

3115

int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,

3115

struct btrfs_scrub_progress *progress)

3116

struct btrfs_scrub_progress *progress)

3116

{

3117

{

3117

struct btrfs_device *dev;

3118

struct btrfs_device *dev;

3118

struct scrub_ctx *sctx = NULL;

3119

struct scrub_ctx *sctx = NULL;

3119

3120

mutex_lock(&root->fs_info->fs_devices->device_list_mutex);

3121

mutex_lock(&root->fs_info->fs_devices->device_list_mutex);

3121

dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);

3122

dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);

3122

if (dev)

3123

if (dev)

3123

sctx = dev->scrub_device;

3124

sctx = dev->scrub_device;

3124

if (sctx)

3125

if (sctx)

3125

memcpy(progress, &sctx->stat, sizeof(*progress));

3126

memcpy(progress, &sctx->stat, sizeof(*progress));

3126

mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);

3127

mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);

3127

3128

return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;

3129

return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;

3129

}

3130

}

3130

3131

static void scrub_remap_extent(struct btrfs_fs_info *fs_info,

3132

static void scrub_remap_extent(struct btrfs_fs_info *fs_info,

3132

u64 extent_logical, u64 extent_len,

3133

u64 extent_logical, u64 extent_len,

3133

u64 *extent_physical,

3134

u64 *extent_physical,

3134

struct btrfs_device **extent_dev,

3135

struct btrfs_device **extent_dev,

3135

int *extent_mirror_num)

3136

int *extent_mirror_num)

3136

{

3137

{

3137

u64 mapped_length;

3138

u64 mapped_length;

3138

struct btrfs_bio *bbio = NULL;

3139

struct btrfs_bio *bbio = NULL;

3139

int ret;

3140

int ret;

3140

3141

mapped_length = extent_len;

3142

mapped_length = extent_len;

3142

ret = btrfs_map_block(fs_info, READ, extent_logical,

3143

ret = btrfs_map_block(fs_info, READ, extent_logical,

3143

&mapped_length, &bbio, 0);

3144

&mapped_length, &bbio, 0);

3144

if (ret || !bbio || mapped_length < extent_len ||

3145

if (ret || !bbio || mapped_length < extent_len ||

3145

!bbio->stripes[0].dev->bdev) {

3146

!bbio->stripes[0].dev->bdev) {

3146

kfree(bbio);

3147

kfree(bbio);

3147

return;

3148

return;

3148

}

3149

}

3149

3150

*extent_physical = bbio->stripes[0].physical;

3151

*extent_physical = bbio->stripes[0].physical;

3151

*extent_mirror_num = bbio->mirror_num;

3152

*extent_mirror_num = bbio->mirror_num;

3152

*extent_dev = bbio->stripes[0].dev;

3153

*extent_dev = bbio->stripes[0].dev;

3153

kfree(bbio);

3154

kfree(bbio);

3154

}

3155

}

3155

3156

static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,

3157

static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,

3157

struct scrub_wr_ctx *wr_ctx,

3158

struct scrub_wr_ctx *wr_ctx,

3158

struct btrfs_fs_info *fs_info,

3159

struct btrfs_fs_info *fs_info,

3159

struct btrfs_device *dev,

3160

struct btrfs_device *dev,

3160

int is_dev_replace)

3161

int is_dev_replace)

3161

{

3162

{

3162

WARN_ON(wr_ctx->wr_curr_bio != NULL);

3163

WARN_ON(wr_ctx->wr_curr_bio != NULL);

3163

3164

mutex_init(&wr_ctx->wr_lock);

3165

mutex_init(&wr_ctx->wr_lock);

3165

wr_ctx->wr_curr_bio = NULL;

3166

wr_ctx->wr_curr_bio = NULL;

3166

if (!is_dev_replace)

3167

if (!is_dev_replace)

3167

return 0;

3168

return 0;

3168

3169

WARN_ON(!dev->bdev);

3170

WARN_ON(!dev->bdev);

3170

wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,

3171

wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,

3171

bio_get_nr_vecs(dev->bdev));

3172

bio_get_nr_vecs(dev->bdev));

3172

wr_ctx->tgtdev = dev;

3173

wr_ctx->tgtdev = dev;

3173

atomic_set(&wr_ctx->flush_all_writes, 0);

3174

atomic_set(&wr_ctx->flush_all_writes, 0);

3174

return 0;

3175

return 0;

3175

}

3176

}

3176

3177

static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)

3178

static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)

3178

{

3179

{

3179

mutex_lock(&wr_ctx->wr_lock);

3180

mutex_lock(&wr_ctx->wr_lock);

3180

kfree(wr_ctx->wr_curr_bio);

3181

kfree(wr_ctx->wr_curr_bio);

3181

wr_ctx->wr_curr_bio = NULL;

3182

wr_ctx->wr_curr_bio = NULL;

3182

mutex_unlock(&wr_ctx->wr_lock);

3183

mutex_unlock(&wr_ctx->wr_lock);

3183

}

3184

}

3184

3185

static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,

3186

static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,

3186

int mirror_num, u64 physical_for_dev_replace)

3187

int mirror_num, u64 physical_for_dev_replace)

3187

{

3188

{

3188

struct scrub_copy_nocow_ctx *nocow_ctx;

3189

struct scrub_copy_nocow_ctx *nocow_ctx;

3189

struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;

3190

struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;

3190

3191

nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);

3192

nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);

3192

if (!nocow_ctx) {

3193

if (!nocow_ctx) {

3193

spin_lock(&sctx->stat_lock);

3194

spin_lock(&sctx->stat_lock);

3194

sctx->stat.malloc_errors++;

3195

sctx->stat.malloc_errors++;

3195

spin_unlock(&sctx->stat_lock);

3196

spin_unlock(&sctx->stat_lock);

3196

return -ENOMEM;

3197

return -ENOMEM;

3197

}

3198

}

3198

3199

scrub_pending_trans_workers_inc(sctx);

3200

scrub_pending_trans_workers_inc(sctx);

3200

3201

nocow_ctx->sctx = sctx;

3202

nocow_ctx->sctx = sctx;

3202

nocow_ctx->logical = logical;

3203

nocow_ctx->logical = logical;

3203

nocow_ctx->len = len;

3204

nocow_ctx->len = len;

3204

nocow_ctx->mirror_num = mirror_num;

3205

nocow_ctx->mirror_num = mirror_num;

3205

nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;

3206

nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;

3206

btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);

3207

btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);

3207

INIT_LIST_HEAD(&nocow_ctx->inodes);

3208

INIT_LIST_HEAD(&nocow_ctx->inodes);

3208

btrfs_queue_work(fs_info->scrub_nocow_workers,

3209

btrfs_queue_work(fs_info->scrub_nocow_workers,

3209

&nocow_ctx->work);

3210

&nocow_ctx->work);

3210

3211

return 0;

3212

return 0;

3212

}

3213

}

3213

3214

static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)

3215

static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)

3215

{

3216

{

3216

struct scrub_copy_nocow_ctx *nocow_ctx = ctx;

3217

struct scrub_copy_nocow_ctx *nocow_ctx = ctx;

3217

struct scrub_nocow_inode *nocow_inode;

3218

struct scrub_nocow_inode *nocow_inode;

3218

3219

nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);

3220

nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);

3220

if (!nocow_inode)

3221

if (!nocow_inode)

3221

return -ENOMEM;

3222

return -ENOMEM;

3222

nocow_inode->inum = inum;

3223

nocow_inode->inum = inum;

3223

nocow_inode->offset = offset;

3224

nocow_inode->offset = offset;

3224

nocow_inode->root = root;

3225

nocow_inode->root = root;

3225

list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);

3226

list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);

3226

return 0;

3227

return 0;

3227

}

3228

}

3228

3229

#define COPY_COMPLETE 1

3230

#define COPY_COMPLETE 1

3230

3231

static void copy_nocow_pages_worker(struct btrfs_work *work)

3232

static void copy_nocow_pages_worker(struct btrfs_work *work)

3232

{

3233

{

3233

struct scrub_copy_nocow_ctx *nocow_ctx =

3234

struct scrub_copy_nocow_ctx *nocow_ctx =

3234

container_of(work, struct scrub_copy_nocow_ctx, work);

3235

container_of(work, struct scrub_copy_nocow_ctx, work);

3235

struct scrub_ctx *sctx = nocow_ctx->sctx;

3236

struct scrub_ctx *sctx = nocow_ctx->sctx;

3236

u64 logical = nocow_ctx->logical;

3237

u64 logical = nocow_ctx->logical;

3237

u64 len = nocow_ctx->len;

3238

u64 len = nocow_ctx->len;

3238

int mirror_num = nocow_ctx->mirror_num;

3239

int mirror_num = nocow_ctx->mirror_num;

3239

u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;

3240

u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;

3240

int ret;

3241

int ret;

3241

struct btrfs_trans_handle *trans = NULL;

3242

struct btrfs_trans_handle *trans = NULL;

3242

struct btrfs_fs_info *fs_info;

3243

struct btrfs_fs_info *fs_info;

3243

struct btrfs_path *path;

3244

struct btrfs_path *path;

3244

struct btrfs_root *root;

3245

struct btrfs_root *root;

3245

int not_written = 0;

3246

int not_written = 0;

3246

3247

fs_info = sctx->dev_root->fs_info;

3248

fs_info = sctx->dev_root->fs_info;

3248

root = fs_info->extent_root;

3249

root = fs_info->extent_root;

3249

3250

path = btrfs_alloc_path();

3251

path = btrfs_alloc_path();

3251

if (!path) {

3252

if (!path) {

3252

spin_lock(&sctx->stat_lock);

3253

spin_lock(&sctx->stat_lock);

3253

sctx->stat.malloc_errors++;

3254

sctx->stat.malloc_errors++;

3254

spin_unlock(&sctx->stat_lock);

3255

spin_unlock(&sctx->stat_lock);

3255

not_written = 1;

3256

not_written = 1;

3256

goto out;

3257

goto out;

3257

}

3258

}

3258

3259

trans = btrfs_join_transaction(root);

3260

trans = btrfs_join_transaction(root);

3260

if (IS_ERR(trans)) {

3261

if (IS_ERR(trans)) {

3261

not_written = 1;

3262

not_written = 1;

3262

goto out;

3263

goto out;

3263

}

3264

}

3264

3265

ret = iterate_inodes_from_logical(logical, fs_info, path,

3266

ret = iterate_inodes_from_logical(logical, fs_info, path,

3266

record_inode_for_nocow, nocow_ctx);

3267

record_inode_for_nocow, nocow_ctx);

3267

if (ret != 0 && ret != -ENOENT) {

3268

if (ret != 0 && ret != -ENOENT) {

3268

btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "

3269

btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "

3269

"phys %llu, len %llu, mir %u, ret %d",

3270

"phys %llu, len %llu, mir %u, ret %d",

3270

logical, physical_for_dev_replace, len, mirror_num,

3271

logical, physical_for_dev_replace, len, mirror_num,

3271

ret);

3272

ret);

3272

not_written = 1;

3273

not_written = 1;

3273

goto out;

3274

goto out;

3274

}

3275

}

3275

3276

btrfs_end_transaction(trans, root);

3277

btrfs_end_transaction(trans, root);

3277

trans = NULL;

3278

trans = NULL;

3278

while (!list_empty(&nocow_ctx->inodes)) {

3279

while (!list_empty(&nocow_ctx->inodes)) {

3279

struct scrub_nocow_inode *entry;

3280

struct scrub_nocow_inode *entry;

3280

entry = list_first_entry(&nocow_ctx->inodes,

3281

entry = list_first_entry(&nocow_ctx->inodes,

3281

struct scrub_nocow_inode,

3282

struct scrub_nocow_inode,

3282

list);

3283

list);

3283

list_del_init(&entry->list);

3284

list_del_init(&entry->list);

3284

ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,

3285

ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,

3285

entry->root, nocow_ctx);

3286

entry->root, nocow_ctx);

3286

kfree(entry);

3287

kfree(entry);

3287

if (ret == COPY_COMPLETE) {

3288

if (ret == COPY_COMPLETE) {

3288

ret = 0;

3289

ret = 0;

3289

break;

3290

break;

3290

} else if (ret) {

3291

} else if (ret) {

3291

break;

3292

break;

3292

}

3293

}

3293

}

3294

}

3294

out:

3295

out:

3295

while (!list_empty(&nocow_ctx->inodes)) {

3296

while (!list_empty(&nocow_ctx->inodes)) {

3296

struct scrub_nocow_inode *entry;

3297

struct scrub_nocow_inode *entry;

3297

entry = list_first_entry(&nocow_ctx->inodes,

3298

entry = list_first_entry(&nocow_ctx->inodes,

3298

struct scrub_nocow_inode,

3299

struct scrub_nocow_inode,

3299

list);

3300

list);

3300

list_del_init(&entry->list);

3301

list_del_init(&entry->list);

3301

kfree(entry);

3302

kfree(entry);

3302

}

3303

}

3303

if (trans && !IS_ERR(trans))

3304

if (trans && !IS_ERR(trans))

3304

btrfs_end_transaction(trans, root);

3305

btrfs_end_transaction(trans, root);

3305

if (not_written)

3306

if (not_written)

3306

btrfs_dev_replace_stats_inc(&fs_info->dev_replace.

3307

btrfs_dev_replace_stats_inc(&fs_info->dev_replace.

3307

num_uncorrectable_read_errors);

3308

num_uncorrectable_read_errors);

3308

3309

btrfs_free_path(path);

3310

btrfs_free_path(path);

3310

kfree(nocow_ctx);

3311

kfree(nocow_ctx);

3311

3312

scrub_pending_trans_workers_dec(sctx);

3313

scrub_pending_trans_workers_dec(sctx);

3313

}

3314

}

3314

3315

static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,

3316

static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,

3316

struct scrub_copy_nocow_ctx *nocow_ctx)

3317

struct scrub_copy_nocow_ctx *nocow_ctx)

3317

{

3318

{

3318

struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;

3319

struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;

3319

struct btrfs_key key;

3320

struct btrfs_key key;

3320

struct inode *inode;

3321

struct inode *inode;

3321

struct page *page;

3322

struct page *page;

3322

struct btrfs_root *local_root;

3323

struct btrfs_root *local_root;

3323

struct btrfs_ordered_extent *ordered;

3324

struct btrfs_ordered_extent *ordered;

3324

struct extent_map *em;

3325

struct extent_map *em;

3325

struct extent_state *cached_state = NULL;

3326

struct extent_state *cached_state = NULL;

3326

struct extent_io_tree *io_tree;

3327

struct extent_io_tree *io_tree;

3327

u64 physical_for_dev_replace;

3328

u64 physical_for_dev_replace;

3328

u64 len = nocow_ctx->len;

3329

u64 len = nocow_ctx->len;

3329

u64 lockstart = offset, lockend = offset + len - 1;

3330

u64 lockstart = offset, lockend = offset + len - 1;

3330

unsigned long index;

3331

unsigned long index;

3331

int srcu_index;

3332

int srcu_index;

3332

int ret = 0;

3333

int ret = 0;

3333

int err = 0;

3334

int err = 0;

3334

3335

key.objectid = root;

3336

key.objectid = root;

3336

key.type = BTRFS_ROOT_ITEM_KEY;

3337

key.type = BTRFS_ROOT_ITEM_KEY;

3337

key.offset = (u64)-1;

3338

key.offset = (u64)-1;

3338

3339

srcu_index = srcu_read_lock(&fs_info->subvol_srcu);

3340

srcu_index = srcu_read_lock(&fs_info->subvol_srcu);

3340

3341

local_root = btrfs_read_fs_root_no_name(fs_info, &key);

3342

local_root = btrfs_read_fs_root_no_name(fs_info, &key);

3342

if (IS_ERR(local_root)) {

3343

if (IS_ERR(local_root)) {

3343

srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);

3344

srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);

3344

return PTR_ERR(local_root);

3345

return PTR_ERR(local_root);

3345

}

3346

}

3346

3347

key.type = BTRFS_INODE_ITEM_KEY;

3348

key.type = BTRFS_INODE_ITEM_KEY;

3348

key.objectid = inum;

3349

key.objectid = inum;

3349

key.offset = 0;

3350

key.offset = 0;

3350

inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);

3351

inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);

3351

srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);

3352

srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);

3352

if (IS_ERR(inode))

3353

if (IS_ERR(inode))

3353

return PTR_ERR(inode);

3354

return PTR_ERR(inode);

3354

3355

/* Avoid truncate/dio/punch hole.. */

3356

/* Avoid truncate/dio/punch hole.. */

3356

mutex_lock(&inode->i_mutex);

3357

mutex_lock(&inode->i_mutex);

3357

inode_dio_wait(inode);

3358

inode_dio_wait(inode);

3358

3359

physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;

3360

physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;

3360

io_tree = &BTRFS_I(inode)->io_tree;

3361

io_tree = &BTRFS_I(inode)->io_tree;

3361

3362

lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);

3363

lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);

3363

ordered = btrfs_lookup_ordered_range(inode, lockstart, len);

3364

ordered = btrfs_lookup_ordered_range(inode, lockstart, len);

3364

if (ordered) {

3365

if (ordered) {

3365

btrfs_put_ordered_extent(ordered);

3366

btrfs_put_ordered_extent(ordered);

3366

goto out_unlock;

3367

goto out_unlock;

3367

}

3368

}

3368

3369

em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);

3370

em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);

3370

if (IS_ERR(em)) {

3371

if (IS_ERR(em)) {

3371

ret = PTR_ERR(em);

3372

ret = PTR_ERR(em);

3372

goto out_unlock;

3373

goto out_unlock;

3373

}

3374

}

3374

3375

/*

3376

/*

3376

* This extent does not actually cover the logical extent anymore,

3377

* This extent does not actually cover the logical extent anymore,

3377

* move on to the next inode.

3378

* move on to the next inode.

3378

*/

3379

*/

3379

if (em->block_start > nocow_ctx->logical ||

3380

if (em->block_start > nocow_ctx->logical ||

3380

em->block_start + em->block_len < nocow_ctx->logical + len) {

3381

em->block_start + em->block_len < nocow_ctx->logical + len) {

3381

free_extent_map(em);

3382

free_extent_map(em);

3382

goto out_unlock;

3383

goto out_unlock;

3383

}

3384

}

3384

free_extent_map(em);

3385

free_extent_map(em);

3385

3386

while (len >= PAGE_CACHE_SIZE) {

3387

while (len >= PAGE_CACHE_SIZE) {

3387

index = offset >> PAGE_CACHE_SHIFT;

3388

index = offset >> PAGE_CACHE_SHIFT;

3388

again:

3389

again:

3389

page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);

3390

page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);

3390

if (!page) {

3391

if (!page) {

3391

btrfs_err(fs_info, "find_or_create_page() failed");

3392

btrfs_err(fs_info, "find_or_create_page() failed");

3392

ret = -ENOMEM;

3393

ret = -ENOMEM;

3393

goto out;

3394

goto out;

3394

}

3395

}

3395

3396

if (PageUptodate(page)) {

3397

if (PageUptodate(page)) {

3397

if (PageDirty(page))

3398

if (PageDirty(page))

3398

goto next_page;

3399

goto next_page;

3399

} else {

3400

} else {

3400

ClearPageError(page);

3401

ClearPageError(page);

3401

err = extent_read_full_page_nolock(io_tree, page,

3402

err = extent_read_full_page_nolock(io_tree, page,

3402

btrfs_get_extent,

3403

btrfs_get_extent,

3403

nocow_ctx->mirror_num);

3404

nocow_ctx->mirror_num);

3404

if (err) {

3405

if (err) {

3405

ret = err;

3406

ret = err;

3406

goto next_page;

3407

goto next_page;

3407

}

3408

}

3408

3409

lock_page(page);

3410

lock_page(page);

3410

/*

3411

/*

3411

* If the page has been remove from the page cache,

3412

* If the page has been remove from the page cache,

3412

* the data on it is meaningless, because it may be

3413

* the data on it is meaningless, because it may be

3413

* old one, the new data may be written into the new

3414

* old one, the new data may be written into the new

3414

* page in the page cache.

3415

* page in the page cache.

3415

*/

3416

*/

3416

if (page->mapping != inode->i_mapping) {

3417

if (page->mapping != inode->i_mapping) {

3417

unlock_page(page);

3418

unlock_page(page);

3418

page_cache_release(page);

3419

page_cache_release(page);

3419

goto again;

3420

goto again;

3420

}

3421

}

3421

if (!PageUptodate(page)) {

3422

if (!PageUptodate(page)) {

3422

ret = -EIO;

3423

ret = -EIO;

3423

goto next_page;

3424

goto next_page;

3424

}

3425

}

3425

}

3426

}

3426

err = write_page_nocow(nocow_ctx->sctx,

3427

err = write_page_nocow(nocow_ctx->sctx,

3427

physical_for_dev_replace, page);

3428

physical_for_dev_replace, page);

3428

if (err)

3429

if (err)

3429

ret = err;

3430

ret = err;

3430

next_page:

3431

next_page:

3431

unlock_page(page);

3432

unlock_page(page);

3432

page_cache_release(page);

3433

page_cache_release(page);

3433

3434

if (ret)

3435

if (ret)

3435

break;

3436

break;

3436

3437

offset += PAGE_CACHE_SIZE;

3438

offset += PAGE_CACHE_SIZE;

3438

physical_for_dev_replace += PAGE_CACHE_SIZE;

3439

physical_for_dev_replace += PAGE_CACHE_SIZE;

3439

len -= PAGE_CACHE_SIZE;

3440

len -= PAGE_CACHE_SIZE;

3440

}

3441

}

3441

ret = COPY_COMPLETE;

3442

ret = COPY_COMPLETE;

3442

out_unlock:

3443

out_unlock:

3443

unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,

3444

unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,

3444

GFP_NOFS);

3445

GFP_NOFS);

3445

out:

3446

out:

3446

mutex_unlock(&inode->i_mutex);

3447

mutex_unlock(&inode->i_mutex);

3447

iput(inode);

3448

iput(inode);

3448

return ret;

3449

return ret;

3449

}

3450

}

3450

3451

static int write_page_nocow(struct scrub_ctx *sctx,

3452

static int write_page_nocow(struct scrub_ctx *sctx,

3452

u64 physical_for_dev_replace, struct page *page)

3453

u64 physical_for_dev_replace, struct page *page)

3453

{

3454

{

3454

struct bio *bio;

3455

struct bio *bio;

3455

struct btrfs_device *dev;

3456

struct btrfs_device *dev;

3456

int ret;

3457

int ret;

3457

3458

dev = sctx->wr_ctx.tgtdev;

3459

dev = sctx->wr_ctx.tgtdev;

3459

if (!dev)

3460

if (!dev)

3460

return -EIO;

3461

return -EIO;

3461

if (!dev->bdev) {

3462

if (!dev->bdev) {

3462

printk_ratelimited(KERN_WARNING

3463

printk_ratelimited(KERN_WARNING

3463

"BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");

3464

"BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");

3464

return -EIO;

3465

return -EIO;

3465

}

3466

}

3466

bio = btrfs_io_bio_alloc(GFP_NOFS, 1);

3467

bio = btrfs_io_bio_alloc(GFP_NOFS, 1);

3467

if (!bio) {

3468

if (!bio) {

3468

spin_lock(&sctx->stat_lock);

3469

spin_lock(&sctx->stat_lock);

3469

sctx->stat.malloc_errors++;

3470

sctx->stat.malloc_errors++;

3470

spin_unlock(&sctx->stat_lock);

3471

spin_unlock(&sctx->stat_lock);

3471

return -ENOMEM;

3472

return -ENOMEM;

3472

}

3473

}

3473

bio->bi_iter.bi_size = 0;

3474

bio->bi_iter.bi_size = 0;

3474

bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;

3475

bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;

3475

bio->bi_bdev = dev->bdev;

3476

bio->bi_bdev = dev->bdev;

3476

ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);

3477

ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);

3477

if (ret != PAGE_CACHE_SIZE) {

3478

if (ret != PAGE_CACHE_SIZE) {

3478

leave_with_eio:

3479

leave_with_eio:

3479

bio_put(bio);

3480

bio_put(bio);

3480

btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);

3481

btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);

3481

return -EIO;

3482

return -EIO;

3482

}

3483

}

3483

3484

if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))

3485

if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))

3485

goto leave_with_eio;

3486

goto leave_with_eio;

3486

3487

bio_put(bio);

3488

bio_put(bio);

3488

return 0;

3489

return 0;

3489

}

3490

}

3490

3491

GITLAB

Btrfs: fix scrub_print_warning to handle skinny metadata extents

 /*
  * Copyright (C) 2011 STRATO.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License v2 as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/vmalloc.h>
 #include "ctree.h"
 #include "disk-io.h"
 #include "backref.h"
 #include "ulist.h"
 #include "transaction.h"
 #include "delayed-ref.h"
 #include "locking.h"
 struct extent_inode_elem {
 	u64 inum;
 	u64 offset;
 	struct extent_inode_elem *next;
 };
 static int check_extent_in_eb(struct btrfs_key *key, struct extent_buffer *eb,
 				struct btrfs_file_extent_item *fi,
 				u64 extent_item_pos,
 				struct extent_inode_elem **eie)
 {
 	u64 offset = 0;
 	struct extent_inode_elem *e;
 	if (!btrfs_file_extent_compression(eb, fi) &&
 	    !btrfs_file_extent_encryption(eb, fi) &&
 	    !btrfs_file_extent_other_encoding(eb, fi)) {
 		u64 data_offset;
 		u64 data_len;
 		data_offset = btrfs_file_extent_offset(eb, fi);
 		data_len = btrfs_file_extent_num_bytes(eb, fi);
 		if (extent_item_pos < data_offset ||
 		    extent_item_pos >= data_offset + data_len)
 			return 1;
 		offset = extent_item_pos - data_offset;
 	}
 	e = kmalloc(sizeof(*e), GFP_NOFS);
 	if (!e)
 		return -ENOMEM;
 	e->next = *eie;
 	e->inum = key->objectid;
 	e->offset = key->offset + offset;
 	*eie = e;
 	return 0;
 }
 static void free_inode_elem_list(struct extent_inode_elem *eie)
 {
 	struct extent_inode_elem *eie_next;
 	for (; eie; eie = eie_next) {
 		eie_next = eie->next;
 		kfree(eie);
 	}
 }
 static int find_extent_in_eb(struct extent_buffer *eb, u64 wanted_disk_byte,
 				u64 extent_item_pos,
 				struct extent_inode_elem **eie)
 {
 	u64 disk_byte;
 	struct btrfs_key key;
 	struct btrfs_file_extent_item *fi;
 	int slot;
 	int nritems;
 	int extent_type;
 	int ret;
 	/*
 	 * from the shared data ref, we only have the leaf but we need
 	 * the key. thus, we must look into all items and see that we
 	 * find one (some) with a reference to our extent item.
 	 */
 	nritems = btrfs_header_nritems(eb);
 	for (slot = 0; slot < nritems; ++slot) {
 		btrfs_item_key_to_cpu(eb, &key, slot);
 		if (key.type != BTRFS_EXTENT_DATA_KEY)
 			continue;
 		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 		extent_type = btrfs_file_extent_type(eb, fi);
 		if (extent_type == BTRFS_FILE_EXTENT_INLINE)
 			continue;
 		/* don't skip BTRFS_FILE_EXTENT_PREALLOC, we can handle that */
 		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
 		if (disk_byte != wanted_disk_byte)
 			continue;
 		ret = check_extent_in_eb(&key, eb, fi, extent_item_pos, eie);
 		if (ret < 0)
 			return ret;
 	}
 	return 0;
 }
 /*
  * this structure records all encountered refs on the way up to the root
  */
 struct __prelim_ref {
 	struct list_head list;
 	u64 root_id;
 	struct btrfs_key key_for_search;
 	int level;
 	int count;
 	struct extent_inode_elem *inode_list;
 	u64 parent;
 	u64 wanted_disk_byte;
 };
 static struct kmem_cache *btrfs_prelim_ref_cache;
 int __init btrfs_prelim_ref_init(void)
 {
 	btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
 					sizeof(struct __prelim_ref),
 					0,
 					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
 					NULL);
 	if (!btrfs_prelim_ref_cache)
 		return -ENOMEM;
 	return 0;
 }
 void btrfs_prelim_ref_exit(void)
 {
 	if (btrfs_prelim_ref_cache)
 		kmem_cache_destroy(btrfs_prelim_ref_cache);
 }
 /*
  * the rules for all callers of this function are:
  * - obtaining the parent is the goal
  * - if you add a key, you must know that it is a correct key
  * - if you cannot add the parent or a correct key, then we will look into the
  *   block later to set a correct key
  *
  * delayed refs
  * ============
  *        backref type | shared | indirect | shared | indirect
  * information         |   tree |     tree |   data |     data
  * --------------------+--------+----------+--------+----------
  *      parent logical |    y   |     -    |    -   |     -
  *      key to resolve |    -   |     y    |    y   |     y
  *  tree block logical |    -   |     -    |    -   |     -
  *  root for resolving |    y   |     y    |    y   |     y
  *
  * - column 1:       we've the parent -> done
  * - column 2, 3, 4: we use the key to find the parent
  *
  * on disk refs (inline or keyed)
  * ==============================
  *        backref type | shared | indirect | shared | indirect
  * information         |   tree |     tree |   data |     data
  * --------------------+--------+----------+--------+----------
  *      parent logical |    y   |     -    |    y   |     -
  *      key to resolve |    -   |     -    |    -   |     y
  *  tree block logical |    y   |     y    |    y   |     y
  *  root for resolving |    -   |     y    |    y   |     y
  *
  * - column 1, 3: we've the parent -> done
  * - column 2:    we take the first key from the block to find the parent
  *                (see __add_missing_keys)
  * - column 4:    we use the key to find the parent
  *
  * additional information that's available but not required to find the parent
  * block might help in merging entries to gain some speed.
  */
 static int __add_prelim_ref(struct list_head *head, u64 root_id,
 			    struct btrfs_key *key, int level,
 			    u64 parent, u64 wanted_disk_byte, int count,
 			    gfp_t gfp_mask)
 {
 	struct __prelim_ref *ref;
 	if (root_id == BTRFS_DATA_RELOC_TREE_OBJECTID)
 		return 0;
 	ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask);
 	if (!ref)
 		return -ENOMEM;
 	ref->root_id = root_id;
 	if (key)
 		ref->key_for_search = *key;
 	else
 		memset(&ref->key_for_search, 0, sizeof(ref->key_for_search));
 	ref->inode_list = NULL;
 	ref->level = level;
 	ref->count = count;
 	ref->parent = parent;
 	ref->wanted_disk_byte = wanted_disk_byte;
 	list_add_tail(&ref->list, head);
 	return 0;
 }
 static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
 			   struct ulist *parents, struct __prelim_ref *ref,
 			   int level, u64 time_seq, const u64 *extent_item_pos,
 			   u64 total_refs)
 {
 	int ret = 0;
 	int slot;
 	struct extent_buffer *eb;
 	struct btrfs_key key;
 	struct btrfs_key *key_for_search = &ref->key_for_search;
 	struct btrfs_file_extent_item *fi;
 	struct extent_inode_elem *eie = NULL, *old = NULL;
 	u64 disk_byte;
 	u64 wanted_disk_byte = ref->wanted_disk_byte;
 	u64 count = 0;
 	if (level != 0) {
 		eb = path->nodes[level];
 		ret = ulist_add(parents, eb->start, 0, GFP_NOFS);
 		if (ret < 0)
 			return ret;
 		return 0;
 	}
 	/*
 	 * We normally enter this function with the path already pointing to
 	 * the first item to check. But sometimes, we may enter it with
 	 * slot==nritems. In that case, go to the next leaf before we continue.
 	 */
 	if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
 		ret = btrfs_next_old_leaf(root, path, time_seq);
 	while (!ret && count < total_refs) {
 		eb = path->nodes[0];
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(eb, &key, slot);
 		if (key.objectid != key_for_search->objectid ||
 		    key.type != BTRFS_EXTENT_DATA_KEY)
 			break;
 		fi = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
 		disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
 		if (disk_byte == wanted_disk_byte) {
 			eie = NULL;
 			old = NULL;
 			count++;
 			if (extent_item_pos) {
 				ret = check_extent_in_eb(&key, eb, fi,
 						*extent_item_pos,
 						&eie);
 				if (ret < 0)
 					break;
 			}
 			if (ret > 0)
 				goto next;
 			ret = ulist_add_merge(parents, eb->start,
 					      (uintptr_t)eie,
 					      (u64 *)&old, GFP_NOFS);
 			if (ret < 0)
 				break;
 			if (!ret && extent_item_pos) {
 				while (old->next)
 					old = old->next;
 				old->next = eie;
 			}
 			eie = NULL;
 		}
 next:
 		ret = btrfs_next_old_item(root, path, time_seq);
 	}
 	if (ret > 0)
 		ret = 0;
 	else if (ret < 0)
 		free_inode_elem_list(eie);
 	return ret;
 }
 /*
  * resolve an indirect backref in the form (root_id, key, level)
  * to a logical address
  */
 static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 				  struct btrfs_path *path, u64 time_seq,
 				  struct __prelim_ref *ref,
 				  struct ulist *parents,
 				  const u64 *extent_item_pos, u64 total_refs)
 {
 	struct btrfs_root *root;
 	struct btrfs_key root_key;
 	struct extent_buffer *eb;
 	int ret = 0;
 	int root_level;
 	int level = ref->level;
 	int index;
 	root_key.objectid = ref->root_id;
 	root_key.type = BTRFS_ROOT_ITEM_KEY;
 	root_key.offset = (u64)-1;
 	index = srcu_read_lock(&fs_info->subvol_srcu);
 	root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 	if (IS_ERR(root)) {
 		srcu_read_unlock(&fs_info->subvol_srcu, index);
 		ret = PTR_ERR(root);
 		goto out;
 	}
 	if (path->search_commit_root)
 		root_level = btrfs_header_level(root->commit_root);
 	else
 		root_level = btrfs_old_root_level(root, time_seq);
 	if (root_level + 1 == level) {
 		srcu_read_unlock(&fs_info->subvol_srcu, index);
 		goto out;
 	}
 	path->lowest_level = level;
 	ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
 	/* root node has been locked, we can release @subvol_srcu safely here */
 	srcu_read_unlock(&fs_info->subvol_srcu, index);
 	pr_debug("search slot in root %llu (level %d, ref count %d) returned "
 		 "%d for key (%llu %u %llu)\n",
 		 ref->root_id, level, ref->count, ret,
 		 ref->key_for_search.objectid, ref->key_for_search.type,
 		 ref->key_for_search.offset);
 	if (ret < 0)
 		goto out;
 	eb = path->nodes[level];
 	while (!eb) {
 		if (WARN_ON(!level)) {
 			ret = 1;
 			goto out;
 		}
 		level--;
 		eb = path->nodes[level];
 	}
 	ret = add_all_parents(root, path, parents, ref, level, time_seq,
 			      extent_item_pos, total_refs);
 out:
 	path->lowest_level = 0;
 	btrfs_release_path(path);
 	return ret;
 }
 /*
  * resolve all indirect backrefs from the list
  */
 static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 				   struct btrfs_path *path, u64 time_seq,
 				   struct list_head *head,
 				   const u64 *extent_item_pos, u64 total_refs)
 {
 	int err;
 	int ret = 0;
 	struct __prelim_ref *ref;
 	struct __prelim_ref *ref_safe;
 	struct __prelim_ref *new_ref;
 	struct ulist *parents;
 	struct ulist_node *node;
 	struct ulist_iterator uiter;
 	parents = ulist_alloc(GFP_NOFS);
 	if (!parents)
 		return -ENOMEM;
 	/*
 	 * _safe allows us to insert directly after the current item without
 	 * iterating over the newly inserted items.
 	 * we're also allowed to re-assign ref during iteration.
 	 */
 	list_for_each_entry_safe(ref, ref_safe, head, list) {
 		if (ref->parent)	/* already direct */
 			continue;
 		if (ref->count == 0)
 			continue;
 		err = __resolve_indirect_ref(fs_info, path, time_seq, ref,
 					     parents, extent_item_pos,
 					     total_refs);
 		/*
 		 * we can only tolerate ENOENT,otherwise,we should catch error
 		 * and return directly.
 		 */
 		if (err == -ENOENT) {
 			continue;
 		} else if (err) {
 			ret = err;
 			goto out;
 		}
 		/* we put the first parent into the ref at hand */
 		ULIST_ITER_INIT(&uiter);
 		node = ulist_next(parents, &uiter);
 		ref->parent = node ? node->val : 0;
 		ref->inode_list = node ?
 			(struct extent_inode_elem *)(uintptr_t)node->aux : NULL;
 		/* additional parents require new refs being added here */
 		while ((node = ulist_next(parents, &uiter))) {
 			new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
 						   GFP_NOFS);
 			if (!new_ref) {
 				ret = -ENOMEM;
 				goto out;
 			}
 			memcpy(new_ref, ref, sizeof(*ref));
 			new_ref->parent = node->val;
 			new_ref->inode_list = (struct extent_inode_elem *)
 							(uintptr_t)node->aux;
 			list_add(&new_ref->list, &ref->list);
 		}
 		ulist_reinit(parents);
 	}
 out:
 	ulist_free(parents);
 	return ret;
 }
 static inline int ref_for_same_block(struct __prelim_ref *ref1,
 				     struct __prelim_ref *ref2)
 {
 	if (ref1->level != ref2->level)
 		return 0;
 	if (ref1->root_id != ref2->root_id)
 		return 0;
 	if (ref1->key_for_search.type != ref2->key_for_search.type)
 		return 0;
 	if (ref1->key_for_search.objectid != ref2->key_for_search.objectid)
 		return 0;
 	if (ref1->key_for_search.offset != ref2->key_for_search.offset)
 		return 0;
 	if (ref1->parent != ref2->parent)
 		return 0;
 	return 1;
 }
 /*
  * read tree blocks and add keys where required.
  */
 static int __add_missing_keys(struct btrfs_fs_info *fs_info,
 			      struct list_head *head)
 {
 	struct list_head *pos;
 	struct extent_buffer *eb;
 	list_for_each(pos, head) {
 		struct __prelim_ref *ref;
 		ref = list_entry(pos, struct __prelim_ref, list);
 		if (ref->parent)
 			continue;
 		if (ref->key_for_search.type)
 			continue;
 		BUG_ON(!ref->wanted_disk_byte);
 		eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte,
 				     fs_info->tree_root->leafsize, 0);
 		if (!eb || !extent_buffer_uptodate(eb)) {
 			free_extent_buffer(eb);
 			return -EIO;
 		}
 		btrfs_tree_read_lock(eb);
 		if (btrfs_header_level(eb) == 0)
 			btrfs_item_key_to_cpu(eb, &ref->key_for_search, 0);
 		else
 			btrfs_node_key_to_cpu(eb, &ref->key_for_search, 0);
 		btrfs_tree_read_unlock(eb);
 		free_extent_buffer(eb);
 	}
 	return 0;
 }
 /*
  * merge two lists of backrefs and adjust counts accordingly
  *
  * mode = 1: merge identical keys, if key is set
  *    FIXME: if we add more keys in __add_prelim_ref, we can merge more here.
  *           additionally, we could even add a key range for the blocks we
  *           looked into to merge even more (-> replace unresolved refs by those
  *           having a parent).
  * mode = 2: merge identical parents
  */
 static void __merge_refs(struct list_head *head, int mode)
 {
 	struct list_head *pos1;
 	list_for_each(pos1, head) {
 		struct list_head *n2;
 		struct list_head *pos2;
 		struct __prelim_ref *ref1;
 		ref1 = list_entry(pos1, struct __prelim_ref, list);
 		for (pos2 = pos1->next, n2 = pos2->next; pos2 != head;
 		     pos2 = n2, n2 = pos2->next) {
 			struct __prelim_ref *ref2;
 			struct __prelim_ref *xchg;
 			struct extent_inode_elem *eie;
 			ref2 = list_entry(pos2, struct __prelim_ref, list);
 			if (mode == 1) {
 				if (!ref_for_same_block(ref1, ref2))
 					continue;
 				if (!ref1->parent && ref2->parent) {
 					xchg = ref1;
 					ref1 = ref2;
 					ref2 = xchg;
 				}
 			} else {
 				if (ref1->parent != ref2->parent)
 					continue;
 			}
 			eie = ref1->inode_list;
 			while (eie && eie->next)
 				eie = eie->next;
 			if (eie)
 				eie->next = ref2->inode_list;
 			else
 				ref1->inode_list = ref2->inode_list;
 			ref1->count += ref2->count;
 			list_del(&ref2->list);
 			kmem_cache_free(btrfs_prelim_ref_cache, ref2);
 		}
 	}
 }
 /*
  * add all currently queued delayed refs from this head whose seq nr is
  * smaller or equal that seq to the list
  */
 static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			      struct list_head *prefs, u64 *total_refs)
 {
 	struct btrfs_delayed_extent_op *extent_op = head->extent_op;
 	struct rb_node *n = &head->node.rb_node;
 	struct btrfs_key key;
 	struct btrfs_key op_key = {0};
 	int sgn;
 	int ret = 0;
 	if (extent_op && extent_op->update_key)
 		btrfs_disk_key_to_cpu(&op_key, &extent_op->key);
 	spin_lock(&head->lock);
 	n = rb_first(&head->ref_root);
 	while (n) {
 		struct btrfs_delayed_ref_node *node;
 		node = rb_entry(n, struct btrfs_delayed_ref_node,
 				rb_node);
 		n = rb_next(n);
 		if (node->seq > seq)
 			continue;
 		switch (node->action) {
 		case BTRFS_ADD_DELAYED_EXTENT:
 		case BTRFS_UPDATE_DELAYED_HEAD:
 			WARN_ON(1);
 			continue;
 		case BTRFS_ADD_DELAYED_REF:
 			sgn = 1;
 			break;
 		case BTRFS_DROP_DELAYED_REF:
 			sgn = -1;
 			break;
 		default:
 			BUG_ON(1);
 		}
 		*total_refs += (node->ref_mod * sgn);
 		switch (node->type) {
 		case BTRFS_TREE_BLOCK_REF_KEY: {
 			struct btrfs_delayed_tree_ref *ref;
 			ref = btrfs_delayed_node_to_tree_ref(node);
 			ret = __add_prelim_ref(prefs, ref->root, &op_key,
 					       ref->level + 1, 0, node->bytenr,
 					       node->ref_mod * sgn, GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_BLOCK_REF_KEY: {
 			struct btrfs_delayed_tree_ref *ref;
 			ref = btrfs_delayed_node_to_tree_ref(node);
 			ret = __add_prelim_ref(prefs, ref->root, NULL,
 					       ref->level + 1, ref->parent,
 					       node->bytenr,
 					       node->ref_mod * sgn, GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_delayed_data_ref *ref;
 			ref = btrfs_delayed_node_to_data_ref(node);
 			key.objectid = ref->objectid;
 			key.type = BTRFS_EXTENT_DATA_KEY;
 			key.offset = ref->offset;
 			ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
 					       node->bytenr,
 					       node->ref_mod * sgn, GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_delayed_data_ref *ref;
 			ref = btrfs_delayed_node_to_data_ref(node);
 			key.objectid = ref->objectid;
 			key.type = BTRFS_EXTENT_DATA_KEY;
 			key.offset = ref->offset;
 			ret = __add_prelim_ref(prefs, ref->root, &key, 0,
 					       ref->parent, node->bytenr,
 					       node->ref_mod * sgn, GFP_ATOMIC);
 			break;
 		}
 		default:
 			WARN_ON(1);
 		}
 		if (ret)
 			break;
 	}
 	spin_unlock(&head->lock);
 	return ret;
 }
 /*
  * add all inline backrefs for bytenr to the list
  */
 static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 			     struct btrfs_path *path, u64 bytenr,
 			     int *info_level, struct list_head *prefs,
 			     u64 *total_refs)
 {
 	int ret = 0;
 	int slot;
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	unsigned long ptr;
 	unsigned long end;
 	struct btrfs_extent_item *ei;
 	u64 flags;
 	u64 item_size;
 	/*
 	 * enumerate all inline refs
 	 */
 	leaf = path->nodes[0];
 	slot = path->slots[0];
 	item_size = btrfs_item_size_nr(leaf, slot);
 	BUG_ON(item_size < sizeof(*ei));
 	ei = btrfs_item_ptr(leaf, slot, struct btrfs_extent_item);
 	flags = btrfs_extent_flags(leaf, ei);
 	*total_refs += btrfs_extent_refs(leaf, ei);
 	btrfs_item_key_to_cpu(leaf, &found_key, slot);
 	ptr = (unsigned long)(ei + 1);
 	end = (unsigned long)ei + item_size;
 	if (found_key.type == BTRFS_EXTENT_ITEM_KEY &&
 	    flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		struct btrfs_tree_block_info *info;
 		info = (struct btrfs_tree_block_info *)ptr;
 		*info_level = btrfs_tree_block_level(leaf, info);
 		ptr += sizeof(struct btrfs_tree_block_info);
 		BUG_ON(ptr > end);
 	} else if (found_key.type == BTRFS_METADATA_ITEM_KEY) {
 		*info_level = found_key.offset;
 	} else {
 		BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
 	}
 	while (ptr < end) {
 		struct btrfs_extent_inline_ref *iref;
 		u64 offset;
 		int type;
 		iref = (struct btrfs_extent_inline_ref *)ptr;
 		type = btrfs_extent_inline_ref_type(leaf, iref);
 		offset = btrfs_extent_inline_ref_offset(leaf, iref);
 		switch (type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
 			ret = __add_prelim_ref(prefs, 0, NULL,
 						*info_level + 1, offset,
 						bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_shared_data_ref *sdref;
 			int count;
 			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 			ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
 					       bytenr, count, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			ret = __add_prelim_ref(prefs, offset, NULL,
 					       *info_level + 1, 0,
 					       bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
 			int count;
 			u64 root;
 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
 			count = btrfs_extent_data_ref_count(leaf, dref);
 			key.objectid = btrfs_extent_data_ref_objectid(leaf,
 								      dref);
 			key.type = BTRFS_EXTENT_DATA_KEY;
 			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
 			root = btrfs_extent_data_ref_root(leaf, dref);
 			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
 					       bytenr, count, GFP_NOFS);
 			break;
 		}
 		default:
 			WARN_ON(1);
 		}
 		if (ret)
 			return ret;
 		ptr += btrfs_extent_inline_ref_size(type);
 	}
 	return 0;
 }
 /*
  * add all non-inline backrefs for bytenr to the list
  */
 static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 			    struct btrfs_path *path, u64 bytenr,
 			    int info_level, struct list_head *prefs)
 {
 	struct btrfs_root *extent_root = fs_info->extent_root;
 	int ret;
 	int slot;
 	struct extent_buffer *leaf;
 	struct btrfs_key key;
 	while (1) {
 		ret = btrfs_next_item(extent_root, path);
 		if (ret < 0)
 			break;
 		if (ret) {
 			ret = 0;
 			break;
 		}
 		slot = path->slots[0];
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (key.objectid != bytenr)
 			break;
 		if (key.type < BTRFS_TREE_BLOCK_REF_KEY)
 			continue;
 		if (key.type > BTRFS_SHARED_DATA_REF_KEY)
 			break;
 		switch (key.type) {
 		case BTRFS_SHARED_BLOCK_REF_KEY:
 			ret = __add_prelim_ref(prefs, 0, NULL,
 						info_level + 1, key.offset,
 						bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_shared_data_ref *sdref;
 			int count;
 			sdref = btrfs_item_ptr(leaf, slot,
 					      struct btrfs_shared_data_ref);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 			ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
 						bytenr, count, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			ret = __add_prelim_ref(prefs, key.offset, NULL,
 					       info_level + 1, 0,
 					       bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
 			int count;
 			u64 root;
 			dref = btrfs_item_ptr(leaf, slot,
 					      struct btrfs_extent_data_ref);
 			count = btrfs_extent_data_ref_count(leaf, dref);
 			key.objectid = btrfs_extent_data_ref_objectid(leaf,
 								      dref);
 			key.type = BTRFS_EXTENT_DATA_KEY;
 			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
 			root = btrfs_extent_data_ref_root(leaf, dref);
 			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
 					       bytenr, count, GFP_NOFS);
 			break;
 		}
 		default:
 			WARN_ON(1);
 		}
 		if (ret)
 			return ret;
 	}
 	return ret;
 }
 /*
  * this adds all existing backrefs (inline backrefs, backrefs and delayed
  * refs) for the given bytenr to the refs list, merges duplicates and resolves
  * indirect refs to their parent bytenr.
  * When roots are found, they're added to the roots list
  *
  * FIXME some caching might speed things up
  */
 static int find_parent_nodes(struct btrfs_trans_handle *trans,
 			     struct btrfs_fs_info *fs_info, u64 bytenr,
 			     u64 time_seq, struct ulist *refs,
 			     struct ulist *roots, const u64 *extent_item_pos)
 {
 	struct btrfs_key key;
 	struct btrfs_path *path;
 	struct btrfs_delayed_ref_root *delayed_refs = NULL;
 	struct btrfs_delayed_ref_head *head;
 	int info_level = 0;
 	int ret;
 	struct list_head prefs_delayed;
 	struct list_head prefs;
 	struct __prelim_ref *ref;
 	struct extent_inode_elem *eie = NULL;
 	u64 total_refs = 0;
 	INIT_LIST_HEAD(&prefs);
 	INIT_LIST_HEAD(&prefs_delayed);
 	key.objectid = bytenr;
 	key.offset = (u64)-1;
 	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
 		key.type = BTRFS_METADATA_ITEM_KEY;
 	else
 		key.type = BTRFS_EXTENT_ITEM_KEY;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 	if (!trans) {
 		path->search_commit_root = 1;
 		path->skip_locking = 1;
 	}
 	/*
 	 * grab both a lock on the path and a lock on the delayed ref head.
 	 * We need both to get a consistent picture of how the refs look
 	 * at a specified point in time
 	 */
 again:
 	head = NULL;
 	ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto out;
 	BUG_ON(ret == 0);
 	if (trans) {
 		/*
 		 * look if there are updates for this ref queued and lock the
 		 * head
 		 */
 		delayed_refs = &trans->transaction->delayed_refs;
 		spin_lock(&delayed_refs->lock);
 		head = btrfs_find_delayed_ref_head(trans, bytenr);
 		if (head) {
 			if (!mutex_trylock(&head->mutex)) {
 				atomic_inc(&head->node.refs);
 				spin_unlock(&delayed_refs->lock);
 				btrfs_release_path(path);
 				/*
 				 * Mutex was contended, block until it's
 				 * released and try again
 				 */
 				mutex_lock(&head->mutex);
 				mutex_unlock(&head->mutex);
 				btrfs_put_delayed_ref(&head->node);
 				goto again;
 			}
 			spin_unlock(&delayed_refs->lock);
 			ret = __add_delayed_refs(head, time_seq,
 						 &prefs_delayed, &total_refs);
 			mutex_unlock(&head->mutex);
 			if (ret)
 				goto out;
 		} else {
 			spin_unlock(&delayed_refs->lock);
 		}
 	}
 	if (path->slots[0]) {
 		struct extent_buffer *leaf;
 		int slot;
 		path->slots[0]--;
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(leaf, &key, slot);
 		if (key.objectid == bytenr &&
 		    (key.type == BTRFS_EXTENT_ITEM_KEY ||
 		     key.type == BTRFS_METADATA_ITEM_KEY)) {
 			ret = __add_inline_refs(fs_info, path, bytenr,
 						&info_level, &prefs,
 						&total_refs);
 			if (ret)
 				goto out;
 			ret = __add_keyed_refs(fs_info, path, bytenr,
 					       info_level, &prefs);
 			if (ret)
 				goto out;
 		}
 	}
 	btrfs_release_path(path);
 	list_splice_init(&prefs_delayed, &prefs);
 	ret = __add_missing_keys(fs_info, &prefs);
 	if (ret)
 		goto out;
 	__merge_refs(&prefs, 1);
 	ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs,
 				      extent_item_pos, total_refs);
 	if (ret)
 		goto out;
 	__merge_refs(&prefs, 2);
 	while (!list_empty(&prefs)) {
 		ref = list_first_entry(&prefs, struct __prelim_ref, list);
 		WARN_ON(ref->count < 0);
 		if (roots && ref->count && ref->root_id && ref->parent == 0) {
 			/* no parent == root of tree */
 			ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS);
 			if (ret < 0)
 				goto out;
 		}
 		if (ref->count && ref->parent) {
 			if (extent_item_pos && !ref->inode_list &&
 			    ref->level == 0) {
 				u32 bsz;
 				struct extent_buffer *eb;
 				bsz = btrfs_level_size(fs_info->extent_root,
 							ref->level);
 				eb = read_tree_block(fs_info->extent_root,
 							   ref->parent, bsz, 0);
 				if (!eb || !extent_buffer_uptodate(eb)) {
 					free_extent_buffer(eb);
 					ret = -EIO;
 					goto out;
 				}
 				ret = find_extent_in_eb(eb, bytenr,
 							*extent_item_pos, &eie);
 				free_extent_buffer(eb);
 				if (ret < 0)
 					goto out;
 				ref->inode_list = eie;
 			}
 			ret = ulist_add_merge(refs, ref->parent,
 					      (uintptr_t)ref->inode_list,
 					      (u64 *)&eie, GFP_NOFS);
 			if (ret < 0)
 				goto out;
 			if (!ret && extent_item_pos) {
 				/*
 				 * we've recorded that parent, so we must extend
 				 * its inode list here
 				 */
 				BUG_ON(!eie);
 				while (eie->next)
 					eie = eie->next;
 				eie->next = ref->inode_list;
 			}
 			eie = NULL;
 		}
 		list_del(&ref->list);
 		kmem_cache_free(btrfs_prelim_ref_cache, ref);
 	}
 out:
 	btrfs_free_path(path);
 	while (!list_empty(&prefs)) {
 		ref = list_first_entry(&prefs, struct __prelim_ref, list);
 		list_del(&ref->list);
 		kmem_cache_free(btrfs_prelim_ref_cache, ref);
 	}
 	while (!list_empty(&prefs_delayed)) {
 		ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
 				       list);
 		list_del(&ref->list);
 		kmem_cache_free(btrfs_prelim_ref_cache, ref);
 	}
 	if (ret < 0)
 		free_inode_elem_list(eie);
 	return ret;
 }
 static void free_leaf_list(struct ulist *blocks)
 {
 	struct ulist_node *node = NULL;
 	struct extent_inode_elem *eie;
 	struct ulist_iterator uiter;
 	ULIST_ITER_INIT(&uiter);
 	while ((node = ulist_next(blocks, &uiter))) {
 		if (!node->aux)
 			continue;
 		eie = (struct extent_inode_elem *)(uintptr_t)node->aux;
 		free_inode_elem_list(eie);
 		node->aux = 0;
 	}
 	ulist_free(blocks);
 }
 /*
  * Finds all leafs with a reference to the specified combination of bytenr and
  * offset. key_list_head will point to a list of corresponding keys (caller must
  * free each list element). The leafs will be stored in the leafs ulist, which
  * must be freed with ulist_free.
  *
  * returns 0 on success, <0 on error
  */
 static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
 				struct btrfs_fs_info *fs_info, u64 bytenr,
 				u64 time_seq, struct ulist **leafs,
 				const u64 *extent_item_pos)
 {
 	int ret;
 	*leafs = ulist_alloc(GFP_NOFS);
 	if (!*leafs)
 		return -ENOMEM;
 	ret = find_parent_nodes(trans, fs_info, bytenr,
 				time_seq, *leafs, NULL, extent_item_pos);
 	if (ret < 0 && ret != -ENOENT) {
 		free_leaf_list(*leafs);
 		return ret;
 	}
 	return 0;
 }
 /*
  * walk all backrefs for a given extent to find all roots that reference this
  * extent. Walking a backref means finding all extents that reference this
  * extent and in turn walk the backrefs of those, too. Naturally this is a
  * recursive process, but here it is implemented in an iterative fashion: We
  * find all referencing extents for the extent in question and put them on a
  * list. In turn, we find all referencing extents for those, further appending
  * to the list. The way we iterate the list allows adding more elements after
  * the current while iterating. The process stops when we reach the end of the
  * list. Found roots are added to the roots list.
  *
  * returns 0 on success, < 0 on error.
  */
 static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 				  struct btrfs_fs_info *fs_info, u64 bytenr,
 				  u64 time_seq, struct ulist **roots)
 {
 	struct ulist *tmp;
 	struct ulist_node *node = NULL;
 	struct ulist_iterator uiter;
 	int ret;
 	tmp = ulist_alloc(GFP_NOFS);
 	if (!tmp)
 		return -ENOMEM;
 	*roots = ulist_alloc(GFP_NOFS);
 	if (!*roots) {
 		ulist_free(tmp);
 		return -ENOMEM;
 	}
 	ULIST_ITER_INIT(&uiter);
 	while (1) {
 		ret = find_parent_nodes(trans, fs_info, bytenr,
 					time_seq, tmp, *roots, NULL);
 		if (ret < 0 && ret != -ENOENT) {
 			ulist_free(tmp);
 			ulist_free(*roots);
 			return ret;
 		}
 		node = ulist_next(tmp, &uiter);
 		if (!node)
 			break;
 		bytenr = node->val;
 		cond_resched();
 	}
 	ulist_free(tmp);
 	return 0;
 }
 int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
 			 struct btrfs_fs_info *fs_info, u64 bytenr,
 			 u64 time_seq, struct ulist **roots)
 {
 	int ret;
 	if (!trans)
 		down_read(&fs_info->commit_root_sem);
 	ret = __btrfs_find_all_roots(trans, fs_info, bytenr, time_seq, roots);
 	if (!trans)
 		up_read(&fs_info->commit_root_sem);
 	return ret;
 }
 /*
  * this makes the path point to (inum INODE_ITEM ioff)
  */
 int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
 			struct btrfs_path *path)
 {
 	struct btrfs_key key;
 	return btrfs_find_item(fs_root, path, inum, ioff,
 			BTRFS_INODE_ITEM_KEY, &key);
 }
 static int inode_ref_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
 				struct btrfs_path *path,
 				struct btrfs_key *found_key)
 {
 	return btrfs_find_item(fs_root, path, inum, ioff,
 			BTRFS_INODE_REF_KEY, found_key);
 }
 int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 			  u64 start_off, struct btrfs_path *path,
 			  struct btrfs_inode_extref **ret_extref,
 			  u64 *found_off)
 {
 	int ret, slot;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_inode_extref *extref;
 	struct extent_buffer *leaf;
 	unsigned long ptr;
 	key.objectid = inode_objectid;
 	btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY);
 	key.offset = start_off;
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
 	while (1) {
 		leaf = path->nodes[0];
 		slot = path->slots[0];
 		if (slot >= btrfs_header_nritems(leaf)) {
 			/*
 			 * If the item at offset is not found,
 			 * btrfs_search_slot will point us to the slot
 			 * where it should be inserted. In our case
 			 * that will be the slot directly before the
 			 * next INODE_REF_KEY_V2 item. In the case
 			 * that we're pointing to the last slot in a
 			 * leaf, we must move one leaf over.
 			 */
 			ret = btrfs_next_leaf(root, path);
 			if (ret) {
 				if (ret >= 1)
 					ret = -ENOENT;
 				break;
 			}
 			continue;
 		}
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
 		/*
 		 * Check that we're still looking at an extended ref key for
 		 * this particular objectid. If we have different
 		 * objectid or type then there are no more to be found
 		 * in the tree and we can exit.
 		 */
 		ret = -ENOENT;
 		if (found_key.objectid != inode_objectid)
 			break;
 		if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY)
 			break;
 		ret = 0;
 		ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
 		extref = (struct btrfs_inode_extref *)ptr;
 		*ret_extref = extref;
 		if (found_off)
 			*found_off = found_key.offset;
 		break;
 	}
 	return ret;
 }
 /*
  * this iterates to turn a name (from iref/extref) into a full filesystem path.
  * Elements of the path are separated by '/' and the path is guaranteed to be
  * 0-terminated. the path is only given within the current file system.
  * Therefore, it never starts with a '/'. the caller is responsible to provide
  * "size" bytes in "dest". the dest buffer will be filled backwards. finally,
  * the start point of the resulting string is returned. this pointer is within
  * dest, normally.
  * in case the path buffer would overflow, the pointer is decremented further
  * as if output was written to the buffer, though no more output is actually
  * generated. that way, the caller can determine how much space would be
  * required for the path to fit into the buffer. in that case, the returned
  * value will be smaller than dest. callers must check this!
  */
 char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
 			u32 name_len, unsigned long name_off,
 			struct extent_buffer *eb_in, u64 parent,
 			char *dest, u32 size)
 {
 	int slot;
 	u64 next_inum;
 	int ret;
 	s64 bytes_left = ((s64)size) - 1;
 	struct extent_buffer *eb = eb_in;
 	struct btrfs_key found_key;
 	int leave_spinning = path->leave_spinning;
 	struct btrfs_inode_ref *iref;
 	if (bytes_left >= 0)
 		dest[bytes_left] = '\0';
 	path->leave_spinning = 1;
 	while (1) {
 		bytes_left -= name_len;
 		if (bytes_left >= 0)
 			read_extent_buffer(eb, dest + bytes_left,
 					   name_off, name_len);
 		if (eb != eb_in) {
 			btrfs_tree_read_unlock_blocking(eb);
 			free_extent_buffer(eb);
 		}
 		ret = inode_ref_info(parent, 0, fs_root, path, &found_key);
 		if (ret > 0)
 			ret = -ENOENT;
 		if (ret)
 			break;
 		next_inum = found_key.offset;
 		/* regular exit ahead */
 		if (parent == next_inum)
 			break;
 		slot = path->slots[0];
 		eb = path->nodes[0];
 		/* make sure we can use eb after releasing the path */
 		if (eb != eb_in) {
 			atomic_inc(&eb->refs);
 			btrfs_tree_read_lock(eb);
 			btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		}
 		btrfs_release_path(path);
 		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
 		name_len = btrfs_inode_ref_name_len(eb, iref);
 		name_off = (unsigned long)(iref + 1);
 		parent = next_inum;
 		--bytes_left;
 		if (bytes_left >= 0)
 			dest[bytes_left] = '/';
 	}
 	btrfs_release_path(path);
 	path->leave_spinning = leave_spinning;
 	if (ret)
 		return ERR_PTR(ret);
 	return dest + bytes_left;
 }
 /*
  * this makes the path point to (logical EXTENT_ITEM *)
  * returns BTRFS_EXTENT_FLAG_DATA for data, BTRFS_EXTENT_FLAG_TREE_BLOCK for
  * tree blocks and <0 on error.
  */
 int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 			struct btrfs_path *path, struct btrfs_key *found_key,
 			u64 *flags_ret)
 {
 	int ret;
 	u64 flags;
 	u64 size = 0;
 	u32 item_size;
 	struct extent_buffer *eb;
 	struct btrfs_extent_item *ei;
 	struct btrfs_key key;
 	if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
 		key.type = BTRFS_METADATA_ITEM_KEY;
 	else
 		key.type = BTRFS_EXTENT_ITEM_KEY;
 	key.objectid = logical;
 	key.offset = (u64)-1;
 	ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
 	ret = btrfs_previous_extent_item(fs_info->extent_root, path, 0);
 	if (ret) {
 		if (ret > 0)
 			ret = -ENOENT;
 		return ret;
 	}
 	btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]);
 	if (found_key->type == BTRFS_METADATA_ITEM_KEY)
 		size = fs_info->extent_root->leafsize;
 	else if (found_key->type == BTRFS_EXTENT_ITEM_KEY)
 		size = found_key->offset;
 	if (found_key->objectid > logical ||
 	    found_key->objectid + size <= logical) {
 		pr_debug("logical %llu is not within any extent\n", logical);
 		return -ENOENT;
 	}
 	eb = path->nodes[0];
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
 	BUG_ON(item_size < sizeof(*ei));
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 	flags = btrfs_extent_flags(eb, ei);
 	pr_debug("logical %llu is at position %llu within the extent (%llu "
 		 "EXTENT_ITEM %llu) flags %#llx size %u\n",
 		 logical, logical - found_key->objectid, found_key->objectid,
 		 found_key->offset, flags, item_size);
 	WARN_ON(!flags_ret);
 	if (flags_ret) {
 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 			*flags_ret = BTRFS_EXTENT_FLAG_TREE_BLOCK;
 		else if (flags & BTRFS_EXTENT_FLAG_DATA)
 			*flags_ret = BTRFS_EXTENT_FLAG_DATA;
 		else
 			BUG_ON(1);
 		return 0;
 	}
 	return -EIO;
 }
 /*
  * helper function to iterate extent inline refs. ptr must point to a 0 value
  * for the first call and may be modified. it is used to track state.
  * if more refs exist, 0 is returned and the next call to
  * __get_extent_inline_ref must pass the modified ptr parameter to get the
  * next ref. after the last ref was processed, 1 is returned.
  * returns <0 on error
  */
 static int __get_extent_inline_ref(unsigned long *ptr, struct extent_buffer *eb,
-				struct btrfs_extent_item *ei, u32 item_size,
+				   struct btrfs_key *key,
-				struct btrfs_extent_inline_ref **out_eiref,
+				   struct btrfs_extent_item *ei, u32 item_size,
-				int *out_type)
+				   struct btrfs_extent_inline_ref **out_eiref,
+				   int *out_type)
 {
 	unsigned long end;
 	u64 flags;
 	struct btrfs_tree_block_info *info;
 	if (!*ptr) {
 		/* first call */
 		flags = btrfs_extent_flags(eb, ei);
 		if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
-			info = (struct btrfs_tree_block_info *)(ei + 1);
+			if (key->type == BTRFS_METADATA_ITEM_KEY) {
-			*out_eiref =
+				/* a skinny metadata extent */
-				(struct btrfs_extent_inline_ref *)(info + 1);
+				*out_eiref =
+				     (struct btrfs_extent_inline_ref *)(ei + 1);
+			} else {
+				WARN_ON(key->type != BTRFS_EXTENT_ITEM_KEY);
+				info = (struct btrfs_tree_block_info *)(ei + 1);
+				*out_eiref =
+				   (struct btrfs_extent_inline_ref *)(info + 1);
+			}
 		} else {
 			*out_eiref = (struct btrfs_extent_inline_ref *)(ei + 1);
 		}
 		*ptr = (unsigned long)*out_eiref;
 		if ((unsigned long)(*ptr) >= (unsigned long)ei + item_size)
 			return -ENOENT;
 	}
 	end = (unsigned long)ei + item_size;
-	*out_eiref = (struct btrfs_extent_inline_ref *)*ptr;
+	*out_eiref = (struct btrfs_extent_inline_ref *)(*ptr);
 	*out_type = btrfs_extent_inline_ref_type(eb, *out_eiref);
 	*ptr += btrfs_extent_inline_ref_size(*out_type);
 	WARN_ON(*ptr > end);
 	if (*ptr == end)
 		return 1; /* last */
 	return 0;
 }
 /*
  * reads the tree block backref for an extent. tree level and root are returned
  * through out_level and out_root. ptr must point to a 0 value for the first
  * call and may be modified (see __get_extent_inline_ref comment).
  * returns 0 if data was provided, 1 if there was no more data to provide or
  * <0 on error.
  */
 int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
-				struct btrfs_extent_item *ei, u32 item_size,
+			    struct btrfs_key *key, struct btrfs_extent_item *ei,
-				u64 *out_root, u8 *out_level)
+			    u32 item_size, u64 *out_root, u8 *out_level)
 {
 	int ret;
 	int type;
 	struct btrfs_tree_block_info *info;
 	struct btrfs_extent_inline_ref *eiref;
 	if (*ptr == (unsigned long)-1)
 		return 1;
 	while (1) {
-		ret = __get_extent_inline_ref(ptr, eb, ei, item_size,
+		ret = __get_extent_inline_ref(ptr, eb, key, ei, item_size,
-						&eiref, &type);
+					      &eiref, &type);
 		if (ret < 0)
 			return ret;
 		if (type == BTRFS_TREE_BLOCK_REF_KEY ||
 		    type == BTRFS_SHARED_BLOCK_REF_KEY)
 			break;
 		if (ret == 1)
 			return 1;
 	}
 	/* we can treat both ref types equally here */
 	info = (struct btrfs_tree_block_info *)(ei + 1);
 	*out_root = btrfs_extent_inline_ref_offset(eb, eiref);
 	*out_level = btrfs_tree_block_level(eb, info);
 	if (ret == 1)
 		*ptr = (unsigned long)-1;
 	return 0;
 }
 static int iterate_leaf_refs(struct extent_inode_elem *inode_list,
 				u64 root, u64 extent_item_objectid,
 				iterate_extent_inodes_t *iterate, void *ctx)
 {
 	struct extent_inode_elem *eie;
 	int ret = 0;
 	for (eie = inode_list; eie; eie = eie->next) {
 		pr_debug("ref for %llu resolved, key (%llu EXTEND_DATA %llu), "
 			 "root %llu\n", extent_item_objectid,
 			 eie->inum, eie->offset, root);
 		ret = iterate(eie->inum, eie->offset, root, ctx);
 		if (ret) {
 			pr_debug("stopping iteration for %llu due to ret=%d\n",
 				 extent_item_objectid, ret);
 			break;
 		}
 	}
 	return ret;
 }
 /*
  * calls iterate() for every inode that references the extent identified by
  * the given parameters.
  * when the iterator function returns a non-zero value, iteration stops.
  */
 int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 				u64 extent_item_objectid, u64 extent_item_pos,
 				int search_commit_root,
 				iterate_extent_inodes_t *iterate, void *ctx)
 {
 	int ret;
 	struct btrfs_trans_handle *trans = NULL;
 	struct ulist *refs = NULL;
 	struct ulist *roots = NULL;
 	struct ulist_node *ref_node = NULL;
 	struct ulist_node *root_node = NULL;
 	struct seq_list tree_mod_seq_elem = {};
 	struct ulist_iterator ref_uiter;
 	struct ulist_iterator root_uiter;
 	pr_debug("resolving all inodes for extent %llu\n",
 			extent_item_objectid);
 	if (!search_commit_root) {
 		trans = btrfs_join_transaction(fs_info->extent_root);
 		if (IS_ERR(trans))
 			return PTR_ERR(trans);
 		btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
 	} else {
 		down_read(&fs_info->commit_root_sem);
 	}
 	ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
 				   tree_mod_seq_elem.seq, &refs,
 				   &extent_item_pos);
 	if (ret)
 		goto out;
 	ULIST_ITER_INIT(&ref_uiter);
 	while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
 		ret = __btrfs_find_all_roots(trans, fs_info, ref_node->val,
 					     tree_mod_seq_elem.seq, &roots);
 		if (ret)
 			break;
 		ULIST_ITER_INIT(&root_uiter);
 		while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
 			pr_debug("root %llu references leaf %llu, data list "
 				 "%#llx\n", root_node->val, ref_node->val,
 				 ref_node->aux);
 			ret = iterate_leaf_refs((struct extent_inode_elem *)
 						(uintptr_t)ref_node->aux,
 						root_node->val,
 						extent_item_objectid,
 						iterate, ctx);
 		}
 		ulist_free(roots);
 	}
 	free_leaf_list(refs);
 out:
 	if (!search_commit_root) {
 		btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
 		btrfs_end_transaction(trans, fs_info->extent_root);
 	} else {
 		up_read(&fs_info->commit_root_sem);
 	}
 	return ret;
 }
 int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
 				struct btrfs_path *path,
 				iterate_extent_inodes_t *iterate, void *ctx)
 {
 	int ret;
 	u64 extent_item_pos;
 	u64 flags = 0;
 	struct btrfs_key found_key;
 	int search_commit_root = path->search_commit_root;
 	ret = extent_from_logical(fs_info, logical, path, &found_key, &flags);
 	btrfs_release_path(path);
 	if (ret < 0)
 		return ret;
 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		return -EINVAL;
 	extent_item_pos = logical - found_key.objectid;
 	ret = iterate_extent_inodes(fs_info, found_key.objectid,
 					extent_item_pos, search_commit_root,
 					iterate, ctx);
 	return ret;
 }
 typedef int (iterate_irefs_t)(u64 parent, u32 name_len, unsigned long name_off,
 			      struct extent_buffer *eb, void *ctx);
 static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
 			      struct btrfs_path *path,
 			      iterate_irefs_t *iterate, void *ctx)
 {
 	int ret = 0;
 	int slot;
 	u32 cur;
 	u32 len;
 	u32 name_len;
 	u64 parent = 0;
 	int found = 0;
 	struct extent_buffer *eb;
 	struct btrfs_item *item;
 	struct btrfs_inode_ref *iref;
 	struct btrfs_key found_key;
 	while (!ret) {
 		ret = inode_ref_info(inum, parent ? parent+1 : 0, fs_root, path,
 				     &found_key);
 		if (ret < 0)
 			break;
 		if (ret) {
 			ret = found ? 0 : -ENOENT;
 			break;
 		}
 		++found;
 		parent = found_key.offset;
 		slot = path->slots[0];
 		eb = btrfs_clone_extent_buffer(path->nodes[0]);
 		if (!eb) {
 			ret = -ENOMEM;
 			break;
 		}
 		extent_buffer_get(eb);
 		btrfs_tree_read_lock(eb);
 		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		btrfs_release_path(path);
 		item = btrfs_item_nr(slot);
 		iref = btrfs_item_ptr(eb, slot, struct btrfs_inode_ref);
 		for (cur = 0; cur < btrfs_item_size(eb, item); cur += len) {
 			name_len = btrfs_inode_ref_name_len(eb, iref);
 			/* path must be released before calling iterate()! */
 			pr_debug("following ref at offset %u for inode %llu in "
 				 "tree %llu\n", cur, found_key.objectid,
 				 fs_root->objectid);
 			ret = iterate(parent, name_len,
 				      (unsigned long)(iref + 1), eb, ctx);
 			if (ret)
 				break;
 			len = sizeof(*iref) + name_len;
 			iref = (struct btrfs_inode_ref *)((char *)iref + len);
 		}
 		btrfs_tree_read_unlock_blocking(eb);
 		free_extent_buffer(eb);
 	}
 	btrfs_release_path(path);
 	return ret;
 }
 static int iterate_inode_extrefs(u64 inum, struct btrfs_root *fs_root,
 				 struct btrfs_path *path,
 				 iterate_irefs_t *iterate, void *ctx)
 {
 	int ret;
 	int slot;
 	u64 offset = 0;
 	u64 parent;
 	int found = 0;
 	struct extent_buffer *eb;
 	struct btrfs_inode_extref *extref;
 	struct extent_buffer *leaf;
 	u32 item_size;
 	u32 cur_offset;
 	unsigned long ptr;
 	while (1) {
 		ret = btrfs_find_one_extref(fs_root, inum, offset, path, &extref,
 					    &offset);
 		if (ret < 0)
 			break;
 		if (ret) {
 			ret = found ? 0 : -ENOENT;
 			break;
 		}
 		++found;
 		slot = path->slots[0];
 		eb = btrfs_clone_extent_buffer(path->nodes[0]);
 		if (!eb) {
 			ret = -ENOMEM;
 			break;
 		}
 		extent_buffer_get(eb);
 		btrfs_tree_read_lock(eb);
 		btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
 		btrfs_release_path(path);
 		leaf = path->nodes[0];
 		item_size = btrfs_item_size_nr(leaf, slot);
 		ptr = btrfs_item_ptr_offset(leaf, slot);
 		cur_offset = 0;
 		while (cur_offset < item_size) {
 			u32 name_len;
 			extref = (struct btrfs_inode_extref *)(ptr + cur_offset);
 			parent = btrfs_inode_extref_parent(eb, extref);
 			name_len = btrfs_inode_extref_name_len(eb, extref);
 			ret = iterate(parent, name_len,
 				      (unsigned long)&extref->name, eb, ctx);
 			if (ret)
 				break;
 			cur_offset += btrfs_inode_extref_name_len(leaf, extref);
 			cur_offset += sizeof(*extref);
 		}
 		btrfs_tree_read_unlock_blocking(eb);
 		free_extent_buffer(eb);
 		offset++;
 	}
 	btrfs_release_path(path);
 	return ret;
 }
 static int iterate_irefs(u64 inum, struct btrfs_root *fs_root,
 			 struct btrfs_path *path, iterate_irefs_t *iterate,
 			 void *ctx)
 {
 	int ret;
 	int found_refs = 0;
 	ret = iterate_inode_refs(inum, fs_root, path, iterate, ctx);
 	if (!ret)
 		++found_refs;
 	else if (ret != -ENOENT)
 		return ret;
 	ret = iterate_inode_extrefs(inum, fs_root, path, iterate, ctx);
 	if (ret == -ENOENT && found_refs)
 		return 0;
 	return ret;
 }
 /*
  * returns 0 if the path could be dumped (probably truncated)
  * returns <0 in case of an error
  */
 static int inode_to_path(u64 inum, u32 name_len, unsigned long name_off,
 			 struct extent_buffer *eb, void *ctx)
 {
 	struct inode_fs_paths *ipath = ctx;
 	char *fspath;
 	char *fspath_min;
 	int i = ipath->fspath->elem_cnt;
 	const int s_ptr = sizeof(char *);
 	u32 bytes_left;
 	bytes_left = ipath->fspath->bytes_left > s_ptr ?
 					ipath->fspath->bytes_left - s_ptr : 0;
 	fspath_min = (char *)ipath->fspath->val + (i + 1) * s_ptr;
 	fspath = btrfs_ref_to_path(ipath->fs_root, ipath->btrfs_path, name_len,
 				   name_off, eb, inum, fspath_min, bytes_left);
 	if (IS_ERR(fspath))
 		return PTR_ERR(fspath);
 	if (fspath > fspath_min) {
 		ipath->fspath->val[i] = (u64)(unsigned long)fspath;
 		++ipath->fspath->elem_cnt;
 		ipath->fspath->bytes_left = fspath - fspath_min;
 	} else {
 		++ipath->fspath->elem_missed;
 		ipath->fspath->bytes_missing += fspath_min - fspath;
 		ipath->fspath->bytes_left = 0;
 	}
 	return 0;
 }
 /*
  * this dumps all file system paths to the inode into the ipath struct, provided
  * is has been created large enough. each path is zero-terminated and accessed
  * from ipath->fspath->val[i].
  * when it returns, there are ipath->fspath->elem_cnt number of paths available
  * in ipath->fspath->val[]. when the allocated space wasn't sufficient, the
  * number of missed paths in recored in ipath->fspath->elem_missed, otherwise,
  * it's zero. ipath->fspath->bytes_missing holds the number of bytes that would
  * have been needed to return all paths.
  */
 int paths_from_inode(u64 inum, struct inode_fs_paths *ipath)
 {
 	return iterate_irefs(inum, ipath->fs_root, ipath->btrfs_path,
 			     inode_to_path, ipath);
 }
 struct btrfs_data_container *init_data_container(u32 total_bytes)
 {
 	struct btrfs_data_container *data;
 	size_t alloc_bytes;
 	alloc_bytes = max_t(size_t, total_bytes, sizeof(*data));
 	data = vmalloc(alloc_bytes);
 	if (!data)
 		return ERR_PTR(-ENOMEM);
 	if (total_bytes >= sizeof(*data)) {
 		data->bytes_left = total_bytes - sizeof(*data);
 		data->bytes_missing = 0;
 	} else {
 		data->bytes_missing = sizeof(*data) - total_bytes;
 		data->bytes_left = 0;
 	}
 	data->elem_cnt = 0;
 	data->elem_missed = 0;
 	return data;
 }
 /*
  * allocates space to return multiple file system paths for an inode.
  * total_bytes to allocate are passed, note that space usable for actual path
  * information will be total_bytes - sizeof(struct inode_fs_paths).
  * the returned pointer must be freed with free_ipath() in the end.
  */
 struct inode_fs_paths *init_ipath(s32 total_bytes, struct btrfs_root *fs_root,
 					struct btrfs_path *path)
 {
 	struct inode_fs_paths *ifp;
 	struct btrfs_data_container *fspath;
 	fspath = init_data_container(total_bytes);
 	if (IS_ERR(fspath))
 		return (void *)fspath;
 	ifp = kmalloc(sizeof(*ifp), GFP_NOFS);
 	if (!ifp) {
 		kfree(fspath);
 		return ERR_PTR(-ENOMEM);
 	}
 	ifp->btrfs_path = path;
 	ifp->fspath = fspath;
 	ifp->fs_root = fs_root;
 	return ifp;
 }
 void free_ipath(struct inode_fs_paths *ipath)
 {
 	if (!ipath)
 		return;
 	vfree(ipath->fspath);
 	kfree(ipath);
 }

1	/*	1	/*
2	* Copyright (C) 2011 STRATO. All rights reserved.	2	* Copyright (C) 2011 STRATO. All rights reserved.
3	*	3	*
4	* This program is free software; you can redistribute it and/or	4	* This program is free software; you can redistribute it and/or
5	* modify it under the terms of the GNU General Public	5	* modify it under the terms of the GNU General Public
6	* License v2 as published by the Free Software Foundation.	6	* License v2 as published by the Free Software Foundation.
7	*	7	*
8	* This program is distributed in the hope that it will be useful,	8	* This program is distributed in the hope that it will be useful,
9	* but WITHOUT ANY WARRANTY; without even the implied warranty of	9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11	* General Public License for more details.	11	* General Public License for more details.
12	*	12	*
13	* You should have received a copy of the GNU General Public	13	* You should have received a copy of the GNU General Public
14	* License along with this program; if not, write to the	14	* License along with this program; if not, write to the
15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,	15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16	* Boston, MA 021110-1307, USA.	16	* Boston, MA 021110-1307, USA.
17	*/	17	*/
18		18
19	#ifndef __BTRFS_BACKREF__	19	#ifndef __BTRFS_BACKREF__
20	#define __BTRFS_BACKREF__	20	#define __BTRFS_BACKREF__
21		21
22	#include <linux/btrfs.h>	22	#include <linux/btrfs.h>
23	#include "ulist.h"	23	#include "ulist.h"
24	#include "extent_io.h"	24	#include "extent_io.h"
25		25
26	struct inode_fs_paths {	26	struct inode_fs_paths {
27	struct btrfs_path *btrfs_path;	27	struct btrfs_path *btrfs_path;
28	struct btrfs_root *fs_root;	28	struct btrfs_root *fs_root;
29	struct btrfs_data_container *fspath;	29	struct btrfs_data_container *fspath;
30	};	30	};
31		31
32	typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,	32	typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
33	void *ctx);	33	void *ctx);
34		34
35	int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,	35	int inode_item_info(u64 inum, u64 ioff, struct btrfs_root *fs_root,
36	struct btrfs_path *path);	36	struct btrfs_path *path);
37		37
38	int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,	38	int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
39	struct btrfs_path path, struct btrfs_key found_key,	39	struct btrfs_path path, struct btrfs_key found_key,
40	u64 *flags);	40	u64 *flags);
41		41
42	int tree_backref_for_extent(unsigned long ptr, struct extent_buffer eb,	42	int tree_backref_for_extent(unsigned long ptr, struct extent_buffer eb,
43	struct btrfs_extent_item *ei, u32 item_size,	43	struct btrfs_key key, struct btrfs_extent_item ei,
44	u64 out_root, u8 out_level);	44	u32 item_size, u64 out_root, u8 out_level);
45		45
46	int iterate_extent_inodes(struct btrfs_fs_info *fs_info,	46	int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
47	u64 extent_item_objectid,	47	u64 extent_item_objectid,
48	u64 extent_offset, int search_commit_root,	48	u64 extent_offset, int search_commit_root,
49	iterate_extent_inodes_t iterate, void ctx);	49	iterate_extent_inodes_t iterate, void ctx);
50		50
51	int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,	51	int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
52	struct btrfs_path *path,	52	struct btrfs_path *path,
53	iterate_extent_inodes_t iterate, void ctx);	53	iterate_extent_inodes_t iterate, void ctx);
54		54
55	int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);	55	int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
56		56
57	int btrfs_find_all_roots(struct btrfs_trans_handle *trans,	57	int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
58	struct btrfs_fs_info *fs_info, u64 bytenr,	58	struct btrfs_fs_info *fs_info, u64 bytenr,
59	u64 time_seq, struct ulist **roots);	59	u64 time_seq, struct ulist **roots);
60	char btrfs_ref_to_path(struct btrfs_root fs_root, struct btrfs_path *path,	60	char btrfs_ref_to_path(struct btrfs_root fs_root, struct btrfs_path *path,
61	u32 name_len, unsigned long name_off,	61	u32 name_len, unsigned long name_off,
62	struct extent_buffer *eb_in, u64 parent,	62	struct extent_buffer *eb_in, u64 parent,
63	char *dest, u32 size);	63	char *dest, u32 size);
64		64
65	struct btrfs_data_container *init_data_container(u32 total_bytes);	65	struct btrfs_data_container *init_data_container(u32 total_bytes);
66	struct inode_fs_paths init_ipath(s32 total_bytes, struct btrfs_root fs_root,	66	struct inode_fs_paths init_ipath(s32 total_bytes, struct btrfs_root fs_root,
67	struct btrfs_path *path);	67	struct btrfs_path *path);
68	void free_ipath(struct inode_fs_paths *ipath);	68	void free_ipath(struct inode_fs_paths *ipath);
69		69
70	int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,	70	int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
71	u64 start_off, struct btrfs_path *path,	71	u64 start_off, struct btrfs_path *path,
72	struct btrfs_inode_extref **ret_extref,	72	struct btrfs_inode_extref **ret_extref,
73	u64 *found_off);	73	u64 *found_off);
74		74
75	int __init btrfs_prelim_ref_init(void);	75	int __init btrfs_prelim_ref_init(void);
76	void btrfs_prelim_ref_exit(void);	76	void btrfs_prelim_ref_exit(void);
77	#endif	77	#endif
78		78

 /*
  * Copyright (C) 2011, 2012 STRATO.  All rights reserved.
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public
  * License v2 as published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  * General Public License for more details.
  *
  * You should have received a copy of the GNU General Public
  * License along with this program; if not, write to the
  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  * Boston, MA 021110-1307, USA.
  */
 #include <linux/blkdev.h>
 #include <linux/ratelimit.h>
 #include "ctree.h"
 #include "volumes.h"
 #include "disk-io.h"
 #include "ordered-data.h"
 #include "transaction.h"
 #include "backref.h"
 #include "extent_io.h"
 #include "dev-replace.h"
 #include "check-integrity.h"
 #include "rcu-string.h"
 #include "raid56.h"
 /*
  * This is only the first step towards a full-features scrub. It reads all
  * extent and super block and verifies the checksums. In case a bad checksum
  * is found or the extent cannot be read, good data will be written back if
  * any can be found.
  *
  * Future enhancements:
  *  - In case an unrepairable extent is encountered, track which files are
  *    affected and report them
  *  - track and record media errors, throw out bad devices
  *  - add a mode to also read unallocated space
  */
 struct scrub_block;
 struct scrub_ctx;
 /*
  * the following three values only influence the performance.
  * The last one configures the number of parallel and outstanding I/O
  * operations. The first two values configure an upper limit for the number
  * of (dynamically allocated) pages that are added to a bio.
  */
 #define SCRUB_PAGES_PER_RD_BIO	32	/* 128k per bio */
 #define SCRUB_PAGES_PER_WR_BIO	32	/* 128k per bio */
 #define SCRUB_BIOS_PER_SCTX	64	/* 8MB per device in flight */
 /*
  * the following value times PAGE_SIZE needs to be large enough to match the
  * largest node/leaf/sector size that shall be supported.
  * Values larger than BTRFS_STRIPE_LEN are not supported.
  */
 #define SCRUB_MAX_PAGES_PER_BLOCK	16	/* 64k per node/leaf/sector */
 struct scrub_page {
 	struct scrub_block	*sblock;
 	struct page		*page;
 	struct btrfs_device	*dev;
 	u64			flags;  /* extent flags */
 	u64			generation;
 	u64			logical;
 	u64			physical;
 	u64			physical_for_dev_replace;
 	atomic_t		ref_count;
 	struct {
 		unsigned int	mirror_num:8;
 		unsigned int	have_csum:1;
 		unsigned int	io_error:1;
 	};
 	u8			csum[BTRFS_CSUM_SIZE];
 };
 struct scrub_bio {
 	int			index;
 	struct scrub_ctx	*sctx;
 	struct btrfs_device	*dev;
 	struct bio		*bio;
 	int			err;
 	u64			logical;
 	u64			physical;
 #if SCRUB_PAGES_PER_WR_BIO >= SCRUB_PAGES_PER_RD_BIO
 	struct scrub_page	*pagev[SCRUB_PAGES_PER_WR_BIO];
 #else
 	struct scrub_page	*pagev[SCRUB_PAGES_PER_RD_BIO];
 #endif
 	int			page_count;
 	int			next_free;
 	struct btrfs_work	work;
 };
 struct scrub_block {
 	struct scrub_page	*pagev[SCRUB_MAX_PAGES_PER_BLOCK];
 	int			page_count;
 	atomic_t		outstanding_pages;
 	atomic_t		ref_count; /* free mem on transition to zero */
 	struct scrub_ctx	*sctx;
 	struct {
 		unsigned int	header_error:1;
 		unsigned int	checksum_error:1;
 		unsigned int	no_io_error_seen:1;
 		unsigned int	generation_error:1; /* also sets header_error */
 	};
 };
 struct scrub_wr_ctx {
 	struct scrub_bio *wr_curr_bio;
 	struct btrfs_device *tgtdev;
 	int pages_per_wr_bio; /* <= SCRUB_PAGES_PER_WR_BIO */
 	atomic_t flush_all_writes;
 	struct mutex wr_lock;
 };
 struct scrub_ctx {
 	struct scrub_bio	*bios[SCRUB_BIOS_PER_SCTX];
 	struct btrfs_root	*dev_root;
 	int			first_free;
 	int			curr;
 	atomic_t		bios_in_flight;
 	atomic_t		workers_pending;
 	spinlock_t		list_lock;
 	wait_queue_head_t	list_wait;
 	u16			csum_size;
 	struct list_head	csum_list;
 	atomic_t		cancel_req;
 	int			readonly;
 	int			pages_per_rd_bio;
 	u32			sectorsize;
 	u32			nodesize;
 	u32			leafsize;
 	int			is_dev_replace;
 	struct scrub_wr_ctx	wr_ctx;
 	/*
 	 * statistics
 	 */
 	struct btrfs_scrub_progress stat;
 	spinlock_t		stat_lock;
 };
 struct scrub_fixup_nodatasum {
 	struct scrub_ctx	*sctx;
 	struct btrfs_device	*dev;
 	u64			logical;
 	struct btrfs_root	*root;
 	struct btrfs_work	work;
 	int			mirror_num;
 };
 struct scrub_nocow_inode {
 	u64			inum;
 	u64			offset;
 	u64			root;
 	struct list_head	list;
 };
 struct scrub_copy_nocow_ctx {
 	struct scrub_ctx	*sctx;
 	u64			logical;
 	u64			len;
 	int			mirror_num;
 	u64			physical_for_dev_replace;
 	struct list_head	inodes;
 	struct btrfs_work	work;
 };
 struct scrub_warning {
 	struct btrfs_path	*path;
 	u64			extent_item_size;
 	char			*scratch_buf;
 	char			*msg_buf;
 	const char		*errstr;
 	sector_t		sector;
 	u64			logical;
 	struct btrfs_device	*dev;
 	int			msg_bufsize;
 	int			scratch_bufsize;
 };
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
 static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx);
 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx);
 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 				     struct btrfs_fs_info *fs_info,
 				     struct scrub_block *original_sblock,
 				     u64 length, u64 logical,
 				     struct scrub_block *sblocks_for_recheck);
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 				struct scrub_block *sblock, int is_metadata,
 				int have_csum, u8 *csum, u64 generation,
 				u16 csum_size);
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 					 struct scrub_block *sblock,
 					 int is_metadata, int have_csum,
 					 const u8 *csum, u64 generation,
 					 u16 csum_size);
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 					     struct scrub_block *sblock_good,
 					     int force_write);
 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 					    struct scrub_block *sblock_good,
 					    int page_num, int force_write);
 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 					   int page_num);
 static int scrub_checksum_data(struct scrub_block *sblock);
 static int scrub_checksum_tree_block(struct scrub_block *sblock);
 static int scrub_checksum_super(struct scrub_block *sblock);
 static void scrub_block_get(struct scrub_block *sblock);
 static void scrub_block_put(struct scrub_block *sblock);
 static void scrub_page_get(struct scrub_page *spage);
 static void scrub_page_put(struct scrub_page *spage);
 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 				    struct scrub_page *spage);
 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 		       u64 physical, struct btrfs_device *dev, u64 flags,
 		       u64 gen, int mirror_num, u8 *csum, int force,
 		       u64 physical_for_dev_replace);
 static void scrub_bio_end_io(struct bio *bio, int err);
 static void scrub_bio_end_io_worker(struct btrfs_work *work);
 static void scrub_block_complete(struct scrub_block *sblock);
 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 			       u64 extent_logical, u64 extent_len,
 			       u64 *extent_physical,
 			       struct btrfs_device **extent_dev,
 			       int *extent_mirror_num);
 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
 			      struct scrub_wr_ctx *wr_ctx,
 			      struct btrfs_fs_info *fs_info,
 			      struct btrfs_device *dev,
 			      int is_dev_replace);
 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx);
 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 				    struct scrub_page *spage);
 static void scrub_wr_submit(struct scrub_ctx *sctx);
 static void scrub_wr_bio_end_io(struct bio *bio, int err);
 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
 static int write_page_nocow(struct scrub_ctx *sctx,
 			    u64 physical_for_dev_replace, struct page *page);
 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 				      struct scrub_copy_nocow_ctx *ctx);
 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 			    int mirror_num, u64 physical_for_dev_replace);
 static void copy_nocow_pages_worker(struct btrfs_work *work);
 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
 {
 	atomic_inc(&sctx->bios_in_flight);
 }
 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
 {
 	atomic_dec(&sctx->bios_in_flight);
 	wake_up(&sctx->list_wait);
 }
 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 {
 	while (atomic_read(&fs_info->scrub_pause_req)) {
 		mutex_unlock(&fs_info->scrub_lock);
 		wait_event(fs_info->scrub_pause_wait,
 		   atomic_read(&fs_info->scrub_pause_req) == 0);
 		mutex_lock(&fs_info->scrub_lock);
 	}
 }
 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
 {
 	atomic_inc(&fs_info->scrubs_paused);
 	wake_up(&fs_info->scrub_pause_wait);
 	mutex_lock(&fs_info->scrub_lock);
 	__scrub_blocked_if_needed(fs_info);
 	atomic_dec(&fs_info->scrubs_paused);
 	mutex_unlock(&fs_info->scrub_lock);
 	wake_up(&fs_info->scrub_pause_wait);
 }
 /*
  * used for workers that require transaction commits (i.e., for the
  * NOCOW case)
  */
 static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx)
 {
 	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 	/*
 	 * increment scrubs_running to prevent cancel requests from
 	 * completing as long as a worker is running. we must also
 	 * increment scrubs_paused to prevent deadlocking on pause
 	 * requests used for transactions commits (as the worker uses a
 	 * transaction context). it is safe to regard the worker
 	 * as paused for all matters practical. effectively, we only
 	 * avoid cancellation requests from completing.
 	 */
 	mutex_lock(&fs_info->scrub_lock);
 	atomic_inc(&fs_info->scrubs_running);
 	atomic_inc(&fs_info->scrubs_paused);
 	mutex_unlock(&fs_info->scrub_lock);
 	/*
 	 * check if @scrubs_running=@scrubs_paused condition
 	 * inside wait_event() is not an atomic operation.
 	 * which means we may inc/dec @scrub_running/paused
 	 * at any time. Let's wake up @scrub_pause_wait as
 	 * much as we can to let commit transaction blocked less.
 	 */
 	wake_up(&fs_info->scrub_pause_wait);
 	atomic_inc(&sctx->workers_pending);
 }
 /* used for workers that require transaction commits */
 static void scrub_pending_trans_workers_dec(struct scrub_ctx *sctx)
 {
 	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 	/*
 	 * see scrub_pending_trans_workers_inc() why we're pretending
 	 * to be paused in the scrub counters
 	 */
 	mutex_lock(&fs_info->scrub_lock);
 	atomic_dec(&fs_info->scrubs_running);
 	atomic_dec(&fs_info->scrubs_paused);
 	mutex_unlock(&fs_info->scrub_lock);
 	atomic_dec(&sctx->workers_pending);
 	wake_up(&fs_info->scrub_pause_wait);
 	wake_up(&sctx->list_wait);
 }
 static void scrub_free_csums(struct scrub_ctx *sctx)
 {
 	while (!list_empty(&sctx->csum_list)) {
 		struct btrfs_ordered_sum *sum;
 		sum = list_first_entry(&sctx->csum_list,
 				       struct btrfs_ordered_sum, list);
 		list_del(&sum->list);
 		kfree(sum);
 	}
 }
 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
 {
 	int i;
 	if (!sctx)
 		return;
 	scrub_free_wr_ctx(&sctx->wr_ctx);
 	/* this can happen when scrub is cancelled */
 	if (sctx->curr != -1) {
 		struct scrub_bio *sbio = sctx->bios[sctx->curr];
 		for (i = 0; i < sbio->page_count; i++) {
 			WARN_ON(!sbio->pagev[i]->page);
 			scrub_block_put(sbio->pagev[i]->sblock);
 		}
 		bio_put(sbio->bio);
 	}
 	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 		struct scrub_bio *sbio = sctx->bios[i];
 		if (!sbio)
 			break;
 		kfree(sbio);
 	}
 	scrub_free_csums(sctx);
 	kfree(sctx);
 }
 static noinline_for_stack
 struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace)
 {
 	struct scrub_ctx *sctx;
 	int		i;
 	struct btrfs_fs_info *fs_info = dev->dev_root->fs_info;
 	int pages_per_rd_bio;
 	int ret;
 	/*
 	 * the setting of pages_per_rd_bio is correct for scrub but might
 	 * be wrong for the dev_replace code where we might read from
 	 * different devices in the initial huge bios. However, that
 	 * code is able to correctly handle the case when adding a page
 	 * to a bio fails.
 	 */
 	if (dev->bdev)
 		pages_per_rd_bio = min_t(int, SCRUB_PAGES_PER_RD_BIO,
 					 bio_get_nr_vecs(dev->bdev));
 	else
 		pages_per_rd_bio = SCRUB_PAGES_PER_RD_BIO;
 	sctx = kzalloc(sizeof(*sctx), GFP_NOFS);
 	if (!sctx)
 		goto nomem;
 	sctx->is_dev_replace = is_dev_replace;
 	sctx->pages_per_rd_bio = pages_per_rd_bio;
 	sctx->curr = -1;
 	sctx->dev_root = dev->dev_root;
 	for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
 		struct scrub_bio *sbio;
 		sbio = kzalloc(sizeof(*sbio), GFP_NOFS);
 		if (!sbio)
 			goto nomem;
 		sctx->bios[i] = sbio;
 		sbio->index = i;
 		sbio->sctx = sctx;
 		sbio->page_count = 0;
 		btrfs_init_work(&sbio->work, scrub_bio_end_io_worker,
 				NULL, NULL);
 		if (i != SCRUB_BIOS_PER_SCTX - 1)
 			sctx->bios[i]->next_free = i + 1;
 		else
 			sctx->bios[i]->next_free = -1;
 	}
 	sctx->first_free = 0;
 	sctx->nodesize = dev->dev_root->nodesize;
 	sctx->leafsize = dev->dev_root->leafsize;
 	sctx->sectorsize = dev->dev_root->sectorsize;
 	atomic_set(&sctx->bios_in_flight, 0);
 	atomic_set(&sctx->workers_pending, 0);
 	atomic_set(&sctx->cancel_req, 0);
 	sctx->csum_size = btrfs_super_csum_size(fs_info->super_copy);
 	INIT_LIST_HEAD(&sctx->csum_list);
 	spin_lock_init(&sctx->list_lock);
 	spin_lock_init(&sctx->stat_lock);
 	init_waitqueue_head(&sctx->list_wait);
 	ret = scrub_setup_wr_ctx(sctx, &sctx->wr_ctx, fs_info,
 				 fs_info->dev_replace.tgtdev, is_dev_replace);
 	if (ret) {
 		scrub_free_ctx(sctx);
 		return ERR_PTR(ret);
 	}
 	return sctx;
 nomem:
 	scrub_free_ctx(sctx);
 	return ERR_PTR(-ENOMEM);
 }
 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
 				     void *warn_ctx)
 {
 	u64 isize;
 	u32 nlink;
 	int ret;
 	int i;
 	struct extent_buffer *eb;
 	struct btrfs_inode_item *inode_item;
 	struct scrub_warning *swarn = warn_ctx;
 	struct btrfs_fs_info *fs_info = swarn->dev->dev_root->fs_info;
 	struct inode_fs_paths *ipath = NULL;
 	struct btrfs_root *local_root;
 	struct btrfs_key root_key;
 	root_key.objectid = root;
 	root_key.type = BTRFS_ROOT_ITEM_KEY;
 	root_key.offset = (u64)-1;
 	local_root = btrfs_read_fs_root_no_name(fs_info, &root_key);
 	if (IS_ERR(local_root)) {
 		ret = PTR_ERR(local_root);
 		goto err;
 	}
 	ret = inode_item_info(inum, 0, local_root, swarn->path);
 	if (ret) {
 		btrfs_release_path(swarn->path);
 		goto err;
 	}
 	eb = swarn->path->nodes[0];
 	inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
 					struct btrfs_inode_item);
 	isize = btrfs_inode_size(eb, inode_item);
 	nlink = btrfs_inode_nlink(eb, inode_item);
 	btrfs_release_path(swarn->path);
 	ipath = init_ipath(4096, local_root, swarn->path);
 	if (IS_ERR(ipath)) {
 		ret = PTR_ERR(ipath);
 		ipath = NULL;
 		goto err;
 	}
 	ret = paths_from_inode(inum, ipath);
 	if (ret < 0)
 		goto err;
 	/*
 	 * we deliberately ignore the bit ipath might have been too small to
 	 * hold all of the paths here
 	 */
 	for (i = 0; i < ipath->fspath->elem_cnt; ++i)
 		printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 			"%s, sector %llu, root %llu, inode %llu, offset %llu, "
 			"length %llu, links %u (path: %s)\n", swarn->errstr,
 			swarn->logical, rcu_str_deref(swarn->dev->name),
 			(unsigned long long)swarn->sector, root, inum, offset,
 			min(isize - offset, (u64)PAGE_SIZE), nlink,
 			(char *)(unsigned long)ipath->fspath->val[i]);
 	free_ipath(ipath);
 	return 0;
 err:
 	printk_in_rcu(KERN_WARNING "BTRFS: %s at logical %llu on dev "
 		"%s, sector %llu, root %llu, inode %llu, offset %llu: path "
 		"resolving failed with ret=%d\n", swarn->errstr,
 		swarn->logical, rcu_str_deref(swarn->dev->name),
 		(unsigned long long)swarn->sector, root, inum, offset, ret);
 	free_ipath(ipath);
 	return 0;
 }
 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
 {
 	struct btrfs_device *dev;
 	struct btrfs_fs_info *fs_info;
 	struct btrfs_path *path;
 	struct btrfs_key found_key;
 	struct extent_buffer *eb;
 	struct btrfs_extent_item *ei;
 	struct scrub_warning swarn;
 	unsigned long ptr = 0;
 	u64 extent_item_pos;
 	u64 flags = 0;
 	u64 ref_root;
 	u32 item_size;
 	u8 ref_level;
 	const int bufsize = 4096;
 	int ret;
 	WARN_ON(sblock->page_count < 1);
 	dev = sblock->pagev[0]->dev;
 	fs_info = sblock->sctx->dev_root->fs_info;
 	path = btrfs_alloc_path();
 	swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS);
 	swarn.msg_buf = kmalloc(bufsize, GFP_NOFS);
 	swarn.sector = (sblock->pagev[0]->physical) >> 9;
 	swarn.logical = sblock->pagev[0]->logical;
 	swarn.errstr = errstr;
 	swarn.dev = NULL;
 	swarn.msg_bufsize = bufsize;
 	swarn.scratch_bufsize = bufsize;
 	if (!path || !swarn.scratch_buf || !swarn.msg_buf)
 		goto out;
 	ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
 				  &flags);
 	if (ret < 0)
 		goto out;
 	extent_item_pos = swarn.logical - found_key.objectid;
 	swarn.extent_item_size = found_key.offset;
 	eb = path->nodes[0];
 	ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
 	item_size = btrfs_item_size_nr(eb, path->slots[0]);
 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		do {
-			ret = tree_backref_for_extent(&ptr, eb, ei, item_size,
+			ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
-							&ref_root, &ref_level);
+						      item_size, &ref_root,
+						      &ref_level);
 			printk_in_rcu(KERN_WARNING
 				"BTRFS: %s at logical %llu on dev %s, "
 				"sector %llu: metadata %s (level %d) in tree "
 				"%llu\n", errstr, swarn.logical,
 				rcu_str_deref(dev->name),
 				(unsigned long long)swarn.sector,
 				ref_level ? "node" : "leaf",
 				ret < 0 ? -1 : ref_level,
 				ret < 0 ? -1 : ref_root);
 		} while (ret != 1);
 		btrfs_release_path(path);
 	} else {
 		btrfs_release_path(path);
 		swarn.path = path;
 		swarn.dev = dev;
 		iterate_extent_inodes(fs_info, found_key.objectid,
 					extent_item_pos, 1,
 					scrub_print_warning_inode, &swarn);
 	}
 out:
 	btrfs_free_path(path);
 	kfree(swarn.scratch_buf);
 	kfree(swarn.msg_buf);
 }
 static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx)
 {
 	struct page *page = NULL;
 	unsigned long index;
 	struct scrub_fixup_nodatasum *fixup = fixup_ctx;
 	int ret;
 	int corrected = 0;
 	struct btrfs_key key;
 	struct inode *inode = NULL;
 	struct btrfs_fs_info *fs_info;
 	u64 end = offset + PAGE_SIZE - 1;
 	struct btrfs_root *local_root;
 	int srcu_index;
 	key.objectid = root;
 	key.type = BTRFS_ROOT_ITEM_KEY;
 	key.offset = (u64)-1;
 	fs_info = fixup->root->fs_info;
 	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
 	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
 	if (IS_ERR(local_root)) {
 		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 		return PTR_ERR(local_root);
 	}
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.objectid = inum;
 	key.offset = 0;
 	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
 	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 	index = offset >> PAGE_CACHE_SHIFT;
 	page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 	if (!page) {
 		ret = -ENOMEM;
 		goto out;
 	}
 	if (PageUptodate(page)) {
 		if (PageDirty(page)) {
 			/*
 			 * we need to write the data to the defect sector. the
 			 * data that was in that sector is not in memory,
 			 * because the page was modified. we must not write the
 			 * modified page to that sector.
 			 *
 			 * TODO: what could be done here: wait for the delalloc
 			 *       runner to write out that page (might involve
 			 *       COW) and see whether the sector is still
 			 *       referenced afterwards.
 			 *
 			 * For the meantime, we'll treat this error
 			 * incorrectable, although there is a chance that a
 			 * later scrub will find the bad sector again and that
 			 * there's no dirty page in memory, then.
 			 */
 			ret = -EIO;
 			goto out;
 		}
 		fs_info = BTRFS_I(inode)->root->fs_info;
 		ret = repair_io_failure(fs_info, offset, PAGE_SIZE,
 					fixup->logical, page,
 					fixup->mirror_num);
 		unlock_page(page);
 		corrected = !ret;
 	} else {
 		/*
 		 * we need to get good data first. the general readpage path
 		 * will call repair_io_failure for us, we just have to make
 		 * sure we read the bad mirror.
 		 */
 		ret = set_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 					EXTENT_DAMAGED, GFP_NOFS);
 		if (ret) {
 			/* set_extent_bits should give proper error */
 			WARN_ON(ret > 0);
 			if (ret > 0)
 				ret = -EFAULT;
 			goto out;
 		}
 		ret = extent_read_full_page(&BTRFS_I(inode)->io_tree, page,
 						btrfs_get_extent,
 						fixup->mirror_num);
 		wait_on_page_locked(page);
 		corrected = !test_range_bit(&BTRFS_I(inode)->io_tree, offset,
 						end, EXTENT_DAMAGED, 0, NULL);
 		if (!corrected)
 			clear_extent_bits(&BTRFS_I(inode)->io_tree, offset, end,
 						EXTENT_DAMAGED, GFP_NOFS);
 	}
 out:
 	if (page)
 		put_page(page);
 	if (inode)
 		iput(inode);
 	if (ret < 0)
 		return ret;
 	if (ret == 0 && corrected) {
 		/*
 		 * we only need to call readpage for one of the inodes belonging
 		 * to this extent. so make iterate_extent_inodes stop
 		 */
 		return 1;
 	}
 	return -EIO;
 }
 static void scrub_fixup_nodatasum(struct btrfs_work *work)
 {
 	int ret;
 	struct scrub_fixup_nodatasum *fixup;
 	struct scrub_ctx *sctx;
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_path *path;
 	int uncorrectable = 0;
 	fixup = container_of(work, struct scrub_fixup_nodatasum, work);
 	sctx = fixup->sctx;
 	path = btrfs_alloc_path();
 	if (!path) {
 		spin_lock(&sctx->stat_lock);
 		++sctx->stat.malloc_errors;
 		spin_unlock(&sctx->stat_lock);
 		uncorrectable = 1;
 		goto out;
 	}
 	trans = btrfs_join_transaction(fixup->root);
 	if (IS_ERR(trans)) {
 		uncorrectable = 1;
 		goto out;
 	}
 	/*
 	 * the idea is to trigger a regular read through the standard path. we
 	 * read a page from the (failed) logical address by specifying the
 	 * corresponding copynum of the failed sector. thus, that readpage is
 	 * expected to fail.
 	 * that is the point where on-the-fly error correction will kick in
 	 * (once it's finished) and rewrite the failed sector if a good copy
 	 * can be found.
 	 */
 	ret = iterate_inodes_from_logical(fixup->logical, fixup->root->fs_info,
 						path, scrub_fixup_readpage,
 						fixup);
 	if (ret < 0) {
 		uncorrectable = 1;
 		goto out;
 	}
 	WARN_ON(ret != 1);
 	spin_lock(&sctx->stat_lock);
 	++sctx->stat.corrected_errors;
 	spin_unlock(&sctx->stat_lock);
 out:
 	if (trans && !IS_ERR(trans))
 		btrfs_end_transaction(trans, fixup->root);
 	if (uncorrectable) {
 		spin_lock(&sctx->stat_lock);
 		++sctx->stat.uncorrectable_errors;
 		spin_unlock(&sctx->stat_lock);
 		btrfs_dev_replace_stats_inc(
 			&sctx->dev_root->fs_info->dev_replace.
 			num_uncorrectable_read_errors);
 		printk_ratelimited_in_rcu(KERN_ERR "BTRFS: "
 		    "unable to fixup (nodatasum) error at logical %llu on dev %s\n",
 			fixup->logical, rcu_str_deref(fixup->dev->name));
 	}
 	btrfs_free_path(path);
 	kfree(fixup);
 	scrub_pending_trans_workers_dec(sctx);
 }
 /*
  * scrub_handle_errored_block gets called when either verification of the
  * pages failed or the bio failed to read, e.g. with EIO. In the latter
  * case, this function handles all pages in the bio, even though only one
  * may be bad.
  * The goal of this function is to repair the errored block by using the
  * contents of one of the mirrors.
  */
 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
 {
 	struct scrub_ctx *sctx = sblock_to_check->sctx;
 	struct btrfs_device *dev;
 	struct btrfs_fs_info *fs_info;
 	u64 length;
 	u64 logical;
 	u64 generation;
 	unsigned int failed_mirror_index;
 	unsigned int is_metadata;
 	unsigned int have_csum;
 	u8 *csum;
 	struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
 	struct scrub_block *sblock_bad;
 	int ret;
 	int mirror_index;
 	int page_num;
 	int success;
 	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
 	BUG_ON(sblock_to_check->page_count < 1);
 	fs_info = sctx->dev_root->fs_info;
 	if (sblock_to_check->pagev[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
 		/*
 		 * if we find an error in a super block, we just report it.
 		 * They will get written with the next transaction commit
 		 * anyway
 		 */
 		spin_lock(&sctx->stat_lock);
 		++sctx->stat.super_errors;
 		spin_unlock(&sctx->stat_lock);
 		return 0;
 	}
 	length = sblock_to_check->page_count * PAGE_SIZE;
 	logical = sblock_to_check->pagev[0]->logical;
 	generation = sblock_to_check->pagev[0]->generation;
 	BUG_ON(sblock_to_check->pagev[0]->mirror_num < 1);
 	failed_mirror_index = sblock_to_check->pagev[0]->mirror_num - 1;
 	is_metadata = !(sblock_to_check->pagev[0]->flags &
 			BTRFS_EXTENT_FLAG_DATA);
 	have_csum = sblock_to_check->pagev[0]->have_csum;
 	csum = sblock_to_check->pagev[0]->csum;
 	dev = sblock_to_check->pagev[0]->dev;
 	if (sctx->is_dev_replace && !is_metadata && !have_csum) {
 		sblocks_for_recheck = NULL;
 		goto nodatasum_case;
 	}
 	/*
 	 * read all mirrors one after the other. This includes to
 	 * re-read the extent or metadata block that failed (that was
 	 * the cause that this fixup code is called) another time,
 	 * page by page this time in order to know which pages
 	 * caused I/O errors and which ones are good (for all mirrors).
 	 * It is the goal to handle the situation when more than one
 	 * mirror contains I/O errors, but the errors do not
 	 * overlap, i.e. the data can be repaired by selecting the
 	 * pages from those mirrors without I/O error on the
 	 * particular pages. One example (with blocks >= 2 * PAGE_SIZE)
 	 * would be that mirror #1 has an I/O error on the first page,
 	 * the second page is good, and mirror #2 has an I/O error on
 	 * the second page, but the first page is good.
 	 * Then the first page of the first mirror can be repaired by
 	 * taking the first page of the second mirror, and the
 	 * second page of the second mirror can be repaired by
 	 * copying the contents of the 2nd page of the 1st mirror.
 	 * One more note: if the pages of one mirror contain I/O
 	 * errors, the checksum cannot be verified. In order to get
 	 * the best data for repairing, the first attempt is to find
 	 * a mirror without I/O errors and with a validated checksum.
 	 * Only if this is not possible, the pages are picked from
 	 * mirrors with I/O errors without considering the checksum.
 	 * If the latter is the case, at the end, the checksum of the
 	 * repaired area is verified in order to correctly maintain
 	 * the statistics.
 	 */
 	sblocks_for_recheck = kzalloc(BTRFS_MAX_MIRRORS *
 				     sizeof(*sblocks_for_recheck),
 				     GFP_NOFS);
 	if (!sblocks_for_recheck) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.malloc_errors++;
 		sctx->stat.read_errors++;
 		sctx->stat.uncorrectable_errors++;
 		spin_unlock(&sctx->stat_lock);
 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 		goto out;
 	}
 	/* setup the context, map the logical blocks and alloc the pages */
 	ret = scrub_setup_recheck_block(sctx, fs_info, sblock_to_check, length,
 					logical, sblocks_for_recheck);
 	if (ret) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.read_errors++;
 		sctx->stat.uncorrectable_errors++;
 		spin_unlock(&sctx->stat_lock);
 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 		goto out;
 	}
 	BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
 	sblock_bad = sblocks_for_recheck + failed_mirror_index;
 	/* build and submit the bios for the failed mirror, check checksums */
 	scrub_recheck_block(fs_info, sblock_bad, is_metadata, have_csum,
 			    csum, generation, sctx->csum_size);
 	if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
 	    sblock_bad->no_io_error_seen) {
 		/*
 		 * the error disappeared after reading page by page, or
 		 * the area was part of a huge bio and other parts of the
 		 * bio caused I/O errors, or the block layer merged several
 		 * read requests into one and the error is caused by a
 		 * different bio (usually one of the two latter cases is
 		 * the cause)
 		 */
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.unverified_errors++;
 		spin_unlock(&sctx->stat_lock);
 		if (sctx->is_dev_replace)
 			scrub_write_block_to_dev_replace(sblock_bad);
 		goto out;
 	}
 	if (!sblock_bad->no_io_error_seen) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.read_errors++;
 		spin_unlock(&sctx->stat_lock);
 		if (__ratelimit(&_rs))
 			scrub_print_warning("i/o error", sblock_to_check);
 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
 	} else if (sblock_bad->checksum_error) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.csum_errors++;
 		spin_unlock(&sctx->stat_lock);
 		if (__ratelimit(&_rs))
 			scrub_print_warning("checksum error", sblock_to_check);
 		btrfs_dev_stat_inc_and_print(dev,
 					     BTRFS_DEV_STAT_CORRUPTION_ERRS);
 	} else if (sblock_bad->header_error) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.verify_errors++;
 		spin_unlock(&sctx->stat_lock);
 		if (__ratelimit(&_rs))
 			scrub_print_warning("checksum/header error",
 					    sblock_to_check);
 		if (sblock_bad->generation_error)
 			btrfs_dev_stat_inc_and_print(dev,
 				BTRFS_DEV_STAT_GENERATION_ERRS);
 		else
 			btrfs_dev_stat_inc_and_print(dev,
 				BTRFS_DEV_STAT_CORRUPTION_ERRS);
 	}
 	if (sctx->readonly) {
 		ASSERT(!sctx->is_dev_replace);
 		goto out;
 	}
 	if (!is_metadata && !have_csum) {
 		struct scrub_fixup_nodatasum *fixup_nodatasum;
 nodatasum_case:
 		WARN_ON(sctx->is_dev_replace);
 		/*
 		 * !is_metadata and !have_csum, this means that the data
 		 * might not be COW'ed, that it might be modified
 		 * concurrently. The general strategy to work on the
 		 * commit root does not help in the case when COW is not
 		 * used.
 		 */
 		fixup_nodatasum = kzalloc(sizeof(*fixup_nodatasum), GFP_NOFS);
 		if (!fixup_nodatasum)
 			goto did_not_correct_error;
 		fixup_nodatasum->sctx = sctx;
 		fixup_nodatasum->dev = dev;
 		fixup_nodatasum->logical = logical;
 		fixup_nodatasum->root = fs_info->extent_root;
 		fixup_nodatasum->mirror_num = failed_mirror_index + 1;
 		scrub_pending_trans_workers_inc(sctx);
 		btrfs_init_work(&fixup_nodatasum->work, scrub_fixup_nodatasum,
 				NULL, NULL);
 		btrfs_queue_work(fs_info->scrub_workers,
 				 &fixup_nodatasum->work);
 		goto out;
 	}
 	/*
 	 * now build and submit the bios for the other mirrors, check
 	 * checksums.
 	 * First try to pick the mirror which is completely without I/O
 	 * errors and also does not have a checksum error.
 	 * If one is found, and if a checksum is present, the full block
 	 * that is known to contain an error is rewritten. Afterwards
 	 * the block is known to be corrected.
 	 * If a mirror is found which is completely correct, and no
 	 * checksum is present, only those pages are rewritten that had
 	 * an I/O error in the block to be repaired, since it cannot be
 	 * determined, which copy of the other pages is better (and it
 	 * could happen otherwise that a correct page would be
 	 * overwritten by a bad one).
 	 */
 	for (mirror_index = 0;
 	     mirror_index < BTRFS_MAX_MIRRORS &&
 	     sblocks_for_recheck[mirror_index].page_count > 0;
 	     mirror_index++) {
 		struct scrub_block *sblock_other;
 		if (mirror_index == failed_mirror_index)
 			continue;
 		sblock_other = sblocks_for_recheck + mirror_index;
 		/* build and submit the bios, check checksums */
 		scrub_recheck_block(fs_info, sblock_other, is_metadata,
 				    have_csum, csum, generation,
 				    sctx->csum_size);
 		if (!sblock_other->header_error &&
 		    !sblock_other->checksum_error &&
 		    sblock_other->no_io_error_seen) {
 			if (sctx->is_dev_replace) {
 				scrub_write_block_to_dev_replace(sblock_other);
 			} else {
 				int force_write = is_metadata || have_csum;
 				ret = scrub_repair_block_from_good_copy(
 						sblock_bad, sblock_other,
 						force_write);
 			}
 			if (0 == ret)
 				goto corrected_error;
 		}
 	}
 	/*
 	 * for dev_replace, pick good pages and write to the target device.
 	 */
 	if (sctx->is_dev_replace) {
 		success = 1;
 		for (page_num = 0; page_num < sblock_bad->page_count;
 		     page_num++) {
 			int sub_success;
 			sub_success = 0;
 			for (mirror_index = 0;
 			     mirror_index < BTRFS_MAX_MIRRORS &&
 			     sblocks_for_recheck[mirror_index].page_count > 0;
 			     mirror_index++) {
 				struct scrub_block *sblock_other =
 					sblocks_for_recheck + mirror_index;
 				struct scrub_page *page_other =
 					sblock_other->pagev[page_num];
 				if (!page_other->io_error) {
 					ret = scrub_write_page_to_dev_replace(
 							sblock_other, page_num);
 					if (ret == 0) {
 						/* succeeded for this page */
 						sub_success = 1;
 						break;
 					} else {
 						btrfs_dev_replace_stats_inc(
 							&sctx->dev_root->
 							fs_info->dev_replace.
 							num_write_errors);
 					}
 				}
 			}
 			if (!sub_success) {
 				/*
 				 * did not find a mirror to fetch the page
 				 * from. scrub_write_page_to_dev_replace()
 				 * handles this case (page->io_error), by
 				 * filling the block with zeros before
 				 * submitting the write request
 				 */
 				success = 0;
 				ret = scrub_write_page_to_dev_replace(
 						sblock_bad, page_num);
 				if (ret)
 					btrfs_dev_replace_stats_inc(
 						&sctx->dev_root->fs_info->
 						dev_replace.num_write_errors);
 			}
 		}
 		goto out;
 	}
 	/*
 	 * for regular scrub, repair those pages that are errored.
 	 * In case of I/O errors in the area that is supposed to be
 	 * repaired, continue by picking good copies of those pages.
 	 * Select the good pages from mirrors to rewrite bad pages from
 	 * the area to fix. Afterwards verify the checksum of the block
 	 * that is supposed to be repaired. This verification step is
 	 * only done for the purpose of statistic counting and for the
 	 * final scrub report, whether errors remain.
 	 * A perfect algorithm could make use of the checksum and try
 	 * all possible combinations of pages from the different mirrors
 	 * until the checksum verification succeeds. For example, when
 	 * the 2nd page of mirror #1 faces I/O errors, and the 2nd page
 	 * of mirror #2 is readable but the final checksum test fails,
 	 * then the 2nd page of mirror #3 could be tried, whether now
 	 * the final checksum succeedes. But this would be a rare
 	 * exception and is therefore not implemented. At least it is
 	 * avoided that the good copy is overwritten.
 	 * A more useful improvement would be to pick the sectors
 	 * without I/O error based on sector sizes (512 bytes on legacy
 	 * disks) instead of on PAGE_SIZE. Then maybe 512 byte of one
 	 * mirror could be repaired by taking 512 byte of a different
 	 * mirror, even if other 512 byte sectors in the same PAGE_SIZE
 	 * area are unreadable.
 	 */
 	/* can only fix I/O errors from here on */
 	if (sblock_bad->no_io_error_seen)
 		goto did_not_correct_error;
 	success = 1;
 	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
 		struct scrub_page *page_bad = sblock_bad->pagev[page_num];
 		if (!page_bad->io_error)
 			continue;
 		for (mirror_index = 0;
 		     mirror_index < BTRFS_MAX_MIRRORS &&
 		     sblocks_for_recheck[mirror_index].page_count > 0;
 		     mirror_index++) {
 			struct scrub_block *sblock_other = sblocks_for_recheck +
 							   mirror_index;
 			struct scrub_page *page_other = sblock_other->pagev[
 							page_num];
 			if (!page_other->io_error) {
 				ret = scrub_repair_page_from_good_copy(
 					sblock_bad, sblock_other, page_num, 0);
 				if (0 == ret) {
 					page_bad->io_error = 0;
 					break; /* succeeded for this page */
 				}
 			}
 		}
 		if (page_bad->io_error) {
 			/* did not find a mirror to copy the page from */
 			success = 0;
 		}
 	}
 	if (success) {
 		if (is_metadata || have_csum) {
 			/*
 			 * need to verify the checksum now that all
 			 * sectors on disk are repaired (the write
 			 * request for data to be repaired is on its way).
 			 * Just be lazy and use scrub_recheck_block()
 			 * which re-reads the data before the checksum
 			 * is verified, but most likely the data comes out
 			 * of the page cache.
 			 */
 			scrub_recheck_block(fs_info, sblock_bad,
 					    is_metadata, have_csum, csum,
 					    generation, sctx->csum_size);
 			if (!sblock_bad->header_error &&
 			    !sblock_bad->checksum_error &&
 			    sblock_bad->no_io_error_seen)
 				goto corrected_error;
 			else
 				goto did_not_correct_error;
 		} else {
 corrected_error:
 			spin_lock(&sctx->stat_lock);
 			sctx->stat.corrected_errors++;
 			spin_unlock(&sctx->stat_lock);
 			printk_ratelimited_in_rcu(KERN_ERR
 				"BTRFS: fixed up error at logical %llu on dev %s\n",
 				logical, rcu_str_deref(dev->name));
 		}
 	} else {
 did_not_correct_error:
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.uncorrectable_errors++;
 		spin_unlock(&sctx->stat_lock);
 		printk_ratelimited_in_rcu(KERN_ERR
 			"BTRFS: unable to fixup (regular) error at logical %llu on dev %s\n",
 			logical, rcu_str_deref(dev->name));
 	}
 out:
 	if (sblocks_for_recheck) {
 		for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
 		     mirror_index++) {
 			struct scrub_block *sblock = sblocks_for_recheck +
 						     mirror_index;
 			int page_index;
 			for (page_index = 0; page_index < sblock->page_count;
 			     page_index++) {
 				sblock->pagev[page_index]->sblock = NULL;
 				scrub_page_put(sblock->pagev[page_index]);
 			}
 		}
 		kfree(sblocks_for_recheck);
 	}
 	return 0;
 }
 static int scrub_setup_recheck_block(struct scrub_ctx *sctx,
 				     struct btrfs_fs_info *fs_info,
 				     struct scrub_block *original_sblock,
 				     u64 length, u64 logical,
 				     struct scrub_block *sblocks_for_recheck)
 {
 	int page_index;
 	int mirror_index;
 	int ret;
 	/*
 	 * note: the two members ref_count and outstanding_pages
 	 * are not used (and not set) in the blocks that are used for
 	 * the recheck procedure
 	 */
 	page_index = 0;
 	while (length > 0) {
 		u64 sublen = min_t(u64, length, PAGE_SIZE);
 		u64 mapped_length = sublen;
 		struct btrfs_bio *bbio = NULL;
 		/*
 		 * with a length of PAGE_SIZE, each returned stripe
 		 * represents one mirror
 		 */
 		ret = btrfs_map_block(fs_info, REQ_GET_READ_MIRRORS, logical,
 				      &mapped_length, &bbio, 0);
 		if (ret || !bbio || mapped_length < sublen) {
 			kfree(bbio);
 			return -EIO;
 		}
 		BUG_ON(page_index >= SCRUB_PAGES_PER_RD_BIO);
 		for (mirror_index = 0; mirror_index < (int)bbio->num_stripes;
 		     mirror_index++) {
 			struct scrub_block *sblock;
 			struct scrub_page *page;
 			if (mirror_index >= BTRFS_MAX_MIRRORS)
 				continue;
 			sblock = sblocks_for_recheck + mirror_index;
 			sblock->sctx = sctx;
 			page = kzalloc(sizeof(*page), GFP_NOFS);
 			if (!page) {
 leave_nomem:
 				spin_lock(&sctx->stat_lock);
 				sctx->stat.malloc_errors++;
 				spin_unlock(&sctx->stat_lock);
 				kfree(bbio);
 				return -ENOMEM;
 			}
 			scrub_page_get(page);
 			sblock->pagev[page_index] = page;
 			page->logical = logical;
 			page->physical = bbio->stripes[mirror_index].physical;
 			BUG_ON(page_index >= original_sblock->page_count);
 			page->physical_for_dev_replace =
 				original_sblock->pagev[page_index]->
 				physical_for_dev_replace;
 			/* for missing devices, dev->bdev is NULL */
 			page->dev = bbio->stripes[mirror_index].dev;
 			page->mirror_num = mirror_index + 1;
 			sblock->page_count++;
 			page->page = alloc_page(GFP_NOFS);
 			if (!page->page)
 				goto leave_nomem;
 		}
 		kfree(bbio);
 		length -= sublen;
 		logical += sublen;
 		page_index++;
 	}
 	return 0;
 }
 /*
  * this function will check the on disk data for checksum errors, header
  * errors and read I/O errors. If any I/O errors happen, the exact pages
  * which are errored are marked as being bad. The goal is to enable scrub
  * to take those pages that are not errored from all the mirrors so that
  * the pages that are errored in the just handled mirror can be repaired.
  */
 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
 				struct scrub_block *sblock, int is_metadata,
 				int have_csum, u8 *csum, u64 generation,
 				u16 csum_size)
 {
 	int page_num;
 	sblock->no_io_error_seen = 1;
 	sblock->header_error = 0;
 	sblock->checksum_error = 0;
 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
 		struct bio *bio;
 		struct scrub_page *page = sblock->pagev[page_num];
 		if (page->dev->bdev == NULL) {
 			page->io_error = 1;
 			sblock->no_io_error_seen = 0;
 			continue;
 		}
 		WARN_ON(!page->page);
 		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
 		if (!bio) {
 			page->io_error = 1;
 			sblock->no_io_error_seen = 0;
 			continue;
 		}
 		bio->bi_bdev = page->dev->bdev;
 		bio->bi_iter.bi_sector = page->physical >> 9;
 		bio_add_page(bio, page->page, PAGE_SIZE, 0);
 		if (btrfsic_submit_bio_wait(READ, bio))
 			sblock->no_io_error_seen = 0;
 		bio_put(bio);
 	}
 	if (sblock->no_io_error_seen)
 		scrub_recheck_block_checksum(fs_info, sblock, is_metadata,
 					     have_csum, csum, generation,
 					     csum_size);
 	return;
 }
 static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 					 struct scrub_block *sblock,
 					 int is_metadata, int have_csum,
 					 const u8 *csum, u64 generation,
 					 u16 csum_size)
 {
 	int page_num;
 	u8 calculated_csum[BTRFS_CSUM_SIZE];
 	u32 crc = ~(u32)0;
 	void *mapped_buffer;
 	WARN_ON(!sblock->pagev[0]->page);
 	if (is_metadata) {
 		struct btrfs_header *h;
 		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
 		h = (struct btrfs_header *)mapped_buffer;
 		if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
 		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
 		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
 			   BTRFS_UUID_SIZE)) {
 			sblock->header_error = 1;
 		} else if (generation != btrfs_stack_header_generation(h)) {
 			sblock->header_error = 1;
 			sblock->generation_error = 1;
 		}
 		csum = h->csum;
 	} else {
 		if (!have_csum)
 			return;
 		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
 	}
 	for (page_num = 0;;) {
 		if (page_num == 0 && is_metadata)
 			crc = btrfs_csum_data(
 				((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE,
 				crc, PAGE_SIZE - BTRFS_CSUM_SIZE);
 		else
 			crc = btrfs_csum_data(mapped_buffer, crc, PAGE_SIZE);
 		kunmap_atomic(mapped_buffer);
 		page_num++;
 		if (page_num >= sblock->page_count)
 			break;
 		WARN_ON(!sblock->pagev[page_num]->page);
 		mapped_buffer = kmap_atomic(sblock->pagev[page_num]->page);
 	}
 	btrfs_csum_final(crc, calculated_csum);
 	if (memcmp(calculated_csum, csum, csum_size))
 		sblock->checksum_error = 1;
 }
 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
 					     struct scrub_block *sblock_good,
 					     int force_write)
 {
 	int page_num;
 	int ret = 0;
 	for (page_num = 0; page_num < sblock_bad->page_count; page_num++) {
 		int ret_sub;
 		ret_sub = scrub_repair_page_from_good_copy(sblock_bad,
 							   sblock_good,
 							   page_num,
 							   force_write);
 		if (ret_sub)
 			ret = ret_sub;
 	}
 	return ret;
 }
 static int scrub_repair_page_from_good_copy(struct scrub_block *sblock_bad,
 					    struct scrub_block *sblock_good,
 					    int page_num, int force_write)
 {
 	struct scrub_page *page_bad = sblock_bad->pagev[page_num];
 	struct scrub_page *page_good = sblock_good->pagev[page_num];
 	BUG_ON(page_bad->page == NULL);
 	BUG_ON(page_good->page == NULL);
 	if (force_write || sblock_bad->header_error ||
 	    sblock_bad->checksum_error || page_bad->io_error) {
 		struct bio *bio;
 		int ret;
 		if (!page_bad->dev->bdev) {
 			printk_ratelimited(KERN_WARNING "BTRFS: "
 				"scrub_repair_page_from_good_copy(bdev == NULL) "
 				"is unexpected!\n");
 			return -EIO;
 		}
 		bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
 		if (!bio)
 			return -EIO;
 		bio->bi_bdev = page_bad->dev->bdev;
 		bio->bi_iter.bi_sector = page_bad->physical >> 9;
 		ret = bio_add_page(bio, page_good->page, PAGE_SIZE, 0);
 		if (PAGE_SIZE != ret) {
 			bio_put(bio);
 			return -EIO;
 		}
 		if (btrfsic_submit_bio_wait(WRITE, bio)) {
 			btrfs_dev_stat_inc_and_print(page_bad->dev,
 				BTRFS_DEV_STAT_WRITE_ERRS);
 			btrfs_dev_replace_stats_inc(
 				&sblock_bad->sctx->dev_root->fs_info->
 				dev_replace.num_write_errors);
 			bio_put(bio);
 			return -EIO;
 		}
 		bio_put(bio);
 	}
 	return 0;
 }
 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
 {
 	int page_num;
 	for (page_num = 0; page_num < sblock->page_count; page_num++) {
 		int ret;
 		ret = scrub_write_page_to_dev_replace(sblock, page_num);
 		if (ret)
 			btrfs_dev_replace_stats_inc(
 				&sblock->sctx->dev_root->fs_info->dev_replace.
 				num_write_errors);
 	}
 }
 static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
 					   int page_num)
 {
 	struct scrub_page *spage = sblock->pagev[page_num];
 	BUG_ON(spage->page == NULL);
 	if (spage->io_error) {
 		void *mapped_buffer = kmap_atomic(spage->page);
 		memset(mapped_buffer, 0, PAGE_CACHE_SIZE);
 		flush_dcache_page(spage->page);
 		kunmap_atomic(mapped_buffer);
 	}
 	return scrub_add_page_to_wr_bio(sblock->sctx, spage);
 }
 static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
 				    struct scrub_page *spage)
 {
 	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
 	struct scrub_bio *sbio;
 	int ret;
 	mutex_lock(&wr_ctx->wr_lock);
 again:
 	if (!wr_ctx->wr_curr_bio) {
 		wr_ctx->wr_curr_bio = kzalloc(sizeof(*wr_ctx->wr_curr_bio),
 					      GFP_NOFS);
 		if (!wr_ctx->wr_curr_bio) {
 			mutex_unlock(&wr_ctx->wr_lock);
 			return -ENOMEM;
 		}
 		wr_ctx->wr_curr_bio->sctx = sctx;
 		wr_ctx->wr_curr_bio->page_count = 0;
 	}
 	sbio = wr_ctx->wr_curr_bio;
 	if (sbio->page_count == 0) {
 		struct bio *bio;
 		sbio->physical = spage->physical_for_dev_replace;
 		sbio->logical = spage->logical;
 		sbio->dev = wr_ctx->tgtdev;
 		bio = sbio->bio;
 		if (!bio) {
 			bio = btrfs_io_bio_alloc(GFP_NOFS, wr_ctx->pages_per_wr_bio);
 			if (!bio) {
 				mutex_unlock(&wr_ctx->wr_lock);
 				return -ENOMEM;
 			}
 			sbio->bio = bio;
 		}
 		bio->bi_private = sbio;
 		bio->bi_end_io = scrub_wr_bio_end_io;
 		bio->bi_bdev = sbio->dev->bdev;
 		bio->bi_iter.bi_sector = sbio->physical >> 9;
 		sbio->err = 0;
 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
 		   spage->physical_for_dev_replace ||
 		   sbio->logical + sbio->page_count * PAGE_SIZE !=
 		   spage->logical) {
 		scrub_wr_submit(sctx);
 		goto again;
 	}
 	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
 	if (ret != PAGE_SIZE) {
 		if (sbio->page_count < 1) {
 			bio_put(sbio->bio);
 			sbio->bio = NULL;
 			mutex_unlock(&wr_ctx->wr_lock);
 			return -EIO;
 		}
 		scrub_wr_submit(sctx);
 		goto again;
 	}
 	sbio->pagev[sbio->page_count] = spage;
 	scrub_page_get(spage);
 	sbio->page_count++;
 	if (sbio->page_count == wr_ctx->pages_per_wr_bio)
 		scrub_wr_submit(sctx);
 	mutex_unlock(&wr_ctx->wr_lock);
 	return 0;
 }
 static void scrub_wr_submit(struct scrub_ctx *sctx)
 {
 	struct scrub_wr_ctx *wr_ctx = &sctx->wr_ctx;
 	struct scrub_bio *sbio;
 	if (!wr_ctx->wr_curr_bio)
 		return;
 	sbio = wr_ctx->wr_curr_bio;
 	wr_ctx->wr_curr_bio = NULL;
 	WARN_ON(!sbio->bio->bi_bdev);
 	scrub_pending_bio_inc(sctx);
 	/* process all writes in a single worker thread. Then the block layer
 	 * orders the requests before sending them to the driver which
 	 * doubled the write performance on spinning disks when measured
 	 * with Linux 3.5 */
 	btrfsic_submit_bio(WRITE, sbio->bio);
 }
 static void scrub_wr_bio_end_io(struct bio *bio, int err)
 {
 	struct scrub_bio *sbio = bio->bi_private;
 	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
 	sbio->err = err;
 	sbio->bio = bio;
 	btrfs_init_work(&sbio->work, scrub_wr_bio_end_io_worker, NULL, NULL);
 	btrfs_queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
 }
 static void scrub_wr_bio_end_io_worker(struct btrfs_work *work)
 {
 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
 	struct scrub_ctx *sctx = sbio->sctx;
 	int i;
 	WARN_ON(sbio->page_count > SCRUB_PAGES_PER_WR_BIO);
 	if (sbio->err) {
 		struct btrfs_dev_replace *dev_replace =
 			&sbio->sctx->dev_root->fs_info->dev_replace;
 		for (i = 0; i < sbio->page_count; i++) {
 			struct scrub_page *spage = sbio->pagev[i];
 			spage->io_error = 1;
 			btrfs_dev_replace_stats_inc(&dev_replace->
 						    num_write_errors);
 		}
 	}
 	for (i = 0; i < sbio->page_count; i++)
 		scrub_page_put(sbio->pagev[i]);
 	bio_put(sbio->bio);
 	kfree(sbio);
 	scrub_pending_bio_dec(sctx);
 }
 static int scrub_checksum(struct scrub_block *sblock)
 {
 	u64 flags;
 	int ret;
 	WARN_ON(sblock->page_count < 1);
 	flags = sblock->pagev[0]->flags;
 	ret = 0;
 	if (flags & BTRFS_EXTENT_FLAG_DATA)
 		ret = scrub_checksum_data(sblock);
 	else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
 		ret = scrub_checksum_tree_block(sblock);
 	else if (flags & BTRFS_EXTENT_FLAG_SUPER)
 		(void)scrub_checksum_super(sblock);
 	else
 		WARN_ON(1);
 	if (ret)
 		scrub_handle_errored_block(sblock);
 	return ret;
 }
 static int scrub_checksum_data(struct scrub_block *sblock)
 {
 	struct scrub_ctx *sctx = sblock->sctx;
 	u8 csum[BTRFS_CSUM_SIZE];
 	u8 *on_disk_csum;
 	struct page *page;
 	void *buffer;
 	u32 crc = ~(u32)0;
 	int fail = 0;
 	u64 len;
 	int index;
 	BUG_ON(sblock->page_count < 1);
 	if (!sblock->pagev[0]->have_csum)
 		return 0;
 	on_disk_csum = sblock->pagev[0]->csum;
 	page = sblock->pagev[0]->page;
 	buffer = kmap_atomic(page);
 	len = sctx->sectorsize;
 	index = 0;
 	for (;;) {
 		u64 l = min_t(u64, len, PAGE_SIZE);
 		crc = btrfs_csum_data(buffer, crc, l);
 		kunmap_atomic(buffer);
 		len -= l;
 		if (len == 0)
 			break;
 		index++;
 		BUG_ON(index >= sblock->page_count);
 		BUG_ON(!sblock->pagev[index]->page);
 		page = sblock->pagev[index]->page;
 		buffer = kmap_atomic(page);
 	}
 	btrfs_csum_final(crc, csum);
 	if (memcmp(csum, on_disk_csum, sctx->csum_size))
 		fail = 1;
 	return fail;
 }
 static int scrub_checksum_tree_block(struct scrub_block *sblock)
 {
 	struct scrub_ctx *sctx = sblock->sctx;
 	struct btrfs_header *h;
 	struct btrfs_root *root = sctx->dev_root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u8 calculated_csum[BTRFS_CSUM_SIZE];
 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
 	struct page *page;
 	void *mapped_buffer;
 	u64 mapped_size;
 	void *p;
 	u32 crc = ~(u32)0;
 	int fail = 0;
 	int crc_fail = 0;
 	u64 len;
 	int index;
 	BUG_ON(sblock->page_count < 1);
 	page = sblock->pagev[0]->page;
 	mapped_buffer = kmap_atomic(page);
 	h = (struct btrfs_header *)mapped_buffer;
 	memcpy(on_disk_csum, h->csum, sctx->csum_size);
 	/*
 	 * we don't use the getter functions here, as we
 	 * a) don't have an extent buffer and
 	 * b) the page is already kmapped
 	 */
 	if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
 		++fail;
 	if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
 		++fail;
 	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
 		++fail;
 	if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
 		   BTRFS_UUID_SIZE))
 		++fail;
 	WARN_ON(sctx->nodesize != sctx->leafsize);
 	len = sctx->nodesize - BTRFS_CSUM_SIZE;
 	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
 	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
 	index = 0;
 	for (;;) {
 		u64 l = min_t(u64, len, mapped_size);
 		crc = btrfs_csum_data(p, crc, l);
 		kunmap_atomic(mapped_buffer);
 		len -= l;
 		if (len == 0)
 			break;
 		index++;
 		BUG_ON(index >= sblock->page_count);
 		BUG_ON(!sblock->pagev[index]->page);
 		page = sblock->pagev[index]->page;
 		mapped_buffer = kmap_atomic(page);
 		mapped_size = PAGE_SIZE;
 		p = mapped_buffer;
 	}
 	btrfs_csum_final(crc, calculated_csum);
 	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
 		++crc_fail;
 	return fail || crc_fail;
 }
 static int scrub_checksum_super(struct scrub_block *sblock)
 {
 	struct btrfs_super_block *s;
 	struct scrub_ctx *sctx = sblock->sctx;
 	struct btrfs_root *root = sctx->dev_root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u8 calculated_csum[BTRFS_CSUM_SIZE];
 	u8 on_disk_csum[BTRFS_CSUM_SIZE];
 	struct page *page;
 	void *mapped_buffer;
 	u64 mapped_size;
 	void *p;
 	u32 crc = ~(u32)0;
 	int fail_gen = 0;
 	int fail_cor = 0;
 	u64 len;
 	int index;
 	BUG_ON(sblock->page_count < 1);
 	page = sblock->pagev[0]->page;
 	mapped_buffer = kmap_atomic(page);
 	s = (struct btrfs_super_block *)mapped_buffer;
 	memcpy(on_disk_csum, s->csum, sctx->csum_size);
 	if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
 		++fail_cor;
 	if (sblock->pagev[0]->generation != btrfs_super_generation(s))
 		++fail_gen;
 	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
 		++fail_cor;
 	len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE;
 	mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE;
 	p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE;
 	index = 0;
 	for (;;) {
 		u64 l = min_t(u64, len, mapped_size);
 		crc = btrfs_csum_data(p, crc, l);
 		kunmap_atomic(mapped_buffer);
 		len -= l;
 		if (len == 0)
 			break;
 		index++;
 		BUG_ON(index >= sblock->page_count);
 		BUG_ON(!sblock->pagev[index]->page);
 		page = sblock->pagev[index]->page;
 		mapped_buffer = kmap_atomic(page);
 		mapped_size = PAGE_SIZE;
 		p = mapped_buffer;
 	}
 	btrfs_csum_final(crc, calculated_csum);
 	if (memcmp(calculated_csum, on_disk_csum, sctx->csum_size))
 		++fail_cor;
 	if (fail_cor + fail_gen) {
 		/*
 		 * if we find an error in a super block, we just report it.
 		 * They will get written with the next transaction commit
 		 * anyway
 		 */
 		spin_lock(&sctx->stat_lock);
 		++sctx->stat.super_errors;
 		spin_unlock(&sctx->stat_lock);
 		if (fail_cor)
 			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
 				BTRFS_DEV_STAT_CORRUPTION_ERRS);
 		else
 			btrfs_dev_stat_inc_and_print(sblock->pagev[0]->dev,
 				BTRFS_DEV_STAT_GENERATION_ERRS);
 	}
 	return fail_cor + fail_gen;
 }
 static void scrub_block_get(struct scrub_block *sblock)
 {
 	atomic_inc(&sblock->ref_count);
 }
 static void scrub_block_put(struct scrub_block *sblock)
 {
 	if (atomic_dec_and_test(&sblock->ref_count)) {
 		int i;
 		for (i = 0; i < sblock->page_count; i++)
 			scrub_page_put(sblock->pagev[i]);
 		kfree(sblock);
 	}
 }
 static void scrub_page_get(struct scrub_page *spage)
 {
 	atomic_inc(&spage->ref_count);
 }
 static void scrub_page_put(struct scrub_page *spage)
 {
 	if (atomic_dec_and_test(&spage->ref_count)) {
 		if (spage->page)
 			__free_page(spage->page);
 		kfree(spage);
 	}
 }
 static void scrub_submit(struct scrub_ctx *sctx)
 {
 	struct scrub_bio *sbio;
 	if (sctx->curr == -1)
 		return;
 	sbio = sctx->bios[sctx->curr];
 	sctx->curr = -1;
 	scrub_pending_bio_inc(sctx);
 	if (!sbio->bio->bi_bdev) {
 		/*
 		 * this case should not happen. If btrfs_map_block() is
 		 * wrong, it could happen for dev-replace operations on
 		 * missing devices when no mirrors are available, but in
 		 * this case it should already fail the mount.
 		 * This case is handled correctly (but _very_ slowly).
 		 */
 		printk_ratelimited(KERN_WARNING
 			"BTRFS: scrub_submit(bio bdev == NULL) is unexpected!\n");
 		bio_endio(sbio->bio, -EIO);
 	} else {
 		btrfsic_submit_bio(READ, sbio->bio);
 	}
 }
 static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
 				    struct scrub_page *spage)
 {
 	struct scrub_block *sblock = spage->sblock;
 	struct scrub_bio *sbio;
 	int ret;
 again:
 	/*
 	 * grab a fresh bio or wait for one to become available
 	 */
 	while (sctx->curr == -1) {
 		spin_lock(&sctx->list_lock);
 		sctx->curr = sctx->first_free;
 		if (sctx->curr != -1) {
 			sctx->first_free = sctx->bios[sctx->curr]->next_free;
 			sctx->bios[sctx->curr]->next_free = -1;
 			sctx->bios[sctx->curr]->page_count = 0;
 			spin_unlock(&sctx->list_lock);
 		} else {
 			spin_unlock(&sctx->list_lock);
 			wait_event(sctx->list_wait, sctx->first_free != -1);
 		}
 	}
 	sbio = sctx->bios[sctx->curr];
 	if (sbio->page_count == 0) {
 		struct bio *bio;
 		sbio->physical = spage->physical;
 		sbio->logical = spage->logical;
 		sbio->dev = spage->dev;
 		bio = sbio->bio;
 		if (!bio) {
 			bio = btrfs_io_bio_alloc(GFP_NOFS, sctx->pages_per_rd_bio);
 			if (!bio)
 				return -ENOMEM;
 			sbio->bio = bio;
 		}
 		bio->bi_private = sbio;
 		bio->bi_end_io = scrub_bio_end_io;
 		bio->bi_bdev = sbio->dev->bdev;
 		bio->bi_iter.bi_sector = sbio->physical >> 9;
 		sbio->err = 0;
 	} else if (sbio->physical + sbio->page_count * PAGE_SIZE !=
 		   spage->physical ||
 		   sbio->logical + sbio->page_count * PAGE_SIZE !=
 		   spage->logical ||
 		   sbio->dev != spage->dev) {
 		scrub_submit(sctx);
 		goto again;
 	}
 	sbio->pagev[sbio->page_count] = spage;
 	ret = bio_add_page(sbio->bio, spage->page, PAGE_SIZE, 0);
 	if (ret != PAGE_SIZE) {
 		if (sbio->page_count < 1) {
 			bio_put(sbio->bio);
 			sbio->bio = NULL;
 			return -EIO;
 		}
 		scrub_submit(sctx);
 		goto again;
 	}
 	scrub_block_get(sblock); /* one for the page added to the bio */
 	atomic_inc(&sblock->outstanding_pages);
 	sbio->page_count++;
 	if (sbio->page_count == sctx->pages_per_rd_bio)
 		scrub_submit(sctx);
 	return 0;
 }
 static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 		       u64 physical, struct btrfs_device *dev, u64 flags,
 		       u64 gen, int mirror_num, u8 *csum, int force,
 		       u64 physical_for_dev_replace)
 {
 	struct scrub_block *sblock;
 	int index;
 	sblock = kzalloc(sizeof(*sblock), GFP_NOFS);
 	if (!sblock) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.malloc_errors++;
 		spin_unlock(&sctx->stat_lock);
 		return -ENOMEM;
 	}
 	/* one ref inside this function, plus one for each page added to
 	 * a bio later on */
 	atomic_set(&sblock->ref_count, 1);
 	sblock->sctx = sctx;
 	sblock->no_io_error_seen = 1;
 	for (index = 0; len > 0; index++) {
 		struct scrub_page *spage;
 		u64 l = min_t(u64, len, PAGE_SIZE);
 		spage = kzalloc(sizeof(*spage), GFP_NOFS);
 		if (!spage) {
 leave_nomem:
 			spin_lock(&sctx->stat_lock);
 			sctx->stat.malloc_errors++;
 			spin_unlock(&sctx->stat_lock);
 			scrub_block_put(sblock);
 			return -ENOMEM;
 		}
 		BUG_ON(index >= SCRUB_MAX_PAGES_PER_BLOCK);
 		scrub_page_get(spage);
 		sblock->pagev[index] = spage;
 		spage->sblock = sblock;
 		spage->dev = dev;
 		spage->flags = flags;
 		spage->generation = gen;
 		spage->logical = logical;
 		spage->physical = physical;
 		spage->physical_for_dev_replace = physical_for_dev_replace;
 		spage->mirror_num = mirror_num;
 		if (csum) {
 			spage->have_csum = 1;
 			memcpy(spage->csum, csum, sctx->csum_size);
 		} else {
 			spage->have_csum = 0;
 		}
 		sblock->page_count++;
 		spage->page = alloc_page(GFP_NOFS);
 		if (!spage->page)
 			goto leave_nomem;
 		len -= l;
 		logical += l;
 		physical += l;
 		physical_for_dev_replace += l;
 	}
 	WARN_ON(sblock->page_count == 0);
 	for (index = 0; index < sblock->page_count; index++) {
 		struct scrub_page *spage = sblock->pagev[index];
 		int ret;
 		ret = scrub_add_page_to_rd_bio(sctx, spage);
 		if (ret) {
 			scrub_block_put(sblock);
 			return ret;
 		}
 	}
 	if (force)
 		scrub_submit(sctx);
 	/* last one frees, either here or in bio completion for last page */
 	scrub_block_put(sblock);
 	return 0;
 }
 static void scrub_bio_end_io(struct bio *bio, int err)
 {
 	struct scrub_bio *sbio = bio->bi_private;
 	struct btrfs_fs_info *fs_info = sbio->dev->dev_root->fs_info;
 	sbio->err = err;
 	sbio->bio = bio;
 	btrfs_queue_work(fs_info->scrub_workers, &sbio->work);
 }
 static void scrub_bio_end_io_worker(struct btrfs_work *work)
 {
 	struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
 	struct scrub_ctx *sctx = sbio->sctx;
 	int i;
 	BUG_ON(sbio->page_count > SCRUB_PAGES_PER_RD_BIO);
 	if (sbio->err) {
 		for (i = 0; i < sbio->page_count; i++) {
 			struct scrub_page *spage = sbio->pagev[i];
 			spage->io_error = 1;
 			spage->sblock->no_io_error_seen = 0;
 		}
 	}
 	/* now complete the scrub_block items that have all pages completed */
 	for (i = 0; i < sbio->page_count; i++) {
 		struct scrub_page *spage = sbio->pagev[i];
 		struct scrub_block *sblock = spage->sblock;
 		if (atomic_dec_and_test(&sblock->outstanding_pages))
 			scrub_block_complete(sblock);
 		scrub_block_put(sblock);
 	}
 	bio_put(sbio->bio);
 	sbio->bio = NULL;
 	spin_lock(&sctx->list_lock);
 	sbio->next_free = sctx->first_free;
 	sctx->first_free = sbio->index;
 	spin_unlock(&sctx->list_lock);
 	if (sctx->is_dev_replace &&
 	    atomic_read(&sctx->wr_ctx.flush_all_writes)) {
 		mutex_lock(&sctx->wr_ctx.wr_lock);
 		scrub_wr_submit(sctx);
 		mutex_unlock(&sctx->wr_ctx.wr_lock);
 	}
 	scrub_pending_bio_dec(sctx);
 }
 static void scrub_block_complete(struct scrub_block *sblock)
 {
 	if (!sblock->no_io_error_seen) {
 		scrub_handle_errored_block(sblock);
 	} else {
 		/*
 		 * if has checksum error, write via repair mechanism in
 		 * dev replace case, otherwise write here in dev replace
 		 * case.
 		 */
 		if (!scrub_checksum(sblock) && sblock->sctx->is_dev_replace)
 			scrub_write_block_to_dev_replace(sblock);
 	}
 }
 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u64 len,
 			   u8 *csum)
 {
 	struct btrfs_ordered_sum *sum = NULL;
 	unsigned long index;
 	unsigned long num_sectors;
 	while (!list_empty(&sctx->csum_list)) {
 		sum = list_first_entry(&sctx->csum_list,
 				       struct btrfs_ordered_sum, list);
 		if (sum->bytenr > logical)
 			return 0;
 		if (sum->bytenr + sum->len > logical)
 			break;
 		++sctx->stat.csum_discards;
 		list_del(&sum->list);
 		kfree(sum);
 		sum = NULL;
 	}
 	if (!sum)
 		return 0;
 	index = ((u32)(logical - sum->bytenr)) / sctx->sectorsize;
 	num_sectors = sum->len / sctx->sectorsize;
 	memcpy(csum, sum->sums + index, sctx->csum_size);
 	if (index == num_sectors - 1) {
 		list_del(&sum->list);
 		kfree(sum);
 	}
 	return 1;
 }
 /* scrub extent tries to collect up to 64 kB for each bio */
 static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len,
 			u64 physical, struct btrfs_device *dev, u64 flags,
 			u64 gen, int mirror_num, u64 physical_for_dev_replace)
 {
 	int ret;
 	u8 csum[BTRFS_CSUM_SIZE];
 	u32 blocksize;
 	if (flags & BTRFS_EXTENT_FLAG_DATA) {
 		blocksize = sctx->sectorsize;
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.data_extents_scrubbed++;
 		sctx->stat.data_bytes_scrubbed += len;
 		spin_unlock(&sctx->stat_lock);
 	} else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		WARN_ON(sctx->nodesize != sctx->leafsize);
 		blocksize = sctx->nodesize;
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.tree_extents_scrubbed++;
 		sctx->stat.tree_bytes_scrubbed += len;
 		spin_unlock(&sctx->stat_lock);
 	} else {
 		blocksize = sctx->sectorsize;
 		WARN_ON(1);
 	}
 	while (len) {
 		u64 l = min_t(u64, len, blocksize);
 		int have_csum = 0;
 		if (flags & BTRFS_EXTENT_FLAG_DATA) {
 			/* push csums to sbio */
 			have_csum = scrub_find_csum(sctx, logical, l, csum);
 			if (have_csum == 0)
 				++sctx->stat.no_csum;
 			if (sctx->is_dev_replace && !have_csum) {
 				ret = copy_nocow_pages(sctx, logical, l,
 						       mirror_num,
 						      physical_for_dev_replace);
 				goto behind_scrub_pages;
 			}
 		}
 		ret = scrub_pages(sctx, logical, l, physical, dev, flags, gen,
 				  mirror_num, have_csum ? csum : NULL, 0,
 				  physical_for_dev_replace);
 behind_scrub_pages:
 		if (ret)
 			return ret;
 		len -= l;
 		logical += l;
 		physical += l;
 		physical_for_dev_replace += l;
 	}
 	return 0;
 }
 /*
  * Given a physical address, this will calculate it's
  * logical offset. if this is a parity stripe, it will return
  * the most left data stripe's logical offset.
  *
  * return 0 if it is a data stripe, 1 means parity stripe.
  */
 static int get_raid56_logic_offset(u64 physical, int num,
 				   struct map_lookup *map, u64 *offset)
 {
 	int i;
 	int j = 0;
 	u64 stripe_nr;
 	u64 last_offset;
 	int stripe_index;
 	int rot;
 	last_offset = (physical - map->stripes[num].physical) *
 		      nr_data_stripes(map);
 	*offset = last_offset;
 	for (i = 0; i < nr_data_stripes(map); i++) {
 		*offset = last_offset + i * map->stripe_len;
 		stripe_nr = *offset;
 		do_div(stripe_nr, map->stripe_len);
 		do_div(stripe_nr, nr_data_stripes(map));
 		/* Work out the disk rotation on this stripe-set */
 		rot = do_div(stripe_nr, map->num_stripes);
 		/* calculate which stripe this data locates */
 		rot += i;
 		stripe_index = rot % map->num_stripes;
 		if (stripe_index == num)
 			return 0;
 		if (stripe_index < num)
 			j++;
 	}
 	*offset = last_offset + j * map->stripe_len;
 	return 1;
 }
 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 					   struct map_lookup *map,
 					   struct btrfs_device *scrub_dev,
 					   int num, u64 base, u64 length,
 					   int is_dev_replace)
 {
 	struct btrfs_path *path;
 	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 	struct btrfs_root *root = fs_info->extent_root;
 	struct btrfs_root *csum_root = fs_info->csum_root;
 	struct btrfs_extent_item *extent;
 	struct blk_plug plug;
 	u64 flags;
 	int ret;
 	int slot;
 	u64 nstripes;
 	struct extent_buffer *l;
 	struct btrfs_key key;
 	u64 physical;
 	u64 logical;
 	u64 logic_end;
 	u64 physical_end;
 	u64 generation;
 	int mirror_num;
 	struct reada_control *reada1;
 	struct reada_control *reada2;
 	struct btrfs_key key_start;
 	struct btrfs_key key_end;
 	u64 increment = map->stripe_len;
 	u64 offset;
 	u64 extent_logical;
 	u64 extent_physical;
 	u64 extent_len;
 	struct btrfs_device *extent_dev;
 	int extent_mirror_num;
 	int stop_loop = 0;
 	nstripes = length;
 	physical = map->stripes[num].physical;
 	offset = 0;
 	do_div(nstripes, map->stripe_len);
 	if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
 		offset = map->stripe_len * num;
 		increment = map->stripe_len * map->num_stripes;
 		mirror_num = 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
 		int factor = map->num_stripes / map->sub_stripes;
 		offset = map->stripe_len * (num / map->sub_stripes);
 		increment = map->stripe_len * factor;
 		mirror_num = num % map->sub_stripes + 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_RAID1) {
 		increment = map->stripe_len;
 		mirror_num = num % map->num_stripes + 1;
 	} else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
 		increment = map->stripe_len;
 		mirror_num = num % map->num_stripes + 1;
 	} else if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
 				BTRFS_BLOCK_GROUP_RAID6)) {
 		get_raid56_logic_offset(physical, num, map, &offset);
 		increment = map->stripe_len * nr_data_stripes(map);
 		mirror_num = 1;
 	} else {
 		increment = map->stripe_len;
 		mirror_num = 1;
 	}
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 	/*
 	 * work on commit root. The related disk blocks are static as
 	 * long as COW is applied. This means, it is save to rewrite
 	 * them to repair disk errors without any race conditions
 	 */
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 	/*
 	 * trigger the readahead for extent tree csum tree and wait for
 	 * completion. During readahead, the scrub is officially paused
 	 * to not hold off transaction commits
 	 */
 	logical = base + offset;
 	physical_end = physical + nstripes * map->stripe_len;
 	if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
 			 BTRFS_BLOCK_GROUP_RAID6)) {
 		get_raid56_logic_offset(physical_end, num,
 					map, &logic_end);
 		logic_end += base;
 	} else {
 		logic_end = logical + increment * nstripes;
 	}
 	wait_event(sctx->list_wait,
 		   atomic_read(&sctx->bios_in_flight) == 0);
 	scrub_blocked_if_needed(fs_info);
 	/* FIXME it might be better to start readahead at commit root */
 	key_start.objectid = logical;
 	key_start.type = BTRFS_EXTENT_ITEM_KEY;
 	key_start.offset = (u64)0;
 	key_end.objectid = logic_end;
 	key_end.type = BTRFS_METADATA_ITEM_KEY;
 	key_end.offset = (u64)-1;
 	reada1 = btrfs_reada_add(root, &key_start, &key_end);
 	key_start.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
 	key_start.type = BTRFS_EXTENT_CSUM_KEY;
 	key_start.offset = logical;
 	key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
 	key_end.type = BTRFS_EXTENT_CSUM_KEY;
 	key_end.offset = logic_end;
 	reada2 = btrfs_reada_add(csum_root, &key_start, &key_end);
 	if (!IS_ERR(reada1))
 		btrfs_reada_wait(reada1);
 	if (!IS_ERR(reada2))
 		btrfs_reada_wait(reada2);
 	/*
 	 * collect all data csums for the stripe to avoid seeking during
 	 * the scrub. This might currently (crc32) end up to be about 1MB
 	 */
 	blk_start_plug(&plug);
 	/*
 	 * now find all extents for each stripe and scrub them
 	 */
 	ret = 0;
 	while (physical < physical_end) {
 		/* for raid56, we skip parity stripe */
 		if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
 				BTRFS_BLOCK_GROUP_RAID6)) {
 			ret = get_raid56_logic_offset(physical, num,
 					map, &logical);
 			logical += base;
 			if (ret)
 				goto skip;
 		}
 		/*
 		 * canceled?
 		 */
 		if (atomic_read(&fs_info->scrub_cancel_req) ||
 		    atomic_read(&sctx->cancel_req)) {
 			ret = -ECANCELED;
 			goto out;
 		}
 		/*
 		 * check to see if we have to pause
 		 */
 		if (atomic_read(&fs_info->scrub_pause_req)) {
 			/* push queued extents */
 			atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
 			scrub_submit(sctx);
 			mutex_lock(&sctx->wr_ctx.wr_lock);
 			scrub_wr_submit(sctx);
 			mutex_unlock(&sctx->wr_ctx.wr_lock);
 			wait_event(sctx->list_wait,
 				   atomic_read(&sctx->bios_in_flight) == 0);
 			atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
 			scrub_blocked_if_needed(fs_info);
 		}
 		if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
 			key.type = BTRFS_METADATA_ITEM_KEY;
 		else
 			key.type = BTRFS_EXTENT_ITEM_KEY;
 		key.objectid = logical;
 		key.offset = (u64)-1;
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
 		if (ret > 0) {
 			ret = btrfs_previous_extent_item(root, path, 0);
 			if (ret < 0)
 				goto out;
 			if (ret > 0) {
 				/* there's no smaller item, so stick with the
 				 * larger one */
 				btrfs_release_path(path);
 				ret = btrfs_search_slot(NULL, root, &key,
 							path, 0, 0);
 				if (ret < 0)
 					goto out;
 			}
 		}
 		stop_loop = 0;
 		while (1) {
 			u64 bytes;
 			l = path->nodes[0];
 			slot = path->slots[0];
 			if (slot >= btrfs_header_nritems(l)) {
 				ret = btrfs_next_leaf(root, path);
 				if (ret == 0)
 					continue;
 				if (ret < 0)
 					goto out;
 				stop_loop = 1;
 				break;
 			}
 			btrfs_item_key_to_cpu(l, &key, slot);
 			if (key.type == BTRFS_METADATA_ITEM_KEY)
 				bytes = root->leafsize;
 			else
 				bytes = key.offset;
 			if (key.objectid + bytes <= logical)
 				goto next;
 			if (key.type != BTRFS_EXTENT_ITEM_KEY &&
 			    key.type != BTRFS_METADATA_ITEM_KEY)
 				goto next;
 			if (key.objectid >= logical + map->stripe_len) {
 				/* out of this device extent */
 				if (key.objectid >= logic_end)
 					stop_loop = 1;
 				break;
 			}
 			extent = btrfs_item_ptr(l, slot,
 						struct btrfs_extent_item);
 			flags = btrfs_extent_flags(l, extent);
 			generation = btrfs_extent_generation(l, extent);
 			if (key.objectid < logical &&
 			    (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) {
 				btrfs_err(fs_info,
 					   "scrub: tree block %llu spanning "
 					   "stripes, ignored. logical=%llu",
 				       key.objectid, logical);
 				goto next;
 			}
 again:
 			extent_logical = key.objectid;
 			extent_len = bytes;
 			/*
 			 * trim extent to this stripe
 			 */
 			if (extent_logical < logical) {
 				extent_len -= logical - extent_logical;
 				extent_logical = logical;
 			}
 			if (extent_logical + extent_len >
 			    logical + map->stripe_len) {
 				extent_len = logical + map->stripe_len -
 					     extent_logical;
 			}
 			extent_physical = extent_logical - logical + physical;
 			extent_dev = scrub_dev;
 			extent_mirror_num = mirror_num;
 			if (is_dev_replace)
 				scrub_remap_extent(fs_info, extent_logical,
 						   extent_len, &extent_physical,
 						   &extent_dev,
 						   &extent_mirror_num);
 			ret = btrfs_lookup_csums_range(csum_root, logical,
 						logical + map->stripe_len - 1,
 						&sctx->csum_list, 1);
 			if (ret)
 				goto out;
 			ret = scrub_extent(sctx, extent_logical, extent_len,
 					   extent_physical, extent_dev, flags,
 					   generation, extent_mirror_num,
 					   extent_logical - logical + physical);
 			if (ret)
 				goto out;
 			scrub_free_csums(sctx);
 			if (extent_logical + extent_len <
 			    key.objectid + bytes) {
 				if (map->type & (BTRFS_BLOCK_GROUP_RAID5 |
 					BTRFS_BLOCK_GROUP_RAID6)) {
 					/*
 					 * loop until we find next data stripe
 					 * or we have finished all stripes.
 					 */
 					do {
 						physical += map->stripe_len;
 						ret = get_raid56_logic_offset(
 								physical, num,
 								map, &logical);
 						logical += base;
 					} while (physical < physical_end && ret);
 				} else {
 					physical += map->stripe_len;
 					logical += increment;
 				}
 				if (logical < key.objectid + bytes) {
 					cond_resched();
 					goto again;
 				}
 				if (physical >= physical_end) {
 					stop_loop = 1;
 					break;
 				}
 			}
 next:
 			path->slots[0]++;
 		}
 		btrfs_release_path(path);
 skip:
 		logical += increment;
 		physical += map->stripe_len;
 		spin_lock(&sctx->stat_lock);
 		if (stop_loop)
 			sctx->stat.last_physical = map->stripes[num].physical +
 						   length;
 		else
 			sctx->stat.last_physical = physical;
 		spin_unlock(&sctx->stat_lock);
 		if (stop_loop)
 			break;
 	}
 out:
 	/* push queued extents */
 	scrub_submit(sctx);
 	mutex_lock(&sctx->wr_ctx.wr_lock);
 	scrub_wr_submit(sctx);
 	mutex_unlock(&sctx->wr_ctx.wr_lock);
 	blk_finish_plug(&plug);
 	btrfs_free_path(path);
 	return ret < 0 ? ret : 0;
 }
 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
 					  struct btrfs_device *scrub_dev,
 					  u64 chunk_tree, u64 chunk_objectid,
 					  u64 chunk_offset, u64 length,
 					  u64 dev_offset, int is_dev_replace)
 {
 	struct btrfs_mapping_tree *map_tree =
 		&sctx->dev_root->fs_info->mapping_tree;
 	struct map_lookup *map;
 	struct extent_map *em;
 	int i;
 	int ret = 0;
 	read_lock(&map_tree->map_tree.lock);
 	em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1);
 	read_unlock(&map_tree->map_tree.lock);
 	if (!em)
 		return -EINVAL;
 	map = (struct map_lookup *)em->bdev;
 	if (em->start != chunk_offset)
 		goto out;
 	if (em->len < length)
 		goto out;
 	for (i = 0; i < map->num_stripes; ++i) {
 		if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
 		    map->stripes[i].physical == dev_offset) {
 			ret = scrub_stripe(sctx, map, scrub_dev, i,
 					   chunk_offset, length,
 					   is_dev_replace);
 			if (ret)
 				goto out;
 		}
 	}
 out:
 	free_extent_map(em);
 	return ret;
 }
 static noinline_for_stack
 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
 			   struct btrfs_device *scrub_dev, u64 start, u64 end,
 			   int is_dev_replace)
 {
 	struct btrfs_dev_extent *dev_extent = NULL;
 	struct btrfs_path *path;
 	struct btrfs_root *root = sctx->dev_root;
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	u64 length;
 	u64 chunk_tree;
 	u64 chunk_objectid;
 	u64 chunk_offset;
 	int ret;
 	int slot;
 	struct extent_buffer *l;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 	path->reada = 2;
 	path->search_commit_root = 1;
 	path->skip_locking = 1;
 	key.objectid = scrub_dev->devid;
 	key.offset = 0ull;
 	key.type = BTRFS_DEV_EXTENT_KEY;
 	while (1) {
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			break;
 		if (ret > 0) {
 			if (path->slots[0] >=
 			    btrfs_header_nritems(path->nodes[0])) {
 				ret = btrfs_next_leaf(root, path);
 				if (ret)
 					break;
 			}
 		}
 		l = path->nodes[0];
 		slot = path->slots[0];
 		btrfs_item_key_to_cpu(l, &found_key, slot);
 		if (found_key.objectid != scrub_dev->devid)
 			break;
 		if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY)
 			break;
 		if (found_key.offset >= end)
 			break;
 		if (found_key.offset < key.offset)
 			break;
 		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
 		length = btrfs_dev_extent_length(l, dev_extent);
 		if (found_key.offset + length <= start) {
 			key.offset = found_key.offset + length;
 			btrfs_release_path(path);
 			continue;
 		}
 		chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent);
 		chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent);
 		chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
 		/*
 		 * get a reference on the corresponding block group to prevent
 		 * the chunk from going away while we scrub it
 		 */
 		cache = btrfs_lookup_block_group(fs_info, chunk_offset);
 		if (!cache) {
 			ret = -ENOENT;
 			break;
 		}
 		dev_replace->cursor_right = found_key.offset + length;
 		dev_replace->cursor_left = found_key.offset;
 		dev_replace->item_needs_writeback = 1;
 		ret = scrub_chunk(sctx, scrub_dev, chunk_tree, chunk_objectid,
 				  chunk_offset, length, found_key.offset,
 				  is_dev_replace);
 		/*
 		 * flush, submit all pending read and write bios, afterwards
 		 * wait for them.
 		 * Note that in the dev replace case, a read request causes
 		 * write requests that are submitted in the read completion
 		 * worker. Therefore in the current situation, it is required
 		 * that all write requests are flushed, so that all read and
 		 * write requests are really completed when bios_in_flight
 		 * changes to 0.
 		 */
 		atomic_set(&sctx->wr_ctx.flush_all_writes, 1);
 		scrub_submit(sctx);
 		mutex_lock(&sctx->wr_ctx.wr_lock);
 		scrub_wr_submit(sctx);
 		mutex_unlock(&sctx->wr_ctx.wr_lock);
 		wait_event(sctx->list_wait,
 			   atomic_read(&sctx->bios_in_flight) == 0);
 		atomic_inc(&fs_info->scrubs_paused);
 		wake_up(&fs_info->scrub_pause_wait);
 		/*
 		 * must be called before we decrease @scrub_paused.
 		 * make sure we don't block transaction commit while
 		 * we are waiting pending workers finished.
 		 */
 		wait_event(sctx->list_wait,
 			   atomic_read(&sctx->workers_pending) == 0);
 		atomic_set(&sctx->wr_ctx.flush_all_writes, 0);
 		mutex_lock(&fs_info->scrub_lock);
 		__scrub_blocked_if_needed(fs_info);
 		atomic_dec(&fs_info->scrubs_paused);
 		mutex_unlock(&fs_info->scrub_lock);
 		wake_up(&fs_info->scrub_pause_wait);
 		btrfs_put_block_group(cache);
 		if (ret)
 			break;
 		if (is_dev_replace &&
 		    atomic64_read(&dev_replace->num_write_errors) > 0) {
 			ret = -EIO;
 			break;
 		}
 		if (sctx->stat.malloc_errors > 0) {
 			ret = -ENOMEM;
 			break;
 		}
 		dev_replace->cursor_left = dev_replace->cursor_right;
 		dev_replace->item_needs_writeback = 1;
 		key.offset = found_key.offset + length;
 		btrfs_release_path(path);
 	}
 	btrfs_free_path(path);
 	/*
 	 * ret can still be 1 from search_slot or next_leaf,
 	 * that's not an error
 	 */
 	return ret < 0 ? ret : 0;
 }
 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
 					   struct btrfs_device *scrub_dev)
 {
 	int	i;
 	u64	bytenr;
 	u64	gen;
 	int	ret;
 	struct btrfs_root *root = sctx->dev_root;
 	if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state))
 		return -EIO;
 	gen = root->fs_info->last_trans_committed;
 	for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
 		bytenr = btrfs_sb_offset(i);
 		if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes)
 			break;
 		ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
 				  scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
 				  NULL, 1, bytenr);
 		if (ret)
 			return ret;
 	}
 	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
 	return 0;
 }
 /*
  * get a reference count on fs_info->scrub_workers. start worker if necessary
  */
 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
 						int is_dev_replace)
 {
 	int ret = 0;
 	int flags = WQ_FREEZABLE | WQ_UNBOUND;
 	int max_active = fs_info->thread_pool_size;
 	if (fs_info->scrub_workers_refcnt == 0) {
 		if (is_dev_replace)
 			fs_info->scrub_workers =
 				btrfs_alloc_workqueue("btrfs-scrub", flags,
 						      1, 4);
 		else
 			fs_info->scrub_workers =
 				btrfs_alloc_workqueue("btrfs-scrub", flags,
 						      max_active, 4);
 		if (!fs_info->scrub_workers) {
 			ret = -ENOMEM;
 			goto out;
 		}
 		fs_info->scrub_wr_completion_workers =
 			btrfs_alloc_workqueue("btrfs-scrubwrc", flags,
 					      max_active, 2);
 		if (!fs_info->scrub_wr_completion_workers) {
 			ret = -ENOMEM;
 			goto out;
 		}
 		fs_info->scrub_nocow_workers =
 			btrfs_alloc_workqueue("btrfs-scrubnc", flags, 1, 0);
 		if (!fs_info->scrub_nocow_workers) {
 			ret = -ENOMEM;
 			goto out;
 		}
 	}
 	++fs_info->scrub_workers_refcnt;
 out:
 	return ret;
 }
 static noinline_for_stack void scrub_workers_put(struct btrfs_fs_info *fs_info)
 {
 	if (--fs_info->scrub_workers_refcnt == 0) {
 		btrfs_destroy_workqueue(fs_info->scrub_workers);
 		btrfs_destroy_workqueue(fs_info->scrub_wr_completion_workers);
 		btrfs_destroy_workqueue(fs_info->scrub_nocow_workers);
 	}
 	WARN_ON(fs_info->scrub_workers_refcnt < 0);
 }
 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 		    u64 end, struct btrfs_scrub_progress *progress,
 		    int readonly, int is_dev_replace)
 {
 	struct scrub_ctx *sctx;
 	int ret;
 	struct btrfs_device *dev;
 	if (btrfs_fs_closing(fs_info))
 		return -EINVAL;
 	/*
 	 * check some assumptions
 	 */
 	if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) {
 		btrfs_err(fs_info,
 			   "scrub: size assumption nodesize == leafsize (%d == %d) fails",
 		       fs_info->chunk_root->nodesize,
 		       fs_info->chunk_root->leafsize);
 		return -EINVAL;
 	}
 	if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) {
 		/*
 		 * in this case scrub is unable to calculate the checksum
 		 * the way scrub is implemented. Do not handle this
 		 * situation at all because it won't ever happen.
 		 */
 		btrfs_err(fs_info,
 			   "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
 		       fs_info->chunk_root->nodesize, BTRFS_STRIPE_LEN);
 		return -EINVAL;
 	}
 	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
 		/* not supported for data w/o checksums */
 		btrfs_err(fs_info,
 			   "scrub: size assumption sectorsize != PAGE_SIZE "
 			   "(%d != %lu) fails",
 		       fs_info->chunk_root->sectorsize, PAGE_SIZE);
 		return -EINVAL;
 	}
 	if (fs_info->chunk_root->nodesize >
 	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK ||
 	    fs_info->chunk_root->sectorsize >
 	    PAGE_SIZE * SCRUB_MAX_PAGES_PER_BLOCK) {
 		/*
 		 * would exhaust the array bounds of pagev member in
 		 * struct scrub_block
 		 */
 		btrfs_err(fs_info, "scrub: size assumption nodesize and sectorsize "
 			   "<= SCRUB_MAX_PAGES_PER_BLOCK (%d <= %d && %d <= %d) fails",
 		       fs_info->chunk_root->nodesize,
 		       SCRUB_MAX_PAGES_PER_BLOCK,
 		       fs_info->chunk_root->sectorsize,
 		       SCRUB_MAX_PAGES_PER_BLOCK);
 		return -EINVAL;
 	}
 	mutex_lock(&fs_info->fs_devices->device_list_mutex);
 	dev = btrfs_find_device(fs_info, devid, NULL, NULL);
 	if (!dev || (dev->missing && !is_dev_replace)) {
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		return -ENODEV;
 	}
 	mutex_lock(&fs_info->scrub_lock);
 	if (!dev->in_fs_metadata || dev->is_tgtdev_for_dev_replace) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		return -EIO;
 	}
 	btrfs_dev_replace_lock(&fs_info->dev_replace);
 	if (dev->scrub_device ||
 	    (!is_dev_replace &&
 	     btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
 		btrfs_dev_replace_unlock(&fs_info->dev_replace);
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		return -EINPROGRESS;
 	}
 	btrfs_dev_replace_unlock(&fs_info->dev_replace);
 	ret = scrub_workers_get(fs_info, is_dev_replace);
 	if (ret) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		return ret;
 	}
 	sctx = scrub_setup_ctx(dev, is_dev_replace);
 	if (IS_ERR(sctx)) {
 		mutex_unlock(&fs_info->scrub_lock);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 		scrub_workers_put(fs_info);
 		return PTR_ERR(sctx);
 	}
 	sctx->readonly = readonly;
 	dev->scrub_device = sctx;
 	mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 	/*
 	 * checking @scrub_pause_req here, we can avoid
 	 * race between committing transaction and scrubbing.
 	 */
 	__scrub_blocked_if_needed(fs_info);
 	atomic_inc(&fs_info->scrubs_running);
 	mutex_unlock(&fs_info->scrub_lock);
 	if (!is_dev_replace) {
 		/*
 		 * by holding device list mutex, we can
 		 * kick off writing super in log tree sync.
 		 */
 		mutex_lock(&fs_info->fs_devices->device_list_mutex);
 		ret = scrub_supers(sctx, dev);
 		mutex_unlock(&fs_info->fs_devices->device_list_mutex);
 	}
 	if (!ret)
 		ret = scrub_enumerate_chunks(sctx, dev, start, end,
 					     is_dev_replace);
 	wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
 	atomic_dec(&fs_info->scrubs_running);
 	wake_up(&fs_info->scrub_pause_wait);
 	wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
 	if (progress)
 		memcpy(progress, &sctx->stat, sizeof(*progress));
 	mutex_lock(&fs_info->scrub_lock);
 	dev->scrub_device = NULL;
 	scrub_workers_put(fs_info);
 	mutex_unlock(&fs_info->scrub_lock);
 	scrub_free_ctx(sctx);
 	return ret;
 }
 void btrfs_scrub_pause(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	mutex_lock(&fs_info->scrub_lock);
 	atomic_inc(&fs_info->scrub_pause_req);
 	while (atomic_read(&fs_info->scrubs_paused) !=
 	       atomic_read(&fs_info->scrubs_running)) {
 		mutex_unlock(&fs_info->scrub_lock);
 		wait_event(fs_info->scrub_pause_wait,
 			   atomic_read(&fs_info->scrubs_paused) ==
 			   atomic_read(&fs_info->scrubs_running));
 		mutex_lock(&fs_info->scrub_lock);
 	}
 	mutex_unlock(&fs_info->scrub_lock);
 }
 void btrfs_scrub_continue(struct btrfs_root *root)
 {
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	atomic_dec(&fs_info->scrub_pause_req);
 	wake_up(&fs_info->scrub_pause_wait);
 }
 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
 {
 	mutex_lock(&fs_info->scrub_lock);
 	if (!atomic_read(&fs_info->scrubs_running)) {
 		mutex_unlock(&fs_info->scrub_lock);
 		return -ENOTCONN;
 	}
 	atomic_inc(&fs_info->scrub_cancel_req);
 	while (atomic_read(&fs_info->scrubs_running)) {
 		mutex_unlock(&fs_info->scrub_lock);
 		wait_event(fs_info->scrub_pause_wait,
 			   atomic_read(&fs_info->scrubs_running) == 0);
 		mutex_lock(&fs_info->scrub_lock);
 	}
 	atomic_dec(&fs_info->scrub_cancel_req);
 	mutex_unlock(&fs_info->scrub_lock);
 	return 0;
 }
 int btrfs_scrub_cancel_dev(struct btrfs_fs_info *fs_info,
 			   struct btrfs_device *dev)
 {
 	struct scrub_ctx *sctx;
 	mutex_lock(&fs_info->scrub_lock);
 	sctx = dev->scrub_device;
 	if (!sctx) {
 		mutex_unlock(&fs_info->scrub_lock);
 		return -ENOTCONN;
 	}
 	atomic_inc(&sctx->cancel_req);
 	while (dev->scrub_device) {
 		mutex_unlock(&fs_info->scrub_lock);
 		wait_event(fs_info->scrub_pause_wait,
 			   dev->scrub_device == NULL);
 		mutex_lock(&fs_info->scrub_lock);
 	}
 	mutex_unlock(&fs_info->scrub_lock);
 	return 0;
 }
 int btrfs_scrub_progress(struct btrfs_root *root, u64 devid,
 			 struct btrfs_scrub_progress *progress)
 {
 	struct btrfs_device *dev;
 	struct scrub_ctx *sctx = NULL;
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	dev = btrfs_find_device(root->fs_info, devid, NULL, NULL);
 	if (dev)
 		sctx = dev->scrub_device;
 	if (sctx)
 		memcpy(progress, &sctx->stat, sizeof(*progress));
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 	return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
 }
 static void scrub_remap_extent(struct btrfs_fs_info *fs_info,
 			       u64 extent_logical, u64 extent_len,
 			       u64 *extent_physical,
 			       struct btrfs_device **extent_dev,
 			       int *extent_mirror_num)
 {
 	u64 mapped_length;
 	struct btrfs_bio *bbio = NULL;
 	int ret;
 	mapped_length = extent_len;
 	ret = btrfs_map_block(fs_info, READ, extent_logical,
 			      &mapped_length, &bbio, 0);
 	if (ret || !bbio || mapped_length < extent_len ||
 	    !bbio->stripes[0].dev->bdev) {
 		kfree(bbio);
 		return;
 	}
 	*extent_physical = bbio->stripes[0].physical;
 	*extent_mirror_num = bbio->mirror_num;
 	*extent_dev = bbio->stripes[0].dev;
 	kfree(bbio);
 }
 static int scrub_setup_wr_ctx(struct scrub_ctx *sctx,
 			      struct scrub_wr_ctx *wr_ctx,
 			      struct btrfs_fs_info *fs_info,
 			      struct btrfs_device *dev,
 			      int is_dev_replace)
 {
 	WARN_ON(wr_ctx->wr_curr_bio != NULL);
 	mutex_init(&wr_ctx->wr_lock);
 	wr_ctx->wr_curr_bio = NULL;
 	if (!is_dev_replace)
 		return 0;
 	WARN_ON(!dev->bdev);
 	wr_ctx->pages_per_wr_bio = min_t(int, SCRUB_PAGES_PER_WR_BIO,
 					 bio_get_nr_vecs(dev->bdev));
 	wr_ctx->tgtdev = dev;
 	atomic_set(&wr_ctx->flush_all_writes, 0);
 	return 0;
 }
 static void scrub_free_wr_ctx(struct scrub_wr_ctx *wr_ctx)
 {
 	mutex_lock(&wr_ctx->wr_lock);
 	kfree(wr_ctx->wr_curr_bio);
 	wr_ctx->wr_curr_bio = NULL;
 	mutex_unlock(&wr_ctx->wr_lock);
 }
 static int copy_nocow_pages(struct scrub_ctx *sctx, u64 logical, u64 len,
 			    int mirror_num, u64 physical_for_dev_replace)
 {
 	struct scrub_copy_nocow_ctx *nocow_ctx;
 	struct btrfs_fs_info *fs_info = sctx->dev_root->fs_info;
 	nocow_ctx = kzalloc(sizeof(*nocow_ctx), GFP_NOFS);
 	if (!nocow_ctx) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.malloc_errors++;
 		spin_unlock(&sctx->stat_lock);
 		return -ENOMEM;
 	}
 	scrub_pending_trans_workers_inc(sctx);
 	nocow_ctx->sctx = sctx;
 	nocow_ctx->logical = logical;
 	nocow_ctx->len = len;
 	nocow_ctx->mirror_num = mirror_num;
 	nocow_ctx->physical_for_dev_replace = physical_for_dev_replace;
 	btrfs_init_work(&nocow_ctx->work, copy_nocow_pages_worker, NULL, NULL);
 	INIT_LIST_HEAD(&nocow_ctx->inodes);
 	btrfs_queue_work(fs_info->scrub_nocow_workers,
 			 &nocow_ctx->work);
 	return 0;
 }
 static int record_inode_for_nocow(u64 inum, u64 offset, u64 root, void *ctx)
 {
 	struct scrub_copy_nocow_ctx *nocow_ctx = ctx;
 	struct scrub_nocow_inode *nocow_inode;
 	nocow_inode = kzalloc(sizeof(*nocow_inode), GFP_NOFS);
 	if (!nocow_inode)
 		return -ENOMEM;
 	nocow_inode->inum = inum;
 	nocow_inode->offset = offset;
 	nocow_inode->root = root;
 	list_add_tail(&nocow_inode->list, &nocow_ctx->inodes);
 	return 0;
 }
 #define COPY_COMPLETE 1
 static void copy_nocow_pages_worker(struct btrfs_work *work)
 {
 	struct scrub_copy_nocow_ctx *nocow_ctx =
 		container_of(work, struct scrub_copy_nocow_ctx, work);
 	struct scrub_ctx *sctx = nocow_ctx->sctx;
 	u64 logical = nocow_ctx->logical;
 	u64 len = nocow_ctx->len;
 	int mirror_num = nocow_ctx->mirror_num;
 	u64 physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
 	int ret;
 	struct btrfs_trans_handle *trans = NULL;
 	struct btrfs_fs_info *fs_info;
 	struct btrfs_path *path;
 	struct btrfs_root *root;
 	int not_written = 0;
 	fs_info = sctx->dev_root->fs_info;
 	root = fs_info->extent_root;
 	path = btrfs_alloc_path();
 	if (!path) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.malloc_errors++;
 		spin_unlock(&sctx->stat_lock);
 		not_written = 1;
 		goto out;
 	}
 	trans = btrfs_join_transaction(root);
 	if (IS_ERR(trans)) {
 		not_written = 1;
 		goto out;
 	}
 	ret = iterate_inodes_from_logical(logical, fs_info, path,
 					  record_inode_for_nocow, nocow_ctx);
 	if (ret != 0 && ret != -ENOENT) {
 		btrfs_warn(fs_info, "iterate_inodes_from_logical() failed: log %llu, "
 			"phys %llu, len %llu, mir %u, ret %d",
 			logical, physical_for_dev_replace, len, mirror_num,
 			ret);
 		not_written = 1;
 		goto out;
 	}
 	btrfs_end_transaction(trans, root);
 	trans = NULL;
 	while (!list_empty(&nocow_ctx->inodes)) {
 		struct scrub_nocow_inode *entry;
 		entry = list_first_entry(&nocow_ctx->inodes,
 					 struct scrub_nocow_inode,
 					 list);
 		list_del_init(&entry->list);
 		ret = copy_nocow_pages_for_inode(entry->inum, entry->offset,
 						 entry->root, nocow_ctx);
 		kfree(entry);
 		if (ret == COPY_COMPLETE) {
 			ret = 0;
 			break;
 		} else if (ret) {
 			break;
 		}
 	}
 out:
 	while (!list_empty(&nocow_ctx->inodes)) {
 		struct scrub_nocow_inode *entry;
 		entry = list_first_entry(&nocow_ctx->inodes,
 					 struct scrub_nocow_inode,
 					 list);
 		list_del_init(&entry->list);
 		kfree(entry);
 	}
 	if (trans && !IS_ERR(trans))
 		btrfs_end_transaction(trans, root);
 	if (not_written)
 		btrfs_dev_replace_stats_inc(&fs_info->dev_replace.
 					    num_uncorrectable_read_errors);
 	btrfs_free_path(path);
 	kfree(nocow_ctx);
 	scrub_pending_trans_workers_dec(sctx);
 }
 static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root,
 				      struct scrub_copy_nocow_ctx *nocow_ctx)
 {
 	struct btrfs_fs_info *fs_info = nocow_ctx->sctx->dev_root->fs_info;
 	struct btrfs_key key;
 	struct inode *inode;
 	struct page *page;
 	struct btrfs_root *local_root;
 	struct btrfs_ordered_extent *ordered;
 	struct extent_map *em;
 	struct extent_state *cached_state = NULL;
 	struct extent_io_tree *io_tree;
 	u64 physical_for_dev_replace;
 	u64 len = nocow_ctx->len;
 	u64 lockstart = offset, lockend = offset + len - 1;
 	unsigned long index;
 	int srcu_index;
 	int ret = 0;
 	int err = 0;
 	key.objectid = root;
 	key.type = BTRFS_ROOT_ITEM_KEY;
 	key.offset = (u64)-1;
 	srcu_index = srcu_read_lock(&fs_info->subvol_srcu);
 	local_root = btrfs_read_fs_root_no_name(fs_info, &key);
 	if (IS_ERR(local_root)) {
 		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 		return PTR_ERR(local_root);
 	}
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.objectid = inum;
 	key.offset = 0;
 	inode = btrfs_iget(fs_info->sb, &key, local_root, NULL);
 	srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
 	if (IS_ERR(inode))
 		return PTR_ERR(inode);
 	/* Avoid truncate/dio/punch hole.. */
 	mutex_lock(&inode->i_mutex);
 	inode_dio_wait(inode);
 	physical_for_dev_replace = nocow_ctx->physical_for_dev_replace;
 	io_tree = &BTRFS_I(inode)->io_tree;
 	lock_extent_bits(io_tree, lockstart, lockend, 0, &cached_state);
 	ordered = btrfs_lookup_ordered_range(inode, lockstart, len);
 	if (ordered) {
 		btrfs_put_ordered_extent(ordered);
 		goto out_unlock;
 	}
 	em = btrfs_get_extent(inode, NULL, 0, lockstart, len, 0);
 	if (IS_ERR(em)) {
 		ret = PTR_ERR(em);
 		goto out_unlock;
 	}
 	/*
 	 * This extent does not actually cover the logical extent anymore,
 	 * move on to the next inode.
 	 */
 	if (em->block_start > nocow_ctx->logical ||
 	    em->block_start + em->block_len < nocow_ctx->logical + len) {
 		free_extent_map(em);
 		goto out_unlock;
 	}
 	free_extent_map(em);
 	while (len >= PAGE_CACHE_SIZE) {
 		index = offset >> PAGE_CACHE_SHIFT;
 again:
 		page = find_or_create_page(inode->i_mapping, index, GFP_NOFS);
 		if (!page) {
 			btrfs_err(fs_info, "find_or_create_page() failed");
 			ret = -ENOMEM;
 			goto out;
 		}
 		if (PageUptodate(page)) {
 			if (PageDirty(page))
 				goto next_page;
 		} else {
 			ClearPageError(page);
 			err = extent_read_full_page_nolock(io_tree, page,
 							   btrfs_get_extent,
 							   nocow_ctx->mirror_num);
 			if (err) {
 				ret = err;
 				goto next_page;
 			}
 			lock_page(page);
 			/*
 			 * If the page has been remove from the page cache,
 			 * the data on it is meaningless, because it may be
 			 * old one, the new data may be written into the new
 			 * page in the page cache.
 			 */
 			if (page->mapping != inode->i_mapping) {
 				unlock_page(page);
 				page_cache_release(page);
 				goto again;
 			}
 			if (!PageUptodate(page)) {
 				ret = -EIO;
 				goto next_page;
 			}
 		}
 		err = write_page_nocow(nocow_ctx->sctx,
 				       physical_for_dev_replace, page);
 		if (err)
 			ret = err;
 next_page:
 		unlock_page(page);
 		page_cache_release(page);
 		if (ret)
 			break;
 		offset += PAGE_CACHE_SIZE;
 		physical_for_dev_replace += PAGE_CACHE_SIZE;
 		len -= PAGE_CACHE_SIZE;
 	}
 	ret = COPY_COMPLETE;
 out_unlock:
 	unlock_extent_cached(io_tree, lockstart, lockend, &cached_state,
 			     GFP_NOFS);
 out:
 	mutex_unlock(&inode->i_mutex);
 	iput(inode);
 	return ret;
 }
 static int write_page_nocow(struct scrub_ctx *sctx,
 			    u64 physical_for_dev_replace, struct page *page)
 {
 	struct bio *bio;
 	struct btrfs_device *dev;
 	int ret;
 	dev = sctx->wr_ctx.tgtdev;
 	if (!dev)
 		return -EIO;
 	if (!dev->bdev) {
 		printk_ratelimited(KERN_WARNING
 			"BTRFS: scrub write_page_nocow(bdev == NULL) is unexpected!\n");
 		return -EIO;
 	}
 	bio = btrfs_io_bio_alloc(GFP_NOFS, 1);
 	if (!bio) {
 		spin_lock(&sctx->stat_lock);
 		sctx->stat.malloc_errors++;
 		spin_unlock(&sctx->stat_lock);
 		return -ENOMEM;
 	}
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
 	bio->bi_bdev = dev->bdev;
 	ret = bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
 	if (ret != PAGE_CACHE_SIZE) {
 leave_with_eio:
 		bio_put(bio);
 		btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
 		return -EIO;
 	}
 	if (btrfsic_submit_bio_wait(WRITE_SYNC, bio))
 		goto leave_with_eio;
 	bio_put(bio);
 	return 0;
 }