Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* raid10.c : Multiple Devices driver for Linux

2

* raid10.c : Multiple Devices driver for Linux

3

*

3

*

4

5

*

5

*

6

* RAID-10 support for md.

6

* RAID-10 support for md.

7

*

7

*

8

* Base on code in raid1.c. See raid1.c for further copyright information.

8

* Base on code in raid1.c. See raid1.c for further copyright information.

9

*

9

*

10

*

10

*

11

* This program is free software; you can redistribute it and/or modify

11

* This program is free software; you can redistribute it and/or modify

12

* it under the terms of the GNU General Public License as published by

12

* it under the terms of the GNU General Public License as published by

13

* the Free Software Foundation; either version 2, or (at your option)

13

* the Free Software Foundation; either version 2, or (at your option)

14

* any later version.

14

* any later version.

15

*

15

*

16

* You should have received a copy of the GNU General Public License

16

* You should have received a copy of the GNU General Public License

17

* (for example /usr/src/linux/COPYING); if not, write to the Free

17

* (for example /usr/src/linux/COPYING); if not, write to the Free

18

* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

18

* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

19

*/

19

*/

20

21

#include <linux/slab.h>

21

#include <linux/slab.h>

22

#include <linux/delay.h>

22

#include <linux/delay.h>

23

#include <linux/blkdev.h>

23

#include <linux/blkdev.h>

24

#include <linux/module.h>

24

#include <linux/module.h>

25

#include <linux/seq_file.h>

25

#include <linux/seq_file.h>

26

#include <linux/ratelimit.h>

26

#include <linux/ratelimit.h>

27

#include <linux/kthread.h>

27

#include <linux/kthread.h>

28

#include "md.h"

28

#include "md.h"

29

#include "raid10.h"

29

#include "raid10.h"

30

#include "raid0.h"

30

#include "raid0.h"

31

#include "bitmap.h"

31

#include "bitmap.h"

32

33

/*

33

/*

34

* RAID10 provides a combination of RAID0 and RAID1 functionality.

34

* RAID10 provides a combination of RAID0 and RAID1 functionality.

35

* The layout of data is defined by

35

* The layout of data is defined by

36

* chunk_size

36

* chunk_size

37

* raid_disks

37

* raid_disks

38

* near_copies (stored in low byte of layout)

38

* near_copies (stored in low byte of layout)

39

* far_copies (stored in second byte of layout)

39

* far_copies (stored in second byte of layout)

40

* far_offset (stored in bit 16 of layout )

40

* far_offset (stored in bit 16 of layout )

41

* use_far_sets (stored in bit 17 of layout )

41

*

42

*

42

* The data to be stored is divided into chunks using chunksize.

43

* The data to be stored is divided into chunks using chunksize. Each device

43

* Each device is divided into far_copies sections.

44

* is divided into far_copies sections. In each section, chunks are laid out

44

* In each section, chunks are laid out in a style similar to raid0, but

45

* in a style similar to raid0, but near_copies copies of each chunk is stored

45

* near_copies copies of each chunk is stored (each on a different drive).

46

* (each on a different drive). The starting device for each section is offset

46

* The starting device for each section is offset near_copies from the starting

47

* near_copies from the starting device of the previous section. Thus there

47

* device of the previous section.

48

* are (near_copies * far_copies) of each chunk, and each is on a different

48

* Thus they are (near_copies*far_copies) of each chunk, and each is on a different

49

* drive. near_copies and far_copies must be at least one, and their product

49

* drive.

50

* is at most raid_disks.

50

* near_copies and far_copies must be at least one, and their product is at most

51

* raid_disks.

52

*

51

*

53

* If far_offset is true, then the far_copies are handled a bit differently.

52

* If far_offset is true, then the far_copies are handled a bit differently.

54

* The copies are still in different stripes, but instead of be very far apart

53

* The copies are still in different stripes, but instead of being very far

55

* on disk, there are adjacent stripes.

54

* apart on disk, there are adjacent stripes.

55

*

56

* The far and offset algorithms are handled slightly differently if

57

* 'use_far_sets' is true. In this case, the array's devices are grouped into

58

* sets that are (near_copies * far_copies) in size. The far copied stripes

59

* are still shifted by 'near_copies' devices, but this shifting stays confined

60

* to the set rather than the entire array. This is done to improve the number

61

* of device combinations that can fail without causing the array to fail.

62

* Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk

63

* on a device):

64

* A B C D A B C D E

65

* ... ...

66

* D A B C E A B C D

67

* Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):

68

* [A B] [C D] [A B] [C D E]

69

* |...| |...| |...| | ... |

70

* [B A] [D C] [B A] [E C D]

56

*/

71

*/

57

72

58

/*

73

/*

59

* Number of guaranteed r10bios in case of extreme VM load:

74

* Number of guaranteed r10bios in case of extreme VM load:

60

*/

75

*/

61

#define NR_RAID10_BIOS 256

76

#define NR_RAID10_BIOS 256

62

77

63

/* when we get a read error on a read-only array, we redirect to another

78

/* when we get a read error on a read-only array, we redirect to another

64

* device without failing the first device, or trying to over-write to

79

* device without failing the first device, or trying to over-write to

65

* correct the read error. To keep track of bad blocks on a per-bio

80

* correct the read error. To keep track of bad blocks on a per-bio

66

* level, we store IO_BLOCKED in the appropriate 'bios' pointer

81

* level, we store IO_BLOCKED in the appropriate 'bios' pointer

67

*/

82

*/

68

#define IO_BLOCKED ((struct bio *)1)

83

#define IO_BLOCKED ((struct bio *)1)

69

/* When we successfully write to a known bad-block, we need to remove the

84

/* When we successfully write to a known bad-block, we need to remove the

70

* bad-block marking which must be done from process context. So we record

85

* bad-block marking which must be done from process context. So we record

71

* the success by setting devs[n].bio to IO_MADE_GOOD

86

* the success by setting devs[n].bio to IO_MADE_GOOD

72

*/

87

*/

73

#define IO_MADE_GOOD ((struct bio *)2)

88

#define IO_MADE_GOOD ((struct bio *)2)

74

89

75

#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)

90

#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)

76

91

77

/* When there are this many requests queued to be written by

92

/* When there are this many requests queued to be written by

78

* the raid10 thread, we become 'congested' to provide back-pressure

93

* the raid10 thread, we become 'congested' to provide back-pressure

79

* for writeback.

94

* for writeback.

80

*/

95

*/

81

static int max_queued_requests = 1024;

96

static int max_queued_requests = 1024;

82

97

83

static void allow_barrier(struct r10conf *conf);

98

static void allow_barrier(struct r10conf *conf);

84

static void lower_barrier(struct r10conf *conf);

99

static void lower_barrier(struct r10conf *conf);

85

static int enough(struct r10conf *conf, int ignore);

100

static int enough(struct r10conf *conf, int ignore);

86

static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,

101

static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,

87

int *skipped);

102

int *skipped);

88

static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);

103

static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);

89

static void end_reshape_write(struct bio *bio, int error);

104

static void end_reshape_write(struct bio *bio, int error);

90

static void end_reshape(struct r10conf *conf);

105

static void end_reshape(struct r10conf *conf);

91

106

92

static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)

107

static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)

93

{

108

{

94

struct r10conf *conf = data;

109

struct r10conf *conf = data;

95

int size = offsetof(struct r10bio, devs[conf->copies]);

110

int size = offsetof(struct r10bio, devs[conf->copies]);

96

111

97

/* allocate a r10bio with room for raid_disks entries in the

112

/* allocate a r10bio with room for raid_disks entries in the

98

* bios array */

113

* bios array */

99

return kzalloc(size, gfp_flags);

114

return kzalloc(size, gfp_flags);

100

}

115

}

101

116

102

static void r10bio_pool_free(void *r10_bio, void *data)

117

static void r10bio_pool_free(void *r10_bio, void *data)

103

{

118

{

104

kfree(r10_bio);

119

kfree(r10_bio);

105

}

120

}

106

121

107

/* Maximum size of each resync request */

122

/* Maximum size of each resync request */

108

#define RESYNC_BLOCK_SIZE (64*1024)

123

#define RESYNC_BLOCK_SIZE (64*1024)

109

#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)

124

#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)

110

/* amount of memory to reserve for resync requests */

125

/* amount of memory to reserve for resync requests */

111

#define RESYNC_WINDOW (1024*1024)

126

#define RESYNC_WINDOW (1024*1024)

112

/* maximum number of concurrent requests, memory permitting */

127

/* maximum number of concurrent requests, memory permitting */

113

#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)

128

#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)

114

129

115

/*

130

/*

116

* When performing a resync, we need to read and compare, so

131

* When performing a resync, we need to read and compare, so

117

* we need as many pages are there are copies.

132

* we need as many pages are there are copies.

118

* When performing a recovery, we need 2 bios, one for read,

133

* When performing a recovery, we need 2 bios, one for read,

119

* one for write (we recover only one drive per r10buf)

134

* one for write (we recover only one drive per r10buf)

120

*

135

*

121

*/

136

*/

122

static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)

137

static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)

123

{

138

{

124

struct r10conf *conf = data;

139

struct r10conf *conf = data;

125

struct page *page;

140

struct page *page;

126

struct r10bio *r10_bio;

141

struct r10bio *r10_bio;

127

struct bio *bio;

142

struct bio *bio;

128

int i, j;

143

int i, j;

129

int nalloc;

144

int nalloc;

130

145

131

r10_bio = r10bio_pool_alloc(gfp_flags, conf);

146

r10_bio = r10bio_pool_alloc(gfp_flags, conf);

132

if (!r10_bio)

147

if (!r10_bio)

133

return NULL;

148

return NULL;

134

149

135

if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||

150

if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||

136

test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))

151

test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))

137

nalloc = conf->copies; /* resync */

152

nalloc = conf->copies; /* resync */

138

else

153

else

139

nalloc = 2; /* recovery */

154

nalloc = 2; /* recovery */

140

155

141

/*

156

/*

142

* Allocate bios.

157

* Allocate bios.

143

*/

158

*/

144

for (j = nalloc ; j-- ; ) {

159

for (j = nalloc ; j-- ; ) {

145

bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);

160

bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);

146

if (!bio)

161

if (!bio)

147

goto out_free_bio;

162

goto out_free_bio;

148

r10_bio->devs[j].bio = bio;

163

r10_bio->devs[j].bio = bio;

149

if (!conf->have_replacement)

164

if (!conf->have_replacement)

150

continue;

165

continue;

151

bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);

166

bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);

152

if (!bio)

167

if (!bio)

153

goto out_free_bio;

168

goto out_free_bio;

154

r10_bio->devs[j].repl_bio = bio;

169

r10_bio->devs[j].repl_bio = bio;

155

}

170

}

156

/*

171

/*

157

* Allocate RESYNC_PAGES data pages and attach them

172

* Allocate RESYNC_PAGES data pages and attach them

158

* where needed.

173

* where needed.

159

*/

174

*/

160

for (j = 0 ; j < nalloc; j++) {

175

for (j = 0 ; j < nalloc; j++) {

161

struct bio *rbio = r10_bio->devs[j].repl_bio;

176

struct bio *rbio = r10_bio->devs[j].repl_bio;

162

bio = r10_bio->devs[j].bio;

177

bio = r10_bio->devs[j].bio;

163

for (i = 0; i < RESYNC_PAGES; i++) {

178

for (i = 0; i < RESYNC_PAGES; i++) {

164

if (j > 0 && !test_bit(MD_RECOVERY_SYNC,

179

if (j > 0 && !test_bit(MD_RECOVERY_SYNC,

165

&conf->mddev->recovery)) {

180

&conf->mddev->recovery)) {

166

/* we can share bv_page's during recovery

181

/* we can share bv_page's during recovery

167

* and reshape */

182

* and reshape */

168

struct bio *rbio = r10_bio->devs[0].bio;

183

struct bio *rbio = r10_bio->devs[0].bio;

169

page = rbio->bi_io_vec[i].bv_page;

184

page = rbio->bi_io_vec[i].bv_page;

170

get_page(page);

185

get_page(page);

171

} else

186

} else

172

page = alloc_page(gfp_flags);

187

page = alloc_page(gfp_flags);

173

if (unlikely(!page))

188

if (unlikely(!page))

174

goto out_free_pages;

189

goto out_free_pages;

175

190

176

bio->bi_io_vec[i].bv_page = page;

191

bio->bi_io_vec[i].bv_page = page;

177

if (rbio)

192

if (rbio)

178

rbio->bi_io_vec[i].bv_page = page;

193

rbio->bi_io_vec[i].bv_page = page;

179

}

194

}

180

}

195

}

181

196

182

return r10_bio;

197

return r10_bio;

183

198

184

out_free_pages:

199

out_free_pages:

185

for ( ; i > 0 ; i--)

200

for ( ; i > 0 ; i--)

186

safe_put_page(bio->bi_io_vec[i-1].bv_page);

201

safe_put_page(bio->bi_io_vec[i-1].bv_page);

187

while (j--)

202

while (j--)

188

for (i = 0; i < RESYNC_PAGES ; i++)

203

for (i = 0; i < RESYNC_PAGES ; i++)

189

safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);

204

safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);

190

j = 0;

205

j = 0;

191

out_free_bio:

206

out_free_bio:

192

for ( ; j < nalloc; j++) {

207

for ( ; j < nalloc; j++) {

193

if (r10_bio->devs[j].bio)

208

if (r10_bio->devs[j].bio)

194

bio_put(r10_bio->devs[j].bio);

209

bio_put(r10_bio->devs[j].bio);

195

if (r10_bio->devs[j].repl_bio)

210

if (r10_bio->devs[j].repl_bio)

196

bio_put(r10_bio->devs[j].repl_bio);

211

bio_put(r10_bio->devs[j].repl_bio);

197

}

212

}

198

r10bio_pool_free(r10_bio, conf);

213

r10bio_pool_free(r10_bio, conf);

199

return NULL;

214

return NULL;

200

}

215

}

201

216

202

static void r10buf_pool_free(void *__r10_bio, void *data)

217

static void r10buf_pool_free(void *__r10_bio, void *data)

203

{

218

{

204

int i;

219

int i;

205

struct r10conf *conf = data;

220

struct r10conf *conf = data;

206

struct r10bio *r10bio = __r10_bio;

221

struct r10bio *r10bio = __r10_bio;

207

int j;

222

int j;

208

223

209

for (j=0; j < conf->copies; j++) {

224

for (j=0; j < conf->copies; j++) {

210

struct bio *bio = r10bio->devs[j].bio;

225

struct bio *bio = r10bio->devs[j].bio;

211

if (bio) {

226

if (bio) {

212

for (i = 0; i < RESYNC_PAGES; i++) {

227

for (i = 0; i < RESYNC_PAGES; i++) {

213

safe_put_page(bio->bi_io_vec[i].bv_page);

228

safe_put_page(bio->bi_io_vec[i].bv_page);

214

bio->bi_io_vec[i].bv_page = NULL;

229

bio->bi_io_vec[i].bv_page = NULL;

215

}

230

}

216

bio_put(bio);

231

bio_put(bio);

217

}

232

}

218

bio = r10bio->devs[j].repl_bio;

233

bio = r10bio->devs[j].repl_bio;

219

if (bio)

234

if (bio)

220

bio_put(bio);

235

bio_put(bio);

221

}

236

}

222

r10bio_pool_free(r10bio, conf);

237

r10bio_pool_free(r10bio, conf);

223

}

238

}

224

239

225

static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)

240

static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)

226

{

241

{

227

int i;

242

int i;

228

243

229

for (i = 0; i < conf->copies; i++) {

244

for (i = 0; i < conf->copies; i++) {

230

struct bio **bio = & r10_bio->devs[i].bio;

245

struct bio **bio = & r10_bio->devs[i].bio;

231

if (!BIO_SPECIAL(*bio))

246

if (!BIO_SPECIAL(*bio))

232

bio_put(*bio);

247

bio_put(*bio);

233

*bio = NULL;

248

*bio = NULL;

234

bio = &r10_bio->devs[i].repl_bio;

249

bio = &r10_bio->devs[i].repl_bio;

235

if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))

250

if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))

236

bio_put(*bio);

251

bio_put(*bio);

237

*bio = NULL;

252

*bio = NULL;

238

}

253

}

239

}

254

}

240

255

241

static void free_r10bio(struct r10bio *r10_bio)

256

static void free_r10bio(struct r10bio *r10_bio)

242

{

257

{

243

struct r10conf *conf = r10_bio->mddev->private;

258

struct r10conf *conf = r10_bio->mddev->private;

244

259

245

put_all_bios(conf, r10_bio);

260

put_all_bios(conf, r10_bio);

246

mempool_free(r10_bio, conf->r10bio_pool);

261

mempool_free(r10_bio, conf->r10bio_pool);

247

}

262

}

248

263

249

static void put_buf(struct r10bio *r10_bio)

264

static void put_buf(struct r10bio *r10_bio)

250

{

265

{

251

struct r10conf *conf = r10_bio->mddev->private;

266

struct r10conf *conf = r10_bio->mddev->private;

252

267

253

mempool_free(r10_bio, conf->r10buf_pool);

268

mempool_free(r10_bio, conf->r10buf_pool);

254

269

255

lower_barrier(conf);

270

lower_barrier(conf);

256

}

271

}

257

272

258

static void reschedule_retry(struct r10bio *r10_bio)

273

static void reschedule_retry(struct r10bio *r10_bio)

259

{

274

{

260

unsigned long flags;

275

unsigned long flags;

261

struct mddev *mddev = r10_bio->mddev;

276

struct mddev *mddev = r10_bio->mddev;

262

struct r10conf *conf = mddev->private;

277

struct r10conf *conf = mddev->private;

263

278

264

spin_lock_irqsave(&conf->device_lock, flags);

279

spin_lock_irqsave(&conf->device_lock, flags);

265

list_add(&r10_bio->retry_list, &conf->retry_list);

280

list_add(&r10_bio->retry_list, &conf->retry_list);

266

conf->nr_queued ++;

281

conf->nr_queued ++;

267

spin_unlock_irqrestore(&conf->device_lock, flags);

282

spin_unlock_irqrestore(&conf->device_lock, flags);

268

283

269

/* wake up frozen array... */

284

/* wake up frozen array... */

270

wake_up(&conf->wait_barrier);

285

wake_up(&conf->wait_barrier);

271

286

272

md_wakeup_thread(mddev->thread);

287

md_wakeup_thread(mddev->thread);

273

}

288

}

274

289

275

/*

290

/*

276

* raid_end_bio_io() is called when we have finished servicing a mirrored

291

* raid_end_bio_io() is called when we have finished servicing a mirrored

277

* operation and are ready to return a success/failure code to the buffer

292

* operation and are ready to return a success/failure code to the buffer

278

* cache layer.

293

* cache layer.

279

*/

294

*/

280

static void raid_end_bio_io(struct r10bio *r10_bio)

295

static void raid_end_bio_io(struct r10bio *r10_bio)

281

{

296

{

282

struct bio *bio = r10_bio->master_bio;

297

struct bio *bio = r10_bio->master_bio;

283

int done;

298

int done;

284

struct r10conf *conf = r10_bio->mddev->private;

299

struct r10conf *conf = r10_bio->mddev->private;

285

300

286

if (bio->bi_phys_segments) {

301

if (bio->bi_phys_segments) {

287

unsigned long flags;

302

unsigned long flags;

288

spin_lock_irqsave(&conf->device_lock, flags);

303

spin_lock_irqsave(&conf->device_lock, flags);

289

bio->bi_phys_segments--;

304

bio->bi_phys_segments--;

290

done = (bio->bi_phys_segments == 0);

305

done = (bio->bi_phys_segments == 0);

291

spin_unlock_irqrestore(&conf->device_lock, flags);

306

spin_unlock_irqrestore(&conf->device_lock, flags);

292

} else

307

} else

293

done = 1;

308

done = 1;

294

if (!test_bit(R10BIO_Uptodate, &r10_bio->state))

309

if (!test_bit(R10BIO_Uptodate, &r10_bio->state))

295

clear_bit(BIO_UPTODATE, &bio->bi_flags);

310

clear_bit(BIO_UPTODATE, &bio->bi_flags);

296

if (done) {

311

if (done) {

297

bio_endio(bio, 0);

312

bio_endio(bio, 0);

298

/*

313

/*

299

* Wake up any possible resync thread that waits for the device

314

* Wake up any possible resync thread that waits for the device

300

* to go idle.

315

* to go idle.

301

*/

316

*/

302

allow_barrier(conf);

317

allow_barrier(conf);

303

}

318

}

304

free_r10bio(r10_bio);

319

free_r10bio(r10_bio);

305

}

320

}

306

321

307

/*

322

/*

308

* Update disk head position estimator based on IRQ completion info.

323

* Update disk head position estimator based on IRQ completion info.

309

*/

324

*/

310

static inline void update_head_pos(int slot, struct r10bio *r10_bio)

325

static inline void update_head_pos(int slot, struct r10bio *r10_bio)

311

{

326

{

312

struct r10conf *conf = r10_bio->mddev->private;

327

struct r10conf *conf = r10_bio->mddev->private;

313

328

314

conf->mirrors[r10_bio->devs[slot].devnum].head_position =

329

conf->mirrors[r10_bio->devs[slot].devnum].head_position =

315

r10_bio->devs[slot].addr + (r10_bio->sectors);

330

r10_bio->devs[slot].addr + (r10_bio->sectors);

316

}

331

}

317

332

318

/*

333

/*

319

* Find the disk number which triggered given bio

334

* Find the disk number which triggered given bio

320

*/

335

*/

321

static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,

336

static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,

322

struct bio *bio, int *slotp, int *replp)

337

struct bio *bio, int *slotp, int *replp)

323

{

338

{

324

int slot;

339

int slot;

325

int repl = 0;

340

int repl = 0;

326

341

327

for (slot = 0; slot < conf->copies; slot++) {

342

for (slot = 0; slot < conf->copies; slot++) {

328

if (r10_bio->devs[slot].bio == bio)

343

if (r10_bio->devs[slot].bio == bio)

329

break;

344

break;

330

if (r10_bio->devs[slot].repl_bio == bio) {

345

if (r10_bio->devs[slot].repl_bio == bio) {

331

repl = 1;

346

repl = 1;

332

break;

347

break;

333

}

348

}

334

}

349

}

335

350

336

BUG_ON(slot == conf->copies);

351

BUG_ON(slot == conf->copies);

337

update_head_pos(slot, r10_bio);

352

update_head_pos(slot, r10_bio);

338

353

339

if (slotp)

354

if (slotp)

340

*slotp = slot;

355

*slotp = slot;

341

if (replp)

356

if (replp)

342

*replp = repl;

357

*replp = repl;

343

return r10_bio->devs[slot].devnum;

358

return r10_bio->devs[slot].devnum;

344

}

359

}

345

360

346

static void raid10_end_read_request(struct bio *bio, int error)

361

static void raid10_end_read_request(struct bio *bio, int error)

347

{

362

{

348

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

363

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

349

struct r10bio *r10_bio = bio->bi_private;

364

struct r10bio *r10_bio = bio->bi_private;

350

int slot, dev;

365

int slot, dev;

351

struct md_rdev *rdev;

366

struct md_rdev *rdev;

352

struct r10conf *conf = r10_bio->mddev->private;

367

struct r10conf *conf = r10_bio->mddev->private;

353

368

354

369

355

slot = r10_bio->read_slot;

370

slot = r10_bio->read_slot;

356

dev = r10_bio->devs[slot].devnum;

371

dev = r10_bio->devs[slot].devnum;

357

rdev = r10_bio->devs[slot].rdev;

372

rdev = r10_bio->devs[slot].rdev;

358

/*

373

/*

359

* this branch is our 'one mirror IO has finished' event handler:

374

* this branch is our 'one mirror IO has finished' event handler:

360

*/

375

*/

361

update_head_pos(slot, r10_bio);

376

update_head_pos(slot, r10_bio);

362

377

363

if (uptodate) {

378

if (uptodate) {

364

/*

379

/*

365

* Set R10BIO_Uptodate in our master bio, so that

380

* Set R10BIO_Uptodate in our master bio, so that

366

* we will return a good error code to the higher

381

* we will return a good error code to the higher

367

* levels even if IO on some other mirrored buffer fails.

382

* levels even if IO on some other mirrored buffer fails.

368

*

383

*

369

* The 'master' represents the composite IO operation to

384

* The 'master' represents the composite IO operation to

370

* user-side. So if something waits for IO, then it will

385

* user-side. So if something waits for IO, then it will

371

* wait for the 'master' bio.

386

* wait for the 'master' bio.

372

*/

387

*/

373

set_bit(R10BIO_Uptodate, &r10_bio->state);

388

set_bit(R10BIO_Uptodate, &r10_bio->state);

374

} else {

389

} else {

375

/* If all other devices that store this block have

390

/* If all other devices that store this block have

376

* failed, we want to return the error upwards rather

391

* failed, we want to return the error upwards rather

377

* than fail the last device. Here we redefine

392

* than fail the last device. Here we redefine

378

* "uptodate" to mean "Don't want to retry"

393

* "uptodate" to mean "Don't want to retry"

379

*/

394

*/

380

unsigned long flags;

395

unsigned long flags;

381

spin_lock_irqsave(&conf->device_lock, flags);

396

spin_lock_irqsave(&conf->device_lock, flags);

382

if (!enough(conf, rdev->raid_disk))

397

if (!enough(conf, rdev->raid_disk))

383

uptodate = 1;

398

uptodate = 1;

384

spin_unlock_irqrestore(&conf->device_lock, flags);

399

spin_unlock_irqrestore(&conf->device_lock, flags);

385

}

400

}

386

if (uptodate) {

401

if (uptodate) {

387

raid_end_bio_io(r10_bio);

402

raid_end_bio_io(r10_bio);

388

rdev_dec_pending(rdev, conf->mddev);

403

rdev_dec_pending(rdev, conf->mddev);

389

} else {

404

} else {

390

/*

405

/*

391

* oops, read error - keep the refcount on the rdev

406

* oops, read error - keep the refcount on the rdev

392

*/

407

*/

393

char b[BDEVNAME_SIZE];

408

char b[BDEVNAME_SIZE];

394

printk_ratelimited(KERN_ERR

409

printk_ratelimited(KERN_ERR

395

"md/raid10:%s: %s: rescheduling sector %llu\n",

410

"md/raid10:%s: %s: rescheduling sector %llu\n",

396

mdname(conf->mddev),

411

mdname(conf->mddev),

397

bdevname(rdev->bdev, b),

412

bdevname(rdev->bdev, b),

398

(unsigned long long)r10_bio->sector);

413

(unsigned long long)r10_bio->sector);

399

set_bit(R10BIO_ReadError, &r10_bio->state);

414

set_bit(R10BIO_ReadError, &r10_bio->state);

400

reschedule_retry(r10_bio);

415

reschedule_retry(r10_bio);

401

}

416

}

402

}

417

}

403

418

404

static void close_write(struct r10bio *r10_bio)

419

static void close_write(struct r10bio *r10_bio)

405

{

420

{

406

/* clear the bitmap if all writes complete successfully */

421

/* clear the bitmap if all writes complete successfully */

407

bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,

422

bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,

408

r10_bio->sectors,

423

r10_bio->sectors,

409

!test_bit(R10BIO_Degraded, &r10_bio->state),

424

!test_bit(R10BIO_Degraded, &r10_bio->state),

410

0);

425

0);

411

md_write_end(r10_bio->mddev);

426

md_write_end(r10_bio->mddev);

412

}

427

}

413

428

414

static void one_write_done(struct r10bio *r10_bio)

429

static void one_write_done(struct r10bio *r10_bio)

415

{

430

{

416

if (atomic_dec_and_test(&r10_bio->remaining)) {

431

if (atomic_dec_and_test(&r10_bio->remaining)) {

417

if (test_bit(R10BIO_WriteError, &r10_bio->state))

432

if (test_bit(R10BIO_WriteError, &r10_bio->state))

418

reschedule_retry(r10_bio);

433

reschedule_retry(r10_bio);

419

else {

434

else {

420

close_write(r10_bio);

435

close_write(r10_bio);

421

if (test_bit(R10BIO_MadeGood, &r10_bio->state))

436

if (test_bit(R10BIO_MadeGood, &r10_bio->state))

422

reschedule_retry(r10_bio);

437

reschedule_retry(r10_bio);

423

else

438

else

424

raid_end_bio_io(r10_bio);

439

raid_end_bio_io(r10_bio);

425

}

440

}

426

}

441

}

427

}

442

}

428

443

429

static void raid10_end_write_request(struct bio *bio, int error)

444

static void raid10_end_write_request(struct bio *bio, int error)

430

{

445

{

431

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

446

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

432

struct r10bio *r10_bio = bio->bi_private;

447

struct r10bio *r10_bio = bio->bi_private;

433

int dev;

448

int dev;

434

int dec_rdev = 1;

449

int dec_rdev = 1;

435

struct r10conf *conf = r10_bio->mddev->private;

450

struct r10conf *conf = r10_bio->mddev->private;

436

int slot, repl;

451

int slot, repl;

437

struct md_rdev *rdev = NULL;

452

struct md_rdev *rdev = NULL;

438

453

439

dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);

454

dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);

440

455

441

if (repl)

456

if (repl)

442

rdev = conf->mirrors[dev].replacement;

457

rdev = conf->mirrors[dev].replacement;

443

if (!rdev) {

458

if (!rdev) {

444

smp_rmb();

459

smp_rmb();

445

repl = 0;

460

repl = 0;

446

rdev = conf->mirrors[dev].rdev;

461

rdev = conf->mirrors[dev].rdev;

447

}

462

}

448

/*

463

/*

449

* this branch is our 'one mirror IO has finished' event handler:

464

* this branch is our 'one mirror IO has finished' event handler:

450

*/

465

*/

451

if (!uptodate) {

466

if (!uptodate) {

452

if (repl)

467

if (repl)

453

/* Never record new bad blocks to replacement,

468

/* Never record new bad blocks to replacement,

454

* just fail it.

469

* just fail it.

455

*/

470

*/

456

md_error(rdev->mddev, rdev);

471

md_error(rdev->mddev, rdev);

457

else {

472

else {

458

set_bit(WriteErrorSeen, &rdev->flags);

473

set_bit(WriteErrorSeen, &rdev->flags);

459

if (!test_and_set_bit(WantReplacement, &rdev->flags))

474

if (!test_and_set_bit(WantReplacement, &rdev->flags))

460

set_bit(MD_RECOVERY_NEEDED,

475

set_bit(MD_RECOVERY_NEEDED,

461

&rdev->mddev->recovery);

476

&rdev->mddev->recovery);

462

set_bit(R10BIO_WriteError, &r10_bio->state);

477

set_bit(R10BIO_WriteError, &r10_bio->state);

463

dec_rdev = 0;

478

dec_rdev = 0;

464

}

479

}

465

} else {

480

} else {

466

/*

481

/*

467

* Set R10BIO_Uptodate in our master bio, so that

482

* Set R10BIO_Uptodate in our master bio, so that

468

* we will return a good error code for to the higher

483

* we will return a good error code for to the higher

469

* levels even if IO on some other mirrored buffer fails.

484

* levels even if IO on some other mirrored buffer fails.

470

*

485

*

471

* The 'master' represents the composite IO operation to

486

* The 'master' represents the composite IO operation to

472

* user-side. So if something waits for IO, then it will

487

* user-side. So if something waits for IO, then it will

473

* wait for the 'master' bio.

488

* wait for the 'master' bio.

474

*/

489

*/

475

sector_t first_bad;

490

sector_t first_bad;

476

int bad_sectors;

491

int bad_sectors;

477

492

478

set_bit(R10BIO_Uptodate, &r10_bio->state);

493

set_bit(R10BIO_Uptodate, &r10_bio->state);

479

494

480

/* Maybe we can clear some bad blocks. */

495

/* Maybe we can clear some bad blocks. */

481

if (is_badblock(rdev,

496

if (is_badblock(rdev,

482

r10_bio->devs[slot].addr,

497

r10_bio->devs[slot].addr,

483

r10_bio->sectors,

498

r10_bio->sectors,

484

&first_bad, &bad_sectors)) {

499

&first_bad, &bad_sectors)) {

485

bio_put(bio);

500

bio_put(bio);

486

if (repl)

501

if (repl)

487

r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;

502

r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;

488

else

503

else

489

r10_bio->devs[slot].bio = IO_MADE_GOOD;

504

r10_bio->devs[slot].bio = IO_MADE_GOOD;

490

dec_rdev = 0;

505

dec_rdev = 0;

491

set_bit(R10BIO_MadeGood, &r10_bio->state);

506

set_bit(R10BIO_MadeGood, &r10_bio->state);

492

}

507

}

493

}

508

}

494

509

495

/*

510

/*

496

*

511

*

497

* Let's see if all mirrored write operations have finished

512

* Let's see if all mirrored write operations have finished

498

* already.

513

* already.

499

*/

514

*/

500

one_write_done(r10_bio);

515

one_write_done(r10_bio);

501

if (dec_rdev)

516

if (dec_rdev)

502

rdev_dec_pending(rdev, conf->mddev);

517

rdev_dec_pending(rdev, conf->mddev);

503

}

518

}

504

519

505

/*

520

/*

506

* RAID10 layout manager

521

* RAID10 layout manager

507

* As well as the chunksize and raid_disks count, there are two

522

* As well as the chunksize and raid_disks count, there are two

508

* parameters: near_copies and far_copies.

523

* parameters: near_copies and far_copies.

509

* near_copies * far_copies must be <= raid_disks.

524

* near_copies * far_copies must be <= raid_disks.

510

* Normally one of these will be 1.

525

* Normally one of these will be 1.

511

* If both are 1, we get raid0.

526

* If both are 1, we get raid0.

512

* If near_copies == raid_disks, we get raid1.

527

* If near_copies == raid_disks, we get raid1.

513

*

528

*

514

* Chunks are laid out in raid0 style with near_copies copies of the

529

* Chunks are laid out in raid0 style with near_copies copies of the

515

* first chunk, followed by near_copies copies of the next chunk and

530

* first chunk, followed by near_copies copies of the next chunk and

516

* so on.

531

* so on.

517

* If far_copies > 1, then after 1/far_copies of the array has been assigned

532

* If far_copies > 1, then after 1/far_copies of the array has been assigned

518

* as described above, we start again with a device offset of near_copies.

533

* as described above, we start again with a device offset of near_copies.

519

* So we effectively have another copy of the whole array further down all

534

* So we effectively have another copy of the whole array further down all

520

* the drives, but with blocks on different drives.

535

* the drives, but with blocks on different drives.

521

* With this layout, and block is never stored twice on the one device.

536

* With this layout, and block is never stored twice on the one device.

522

*

537

*

523

* raid10_find_phys finds the sector offset of a given virtual sector

538

* raid10_find_phys finds the sector offset of a given virtual sector

524

* on each device that it is on.

539

* on each device that it is on.

525

*

540

*

526

* raid10_find_virt does the reverse mapping, from a device and a

541

* raid10_find_virt does the reverse mapping, from a device and a

527

* sector offset to a virtual address

542

* sector offset to a virtual address

528

*/

543

*/

529

544

530

static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)

545

static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)

531

{

546

{

532

int n,f;

547

int n,f;

533

sector_t sector;

548

sector_t sector;

534

sector_t chunk;

549

sector_t chunk;

535

sector_t stripe;

550

sector_t stripe;

536

int dev;

551

int dev;

537

int slot = 0;

552

int slot = 0;

538

553

539

/* now calculate first sector/dev */

554

/* now calculate first sector/dev */

540

chunk = r10bio->sector >> geo->chunk_shift;

555

chunk = r10bio->sector >> geo->chunk_shift;

541

sector = r10bio->sector & geo->chunk_mask;

556

sector = r10bio->sector & geo->chunk_mask;

542

557

543

chunk *= geo->near_copies;

558

chunk *= geo->near_copies;

544

stripe = chunk;

559

stripe = chunk;

545

dev = sector_div(stripe, geo->raid_disks);

560

dev = sector_div(stripe, geo->raid_disks);

546

if (geo->far_offset)

561

if (geo->far_offset)

547

stripe *= geo->far_copies;

562

stripe *= geo->far_copies;

548

563

549

sector += stripe << geo->chunk_shift;

564

sector += stripe << geo->chunk_shift;

550

565

551

/* and calculate all the others */

566

/* and calculate all the others */

552

for (n = 0; n < geo->near_copies; n++) {

567

for (n = 0; n < geo->near_copies; n++) {

553

int d = dev;

568

int d = dev;

569

int set;

554

sector_t s = sector;

570

sector_t s = sector;

555

r10bio->devs[slot].devnum = d;

571

r10bio->devs[slot].devnum = d;

556

r10bio->devs[slot].addr = s;

572

r10bio->devs[slot].addr = s;

557

slot++;

573

slot++;

558

574

559

for (f = 1; f < geo->far_copies; f++) {

575

for (f = 1; f < geo->far_copies; f++) {

576

set = d / geo->far_set_size;

560

d += geo->near_copies;

577

d += geo->near_copies;

561

d %= geo->raid_disks;

578

d %= geo->far_set_size;

579

d += geo->far_set_size * set;

580

562

s += geo->stride;

581

s += geo->stride;

563

r10bio->devs[slot].devnum = d;

582

r10bio->devs[slot].devnum = d;

564

r10bio->devs[slot].addr = s;

583

r10bio->devs[slot].addr = s;

565

slot++;

584

slot++;

566

}

585

}

567

dev++;

586

dev++;

568

if (dev >= geo->raid_disks) {

587

if (dev >= geo->raid_disks) {

569

dev = 0;

588

dev = 0;

570

sector += (geo->chunk_mask + 1);

589

sector += (geo->chunk_mask + 1);

571

}

590

}

572

}

591

}

573

}

592

}

574

593

575

static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)

594

static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)

576

{

595

{

577

struct geom *geo = &conf->geo;

596

struct geom *geo = &conf->geo;

578

597

579

if (conf->reshape_progress != MaxSector &&

598

if (conf->reshape_progress != MaxSector &&

580

((r10bio->sector >= conf->reshape_progress) !=

599

((r10bio->sector >= conf->reshape_progress) !=

581

conf->mddev->reshape_backwards)) {

600

conf->mddev->reshape_backwards)) {

582

set_bit(R10BIO_Previous, &r10bio->state);

601

set_bit(R10BIO_Previous, &r10bio->state);

583

geo = &conf->prev;

602

geo = &conf->prev;

584

} else

603

} else

585

clear_bit(R10BIO_Previous, &r10bio->state);

604

clear_bit(R10BIO_Previous, &r10bio->state);

586

605

587

__raid10_find_phys(geo, r10bio);

606

__raid10_find_phys(geo, r10bio);

588

}

607

}

589

608

590

static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)

609

static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)

591

{

610

{

592

sector_t offset, chunk, vchunk;

611

sector_t offset, chunk, vchunk;

593

/* Never use conf->prev as this is only called during resync

612

/* Never use conf->prev as this is only called during resync

594

* or recovery, so reshape isn't happening

613

* or recovery, so reshape isn't happening

595

*/

614

*/

596

struct geom *geo = &conf->geo;

615

struct geom *geo = &conf->geo;

616

int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;

617

int far_set_size = geo->far_set_size;

597

618

598

offset = sector & geo->chunk_mask;

619

offset = sector & geo->chunk_mask;

599

if (geo->far_offset) {

620

if (geo->far_offset) {

600

int fc;

621

int fc;

601

chunk = sector >> geo->chunk_shift;

622

chunk = sector >> geo->chunk_shift;

602

fc = sector_div(chunk, geo->far_copies);

623

fc = sector_div(chunk, geo->far_copies);

603

dev -= fc * geo->near_copies;

624

dev -= fc * geo->near_copies;

604

if (dev < 0)

625

if (dev < far_set_start)

605

dev += geo->raid_disks;

626

dev += far_set_size;

606

} else {

627

} else {

607

while (sector >= geo->stride) {

628

while (sector >= geo->stride) {

608

sector -= geo->stride;

629

sector -= geo->stride;

609

if (dev < geo->near_copies)

630

if (dev < (geo->near_copies + far_set_start))

610

dev += geo->raid_disks - geo->near_copies;

631

dev += far_set_size - geo->near_copies;

611

else

632

else

612

dev -= geo->near_copies;

633

dev -= geo->near_copies;

613

}

634

}

614

chunk = sector >> geo->chunk_shift;

635

chunk = sector >> geo->chunk_shift;

615

}

636

}

616

vchunk = chunk * geo->raid_disks + dev;

637

vchunk = chunk * geo->raid_disks + dev;

617

sector_div(vchunk, geo->near_copies);

638

sector_div(vchunk, geo->near_copies);

618

return (vchunk << geo->chunk_shift) + offset;

639

return (vchunk << geo->chunk_shift) + offset;

619

}

640

}

620

641

621

/**

642

/**

622

* raid10_mergeable_bvec -- tell bio layer if a two requests can be merged

643

* raid10_mergeable_bvec -- tell bio layer if a two requests can be merged

623

* @q: request queue

644

* @q: request queue

624

* @bvm: properties of new bio

645

* @bvm: properties of new bio

625

* @biovec: the request that could be merged to it.

646

* @biovec: the request that could be merged to it.

626

*

647

*

627

* Return amount of bytes we can accept at this offset

648

* Return amount of bytes we can accept at this offset

628

* This requires checking for end-of-chunk if near_copies != raid_disks,

649

* This requires checking for end-of-chunk if near_copies != raid_disks,

629

* and for subordinate merge_bvec_fns if merge_check_needed.

650

* and for subordinate merge_bvec_fns if merge_check_needed.

630

*/

651

*/

631

static int raid10_mergeable_bvec(struct request_queue *q,

652

static int raid10_mergeable_bvec(struct request_queue *q,

632

struct bvec_merge_data *bvm,

653

struct bvec_merge_data *bvm,

633

struct bio_vec *biovec)

654

struct bio_vec *biovec)

634

{

655

{

635

struct mddev *mddev = q->queuedata;

656

struct mddev *mddev = q->queuedata;

636

struct r10conf *conf = mddev->private;

657

struct r10conf *conf = mddev->private;

637

sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);

658

sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);

638

int max;

659

int max;

639

unsigned int chunk_sectors;

660

unsigned int chunk_sectors;

640

unsigned int bio_sectors = bvm->bi_size >> 9;

661

unsigned int bio_sectors = bvm->bi_size >> 9;

641

struct geom *geo = &conf->geo;

662

struct geom *geo = &conf->geo;

642

663

643

chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;

664

chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;

644

if (conf->reshape_progress != MaxSector &&

665

if (conf->reshape_progress != MaxSector &&

645

((sector >= conf->reshape_progress) !=

666

((sector >= conf->reshape_progress) !=

646

conf->mddev->reshape_backwards))

667

conf->mddev->reshape_backwards))

647

geo = &conf->prev;

668

geo = &conf->prev;

648

669

649

if (geo->near_copies < geo->raid_disks) {

670

if (geo->near_copies < geo->raid_disks) {

650

max = (chunk_sectors - ((sector & (chunk_sectors - 1))

671

max = (chunk_sectors - ((sector & (chunk_sectors - 1))

651

+ bio_sectors)) << 9;

672

+ bio_sectors)) << 9;

652

if (max < 0)

673

if (max < 0)

653

/* bio_add cannot handle a negative return */

674

/* bio_add cannot handle a negative return */

654

max = 0;

675

max = 0;

655

if (max <= biovec->bv_len && bio_sectors == 0)

676

if (max <= biovec->bv_len && bio_sectors == 0)

656

return biovec->bv_len;

677

return biovec->bv_len;

657

} else

678

} else

658

max = biovec->bv_len;

679

max = biovec->bv_len;

659

680

660

if (mddev->merge_check_needed) {

681

if (mddev->merge_check_needed) {

661

struct {

682

struct {

662

struct r10bio r10_bio;

683

struct r10bio r10_bio;

663

struct r10dev devs[conf->copies];

684

struct r10dev devs[conf->copies];

664

} on_stack;

685

} on_stack;

665

struct r10bio *r10_bio = &on_stack.r10_bio;

686

struct r10bio *r10_bio = &on_stack.r10_bio;

666

int s;

687

int s;

667

if (conf->reshape_progress != MaxSector) {

688

if (conf->reshape_progress != MaxSector) {

668

/* Cannot give any guidance during reshape */

689

/* Cannot give any guidance during reshape */

669

if (max <= biovec->bv_len && bio_sectors == 0)

690

if (max <= biovec->bv_len && bio_sectors == 0)

670

return biovec->bv_len;

691

return biovec->bv_len;

671

return 0;

692

return 0;

672

}

693

}

673

r10_bio->sector = sector;

694

r10_bio->sector = sector;

674

raid10_find_phys(conf, r10_bio);

695

raid10_find_phys(conf, r10_bio);

675

rcu_read_lock();

696

rcu_read_lock();

676

for (s = 0; s < conf->copies; s++) {

697

for (s = 0; s < conf->copies; s++) {

677

int disk = r10_bio->devs[s].devnum;

698

int disk = r10_bio->devs[s].devnum;

678

struct md_rdev *rdev = rcu_dereference(

699

struct md_rdev *rdev = rcu_dereference(

679

conf->mirrors[disk].rdev);

700

conf->mirrors[disk].rdev);

680

if (rdev && !test_bit(Faulty, &rdev->flags)) {

701

if (rdev && !test_bit(Faulty, &rdev->flags)) {

681

struct request_queue *q =

702

struct request_queue *q =

682

bdev_get_queue(rdev->bdev);

703

bdev_get_queue(rdev->bdev);

683

if (q->merge_bvec_fn) {

704

if (q->merge_bvec_fn) {

684

bvm->bi_sector = r10_bio->devs[s].addr

705

bvm->bi_sector = r10_bio->devs[s].addr

685

+ rdev->data_offset;

706

+ rdev->data_offset;

686

bvm->bi_bdev = rdev->bdev;

707

bvm->bi_bdev = rdev->bdev;

687

max = min(max, q->merge_bvec_fn(

708

max = min(max, q->merge_bvec_fn(

688

q, bvm, biovec));

709

q, bvm, biovec));

689

}

710

}

690

}

711

}

691

rdev = rcu_dereference(conf->mirrors[disk].replacement);

712

rdev = rcu_dereference(conf->mirrors[disk].replacement);

692

if (rdev && !test_bit(Faulty, &rdev->flags)) {

713

if (rdev && !test_bit(Faulty, &rdev->flags)) {

693

struct request_queue *q =

714

struct request_queue *q =

694

bdev_get_queue(rdev->bdev);

715

bdev_get_queue(rdev->bdev);

695

if (q->merge_bvec_fn) {

716

if (q->merge_bvec_fn) {

696

bvm->bi_sector = r10_bio->devs[s].addr

717

bvm->bi_sector = r10_bio->devs[s].addr

697

+ rdev->data_offset;

718

+ rdev->data_offset;

698

bvm->bi_bdev = rdev->bdev;

719

bvm->bi_bdev = rdev->bdev;

699

max = min(max, q->merge_bvec_fn(

720

max = min(max, q->merge_bvec_fn(

700

q, bvm, biovec));

721

q, bvm, biovec));

701

}

722

}

702

}

723

}

703

}

724

}

704

rcu_read_unlock();

725

rcu_read_unlock();

705

}

726

}

706

return max;

727

return max;

707

}

728

}

708

729

709

/*

730

/*

710

* This routine returns the disk from which the requested read should

731

* This routine returns the disk from which the requested read should

711

* be done. There is a per-array 'next expected sequential IO' sector

732

* be done. There is a per-array 'next expected sequential IO' sector

712

* number - if this matches on the next IO then we use the last disk.

733

* number - if this matches on the next IO then we use the last disk.

713

* There is also a per-disk 'last know head position' sector that is

734

* There is also a per-disk 'last know head position' sector that is

714

* maintained from IRQ contexts, both the normal and the resync IO

735

* maintained from IRQ contexts, both the normal and the resync IO

715

* completion handlers update this position correctly. If there is no

736

* completion handlers update this position correctly. If there is no

716

* perfect sequential match then we pick the disk whose head is closest.

737

* perfect sequential match then we pick the disk whose head is closest.

717

*

738

*

718

* If there are 2 mirrors in the same 2 devices, performance degrades

739

* If there are 2 mirrors in the same 2 devices, performance degrades

719

* because position is mirror, not device based.

740

* because position is mirror, not device based.

720

*

741

*

721

* The rdev for the device selected will have nr_pending incremented.

742

* The rdev for the device selected will have nr_pending incremented.

722

*/

743

*/

723

744

724

/*

745

/*

725

* FIXME: possibly should rethink readbalancing and do it differently

746

* FIXME: possibly should rethink readbalancing and do it differently

726

* depending on near_copies / far_copies geometry.

747

* depending on near_copies / far_copies geometry.

727

*/

748

*/

728

static struct md_rdev *read_balance(struct r10conf *conf,

749

static struct md_rdev *read_balance(struct r10conf *conf,

729

struct r10bio *r10_bio,

750

struct r10bio *r10_bio,

730

int *max_sectors)

751

int *max_sectors)

731

{

752

{

732

const sector_t this_sector = r10_bio->sector;

753

const sector_t this_sector = r10_bio->sector;

733

int disk, slot;

754

int disk, slot;

734

int sectors = r10_bio->sectors;

755

int sectors = r10_bio->sectors;

735

int best_good_sectors;

756

int best_good_sectors;

736

sector_t new_distance, best_dist;

757

sector_t new_distance, best_dist;

737

struct md_rdev *best_rdev, *rdev = NULL;

758

struct md_rdev *best_rdev, *rdev = NULL;

738

int do_balance;

759

int do_balance;

739

int best_slot;

760

int best_slot;

740

struct geom *geo = &conf->geo;

761

struct geom *geo = &conf->geo;

741

762

742

raid10_find_phys(conf, r10_bio);

763

raid10_find_phys(conf, r10_bio);

743

rcu_read_lock();

764

rcu_read_lock();

744

retry:

765

retry:

745

sectors = r10_bio->sectors;

766

sectors = r10_bio->sectors;

746

best_slot = -1;

767

best_slot = -1;

747

best_rdev = NULL;

768

best_rdev = NULL;

748

best_dist = MaxSector;

769

best_dist = MaxSector;

749

best_good_sectors = 0;

770

best_good_sectors = 0;

750

do_balance = 1;

771

do_balance = 1;

751

/*

772

/*

752

* Check if we can balance. We can balance on the whole

773

* Check if we can balance. We can balance on the whole

753

* device if no resync is going on (recovery is ok), or below

774

* device if no resync is going on (recovery is ok), or below

754

* the resync window. We take the first readable disk when

775

* the resync window. We take the first readable disk when

755

* above the resync window.

776

* above the resync window.

756

*/

777

*/

757

if (conf->mddev->recovery_cp < MaxSector

778

if (conf->mddev->recovery_cp < MaxSector

758

&& (this_sector + sectors >= conf->next_resync))

779

&& (this_sector + sectors >= conf->next_resync))

759

do_balance = 0;

780

do_balance = 0;

760

781

761

for (slot = 0; slot < conf->copies ; slot++) {

782

for (slot = 0; slot < conf->copies ; slot++) {

762

sector_t first_bad;

783

sector_t first_bad;

763

int bad_sectors;

784

int bad_sectors;

764

sector_t dev_sector;

785

sector_t dev_sector;

765

786

766

if (r10_bio->devs[slot].bio == IO_BLOCKED)

787

if (r10_bio->devs[slot].bio == IO_BLOCKED)

767

continue;

788

continue;

768

disk = r10_bio->devs[slot].devnum;

789

disk = r10_bio->devs[slot].devnum;

769

rdev = rcu_dereference(conf->mirrors[disk].replacement);

790

rdev = rcu_dereference(conf->mirrors[disk].replacement);

770

if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||

791

if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||

771

test_bit(Unmerged, &rdev->flags) ||

792

test_bit(Unmerged, &rdev->flags) ||

772

r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)

793

r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)

773

rdev = rcu_dereference(conf->mirrors[disk].rdev);

794

rdev = rcu_dereference(conf->mirrors[disk].rdev);

774

if (rdev == NULL ||

795

if (rdev == NULL ||

775

test_bit(Faulty, &rdev->flags) ||

796

test_bit(Faulty, &rdev->flags) ||

776

test_bit(Unmerged, &rdev->flags))

797

test_bit(Unmerged, &rdev->flags))

777

continue;

798

continue;

778

if (!test_bit(In_sync, &rdev->flags) &&

799

if (!test_bit(In_sync, &rdev->flags) &&

779

r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)

800

r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)

780

continue;

801

continue;

781

802

782

dev_sector = r10_bio->devs[slot].addr;

803

dev_sector = r10_bio->devs[slot].addr;

783

if (is_badblock(rdev, dev_sector, sectors,

804

if (is_badblock(rdev, dev_sector, sectors,

784

&first_bad, &bad_sectors)) {

805

&first_bad, &bad_sectors)) {

785

if (best_dist < MaxSector)

806

if (best_dist < MaxSector)

786

/* Already have a better slot */

807

/* Already have a better slot */

787

continue;

808

continue;

788

if (first_bad <= dev_sector) {

809

if (first_bad <= dev_sector) {

789

/* Cannot read here. If this is the

810

/* Cannot read here. If this is the

790

* 'primary' device, then we must not read

811

* 'primary' device, then we must not read

791

* beyond 'bad_sectors' from another device.

812

* beyond 'bad_sectors' from another device.

792

*/

813

*/

793

bad_sectors -= (dev_sector - first_bad);

814

bad_sectors -= (dev_sector - first_bad);

794

if (!do_balance && sectors > bad_sectors)

815

if (!do_balance && sectors > bad_sectors)

795

sectors = bad_sectors;

816

sectors = bad_sectors;

796

if (best_good_sectors > sectors)

817

if (best_good_sectors > sectors)

797

best_good_sectors = sectors;

818

best_good_sectors = sectors;

798

} else {

819

} else {

799

sector_t good_sectors =

820

sector_t good_sectors =

800

first_bad - dev_sector;

821

first_bad - dev_sector;

801

if (good_sectors > best_good_sectors) {

822

if (good_sectors > best_good_sectors) {

802

best_good_sectors = good_sectors;

823

best_good_sectors = good_sectors;

803

best_slot = slot;

824

best_slot = slot;

804

best_rdev = rdev;

825

best_rdev = rdev;

805

}

826

}

806

if (!do_balance)

827

if (!do_balance)

807

/* Must read from here */

828

/* Must read from here */

808

break;

829

break;

809

}

830

}

810

continue;

831

continue;

811

} else

832

} else

812

best_good_sectors = sectors;

833

best_good_sectors = sectors;

813

834

814

if (!do_balance)

835

if (!do_balance)

815

break;

836

break;

816

837

817

/* This optimisation is debatable, and completely destroys

838

/* This optimisation is debatable, and completely destroys

818

* sequential read speed for 'far copies' arrays. So only

839

* sequential read speed for 'far copies' arrays. So only

819

* keep it for 'near' arrays, and review those later.

840

* keep it for 'near' arrays, and review those later.

820

*/

841

*/

821

if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))

842

if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))

822

break;

843

break;

823

844

824

/* for far > 1 always use the lowest address */

845

/* for far > 1 always use the lowest address */

825

if (geo->far_copies > 1)

846

if (geo->far_copies > 1)

826

new_distance = r10_bio->devs[slot].addr;

847

new_distance = r10_bio->devs[slot].addr;

827

else

848

else

828

new_distance = abs(r10_bio->devs[slot].addr -

849

new_distance = abs(r10_bio->devs[slot].addr -

829

conf->mirrors[disk].head_position);

850

conf->mirrors[disk].head_position);

830

if (new_distance < best_dist) {

851

if (new_distance < best_dist) {

831

best_dist = new_distance;

852

best_dist = new_distance;

832

best_slot = slot;

853

best_slot = slot;

833

best_rdev = rdev;

854

best_rdev = rdev;

834

}

855

}

835

}

856

}

836

if (slot >= conf->copies) {

857

if (slot >= conf->copies) {

837

slot = best_slot;

858

slot = best_slot;

838

rdev = best_rdev;

859

rdev = best_rdev;

839

}

860

}

840

861

841

if (slot >= 0) {

862

if (slot >= 0) {

842

atomic_inc(&rdev->nr_pending);

863

atomic_inc(&rdev->nr_pending);

843

if (test_bit(Faulty, &rdev->flags)) {

864

if (test_bit(Faulty, &rdev->flags)) {

844

/* Cannot risk returning a device that failed

865

/* Cannot risk returning a device that failed

845

* before we inc'ed nr_pending

866

* before we inc'ed nr_pending

846

*/

867

*/

847

rdev_dec_pending(rdev, conf->mddev);

868

rdev_dec_pending(rdev, conf->mddev);

848

goto retry;

869

goto retry;

849

}

870

}

850

r10_bio->read_slot = slot;

871

r10_bio->read_slot = slot;

851

} else

872

} else

852

rdev = NULL;

873

rdev = NULL;

853

rcu_read_unlock();

874

rcu_read_unlock();

854

*max_sectors = best_good_sectors;

875

*max_sectors = best_good_sectors;

855

876

856

return rdev;

877

return rdev;

857

}

878

}

858

879

859

int md_raid10_congested(struct mddev *mddev, int bits)

880

int md_raid10_congested(struct mddev *mddev, int bits)

860

{

881

{

861

struct r10conf *conf = mddev->private;

882

struct r10conf *conf = mddev->private;

862

int i, ret = 0;

883

int i, ret = 0;

863

884

864

if ((bits & (1 << BDI_async_congested)) &&

885

if ((bits & (1 << BDI_async_congested)) &&

865

conf->pending_count >= max_queued_requests)

886

conf->pending_count >= max_queued_requests)

866

return 1;

887

return 1;

867

888

868

rcu_read_lock();

889

rcu_read_lock();

869

for (i = 0;

890

for (i = 0;

870

(i < conf->geo.raid_disks || i < conf->prev.raid_disks)

891

(i < conf->geo.raid_disks || i < conf->prev.raid_disks)

871

&& ret == 0;

892

&& ret == 0;

872

i++) {

893

i++) {

873

struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);

894

struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);

874

if (rdev && !test_bit(Faulty, &rdev->flags)) {

895

if (rdev && !test_bit(Faulty, &rdev->flags)) {

875

struct request_queue *q = bdev_get_queue(rdev->bdev);

896

struct request_queue *q = bdev_get_queue(rdev->bdev);

876

897

877

ret |= bdi_congested(&q->backing_dev_info, bits);

898

ret |= bdi_congested(&q->backing_dev_info, bits);

878

}

899

}

879

}

900

}

880

rcu_read_unlock();

901

rcu_read_unlock();

881

return ret;

902

return ret;

882

}

903

}

883

EXPORT_SYMBOL_GPL(md_raid10_congested);

904

EXPORT_SYMBOL_GPL(md_raid10_congested);

884

905

885

static int raid10_congested(void *data, int bits)

906

static int raid10_congested(void *data, int bits)

886

{

907

{

887

struct mddev *mddev = data;

908

struct mddev *mddev = data;

888

909

889

return mddev_congested(mddev, bits) ||

910

return mddev_congested(mddev, bits) ||

890

md_raid10_congested(mddev, bits);

911

md_raid10_congested(mddev, bits);

891

}

912

}

892

913

893

static void flush_pending_writes(struct r10conf *conf)

914

static void flush_pending_writes(struct r10conf *conf)

894

{

915

{

895

/* Any writes that have been queued but are awaiting

916

/* Any writes that have been queued but are awaiting

896

* bitmap updates get flushed here.

917

* bitmap updates get flushed here.

897

*/

918

*/

898

spin_lock_irq(&conf->device_lock);

919

spin_lock_irq(&conf->device_lock);

899

920

900

if (conf->pending_bio_list.head) {

921

if (conf->pending_bio_list.head) {

901

struct bio *bio;

922

struct bio *bio;

902

bio = bio_list_get(&conf->pending_bio_list);

923

bio = bio_list_get(&conf->pending_bio_list);

903

conf->pending_count = 0;

924

conf->pending_count = 0;

904

spin_unlock_irq(&conf->device_lock);

925

spin_unlock_irq(&conf->device_lock);

905

/* flush any pending bitmap writes to disk

926

/* flush any pending bitmap writes to disk

906

* before proceeding w/ I/O */

927

* before proceeding w/ I/O */

907

bitmap_unplug(conf->mddev->bitmap);

928

bitmap_unplug(conf->mddev->bitmap);

908

wake_up(&conf->wait_barrier);

929

wake_up(&conf->wait_barrier);

909

930

910

while (bio) { /* submit pending writes */

931

while (bio) { /* submit pending writes */

911

struct bio *next = bio->bi_next;

932

struct bio *next = bio->bi_next;

912

bio->bi_next = NULL;

933

bio->bi_next = NULL;

913

if (unlikely((bio->bi_rw & REQ_DISCARD) &&

934

if (unlikely((bio->bi_rw & REQ_DISCARD) &&

914

!blk_queue_discard(bdev_get_queue(bio->bi_bdev))))

935

!blk_queue_discard(bdev_get_queue(bio->bi_bdev))))

915

/* Just ignore it */

936

/* Just ignore it */

916

bio_endio(bio, 0);

937

bio_endio(bio, 0);

917

else

938

else

918

generic_make_request(bio);

939

generic_make_request(bio);

919

bio = next;

940

bio = next;

920

}

941

}

921

} else

942

} else

922

spin_unlock_irq(&conf->device_lock);

943

spin_unlock_irq(&conf->device_lock);

923

}

944

}

924

945

925

/* Barriers....

946

/* Barriers....

926

* Sometimes we need to suspend IO while we do something else,

947

* Sometimes we need to suspend IO while we do something else,

927

* either some resync/recovery, or reconfigure the array.

948

* either some resync/recovery, or reconfigure the array.

928

* To do this we raise a 'barrier'.

949

* To do this we raise a 'barrier'.

929

* The 'barrier' is a counter that can be raised multiple times

950

* The 'barrier' is a counter that can be raised multiple times

930

* to count how many activities are happening which preclude

951

* to count how many activities are happening which preclude

931

* normal IO.

952

* normal IO.

932

* We can only raise the barrier if there is no pending IO.

953

* We can only raise the barrier if there is no pending IO.

933

* i.e. if nr_pending == 0.

954

* i.e. if nr_pending == 0.

934

* We choose only to raise the barrier if no-one is waiting for the

955

* We choose only to raise the barrier if no-one is waiting for the

935

* barrier to go down. This means that as soon as an IO request

956

* barrier to go down. This means that as soon as an IO request

936

* is ready, no other operations which require a barrier will start

957

* is ready, no other operations which require a barrier will start

937

* until the IO request has had a chance.

958

* until the IO request has had a chance.

938

*

959

*

939

* So: regular IO calls 'wait_barrier'. When that returns there

960

* So: regular IO calls 'wait_barrier'. When that returns there

940

* is no backgroup IO happening, It must arrange to call

961

* is no backgroup IO happening, It must arrange to call

941

* allow_barrier when it has finished its IO.

962

* allow_barrier when it has finished its IO.

942

* backgroup IO calls must call raise_barrier. Once that returns

963

* backgroup IO calls must call raise_barrier. Once that returns

943

* there is no normal IO happeing. It must arrange to call

964

* there is no normal IO happeing. It must arrange to call

944

* lower_barrier when the particular background IO completes.

965

* lower_barrier when the particular background IO completes.

945

*/

966

*/

946

967

947

static void raise_barrier(struct r10conf *conf, int force)

968

static void raise_barrier(struct r10conf *conf, int force)

948

{

969

{

949

BUG_ON(force && !conf->barrier);

970

BUG_ON(force && !conf->barrier);

950

spin_lock_irq(&conf->resync_lock);

971

spin_lock_irq(&conf->resync_lock);

951

972

952

/* Wait until no block IO is waiting (unless 'force') */

973

/* Wait until no block IO is waiting (unless 'force') */

953

wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,

974

wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,

954

conf->resync_lock);

975

conf->resync_lock);

955

976

956

/* block any new IO from starting */

977

/* block any new IO from starting */

957

conf->barrier++;

978

conf->barrier++;

958

979

959

/* Now wait for all pending IO to complete */

980

/* Now wait for all pending IO to complete */

960

wait_event_lock_irq(conf->wait_barrier,

981

wait_event_lock_irq(conf->wait_barrier,

961

!conf->nr_pending && conf->barrier < RESYNC_DEPTH,

982

!conf->nr_pending && conf->barrier < RESYNC_DEPTH,

962

conf->resync_lock);

983

conf->resync_lock);

963

984

964

spin_unlock_irq(&conf->resync_lock);

985

spin_unlock_irq(&conf->resync_lock);

965

}

986

}

966

987

967

static void lower_barrier(struct r10conf *conf)

988

static void lower_barrier(struct r10conf *conf)

968

{

989

{

969

unsigned long flags;

990

unsigned long flags;

970

spin_lock_irqsave(&conf->resync_lock, flags);

991

spin_lock_irqsave(&conf->resync_lock, flags);

971

conf->barrier--;

992

conf->barrier--;

972

spin_unlock_irqrestore(&conf->resync_lock, flags);

993

spin_unlock_irqrestore(&conf->resync_lock, flags);

973

wake_up(&conf->wait_barrier);

994

wake_up(&conf->wait_barrier);

974

}

995

}

975

996

976

static void wait_barrier(struct r10conf *conf)

997

static void wait_barrier(struct r10conf *conf)

977

{

998

{

978

spin_lock_irq(&conf->resync_lock);

999

spin_lock_irq(&conf->resync_lock);

979

if (conf->barrier) {

1000

if (conf->barrier) {

980

conf->nr_waiting++;

1001

conf->nr_waiting++;

981

/* Wait for the barrier to drop.

1002

/* Wait for the barrier to drop.

982

* However if there are already pending

1003

* However if there are already pending

983

* requests (preventing the barrier from

1004

* requests (preventing the barrier from

984

* rising completely), and the

1005

* rising completely), and the

985

* pre-process bio queue isn't empty,

1006

* pre-process bio queue isn't empty,

986

* then don't wait, as we need to empty

1007

* then don't wait, as we need to empty

987

* that queue to get the nr_pending

1008

* that queue to get the nr_pending

988

* count down.

1009

* count down.

989

*/

1010

*/

990

wait_event_lock_irq(conf->wait_barrier,

1011

wait_event_lock_irq(conf->wait_barrier,

991

!conf->barrier ||

1012

!conf->barrier ||

992

(conf->nr_pending &&

1013

(conf->nr_pending &&

993

current->bio_list &&

1014

current->bio_list &&

994

!bio_list_empty(current->bio_list)),

1015

!bio_list_empty(current->bio_list)),

995

conf->resync_lock);

1016

conf->resync_lock);

996

conf->nr_waiting--;

1017

conf->nr_waiting--;

997

}

1018

}

998

conf->nr_pending++;

1019

conf->nr_pending++;

999

spin_unlock_irq(&conf->resync_lock);

1020

spin_unlock_irq(&conf->resync_lock);

1000

}

1021

}

1001

1022

1002

static void allow_barrier(struct r10conf *conf)

1023

static void allow_barrier(struct r10conf *conf)

1003

{

1024

{

1004

unsigned long flags;

1025

unsigned long flags;

1005

spin_lock_irqsave(&conf->resync_lock, flags);

1026

spin_lock_irqsave(&conf->resync_lock, flags);

1006

conf->nr_pending--;

1027

conf->nr_pending--;

1007

spin_unlock_irqrestore(&conf->resync_lock, flags);

1028

spin_unlock_irqrestore(&conf->resync_lock, flags);

1008

wake_up(&conf->wait_barrier);

1029

wake_up(&conf->wait_barrier);

1009

}

1030

}

1010

1031

1011

static void freeze_array(struct r10conf *conf)

1032

static void freeze_array(struct r10conf *conf)

1012

{

1033

{

1013

/* stop syncio and normal IO and wait for everything to

1034

/* stop syncio and normal IO and wait for everything to

1014

* go quiet.

1035

* go quiet.

1015

* We increment barrier and nr_waiting, and then

1036

* We increment barrier and nr_waiting, and then

1016

* wait until nr_pending match nr_queued+1

1037

* wait until nr_pending match nr_queued+1

1017

* This is called in the context of one normal IO request

1038

* This is called in the context of one normal IO request

1018

* that has failed. Thus any sync request that might be pending

1039

* that has failed. Thus any sync request that might be pending

1019

* will be blocked by nr_pending, and we need to wait for

1040

* will be blocked by nr_pending, and we need to wait for

1020

* pending IO requests to complete or be queued for re-try.

1041

* pending IO requests to complete or be queued for re-try.

1021

* Thus the number queued (nr_queued) plus this request (1)

1042

* Thus the number queued (nr_queued) plus this request (1)

1022

* must match the number of pending IOs (nr_pending) before

1043

* must match the number of pending IOs (nr_pending) before

1023

* we continue.

1044

* we continue.

1024

*/

1045

*/

1025

spin_lock_irq(&conf->resync_lock);

1046

spin_lock_irq(&conf->resync_lock);

1026

conf->barrier++;

1047

conf->barrier++;

1027

conf->nr_waiting++;

1048

conf->nr_waiting++;

1028

wait_event_lock_irq_cmd(conf->wait_barrier,

1049

wait_event_lock_irq_cmd(conf->wait_barrier,

1029

conf->nr_pending == conf->nr_queued+1,

1050

conf->nr_pending == conf->nr_queued+1,

1030

conf->resync_lock,

1051

conf->resync_lock,

1031

flush_pending_writes(conf));

1052

flush_pending_writes(conf));

1032

1053

1033

spin_unlock_irq(&conf->resync_lock);

1054

spin_unlock_irq(&conf->resync_lock);

1034

}

1055

}

1035

1056

1036

static void unfreeze_array(struct r10conf *conf)

1057

static void unfreeze_array(struct r10conf *conf)

1037

{

1058

{

1038

/* reverse the effect of the freeze */

1059

/* reverse the effect of the freeze */

1039

spin_lock_irq(&conf->resync_lock);

1060

spin_lock_irq(&conf->resync_lock);

1040

conf->barrier--;

1061

conf->barrier--;

1041

conf->nr_waiting--;

1062

conf->nr_waiting--;

1042

wake_up(&conf->wait_barrier);

1063

wake_up(&conf->wait_barrier);

1043

spin_unlock_irq(&conf->resync_lock);

1064

spin_unlock_irq(&conf->resync_lock);

1044

}

1065

}

1045

1066

1046

static sector_t choose_data_offset(struct r10bio *r10_bio,

1067

static sector_t choose_data_offset(struct r10bio *r10_bio,

1047

struct md_rdev *rdev)

1068

struct md_rdev *rdev)

1048

{

1069

{

1049

if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||

1070

if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||

1050

test_bit(R10BIO_Previous, &r10_bio->state))

1071

test_bit(R10BIO_Previous, &r10_bio->state))

1051

return rdev->data_offset;

1072

return rdev->data_offset;

1052

else

1073

else

1053

return rdev->new_data_offset;

1074

return rdev->new_data_offset;

1054

}

1075

}

1055

1076

1056

struct raid10_plug_cb {

1077

struct raid10_plug_cb {

1057

struct blk_plug_cb cb;

1078

struct blk_plug_cb cb;

1058

struct bio_list pending;

1079

struct bio_list pending;

1059

int pending_cnt;

1080

int pending_cnt;

1060

};

1081

};

1061

1082

1062

static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)

1083

static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)

1063

{

1084

{

1064

struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,

1085

struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,

1065

cb);

1086

cb);

1066

struct mddev *mddev = plug->cb.data;

1087

struct mddev *mddev = plug->cb.data;

1067

struct r10conf *conf = mddev->private;

1088

struct r10conf *conf = mddev->private;

1068

struct bio *bio;

1089

struct bio *bio;

1069

1090

1070

if (from_schedule || current->bio_list) {

1091

if (from_schedule || current->bio_list) {

1071

spin_lock_irq(&conf->device_lock);

1092

spin_lock_irq(&conf->device_lock);

1072

bio_list_merge(&conf->pending_bio_list, &plug->pending);

1093

bio_list_merge(&conf->pending_bio_list, &plug->pending);

1073

conf->pending_count += plug->pending_cnt;

1094

conf->pending_count += plug->pending_cnt;

1074

spin_unlock_irq(&conf->device_lock);

1095

spin_unlock_irq(&conf->device_lock);

1075

md_wakeup_thread(mddev->thread);

1096

md_wakeup_thread(mddev->thread);

1076

kfree(plug);

1097

kfree(plug);

1077

return;

1098

return;

1078

}

1099

}

1079

1100

1080

/* we aren't scheduling, so we can do the write-out directly. */

1101

/* we aren't scheduling, so we can do the write-out directly. */

1081

bio = bio_list_get(&plug->pending);

1102

bio = bio_list_get(&plug->pending);

1082

bitmap_unplug(mddev->bitmap);

1103

bitmap_unplug(mddev->bitmap);

1083

wake_up(&conf->wait_barrier);

1104

wake_up(&conf->wait_barrier);

1084

1105

1085

while (bio) { /* submit pending writes */

1106

while (bio) { /* submit pending writes */

1086

struct bio *next = bio->bi_next;

1107

struct bio *next = bio->bi_next;

1087

bio->bi_next = NULL;

1108

bio->bi_next = NULL;

1088

generic_make_request(bio);

1109

generic_make_request(bio);

1089

bio = next;

1110

bio = next;

1090

}

1111

}

1091

kfree(plug);

1112

kfree(plug);

1092

}

1113

}

1093

1114

1094

static void make_request(struct mddev *mddev, struct bio * bio)

1115

static void make_request(struct mddev *mddev, struct bio * bio)

1095

{

1116

{

1096

struct r10conf *conf = mddev->private;

1117

struct r10conf *conf = mddev->private;

1097

struct r10bio *r10_bio;

1118

struct r10bio *r10_bio;

1098

struct bio *read_bio;

1119

struct bio *read_bio;

1099

int i;

1120

int i;

1100

sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);

1121

sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);

1101

int chunk_sects = chunk_mask + 1;

1122

int chunk_sects = chunk_mask + 1;

1102

const int rw = bio_data_dir(bio);

1123

const int rw = bio_data_dir(bio);

1103

const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);

1124

const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);

1104

const unsigned long do_fua = (bio->bi_rw & REQ_FUA);

1125

const unsigned long do_fua = (bio->bi_rw & REQ_FUA);

1105

const unsigned long do_discard = (bio->bi_rw

1126

const unsigned long do_discard = (bio->bi_rw

1106

& (REQ_DISCARD | REQ_SECURE));

1127

& (REQ_DISCARD | REQ_SECURE));

1107

const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);

1128

const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);

1108

unsigned long flags;

1129

unsigned long flags;

1109

struct md_rdev *blocked_rdev;

1130

struct md_rdev *blocked_rdev;

1110

struct blk_plug_cb *cb;

1131

struct blk_plug_cb *cb;

1111

struct raid10_plug_cb *plug = NULL;

1132

struct raid10_plug_cb *plug = NULL;

1112

int sectors_handled;

1133

int sectors_handled;

1113

int max_sectors;

1134

int max_sectors;

1114

int sectors;

1135

int sectors;

1115

1136

1116

if (unlikely(bio->bi_rw & REQ_FLUSH)) {

1137

if (unlikely(bio->bi_rw & REQ_FLUSH)) {

1117

md_flush_request(mddev, bio);

1138

md_flush_request(mddev, bio);

1118

return;

1139

return;

1119

}

1140

}

1120

1141

1121

/* If this request crosses a chunk boundary, we need to

1142

/* If this request crosses a chunk boundary, we need to

1122

* split it. This will only happen for 1 PAGE (or less) requests.

1143

* split it. This will only happen for 1 PAGE (or less) requests.

1123

*/

1144

*/

1124

if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)

1145

if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)

1125

> chunk_sects

1146

> chunk_sects

1126

&& (conf->geo.near_copies < conf->geo.raid_disks

1147

&& (conf->geo.near_copies < conf->geo.raid_disks

1127

|| conf->prev.near_copies < conf->prev.raid_disks))) {

1148

|| conf->prev.near_copies < conf->prev.raid_disks))) {

1128

struct bio_pair *bp;

1149

struct bio_pair *bp;

1129

/* Sanity check -- queue functions should prevent this happening */

1150

/* Sanity check -- queue functions should prevent this happening */

1130

if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||

1151

if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||

1131

bio->bi_idx != 0)

1152

bio->bi_idx != 0)

1132

goto bad_map;

1153

goto bad_map;

1133

/* This is a one page bio that upper layers

1154

/* This is a one page bio that upper layers

1134

* refuse to split for us, so we need to split it.

1155

* refuse to split for us, so we need to split it.

1135

*/

1156

*/

1136

bp = bio_split(bio,

1157

bp = bio_split(bio,

1137

chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );

1158

chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );

1138

1159

1139

/* Each of these 'make_request' calls will call 'wait_barrier'.

1160

/* Each of these 'make_request' calls will call 'wait_barrier'.

1140

* If the first succeeds but the second blocks due to the resync

1161

* If the first succeeds but the second blocks due to the resync

1141

* thread raising the barrier, we will deadlock because the

1162

* thread raising the barrier, we will deadlock because the

1142

* IO to the underlying device will be queued in generic_make_request

1163

* IO to the underlying device will be queued in generic_make_request

1143

* and will never complete, so will never reduce nr_pending.

1164

* and will never complete, so will never reduce nr_pending.

1144

* So increment nr_waiting here so no new raise_barriers will

1165

* So increment nr_waiting here so no new raise_barriers will

1145

* succeed, and so the second wait_barrier cannot block.

1166

* succeed, and so the second wait_barrier cannot block.

1146

*/

1167

*/

1147

spin_lock_irq(&conf->resync_lock);

1168

spin_lock_irq(&conf->resync_lock);

1148

conf->nr_waiting++;

1169

conf->nr_waiting++;

1149

spin_unlock_irq(&conf->resync_lock);

1170

spin_unlock_irq(&conf->resync_lock);

1150

1171

1151

make_request(mddev, &bp->bio1);

1172

make_request(mddev, &bp->bio1);

1152

make_request(mddev, &bp->bio2);

1173

make_request(mddev, &bp->bio2);

1153

1174

1154

spin_lock_irq(&conf->resync_lock);

1175

spin_lock_irq(&conf->resync_lock);

1155

conf->nr_waiting--;

1176

conf->nr_waiting--;

1156

wake_up(&conf->wait_barrier);

1177

wake_up(&conf->wait_barrier);

1157

spin_unlock_irq(&conf->resync_lock);

1178

spin_unlock_irq(&conf->resync_lock);

1158

1179

1159

bio_pair_release(bp);

1180

bio_pair_release(bp);

1160

return;

1181

return;

1161

bad_map:

1182

bad_map:

1162

printk("md/raid10:%s: make_request bug: can't convert block across chunks"

1183

printk("md/raid10:%s: make_request bug: can't convert block across chunks"

1163

" or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,

1184

" or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,

1164

(unsigned long long)bio->bi_sector, bio->bi_size >> 10);

1185

(unsigned long long)bio->bi_sector, bio->bi_size >> 10);

1165

1186

1166

bio_io_error(bio);

1187

bio_io_error(bio);

1167

return;

1188

return;

1168

}

1189

}

1169

1190

1170

md_write_start(mddev, bio);

1191

md_write_start(mddev, bio);

1171

1192

1172

/*

1193

/*

1173

* Register the new request and wait if the reconstruction

1194

* Register the new request and wait if the reconstruction

1174

* thread has put up a bar for new requests.

1195

* thread has put up a bar for new requests.

1175

* Continue immediately if no resync is active currently.

1196

* Continue immediately if no resync is active currently.

1176

*/

1197

*/

1177

wait_barrier(conf);

1198

wait_barrier(conf);

1178

1199

1179

sectors = bio->bi_size >> 9;

1200

sectors = bio->bi_size >> 9;

1180

while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&

1201

while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&

1181

bio->bi_sector < conf->reshape_progress &&

1202

bio->bi_sector < conf->reshape_progress &&

1182

bio->bi_sector + sectors > conf->reshape_progress) {

1203

bio->bi_sector + sectors > conf->reshape_progress) {

1183

/* IO spans the reshape position. Need to wait for

1204

/* IO spans the reshape position. Need to wait for

1184

* reshape to pass

1205

* reshape to pass

1185

*/

1206

*/

1186

allow_barrier(conf);

1207

allow_barrier(conf);

1187

wait_event(conf->wait_barrier,

1208

wait_event(conf->wait_barrier,

1188

conf->reshape_progress <= bio->bi_sector ||

1209

conf->reshape_progress <= bio->bi_sector ||

1189

conf->reshape_progress >= bio->bi_sector + sectors);

1210

conf->reshape_progress >= bio->bi_sector + sectors);

1190

wait_barrier(conf);

1211

wait_barrier(conf);

1191

}

1212

}

1192

if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&

1213

if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&

1193

bio_data_dir(bio) == WRITE &&

1214

bio_data_dir(bio) == WRITE &&

1194

(mddev->reshape_backwards

1215

(mddev->reshape_backwards

1195

? (bio->bi_sector < conf->reshape_safe &&

1216

? (bio->bi_sector < conf->reshape_safe &&

1196

bio->bi_sector + sectors > conf->reshape_progress)

1217

bio->bi_sector + sectors > conf->reshape_progress)

1197

: (bio->bi_sector + sectors > conf->reshape_safe &&

1218

: (bio->bi_sector + sectors > conf->reshape_safe &&

1198

bio->bi_sector < conf->reshape_progress))) {

1219

bio->bi_sector < conf->reshape_progress))) {

1199

/* Need to update reshape_position in metadata */

1220

/* Need to update reshape_position in metadata */

1200

mddev->reshape_position = conf->reshape_progress;

1221

mddev->reshape_position = conf->reshape_progress;

1201

set_bit(MD_CHANGE_DEVS, &mddev->flags);

1222

set_bit(MD_CHANGE_DEVS, &mddev->flags);

1202

set_bit(MD_CHANGE_PENDING, &mddev->flags);

1223

set_bit(MD_CHANGE_PENDING, &mddev->flags);

1203

md_wakeup_thread(mddev->thread);

1224

md_wakeup_thread(mddev->thread);

1204

wait_event(mddev->sb_wait,

1225

wait_event(mddev->sb_wait,

1205

!test_bit(MD_CHANGE_PENDING, &mddev->flags));

1226

!test_bit(MD_CHANGE_PENDING, &mddev->flags));

1206

1227

1207

conf->reshape_safe = mddev->reshape_position;

1228

conf->reshape_safe = mddev->reshape_position;

1208

}

1229

}

1209

1230

1210

r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);

1231

r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);

1211

1232

1212

r10_bio->master_bio = bio;

1233

r10_bio->master_bio = bio;

1213

r10_bio->sectors = sectors;

1234

r10_bio->sectors = sectors;

1214

1235

1215

r10_bio->mddev = mddev;

1236

r10_bio->mddev = mddev;

1216

r10_bio->sector = bio->bi_sector;

1237

r10_bio->sector = bio->bi_sector;

1217

r10_bio->state = 0;

1238

r10_bio->state = 0;

1218

1239

1219

/* We might need to issue multiple reads to different

1240

/* We might need to issue multiple reads to different

1220

* devices if there are bad blocks around, so we keep

1241

* devices if there are bad blocks around, so we keep

1221

* track of the number of reads in bio->bi_phys_segments.

1242

* track of the number of reads in bio->bi_phys_segments.

1222

* If this is 0, there is only one r10_bio and no locking

1243

* If this is 0, there is only one r10_bio and no locking

1223

* will be needed when the request completes. If it is

1244

* will be needed when the request completes. If it is

1224

* non-zero, then it is the number of not-completed requests.

1245

* non-zero, then it is the number of not-completed requests.

1225

*/

1246

*/

1226

bio->bi_phys_segments = 0;

1247

bio->bi_phys_segments = 0;

1227

clear_bit(BIO_SEG_VALID, &bio->bi_flags);

1248

clear_bit(BIO_SEG_VALID, &bio->bi_flags);

1228

1249

1229

if (rw == READ) {

1250

if (rw == READ) {

1230

/*

1251

/*

1231

* read balancing logic:

1252

* read balancing logic:

1232

*/

1253

*/

1233

struct md_rdev *rdev;

1254

struct md_rdev *rdev;

1234

int slot;

1255

int slot;

1235

1256

1236

read_again:

1257

read_again:

1237

rdev = read_balance(conf, r10_bio, &max_sectors);

1258

rdev = read_balance(conf, r10_bio, &max_sectors);

1238

if (!rdev) {

1259

if (!rdev) {

1239

raid_end_bio_io(r10_bio);

1260

raid_end_bio_io(r10_bio);

1240

return;

1261

return;

1241

}

1262

}

1242

slot = r10_bio->read_slot;

1263

slot = r10_bio->read_slot;

1243

1264

1244

read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);

1265

read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);

1245

md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,

1266

md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,

1246

max_sectors);

1267

max_sectors);

1247

1268

1248

r10_bio->devs[slot].bio = read_bio;

1269

r10_bio->devs[slot].bio = read_bio;

1249

r10_bio->devs[slot].rdev = rdev;

1270

r10_bio->devs[slot].rdev = rdev;

1250

1271

1251

read_bio->bi_sector = r10_bio->devs[slot].addr +

1272

read_bio->bi_sector = r10_bio->devs[slot].addr +

1252

choose_data_offset(r10_bio, rdev);

1273

choose_data_offset(r10_bio, rdev);

1253

read_bio->bi_bdev = rdev->bdev;

1274

read_bio->bi_bdev = rdev->bdev;

1254

read_bio->bi_end_io = raid10_end_read_request;

1275

read_bio->bi_end_io = raid10_end_read_request;

1255

read_bio->bi_rw = READ | do_sync;

1276

read_bio->bi_rw = READ | do_sync;

1256

read_bio->bi_private = r10_bio;

1277

read_bio->bi_private = r10_bio;

1257

1278

1258

if (max_sectors < r10_bio->sectors) {

1279

if (max_sectors < r10_bio->sectors) {

1259

/* Could not read all from this device, so we will

1280

/* Could not read all from this device, so we will

1260

* need another r10_bio.

1281

* need another r10_bio.

1261

*/

1282

*/

1262

sectors_handled = (r10_bio->sectors + max_sectors

1283

sectors_handled = (r10_bio->sectors + max_sectors

1263

- bio->bi_sector);

1284

- bio->bi_sector);

1264

r10_bio->sectors = max_sectors;

1285

r10_bio->sectors = max_sectors;

1265

spin_lock_irq(&conf->device_lock);

1286

spin_lock_irq(&conf->device_lock);

1266

if (bio->bi_phys_segments == 0)

1287

if (bio->bi_phys_segments == 0)

1267

bio->bi_phys_segments = 2;

1288

bio->bi_phys_segments = 2;

1268

else

1289

else

1269

bio->bi_phys_segments++;

1290

bio->bi_phys_segments++;

1270

spin_unlock(&conf->device_lock);

1291

spin_unlock(&conf->device_lock);

1271

/* Cannot call generic_make_request directly

1292

/* Cannot call generic_make_request directly

1272

* as that will be queued in __generic_make_request

1293

* as that will be queued in __generic_make_request

1273

* and subsequent mempool_alloc might block

1294

* and subsequent mempool_alloc might block

1274

* waiting for it. so hand bio over to raid10d.

1295

* waiting for it. so hand bio over to raid10d.

1275

*/

1296

*/

1276

reschedule_retry(r10_bio);

1297

reschedule_retry(r10_bio);

1277

1298

1278

r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);

1299

r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);

1279

1300

1280

r10_bio->master_bio = bio;

1301

r10_bio->master_bio = bio;

1281

r10_bio->sectors = ((bio->bi_size >> 9)

1302

r10_bio->sectors = ((bio->bi_size >> 9)

1282

- sectors_handled);

1303

- sectors_handled);

1283

r10_bio->state = 0;

1304

r10_bio->state = 0;

1284

r10_bio->mddev = mddev;

1305

r10_bio->mddev = mddev;

1285

r10_bio->sector = bio->bi_sector + sectors_handled;

1306

r10_bio->sector = bio->bi_sector + sectors_handled;

1286

goto read_again;

1307

goto read_again;

1287

} else

1308

} else

1288

generic_make_request(read_bio);

1309

generic_make_request(read_bio);

1289

return;

1310

return;

1290

}

1311

}

1291

1312

1292

/*

1313

/*

1293

* WRITE:

1314

* WRITE:

1294

*/

1315

*/

1295

if (conf->pending_count >= max_queued_requests) {

1316

if (conf->pending_count >= max_queued_requests) {

1296

md_wakeup_thread(mddev->thread);

1317

md_wakeup_thread(mddev->thread);

1297

wait_event(conf->wait_barrier,

1318

wait_event(conf->wait_barrier,

1298

conf->pending_count < max_queued_requests);

1319

conf->pending_count < max_queued_requests);

1299

}

1320

}

1300

/* first select target devices under rcu_lock and

1321

/* first select target devices under rcu_lock and

1301

* inc refcount on their rdev. Record them by setting

1322

* inc refcount on their rdev. Record them by setting

1302

* bios[x] to bio

1323

* bios[x] to bio

1303

* If there are known/acknowledged bad blocks on any device

1324

* If there are known/acknowledged bad blocks on any device

1304

* on which we have seen a write error, we want to avoid

1325

* on which we have seen a write error, we want to avoid

1305

* writing to those blocks. This potentially requires several

1326

* writing to those blocks. This potentially requires several

1306

* writes to write around the bad blocks. Each set of writes

1327

* writes to write around the bad blocks. Each set of writes

1307

* gets its own r10_bio with a set of bios attached. The number

1328

* gets its own r10_bio with a set of bios attached. The number

1308

* of r10_bios is recored in bio->bi_phys_segments just as with

1329

* of r10_bios is recored in bio->bi_phys_segments just as with

1309

* the read case.

1330

* the read case.

1310

*/

1331

*/

1311

1332

1312

r10_bio->read_slot = -1; /* make sure repl_bio gets freed */

1333

r10_bio->read_slot = -1; /* make sure repl_bio gets freed */

1313

raid10_find_phys(conf, r10_bio);

1334

raid10_find_phys(conf, r10_bio);

1314

retry_write:

1335

retry_write:

1315

blocked_rdev = NULL;

1336

blocked_rdev = NULL;

1316

rcu_read_lock();

1337

rcu_read_lock();

1317

max_sectors = r10_bio->sectors;

1338

max_sectors = r10_bio->sectors;

1318

1339

1319

for (i = 0; i < conf->copies; i++) {

1340

for (i = 0; i < conf->copies; i++) {

1320

int d = r10_bio->devs[i].devnum;

1341

int d = r10_bio->devs[i].devnum;

1321

struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);

1342

struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);

1322

struct md_rdev *rrdev = rcu_dereference(

1343

struct md_rdev *rrdev = rcu_dereference(

1323

conf->mirrors[d].replacement);

1344

conf->mirrors[d].replacement);

1324

if (rdev == rrdev)

1345

if (rdev == rrdev)

1325

rrdev = NULL;

1346

rrdev = NULL;

1326

if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {

1347

if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {

1327

atomic_inc(&rdev->nr_pending);

1348

atomic_inc(&rdev->nr_pending);

1328

blocked_rdev = rdev;

1349

blocked_rdev = rdev;

1329

break;

1350

break;

1330

}

1351

}

1331

if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {

1352

if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {

1332

atomic_inc(&rrdev->nr_pending);

1353

atomic_inc(&rrdev->nr_pending);

1333

blocked_rdev = rrdev;

1354

blocked_rdev = rrdev;

1334

break;

1355

break;

1335

}

1356

}

1336

if (rdev && (test_bit(Faulty, &rdev->flags)

1357

if (rdev && (test_bit(Faulty, &rdev->flags)

1337

|| test_bit(Unmerged, &rdev->flags)))

1358

|| test_bit(Unmerged, &rdev->flags)))

1338

rdev = NULL;

1359

rdev = NULL;

1339

if (rrdev && (test_bit(Faulty, &rrdev->flags)

1360

if (rrdev && (test_bit(Faulty, &rrdev->flags)

1340

|| test_bit(Unmerged, &rrdev->flags)))

1361

|| test_bit(Unmerged, &rrdev->flags)))

1341

rrdev = NULL;

1362

rrdev = NULL;

1342

1363

1343

r10_bio->devs[i].bio = NULL;

1364

r10_bio->devs[i].bio = NULL;

1344

r10_bio->devs[i].repl_bio = NULL;

1365

r10_bio->devs[i].repl_bio = NULL;

1345

1366

1346

if (!rdev && !rrdev) {

1367

if (!rdev && !rrdev) {

1347

set_bit(R10BIO_Degraded, &r10_bio->state);

1368

set_bit(R10BIO_Degraded, &r10_bio->state);

1348

continue;

1369

continue;

1349

}

1370

}

1350

if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {

1371

if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {

1351

sector_t first_bad;

1372

sector_t first_bad;

1352

sector_t dev_sector = r10_bio->devs[i].addr;

1373

sector_t dev_sector = r10_bio->devs[i].addr;

1353

int bad_sectors;

1374

int bad_sectors;

1354

int is_bad;

1375

int is_bad;

1355

1376

1356

is_bad = is_badblock(rdev, dev_sector,

1377

is_bad = is_badblock(rdev, dev_sector,

1357

max_sectors,

1378

max_sectors,

1358

&first_bad, &bad_sectors);

1379

&first_bad, &bad_sectors);

1359

if (is_bad < 0) {

1380

if (is_bad < 0) {

1360

/* Mustn't write here until the bad block

1381

/* Mustn't write here until the bad block

1361

* is acknowledged

1382

* is acknowledged

1362

*/

1383

*/

1363

atomic_inc(&rdev->nr_pending);

1384

atomic_inc(&rdev->nr_pending);

1364

set_bit(BlockedBadBlocks, &rdev->flags);

1385

set_bit(BlockedBadBlocks, &rdev->flags);

1365

blocked_rdev = rdev;

1386

blocked_rdev = rdev;

1366

break;

1387

break;

1367

}

1388

}

1368

if (is_bad && first_bad <= dev_sector) {

1389

if (is_bad && first_bad <= dev_sector) {

1369

/* Cannot write here at all */

1390

/* Cannot write here at all */

1370

bad_sectors -= (dev_sector - first_bad);

1391

bad_sectors -= (dev_sector - first_bad);

1371

if (bad_sectors < max_sectors)

1392

if (bad_sectors < max_sectors)

1372

/* Mustn't write more than bad_sectors

1393

/* Mustn't write more than bad_sectors

1373

* to other devices yet

1394

* to other devices yet

1374

*/

1395

*/

1375

max_sectors = bad_sectors;

1396

max_sectors = bad_sectors;

1376

/* We don't set R10BIO_Degraded as that

1397

/* We don't set R10BIO_Degraded as that

1377

* only applies if the disk is missing,

1398

* only applies if the disk is missing,

1378

* so it might be re-added, and we want to

1399

* so it might be re-added, and we want to

1379

* know to recover this chunk.

1400

* know to recover this chunk.

1380

* In this case the device is here, and the

1401

* In this case the device is here, and the

1381

* fact that this chunk is not in-sync is

1402

* fact that this chunk is not in-sync is

1382

* recorded in the bad block log.

1403

* recorded in the bad block log.

1383

*/

1404

*/

1384

continue;

1405

continue;

1385

}

1406

}

1386

if (is_bad) {

1407

if (is_bad) {

1387

int good_sectors = first_bad - dev_sector;

1408

int good_sectors = first_bad - dev_sector;

1388

if (good_sectors < max_sectors)

1409

if (good_sectors < max_sectors)

1389

max_sectors = good_sectors;

1410

max_sectors = good_sectors;

1390

}

1411

}

1391

}

1412

}

1392

if (rdev) {

1413

if (rdev) {

1393

r10_bio->devs[i].bio = bio;

1414

r10_bio->devs[i].bio = bio;

1394

atomic_inc(&rdev->nr_pending);

1415

atomic_inc(&rdev->nr_pending);

1395

}

1416

}

1396

if (rrdev) {

1417

if (rrdev) {

1397

r10_bio->devs[i].repl_bio = bio;

1418

r10_bio->devs[i].repl_bio = bio;

1398

atomic_inc(&rrdev->nr_pending);

1419

atomic_inc(&rrdev->nr_pending);

1399

}

1420

}

1400

}

1421

}

1401

rcu_read_unlock();

1422

rcu_read_unlock();

1402

1423

1403

if (unlikely(blocked_rdev)) {

1424

if (unlikely(blocked_rdev)) {

1404

/* Have to wait for this device to get unblocked, then retry */

1425

/* Have to wait for this device to get unblocked, then retry */

1405

int j;

1426

int j;

1406

int d;

1427

int d;

1407

1428

1408

for (j = 0; j < i; j++) {

1429

for (j = 0; j < i; j++) {

1409

if (r10_bio->devs[j].bio) {

1430

if (r10_bio->devs[j].bio) {

1410

d = r10_bio->devs[j].devnum;

1431

d = r10_bio->devs[j].devnum;

1411

rdev_dec_pending(conf->mirrors[d].rdev, mddev);

1432

rdev_dec_pending(conf->mirrors[d].rdev, mddev);

1412

}

1433

}

1413

if (r10_bio->devs[j].repl_bio) {

1434

if (r10_bio->devs[j].repl_bio) {

1414

struct md_rdev *rdev;

1435

struct md_rdev *rdev;

1415

d = r10_bio->devs[j].devnum;

1436

d = r10_bio->devs[j].devnum;

1416

rdev = conf->mirrors[d].replacement;

1437

rdev = conf->mirrors[d].replacement;

1417

if (!rdev) {

1438

if (!rdev) {

1418

/* Race with remove_disk */

1439

/* Race with remove_disk */

1419

smp_mb();

1440

smp_mb();

1420

rdev = conf->mirrors[d].rdev;

1441

rdev = conf->mirrors[d].rdev;

1421

}

1442

}

1422

rdev_dec_pending(rdev, mddev);

1443

rdev_dec_pending(rdev, mddev);

1423

}

1444

}

1424

}

1445

}

1425

allow_barrier(conf);

1446

allow_barrier(conf);

1426

md_wait_for_blocked_rdev(blocked_rdev, mddev);

1447

md_wait_for_blocked_rdev(blocked_rdev, mddev);

1427

wait_barrier(conf);

1448

wait_barrier(conf);

1428

goto retry_write;

1449

goto retry_write;

1429

}

1450

}

1430

1451

1431

if (max_sectors < r10_bio->sectors) {

1452

if (max_sectors < r10_bio->sectors) {

1432

/* We are splitting this into multiple parts, so

1453

/* We are splitting this into multiple parts, so

1433

* we need to prepare for allocating another r10_bio.

1454

* we need to prepare for allocating another r10_bio.

1434

*/

1455

*/

1435

r10_bio->sectors = max_sectors;

1456

r10_bio->sectors = max_sectors;

1436

spin_lock_irq(&conf->device_lock);

1457

spin_lock_irq(&conf->device_lock);

1437

if (bio->bi_phys_segments == 0)

1458

if (bio->bi_phys_segments == 0)

1438

bio->bi_phys_segments = 2;

1459

bio->bi_phys_segments = 2;

1439

else

1460

else

1440

bio->bi_phys_segments++;

1461

bio->bi_phys_segments++;

1441

spin_unlock_irq(&conf->device_lock);

1462

spin_unlock_irq(&conf->device_lock);

1442

}

1463

}

1443

sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;

1464

sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;

1444

1465

1445

atomic_set(&r10_bio->remaining, 1);

1466

atomic_set(&r10_bio->remaining, 1);

1446

bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);

1467

bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);

1447

1468

1448

for (i = 0; i < conf->copies; i++) {

1469

for (i = 0; i < conf->copies; i++) {

1449

struct bio *mbio;

1470

struct bio *mbio;

1450

int d = r10_bio->devs[i].devnum;

1471

int d = r10_bio->devs[i].devnum;

1451

if (r10_bio->devs[i].bio) {

1472

if (r10_bio->devs[i].bio) {

1452

struct md_rdev *rdev = conf->mirrors[d].rdev;

1473

struct md_rdev *rdev = conf->mirrors[d].rdev;

1453

mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);

1474

mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);

1454

md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,

1475

md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,

1455

max_sectors);

1476

max_sectors);

1456

r10_bio->devs[i].bio = mbio;

1477

r10_bio->devs[i].bio = mbio;

1457

1478

1458

mbio->bi_sector = (r10_bio->devs[i].addr+

1479

mbio->bi_sector = (r10_bio->devs[i].addr+

1459

choose_data_offset(r10_bio,

1480

choose_data_offset(r10_bio,

1460

rdev));

1481

rdev));

1461

mbio->bi_bdev = rdev->bdev;

1482

mbio->bi_bdev = rdev->bdev;

1462

mbio->bi_end_io = raid10_end_write_request;

1483

mbio->bi_end_io = raid10_end_write_request;

1463

mbio->bi_rw =

1484

mbio->bi_rw =

1464

WRITE | do_sync | do_fua | do_discard | do_same;

1485

WRITE | do_sync | do_fua | do_discard | do_same;

1465

mbio->bi_private = r10_bio;

1486

mbio->bi_private = r10_bio;

1466

1487

1467

atomic_inc(&r10_bio->remaining);

1488

atomic_inc(&r10_bio->remaining);

1468

1489

1469

cb = blk_check_plugged(raid10_unplug, mddev,

1490

cb = blk_check_plugged(raid10_unplug, mddev,

1470

sizeof(*plug));

1491

sizeof(*plug));

1471

if (cb)

1492

if (cb)

1472

plug = container_of(cb, struct raid10_plug_cb,

1493

plug = container_of(cb, struct raid10_plug_cb,

1473

cb);

1494

cb);

1474

else

1495

else

1475

plug = NULL;

1496

plug = NULL;

1476

spin_lock_irqsave(&conf->device_lock, flags);

1497

spin_lock_irqsave(&conf->device_lock, flags);

1477

if (plug) {

1498

if (plug) {

1478

bio_list_add(&plug->pending, mbio);

1499

bio_list_add(&plug->pending, mbio);

1479

plug->pending_cnt++;

1500

plug->pending_cnt++;

1480

} else {

1501

} else {

1481

bio_list_add(&conf->pending_bio_list, mbio);

1502

bio_list_add(&conf->pending_bio_list, mbio);

1482

conf->pending_count++;

1503

conf->pending_count++;

1483

}

1504

}

1484

spin_unlock_irqrestore(&conf->device_lock, flags);

1505

spin_unlock_irqrestore(&conf->device_lock, flags);

1485

if (!plug)

1506

if (!plug)

1486

md_wakeup_thread(mddev->thread);

1507

md_wakeup_thread(mddev->thread);

1487

}

1508

}

1488

1509

1489

if (r10_bio->devs[i].repl_bio) {

1510

if (r10_bio->devs[i].repl_bio) {

1490

struct md_rdev *rdev = conf->mirrors[d].replacement;

1511

struct md_rdev *rdev = conf->mirrors[d].replacement;

1491

if (rdev == NULL) {

1512

if (rdev == NULL) {

1492

/* Replacement just got moved to main 'rdev' */

1513

/* Replacement just got moved to main 'rdev' */

1493

smp_mb();

1514

smp_mb();

1494

rdev = conf->mirrors[d].rdev;

1515

rdev = conf->mirrors[d].rdev;

1495

}

1516

}

1496

mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);

1517

mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);

1497

md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,

1518

md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,

1498

max_sectors);

1519

max_sectors);

1499

r10_bio->devs[i].repl_bio = mbio;

1520

r10_bio->devs[i].repl_bio = mbio;

1500

1521

1501

mbio->bi_sector = (r10_bio->devs[i].addr +

1522

mbio->bi_sector = (r10_bio->devs[i].addr +

1502

choose_data_offset(

1523

choose_data_offset(

1503

r10_bio, rdev));

1524

r10_bio, rdev));

1504

mbio->bi_bdev = rdev->bdev;

1525

mbio->bi_bdev = rdev->bdev;

1505

mbio->bi_end_io = raid10_end_write_request;

1526

mbio->bi_end_io = raid10_end_write_request;

1506

mbio->bi_rw =

1527

mbio->bi_rw =

1507

WRITE | do_sync | do_fua | do_discard | do_same;

1528

WRITE | do_sync | do_fua | do_discard | do_same;

1508

mbio->bi_private = r10_bio;

1529

mbio->bi_private = r10_bio;

1509

1530

1510

atomic_inc(&r10_bio->remaining);

1531

atomic_inc(&r10_bio->remaining);

1511

spin_lock_irqsave(&conf->device_lock, flags);

1532

spin_lock_irqsave(&conf->device_lock, flags);

1512

bio_list_add(&conf->pending_bio_list, mbio);

1533

bio_list_add(&conf->pending_bio_list, mbio);

1513

conf->pending_count++;

1534

conf->pending_count++;

1514

spin_unlock_irqrestore(&conf->device_lock, flags);

1535

spin_unlock_irqrestore(&conf->device_lock, flags);

1515

if (!mddev_check_plugged(mddev))

1536

if (!mddev_check_plugged(mddev))

1516

md_wakeup_thread(mddev->thread);

1537

md_wakeup_thread(mddev->thread);

1517

}

1538

}

1518

}

1539

}

1519

1540

1520

/* Don't remove the bias on 'remaining' (one_write_done) until

1541

/* Don't remove the bias on 'remaining' (one_write_done) until

1521

* after checking if we need to go around again.

1542

* after checking if we need to go around again.

1522

*/

1543

*/

1523

1544

1524

if (sectors_handled < (bio->bi_size >> 9)) {

1545

if (sectors_handled < (bio->bi_size >> 9)) {

1525

one_write_done(r10_bio);

1546

one_write_done(r10_bio);

1526

/* We need another r10_bio. It has already been counted

1547

/* We need another r10_bio. It has already been counted

1527

* in bio->bi_phys_segments.

1548

* in bio->bi_phys_segments.

1528

*/

1549

*/

1529

r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);

1550

r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);

1530

1551

1531

r10_bio->master_bio = bio;

1552

r10_bio->master_bio = bio;

1532

r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;

1553

r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;

1533

1554

1534

r10_bio->mddev = mddev;

1555

r10_bio->mddev = mddev;

1535

r10_bio->sector = bio->bi_sector + sectors_handled;

1556

r10_bio->sector = bio->bi_sector + sectors_handled;

1536

r10_bio->state = 0;

1557

r10_bio->state = 0;

1537

goto retry_write;

1558

goto retry_write;

1538

}

1559

}

1539

one_write_done(r10_bio);

1560

one_write_done(r10_bio);

1540

1561

1541

/* In case raid10d snuck in to freeze_array */

1562

/* In case raid10d snuck in to freeze_array */

1542

wake_up(&conf->wait_barrier);

1563

wake_up(&conf->wait_barrier);

1543

}

1564

}

1544

1565

1545

static void status(struct seq_file *seq, struct mddev *mddev)

1566

static void status(struct seq_file *seq, struct mddev *mddev)

1546

{

1567

{

1547

struct r10conf *conf = mddev->private;

1568

struct r10conf *conf = mddev->private;

1548

int i;

1569

int i;

1549

1570

1550

if (conf->geo.near_copies < conf->geo.raid_disks)

1571

if (conf->geo.near_copies < conf->geo.raid_disks)

1551

seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);

1572

seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);

1552

if (conf->geo.near_copies > 1)

1573

if (conf->geo.near_copies > 1)

1553

seq_printf(seq, " %d near-copies", conf->geo.near_copies);

1574

seq_printf(seq, " %d near-copies", conf->geo.near_copies);

1554

if (conf->geo.far_copies > 1) {

1575

if (conf->geo.far_copies > 1) {

1555

if (conf->geo.far_offset)

1576

if (conf->geo.far_offset)

1556

seq_printf(seq, " %d offset-copies", conf->geo.far_copies);

1577

seq_printf(seq, " %d offset-copies", conf->geo.far_copies);

1557

else

1578

else

1558

seq_printf(seq, " %d far-copies", conf->geo.far_copies);

1579

seq_printf(seq, " %d far-copies", conf->geo.far_copies);

1559

}

1580

}

1560

seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,

1581

seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,

1561

conf->geo.raid_disks - mddev->degraded);

1582

conf->geo.raid_disks - mddev->degraded);

1562

for (i = 0; i < conf->geo.raid_disks; i++)

1583

for (i = 0; i < conf->geo.raid_disks; i++)

1563

seq_printf(seq, "%s",

1584

seq_printf(seq, "%s",

1564

conf->mirrors[i].rdev &&

1585

conf->mirrors[i].rdev &&

1565

test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");

1586

test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");

1566

seq_printf(seq, "]");

1587

seq_printf(seq, "]");

1567

}

1588

}

1568

1589

1569

/* check if there are enough drives for

1590

/* check if there are enough drives for

1570

* every block to appear on atleast one.

1591

* every block to appear on atleast one.

1571

* Don't consider the device numbered 'ignore'

1592

* Don't consider the device numbered 'ignore'

1572

* as we might be about to remove it.

1593

* as we might be about to remove it.

1573

*/

1594

*/

1574

static int _enough(struct r10conf *conf, struct geom *geo, int ignore)

1595

static int _enough(struct r10conf *conf, struct geom *geo, int ignore)

1575

{

1596

{

1576

int first = 0;

1597

int first = 0;

1577

1598

1578

do {

1599

do {

1579

int n = conf->copies;

1600

int n = conf->copies;

1580

int cnt = 0;

1601

int cnt = 0;

1581

int this = first;

1602

int this = first;

1582

while (n--) {

1603

while (n--) {

1583

if (conf->mirrors[this].rdev &&

1604

if (conf->mirrors[this].rdev &&

1584

this != ignore)

1605

this != ignore)

1585

cnt++;

1606

cnt++;

1586

this = (this+1) % geo->raid_disks;

1607

this = (this+1) % geo->raid_disks;

1587

}

1608

}

1588

if (cnt == 0)

1609

if (cnt == 0)

1589

return 0;

1610

return 0;

1590

first = (first + geo->near_copies) % geo->raid_disks;

1611

first = (first + geo->near_copies) % geo->raid_disks;

1591

} while (first != 0);

1612

} while (first != 0);

1592

return 1;

1613

return 1;

1593

}

1614

}

1594

1615

1595

static int enough(struct r10conf *conf, int ignore)

1616

static int enough(struct r10conf *conf, int ignore)

1596

{

1617

{

1597

return _enough(conf, &conf->geo, ignore) &&

1618

return _enough(conf, &conf->geo, ignore) &&

1598

_enough(conf, &conf->prev, ignore);

1619

_enough(conf, &conf->prev, ignore);

1599

}

1620

}

1600

1621

1601

static void error(struct mddev *mddev, struct md_rdev *rdev)

1622

static void error(struct mddev *mddev, struct md_rdev *rdev)

1602

{

1623

{

1603

char b[BDEVNAME_SIZE];

1624

char b[BDEVNAME_SIZE];

1604

struct r10conf *conf = mddev->private;

1625

struct r10conf *conf = mddev->private;

1605

1626

1606

/*

1627

/*

1607

* If it is not operational, then we have already marked it as dead

1628

* If it is not operational, then we have already marked it as dead

1608

* else if it is the last working disks, ignore the error, let the

1629

* else if it is the last working disks, ignore the error, let the

1609

* next level up know.

1630

* next level up know.

1610

* else mark the drive as failed

1631

* else mark the drive as failed

1611

*/

1632

*/

1612

if (test_bit(In_sync, &rdev->flags)

1633

if (test_bit(In_sync, &rdev->flags)

1613

&& !enough(conf, rdev->raid_disk))

1634

&& !enough(conf, rdev->raid_disk))

1614

/*

1635

/*

1615

* Don't fail the drive, just return an IO error.

1636

* Don't fail the drive, just return an IO error.

1616

*/

1637

*/

1617

return;

1638

return;

1618

if (test_and_clear_bit(In_sync, &rdev->flags)) {

1639

if (test_and_clear_bit(In_sync, &rdev->flags)) {

1619

unsigned long flags;

1640

unsigned long flags;

1620

spin_lock_irqsave(&conf->device_lock, flags);

1641

spin_lock_irqsave(&conf->device_lock, flags);

1621

mddev->degraded++;

1642

mddev->degraded++;

1622

spin_unlock_irqrestore(&conf->device_lock, flags);

1643

spin_unlock_irqrestore(&conf->device_lock, flags);

1623

/*

1644

/*

1624

* if recovery is running, make sure it aborts.

1645

* if recovery is running, make sure it aborts.

1625

*/

1646

*/

1626

set_bit(MD_RECOVERY_INTR, &mddev->recovery);

1647

set_bit(MD_RECOVERY_INTR, &mddev->recovery);

1627

}

1648

}

1628

set_bit(Blocked, &rdev->flags);

1649

set_bit(Blocked, &rdev->flags);

1629

set_bit(Faulty, &rdev->flags);

1650

set_bit(Faulty, &rdev->flags);

1630

set_bit(MD_CHANGE_DEVS, &mddev->flags);

1651

set_bit(MD_CHANGE_DEVS, &mddev->flags);

1631

printk(KERN_ALERT

1652

printk(KERN_ALERT

1632

"md/raid10:%s: Disk failure on %s, disabling device.\n"

1653

"md/raid10:%s: Disk failure on %s, disabling device.\n"

1633

"md/raid10:%s: Operation continuing on %d devices.\n",

1654

"md/raid10:%s: Operation continuing on %d devices.\n",

1634

mdname(mddev), bdevname(rdev->bdev, b),

1655

mdname(mddev), bdevname(rdev->bdev, b),

1635

mdname(mddev), conf->geo.raid_disks - mddev->degraded);

1656

mdname(mddev), conf->geo.raid_disks - mddev->degraded);

1636

}

1657

}

1637

1658

1638

static void print_conf(struct r10conf *conf)

1659

static void print_conf(struct r10conf *conf)

1639

{

1660

{

1640

int i;

1661

int i;

1641

struct raid10_info *tmp;

1662

struct raid10_info *tmp;

1642

1663

1643

printk(KERN_DEBUG "RAID10 conf printout:\n");

1664

printk(KERN_DEBUG "RAID10 conf printout:\n");

1644

if (!conf) {

1665

if (!conf) {

1645

printk(KERN_DEBUG "(!conf)\n");

1666

printk(KERN_DEBUG "(!conf)\n");

1646

return;

1667

return;

1647

}

1668

}

1648

printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,

1669

printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,

1649

conf->geo.raid_disks);

1670

conf->geo.raid_disks);

1650

1671

1651

for (i = 0; i < conf->geo.raid_disks; i++) {

1672

for (i = 0; i < conf->geo.raid_disks; i++) {

1652

char b[BDEVNAME_SIZE];

1673

char b[BDEVNAME_SIZE];

1653

tmp = conf->mirrors + i;

1674

tmp = conf->mirrors + i;

1654

if (tmp->rdev)

1675

if (tmp->rdev)

1655

printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",

1676

printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",

1656

i, !test_bit(In_sync, &tmp->rdev->flags),

1677

i, !test_bit(In_sync, &tmp->rdev->flags),

1657

!test_bit(Faulty, &tmp->rdev->flags),

1678

!test_bit(Faulty, &tmp->rdev->flags),

1658

bdevname(tmp->rdev->bdev,b));

1679

bdevname(tmp->rdev->bdev,b));

1659

}

1680

}

1660

}

1681

}

1661

1682

1662

static void close_sync(struct r10conf *conf)

1683

static void close_sync(struct r10conf *conf)

1663

{

1684

{

1664

wait_barrier(conf);

1685

wait_barrier(conf);

1665

allow_barrier(conf);

1686

allow_barrier(conf);

1666

1687

1667

mempool_destroy(conf->r10buf_pool);

1688

mempool_destroy(conf->r10buf_pool);

1668

conf->r10buf_pool = NULL;

1689

conf->r10buf_pool = NULL;

1669

}

1690

}

1670

1691

1671

static int raid10_spare_active(struct mddev *mddev)

1692

static int raid10_spare_active(struct mddev *mddev)

1672

{

1693

{

1673

int i;

1694

int i;

1674

struct r10conf *conf = mddev->private;

1695

struct r10conf *conf = mddev->private;

1675

struct raid10_info *tmp;

1696

struct raid10_info *tmp;

1676

int count = 0;

1697

int count = 0;

1677

unsigned long flags;

1698

unsigned long flags;

1678

1699

1679

/*

1700

/*

1680

* Find all non-in_sync disks within the RAID10 configuration

1701

* Find all non-in_sync disks within the RAID10 configuration

1681

* and mark them in_sync

1702

* and mark them in_sync

1682

*/

1703

*/

1683

for (i = 0; i < conf->geo.raid_disks; i++) {

1704

for (i = 0; i < conf->geo.raid_disks; i++) {

1684

tmp = conf->mirrors + i;

1705

tmp = conf->mirrors + i;

1685

if (tmp->replacement

1706

if (tmp->replacement

1686

&& tmp->replacement->recovery_offset == MaxSector

1707

&& tmp->replacement->recovery_offset == MaxSector

1687

&& !test_bit(Faulty, &tmp->replacement->flags)

1708

&& !test_bit(Faulty, &tmp->replacement->flags)

1688

&& !test_and_set_bit(In_sync, &tmp->replacement->flags)) {

1709

&& !test_and_set_bit(In_sync, &tmp->replacement->flags)) {

1689

/* Replacement has just become active */

1710

/* Replacement has just become active */

1690

if (!tmp->rdev

1711

if (!tmp->rdev

1691

|| !test_and_clear_bit(In_sync, &tmp->rdev->flags))

1712

|| !test_and_clear_bit(In_sync, &tmp->rdev->flags))

1692

count++;

1713

count++;

1693

if (tmp->rdev) {

1714

if (tmp->rdev) {

1694

/* Replaced device not technically faulty,

1715

/* Replaced device not technically faulty,

1695

* but we need to be sure it gets removed

1716

* but we need to be sure it gets removed

1696

* and never re-added.

1717

* and never re-added.

1697

*/

1718

*/

1698

set_bit(Faulty, &tmp->rdev->flags);

1719

set_bit(Faulty, &tmp->rdev->flags);

1699

sysfs_notify_dirent_safe(

1720

sysfs_notify_dirent_safe(

1700

tmp->rdev->sysfs_state);

1721

tmp->rdev->sysfs_state);

1701

}

1722

}

1702

sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);

1723

sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);

1703

} else if (tmp->rdev

1724

} else if (tmp->rdev

1704

&& !test_bit(Faulty, &tmp->rdev->flags)

1725

&& !test_bit(Faulty, &tmp->rdev->flags)

1705

&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {

1726

&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {

1706

count++;

1727

count++;

1707

sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);

1728

sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);

1708

}

1729

}

1709

}

1730

}

1710

spin_lock_irqsave(&conf->device_lock, flags);

1731

spin_lock_irqsave(&conf->device_lock, flags);

1711

mddev->degraded -= count;

1732

mddev->degraded -= count;

1712

spin_unlock_irqrestore(&conf->device_lock, flags);

1733

spin_unlock_irqrestore(&conf->device_lock, flags);

1713

1734

1714

print_conf(conf);

1735

print_conf(conf);

1715

return count;

1736

return count;

1716

}

1737

}

1717

1738

1718

1739

1719

static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)

1740

static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)

1720

{

1741

{

1721

struct r10conf *conf = mddev->private;

1742

struct r10conf *conf = mddev->private;

1722

int err = -EEXIST;

1743

int err = -EEXIST;

1723

int mirror;

1744

int mirror;

1724

int first = 0;

1745

int first = 0;

1725

int last = conf->geo.raid_disks - 1;

1746

int last = conf->geo.raid_disks - 1;

1726

struct request_queue *q = bdev_get_queue(rdev->bdev);

1747

struct request_queue *q = bdev_get_queue(rdev->bdev);

1727

1748

1728

if (mddev->recovery_cp < MaxSector)

1749

if (mddev->recovery_cp < MaxSector)

1729

/* only hot-add to in-sync arrays, as recovery is

1750

/* only hot-add to in-sync arrays, as recovery is

1730

* very different from resync

1751

* very different from resync

1731

*/

1752

*/

1732

return -EBUSY;

1753

return -EBUSY;

1733

if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))

1754

if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))

1734

return -EINVAL;

1755

return -EINVAL;

1735

1756

1736

if (rdev->raid_disk >= 0)

1757

if (rdev->raid_disk >= 0)

1737

first = last = rdev->raid_disk;

1758

first = last = rdev->raid_disk;

1738

1759

1739

if (q->merge_bvec_fn) {

1760

if (q->merge_bvec_fn) {

1740

set_bit(Unmerged, &rdev->flags);

1761

set_bit(Unmerged, &rdev->flags);

1741

mddev->merge_check_needed = 1;

1762

mddev->merge_check_needed = 1;

1742

}

1763

}

1743

1764

1744

if (rdev->saved_raid_disk >= first &&

1765

if (rdev->saved_raid_disk >= first &&

1745

conf->mirrors[rdev->saved_raid_disk].rdev == NULL)

1766

conf->mirrors[rdev->saved_raid_disk].rdev == NULL)

1746

mirror = rdev->saved_raid_disk;

1767

mirror = rdev->saved_raid_disk;

1747

else

1768

else

1748

mirror = first;

1769

mirror = first;

1749

for ( ; mirror <= last ; mirror++) {

1770

for ( ; mirror <= last ; mirror++) {

1750

struct raid10_info *p = &conf->mirrors[mirror];

1771

struct raid10_info *p = &conf->mirrors[mirror];

1751

if (p->recovery_disabled == mddev->recovery_disabled)

1772

if (p->recovery_disabled == mddev->recovery_disabled)

1752

continue;

1773

continue;

1753

if (p->rdev) {

1774

if (p->rdev) {

1754

if (!test_bit(WantReplacement, &p->rdev->flags) ||

1775

if (!test_bit(WantReplacement, &p->rdev->flags) ||

1755

p->replacement != NULL)

1776

p->replacement != NULL)

1756

continue;

1777

continue;

1757

clear_bit(In_sync, &rdev->flags);

1778

clear_bit(In_sync, &rdev->flags);

1758

set_bit(Replacement, &rdev->flags);

1779

set_bit(Replacement, &rdev->flags);

1759

rdev->raid_disk = mirror;

1780

rdev->raid_disk = mirror;

1760

err = 0;

1781

err = 0;

1761

disk_stack_limits(mddev->gendisk, rdev->bdev,

1782

disk_stack_limits(mddev->gendisk, rdev->bdev,

1762

rdev->data_offset << 9);

1783

rdev->data_offset << 9);

1763

conf->fullsync = 1;

1784

conf->fullsync = 1;

1764

rcu_assign_pointer(p->replacement, rdev);

1785

rcu_assign_pointer(p->replacement, rdev);

1765

break;

1786

break;

1766

}

1787

}

1767

1788

1768

disk_stack_limits(mddev->gendisk, rdev->bdev,

1789

disk_stack_limits(mddev->gendisk, rdev->bdev,

1769

rdev->data_offset << 9);

1790

rdev->data_offset << 9);

1770

1791

1771

p->head_position = 0;

1792

p->head_position = 0;

1772

p->recovery_disabled = mddev->recovery_disabled - 1;

1793

p->recovery_disabled = mddev->recovery_disabled - 1;

1773

rdev->raid_disk = mirror;

1794

rdev->raid_disk = mirror;

1774

err = 0;

1795

err = 0;

1775

if (rdev->saved_raid_disk != mirror)

1796

if (rdev->saved_raid_disk != mirror)

1776

conf->fullsync = 1;

1797

conf->fullsync = 1;

1777

rcu_assign_pointer(p->rdev, rdev);

1798

rcu_assign_pointer(p->rdev, rdev);

1778

break;

1799

break;

1779

}

1800

}

1780

if (err == 0 && test_bit(Unmerged, &rdev->flags)) {

1801

if (err == 0 && test_bit(Unmerged, &rdev->flags)) {

1781

/* Some requests might not have seen this new

1802

/* Some requests might not have seen this new

1782

* merge_bvec_fn. We must wait for them to complete

1803

* merge_bvec_fn. We must wait for them to complete

1783

* before merging the device fully.

1804

* before merging the device fully.

1784

* First we make sure any code which has tested

1805

* First we make sure any code which has tested

1785

* our function has submitted the request, then

1806

* our function has submitted the request, then

1786

* we wait for all outstanding requests to complete.

1807

* we wait for all outstanding requests to complete.

1787

*/

1808

*/

1788

synchronize_sched();

1809

synchronize_sched();

1789

raise_barrier(conf, 0);

1810

raise_barrier(conf, 0);

1790

lower_barrier(conf);

1811

lower_barrier(conf);

1791

clear_bit(Unmerged, &rdev->flags);

1812

clear_bit(Unmerged, &rdev->flags);

1792

}

1813

}

1793

md_integrity_add_rdev(rdev, mddev);

1814

md_integrity_add_rdev(rdev, mddev);

1794

if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))

1815

if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))

1795

queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);

1816

queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);

1796

1817

1797

print_conf(conf);

1818

print_conf(conf);

1798

return err;

1819

return err;

1799

}

1820

}

1800

1821

1801

static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)

1822

static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)

1802

{

1823

{

1803

struct r10conf *conf = mddev->private;

1824

struct r10conf *conf = mddev->private;

1804

int err = 0;

1825

int err = 0;

1805

int number = rdev->raid_disk;

1826

int number = rdev->raid_disk;

1806

struct md_rdev **rdevp;

1827

struct md_rdev **rdevp;

1807

struct raid10_info *p = conf->mirrors + number;

1828

struct raid10_info *p = conf->mirrors + number;

1808

1829

1809

print_conf(conf);

1830

print_conf(conf);

1810

if (rdev == p->rdev)

1831

if (rdev == p->rdev)

1811

rdevp = &p->rdev;

1832

rdevp = &p->rdev;

1812

else if (rdev == p->replacement)

1833

else if (rdev == p->replacement)

1813

rdevp = &p->replacement;

1834

rdevp = &p->replacement;

1814

else

1835

else

1815

return 0;

1836

return 0;

1816

1837

1817

if (test_bit(In_sync, &rdev->flags) ||

1838

if (test_bit(In_sync, &rdev->flags) ||

1818

atomic_read(&rdev->nr_pending)) {

1839

atomic_read(&rdev->nr_pending)) {

1819

err = -EBUSY;

1840

err = -EBUSY;

1820

goto abort;

1841

goto abort;

1821

}

1842

}

1822

/* Only remove faulty devices if recovery

1843

/* Only remove faulty devices if recovery

1823

* is not possible.

1844

* is not possible.

1824

*/

1845

*/

1825

if (!test_bit(Faulty, &rdev->flags) &&

1846

if (!test_bit(Faulty, &rdev->flags) &&

1826

mddev->recovery_disabled != p->recovery_disabled &&

1847

mddev->recovery_disabled != p->recovery_disabled &&

1827

(!p->replacement || p->replacement == rdev) &&

1848

(!p->replacement || p->replacement == rdev) &&

1828

number < conf->geo.raid_disks &&

1849

number < conf->geo.raid_disks &&

1829

enough(conf, -1)) {

1850

enough(conf, -1)) {

1830

err = -EBUSY;

1851

err = -EBUSY;

1831

goto abort;

1852

goto abort;

1832

}

1853

}

1833

*rdevp = NULL;

1854

*rdevp = NULL;

1834

synchronize_rcu();

1855

synchronize_rcu();

1835

if (atomic_read(&rdev->nr_pending)) {

1856

if (atomic_read(&rdev->nr_pending)) {

1836

/* lost the race, try later */

1857

/* lost the race, try later */

1837

err = -EBUSY;

1858

err = -EBUSY;

1838

*rdevp = rdev;

1859

*rdevp = rdev;

1839

goto abort;

1860

goto abort;

1840

} else if (p->replacement) {

1861

} else if (p->replacement) {

1841

/* We must have just cleared 'rdev' */

1862

/* We must have just cleared 'rdev' */

1842

p->rdev = p->replacement;

1863

p->rdev = p->replacement;

1843

clear_bit(Replacement, &p->replacement->flags);

1864

clear_bit(Replacement, &p->replacement->flags);

1844

smp_mb(); /* Make sure other CPUs may see both as identical

1865

smp_mb(); /* Make sure other CPUs may see both as identical

1845

* but will never see neither -- if they are careful.

1866

* but will never see neither -- if they are careful.

1846

*/

1867

*/

1847

p->replacement = NULL;

1868

p->replacement = NULL;

1848

clear_bit(WantReplacement, &rdev->flags);

1869

clear_bit(WantReplacement, &rdev->flags);

1849

} else

1870

} else

1850

/* We might have just remove the Replacement as faulty

1871

/* We might have just remove the Replacement as faulty

1851

* Clear the flag just in case

1872

* Clear the flag just in case

1852

*/

1873

*/

1853

clear_bit(WantReplacement, &rdev->flags);

1874

clear_bit(WantReplacement, &rdev->flags);

1854

1875

1855

err = md_integrity_register(mddev);

1876

err = md_integrity_register(mddev);

1856

1877

1857

abort:

1878

abort:

1858

1879

1859

print_conf(conf);

1880

print_conf(conf);

1860

return err;

1881

return err;

1861

}

1882

}

1862

1883

1863

1884

1864

static void end_sync_read(struct bio *bio, int error)

1885

static void end_sync_read(struct bio *bio, int error)

1865

{

1886

{

1866

struct r10bio *r10_bio = bio->bi_private;

1887

struct r10bio *r10_bio = bio->bi_private;

1867

struct r10conf *conf = r10_bio->mddev->private;

1888

struct r10conf *conf = r10_bio->mddev->private;

1868

int d;

1889

int d;

1869

1890

1870

if (bio == r10_bio->master_bio) {

1891

if (bio == r10_bio->master_bio) {

1871

/* this is a reshape read */

1892

/* this is a reshape read */

1872

d = r10_bio->read_slot; /* really the read dev */

1893

d = r10_bio->read_slot; /* really the read dev */

1873

} else

1894

} else

1874

d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);

1895

d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);

1875

1896

1876

if (test_bit(BIO_UPTODATE, &bio->bi_flags))

1897

if (test_bit(BIO_UPTODATE, &bio->bi_flags))

1877

set_bit(R10BIO_Uptodate, &r10_bio->state);

1898

set_bit(R10BIO_Uptodate, &r10_bio->state);

1878

else

1899

else

1879

/* The write handler will notice the lack of

1900

/* The write handler will notice the lack of

1880

* R10BIO_Uptodate and record any errors etc

1901

* R10BIO_Uptodate and record any errors etc

1881

*/

1902

*/

1882

atomic_add(r10_bio->sectors,

1903

atomic_add(r10_bio->sectors,

1883

&conf->mirrors[d].rdev->corrected_errors);

1904

&conf->mirrors[d].rdev->corrected_errors);

1884

1905

1885

/* for reconstruct, we always reschedule after a read.

1906

/* for reconstruct, we always reschedule after a read.

1886

* for resync, only after all reads

1907

* for resync, only after all reads

1887

*/

1908

*/

1888

rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);

1909

rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);

1889

if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||

1910

if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||

1890

atomic_dec_and_test(&r10_bio->remaining)) {

1911

atomic_dec_and_test(&r10_bio->remaining)) {

1891

/* we have read all the blocks,

1912

/* we have read all the blocks,

1892

* do the comparison in process context in raid10d

1913

* do the comparison in process context in raid10d

1893

*/

1914

*/

1894

reschedule_retry(r10_bio);

1915

reschedule_retry(r10_bio);

1895

}

1916

}

1896

}

1917

}

1897

1918

1898

static void end_sync_request(struct r10bio *r10_bio)

1919

static void end_sync_request(struct r10bio *r10_bio)

1899

{

1920

{

1900

struct mddev *mddev = r10_bio->mddev;

1921

struct mddev *mddev = r10_bio->mddev;

1901

1922

1902

while (atomic_dec_and_test(&r10_bio->remaining)) {

1923

while (atomic_dec_and_test(&r10_bio->remaining)) {

1903

if (r10_bio->master_bio == NULL) {

1924

if (r10_bio->master_bio == NULL) {

1904

/* the primary of several recovery bios */

1925

/* the primary of several recovery bios */

1905

sector_t s = r10_bio->sectors;

1926

sector_t s = r10_bio->sectors;

1906

if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||

1927

if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||

1907

test_bit(R10BIO_WriteError, &r10_bio->state))

1928

test_bit(R10BIO_WriteError, &r10_bio->state))

1908

reschedule_retry(r10_bio);

1929

reschedule_retry(r10_bio);

1909

else

1930

else

1910

put_buf(r10_bio);

1931

put_buf(r10_bio);

1911

md_done_sync(mddev, s, 1);

1932

md_done_sync(mddev, s, 1);

1912

break;

1933

break;

1913

} else {

1934

} else {

1914

struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;

1935

struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;

1915

if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||

1936

if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||

1916

test_bit(R10BIO_WriteError, &r10_bio->state))

1937

test_bit(R10BIO_WriteError, &r10_bio->state))

1917

reschedule_retry(r10_bio);

1938

reschedule_retry(r10_bio);

1918

else

1939

else

1919

put_buf(r10_bio);

1940

put_buf(r10_bio);

1920

r10_bio = r10_bio2;

1941

r10_bio = r10_bio2;

1921

}

1942

}

1922

}

1943

}

1923

}

1944

}

1924

1945

1925

static void end_sync_write(struct bio *bio, int error)

1946

static void end_sync_write(struct bio *bio, int error)

1926

{

1947

{

1927

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

1948

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

1928

struct r10bio *r10_bio = bio->bi_private;

1949

struct r10bio *r10_bio = bio->bi_private;

1929

struct mddev *mddev = r10_bio->mddev;

1950

struct mddev *mddev = r10_bio->mddev;

1930

struct r10conf *conf = mddev->private;

1951

struct r10conf *conf = mddev->private;

1931

int d;

1952

int d;

1932

sector_t first_bad;

1953

sector_t first_bad;

1933

int bad_sectors;

1954

int bad_sectors;

1934

int slot;

1955

int slot;

1935

int repl;

1956

int repl;

1936

struct md_rdev *rdev = NULL;

1957

struct md_rdev *rdev = NULL;

1937

1958

1938

d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);

1959

d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);

1939

if (repl)

1960

if (repl)

1940

rdev = conf->mirrors[d].replacement;

1961

rdev = conf->mirrors[d].replacement;

1941

else

1962

else

1942

rdev = conf->mirrors[d].rdev;

1963

rdev = conf->mirrors[d].rdev;

1943

1964

1944

if (!uptodate) {

1965

if (!uptodate) {

1945

if (repl)

1966

if (repl)

1946

md_error(mddev, rdev);

1967

md_error(mddev, rdev);

1947

else {

1968

else {

1948

set_bit(WriteErrorSeen, &rdev->flags);

1969

set_bit(WriteErrorSeen, &rdev->flags);

1949

if (!test_and_set_bit(WantReplacement, &rdev->flags))

1970

if (!test_and_set_bit(WantReplacement, &rdev->flags))

1950

set_bit(MD_RECOVERY_NEEDED,

1971

set_bit(MD_RECOVERY_NEEDED,

1951

&rdev->mddev->recovery);

1972

&rdev->mddev->recovery);

1952

set_bit(R10BIO_WriteError, &r10_bio->state);

1973

set_bit(R10BIO_WriteError, &r10_bio->state);

1953

}

1974

}

1954

} else if (is_badblock(rdev,

1975

} else if (is_badblock(rdev,

1955

r10_bio->devs[slot].addr,

1976

r10_bio->devs[slot].addr,

1956

r10_bio->sectors,

1977

r10_bio->sectors,

1957

&first_bad, &bad_sectors))

1978

&first_bad, &bad_sectors))

1958

set_bit(R10BIO_MadeGood, &r10_bio->state);

1979

set_bit(R10BIO_MadeGood, &r10_bio->state);

1959

1980

1960

rdev_dec_pending(rdev, mddev);

1981

rdev_dec_pending(rdev, mddev);

1961

1982

1962

end_sync_request(r10_bio);

1983

end_sync_request(r10_bio);

1963

}

1984

}

1964

1985

1965

/*

1986

/*

1966

* Note: sync and recover and handled very differently for raid10

1987

* Note: sync and recover and handled very differently for raid10

1967

* This code is for resync.

1988

* This code is for resync.

1968

* For resync, we read through virtual addresses and read all blocks.

1989

* For resync, we read through virtual addresses and read all blocks.

1969

* If there is any error, we schedule a write. The lowest numbered

1990

* If there is any error, we schedule a write. The lowest numbered

1970

* drive is authoritative.

1991

* drive is authoritative.

1971

* However requests come for physical address, so we need to map.

1992

* However requests come for physical address, so we need to map.

1972

* For every physical address there are raid_disks/copies virtual addresses,

1993

* For every physical address there are raid_disks/copies virtual addresses,

1973

* which is always are least one, but is not necessarly an integer.

1994

* which is always are least one, but is not necessarly an integer.

1974

* This means that a physical address can span multiple chunks, so we may

1995

* This means that a physical address can span multiple chunks, so we may

1975

* have to submit multiple io requests for a single sync request.

1996

* have to submit multiple io requests for a single sync request.

1976

*/

1997

*/

1977

/*

1998

/*

1978

* We check if all blocks are in-sync and only write to blocks that

1999

* We check if all blocks are in-sync and only write to blocks that

1979

* aren't in sync

2000

* aren't in sync

1980

*/

2001

*/

1981

static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)

2002

static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)

1982

{

2003

{

1983

struct r10conf *conf = mddev->private;

2004

struct r10conf *conf = mddev->private;

1984

int i, first;

2005

int i, first;

1985

struct bio *tbio, *fbio;

2006

struct bio *tbio, *fbio;

1986

int vcnt;

2007

int vcnt;

1987

2008

1988

atomic_set(&r10_bio->remaining, 1);

2009

atomic_set(&r10_bio->remaining, 1);

1989

2010

1990

/* find the first device with a block */

2011

/* find the first device with a block */

1991

for (i=0; i<conf->copies; i++)

2012

for (i=0; i<conf->copies; i++)

1992

if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))

2013

if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))

1993

break;

2014

break;

1994

2015

1995

if (i == conf->copies)

2016

if (i == conf->copies)

1996

goto done;

2017

goto done;

1997

2018

1998

first = i;

2019

first = i;

1999

fbio = r10_bio->devs[i].bio;

2020

fbio = r10_bio->devs[i].bio;

2000

2021

2001

vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);

2022

vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);

2002

/* now find blocks with errors */

2023

/* now find blocks with errors */

2003

for (i=0 ; i < conf->copies ; i++) {

2024

for (i=0 ; i < conf->copies ; i++) {

2004

int j, d;

2025

int j, d;

2005

2026

2006

tbio = r10_bio->devs[i].bio;

2027

tbio = r10_bio->devs[i].bio;

2007

2028

2008

if (tbio->bi_end_io != end_sync_read)

2029

if (tbio->bi_end_io != end_sync_read)

2009

continue;

2030

continue;

2010

if (i == first)

2031

if (i == first)

2011

continue;

2032

continue;

2012

if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {

2033

if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {

2013

/* We know that the bi_io_vec layout is the same for

2034

/* We know that the bi_io_vec layout is the same for

2014

* both 'first' and 'i', so we just compare them.

2035

* both 'first' and 'i', so we just compare them.

2015

* All vec entries are PAGE_SIZE;

2036

* All vec entries are PAGE_SIZE;

2016

*/

2037

*/

2017

for (j = 0; j < vcnt; j++)

2038

for (j = 0; j < vcnt; j++)

2018

if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),

2039

if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),

2019

page_address(tbio->bi_io_vec[j].bv_page),

2040

page_address(tbio->bi_io_vec[j].bv_page),

2020

fbio->bi_io_vec[j].bv_len))

2041

fbio->bi_io_vec[j].bv_len))

2021

break;

2042

break;

2022

if (j == vcnt)

2043

if (j == vcnt)

2023

continue;

2044

continue;

2024

atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);

2045

atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);

2025

if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))

2046

if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))

2026

/* Don't fix anything. */

2047

/* Don't fix anything. */

2027

continue;

2048

continue;

2028

}

2049

}

2029

/* Ok, we need to write this bio, either to correct an

2050

/* Ok, we need to write this bio, either to correct an

2030

* inconsistency or to correct an unreadable block.

2051

* inconsistency or to correct an unreadable block.

2031

* First we need to fixup bv_offset, bv_len and

2052

* First we need to fixup bv_offset, bv_len and

2032

* bi_vecs, as the read request might have corrupted these

2053

* bi_vecs, as the read request might have corrupted these

2033

*/

2054

*/

2034

tbio->bi_vcnt = vcnt;

2055

tbio->bi_vcnt = vcnt;

2035

tbio->bi_size = r10_bio->sectors << 9;

2056

tbio->bi_size = r10_bio->sectors << 9;

2036

tbio->bi_idx = 0;

2057

tbio->bi_idx = 0;

2037

tbio->bi_phys_segments = 0;

2058

tbio->bi_phys_segments = 0;

2038

tbio->bi_flags &= ~(BIO_POOL_MASK - 1);

2059

tbio->bi_flags &= ~(BIO_POOL_MASK - 1);

2039

tbio->bi_flags |= 1 << BIO_UPTODATE;

2060

tbio->bi_flags |= 1 << BIO_UPTODATE;

2040

tbio->bi_next = NULL;

2061

tbio->bi_next = NULL;

2041

tbio->bi_rw = WRITE;

2062

tbio->bi_rw = WRITE;

2042

tbio->bi_private = r10_bio;

2063

tbio->bi_private = r10_bio;

2043

tbio->bi_sector = r10_bio->devs[i].addr;

2064

tbio->bi_sector = r10_bio->devs[i].addr;

2044

2065

2045

for (j=0; j < vcnt ; j++) {

2066

for (j=0; j < vcnt ; j++) {

2046

tbio->bi_io_vec[j].bv_offset = 0;

2067

tbio->bi_io_vec[j].bv_offset = 0;

2047

tbio->bi_io_vec[j].bv_len = PAGE_SIZE;

2068

tbio->bi_io_vec[j].bv_len = PAGE_SIZE;

2048

2069

2049

memcpy(page_address(tbio->bi_io_vec[j].bv_page),

2070

memcpy(page_address(tbio->bi_io_vec[j].bv_page),

2050

page_address(fbio->bi_io_vec[j].bv_page),

2071

page_address(fbio->bi_io_vec[j].bv_page),

2051

PAGE_SIZE);

2072

PAGE_SIZE);

2052

}

2073

}

2053

tbio->bi_end_io = end_sync_write;

2074

tbio->bi_end_io = end_sync_write;

2054

2075

2055

d = r10_bio->devs[i].devnum;

2076

d = r10_bio->devs[i].devnum;

2056

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

2077

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

2057

atomic_inc(&r10_bio->remaining);

2078

atomic_inc(&r10_bio->remaining);

2058

md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);

2079

md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);

2059

2080

2060

tbio->bi_sector += conf->mirrors[d].rdev->data_offset;

2081

tbio->bi_sector += conf->mirrors[d].rdev->data_offset;

2061

tbio->bi_bdev = conf->mirrors[d].rdev->bdev;

2082

tbio->bi_bdev = conf->mirrors[d].rdev->bdev;

2062

generic_make_request(tbio);

2083

generic_make_request(tbio);

2063

}

2084

}

2064

2085

2065

/* Now write out to any replacement devices

2086

/* Now write out to any replacement devices

2066

* that are active

2087

* that are active

2067

*/

2088

*/

2068

for (i = 0; i < conf->copies; i++) {

2089

for (i = 0; i < conf->copies; i++) {

2069

int j, d;

2090

int j, d;

2070

2091

2071

tbio = r10_bio->devs[i].repl_bio;

2092

tbio = r10_bio->devs[i].repl_bio;

2072

if (!tbio || !tbio->bi_end_io)

2093

if (!tbio || !tbio->bi_end_io)

2073

continue;

2094

continue;

2074

if (r10_bio->devs[i].bio->bi_end_io != end_sync_write

2095

if (r10_bio->devs[i].bio->bi_end_io != end_sync_write

2075

&& r10_bio->devs[i].bio != fbio)

2096

&& r10_bio->devs[i].bio != fbio)

2076

for (j = 0; j < vcnt; j++)

2097

for (j = 0; j < vcnt; j++)

2077

memcpy(page_address(tbio->bi_io_vec[j].bv_page),

2098

memcpy(page_address(tbio->bi_io_vec[j].bv_page),

2078

page_address(fbio->bi_io_vec[j].bv_page),

2099

page_address(fbio->bi_io_vec[j].bv_page),

2079

PAGE_SIZE);

2100

PAGE_SIZE);

2080

d = r10_bio->devs[i].devnum;

2101

d = r10_bio->devs[i].devnum;

2081

atomic_inc(&r10_bio->remaining);

2102

atomic_inc(&r10_bio->remaining);

2082

md_sync_acct(conf->mirrors[d].replacement->bdev,

2103

md_sync_acct(conf->mirrors[d].replacement->bdev,

2083

tbio->bi_size >> 9);

2104

tbio->bi_size >> 9);

2084

generic_make_request(tbio);

2105

generic_make_request(tbio);

2085

}

2106

}

2086

2107

2087

done:

2108

done:

2088

if (atomic_dec_and_test(&r10_bio->remaining)) {

2109

if (atomic_dec_and_test(&r10_bio->remaining)) {

2089

md_done_sync(mddev, r10_bio->sectors, 1);

2110

md_done_sync(mddev, r10_bio->sectors, 1);

2090

put_buf(r10_bio);

2111

put_buf(r10_bio);

2091

}

2112

}

2092

}

2113

}

2093

2114

2094

/*

2115

/*

2095

* Now for the recovery code.

2116

* Now for the recovery code.

2096

* Recovery happens across physical sectors.

2117

* Recovery happens across physical sectors.

2097

* We recover all non-is_sync drives by finding the virtual address of

2118

* We recover all non-is_sync drives by finding the virtual address of

2098

* each, and then choose a working drive that also has that virt address.

2119

* each, and then choose a working drive that also has that virt address.

2099

* There is a separate r10_bio for each non-in_sync drive.

2120

* There is a separate r10_bio for each non-in_sync drive.

2100

* Only the first two slots are in use. The first for reading,

2121

* Only the first two slots are in use. The first for reading,

2101

* The second for writing.

2122

* The second for writing.

2102

*

2123

*

2103

*/

2124

*/

2104

static void fix_recovery_read_error(struct r10bio *r10_bio)

2125

static void fix_recovery_read_error(struct r10bio *r10_bio)

2105

{

2126

{

2106

/* We got a read error during recovery.

2127

/* We got a read error during recovery.

2107

* We repeat the read in smaller page-sized sections.

2128

* We repeat the read in smaller page-sized sections.

2108

* If a read succeeds, write it to the new device or record

2129

* If a read succeeds, write it to the new device or record

2109

* a bad block if we cannot.

2130

* a bad block if we cannot.

2110

* If a read fails, record a bad block on both old and

2131

* If a read fails, record a bad block on both old and

2111

* new devices.

2132

* new devices.

2112

*/

2133

*/

2113

struct mddev *mddev = r10_bio->mddev;

2134

struct mddev *mddev = r10_bio->mddev;

2114

struct r10conf *conf = mddev->private;

2135

struct r10conf *conf = mddev->private;

2115

struct bio *bio = r10_bio->devs[0].bio;

2136

struct bio *bio = r10_bio->devs[0].bio;

2116

sector_t sect = 0;

2137

sector_t sect = 0;

2117

int sectors = r10_bio->sectors;

2138

int sectors = r10_bio->sectors;

2118

int idx = 0;

2139

int idx = 0;

2119

int dr = r10_bio->devs[0].devnum;

2140

int dr = r10_bio->devs[0].devnum;

2120

int dw = r10_bio->devs[1].devnum;

2141

int dw = r10_bio->devs[1].devnum;

2121

2142

2122

while (sectors) {

2143

while (sectors) {

2123

int s = sectors;

2144

int s = sectors;

2124

struct md_rdev *rdev;

2145

struct md_rdev *rdev;

2125

sector_t addr;

2146

sector_t addr;

2126

int ok;

2147

int ok;

2127

2148

2128

if (s > (PAGE_SIZE>>9))

2149

if (s > (PAGE_SIZE>>9))

2129

s = PAGE_SIZE >> 9;

2150

s = PAGE_SIZE >> 9;

2130

2151

2131

rdev = conf->mirrors[dr].rdev;

2152

rdev = conf->mirrors[dr].rdev;

2132

addr = r10_bio->devs[0].addr + sect,

2153

addr = r10_bio->devs[0].addr + sect,

2133

ok = sync_page_io(rdev,

2154

ok = sync_page_io(rdev,

2134

addr,

2155

addr,

2135

s << 9,

2156

s << 9,

2136

bio->bi_io_vec[idx].bv_page,

2157

bio->bi_io_vec[idx].bv_page,

2137

READ, false);

2158

READ, false);

2138

if (ok) {

2159

if (ok) {

2139

rdev = conf->mirrors[dw].rdev;

2160

rdev = conf->mirrors[dw].rdev;

2140

addr = r10_bio->devs[1].addr + sect;

2161

addr = r10_bio->devs[1].addr + sect;

2141

ok = sync_page_io(rdev,

2162

ok = sync_page_io(rdev,

2142

addr,

2163

addr,

2143

s << 9,

2164

s << 9,

2144

bio->bi_io_vec[idx].bv_page,

2165

bio->bi_io_vec[idx].bv_page,

2145

WRITE, false);

2166

WRITE, false);

2146

if (!ok) {

2167

if (!ok) {

2147

set_bit(WriteErrorSeen, &rdev->flags);

2168

set_bit(WriteErrorSeen, &rdev->flags);

2148

if (!test_and_set_bit(WantReplacement,

2169

if (!test_and_set_bit(WantReplacement,

2149

&rdev->flags))

2170

&rdev->flags))

2150

set_bit(MD_RECOVERY_NEEDED,

2171

set_bit(MD_RECOVERY_NEEDED,

2151

&rdev->mddev->recovery);

2172

&rdev->mddev->recovery);

2152

}

2173

}

2153

}

2174

}

2154

if (!ok) {

2175

if (!ok) {

2155

/* We don't worry if we cannot set a bad block -

2176

/* We don't worry if we cannot set a bad block -

2156

* it really is bad so there is no loss in not

2177

* it really is bad so there is no loss in not

2157

* recording it yet

2178

* recording it yet

2158

*/

2179

*/

2159

rdev_set_badblocks(rdev, addr, s, 0);

2180

rdev_set_badblocks(rdev, addr, s, 0);

2160

2181

2161

if (rdev != conf->mirrors[dw].rdev) {

2182

if (rdev != conf->mirrors[dw].rdev) {

2162

/* need bad block on destination too */

2183

/* need bad block on destination too */

2163

struct md_rdev *rdev2 = conf->mirrors[dw].rdev;

2184

struct md_rdev *rdev2 = conf->mirrors[dw].rdev;

2164

addr = r10_bio->devs[1].addr + sect;

2185

addr = r10_bio->devs[1].addr + sect;

2165

ok = rdev_set_badblocks(rdev2, addr, s, 0);

2186

ok = rdev_set_badblocks(rdev2, addr, s, 0);

2166

if (!ok) {

2187

if (!ok) {

2167

/* just abort the recovery */

2188

/* just abort the recovery */

2168

printk(KERN_NOTICE

2189

printk(KERN_NOTICE

2169

"md/raid10:%s: recovery aborted"

2190

"md/raid10:%s: recovery aborted"

2170

" due to read error\n",

2191

" due to read error\n",

2171

mdname(mddev));

2192

mdname(mddev));

2172

2193

2173

conf->mirrors[dw].recovery_disabled

2194

conf->mirrors[dw].recovery_disabled

2174

= mddev->recovery_disabled;

2195

= mddev->recovery_disabled;

2175

set_bit(MD_RECOVERY_INTR,

2196

set_bit(MD_RECOVERY_INTR,

2176

&mddev->recovery);

2197

&mddev->recovery);

2177

break;

2198

break;

2178

}

2199

}

2179

}

2200

}

2180

}

2201

}

2181

2202

2182

sectors -= s;

2203

sectors -= s;

2183

sect += s;

2204

sect += s;

2184

idx++;

2205

idx++;

2185

}

2206

}

2186

}

2207

}

2187

2208

2188

static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)

2209

static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)

2189

{

2210

{

2190

struct r10conf *conf = mddev->private;

2211

struct r10conf *conf = mddev->private;

2191

int d;

2212

int d;

2192

struct bio *wbio, *wbio2;

2213

struct bio *wbio, *wbio2;

2193

2214

2194

if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {

2215

if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {

2195

fix_recovery_read_error(r10_bio);

2216

fix_recovery_read_error(r10_bio);

2196

end_sync_request(r10_bio);

2217

end_sync_request(r10_bio);

2197

return;

2218

return;

2198

}

2219

}

2199

2220

2200

/*

2221

/*

2201

* share the pages with the first bio

2222

* share the pages with the first bio

2202

* and submit the write request

2223

* and submit the write request

2203

*/

2224

*/

2204

d = r10_bio->devs[1].devnum;

2225

d = r10_bio->devs[1].devnum;

2205

wbio = r10_bio->devs[1].bio;

2226

wbio = r10_bio->devs[1].bio;

2206

wbio2 = r10_bio->devs[1].repl_bio;

2227

wbio2 = r10_bio->devs[1].repl_bio;

2207

if (wbio->bi_end_io) {

2228

if (wbio->bi_end_io) {

2208

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

2229

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

2209

md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);

2230

md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);

2210

generic_make_request(wbio);

2231

generic_make_request(wbio);

2211

}

2232

}

2212

if (wbio2 && wbio2->bi_end_io) {

2233

if (wbio2 && wbio2->bi_end_io) {

2213

atomic_inc(&conf->mirrors[d].replacement->nr_pending);

2234

atomic_inc(&conf->mirrors[d].replacement->nr_pending);

2214

md_sync_acct(conf->mirrors[d].replacement->bdev,

2235

md_sync_acct(conf->mirrors[d].replacement->bdev,

2215

wbio2->bi_size >> 9);

2236

wbio2->bi_size >> 9);

2216

generic_make_request(wbio2);

2237

generic_make_request(wbio2);

2217

}

2238

}

2218

}

2239

}

2219

2240

2220

2241

2221

/*

2242

/*

2222

* Used by fix_read_error() to decay the per rdev read_errors.

2243

* Used by fix_read_error() to decay the per rdev read_errors.

2223

* We halve the read error count for every hour that has elapsed

2244

* We halve the read error count for every hour that has elapsed

2224

* since the last recorded read error.

2245

* since the last recorded read error.

2225

*

2246

*

2226

*/

2247

*/

2227

static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)

2248

static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)

2228

{

2249

{

2229

struct timespec cur_time_mon;

2250

struct timespec cur_time_mon;

2230

unsigned long hours_since_last;

2251

unsigned long hours_since_last;

2231

unsigned int read_errors = atomic_read(&rdev->read_errors);

2252

unsigned int read_errors = atomic_read(&rdev->read_errors);

2232

2253

2233

ktime_get_ts(&cur_time_mon);

2254

ktime_get_ts(&cur_time_mon);

2234

2255

2235

if (rdev->last_read_error.tv_sec == 0 &&

2256

if (rdev->last_read_error.tv_sec == 0 &&

2236

rdev->last_read_error.tv_nsec == 0) {

2257

rdev->last_read_error.tv_nsec == 0) {

2237

/* first time we've seen a read error */

2258

/* first time we've seen a read error */

2238

rdev->last_read_error = cur_time_mon;

2259

rdev->last_read_error = cur_time_mon;

2239

return;

2260

return;

2240

}

2261

}

2241

2262

2242

hours_since_last = (cur_time_mon.tv_sec -

2263

hours_since_last = (cur_time_mon.tv_sec -

2243

rdev->last_read_error.tv_sec) / 3600;

2264

rdev->last_read_error.tv_sec) / 3600;

2244

2265

2245

rdev->last_read_error = cur_time_mon;

2266

rdev->last_read_error = cur_time_mon;

2246

2267

2247

/*

2268

/*

2248

* if hours_since_last is > the number of bits in read_errors

2269

* if hours_since_last is > the number of bits in read_errors

2249

* just set read errors to 0. We do this to avoid

2270

* just set read errors to 0. We do this to avoid

2250

* overflowing the shift of read_errors by hours_since_last.

2271

* overflowing the shift of read_errors by hours_since_last.

2251

*/

2272

*/

2252

if (hours_since_last >= 8 * sizeof(read_errors))

2273

if (hours_since_last >= 8 * sizeof(read_errors))

2253

atomic_set(&rdev->read_errors, 0);

2274

atomic_set(&rdev->read_errors, 0);

2254

else

2275

else

2255

atomic_set(&rdev->read_errors, read_errors >> hours_since_last);

2276

atomic_set(&rdev->read_errors, read_errors >> hours_since_last);

2256

}

2277

}

2257

2278

2258

static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,

2279

static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,

2259

int sectors, struct page *page, int rw)

2280

int sectors, struct page *page, int rw)

2260

{

2281

{

2261

sector_t first_bad;

2282

sector_t first_bad;

2262

int bad_sectors;

2283

int bad_sectors;

2263

2284

2264

if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)

2285

if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)

2265

&& (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))

2286

&& (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))

2266

return -1;

2287

return -1;

2267

if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))

2288

if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))

2268

/* success */

2289

/* success */

2269

return 1;

2290

return 1;

2270

if (rw == WRITE) {

2291

if (rw == WRITE) {

2271

set_bit(WriteErrorSeen, &rdev->flags);

2292

set_bit(WriteErrorSeen, &rdev->flags);

2272

if (!test_and_set_bit(WantReplacement, &rdev->flags))

2293

if (!test_and_set_bit(WantReplacement, &rdev->flags))

2273

set_bit(MD_RECOVERY_NEEDED,

2294

set_bit(MD_RECOVERY_NEEDED,

2274

&rdev->mddev->recovery);

2295

&rdev->mddev->recovery);

2275

}

2296

}

2276

/* need to record an error - either for the block or the device */

2297

/* need to record an error - either for the block or the device */

2277

if (!rdev_set_badblocks(rdev, sector, sectors, 0))

2298

if (!rdev_set_badblocks(rdev, sector, sectors, 0))

2278

md_error(rdev->mddev, rdev);

2299

md_error(rdev->mddev, rdev);

2279

return 0;

2300

return 0;

2280

}

2301

}

2281

2302

2282

/*

2303

/*

2283

* This is a kernel thread which:

2304

* This is a kernel thread which:

2284

*

2305

*

2285

* 1. Retries failed read operations on working mirrors.

2306

* 1. Retries failed read operations on working mirrors.

2286

* 2. Updates the raid superblock when problems encounter.

2307

* 2. Updates the raid superblock when problems encounter.

2287

* 3. Performs writes following reads for array synchronising.

2308

* 3. Performs writes following reads for array synchronising.

2288

*/

2309

*/

2289

2310

2290

static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)

2311

static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)

2291

{

2312

{

2292

int sect = 0; /* Offset from r10_bio->sector */

2313

int sect = 0; /* Offset from r10_bio->sector */

2293

int sectors = r10_bio->sectors;

2314

int sectors = r10_bio->sectors;

2294

struct md_rdev*rdev;

2315

struct md_rdev*rdev;

2295

int max_read_errors = atomic_read(&mddev->max_corr_read_errors);

2316

int max_read_errors = atomic_read(&mddev->max_corr_read_errors);

2296

int d = r10_bio->devs[r10_bio->read_slot].devnum;

2317

int d = r10_bio->devs[r10_bio->read_slot].devnum;

2297

2318

2298

/* still own a reference to this rdev, so it cannot

2319

/* still own a reference to this rdev, so it cannot

2299

* have been cleared recently.

2320

* have been cleared recently.

2300

*/

2321

*/

2301

rdev = conf->mirrors[d].rdev;

2322

rdev = conf->mirrors[d].rdev;

2302

2323

2303

if (test_bit(Faulty, &rdev->flags))

2324

if (test_bit(Faulty, &rdev->flags))

2304

/* drive has already been failed, just ignore any

2325

/* drive has already been failed, just ignore any

2305

more fix_read_error() attempts */

2326

more fix_read_error() attempts */

2306

return;

2327

return;

2307

2328

2308

check_decay_read_errors(mddev, rdev);

2329

check_decay_read_errors(mddev, rdev);

2309

atomic_inc(&rdev->read_errors);

2330

atomic_inc(&rdev->read_errors);

2310

if (atomic_read(&rdev->read_errors) > max_read_errors) {

2331

if (atomic_read(&rdev->read_errors) > max_read_errors) {

2311

char b[BDEVNAME_SIZE];

2332

char b[BDEVNAME_SIZE];

2312

bdevname(rdev->bdev, b);

2333

bdevname(rdev->bdev, b);

2313

2334

2314

printk(KERN_NOTICE

2335

printk(KERN_NOTICE

2315

"md/raid10:%s: %s: Raid device exceeded "

2336

"md/raid10:%s: %s: Raid device exceeded "

2316

"read_error threshold [cur %d:max %d]\n",

2337

"read_error threshold [cur %d:max %d]\n",

2317

mdname(mddev), b,

2338

mdname(mddev), b,

2318

atomic_read(&rdev->read_errors), max_read_errors);

2339

atomic_read(&rdev->read_errors), max_read_errors);

2319

printk(KERN_NOTICE

2340

printk(KERN_NOTICE

2320

"md/raid10:%s: %s: Failing raid device\n",

2341

"md/raid10:%s: %s: Failing raid device\n",

2321

mdname(mddev), b);

2342

mdname(mddev), b);

2322

md_error(mddev, conf->mirrors[d].rdev);

2343

md_error(mddev, conf->mirrors[d].rdev);

2323

r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;

2344

r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;

2324

return;

2345

return;

2325

}

2346

}

2326

2347

2327

while(sectors) {

2348

while(sectors) {

2328

int s = sectors;

2349

int s = sectors;

2329

int sl = r10_bio->read_slot;

2350

int sl = r10_bio->read_slot;

2330

int success = 0;

2351

int success = 0;

2331

int start;

2352

int start;

2332

2353

2333

if (s > (PAGE_SIZE>>9))

2354

if (s > (PAGE_SIZE>>9))

2334

s = PAGE_SIZE >> 9;

2355

s = PAGE_SIZE >> 9;

2335

2356

2336

rcu_read_lock();

2357

rcu_read_lock();

2337

do {

2358

do {

2338

sector_t first_bad;

2359

sector_t first_bad;

2339

int bad_sectors;

2360

int bad_sectors;

2340

2361

2341

d = r10_bio->devs[sl].devnum;

2362

d = r10_bio->devs[sl].devnum;

2342

rdev = rcu_dereference(conf->mirrors[d].rdev);

2363

rdev = rcu_dereference(conf->mirrors[d].rdev);

2343

if (rdev &&

2364

if (rdev &&

2344

!test_bit(Unmerged, &rdev->flags) &&

2365

!test_bit(Unmerged, &rdev->flags) &&

2345

test_bit(In_sync, &rdev->flags) &&

2366

test_bit(In_sync, &rdev->flags) &&

2346

is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,

2367

is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,

2347

&first_bad, &bad_sectors) == 0) {

2368

&first_bad, &bad_sectors) == 0) {

2348

atomic_inc(&rdev->nr_pending);

2369

atomic_inc(&rdev->nr_pending);

2349

rcu_read_unlock();

2370

rcu_read_unlock();

2350

success = sync_page_io(rdev,

2371

success = sync_page_io(rdev,

2351

r10_bio->devs[sl].addr +

2372

r10_bio->devs[sl].addr +

2352

sect,

2373

sect,

2353

s<<9,

2374

s<<9,

2354

conf->tmppage, READ, false);

2375

conf->tmppage, READ, false);

2355

rdev_dec_pending(rdev, mddev);

2376

rdev_dec_pending(rdev, mddev);

2356

rcu_read_lock();

2377

rcu_read_lock();

2357

if (success)

2378

if (success)

2358

break;

2379

break;

2359

}

2380

}

2360

sl++;

2381

sl++;

2361

if (sl == conf->copies)

2382

if (sl == conf->copies)

2362

sl = 0;

2383

sl = 0;

2363

} while (!success && sl != r10_bio->read_slot);

2384

} while (!success && sl != r10_bio->read_slot);

2364

rcu_read_unlock();

2385

rcu_read_unlock();

2365

2386

2366

if (!success) {

2387

if (!success) {

2367

/* Cannot read from anywhere, just mark the block

2388

/* Cannot read from anywhere, just mark the block

2368

* as bad on the first device to discourage future

2389

* as bad on the first device to discourage future

2369

* reads.

2390

* reads.

2370

*/

2391

*/

2371

int dn = r10_bio->devs[r10_bio->read_slot].devnum;

2392

int dn = r10_bio->devs[r10_bio->read_slot].devnum;

2372

rdev = conf->mirrors[dn].rdev;

2393

rdev = conf->mirrors[dn].rdev;

2373

2394

2374

if (!rdev_set_badblocks(

2395

if (!rdev_set_badblocks(

2375

rdev,

2396

rdev,

2376

r10_bio->devs[r10_bio->read_slot].addr

2397

r10_bio->devs[r10_bio->read_slot].addr

2377

+ sect,

2398

+ sect,

2378

s, 0)) {

2399

s, 0)) {

2379

md_error(mddev, rdev);

2400

md_error(mddev, rdev);

2380

r10_bio->devs[r10_bio->read_slot].bio

2401

r10_bio->devs[r10_bio->read_slot].bio

2381

= IO_BLOCKED;

2402

= IO_BLOCKED;

2382

}

2403

}

2383

break;

2404

break;

2384

}

2405

}

2385

2406

2386

start = sl;

2407

start = sl;

2387

/* write it back and re-read */

2408

/* write it back and re-read */

2388

rcu_read_lock();

2409

rcu_read_lock();

2389

while (sl != r10_bio->read_slot) {

2410

while (sl != r10_bio->read_slot) {

2390

char b[BDEVNAME_SIZE];

2411

char b[BDEVNAME_SIZE];

2391

2412

2392

if (sl==0)

2413

if (sl==0)

2393

sl = conf->copies;

2414

sl = conf->copies;

2394

sl--;

2415

sl--;

2395

d = r10_bio->devs[sl].devnum;

2416

d = r10_bio->devs[sl].devnum;

2396

rdev = rcu_dereference(conf->mirrors[d].rdev);

2417

rdev = rcu_dereference(conf->mirrors[d].rdev);

2397

if (!rdev ||

2418

if (!rdev ||

2398

test_bit(Unmerged, &rdev->flags) ||

2419

test_bit(Unmerged, &rdev->flags) ||

2399

!test_bit(In_sync, &rdev->flags))

2420

!test_bit(In_sync, &rdev->flags))

2400

continue;

2421

continue;

2401

2422

2402

atomic_inc(&rdev->nr_pending);

2423

atomic_inc(&rdev->nr_pending);

2403

rcu_read_unlock();

2424

rcu_read_unlock();

2404

if (r10_sync_page_io(rdev,

2425

if (r10_sync_page_io(rdev,

2405

r10_bio->devs[sl].addr +

2426

r10_bio->devs[sl].addr +

2406

sect,

2427

sect,

2407

s, conf->tmppage, WRITE)

2428

s, conf->tmppage, WRITE)

2408

== 0) {

2429

== 0) {

2409

/* Well, this device is dead */

2430

/* Well, this device is dead */

2410

printk(KERN_NOTICE

2431

printk(KERN_NOTICE

2411

"md/raid10:%s: read correction "

2432

"md/raid10:%s: read correction "

2412

"write failed"

2433

"write failed"

2413

" (%d sectors at %llu on %s)\n",

2434

" (%d sectors at %llu on %s)\n",

2414

mdname(mddev), s,

2435

mdname(mddev), s,

2415

(unsigned long long)(

2436

(unsigned long long)(

2416

sect +

2437

sect +

2417

choose_data_offset(r10_bio,

2438

choose_data_offset(r10_bio,

2418

rdev)),

2439

rdev)),

2419

bdevname(rdev->bdev, b));

2440

bdevname(rdev->bdev, b));

2420

printk(KERN_NOTICE "md/raid10:%s: %s: failing "

2441

printk(KERN_NOTICE "md/raid10:%s: %s: failing "

2421

"drive\n",

2442

"drive\n",

2422

mdname(mddev),

2443

mdname(mddev),

2423

bdevname(rdev->bdev, b));

2444

bdevname(rdev->bdev, b));

2424

}

2445

}

2425

rdev_dec_pending(rdev, mddev);

2446

rdev_dec_pending(rdev, mddev);

2426

rcu_read_lock();

2447

rcu_read_lock();

2427

}

2448

}

2428

sl = start;

2449

sl = start;

2429

while (sl != r10_bio->read_slot) {

2450

while (sl != r10_bio->read_slot) {

2430

char b[BDEVNAME_SIZE];

2451

char b[BDEVNAME_SIZE];

2431

2452

2432

if (sl==0)

2453

if (sl==0)

2433

sl = conf->copies;

2454

sl = conf->copies;

2434

sl--;

2455

sl--;

2435

d = r10_bio->devs[sl].devnum;

2456

d = r10_bio->devs[sl].devnum;

2436

rdev = rcu_dereference(conf->mirrors[d].rdev);

2457

rdev = rcu_dereference(conf->mirrors[d].rdev);

2437

if (!rdev ||

2458

if (!rdev ||

2438

!test_bit(In_sync, &rdev->flags))

2459

!test_bit(In_sync, &rdev->flags))

2439

continue;

2460

continue;

2440

2461

2441

atomic_inc(&rdev->nr_pending);

2462

atomic_inc(&rdev->nr_pending);

2442

rcu_read_unlock();

2463

rcu_read_unlock();

2443

switch (r10_sync_page_io(rdev,

2464

switch (r10_sync_page_io(rdev,

2444

r10_bio->devs[sl].addr +

2465

r10_bio->devs[sl].addr +

2445

sect,

2466

sect,

2446

s, conf->tmppage,

2467

s, conf->tmppage,

2447

READ)) {

2468

READ)) {

2448

case 0:

2469

case 0:

2449

/* Well, this device is dead */

2470

/* Well, this device is dead */

2450

printk(KERN_NOTICE

2471

printk(KERN_NOTICE

2451

"md/raid10:%s: unable to read back "

2472

"md/raid10:%s: unable to read back "

2452

"corrected sectors"

2473

"corrected sectors"

2453

" (%d sectors at %llu on %s)\n",

2474

" (%d sectors at %llu on %s)\n",

2454

mdname(mddev), s,

2475

mdname(mddev), s,

2455

(unsigned long long)(

2476

(unsigned long long)(

2456

sect +

2477

sect +

2457

choose_data_offset(r10_bio, rdev)),

2478

choose_data_offset(r10_bio, rdev)),

2458

bdevname(rdev->bdev, b));

2479

bdevname(rdev->bdev, b));

2459

printk(KERN_NOTICE "md/raid10:%s: %s: failing "

2480

printk(KERN_NOTICE "md/raid10:%s: %s: failing "

2460

"drive\n",

2481

"drive\n",

2461

mdname(mddev),

2482

mdname(mddev),

2462

bdevname(rdev->bdev, b));

2483

bdevname(rdev->bdev, b));

2463

break;

2484

break;

2464

case 1:

2485

case 1:

2465

printk(KERN_INFO

2486

printk(KERN_INFO

2466

"md/raid10:%s: read error corrected"

2487

"md/raid10:%s: read error corrected"

2467

" (%d sectors at %llu on %s)\n",

2488

" (%d sectors at %llu on %s)\n",

2468

mdname(mddev), s,

2489

mdname(mddev), s,

2469

(unsigned long long)(

2490

(unsigned long long)(

2470

sect +

2491

sect +

2471

choose_data_offset(r10_bio, rdev)),

2492

choose_data_offset(r10_bio, rdev)),

2472

bdevname(rdev->bdev, b));

2493

bdevname(rdev->bdev, b));

2473

atomic_add(s, &rdev->corrected_errors);

2494

atomic_add(s, &rdev->corrected_errors);

2474

}

2495

}

2475

2496

2476

rdev_dec_pending(rdev, mddev);

2497

rdev_dec_pending(rdev, mddev);

2477

rcu_read_lock();

2498

rcu_read_lock();

2478

}

2499

}

2479

rcu_read_unlock();

2500

rcu_read_unlock();

2480

2501

2481

sectors -= s;

2502

sectors -= s;

2482

sect += s;

2503

sect += s;

2483

}

2504

}

2484

}

2505

}

2485

2506

2486

static void bi_complete(struct bio *bio, int error)

2507

static void bi_complete(struct bio *bio, int error)

2487

{

2508

{

2488

complete((struct completion *)bio->bi_private);

2509

complete((struct completion *)bio->bi_private);

2489

}

2510

}

2490

2511

2491

static int submit_bio_wait(int rw, struct bio *bio)

2512

static int submit_bio_wait(int rw, struct bio *bio)

2492

{

2513

{

2493

struct completion event;

2514

struct completion event;

2494

rw |= REQ_SYNC;

2515

rw |= REQ_SYNC;

2495

2516

2496

init_completion(&event);

2517

init_completion(&event);

2497

bio->bi_private = &event;

2518

bio->bi_private = &event;

2498

bio->bi_end_io = bi_complete;

2519

bio->bi_end_io = bi_complete;

2499

submit_bio(rw, bio);

2520

submit_bio(rw, bio);

2500

wait_for_completion(&event);

2521

wait_for_completion(&event);

2501

2522

2502

return test_bit(BIO_UPTODATE, &bio->bi_flags);

2523

return test_bit(BIO_UPTODATE, &bio->bi_flags);

2503

}

2524

}

2504

2525

2505

static int narrow_write_error(struct r10bio *r10_bio, int i)

2526

static int narrow_write_error(struct r10bio *r10_bio, int i)

2506

{

2527

{

2507

struct bio *bio = r10_bio->master_bio;

2528

struct bio *bio = r10_bio->master_bio;

2508

struct mddev *mddev = r10_bio->mddev;

2529

struct mddev *mddev = r10_bio->mddev;

2509

struct r10conf *conf = mddev->private;

2530

struct r10conf *conf = mddev->private;

2510

struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;

2531

struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;

2511

/* bio has the data to be written to slot 'i' where

2532

/* bio has the data to be written to slot 'i' where

2512

* we just recently had a write error.

2533

* we just recently had a write error.

2513

* We repeatedly clone the bio and trim down to one block,

2534

* We repeatedly clone the bio and trim down to one block,

2514

* then try the write. Where the write fails we record

2535

* then try the write. Where the write fails we record

2515

* a bad block.

2536

* a bad block.

2516

* It is conceivable that the bio doesn't exactly align with

2537

* It is conceivable that the bio doesn't exactly align with

2517

* blocks. We must handle this.

2538

* blocks. We must handle this.

2518

*

2539

*

2519

* We currently own a reference to the rdev.

2540

* We currently own a reference to the rdev.

2520

*/

2541

*/

2521

2542

2522

int block_sectors;

2543

int block_sectors;

2523

sector_t sector;

2544

sector_t sector;

2524

int sectors;

2545

int sectors;

2525

int sect_to_write = r10_bio->sectors;

2546

int sect_to_write = r10_bio->sectors;

2526

int ok = 1;

2547

int ok = 1;

2527

2548

2528

if (rdev->badblocks.shift < 0)

2549

if (rdev->badblocks.shift < 0)

2529

return 0;

2550

return 0;

2530

2551

2531

block_sectors = 1 << rdev->badblocks.shift;

2552

block_sectors = 1 << rdev->badblocks.shift;

2532

sector = r10_bio->sector;

2553

sector = r10_bio->sector;

2533

sectors = ((r10_bio->sector + block_sectors)

2554

sectors = ((r10_bio->sector + block_sectors)

2534

& ~(sector_t)(block_sectors - 1))

2555

& ~(sector_t)(block_sectors - 1))

2535

- sector;

2556

- sector;

2536

2557

2537

while (sect_to_write) {

2558

while (sect_to_write) {

2538

struct bio *wbio;

2559

struct bio *wbio;

2539

if (sectors > sect_to_write)

2560

if (sectors > sect_to_write)

2540

sectors = sect_to_write;

2561

sectors = sect_to_write;

2541

/* Write at 'sector' for 'sectors' */

2562

/* Write at 'sector' for 'sectors' */

2542

wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);

2563

wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);

2543

md_trim_bio(wbio, sector - bio->bi_sector, sectors);

2564

md_trim_bio(wbio, sector - bio->bi_sector, sectors);

2544

wbio->bi_sector = (r10_bio->devs[i].addr+

2565

wbio->bi_sector = (r10_bio->devs[i].addr+

2545

choose_data_offset(r10_bio, rdev) +

2566

choose_data_offset(r10_bio, rdev) +

2546

(sector - r10_bio->sector));

2567

(sector - r10_bio->sector));

2547

wbio->bi_bdev = rdev->bdev;

2568

wbio->bi_bdev = rdev->bdev;

2548

if (submit_bio_wait(WRITE, wbio) == 0)

2569

if (submit_bio_wait(WRITE, wbio) == 0)

2549

/* Failure! */

2570

/* Failure! */

2550

ok = rdev_set_badblocks(rdev, sector,

2571

ok = rdev_set_badblocks(rdev, sector,

2551

sectors, 0)

2572

sectors, 0)

2552

&& ok;

2573

&& ok;

2553

2574

2554

bio_put(wbio);

2575

bio_put(wbio);

2555

sect_to_write -= sectors;

2576

sect_to_write -= sectors;

2556

sector += sectors;

2577

sector += sectors;

2557

sectors = block_sectors;

2578

sectors = block_sectors;

2558

}

2579

}

2559

return ok;

2580

return ok;

2560

}

2581

}

2561

2582

2562

static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)

2583

static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)

2563

{

2584

{

2564

int slot = r10_bio->read_slot;

2585

int slot = r10_bio->read_slot;

2565

struct bio *bio;

2586

struct bio *bio;

2566

struct r10conf *conf = mddev->private;

2587

struct r10conf *conf = mddev->private;

2567

struct md_rdev *rdev = r10_bio->devs[slot].rdev;

2588

struct md_rdev *rdev = r10_bio->devs[slot].rdev;

2568

char b[BDEVNAME_SIZE];

2589

char b[BDEVNAME_SIZE];

2569

unsigned long do_sync;

2590

unsigned long do_sync;

2570

int max_sectors;

2591

int max_sectors;

2571

2592

2572

/* we got a read error. Maybe the drive is bad. Maybe just

2593

/* we got a read error. Maybe the drive is bad. Maybe just

2573

* the block and we can fix it.

2594

* the block and we can fix it.

2574

* We freeze all other IO, and try reading the block from

2595

* We freeze all other IO, and try reading the block from

2575

* other devices. When we find one, we re-write

2596

* other devices. When we find one, we re-write

2576

* and check it that fixes the read error.

2597

* and check it that fixes the read error.

2577

* This is all done synchronously while the array is

2598

* This is all done synchronously while the array is

2578

* frozen.

2599

* frozen.

2579

*/

2600

*/

2580

bio = r10_bio->devs[slot].bio;

2601

bio = r10_bio->devs[slot].bio;

2581

bdevname(bio->bi_bdev, b);

2602

bdevname(bio->bi_bdev, b);

2582

bio_put(bio);

2603

bio_put(bio);

2583

r10_bio->devs[slot].bio = NULL;

2604

r10_bio->devs[slot].bio = NULL;

2584

2605

2585

if (mddev->ro == 0) {

2606

if (mddev->ro == 0) {

2586

freeze_array(conf);

2607

freeze_array(conf);

2587

fix_read_error(conf, mddev, r10_bio);

2608

fix_read_error(conf, mddev, r10_bio);

2588

unfreeze_array(conf);

2609

unfreeze_array(conf);

2589

} else

2610

} else

2590

r10_bio->devs[slot].bio = IO_BLOCKED;

2611

r10_bio->devs[slot].bio = IO_BLOCKED;

2591

2612

2592

rdev_dec_pending(rdev, mddev);

2613

rdev_dec_pending(rdev, mddev);

2593

2614

2594

read_more:

2615

read_more:

2595

rdev = read_balance(conf, r10_bio, &max_sectors);

2616

rdev = read_balance(conf, r10_bio, &max_sectors);

2596

if (rdev == NULL) {

2617

if (rdev == NULL) {

2597

printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"

2618

printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"

2598

" read error for block %llu\n",

2619

" read error for block %llu\n",

2599

mdname(mddev), b,

2620

mdname(mddev), b,

2600

(unsigned long long)r10_bio->sector);

2621

(unsigned long long)r10_bio->sector);

2601

raid_end_bio_io(r10_bio);

2622

raid_end_bio_io(r10_bio);

2602

return;

2623

return;

2603

}

2624

}

2604

2625

2605

do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);

2626

do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);

2606

slot = r10_bio->read_slot;

2627

slot = r10_bio->read_slot;

2607

printk_ratelimited(

2628

printk_ratelimited(

2608

KERN_ERR

2629

KERN_ERR

2609

"md/raid10:%s: %s: redirecting "

2630

"md/raid10:%s: %s: redirecting "

2610

"sector %llu to another mirror\n",

2631

"sector %llu to another mirror\n",

2611

mdname(mddev),

2632

mdname(mddev),

2612

bdevname(rdev->bdev, b),

2633

bdevname(rdev->bdev, b),

2613

(unsigned long long)r10_bio->sector);

2634

(unsigned long long)r10_bio->sector);

2614

bio = bio_clone_mddev(r10_bio->master_bio,

2635

bio = bio_clone_mddev(r10_bio->master_bio,

2615

GFP_NOIO, mddev);

2636

GFP_NOIO, mddev);

2616

md_trim_bio(bio,

2637

md_trim_bio(bio,

2617

r10_bio->sector - bio->bi_sector,

2638

r10_bio->sector - bio->bi_sector,

2618

max_sectors);

2639

max_sectors);

2619

r10_bio->devs[slot].bio = bio;

2640

r10_bio->devs[slot].bio = bio;

2620

r10_bio->devs[slot].rdev = rdev;

2641

r10_bio->devs[slot].rdev = rdev;

2621

bio->bi_sector = r10_bio->devs[slot].addr

2642

bio->bi_sector = r10_bio->devs[slot].addr

2622

+ choose_data_offset(r10_bio, rdev);

2643

+ choose_data_offset(r10_bio, rdev);

2623

bio->bi_bdev = rdev->bdev;

2644

bio->bi_bdev = rdev->bdev;

2624

bio->bi_rw = READ | do_sync;

2645

bio->bi_rw = READ | do_sync;

2625

bio->bi_private = r10_bio;

2646

bio->bi_private = r10_bio;

2626

bio->bi_end_io = raid10_end_read_request;

2647

bio->bi_end_io = raid10_end_read_request;

2627

if (max_sectors < r10_bio->sectors) {

2648

if (max_sectors < r10_bio->sectors) {

2628

/* Drat - have to split this up more */

2649

/* Drat - have to split this up more */

2629

struct bio *mbio = r10_bio->master_bio;

2650

struct bio *mbio = r10_bio->master_bio;

2630

int sectors_handled =

2651

int sectors_handled =

2631

r10_bio->sector + max_sectors

2652

r10_bio->sector + max_sectors

2632

- mbio->bi_sector;

2653

- mbio->bi_sector;

2633

r10_bio->sectors = max_sectors;

2654

r10_bio->sectors = max_sectors;

2634

spin_lock_irq(&conf->device_lock);

2655

spin_lock_irq(&conf->device_lock);

2635

if (mbio->bi_phys_segments == 0)

2656

if (mbio->bi_phys_segments == 0)

2636

mbio->bi_phys_segments = 2;

2657

mbio->bi_phys_segments = 2;

2637

else

2658

else

2638

mbio->bi_phys_segments++;

2659

mbio->bi_phys_segments++;

2639

spin_unlock_irq(&conf->device_lock);

2660

spin_unlock_irq(&conf->device_lock);

2640

generic_make_request(bio);

2661

generic_make_request(bio);

2641

2662

2642

r10_bio = mempool_alloc(conf->r10bio_pool,

2663

r10_bio = mempool_alloc(conf->r10bio_pool,

2643

GFP_NOIO);

2664

GFP_NOIO);

2644

r10_bio->master_bio = mbio;

2665

r10_bio->master_bio = mbio;

2645

r10_bio->sectors = (mbio->bi_size >> 9)

2666

r10_bio->sectors = (mbio->bi_size >> 9)

2646

- sectors_handled;

2667

- sectors_handled;

2647

r10_bio->state = 0;

2668

r10_bio->state = 0;

2648

set_bit(R10BIO_ReadError,

2669

set_bit(R10BIO_ReadError,

2649

&r10_bio->state);

2670

&r10_bio->state);

2650

r10_bio->mddev = mddev;

2671

r10_bio->mddev = mddev;

2651

r10_bio->sector = mbio->bi_sector

2672

r10_bio->sector = mbio->bi_sector

2652

+ sectors_handled;

2673

+ sectors_handled;

2653

2674

2654

goto read_more;

2675

goto read_more;

2655

} else

2676

} else

2656

generic_make_request(bio);

2677

generic_make_request(bio);

2657

}

2678

}

2658

2679

2659

static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)

2680

static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)

2660

{

2681

{

2661

/* Some sort of write request has finished and it

2682

/* Some sort of write request has finished and it

2662

* succeeded in writing where we thought there was a

2683

* succeeded in writing where we thought there was a

2663

* bad block. So forget the bad block.

2684

* bad block. So forget the bad block.

2664

* Or possibly if failed and we need to record

2685

* Or possibly if failed and we need to record

2665

* a bad block.

2686

* a bad block.

2666

*/

2687

*/

2667

int m;

2688

int m;

2668

struct md_rdev *rdev;

2689

struct md_rdev *rdev;

2669

2690

2670

if (test_bit(R10BIO_IsSync, &r10_bio->state) ||

2691

if (test_bit(R10BIO_IsSync, &r10_bio->state) ||

2671

test_bit(R10BIO_IsRecover, &r10_bio->state)) {

2692

test_bit(R10BIO_IsRecover, &r10_bio->state)) {

2672

for (m = 0; m < conf->copies; m++) {

2693

for (m = 0; m < conf->copies; m++) {

2673

int dev = r10_bio->devs[m].devnum;

2694

int dev = r10_bio->devs[m].devnum;

2674

rdev = conf->mirrors[dev].rdev;

2695

rdev = conf->mirrors[dev].rdev;

2675

if (r10_bio->devs[m].bio == NULL)

2696

if (r10_bio->devs[m].bio == NULL)

2676

continue;

2697

continue;

2677

if (test_bit(BIO_UPTODATE,

2698

if (test_bit(BIO_UPTODATE,

2678

&r10_bio->devs[m].bio->bi_flags)) {

2699

&r10_bio->devs[m].bio->bi_flags)) {

2679

rdev_clear_badblocks(

2700

rdev_clear_badblocks(

2680

rdev,

2701

rdev,

2681

r10_bio->devs[m].addr,

2702

r10_bio->devs[m].addr,

2682

r10_bio->sectors, 0);

2703

r10_bio->sectors, 0);

2683

} else {

2704

} else {

2684

if (!rdev_set_badblocks(

2705

if (!rdev_set_badblocks(

2685

rdev,

2706

rdev,

2686

r10_bio->devs[m].addr,

2707

r10_bio->devs[m].addr,

2687

r10_bio->sectors, 0))

2708

r10_bio->sectors, 0))

2688

md_error(conf->mddev, rdev);

2709

md_error(conf->mddev, rdev);

2689

}

2710

}

2690

rdev = conf->mirrors[dev].replacement;

2711

rdev = conf->mirrors[dev].replacement;

2691

if (r10_bio->devs[m].repl_bio == NULL)

2712

if (r10_bio->devs[m].repl_bio == NULL)

2692

continue;

2713

continue;

2693

if (test_bit(BIO_UPTODATE,

2714

if (test_bit(BIO_UPTODATE,

2694

&r10_bio->devs[m].repl_bio->bi_flags)) {

2715

&r10_bio->devs[m].repl_bio->bi_flags)) {

2695

rdev_clear_badblocks(

2716

rdev_clear_badblocks(

2696

rdev,

2717

rdev,

2697

r10_bio->devs[m].addr,

2718

r10_bio->devs[m].addr,

2698

r10_bio->sectors, 0);

2719

r10_bio->sectors, 0);

2699

} else {

2720

} else {

2700

if (!rdev_set_badblocks(

2721

if (!rdev_set_badblocks(

2701

rdev,

2722

rdev,

2702

r10_bio->devs[m].addr,

2723

r10_bio->devs[m].addr,

2703

r10_bio->sectors, 0))

2724

r10_bio->sectors, 0))

2704

md_error(conf->mddev, rdev);

2725

md_error(conf->mddev, rdev);

2705

}

2726

}

2706

}

2727

}

2707

put_buf(r10_bio);

2728

put_buf(r10_bio);

2708

} else {

2729

} else {

2709

for (m = 0; m < conf->copies; m++) {

2730

for (m = 0; m < conf->copies; m++) {

2710

int dev = r10_bio->devs[m].devnum;

2731

int dev = r10_bio->devs[m].devnum;

2711

struct bio *bio = r10_bio->devs[m].bio;

2732

struct bio *bio = r10_bio->devs[m].bio;

2712

rdev = conf->mirrors[dev].rdev;

2733

rdev = conf->mirrors[dev].rdev;

2713

if (bio == IO_MADE_GOOD) {

2734

if (bio == IO_MADE_GOOD) {

2714

rdev_clear_badblocks(

2735

rdev_clear_badblocks(

2715

rdev,

2736

rdev,

2716

r10_bio->devs[m].addr,

2737

r10_bio->devs[m].addr,

2717

r10_bio->sectors, 0);

2738

r10_bio->sectors, 0);

2718

rdev_dec_pending(rdev, conf->mddev);

2739

rdev_dec_pending(rdev, conf->mddev);

2719

} else if (bio != NULL &&

2740

} else if (bio != NULL &&

2720

!test_bit(BIO_UPTODATE, &bio->bi_flags)) {

2741

!test_bit(BIO_UPTODATE, &bio->bi_flags)) {

2721

if (!narrow_write_error(r10_bio, m)) {

2742

if (!narrow_write_error(r10_bio, m)) {

2722

md_error(conf->mddev, rdev);

2743

md_error(conf->mddev, rdev);

2723

set_bit(R10BIO_Degraded,

2744

set_bit(R10BIO_Degraded,

2724

&r10_bio->state);

2745

&r10_bio->state);

2725

}

2746

}

2726

rdev_dec_pending(rdev, conf->mddev);

2747

rdev_dec_pending(rdev, conf->mddev);

2727

}

2748

}

2728

bio = r10_bio->devs[m].repl_bio;

2749

bio = r10_bio->devs[m].repl_bio;

2729

rdev = conf->mirrors[dev].replacement;

2750

rdev = conf->mirrors[dev].replacement;

2730

if (rdev && bio == IO_MADE_GOOD) {

2751

if (rdev && bio == IO_MADE_GOOD) {

2731

rdev_clear_badblocks(

2752

rdev_clear_badblocks(

2732

rdev,

2753

rdev,

2733

r10_bio->devs[m].addr,

2754

r10_bio->devs[m].addr,

2734

r10_bio->sectors, 0);

2755

r10_bio->sectors, 0);

2735

rdev_dec_pending(rdev, conf->mddev);

2756

rdev_dec_pending(rdev, conf->mddev);

2736

}

2757

}

2737

}

2758

}

2738

if (test_bit(R10BIO_WriteError,

2759

if (test_bit(R10BIO_WriteError,

2739

&r10_bio->state))

2760

&r10_bio->state))

2740

close_write(r10_bio);

2761

close_write(r10_bio);

2741

raid_end_bio_io(r10_bio);

2762

raid_end_bio_io(r10_bio);

2742

}

2763

}

2743

}

2764

}

2744

2765

2745

static void raid10d(struct md_thread *thread)

2766

static void raid10d(struct md_thread *thread)

2746

{

2767

{

2747

struct mddev *mddev = thread->mddev;

2768

struct mddev *mddev = thread->mddev;

2748

struct r10bio *r10_bio;

2769

struct r10bio *r10_bio;

2749

unsigned long flags;

2770

unsigned long flags;

2750

struct r10conf *conf = mddev->private;

2771

struct r10conf *conf = mddev->private;

2751

struct list_head *head = &conf->retry_list;

2772

struct list_head *head = &conf->retry_list;

2752

struct blk_plug plug;

2773

struct blk_plug plug;

2753

2774

2754

md_check_recovery(mddev);

2775

md_check_recovery(mddev);

2755

2776

2756

blk_start_plug(&plug);

2777

blk_start_plug(&plug);

2757

for (;;) {

2778

for (;;) {

2758

2779

2759

flush_pending_writes(conf);

2780

flush_pending_writes(conf);

2760

2781

2761

spin_lock_irqsave(&conf->device_lock, flags);

2782

spin_lock_irqsave(&conf->device_lock, flags);

2762

if (list_empty(head)) {

2783

if (list_empty(head)) {

2763

spin_unlock_irqrestore(&conf->device_lock, flags);

2784

spin_unlock_irqrestore(&conf->device_lock, flags);

2764

break;

2785

break;

2765

}

2786

}

2766

r10_bio = list_entry(head->prev, struct r10bio, retry_list);

2787

r10_bio = list_entry(head->prev, struct r10bio, retry_list);

2767

list_del(head->prev);

2788

list_del(head->prev);

2768

conf->nr_queued--;

2789

conf->nr_queued--;

2769

spin_unlock_irqrestore(&conf->device_lock, flags);

2790

spin_unlock_irqrestore(&conf->device_lock, flags);

2770

2791

2771

mddev = r10_bio->mddev;

2792

mddev = r10_bio->mddev;

2772

conf = mddev->private;

2793

conf = mddev->private;

2773

if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||

2794

if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||

2774

test_bit(R10BIO_WriteError, &r10_bio->state))

2795

test_bit(R10BIO_WriteError, &r10_bio->state))

2775

handle_write_completed(conf, r10_bio);

2796

handle_write_completed(conf, r10_bio);

2776

else if (test_bit(R10BIO_IsReshape, &r10_bio->state))

2797

else if (test_bit(R10BIO_IsReshape, &r10_bio->state))

2777

reshape_request_write(mddev, r10_bio);

2798

reshape_request_write(mddev, r10_bio);

2778

else if (test_bit(R10BIO_IsSync, &r10_bio->state))

2799

else if (test_bit(R10BIO_IsSync, &r10_bio->state))

2779

sync_request_write(mddev, r10_bio);

2800

sync_request_write(mddev, r10_bio);

2780

else if (test_bit(R10BIO_IsRecover, &r10_bio->state))

2801

else if (test_bit(R10BIO_IsRecover, &r10_bio->state))

2781

recovery_request_write(mddev, r10_bio);

2802

recovery_request_write(mddev, r10_bio);

2782

else if (test_bit(R10BIO_ReadError, &r10_bio->state))

2803

else if (test_bit(R10BIO_ReadError, &r10_bio->state))

2783

handle_read_error(mddev, r10_bio);

2804

handle_read_error(mddev, r10_bio);

2784

else {

2805

else {

2785

/* just a partial read to be scheduled from a

2806

/* just a partial read to be scheduled from a

2786

* separate context

2807

* separate context

2787

*/

2808

*/

2788

int slot = r10_bio->read_slot;

2809

int slot = r10_bio->read_slot;

2789

generic_make_request(r10_bio->devs[slot].bio);

2810

generic_make_request(r10_bio->devs[slot].bio);

2790

}

2811

}

2791

2812

2792

cond_resched();

2813

cond_resched();

2793

if (mddev->flags & ~(1<<MD_CHANGE_PENDING))

2814

if (mddev->flags & ~(1<<MD_CHANGE_PENDING))

2794

md_check_recovery(mddev);

2815

md_check_recovery(mddev);

2795

}

2816

}

2796

blk_finish_plug(&plug);

2817

blk_finish_plug(&plug);

2797

}

2818

}

2798

2819

2799

2820

2800

static int init_resync(struct r10conf *conf)

2821

static int init_resync(struct r10conf *conf)

2801

{

2822

{

2802

int buffs;

2823

int buffs;

2803

int i;

2824

int i;

2804

2825

2805

buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;

2826

buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;

2806

BUG_ON(conf->r10buf_pool);

2827

BUG_ON(conf->r10buf_pool);

2807

conf->have_replacement = 0;

2828

conf->have_replacement = 0;

2808

for (i = 0; i < conf->geo.raid_disks; i++)

2829

for (i = 0; i < conf->geo.raid_disks; i++)

2809

if (conf->mirrors[i].replacement)

2830

if (conf->mirrors[i].replacement)

2810

conf->have_replacement = 1;

2831

conf->have_replacement = 1;

2811

conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);

2832

conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);

2812

if (!conf->r10buf_pool)

2833

if (!conf->r10buf_pool)

2813

return -ENOMEM;

2834

return -ENOMEM;

2814

conf->next_resync = 0;

2835

conf->next_resync = 0;

2815

return 0;

2836

return 0;

2816

}

2837

}

2817

2838

2818

/*

2839

/*

2819

* perform a "sync" on one "block"

2840

* perform a "sync" on one "block"

2820

*

2841

*

2821

* We need to make sure that no normal I/O request - particularly write

2842

* We need to make sure that no normal I/O request - particularly write

2822

* requests - conflict with active sync requests.

2843

* requests - conflict with active sync requests.

2823

*

2844

*

2824

* This is achieved by tracking pending requests and a 'barrier' concept

2845

* This is achieved by tracking pending requests and a 'barrier' concept

2825

* that can be installed to exclude normal IO requests.

2846

* that can be installed to exclude normal IO requests.

2826

*

2847

*

2827

* Resync and recovery are handled very differently.

2848

* Resync and recovery are handled very differently.

2828

* We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.

2849

* We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.

2829

*

2850

*

2830

* For resync, we iterate over virtual addresses, read all copies,

2851

* For resync, we iterate over virtual addresses, read all copies,

2831

* and update if there are differences. If only one copy is live,

2852

* and update if there are differences. If only one copy is live,

2832

* skip it.

2853

* skip it.

2833

* For recovery, we iterate over physical addresses, read a good

2854

* For recovery, we iterate over physical addresses, read a good

2834

* value for each non-in_sync drive, and over-write.

2855

* value for each non-in_sync drive, and over-write.

2835

*

2856

*

2836

* So, for recovery we may have several outstanding complex requests for a

2857

* So, for recovery we may have several outstanding complex requests for a

2837

* given address, one for each out-of-sync device. We model this by allocating

2858

* given address, one for each out-of-sync device. We model this by allocating

2838

* a number of r10_bio structures, one for each out-of-sync device.

2859

* a number of r10_bio structures, one for each out-of-sync device.

2839

* As we setup these structures, we collect all bio's together into a list

2860

* As we setup these structures, we collect all bio's together into a list

2840

* which we then process collectively to add pages, and then process again

2861

* which we then process collectively to add pages, and then process again

2841

* to pass to generic_make_request.

2862

* to pass to generic_make_request.

2842

*

2863

*

2843

* The r10_bio structures are linked using a borrowed master_bio pointer.

2864

* The r10_bio structures are linked using a borrowed master_bio pointer.

2844

* This link is counted in ->remaining. When the r10_bio that points to NULL

2865

* This link is counted in ->remaining. When the r10_bio that points to NULL

2845

* has its remaining count decremented to 0, the whole complex operation

2866

* has its remaining count decremented to 0, the whole complex operation

2846

* is complete.

2867

* is complete.

2847

*

2868

*

2848

*/

2869

*/

2849

2870

2850

static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,

2871

static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,

2851

int *skipped, int go_faster)

2872

int *skipped, int go_faster)

2852

{

2873

{

2853

struct r10conf *conf = mddev->private;

2874

struct r10conf *conf = mddev->private;

2854

struct r10bio *r10_bio;

2875

struct r10bio *r10_bio;

2855

struct bio *biolist = NULL, *bio;

2876

struct bio *biolist = NULL, *bio;

2856

sector_t max_sector, nr_sectors;

2877

sector_t max_sector, nr_sectors;

2857

int i;

2878

int i;

2858

int max_sync;

2879

int max_sync;

2859

sector_t sync_blocks;

2880

sector_t sync_blocks;

2860

sector_t sectors_skipped = 0;

2881

sector_t sectors_skipped = 0;

2861

int chunks_skipped = 0;

2882

int chunks_skipped = 0;

2862

sector_t chunk_mask = conf->geo.chunk_mask;

2883

sector_t chunk_mask = conf->geo.chunk_mask;

2863

2884

2864

if (!conf->r10buf_pool)

2885

if (!conf->r10buf_pool)

2865

if (init_resync(conf))

2886

if (init_resync(conf))

2866

return 0;

2887

return 0;

2867

2888

2868

skipped:

2889

skipped:

2869

max_sector = mddev->dev_sectors;

2890

max_sector = mddev->dev_sectors;

2870

if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||

2891

if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||

2871

test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))

2892

test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))

2872

max_sector = mddev->resync_max_sectors;

2893

max_sector = mddev->resync_max_sectors;

2873

if (sector_nr >= max_sector) {

2894

if (sector_nr >= max_sector) {

2874

/* If we aborted, we need to abort the

2895

/* If we aborted, we need to abort the

2875

* sync on the 'current' bitmap chucks (there can

2896

* sync on the 'current' bitmap chucks (there can

2876

* be several when recovering multiple devices).

2897

* be several when recovering multiple devices).

2877

* as we may have started syncing it but not finished.

2898

* as we may have started syncing it but not finished.

2878

* We can find the current address in

2899

* We can find the current address in

2879

* mddev->curr_resync, but for recovery,

2900

* mddev->curr_resync, but for recovery,

2880

* we need to convert that to several

2901

* we need to convert that to several

2881

* virtual addresses.

2902

* virtual addresses.

2882

*/

2903

*/

2883

if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {

2904

if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {

2884

end_reshape(conf);

2905

end_reshape(conf);

2885

return 0;

2906

return 0;

2886

}

2907

}

2887

2908

2888

if (mddev->curr_resync < max_sector) { /* aborted */

2909

if (mddev->curr_resync < max_sector) { /* aborted */

2889

if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))

2910

if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))

2890

bitmap_end_sync(mddev->bitmap, mddev->curr_resync,

2911

bitmap_end_sync(mddev->bitmap, mddev->curr_resync,

2891

&sync_blocks, 1);

2912

&sync_blocks, 1);

2892

else for (i = 0; i < conf->geo.raid_disks; i++) {

2913

else for (i = 0; i < conf->geo.raid_disks; i++) {

2893

sector_t sect =

2914

sector_t sect =

2894

raid10_find_virt(conf, mddev->curr_resync, i);

2915

raid10_find_virt(conf, mddev->curr_resync, i);

2895

bitmap_end_sync(mddev->bitmap, sect,

2916

bitmap_end_sync(mddev->bitmap, sect,

2896

&sync_blocks, 1);

2917

&sync_blocks, 1);

2897

}

2918

}

2898

} else {

2919

} else {

2899

/* completed sync */

2920

/* completed sync */

2900

if ((!mddev->bitmap || conf->fullsync)

2921

if ((!mddev->bitmap || conf->fullsync)

2901

&& conf->have_replacement

2922

&& conf->have_replacement

2902

&& test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {

2923

&& test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {

2903

/* Completed a full sync so the replacements

2924

/* Completed a full sync so the replacements

2904

* are now fully recovered.

2925

* are now fully recovered.

2905

*/

2926

*/

2906

for (i = 0; i < conf->geo.raid_disks; i++)

2927

for (i = 0; i < conf->geo.raid_disks; i++)

2907

if (conf->mirrors[i].replacement)

2928

if (conf->mirrors[i].replacement)

2908

conf->mirrors[i].replacement

2929

conf->mirrors[i].replacement

2909

->recovery_offset

2930

->recovery_offset

2910

= MaxSector;

2931

= MaxSector;

2911

}

2932

}

2912

conf->fullsync = 0;

2933

conf->fullsync = 0;

2913

}

2934

}

2914

bitmap_close_sync(mddev->bitmap);

2935

bitmap_close_sync(mddev->bitmap);

2915

close_sync(conf);

2936

close_sync(conf);

2916

*skipped = 1;

2937

*skipped = 1;

2917

return sectors_skipped;

2938

return sectors_skipped;

2918

}

2939

}

2919

2940

2920

if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))

2941

if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))

2921

return reshape_request(mddev, sector_nr, skipped);

2942

return reshape_request(mddev, sector_nr, skipped);

2922

2943

2923

if (chunks_skipped >= conf->geo.raid_disks) {

2944

if (chunks_skipped >= conf->geo.raid_disks) {

2924

/* if there has been nothing to do on any drive,

2945

/* if there has been nothing to do on any drive,

2925

* then there is nothing to do at all..

2946

* then there is nothing to do at all..

2926

*/

2947

*/

2927

*skipped = 1;

2948

*skipped = 1;

2928

return (max_sector - sector_nr) + sectors_skipped;

2949

return (max_sector - sector_nr) + sectors_skipped;

2929

}

2950

}

2930

2951

2931

if (max_sector > mddev->resync_max)

2952

if (max_sector > mddev->resync_max)

2932

max_sector = mddev->resync_max; /* Don't do IO beyond here */

2953

max_sector = mddev->resync_max; /* Don't do IO beyond here */

2933

2954

2934

/* make sure whole request will fit in a chunk - if chunks

2955

/* make sure whole request will fit in a chunk - if chunks

2935

* are meaningful

2956

* are meaningful

2936

*/

2957

*/

2937

if (conf->geo.near_copies < conf->geo.raid_disks &&

2958

if (conf->geo.near_copies < conf->geo.raid_disks &&

2938

max_sector > (sector_nr | chunk_mask))

2959

max_sector > (sector_nr | chunk_mask))

2939

max_sector = (sector_nr | chunk_mask) + 1;

2960

max_sector = (sector_nr | chunk_mask) + 1;

2940

/*

2961

/*

2941

* If there is non-resync activity waiting for us then

2962

* If there is non-resync activity waiting for us then

2942

* put in a delay to throttle resync.

2963

* put in a delay to throttle resync.

2943

*/

2964

*/

2944

if (!go_faster && conf->nr_waiting)

2965

if (!go_faster && conf->nr_waiting)

2945

msleep_interruptible(1000);

2966

msleep_interruptible(1000);

2946

2967

2947

/* Again, very different code for resync and recovery.

2968

/* Again, very different code for resync and recovery.

2948

* Both must result in an r10bio with a list of bios that

2969

* Both must result in an r10bio with a list of bios that

2949

* have bi_end_io, bi_sector, bi_bdev set,

2970

* have bi_end_io, bi_sector, bi_bdev set,

2950

* and bi_private set to the r10bio.

2971

* and bi_private set to the r10bio.

2951

* For recovery, we may actually create several r10bios

2972

* For recovery, we may actually create several r10bios

2952

* with 2 bios in each, that correspond to the bios in the main one.

2973

* with 2 bios in each, that correspond to the bios in the main one.

2953

* In this case, the subordinate r10bios link back through a

2974

* In this case, the subordinate r10bios link back through a

2954

* borrowed master_bio pointer, and the counter in the master

2975

* borrowed master_bio pointer, and the counter in the master

2955

* includes a ref from each subordinate.

2976

* includes a ref from each subordinate.

2956

*/

2977

*/

2957

/* First, we decide what to do and set ->bi_end_io

2978

/* First, we decide what to do and set ->bi_end_io

2958

* To end_sync_read if we want to read, and

2979

* To end_sync_read if we want to read, and

2959

* end_sync_write if we will want to write.

2980

* end_sync_write if we will want to write.

2960

*/

2981

*/

2961

2982

2962

max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);

2983

max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);

2963

if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {

2984

if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {

2964

/* recovery... the complicated one */

2985

/* recovery... the complicated one */

2965

int j;

2986

int j;

2966

r10_bio = NULL;

2987

r10_bio = NULL;

2967

2988

2968

for (i = 0 ; i < conf->geo.raid_disks; i++) {

2989

for (i = 0 ; i < conf->geo.raid_disks; i++) {

2969

int still_degraded;

2990

int still_degraded;

2970

struct r10bio *rb2;

2991

struct r10bio *rb2;

2971

sector_t sect;

2992

sector_t sect;

2972

int must_sync;

2993

int must_sync;

2973

int any_working;

2994

int any_working;

2974

struct raid10_info *mirror = &conf->mirrors[i];

2995

struct raid10_info *mirror = &conf->mirrors[i];

2975

2996

2976

if ((mirror->rdev == NULL ||

2997

if ((mirror->rdev == NULL ||

2977

test_bit(In_sync, &mirror->rdev->flags))

2998

test_bit(In_sync, &mirror->rdev->flags))

2978

&&

2999

&&

2979

(mirror->replacement == NULL ||

3000

(mirror->replacement == NULL ||

2980

test_bit(Faulty,

3001

test_bit(Faulty,

2981

&mirror->replacement->flags)))

3002

&mirror->replacement->flags)))

2982

continue;

3003

continue;

2983

3004

2984

still_degraded = 0;

3005

still_degraded = 0;

2985

/* want to reconstruct this device */

3006

/* want to reconstruct this device */

2986

rb2 = r10_bio;

3007

rb2 = r10_bio;

2987

sect = raid10_find_virt(conf, sector_nr, i);

3008

sect = raid10_find_virt(conf, sector_nr, i);

2988

if (sect >= mddev->resync_max_sectors) {

3009

if (sect >= mddev->resync_max_sectors) {

2989

/* last stripe is not complete - don't

3010

/* last stripe is not complete - don't

2990

* try to recover this sector.

3011

* try to recover this sector.

2991

*/

3012

*/

2992

continue;

3013

continue;

2993

}

3014

}

2994

/* Unless we are doing a full sync, or a replacement

3015

/* Unless we are doing a full sync, or a replacement

2995

* we only need to recover the block if it is set in

3016

* we only need to recover the block if it is set in

2996

* the bitmap

3017

* the bitmap

2997

*/

3018

*/

2998

must_sync = bitmap_start_sync(mddev->bitmap, sect,

3019

must_sync = bitmap_start_sync(mddev->bitmap, sect,

2999

&sync_blocks, 1);

3020

&sync_blocks, 1);

3000

if (sync_blocks < max_sync)

3021

if (sync_blocks < max_sync)

3001

max_sync = sync_blocks;

3022

max_sync = sync_blocks;

3002

if (!must_sync &&

3023

if (!must_sync &&

3003

mirror->replacement == NULL &&

3024

mirror->replacement == NULL &&

3004

!conf->fullsync) {

3025

!conf->fullsync) {

3005

/* yep, skip the sync_blocks here, but don't assume

3026

/* yep, skip the sync_blocks here, but don't assume

3006

* that there will never be anything to do here

3027

* that there will never be anything to do here

3007

*/

3028

*/

3008

chunks_skipped = -1;

3029

chunks_skipped = -1;

3009

continue;

3030

continue;

3010

}

3031

}

3011

3032

3012

r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);

3033

r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);

3013

raise_barrier(conf, rb2 != NULL);

3034

raise_barrier(conf, rb2 != NULL);

3014

atomic_set(&r10_bio->remaining, 0);

3035

atomic_set(&r10_bio->remaining, 0);

3015

3036

3016

r10_bio->master_bio = (struct bio*)rb2;

3037

r10_bio->master_bio = (struct bio*)rb2;

3017

if (rb2)

3038

if (rb2)

3018

atomic_inc(&rb2->remaining);

3039

atomic_inc(&rb2->remaining);

3019

r10_bio->mddev = mddev;

3040

r10_bio->mddev = mddev;

3020

set_bit(R10BIO_IsRecover, &r10_bio->state);

3041

set_bit(R10BIO_IsRecover, &r10_bio->state);

3021

r10_bio->sector = sect;

3042

r10_bio->sector = sect;

3022

3043

3023

raid10_find_phys(conf, r10_bio);

3044

raid10_find_phys(conf, r10_bio);

3024

3045

3025

/* Need to check if the array will still be

3046

/* Need to check if the array will still be

3026

* degraded

3047

* degraded

3027

*/

3048

*/

3028

for (j = 0; j < conf->geo.raid_disks; j++)

3049

for (j = 0; j < conf->geo.raid_disks; j++)

3029

if (conf->mirrors[j].rdev == NULL ||

3050

if (conf->mirrors[j].rdev == NULL ||

3030

test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {

3051

test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {

3031

still_degraded = 1;

3052

still_degraded = 1;

3032

break;

3053

break;

3033

}

3054

}

3034

3055

3035

must_sync = bitmap_start_sync(mddev->bitmap, sect,

3056

must_sync = bitmap_start_sync(mddev->bitmap, sect,

3036

&sync_blocks, still_degraded);

3057

&sync_blocks, still_degraded);

3037

3058

3038

any_working = 0;

3059

any_working = 0;

3039

for (j=0; j<conf->copies;j++) {

3060

for (j=0; j<conf->copies;j++) {

3040

int k;

3061

int k;

3041

int d = r10_bio->devs[j].devnum;

3062

int d = r10_bio->devs[j].devnum;

3042

sector_t from_addr, to_addr;

3063

sector_t from_addr, to_addr;

3043

struct md_rdev *rdev;

3064

struct md_rdev *rdev;

3044

sector_t sector, first_bad;

3065

sector_t sector, first_bad;

3045

int bad_sectors;

3066

int bad_sectors;

3046

if (!conf->mirrors[d].rdev ||

3067

if (!conf->mirrors[d].rdev ||

3047

!test_bit(In_sync, &conf->mirrors[d].rdev->flags))

3068

!test_bit(In_sync, &conf->mirrors[d].rdev->flags))

3048

continue;

3069

continue;

3049

/* This is where we read from */

3070

/* This is where we read from */

3050

any_working = 1;

3071

any_working = 1;

3051

rdev = conf->mirrors[d].rdev;

3072

rdev = conf->mirrors[d].rdev;

3052

sector = r10_bio->devs[j].addr;

3073

sector = r10_bio->devs[j].addr;

3053

3074

3054

if (is_badblock(rdev, sector, max_sync,

3075

if (is_badblock(rdev, sector, max_sync,

3055

&first_bad, &bad_sectors)) {

3076

&first_bad, &bad_sectors)) {

3056

if (first_bad > sector)

3077

if (first_bad > sector)

3057

max_sync = first_bad - sector;

3078

max_sync = first_bad - sector;

3058

else {

3079

else {

3059

bad_sectors -= (sector

3080

bad_sectors -= (sector

3060

- first_bad);

3081

- first_bad);

3061

if (max_sync > bad_sectors)

3082

if (max_sync > bad_sectors)

3062

max_sync = bad_sectors;

3083

max_sync = bad_sectors;

3063

continue;

3084

continue;

3064

}

3085

}

3065

}

3086

}

3066

bio = r10_bio->devs[0].bio;

3087

bio = r10_bio->devs[0].bio;

3067

bio->bi_next = biolist;

3088

bio->bi_next = biolist;

3068

biolist = bio;

3089

biolist = bio;

3069

bio->bi_private = r10_bio;

3090

bio->bi_private = r10_bio;

3070

bio->bi_end_io = end_sync_read;

3091

bio->bi_end_io = end_sync_read;

3071

bio->bi_rw = READ;

3092

bio->bi_rw = READ;

3072

from_addr = r10_bio->devs[j].addr;

3093

from_addr = r10_bio->devs[j].addr;

3073

bio->bi_sector = from_addr + rdev->data_offset;

3094

bio->bi_sector = from_addr + rdev->data_offset;

3074

bio->bi_bdev = rdev->bdev;

3095

bio->bi_bdev = rdev->bdev;

3075

atomic_inc(&rdev->nr_pending);

3096

atomic_inc(&rdev->nr_pending);

3076

/* and we write to 'i' (if not in_sync) */

3097

/* and we write to 'i' (if not in_sync) */

3077

3098

3078

for (k=0; k<conf->copies; k++)

3099

for (k=0; k<conf->copies; k++)

3079

if (r10_bio->devs[k].devnum == i)

3100

if (r10_bio->devs[k].devnum == i)

3080

break;

3101

break;

3081

BUG_ON(k == conf->copies);

3102

BUG_ON(k == conf->copies);

3082

to_addr = r10_bio->devs[k].addr;

3103

to_addr = r10_bio->devs[k].addr;

3083

r10_bio->devs[0].devnum = d;

3104

r10_bio->devs[0].devnum = d;

3084

r10_bio->devs[0].addr = from_addr;

3105

r10_bio->devs[0].addr = from_addr;

3085

r10_bio->devs[1].devnum = i;

3106

r10_bio->devs[1].devnum = i;

3086

r10_bio->devs[1].addr = to_addr;

3107

r10_bio->devs[1].addr = to_addr;

3087

3108

3088

rdev = mirror->rdev;

3109

rdev = mirror->rdev;

3089

if (!test_bit(In_sync, &rdev->flags)) {

3110

if (!test_bit(In_sync, &rdev->flags)) {

3090

bio = r10_bio->devs[1].bio;

3111

bio = r10_bio->devs[1].bio;

3091

bio->bi_next = biolist;

3112

bio->bi_next = biolist;

3092

biolist = bio;

3113

biolist = bio;

3093

bio->bi_private = r10_bio;

3114

bio->bi_private = r10_bio;

3094

bio->bi_end_io = end_sync_write;

3115

bio->bi_end_io = end_sync_write;

3095

bio->bi_rw = WRITE;

3116

bio->bi_rw = WRITE;

3096

bio->bi_sector = to_addr

3117

bio->bi_sector = to_addr

3097

+ rdev->data_offset;

3118

+ rdev->data_offset;

3098

bio->bi_bdev = rdev->bdev;

3119

bio->bi_bdev = rdev->bdev;

3099

atomic_inc(&r10_bio->remaining);

3120

atomic_inc(&r10_bio->remaining);

3100

} else

3121

} else

3101

r10_bio->devs[1].bio->bi_end_io = NULL;

3122

r10_bio->devs[1].bio->bi_end_io = NULL;

3102

3123

3103

/* and maybe write to replacement */

3124

/* and maybe write to replacement */

3104

bio = r10_bio->devs[1].repl_bio;

3125

bio = r10_bio->devs[1].repl_bio;

3105

if (bio)

3126

if (bio)

3106

bio->bi_end_io = NULL;

3127

bio->bi_end_io = NULL;

3107

rdev = mirror->replacement;

3128

rdev = mirror->replacement;

3108

/* Note: if rdev != NULL, then bio

3129

/* Note: if rdev != NULL, then bio

3109

* cannot be NULL as r10buf_pool_alloc will

3130

* cannot be NULL as r10buf_pool_alloc will

3110

* have allocated it.

3131

* have allocated it.

3111

* So the second test here is pointless.

3132

* So the second test here is pointless.

3112

* But it keeps semantic-checkers happy, and

3133

* But it keeps semantic-checkers happy, and

3113

* this comment keeps human reviewers

3134

* this comment keeps human reviewers

3114

* happy.

3135

* happy.

3115

*/

3136

*/

3116

if (rdev == NULL || bio == NULL ||

3137

if (rdev == NULL || bio == NULL ||

3117

test_bit(Faulty, &rdev->flags))

3138

test_bit(Faulty, &rdev->flags))

3118

break;

3139

break;

3119

bio->bi_next = biolist;

3140

bio->bi_next = biolist;

3120

biolist = bio;

3141

biolist = bio;

3121

bio->bi_private = r10_bio;

3142

bio->bi_private = r10_bio;

3122

bio->bi_end_io = end_sync_write;

3143

bio->bi_end_io = end_sync_write;

3123

bio->bi_rw = WRITE;

3144

bio->bi_rw = WRITE;

3124

bio->bi_sector = to_addr + rdev->data_offset;

3145

bio->bi_sector = to_addr + rdev->data_offset;

3125

bio->bi_bdev = rdev->bdev;

3146

bio->bi_bdev = rdev->bdev;

3126

atomic_inc(&r10_bio->remaining);

3147

atomic_inc(&r10_bio->remaining);

3127

break;

3148

break;

3128

}

3149

}

3129

if (j == conf->copies) {

3150

if (j == conf->copies) {

3130

/* Cannot recover, so abort the recovery or

3151

/* Cannot recover, so abort the recovery or

3131

* record a bad block */

3152

* record a bad block */

3132

put_buf(r10_bio);

3153

put_buf(r10_bio);

3133

if (rb2)

3154

if (rb2)

3134

atomic_dec(&rb2->remaining);

3155

atomic_dec(&rb2->remaining);

3135

r10_bio = rb2;

3156

r10_bio = rb2;

3136

if (any_working) {

3157

if (any_working) {

3137

/* problem is that there are bad blocks

3158

/* problem is that there are bad blocks

3138

* on other device(s)

3159

* on other device(s)

3139

*/

3160

*/

3140

int k;

3161

int k;

3141

for (k = 0; k < conf->copies; k++)

3162

for (k = 0; k < conf->copies; k++)

3142

if (r10_bio->devs[k].devnum == i)

3163

if (r10_bio->devs[k].devnum == i)

3143

break;

3164

break;

3144

if (!test_bit(In_sync,

3165

if (!test_bit(In_sync,

3145

&mirror->rdev->flags)

3166

&mirror->rdev->flags)

3146

&& !rdev_set_badblocks(

3167

&& !rdev_set_badblocks(

3147

mirror->rdev,

3168

mirror->rdev,

3148

r10_bio->devs[k].addr,

3169

r10_bio->devs[k].addr,

3149

max_sync, 0))

3170

max_sync, 0))

3150

any_working = 0;

3171

any_working = 0;

3151

if (mirror->replacement &&

3172

if (mirror->replacement &&

3152

!rdev_set_badblocks(

3173

!rdev_set_badblocks(

3153

mirror->replacement,

3174

mirror->replacement,

3154

r10_bio->devs[k].addr,

3175

r10_bio->devs[k].addr,

3155

max_sync, 0))

3176

max_sync, 0))

3156

any_working = 0;

3177

any_working = 0;

3157

}

3178

}

3158

if (!any_working) {

3179

if (!any_working) {

3159

if (!test_and_set_bit(MD_RECOVERY_INTR,

3180

if (!test_and_set_bit(MD_RECOVERY_INTR,

3160

&mddev->recovery))

3181

&mddev->recovery))

3161

printk(KERN_INFO "md/raid10:%s: insufficient "

3182

printk(KERN_INFO "md/raid10:%s: insufficient "

3162

"working devices for recovery.\n",

3183

"working devices for recovery.\n",

3163

mdname(mddev));

3184

mdname(mddev));

3164

mirror->recovery_disabled

3185

mirror->recovery_disabled

3165

= mddev->recovery_disabled;

3186

= mddev->recovery_disabled;

3166

}

3187

}

3167

break;

3188

break;

3168

}

3189

}

3169

}

3190

}

3170

if (biolist == NULL) {

3191

if (biolist == NULL) {

3171

while (r10_bio) {

3192

while (r10_bio) {

3172

struct r10bio *rb2 = r10_bio;

3193

struct r10bio *rb2 = r10_bio;

3173

r10_bio = (struct r10bio*) rb2->master_bio;

3194

r10_bio = (struct r10bio*) rb2->master_bio;

3174

rb2->master_bio = NULL;

3195

rb2->master_bio = NULL;

3175

put_buf(rb2);

3196

put_buf(rb2);

3176

}

3197

}

3177

goto giveup;

3198

goto giveup;

3178

}

3199

}

3179

} else {

3200

} else {

3180

/* resync. Schedule a read for every block at this virt offset */

3201

/* resync. Schedule a read for every block at this virt offset */

3181

int count = 0;

3202

int count = 0;

3182

3203

3183

bitmap_cond_end_sync(mddev->bitmap, sector_nr);

3204

bitmap_cond_end_sync(mddev->bitmap, sector_nr);

3184

3205

3185

if (!bitmap_start_sync(mddev->bitmap, sector_nr,

3206

if (!bitmap_start_sync(mddev->bitmap, sector_nr,

3186

&sync_blocks, mddev->degraded) &&

3207

&sync_blocks, mddev->degraded) &&

3187

!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,

3208

!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,

3188

&mddev->recovery)) {

3209

&mddev->recovery)) {

3189

/* We can skip this block */

3210

/* We can skip this block */

3190

*skipped = 1;

3211

*skipped = 1;

3191

return sync_blocks + sectors_skipped;

3212

return sync_blocks + sectors_skipped;

3192

}

3213

}

3193

if (sync_blocks < max_sync)

3214

if (sync_blocks < max_sync)

3194

max_sync = sync_blocks;

3215

max_sync = sync_blocks;

3195

r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);

3216

r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);

3196

3217

3197

r10_bio->mddev = mddev;

3218

r10_bio->mddev = mddev;

3198

atomic_set(&r10_bio->remaining, 0);

3219

atomic_set(&r10_bio->remaining, 0);

3199

raise_barrier(conf, 0);

3220

raise_barrier(conf, 0);

3200

conf->next_resync = sector_nr;

3221

conf->next_resync = sector_nr;

3201

3222

3202

r10_bio->master_bio = NULL;

3223

r10_bio->master_bio = NULL;

3203

r10_bio->sector = sector_nr;

3224

r10_bio->sector = sector_nr;

3204

set_bit(R10BIO_IsSync, &r10_bio->state);

3225

set_bit(R10BIO_IsSync, &r10_bio->state);

3205

raid10_find_phys(conf, r10_bio);

3226

raid10_find_phys(conf, r10_bio);

3206

r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;

3227

r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;

3207

3228

3208

for (i = 0; i < conf->copies; i++) {

3229

for (i = 0; i < conf->copies; i++) {

3209

int d = r10_bio->devs[i].devnum;

3230

int d = r10_bio->devs[i].devnum;

3210

sector_t first_bad, sector;

3231

sector_t first_bad, sector;

3211

int bad_sectors;

3232

int bad_sectors;

3212

3233

3213

if (r10_bio->devs[i].repl_bio)

3234

if (r10_bio->devs[i].repl_bio)

3214

r10_bio->devs[i].repl_bio->bi_end_io = NULL;

3235

r10_bio->devs[i].repl_bio->bi_end_io = NULL;

3215

3236

3216

bio = r10_bio->devs[i].bio;

3237

bio = r10_bio->devs[i].bio;

3217

bio->bi_end_io = NULL;

3238

bio->bi_end_io = NULL;

3218

clear_bit(BIO_UPTODATE, &bio->bi_flags);

3239

clear_bit(BIO_UPTODATE, &bio->bi_flags);

3219

if (conf->mirrors[d].rdev == NULL ||

3240

if (conf->mirrors[d].rdev == NULL ||

3220

test_bit(Faulty, &conf->mirrors[d].rdev->flags))

3241

test_bit(Faulty, &conf->mirrors[d].rdev->flags))

3221

continue;

3242

continue;

3222

sector = r10_bio->devs[i].addr;

3243

sector = r10_bio->devs[i].addr;

3223

if (is_badblock(conf->mirrors[d].rdev,

3244

if (is_badblock(conf->mirrors[d].rdev,

3224

sector, max_sync,

3245

sector, max_sync,

3225

&first_bad, &bad_sectors)) {

3246

&first_bad, &bad_sectors)) {

3226

if (first_bad > sector)

3247

if (first_bad > sector)

3227

max_sync = first_bad - sector;

3248

max_sync = first_bad - sector;

3228

else {

3249

else {

3229

bad_sectors -= (sector - first_bad);

3250

bad_sectors -= (sector - first_bad);

3230

if (max_sync > bad_sectors)

3251

if (max_sync > bad_sectors)

3231

max_sync = bad_sectors;

3252

max_sync = bad_sectors;

3232

continue;

3253

continue;

3233

}

3254

}

3234

}

3255

}

3235

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

3256

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

3236

atomic_inc(&r10_bio->remaining);

3257

atomic_inc(&r10_bio->remaining);

3237

bio->bi_next = biolist;

3258

bio->bi_next = biolist;

3238

biolist = bio;

3259

biolist = bio;

3239

bio->bi_private = r10_bio;

3260

bio->bi_private = r10_bio;

3240

bio->bi_end_io = end_sync_read;

3261

bio->bi_end_io = end_sync_read;

3241

bio->bi_rw = READ;

3262

bio->bi_rw = READ;

3242

bio->bi_sector = sector +

3263

bio->bi_sector = sector +

3243

conf->mirrors[d].rdev->data_offset;

3264

conf->mirrors[d].rdev->data_offset;

3244

bio->bi_bdev = conf->mirrors[d].rdev->bdev;

3265

bio->bi_bdev = conf->mirrors[d].rdev->bdev;

3245

count++;

3266

count++;

3246

3267

3247

if (conf->mirrors[d].replacement == NULL ||

3268

if (conf->mirrors[d].replacement == NULL ||

3248

test_bit(Faulty,

3269

test_bit(Faulty,

3249

&conf->mirrors[d].replacement->flags))

3270

&conf->mirrors[d].replacement->flags))

3250

continue;

3271

continue;

3251

3272

3252

/* Need to set up for writing to the replacement */

3273

/* Need to set up for writing to the replacement */

3253

bio = r10_bio->devs[i].repl_bio;

3274

bio = r10_bio->devs[i].repl_bio;

3254

clear_bit(BIO_UPTODATE, &bio->bi_flags);

3275

clear_bit(BIO_UPTODATE, &bio->bi_flags);

3255

3276

3256

sector = r10_bio->devs[i].addr;

3277

sector = r10_bio->devs[i].addr;

3257

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

3278

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

3258

bio->bi_next = biolist;

3279

bio->bi_next = biolist;

3259

biolist = bio;

3280

biolist = bio;

3260

bio->bi_private = r10_bio;

3281

bio->bi_private = r10_bio;

3261

bio->bi_end_io = end_sync_write;

3282

bio->bi_end_io = end_sync_write;

3262

bio->bi_rw = WRITE;

3283

bio->bi_rw = WRITE;

3263

bio->bi_sector = sector +

3284

bio->bi_sector = sector +

3264

conf->mirrors[d].replacement->data_offset;

3285

conf->mirrors[d].replacement->data_offset;

3265

bio->bi_bdev = conf->mirrors[d].replacement->bdev;

3286

bio->bi_bdev = conf->mirrors[d].replacement->bdev;

3266

count++;

3287

count++;

3267

}

3288

}

3268

3289

3269

if (count < 2) {

3290

if (count < 2) {

3270

for (i=0; i<conf->copies; i++) {

3291

for (i=0; i<conf->copies; i++) {

3271

int d = r10_bio->devs[i].devnum;

3292

int d = r10_bio->devs[i].devnum;

3272

if (r10_bio->devs[i].bio->bi_end_io)

3293

if (r10_bio->devs[i].bio->bi_end_io)

3273

rdev_dec_pending(conf->mirrors[d].rdev,

3294

rdev_dec_pending(conf->mirrors[d].rdev,

3274

mddev);

3295

mddev);

3275

if (r10_bio->devs[i].repl_bio &&

3296

if (r10_bio->devs[i].repl_bio &&

3276

r10_bio->devs[i].repl_bio->bi_end_io)

3297

r10_bio->devs[i].repl_bio->bi_end_io)

3277

rdev_dec_pending(

3298

rdev_dec_pending(

3278

conf->mirrors[d].replacement,

3299

conf->mirrors[d].replacement,

3279

mddev);

3300

mddev);

3280

}

3301

}

3281

put_buf(r10_bio);

3302

put_buf(r10_bio);

3282

biolist = NULL;

3303

biolist = NULL;

3283

goto giveup;

3304

goto giveup;

3284

}

3305

}

3285

}

3306

}

3286

3307

3287

for (bio = biolist; bio ; bio=bio->bi_next) {

3308

for (bio = biolist; bio ; bio=bio->bi_next) {

3288

3309

3289

bio->bi_flags &= ~(BIO_POOL_MASK - 1);

3310

bio->bi_flags &= ~(BIO_POOL_MASK - 1);

3290

if (bio->bi_end_io)

3311

if (bio->bi_end_io)

3291

bio->bi_flags |= 1 << BIO_UPTODATE;

3312

bio->bi_flags |= 1 << BIO_UPTODATE;

3292

bio->bi_vcnt = 0;

3313

bio->bi_vcnt = 0;

3293

bio->bi_idx = 0;

3314

bio->bi_idx = 0;

3294

bio->bi_phys_segments = 0;

3315

bio->bi_phys_segments = 0;

3295

bio->bi_size = 0;

3316

bio->bi_size = 0;

3296

}

3317

}

3297

3318

3298

nr_sectors = 0;

3319

nr_sectors = 0;

3299

if (sector_nr + max_sync < max_sector)

3320

if (sector_nr + max_sync < max_sector)

3300

max_sector = sector_nr + max_sync;

3321

max_sector = sector_nr + max_sync;

3301

do {

3322

do {

3302

struct page *page;

3323

struct page *page;

3303

int len = PAGE_SIZE;

3324

int len = PAGE_SIZE;

3304

if (sector_nr + (len>>9) > max_sector)

3325

if (sector_nr + (len>>9) > max_sector)

3305

len = (max_sector - sector_nr) << 9;

3326

len = (max_sector - sector_nr) << 9;

3306

if (len == 0)

3327

if (len == 0)

3307

break;

3328

break;

3308

for (bio= biolist ; bio ; bio=bio->bi_next) {

3329

for (bio= biolist ; bio ; bio=bio->bi_next) {

3309

struct bio *bio2;

3330

struct bio *bio2;

3310

page = bio->bi_io_vec[bio->bi_vcnt].bv_page;

3331

page = bio->bi_io_vec[bio->bi_vcnt].bv_page;

3311

if (bio_add_page(bio, page, len, 0))

3332

if (bio_add_page(bio, page, len, 0))

3312

continue;

3333

continue;

3313

3334

3314

/* stop here */

3335

/* stop here */

3315

bio->bi_io_vec[bio->bi_vcnt].bv_page = page;

3336

bio->bi_io_vec[bio->bi_vcnt].bv_page = page;

3316

for (bio2 = biolist;

3337

for (bio2 = biolist;

3317

bio2 && bio2 != bio;

3338

bio2 && bio2 != bio;

3318

bio2 = bio2->bi_next) {

3339

bio2 = bio2->bi_next) {

3319

/* remove last page from this bio */

3340

/* remove last page from this bio */

3320

bio2->bi_vcnt--;

3341

bio2->bi_vcnt--;

3321

bio2->bi_size -= len;

3342

bio2->bi_size -= len;

3322

bio2->bi_flags &= ~(1<< BIO_SEG_VALID);

3343

bio2->bi_flags &= ~(1<< BIO_SEG_VALID);

3323

}

3344

}

3324

goto bio_full;

3345

goto bio_full;

3325

}

3346

}

3326

nr_sectors += len>>9;

3347

nr_sectors += len>>9;

3327

sector_nr += len>>9;

3348

sector_nr += len>>9;

3328

} while (biolist->bi_vcnt < RESYNC_PAGES);

3349

} while (biolist->bi_vcnt < RESYNC_PAGES);

3329

bio_full:

3350

bio_full:

3330

r10_bio->sectors = nr_sectors;

3351

r10_bio->sectors = nr_sectors;

3331

3352

3332

while (biolist) {

3353

while (biolist) {

3333

bio = biolist;

3354

bio = biolist;

3334

biolist = biolist->bi_next;

3355

biolist = biolist->bi_next;

3335

3356

3336

bio->bi_next = NULL;

3357

bio->bi_next = NULL;

3337

r10_bio = bio->bi_private;

3358

r10_bio = bio->bi_private;

3338

r10_bio->sectors = nr_sectors;

3359

r10_bio->sectors = nr_sectors;

3339

3360

3340

if (bio->bi_end_io == end_sync_read) {

3361

if (bio->bi_end_io == end_sync_read) {

3341

md_sync_acct(bio->bi_bdev, nr_sectors);

3362

md_sync_acct(bio->bi_bdev, nr_sectors);

3342

generic_make_request(bio);

3363

generic_make_request(bio);

3343

}

3364

}

3344

}

3365

}

3345

3366

3346

if (sectors_skipped)

3367

if (sectors_skipped)

3347

/* pretend they weren't skipped, it makes

3368

/* pretend they weren't skipped, it makes

3348

* no important difference in this case

3369

* no important difference in this case

3349

*/

3370

*/

3350

md_done_sync(mddev, sectors_skipped, 1);

3371

md_done_sync(mddev, sectors_skipped, 1);

3351

3372

3352

return sectors_skipped + nr_sectors;

3373

return sectors_skipped + nr_sectors;

3353

giveup:

3374

giveup:

3354

/* There is nowhere to write, so all non-sync

3375

/* There is nowhere to write, so all non-sync

3355

* drives must be failed or in resync, all drives

3376

* drives must be failed or in resync, all drives

3356

* have a bad block, so try the next chunk...

3377

* have a bad block, so try the next chunk...

3357

*/

3378

*/

3358

if (sector_nr + max_sync < max_sector)

3379

if (sector_nr + max_sync < max_sector)

3359

max_sector = sector_nr + max_sync;

3380

max_sector = sector_nr + max_sync;

3360

3381

3361

sectors_skipped += (max_sector - sector_nr);

3382

sectors_skipped += (max_sector - sector_nr);

3362

chunks_skipped ++;

3383

chunks_skipped ++;

3363

sector_nr = max_sector;

3384

sector_nr = max_sector;

3364

goto skipped;

3385

goto skipped;

3365

}

3386

}

3366

3387

3367

static sector_t

3388

static sector_t

3368

raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)

3389

raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)

3369

{

3390

{

3370

sector_t size;

3391

sector_t size;

3371

struct r10conf *conf = mddev->private;

3392

struct r10conf *conf = mddev->private;

3372

3393

3373

if (!raid_disks)

3394

if (!raid_disks)

3374

raid_disks = min(conf->geo.raid_disks,

3395

raid_disks = min(conf->geo.raid_disks,

3375

conf->prev.raid_disks);

3396

conf->prev.raid_disks);

3376

if (!sectors)

3397

if (!sectors)

3377

sectors = conf->dev_sectors;

3398

sectors = conf->dev_sectors;

3378

3399

3379

size = sectors >> conf->geo.chunk_shift;

3400

size = sectors >> conf->geo.chunk_shift;

3380

sector_div(size, conf->geo.far_copies);

3401

sector_div(size, conf->geo.far_copies);

3381

size = size * raid_disks;

3402

size = size * raid_disks;

3382

sector_div(size, conf->geo.near_copies);

3403

sector_div(size, conf->geo.near_copies);

3383

3404

3384

return size << conf->geo.chunk_shift;

3405

return size << conf->geo.chunk_shift;

3385

}

3406

}

3386

3407

3387

static void calc_sectors(struct r10conf *conf, sector_t size)

3408

static void calc_sectors(struct r10conf *conf, sector_t size)

3388

{

3409

{

3389

/* Calculate the number of sectors-per-device that will

3410

/* Calculate the number of sectors-per-device that will

3390

* actually be used, and set conf->dev_sectors and

3411

* actually be used, and set conf->dev_sectors and

3391

* conf->stride

3412

* conf->stride

3392

*/

3413

*/

3393

3414

3394

size = size >> conf->geo.chunk_shift;

3415

size = size >> conf->geo.chunk_shift;

3395

sector_div(size, conf->geo.far_copies);

3416

sector_div(size, conf->geo.far_copies);

3396

size = size * conf->geo.raid_disks;

3417

size = size * conf->geo.raid_disks;

3397

sector_div(size, conf->geo.near_copies);

3418

sector_div(size, conf->geo.near_copies);

3398

/* 'size' is now the number of chunks in the array */

3419

/* 'size' is now the number of chunks in the array */

3399

/* calculate "used chunks per device" */

3420

/* calculate "used chunks per device" */

3400

size = size * conf->copies;

3421

size = size * conf->copies;

3401

3422

3402

/* We need to round up when dividing by raid_disks to

3423

/* We need to round up when dividing by raid_disks to

3403

* get the stride size.

3424

* get the stride size.

3404

*/

3425

*/

3405

size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);

3426

size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);

3406

3427

3407

conf->dev_sectors = size << conf->geo.chunk_shift;

3428

conf->dev_sectors = size << conf->geo.chunk_shift;

3408

3429

3409

if (conf->geo.far_offset)

3430

if (conf->geo.far_offset)

3410

conf->geo.stride = 1 << conf->geo.chunk_shift;

3431

conf->geo.stride = 1 << conf->geo.chunk_shift;

3411

else {

3432

else {

3412

sector_div(size, conf->geo.far_copies);

3433

sector_div(size, conf->geo.far_copies);

3413

conf->geo.stride = size << conf->geo.chunk_shift;

3434

conf->geo.stride = size << conf->geo.chunk_shift;

3414

}

3435

}

3415

}

3436

}

3416

3437

3417

enum geo_type {geo_new, geo_old, geo_start};

3438

enum geo_type {geo_new, geo_old, geo_start};

3418

static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)

3439

static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)

3419

{

3440

{

3420

int nc, fc, fo;

3441

int nc, fc, fo;

3421

int layout, chunk, disks;

3442

int layout, chunk, disks;

3422

switch (new) {

3443

switch (new) {

3423

case geo_old:

3444

case geo_old:

3424

layout = mddev->layout;

3445

layout = mddev->layout;

3425

chunk = mddev->chunk_sectors;

3446

chunk = mddev->chunk_sectors;

3426

disks = mddev->raid_disks - mddev->delta_disks;

3447

disks = mddev->raid_disks - mddev->delta_disks;

3427

break;

3448

break;

3428

case geo_new:

3449

case geo_new:

3429

layout = mddev->new_layout;

3450

layout = mddev->new_layout;

3430

chunk = mddev->new_chunk_sectors;

3451

chunk = mddev->new_chunk_sectors;

3431

disks = mddev->raid_disks;

3452

disks = mddev->raid_disks;

3432

break;

3453

break;

3433

default: /* avoid 'may be unused' warnings */

3454

default: /* avoid 'may be unused' warnings */

3434

case geo_start: /* new when starting reshape - raid_disks not

3455

case geo_start: /* new when starting reshape - raid_disks not

3435

* updated yet. */

3456

* updated yet. */

3436

layout = mddev->new_layout;

3457

layout = mddev->new_layout;

3437

chunk = mddev->new_chunk_sectors;

3458

chunk = mddev->new_chunk_sectors;

3438

disks = mddev->raid_disks + mddev->delta_disks;

3459

disks = mddev->raid_disks + mddev->delta_disks;

3439

break;

3460

break;

3440

}

3461

}

3441

if (layout >> 17)

3462

if (layout >> 18)

3442

return -1;

3463

return -1;

3443

if (chunk < (PAGE_SIZE >> 9) ||

3464

if (chunk < (PAGE_SIZE >> 9) ||

3444

!is_power_of_2(chunk))

3465

!is_power_of_2(chunk))

3445

return -2;

3466

return -2;

3446

nc = layout & 255;

3467

nc = layout & 255;

3447

fc = (layout >> 8) & 255;

3468

fc = (layout >> 8) & 255;

3448

fo = layout & (1<<16);

3469

fo = layout & (1<<16);

3449

geo->raid_disks = disks;

3470

geo->raid_disks = disks;

3450

geo->near_copies = nc;

3471

geo->near_copies = nc;

3451

geo->far_copies = fc;

3472

geo->far_copies = fc;

3452

geo->far_offset = fo;

3473

geo->far_offset = fo;

3474

geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;

3453

geo->chunk_mask = chunk - 1;

3475

geo->chunk_mask = chunk - 1;

3454

geo->chunk_shift = ffz(~chunk);

3476

geo->chunk_shift = ffz(~chunk);

3455

return nc*fc;

3477

return nc*fc;

3456

}

3478

}

3457

3479

3458

static struct r10conf *setup_conf(struct mddev *mddev)

3480

static struct r10conf *setup_conf(struct mddev *mddev)

3459

{

3481

{

3460

struct r10conf *conf = NULL;

3482

struct r10conf *conf = NULL;

3461

int err = -EINVAL;

3483

int err = -EINVAL;

3462

struct geom geo;

3484

struct geom geo;

3463

int copies;

3485

int copies;

3464

3486

3465

copies = setup_geo(&geo, mddev, geo_new);

3487

copies = setup_geo(&geo, mddev, geo_new);

3466

3488

3467

if (copies == -2) {

3489

if (copies == -2) {

3468

printk(KERN_ERR "md/raid10:%s: chunk size must be "

3490

printk(KERN_ERR "md/raid10:%s: chunk size must be "

3469

"at least PAGE_SIZE(%ld) and be a power of 2.\n",

3491

"at least PAGE_SIZE(%ld) and be a power of 2.\n",

3470

mdname(mddev), PAGE_SIZE);

3492

mdname(mddev), PAGE_SIZE);

3471

goto out;

3493

goto out;

3472

}

3494

}

3473

3495

3474

if (copies < 2 || copies > mddev->raid_disks) {

3496

if (copies < 2 || copies > mddev->raid_disks) {

3475

printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",

3497

printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",

3476

mdname(mddev), mddev->new_layout);

3498

mdname(mddev), mddev->new_layout);

3477

goto out;

3499

goto out;

3478

}

3500

}

3479

3501

3480

err = -ENOMEM;

3502

err = -ENOMEM;

3481

conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);

3503

conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);

3482

if (!conf)

3504

if (!conf)

3483

goto out;

3505

goto out;

3484

3506

3485

/* FIXME calc properly */

3507

/* FIXME calc properly */

3486

conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +

3508

conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +

3487

max(0,mddev->delta_disks)),

3509

max(0,mddev->delta_disks)),

3488

GFP_KERNEL);

3510

GFP_KERNEL);

3489

if (!conf->mirrors)

3511

if (!conf->mirrors)

3490

goto out;

3512

goto out;

3491

3513

3492

conf->tmppage = alloc_page(GFP_KERNEL);

3514

conf->tmppage = alloc_page(GFP_KERNEL);

3493

if (!conf->tmppage)

3515

if (!conf->tmppage)

3494

goto out;

3516

goto out;

3495

3517

3496

conf->geo = geo;

3518

conf->geo = geo;

3497

conf->copies = copies;

3519

conf->copies = copies;

3498

conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,

3520

conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,

3499

r10bio_pool_free, conf);

3521

r10bio_pool_free, conf);

3500

if (!conf->r10bio_pool)

3522

if (!conf->r10bio_pool)

3501

goto out;

3523

goto out;

3502

3524

3503

calc_sectors(conf, mddev->dev_sectors);

3525

calc_sectors(conf, mddev->dev_sectors);

3504

if (mddev->reshape_position == MaxSector) {

3526

if (mddev->reshape_position == MaxSector) {

3505

conf->prev = conf->geo;

3527

conf->prev = conf->geo;

3506

conf->reshape_progress = MaxSector;

3528

conf->reshape_progress = MaxSector;

3507

} else {

3529

} else {

3508

if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {

3530

if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {

3509

err = -EINVAL;

3531

err = -EINVAL;

3510

goto out;

3532

goto out;

3511

}

3533

}

3512

conf->reshape_progress = mddev->reshape_position;

3534

conf->reshape_progress = mddev->reshape_position;

3513

if (conf->prev.far_offset)

3535

if (conf->prev.far_offset)

3514

conf->prev.stride = 1 << conf->prev.chunk_shift;

3536

conf->prev.stride = 1 << conf->prev.chunk_shift;

3515

else

3537

else

3516

/* far_copies must be 1 */

3538

/* far_copies must be 1 */

3517

conf->prev.stride = conf->dev_sectors;

3539

conf->prev.stride = conf->dev_sectors;

3518

}

3540

}

3519

spin_lock_init(&conf->device_lock);

3541

spin_lock_init(&conf->device_lock);

3520

INIT_LIST_HEAD(&conf->retry_list);

3542

INIT_LIST_HEAD(&conf->retry_list);

3521

3543

3522

spin_lock_init(&conf->resync_lock);

3544

spin_lock_init(&conf->resync_lock);

3523

init_waitqueue_head(&conf->wait_barrier);

3545

init_waitqueue_head(&conf->wait_barrier);

3524

3546

3525

conf->thread = md_register_thread(raid10d, mddev, "raid10");

3547

conf->thread = md_register_thread(raid10d, mddev, "raid10");

3526

if (!conf->thread)

3548

if (!conf->thread)

3527

goto out;

3549

goto out;

3528

3550

3529

conf->mddev = mddev;

3551

conf->mddev = mddev;

3530

return conf;

3552

return conf;

3531

3553

3532

out:

3554

out:

3533

if (err == -ENOMEM)

3555

if (err == -ENOMEM)

3534

printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",

3556

printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",

3535

mdname(mddev));

3557

mdname(mddev));

3536

if (conf) {

3558

if (conf) {

3537

if (conf->r10bio_pool)

3559

if (conf->r10bio_pool)

3538

mempool_destroy(conf->r10bio_pool);

3560

mempool_destroy(conf->r10bio_pool);

3539

kfree(conf->mirrors);

3561

kfree(conf->mirrors);

3540

safe_put_page(conf->tmppage);

3562

safe_put_page(conf->tmppage);

3541

kfree(conf);

3563

kfree(conf);

3542

}

3564

}

3543

return ERR_PTR(err);

3565

return ERR_PTR(err);

3544

}

3566

}

3545

3567

3546

static int run(struct mddev *mddev)

3568

static int run(struct mddev *mddev)

3547

{

3569

{

3548

struct r10conf *conf;

3570

struct r10conf *conf;

3549

int i, disk_idx, chunk_size;

3571

int i, disk_idx, chunk_size;

3550

struct raid10_info *disk;

3572

struct raid10_info *disk;

3551

struct md_rdev *rdev;

3573

struct md_rdev *rdev;

3552

sector_t size;

3574

sector_t size;

3553

sector_t min_offset_diff = 0;

3575

sector_t min_offset_diff = 0;

3554

int first = 1;

3576

int first = 1;

3555

bool discard_supported = false;

3577

bool discard_supported = false;

3556

3578

3557

if (mddev->private == NULL) {

3579

if (mddev->private == NULL) {

3558

conf = setup_conf(mddev);

3580

conf = setup_conf(mddev);

3559

if (IS_ERR(conf))

3581

if (IS_ERR(conf))

3560

return PTR_ERR(conf);

3582

return PTR_ERR(conf);

3561

mddev->private = conf;

3583

mddev->private = conf;

3562

}

3584

}

3563

conf = mddev->private;

3585

conf = mddev->private;

3564

if (!conf)

3586

if (!conf)

3565

goto out;

3587

goto out;

3566

3588

3567

mddev->thread = conf->thread;

3589

mddev->thread = conf->thread;

3568

conf->thread = NULL;

3590

conf->thread = NULL;

3569

3591

3570

chunk_size = mddev->chunk_sectors << 9;

3592

chunk_size = mddev->chunk_sectors << 9;

3571

if (mddev->queue) {

3593

if (mddev->queue) {

3572

blk_queue_max_discard_sectors(mddev->queue,

3594

blk_queue_max_discard_sectors(mddev->queue,

3573

mddev->chunk_sectors);

3595

mddev->chunk_sectors);

3574

blk_queue_max_write_same_sectors(mddev->queue,

3596

blk_queue_max_write_same_sectors(mddev->queue,

3575

mddev->chunk_sectors);

3597

mddev->chunk_sectors);

3576

blk_queue_io_min(mddev->queue, chunk_size);

3598

blk_queue_io_min(mddev->queue, chunk_size);

3577

if (conf->geo.raid_disks % conf->geo.near_copies)

3599

if (conf->geo.raid_disks % conf->geo.near_copies)

3578

blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);

3600

blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);

3579

else

3601

else

3580

blk_queue_io_opt(mddev->queue, chunk_size *

3602

blk_queue_io_opt(mddev->queue, chunk_size *

3581

(conf->geo.raid_disks / conf->geo.near_copies));

3603

(conf->geo.raid_disks / conf->geo.near_copies));

3582

}

3604

}

3583

3605

3584

rdev_for_each(rdev, mddev) {

3606

rdev_for_each(rdev, mddev) {

3585

long long diff;

3607

long long diff;

3586

struct request_queue *q;

3608

struct request_queue *q;

3587

3609

3588

disk_idx = rdev->raid_disk;

3610

disk_idx = rdev->raid_disk;

3589

if (disk_idx < 0)

3611

if (disk_idx < 0)

3590

continue;

3612

continue;

3591

if (disk_idx >= conf->geo.raid_disks &&

3613

if (disk_idx >= conf->geo.raid_disks &&

3592

disk_idx >= conf->prev.raid_disks)

3614

disk_idx >= conf->prev.raid_disks)

3593

continue;

3615

continue;

3594

disk = conf->mirrors + disk_idx;

3616

disk = conf->mirrors + disk_idx;

3595

3617

3596

if (test_bit(Replacement, &rdev->flags)) {

3618

if (test_bit(Replacement, &rdev->flags)) {

3597

if (disk->replacement)

3619

if (disk->replacement)

3598

goto out_free_conf;

3620

goto out_free_conf;

3599

disk->replacement = rdev;

3621

disk->replacement = rdev;

3600

} else {

3622

} else {

3601

if (disk->rdev)

3623

if (disk->rdev)

3602

goto out_free_conf;

3624

goto out_free_conf;

3603

disk->rdev = rdev;

3625

disk->rdev = rdev;

3604

}

3626

}

3605

q = bdev_get_queue(rdev->bdev);

3627

q = bdev_get_queue(rdev->bdev);

3606

if (q->merge_bvec_fn)

3628

if (q->merge_bvec_fn)

3607

mddev->merge_check_needed = 1;

3629

mddev->merge_check_needed = 1;

3608

diff = (rdev->new_data_offset - rdev->data_offset);

3630

diff = (rdev->new_data_offset - rdev->data_offset);

3609

if (!mddev->reshape_backwards)

3631

if (!mddev->reshape_backwards)

3610

diff = -diff;

3632

diff = -diff;

3611

if (diff < 0)

3633

if (diff < 0)

3612

diff = 0;

3634

diff = 0;

3613

if (first || diff < min_offset_diff)

3635

if (first || diff < min_offset_diff)

3614

min_offset_diff = diff;

3636

min_offset_diff = diff;

3615

3637

3616

if (mddev->gendisk)

3638

if (mddev->gendisk)

3617

disk_stack_limits(mddev->gendisk, rdev->bdev,

3639

disk_stack_limits(mddev->gendisk, rdev->bdev,

3618

rdev->data_offset << 9);

3640

rdev->data_offset << 9);

3619

3641

3620

disk->head_position = 0;

3642

disk->head_position = 0;

3621

3643

3622

if (blk_queue_discard(bdev_get_queue(rdev->bdev)))

3644

if (blk_queue_discard(bdev_get_queue(rdev->bdev)))

3623

discard_supported = true;

3645

discard_supported = true;

3624

}

3646

}

3625

3647

3626

if (mddev->queue) {

3648

if (mddev->queue) {

3627

if (discard_supported)

3649

if (discard_supported)

3628

queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,

3650

queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,

3629

mddev->queue);

3651

mddev->queue);

3630

else

3652

else

3631

queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,

3653

queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,

3632

mddev->queue);

3654

mddev->queue);

3633

}

3655

}

3634

/* need to check that every block has at least one working mirror */

3656

/* need to check that every block has at least one working mirror */

3635

if (!enough(conf, -1)) {

3657

if (!enough(conf, -1)) {

3636

printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",

3658

printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",

3637

mdname(mddev));

3659

mdname(mddev));

3638

goto out_free_conf;

3660

goto out_free_conf;

3639

}

3661

}

3640

3662

3641

if (conf->reshape_progress != MaxSector) {

3663

if (conf->reshape_progress != MaxSector) {

3642

/* must ensure that shape change is supported */

3664

/* must ensure that shape change is supported */

3643

if (conf->geo.far_copies != 1 &&

3665

if (conf->geo.far_copies != 1 &&

3644

conf->geo.far_offset == 0)

3666

conf->geo.far_offset == 0)

3645

goto out_free_conf;

3667

goto out_free_conf;

3646

if (conf->prev.far_copies != 1 &&

3668

if (conf->prev.far_copies != 1 &&

3647

conf->geo.far_offset == 0)

3669

conf->geo.far_offset == 0)

3648

goto out_free_conf;

3670

goto out_free_conf;

3649

}

3671

}

3650

3672

3651

mddev->degraded = 0;

3673

mddev->degraded = 0;

3652

for (i = 0;

3674

for (i = 0;

3653

i < conf->geo.raid_disks

3675

i < conf->geo.raid_disks

3654

|| i < conf->prev.raid_disks;

3676

|| i < conf->prev.raid_disks;

3655

i++) {

3677

i++) {

3656

3678

3657

disk = conf->mirrors + i;

3679

disk = conf->mirrors + i;

3658

3680

3659

if (!disk->rdev && disk->replacement) {

3681

if (!disk->rdev && disk->replacement) {

3660

/* The replacement is all we have - use it */

3682

/* The replacement is all we have - use it */

3661

disk->rdev = disk->replacement;

3683

disk->rdev = disk->replacement;

3662

disk->replacement = NULL;

3684

disk->replacement = NULL;

3663

clear_bit(Replacement, &disk->rdev->flags);

3685

clear_bit(Replacement, &disk->rdev->flags);

3664

}

3686

}

3665

3687

3666

if (!disk->rdev ||

3688

if (!disk->rdev ||

3667

!test_bit(In_sync, &disk->rdev->flags)) {

3689

!test_bit(In_sync, &disk->rdev->flags)) {

3668

disk->head_position = 0;

3690

disk->head_position = 0;

3669

mddev->degraded++;

3691

mddev->degraded++;

3670

if (disk->rdev)

3692

if (disk->rdev)

3671

conf->fullsync = 1;

3693

conf->fullsync = 1;

3672

}

3694

}

3673

disk->recovery_disabled = mddev->recovery_disabled - 1;

3695

disk->recovery_disabled = mddev->recovery_disabled - 1;

3674

}

3696

}

3675

3697

3676

if (mddev->recovery_cp != MaxSector)

3698

if (mddev->recovery_cp != MaxSector)

3677

printk(KERN_NOTICE "md/raid10:%s: not clean"

3699

printk(KERN_NOTICE "md/raid10:%s: not clean"

3678

" -- starting background reconstruction\n",

3700

" -- starting background reconstruction\n",

3679

mdname(mddev));

3701

mdname(mddev));

3680

printk(KERN_INFO

3702

printk(KERN_INFO

3681

"md/raid10:%s: active with %d out of %d devices\n",

3703

"md/raid10:%s: active with %d out of %d devices\n",

3682

mdname(mddev), conf->geo.raid_disks - mddev->degraded,

3704

mdname(mddev), conf->geo.raid_disks - mddev->degraded,

3683

conf->geo.raid_disks);

3705

conf->geo.raid_disks);

3684

/*

3706

/*

3685

* Ok, everything is just fine now

3707

* Ok, everything is just fine now

3686

*/

3708

*/

3687

mddev->dev_sectors = conf->dev_sectors;

3709

mddev->dev_sectors = conf->dev_sectors;

3688

size = raid10_size(mddev, 0, 0);

3710

size = raid10_size(mddev, 0, 0);

3689

md_set_array_sectors(mddev, size);

3711

md_set_array_sectors(mddev, size);

3690

mddev->resync_max_sectors = size;

3712

mddev->resync_max_sectors = size;

3691

3713

3692

if (mddev->queue) {

3714

if (mddev->queue) {

3693

int stripe = conf->geo.raid_disks *

3715

int stripe = conf->geo.raid_disks *

3694

((mddev->chunk_sectors << 9) / PAGE_SIZE);

3716

((mddev->chunk_sectors << 9) / PAGE_SIZE);

3695

mddev->queue->backing_dev_info.congested_fn = raid10_congested;

3717

mddev->queue->backing_dev_info.congested_fn = raid10_congested;

3696

mddev->queue->backing_dev_info.congested_data = mddev;

3718

mddev->queue->backing_dev_info.congested_data = mddev;

3697

3719

3698

/* Calculate max read-ahead size.

3720

/* Calculate max read-ahead size.

3699

* We need to readahead at least twice a whole stripe....

3721

* We need to readahead at least twice a whole stripe....

3700

* maybe...

3722

* maybe...

3701

*/

3723

*/

3702

stripe /= conf->geo.near_copies;

3724

stripe /= conf->geo.near_copies;

3703

if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)

3725

if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)

3704

mddev->queue->backing_dev_info.ra_pages = 2 * stripe;

3726

mddev->queue->backing_dev_info.ra_pages = 2 * stripe;

3705

blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);

3727

blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);

3706

}

3728

}

3707

3729

3708

3730

3709

if (md_integrity_register(mddev))

3731

if (md_integrity_register(mddev))

3710

goto out_free_conf;

3732

goto out_free_conf;

3711

3733

3712

if (conf->reshape_progress != MaxSector) {

3734

if (conf->reshape_progress != MaxSector) {

3713

unsigned long before_length, after_length;

3735

unsigned long before_length, after_length;

3714

3736

3715

before_length = ((1 << conf->prev.chunk_shift) *

3737

before_length = ((1 << conf->prev.chunk_shift) *

3716

conf->prev.far_copies);

3738

conf->prev.far_copies);

3717

after_length = ((1 << conf->geo.chunk_shift) *

3739

after_length = ((1 << conf->geo.chunk_shift) *

3718

conf->geo.far_copies);

3740

conf->geo.far_copies);

3719

3741

3720

if (max(before_length, after_length) > min_offset_diff) {

3742

if (max(before_length, after_length) > min_offset_diff) {

3721

/* This cannot work */

3743

/* This cannot work */

3722

printk("md/raid10: offset difference not enough to continue reshape\n");

3744

printk("md/raid10: offset difference not enough to continue reshape\n");

3723

goto out_free_conf;

3745

goto out_free_conf;

3724

}

3746

}

3725

conf->offset_diff = min_offset_diff;

3747

conf->offset_diff = min_offset_diff;

3726

3748

3727

conf->reshape_safe = conf->reshape_progress;

3749

conf->reshape_safe = conf->reshape_progress;

3728

clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);

3750

clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);

3729

clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);

3751

clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);

3730

set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);

3752

set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);

3731

set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);

3753

set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);

3732

mddev->sync_thread = md_register_thread(md_do_sync, mddev,

3754

mddev->sync_thread = md_register_thread(md_do_sync, mddev,

3733

"reshape");

3755

"reshape");

3734

}

3756

}

3735

3757

3736

return 0;

3758

return 0;

3737

3759

3738

out_free_conf:

3760

out_free_conf:

3739

md_unregister_thread(&mddev->thread);

3761

md_unregister_thread(&mddev->thread);

3740

if (conf->r10bio_pool)

3762

if (conf->r10bio_pool)

3741

mempool_destroy(conf->r10bio_pool);

3763

mempool_destroy(conf->r10bio_pool);

3742

safe_put_page(conf->tmppage);

3764

safe_put_page(conf->tmppage);

3743

kfree(conf->mirrors);

3765

kfree(conf->mirrors);

3744

kfree(conf);

3766

kfree(conf);

3745

mddev->private = NULL;

3767

mddev->private = NULL;

3746

out:

3768

out:

3747

return -EIO;

3769

return -EIO;

3748

}

3770

}

3749

3771

3750

static int stop(struct mddev *mddev)

3772

static int stop(struct mddev *mddev)

3751

{

3773

{

3752

struct r10conf *conf = mddev->private;

3774

struct r10conf *conf = mddev->private;

3753

3775

3754

raise_barrier(conf, 0);

3776

raise_barrier(conf, 0);

3755

lower_barrier(conf);

3777

lower_barrier(conf);

3756

3778

3757

md_unregister_thread(&mddev->thread);

3779

md_unregister_thread(&mddev->thread);

3758

if (mddev->queue)

3780

if (mddev->queue)

3759

/* the unplug fn references 'conf'*/

3781

/* the unplug fn references 'conf'*/

3760

blk_sync_queue(mddev->queue);

3782

blk_sync_queue(mddev->queue);

3761

3783

3762

if (conf->r10bio_pool)

3784

if (conf->r10bio_pool)

3763

mempool_destroy(conf->r10bio_pool);

3785

mempool_destroy(conf->r10bio_pool);

3764

kfree(conf->mirrors);

3786

kfree(conf->mirrors);

3765

kfree(conf);

3787

kfree(conf);

3766

mddev->private = NULL;

3788

mddev->private = NULL;

3767

return 0;

3789

return 0;

3768

}

3790

}

3769

3791

3770

static void raid10_quiesce(struct mddev *mddev, int state)

3792

static void raid10_quiesce(struct mddev *mddev, int state)

3771

{

3793

{

3772

struct r10conf *conf = mddev->private;

3794

struct r10conf *conf = mddev->private;

3773

3795

3774

switch(state) {

3796

switch(state) {

3775

case 1:

3797

case 1:

3776

raise_barrier(conf, 0);

3798

raise_barrier(conf, 0);

3777

break;

3799

break;

3778

case 0:

3800

case 0:

3779

lower_barrier(conf);

3801

lower_barrier(conf);

3780

break;

3802

break;

3781

}

3803

}

3782

}

3804

}

3783

3805

3784

static int raid10_resize(struct mddev *mddev, sector_t sectors)

3806

static int raid10_resize(struct mddev *mddev, sector_t sectors)

3785

{

3807

{

3786

/* Resize of 'far' arrays is not supported.

3808

/* Resize of 'far' arrays is not supported.

3787

* For 'near' and 'offset' arrays we can set the

3809

* For 'near' and 'offset' arrays we can set the

3788

* number of sectors used to be an appropriate multiple

3810

* number of sectors used to be an appropriate multiple

3789

* of the chunk size.

3811

* of the chunk size.

3790

* For 'offset', this is far_copies*chunksize.

3812

* For 'offset', this is far_copies*chunksize.

3791

* For 'near' the multiplier is the LCM of

3813

* For 'near' the multiplier is the LCM of

3792

* near_copies and raid_disks.

3814

* near_copies and raid_disks.

3793

* So if far_copies > 1 && !far_offset, fail.

3815

* So if far_copies > 1 && !far_offset, fail.

3794

* Else find LCM(raid_disks, near_copy)*far_copies and

3816

* Else find LCM(raid_disks, near_copy)*far_copies and

3795

* multiply by chunk_size. Then round to this number.

3817

* multiply by chunk_size. Then round to this number.

3796

* This is mostly done by raid10_size()

3818

* This is mostly done by raid10_size()

3797

*/

3819

*/

3798

struct r10conf *conf = mddev->private;

3820

struct r10conf *conf = mddev->private;

3799

sector_t oldsize, size;

3821

sector_t oldsize, size;

3800

3822

3801

if (mddev->reshape_position != MaxSector)

3823

if (mddev->reshape_position != MaxSector)

3802

return -EBUSY;

3824

return -EBUSY;

3803

3825

3804

if (conf->geo.far_copies > 1 && !conf->geo.far_offset)

3826

if (conf->geo.far_copies > 1 && !conf->geo.far_offset)

3805

return -EINVAL;

3827

return -EINVAL;

3806

3828

3807

oldsize = raid10_size(mddev, 0, 0);

3829

oldsize = raid10_size(mddev, 0, 0);

3808

size = raid10_size(mddev, sectors, 0);

3830

size = raid10_size(mddev, sectors, 0);

3809

if (mddev->external_size &&

3831

if (mddev->external_size &&

3810

mddev->array_sectors > size)

3832

mddev->array_sectors > size)

3811

return -EINVAL;

3833

return -EINVAL;

3812

if (mddev->bitmap) {

3834

if (mddev->bitmap) {

3813

int ret = bitmap_resize(mddev->bitmap, size, 0, 0);

3835

int ret = bitmap_resize(mddev->bitmap, size, 0, 0);

3814

if (ret)

3836

if (ret)

3815

return ret;

3837

return ret;

3816

}

3838

}

3817

md_set_array_sectors(mddev, size);

3839

md_set_array_sectors(mddev, size);

3818

set_capacity(mddev->gendisk, mddev->array_sectors);

3840

set_capacity(mddev->gendisk, mddev->array_sectors);

3819

revalidate_disk(mddev->gendisk);

3841

revalidate_disk(mddev->gendisk);

3820

if (sectors > mddev->dev_sectors &&

3842

if (sectors > mddev->dev_sectors &&

3821

mddev->recovery_cp > oldsize) {

3843

mddev->recovery_cp > oldsize) {

3822

mddev->recovery_cp = oldsize;

3844

mddev->recovery_cp = oldsize;

3823

set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);

3845

set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);

3824

}

3846

}

3825

calc_sectors(conf, sectors);

3847

calc_sectors(conf, sectors);

3826

mddev->dev_sectors = conf->dev_sectors;

3848

mddev->dev_sectors = conf->dev_sectors;

3827

mddev->resync_max_sectors = size;

3849

mddev->resync_max_sectors = size;

3828

return 0;

3850

return 0;

3829

}

3851

}

3830

3852

3831

static void *raid10_takeover_raid0(struct mddev *mddev)

3853

static void *raid10_takeover_raid0(struct mddev *mddev)

3832

{

3854

{

3833

struct md_rdev *rdev;

3855

struct md_rdev *rdev;

3834

struct r10conf *conf;

3856

struct r10conf *conf;

3835

3857

3836

if (mddev->degraded > 0) {

3858

if (mddev->degraded > 0) {

3837

printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",

3859

printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",

3838

mdname(mddev));

3860

mdname(mddev));

3839

return ERR_PTR(-EINVAL);

3861

return ERR_PTR(-EINVAL);

3840

}

3862

}

3841

3863

3842

/* Set new parameters */

3864

/* Set new parameters */

3843

mddev->new_level = 10;

3865

mddev->new_level = 10;

3844

/* new layout: far_copies = 1, near_copies = 2 */

3866

/* new layout: far_copies = 1, near_copies = 2 */

3845

mddev->new_layout = (1<<8) + 2;

3867

mddev->new_layout = (1<<8) + 2;

3846

mddev->new_chunk_sectors = mddev->chunk_sectors;

3868

mddev->new_chunk_sectors = mddev->chunk_sectors;

3847

mddev->delta_disks = mddev->raid_disks;

3869

mddev->delta_disks = mddev->raid_disks;

3848

mddev->raid_disks *= 2;

3870

mddev->raid_disks *= 2;

3849

/* make sure it will be not marked as dirty */

3871

/* make sure it will be not marked as dirty */

3850

mddev->recovery_cp = MaxSector;

3872

mddev->recovery_cp = MaxSector;

3851

3873

3852

conf = setup_conf(mddev);

3874

conf = setup_conf(mddev);

3853

if (!IS_ERR(conf)) {

3875

if (!IS_ERR(conf)) {

3854

rdev_for_each(rdev, mddev)

3876

rdev_for_each(rdev, mddev)

3855

if (rdev->raid_disk >= 0)

3877

if (rdev->raid_disk >= 0)

3856

rdev->new_raid_disk = rdev->raid_disk * 2;

3878

rdev->new_raid_disk = rdev->raid_disk * 2;

3857

conf->barrier = 1;

3879

conf->barrier = 1;

3858

}

3880

}

3859

3881

3860

return conf;

3882

return conf;

3861

}

3883

}

3862

3884

3863

static void *raid10_takeover(struct mddev *mddev)

3885

static void *raid10_takeover(struct mddev *mddev)

3864

{

3886

{

3865

struct r0conf *raid0_conf;

3887

struct r0conf *raid0_conf;

3866

3888

3867

/* raid10 can take over:

3889

/* raid10 can take over:

3868

* raid0 - providing it has only two drives

3890

* raid0 - providing it has only two drives

3869

*/

3891

*/

3870

if (mddev->level == 0) {

3892

if (mddev->level == 0) {

3871

/* for raid0 takeover only one zone is supported */

3893

/* for raid0 takeover only one zone is supported */

3872

raid0_conf = mddev->private;

3894

raid0_conf = mddev->private;

3873

if (raid0_conf->nr_strip_zones > 1) {

3895

if (raid0_conf->nr_strip_zones > 1) {

3874

printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"

3896

printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"

3875

" with more than one zone.\n",

3897

" with more than one zone.\n",

3876

mdname(mddev));

3898

mdname(mddev));

3877

return ERR_PTR(-EINVAL);

3899

return ERR_PTR(-EINVAL);

3878

}

3900

}

3879

return raid10_takeover_raid0(mddev);

3901

return raid10_takeover_raid0(mddev);

3880

}

3902

}

3881

return ERR_PTR(-EINVAL);

3903

return ERR_PTR(-EINVAL);

3882

}

3904

}

3883

3905

3884

static int raid10_check_reshape(struct mddev *mddev)

3906

static int raid10_check_reshape(struct mddev *mddev)

3885

{

3907

{

3886

/* Called when there is a request to change

3908

/* Called when there is a request to change

3887

* - layout (to ->new_layout)

3909

* - layout (to ->new_layout)

3888

* - chunk size (to ->new_chunk_sectors)

3910

* - chunk size (to ->new_chunk_sectors)

3889

* - raid_disks (by delta_disks)

3911

* - raid_disks (by delta_disks)

3890

* or when trying to restart a reshape that was ongoing.

3912

* or when trying to restart a reshape that was ongoing.

3891

*

3913

*

3892

* We need to validate the request and possibly allocate

3914

* We need to validate the request and possibly allocate

3893

* space if that might be an issue later.

3915

* space if that might be an issue later.

3894

*

3916

*

3895

* Currently we reject any reshape of a 'far' mode array,

3917

* Currently we reject any reshape of a 'far' mode array,

3896

* allow chunk size to change if new is generally acceptable,

3918

* allow chunk size to change if new is generally acceptable,

3897

* allow raid_disks to increase, and allow

3919

* allow raid_disks to increase, and allow

3898

* a switch between 'near' mode and 'offset' mode.

3920

* a switch between 'near' mode and 'offset' mode.

3899

*/

3921

*/

3900

struct r10conf *conf = mddev->private;

3922

struct r10conf *conf = mddev->private;

3901

struct geom geo;

3923

struct geom geo;

3902

3924

3903

if (conf->geo.far_copies != 1 && !conf->geo.far_offset)

3925

if (conf->geo.far_copies != 1 && !conf->geo.far_offset)

3904

return -EINVAL;

3926

return -EINVAL;

3905

3927

3906

if (setup_geo(&geo, mddev, geo_start) != conf->copies)

3928

if (setup_geo(&geo, mddev, geo_start) != conf->copies)

3907

/* mustn't change number of copies */

3929

/* mustn't change number of copies */

3908

return -EINVAL;

3930

return -EINVAL;

3909

if (geo.far_copies > 1 && !geo.far_offset)

3931

if (geo.far_copies > 1 && !geo.far_offset)

3910

/* Cannot switch to 'far' mode */

3932

/* Cannot switch to 'far' mode */

3911

return -EINVAL;

3933

return -EINVAL;

3912

3934

3913

if (mddev->array_sectors & geo.chunk_mask)

3935

if (mddev->array_sectors & geo.chunk_mask)

3914

/* not factor of array size */

3936

/* not factor of array size */

3915

return -EINVAL;

3937

return -EINVAL;

3916

3938

3917

if (!enough(conf, -1))

3939

if (!enough(conf, -1))

3918

return -EINVAL;

3940

return -EINVAL;

3919

3941

3920

kfree(conf->mirrors_new);

3942

kfree(conf->mirrors_new);

3921

conf->mirrors_new = NULL;

3943

conf->mirrors_new = NULL;

3922

if (mddev->delta_disks > 0) {

3944

if (mddev->delta_disks > 0) {

3923

/* allocate new 'mirrors' list */

3945

/* allocate new 'mirrors' list */

3924

conf->mirrors_new = kzalloc(

3946

conf->mirrors_new = kzalloc(

3925

sizeof(struct raid10_info)

3947

sizeof(struct raid10_info)

3926

*(mddev->raid_disks +

3948

*(mddev->raid_disks +

3927

mddev->delta_disks),

3949

mddev->delta_disks),

3928

GFP_KERNEL);

3950

GFP_KERNEL);

3929

if (!conf->mirrors_new)

3951

if (!conf->mirrors_new)

3930

return -ENOMEM;

3952

return -ENOMEM;

3931

}

3953

}

3932

return 0;

3954

return 0;

3933

}

3955

}

3934

3956

3935

/*

3957

/*

3936

* Need to check if array has failed when deciding whether to:

3958

* Need to check if array has failed when deciding whether to:

3937

* - start an array

3959

* - start an array

3938

* - remove non-faulty devices

3960

* - remove non-faulty devices

3939

* - add a spare

3961

* - add a spare

3940

* - allow a reshape

3962

* - allow a reshape

3941

* This determination is simple when no reshape is happening.

3963

* This determination is simple when no reshape is happening.

3942

* However if there is a reshape, we need to carefully check

3964

* However if there is a reshape, we need to carefully check

3943

* both the before and after sections.

3965

* both the before and after sections.

3944

* This is because some failed devices may only affect one

3966

* This is because some failed devices may only affect one

3945

* of the two sections, and some non-in_sync devices may

3967

* of the two sections, and some non-in_sync devices may

3946

* be insync in the section most affected by failed devices.

3968

* be insync in the section most affected by failed devices.

3947

*/

3969

*/

3948

static int calc_degraded(struct r10conf *conf)

3970

static int calc_degraded(struct r10conf *conf)

3949

{

3971

{

3950

int degraded, degraded2;

3972

int degraded, degraded2;

3951

int i;

3973

int i;

3952

3974

3953

rcu_read_lock();

3975

rcu_read_lock();

3954

degraded = 0;

3976

degraded = 0;

3955

/* 'prev' section first */

3977

/* 'prev' section first */

3956

for (i = 0; i < conf->prev.raid_disks; i++) {

3978

for (i = 0; i < conf->prev.raid_disks; i++) {

3957

struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);

3979

struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);

3958

if (!rdev || test_bit(Faulty, &rdev->flags))

3980

if (!rdev || test_bit(Faulty, &rdev->flags))

3959

degraded++;

3981

degraded++;

3960

else if (!test_bit(In_sync, &rdev->flags))

3982

else if (!test_bit(In_sync, &rdev->flags))

3961

/* When we can reduce the number of devices in

3983

/* When we can reduce the number of devices in

3962

* an array, this might not contribute to

3984

* an array, this might not contribute to

3963

* 'degraded'. It does now.

3985

* 'degraded'. It does now.

3964

*/

3986

*/

3965

degraded++;

3987

degraded++;

3966

}

3988

}

3967

rcu_read_unlock();

3989

rcu_read_unlock();

3968

if (conf->geo.raid_disks == conf->prev.raid_disks)

3990

if (conf->geo.raid_disks == conf->prev.raid_disks)

3969

return degraded;

3991

return degraded;

3970

rcu_read_lock();

3992

rcu_read_lock();

3971

degraded2 = 0;

3993

degraded2 = 0;

3972

for (i = 0; i < conf->geo.raid_disks; i++) {

3994

for (i = 0; i < conf->geo.raid_disks; i++) {

3973

struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);

3995

struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);

3974

if (!rdev || test_bit(Faulty, &rdev->flags))

3996

if (!rdev || test_bit(Faulty, &rdev->flags))

3975

degraded2++;

3997

degraded2++;

3976

else if (!test_bit(In_sync, &rdev->flags)) {

3998

else if (!test_bit(In_sync, &rdev->flags)) {

3977

/* If reshape is increasing the number of devices,

3999

/* If reshape is increasing the number of devices,

3978

* this section has already been recovered, so

4000

* this section has already been recovered, so

3979

* it doesn't contribute to degraded.

4001

* it doesn't contribute to degraded.

3980

* else it does.

4002

* else it does.

3981

*/

4003

*/

3982

if (conf->geo.raid_disks <= conf->prev.raid_disks)

4004

if (conf->geo.raid_disks <= conf->prev.raid_disks)

3983

degraded2++;

4005

degraded2++;

3984

}

4006

}

3985

}

4007

}

3986

rcu_read_unlock();

4008

rcu_read_unlock();

3987

if (degraded2 > degraded)

4009

if (degraded2 > degraded)

3988

return degraded2;

4010

return degraded2;

3989

return degraded;

4011

return degraded;

3990

}

4012

}

3991

4013

3992

static int raid10_start_reshape(struct mddev *mddev)

4014

static int raid10_start_reshape(struct mddev *mddev)

3993

{

4015

{

3994

/* A 'reshape' has been requested. This commits

4016

/* A 'reshape' has been requested. This commits

3995

* the various 'new' fields and sets MD_RECOVER_RESHAPE

4017

* the various 'new' fields and sets MD_RECOVER_RESHAPE

3996

* This also checks if there are enough spares and adds them

4018

* This also checks if there are enough spares and adds them

3997

* to the array.

4019

* to the array.

3998

* We currently require enough spares to make the final

4020

* We currently require enough spares to make the final

3999

* array non-degraded. We also require that the difference

4021

* array non-degraded. We also require that the difference

4000

* between old and new data_offset - on each device - is

4022

* between old and new data_offset - on each device - is

4001

* enough that we never risk over-writing.

4023

* enough that we never risk over-writing.

4002

*/

4024

*/

4003

4025

4004

unsigned long before_length, after_length;

4026

unsigned long before_length, after_length;

4005

sector_t min_offset_diff = 0;

4027

sector_t min_offset_diff = 0;

4006

int first = 1;

4028

int first = 1;

4007

struct geom new;

4029

struct geom new;

4008

struct r10conf *conf = mddev->private;

4030

struct r10conf *conf = mddev->private;

4009

struct md_rdev *rdev;

4031

struct md_rdev *rdev;

4010

int spares = 0;

4032

int spares = 0;

4011

int ret;

4033

int ret;

4012

4034

4013

if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))

4035

if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))

4014

return -EBUSY;

4036

return -EBUSY;

4015

4037

4016

if (setup_geo(&new, mddev, geo_start) != conf->copies)

4038

if (setup_geo(&new, mddev, geo_start) != conf->copies)

4017

return -EINVAL;

4039

return -EINVAL;

4018

4040

4019

before_length = ((1 << conf->prev.chunk_shift) *

4041

before_length = ((1 << conf->prev.chunk_shift) *

4020

conf->prev.far_copies);

4042

conf->prev.far_copies);

4021

after_length = ((1 << conf->geo.chunk_shift) *

4043

after_length = ((1 << conf->geo.chunk_shift) *

4022

conf->geo.far_copies);

4044

conf->geo.far_copies);

4023

4045

4024

rdev_for_each(rdev, mddev) {

4046

rdev_for_each(rdev, mddev) {

4025

if (!test_bit(In_sync, &rdev->flags)

4047

if (!test_bit(In_sync, &rdev->flags)

4026

&& !test_bit(Faulty, &rdev->flags))

4048

&& !test_bit(Faulty, &rdev->flags))

4027

spares++;

4049

spares++;

4028

if (rdev->raid_disk >= 0) {

4050

if (rdev->raid_disk >= 0) {

4029

long long diff = (rdev->new_data_offset

4051

long long diff = (rdev->new_data_offset

4030

- rdev->data_offset);

4052

- rdev->data_offset);

4031

if (!mddev->reshape_backwards)

4053

if (!mddev->reshape_backwards)

4032

diff = -diff;

4054

diff = -diff;

4033

if (diff < 0)

4055

if (diff < 0)

4034

diff = 0;

4056

diff = 0;

4035

if (first || diff < min_offset_diff)

4057

if (first || diff < min_offset_diff)

4036

min_offset_diff = diff;

4058

min_offset_diff = diff;

4037

}

4059

}

4038

}

4060

}

4039

4061

4040

if (max(before_length, after_length) > min_offset_diff)

4062

if (max(before_length, after_length) > min_offset_diff)

4041

return -EINVAL;

4063

return -EINVAL;

4042

4064

4043

if (spares < mddev->delta_disks)

4065

if (spares < mddev->delta_disks)

4044

return -EINVAL;

4066

return -EINVAL;

4045

4067

4046

conf->offset_diff = min_offset_diff;

4068

conf->offset_diff = min_offset_diff;

4047

spin_lock_irq(&conf->device_lock);

4069

spin_lock_irq(&conf->device_lock);

4048

if (conf->mirrors_new) {

4070

if (conf->mirrors_new) {

4049

memcpy(conf->mirrors_new, conf->mirrors,

4071

memcpy(conf->mirrors_new, conf->mirrors,

4050

sizeof(struct raid10_info)*conf->prev.raid_disks);

4072

sizeof(struct raid10_info)*conf->prev.raid_disks);

4051

smp_mb();

4073

smp_mb();

4052

kfree(conf->mirrors_old); /* FIXME and elsewhere */

4074

kfree(conf->mirrors_old); /* FIXME and elsewhere */

4053

conf->mirrors_old = conf->mirrors;

4075

conf->mirrors_old = conf->mirrors;

4054

conf->mirrors = conf->mirrors_new;

4076

conf->mirrors = conf->mirrors_new;

4055

conf->mirrors_new = NULL;

4077

conf->mirrors_new = NULL;

4056

}

4078

}

4057

setup_geo(&conf->geo, mddev, geo_start);

4079

setup_geo(&conf->geo, mddev, geo_start);

4058

smp_mb();

4080

smp_mb();

4059

if (mddev->reshape_backwards) {

4081

if (mddev->reshape_backwards) {

4060

sector_t size = raid10_size(mddev, 0, 0);

4082

sector_t size = raid10_size(mddev, 0, 0);

4061

if (size < mddev->array_sectors) {

4083

if (size < mddev->array_sectors) {

4062

spin_unlock_irq(&conf->device_lock);

4084

spin_unlock_irq(&conf->device_lock);

4063

printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",

4085

printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",

4064

mdname(mddev));

4086

mdname(mddev));

4065

return -EINVAL;

4087

return -EINVAL;

4066

}

4088

}

4067

mddev->resync_max_sectors = size;

4089

mddev->resync_max_sectors = size;

4068

conf->reshape_progress = size;

4090

conf->reshape_progress = size;

4069

} else

4091

} else

4070

conf->reshape_progress = 0;

4092

conf->reshape_progress = 0;

4071

spin_unlock_irq(&conf->device_lock);

4093

spin_unlock_irq(&conf->device_lock);

4072

4094

4073

if (mddev->delta_disks && mddev->bitmap) {

4095

if (mddev->delta_disks && mddev->bitmap) {

4074

ret = bitmap_resize(mddev->bitmap,

4096

ret = bitmap_resize(mddev->bitmap,

4075

raid10_size(mddev, 0,

4097

raid10_size(mddev, 0,

4076

conf->geo.raid_disks),

4098

conf->geo.raid_disks),

4077

0, 0);

4099

0, 0);

4078

if (ret)

4100

if (ret)

4079

goto abort;

4101

goto abort;

4080

}

4102

}

4081

if (mddev->delta_disks > 0) {

4103

if (mddev->delta_disks > 0) {

4082

rdev_for_each(rdev, mddev)

4104

rdev_for_each(rdev, mddev)

4083

if (rdev->raid_disk < 0 &&

4105

if (rdev->raid_disk < 0 &&

4084

!test_bit(Faulty, &rdev->flags)) {

4106

!test_bit(Faulty, &rdev->flags)) {

4085

if (raid10_add_disk(mddev, rdev) == 0) {

4107

if (raid10_add_disk(mddev, rdev) == 0) {

4086

if (rdev->raid_disk >=

4108

if (rdev->raid_disk >=

4087

conf->prev.raid_disks)

4109

conf->prev.raid_disks)

4088

set_bit(In_sync, &rdev->flags);

4110

set_bit(In_sync, &rdev->flags);

4089

else

4111

else

4090

rdev->recovery_offset = 0;

4112

rdev->recovery_offset = 0;

4091

4113

4092

if (sysfs_link_rdev(mddev, rdev))

4114

if (sysfs_link_rdev(mddev, rdev))

4093

/* Failure here is OK */;

4115

/* Failure here is OK */;

4094

}

4116

}

4095

} else if (rdev->raid_disk >= conf->prev.raid_disks

4117

} else if (rdev->raid_disk >= conf->prev.raid_disks

4096

&& !test_bit(Faulty, &rdev->flags)) {

4118

&& !test_bit(Faulty, &rdev->flags)) {

4097

/* This is a spare that was manually added */

4119

/* This is a spare that was manually added */

4098

set_bit(In_sync, &rdev->flags);

4120

set_bit(In_sync, &rdev->flags);

4099

}

4121

}

4100

}

4122

}

4101

/* When a reshape changes the number of devices,

4123

/* When a reshape changes the number of devices,

4102

* ->degraded is measured against the larger of the

4124

* ->degraded is measured against the larger of the

4103

* pre and post numbers.

4125

* pre and post numbers.

4104

*/

4126

*/

4105

spin_lock_irq(&conf->device_lock);

4127

spin_lock_irq(&conf->device_lock);

4106

mddev->degraded = calc_degraded(conf);

4128

mddev->degraded = calc_degraded(conf);

4107

spin_unlock_irq(&conf->device_lock);

4129

spin_unlock_irq(&conf->device_lock);

4108

mddev->raid_disks = conf->geo.raid_disks;

4130

mddev->raid_disks = conf->geo.raid_disks;

4109

mddev->reshape_position = conf->reshape_progress;

4131

mddev->reshape_position = conf->reshape_progress;

4110

set_bit(MD_CHANGE_DEVS, &mddev->flags);

4132

set_bit(MD_CHANGE_DEVS, &mddev->flags);

4111

4133

4112

clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);

4134

clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);

4113

clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);

4135

clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);

4114

set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);

4136

set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);

4115

set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);

4137

set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);

4116

4138

4117

mddev->sync_thread = md_register_thread(md_do_sync, mddev,

4139

mddev->sync_thread = md_register_thread(md_do_sync, mddev,

4118

"reshape");

4140

"reshape");

4119

if (!mddev->sync_thread) {

4141

if (!mddev->sync_thread) {

4120

ret = -EAGAIN;

4142

ret = -EAGAIN;

4121

goto abort;

4143

goto abort;

4122

}

4144

}

4123

conf->reshape_checkpoint = jiffies;

4145

conf->reshape_checkpoint = jiffies;

4124

md_wakeup_thread(mddev->sync_thread);

4146

md_wakeup_thread(mddev->sync_thread);

4125

md_new_event(mddev);

4147

md_new_event(mddev);

4126

return 0;

4148

return 0;

4127

4149

4128

abort:

4150

abort:

4129

mddev->recovery = 0;

4151

mddev->recovery = 0;

4130

spin_lock_irq(&conf->device_lock);

4152

spin_lock_irq(&conf->device_lock);

4131

conf->geo = conf->prev;

4153

conf->geo = conf->prev;

4132

mddev->raid_disks = conf->geo.raid_disks;

4154

mddev->raid_disks = conf->geo.raid_disks;

4133

rdev_for_each(rdev, mddev)

4155

rdev_for_each(rdev, mddev)

4134

rdev->new_data_offset = rdev->data_offset;

4156

rdev->new_data_offset = rdev->data_offset;

4135

smp_wmb();

4157

smp_wmb();

4136

conf->reshape_progress = MaxSector;

4158

conf->reshape_progress = MaxSector;

4137

mddev->reshape_position = MaxSector;

4159

mddev->reshape_position = MaxSector;

4138

spin_unlock_irq(&conf->device_lock);

4160

spin_unlock_irq(&conf->device_lock);

4139

return ret;

4161

return ret;

4140

}

4162

}

4141

4163

4142

/* Calculate the last device-address that could contain

4164

/* Calculate the last device-address that could contain

4143

* any block from the chunk that includes the array-address 's'

4165

* any block from the chunk that includes the array-address 's'

4144

* and report the next address.

4166

* and report the next address.

4145

* i.e. the address returned will be chunk-aligned and after

4167

* i.e. the address returned will be chunk-aligned and after

4146

* any data that is in the chunk containing 's'.

4168

* any data that is in the chunk containing 's'.

4147

*/

4169

*/

4148

static sector_t last_dev_address(sector_t s, struct geom *geo)

4170

static sector_t last_dev_address(sector_t s, struct geom *geo)

4149

{

4171

{

4150

s = (s | geo->chunk_mask) + 1;

4172

s = (s | geo->chunk_mask) + 1;

4151

s >>= geo->chunk_shift;

4173

s >>= geo->chunk_shift;

4152

s *= geo->near_copies;

4174

s *= geo->near_copies;

4153

s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);

4175

s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);

4154

s *= geo->far_copies;

4176

s *= geo->far_copies;

4155

s <<= geo->chunk_shift;

4177

s <<= geo->chunk_shift;

4156

return s;

4178

return s;

4157

}

4179

}

4158

4180

4159

/* Calculate the first device-address that could contain

4181

/* Calculate the first device-address that could contain

4160

* any block from the chunk that includes the array-address 's'.

4182

* any block from the chunk that includes the array-address 's'.

4161

* This too will be the start of a chunk

4183

* This too will be the start of a chunk

4162

*/

4184

*/

4163

static sector_t first_dev_address(sector_t s, struct geom *geo)

4185

static sector_t first_dev_address(sector_t s, struct geom *geo)

4164

{

4186

{

4165

s >>= geo->chunk_shift;

4187

s >>= geo->chunk_shift;

4166

s *= geo->near_copies;

4188

s *= geo->near_copies;

4167

sector_div(s, geo->raid_disks);

4189

sector_div(s, geo->raid_disks);

4168

s *= geo->far_copies;

4190

s *= geo->far_copies;

4169

s <<= geo->chunk_shift;

4191

s <<= geo->chunk_shift;

4170

return s;

4192

return s;

4171

}

4193

}

4172

4194

4173

static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,

4195

static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,

4174

int *skipped)

4196

int *skipped)

4175

{

4197

{

4176

/* We simply copy at most one chunk (smallest of old and new)

4198

/* We simply copy at most one chunk (smallest of old and new)

4177

* at a time, possibly less if that exceeds RESYNC_PAGES,

4199

* at a time, possibly less if that exceeds RESYNC_PAGES,

4178

* or we hit a bad block or something.

4200

* or we hit a bad block or something.

4179

* This might mean we pause for normal IO in the middle of

4201

* This might mean we pause for normal IO in the middle of

4180

* a chunk, but that is not a problem was mddev->reshape_position

4202

* a chunk, but that is not a problem was mddev->reshape_position

4181

* can record any location.

4203

* can record any location.

4182

*

4204

*

4183

* If we will want to write to a location that isn't

4205

* If we will want to write to a location that isn't

4184

* yet recorded as 'safe' (i.e. in metadata on disk) then

4206

* yet recorded as 'safe' (i.e. in metadata on disk) then

4185

* we need to flush all reshape requests and update the metadata.

4207

* we need to flush all reshape requests and update the metadata.

4186

*

4208

*

4187

* When reshaping forwards (e.g. to more devices), we interpret

4209

* When reshaping forwards (e.g. to more devices), we interpret

4188

* 'safe' as the earliest block which might not have been copied

4210

* 'safe' as the earliest block which might not have been copied

4189

* down yet. We divide this by previous stripe size and multiply

4211

* down yet. We divide this by previous stripe size and multiply

4190

* by previous stripe length to get lowest device offset that we

4212

* by previous stripe length to get lowest device offset that we

4191

* cannot write to yet.

4213

* cannot write to yet.

4192

* We interpret 'sector_nr' as an address that we want to write to.

4214

* We interpret 'sector_nr' as an address that we want to write to.

4193

* From this we use last_device_address() to find where we might

4215

* From this we use last_device_address() to find where we might

4194

* write to, and first_device_address on the 'safe' position.

4216

* write to, and first_device_address on the 'safe' position.

4195

* If this 'next' write position is after the 'safe' position,

4217

* If this 'next' write position is after the 'safe' position,

4196

* we must update the metadata to increase the 'safe' position.

4218

* we must update the metadata to increase the 'safe' position.

4197

*

4219

*

4198

* When reshaping backwards, we round in the opposite direction

4220

* When reshaping backwards, we round in the opposite direction

4199

* and perform the reverse test: next write position must not be

4221

* and perform the reverse test: next write position must not be

4200

* less than current safe position.

4222

* less than current safe position.

4201

*

4223

*

4202

* In all this the minimum difference in data offsets

4224

* In all this the minimum difference in data offsets

4203

* (conf->offset_diff - always positive) allows a bit of slack,

4225

* (conf->offset_diff - always positive) allows a bit of slack,

4204

* so next can be after 'safe', but not by more than offset_disk

4226

* so next can be after 'safe', but not by more than offset_disk

4205

*

4227

*

4206

* We need to prepare all the bios here before we start any IO

4228

* We need to prepare all the bios here before we start any IO

4207

* to ensure the size we choose is acceptable to all devices.

4229

* to ensure the size we choose is acceptable to all devices.

4208

* The means one for each copy for write-out and an extra one for

4230

* The means one for each copy for write-out and an extra one for

4209

* read-in.

4231

* read-in.

4210

* We store the read-in bio in ->master_bio and the others in

4232

* We store the read-in bio in ->master_bio and the others in

4211

* ->devs[x].bio and ->devs[x].repl_bio.

4233

* ->devs[x].bio and ->devs[x].repl_bio.

4212

*/

4234

*/

4213

struct r10conf *conf = mddev->private;

4235

struct r10conf *conf = mddev->private;

4214

struct r10bio *r10_bio;

4236

struct r10bio *r10_bio;

4215

sector_t next, safe, last;

4237

sector_t next, safe, last;

4216

int max_sectors;

4238

int max_sectors;

4217

int nr_sectors;

4239

int nr_sectors;

4218

int s;

4240

int s;

4219

struct md_rdev *rdev;

4241

struct md_rdev *rdev;

4220

int need_flush = 0;

4242

int need_flush = 0;

4221

struct bio *blist;

4243

struct bio *blist;

4222

struct bio *bio, *read_bio;

4244

struct bio *bio, *read_bio;

4223

int sectors_done = 0;

4245

int sectors_done = 0;

4224

4246

4225

if (sector_nr == 0) {

4247

if (sector_nr == 0) {

4226

/* If restarting in the middle, skip the initial sectors */

4248

/* If restarting in the middle, skip the initial sectors */

4227

if (mddev->reshape_backwards &&

4249

if (mddev->reshape_backwards &&

4228

conf->reshape_progress < raid10_size(mddev, 0, 0)) {

4250

conf->reshape_progress < raid10_size(mddev, 0, 0)) {

4229

sector_nr = (raid10_size(mddev, 0, 0)

4251

sector_nr = (raid10_size(mddev, 0, 0)

4230

- conf->reshape_progress);

4252

- conf->reshape_progress);

4231

} else if (!mddev->reshape_backwards &&

4253

} else if (!mddev->reshape_backwards &&

4232

conf->reshape_progress > 0)

4254

conf->reshape_progress > 0)

4233

sector_nr = conf->reshape_progress;

4255

sector_nr = conf->reshape_progress;

4234

if (sector_nr) {

4256

if (sector_nr) {

4235

mddev->curr_resync_completed = sector_nr;

4257

mddev->curr_resync_completed = sector_nr;

4236

sysfs_notify(&mddev->kobj, NULL, "sync_completed");

4258

sysfs_notify(&mddev->kobj, NULL, "sync_completed");

4237

*skipped = 1;

4259

*skipped = 1;

4238

return sector_nr;

4260

return sector_nr;

4239

}

4261

}

4240

}

4262

}

4241

4263

4242

/* We don't use sector_nr to track where we are up to

4264

/* We don't use sector_nr to track where we are up to

4243

* as that doesn't work well for ->reshape_backwards.

4265

* as that doesn't work well for ->reshape_backwards.

4244

* So just use ->reshape_progress.

4266

* So just use ->reshape_progress.

4245

*/

4267

*/

4246

if (mddev->reshape_backwards) {

4268

if (mddev->reshape_backwards) {

4247

/* 'next' is the earliest device address that we might

4269

/* 'next' is the earliest device address that we might

4248

* write to for this chunk in the new layout

4270

* write to for this chunk in the new layout

4249

*/

4271

*/

4250

next = first_dev_address(conf->reshape_progress - 1,

4272

next = first_dev_address(conf->reshape_progress - 1,

4251

&conf->geo);

4273

&conf->geo);

4252

4274

4253

/* 'safe' is the last device address that we might read from

4275

/* 'safe' is the last device address that we might read from

4254

* in the old layout after a restart

4276

* in the old layout after a restart

4255

*/

4277

*/

4256

safe = last_dev_address(conf->reshape_safe - 1,

4278

safe = last_dev_address(conf->reshape_safe - 1,

4257

&conf->prev);

4279

&conf->prev);

4258

4280

4259

if (next + conf->offset_diff < safe)

4281

if (next + conf->offset_diff < safe)

4260

need_flush = 1;

4282

need_flush = 1;

4261

4283

4262

last = conf->reshape_progress - 1;

4284

last = conf->reshape_progress - 1;

4263

sector_nr = last & ~(sector_t)(conf->geo.chunk_mask

4285

sector_nr = last & ~(sector_t)(conf->geo.chunk_mask

4264

& conf->prev.chunk_mask);

4286

& conf->prev.chunk_mask);

4265

if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)

4287

if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)

4266

sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;

4288

sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;

4267

} else {

4289

} else {

4268

/* 'next' is after the last device address that we

4290

/* 'next' is after the last device address that we

4269

* might write to for this chunk in the new layout

4291

* might write to for this chunk in the new layout

4270

*/

4292

*/

4271

next = last_dev_address(conf->reshape_progress, &conf->geo);

4293

next = last_dev_address(conf->reshape_progress, &conf->geo);

4272

4294

4273

/* 'safe' is the earliest device address that we might

4295

/* 'safe' is the earliest device address that we might

4274

* read from in the old layout after a restart

4296

* read from in the old layout after a restart

4275

*/

4297

*/

4276

safe = first_dev_address(conf->reshape_safe, &conf->prev);

4298

safe = first_dev_address(conf->reshape_safe, &conf->prev);

4277

4299

4278

/* Need to update metadata if 'next' might be beyond 'safe'

4300

/* Need to update metadata if 'next' might be beyond 'safe'

4279

* as that would possibly corrupt data

4301

* as that would possibly corrupt data

4280

*/

4302

*/

4281

if (next > safe + conf->offset_diff)

4303

if (next > safe + conf->offset_diff)

4282

need_flush = 1;

4304

need_flush = 1;

4283

4305

4284

sector_nr = conf->reshape_progress;

4306

sector_nr = conf->reshape_progress;

4285

last = sector_nr | (conf->geo.chunk_mask

4307

last = sector_nr | (conf->geo.chunk_mask

4286

& conf->prev.chunk_mask);

4308

& conf->prev.chunk_mask);

4287

4309

4288

if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)

4310

if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)

4289

last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;

4311

last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;

4290

}

4312

}

4291

4313

4292

if (need_flush ||

4314

if (need_flush ||

4293

time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {

4315

time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {

4294

/* Need to update reshape_position in metadata */

4316

/* Need to update reshape_position in metadata */

4295

wait_barrier(conf);

4317

wait_barrier(conf);

4296

mddev->reshape_position = conf->reshape_progress;

4318

mddev->reshape_position = conf->reshape_progress;

4297

if (mddev->reshape_backwards)

4319

if (mddev->reshape_backwards)

4298

mddev->curr_resync_completed = raid10_size(mddev, 0, 0)

4320

mddev->curr_resync_completed = raid10_size(mddev, 0, 0)

4299

- conf->reshape_progress;

4321

- conf->reshape_progress;

4300

else

4322

else

4301

mddev->curr_resync_completed = conf->reshape_progress;

4323

mddev->curr_resync_completed = conf->reshape_progress;

4302

conf->reshape_checkpoint = jiffies;

4324

conf->reshape_checkpoint = jiffies;

4303

set_bit(MD_CHANGE_DEVS, &mddev->flags);

4325

set_bit(MD_CHANGE_DEVS, &mddev->flags);

4304

md_wakeup_thread(mddev->thread);

4326

md_wakeup_thread(mddev->thread);

4305

wait_event(mddev->sb_wait, mddev->flags == 0 ||

4327

wait_event(mddev->sb_wait, mddev->flags == 0 ||

4306

kthread_should_stop());

4328

kthread_should_stop());

4307

conf->reshape_safe = mddev->reshape_position;

4329

conf->reshape_safe = mddev->reshape_position;

4308

allow_barrier(conf);

4330

allow_barrier(conf);

4309

}

4331

}

4310

4332

4311

read_more:

4333

read_more:

4312

/* Now schedule reads for blocks from sector_nr to last */

4334

/* Now schedule reads for blocks from sector_nr to last */

4313

r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);

4335

r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);

4314

raise_barrier(conf, sectors_done != 0);

4336

raise_barrier(conf, sectors_done != 0);

4315

atomic_set(&r10_bio->remaining, 0);

4337

atomic_set(&r10_bio->remaining, 0);

4316

r10_bio->mddev = mddev;

4338

r10_bio->mddev = mddev;

4317

r10_bio->sector = sector_nr;

4339

r10_bio->sector = sector_nr;

4318

set_bit(R10BIO_IsReshape, &r10_bio->state);

4340

set_bit(R10BIO_IsReshape, &r10_bio->state);

4319

r10_bio->sectors = last - sector_nr + 1;

4341

r10_bio->sectors = last - sector_nr + 1;

4320

rdev = read_balance(conf, r10_bio, &max_sectors);

4342

rdev = read_balance(conf, r10_bio, &max_sectors);

4321

BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));

4343

BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));

4322

4344

4323

if (!rdev) {

4345

if (!rdev) {

4324

/* Cannot read from here, so need to record bad blocks

4346

/* Cannot read from here, so need to record bad blocks

4325

* on all the target devices.

4347

* on all the target devices.

4326

*/

4348

*/

4327

// FIXME

4349

// FIXME

4328

set_bit(MD_RECOVERY_INTR, &mddev->recovery);

4350

set_bit(MD_RECOVERY_INTR, &mddev->recovery);

4329

return sectors_done;

4351

return sectors_done;

4330

}

4352

}

4331

4353

4332

read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);

4354

read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);

4333

4355

4334

read_bio->bi_bdev = rdev->bdev;

4356

read_bio->bi_bdev = rdev->bdev;

4335

read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr

4357

read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr

4336

+ rdev->data_offset);

4358

+ rdev->data_offset);

4337

read_bio->bi_private = r10_bio;

4359

read_bio->bi_private = r10_bio;

4338

read_bio->bi_end_io = end_sync_read;

4360

read_bio->bi_end_io = end_sync_read;

4339

read_bio->bi_rw = READ;

4361

read_bio->bi_rw = READ;

4340

read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);

4362

read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);

4341

read_bio->bi_flags |= 1 << BIO_UPTODATE;

4363

read_bio->bi_flags |= 1 << BIO_UPTODATE;

4342

read_bio->bi_vcnt = 0;

4364

read_bio->bi_vcnt = 0;

4343

read_bio->bi_idx = 0;

4365

read_bio->bi_idx = 0;

4344

read_bio->bi_size = 0;

4366

read_bio->bi_size = 0;

4345

r10_bio->master_bio = read_bio;

4367

r10_bio->master_bio = read_bio;

4346

r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;

4368

r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;

4347

4369

4348

/* Now find the locations in the new layout */

4370

/* Now find the locations in the new layout */

4349

__raid10_find_phys(&conf->geo, r10_bio);

4371

__raid10_find_phys(&conf->geo, r10_bio);

4350

4372

4351

blist = read_bio;

4373

blist = read_bio;

4352

read_bio->bi_next = NULL;

4374

read_bio->bi_next = NULL;

4353

4375

4354

for (s = 0; s < conf->copies*2; s++) {

4376

for (s = 0; s < conf->copies*2; s++) {

4355

struct bio *b;

4377

struct bio *b;

4356

int d = r10_bio->devs[s/2].devnum;

4378

int d = r10_bio->devs[s/2].devnum;

4357

struct md_rdev *rdev2;

4379

struct md_rdev *rdev2;

4358

if (s&1) {

4380

if (s&1) {

4359

rdev2 = conf->mirrors[d].replacement;

4381

rdev2 = conf->mirrors[d].replacement;

4360

b = r10_bio->devs[s/2].repl_bio;

4382

b = r10_bio->devs[s/2].repl_bio;

4361

} else {

4383

} else {

4362

rdev2 = conf->mirrors[d].rdev;

4384

rdev2 = conf->mirrors[d].rdev;

4363

b = r10_bio->devs[s/2].bio;

4385

b = r10_bio->devs[s/2].bio;

4364

}

4386

}

4365

if (!rdev2 || test_bit(Faulty, &rdev2->flags))

4387

if (!rdev2 || test_bit(Faulty, &rdev2->flags))

4366

continue;

4388

continue;

4367

b->bi_bdev = rdev2->bdev;

4389

b->bi_bdev = rdev2->bdev;

4368

b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;

4390

b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;

4369

b->bi_private = r10_bio;

4391

b->bi_private = r10_bio;

4370

b->bi_end_io = end_reshape_write;

4392

b->bi_end_io = end_reshape_write;

4371

b->bi_rw = WRITE;

4393

b->bi_rw = WRITE;

4372

b->bi_flags &= ~(BIO_POOL_MASK - 1);

4394

b->bi_flags &= ~(BIO_POOL_MASK - 1);

4373

b->bi_flags |= 1 << BIO_UPTODATE;

4395

b->bi_flags |= 1 << BIO_UPTODATE;

4374

b->bi_next = blist;

4396

b->bi_next = blist;

4375

b->bi_vcnt = 0;

4397

b->bi_vcnt = 0;

4376

b->bi_idx = 0;

4398

b->bi_idx = 0;

4377

b->bi_size = 0;

4399

b->bi_size = 0;

4378

blist = b;

4400

blist = b;

4379

}

4401

}

4380

4402

4381

/* Now add as many pages as possible to all of these bios. */

4403

/* Now add as many pages as possible to all of these bios. */

4382

4404

4383

nr_sectors = 0;

4405

nr_sectors = 0;

4384

for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {

4406

for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {

4385

struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;

4407

struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;

4386

int len = (max_sectors - s) << 9;

4408

int len = (max_sectors - s) << 9;

4387

if (len > PAGE_SIZE)

4409

if (len > PAGE_SIZE)

4388

len = PAGE_SIZE;

4410

len = PAGE_SIZE;

4389

for (bio = blist; bio ; bio = bio->bi_next) {

4411

for (bio = blist; bio ; bio = bio->bi_next) {

4390

struct bio *bio2;

4412

struct bio *bio2;

4391

if (bio_add_page(bio, page, len, 0))

4413

if (bio_add_page(bio, page, len, 0))

4392

continue;

4414

continue;

4393

4415

4394

/* Didn't fit, must stop */

4416

/* Didn't fit, must stop */

4395

for (bio2 = blist;

4417

for (bio2 = blist;

4396

bio2 && bio2 != bio;

4418

bio2 && bio2 != bio;

4397

bio2 = bio2->bi_next) {

4419

bio2 = bio2->bi_next) {

4398

/* Remove last page from this bio */

4420

/* Remove last page from this bio */

4399

bio2->bi_vcnt--;

4421

bio2->bi_vcnt--;

4400

bio2->bi_size -= len;

4422

bio2->bi_size -= len;

4401

bio2->bi_flags &= ~(1<<BIO_SEG_VALID);

4423

bio2->bi_flags &= ~(1<<BIO_SEG_VALID);

4402

}

4424

}

4403

goto bio_full;

4425

goto bio_full;

4404

}

4426

}

4405

sector_nr += len >> 9;

4427

sector_nr += len >> 9;

4406

nr_sectors += len >> 9;

4428

nr_sectors += len >> 9;

4407

}

4429

}

4408

bio_full:

4430

bio_full:

4409

r10_bio->sectors = nr_sectors;

4431

r10_bio->sectors = nr_sectors;

4410

4432

4411

/* Now submit the read */

4433

/* Now submit the read */

4412

md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);

4434

md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);

4413

atomic_inc(&r10_bio->remaining);

4435

atomic_inc(&r10_bio->remaining);

4414

read_bio->bi_next = NULL;

4436

read_bio->bi_next = NULL;

4415

generic_make_request(read_bio);

4437

generic_make_request(read_bio);

4416

sector_nr += nr_sectors;

4438

sector_nr += nr_sectors;

4417

sectors_done += nr_sectors;

4439

sectors_done += nr_sectors;

4418

if (sector_nr <= last)

4440

if (sector_nr <= last)

4419

goto read_more;

4441

goto read_more;

4420

4442

4421

/* Now that we have done the whole section we can

4443

/* Now that we have done the whole section we can

4422

* update reshape_progress

4444

* update reshape_progress

4423

*/

4445

*/

4424

if (mddev->reshape_backwards)

4446

if (mddev->reshape_backwards)

4425

conf->reshape_progress -= sectors_done;

4447

conf->reshape_progress -= sectors_done;

4426

else

4448

else

4427

conf->reshape_progress += sectors_done;

4449

conf->reshape_progress += sectors_done;

4428

4450

4429

return sectors_done;

4451

return sectors_done;

4430

}

4452

}

4431

4453

4432

static void end_reshape_request(struct r10bio *r10_bio);

4454

static void end_reshape_request(struct r10bio *r10_bio);

4433

static int handle_reshape_read_error(struct mddev *mddev,

4455

static int handle_reshape_read_error(struct mddev *mddev,

4434

struct r10bio *r10_bio);

4456

struct r10bio *r10_bio);

4435

static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)

4457

static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)

4436

{

4458

{

4437

/* Reshape read completed. Hopefully we have a block

4459

/* Reshape read completed. Hopefully we have a block

4438

* to write out.

4460

* to write out.

4439

* If we got a read error then we do sync 1-page reads from

4461

* If we got a read error then we do sync 1-page reads from

4440

* elsewhere until we find the data - or give up.

4462

* elsewhere until we find the data - or give up.

4441

*/

4463

*/

4442

struct r10conf *conf = mddev->private;

4464

struct r10conf *conf = mddev->private;

4443

int s;

4465

int s;

4444

4466

4445

if (!test_bit(R10BIO_Uptodate, &r10_bio->state))

4467

if (!test_bit(R10BIO_Uptodate, &r10_bio->state))

4446

if (handle_reshape_read_error(mddev, r10_bio) < 0) {

4468

if (handle_reshape_read_error(mddev, r10_bio) < 0) {

4447

/* Reshape has been aborted */

4469

/* Reshape has been aborted */

4448

md_done_sync(mddev, r10_bio->sectors, 0);

4470

md_done_sync(mddev, r10_bio->sectors, 0);

4449

return;

4471

return;

4450

}

4472

}

4451

4473

4452

/* We definitely have the data in the pages, schedule the

4474

/* We definitely have the data in the pages, schedule the

4453

* writes.

4475

* writes.

4454

*/

4476

*/

4455

atomic_set(&r10_bio->remaining, 1);

4477

atomic_set(&r10_bio->remaining, 1);

4456

for (s = 0; s < conf->copies*2; s++) {

4478

for (s = 0; s < conf->copies*2; s++) {

4457

struct bio *b;

4479

struct bio *b;

4458

int d = r10_bio->devs[s/2].devnum;

4480

int d = r10_bio->devs[s/2].devnum;

4459

struct md_rdev *rdev;

4481

struct md_rdev *rdev;

4460

if (s&1) {

4482

if (s&1) {

4461

rdev = conf->mirrors[d].replacement;

4483

rdev = conf->mirrors[d].replacement;

4462

b = r10_bio->devs[s/2].repl_bio;

4484

b = r10_bio->devs[s/2].repl_bio;

4463

} else {

4485

} else {

4464

rdev = conf->mirrors[d].rdev;

4486

rdev = conf->mirrors[d].rdev;

4465

b = r10_bio->devs[s/2].bio;

4487

b = r10_bio->devs[s/2].bio;

4466

}

4488

}

4467

if (!rdev || test_bit(Faulty, &rdev->flags))

4489

if (!rdev || test_bit(Faulty, &rdev->flags))

4468

continue;

4490

continue;

4469

atomic_inc(&rdev->nr_pending);

4491

atomic_inc(&rdev->nr_pending);

4470

md_sync_acct(b->bi_bdev, r10_bio->sectors);

4492

md_sync_acct(b->bi_bdev, r10_bio->sectors);

4471

atomic_inc(&r10_bio->remaining);

4493

atomic_inc(&r10_bio->remaining);

4472

b->bi_next = NULL;

4494

b->bi_next = NULL;

4473

generic_make_request(b);

4495

generic_make_request(b);

4474

}

4496

}

4475

end_reshape_request(r10_bio);

4497

end_reshape_request(r10_bio);

4476

}

4498

}

4477

4499

4478

static void end_reshape(struct r10conf *conf)

4500

static void end_reshape(struct r10conf *conf)

4479

{

4501

{

4480

if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))

4502

if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))

4481

return;

4503

return;

4482

4504

4483

spin_lock_irq(&conf->device_lock);

4505

spin_lock_irq(&conf->device_lock);

4484

conf->prev = conf->geo;

4506

conf->prev = conf->geo;

4485

md_finish_reshape(conf->mddev);

4507

md_finish_reshape(conf->mddev);

4486

smp_wmb();

4508

smp_wmb();

4487

conf->reshape_progress = MaxSector;

4509

conf->reshape_progress = MaxSector;

4488

spin_unlock_irq(&conf->device_lock);

4510

spin_unlock_irq(&conf->device_lock);

4489

4511

4490

/* read-ahead size must cover two whole stripes, which is

4512

/* read-ahead size must cover two whole stripes, which is

4491

* 2 * (datadisks) * chunksize where 'n' is the number of raid devices

4513

* 2 * (datadisks) * chunksize where 'n' is the number of raid devices

4492

*/

4514

*/

4493

if (conf->mddev->queue) {

4515

if (conf->mddev->queue) {

4494

int stripe = conf->geo.raid_disks *

4516

int stripe = conf->geo.raid_disks *

4495

((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);

4517

((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);

4496

stripe /= conf->geo.near_copies;

4518

stripe /= conf->geo.near_copies;

4497

if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)

4519

if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)

4498

conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;

4520

conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;

4499

}

4521

}

4500

conf->fullsync = 0;

4522

conf->fullsync = 0;

4501

}

4523

}

4502

4524

4503

4525

4504

static int handle_reshape_read_error(struct mddev *mddev,

4526

static int handle_reshape_read_error(struct mddev *mddev,

4505

struct r10bio *r10_bio)

4527

struct r10bio *r10_bio)

4506

{

4528

{

4507

/* Use sync reads to get the blocks from somewhere else */

4529

/* Use sync reads to get the blocks from somewhere else */

4508

int sectors = r10_bio->sectors;

4530

int sectors = r10_bio->sectors;

4509

struct r10conf *conf = mddev->private;

4531

struct r10conf *conf = mddev->private;

4510

struct {

4532

struct {

4511

struct r10bio r10_bio;

4533

struct r10bio r10_bio;

4512

struct r10dev devs[conf->copies];

4534

struct r10dev devs[conf->copies];

4513

} on_stack;

4535

} on_stack;

4514

struct r10bio *r10b = &on_stack.r10_bio;

4536

struct r10bio *r10b = &on_stack.r10_bio;

4515

int slot = 0;

4537

int slot = 0;

4516

int idx = 0;

4538

int idx = 0;

4517

struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;

4539

struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;

4518

4540

4519

r10b->sector = r10_bio->sector;

4541

r10b->sector = r10_bio->sector;

4520

__raid10_find_phys(&conf->prev, r10b);

4542

__raid10_find_phys(&conf->prev, r10b);

4521

4543

4522

while (sectors) {

4544

while (sectors) {

4523

int s = sectors;

4545

int s = sectors;

4524

int success = 0;

4546

int success = 0;

4525

int first_slot = slot;

4547

int first_slot = slot;

4526

4548

4527

if (s > (PAGE_SIZE >> 9))

4549

if (s > (PAGE_SIZE >> 9))

4528

s = PAGE_SIZE >> 9;

4550

s = PAGE_SIZE >> 9;

4529

4551

4530

while (!success) {

4552

while (!success) {

4531

int d = r10b->devs[slot].devnum;

4553

int d = r10b->devs[slot].devnum;

4532

struct md_rdev *rdev = conf->mirrors[d].rdev;

4554

struct md_rdev *rdev = conf->mirrors[d].rdev;

4533

sector_t addr;

4555

sector_t addr;

4534

if (rdev == NULL ||

4556

if (rdev == NULL ||

4535

test_bit(Faulty, &rdev->flags) ||

4557

test_bit(Faulty, &rdev->flags) ||

4536

!test_bit(In_sync, &rdev->flags))

4558

!test_bit(In_sync, &rdev->flags))

4537

goto failed;

4559

goto failed;

4538

4560

4539

addr = r10b->devs[slot].addr + idx * PAGE_SIZE;

4561

addr = r10b->devs[slot].addr + idx * PAGE_SIZE;

4540

success = sync_page_io(rdev,

4562

success = sync_page_io(rdev,

4541

addr,

4563

addr,

4542

s << 9,

4564

s << 9,

4543

bvec[idx].bv_page,

4565

bvec[idx].bv_page,

4544

READ, false);

4566

READ, false);

4545

if (success)

4567

if (success)

4546

break;

4568

break;

4547

failed:

4569

failed:

4548

slot++;

4570

slot++;

4549

if (slot >= conf->copies)

4571

if (slot >= conf->copies)

4550

slot = 0;

4572

slot = 0;

4551

if (slot == first_slot)

4573

if (slot == first_slot)

4552

break;

4574

break;

4553

}

4575

}

4554

if (!success) {

4576

if (!success) {

4555

/* couldn't read this block, must give up */

4577

/* couldn't read this block, must give up */

4556

set_bit(MD_RECOVERY_INTR,

4578

set_bit(MD_RECOVERY_INTR,

4557

&mddev->recovery);

4579

&mddev->recovery);

4558

return -EIO;

4580

return -EIO;

4559

}

4581

}

4560

sectors -= s;

4582

sectors -= s;

4561

idx++;

4583

idx++;

4562

}

4584

}

4563

return 0;

4585

return 0;

4564

}

4586

}

4565

4587

4566

static void end_reshape_write(struct bio *bio, int error)

4588

static void end_reshape_write(struct bio *bio, int error)

4567

{

4589

{

4568

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

4590

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

4569

struct r10bio *r10_bio = bio->bi_private;

4591

struct r10bio *r10_bio = bio->bi_private;

4570

struct mddev *mddev = r10_bio->mddev;

4592

struct mddev *mddev = r10_bio->mddev;

4571

struct r10conf *conf = mddev->private;

4593

struct r10conf *conf = mddev->private;

4572

int d;

4594

int d;

4573

int slot;

4595

int slot;

4574

int repl;

4596

int repl;

4575

struct md_rdev *rdev = NULL;

4597

struct md_rdev *rdev = NULL;

4576

4598

4577

d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);

4599

d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);

4578

if (repl)

4600

if (repl)

4579

rdev = conf->mirrors[d].replacement;

4601

rdev = conf->mirrors[d].replacement;

4580

if (!rdev) {

4602

if (!rdev) {

4581

smp_mb();

4603

smp_mb();

4582

rdev = conf->mirrors[d].rdev;

4604

rdev = conf->mirrors[d].rdev;

4583

}

4605

}

4584

4606

4585

if (!uptodate) {

4607

if (!uptodate) {

4586

/* FIXME should record badblock */

4608

/* FIXME should record badblock */

4587

md_error(mddev, rdev);

4609

md_error(mddev, rdev);

4588

}

4610

}

4589

4611

4590

rdev_dec_pending(rdev, mddev);

4612

rdev_dec_pending(rdev, mddev);

4591

end_reshape_request(r10_bio);

4613

end_reshape_request(r10_bio);

4592

}

4614

}

4593

4615

4594

static void end_reshape_request(struct r10bio *r10_bio)

4616

static void end_reshape_request(struct r10bio *r10_bio)

4595

{

4617

{

4596

if (!atomic_dec_and_test(&r10_bio->remaining))

4618

if (!atomic_dec_and_test(&r10_bio->remaining))

4597

return;

4619

return;

4598

md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);

4620

md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);

4599

bio_put(r10_bio->master_bio);

4621

bio_put(r10_bio->master_bio);

4600

put_buf(r10_bio);

4622

put_buf(r10_bio);

4601

}

4623

}

4602

4624

4603

static void raid10_finish_reshape(struct mddev *mddev)

4625

static void raid10_finish_reshape(struct mddev *mddev)

4604

{

4626

{

4605

struct r10conf *conf = mddev->private;

4627

struct r10conf *conf = mddev->private;

4606

4628

4607

if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))

4629

if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))

4608

return;

4630

return;

4609

4631

4610

if (mddev->delta_disks > 0) {

4632

if (mddev->delta_disks > 0) {

4611

sector_t size = raid10_size(mddev, 0, 0);

4633

sector_t size = raid10_size(mddev, 0, 0);

4612

md_set_array_sectors(mddev, size);

4634

md_set_array_sectors(mddev, size);

4613

if (mddev->recovery_cp > mddev->resync_max_sectors) {

4635

if (mddev->recovery_cp > mddev->resync_max_sectors) {

4614

mddev->recovery_cp = mddev->resync_max_sectors;

4636

mddev->recovery_cp = mddev->resync_max_sectors;

4615

set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);

4637

set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);

4616

}

4638

}

4617

mddev->resync_max_sectors = size;

4639

mddev->resync_max_sectors = size;

4618

set_capacity(mddev->gendisk, mddev->array_sectors);

4640

set_capacity(mddev->gendisk, mddev->array_sectors);

4619

revalidate_disk(mddev->gendisk);

4641

revalidate_disk(mddev->gendisk);

4620

} else {

4642

} else {

4621

int d;

4643

int d;

4622

for (d = conf->geo.raid_disks ;

4644

for (d = conf->geo.raid_disks ;

4623

d < conf->geo.raid_disks - mddev->delta_disks;

4645

d < conf->geo.raid_disks - mddev->delta_disks;

4624

d++) {

4646

d++) {

4625

struct md_rdev *rdev = conf->mirrors[d].rdev;

4647

struct md_rdev *rdev = conf->mirrors[d].rdev;

4626

if (rdev)

4648

if (rdev)

4627

clear_bit(In_sync, &rdev->flags);

4649

clear_bit(In_sync, &rdev->flags);

4628

rdev = conf->mirrors[d].replacement;

4650

rdev = conf->mirrors[d].replacement;

4629

if (rdev)

4651

if (rdev)

4630

clear_bit(In_sync, &rdev->flags);

4652

clear_bit(In_sync, &rdev->flags);

4631

}

4653

}

4632

}

4654

}

4633

mddev->layout = mddev->new_layout;

4655

mddev->layout = mddev->new_layout;

4634

mddev->chunk_sectors = 1 << conf->geo.chunk_shift;

4656

mddev->chunk_sectors = 1 << conf->geo.chunk_shift;

4635

mddev->reshape_position = MaxSector;

4657

mddev->reshape_position = MaxSector;

4636

mddev->delta_disks = 0;

4658

mddev->delta_disks = 0;

4637

mddev->reshape_backwards = 0;

4659

mddev->reshape_backwards = 0;

4638

}

4660

}

4639

4661

4640

static struct md_personality raid10_personality =

4662

static struct md_personality raid10_personality =

4641

{

4663

{

4642

.name = "raid10",

4664

.name = "raid10",

4643

.level = 10,

4665

.level = 10,

4644

.owner = THIS_MODULE,

4666

.owner = THIS_MODULE,

4645

.make_request = make_request,

4667

.make_request = make_request,

4646

.run = run,

4668

.run = run,

4647

.stop = stop,

4669

.stop = stop,

4648

.status = status,

4670

.status = status,

4649

.error_handler = error,

4671

.error_handler = error,

4650

.hot_add_disk = raid10_add_disk,

4672

.hot_add_disk = raid10_add_disk,

4651

.hot_remove_disk= raid10_remove_disk,

4673

.hot_remove_disk= raid10_remove_disk,

4652

.spare_active = raid10_spare_active,

4674

.spare_active = raid10_spare_active,

4653

.sync_request = sync_request,

4675

.sync_request = sync_request,

4654

.quiesce = raid10_quiesce,

4676

.quiesce = raid10_quiesce,

4655

.size = raid10_size,

4677

.size = raid10_size,

4656

.resize = raid10_resize,

4678

.resize = raid10_resize,

4657

.takeover = raid10_takeover,

4679

.takeover = raid10_takeover,

4658

.check_reshape = raid10_check_reshape,

4680

.check_reshape = raid10_check_reshape,

4659

.start_reshape = raid10_start_reshape,

4681

.start_reshape = raid10_start_reshape,

4660

.finish_reshape = raid10_finish_reshape,

4682

.finish_reshape = raid10_finish_reshape,

4661

};

4683

};

4662

4684

4663

static int __init raid_init(void)

4685

static int __init raid_init(void)

4664

{

4686

{

4665

return register_md_personality(&raid10_personality);

4687

return register_md_personality(&raid10_personality);

4666

}

4688

}

4667

4689

4668

static void raid_exit(void)

4690

static void raid_exit(void)

4669

{

4691

{

4670

unregister_md_personality(&raid10_personality);

4692

unregister_md_personality(&raid10_personality);

4671

}

4693

}

4672

4694

4673

module_init(raid_init);

4695

module_init(raid_init);

4674

module_exit(raid_exit);

4696

module_exit(raid_exit);

4675

MODULE_LICENSE("GPL");

4697

MODULE_LICENSE("GPL");

4676

MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");

4698

MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");

4677

MODULE_ALIAS("md-personality-9"); /* RAID10 */

4699

MODULE_ALIAS("md-personality-9"); /* RAID10 */

4678

MODULE_ALIAS("md-raid10");

4700

MODULE_ALIAS("md-raid10");

4679

MODULE_ALIAS("md-level-10");

4701

MODULE_ALIAS("md-level-10");

4680

4702

GITLAB

MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 1)

 /*
  * raid10.c : Multiple Devices driver for Linux
  *
  * Copyright (C) 2000-2004 Neil Brown
  *
  * RAID-10 support for md.
  *
  * Base on code in raid1.c.  See raid1.c for further copyright information.
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2, or (at your option)
  * any later version.
  *
  * You should have received a copy of the GNU General Public License
  * (for example /usr/src/linux/COPYING); if not, write to the Free
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
 #include "md.h"
 #include "raid10.h"
 #include "raid0.h"
 #include "bitmap.h"
 /*
  * RAID10 provides a combination of RAID0 and RAID1 functionality.
  * The layout of data is defined by
  *    chunk_size
  *    raid_disks
  *    near_copies (stored in low byte of layout)
  *    far_copies (stored in second byte of layout)
  *    far_offset (stored in bit 16 of layout )
+ *    use_far_sets (stored in bit 17 of layout )
  *
- * The data to be stored is divided into chunks using chunksize.
+ * The data to be stored is divided into chunks using chunksize.  Each device
- * Each device is divided into far_copies sections.
+ * is divided into far_copies sections.   In each section, chunks are laid out
- * In each section, chunks are laid out in a style similar to raid0, but
+ * in a style similar to raid0, but near_copies copies of each chunk is stored
- * near_copies copies of each chunk is stored (each on a different drive).
+ * (each on a different drive).  The starting device for each section is offset
- * The starting device for each section is offset near_copies from the starting
+ * near_copies from the starting device of the previous section.  Thus there
- * device of the previous section.
+ * are (near_copies * far_copies) of each chunk, and each is on a different
- * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
+ * drive.  near_copies and far_copies must be at least one, and their product
- * drive.
+ * is at most raid_disks.
- * near_copies and far_copies must be at least one, and their product is at most
- * raid_disks.
  *
  * If far_offset is true, then the far_copies are handled a bit differently.
- * The copies are still in different stripes, but instead of be very far apart
+ * The copies are still in different stripes, but instead of being very far
- * on disk, there are adjacent stripes.
+ * apart on disk, there are adjacent stripes.
+ *
+ * The far and offset algorithms are handled slightly differently if
+ * 'use_far_sets' is true.  In this case, the array's devices are grouped into
+ * sets that are (near_copies * far_copies) in size.  The far copied stripes
+ * are still shifted by 'near_copies' devices, but this shifting stays confined
+ * to the set rather than the entire array.  This is done to improve the number
+ * of device combinations that can fail without causing the array to fail.
+ * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
+ * on a device):
+ *    A B C D    A B C D E
+ *      ...         ...
+ *    D A B C    E A B C D
+ * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
+ *    [A B] [C D]    [A B] [C D E]
+ *    |...| |...|    |...| | ... |
+ *    [B A] [D C]    [B A] [E C D]
  */
 /*
  * Number of guaranteed r10bios in case of extreme VM load:
  */
 #define	NR_RAID10_BIOS 256
 /* when we get a read error on a read-only array, we redirect to another
  * device without failing the first device, or trying to over-write to
  * correct the read error.  To keep track of bad blocks on a per-bio
  * level, we store IO_BLOCKED in the appropriate 'bios' pointer
  */
 #define IO_BLOCKED ((struct bio *)1)
 /* When we successfully write to a known bad-block, we need to remove the
  * bad-block marking which must be done from process context.  So we record
  * the success by setting devs[n].bio to IO_MADE_GOOD
  */
 #define IO_MADE_GOOD ((struct bio *)2)
 #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
 /* When there are this many requests queued to be written by
  * the raid10 thread, we become 'congested' to provide back-pressure
  * for writeback.
  */
 static int max_queued_requests = 1024;
 static void allow_barrier(struct r10conf *conf);
 static void lower_barrier(struct r10conf *conf);
 static int enough(struct r10conf *conf, int ignore);
 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 				int *skipped);
 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
 static void end_reshape_write(struct bio *bio, int error);
 static void end_reshape(struct r10conf *conf);
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
 	int size = offsetof(struct r10bio, devs[conf->copies]);
 	/* allocate a r10bio with room for raid_disks entries in the
 	 * bios array */
 	return kzalloc(size, gfp_flags);
 }
 static void r10bio_pool_free(void *r10_bio, void *data)
 {
 	kfree(r10_bio);
 }
 /* Maximum size of each resync request */
 #define RESYNC_BLOCK_SIZE (64*1024)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 /* amount of memory to reserve for resync requests */
 #define RESYNC_WINDOW (1024*1024)
 /* maximum number of concurrent requests, memory permitting */
 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
 /*
  * When performing a resync, we need to read and compare, so
  * we need as many pages are there are copies.
  * When performing a recovery, we need 2 bios, one for read,
  * one for write (we recover only one drive per r10buf)
  *
  */
 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
 	struct page *page;
 	struct r10bio *r10_bio;
 	struct bio *bio;
 	int i, j;
 	int nalloc;
 	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
 	if (!r10_bio)
 		return NULL;
 	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
 	    test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
 		nalloc = conf->copies; /* resync */
 	else
 		nalloc = 2; /* recovery */
 	/*
 	 * Allocate bios.
 	 */
 	for (j = nalloc ; j-- ; ) {
 		bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
 		if (!bio)
 			goto out_free_bio;
 		r10_bio->devs[j].bio = bio;
 		if (!conf->have_replacement)
 			continue;
 		bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
 		if (!bio)
 			goto out_free_bio;
 		r10_bio->devs[j].repl_bio = bio;
 	}
 	/*
 	 * Allocate RESYNC_PAGES data pages and attach them
 	 * where needed.
 	 */
 	for (j = 0 ; j < nalloc; j++) {
 		struct bio *rbio = r10_bio->devs[j].repl_bio;
 		bio = r10_bio->devs[j].bio;
 		for (i = 0; i < RESYNC_PAGES; i++) {
 			if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
 					       &conf->mddev->recovery)) {
 				/* we can share bv_page's during recovery
 				 * and reshape */
 				struct bio *rbio = r10_bio->devs[0].bio;
 				page = rbio->bi_io_vec[i].bv_page;
 				get_page(page);
 			} else
 				page = alloc_page(gfp_flags);
 			if (unlikely(!page))
 				goto out_free_pages;
 			bio->bi_io_vec[i].bv_page = page;
 			if (rbio)
 				rbio->bi_io_vec[i].bv_page = page;
 		}
 	}
 	return r10_bio;
 out_free_pages:
 	for ( ; i > 0 ; i--)
 		safe_put_page(bio->bi_io_vec[i-1].bv_page);
 	while (j--)
 		for (i = 0; i < RESYNC_PAGES ; i++)
 			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
 	j = 0;
 out_free_bio:
 	for ( ; j < nalloc; j++) {
 		if (r10_bio->devs[j].bio)
 			bio_put(r10_bio->devs[j].bio);
 		if (r10_bio->devs[j].repl_bio)
 			bio_put(r10_bio->devs[j].repl_bio);
 	}
 	r10bio_pool_free(r10_bio, conf);
 	return NULL;
 }
 static void r10buf_pool_free(void *__r10_bio, void *data)
 {
 	int i;
 	struct r10conf *conf = data;
 	struct r10bio *r10bio = __r10_bio;
 	int j;
 	for (j=0; j < conf->copies; j++) {
 		struct bio *bio = r10bio->devs[j].bio;
 		if (bio) {
 			for (i = 0; i < RESYNC_PAGES; i++) {
 				safe_put_page(bio->bi_io_vec[i].bv_page);
 				bio->bi_io_vec[i].bv_page = NULL;
 			}
 			bio_put(bio);
 		}
 		bio = r10bio->devs[j].repl_bio;
 		if (bio)
 			bio_put(bio);
 	}
 	r10bio_pool_free(r10bio, conf);
 }
 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
 {
 	int i;
 	for (i = 0; i < conf->copies; i++) {
 		struct bio **bio = & r10_bio->devs[i].bio;
 		if (!BIO_SPECIAL(*bio))
 			bio_put(*bio);
 		*bio = NULL;
 		bio = &r10_bio->devs[i].repl_bio;
 		if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
 			bio_put(*bio);
 		*bio = NULL;
 	}
 }
 static void free_r10bio(struct r10bio *r10_bio)
 {
 	struct r10conf *conf = r10_bio->mddev->private;
 	put_all_bios(conf, r10_bio);
 	mempool_free(r10_bio, conf->r10bio_pool);
 }
 static void put_buf(struct r10bio *r10_bio)
 {
 	struct r10conf *conf = r10_bio->mddev->private;
 	mempool_free(r10_bio, conf->r10buf_pool);
 	lower_barrier(conf);
 }
 static void reschedule_retry(struct r10bio *r10_bio)
 {
 	unsigned long flags;
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r10_bio->retry_list, &conf->retry_list);
 	conf->nr_queued ++;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	/* wake up frozen array... */
 	wake_up(&conf->wait_barrier);
 	md_wakeup_thread(mddev->thread);
 }
 /*
  * raid_end_bio_io() is called when we have finished servicing a mirrored
  * operation and are ready to return a success/failure code to the buffer
  * cache layer.
  */
 static void raid_end_bio_io(struct r10bio *r10_bio)
 {
 	struct bio *bio = r10_bio->master_bio;
 	int done;
 	struct r10conf *conf = r10_bio->mddev->private;
 	if (bio->bi_phys_segments) {
 		unsigned long flags;
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio->bi_phys_segments--;
 		done = (bio->bi_phys_segments == 0);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	} else
 		done = 1;
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	if (done) {
 		bio_endio(bio, 0);
 		/*
 		 * Wake up any possible resync thread that waits for the device
 		 * to go idle.
 		 */
 		allow_barrier(conf);
 	}
 	free_r10bio(r10_bio);
 }
 /*
  * Update disk head position estimator based on IRQ completion info.
  */
 static inline void update_head_pos(int slot, struct r10bio *r10_bio)
 {
 	struct r10conf *conf = r10_bio->mddev->private;
 	conf->mirrors[r10_bio->devs[slot].devnum].head_position =
 		r10_bio->devs[slot].addr + (r10_bio->sectors);
 }
 /*
  * Find the disk number which triggered given bio
  */
 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
 			 struct bio *bio, int *slotp, int *replp)
 {
 	int slot;
 	int repl = 0;
 	for (slot = 0; slot < conf->copies; slot++) {
 		if (r10_bio->devs[slot].bio == bio)
 			break;
 		if (r10_bio->devs[slot].repl_bio == bio) {
 			repl = 1;
 			break;
 		}
 	}
 	BUG_ON(slot == conf->copies);
 	update_head_pos(slot, r10_bio);
 	if (slotp)
 		*slotp = slot;
 	if (replp)
 		*replp = repl;
 	return r10_bio->devs[slot].devnum;
 }
 static void raid10_end_read_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct r10bio *r10_bio = bio->bi_private;
 	int slot, dev;
 	struct md_rdev *rdev;
 	struct r10conf *conf = r10_bio->mddev->private;
 	slot = r10_bio->read_slot;
 	dev = r10_bio->devs[slot].devnum;
 	rdev = r10_bio->devs[slot].rdev;
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
 	update_head_pos(slot, r10_bio);
 	if (uptodate) {
 		/*
 		 * Set R10BIO_Uptodate in our master bio, so that
 		 * we will return a good error code to the higher
 		 * levels even if IO on some other mirrored buffer fails.
 		 *
 		 * The 'master' represents the composite IO operation to
 		 * user-side. So if something waits for IO, then it will
 		 * wait for the 'master' bio.
 		 */
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
 	} else {
 		/* If all other devices that store this block have
 		 * failed, we want to return the error upwards rather
 		 * than fail the last device.  Here we redefine
 		 * "uptodate" to mean "Don't want to retry"
 		 */
 		unsigned long flags;
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (!enough(conf, rdev->raid_disk))
 			uptodate = 1;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	if (uptodate) {
 		raid_end_bio_io(r10_bio);
 		rdev_dec_pending(rdev, conf->mddev);
 	} else {
 		/*
 		 * oops, read error - keep the refcount on the rdev
 		 */
 		char b[BDEVNAME_SIZE];
 		printk_ratelimited(KERN_ERR
 				   "md/raid10:%s: %s: rescheduling sector %llu\n",
 				   mdname(conf->mddev),
 				   bdevname(rdev->bdev, b),
 				   (unsigned long long)r10_bio->sector);
 		set_bit(R10BIO_ReadError, &r10_bio->state);
 		reschedule_retry(r10_bio);
 	}
 }
 static void close_write(struct r10bio *r10_bio)
 {
 	/* clear the bitmap if all writes complete successfully */
 	bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
 			r10_bio->sectors,
 			!test_bit(R10BIO_Degraded, &r10_bio->state),
 			0);
 	md_write_end(r10_bio->mddev);
 }
 static void one_write_done(struct r10bio *r10_bio)
 {
 	if (atomic_dec_and_test(&r10_bio->remaining)) {
 		if (test_bit(R10BIO_WriteError, &r10_bio->state))
 			reschedule_retry(r10_bio);
 		else {
 			close_write(r10_bio);
 			if (test_bit(R10BIO_MadeGood, &r10_bio->state))
 				reschedule_retry(r10_bio);
 			else
 				raid_end_bio_io(r10_bio);
 		}
 	}
 }
 static void raid10_end_write_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct r10bio *r10_bio = bio->bi_private;
 	int dev;
 	int dec_rdev = 1;
 	struct r10conf *conf = r10_bio->mddev->private;
 	int slot, repl;
 	struct md_rdev *rdev = NULL;
 	dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 	if (repl)
 		rdev = conf->mirrors[dev].replacement;
 	if (!rdev) {
 		smp_rmb();
 		repl = 0;
 		rdev = conf->mirrors[dev].rdev;
 	}
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
 	if (!uptodate) {
 		if (repl)
 			/* Never record new bad blocks to replacement,
 			 * just fail it.
 			 */
 			md_error(rdev->mddev, rdev);
 		else {
 			set_bit(WriteErrorSeen,	&rdev->flags);
 			if (!test_and_set_bit(WantReplacement, &rdev->flags))
 				set_bit(MD_RECOVERY_NEEDED,
 					&rdev->mddev->recovery);
 			set_bit(R10BIO_WriteError, &r10_bio->state);
 			dec_rdev = 0;
 		}
 	} else {
 		/*
 		 * Set R10BIO_Uptodate in our master bio, so that
 		 * we will return a good error code for to the higher
 		 * levels even if IO on some other mirrored buffer fails.
 		 *
 		 * The 'master' represents the composite IO operation to
 		 * user-side. So if something waits for IO, then it will
 		 * wait for the 'master' bio.
 		 */
 		sector_t first_bad;
 		int bad_sectors;
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
 		/* Maybe we can clear some bad blocks. */
 		if (is_badblock(rdev,
 				r10_bio->devs[slot].addr,
 				r10_bio->sectors,
 				&first_bad, &bad_sectors)) {
 			bio_put(bio);
 			if (repl)
 				r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
 			else
 				r10_bio->devs[slot].bio = IO_MADE_GOOD;
 			dec_rdev = 0;
 			set_bit(R10BIO_MadeGood, &r10_bio->state);
 		}
 	}
 	/*
 	 *
 	 * Let's see if all mirrored write operations have finished
 	 * already.
 	 */
 	one_write_done(r10_bio);
 	if (dec_rdev)
 		rdev_dec_pending(rdev, conf->mddev);
 }
 /*
  * RAID10 layout manager
  * As well as the chunksize and raid_disks count, there are two
  * parameters: near_copies and far_copies.
  * near_copies * far_copies must be <= raid_disks.
  * Normally one of these will be 1.
  * If both are 1, we get raid0.
  * If near_copies == raid_disks, we get raid1.
  *
  * Chunks are laid out in raid0 style with near_copies copies of the
  * first chunk, followed by near_copies copies of the next chunk and
  * so on.
  * If far_copies > 1, then after 1/far_copies of the array has been assigned
  * as described above, we start again with a device offset of near_copies.
  * So we effectively have another copy of the whole array further down all
  * the drives, but with blocks on different drives.
  * With this layout, and block is never stored twice on the one device.
  *
  * raid10_find_phys finds the sector offset of a given virtual sector
  * on each device that it is on.
  *
  * raid10_find_virt does the reverse mapping, from a device and a
  * sector offset to a virtual address
  */
 static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
 {
 	int n,f;
 	sector_t sector;
 	sector_t chunk;
 	sector_t stripe;
 	int dev;
 	int slot = 0;
 	/* now calculate first sector/dev */
 	chunk = r10bio->sector >> geo->chunk_shift;
 	sector = r10bio->sector & geo->chunk_mask;
 	chunk *= geo->near_copies;
 	stripe = chunk;
 	dev = sector_div(stripe, geo->raid_disks);
 	if (geo->far_offset)
 		stripe *= geo->far_copies;
 	sector += stripe << geo->chunk_shift;
 	/* and calculate all the others */
 	for (n = 0; n < geo->near_copies; n++) {
 		int d = dev;
+		int set;
 		sector_t s = sector;
 		r10bio->devs[slot].devnum = d;
 		r10bio->devs[slot].addr = s;
 		slot++;
 		for (f = 1; f < geo->far_copies; f++) {
+			set = d / geo->far_set_size;
 			d += geo->near_copies;
-			d %= geo->raid_disks;
+			d %= geo->far_set_size;
+			d += geo->far_set_size * set;
 			s += geo->stride;
 			r10bio->devs[slot].devnum = d;
 			r10bio->devs[slot].addr = s;
 			slot++;
 		}
 		dev++;
 		if (dev >= geo->raid_disks) {
 			dev = 0;
 			sector += (geo->chunk_mask + 1);
 		}
 	}
 }
 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
 {
 	struct geom *geo = &conf->geo;
 	if (conf->reshape_progress != MaxSector &&
 	    ((r10bio->sector >= conf->reshape_progress) !=
 	     conf->mddev->reshape_backwards)) {
 		set_bit(R10BIO_Previous, &r10bio->state);
 		geo = &conf->prev;
 	} else
 		clear_bit(R10BIO_Previous, &r10bio->state);
 	__raid10_find_phys(geo, r10bio);
 }
 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 {
 	sector_t offset, chunk, vchunk;
 	/* Never use conf->prev as this is only called during resync
 	 * or recovery, so reshape isn't happening
 	 */
 	struct geom *geo = &conf->geo;
+	int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
+	int far_set_size = geo->far_set_size;
 	offset = sector & geo->chunk_mask;
 	if (geo->far_offset) {
 		int fc;
 		chunk = sector >> geo->chunk_shift;
 		fc = sector_div(chunk, geo->far_copies);
 		dev -= fc * geo->near_copies;
-		if (dev < 0)
+		if (dev < far_set_start)
-			dev += geo->raid_disks;
+			dev += far_set_size;
 	} else {
 		while (sector >= geo->stride) {
 			sector -= geo->stride;
-			if (dev < geo->near_copies)
+			if (dev < (geo->near_copies + far_set_start))
-				dev += geo->raid_disks - geo->near_copies;
+				dev += far_set_size - geo->near_copies;
 			else
 				dev -= geo->near_copies;
 		}
 		chunk = sector >> geo->chunk_shift;
 	}
 	vchunk = chunk * geo->raid_disks + dev;
 	sector_div(vchunk, geo->near_copies);
 	return (vchunk << geo->chunk_shift) + offset;
 }
 /**
  *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
  *	@q: request queue
  *	@bvm: properties of new bio
  *	@biovec: the request that could be merged to it.
  *
  *	Return amount of bytes we can accept at this offset
  *	This requires checking for end-of-chunk if near_copies != raid_disks,
  *	and for subordinate merge_bvec_fns if merge_check_needed.
  */
 static int raid10_mergeable_bvec(struct request_queue *q,
 				 struct bvec_merge_data *bvm,
 				 struct bio_vec *biovec)
 {
 	struct mddev *mddev = q->queuedata;
 	struct r10conf *conf = mddev->private;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
 	unsigned int chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 	struct geom *geo = &conf->geo;
 	chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
 	if (conf->reshape_progress != MaxSector &&
 	    ((sector >= conf->reshape_progress) !=
 	     conf->mddev->reshape_backwards))
 		geo = &conf->prev;
 	if (geo->near_copies < geo->raid_disks) {
 		max = (chunk_sectors - ((sector & (chunk_sectors - 1))
 					+ bio_sectors)) << 9;
 		if (max < 0)
 			/* bio_add cannot handle a negative return */
 			max = 0;
 		if (max <= biovec->bv_len && bio_sectors == 0)
 			return biovec->bv_len;
 	} else
 		max = biovec->bv_len;
 	if (mddev->merge_check_needed) {
 		struct {
 			struct r10bio r10_bio;
 			struct r10dev devs[conf->copies];
 		} on_stack;
 		struct r10bio *r10_bio = &on_stack.r10_bio;
 		int s;
 		if (conf->reshape_progress != MaxSector) {
 			/* Cannot give any guidance during reshape */
 			if (max <= biovec->bv_len && bio_sectors == 0)
 				return biovec->bv_len;
 			return 0;
 		}
 		r10_bio->sector = sector;
 		raid10_find_phys(conf, r10_bio);
 		rcu_read_lock();
 		for (s = 0; s < conf->copies; s++) {
 			int disk = r10_bio->devs[s].devnum;
 			struct md_rdev *rdev = rcu_dereference(
 				conf->mirrors[disk].rdev);
 			if (rdev && !test_bit(Faulty, &rdev->flags)) {
 				struct request_queue *q =
 					bdev_get_queue(rdev->bdev);
 				if (q->merge_bvec_fn) {
 					bvm->bi_sector = r10_bio->devs[s].addr
 						+ rdev->data_offset;
 					bvm->bi_bdev = rdev->bdev;
 					max = min(max, q->merge_bvec_fn(
 							  q, bvm, biovec));
 				}
 			}
 			rdev = rcu_dereference(conf->mirrors[disk].replacement);
 			if (rdev && !test_bit(Faulty, &rdev->flags)) {
 				struct request_queue *q =
 					bdev_get_queue(rdev->bdev);
 				if (q->merge_bvec_fn) {
 					bvm->bi_sector = r10_bio->devs[s].addr
 						+ rdev->data_offset;
 					bvm->bi_bdev = rdev->bdev;
 					max = min(max, q->merge_bvec_fn(
 							  q, bvm, biovec));
 				}
 			}
 		}
 		rcu_read_unlock();
 	}
 	return max;
 }
 /*
  * This routine returns the disk from which the requested read should
  * be done. There is a per-array 'next expected sequential IO' sector
  * number - if this matches on the next IO then we use the last disk.
  * There is also a per-disk 'last know head position' sector that is
  * maintained from IRQ contexts, both the normal and the resync IO
  * completion handlers update this position correctly. If there is no
  * perfect sequential match then we pick the disk whose head is closest.
  *
  * If there are 2 mirrors in the same 2 devices, performance degrades
  * because position is mirror, not device based.
  *
  * The rdev for the device selected will have nr_pending incremented.
  */
 /*
  * FIXME: possibly should rethink readbalancing and do it differently
  * depending on near_copies / far_copies geometry.
  */
 static struct md_rdev *read_balance(struct r10conf *conf,
 				    struct r10bio *r10_bio,
 				    int *max_sectors)
 {
 	const sector_t this_sector = r10_bio->sector;
 	int disk, slot;
 	int sectors = r10_bio->sectors;
 	int best_good_sectors;
 	sector_t new_distance, best_dist;
 	struct md_rdev *best_rdev, *rdev = NULL;
 	int do_balance;
 	int best_slot;
 	struct geom *geo = &conf->geo;
 	raid10_find_phys(conf, r10_bio);
 	rcu_read_lock();
 retry:
 	sectors = r10_bio->sectors;
 	best_slot = -1;
 	best_rdev = NULL;
 	best_dist = MaxSector;
 	best_good_sectors = 0;
 	do_balance = 1;
 	/*
 	 * Check if we can balance. We can balance on the whole
 	 * device if no resync is going on (recovery is ok), or below
 	 * the resync window. We take the first readable disk when
 	 * above the resync window.
 	 */
 	if (conf->mddev->recovery_cp < MaxSector
 	    && (this_sector + sectors >= conf->next_resync))
 		do_balance = 0;
 	for (slot = 0; slot < conf->copies ; slot++) {
 		sector_t first_bad;
 		int bad_sectors;
 		sector_t dev_sector;
 		if (r10_bio->devs[slot].bio == IO_BLOCKED)
 			continue;
 		disk = r10_bio->devs[slot].devnum;
 		rdev = rcu_dereference(conf->mirrors[disk].replacement);
 		if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
 		    test_bit(Unmerged, &rdev->flags) ||
 		    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 			rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (rdev == NULL ||
 		    test_bit(Faulty, &rdev->flags) ||
 		    test_bit(Unmerged, &rdev->flags))
 			continue;
 		if (!test_bit(In_sync, &rdev->flags) &&
 		    r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
 			continue;
 		dev_sector = r10_bio->devs[slot].addr;
 		if (is_badblock(rdev, dev_sector, sectors,
 				&first_bad, &bad_sectors)) {
 			if (best_dist < MaxSector)
 				/* Already have a better slot */
 				continue;
 			if (first_bad <= dev_sector) {
 				/* Cannot read here.  If this is the
 				 * 'primary' device, then we must not read
 				 * beyond 'bad_sectors' from another device.
 				 */
 				bad_sectors -= (dev_sector - first_bad);
 				if (!do_balance && sectors > bad_sectors)
 					sectors = bad_sectors;
 				if (best_good_sectors > sectors)
 					best_good_sectors = sectors;
 			} else {
 				sector_t good_sectors =
 					first_bad - dev_sector;
 				if (good_sectors > best_good_sectors) {
 					best_good_sectors = good_sectors;
 					best_slot = slot;
 					best_rdev = rdev;
 				}
 				if (!do_balance)
 					/* Must read from here */
 					break;
 			}
 			continue;
 		} else
 			best_good_sectors = sectors;
 		if (!do_balance)
 			break;
 		/* This optimisation is debatable, and completely destroys
 		 * sequential read speed for 'far copies' arrays.  So only
 		 * keep it for 'near' arrays, and review those later.
 		 */
 		if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
 			break;
 		/* for far > 1 always use the lowest address */
 		if (geo->far_copies > 1)
 			new_distance = r10_bio->devs[slot].addr;
 		else
 			new_distance = abs(r10_bio->devs[slot].addr -
 					   conf->mirrors[disk].head_position);
 		if (new_distance < best_dist) {
 			best_dist = new_distance;
 			best_slot = slot;
 			best_rdev = rdev;
 		}
 	}
 	if (slot >= conf->copies) {
 		slot = best_slot;
 		rdev = best_rdev;
 	}
 	if (slot >= 0) {
 		atomic_inc(&rdev->nr_pending);
 		if (test_bit(Faulty, &rdev->flags)) {
 			/* Cannot risk returning a device that failed
 			 * before we inc'ed nr_pending
 			 */
 			rdev_dec_pending(rdev, conf->mddev);
 			goto retry;
 		}
 		r10_bio->read_slot = slot;
 	} else
 		rdev = NULL;
 	rcu_read_unlock();
 	*max_sectors = best_good_sectors;
 	return rdev;
 }
 int md_raid10_congested(struct mddev *mddev, int bits)
 {
 	struct r10conf *conf = mddev->private;
 	int i, ret = 0;
 	if ((bits & (1 << BDI_async_congested)) &&
 	    conf->pending_count >= max_queued_requests)
 		return 1;
 	rcu_read_lock();
 	for (i = 0;
 	     (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
 		     && ret == 0;
 	     i++) {
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct request_queue *q = bdev_get_queue(rdev->bdev);
 			ret |= bdi_congested(&q->backing_dev_info, bits);
 		}
 	}
 	rcu_read_unlock();
 	return ret;
 }
 EXPORT_SYMBOL_GPL(md_raid10_congested);
 static int raid10_congested(void *data, int bits)
 {
 	struct mddev *mddev = data;
 	return mddev_congested(mddev, bits) ||
 		md_raid10_congested(mddev, bits);
 }
 static void flush_pending_writes(struct r10conf *conf)
 {
 	/* Any writes that have been queued but are awaiting
 	 * bitmap updates get flushed here.
 	 */
 	spin_lock_irq(&conf->device_lock);
 	if (conf->pending_bio_list.head) {
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
 		conf->pending_count = 0;
 		spin_unlock_irq(&conf->device_lock);
 		/* flush any pending bitmap writes to disk
 		 * before proceeding w/ I/O */
 		bitmap_unplug(conf->mddev->bitmap);
 		wake_up(&conf->wait_barrier);
 		while (bio) { /* submit pending writes */
 			struct bio *next = bio->bi_next;
 			bio->bi_next = NULL;
 			if (unlikely((bio->bi_rw & REQ_DISCARD) &&
 			    !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
 				/* Just ignore it */
 				bio_endio(bio, 0);
 			else
 				generic_make_request(bio);
 			bio = next;
 		}
 	} else
 		spin_unlock_irq(&conf->device_lock);
 }
 /* Barriers....
  * Sometimes we need to suspend IO while we do something else,
  * either some resync/recovery, or reconfigure the array.
  * To do this we raise a 'barrier'.
  * The 'barrier' is a counter that can be raised multiple times
  * to count how many activities are happening which preclude
  * normal IO.
  * We can only raise the barrier if there is no pending IO.
  * i.e. if nr_pending == 0.
  * We choose only to raise the barrier if no-one is waiting for the
  * barrier to go down.  This means that as soon as an IO request
  * is ready, no other operations which require a barrier will start
  * until the IO request has had a chance.
  *
  * So: regular IO calls 'wait_barrier'.  When that returns there
  *    is no backgroup IO happening,  It must arrange to call
  *    allow_barrier when it has finished its IO.
  * backgroup IO calls must call raise_barrier.  Once that returns
  *    there is no normal IO happeing.  It must arrange to call
  *    lower_barrier when the particular background IO completes.
  */
 static void raise_barrier(struct r10conf *conf, int force)
 {
 	BUG_ON(force && !conf->barrier);
 	spin_lock_irq(&conf->resync_lock);
 	/* Wait until no block IO is waiting (unless 'force') */
 	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
 			    conf->resync_lock);
 	/* block any new IO from starting */
 	conf->barrier++;
 	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
 			    conf->resync_lock);
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void lower_barrier(struct r10conf *conf)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->resync_lock, flags);
 	conf->barrier--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 static void wait_barrier(struct r10conf *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
 	if (conf->barrier) {
 		conf->nr_waiting++;
 		/* Wait for the barrier to drop.
 		 * However if there are already pending
 		 * requests (preventing the barrier from
 		 * rising completely), and the
 		 * pre-process bio queue isn't empty,
 		 * then don't wait, as we need to empty
 		 * that queue to get the nr_pending
 		 * count down.
 		 */
 		wait_event_lock_irq(conf->wait_barrier,
 				    !conf->barrier ||
 				    (conf->nr_pending &&
 				     current->bio_list &&
 				     !bio_list_empty(current->bio_list)),
 				    conf->resync_lock);
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void allow_barrier(struct r10conf *conf)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->resync_lock, flags);
 	conf->nr_pending--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 static void freeze_array(struct r10conf *conf)
 {
 	/* stop syncio and normal IO and wait for everything to
 	 * go quiet.
 	 * We increment barrier and nr_waiting, and then
 	 * wait until nr_pending match nr_queued+1
 	 * This is called in the context of one normal IO request
 	 * that has failed. Thus any sync request that might be pending
 	 * will be blocked by nr_pending, and we need to wait for
 	 * pending IO requests to complete or be queued for re-try.
 	 * Thus the number queued (nr_queued) plus this request (1)
 	 * must match the number of pending IOs (nr_pending) before
 	 * we continue.
 	 */
 	spin_lock_irq(&conf->resync_lock);
 	conf->barrier++;
 	conf->nr_waiting++;
 	wait_event_lock_irq_cmd(conf->wait_barrier,
 				conf->nr_pending == conf->nr_queued+1,
 				conf->resync_lock,
 				flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(struct r10conf *conf)
 {
 	/* reverse the effect of the freeze */
 	spin_lock_irq(&conf->resync_lock);
 	conf->barrier--;
 	conf->nr_waiting--;
 	wake_up(&conf->wait_barrier);
 	spin_unlock_irq(&conf->resync_lock);
 }
 static sector_t choose_data_offset(struct r10bio *r10_bio,
 				   struct md_rdev *rdev)
 {
 	if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
 	    test_bit(R10BIO_Previous, &r10_bio->state))
 		return rdev->data_offset;
 	else
 		return rdev->new_data_offset;
 }
 struct raid10_plug_cb {
 	struct blk_plug_cb	cb;
 	struct bio_list		pending;
 	int			pending_cnt;
 };
 static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 {
 	struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
 						   cb);
 	struct mddev *mddev = plug->cb.data;
 	struct r10conf *conf = mddev->private;
 	struct bio *bio;
 	if (from_schedule || current->bio_list) {
 		spin_lock_irq(&conf->device_lock);
 		bio_list_merge(&conf->pending_bio_list, &plug->pending);
 		conf->pending_count += plug->pending_cnt;
 		spin_unlock_irq(&conf->device_lock);
 		md_wakeup_thread(mddev->thread);
 		kfree(plug);
 		return;
 	}
 	/* we aren't scheduling, so we can do the write-out directly. */
 	bio = bio_list_get(&plug->pending);
 	bitmap_unplug(mddev->bitmap);
 	wake_up(&conf->wait_barrier);
 	while (bio) { /* submit pending writes */
 		struct bio *next = bio->bi_next;
 		bio->bi_next = NULL;
 		generic_make_request(bio);
 		bio = next;
 	}
 	kfree(plug);
 }
 static void make_request(struct mddev *mddev, struct bio * bio)
 {
 	struct r10conf *conf = mddev->private;
 	struct r10bio *r10_bio;
 	struct bio *read_bio;
 	int i;
 	sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
 	int chunk_sects = chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
 	const unsigned long do_discard = (bio->bi_rw
 					  & (REQ_DISCARD | REQ_SECURE));
 	const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
 	struct raid10_plug_cb *plug = NULL;
 	int sectors_handled;
 	int max_sectors;
 	int sectors;
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
 	/* If this request crosses a chunk boundary, we need to
 	 * split it.  This will only happen for 1 PAGE (or less) requests.
 	 */
 	if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
 		     > chunk_sects
 		     && (conf->geo.near_copies < conf->geo.raid_disks
 			 || conf->prev.near_copies < conf->prev.raid_disks))) {
 		struct bio_pair *bp;
 		/* Sanity check -- queue functions should prevent this happening */
 		if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
 		    bio->bi_idx != 0)
 			goto bad_map;
 		/* This is a one page bio that upper layers
 		 * refuse to split for us, so we need to split it.
 		 */
 		bp = bio_split(bio,
 			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
 		/* Each of these 'make_request' calls will call 'wait_barrier'.
 		 * If the first succeeds but the second blocks due to the resync
 		 * thread raising the barrier, we will deadlock because the
 		 * IO to the underlying device will be queued in generic_make_request
 		 * and will never complete, so will never reduce nr_pending.
 		 * So increment nr_waiting here so no new raise_barriers will
 		 * succeed, and so the second wait_barrier cannot block.
 		 */
 		spin_lock_irq(&conf->resync_lock);
 		conf->nr_waiting++;
 		spin_unlock_irq(&conf->resync_lock);
 		make_request(mddev, &bp->bio1);
 		make_request(mddev, &bp->bio2);
 		spin_lock_irq(&conf->resync_lock);
 		conf->nr_waiting--;
 		wake_up(&conf->wait_barrier);
 		spin_unlock_irq(&conf->resync_lock);
 		bio_pair_release(bp);
 		return;
 	bad_map:
 		printk("md/raid10:%s: make_request bug: can't convert block across chunks"
 		       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
 		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
 		bio_io_error(bio);
 		return;
 	}
 	md_write_start(mddev, bio);
 	/*
 	 * Register the new request and wait if the reconstruction
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
 	wait_barrier(conf);
 	sectors = bio->bi_size >> 9;
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio->bi_sector < conf->reshape_progress &&
 	    bio->bi_sector + sectors > conf->reshape_progress) {
 		/* IO spans the reshape position.  Need to wait for
 		 * reshape to pass
 		 */
 		allow_barrier(conf);
 		wait_event(conf->wait_barrier,
 			   conf->reshape_progress <= bio->bi_sector ||
 			   conf->reshape_progress >= bio->bi_sector + sectors);
 		wait_barrier(conf);
 	}
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio_data_dir(bio) == WRITE &&
 	    (mddev->reshape_backwards
 	     ? (bio->bi_sector < conf->reshape_safe &&
 		bio->bi_sector + sectors > conf->reshape_progress)
 	     : (bio->bi_sector + sectors > conf->reshape_safe &&
 		bio->bi_sector < conf->reshape_progress))) {
 		/* Need to update reshape_position in metadata */
 		mddev->reshape_position = conf->reshape_progress;
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		set_bit(MD_CHANGE_PENDING, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait,
 			   !test_bit(MD_CHANGE_PENDING, &mddev->flags));
 		conf->reshape_safe = mddev->reshape_position;
 	}
 	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 	r10_bio->master_bio = bio;
 	r10_bio->sectors = sectors;
 	r10_bio->mddev = mddev;
 	r10_bio->sector = bio->bi_sector;
 	r10_bio->state = 0;
 	/* We might need to issue multiple reads to different
 	 * devices if there are bad blocks around, so we keep
 	 * track of the number of reads in bio->bi_phys_segments.
 	 * If this is 0, there is only one r10_bio and no locking
 	 * will be needed when the request completes.  If it is
 	 * non-zero, then it is the number of not-completed requests.
 	 */
 	bio->bi_phys_segments = 0;
 	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
 	if (rw == READ) {
 		/*
 		 * read balancing logic:
 		 */
 		struct md_rdev *rdev;
 		int slot;
 read_again:
 		rdev = read_balance(conf, r10_bio, &max_sectors);
 		if (!rdev) {
 			raid_end_bio_io(r10_bio);
 			return;
 		}
 		slot = r10_bio->read_slot;
 		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 		md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
 			    max_sectors);
 		r10_bio->devs[slot].bio = read_bio;
 		r10_bio->devs[slot].rdev = rdev;
 		read_bio->bi_sector = r10_bio->devs[slot].addr +
 			choose_data_offset(r10_bio, rdev);
 		read_bio->bi_bdev = rdev->bdev;
 		read_bio->bi_end_io = raid10_end_read_request;
 		read_bio->bi_rw = READ | do_sync;
 		read_bio->bi_private = r10_bio;
 		if (max_sectors < r10_bio->sectors) {
 			/* Could not read all from this device, so we will
 			 * need another r10_bio.
 			 */
 			sectors_handled = (r10_bio->sectors + max_sectors
 					   - bio->bi_sector);
 			r10_bio->sectors = max_sectors;
 			spin_lock_irq(&conf->device_lock);
 			if (bio->bi_phys_segments == 0)
 				bio->bi_phys_segments = 2;
 			else
 				bio->bi_phys_segments++;
 			spin_unlock(&conf->device_lock);
 			/* Cannot call generic_make_request directly
 			 * as that will be queued in __generic_make_request
 			 * and subsequent mempool_alloc might block
 			 * waiting for it.  so hand bio over to raid10d.
 			 */
 			reschedule_retry(r10_bio);
 			r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 			r10_bio->master_bio = bio;
 			r10_bio->sectors = ((bio->bi_size >> 9)
 					    - sectors_handled);
 			r10_bio->state = 0;
 			r10_bio->mddev = mddev;
 			r10_bio->sector = bio->bi_sector + sectors_handled;
 			goto read_again;
 		} else
 			generic_make_request(read_bio);
 		return;
 	}
 	/*
 	 * WRITE:
 	 */
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		wait_event(conf->wait_barrier,
 			   conf->pending_count < max_queued_requests);
 	}
 	/* first select target devices under rcu_lock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
 	 * If there are known/acknowledged bad blocks on any device
 	 * on which we have seen a write error, we want to avoid
 	 * writing to those blocks.  This potentially requires several
 	 * writes to write around the bad blocks.  Each set of writes
 	 * gets its own r10_bio with a set of bios attached.  The number
 	 * of r10_bios is recored in bio->bi_phys_segments just as with
 	 * the read case.
 	 */
 	r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
 	raid10_find_phys(conf, r10_bio);
 retry_write:
 	blocked_rdev = NULL;
 	rcu_read_lock();
 	max_sectors = r10_bio->sectors;
 	for (i = 0;  i < conf->copies; i++) {
 		int d = r10_bio->devs[i].devnum;
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
 		struct md_rdev *rrdev = rcu_dereference(
 			conf->mirrors[d].replacement);
 		if (rdev == rrdev)
 			rrdev = NULL;
 		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 			atomic_inc(&rdev->nr_pending);
 			blocked_rdev = rdev;
 			break;
 		}
 		if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
 			atomic_inc(&rrdev->nr_pending);
 			blocked_rdev = rrdev;
 			break;
 		}
 		if (rdev && (test_bit(Faulty, &rdev->flags)
 			     || test_bit(Unmerged, &rdev->flags)))
 			rdev = NULL;
 		if (rrdev && (test_bit(Faulty, &rrdev->flags)
 			      || test_bit(Unmerged, &rrdev->flags)))
 			rrdev = NULL;
 		r10_bio->devs[i].bio = NULL;
 		r10_bio->devs[i].repl_bio = NULL;
 		if (!rdev && !rrdev) {
 			set_bit(R10BIO_Degraded, &r10_bio->state);
 			continue;
 		}
 		if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
 			sector_t first_bad;
 			sector_t dev_sector = r10_bio->devs[i].addr;
 			int bad_sectors;
 			int is_bad;
 			is_bad = is_badblock(rdev, dev_sector,
 					     max_sectors,
 					     &first_bad, &bad_sectors);
 			if (is_bad < 0) {
 				/* Mustn't write here until the bad block
 				 * is acknowledged
 				 */
 				atomic_inc(&rdev->nr_pending);
 				set_bit(BlockedBadBlocks, &rdev->flags);
 				blocked_rdev = rdev;
 				break;
 			}
 			if (is_bad && first_bad <= dev_sector) {
 				/* Cannot write here at all */
 				bad_sectors -= (dev_sector - first_bad);
 				if (bad_sectors < max_sectors)
 					/* Mustn't write more than bad_sectors
 					 * to other devices yet
 					 */
 					max_sectors = bad_sectors;
 				/* We don't set R10BIO_Degraded as that
 				 * only applies if the disk is missing,
 				 * so it might be re-added, and we want to
 				 * know to recover this chunk.
 				 * In this case the device is here, and the
 				 * fact that this chunk is not in-sync is
 				 * recorded in the bad block log.
 				 */
 				continue;
 			}
 			if (is_bad) {
 				int good_sectors = first_bad - dev_sector;
 				if (good_sectors < max_sectors)
 					max_sectors = good_sectors;
 			}
 		}
 		if (rdev) {
 			r10_bio->devs[i].bio = bio;
 			atomic_inc(&rdev->nr_pending);
 		}
 		if (rrdev) {
 			r10_bio->devs[i].repl_bio = bio;
 			atomic_inc(&rrdev->nr_pending);
 		}
 	}
 	rcu_read_unlock();
 	if (unlikely(blocked_rdev)) {
 		/* Have to wait for this device to get unblocked, then retry */
 		int j;
 		int d;
 		for (j = 0; j < i; j++) {
 			if (r10_bio->devs[j].bio) {
 				d = r10_bio->devs[j].devnum;
 				rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 			}
 			if (r10_bio->devs[j].repl_bio) {
 				struct md_rdev *rdev;
 				d = r10_bio->devs[j].devnum;
 				rdev = conf->mirrors[d].replacement;
 				if (!rdev) {
 					/* Race with remove_disk */
 					smp_mb();
 					rdev = conf->mirrors[d].rdev;
 				}
 				rdev_dec_pending(rdev, mddev);
 			}
 		}
 		allow_barrier(conf);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
 		wait_barrier(conf);
 		goto retry_write;
 	}
 	if (max_sectors < r10_bio->sectors) {
 		/* We are splitting this into multiple parts, so
 		 * we need to prepare for allocating another r10_bio.
 		 */
 		r10_bio->sectors = max_sectors;
 		spin_lock_irq(&conf->device_lock);
 		if (bio->bi_phys_segments == 0)
 			bio->bi_phys_segments = 2;
 		else
 			bio->bi_phys_segments++;
 		spin_unlock_irq(&conf->device_lock);
 	}
 	sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
 	atomic_set(&r10_bio->remaining, 1);
 	bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
 	for (i = 0; i < conf->copies; i++) {
 		struct bio *mbio;
 		int d = r10_bio->devs[i].devnum;
 		if (r10_bio->devs[i].bio) {
 			struct md_rdev *rdev = conf->mirrors[d].rdev;
 			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 			md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
 				    max_sectors);
 			r10_bio->devs[i].bio = mbio;
 			mbio->bi_sector	= (r10_bio->devs[i].addr+
 					   choose_data_offset(r10_bio,
 							      rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
 			mbio->bi_rw =
 				WRITE | do_sync | do_fua | do_discard | do_same;
 			mbio->bi_private = r10_bio;
 			atomic_inc(&r10_bio->remaining);
 			cb = blk_check_plugged(raid10_unplug, mddev,
 					       sizeof(*plug));
 			if (cb)
 				plug = container_of(cb, struct raid10_plug_cb,
 						    cb);
 			else
 				plug = NULL;
 			spin_lock_irqsave(&conf->device_lock, flags);
 			if (plug) {
 				bio_list_add(&plug->pending, mbio);
 				plug->pending_cnt++;
 			} else {
 				bio_list_add(&conf->pending_bio_list, mbio);
 				conf->pending_count++;
 			}
 			spin_unlock_irqrestore(&conf->device_lock, flags);
 			if (!plug)
 				md_wakeup_thread(mddev->thread);
 		}
 		if (r10_bio->devs[i].repl_bio) {
 			struct md_rdev *rdev = conf->mirrors[d].replacement;
 			if (rdev == NULL) {
 				/* Replacement just got moved to main 'rdev' */
 				smp_mb();
 				rdev = conf->mirrors[d].rdev;
 			}
 			mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 			md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
 				    max_sectors);
 			r10_bio->devs[i].repl_bio = mbio;
 			mbio->bi_sector	= (r10_bio->devs[i].addr +
 					   choose_data_offset(
 						   r10_bio, rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
 			mbio->bi_rw =
 				WRITE | do_sync | do_fua | do_discard | do_same;
 			mbio->bi_private = r10_bio;
 			atomic_inc(&r10_bio->remaining);
 			spin_lock_irqsave(&conf->device_lock, flags);
 			bio_list_add(&conf->pending_bio_list, mbio);
 			conf->pending_count++;
 			spin_unlock_irqrestore(&conf->device_lock, flags);
 			if (!mddev_check_plugged(mddev))
 				md_wakeup_thread(mddev->thread);
 		}
 	}
 	/* Don't remove the bias on 'remaining' (one_write_done) until
 	 * after checking if we need to go around again.
 	 */
 	if (sectors_handled < (bio->bi_size >> 9)) {
 		one_write_done(r10_bio);
 		/* We need another r10_bio.  It has already been counted
 		 * in bio->bi_phys_segments.
 		 */
 		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 		r10_bio->master_bio = bio;
 		r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
 		r10_bio->mddev = mddev;
 		r10_bio->sector = bio->bi_sector + sectors_handled;
 		r10_bio->state = 0;
 		goto retry_write;
 	}
 	one_write_done(r10_bio);
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
 }
 static void status(struct seq_file *seq, struct mddev *mddev)
 {
 	struct r10conf *conf = mddev->private;
 	int i;
 	if (conf->geo.near_copies < conf->geo.raid_disks)
 		seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
 	if (conf->geo.near_copies > 1)
 		seq_printf(seq, " %d near-copies", conf->geo.near_copies);
 	if (conf->geo.far_copies > 1) {
 		if (conf->geo.far_offset)
 			seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
 		else
 			seq_printf(seq, " %d far-copies", conf->geo.far_copies);
 	}
 	seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
 					conf->geo.raid_disks - mddev->degraded);
 	for (i = 0; i < conf->geo.raid_disks; i++)
 		seq_printf(seq, "%s",
 			      conf->mirrors[i].rdev &&
 			      test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
 	seq_printf(seq, "]");
 }
 /* check if there are enough drives for
  * every block to appear on atleast one.
  * Don't consider the device numbered 'ignore'
  * as we might be about to remove it.
  */
 static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
 {
 	int first = 0;
 	do {
 		int n = conf->copies;
 		int cnt = 0;
 		int this = first;
 		while (n--) {
 			if (conf->mirrors[this].rdev &&
 			    this != ignore)
 				cnt++;
 			this = (this+1) % geo->raid_disks;
 		}
 		if (cnt == 0)
 			return 0;
 		first = (first + geo->near_copies) % geo->raid_disks;
 	} while (first != 0);
 	return 1;
 }
 static int enough(struct r10conf *conf, int ignore)
 {
 	return _enough(conf, &conf->geo, ignore) &&
 		_enough(conf, &conf->prev, ignore);
 }
 static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	struct r10conf *conf = mddev->private;
 	/*
 	 * If it is not operational, then we have already marked it as dead
 	 * else if it is the last working disks, ignore the error, let the
 	 * next level up know.
 	 * else mark the drive as failed
 	 */
 	if (test_bit(In_sync, &rdev->flags)
 	    && !enough(conf, rdev->raid_disk))
 		/*
 		 * Don't fail the drive, just return an IO error.
 		 */
 		return;
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
 		unsigned long flags;
 		spin_lock_irqsave(&conf->device_lock, flags);
 		mddev->degraded++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		/*
 		 * if recovery is running, make sure it aborts.
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	}
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	printk(KERN_ALERT
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid10:%s: Operation continuing on %d devices.\n",
 	       mdname(mddev), bdevname(rdev->bdev, b),
 	       mdname(mddev), conf->geo.raid_disks - mddev->degraded);
 }
 static void print_conf(struct r10conf *conf)
 {
 	int i;
 	struct raid10_info *tmp;
 	printk(KERN_DEBUG "RAID10 conf printout:\n");
 	if (!conf) {
 		printk(KERN_DEBUG "(!conf)\n");
 		return;
 	}
 	printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
 		conf->geo.raid_disks);
 	for (i = 0; i < conf->geo.raid_disks; i++) {
 		char b[BDEVNAME_SIZE];
 		tmp = conf->mirrors + i;
 		if (tmp->rdev)
 			printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
 				i, !test_bit(In_sync, &tmp->rdev->flags),
 			        !test_bit(Faulty, &tmp->rdev->flags),
 				bdevname(tmp->rdev->bdev,b));
 	}
 }
 static void close_sync(struct r10conf *conf)
 {
 	wait_barrier(conf);
 	allow_barrier(conf);
 	mempool_destroy(conf->r10buf_pool);
 	conf->r10buf_pool = NULL;
 }
 static int raid10_spare_active(struct mddev *mddev)
 {
 	int i;
 	struct r10conf *conf = mddev->private;
 	struct raid10_info *tmp;
 	int count = 0;
 	unsigned long flags;
 	/*
 	 * Find all non-in_sync disks within the RAID10 configuration
 	 * and mark them in_sync
 	 */
 	for (i = 0; i < conf->geo.raid_disks; i++) {
 		tmp = conf->mirrors + i;
 		if (tmp->replacement
 		    && tmp->replacement->recovery_offset == MaxSector
 		    && !test_bit(Faulty, &tmp->replacement->flags)
 		    && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
 			/* Replacement has just become active */
 			if (!tmp->rdev
 			    || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
 				count++;
 			if (tmp->rdev) {
 				/* Replaced device not technically faulty,
 				 * but we need to be sure it gets removed
 				 * and never re-added.
 				 */
 				set_bit(Faulty, &tmp->rdev->flags);
 				sysfs_notify_dirent_safe(
 					tmp->rdev->sysfs_state);
 			}
 			sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
 		} else if (tmp->rdev
 			   && !test_bit(Faulty, &tmp->rdev->flags)
 			   && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
 			count++;
 			sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
 		}
 	}
 	spin_lock_irqsave(&conf->device_lock, flags);
 	mddev->degraded -= count;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	print_conf(conf);
 	return count;
 }
 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct r10conf *conf = mddev->private;
 	int err = -EEXIST;
 	int mirror;
 	int first = 0;
 	int last = conf->geo.raid_disks - 1;
 	struct request_queue *q = bdev_get_queue(rdev->bdev);
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
 		 * very different from resync
 		 */
 		return -EBUSY;
 	if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
 		return -EINVAL;
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 	if (q->merge_bvec_fn) {
 		set_bit(Unmerged, &rdev->flags);
 		mddev->merge_check_needed = 1;
 	}
 	if (rdev->saved_raid_disk >= first &&
 	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
 		mirror = rdev->saved_raid_disk;
 	else
 		mirror = first;
 	for ( ; mirror <= last ; mirror++) {
 		struct raid10_info *p = &conf->mirrors[mirror];
 		if (p->recovery_disabled == mddev->recovery_disabled)
 			continue;
 		if (p->rdev) {
 			if (!test_bit(WantReplacement, &p->rdev->flags) ||
 			    p->replacement != NULL)
 				continue;
 			clear_bit(In_sync, &rdev->flags);
 			set_bit(Replacement, &rdev->flags);
 			rdev->raid_disk = mirror;
 			err = 0;
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
 			conf->fullsync = 1;
 			rcu_assign_pointer(p->replacement, rdev);
 			break;
 		}
 		disk_stack_limits(mddev->gendisk, rdev->bdev,
 				  rdev->data_offset << 9);
 		p->head_position = 0;
 		p->recovery_disabled = mddev->recovery_disabled - 1;
 		rdev->raid_disk = mirror;
 		err = 0;
 		if (rdev->saved_raid_disk != mirror)
 			conf->fullsync = 1;
 		rcu_assign_pointer(p->rdev, rdev);
 		break;
 	}
 	if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
 		/* Some requests might not have seen this new
 		 * merge_bvec_fn.  We must wait for them to complete
 		 * before merging the device fully.
 		 * First we make sure any code which has tested
 		 * our function has submitted the request, then
 		 * we wait for all outstanding requests to complete.
 		 */
 		synchronize_sched();
 		raise_barrier(conf, 0);
 		lower_barrier(conf);
 		clear_bit(Unmerged, &rdev->flags);
 	}
 	md_integrity_add_rdev(rdev, mddev);
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 	print_conf(conf);
 	return err;
 }
 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct r10conf *conf = mddev->private;
 	int err = 0;
 	int number = rdev->raid_disk;
 	struct md_rdev **rdevp;
 	struct raid10_info *p = conf->mirrors + number;
 	print_conf(conf);
 	if (rdev == p->rdev)
 		rdevp = &p->rdev;
 	else if (rdev == p->replacement)
 		rdevp = &p->replacement;
 	else
 		return 0;
 	if (test_bit(In_sync, &rdev->flags) ||
 	    atomic_read(&rdev->nr_pending)) {
 		err = -EBUSY;
 		goto abort;
 	}
 	/* Only remove faulty devices if recovery
 	 * is not possible.
 	 */
 	if (!test_bit(Faulty, &rdev->flags) &&
 	    mddev->recovery_disabled != p->recovery_disabled &&
 	    (!p->replacement || p->replacement == rdev) &&
 	    number < conf->geo.raid_disks &&
 	    enough(conf, -1)) {
 		err = -EBUSY;
 		goto abort;
 	}
 	*rdevp = NULL;
 	synchronize_rcu();
 	if (atomic_read(&rdev->nr_pending)) {
 		/* lost the race, try later */
 		err = -EBUSY;
 		*rdevp = rdev;
 		goto abort;
 	} else if (p->replacement) {
 		/* We must have just cleared 'rdev' */
 		p->rdev = p->replacement;
 		clear_bit(Replacement, &p->replacement->flags);
 		smp_mb(); /* Make sure other CPUs may see both as identical
 			   * but will never see neither -- if they are careful.
 			   */
 		p->replacement = NULL;
 		clear_bit(WantReplacement, &rdev->flags);
 	} else
 		/* We might have just remove the Replacement as faulty
 		 * Clear the flag just in case
 		 */
 		clear_bit(WantReplacement, &rdev->flags);
 	err = md_integrity_register(mddev);
 abort:
 	print_conf(conf);
 	return err;
 }
 static void end_sync_read(struct bio *bio, int error)
 {
 	struct r10bio *r10_bio = bio->bi_private;
 	struct r10conf *conf = r10_bio->mddev->private;
 	int d;
 	if (bio == r10_bio->master_bio) {
 		/* this is a reshape read */
 		d = r10_bio->read_slot; /* really the read dev */
 	} else
 		d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
 	else
 		/* The write handler will notice the lack of
 		 * R10BIO_Uptodate and record any errors etc
 		 */
 		atomic_add(r10_bio->sectors,
 			   &conf->mirrors[d].rdev->corrected_errors);
 	/* for reconstruct, we always reschedule after a read.
 	 * for resync, only after all reads
 	 */
 	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
 	if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
 	    atomic_dec_and_test(&r10_bio->remaining)) {
 		/* we have read all the blocks,
 		 * do the comparison in process context in raid10d
 		 */
 		reschedule_retry(r10_bio);
 	}
 }
 static void end_sync_request(struct r10bio *r10_bio)
 {
 	struct mddev *mddev = r10_bio->mddev;
 	while (atomic_dec_and_test(&r10_bio->remaining)) {
 		if (r10_bio->master_bio == NULL) {
 			/* the primary of several recovery bios */
 			sector_t s = r10_bio->sectors;
 			if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
 			    test_bit(R10BIO_WriteError, &r10_bio->state))
 				reschedule_retry(r10_bio);
 			else
 				put_buf(r10_bio);
 			md_done_sync(mddev, s, 1);
 			break;
 		} else {
 			struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
 			if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
 			    test_bit(R10BIO_WriteError, &r10_bio->state))
 				reschedule_retry(r10_bio);
 			else
 				put_buf(r10_bio);
 			r10_bio = r10_bio2;
 		}
 	}
 }
 static void end_sync_write(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct r10bio *r10_bio = bio->bi_private;
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	int d;
 	sector_t first_bad;
 	int bad_sectors;
 	int slot;
 	int repl;
 	struct md_rdev *rdev = NULL;
 	d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 	if (repl)
 		rdev = conf->mirrors[d].replacement;
 	else
 		rdev = conf->mirrors[d].rdev;
 	if (!uptodate) {
 		if (repl)
 			md_error(mddev, rdev);
 		else {
 			set_bit(WriteErrorSeen, &rdev->flags);
 			if (!test_and_set_bit(WantReplacement, &rdev->flags))
 				set_bit(MD_RECOVERY_NEEDED,
 					&rdev->mddev->recovery);
 			set_bit(R10BIO_WriteError, &r10_bio->state);
 		}
 	} else if (is_badblock(rdev,
 			     r10_bio->devs[slot].addr,
 			     r10_bio->sectors,
 			     &first_bad, &bad_sectors))
 		set_bit(R10BIO_MadeGood, &r10_bio->state);
 	rdev_dec_pending(rdev, mddev);
 	end_sync_request(r10_bio);
 }
 /*
  * Note: sync and recover and handled very differently for raid10
  * This code is for resync.
  * For resync, we read through virtual addresses and read all blocks.
  * If there is any error, we schedule a write.  The lowest numbered
  * drive is authoritative.
  * However requests come for physical address, so we need to map.
  * For every physical address there are raid_disks/copies virtual addresses,
  * which is always are least one, but is not necessarly an integer.
  * This means that a physical address can span multiple chunks, so we may
  * have to submit multiple io requests for a single sync request.
  */
 /*
  * We check if all blocks are in-sync and only write to blocks that
  * aren't in sync
  */
 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 {
 	struct r10conf *conf = mddev->private;
 	int i, first;
 	struct bio *tbio, *fbio;
 	int vcnt;
 	atomic_set(&r10_bio->remaining, 1);
 	/* find the first device with a block */
 	for (i=0; i<conf->copies; i++)
 		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
 			break;
 	if (i == conf->copies)
 		goto done;
 	first = i;
 	fbio = r10_bio->devs[i].bio;
 	vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
 	/* now find blocks with errors */
 	for (i=0 ; i < conf->copies ; i++) {
 		int  j, d;
 		tbio = r10_bio->devs[i].bio;
 		if (tbio->bi_end_io != end_sync_read)
 			continue;
 		if (i == first)
 			continue;
 		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
 			/* We know that the bi_io_vec layout is the same for
 			 * both 'first' and 'i', so we just compare them.
 			 * All vec entries are PAGE_SIZE;
 			 */
 			for (j = 0; j < vcnt; j++)
 				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
 					   page_address(tbio->bi_io_vec[j].bv_page),
 					   fbio->bi_io_vec[j].bv_len))
 					break;
 			if (j == vcnt)
 				continue;
 			atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
 			if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
 				/* Don't fix anything. */
 				continue;
 		}
 		/* Ok, we need to write this bio, either to correct an
 		 * inconsistency or to correct an unreadable block.
 		 * First we need to fixup bv_offset, bv_len and
 		 * bi_vecs, as the read request might have corrupted these
 		 */
 		tbio->bi_vcnt = vcnt;
 		tbio->bi_size = r10_bio->sectors << 9;
 		tbio->bi_idx = 0;
 		tbio->bi_phys_segments = 0;
 		tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
 		tbio->bi_flags |= 1 << BIO_UPTODATE;
 		tbio->bi_next = NULL;
 		tbio->bi_rw = WRITE;
 		tbio->bi_private = r10_bio;
 		tbio->bi_sector = r10_bio->devs[i].addr;
 		for (j=0; j < vcnt ; j++) {
 			tbio->bi_io_vec[j].bv_offset = 0;
 			tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
 			memcpy(page_address(tbio->bi_io_vec[j].bv_page),
 			       page_address(fbio->bi_io_vec[j].bv_page),
 			       PAGE_SIZE);
 		}
 		tbio->bi_end_io = end_sync_write;
 		d = r10_bio->devs[i].devnum;
 		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 		atomic_inc(&r10_bio->remaining);
 		md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
 		tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
 		tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		generic_make_request(tbio);
 	}
 	/* Now write out to any replacement devices
 	 * that are active
 	 */
 	for (i = 0; i < conf->copies; i++) {
 		int j, d;
 		tbio = r10_bio->devs[i].repl_bio;
 		if (!tbio || !tbio->bi_end_io)
 			continue;
 		if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
 		    && r10_bio->devs[i].bio != fbio)
 			for (j = 0; j < vcnt; j++)
 				memcpy(page_address(tbio->bi_io_vec[j].bv_page),
 				       page_address(fbio->bi_io_vec[j].bv_page),
 				       PAGE_SIZE);
 		d = r10_bio->devs[i].devnum;
 		atomic_inc(&r10_bio->remaining);
 		md_sync_acct(conf->mirrors[d].replacement->bdev,
 			     tbio->bi_size >> 9);
 		generic_make_request(tbio);
 	}
 done:
 	if (atomic_dec_and_test(&r10_bio->remaining)) {
 		md_done_sync(mddev, r10_bio->sectors, 1);
 		put_buf(r10_bio);
 	}
 }
 /*
  * Now for the recovery code.
  * Recovery happens across physical sectors.
  * We recover all non-is_sync drives by finding the virtual address of
  * each, and then choose a working drive that also has that virt address.
  * There is a separate r10_bio for each non-in_sync drive.
  * Only the first two slots are in use. The first for reading,
  * The second for writing.
  *
  */
 static void fix_recovery_read_error(struct r10bio *r10_bio)
 {
 	/* We got a read error during recovery.
 	 * We repeat the read in smaller page-sized sections.
 	 * If a read succeeds, write it to the new device or record
 	 * a bad block if we cannot.
 	 * If a read fails, record a bad block on both old and
 	 * new devices.
 	 */
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	struct bio *bio = r10_bio->devs[0].bio;
 	sector_t sect = 0;
 	int sectors = r10_bio->sectors;
 	int idx = 0;
 	int dr = r10_bio->devs[0].devnum;
 	int dw = r10_bio->devs[1].devnum;
 	while (sectors) {
 		int s = sectors;
 		struct md_rdev *rdev;
 		sector_t addr;
 		int ok;
 		if (s > (PAGE_SIZE>>9))
 			s = PAGE_SIZE >> 9;
 		rdev = conf->mirrors[dr].rdev;
 		addr = r10_bio->devs[0].addr + sect,
 		ok = sync_page_io(rdev,
 				  addr,
 				  s << 9,
 				  bio->bi_io_vec[idx].bv_page,
 				  READ, false);
 		if (ok) {
 			rdev = conf->mirrors[dw].rdev;
 			addr = r10_bio->devs[1].addr + sect;
 			ok = sync_page_io(rdev,
 					  addr,
 					  s << 9,
 					  bio->bi_io_vec[idx].bv_page,
 					  WRITE, false);
 			if (!ok) {
 				set_bit(WriteErrorSeen, &rdev->flags);
 				if (!test_and_set_bit(WantReplacement,
 						      &rdev->flags))
 					set_bit(MD_RECOVERY_NEEDED,
 						&rdev->mddev->recovery);
 			}
 		}
 		if (!ok) {
 			/* We don't worry if we cannot set a bad block -
 			 * it really is bad so there is no loss in not
 			 * recording it yet
 			 */
 			rdev_set_badblocks(rdev, addr, s, 0);
 			if (rdev != conf->mirrors[dw].rdev) {
 				/* need bad block on destination too */
 				struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
 				addr = r10_bio->devs[1].addr + sect;
 				ok = rdev_set_badblocks(rdev2, addr, s, 0);
 				if (!ok) {
 					/* just abort the recovery */
 					printk(KERN_NOTICE
 					       "md/raid10:%s: recovery aborted"
 					       " due to read error\n",
 					       mdname(mddev));
 					conf->mirrors[dw].recovery_disabled
 						= mddev->recovery_disabled;
 					set_bit(MD_RECOVERY_INTR,
 						&mddev->recovery);
 					break;
 				}
 			}
 		}
 		sectors -= s;
 		sect += s;
 		idx++;
 	}
 }
 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 {
 	struct r10conf *conf = mddev->private;
 	int d;
 	struct bio *wbio, *wbio2;
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
 		fix_recovery_read_error(r10_bio);
 		end_sync_request(r10_bio);
 		return;
 	}
 	/*
 	 * share the pages with the first bio
 	 * and submit the write request
 	 */
 	d = r10_bio->devs[1].devnum;
 	wbio = r10_bio->devs[1].bio;
 	wbio2 = r10_bio->devs[1].repl_bio;
 	if (wbio->bi_end_io) {
 		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 		md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
 		generic_make_request(wbio);
 	}
 	if (wbio2 && wbio2->bi_end_io) {
 		atomic_inc(&conf->mirrors[d].replacement->nr_pending);
 		md_sync_acct(conf->mirrors[d].replacement->bdev,
 			     wbio2->bi_size >> 9);
 		generic_make_request(wbio2);
 	}
 }
 /*
  * Used by fix_read_error() to decay the per rdev read_errors.
  * We halve the read error count for every hour that has elapsed
  * since the last recorded read error.
  *
  */
 static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct timespec cur_time_mon;
 	unsigned long hours_since_last;
 	unsigned int read_errors = atomic_read(&rdev->read_errors);
 	ktime_get_ts(&cur_time_mon);
 	if (rdev->last_read_error.tv_sec == 0 &&
 	    rdev->last_read_error.tv_nsec == 0) {
 		/* first time we've seen a read error */
 		rdev->last_read_error = cur_time_mon;
 		return;
 	}
 	hours_since_last = (cur_time_mon.tv_sec -
 			    rdev->last_read_error.tv_sec) / 3600;
 	rdev->last_read_error = cur_time_mon;
 	/*
 	 * if hours_since_last is > the number of bits in read_errors
 	 * just set read errors to 0. We do this to avoid
 	 * overflowing the shift of read_errors by hours_since_last.
 	 */
 	if (hours_since_last >= 8 * sizeof(read_errors))
 		atomic_set(&rdev->read_errors, 0);
 	else
 		atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
 }
 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
 			    int sectors, struct page *page, int rw)
 {
 	sector_t first_bad;
 	int bad_sectors;
 	if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
 	    && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
 		return -1;
 	if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
 		/* success */
 		return 1;
 	if (rw == WRITE) {
 		set_bit(WriteErrorSeen, &rdev->flags);
 		if (!test_and_set_bit(WantReplacement, &rdev->flags))
 			set_bit(MD_RECOVERY_NEEDED,
 				&rdev->mddev->recovery);
 	}
 	/* need to record an error - either for the block or the device */
 	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
 		md_error(rdev->mddev, rdev);
 	return 0;
 }
 /*
  * This is a kernel thread which:
  *
  *	1.	Retries failed read operations on working mirrors.
  *	2.	Updates the raid superblock when problems encounter.
  *	3.	Performs writes following reads for array synchronising.
  */
 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
 {
 	int sect = 0; /* Offset from r10_bio->sector */
 	int sectors = r10_bio->sectors;
 	struct md_rdev*rdev;
 	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
 	int d = r10_bio->devs[r10_bio->read_slot].devnum;
 	/* still own a reference to this rdev, so it cannot
 	 * have been cleared recently.
 	 */
 	rdev = conf->mirrors[d].rdev;
 	if (test_bit(Faulty, &rdev->flags))
 		/* drive has already been failed, just ignore any
 		   more fix_read_error() attempts */
 		return;
 	check_decay_read_errors(mddev, rdev);
 	atomic_inc(&rdev->read_errors);
 	if (atomic_read(&rdev->read_errors) > max_read_errors) {
 		char b[BDEVNAME_SIZE];
 		bdevname(rdev->bdev, b);
 		printk(KERN_NOTICE
 		       "md/raid10:%s: %s: Raid device exceeded "
 		       "read_error threshold [cur %d:max %d]\n",
 		       mdname(mddev), b,
 		       atomic_read(&rdev->read_errors), max_read_errors);
 		printk(KERN_NOTICE
 		       "md/raid10:%s: %s: Failing raid device\n",
 		       mdname(mddev), b);
 		md_error(mddev, conf->mirrors[d].rdev);
 		r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
 		return;
 	}
 	while(sectors) {
 		int s = sectors;
 		int sl = r10_bio->read_slot;
 		int success = 0;
 		int start;
 		if (s > (PAGE_SIZE>>9))
 			s = PAGE_SIZE >> 9;
 		rcu_read_lock();
 		do {
 			sector_t first_bad;
 			int bad_sectors;
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (rdev &&
 			    !test_bit(Unmerged, &rdev->flags) &&
 			    test_bit(In_sync, &rdev->flags) &&
 			    is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
 					&first_bad, &bad_sectors) == 0) {
 				atomic_inc(&rdev->nr_pending);
 				rcu_read_unlock();
 				success = sync_page_io(rdev,
 						       r10_bio->devs[sl].addr +
 						       sect,
 						       s<<9,
 						       conf->tmppage, READ, false);
 				rdev_dec_pending(rdev, mddev);
 				rcu_read_lock();
 				if (success)
 					break;
 			}
 			sl++;
 			if (sl == conf->copies)
 				sl = 0;
 		} while (!success && sl != r10_bio->read_slot);
 		rcu_read_unlock();
 		if (!success) {
 			/* Cannot read from anywhere, just mark the block
 			 * as bad on the first device to discourage future
 			 * reads.
 			 */
 			int dn = r10_bio->devs[r10_bio->read_slot].devnum;
 			rdev = conf->mirrors[dn].rdev;
 			if (!rdev_set_badblocks(
 				    rdev,
 				    r10_bio->devs[r10_bio->read_slot].addr
 				    + sect,
 				    s, 0)) {
 				md_error(mddev, rdev);
 				r10_bio->devs[r10_bio->read_slot].bio
 					= IO_BLOCKED;
 			}
 			break;
 		}
 		start = sl;
 		/* write it back and re-read */
 		rcu_read_lock();
 		while (sl != r10_bio->read_slot) {
 			char b[BDEVNAME_SIZE];
 			if (sl==0)
 				sl = conf->copies;
 			sl--;
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (!rdev ||
 			    test_bit(Unmerged, &rdev->flags) ||
 			    !test_bit(In_sync, &rdev->flags))
 				continue;
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
 			if (r10_sync_page_io(rdev,
 					     r10_bio->devs[sl].addr +
 					     sect,
 					     s, conf->tmppage, WRITE)
 			    == 0) {
 				/* Well, this device is dead */
 				printk(KERN_NOTICE
 				       "md/raid10:%s: read correction "
 				       "write failed"
 				       " (%d sectors at %llu on %s)\n",
 				       mdname(mddev), s,
 				       (unsigned long long)(
 					       sect +
 					       choose_data_offset(r10_bio,
 								  rdev)),
 				       bdevname(rdev->bdev, b));
 				printk(KERN_NOTICE "md/raid10:%s: %s: failing "
 				       "drive\n",
 				       mdname(mddev),
 				       bdevname(rdev->bdev, b));
 			}
 			rdev_dec_pending(rdev, mddev);
 			rcu_read_lock();
 		}
 		sl = start;
 		while (sl != r10_bio->read_slot) {
 			char b[BDEVNAME_SIZE];
 			if (sl==0)
 				sl = conf->copies;
 			sl--;
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (!rdev ||
 			    !test_bit(In_sync, &rdev->flags))
 				continue;
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
 			switch (r10_sync_page_io(rdev,
 					     r10_bio->devs[sl].addr +
 					     sect,
 					     s, conf->tmppage,
 						 READ)) {
 			case 0:
 				/* Well, this device is dead */
 				printk(KERN_NOTICE
 				       "md/raid10:%s: unable to read back "
 				       "corrected sectors"
 				       " (%d sectors at %llu on %s)\n",
 				       mdname(mddev), s,
 				       (unsigned long long)(
 					       sect +
 					       choose_data_offset(r10_bio, rdev)),
 				       bdevname(rdev->bdev, b));
 				printk(KERN_NOTICE "md/raid10:%s: %s: failing "
 				       "drive\n",
 				       mdname(mddev),
 				       bdevname(rdev->bdev, b));
 				break;
 			case 1:
 				printk(KERN_INFO
 				       "md/raid10:%s: read error corrected"
 				       " (%d sectors at %llu on %s)\n",
 				       mdname(mddev), s,
 				       (unsigned long long)(
 					       sect +
 					       choose_data_offset(r10_bio, rdev)),
 				       bdevname(rdev->bdev, b));
 				atomic_add(s, &rdev->corrected_errors);
 			}
 			rdev_dec_pending(rdev, mddev);
 			rcu_read_lock();
 		}
 		rcu_read_unlock();
 		sectors -= s;
 		sect += s;
 	}
 }
 static void bi_complete(struct bio *bio, int error)
 {
 	complete((struct completion *)bio->bi_private);
 }
 static int submit_bio_wait(int rw, struct bio *bio)
 {
 	struct completion event;
 	rw |= REQ_SYNC;
 	init_completion(&event);
 	bio->bi_private = &event;
 	bio->bi_end_io = bi_complete;
 	submit_bio(rw, bio);
 	wait_for_completion(&event);
 	return test_bit(BIO_UPTODATE, &bio->bi_flags);
 }
 static int narrow_write_error(struct r10bio *r10_bio, int i)
 {
 	struct bio *bio = r10_bio->master_bio;
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
 	/* bio has the data to be written to slot 'i' where
 	 * we just recently had a write error.
 	 * We repeatedly clone the bio and trim down to one block,
 	 * then try the write.  Where the write fails we record
 	 * a bad block.
 	 * It is conceivable that the bio doesn't exactly align with
 	 * blocks.  We must handle this.
 	 *
 	 * We currently own a reference to the rdev.
 	 */
 	int block_sectors;
 	sector_t sector;
 	int sectors;
 	int sect_to_write = r10_bio->sectors;
 	int ok = 1;
 	if (rdev->badblocks.shift < 0)
 		return 0;
 	block_sectors = 1 << rdev->badblocks.shift;
 	sector = r10_bio->sector;
 	sectors = ((r10_bio->sector + block_sectors)
 		   & ~(sector_t)(block_sectors - 1))
 		- sector;
 	while (sect_to_write) {
 		struct bio *wbio;
 		if (sectors > sect_to_write)
 			sectors = sect_to_write;
 		/* Write at 'sector' for 'sectors' */
 		wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 		md_trim_bio(wbio, sector - bio->bi_sector, sectors);
 		wbio->bi_sector = (r10_bio->devs[i].addr+
 				   choose_data_offset(r10_bio, rdev) +
 				   (sector - r10_bio->sector));
 		wbio->bi_bdev = rdev->bdev;
 		if (submit_bio_wait(WRITE, wbio) == 0)
 			/* Failure! */
 			ok = rdev_set_badblocks(rdev, sector,
 						sectors, 0)
 				&& ok;
 		bio_put(wbio);
 		sect_to_write -= sectors;
 		sector += sectors;
 		sectors = block_sectors;
 	}
 	return ok;
 }
 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 {
 	int slot = r10_bio->read_slot;
 	struct bio *bio;
 	struct r10conf *conf = mddev->private;
 	struct md_rdev *rdev = r10_bio->devs[slot].rdev;
 	char b[BDEVNAME_SIZE];
 	unsigned long do_sync;
 	int max_sectors;
 	/* we got a read error. Maybe the drive is bad.  Maybe just
 	 * the block and we can fix it.
 	 * We freeze all other IO, and try reading the block from
 	 * other devices.  When we find one, we re-write
 	 * and check it that fixes the read error.
 	 * This is all done synchronously while the array is
 	 * frozen.
 	 */
 	bio = r10_bio->devs[slot].bio;
 	bdevname(bio->bi_bdev, b);
 	bio_put(bio);
 	r10_bio->devs[slot].bio = NULL;
 	if (mddev->ro == 0) {
 		freeze_array(conf);
 		fix_read_error(conf, mddev, r10_bio);
 		unfreeze_array(conf);
 	} else
 		r10_bio->devs[slot].bio = IO_BLOCKED;
 	rdev_dec_pending(rdev, mddev);
 read_more:
 	rdev = read_balance(conf, r10_bio, &max_sectors);
 	if (rdev == NULL) {
 		printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
 		       " read error for block %llu\n",
 		       mdname(mddev), b,
 		       (unsigned long long)r10_bio->sector);
 		raid_end_bio_io(r10_bio);
 		return;
 	}
 	do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
 	slot = r10_bio->read_slot;
 	printk_ratelimited(
 		KERN_ERR
 		"md/raid10:%s: %s: redirecting "
 		"sector %llu to another mirror\n",
 		mdname(mddev),
 		bdevname(rdev->bdev, b),
 		(unsigned long long)r10_bio->sector);
 	bio = bio_clone_mddev(r10_bio->master_bio,
 			      GFP_NOIO, mddev);
 	md_trim_bio(bio,
 		    r10_bio->sector - bio->bi_sector,
 		    max_sectors);
 	r10_bio->devs[slot].bio = bio;
 	r10_bio->devs[slot].rdev = rdev;
 	bio->bi_sector = r10_bio->devs[slot].addr
 		+ choose_data_offset(r10_bio, rdev);
 	bio->bi_bdev = rdev->bdev;
 	bio->bi_rw = READ | do_sync;
 	bio->bi_private = r10_bio;
 	bio->bi_end_io = raid10_end_read_request;
 	if (max_sectors < r10_bio->sectors) {
 		/* Drat - have to split this up more */
 		struct bio *mbio = r10_bio->master_bio;
 		int sectors_handled =
 			r10_bio->sector + max_sectors
 			- mbio->bi_sector;
 		r10_bio->sectors = max_sectors;
 		spin_lock_irq(&conf->device_lock);
 		if (mbio->bi_phys_segments == 0)
 			mbio->bi_phys_segments = 2;
 		else
 			mbio->bi_phys_segments++;
 		spin_unlock_irq(&conf->device_lock);
 		generic_make_request(bio);
 		r10_bio = mempool_alloc(conf->r10bio_pool,
 					GFP_NOIO);
 		r10_bio->master_bio = mbio;
 		r10_bio->sectors = (mbio->bi_size >> 9)
 			- sectors_handled;
 		r10_bio->state = 0;
 		set_bit(R10BIO_ReadError,
 			&r10_bio->state);
 		r10_bio->mddev = mddev;
 		r10_bio->sector = mbio->bi_sector
 			+ sectors_handled;
 		goto read_more;
 	} else
 		generic_make_request(bio);
 }
 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 {
 	/* Some sort of write request has finished and it
 	 * succeeded in writing where we thought there was a
 	 * bad block.  So forget the bad block.
 	 * Or possibly if failed and we need to record
 	 * a bad block.
 	 */
 	int m;
 	struct md_rdev *rdev;
 	if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
 	    test_bit(R10BIO_IsRecover, &r10_bio->state)) {
 		for (m = 0; m < conf->copies; m++) {
 			int dev = r10_bio->devs[m].devnum;
 			rdev = conf->mirrors[dev].rdev;
 			if (r10_bio->devs[m].bio == NULL)
 				continue;
 			if (test_bit(BIO_UPTODATE,
 				     &r10_bio->devs[m].bio->bi_flags)) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
 					r10_bio->sectors, 0);
 			} else {
 				if (!rdev_set_badblocks(
 					    rdev,
 					    r10_bio->devs[m].addr,
 					    r10_bio->sectors, 0))
 					md_error(conf->mddev, rdev);
 			}
 			rdev = conf->mirrors[dev].replacement;
 			if (r10_bio->devs[m].repl_bio == NULL)
 				continue;
 			if (test_bit(BIO_UPTODATE,
 				     &r10_bio->devs[m].repl_bio->bi_flags)) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
 					r10_bio->sectors, 0);
 			} else {
 				if (!rdev_set_badblocks(
 					    rdev,
 					    r10_bio->devs[m].addr,
 					    r10_bio->sectors, 0))
 					md_error(conf->mddev, rdev);
 			}
 		}
 		put_buf(r10_bio);
 	} else {
 		for (m = 0; m < conf->copies; m++) {
 			int dev = r10_bio->devs[m].devnum;
 			struct bio *bio = r10_bio->devs[m].bio;
 			rdev = conf->mirrors[dev].rdev;
 			if (bio == IO_MADE_GOOD) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
 					r10_bio->sectors, 0);
 				rdev_dec_pending(rdev, conf->mddev);
 			} else if (bio != NULL &&
 				   !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
 				if (!narrow_write_error(r10_bio, m)) {
 					md_error(conf->mddev, rdev);
 					set_bit(R10BIO_Degraded,
 						&r10_bio->state);
 				}
 				rdev_dec_pending(rdev, conf->mddev);
 			}
 			bio = r10_bio->devs[m].repl_bio;
 			rdev = conf->mirrors[dev].replacement;
 			if (rdev && bio == IO_MADE_GOOD) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
 					r10_bio->sectors, 0);
 				rdev_dec_pending(rdev, conf->mddev);
 			}
 		}
 		if (test_bit(R10BIO_WriteError,
 			     &r10_bio->state))
 			close_write(r10_bio);
 		raid_end_bio_io(r10_bio);
 	}
 }
 static void raid10d(struct md_thread *thread)
 {
 	struct mddev *mddev = thread->mddev;
 	struct r10bio *r10_bio;
 	unsigned long flags;
 	struct r10conf *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	struct blk_plug plug;
 	md_check_recovery(mddev);
 	blk_start_plug(&plug);
 	for (;;) {
 		flush_pending_writes(conf);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (list_empty(head)) {
 			spin_unlock_irqrestore(&conf->device_lock, flags);
 			break;
 		}
 		r10_bio = list_entry(head->prev, struct r10bio, retry_list);
 		list_del(head->prev);
 		conf->nr_queued--;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		mddev = r10_bio->mddev;
 		conf = mddev->private;
 		if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
 		    test_bit(R10BIO_WriteError, &r10_bio->state))
 			handle_write_completed(conf, r10_bio);
 		else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
 			reshape_request_write(mddev, r10_bio);
 		else if (test_bit(R10BIO_IsSync, &r10_bio->state))
 			sync_request_write(mddev, r10_bio);
 		else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
 			recovery_request_write(mddev, r10_bio);
 		else if (test_bit(R10BIO_ReadError, &r10_bio->state))
 			handle_read_error(mddev, r10_bio);
 		else {
 			/* just a partial read to be scheduled from a
 			 * separate context
 			 */
 			int slot = r10_bio->read_slot;
 			generic_make_request(r10_bio->devs[slot].bio);
 		}
 		cond_resched();
 		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
 			md_check_recovery(mddev);
 	}
 	blk_finish_plug(&plug);
 }
 static int init_resync(struct r10conf *conf)
 {
 	int buffs;
 	int i;
 	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
 	BUG_ON(conf->r10buf_pool);
 	conf->have_replacement = 0;
 	for (i = 0; i < conf->geo.raid_disks; i++)
 		if (conf->mirrors[i].replacement)
 			conf->have_replacement = 1;
 	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
 	if (!conf->r10buf_pool)
 		return -ENOMEM;
 	conf->next_resync = 0;
 	return 0;
 }
 /*
  * perform a "sync" on one "block"
  *
  * We need to make sure that no normal I/O request - particularly write
  * requests - conflict with active sync requests.
  *
  * This is achieved by tracking pending requests and a 'barrier' concept
  * that can be installed to exclude normal IO requests.
  *
  * Resync and recovery are handled very differently.
  * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
  *
  * For resync, we iterate over virtual addresses, read all copies,
  * and update if there are differences.  If only one copy is live,
  * skip it.
  * For recovery, we iterate over physical addresses, read a good
  * value for each non-in_sync drive, and over-write.
  *
  * So, for recovery we may have several outstanding complex requests for a
  * given address, one for each out-of-sync device.  We model this by allocating
  * a number of r10_bio structures, one for each out-of-sync device.
  * As we setup these structures, we collect all bio's together into a list
  * which we then process collectively to add pages, and then process again
  * to pass to generic_make_request.
  *
  * The r10_bio structures are linked using a borrowed master_bio pointer.
  * This link is counted in ->remaining.  When the r10_bio that points to NULL
  * has its remaining count decremented to 0, the whole complex operation
  * is complete.
  *
  */
 static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			     int *skipped, int go_faster)
 {
 	struct r10conf *conf = mddev->private;
 	struct r10bio *r10_bio;
 	struct bio *biolist = NULL, *bio;
 	sector_t max_sector, nr_sectors;
 	int i;
 	int max_sync;
 	sector_t sync_blocks;
 	sector_t sectors_skipped = 0;
 	int chunks_skipped = 0;
 	sector_t chunk_mask = conf->geo.chunk_mask;
 	if (!conf->r10buf_pool)
 		if (init_resync(conf))
 			return 0;
  skipped:
 	max_sector = mddev->dev_sectors;
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
 	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		max_sector = mddev->resync_max_sectors;
 	if (sector_nr >= max_sector) {
 		/* If we aborted, we need to abort the
 		 * sync on the 'current' bitmap chucks (there can
 		 * be several when recovering multiple devices).
 		 * as we may have started syncing it but not finished.
 		 * We can find the current address in
 		 * mddev->curr_resync, but for recovery,
 		 * we need to convert that to several
 		 * virtual addresses.
 		 */
 		if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
 			end_reshape(conf);
 			return 0;
 		}
 		if (mddev->curr_resync < max_sector) { /* aborted */
 			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 				bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
 						&sync_blocks, 1);
 			else for (i = 0; i < conf->geo.raid_disks; i++) {
 				sector_t sect =
 					raid10_find_virt(conf, mddev->curr_resync, i);
 				bitmap_end_sync(mddev->bitmap, sect,
 						&sync_blocks, 1);
 			}
 		} else {
 			/* completed sync */
 			if ((!mddev->bitmap || conf->fullsync)
 			    && conf->have_replacement
 			    && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 				/* Completed a full sync so the replacements
 				 * are now fully recovered.
 				 */
 				for (i = 0; i < conf->geo.raid_disks; i++)
 					if (conf->mirrors[i].replacement)
 						conf->mirrors[i].replacement
 							->recovery_offset
 							= MaxSector;
 			}
 			conf->fullsync = 0;
 		}
 		bitmap_close_sync(mddev->bitmap);
 		close_sync(conf);
 		*skipped = 1;
 		return sectors_skipped;
 	}
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		return reshape_request(mddev, sector_nr, skipped);
 	if (chunks_skipped >= conf->geo.raid_disks) {
 		/* if there has been nothing to do on any drive,
 		 * then there is nothing to do at all..
 		 */
 		*skipped = 1;
 		return (max_sector - sector_nr) + sectors_skipped;
 	}
 	if (max_sector > mddev->resync_max)
 		max_sector = mddev->resync_max; /* Don't do IO beyond here */
 	/* make sure whole request will fit in a chunk - if chunks
 	 * are meaningful
 	 */
 	if (conf->geo.near_copies < conf->geo.raid_disks &&
 	    max_sector > (sector_nr | chunk_mask))
 		max_sector = (sector_nr | chunk_mask) + 1;
 	/*
 	 * If there is non-resync activity waiting for us then
 	 * put in a delay to throttle resync.
 	 */
 	if (!go_faster && conf->nr_waiting)
 		msleep_interruptible(1000);
 	/* Again, very different code for resync and recovery.
 	 * Both must result in an r10bio with a list of bios that
 	 * have bi_end_io, bi_sector, bi_bdev set,
 	 * and bi_private set to the r10bio.
 	 * For recovery, we may actually create several r10bios
 	 * with 2 bios in each, that correspond to the bios in the main one.
 	 * In this case, the subordinate r10bios link back through a
 	 * borrowed master_bio pointer, and the counter in the master
 	 * includes a ref from each subordinate.
 	 */
 	/* First, we decide what to do and set ->bi_end_io
 	 * To end_sync_read if we want to read, and
 	 * end_sync_write if we will want to write.
 	 */
 	max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
 	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 		/* recovery... the complicated one */
 		int j;
 		r10_bio = NULL;
 		for (i = 0 ; i < conf->geo.raid_disks; i++) {
 			int still_degraded;
 			struct r10bio *rb2;
 			sector_t sect;
 			int must_sync;
 			int any_working;
 			struct raid10_info *mirror = &conf->mirrors[i];
 			if ((mirror->rdev == NULL ||
 			     test_bit(In_sync, &mirror->rdev->flags))
 			    &&
 			    (mirror->replacement == NULL ||
 			     test_bit(Faulty,
 				      &mirror->replacement->flags)))
 				continue;
 			still_degraded = 0;
 			/* want to reconstruct this device */
 			rb2 = r10_bio;
 			sect = raid10_find_virt(conf, sector_nr, i);
 			if (sect >= mddev->resync_max_sectors) {
 				/* last stripe is not complete - don't
 				 * try to recover this sector.
 				 */
 				continue;
 			}
 			/* Unless we are doing a full sync, or a replacement
 			 * we only need to recover the block if it is set in
 			 * the bitmap
 			 */
 			must_sync = bitmap_start_sync(mddev->bitmap, sect,
 						      &sync_blocks, 1);
 			if (sync_blocks < max_sync)
 				max_sync = sync_blocks;
 			if (!must_sync &&
 			    mirror->replacement == NULL &&
 			    !conf->fullsync) {
 				/* yep, skip the sync_blocks here, but don't assume
 				 * that there will never be anything to do here
 				 */
 				chunks_skipped = -1;
 				continue;
 			}
 			r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
 			raise_barrier(conf, rb2 != NULL);
 			atomic_set(&r10_bio->remaining, 0);
 			r10_bio->master_bio = (struct bio*)rb2;
 			if (rb2)
 				atomic_inc(&rb2->remaining);
 			r10_bio->mddev = mddev;
 			set_bit(R10BIO_IsRecover, &r10_bio->state);
 			r10_bio->sector = sect;
 			raid10_find_phys(conf, r10_bio);
 			/* Need to check if the array will still be
 			 * degraded
 			 */
 			for (j = 0; j < conf->geo.raid_disks; j++)
 				if (conf->mirrors[j].rdev == NULL ||
 				    test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
 					still_degraded = 1;
 					break;
 				}
 			must_sync = bitmap_start_sync(mddev->bitmap, sect,
 						      &sync_blocks, still_degraded);
 			any_working = 0;
 			for (j=0; j<conf->copies;j++) {
 				int k;
 				int d = r10_bio->devs[j].devnum;
 				sector_t from_addr, to_addr;
 				struct md_rdev *rdev;
 				sector_t sector, first_bad;
 				int bad_sectors;
 				if (!conf->mirrors[d].rdev ||
 				    !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
 					continue;
 				/* This is where we read from */
 				any_working = 1;
 				rdev = conf->mirrors[d].rdev;
 				sector = r10_bio->devs[j].addr;
 				if (is_badblock(rdev, sector, max_sync,
 						&first_bad, &bad_sectors)) {
 					if (first_bad > sector)
 						max_sync = first_bad - sector;
 					else {
 						bad_sectors -= (sector
 								- first_bad);
 						if (max_sync > bad_sectors)
 							max_sync = bad_sectors;
 						continue;
 					}
 				}
 				bio = r10_bio->devs[0].bio;
 				bio->bi_next = biolist;
 				biolist = bio;
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_read;
 				bio->bi_rw = READ;
 				from_addr = r10_bio->devs[j].addr;
 				bio->bi_sector = from_addr + rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				atomic_inc(&rdev->nr_pending);
 				/* and we write to 'i' (if not in_sync) */
 				for (k=0; k<conf->copies; k++)
 					if (r10_bio->devs[k].devnum == i)
 						break;
 				BUG_ON(k == conf->copies);
 				to_addr = r10_bio->devs[k].addr;
 				r10_bio->devs[0].devnum = d;
 				r10_bio->devs[0].addr = from_addr;
 				r10_bio->devs[1].devnum = i;
 				r10_bio->devs[1].addr = to_addr;
 				rdev = mirror->rdev;
 				if (!test_bit(In_sync, &rdev->flags)) {
 					bio = r10_bio->devs[1].bio;
 					bio->bi_next = biolist;
 					biolist = bio;
 					bio->bi_private = r10_bio;
 					bio->bi_end_io = end_sync_write;
 					bio->bi_rw = WRITE;
 					bio->bi_sector = to_addr
 						+ rdev->data_offset;
 					bio->bi_bdev = rdev->bdev;
 					atomic_inc(&r10_bio->remaining);
 				} else
 					r10_bio->devs[1].bio->bi_end_io = NULL;
 				/* and maybe write to replacement */
 				bio = r10_bio->devs[1].repl_bio;
 				if (bio)
 					bio->bi_end_io = NULL;
 				rdev = mirror->replacement;
 				/* Note: if rdev != NULL, then bio
 				 * cannot be NULL as r10buf_pool_alloc will
 				 * have allocated it.
 				 * So the second test here is pointless.
 				 * But it keeps semantic-checkers happy, and
 				 * this comment keeps human reviewers
 				 * happy.
 				 */
 				if (rdev == NULL || bio == NULL ||
 				    test_bit(Faulty, &rdev->flags))
 					break;
 				bio->bi_next = biolist;
 				biolist = bio;
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_write;
 				bio->bi_rw = WRITE;
 				bio->bi_sector = to_addr + rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				atomic_inc(&r10_bio->remaining);
 				break;
 			}
 			if (j == conf->copies) {
 				/* Cannot recover, so abort the recovery or
 				 * record a bad block */
 				put_buf(r10_bio);
 				if (rb2)
 					atomic_dec(&rb2->remaining);
 				r10_bio = rb2;
 				if (any_working) {
 					/* problem is that there are bad blocks
 					 * on other device(s)
 					 */
 					int k;
 					for (k = 0; k < conf->copies; k++)
 						if (r10_bio->devs[k].devnum == i)
 							break;
 					if (!test_bit(In_sync,
 						      &mirror->rdev->flags)
 					    && !rdev_set_badblocks(
 						    mirror->rdev,
 						    r10_bio->devs[k].addr,
 						    max_sync, 0))
 						any_working = 0;
 					if (mirror->replacement &&
 					    !rdev_set_badblocks(
 						    mirror->replacement,
 						    r10_bio->devs[k].addr,
 						    max_sync, 0))
 						any_working = 0;
 				}
 				if (!any_working)  {
 					if (!test_and_set_bit(MD_RECOVERY_INTR,
 							      &mddev->recovery))
 						printk(KERN_INFO "md/raid10:%s: insufficient "
 						       "working devices for recovery.\n",
 						       mdname(mddev));
 					mirror->recovery_disabled
 						= mddev->recovery_disabled;
 				}
 				break;
 			}
 		}
 		if (biolist == NULL) {
 			while (r10_bio) {
 				struct r10bio *rb2 = r10_bio;
 				r10_bio = (struct r10bio*) rb2->master_bio;
 				rb2->master_bio = NULL;
 				put_buf(rb2);
 			}
 			goto giveup;
 		}
 	} else {
 		/* resync. Schedule a read for every block at this virt offset */
 		int count = 0;
 		bitmap_cond_end_sync(mddev->bitmap, sector_nr);
 		if (!bitmap_start_sync(mddev->bitmap, sector_nr,
 				       &sync_blocks, mddev->degraded) &&
 		    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
 						 &mddev->recovery)) {
 			/* We can skip this block */
 			*skipped = 1;
 			return sync_blocks + sectors_skipped;
 		}
 		if (sync_blocks < max_sync)
 			max_sync = sync_blocks;
 		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
 		r10_bio->mddev = mddev;
 		atomic_set(&r10_bio->remaining, 0);
 		raise_barrier(conf, 0);
 		conf->next_resync = sector_nr;
 		r10_bio->master_bio = NULL;
 		r10_bio->sector = sector_nr;
 		set_bit(R10BIO_IsSync, &r10_bio->state);
 		raid10_find_phys(conf, r10_bio);
 		r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
 		for (i = 0; i < conf->copies; i++) {
 			int d = r10_bio->devs[i].devnum;
 			sector_t first_bad, sector;
 			int bad_sectors;
 			if (r10_bio->devs[i].repl_bio)
 				r10_bio->devs[i].repl_bio->bi_end_io = NULL;
 			bio = r10_bio->devs[i].bio;
 			bio->bi_end_io = NULL;
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 			if (conf->mirrors[d].rdev == NULL ||
 			    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
 				continue;
 			sector = r10_bio->devs[i].addr;
 			if (is_badblock(conf->mirrors[d].rdev,
 					sector, max_sync,
 					&first_bad, &bad_sectors)) {
 				if (first_bad > sector)
 					max_sync = first_bad - sector;
 				else {
 					bad_sectors -= (sector - first_bad);
 					if (max_sync > bad_sectors)
 						max_sync = bad_sectors;
 					continue;
 				}
 			}
 			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 			atomic_inc(&r10_bio->remaining);
 			bio->bi_next = biolist;
 			biolist = bio;
 			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_read;
 			bio->bi_rw = READ;
 			bio->bi_sector = sector +
 				conf->mirrors[d].rdev->data_offset;
 			bio->bi_bdev = conf->mirrors[d].rdev->bdev;
 			count++;
 			if (conf->mirrors[d].replacement == NULL ||
 			    test_bit(Faulty,
 				     &conf->mirrors[d].replacement->flags))
 				continue;
 			/* Need to set up for writing to the replacement */
 			bio = r10_bio->devs[i].repl_bio;
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 			sector = r10_bio->devs[i].addr;
 			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 			bio->bi_next = biolist;
 			biolist = bio;
 			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_write;
 			bio->bi_rw = WRITE;
 			bio->bi_sector = sector +
 				conf->mirrors[d].replacement->data_offset;
 			bio->bi_bdev = conf->mirrors[d].replacement->bdev;
 			count++;
 		}
 		if (count < 2) {
 			for (i=0; i<conf->copies; i++) {
 				int d = r10_bio->devs[i].devnum;
 				if (r10_bio->devs[i].bio->bi_end_io)
 					rdev_dec_pending(conf->mirrors[d].rdev,
 							 mddev);
 				if (r10_bio->devs[i].repl_bio &&
 				    r10_bio->devs[i].repl_bio->bi_end_io)
 					rdev_dec_pending(
 						conf->mirrors[d].replacement,
 						mddev);
 			}
 			put_buf(r10_bio);
 			biolist = NULL;
 			goto giveup;
 		}
 	}
 	for (bio = biolist; bio ; bio=bio->bi_next) {
 		bio->bi_flags &= ~(BIO_POOL_MASK - 1);
 		if (bio->bi_end_io)
 			bio->bi_flags |= 1 << BIO_UPTODATE;
 		bio->bi_vcnt = 0;
 		bio->bi_idx = 0;
 		bio->bi_phys_segments = 0;
 		bio->bi_size = 0;
 	}
 	nr_sectors = 0;
 	if (sector_nr + max_sync < max_sector)
 		max_sector = sector_nr + max_sync;
 	do {
 		struct page *page;
 		int len = PAGE_SIZE;
 		if (sector_nr + (len>>9) > max_sector)
 			len = (max_sector - sector_nr) << 9;
 		if (len == 0)
 			break;
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
 			struct bio *bio2;
 			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
 			if (bio_add_page(bio, page, len, 0))
 				continue;
 			/* stop here */
 			bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
 			for (bio2 = biolist;
 			     bio2 && bio2 != bio;
 			     bio2 = bio2->bi_next) {
 				/* remove last page from this bio */
 				bio2->bi_vcnt--;
 				bio2->bi_size -= len;
 				bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
 			}
 			goto bio_full;
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 	} while (biolist->bi_vcnt < RESYNC_PAGES);
  bio_full:
 	r10_bio->sectors = nr_sectors;
 	while (biolist) {
 		bio = biolist;
 		biolist = biolist->bi_next;
 		bio->bi_next = NULL;
 		r10_bio = bio->bi_private;
 		r10_bio->sectors = nr_sectors;
 		if (bio->bi_end_io == end_sync_read) {
 			md_sync_acct(bio->bi_bdev, nr_sectors);
 			generic_make_request(bio);
 		}
 	}
 	if (sectors_skipped)
 		/* pretend they weren't skipped, it makes
 		 * no important difference in this case
 		 */
 		md_done_sync(mddev, sectors_skipped, 1);
 	return sectors_skipped + nr_sectors;
  giveup:
 	/* There is nowhere to write, so all non-sync
 	 * drives must be failed or in resync, all drives
 	 * have a bad block, so try the next chunk...
 	 */
 	if (sector_nr + max_sync < max_sector)
 		max_sector = sector_nr + max_sync;
 	sectors_skipped += (max_sector - sector_nr);
 	chunks_skipped ++;
 	sector_nr = max_sector;
 	goto skipped;
 }
 static sector_t
 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
 	sector_t size;
 	struct r10conf *conf = mddev->private;
 	if (!raid_disks)
 		raid_disks = min(conf->geo.raid_disks,
 				 conf->prev.raid_disks);
 	if (!sectors)
 		sectors = conf->dev_sectors;
 	size = sectors >> conf->geo.chunk_shift;
 	sector_div(size, conf->geo.far_copies);
 	size = size * raid_disks;
 	sector_div(size, conf->geo.near_copies);
 	return size << conf->geo.chunk_shift;
 }
 static void calc_sectors(struct r10conf *conf, sector_t size)
 {
 	/* Calculate the number of sectors-per-device that will
 	 * actually be used, and set conf->dev_sectors and
 	 * conf->stride
 	 */
 	size = size >> conf->geo.chunk_shift;
 	sector_div(size, conf->geo.far_copies);
 	size = size * conf->geo.raid_disks;
 	sector_div(size, conf->geo.near_copies);
 	/* 'size' is now the number of chunks in the array */
 	/* calculate "used chunks per device" */
 	size = size * conf->copies;
 	/* We need to round up when dividing by raid_disks to
 	 * get the stride size.
 	 */
 	size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
 	conf->dev_sectors = size << conf->geo.chunk_shift;
 	if (conf->geo.far_offset)
 		conf->geo.stride = 1 << conf->geo.chunk_shift;
 	else {
 		sector_div(size, conf->geo.far_copies);
 		conf->geo.stride = size << conf->geo.chunk_shift;
 	}
 }
 enum geo_type {geo_new, geo_old, geo_start};
 static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
 {
 	int nc, fc, fo;
 	int layout, chunk, disks;
 	switch (new) {
 	case geo_old:
 		layout = mddev->layout;
 		chunk = mddev->chunk_sectors;
 		disks = mddev->raid_disks - mddev->delta_disks;
 		break;
 	case geo_new:
 		layout = mddev->new_layout;
 		chunk = mddev->new_chunk_sectors;
 		disks = mddev->raid_disks;
 		break;
 	default: /* avoid 'may be unused' warnings */
 	case geo_start: /* new when starting reshape - raid_disks not
 			 * updated yet. */
 		layout = mddev->new_layout;
 		chunk = mddev->new_chunk_sectors;
 		disks = mddev->raid_disks + mddev->delta_disks;
 		break;
 	}
-	if (layout >> 17)
+	if (layout >> 18)
 		return -1;
 	if (chunk < (PAGE_SIZE >> 9) ||
 	    !is_power_of_2(chunk))
 		return -2;
 	nc = layout & 255;
 	fc = (layout >> 8) & 255;
 	fo = layout & (1<<16);
 	geo->raid_disks = disks;
 	geo->near_copies = nc;
 	geo->far_copies = fc;
 	geo->far_offset = fo;
+	geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
 	geo->chunk_mask = chunk - 1;
 	geo->chunk_shift = ffz(~chunk);
 	return nc*fc;
 }
 static struct r10conf *setup_conf(struct mddev *mddev)
 {
 	struct r10conf *conf = NULL;
 	int err = -EINVAL;
 	struct geom geo;
 	int copies;
 	copies = setup_geo(&geo, mddev, geo_new);
 	if (copies == -2) {
 		printk(KERN_ERR "md/raid10:%s: chunk size must be "
 		       "at least PAGE_SIZE(%ld) and be a power of 2.\n",
 		       mdname(mddev), PAGE_SIZE);
 		goto out;
 	}
 	if (copies < 2 || copies > mddev->raid_disks) {
 		printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
 		       mdname(mddev), mddev->new_layout);
 		goto out;
 	}
 	err = -ENOMEM;
 	conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
 	if (!conf)
 		goto out;
 	/* FIXME calc properly */
 	conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
 							    max(0,mddev->delta_disks)),
 				GFP_KERNEL);
 	if (!conf->mirrors)
 		goto out;
 	conf->tmppage = alloc_page(GFP_KERNEL);
 	if (!conf->tmppage)
 		goto out;
 	conf->geo = geo;
 	conf->copies = copies;
 	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
 					   r10bio_pool_free, conf);
 	if (!conf->r10bio_pool)
 		goto out;
 	calc_sectors(conf, mddev->dev_sectors);
 	if (mddev->reshape_position == MaxSector) {
 		conf->prev = conf->geo;
 		conf->reshape_progress = MaxSector;
 	} else {
 		if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
 			err = -EINVAL;
 			goto out;
 		}
 		conf->reshape_progress = mddev->reshape_position;
 		if (conf->prev.far_offset)
 			conf->prev.stride = 1 << conf->prev.chunk_shift;
 		else
 			/* far_copies must be 1 */
 			conf->prev.stride = conf->dev_sectors;
 	}
 	spin_lock_init(&conf->device_lock);
 	INIT_LIST_HEAD(&conf->retry_list);
 	spin_lock_init(&conf->resync_lock);
 	init_waitqueue_head(&conf->wait_barrier);
 	conf->thread = md_register_thread(raid10d, mddev, "raid10");
 	if (!conf->thread)
 		goto out;
 	conf->mddev = mddev;
 	return conf;
  out:
 	if (err == -ENOMEM)
 		printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
 		       mdname(mddev));
 	if (conf) {
 		if (conf->r10bio_pool)
 			mempool_destroy(conf->r10bio_pool);
 		kfree(conf->mirrors);
 		safe_put_page(conf->tmppage);
 		kfree(conf);
 	}
 	return ERR_PTR(err);
 }
 static int run(struct mddev *mddev)
 {
 	struct r10conf *conf;
 	int i, disk_idx, chunk_size;
 	struct raid10_info *disk;
 	struct md_rdev *rdev;
 	sector_t size;
 	sector_t min_offset_diff = 0;
 	int first = 1;
 	bool discard_supported = false;
 	if (mddev->private == NULL) {
 		conf = setup_conf(mddev);
 		if (IS_ERR(conf))
 			return PTR_ERR(conf);
 		mddev->private = conf;
 	}
 	conf = mddev->private;
 	if (!conf)
 		goto out;
 	mddev->thread = conf->thread;
 	conf->thread = NULL;
 	chunk_size = mddev->chunk_sectors << 9;
 	if (mddev->queue) {
 		blk_queue_max_discard_sectors(mddev->queue,
 					      mddev->chunk_sectors);
 		blk_queue_max_write_same_sectors(mddev->queue,
 						 mddev->chunk_sectors);
 		blk_queue_io_min(mddev->queue, chunk_size);
 		if (conf->geo.raid_disks % conf->geo.near_copies)
 			blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
 		else
 			blk_queue_io_opt(mddev->queue, chunk_size *
 					 (conf->geo.raid_disks / conf->geo.near_copies));
 	}
 	rdev_for_each(rdev, mddev) {
 		long long diff;
 		struct request_queue *q;
 		disk_idx = rdev->raid_disk;
 		if (disk_idx < 0)
 			continue;
 		if (disk_idx >= conf->geo.raid_disks &&
 		    disk_idx >= conf->prev.raid_disks)
 			continue;
 		disk = conf->mirrors + disk_idx;
 		if (test_bit(Replacement, &rdev->flags)) {
 			if (disk->replacement)
 				goto out_free_conf;
 			disk->replacement = rdev;
 		} else {
 			if (disk->rdev)
 				goto out_free_conf;
 			disk->rdev = rdev;
 		}
 		q = bdev_get_queue(rdev->bdev);
 		if (q->merge_bvec_fn)
 			mddev->merge_check_needed = 1;
 		diff = (rdev->new_data_offset - rdev->data_offset);
 		if (!mddev->reshape_backwards)
 			diff = -diff;
 		if (diff < 0)
 			diff = 0;
 		if (first || diff < min_offset_diff)
 			min_offset_diff = diff;
 		if (mddev->gendisk)
 			disk_stack_limits(mddev->gendisk, rdev->bdev,
 					  rdev->data_offset << 9);
 		disk->head_position = 0;
 		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
 			discard_supported = true;
 	}
 	if (mddev->queue) {
 		if (discard_supported)
 			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
 						mddev->queue);
 		else
 			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
 						  mddev->queue);
 	}
 	/* need to check that every block has at least one working mirror */
 	if (!enough(conf, -1)) {
 		printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
 		       mdname(mddev));
 		goto out_free_conf;
 	}
 	if (conf->reshape_progress != MaxSector) {
 		/* must ensure that shape change is supported */
 		if (conf->geo.far_copies != 1 &&
 		    conf->geo.far_offset == 0)
 			goto out_free_conf;
 		if (conf->prev.far_copies != 1 &&
 		    conf->geo.far_offset == 0)
 			goto out_free_conf;
 	}
 	mddev->degraded = 0;
 	for (i = 0;
 	     i < conf->geo.raid_disks
 		     || i < conf->prev.raid_disks;
 	     i++) {
 		disk = conf->mirrors + i;
 		if (!disk->rdev && disk->replacement) {
 			/* The replacement is all we have - use it */
 			disk->rdev = disk->replacement;
 			disk->replacement = NULL;
 			clear_bit(Replacement, &disk->rdev->flags);
 		}
 		if (!disk->rdev ||
 		    !test_bit(In_sync, &disk->rdev->flags)) {
 			disk->head_position = 0;
 			mddev->degraded++;
 			if (disk->rdev)
 				conf->fullsync = 1;
 		}
 		disk->recovery_disabled = mddev->recovery_disabled - 1;
 	}
 	if (mddev->recovery_cp != MaxSector)
 		printk(KERN_NOTICE "md/raid10:%s: not clean"
 		       " -- starting background reconstruction\n",
 		       mdname(mddev));
 	printk(KERN_INFO
 		"md/raid10:%s: active with %d out of %d devices\n",
 		mdname(mddev), conf->geo.raid_disks - mddev->degraded,
 		conf->geo.raid_disks);
 	/*
 	 * Ok, everything is just fine now
 	 */
 	mddev->dev_sectors = conf->dev_sectors;
 	size = raid10_size(mddev, 0, 0);
 	md_set_array_sectors(mddev, size);
 	mddev->resync_max_sectors = size;
 	if (mddev->queue) {
 		int stripe = conf->geo.raid_disks *
 			((mddev->chunk_sectors << 9) / PAGE_SIZE);
 		mddev->queue->backing_dev_info.congested_fn = raid10_congested;
 		mddev->queue->backing_dev_info.congested_data = mddev;
 		/* Calculate max read-ahead size.
 		 * We need to readahead at least twice a whole stripe....
 		 * maybe...
 		 */
 		stripe /= conf->geo.near_copies;
 		if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
 	}
 	if (md_integrity_register(mddev))
 		goto out_free_conf;
 	if (conf->reshape_progress != MaxSector) {
 		unsigned long before_length, after_length;
 		before_length = ((1 << conf->prev.chunk_shift) *
 				 conf->prev.far_copies);
 		after_length = ((1 << conf->geo.chunk_shift) *
 				conf->geo.far_copies);
 		if (max(before_length, after_length) > min_offset_diff) {
 			/* This cannot work */
 			printk("md/raid10: offset difference not enough to continue reshape\n");
 			goto out_free_conf;
 		}
 		conf->offset_diff = min_offset_diff;
 		conf->reshape_safe = conf->reshape_progress;
 		clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 		clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 		set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 		set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 		mddev->sync_thread = md_register_thread(md_do_sync, mddev,
 							"reshape");
 	}
 	return 0;
 out_free_conf:
 	md_unregister_thread(&mddev->thread);
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
 	safe_put_page(conf->tmppage);
 	kfree(conf->mirrors);
 	kfree(conf);
 	mddev->private = NULL;
 out:
 	return -EIO;
 }
 static int stop(struct mddev *mddev)
 {
 	struct r10conf *conf = mddev->private;
 	raise_barrier(conf, 0);
 	lower_barrier(conf);
 	md_unregister_thread(&mddev->thread);
 	if (mddev->queue)
 		/* the unplug fn references 'conf'*/
 		blk_sync_queue(mddev->queue);
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
 	kfree(conf->mirrors);
 	kfree(conf);
 	mddev->private = NULL;
 	return 0;
 }
 static void raid10_quiesce(struct mddev *mddev, int state)
 {
 	struct r10conf *conf = mddev->private;
 	switch(state) {
 	case 1:
 		raise_barrier(conf, 0);
 		break;
 	case 0:
 		lower_barrier(conf);
 		break;
 	}
 }
 static int raid10_resize(struct mddev *mddev, sector_t sectors)
 {
 	/* Resize of 'far' arrays is not supported.
 	 * For 'near' and 'offset' arrays we can set the
 	 * number of sectors used to be an appropriate multiple
 	 * of the chunk size.
 	 * For 'offset', this is far_copies*chunksize.
 	 * For 'near' the multiplier is the LCM of
 	 * near_copies and raid_disks.
 	 * So if far_copies > 1 && !far_offset, fail.
 	 * Else find LCM(raid_disks, near_copy)*far_copies and
 	 * multiply by chunk_size.  Then round to this number.
 	 * This is mostly done by raid10_size()
 	 */
 	struct r10conf *conf = mddev->private;
 	sector_t oldsize, size;
 	if (mddev->reshape_position != MaxSector)
 		return -EBUSY;
 	if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
 		return -EINVAL;
 	oldsize = raid10_size(mddev, 0, 0);
 	size = raid10_size(mddev, sectors, 0);
 	if (mddev->external_size &&
 	    mddev->array_sectors > size)
 		return -EINVAL;
 	if (mddev->bitmap) {
 		int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
 		if (ret)
 			return ret;
 	}
 	md_set_array_sectors(mddev, size);
 	set_capacity(mddev->gendisk, mddev->array_sectors);
 	revalidate_disk(mddev->gendisk);
 	if (sectors > mddev->dev_sectors &&
 	    mddev->recovery_cp > oldsize) {
 		mddev->recovery_cp = oldsize;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}
 	calc_sectors(conf, sectors);
 	mddev->dev_sectors = conf->dev_sectors;
 	mddev->resync_max_sectors = size;
 	return 0;
 }
 static void *raid10_takeover_raid0(struct mddev *mddev)
 {
 	struct md_rdev *rdev;
 	struct r10conf *conf;
 	if (mddev->degraded > 0) {
 		printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
 		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 	/* Set new parameters */
 	mddev->new_level = 10;
 	/* new layout: far_copies = 1, near_copies = 2 */
 	mddev->new_layout = (1<<8) + 2;
 	mddev->new_chunk_sectors = mddev->chunk_sectors;
 	mddev->delta_disks = mddev->raid_disks;
 	mddev->raid_disks *= 2;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
 	conf = setup_conf(mddev);
 	if (!IS_ERR(conf)) {
 		rdev_for_each(rdev, mddev)
 			if (rdev->raid_disk >= 0)
 				rdev->new_raid_disk = rdev->raid_disk * 2;
 		conf->barrier = 1;
 	}
 	return conf;
 }
 static void *raid10_takeover(struct mddev *mddev)
 {
 	struct r0conf *raid0_conf;
 	/* raid10 can take over:
 	 *  raid0 - providing it has only two drives
 	 */
 	if (mddev->level == 0) {
 		/* for raid0 takeover only one zone is supported */
 		raid0_conf = mddev->private;
 		if (raid0_conf->nr_strip_zones > 1) {
 			printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
 			       " with more than one zone.\n",
 			       mdname(mddev));
 			return ERR_PTR(-EINVAL);
 		}
 		return raid10_takeover_raid0(mddev);
 	}
 	return ERR_PTR(-EINVAL);
 }
 static int raid10_check_reshape(struct mddev *mddev)
 {
 	/* Called when there is a request to change
 	 * - layout (to ->new_layout)
 	 * - chunk size (to ->new_chunk_sectors)
 	 * - raid_disks (by delta_disks)
 	 * or when trying to restart a reshape that was ongoing.
 	 *
 	 * We need to validate the request and possibly allocate
 	 * space if that might be an issue later.
 	 *
 	 * Currently we reject any reshape of a 'far' mode array,
 	 * allow chunk size to change if new is generally acceptable,
 	 * allow raid_disks to increase, and allow
 	 * a switch between 'near' mode and 'offset' mode.
 	 */
 	struct r10conf *conf = mddev->private;
 	struct geom geo;
 	if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
 		return -EINVAL;
 	if (setup_geo(&geo, mddev, geo_start) != conf->copies)
 		/* mustn't change number of copies */
 		return -EINVAL;
 	if (geo.far_copies > 1 && !geo.far_offset)
 		/* Cannot switch to 'far' mode */
 		return -EINVAL;
 	if (mddev->array_sectors & geo.chunk_mask)
 			/* not factor of array size */
 			return -EINVAL;
 	if (!enough(conf, -1))
 		return -EINVAL;
 	kfree(conf->mirrors_new);
 	conf->mirrors_new = NULL;
 	if (mddev->delta_disks > 0) {
 		/* allocate new 'mirrors' list */
 		conf->mirrors_new = kzalloc(
 			sizeof(struct raid10_info)
 			*(mddev->raid_disks +
 			  mddev->delta_disks),
 			GFP_KERNEL);
 		if (!conf->mirrors_new)
 			return -ENOMEM;
 	}
 	return 0;
 }
 /*
  * Need to check if array has failed when deciding whether to:
  *  - start an array
  *  - remove non-faulty devices
  *  - add a spare
  *  - allow a reshape
  * This determination is simple when no reshape is happening.
  * However if there is a reshape, we need to carefully check
  * both the before and after sections.
  * This is because some failed devices may only affect one
  * of the two sections, and some non-in_sync devices may
  * be insync in the section most affected by failed devices.
  */
 static int calc_degraded(struct r10conf *conf)
 {
 	int degraded, degraded2;
 	int i;
 	rcu_read_lock();
 	degraded = 0;
 	/* 'prev' section first */
 	for (i = 0; i < conf->prev.raid_disks; i++) {
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (!rdev || test_bit(Faulty, &rdev->flags))
 			degraded++;
 		else if (!test_bit(In_sync, &rdev->flags))
 			/* When we can reduce the number of devices in
 			 * an array, this might not contribute to
 			 * 'degraded'.  It does now.
 			 */
 			degraded++;
 	}
 	rcu_read_unlock();
 	if (conf->geo.raid_disks == conf->prev.raid_disks)
 		return degraded;
 	rcu_read_lock();
 	degraded2 = 0;
 	for (i = 0; i < conf->geo.raid_disks; i++) {
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (!rdev || test_bit(Faulty, &rdev->flags))
 			degraded2++;
 		else if (!test_bit(In_sync, &rdev->flags)) {
 			/* If reshape is increasing the number of devices,
 			 * this section has already been recovered, so
 			 * it doesn't contribute to degraded.
 			 * else it does.
 			 */
 			if (conf->geo.raid_disks <= conf->prev.raid_disks)
 				degraded2++;
 		}
 	}
 	rcu_read_unlock();
 	if (degraded2 > degraded)
 		return degraded2;
 	return degraded;
 }
 static int raid10_start_reshape(struct mddev *mddev)
 {
 	/* A 'reshape' has been requested. This commits
 	 * the various 'new' fields and sets MD_RECOVER_RESHAPE
 	 * This also checks if there are enough spares and adds them
 	 * to the array.
 	 * We currently require enough spares to make the final
 	 * array non-degraded.  We also require that the difference
 	 * between old and new data_offset - on each device - is
 	 * enough that we never risk over-writing.
 	 */
 	unsigned long before_length, after_length;
 	sector_t min_offset_diff = 0;
 	int first = 1;
 	struct geom new;
 	struct r10conf *conf = mddev->private;
 	struct md_rdev *rdev;
 	int spares = 0;
 	int ret;
 	if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
 		return -EBUSY;
 	if (setup_geo(&new, mddev, geo_start) != conf->copies)
 		return -EINVAL;
 	before_length = ((1 << conf->prev.chunk_shift) *
 			 conf->prev.far_copies);
 	after_length = ((1 << conf->geo.chunk_shift) *
 			conf->geo.far_copies);
 	rdev_for_each(rdev, mddev) {
 		if (!test_bit(In_sync, &rdev->flags)
 		    && !test_bit(Faulty, &rdev->flags))
 			spares++;
 		if (rdev->raid_disk >= 0) {
 			long long diff = (rdev->new_data_offset
 					  - rdev->data_offset);
 			if (!mddev->reshape_backwards)
 				diff = -diff;
 			if (diff < 0)
 				diff = 0;
 			if (first || diff < min_offset_diff)
 				min_offset_diff = diff;
 		}
 	}
 	if (max(before_length, after_length) > min_offset_diff)
 		return -EINVAL;
 	if (spares < mddev->delta_disks)
 		return -EINVAL;
 	conf->offset_diff = min_offset_diff;
 	spin_lock_irq(&conf->device_lock);
 	if (conf->mirrors_new) {
 		memcpy(conf->mirrors_new, conf->mirrors,
 		       sizeof(struct raid10_info)*conf->prev.raid_disks);
 		smp_mb();
 		kfree(conf->mirrors_old); /* FIXME and elsewhere */
 		conf->mirrors_old = conf->mirrors;
 		conf->mirrors = conf->mirrors_new;
 		conf->mirrors_new = NULL;
 	}
 	setup_geo(&conf->geo, mddev, geo_start);
 	smp_mb();
 	if (mddev->reshape_backwards) {
 		sector_t size = raid10_size(mddev, 0, 0);
 		if (size < mddev->array_sectors) {
 			spin_unlock_irq(&conf->device_lock);
 			printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
 			       mdname(mddev));
 			return -EINVAL;
 		}
 		mddev->resync_max_sectors = size;
 		conf->reshape_progress = size;
 	} else
 		conf->reshape_progress = 0;
 	spin_unlock_irq(&conf->device_lock);
 	if (mddev->delta_disks && mddev->bitmap) {
 		ret = bitmap_resize(mddev->bitmap,
 				    raid10_size(mddev, 0,
 						conf->geo.raid_disks),
 				    0, 0);
 		if (ret)
 			goto abort;
 	}
 	if (mddev->delta_disks > 0) {
 		rdev_for_each(rdev, mddev)
 			if (rdev->raid_disk < 0 &&
 			    !test_bit(Faulty, &rdev->flags)) {
 				if (raid10_add_disk(mddev, rdev) == 0) {
 					if (rdev->raid_disk >=
 					    conf->prev.raid_disks)
 						set_bit(In_sync, &rdev->flags);
 					else
 						rdev->recovery_offset = 0;
 					if (sysfs_link_rdev(mddev, rdev))
 						/* Failure here  is OK */;
 				}
 			} else if (rdev->raid_disk >= conf->prev.raid_disks
 				   && !test_bit(Faulty, &rdev->flags)) {
 				/* This is a spare that was manually added */
 				set_bit(In_sync, &rdev->flags);
 			}
 	}
 	/* When a reshape changes the number of devices,
 	 * ->degraded is measured against the larger of the
 	 * pre and  post numbers.
 	 */
 	spin_lock_irq(&conf->device_lock);
 	mddev->degraded = calc_degraded(conf);
 	spin_unlock_irq(&conf->device_lock);
 	mddev->raid_disks = conf->geo.raid_disks;
 	mddev->reshape_position = conf->reshape_progress;
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
 	clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
 	set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
 	set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
 	mddev->sync_thread = md_register_thread(md_do_sync, mddev,
 						"reshape");
 	if (!mddev->sync_thread) {
 		ret = -EAGAIN;
 		goto abort;
 	}
 	conf->reshape_checkpoint = jiffies;
 	md_wakeup_thread(mddev->sync_thread);
 	md_new_event(mddev);
 	return 0;
 abort:
 	mddev->recovery = 0;
 	spin_lock_irq(&conf->device_lock);
 	conf->geo = conf->prev;
 	mddev->raid_disks = conf->geo.raid_disks;
 	rdev_for_each(rdev, mddev)
 		rdev->new_data_offset = rdev->data_offset;
 	smp_wmb();
 	conf->reshape_progress = MaxSector;
 	mddev->reshape_position = MaxSector;
 	spin_unlock_irq(&conf->device_lock);
 	return ret;
 }
 /* Calculate the last device-address that could contain
  * any block from the chunk that includes the array-address 's'
  * and report the next address.
  * i.e. the address returned will be chunk-aligned and after
  * any data that is in the chunk containing 's'.
  */
 static sector_t last_dev_address(sector_t s, struct geom *geo)
 {
 	s = (s | geo->chunk_mask) + 1;
 	s >>= geo->chunk_shift;
 	s *= geo->near_copies;
 	s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
 	s *= geo->far_copies;
 	s <<= geo->chunk_shift;
 	return s;
 }
 /* Calculate the first device-address that could contain
  * any block from the chunk that includes the array-address 's'.
  * This too will be the start of a chunk
  */
 static sector_t first_dev_address(sector_t s, struct geom *geo)
 {
 	s >>= geo->chunk_shift;
 	s *= geo->near_copies;
 	sector_div(s, geo->raid_disks);
 	s *= geo->far_copies;
 	s <<= geo->chunk_shift;
 	return s;
 }
 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 				int *skipped)
 {
 	/* We simply copy at most one chunk (smallest of old and new)
 	 * at a time, possibly less if that exceeds RESYNC_PAGES,
 	 * or we hit a bad block or something.
 	 * This might mean we pause for normal IO in the middle of
 	 * a chunk, but that is not a problem was mddev->reshape_position
 	 * can record any location.
 	 *
 	 * If we will want to write to a location that isn't
 	 * yet recorded as 'safe' (i.e. in metadata on disk) then
 	 * we need to flush all reshape requests and update the metadata.
 	 *
 	 * When reshaping forwards (e.g. to more devices), we interpret
 	 * 'safe' as the earliest block which might not have been copied
 	 * down yet.  We divide this by previous stripe size and multiply
 	 * by previous stripe length to get lowest device offset that we
 	 * cannot write to yet.
 	 * We interpret 'sector_nr' as an address that we want to write to.
 	 * From this we use last_device_address() to find where we might
 	 * write to, and first_device_address on the  'safe' position.
 	 * If this 'next' write position is after the 'safe' position,
 	 * we must update the metadata to increase the 'safe' position.
 	 *
 	 * When reshaping backwards, we round in the opposite direction
 	 * and perform the reverse test:  next write position must not be
 	 * less than current safe position.
 	 *
 	 * In all this the minimum difference in data offsets
 	 * (conf->offset_diff - always positive) allows a bit of slack,
 	 * so next can be after 'safe', but not by more than offset_disk
 	 *
 	 * We need to prepare all the bios here before we start any IO
 	 * to ensure the size we choose is acceptable to all devices.
 	 * The means one for each copy for write-out and an extra one for
 	 * read-in.
 	 * We store the read-in bio in ->master_bio and the others in
 	 * ->devs[x].bio and ->devs[x].repl_bio.
 	 */
 	struct r10conf *conf = mddev->private;
 	struct r10bio *r10_bio;
 	sector_t next, safe, last;
 	int max_sectors;
 	int nr_sectors;
 	int s;
 	struct md_rdev *rdev;
 	int need_flush = 0;
 	struct bio *blist;
 	struct bio *bio, *read_bio;
 	int sectors_done = 0;
 	if (sector_nr == 0) {
 		/* If restarting in the middle, skip the initial sectors */
 		if (mddev->reshape_backwards &&
 		    conf->reshape_progress < raid10_size(mddev, 0, 0)) {
 			sector_nr = (raid10_size(mddev, 0, 0)
 				     - conf->reshape_progress);
 		} else if (!mddev->reshape_backwards &&
 			   conf->reshape_progress > 0)
 			sector_nr = conf->reshape_progress;
 		if (sector_nr) {
 			mddev->curr_resync_completed = sector_nr;
 			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 			*skipped = 1;
 			return sector_nr;
 		}
 	}
 	/* We don't use sector_nr to track where we are up to
 	 * as that doesn't work well for ->reshape_backwards.
 	 * So just use ->reshape_progress.
 	 */
 	if (mddev->reshape_backwards) {
 		/* 'next' is the earliest device address that we might
 		 * write to for this chunk in the new layout
 		 */
 		next = first_dev_address(conf->reshape_progress - 1,
 					 &conf->geo);
 		/* 'safe' is the last device address that we might read from
 		 * in the old layout after a restart
 		 */
 		safe = last_dev_address(conf->reshape_safe - 1,
 					&conf->prev);
 		if (next + conf->offset_diff < safe)
 			need_flush = 1;
 		last = conf->reshape_progress - 1;
 		sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
 					       & conf->prev.chunk_mask);
 		if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
 			sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
 	} else {
 		/* 'next' is after the last device address that we
 		 * might write to for this chunk in the new layout
 		 */
 		next = last_dev_address(conf->reshape_progress, &conf->geo);
 		/* 'safe' is the earliest device address that we might
 		 * read from in the old layout after a restart
 		 */
 		safe = first_dev_address(conf->reshape_safe, &conf->prev);
 		/* Need to update metadata if 'next' might be beyond 'safe'
 		 * as that would possibly corrupt data
 		 */
 		if (next > safe + conf->offset_diff)
 			need_flush = 1;
 		sector_nr = conf->reshape_progress;
 		last  = sector_nr | (conf->geo.chunk_mask
 				     & conf->prev.chunk_mask);
 		if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
 			last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
 	}
 	if (need_flush ||
 	    time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
 		/* Need to update reshape_position in metadata */
 		wait_barrier(conf);
 		mddev->reshape_position = conf->reshape_progress;
 		if (mddev->reshape_backwards)
 			mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
 				- conf->reshape_progress;
 		else
 			mddev->curr_resync_completed = conf->reshape_progress;
 		conf->reshape_checkpoint = jiffies;
 		set_bit(MD_CHANGE_DEVS, &mddev->flags);
 		md_wakeup_thread(mddev->thread);
 		wait_event(mddev->sb_wait, mddev->flags == 0 ||
 			   kthread_should_stop());
 		conf->reshape_safe = mddev->reshape_position;
 		allow_barrier(conf);
 	}
 read_more:
 	/* Now schedule reads for blocks from sector_nr to last */
 	r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
 	raise_barrier(conf, sectors_done != 0);
 	atomic_set(&r10_bio->remaining, 0);
 	r10_bio->mddev = mddev;
 	r10_bio->sector = sector_nr;
 	set_bit(R10BIO_IsReshape, &r10_bio->state);
 	r10_bio->sectors = last - sector_nr + 1;
 	rdev = read_balance(conf, r10_bio, &max_sectors);
 	BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
 	if (!rdev) {
 		/* Cannot read from here, so need to record bad blocks
 		 * on all the target devices.
 		 */
 		// FIXME
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 		return sectors_done;
 	}
 	read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
 	read_bio->bi_bdev = rdev->bdev;
 	read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
 			       + rdev->data_offset);
 	read_bio->bi_private = r10_bio;
 	read_bio->bi_end_io = end_sync_read;
 	read_bio->bi_rw = READ;
 	read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
 	read_bio->bi_flags |= 1 << BIO_UPTODATE;
 	read_bio->bi_vcnt = 0;
 	read_bio->bi_idx = 0;
 	read_bio->bi_size = 0;
 	r10_bio->master_bio = read_bio;
 	r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
 	/* Now find the locations in the new layout */
 	__raid10_find_phys(&conf->geo, r10_bio);
 	blist = read_bio;
 	read_bio->bi_next = NULL;
 	for (s = 0; s < conf->copies*2; s++) {
 		struct bio *b;
 		int d = r10_bio->devs[s/2].devnum;
 		struct md_rdev *rdev2;
 		if (s&1) {
 			rdev2 = conf->mirrors[d].replacement;
 			b = r10_bio->devs[s/2].repl_bio;
 		} else {
 			rdev2 = conf->mirrors[d].rdev;
 			b = r10_bio->devs[s/2].bio;
 		}
 		if (!rdev2 || test_bit(Faulty, &rdev2->flags))
 			continue;
 		b->bi_bdev = rdev2->bdev;
 		b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
 		b->bi_private = r10_bio;
 		b->bi_end_io = end_reshape_write;
 		b->bi_rw = WRITE;
 		b->bi_flags &= ~(BIO_POOL_MASK - 1);
 		b->bi_flags |= 1 << BIO_UPTODATE;
 		b->bi_next = blist;
 		b->bi_vcnt = 0;
 		b->bi_idx = 0;
 		b->bi_size = 0;
 		blist = b;
 	}
 	/* Now add as many pages as possible to all of these bios. */
 	nr_sectors = 0;
 	for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
 		struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
 		int len = (max_sectors - s) << 9;
 		if (len > PAGE_SIZE)
 			len = PAGE_SIZE;
 		for (bio = blist; bio ; bio = bio->bi_next) {
 			struct bio *bio2;
 			if (bio_add_page(bio, page, len, 0))
 				continue;
 			/* Didn't fit, must stop */
 			for (bio2 = blist;
 			     bio2 && bio2 != bio;
 			     bio2 = bio2->bi_next) {
 				/* Remove last page from this bio */
 				bio2->bi_vcnt--;
 				bio2->bi_size -= len;
 				bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
 			}
 			goto bio_full;
 		}
 		sector_nr += len >> 9;
 		nr_sectors += len >> 9;
 	}
 bio_full:
 	r10_bio->sectors = nr_sectors;
 	/* Now submit the read */
 	md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
 	atomic_inc(&r10_bio->remaining);
 	read_bio->bi_next = NULL;
 	generic_make_request(read_bio);
 	sector_nr += nr_sectors;
 	sectors_done += nr_sectors;
 	if (sector_nr <= last)
 		goto read_more;
 	/* Now that we have done the whole section we can
 	 * update reshape_progress
 	 */
 	if (mddev->reshape_backwards)
 		conf->reshape_progress -= sectors_done;
 	else
 		conf->reshape_progress += sectors_done;
 	return sectors_done;
 }
 static void end_reshape_request(struct r10bio *r10_bio);
 static int handle_reshape_read_error(struct mddev *mddev,
 				     struct r10bio *r10_bio);
 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 {
 	/* Reshape read completed.  Hopefully we have a block
 	 * to write out.
 	 * If we got a read error then we do sync 1-page reads from
 	 * elsewhere until we find the data - or give up.
 	 */
 	struct r10conf *conf = mddev->private;
 	int s;
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		if (handle_reshape_read_error(mddev, r10_bio) < 0) {
 			/* Reshape has been aborted */
 			md_done_sync(mddev, r10_bio->sectors, 0);
 			return;
 		}
 	/* We definitely have the data in the pages, schedule the
 	 * writes.
 	 */
 	atomic_set(&r10_bio->remaining, 1);
 	for (s = 0; s < conf->copies*2; s++) {
 		struct bio *b;
 		int d = r10_bio->devs[s/2].devnum;
 		struct md_rdev *rdev;
 		if (s&1) {
 			rdev = conf->mirrors[d].replacement;
 			b = r10_bio->devs[s/2].repl_bio;
 		} else {
 			rdev = conf->mirrors[d].rdev;
 			b = r10_bio->devs[s/2].bio;
 		}
 		if (!rdev || test_bit(Faulty, &rdev->flags))
 			continue;
 		atomic_inc(&rdev->nr_pending);
 		md_sync_acct(b->bi_bdev, r10_bio->sectors);
 		atomic_inc(&r10_bio->remaining);
 		b->bi_next = NULL;
 		generic_make_request(b);
 	}
 	end_reshape_request(r10_bio);
 }
 static void end_reshape(struct r10conf *conf)
 {
 	if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
 		return;
 	spin_lock_irq(&conf->device_lock);
 	conf->prev = conf->geo;
 	md_finish_reshape(conf->mddev);
 	smp_wmb();
 	conf->reshape_progress = MaxSector;
 	spin_unlock_irq(&conf->device_lock);
 	/* read-ahead size must cover two whole stripes, which is
 	 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
 	 */
 	if (conf->mddev->queue) {
 		int stripe = conf->geo.raid_disks *
 			((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
 		stripe /= conf->geo.near_copies;
 		if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
 			conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
 	}
 	conf->fullsync = 0;
 }
 static int handle_reshape_read_error(struct mddev *mddev,
 				     struct r10bio *r10_bio)
 {
 	/* Use sync reads to get the blocks from somewhere else */
 	int sectors = r10_bio->sectors;
 	struct r10conf *conf = mddev->private;
 	struct {
 		struct r10bio r10_bio;
 		struct r10dev devs[conf->copies];
 	} on_stack;
 	struct r10bio *r10b = &on_stack.r10_bio;
 	int slot = 0;
 	int idx = 0;
 	struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
 	r10b->sector = r10_bio->sector;
 	__raid10_find_phys(&conf->prev, r10b);
 	while (sectors) {
 		int s = sectors;
 		int success = 0;
 		int first_slot = slot;
 		if (s > (PAGE_SIZE >> 9))
 			s = PAGE_SIZE >> 9;
 		while (!success) {
 			int d = r10b->devs[slot].devnum;
 			struct md_rdev *rdev = conf->mirrors[d].rdev;
 			sector_t addr;
 			if (rdev == NULL ||
 			    test_bit(Faulty, &rdev->flags) ||
 			    !test_bit(In_sync, &rdev->flags))
 				goto failed;
 			addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
 			success = sync_page_io(rdev,
 					       addr,
 					       s << 9,
 					       bvec[idx].bv_page,
 					       READ, false);
 			if (success)
 				break;
 		failed:
 			slot++;
 			if (slot >= conf->copies)
 				slot = 0;
 			if (slot == first_slot)
 				break;
 		}
 		if (!success) {
 			/* couldn't read this block, must give up */
 			set_bit(MD_RECOVERY_INTR,
 				&mddev->recovery);
 			return -EIO;
 		}
 		sectors -= s;
 		idx++;
 	}
 	return 0;
 }
 static void end_reshape_write(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct r10bio *r10_bio = bio->bi_private;
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	int d;
 	int slot;
 	int repl;
 	struct md_rdev *rdev = NULL;
 	d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
 	if (repl)
 		rdev = conf->mirrors[d].replacement;
 	if (!rdev) {
 		smp_mb();
 		rdev = conf->mirrors[d].rdev;
 	}
 	if (!uptodate) {
 		/* FIXME should record badblock */
 		md_error(mddev, rdev);
 	}
 	rdev_dec_pending(rdev, mddev);
 	end_reshape_request(r10_bio);
 }
 static void end_reshape_request(struct r10bio *r10_bio)
 {
 	if (!atomic_dec_and_test(&r10_bio->remaining))
 		return;
 	md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
 	bio_put(r10_bio->master_bio);
 	put_buf(r10_bio);
 }
 static void raid10_finish_reshape(struct mddev *mddev)
 {
 	struct r10conf *conf = mddev->private;
 	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
 		return;
 	if (mddev->delta_disks > 0) {
 		sector_t size = raid10_size(mddev, 0, 0);
 		md_set_array_sectors(mddev, size);
 		if (mddev->recovery_cp > mddev->resync_max_sectors) {
 			mddev->recovery_cp = mddev->resync_max_sectors;
 			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 		}
 		mddev->resync_max_sectors = size;
 		set_capacity(mddev->gendisk, mddev->array_sectors);
 		revalidate_disk(mddev->gendisk);
 	} else {
 		int d;
 		for (d = conf->geo.raid_disks ;
 		     d < conf->geo.raid_disks - mddev->delta_disks;
 		     d++) {
 			struct md_rdev *rdev = conf->mirrors[d].rdev;
 			if (rdev)
 				clear_bit(In_sync, &rdev->flags);
 			rdev = conf->mirrors[d].replacement;
 			if (rdev)
 				clear_bit(In_sync, &rdev->flags);
 		}
 	}
 	mddev->layout = mddev->new_layout;
 	mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
 	mddev->reshape_position = MaxSector;
 	mddev->delta_disks = 0;
 	mddev->reshape_backwards = 0;
 }
 static struct md_personality raid10_personality =
 {
 	.name		= "raid10",
 	.level		= 10,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
 	.stop		= stop,
 	.status		= status,
 	.error_handler	= error,
 	.hot_add_disk	= raid10_add_disk,
 	.hot_remove_disk= raid10_remove_disk,
 	.spare_active	= raid10_spare_active,
 	.sync_request	= sync_request,
 	.quiesce	= raid10_quiesce,
 	.size		= raid10_size,
 	.resize		= raid10_resize,
 	.takeover	= raid10_takeover,
 	.check_reshape	= raid10_check_reshape,
 	.start_reshape	= raid10_start_reshape,
 	.finish_reshape	= raid10_finish_reshape,
 };
 static int __init raid_init(void)
 {
 	return register_md_personality(&raid10_personality);
 }
 static void raid_exit(void)
 {
 	unregister_md_personality(&raid10_personality);
 }
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
 MODULE_ALIAS("md-raid10");
 MODULE_ALIAS("md-level-10");

 #ifndef _RAID10_H
 #define _RAID10_H
 struct raid10_info {
 	struct md_rdev	*rdev, *replacement;
 	sector_t	head_position;
 	int		recovery_disabled;	/* matches
 						 * mddev->recovery_disabled
 						 * when we shouldn't try
 						 * recovering this device.
 						 */
 };
 struct r10conf {
 	struct mddev		*mddev;
 	struct raid10_info	*mirrors;
 	struct raid10_info	*mirrors_new, *mirrors_old;
 	spinlock_t		device_lock;
 	/* geometry */
 	struct geom {
 		int		raid_disks;
 		int		near_copies;  /* number of copies laid out
 					       * raid0 style */
 		int		far_copies;   /* number of copies laid out
 					       * at large strides across drives
 					       */
 		int		far_offset;   /* far_copies are offset by 1
 					       * stripe instead of many
 					       */
 		sector_t	stride;	      /* distance between far copies.
 					       * This is size / far_copies unless
 					       * far_offset, in which case it is
 					       * 1 stripe.
 					       */
+		int             far_set_size; /* The number of devices in a set,
+					       * where a 'set' are devices that
+					       * contain far/offset copies of
+					       * each other.
+					       */
 		int		chunk_shift; /* shift from chunks to sectors */
 		sector_t	chunk_mask;
 	} prev, geo;
 	int			copies;	      /* near_copies * far_copies.
 					       * must be <= raid_disks
 					       */
 	sector_t		dev_sectors;  /* temp copy of
 					       * mddev->dev_sectors */
 	sector_t		reshape_progress;
 	sector_t		reshape_safe;
 	unsigned long		reshape_checkpoint;
 	sector_t		offset_diff;
 	struct list_head	retry_list;
 	/* queue pending writes and submit them on unplug */
 	struct bio_list		pending_bio_list;
 	int			pending_count;
 	spinlock_t		resync_lock;
 	int			nr_pending;
 	int			nr_waiting;
 	int			nr_queued;
 	int			barrier;
 	sector_t		next_resync;
 	int			fullsync;  /* set to 1 if a full sync is needed,
 					    * (fresh device added).
 					    * Cleared when a sync completes.
 					    */
 	int			have_replacement; /* There is at least one
 						   * replacement device.
 						   */
 	wait_queue_head_t	wait_barrier;
 	mempool_t		*r10bio_pool;
 	mempool_t		*r10buf_pool;
 	struct page		*tmppage;
 	/* When taking over an array from a different personality, we store
 	 * the new thread here until we fully activate the array.
 	 */
 	struct md_thread	*thread;
 };
 /*
  * this is our 'private' RAID10 bio.
  *
  * it contains information about what kind of IO operations were started
  * for this RAID10 operation, and about their status:
  */
 struct r10bio {
 	atomic_t		remaining; /* 'have we finished' count,
 					    * used from IRQ handlers
 					    */
 	sector_t		sector;	/* virtual sector number */
 	int			sectors;
 	unsigned long		state;
 	struct mddev		*mddev;
 	/*
 	 * original bio going to /dev/mdx
 	 */
 	struct bio		*master_bio;
 	/*
 	 * if the IO is in READ direction, then this is where we read
 	 */
 	int			read_slot;
 	struct list_head	retry_list;
 	/*
 	 * if the IO is in WRITE direction, then multiple bios are used,
 	 * one for each copy.
 	 * When resyncing we also use one for each copy.
 	 * When reconstructing, we use 2 bios, one for read, one for write.
 	 * We choose the number when they are allocated.
 	 * We sometimes need an extra bio to write to the replacement.
 	 */
 	struct r10dev {
 		struct bio	*bio;
 		union {
 			struct bio	*repl_bio; /* used for resync and
 						    * writes */
 			struct md_rdev	*rdev;	   /* used for reads
 						    * (read_slot >= 0) */
 		};
 		sector_t	addr;
 		int		devnum;
 	} devs[0];
 };
 /* bits for r10bio.state */
 enum r10bio_state {
 	R10BIO_Uptodate,
 	R10BIO_IsSync,
 	R10BIO_IsRecover,
 	R10BIO_IsReshape,
 	R10BIO_Degraded,
 /* Set ReadError on bios that experience a read error
  * so that raid10d knows what to do with them.
  */
 	R10BIO_ReadError,
 /* If a write for this request means we can clear some
  * known-bad-block records, we set this flag.
  */
 	R10BIO_MadeGood,
 	R10BIO_WriteError,
 /* During a reshape we might be performing IO on the
  * 'previous' part of the array, in which case this
  * flag is set
  */
 	R10BIO_Previous,
 };
 extern int md_raid10_congested(struct mddev *mddev, int bits);
 #endif