Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* raid10.c : Multiple Devices driver for Linux

2

* raid10.c : Multiple Devices driver for Linux

3

*

3

*

4

5

*

5

*

6

* RAID-10 support for md.

6

* RAID-10 support for md.

7

*

7

*

8

* Base on code in raid1.c. See raid1.c for further copyright information.

8

* Base on code in raid1.c. See raid1.c for further copyright information.

9

*

9

*

10

*

10

*

11

* This program is free software; you can redistribute it and/or modify

11

* This program is free software; you can redistribute it and/or modify

12

* it under the terms of the GNU General Public License as published by

12

* it under the terms of the GNU General Public License as published by

13

* the Free Software Foundation; either version 2, or (at your option)

13

* the Free Software Foundation; either version 2, or (at your option)

14

* any later version.

14

* any later version.

15

*

15

*

16

* You should have received a copy of the GNU General Public License

16

* You should have received a copy of the GNU General Public License

17

* (for example /usr/src/linux/COPYING); if not, write to the Free

17

* (for example /usr/src/linux/COPYING); if not, write to the Free

18

* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

18

* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

19

*/

19

*/

20

21

#include <linux/slab.h>

21

#include <linux/slab.h>

22

#include <linux/delay.h>

22

#include <linux/delay.h>

23

#include <linux/blkdev.h>

23

#include <linux/blkdev.h>

24

#include <linux/module.h>

24

#include <linux/module.h>

25

#include <linux/seq_file.h>

25

#include <linux/seq_file.h>

26

#include <linux/ratelimit.h>

26

#include <linux/ratelimit.h>

27

#include "md.h"

27

#include "md.h"

28

#include "raid10.h"

28

#include "raid10.h"

29

#include "raid0.h"

29

#include "raid0.h"

30

#include "bitmap.h"

30

#include "bitmap.h"

31

32

/*

32

/*

33

* RAID10 provides a combination of RAID0 and RAID1 functionality.

33

* RAID10 provides a combination of RAID0 and RAID1 functionality.

34

* The layout of data is defined by

34

* The layout of data is defined by

35

* chunk_size

35

* chunk_size

36

* raid_disks

36

* raid_disks

37

* near_copies (stored in low byte of layout)

37

* near_copies (stored in low byte of layout)

38

* far_copies (stored in second byte of layout)

38

* far_copies (stored in second byte of layout)

39

* far_offset (stored in bit 16 of layout )

39

* far_offset (stored in bit 16 of layout )

40

*

40

*

41

* The data to be stored is divided into chunks using chunksize.

41

* The data to be stored is divided into chunks using chunksize.

42

* Each device is divided into far_copies sections.

42

* Each device is divided into far_copies sections.

43

* In each section, chunks are laid out in a style similar to raid0, but

43

* In each section, chunks are laid out in a style similar to raid0, but

44

* near_copies copies of each chunk is stored (each on a different drive).

44

* near_copies copies of each chunk is stored (each on a different drive).

45

* The starting device for each section is offset near_copies from the starting

45

* The starting device for each section is offset near_copies from the starting

46

* device of the previous section.

46

* device of the previous section.

47

* Thus they are (near_copies*far_copies) of each chunk, and each is on a different

47

* Thus they are (near_copies*far_copies) of each chunk, and each is on a different

48

* drive.

48

* drive.

49

* near_copies and far_copies must be at least one, and their product is at most

49

* near_copies and far_copies must be at least one, and their product is at most

50

* raid_disks.

50

* raid_disks.

51

*

51

*

52

* If far_offset is true, then the far_copies are handled a bit differently.

52

* If far_offset is true, then the far_copies are handled a bit differently.

53

* The copies are still in different stripes, but instead of be very far apart

53

* The copies are still in different stripes, but instead of be very far apart

54

* on disk, there are adjacent stripes.

54

* on disk, there are adjacent stripes.

55

*/

55

*/

56

57

/*

57

/*

58

* Number of guaranteed r10bios in case of extreme VM load:

58

* Number of guaranteed r10bios in case of extreme VM load:

59

*/

59

*/

60

#define NR_RAID10_BIOS 256

60

#define NR_RAID10_BIOS 256

61

62

/* When there are this many requests queue to be written by

62

/* When there are this many requests queue to be written by

63

* the raid10 thread, we become 'congested' to provide back-pressure

63

* the raid10 thread, we become 'congested' to provide back-pressure

64

* for writeback.

64

* for writeback.

65

*/

65

*/

66

static int max_queued_requests = 1024;

66

static int max_queued_requests = 1024;

67

68

static void allow_barrier(struct r10conf *conf);

68

static void allow_barrier(struct r10conf *conf);

69

static void lower_barrier(struct r10conf *conf);

69

static void lower_barrier(struct r10conf *conf);

70

71

static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)

71

static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)

72

{

72

{

73

struct r10conf *conf = data;

73

struct r10conf *conf = data;

74

int size = offsetof(struct r10bio, devs[conf->copies]);

74

int size = offsetof(struct r10bio, devs[conf->copies]);

75

76

/* allocate a r10bio with room for raid_disks entries in the bios array */

76

/* allocate a r10bio with room for raid_disks entries in the

77

* bios array */

77

return kzalloc(size, gfp_flags);

78

return kzalloc(size, gfp_flags);

78

}

79

}

79

80

static void r10bio_pool_free(void *r10_bio, void *data)

81

static void r10bio_pool_free(void *r10_bio, void *data)

81

{

82

{

82

kfree(r10_bio);

83

kfree(r10_bio);

83

}

84

}

84

85

/* Maximum size of each resync request */

86

/* Maximum size of each resync request */

86

#define RESYNC_BLOCK_SIZE (64*1024)

87

#define RESYNC_BLOCK_SIZE (64*1024)

87

#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)

88

#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)

88

/* amount of memory to reserve for resync requests */

89

/* amount of memory to reserve for resync requests */

89

#define RESYNC_WINDOW (1024*1024)

90

#define RESYNC_WINDOW (1024*1024)

90

/* maximum number of concurrent requests, memory permitting */

91

/* maximum number of concurrent requests, memory permitting */

91

#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)

92

#define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)

92

93

/*

94

/*

94

* When performing a resync, we need to read and compare, so

95

* When performing a resync, we need to read and compare, so

95

* we need as many pages are there are copies.

96

* we need as many pages are there are copies.

96

* When performing a recovery, we need 2 bios, one for read,

97

* When performing a recovery, we need 2 bios, one for read,

97

* one for write (we recover only one drive per r10buf)

98

* one for write (we recover only one drive per r10buf)

98

*

99

*

99

*/

100

*/

100

static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)

101

static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)

101

{

102

{

102

struct r10conf *conf = data;

103

struct r10conf *conf = data;

103

struct page *page;

104

struct page *page;

104

struct r10bio *r10_bio;

105

struct r10bio *r10_bio;

105

struct bio *bio;

106

struct bio *bio;

106

int i, j;

107

int i, j;

107

int nalloc;

108

int nalloc;

108

109

r10_bio = r10bio_pool_alloc(gfp_flags, conf);

110

r10_bio = r10bio_pool_alloc(gfp_flags, conf);

110

if (!r10_bio)

111

if (!r10_bio)

111

return NULL;

112

return NULL;

112

113

if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))

114

if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))

114

nalloc = conf->copies; /* resync */

115

nalloc = conf->copies; /* resync */

115

else

116

else

116

nalloc = 2; /* recovery */

117

nalloc = 2; /* recovery */

117

118

/*

119

/*

119

* Allocate bios.

120

* Allocate bios.

120

*/

121

*/

121

for (j = nalloc ; j-- ; ) {

122

for (j = nalloc ; j-- ; ) {

122

bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);

123

bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);

123

if (!bio)

124

if (!bio)

124

goto out_free_bio;

125

goto out_free_bio;

125

r10_bio->devs[j].bio = bio;

126

r10_bio->devs[j].bio = bio;

127

if (!conf->have_replacement)

128

continue;

129

bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);

130

if (!bio)

131

goto out_free_bio;

132

r10_bio->devs[j].repl_bio = bio;

126

}

133

}

127

/*

134

/*

128

* Allocate RESYNC_PAGES data pages and attach them

135

* Allocate RESYNC_PAGES data pages and attach them

129

* where needed.

136

* where needed.

130

*/

137

*/

131

for (j = 0 ; j < nalloc; j++) {

138

for (j = 0 ; j < nalloc; j++) {

139

struct bio *rbio = r10_bio->devs[j].repl_bio;

132

bio = r10_bio->devs[j].bio;

140

bio = r10_bio->devs[j].bio;

133

for (i = 0; i < RESYNC_PAGES; i++) {

141

for (i = 0; i < RESYNC_PAGES; i++) {

134

if (j == 1 && !test_bit(MD_RECOVERY_SYNC,

142

if (j == 1 && !test_bit(MD_RECOVERY_SYNC,

135

&conf->mddev->recovery)) {

143

&conf->mddev->recovery)) {

136

/* we can share bv_page's during recovery */

144

/* we can share bv_page's during recovery */

137

struct bio *rbio = r10_bio->devs[0].bio;

145

struct bio *rbio = r10_bio->devs[0].bio;

138

page = rbio->bi_io_vec[i].bv_page;

146

page = rbio->bi_io_vec[i].bv_page;

139

get_page(page);

147

get_page(page);

140

} else

148

} else

141

page = alloc_page(gfp_flags);

149

page = alloc_page(gfp_flags);

142

if (unlikely(!page))

150

if (unlikely(!page))

143

goto out_free_pages;

151

goto out_free_pages;

144

152

145

bio->bi_io_vec[i].bv_page = page;

153

bio->bi_io_vec[i].bv_page = page;

154

if (rbio)

155

rbio->bi_io_vec[i].bv_page = page;

146

}

156

}

147

}

157

}

148

158

149

return r10_bio;

159

return r10_bio;

150

160

151

out_free_pages:

161

out_free_pages:

152

for ( ; i > 0 ; i--)

162

for ( ; i > 0 ; i--)

153

safe_put_page(bio->bi_io_vec[i-1].bv_page);

163

safe_put_page(bio->bi_io_vec[i-1].bv_page);

154

while (j--)

164

while (j--)

155

for (i = 0; i < RESYNC_PAGES ; i++)

165

for (i = 0; i < RESYNC_PAGES ; i++)

156

safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);

166

safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);

157

j = -1;

167

j = -1;

158

out_free_bio:

168

out_free_bio:

159

while ( ++j < nalloc )

169

while (++j < nalloc) {

160

bio_put(r10_bio->devs[j].bio);

170

bio_put(r10_bio->devs[j].bio);

171

if (r10_bio->devs[j].repl_bio)

172

bio_put(r10_bio->devs[j].repl_bio);

173

}

161

r10bio_pool_free(r10_bio, conf);

174

r10bio_pool_free(r10_bio, conf);

162

return NULL;

175

return NULL;

163

}

176

}

164

177

165

static void r10buf_pool_free(void *__r10_bio, void *data)

178

static void r10buf_pool_free(void *__r10_bio, void *data)

166

{

179

{

167

int i;

180

int i;

168

struct r10conf *conf = data;

181

struct r10conf *conf = data;

169

struct r10bio *r10bio = __r10_bio;

182

struct r10bio *r10bio = __r10_bio;

170

int j;

183

int j;

171

184

172

for (j=0; j < conf->copies; j++) {

185

for (j=0; j < conf->copies; j++) {

173

struct bio *bio = r10bio->devs[j].bio;

186

struct bio *bio = r10bio->devs[j].bio;

174

if (bio) {

187

if (bio) {

175

for (i = 0; i < RESYNC_PAGES; i++) {

188

for (i = 0; i < RESYNC_PAGES; i++) {

176

safe_put_page(bio->bi_io_vec[i].bv_page);

189

safe_put_page(bio->bi_io_vec[i].bv_page);

177

bio->bi_io_vec[i].bv_page = NULL;

190

bio->bi_io_vec[i].bv_page = NULL;

178

}

191

}

179

bio_put(bio);

192

bio_put(bio);

180

}

193

}

194

bio = r10bio->devs[j].repl_bio;

195

if (bio)

196

bio_put(bio);

181

}

197

}

182

r10bio_pool_free(r10bio, conf);

198

r10bio_pool_free(r10bio, conf);

183

}

199

}

184

200

185

static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)

201

static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)

186

{

202

{

187

int i;

203

int i;

188

204

189

for (i = 0; i < conf->copies; i++) {

205

for (i = 0; i < conf->copies; i++) {

190

struct bio **bio = & r10_bio->devs[i].bio;

206

struct bio **bio = & r10_bio->devs[i].bio;

191

if (!BIO_SPECIAL(*bio))

207

if (!BIO_SPECIAL(*bio))

192

bio_put(*bio);

208

bio_put(*bio);

193

*bio = NULL;

209

*bio = NULL;

210

bio = &r10_bio->devs[i].repl_bio;

211

if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))

212

bio_put(*bio);

213

*bio = NULL;

194

}

214

}

195

}

215

}

196

216

197

static void free_r10bio(struct r10bio *r10_bio)

217

static void free_r10bio(struct r10bio *r10_bio)

198

{

218

{

199

struct r10conf *conf = r10_bio->mddev->private;

219

struct r10conf *conf = r10_bio->mddev->private;

200

220

201

put_all_bios(conf, r10_bio);

221

put_all_bios(conf, r10_bio);

202

mempool_free(r10_bio, conf->r10bio_pool);

222

mempool_free(r10_bio, conf->r10bio_pool);

203

}

223

}

204

224

205

static void put_buf(struct r10bio *r10_bio)

225

static void put_buf(struct r10bio *r10_bio)

206

{

226

{

207

struct r10conf *conf = r10_bio->mddev->private;

227

struct r10conf *conf = r10_bio->mddev->private;

208

228

209

mempool_free(r10_bio, conf->r10buf_pool);

229

mempool_free(r10_bio, conf->r10buf_pool);

210

230

211

lower_barrier(conf);

231

lower_barrier(conf);

212

}

232

}

213

233

214

static void reschedule_retry(struct r10bio *r10_bio)

234

static void reschedule_retry(struct r10bio *r10_bio)

215

{

235

{

216

unsigned long flags;

236

unsigned long flags;

217

struct mddev *mddev = r10_bio->mddev;

237

struct mddev *mddev = r10_bio->mddev;

218

struct r10conf *conf = mddev->private;

238

struct r10conf *conf = mddev->private;

219

239

220

spin_lock_irqsave(&conf->device_lock, flags);

240

spin_lock_irqsave(&conf->device_lock, flags);

221

list_add(&r10_bio->retry_list, &conf->retry_list);

241

list_add(&r10_bio->retry_list, &conf->retry_list);

222

conf->nr_queued ++;

242

conf->nr_queued ++;

223

spin_unlock_irqrestore(&conf->device_lock, flags);

243

spin_unlock_irqrestore(&conf->device_lock, flags);

224

244

225

/* wake up frozen array... */

245

/* wake up frozen array... */

226

wake_up(&conf->wait_barrier);

246

wake_up(&conf->wait_barrier);

227

247

228

md_wakeup_thread(mddev->thread);

248

md_wakeup_thread(mddev->thread);

229

}

249

}

230

250

231

/*

251

/*

232

* raid_end_bio_io() is called when we have finished servicing a mirrored

252

* raid_end_bio_io() is called when we have finished servicing a mirrored

233

* operation and are ready to return a success/failure code to the buffer

253

* operation and are ready to return a success/failure code to the buffer

234

* cache layer.

254

* cache layer.

235

*/

255

*/

236

static void raid_end_bio_io(struct r10bio *r10_bio)

256

static void raid_end_bio_io(struct r10bio *r10_bio)

237

{

257

{

238

struct bio *bio = r10_bio->master_bio;

258

struct bio *bio = r10_bio->master_bio;

239

int done;

259

int done;

240

struct r10conf *conf = r10_bio->mddev->private;

260

struct r10conf *conf = r10_bio->mddev->private;

241

261

242

if (bio->bi_phys_segments) {

262

if (bio->bi_phys_segments) {

243

unsigned long flags;

263

unsigned long flags;

244

spin_lock_irqsave(&conf->device_lock, flags);

264

spin_lock_irqsave(&conf->device_lock, flags);

245

bio->bi_phys_segments--;

265

bio->bi_phys_segments--;

246

done = (bio->bi_phys_segments == 0);

266

done = (bio->bi_phys_segments == 0);

247

spin_unlock_irqrestore(&conf->device_lock, flags);

267

spin_unlock_irqrestore(&conf->device_lock, flags);

248

} else

268

} else

249

done = 1;

269

done = 1;

250

if (!test_bit(R10BIO_Uptodate, &r10_bio->state))

270

if (!test_bit(R10BIO_Uptodate, &r10_bio->state))

251

clear_bit(BIO_UPTODATE, &bio->bi_flags);

271

clear_bit(BIO_UPTODATE, &bio->bi_flags);

252

if (done) {

272

if (done) {

253

bio_endio(bio, 0);

273

bio_endio(bio, 0);

254

/*

274

/*

255

* Wake up any possible resync thread that waits for the device

275

* Wake up any possible resync thread that waits for the device

256

* to go idle.

276

* to go idle.

257

*/

277

*/

258

allow_barrier(conf);

278

allow_barrier(conf);

259

}

279

}

260

free_r10bio(r10_bio);

280

free_r10bio(r10_bio);

261

}

281

}

262

282

263

/*

283

/*

264

* Update disk head position estimator based on IRQ completion info.

284

* Update disk head position estimator based on IRQ completion info.

265

*/

285

*/

266

static inline void update_head_pos(int slot, struct r10bio *r10_bio)

286

static inline void update_head_pos(int slot, struct r10bio *r10_bio)

267

{

287

{

268

struct r10conf *conf = r10_bio->mddev->private;

288

struct r10conf *conf = r10_bio->mddev->private;

269

289

270

conf->mirrors[r10_bio->devs[slot].devnum].head_position =

290

conf->mirrors[r10_bio->devs[slot].devnum].head_position =

271

r10_bio->devs[slot].addr + (r10_bio->sectors);

291

r10_bio->devs[slot].addr + (r10_bio->sectors);

272

}

292

}

273

293

274

/*

294

/*

275

* Find the disk number which triggered given bio

295

* Find the disk number which triggered given bio

276

*/

296

*/

277

static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,

297

static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,

278

struct bio *bio, int *slotp)

298

struct bio *bio, int *slotp, int *replp)

279

{

299

{

280

int slot;

300

int slot;

301

int repl = 0;

281

302

282

for (slot = 0; slot < conf->copies; slot++)

303

for (slot = 0; slot < conf->copies; slot++) {

283

if (r10_bio->devs[slot].bio == bio)

304

if (r10_bio->devs[slot].bio == bio)

284

break;

305

break;

306

if (r10_bio->devs[slot].repl_bio == bio) {

307

repl = 1;

308

break;

309

}

310

}

285

311

286

BUG_ON(slot == conf->copies);

312

BUG_ON(slot == conf->copies);

287

update_head_pos(slot, r10_bio);

313

update_head_pos(slot, r10_bio);

288

314

289

if (slotp)

315

if (slotp)

290

*slotp = slot;

316

*slotp = slot;

317

if (replp)

318

*replp = repl;

291

return r10_bio->devs[slot].devnum;

319

return r10_bio->devs[slot].devnum;

292

}

320

}

293

321

294

static void raid10_end_read_request(struct bio *bio, int error)

322

static void raid10_end_read_request(struct bio *bio, int error)

295

{

323

{

296

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

324

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

297

struct r10bio *r10_bio = bio->bi_private;

325

struct r10bio *r10_bio = bio->bi_private;

298

int slot, dev;

326

int slot, dev;

299

struct r10conf *conf = r10_bio->mddev->private;

327

struct r10conf *conf = r10_bio->mddev->private;

300

328

301

329

302

slot = r10_bio->read_slot;

330

slot = r10_bio->read_slot;

303

dev = r10_bio->devs[slot].devnum;

331

dev = r10_bio->devs[slot].devnum;

304

/*

332

/*

305

* this branch is our 'one mirror IO has finished' event handler:

333

* this branch is our 'one mirror IO has finished' event handler:

306

*/

334

*/

307

update_head_pos(slot, r10_bio);

335

update_head_pos(slot, r10_bio);

308

336

309

if (uptodate) {

337

if (uptodate) {

310

/*

338

/*

311

* Set R10BIO_Uptodate in our master bio, so that

339

* Set R10BIO_Uptodate in our master bio, so that

312

* we will return a good error code to the higher

340

* we will return a good error code to the higher

313

* levels even if IO on some other mirrored buffer fails.

341

* levels even if IO on some other mirrored buffer fails.

314

*

342

*

315

* The 'master' represents the composite IO operation to

343

* The 'master' represents the composite IO operation to

316

* user-side. So if something waits for IO, then it will

344

* user-side. So if something waits for IO, then it will

317

* wait for the 'master' bio.

345

* wait for the 'master' bio.

318

*/

346

*/

319

set_bit(R10BIO_Uptodate, &r10_bio->state);

347

set_bit(R10BIO_Uptodate, &r10_bio->state);

320

raid_end_bio_io(r10_bio);

348

raid_end_bio_io(r10_bio);

321

rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);

349

rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);

322

} else {

350

} else {

323

/*

351

/*

324

* oops, read error - keep the refcount on the rdev

352

* oops, read error - keep the refcount on the rdev

325

*/

353

*/

326

char b[BDEVNAME_SIZE];

354

char b[BDEVNAME_SIZE];

327

printk_ratelimited(KERN_ERR

355

printk_ratelimited(KERN_ERR

328

"md/raid10:%s: %s: rescheduling sector %llu\n",

356

"md/raid10:%s: %s: rescheduling sector %llu\n",

329

mdname(conf->mddev),

357

mdname(conf->mddev),

330

bdevname(conf->mirrors[dev].rdev->bdev, b),

358

bdevname(conf->mirrors[dev].rdev->bdev, b),

331

(unsigned long long)r10_bio->sector);

359

(unsigned long long)r10_bio->sector);

332

set_bit(R10BIO_ReadError, &r10_bio->state);

360

set_bit(R10BIO_ReadError, &r10_bio->state);

333

reschedule_retry(r10_bio);

361

reschedule_retry(r10_bio);

334

}

362

}

335

}

363

}

336

364

337

static void close_write(struct r10bio *r10_bio)

365

static void close_write(struct r10bio *r10_bio)

338

{

366

{

339

/* clear the bitmap if all writes complete successfully */

367

/* clear the bitmap if all writes complete successfully */

340

bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,

368

bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,

341

r10_bio->sectors,

369

r10_bio->sectors,

342

!test_bit(R10BIO_Degraded, &r10_bio->state),

370

!test_bit(R10BIO_Degraded, &r10_bio->state),

343

0);

371

0);

344

md_write_end(r10_bio->mddev);

372

md_write_end(r10_bio->mddev);

345

}

373

}

346

374

347

static void one_write_done(struct r10bio *r10_bio)

375

static void one_write_done(struct r10bio *r10_bio)

348

{

376

{

349

if (atomic_dec_and_test(&r10_bio->remaining)) {

377

if (atomic_dec_and_test(&r10_bio->remaining)) {

350

if (test_bit(R10BIO_WriteError, &r10_bio->state))

378

if (test_bit(R10BIO_WriteError, &r10_bio->state))

351

reschedule_retry(r10_bio);

379

reschedule_retry(r10_bio);

352

else {

380

else {

353

close_write(r10_bio);

381

close_write(r10_bio);

354

if (test_bit(R10BIO_MadeGood, &r10_bio->state))

382

if (test_bit(R10BIO_MadeGood, &r10_bio->state))

355

reschedule_retry(r10_bio);

383

reschedule_retry(r10_bio);

356

else

384

else

357

raid_end_bio_io(r10_bio);

385

raid_end_bio_io(r10_bio);

358

}

386

}

359

}

387

}

360

}

388

}

361

389

362

static void raid10_end_write_request(struct bio *bio, int error)

390

static void raid10_end_write_request(struct bio *bio, int error)

363

{

391

{

364

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

392

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

365

struct r10bio *r10_bio = bio->bi_private;

393

struct r10bio *r10_bio = bio->bi_private;

366

int dev;

394

int dev;

367

int dec_rdev = 1;

395

int dec_rdev = 1;

368

struct r10conf *conf = r10_bio->mddev->private;

396

struct r10conf *conf = r10_bio->mddev->private;

369

int slot;

397

int slot;

370

398

371

dev = find_bio_disk(conf, r10_bio, bio, &slot);

399

dev = find_bio_disk(conf, r10_bio, bio, &slot, NULL);

372

400

373

/*

401

/*

374

* this branch is our 'one mirror IO has finished' event handler:

402

* this branch is our 'one mirror IO has finished' event handler:

375

*/

403

*/

376

if (!uptodate) {

404

if (!uptodate) {

377

set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags);

405

set_bit(WriteErrorSeen, &conf->mirrors[dev].rdev->flags);

378

set_bit(R10BIO_WriteError, &r10_bio->state);

406

set_bit(R10BIO_WriteError, &r10_bio->state);

379

dec_rdev = 0;

407

dec_rdev = 0;

380

} else {

408

} else {

381

/*

409

/*

382

* Set R10BIO_Uptodate in our master bio, so that

410

* Set R10BIO_Uptodate in our master bio, so that

383

* we will return a good error code for to the higher

411

* we will return a good error code for to the higher

384

* levels even if IO on some other mirrored buffer fails.

412

* levels even if IO on some other mirrored buffer fails.

385

*

413

*

386

* The 'master' represents the composite IO operation to

414

* The 'master' represents the composite IO operation to

387

* user-side. So if something waits for IO, then it will

415

* user-side. So if something waits for IO, then it will

388

* wait for the 'master' bio.

416

* wait for the 'master' bio.

389

*/

417

*/

390

sector_t first_bad;

418

sector_t first_bad;

391

int bad_sectors;

419

int bad_sectors;

392

420

393

set_bit(R10BIO_Uptodate, &r10_bio->state);

421

set_bit(R10BIO_Uptodate, &r10_bio->state);

394

422

395

/* Maybe we can clear some bad blocks. */

423

/* Maybe we can clear some bad blocks. */

396

if (is_badblock(conf->mirrors[dev].rdev,

424

if (is_badblock(conf->mirrors[dev].rdev,

397

r10_bio->devs[slot].addr,

425

r10_bio->devs[slot].addr,

398

r10_bio->sectors,

426

r10_bio->sectors,

399

&first_bad, &bad_sectors)) {

427

&first_bad, &bad_sectors)) {

400

bio_put(bio);

428

bio_put(bio);

401

r10_bio->devs[slot].bio = IO_MADE_GOOD;

429

r10_bio->devs[slot].bio = IO_MADE_GOOD;

402

dec_rdev = 0;

430

dec_rdev = 0;

403

set_bit(R10BIO_MadeGood, &r10_bio->state);

431

set_bit(R10BIO_MadeGood, &r10_bio->state);

404

}

432

}

405

}

433

}

406

434

407

/*

435

/*

408

*

436

*

409

* Let's see if all mirrored write operations have finished

437

* Let's see if all mirrored write operations have finished

410

* already.

438

* already.

411

*/

439

*/

412

one_write_done(r10_bio);

440

one_write_done(r10_bio);

413

if (dec_rdev)

441

if (dec_rdev)

414

rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);

442

rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);

415

}

443

}

416

444

417

445

418

/*

446

/*

419

* RAID10 layout manager

447

* RAID10 layout manager

420

* As well as the chunksize and raid_disks count, there are two

448

* As well as the chunksize and raid_disks count, there are two

421

* parameters: near_copies and far_copies.

449

* parameters: near_copies and far_copies.

422

* near_copies * far_copies must be <= raid_disks.

450

* near_copies * far_copies must be <= raid_disks.

423

* Normally one of these will be 1.

451

* Normally one of these will be 1.

424

* If both are 1, we get raid0.

452

* If both are 1, we get raid0.

425

* If near_copies == raid_disks, we get raid1.

453

* If near_copies == raid_disks, we get raid1.

426

*

454

*

427

* Chunks are laid out in raid0 style with near_copies copies of the

455

* Chunks are laid out in raid0 style with near_copies copies of the

428

* first chunk, followed by near_copies copies of the next chunk and

456

* first chunk, followed by near_copies copies of the next chunk and

429

* so on.

457

* so on.

430

* If far_copies > 1, then after 1/far_copies of the array has been assigned

458

* If far_copies > 1, then after 1/far_copies of the array has been assigned

431

* as described above, we start again with a device offset of near_copies.

459

* as described above, we start again with a device offset of near_copies.

432

* So we effectively have another copy of the whole array further down all

460

* So we effectively have another copy of the whole array further down all

433

* the drives, but with blocks on different drives.

461

* the drives, but with blocks on different drives.

434

* With this layout, and block is never stored twice on the one device.

462

* With this layout, and block is never stored twice on the one device.

435

*

463

*

436

* raid10_find_phys finds the sector offset of a given virtual sector

464

* raid10_find_phys finds the sector offset of a given virtual sector

437

* on each device that it is on.

465

* on each device that it is on.

438

*

466

*

439

* raid10_find_virt does the reverse mapping, from a device and a

467

* raid10_find_virt does the reverse mapping, from a device and a

440

* sector offset to a virtual address

468

* sector offset to a virtual address

441

*/

469

*/

442

470

443

static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)

471

static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)

444

{

472

{

445

int n,f;

473

int n,f;

446

sector_t sector;

474

sector_t sector;

447

sector_t chunk;

475

sector_t chunk;

448

sector_t stripe;

476

sector_t stripe;

449

int dev;

477

int dev;

450

478

451

int slot = 0;

479

int slot = 0;

452

480

453

/* now calculate first sector/dev */

481

/* now calculate first sector/dev */

454

chunk = r10bio->sector >> conf->chunk_shift;

482

chunk = r10bio->sector >> conf->chunk_shift;

455

sector = r10bio->sector & conf->chunk_mask;

483

sector = r10bio->sector & conf->chunk_mask;

456

484

457

chunk *= conf->near_copies;

485

chunk *= conf->near_copies;

458

stripe = chunk;

486

stripe = chunk;

459

dev = sector_div(stripe, conf->raid_disks);

487

dev = sector_div(stripe, conf->raid_disks);

460

if (conf->far_offset)

488

if (conf->far_offset)

461

stripe *= conf->far_copies;

489

stripe *= conf->far_copies;

462

490

463

sector += stripe << conf->chunk_shift;

491

sector += stripe << conf->chunk_shift;

464

492

465

/* and calculate all the others */

493

/* and calculate all the others */

466

for (n=0; n < conf->near_copies; n++) {

494

for (n=0; n < conf->near_copies; n++) {

467

int d = dev;

495

int d = dev;

468

sector_t s = sector;

496

sector_t s = sector;

469

r10bio->devs[slot].addr = sector;

497

r10bio->devs[slot].addr = sector;

470

r10bio->devs[slot].devnum = d;

498

r10bio->devs[slot].devnum = d;

471

slot++;

499

slot++;

472

500

473

for (f = 1; f < conf->far_copies; f++) {

501

for (f = 1; f < conf->far_copies; f++) {

474

d += conf->near_copies;

502

d += conf->near_copies;

475

if (d >= conf->raid_disks)

503

if (d >= conf->raid_disks)

476

d -= conf->raid_disks;

504

d -= conf->raid_disks;

477

s += conf->stride;

505

s += conf->stride;

478

r10bio->devs[slot].devnum = d;

506

r10bio->devs[slot].devnum = d;

479

r10bio->devs[slot].addr = s;

507

r10bio->devs[slot].addr = s;

480

slot++;

508

slot++;

481

}

509

}

482

dev++;

510

dev++;

483

if (dev >= conf->raid_disks) {

511

if (dev >= conf->raid_disks) {

484

dev = 0;

512

dev = 0;

485

sector += (conf->chunk_mask + 1);

513

sector += (conf->chunk_mask + 1);

486

}

514

}

487

}

515

}

488

BUG_ON(slot != conf->copies);

516

BUG_ON(slot != conf->copies);

489

}

517

}

490

518

491

static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)

519

static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)

492

{

520

{

493

sector_t offset, chunk, vchunk;

521

sector_t offset, chunk, vchunk;

494

522

495

offset = sector & conf->chunk_mask;

523

offset = sector & conf->chunk_mask;

496

if (conf->far_offset) {

524

if (conf->far_offset) {

497

int fc;

525

int fc;

498

chunk = sector >> conf->chunk_shift;

526

chunk = sector >> conf->chunk_shift;

499

fc = sector_div(chunk, conf->far_copies);

527

fc = sector_div(chunk, conf->far_copies);

500

dev -= fc * conf->near_copies;

528

dev -= fc * conf->near_copies;

501

if (dev < 0)

529

if (dev < 0)

502

dev += conf->raid_disks;

530

dev += conf->raid_disks;

503

} else {

531

} else {

504

while (sector >= conf->stride) {

532

while (sector >= conf->stride) {

505

sector -= conf->stride;

533

sector -= conf->stride;

506

if (dev < conf->near_copies)

534

if (dev < conf->near_copies)

507

dev += conf->raid_disks - conf->near_copies;

535

dev += conf->raid_disks - conf->near_copies;

508

else

536

else

509

dev -= conf->near_copies;

537

dev -= conf->near_copies;

510

}

538

}

511

chunk = sector >> conf->chunk_shift;

539

chunk = sector >> conf->chunk_shift;

512

}

540

}

513

vchunk = chunk * conf->raid_disks + dev;

541

vchunk = chunk * conf->raid_disks + dev;

514

sector_div(vchunk, conf->near_copies);

542

sector_div(vchunk, conf->near_copies);

515

return (vchunk << conf->chunk_shift) + offset;

543

return (vchunk << conf->chunk_shift) + offset;

516

}

544

}

517

545

518

/**

546

/**

519

* raid10_mergeable_bvec -- tell bio layer if a two requests can be merged

547

* raid10_mergeable_bvec -- tell bio layer if a two requests can be merged

520

* @q: request queue

548

* @q: request queue

521

* @bvm: properties of new bio

549

* @bvm: properties of new bio

522

* @biovec: the request that could be merged to it.

550

* @biovec: the request that could be merged to it.

523

*

551

*

524

* Return amount of bytes we can accept at this offset

552

* Return amount of bytes we can accept at this offset

525

* If near_copies == raid_disk, there are no striping issues,

553

* If near_copies == raid_disk, there are no striping issues,

526

* but in that case, the function isn't called at all.

554

* but in that case, the function isn't called at all.

527

*/

555

*/

528

static int raid10_mergeable_bvec(struct request_queue *q,

556

static int raid10_mergeable_bvec(struct request_queue *q,

529

struct bvec_merge_data *bvm,

557

struct bvec_merge_data *bvm,

530

struct bio_vec *biovec)

558

struct bio_vec *biovec)

531

{

559

{

532

struct mddev *mddev = q->queuedata;

560

struct mddev *mddev = q->queuedata;

533

sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);

561

sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);

534

int max;

562

int max;

535

unsigned int chunk_sectors = mddev->chunk_sectors;

563

unsigned int chunk_sectors = mddev->chunk_sectors;

536

unsigned int bio_sectors = bvm->bi_size >> 9;

564

unsigned int bio_sectors = bvm->bi_size >> 9;

537

565

538

max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;

566

max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;

539

if (max < 0) max = 0; /* bio_add cannot handle a negative return */

567

if (max < 0) max = 0; /* bio_add cannot handle a negative return */

540

if (max <= biovec->bv_len && bio_sectors == 0)

568

if (max <= biovec->bv_len && bio_sectors == 0)

541

return biovec->bv_len;

569

return biovec->bv_len;

542

else

570

else

543

return max;

571

return max;

544

}

572

}

545

573

546

/*

574

/*

547

* This routine returns the disk from which the requested read should

575

* This routine returns the disk from which the requested read should

548

* be done. There is a per-array 'next expected sequential IO' sector

576

* be done. There is a per-array 'next expected sequential IO' sector

549

* number - if this matches on the next IO then we use the last disk.

577

* number - if this matches on the next IO then we use the last disk.

550

* There is also a per-disk 'last know head position' sector that is

578

* There is also a per-disk 'last know head position' sector that is

551

* maintained from IRQ contexts, both the normal and the resync IO

579

* maintained from IRQ contexts, both the normal and the resync IO

552

* completion handlers update this position correctly. If there is no

580

* completion handlers update this position correctly. If there is no

553

* perfect sequential match then we pick the disk whose head is closest.

581

* perfect sequential match then we pick the disk whose head is closest.

554

*

582

*

555

* If there are 2 mirrors in the same 2 devices, performance degrades

583

* If there are 2 mirrors in the same 2 devices, performance degrades

556

* because position is mirror, not device based.

584

* because position is mirror, not device based.

557

*

585

*

558

* The rdev for the device selected will have nr_pending incremented.

586

* The rdev for the device selected will have nr_pending incremented.

559

*/

587

*/

560

588

561

/*

589

/*

562

* FIXME: possibly should rethink readbalancing and do it differently

590

* FIXME: possibly should rethink readbalancing and do it differently

563

* depending on near_copies / far_copies geometry.

591

* depending on near_copies / far_copies geometry.

564

*/

592

*/

565

static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors)

593

static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors)

566

{

594

{

567

const sector_t this_sector = r10_bio->sector;

595

const sector_t this_sector = r10_bio->sector;

568

int disk, slot;

596

int disk, slot;

569

int sectors = r10_bio->sectors;

597

int sectors = r10_bio->sectors;

570

int best_good_sectors;

598

int best_good_sectors;

571

sector_t new_distance, best_dist;

599

sector_t new_distance, best_dist;

572

struct md_rdev *rdev;

600

struct md_rdev *rdev;

573

int do_balance;

601

int do_balance;

574

int best_slot;

602

int best_slot;

575

603

576

raid10_find_phys(conf, r10_bio);

604

raid10_find_phys(conf, r10_bio);

577

rcu_read_lock();

605

rcu_read_lock();

578

retry:

606

retry:

579

sectors = r10_bio->sectors;

607

sectors = r10_bio->sectors;

580

best_slot = -1;

608

best_slot = -1;

581

best_dist = MaxSector;

609

best_dist = MaxSector;

582

best_good_sectors = 0;

610

best_good_sectors = 0;

583

do_balance = 1;

611

do_balance = 1;

584

/*

612

/*

585

* Check if we can balance. We can balance on the whole

613

* Check if we can balance. We can balance on the whole

586

* device if no resync is going on (recovery is ok), or below

614

* device if no resync is going on (recovery is ok), or below

587

* the resync window. We take the first readable disk when

615

* the resync window. We take the first readable disk when

588

* above the resync window.

616

* above the resync window.

589

*/

617

*/

590

if (conf->mddev->recovery_cp < MaxSector

618

if (conf->mddev->recovery_cp < MaxSector

591

&& (this_sector + sectors >= conf->next_resync))

619

&& (this_sector + sectors >= conf->next_resync))

592

do_balance = 0;

620

do_balance = 0;

593

621

594

for (slot = 0; slot < conf->copies ; slot++) {

622

for (slot = 0; slot < conf->copies ; slot++) {

595

sector_t first_bad;

623

sector_t first_bad;

596

int bad_sectors;

624

int bad_sectors;

597

sector_t dev_sector;

625

sector_t dev_sector;

598

626

599

if (r10_bio->devs[slot].bio == IO_BLOCKED)

627

if (r10_bio->devs[slot].bio == IO_BLOCKED)

600

continue;

628

continue;

601

disk = r10_bio->devs[slot].devnum;

629

disk = r10_bio->devs[slot].devnum;

602

rdev = rcu_dereference(conf->mirrors[disk].rdev);

630

rdev = rcu_dereference(conf->mirrors[disk].rdev);

603

if (rdev == NULL)

631

if (rdev == NULL)

604

continue;

632

continue;

605

if (!test_bit(In_sync, &rdev->flags))

633

if (!test_bit(In_sync, &rdev->flags))

606

continue;

634

continue;

607

635

608

dev_sector = r10_bio->devs[slot].addr;

636

dev_sector = r10_bio->devs[slot].addr;

609

if (is_badblock(rdev, dev_sector, sectors,

637

if (is_badblock(rdev, dev_sector, sectors,

610

&first_bad, &bad_sectors)) {

638

&first_bad, &bad_sectors)) {

611

if (best_dist < MaxSector)

639

if (best_dist < MaxSector)

612

/* Already have a better slot */

640

/* Already have a better slot */

613

continue;

641

continue;

614

if (first_bad <= dev_sector) {

642

if (first_bad <= dev_sector) {

615

/* Cannot read here. If this is the

643

/* Cannot read here. If this is the

616

* 'primary' device, then we must not read

644

* 'primary' device, then we must not read

617

* beyond 'bad_sectors' from another device.

645

* beyond 'bad_sectors' from another device.

618

*/

646

*/

619

bad_sectors -= (dev_sector - first_bad);

647

bad_sectors -= (dev_sector - first_bad);

620

if (!do_balance && sectors > bad_sectors)

648

if (!do_balance && sectors > bad_sectors)

621

sectors = bad_sectors;

649

sectors = bad_sectors;

622

if (best_good_sectors > sectors)

650

if (best_good_sectors > sectors)

623

best_good_sectors = sectors;

651

best_good_sectors = sectors;

624

} else {

652

} else {

625

sector_t good_sectors =

653

sector_t good_sectors =

626

first_bad - dev_sector;

654

first_bad - dev_sector;

627

if (good_sectors > best_good_sectors) {

655

if (good_sectors > best_good_sectors) {

628

best_good_sectors = good_sectors;

656

best_good_sectors = good_sectors;

629

best_slot = slot;

657

best_slot = slot;

630

}

658

}

631

if (!do_balance)

659

if (!do_balance)

632

/* Must read from here */

660

/* Must read from here */

633

break;

661

break;

634

}

662

}

635

continue;

663

continue;

636

} else

664

} else

637

best_good_sectors = sectors;

665

best_good_sectors = sectors;

638

666

639

if (!do_balance)

667

if (!do_balance)

640

break;

668

break;

641

669

642

/* This optimisation is debatable, and completely destroys

670

/* This optimisation is debatable, and completely destroys

643

* sequential read speed for 'far copies' arrays. So only

671

* sequential read speed for 'far copies' arrays. So only

644

* keep it for 'near' arrays, and review those later.

672

* keep it for 'near' arrays, and review those later.

645

*/

673

*/

646

if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))

674

if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))

647

break;

675

break;

648

676

649

/* for far > 1 always use the lowest address */

677

/* for far > 1 always use the lowest address */

650

if (conf->far_copies > 1)

678

if (conf->far_copies > 1)

651

new_distance = r10_bio->devs[slot].addr;

679

new_distance = r10_bio->devs[slot].addr;

652

else

680

else

653

new_distance = abs(r10_bio->devs[slot].addr -

681

new_distance = abs(r10_bio->devs[slot].addr -

654

conf->mirrors[disk].head_position);

682

conf->mirrors[disk].head_position);

655

if (new_distance < best_dist) {

683

if (new_distance < best_dist) {

656

best_dist = new_distance;

684

best_dist = new_distance;

657

best_slot = slot;

685

best_slot = slot;

658

}

686

}

659

}

687

}

660

if (slot == conf->copies)

688

if (slot == conf->copies)

661

slot = best_slot;

689

slot = best_slot;

662

690

663

if (slot >= 0) {

691

if (slot >= 0) {

664

disk = r10_bio->devs[slot].devnum;

692

disk = r10_bio->devs[slot].devnum;

665

rdev = rcu_dereference(conf->mirrors[disk].rdev);

693

rdev = rcu_dereference(conf->mirrors[disk].rdev);

666

if (!rdev)

694

if (!rdev)

667

goto retry;

695

goto retry;

668

atomic_inc(&rdev->nr_pending);

696

atomic_inc(&rdev->nr_pending);

669

if (test_bit(Faulty, &rdev->flags)) {

697

if (test_bit(Faulty, &rdev->flags)) {

670

/* Cannot risk returning a device that failed

698

/* Cannot risk returning a device that failed

671

* before we inc'ed nr_pending

699

* before we inc'ed nr_pending

672

*/

700

*/

673

rdev_dec_pending(rdev, conf->mddev);

701

rdev_dec_pending(rdev, conf->mddev);

674

goto retry;

702

goto retry;

675

}

703

}

676

r10_bio->read_slot = slot;

704

r10_bio->read_slot = slot;

677

} else

705

} else

678

disk = -1;

706

disk = -1;

679

rcu_read_unlock();

707

rcu_read_unlock();

680

*max_sectors = best_good_sectors;

708

*max_sectors = best_good_sectors;

681

709

682

return disk;

710

return disk;

683

}

711

}

684

712

685

static int raid10_congested(void *data, int bits)

713

static int raid10_congested(void *data, int bits)

686

{

714

{

687

struct mddev *mddev = data;

715

struct mddev *mddev = data;

688

struct r10conf *conf = mddev->private;

716

struct r10conf *conf = mddev->private;

689

int i, ret = 0;

717

int i, ret = 0;

690

718

691

if ((bits & (1 << BDI_async_congested)) &&

719

if ((bits & (1 << BDI_async_congested)) &&

692

conf->pending_count >= max_queued_requests)

720

conf->pending_count >= max_queued_requests)

693

return 1;

721

return 1;

694

722

695

if (mddev_congested(mddev, bits))

723

if (mddev_congested(mddev, bits))

696

return 1;

724

return 1;

697

rcu_read_lock();

725

rcu_read_lock();

698

for (i = 0; i < conf->raid_disks && ret == 0; i++) {

726

for (i = 0; i < conf->raid_disks && ret == 0; i++) {

699

struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);

727

struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);

700

if (rdev && !test_bit(Faulty, &rdev->flags)) {

728

if (rdev && !test_bit(Faulty, &rdev->flags)) {

701

struct request_queue *q = bdev_get_queue(rdev->bdev);

729

struct request_queue *q = bdev_get_queue(rdev->bdev);

702

730

703

ret |= bdi_congested(&q->backing_dev_info, bits);

731

ret |= bdi_congested(&q->backing_dev_info, bits);

704

}

732

}

705

}

733

}

706

rcu_read_unlock();

734

rcu_read_unlock();

707

return ret;

735

return ret;

708

}

736

}

709

737

710

static void flush_pending_writes(struct r10conf *conf)

738

static void flush_pending_writes(struct r10conf *conf)

711

{

739

{

712

/* Any writes that have been queued but are awaiting

740

/* Any writes that have been queued but are awaiting

713

* bitmap updates get flushed here.

741

* bitmap updates get flushed here.

714

*/

742

*/

715

spin_lock_irq(&conf->device_lock);

743

spin_lock_irq(&conf->device_lock);

716

744

717

if (conf->pending_bio_list.head) {

745

if (conf->pending_bio_list.head) {

718

struct bio *bio;

746

struct bio *bio;

719

bio = bio_list_get(&conf->pending_bio_list);

747

bio = bio_list_get(&conf->pending_bio_list);

720

conf->pending_count = 0;

748

conf->pending_count = 0;

721

spin_unlock_irq(&conf->device_lock);

749

spin_unlock_irq(&conf->device_lock);

722

/* flush any pending bitmap writes to disk

750

/* flush any pending bitmap writes to disk

723

* before proceeding w/ I/O */

751

* before proceeding w/ I/O */

724

bitmap_unplug(conf->mddev->bitmap);

752

bitmap_unplug(conf->mddev->bitmap);

725

wake_up(&conf->wait_barrier);

753

wake_up(&conf->wait_barrier);

726

754

727

while (bio) { /* submit pending writes */

755

while (bio) { /* submit pending writes */

728

struct bio *next = bio->bi_next;

756

struct bio *next = bio->bi_next;

729

bio->bi_next = NULL;

757

bio->bi_next = NULL;

730

generic_make_request(bio);

758

generic_make_request(bio);

731

bio = next;

759

bio = next;

732

}

760

}

733

} else

761

} else

734

spin_unlock_irq(&conf->device_lock);

762

spin_unlock_irq(&conf->device_lock);

735

}

763

}

736

764

737

/* Barriers....

765

/* Barriers....

738

* Sometimes we need to suspend IO while we do something else,

766

* Sometimes we need to suspend IO while we do something else,

739

* either some resync/recovery, or reconfigure the array.

767

* either some resync/recovery, or reconfigure the array.

740

* To do this we raise a 'barrier'.

768

* To do this we raise a 'barrier'.

741

* The 'barrier' is a counter that can be raised multiple times

769

* The 'barrier' is a counter that can be raised multiple times

742

* to count how many activities are happening which preclude

770

* to count how many activities are happening which preclude

743

* normal IO.

771

* normal IO.

744

* We can only raise the barrier if there is no pending IO.

772

* We can only raise the barrier if there is no pending IO.

745

* i.e. if nr_pending == 0.

773

* i.e. if nr_pending == 0.

746

* We choose only to raise the barrier if no-one is waiting for the

774

* We choose only to raise the barrier if no-one is waiting for the

747

* barrier to go down. This means that as soon as an IO request

775

* barrier to go down. This means that as soon as an IO request

748

* is ready, no other operations which require a barrier will start

776

* is ready, no other operations which require a barrier will start

749

* until the IO request has had a chance.

777

* until the IO request has had a chance.

750

*

778

*

751

* So: regular IO calls 'wait_barrier'. When that returns there

779

* So: regular IO calls 'wait_barrier'. When that returns there

752

* is no backgroup IO happening, It must arrange to call

780

* is no backgroup IO happening, It must arrange to call

753

* allow_barrier when it has finished its IO.

781

* allow_barrier when it has finished its IO.

754

* backgroup IO calls must call raise_barrier. Once that returns

782

* backgroup IO calls must call raise_barrier. Once that returns

755

* there is no normal IO happeing. It must arrange to call

783

* there is no normal IO happeing. It must arrange to call

756

* lower_barrier when the particular background IO completes.

784

* lower_barrier when the particular background IO completes.

757

*/

785

*/

758

786

759

static void raise_barrier(struct r10conf *conf, int force)

787

static void raise_barrier(struct r10conf *conf, int force)

760

{

788

{

761

BUG_ON(force && !conf->barrier);

789

BUG_ON(force && !conf->barrier);

762

spin_lock_irq(&conf->resync_lock);

790

spin_lock_irq(&conf->resync_lock);

763

791

764

/* Wait until no block IO is waiting (unless 'force') */

792

/* Wait until no block IO is waiting (unless 'force') */

765

wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,

793

wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,

766

conf->resync_lock, );

794

conf->resync_lock, );

767

795

768

/* block any new IO from starting */

796

/* block any new IO from starting */

769

conf->barrier++;

797

conf->barrier++;

770

798

771

/* Now wait for all pending IO to complete */

799

/* Now wait for all pending IO to complete */

772

wait_event_lock_irq(conf->wait_barrier,

800

wait_event_lock_irq(conf->wait_barrier,

773

!conf->nr_pending && conf->barrier < RESYNC_DEPTH,

801

!conf->nr_pending && conf->barrier < RESYNC_DEPTH,

774

conf->resync_lock, );

802

conf->resync_lock, );

775

803

776

spin_unlock_irq(&conf->resync_lock);

804

spin_unlock_irq(&conf->resync_lock);

777

}

805

}

778

806

779

static void lower_barrier(struct r10conf *conf)

807

static void lower_barrier(struct r10conf *conf)

780

{

808

{

781

unsigned long flags;

809

unsigned long flags;

782

spin_lock_irqsave(&conf->resync_lock, flags);

810

spin_lock_irqsave(&conf->resync_lock, flags);

783

conf->barrier--;

811

conf->barrier--;

784

spin_unlock_irqrestore(&conf->resync_lock, flags);

812

spin_unlock_irqrestore(&conf->resync_lock, flags);

785

wake_up(&conf->wait_barrier);

813

wake_up(&conf->wait_barrier);

786

}

814

}

787

815

788

static void wait_barrier(struct r10conf *conf)

816

static void wait_barrier(struct r10conf *conf)

789

{

817

{

790

spin_lock_irq(&conf->resync_lock);

818

spin_lock_irq(&conf->resync_lock);

791

if (conf->barrier) {

819

if (conf->barrier) {

792

conf->nr_waiting++;

820

conf->nr_waiting++;

793

wait_event_lock_irq(conf->wait_barrier, !conf->barrier,

821

wait_event_lock_irq(conf->wait_barrier, !conf->barrier,

794

conf->resync_lock,

822

conf->resync_lock,

795

);

823

);

796

conf->nr_waiting--;

824

conf->nr_waiting--;

797

}

825

}

798

conf->nr_pending++;

826

conf->nr_pending++;

799

spin_unlock_irq(&conf->resync_lock);

827

spin_unlock_irq(&conf->resync_lock);

800

}

828

}

801

829

802

static void allow_barrier(struct r10conf *conf)

830

static void allow_barrier(struct r10conf *conf)

803

{

831

{

804

unsigned long flags;

832

unsigned long flags;

805

spin_lock_irqsave(&conf->resync_lock, flags);

833

spin_lock_irqsave(&conf->resync_lock, flags);

806

conf->nr_pending--;

834

conf->nr_pending--;

807

spin_unlock_irqrestore(&conf->resync_lock, flags);

835

spin_unlock_irqrestore(&conf->resync_lock, flags);

808

wake_up(&conf->wait_barrier);

836

wake_up(&conf->wait_barrier);

809

}

837

}

810

838

811

static void freeze_array(struct r10conf *conf)

839

static void freeze_array(struct r10conf *conf)

812

{

840

{

813

/* stop syncio and normal IO and wait for everything to

841

/* stop syncio and normal IO and wait for everything to

814

* go quiet.

842

* go quiet.

815

* We increment barrier and nr_waiting, and then

843

* We increment barrier and nr_waiting, and then

816

* wait until nr_pending match nr_queued+1

844

* wait until nr_pending match nr_queued+1

817

* This is called in the context of one normal IO request

845

* This is called in the context of one normal IO request

818

* that has failed. Thus any sync request that might be pending

846

* that has failed. Thus any sync request that might be pending

819

* will be blocked by nr_pending, and we need to wait for

847

* will be blocked by nr_pending, and we need to wait for

820

* pending IO requests to complete or be queued for re-try.

848

* pending IO requests to complete or be queued for re-try.

821

* Thus the number queued (nr_queued) plus this request (1)

849

* Thus the number queued (nr_queued) plus this request (1)

822

* must match the number of pending IOs (nr_pending) before

850

* must match the number of pending IOs (nr_pending) before

823

* we continue.

851

* we continue.

824

*/

852

*/

825

spin_lock_irq(&conf->resync_lock);

853

spin_lock_irq(&conf->resync_lock);

826

conf->barrier++;

854

conf->barrier++;

827

conf->nr_waiting++;

855

conf->nr_waiting++;

828

wait_event_lock_irq(conf->wait_barrier,

856

wait_event_lock_irq(conf->wait_barrier,

829

conf->nr_pending == conf->nr_queued+1,

857

conf->nr_pending == conf->nr_queued+1,

830

conf->resync_lock,

858

conf->resync_lock,

831

flush_pending_writes(conf));

859

flush_pending_writes(conf));

832

860

833

spin_unlock_irq(&conf->resync_lock);

861

spin_unlock_irq(&conf->resync_lock);

834

}

862

}

835

863

836

static void unfreeze_array(struct r10conf *conf)

864

static void unfreeze_array(struct r10conf *conf)

837

{

865

{

838

/* reverse the effect of the freeze */

866

/* reverse the effect of the freeze */

839

spin_lock_irq(&conf->resync_lock);

867

spin_lock_irq(&conf->resync_lock);

840

conf->barrier--;

868

conf->barrier--;

841

conf->nr_waiting--;

869

conf->nr_waiting--;

842

wake_up(&conf->wait_barrier);

870

wake_up(&conf->wait_barrier);

843

spin_unlock_irq(&conf->resync_lock);

871

spin_unlock_irq(&conf->resync_lock);

844

}

872

}

845

873

846

static void make_request(struct mddev *mddev, struct bio * bio)

874

static void make_request(struct mddev *mddev, struct bio * bio)

847

{

875

{

848

struct r10conf *conf = mddev->private;

876

struct r10conf *conf = mddev->private;

849

struct mirror_info *mirror;

877

struct mirror_info *mirror;

850

struct r10bio *r10_bio;

878

struct r10bio *r10_bio;

851

struct bio *read_bio;

879

struct bio *read_bio;

852

int i;

880

int i;

853

int chunk_sects = conf->chunk_mask + 1;

881

int chunk_sects = conf->chunk_mask + 1;

854

const int rw = bio_data_dir(bio);

882

const int rw = bio_data_dir(bio);

855

const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);

883

const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);

856

const unsigned long do_fua = (bio->bi_rw & REQ_FUA);

884

const unsigned long do_fua = (bio->bi_rw & REQ_FUA);

857

unsigned long flags;

885

unsigned long flags;

858

struct md_rdev *blocked_rdev;

886

struct md_rdev *blocked_rdev;

859

int plugged;

887

int plugged;

860

int sectors_handled;

888

int sectors_handled;

861

int max_sectors;

889

int max_sectors;

862

890

863

if (unlikely(bio->bi_rw & REQ_FLUSH)) {

891

if (unlikely(bio->bi_rw & REQ_FLUSH)) {

864

md_flush_request(mddev, bio);

892

md_flush_request(mddev, bio);

865

return;

893

return;

866

}

894

}

867

895

868

/* If this request crosses a chunk boundary, we need to

896

/* If this request crosses a chunk boundary, we need to

869

* split it. This will only happen for 1 PAGE (or less) requests.

897

* split it. This will only happen for 1 PAGE (or less) requests.

870

*/

898

*/

871

if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)

899

if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)

872

> chunk_sects &&

900

> chunk_sects &&

873

conf->near_copies < conf->raid_disks)) {

901

conf->near_copies < conf->raid_disks)) {

874

struct bio_pair *bp;

902

struct bio_pair *bp;

875

/* Sanity check -- queue functions should prevent this happening */

903

/* Sanity check -- queue functions should prevent this happening */

876

if (bio->bi_vcnt != 1 ||

904

if (bio->bi_vcnt != 1 ||

877

bio->bi_idx != 0)

905

bio->bi_idx != 0)

878

goto bad_map;

906

goto bad_map;

879

/* This is a one page bio that upper layers

907

/* This is a one page bio that upper layers

880

* refuse to split for us, so we need to split it.

908

* refuse to split for us, so we need to split it.

881

*/

909

*/

882

bp = bio_split(bio,

910

bp = bio_split(bio,

883

chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );

911

chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );

884

912

885

/* Each of these 'make_request' calls will call 'wait_barrier'.

913

/* Each of these 'make_request' calls will call 'wait_barrier'.

886

* If the first succeeds but the second blocks due to the resync

914

* If the first succeeds but the second blocks due to the resync

887

* thread raising the barrier, we will deadlock because the

915

* thread raising the barrier, we will deadlock because the

888

* IO to the underlying device will be queued in generic_make_request

916

* IO to the underlying device will be queued in generic_make_request

889

* and will never complete, so will never reduce nr_pending.

917

* and will never complete, so will never reduce nr_pending.

890

* So increment nr_waiting here so no new raise_barriers will

918

* So increment nr_waiting here so no new raise_barriers will

891

* succeed, and so the second wait_barrier cannot block.

919

* succeed, and so the second wait_barrier cannot block.

892

*/

920

*/

893

spin_lock_irq(&conf->resync_lock);

921

spin_lock_irq(&conf->resync_lock);

894

conf->nr_waiting++;

922

conf->nr_waiting++;

895

spin_unlock_irq(&conf->resync_lock);

923

spin_unlock_irq(&conf->resync_lock);

896

924

897

make_request(mddev, &bp->bio1);

925

make_request(mddev, &bp->bio1);

898

make_request(mddev, &bp->bio2);

926

make_request(mddev, &bp->bio2);

899

927

900

spin_lock_irq(&conf->resync_lock);

928

spin_lock_irq(&conf->resync_lock);

901

conf->nr_waiting--;

929

conf->nr_waiting--;

902

wake_up(&conf->wait_barrier);

930

wake_up(&conf->wait_barrier);

903

spin_unlock_irq(&conf->resync_lock);

931

spin_unlock_irq(&conf->resync_lock);

904

932

905

bio_pair_release(bp);

933

bio_pair_release(bp);

906

return;

934

return;

907

bad_map:

935

bad_map:

908

printk("md/raid10:%s: make_request bug: can't convert block across chunks"

936

printk("md/raid10:%s: make_request bug: can't convert block across chunks"

909

" or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,

937

" or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,

910

(unsigned long long)bio->bi_sector, bio->bi_size >> 10);

938

(unsigned long long)bio->bi_sector, bio->bi_size >> 10);

911

939

912

bio_io_error(bio);

940

bio_io_error(bio);

913

return;

941

return;

914

}

942

}

915

943

916

md_write_start(mddev, bio);

944

md_write_start(mddev, bio);

917

945

918

/*

946

/*

919

* Register the new request and wait if the reconstruction

947

* Register the new request and wait if the reconstruction

920

* thread has put up a bar for new requests.

948

* thread has put up a bar for new requests.

921

* Continue immediately if no resync is active currently.

949

* Continue immediately if no resync is active currently.

922

*/

950

*/

923

wait_barrier(conf);

951

wait_barrier(conf);

924

952

925

r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);

953

r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);

926

954

927

r10_bio->master_bio = bio;

955

r10_bio->master_bio = bio;

928

r10_bio->sectors = bio->bi_size >> 9;

956

r10_bio->sectors = bio->bi_size >> 9;

929

957

930

r10_bio->mddev = mddev;

958

r10_bio->mddev = mddev;

931

r10_bio->sector = bio->bi_sector;

959

r10_bio->sector = bio->bi_sector;

932

r10_bio->state = 0;

960

r10_bio->state = 0;

933

961

934

/* We might need to issue multiple reads to different

962

/* We might need to issue multiple reads to different

935

* devices if there are bad blocks around, so we keep

963

* devices if there are bad blocks around, so we keep

936

* track of the number of reads in bio->bi_phys_segments.

964

* track of the number of reads in bio->bi_phys_segments.

937

* If this is 0, there is only one r10_bio and no locking

965

* If this is 0, there is only one r10_bio and no locking

938

* will be needed when the request completes. If it is

966

* will be needed when the request completes. If it is

939

* non-zero, then it is the number of not-completed requests.

967

* non-zero, then it is the number of not-completed requests.

940

*/

968

*/

941

bio->bi_phys_segments = 0;

969

bio->bi_phys_segments = 0;

942

clear_bit(BIO_SEG_VALID, &bio->bi_flags);

970

clear_bit(BIO_SEG_VALID, &bio->bi_flags);

943

971

944

if (rw == READ) {

972

if (rw == READ) {

945

/*

973

/*

946

* read balancing logic:

974

* read balancing logic:

947

*/

975

*/

948

int disk;

976

int disk;

949

int slot;

977

int slot;

950

978

951

read_again:

979

read_again:

952

disk = read_balance(conf, r10_bio, &max_sectors);

980

disk = read_balance(conf, r10_bio, &max_sectors);

953

slot = r10_bio->read_slot;

981

slot = r10_bio->read_slot;

954

if (disk < 0) {

982

if (disk < 0) {

955

raid_end_bio_io(r10_bio);

983

raid_end_bio_io(r10_bio);

956

return;

984

return;

957

}

985

}

958

mirror = conf->mirrors + disk;

986

mirror = conf->mirrors + disk;

959

987

960

read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);

988

read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);

961

md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,

989

md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,

962

max_sectors);

990

max_sectors);

963

991

964

r10_bio->devs[slot].bio = read_bio;

992

r10_bio->devs[slot].bio = read_bio;

965

993

966

read_bio->bi_sector = r10_bio->devs[slot].addr +

994

read_bio->bi_sector = r10_bio->devs[slot].addr +

967

mirror->rdev->data_offset;

995

mirror->rdev->data_offset;

968

read_bio->bi_bdev = mirror->rdev->bdev;

996

read_bio->bi_bdev = mirror->rdev->bdev;

969

read_bio->bi_end_io = raid10_end_read_request;

997

read_bio->bi_end_io = raid10_end_read_request;

970

read_bio->bi_rw = READ | do_sync;

998

read_bio->bi_rw = READ | do_sync;

971

read_bio->bi_private = r10_bio;

999

read_bio->bi_private = r10_bio;

972

1000

973

if (max_sectors < r10_bio->sectors) {

1001

if (max_sectors < r10_bio->sectors) {

974

/* Could not read all from this device, so we will

1002

/* Could not read all from this device, so we will

975

* need another r10_bio.

1003

* need another r10_bio.

976

*/

1004

*/

977

sectors_handled = (r10_bio->sectors + max_sectors

1005

sectors_handled = (r10_bio->sectors + max_sectors

978

- bio->bi_sector);

1006

- bio->bi_sector);

979

r10_bio->sectors = max_sectors;

1007

r10_bio->sectors = max_sectors;

980

spin_lock_irq(&conf->device_lock);

1008

spin_lock_irq(&conf->device_lock);

981

if (bio->bi_phys_segments == 0)

1009

if (bio->bi_phys_segments == 0)

982

bio->bi_phys_segments = 2;

1010

bio->bi_phys_segments = 2;

983

else

1011

else

984

bio->bi_phys_segments++;

1012

bio->bi_phys_segments++;

985

spin_unlock(&conf->device_lock);

1013

spin_unlock(&conf->device_lock);

986

/* Cannot call generic_make_request directly

1014

/* Cannot call generic_make_request directly

987

* as that will be queued in __generic_make_request

1015

* as that will be queued in __generic_make_request

988

* and subsequent mempool_alloc might block

1016

* and subsequent mempool_alloc might block

989

* waiting for it. so hand bio over to raid10d.

1017

* waiting for it. so hand bio over to raid10d.

990

*/

1018

*/

991

reschedule_retry(r10_bio);

1019

reschedule_retry(r10_bio);

992

1020

993

r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);

1021

r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);

994

1022

995

r10_bio->master_bio = bio;

1023

r10_bio->master_bio = bio;

996

r10_bio->sectors = ((bio->bi_size >> 9)

1024

r10_bio->sectors = ((bio->bi_size >> 9)

997

- sectors_handled);

1025

- sectors_handled);

998

r10_bio->state = 0;

1026

r10_bio->state = 0;

999

r10_bio->mddev = mddev;

1027

r10_bio->mddev = mddev;

1000

r10_bio->sector = bio->bi_sector + sectors_handled;

1028

r10_bio->sector = bio->bi_sector + sectors_handled;

1001

goto read_again;

1029

goto read_again;

1002

} else

1030

} else

1003

generic_make_request(read_bio);

1031

generic_make_request(read_bio);

1004

return;

1032

return;

1005

}

1033

}

1006

1034

1007

/*

1035

/*

1008

* WRITE:

1036

* WRITE:

1009

*/

1037

*/

1010

if (conf->pending_count >= max_queued_requests) {

1038

if (conf->pending_count >= max_queued_requests) {

1011

md_wakeup_thread(mddev->thread);

1039

md_wakeup_thread(mddev->thread);

1012

wait_event(conf->wait_barrier,

1040

wait_event(conf->wait_barrier,

1013

conf->pending_count < max_queued_requests);

1041

conf->pending_count < max_queued_requests);

1014

}

1042

}

1015

/* first select target devices under rcu_lock and

1043

/* first select target devices under rcu_lock and

1016

* inc refcount on their rdev. Record them by setting

1044

* inc refcount on their rdev. Record them by setting

1017

* bios[x] to bio

1045

* bios[x] to bio

1018

* If there are known/acknowledged bad blocks on any device

1046

* If there are known/acknowledged bad blocks on any device

1019

* on which we have seen a write error, we want to avoid

1047

* on which we have seen a write error, we want to avoid

1020

* writing to those blocks. This potentially requires several

1048

* writing to those blocks. This potentially requires several

1021

* writes to write around the bad blocks. Each set of writes

1049

* writes to write around the bad blocks. Each set of writes

1022

* gets its own r10_bio with a set of bios attached. The number

1050

* gets its own r10_bio with a set of bios attached. The number

1023

* of r10_bios is recored in bio->bi_phys_segments just as with

1051

* of r10_bios is recored in bio->bi_phys_segments just as with

1024

* the read case.

1052

* the read case.

1025

*/

1053

*/

1026

plugged = mddev_check_plugged(mddev);

1054

plugged = mddev_check_plugged(mddev);

1027

1055

1056

r10_bio->read_slot = -1; /* make sure repl_bio gets freed */

1028

raid10_find_phys(conf, r10_bio);

1057

raid10_find_phys(conf, r10_bio);

1029

retry_write:

1058

retry_write:

1030

blocked_rdev = NULL;

1059

blocked_rdev = NULL;

1031

rcu_read_lock();

1060

rcu_read_lock();

1032

max_sectors = r10_bio->sectors;

1061

max_sectors = r10_bio->sectors;

1033

1062

1034

for (i = 0; i < conf->copies; i++) {

1063

for (i = 0; i < conf->copies; i++) {

1035

int d = r10_bio->devs[i].devnum;

1064

int d = r10_bio->devs[i].devnum;

1036

struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);

1065

struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);

1037

if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {

1066

if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {

1038

atomic_inc(&rdev->nr_pending);

1067

atomic_inc(&rdev->nr_pending);

1039

blocked_rdev = rdev;

1068

blocked_rdev = rdev;

1040

break;

1069

break;

1041

}

1070

}

1042

r10_bio->devs[i].bio = NULL;

1071

r10_bio->devs[i].bio = NULL;

1043

if (!rdev || test_bit(Faulty, &rdev->flags)) {

1072

if (!rdev || test_bit(Faulty, &rdev->flags)) {

1044

set_bit(R10BIO_Degraded, &r10_bio->state);

1073

set_bit(R10BIO_Degraded, &r10_bio->state);

1045

continue;

1074

continue;

1046

}

1075

}

1047

if (test_bit(WriteErrorSeen, &rdev->flags)) {

1076

if (test_bit(WriteErrorSeen, &rdev->flags)) {

1048

sector_t first_bad;

1077

sector_t first_bad;

1049

sector_t dev_sector = r10_bio->devs[i].addr;

1078

sector_t dev_sector = r10_bio->devs[i].addr;

1050

int bad_sectors;

1079

int bad_sectors;

1051

int is_bad;

1080

int is_bad;

1052

1081

1053

is_bad = is_badblock(rdev, dev_sector,

1082

is_bad = is_badblock(rdev, dev_sector,

1054

max_sectors,

1083

max_sectors,

1055

&first_bad, &bad_sectors);

1084

&first_bad, &bad_sectors);

1056

if (is_bad < 0) {

1085

if (is_bad < 0) {

1057

/* Mustn't write here until the bad block

1086

/* Mustn't write here until the bad block

1058

* is acknowledged

1087

* is acknowledged

1059

*/

1088

*/

1060

atomic_inc(&rdev->nr_pending);

1089

atomic_inc(&rdev->nr_pending);

1061

set_bit(BlockedBadBlocks, &rdev->flags);

1090

set_bit(BlockedBadBlocks, &rdev->flags);

1062

blocked_rdev = rdev;

1091

blocked_rdev = rdev;

1063

break;

1092

break;

1064

}

1093

}

1065

if (is_bad && first_bad <= dev_sector) {

1094

if (is_bad && first_bad <= dev_sector) {

1066

/* Cannot write here at all */

1095

/* Cannot write here at all */

1067

bad_sectors -= (dev_sector - first_bad);

1096

bad_sectors -= (dev_sector - first_bad);

1068

if (bad_sectors < max_sectors)

1097

if (bad_sectors < max_sectors)

1069

/* Mustn't write more than bad_sectors

1098

/* Mustn't write more than bad_sectors

1070

* to other devices yet

1099

* to other devices yet

1071

*/

1100

*/

1072

max_sectors = bad_sectors;

1101

max_sectors = bad_sectors;

1073

/* We don't set R10BIO_Degraded as that

1102

/* We don't set R10BIO_Degraded as that

1074

* only applies if the disk is missing,

1103

* only applies if the disk is missing,

1075

* so it might be re-added, and we want to

1104

* so it might be re-added, and we want to

1076

* know to recover this chunk.

1105

* know to recover this chunk.

1077

* In this case the device is here, and the

1106

* In this case the device is here, and the

1078

* fact that this chunk is not in-sync is

1107

* fact that this chunk is not in-sync is

1079

* recorded in the bad block log.

1108

* recorded in the bad block log.

1080

*/

1109

*/

1081

continue;

1110

continue;

1082

}

1111

}

1083

if (is_bad) {

1112

if (is_bad) {

1084

int good_sectors = first_bad - dev_sector;

1113

int good_sectors = first_bad - dev_sector;

1085

if (good_sectors < max_sectors)

1114

if (good_sectors < max_sectors)

1086

max_sectors = good_sectors;

1115

max_sectors = good_sectors;

1087

}

1116

}

1088

}

1117

}

1089

r10_bio->devs[i].bio = bio;

1118

r10_bio->devs[i].bio = bio;

1090

atomic_inc(&rdev->nr_pending);

1119

atomic_inc(&rdev->nr_pending);

1091

}

1120

}

1092

rcu_read_unlock();

1121

rcu_read_unlock();

1093

1122

1094

if (unlikely(blocked_rdev)) {

1123

if (unlikely(blocked_rdev)) {

1095

/* Have to wait for this device to get unblocked, then retry */

1124

/* Have to wait for this device to get unblocked, then retry */

1096

int j;

1125

int j;

1097

int d;

1126

int d;

1098

1127

1099

for (j = 0; j < i; j++)

1128

for (j = 0; j < i; j++)

1100

if (r10_bio->devs[j].bio) {

1129

if (r10_bio->devs[j].bio) {

1101

d = r10_bio->devs[j].devnum;

1130

d = r10_bio->devs[j].devnum;

1102

rdev_dec_pending(conf->mirrors[d].rdev, mddev);

1131

rdev_dec_pending(conf->mirrors[d].rdev, mddev);

1103

}

1132

}

1104

allow_barrier(conf);

1133

allow_barrier(conf);

1105

md_wait_for_blocked_rdev(blocked_rdev, mddev);

1134

md_wait_for_blocked_rdev(blocked_rdev, mddev);

1106

wait_barrier(conf);

1135

wait_barrier(conf);

1107

goto retry_write;

1136

goto retry_write;

1108

}

1137

}

1109

1138

1110

if (max_sectors < r10_bio->sectors) {

1139

if (max_sectors < r10_bio->sectors) {

1111

/* We are splitting this into multiple parts, so

1140

/* We are splitting this into multiple parts, so

1112

* we need to prepare for allocating another r10_bio.

1141

* we need to prepare for allocating another r10_bio.

1113

*/

1142

*/

1114

r10_bio->sectors = max_sectors;

1143

r10_bio->sectors = max_sectors;

1115

spin_lock_irq(&conf->device_lock);

1144

spin_lock_irq(&conf->device_lock);

1116

if (bio->bi_phys_segments == 0)

1145

if (bio->bi_phys_segments == 0)

1117

bio->bi_phys_segments = 2;

1146

bio->bi_phys_segments = 2;

1118

else

1147

else

1119

bio->bi_phys_segments++;

1148

bio->bi_phys_segments++;

1120

spin_unlock_irq(&conf->device_lock);

1149

spin_unlock_irq(&conf->device_lock);

1121

}

1150

}

1122

sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;

1151

sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;

1123

1152

1124

atomic_set(&r10_bio->remaining, 1);

1153

atomic_set(&r10_bio->remaining, 1);

1125

bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);

1154

bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);

1126

1155

1127

for (i = 0; i < conf->copies; i++) {

1156

for (i = 0; i < conf->copies; i++) {

1128

struct bio *mbio;

1157

struct bio *mbio;

1129

int d = r10_bio->devs[i].devnum;

1158

int d = r10_bio->devs[i].devnum;

1130

if (!r10_bio->devs[i].bio)

1159

if (!r10_bio->devs[i].bio)

1131

continue;

1160

continue;

1132

1161

1133

mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);

1162

mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);

1134

md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,

1163

md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,

1135

max_sectors);

1164

max_sectors);

1136

r10_bio->devs[i].bio = mbio;

1165

r10_bio->devs[i].bio = mbio;

1137

1166

1138

mbio->bi_sector = (r10_bio->devs[i].addr+

1167

mbio->bi_sector = (r10_bio->devs[i].addr+

1139

conf->mirrors[d].rdev->data_offset);

1168

conf->mirrors[d].rdev->data_offset);

1140

mbio->bi_bdev = conf->mirrors[d].rdev->bdev;

1169

mbio->bi_bdev = conf->mirrors[d].rdev->bdev;

1141

mbio->bi_end_io = raid10_end_write_request;

1170

mbio->bi_end_io = raid10_end_write_request;

1142

mbio->bi_rw = WRITE | do_sync | do_fua;

1171

mbio->bi_rw = WRITE | do_sync | do_fua;

1143

mbio->bi_private = r10_bio;

1172

mbio->bi_private = r10_bio;

1144

1173

1145

atomic_inc(&r10_bio->remaining);

1174

atomic_inc(&r10_bio->remaining);

1146

spin_lock_irqsave(&conf->device_lock, flags);

1175

spin_lock_irqsave(&conf->device_lock, flags);

1147

bio_list_add(&conf->pending_bio_list, mbio);

1176

bio_list_add(&conf->pending_bio_list, mbio);

1148

conf->pending_count++;

1177

conf->pending_count++;

1149

spin_unlock_irqrestore(&conf->device_lock, flags);

1178

spin_unlock_irqrestore(&conf->device_lock, flags);

1150

}

1179

}

1151

1180

1152

/* Don't remove the bias on 'remaining' (one_write_done) until

1181

/* Don't remove the bias on 'remaining' (one_write_done) until

1153

* after checking if we need to go around again.

1182

* after checking if we need to go around again.

1154

*/

1183

*/

1155

1184

1156

if (sectors_handled < (bio->bi_size >> 9)) {

1185

if (sectors_handled < (bio->bi_size >> 9)) {

1157

one_write_done(r10_bio);

1186

one_write_done(r10_bio);

1158

/* We need another r10_bio. It has already been counted

1187

/* We need another r10_bio. It has already been counted

1159

* in bio->bi_phys_segments.

1188

* in bio->bi_phys_segments.

1160

*/

1189

*/

1161

r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);

1190

r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);

1162

1191

1163

r10_bio->master_bio = bio;

1192

r10_bio->master_bio = bio;

1164

r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;

1193

r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;

1165

1194

1166

r10_bio->mddev = mddev;

1195

r10_bio->mddev = mddev;

1167

r10_bio->sector = bio->bi_sector + sectors_handled;

1196

r10_bio->sector = bio->bi_sector + sectors_handled;

1168

r10_bio->state = 0;

1197

r10_bio->state = 0;

1169

goto retry_write;

1198

goto retry_write;

1170

}

1199

}

1171

one_write_done(r10_bio);

1200

one_write_done(r10_bio);

1172

1201

1173

/* In case raid10d snuck in to freeze_array */

1202

/* In case raid10d snuck in to freeze_array */

1174

wake_up(&conf->wait_barrier);

1203

wake_up(&conf->wait_barrier);

1175

1204

1176

if (do_sync || !mddev->bitmap || !plugged)

1205

if (do_sync || !mddev->bitmap || !plugged)

1177

md_wakeup_thread(mddev->thread);

1206

md_wakeup_thread(mddev->thread);

1178

}

1207

}

1179

1208

1180

static void status(struct seq_file *seq, struct mddev *mddev)

1209

static void status(struct seq_file *seq, struct mddev *mddev)

1181

{

1210

{

1182

struct r10conf *conf = mddev->private;

1211

struct r10conf *conf = mddev->private;

1183

int i;

1212

int i;

1184

1213

1185

if (conf->near_copies < conf->raid_disks)

1214

if (conf->near_copies < conf->raid_disks)

1186

seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);

1215

seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);

1187

if (conf->near_copies > 1)

1216

if (conf->near_copies > 1)

1188

seq_printf(seq, " %d near-copies", conf->near_copies);

1217

seq_printf(seq, " %d near-copies", conf->near_copies);

1189

if (conf->far_copies > 1) {

1218

if (conf->far_copies > 1) {

1190

if (conf->far_offset)

1219

if (conf->far_offset)

1191

seq_printf(seq, " %d offset-copies", conf->far_copies);

1220

seq_printf(seq, " %d offset-copies", conf->far_copies);

1192

else

1221

else

1193

seq_printf(seq, " %d far-copies", conf->far_copies);

1222

seq_printf(seq, " %d far-copies", conf->far_copies);

1194

}

1223

}

1195

seq_printf(seq, " [%d/%d] [", conf->raid_disks,

1224

seq_printf(seq, " [%d/%d] [", conf->raid_disks,

1196

conf->raid_disks - mddev->degraded);

1225

conf->raid_disks - mddev->degraded);

1197

for (i = 0; i < conf->raid_disks; i++)

1226

for (i = 0; i < conf->raid_disks; i++)

1198

seq_printf(seq, "%s",

1227

seq_printf(seq, "%s",

1199

conf->mirrors[i].rdev &&

1228

conf->mirrors[i].rdev &&

1200

test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");

1229

test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");

1201

seq_printf(seq, "]");

1230

seq_printf(seq, "]");

1202

}

1231

}

1203

1232

1204

/* check if there are enough drives for

1233

/* check if there are enough drives for

1205

* every block to appear on atleast one.

1234

* every block to appear on atleast one.

1206

* Don't consider the device numbered 'ignore'

1235

* Don't consider the device numbered 'ignore'

1207

* as we might be about to remove it.

1236

* as we might be about to remove it.

1208

*/

1237

*/

1209

static int enough(struct r10conf *conf, int ignore)

1238

static int enough(struct r10conf *conf, int ignore)

1210

{

1239

{

1211

int first = 0;

1240

int first = 0;

1212

1241

1213

do {

1242

do {

1214

int n = conf->copies;

1243

int n = conf->copies;

1215

int cnt = 0;

1244

int cnt = 0;

1216

while (n--) {

1245

while (n--) {

1217

if (conf->mirrors[first].rdev &&

1246

if (conf->mirrors[first].rdev &&

1218

first != ignore)

1247

first != ignore)

1219

cnt++;

1248

cnt++;

1220

first = (first+1) % conf->raid_disks;

1249

first = (first+1) % conf->raid_disks;

1221

}

1250

}

1222

if (cnt == 0)

1251

if (cnt == 0)

1223

return 0;

1252

return 0;

1224

} while (first != 0);

1253

} while (first != 0);

1225

return 1;

1254

return 1;

1226

}

1255

}

1227

1256

1228

static void error(struct mddev *mddev, struct md_rdev *rdev)

1257

static void error(struct mddev *mddev, struct md_rdev *rdev)

1229

{

1258

{

1230

char b[BDEVNAME_SIZE];

1259

char b[BDEVNAME_SIZE];

1231

struct r10conf *conf = mddev->private;

1260

struct r10conf *conf = mddev->private;

1232

1261

1233

/*

1262

/*

1234

* If it is not operational, then we have already marked it as dead

1263

* If it is not operational, then we have already marked it as dead

1235

* else if it is the last working disks, ignore the error, let the

1264

* else if it is the last working disks, ignore the error, let the

1236

* next level up know.

1265

* next level up know.

1237

* else mark the drive as failed

1266

* else mark the drive as failed

1238

*/

1267

*/

1239

if (test_bit(In_sync, &rdev->flags)

1268

if (test_bit(In_sync, &rdev->flags)

1240

&& !enough(conf, rdev->raid_disk))

1269

&& !enough(conf, rdev->raid_disk))

1241

/*

1270

/*

1242

* Don't fail the drive, just return an IO error.

1271

* Don't fail the drive, just return an IO error.

1243

*/

1272

*/

1244

return;

1273

return;

1245

if (test_and_clear_bit(In_sync, &rdev->flags)) {

1274

if (test_and_clear_bit(In_sync, &rdev->flags)) {

1246

unsigned long flags;

1275

unsigned long flags;

1247

spin_lock_irqsave(&conf->device_lock, flags);

1276

spin_lock_irqsave(&conf->device_lock, flags);

1248

mddev->degraded++;

1277

mddev->degraded++;

1249

spin_unlock_irqrestore(&conf->device_lock, flags);

1278

spin_unlock_irqrestore(&conf->device_lock, flags);

1250

/*

1279

/*

1251

* if recovery is running, make sure it aborts.

1280

* if recovery is running, make sure it aborts.

1252

*/

1281

*/

1253

set_bit(MD_RECOVERY_INTR, &mddev->recovery);

1282

set_bit(MD_RECOVERY_INTR, &mddev->recovery);

1254

}

1283

}

1255

set_bit(Blocked, &rdev->flags);

1284

set_bit(Blocked, &rdev->flags);

1256

set_bit(Faulty, &rdev->flags);

1285

set_bit(Faulty, &rdev->flags);

1257

set_bit(MD_CHANGE_DEVS, &mddev->flags);

1286

set_bit(MD_CHANGE_DEVS, &mddev->flags);

1258

printk(KERN_ALERT

1287

printk(KERN_ALERT

1259

"md/raid10:%s: Disk failure on %s, disabling device.\n"

1288

"md/raid10:%s: Disk failure on %s, disabling device.\n"

1260

"md/raid10:%s: Operation continuing on %d devices.\n",

1289

"md/raid10:%s: Operation continuing on %d devices.\n",

1261

mdname(mddev), bdevname(rdev->bdev, b),

1290

mdname(mddev), bdevname(rdev->bdev, b),

1262

mdname(mddev), conf->raid_disks - mddev->degraded);

1291

mdname(mddev), conf->raid_disks - mddev->degraded);

1263

}

1292

}

1264

1293

1265

static void print_conf(struct r10conf *conf)

1294

static void print_conf(struct r10conf *conf)

1266

{

1295

{

1267

int i;

1296

int i;

1268

struct mirror_info *tmp;

1297

struct mirror_info *tmp;

1269

1298

1270

printk(KERN_DEBUG "RAID10 conf printout:\n");

1299

printk(KERN_DEBUG "RAID10 conf printout:\n");

1271

if (!conf) {

1300

if (!conf) {

1272

printk(KERN_DEBUG "(!conf)\n");

1301

printk(KERN_DEBUG "(!conf)\n");

1273

return;

1302

return;

1274

}

1303

}

1275

printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,

1304

printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,

1276

conf->raid_disks);

1305

conf->raid_disks);

1277

1306

1278

for (i = 0; i < conf->raid_disks; i++) {

1307

for (i = 0; i < conf->raid_disks; i++) {

1279

char b[BDEVNAME_SIZE];

1308

char b[BDEVNAME_SIZE];

1280

tmp = conf->mirrors + i;

1309

tmp = conf->mirrors + i;

1281

if (tmp->rdev)

1310

if (tmp->rdev)

1282

printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",

1311

printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",

1283

i, !test_bit(In_sync, &tmp->rdev->flags),

1312

i, !test_bit(In_sync, &tmp->rdev->flags),

1284

!test_bit(Faulty, &tmp->rdev->flags),

1313

!test_bit(Faulty, &tmp->rdev->flags),

1285

bdevname(tmp->rdev->bdev,b));

1314

bdevname(tmp->rdev->bdev,b));

1286

}

1315

}

1287

}

1316

}

1288

1317

1289

static void close_sync(struct r10conf *conf)

1318

static void close_sync(struct r10conf *conf)

1290

{

1319

{

1291

wait_barrier(conf);

1320

wait_barrier(conf);

1292

allow_barrier(conf);

1321

allow_barrier(conf);

1293

1322

1294

mempool_destroy(conf->r10buf_pool);

1323

mempool_destroy(conf->r10buf_pool);

1295

conf->r10buf_pool = NULL;

1324

conf->r10buf_pool = NULL;

1296

}

1325

}

1297

1326

1298

static int raid10_spare_active(struct mddev *mddev)

1327

static int raid10_spare_active(struct mddev *mddev)

1299

{

1328

{

1300

int i;

1329

int i;

1301

struct r10conf *conf = mddev->private;

1330

struct r10conf *conf = mddev->private;

1302

struct mirror_info *tmp;

1331

struct mirror_info *tmp;

1303

int count = 0;

1332

int count = 0;

1304

unsigned long flags;

1333

unsigned long flags;

1305

1334

1306

/*

1335

/*

1307

* Find all non-in_sync disks within the RAID10 configuration

1336

* Find all non-in_sync disks within the RAID10 configuration

1308

* and mark them in_sync

1337

* and mark them in_sync

1309

*/

1338

*/

1310

for (i = 0; i < conf->raid_disks; i++) {

1339

for (i = 0; i < conf->raid_disks; i++) {

1311

tmp = conf->mirrors + i;

1340

tmp = conf->mirrors + i;

1312

if (tmp->rdev

1341

if (tmp->rdev

1313

&& !test_bit(Faulty, &tmp->rdev->flags)

1342

&& !test_bit(Faulty, &tmp->rdev->flags)

1314

&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {

1343

&& !test_and_set_bit(In_sync, &tmp->rdev->flags)) {

1315

count++;

1344

count++;

1316

sysfs_notify_dirent(tmp->rdev->sysfs_state);

1345

sysfs_notify_dirent(tmp->rdev->sysfs_state);

1317

}

1346

}

1318

}

1347

}

1319

spin_lock_irqsave(&conf->device_lock, flags);

1348

spin_lock_irqsave(&conf->device_lock, flags);

1320

mddev->degraded -= count;

1349

mddev->degraded -= count;

1321

spin_unlock_irqrestore(&conf->device_lock, flags);

1350

spin_unlock_irqrestore(&conf->device_lock, flags);

1322

1351

1323

print_conf(conf);

1352

print_conf(conf);

1324

return count;

1353

return count;

1325

}

1354

}

1326

1355

1327

1356

1328

static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)

1357

static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)

1329

{

1358

{

1330

struct r10conf *conf = mddev->private;

1359

struct r10conf *conf = mddev->private;

1331

int err = -EEXIST;

1360

int err = -EEXIST;

1332

int mirror;

1361

int mirror;

1333

int first = 0;

1362

int first = 0;

1334

int last = conf->raid_disks - 1;

1363

int last = conf->raid_disks - 1;

1335

1364

1336

if (mddev->recovery_cp < MaxSector)

1365

if (mddev->recovery_cp < MaxSector)

1337

/* only hot-add to in-sync arrays, as recovery is

1366

/* only hot-add to in-sync arrays, as recovery is

1338

* very different from resync

1367

* very different from resync

1339

*/

1368

*/

1340

return -EBUSY;

1369

return -EBUSY;

1341

if (!enough(conf, -1))

1370

if (!enough(conf, -1))

1342

return -EINVAL;

1371

return -EINVAL;

1343

1372

1344

if (rdev->raid_disk >= 0)

1373

if (rdev->raid_disk >= 0)

1345

first = last = rdev->raid_disk;

1374

first = last = rdev->raid_disk;

1346

1375

1347

if (rdev->saved_raid_disk >= first &&

1376

if (rdev->saved_raid_disk >= first &&

1348

conf->mirrors[rdev->saved_raid_disk].rdev == NULL)

1377

conf->mirrors[rdev->saved_raid_disk].rdev == NULL)

1349

mirror = rdev->saved_raid_disk;

1378

mirror = rdev->saved_raid_disk;

1350

else

1379

else

1351

mirror = first;

1380

mirror = first;

1352

for ( ; mirror <= last ; mirror++) {

1381

for ( ; mirror <= last ; mirror++) {

1353

struct mirror_info *p = &conf->mirrors[mirror];

1382

struct mirror_info *p = &conf->mirrors[mirror];

1354

if (p->recovery_disabled == mddev->recovery_disabled)

1383

if (p->recovery_disabled == mddev->recovery_disabled)

1355

continue;

1384

continue;

1356

if (p->rdev)

1385

if (p->rdev)

1357

continue;

1386

continue;

1358

1387

1359

disk_stack_limits(mddev->gendisk, rdev->bdev,

1388

disk_stack_limits(mddev->gendisk, rdev->bdev,

1360

rdev->data_offset << 9);

1389

rdev->data_offset << 9);

1361

/* as we don't honour merge_bvec_fn, we must

1390

/* as we don't honour merge_bvec_fn, we must

1362

* never risk violating it, so limit

1391

* never risk violating it, so limit

1363

* ->max_segments to one lying with a single

1392

* ->max_segments to one lying with a single

1364

* page, as a one page request is never in

1393

* page, as a one page request is never in

1365

* violation.

1394

* violation.

1366

*/

1395

*/

1367

if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {

1396

if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {

1368

blk_queue_max_segments(mddev->queue, 1);

1397

blk_queue_max_segments(mddev->queue, 1);

1369

blk_queue_segment_boundary(mddev->queue,

1398

blk_queue_segment_boundary(mddev->queue,

1370

PAGE_CACHE_SIZE - 1);

1399

PAGE_CACHE_SIZE - 1);

1371

}

1400

}

1372

1401

1373

p->head_position = 0;

1402

p->head_position = 0;

1374

p->recovery_disabled = mddev->recovery_disabled - 1;

1403

p->recovery_disabled = mddev->recovery_disabled - 1;

1375

rdev->raid_disk = mirror;

1404

rdev->raid_disk = mirror;

1376

err = 0;

1405

err = 0;

1377

if (rdev->saved_raid_disk != mirror)

1406

if (rdev->saved_raid_disk != mirror)

1378

conf->fullsync = 1;

1407

conf->fullsync = 1;

1379

rcu_assign_pointer(p->rdev, rdev);

1408

rcu_assign_pointer(p->rdev, rdev);

1380

break;

1409

break;

1381

}

1410

}

1382

1411

1383

md_integrity_add_rdev(rdev, mddev);

1412

md_integrity_add_rdev(rdev, mddev);

1384

print_conf(conf);

1413

print_conf(conf);

1385

return err;

1414

return err;

1386

}

1415

}

1387

1416

1388

static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)

1417

static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)

1389

{

1418

{

1390

struct r10conf *conf = mddev->private;

1419

struct r10conf *conf = mddev->private;

1391

int err = 0;

1420

int err = 0;

1392

int number = rdev->raid_disk;

1421

int number = rdev->raid_disk;

1393

struct mirror_info *p = conf->mirrors+ number;

1422

struct mirror_info *p = conf->mirrors+ number;

1394

1423

1395

print_conf(conf);

1424

print_conf(conf);

1396

if (rdev == p->rdev) {

1425

if (rdev == p->rdev) {

1397

if (test_bit(In_sync, &rdev->flags) ||

1426

if (test_bit(In_sync, &rdev->flags) ||

1398

atomic_read(&rdev->nr_pending)) {

1427

atomic_read(&rdev->nr_pending)) {

1399

err = -EBUSY;

1428

err = -EBUSY;

1400

goto abort;

1429

goto abort;

1401

}

1430

}

1402

/* Only remove faulty devices in recovery

1431

/* Only remove faulty devices in recovery

1403

* is not possible.

1432

* is not possible.

1404

*/

1433

*/

1405

if (!test_bit(Faulty, &rdev->flags) &&

1434

if (!test_bit(Faulty, &rdev->flags) &&

1406

mddev->recovery_disabled != p->recovery_disabled &&

1435

mddev->recovery_disabled != p->recovery_disabled &&

1407

enough(conf, -1)) {

1436

enough(conf, -1)) {

1408

err = -EBUSY;

1437

err = -EBUSY;

1409

goto abort;

1438

goto abort;

1410

}

1439

}

1411

p->rdev = NULL;

1440

p->rdev = NULL;

1412

synchronize_rcu();

1441

synchronize_rcu();

1413

if (atomic_read(&rdev->nr_pending)) {

1442

if (atomic_read(&rdev->nr_pending)) {

1414

/* lost the race, try later */

1443

/* lost the race, try later */

1415

err = -EBUSY;

1444

err = -EBUSY;

1416

p->rdev = rdev;

1445

p->rdev = rdev;

1417

goto abort;

1446

goto abort;

1418

}

1447

}

1419

err = md_integrity_register(mddev);

1448

err = md_integrity_register(mddev);

1420

}

1449

}

1421

abort:

1450

abort:

1422

1451

1423

print_conf(conf);

1452

print_conf(conf);

1424

return err;

1453

return err;

1425

}

1454

}

1426

1455

1427

1456

1428

static void end_sync_read(struct bio *bio, int error)

1457

static void end_sync_read(struct bio *bio, int error)

1429

{

1458

{

1430

struct r10bio *r10_bio = bio->bi_private;

1459

struct r10bio *r10_bio = bio->bi_private;

1431

struct r10conf *conf = r10_bio->mddev->private;

1460

struct r10conf *conf = r10_bio->mddev->private;

1432

int d;

1461

int d;

1433

1462

1434

d = find_bio_disk(conf, r10_bio, bio, NULL);

1463

d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);

1435

1464

1436

if (test_bit(BIO_UPTODATE, &bio->bi_flags))

1465

if (test_bit(BIO_UPTODATE, &bio->bi_flags))

1437

set_bit(R10BIO_Uptodate, &r10_bio->state);

1466

set_bit(R10BIO_Uptodate, &r10_bio->state);

1438

else

1467

else

1439

/* The write handler will notice the lack of

1468

/* The write handler will notice the lack of

1440

* R10BIO_Uptodate and record any errors etc

1469

* R10BIO_Uptodate and record any errors etc

1441

*/

1470

*/

1442

atomic_add(r10_bio->sectors,

1471

atomic_add(r10_bio->sectors,

1443

&conf->mirrors[d].rdev->corrected_errors);

1472

&conf->mirrors[d].rdev->corrected_errors);

1444

1473

1445

/* for reconstruct, we always reschedule after a read.

1474

/* for reconstruct, we always reschedule after a read.

1446

* for resync, only after all reads

1475

* for resync, only after all reads

1447

*/

1476

*/

1448

rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);

1477

rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);

1449

if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||

1478

if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||

1450

atomic_dec_and_test(&r10_bio->remaining)) {

1479

atomic_dec_and_test(&r10_bio->remaining)) {

1451

/* we have read all the blocks,

1480

/* we have read all the blocks,

1452

* do the comparison in process context in raid10d

1481

* do the comparison in process context in raid10d

1453

*/

1482

*/

1454

reschedule_retry(r10_bio);

1483

reschedule_retry(r10_bio);

1455

}

1484

}

1456

}

1485

}

1457

1486

1458

static void end_sync_request(struct r10bio *r10_bio)

1487

static void end_sync_request(struct r10bio *r10_bio)

1459

{

1488

{

1460

struct mddev *mddev = r10_bio->mddev;

1489

struct mddev *mddev = r10_bio->mddev;

1461

1490

1462

while (atomic_dec_and_test(&r10_bio->remaining)) {

1491

while (atomic_dec_and_test(&r10_bio->remaining)) {

1463

if (r10_bio->master_bio == NULL) {

1492

if (r10_bio->master_bio == NULL) {

1464

/* the primary of several recovery bios */

1493

/* the primary of several recovery bios */

1465

sector_t s = r10_bio->sectors;

1494

sector_t s = r10_bio->sectors;

1466

if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||

1495

if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||

1467

test_bit(R10BIO_WriteError, &r10_bio->state))

1496

test_bit(R10BIO_WriteError, &r10_bio->state))

1468

reschedule_retry(r10_bio);

1497

reschedule_retry(r10_bio);

1469

else

1498

else

1470

put_buf(r10_bio);

1499

put_buf(r10_bio);

1471

md_done_sync(mddev, s, 1);

1500

md_done_sync(mddev, s, 1);

1472

break;

1501

break;

1473

} else {

1502

} else {

1474

struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;

1503

struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;

1475

if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||

1504

if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||

1476

test_bit(R10BIO_WriteError, &r10_bio->state))

1505

test_bit(R10BIO_WriteError, &r10_bio->state))

1477

reschedule_retry(r10_bio);

1506

reschedule_retry(r10_bio);

1478

else

1507

else

1479

put_buf(r10_bio);

1508

put_buf(r10_bio);

1480

r10_bio = r10_bio2;

1509

r10_bio = r10_bio2;

1481

}

1510

}

1482

}

1511

}

1483

}

1512

}

1484

1513

1485

static void end_sync_write(struct bio *bio, int error)

1514

static void end_sync_write(struct bio *bio, int error)

1486

{

1515

{

1487

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

1516

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

1488

struct r10bio *r10_bio = bio->bi_private;

1517

struct r10bio *r10_bio = bio->bi_private;

1489

struct mddev *mddev = r10_bio->mddev;

1518

struct mddev *mddev = r10_bio->mddev;

1490

struct r10conf *conf = mddev->private;

1519

struct r10conf *conf = mddev->private;

1491

int d;

1520

int d;

1492

sector_t first_bad;

1521

sector_t first_bad;

1493

int bad_sectors;

1522

int bad_sectors;

1494

int slot;

1523

int slot;

1495

1524

1496

d = find_bio_disk(conf, r10_bio, bio, &slot);

1525

d = find_bio_disk(conf, r10_bio, bio, &slot, NULL);

1497

1526

1498

if (!uptodate) {

1527

if (!uptodate) {

1499

set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);

1528

set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);

1500

set_bit(R10BIO_WriteError, &r10_bio->state);

1529

set_bit(R10BIO_WriteError, &r10_bio->state);

1501

} else if (is_badblock(conf->mirrors[d].rdev,

1530

} else if (is_badblock(conf->mirrors[d].rdev,

1502

r10_bio->devs[slot].addr,

1531

r10_bio->devs[slot].addr,

1503

r10_bio->sectors,

1532

r10_bio->sectors,

1504

&first_bad, &bad_sectors))

1533

&first_bad, &bad_sectors))

1505

set_bit(R10BIO_MadeGood, &r10_bio->state);

1534

set_bit(R10BIO_MadeGood, &r10_bio->state);

1506

1535

1507

rdev_dec_pending(conf->mirrors[d].rdev, mddev);

1536

rdev_dec_pending(conf->mirrors[d].rdev, mddev);

1508

1537

1509

end_sync_request(r10_bio);

1538

end_sync_request(r10_bio);

1510

}

1539

}

1511

1540

1512

/*

1541

/*

1513

* Note: sync and recover and handled very differently for raid10

1542

* Note: sync and recover and handled very differently for raid10

1514

* This code is for resync.

1543

* This code is for resync.

1515

* For resync, we read through virtual addresses and read all blocks.

1544

* For resync, we read through virtual addresses and read all blocks.

1516

* If there is any error, we schedule a write. The lowest numbered

1545

* If there is any error, we schedule a write. The lowest numbered

1517

* drive is authoritative.

1546

* drive is authoritative.

1518

* However requests come for physical address, so we need to map.

1547

* However requests come for physical address, so we need to map.

1519

* For every physical address there are raid_disks/copies virtual addresses,

1548

* For every physical address there are raid_disks/copies virtual addresses,

1520

* which is always are least one, but is not necessarly an integer.

1549

* which is always are least one, but is not necessarly an integer.

1521

* This means that a physical address can span multiple chunks, so we may

1550

* This means that a physical address can span multiple chunks, so we may

1522

* have to submit multiple io requests for a single sync request.

1551

* have to submit multiple io requests for a single sync request.

1523

*/

1552

*/

1524

/*

1553

/*

1525

* We check if all blocks are in-sync and only write to blocks that

1554

* We check if all blocks are in-sync and only write to blocks that

1526

* aren't in sync

1555

* aren't in sync

1527

*/

1556

*/

1528

static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)

1557

static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)

1529

{

1558

{

1530

struct r10conf *conf = mddev->private;

1559

struct r10conf *conf = mddev->private;

1531

int i, first;

1560

int i, first;

1532

struct bio *tbio, *fbio;

1561

struct bio *tbio, *fbio;

1533

1562

1534

atomic_set(&r10_bio->remaining, 1);

1563

atomic_set(&r10_bio->remaining, 1);

1535

1564

1536

/* find the first device with a block */

1565

/* find the first device with a block */

1537

for (i=0; i<conf->copies; i++)

1566

for (i=0; i<conf->copies; i++)

1538

if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))

1567

if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))

1539

break;

1568

break;

1540

1569

1541

if (i == conf->copies)

1570

if (i == conf->copies)

1542

goto done;

1571

goto done;

1543

1572

1544

first = i;

1573

first = i;

1545

fbio = r10_bio->devs[i].bio;

1574

fbio = r10_bio->devs[i].bio;

1546

1575

1547

/* now find blocks with errors */

1576

/* now find blocks with errors */

1548

for (i=0 ; i < conf->copies ; i++) {

1577

for (i=0 ; i < conf->copies ; i++) {

1549

int j, d;

1578

int j, d;

1550

int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);

1579

int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);

1551

1580

1552

tbio = r10_bio->devs[i].bio;

1581

tbio = r10_bio->devs[i].bio;

1553

1582

1554

if (tbio->bi_end_io != end_sync_read)

1583

if (tbio->bi_end_io != end_sync_read)

1555

continue;

1584

continue;

1556

if (i == first)

1585

if (i == first)

1557

continue;

1586

continue;

1558

if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {

1587

if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {

1559

/* We know that the bi_io_vec layout is the same for

1588

/* We know that the bi_io_vec layout is the same for

1560

* both 'first' and 'i', so we just compare them.

1589

* both 'first' and 'i', so we just compare them.

1561

* All vec entries are PAGE_SIZE;

1590

* All vec entries are PAGE_SIZE;

1562

*/

1591

*/

1563

for (j = 0; j < vcnt; j++)

1592

for (j = 0; j < vcnt; j++)

1564

if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),

1593

if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),

1565

page_address(tbio->bi_io_vec[j].bv_page),

1594

page_address(tbio->bi_io_vec[j].bv_page),

1566

PAGE_SIZE))

1595

PAGE_SIZE))

1567

break;

1596

break;

1568

if (j == vcnt)

1597

if (j == vcnt)

1569

continue;

1598

continue;

1570

mddev->resync_mismatches += r10_bio->sectors;

1599

mddev->resync_mismatches += r10_bio->sectors;

1571

if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))

1600

if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))

1572

/* Don't fix anything. */

1601

/* Don't fix anything. */

1573

continue;

1602

continue;

1574

}

1603

}

1575

/* Ok, we need to write this bio, either to correct an

1604

/* Ok, we need to write this bio, either to correct an

1576

* inconsistency or to correct an unreadable block.

1605

* inconsistency or to correct an unreadable block.

1577

* First we need to fixup bv_offset, bv_len and

1606

* First we need to fixup bv_offset, bv_len and

1578

* bi_vecs, as the read request might have corrupted these

1607

* bi_vecs, as the read request might have corrupted these

1579

*/

1608

*/

1580

tbio->bi_vcnt = vcnt;

1609

tbio->bi_vcnt = vcnt;

1581

tbio->bi_size = r10_bio->sectors << 9;

1610

tbio->bi_size = r10_bio->sectors << 9;

1582

tbio->bi_idx = 0;

1611

tbio->bi_idx = 0;

1583

tbio->bi_phys_segments = 0;

1612

tbio->bi_phys_segments = 0;

1584

tbio->bi_flags &= ~(BIO_POOL_MASK - 1);

1613

tbio->bi_flags &= ~(BIO_POOL_MASK - 1);

1585

tbio->bi_flags |= 1 << BIO_UPTODATE;

1614

tbio->bi_flags |= 1 << BIO_UPTODATE;

1586

tbio->bi_next = NULL;

1615

tbio->bi_next = NULL;

1587

tbio->bi_rw = WRITE;

1616

tbio->bi_rw = WRITE;

1588

tbio->bi_private = r10_bio;

1617

tbio->bi_private = r10_bio;

1589

tbio->bi_sector = r10_bio->devs[i].addr;

1618

tbio->bi_sector = r10_bio->devs[i].addr;

1590

1619

1591

for (j=0; j < vcnt ; j++) {

1620

for (j=0; j < vcnt ; j++) {

1592

tbio->bi_io_vec[j].bv_offset = 0;

1621

tbio->bi_io_vec[j].bv_offset = 0;

1593

tbio->bi_io_vec[j].bv_len = PAGE_SIZE;

1622

tbio->bi_io_vec[j].bv_len = PAGE_SIZE;

1594

1623

1595

memcpy(page_address(tbio->bi_io_vec[j].bv_page),

1624

memcpy(page_address(tbio->bi_io_vec[j].bv_page),

1596

page_address(fbio->bi_io_vec[j].bv_page),

1625

page_address(fbio->bi_io_vec[j].bv_page),

1597

PAGE_SIZE);

1626

PAGE_SIZE);

1598

}

1627

}

1599

tbio->bi_end_io = end_sync_write;

1628

tbio->bi_end_io = end_sync_write;

1600

1629

1601

d = r10_bio->devs[i].devnum;

1630

d = r10_bio->devs[i].devnum;

1602

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

1631

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

1603

atomic_inc(&r10_bio->remaining);

1632

atomic_inc(&r10_bio->remaining);

1604

md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);

1633

md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);

1605

1634

1606

tbio->bi_sector += conf->mirrors[d].rdev->data_offset;

1635

tbio->bi_sector += conf->mirrors[d].rdev->data_offset;

1607

tbio->bi_bdev = conf->mirrors[d].rdev->bdev;

1636

tbio->bi_bdev = conf->mirrors[d].rdev->bdev;

1608

generic_make_request(tbio);

1637

generic_make_request(tbio);

1609

}

1638

}

1610

1639

1611

done:

1640

done:

1612

if (atomic_dec_and_test(&r10_bio->remaining)) {

1641

if (atomic_dec_and_test(&r10_bio->remaining)) {

1613

md_done_sync(mddev, r10_bio->sectors, 1);

1642

md_done_sync(mddev, r10_bio->sectors, 1);

1614

put_buf(r10_bio);

1643

put_buf(r10_bio);

1615

}

1644

}

1616

}

1645

}

1617

1646

1618

/*

1647

/*

1619

* Now for the recovery code.

1648

* Now for the recovery code.

1620

* Recovery happens across physical sectors.

1649

* Recovery happens across physical sectors.

1621

* We recover all non-is_sync drives by finding the virtual address of

1650

* We recover all non-is_sync drives by finding the virtual address of

1622

* each, and then choose a working drive that also has that virt address.

1651

* each, and then choose a working drive that also has that virt address.

1623

* There is a separate r10_bio for each non-in_sync drive.

1652

* There is a separate r10_bio for each non-in_sync drive.

1624

* Only the first two slots are in use. The first for reading,

1653

* Only the first two slots are in use. The first for reading,

1625

* The second for writing.

1654

* The second for writing.

1626

*

1655

*

1627

*/

1656

*/

1628

static void fix_recovery_read_error(struct r10bio *r10_bio)

1657

static void fix_recovery_read_error(struct r10bio *r10_bio)

1629

{

1658

{

1630

/* We got a read error during recovery.

1659

/* We got a read error during recovery.

1631

* We repeat the read in smaller page-sized sections.

1660

* We repeat the read in smaller page-sized sections.

1632

* If a read succeeds, write it to the new device or record

1661

* If a read succeeds, write it to the new device or record

1633

* a bad block if we cannot.

1662

* a bad block if we cannot.

1634

* If a read fails, record a bad block on both old and

1663

* If a read fails, record a bad block on both old and

1635

* new devices.

1664

* new devices.

1636

*/

1665

*/

1637

struct mddev *mddev = r10_bio->mddev;

1666

struct mddev *mddev = r10_bio->mddev;

1638

struct r10conf *conf = mddev->private;

1667

struct r10conf *conf = mddev->private;

1639

struct bio *bio = r10_bio->devs[0].bio;

1668

struct bio *bio = r10_bio->devs[0].bio;

1640

sector_t sect = 0;

1669

sector_t sect = 0;

1641

int sectors = r10_bio->sectors;

1670

int sectors = r10_bio->sectors;

1642

int idx = 0;

1671

int idx = 0;

1643

int dr = r10_bio->devs[0].devnum;

1672

int dr = r10_bio->devs[0].devnum;

1644

int dw = r10_bio->devs[1].devnum;

1673

int dw = r10_bio->devs[1].devnum;

1645

1674

1646

while (sectors) {

1675

while (sectors) {

1647

int s = sectors;

1676

int s = sectors;

1648

struct md_rdev *rdev;

1677

struct md_rdev *rdev;

1649

sector_t addr;

1678

sector_t addr;

1650

int ok;

1679

int ok;

1651

1680

1652

if (s > (PAGE_SIZE>>9))

1681

if (s > (PAGE_SIZE>>9))

1653

s = PAGE_SIZE >> 9;

1682

s = PAGE_SIZE >> 9;

1654

1683

1655

rdev = conf->mirrors[dr].rdev;

1684

rdev = conf->mirrors[dr].rdev;

1656

addr = r10_bio->devs[0].addr + sect,

1685

addr = r10_bio->devs[0].addr + sect,

1657

ok = sync_page_io(rdev,

1686

ok = sync_page_io(rdev,

1658

addr,

1687

addr,

1659

s << 9,

1688

s << 9,

1660

bio->bi_io_vec[idx].bv_page,

1689

bio->bi_io_vec[idx].bv_page,

1661

READ, false);

1690

READ, false);

1662

if (ok) {

1691

if (ok) {

1663

rdev = conf->mirrors[dw].rdev;

1692

rdev = conf->mirrors[dw].rdev;

1664

addr = r10_bio->devs[1].addr + sect;

1693

addr = r10_bio->devs[1].addr + sect;

1665

ok = sync_page_io(rdev,

1694

ok = sync_page_io(rdev,

1666

addr,

1695

addr,

1667

s << 9,

1696

s << 9,

1668

bio->bi_io_vec[idx].bv_page,

1697

bio->bi_io_vec[idx].bv_page,

1669

WRITE, false);

1698

WRITE, false);

1670

if (!ok)

1699

if (!ok)

1671

set_bit(WriteErrorSeen, &rdev->flags);

1700

set_bit(WriteErrorSeen, &rdev->flags);

1672

}

1701

}

1673

if (!ok) {

1702

if (!ok) {

1674

/* We don't worry if we cannot set a bad block -

1703

/* We don't worry if we cannot set a bad block -

1675

* it really is bad so there is no loss in not

1704

* it really is bad so there is no loss in not

1676

* recording it yet

1705

* recording it yet

1677

*/

1706

*/

1678

rdev_set_badblocks(rdev, addr, s, 0);

1707

rdev_set_badblocks(rdev, addr, s, 0);

1679

1708

1680

if (rdev != conf->mirrors[dw].rdev) {

1709

if (rdev != conf->mirrors[dw].rdev) {

1681

/* need bad block on destination too */

1710

/* need bad block on destination too */

1682

struct md_rdev *rdev2 = conf->mirrors[dw].rdev;

1711

struct md_rdev *rdev2 = conf->mirrors[dw].rdev;

1683

addr = r10_bio->devs[1].addr + sect;

1712

addr = r10_bio->devs[1].addr + sect;

1684

ok = rdev_set_badblocks(rdev2, addr, s, 0);

1713

ok = rdev_set_badblocks(rdev2, addr, s, 0);

1685

if (!ok) {

1714

if (!ok) {

1686

/* just abort the recovery */

1715

/* just abort the recovery */

1687

printk(KERN_NOTICE

1716

printk(KERN_NOTICE

1688

"md/raid10:%s: recovery aborted"

1717

"md/raid10:%s: recovery aborted"

1689

" due to read error\n",

1718

" due to read error\n",

1690

mdname(mddev));

1719

mdname(mddev));

1691

1720

1692

conf->mirrors[dw].recovery_disabled

1721

conf->mirrors[dw].recovery_disabled

1693

= mddev->recovery_disabled;

1722

= mddev->recovery_disabled;

1694

set_bit(MD_RECOVERY_INTR,

1723

set_bit(MD_RECOVERY_INTR,

1695

&mddev->recovery);

1724

&mddev->recovery);

1696

break;

1725

break;

1697

}

1726

}

1698

}

1727

}

1699

}

1728

}

1700

1729

1701

sectors -= s;

1730

sectors -= s;

1702

sect += s;

1731

sect += s;

1703

idx++;

1732

idx++;

1704

}

1733

}

1705

}

1734

}

1706

1735

1707

static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)

1736

static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)

1708

{

1737

{

1709

struct r10conf *conf = mddev->private;

1738

struct r10conf *conf = mddev->private;

1710

int d;

1739

int d;

1711

struct bio *wbio;

1740

struct bio *wbio;

1712

1741

1713

if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {

1742

if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {

1714

fix_recovery_read_error(r10_bio);

1743

fix_recovery_read_error(r10_bio);

1715

end_sync_request(r10_bio);

1744

end_sync_request(r10_bio);

1716

return;

1745

return;

1717

}

1746

}

1718

1747

1719

/*

1748

/*

1720

* share the pages with the first bio

1749

* share the pages with the first bio

1721

* and submit the write request

1750

* and submit the write request

1722

*/

1751

*/

1723

wbio = r10_bio->devs[1].bio;

1752

wbio = r10_bio->devs[1].bio;

1724

d = r10_bio->devs[1].devnum;

1753

d = r10_bio->devs[1].devnum;

1725

1754

1726

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

1755

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

1727

md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);

1756

md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);

1728

generic_make_request(wbio);

1757

generic_make_request(wbio);

1729

}

1758

}

1730

1759

1731

1760

1732

/*

1761

/*

1733

* Used by fix_read_error() to decay the per rdev read_errors.

1762

* Used by fix_read_error() to decay the per rdev read_errors.

1734

* We halve the read error count for every hour that has elapsed

1763

* We halve the read error count for every hour that has elapsed

1735

* since the last recorded read error.

1764

* since the last recorded read error.

1736

*

1765

*

1737

*/

1766

*/

1738

static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)

1767

static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)

1739

{

1768

{

1740

struct timespec cur_time_mon;

1769

struct timespec cur_time_mon;

1741

unsigned long hours_since_last;

1770

unsigned long hours_since_last;

1742

unsigned int read_errors = atomic_read(&rdev->read_errors);

1771

unsigned int read_errors = atomic_read(&rdev->read_errors);

1743

1772

1744

ktime_get_ts(&cur_time_mon);

1773

ktime_get_ts(&cur_time_mon);

1745

1774

1746

if (rdev->last_read_error.tv_sec == 0 &&

1775

if (rdev->last_read_error.tv_sec == 0 &&

1747

rdev->last_read_error.tv_nsec == 0) {

1776

rdev->last_read_error.tv_nsec == 0) {

1748

/* first time we've seen a read error */

1777

/* first time we've seen a read error */

1749

rdev->last_read_error = cur_time_mon;

1778

rdev->last_read_error = cur_time_mon;

1750

return;

1779

return;

1751

}

1780

}

1752

1781

1753

hours_since_last = (cur_time_mon.tv_sec -

1782

hours_since_last = (cur_time_mon.tv_sec -

1754

rdev->last_read_error.tv_sec) / 3600;

1783

rdev->last_read_error.tv_sec) / 3600;

1755

1784

1756

rdev->last_read_error = cur_time_mon;

1785

rdev->last_read_error = cur_time_mon;

1757

1786

1758

/*

1787

/*

1759

* if hours_since_last is > the number of bits in read_errors

1788

* if hours_since_last is > the number of bits in read_errors

1760

* just set read errors to 0. We do this to avoid

1789

* just set read errors to 0. We do this to avoid

1761

* overflowing the shift of read_errors by hours_since_last.

1790

* overflowing the shift of read_errors by hours_since_last.

1762

*/

1791

*/

1763

if (hours_since_last >= 8 * sizeof(read_errors))

1792

if (hours_since_last >= 8 * sizeof(read_errors))

1764

atomic_set(&rdev->read_errors, 0);

1793

atomic_set(&rdev->read_errors, 0);

1765

else

1794

else

1766

atomic_set(&rdev->read_errors, read_errors >> hours_since_last);

1795

atomic_set(&rdev->read_errors, read_errors >> hours_since_last);

1767

}

1796

}

1768

1797

1769

static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,

1798

static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,

1770

int sectors, struct page *page, int rw)

1799

int sectors, struct page *page, int rw)

1771

{

1800

{

1772

sector_t first_bad;

1801

sector_t first_bad;

1773

int bad_sectors;

1802

int bad_sectors;

1774

1803

1775

if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)

1804

if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)

1776

&& (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))

1805

&& (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))

1777

return -1;

1806

return -1;

1778

if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))

1807

if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))

1779

/* success */

1808

/* success */

1780

return 1;

1809

return 1;

1781

if (rw == WRITE)

1810

if (rw == WRITE)

1782

set_bit(WriteErrorSeen, &rdev->flags);

1811

set_bit(WriteErrorSeen, &rdev->flags);

1783

/* need to record an error - either for the block or the device */

1812

/* need to record an error - either for the block or the device */

1784

if (!rdev_set_badblocks(rdev, sector, sectors, 0))

1813

if (!rdev_set_badblocks(rdev, sector, sectors, 0))

1785

md_error(rdev->mddev, rdev);

1814

md_error(rdev->mddev, rdev);

1786

return 0;

1815

return 0;

1787

}

1816

}

1788

1817

1789

/*

1818

/*

1790

* This is a kernel thread which:

1819

* This is a kernel thread which:

1791

*

1820

*

1792

* 1. Retries failed read operations on working mirrors.

1821

* 1. Retries failed read operations on working mirrors.

1793

* 2. Updates the raid superblock when problems encounter.

1822

* 2. Updates the raid superblock when problems encounter.

1794

* 3. Performs writes following reads for array synchronising.

1823

* 3. Performs writes following reads for array synchronising.

1795

*/

1824

*/

1796

1825

1797

static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)

1826

static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)

1798

{

1827

{

1799

int sect = 0; /* Offset from r10_bio->sector */

1828

int sect = 0; /* Offset from r10_bio->sector */

1800

int sectors = r10_bio->sectors;

1829

int sectors = r10_bio->sectors;

1801

struct md_rdev*rdev;

1830

struct md_rdev*rdev;

1802

int max_read_errors = atomic_read(&mddev->max_corr_read_errors);

1831

int max_read_errors = atomic_read(&mddev->max_corr_read_errors);

1803

int d = r10_bio->devs[r10_bio->read_slot].devnum;

1832

int d = r10_bio->devs[r10_bio->read_slot].devnum;

1804

1833

1805

/* still own a reference to this rdev, so it cannot

1834

/* still own a reference to this rdev, so it cannot

1806

* have been cleared recently.

1835

* have been cleared recently.

1807

*/

1836

*/

1808

rdev = conf->mirrors[d].rdev;

1837

rdev = conf->mirrors[d].rdev;

1809

1838

1810

if (test_bit(Faulty, &rdev->flags))

1839

if (test_bit(Faulty, &rdev->flags))

1811

/* drive has already been failed, just ignore any

1840

/* drive has already been failed, just ignore any

1812

more fix_read_error() attempts */

1841

more fix_read_error() attempts */

1813

return;

1842

return;

1814

1843

1815

check_decay_read_errors(mddev, rdev);

1844

check_decay_read_errors(mddev, rdev);

1816

atomic_inc(&rdev->read_errors);

1845

atomic_inc(&rdev->read_errors);

1817

if (atomic_read(&rdev->read_errors) > max_read_errors) {

1846

if (atomic_read(&rdev->read_errors) > max_read_errors) {

1818

char b[BDEVNAME_SIZE];

1847

char b[BDEVNAME_SIZE];

1819

bdevname(rdev->bdev, b);

1848

bdevname(rdev->bdev, b);

1820

1849

1821

printk(KERN_NOTICE

1850

printk(KERN_NOTICE

1822

"md/raid10:%s: %s: Raid device exceeded "

1851

"md/raid10:%s: %s: Raid device exceeded "

1823

"read_error threshold [cur %d:max %d]\n",

1852

"read_error threshold [cur %d:max %d]\n",

1824

mdname(mddev), b,

1853

mdname(mddev), b,

1825

atomic_read(&rdev->read_errors), max_read_errors);

1854

atomic_read(&rdev->read_errors), max_read_errors);

1826

printk(KERN_NOTICE

1855

printk(KERN_NOTICE

1827

"md/raid10:%s: %s: Failing raid device\n",

1856

"md/raid10:%s: %s: Failing raid device\n",

1828

mdname(mddev), b);

1857

mdname(mddev), b);

1829

md_error(mddev, conf->mirrors[d].rdev);

1858

md_error(mddev, conf->mirrors[d].rdev);

1830

return;

1859

return;

1831

}

1860

}

1832

1861

1833

while(sectors) {

1862

while(sectors) {

1834

int s = sectors;

1863

int s = sectors;

1835

int sl = r10_bio->read_slot;

1864

int sl = r10_bio->read_slot;

1836

int success = 0;

1865

int success = 0;

1837

int start;

1866

int start;

1838

1867

1839

if (s > (PAGE_SIZE>>9))

1868

if (s > (PAGE_SIZE>>9))

1840

s = PAGE_SIZE >> 9;

1869

s = PAGE_SIZE >> 9;

1841

1870

1842

rcu_read_lock();

1871

rcu_read_lock();

1843

do {

1872

do {

1844

sector_t first_bad;

1873

sector_t first_bad;

1845

int bad_sectors;

1874

int bad_sectors;

1846

1875

1847

d = r10_bio->devs[sl].devnum;

1876

d = r10_bio->devs[sl].devnum;

1848

rdev = rcu_dereference(conf->mirrors[d].rdev);

1877

rdev = rcu_dereference(conf->mirrors[d].rdev);

1849

if (rdev &&

1878

if (rdev &&

1850

test_bit(In_sync, &rdev->flags) &&

1879

test_bit(In_sync, &rdev->flags) &&

1851

is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,

1880

is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,

1852

&first_bad, &bad_sectors) == 0) {

1881

&first_bad, &bad_sectors) == 0) {

1853

atomic_inc(&rdev->nr_pending);

1882

atomic_inc(&rdev->nr_pending);

1854

rcu_read_unlock();

1883

rcu_read_unlock();

1855

success = sync_page_io(rdev,

1884

success = sync_page_io(rdev,

1856

r10_bio->devs[sl].addr +

1885

r10_bio->devs[sl].addr +

1857

sect,

1886

sect,

1858

s<<9,

1887

s<<9,

1859

conf->tmppage, READ, false);

1888

conf->tmppage, READ, false);

1860

rdev_dec_pending(rdev, mddev);

1889

rdev_dec_pending(rdev, mddev);

1861

rcu_read_lock();

1890

rcu_read_lock();

1862

if (success)

1891

if (success)

1863

break;

1892

break;

1864

}

1893

}

1865

sl++;

1894

sl++;

1866

if (sl == conf->copies)

1895

if (sl == conf->copies)

1867

sl = 0;

1896

sl = 0;

1868

} while (!success && sl != r10_bio->read_slot);

1897

} while (!success && sl != r10_bio->read_slot);

1869

rcu_read_unlock();

1898

rcu_read_unlock();

1870

1899

1871

if (!success) {

1900

if (!success) {

1872

/* Cannot read from anywhere, just mark the block

1901

/* Cannot read from anywhere, just mark the block

1873

* as bad on the first device to discourage future

1902

* as bad on the first device to discourage future

1874

* reads.

1903

* reads.

1875

*/

1904

*/

1876

int dn = r10_bio->devs[r10_bio->read_slot].devnum;

1905

int dn = r10_bio->devs[r10_bio->read_slot].devnum;

1877

rdev = conf->mirrors[dn].rdev;

1906

rdev = conf->mirrors[dn].rdev;

1878

1907

1879

if (!rdev_set_badblocks(

1908

if (!rdev_set_badblocks(

1880

rdev,

1909

rdev,

1881

r10_bio->devs[r10_bio->read_slot].addr

1910

r10_bio->devs[r10_bio->read_slot].addr

1882

+ sect,

1911

+ sect,

1883

s, 0))

1912

s, 0))

1884

md_error(mddev, rdev);

1913

md_error(mddev, rdev);

1885

break;

1914

break;

1886

}

1915

}

1887

1916

1888

start = sl;

1917

start = sl;

1889

/* write it back and re-read */

1918

/* write it back and re-read */

1890

rcu_read_lock();

1919

rcu_read_lock();

1891

while (sl != r10_bio->read_slot) {

1920

while (sl != r10_bio->read_slot) {

1892

char b[BDEVNAME_SIZE];

1921

char b[BDEVNAME_SIZE];

1893

1922

1894

if (sl==0)

1923

if (sl==0)

1895

sl = conf->copies;

1924

sl = conf->copies;

1896

sl--;

1925

sl--;

1897

d = r10_bio->devs[sl].devnum;

1926

d = r10_bio->devs[sl].devnum;

1898

rdev = rcu_dereference(conf->mirrors[d].rdev);

1927

rdev = rcu_dereference(conf->mirrors[d].rdev);

1899

if (!rdev ||

1928

if (!rdev ||

1900

!test_bit(In_sync, &rdev->flags))

1929

!test_bit(In_sync, &rdev->flags))

1901

continue;

1930

continue;

1902

1931

1903

atomic_inc(&rdev->nr_pending);

1932

atomic_inc(&rdev->nr_pending);

1904

rcu_read_unlock();

1933

rcu_read_unlock();

1905

if (r10_sync_page_io(rdev,

1934

if (r10_sync_page_io(rdev,

1906

r10_bio->devs[sl].addr +

1935

r10_bio->devs[sl].addr +

1907

sect,

1936

sect,

1908

s<<9, conf->tmppage, WRITE)

1937

s<<9, conf->tmppage, WRITE)

1909

== 0) {

1938

== 0) {

1910

/* Well, this device is dead */

1939

/* Well, this device is dead */

1911

printk(KERN_NOTICE

1940

printk(KERN_NOTICE

1912

"md/raid10:%s: read correction "

1941

"md/raid10:%s: read correction "

1913

"write failed"

1942

"write failed"

1914

" (%d sectors at %llu on %s)\n",

1943

" (%d sectors at %llu on %s)\n",

1915

mdname(mddev), s,

1944

mdname(mddev), s,

1916

(unsigned long long)(

1945

(unsigned long long)(

1917

sect + rdev->data_offset),

1946

sect + rdev->data_offset),

1918

bdevname(rdev->bdev, b));

1947

bdevname(rdev->bdev, b));

1919

printk(KERN_NOTICE "md/raid10:%s: %s: failing "

1948

printk(KERN_NOTICE "md/raid10:%s: %s: failing "

1920

"drive\n",

1949

"drive\n",

1921

mdname(mddev),

1950

mdname(mddev),

1922

bdevname(rdev->bdev, b));

1951

bdevname(rdev->bdev, b));

1923

}

1952

}

1924

rdev_dec_pending(rdev, mddev);

1953

rdev_dec_pending(rdev, mddev);

1925

rcu_read_lock();

1954

rcu_read_lock();

1926

}

1955

}

1927

sl = start;

1956

sl = start;

1928

while (sl != r10_bio->read_slot) {

1957

while (sl != r10_bio->read_slot) {

1929

char b[BDEVNAME_SIZE];

1958

char b[BDEVNAME_SIZE];

1930

1959

1931

if (sl==0)

1960

if (sl==0)

1932

sl = conf->copies;

1961

sl = conf->copies;

1933

sl--;

1962

sl--;

1934

d = r10_bio->devs[sl].devnum;

1963

d = r10_bio->devs[sl].devnum;

1935

rdev = rcu_dereference(conf->mirrors[d].rdev);

1964

rdev = rcu_dereference(conf->mirrors[d].rdev);

1936

if (!rdev ||

1965

if (!rdev ||

1937

!test_bit(In_sync, &rdev->flags))

1966

!test_bit(In_sync, &rdev->flags))

1938

continue;

1967

continue;

1939

1968

1940

atomic_inc(&rdev->nr_pending);

1969

atomic_inc(&rdev->nr_pending);

1941

rcu_read_unlock();

1970

rcu_read_unlock();

1942

switch (r10_sync_page_io(rdev,

1971

switch (r10_sync_page_io(rdev,

1943

r10_bio->devs[sl].addr +

1972

r10_bio->devs[sl].addr +

1944

sect,

1973

sect,

1945

s<<9, conf->tmppage,

1974

s<<9, conf->tmppage,

1946

READ)) {

1975

READ)) {

1947

case 0:

1976

case 0:

1948

/* Well, this device is dead */

1977

/* Well, this device is dead */

1949

printk(KERN_NOTICE

1978

printk(KERN_NOTICE

1950

"md/raid10:%s: unable to read back "

1979

"md/raid10:%s: unable to read back "

1951

"corrected sectors"

1980

"corrected sectors"

1952

" (%d sectors at %llu on %s)\n",

1981

" (%d sectors at %llu on %s)\n",

1953

mdname(mddev), s,

1982

mdname(mddev), s,

1954

(unsigned long long)(

1983

(unsigned long long)(

1955

sect + rdev->data_offset),

1984

sect + rdev->data_offset),

1956

bdevname(rdev->bdev, b));

1985

bdevname(rdev->bdev, b));

1957

printk(KERN_NOTICE "md/raid10:%s: %s: failing "

1986

printk(KERN_NOTICE "md/raid10:%s: %s: failing "

1958

"drive\n",

1987

"drive\n",

1959

mdname(mddev),

1988

mdname(mddev),

1960

bdevname(rdev->bdev, b));

1989

bdevname(rdev->bdev, b));

1961

break;

1990

break;

1962

case 1:

1991

case 1:

1963

printk(KERN_INFO

1992

printk(KERN_INFO

1964

"md/raid10:%s: read error corrected"

1993

"md/raid10:%s: read error corrected"

1965

" (%d sectors at %llu on %s)\n",

1994

" (%d sectors at %llu on %s)\n",

1966

mdname(mddev), s,

1995

mdname(mddev), s,

1967

(unsigned long long)(

1996

(unsigned long long)(

1968

sect + rdev->data_offset),

1997

sect + rdev->data_offset),

1969

bdevname(rdev->bdev, b));

1998

bdevname(rdev->bdev, b));

1970

atomic_add(s, &rdev->corrected_errors);

1999

atomic_add(s, &rdev->corrected_errors);

1971

}

2000

}

1972

2001

1973

rdev_dec_pending(rdev, mddev);

2002

rdev_dec_pending(rdev, mddev);

1974

rcu_read_lock();

2003

rcu_read_lock();

1975

}

2004

}

1976

rcu_read_unlock();

2005

rcu_read_unlock();

1977

2006

1978

sectors -= s;

2007

sectors -= s;

1979

sect += s;

2008

sect += s;

1980

}

2009

}

1981

}

2010

}

1982

2011

1983

static void bi_complete(struct bio *bio, int error)

2012

static void bi_complete(struct bio *bio, int error)

1984

{

2013

{

1985

complete((struct completion *)bio->bi_private);

2014

complete((struct completion *)bio->bi_private);

1986

}

2015

}

1987

2016

1988

static int submit_bio_wait(int rw, struct bio *bio)

2017

static int submit_bio_wait(int rw, struct bio *bio)

1989

{

2018

{

1990

struct completion event;

2019

struct completion event;

1991

rw |= REQ_SYNC;

2020

rw |= REQ_SYNC;

1992

2021

1993

init_completion(&event);

2022

init_completion(&event);

1994

bio->bi_private = &event;

2023

bio->bi_private = &event;

1995

bio->bi_end_io = bi_complete;

2024

bio->bi_end_io = bi_complete;

1996

submit_bio(rw, bio);

2025

submit_bio(rw, bio);

1997

wait_for_completion(&event);

2026

wait_for_completion(&event);

1998

2027

1999

return test_bit(BIO_UPTODATE, &bio->bi_flags);

2028

return test_bit(BIO_UPTODATE, &bio->bi_flags);

2000

}

2029

}

2001

2030

2002

static int narrow_write_error(struct r10bio *r10_bio, int i)

2031

static int narrow_write_error(struct r10bio *r10_bio, int i)

2003

{

2032

{

2004

struct bio *bio = r10_bio->master_bio;

2033

struct bio *bio = r10_bio->master_bio;

2005

struct mddev *mddev = r10_bio->mddev;

2034

struct mddev *mddev = r10_bio->mddev;

2006

struct r10conf *conf = mddev->private;

2035

struct r10conf *conf = mddev->private;

2007

struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;

2036

struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;

2008

/* bio has the data to be written to slot 'i' where

2037

/* bio has the data to be written to slot 'i' where

2009

* we just recently had a write error.

2038

* we just recently had a write error.

2010

* We repeatedly clone the bio and trim down to one block,

2039

* We repeatedly clone the bio and trim down to one block,

2011

* then try the write. Where the write fails we record

2040

* then try the write. Where the write fails we record

2012

* a bad block.

2041

* a bad block.

2013

* It is conceivable that the bio doesn't exactly align with

2042

* It is conceivable that the bio doesn't exactly align with

2014

* blocks. We must handle this.

2043

* blocks. We must handle this.

2015

*

2044

*

2016

* We currently own a reference to the rdev.

2045

* We currently own a reference to the rdev.

2017

*/

2046

*/

2018

2047

2019

int block_sectors;

2048

int block_sectors;

2020

sector_t sector;

2049

sector_t sector;

2021

int sectors;

2050

int sectors;

2022

int sect_to_write = r10_bio->sectors;

2051

int sect_to_write = r10_bio->sectors;

2023

int ok = 1;

2052

int ok = 1;

2024

2053

2025

if (rdev->badblocks.shift < 0)

2054

if (rdev->badblocks.shift < 0)

2026

return 0;

2055

return 0;

2027

2056

2028

block_sectors = 1 << rdev->badblocks.shift;

2057

block_sectors = 1 << rdev->badblocks.shift;

2029

sector = r10_bio->sector;

2058

sector = r10_bio->sector;

2030

sectors = ((r10_bio->sector + block_sectors)

2059

sectors = ((r10_bio->sector + block_sectors)

2031

& ~(sector_t)(block_sectors - 1))

2060

& ~(sector_t)(block_sectors - 1))

2032

- sector;

2061

- sector;

2033

2062

2034

while (sect_to_write) {

2063

while (sect_to_write) {

2035

struct bio *wbio;

2064

struct bio *wbio;

2036

if (sectors > sect_to_write)

2065

if (sectors > sect_to_write)

2037

sectors = sect_to_write;

2066

sectors = sect_to_write;

2038

/* Write at 'sector' for 'sectors' */

2067

/* Write at 'sector' for 'sectors' */

2039

wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);

2068

wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);

2040

md_trim_bio(wbio, sector - bio->bi_sector, sectors);

2069

md_trim_bio(wbio, sector - bio->bi_sector, sectors);

2041

wbio->bi_sector = (r10_bio->devs[i].addr+

2070

wbio->bi_sector = (r10_bio->devs[i].addr+

2042

rdev->data_offset+

2071

rdev->data_offset+

2043

(sector - r10_bio->sector));

2072

(sector - r10_bio->sector));

2044

wbio->bi_bdev = rdev->bdev;

2073

wbio->bi_bdev = rdev->bdev;

2045

if (submit_bio_wait(WRITE, wbio) == 0)

2074

if (submit_bio_wait(WRITE, wbio) == 0)

2046

/* Failure! */

2075

/* Failure! */

2047

ok = rdev_set_badblocks(rdev, sector,

2076

ok = rdev_set_badblocks(rdev, sector,

2048

sectors, 0)

2077

sectors, 0)

2049

&& ok;

2078

&& ok;

2050

2079

2051

bio_put(wbio);

2080

bio_put(wbio);

2052

sect_to_write -= sectors;

2081

sect_to_write -= sectors;

2053

sector += sectors;

2082

sector += sectors;

2054

sectors = block_sectors;

2083

sectors = block_sectors;

2055

}

2084

}

2056

return ok;

2085

return ok;

2057

}

2086

}

2058

2087

2059

static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)

2088

static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)

2060

{

2089

{

2061

int slot = r10_bio->read_slot;

2090

int slot = r10_bio->read_slot;

2062

int mirror = r10_bio->devs[slot].devnum;

2091

int mirror = r10_bio->devs[slot].devnum;

2063

struct bio *bio;

2092

struct bio *bio;

2064

struct r10conf *conf = mddev->private;

2093

struct r10conf *conf = mddev->private;

2065

struct md_rdev *rdev;

2094

struct md_rdev *rdev;

2066

char b[BDEVNAME_SIZE];

2095

char b[BDEVNAME_SIZE];

2067

unsigned long do_sync;

2096

unsigned long do_sync;

2068

int max_sectors;

2097

int max_sectors;

2069

2098

2070

/* we got a read error. Maybe the drive is bad. Maybe just

2099

/* we got a read error. Maybe the drive is bad. Maybe just

2071

* the block and we can fix it.

2100

* the block and we can fix it.

2072

* We freeze all other IO, and try reading the block from

2101

* We freeze all other IO, and try reading the block from

2073

* other devices. When we find one, we re-write

2102

* other devices. When we find one, we re-write

2074

* and check it that fixes the read error.

2103

* and check it that fixes the read error.

2075

* This is all done synchronously while the array is

2104

* This is all done synchronously while the array is

2076

* frozen.

2105

* frozen.

2077

*/

2106

*/

2078

if (mddev->ro == 0) {

2107

if (mddev->ro == 0) {

2079

freeze_array(conf);

2108

freeze_array(conf);

2080

fix_read_error(conf, mddev, r10_bio);

2109

fix_read_error(conf, mddev, r10_bio);

2081

unfreeze_array(conf);

2110

unfreeze_array(conf);

2082

}

2111

}

2083

rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);

2112

rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);

2084

2113

2085

bio = r10_bio->devs[slot].bio;

2114

bio = r10_bio->devs[slot].bio;

2086

bdevname(bio->bi_bdev, b);

2115

bdevname(bio->bi_bdev, b);

2087

r10_bio->devs[slot].bio =

2116

r10_bio->devs[slot].bio =

2088

mddev->ro ? IO_BLOCKED : NULL;

2117

mddev->ro ? IO_BLOCKED : NULL;

2089

read_more:

2118

read_more:

2090

mirror = read_balance(conf, r10_bio, &max_sectors);

2119

mirror = read_balance(conf, r10_bio, &max_sectors);

2091

if (mirror == -1) {

2120

if (mirror == -1) {

2092

printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"

2121

printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"

2093

" read error for block %llu\n",

2122

" read error for block %llu\n",

2094

mdname(mddev), b,

2123

mdname(mddev), b,

2095

(unsigned long long)r10_bio->sector);

2124

(unsigned long long)r10_bio->sector);

2096

raid_end_bio_io(r10_bio);

2125

raid_end_bio_io(r10_bio);

2097

bio_put(bio);

2126

bio_put(bio);

2098

return;

2127

return;

2099

}

2128

}

2100

2129

2101

do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);

2130

do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);

2102

if (bio)

2131

if (bio)

2103

bio_put(bio);

2132

bio_put(bio);

2104

slot = r10_bio->read_slot;

2133

slot = r10_bio->read_slot;

2105

rdev = conf->mirrors[mirror].rdev;

2134

rdev = conf->mirrors[mirror].rdev;

2106

printk_ratelimited(

2135

printk_ratelimited(

2107

KERN_ERR

2136

KERN_ERR

2108

"md/raid10:%s: %s: redirecting"

2137

"md/raid10:%s: %s: redirecting"

2109

"sector %llu to another mirror\n",

2138

"sector %llu to another mirror\n",

2110

mdname(mddev),

2139

mdname(mddev),

2111

bdevname(rdev->bdev, b),

2140

bdevname(rdev->bdev, b),

2112

(unsigned long long)r10_bio->sector);

2141

(unsigned long long)r10_bio->sector);

2113

bio = bio_clone_mddev(r10_bio->master_bio,

2142

bio = bio_clone_mddev(r10_bio->master_bio,

2114

GFP_NOIO, mddev);

2143

GFP_NOIO, mddev);

2115

md_trim_bio(bio,

2144

md_trim_bio(bio,

2116

r10_bio->sector - bio->bi_sector,

2145

r10_bio->sector - bio->bi_sector,

2117

max_sectors);

2146

max_sectors);

2118

r10_bio->devs[slot].bio = bio;

2147

r10_bio->devs[slot].bio = bio;

2119

bio->bi_sector = r10_bio->devs[slot].addr

2148

bio->bi_sector = r10_bio->devs[slot].addr

2120

+ rdev->data_offset;

2149

+ rdev->data_offset;

2121

bio->bi_bdev = rdev->bdev;

2150

bio->bi_bdev = rdev->bdev;

2122

bio->bi_rw = READ | do_sync;

2151

bio->bi_rw = READ | do_sync;

2123

bio->bi_private = r10_bio;

2152

bio->bi_private = r10_bio;

2124

bio->bi_end_io = raid10_end_read_request;

2153

bio->bi_end_io = raid10_end_read_request;

2125

if (max_sectors < r10_bio->sectors) {

2154

if (max_sectors < r10_bio->sectors) {

2126

/* Drat - have to split this up more */

2155

/* Drat - have to split this up more */

2127

struct bio *mbio = r10_bio->master_bio;

2156

struct bio *mbio = r10_bio->master_bio;

2128

int sectors_handled =

2157

int sectors_handled =

2129

r10_bio->sector + max_sectors

2158

r10_bio->sector + max_sectors

2130

- mbio->bi_sector;

2159

- mbio->bi_sector;

2131

r10_bio->sectors = max_sectors;

2160

r10_bio->sectors = max_sectors;

2132

spin_lock_irq(&conf->device_lock);

2161

spin_lock_irq(&conf->device_lock);

2133

if (mbio->bi_phys_segments == 0)

2162

if (mbio->bi_phys_segments == 0)

2134

mbio->bi_phys_segments = 2;

2163

mbio->bi_phys_segments = 2;

2135

else

2164

else

2136

mbio->bi_phys_segments++;

2165

mbio->bi_phys_segments++;

2137

spin_unlock_irq(&conf->device_lock);

2166

spin_unlock_irq(&conf->device_lock);

2138

generic_make_request(bio);

2167

generic_make_request(bio);

2139

bio = NULL;

2168

bio = NULL;

2140

2169

2141

r10_bio = mempool_alloc(conf->r10bio_pool,

2170

r10_bio = mempool_alloc(conf->r10bio_pool,

2142

GFP_NOIO);

2171

GFP_NOIO);

2143

r10_bio->master_bio = mbio;

2172

r10_bio->master_bio = mbio;

2144

r10_bio->sectors = (mbio->bi_size >> 9)

2173

r10_bio->sectors = (mbio->bi_size >> 9)

2145

- sectors_handled;

2174

- sectors_handled;

2146

r10_bio->state = 0;

2175

r10_bio->state = 0;

2147

set_bit(R10BIO_ReadError,

2176

set_bit(R10BIO_ReadError,

2148

&r10_bio->state);

2177

&r10_bio->state);

2149

r10_bio->mddev = mddev;

2178

r10_bio->mddev = mddev;

2150

r10_bio->sector = mbio->bi_sector

2179

r10_bio->sector = mbio->bi_sector

2151

+ sectors_handled;

2180

+ sectors_handled;

2152

2181

2153

goto read_more;

2182

goto read_more;

2154

} else

2183

} else

2155

generic_make_request(bio);

2184

generic_make_request(bio);

2156

}

2185

}

2157

2186

2158

static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)

2187

static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)

2159

{

2188

{

2160

/* Some sort of write request has finished and it

2189

/* Some sort of write request has finished and it

2161

* succeeded in writing where we thought there was a

2190

* succeeded in writing where we thought there was a

2162

* bad block. So forget the bad block.

2191

* bad block. So forget the bad block.

2163

* Or possibly if failed and we need to record

2192

* Or possibly if failed and we need to record

2164

* a bad block.

2193

* a bad block.

2165

*/

2194

*/

2166

int m;

2195

int m;

2167

struct md_rdev *rdev;

2196

struct md_rdev *rdev;

2168

2197

2169

if (test_bit(R10BIO_IsSync, &r10_bio->state) ||

2198

if (test_bit(R10BIO_IsSync, &r10_bio->state) ||

2170

test_bit(R10BIO_IsRecover, &r10_bio->state)) {

2199

test_bit(R10BIO_IsRecover, &r10_bio->state)) {

2171

for (m = 0; m < conf->copies; m++) {

2200

for (m = 0; m < conf->copies; m++) {

2172

int dev = r10_bio->devs[m].devnum;

2201

int dev = r10_bio->devs[m].devnum;

2173

rdev = conf->mirrors[dev].rdev;

2202

rdev = conf->mirrors[dev].rdev;

2174

if (r10_bio->devs[m].bio == NULL)

2203

if (r10_bio->devs[m].bio == NULL)

2175

continue;

2204

continue;

2176

if (test_bit(BIO_UPTODATE,

2205

if (test_bit(BIO_UPTODATE,

2177

&r10_bio->devs[m].bio->bi_flags)) {

2206

&r10_bio->devs[m].bio->bi_flags)) {

2178

rdev_clear_badblocks(

2207

rdev_clear_badblocks(

2179

rdev,

2208

rdev,

2180

r10_bio->devs[m].addr,

2209

r10_bio->devs[m].addr,

2181

r10_bio->sectors);

2210

r10_bio->sectors);

2182

} else {

2211

} else {

2183

if (!rdev_set_badblocks(

2212

if (!rdev_set_badblocks(

2184

rdev,

2213

rdev,

2185

r10_bio->devs[m].addr,

2214

r10_bio->devs[m].addr,

2186

r10_bio->sectors, 0))

2215

r10_bio->sectors, 0))

2187

md_error(conf->mddev, rdev);

2216

md_error(conf->mddev, rdev);

2188

}

2217

}

2189

}

2218

}

2190

put_buf(r10_bio);

2219

put_buf(r10_bio);

2191

} else {

2220

} else {

2192

for (m = 0; m < conf->copies; m++) {

2221

for (m = 0; m < conf->copies; m++) {

2193

int dev = r10_bio->devs[m].devnum;

2222

int dev = r10_bio->devs[m].devnum;

2194

struct bio *bio = r10_bio->devs[m].bio;

2223

struct bio *bio = r10_bio->devs[m].bio;

2195

rdev = conf->mirrors[dev].rdev;

2224

rdev = conf->mirrors[dev].rdev;

2196

if (bio == IO_MADE_GOOD) {

2225

if (bio == IO_MADE_GOOD) {

2197

rdev_clear_badblocks(

2226

rdev_clear_badblocks(

2198

rdev,

2227

rdev,

2199

r10_bio->devs[m].addr,

2228

r10_bio->devs[m].addr,

2200

r10_bio->sectors);

2229

r10_bio->sectors);

2201

rdev_dec_pending(rdev, conf->mddev);

2230

rdev_dec_pending(rdev, conf->mddev);

2202

} else if (bio != NULL &&

2231

} else if (bio != NULL &&

2203

!test_bit(BIO_UPTODATE, &bio->bi_flags)) {

2232

!test_bit(BIO_UPTODATE, &bio->bi_flags)) {

2204

if (!narrow_write_error(r10_bio, m)) {

2233

if (!narrow_write_error(r10_bio, m)) {

2205

md_error(conf->mddev, rdev);

2234

md_error(conf->mddev, rdev);

2206

set_bit(R10BIO_Degraded,

2235

set_bit(R10BIO_Degraded,

2207

&r10_bio->state);

2236

&r10_bio->state);

2208

}

2237

}

2209

rdev_dec_pending(rdev, conf->mddev);

2238

rdev_dec_pending(rdev, conf->mddev);

2210

}

2239

}

2211

}

2240

}

2212

if (test_bit(R10BIO_WriteError,

2241

if (test_bit(R10BIO_WriteError,

2213

&r10_bio->state))

2242

&r10_bio->state))

2214

close_write(r10_bio);

2243

close_write(r10_bio);

2215

raid_end_bio_io(r10_bio);

2244

raid_end_bio_io(r10_bio);

2216

}

2245

}

2217

}

2246

}

2218

2247

2219

static void raid10d(struct mddev *mddev)

2248

static void raid10d(struct mddev *mddev)

2220

{

2249

{

2221

struct r10bio *r10_bio;

2250

struct r10bio *r10_bio;

2222

unsigned long flags;

2251

unsigned long flags;

2223

struct r10conf *conf = mddev->private;

2252

struct r10conf *conf = mddev->private;

2224

struct list_head *head = &conf->retry_list;

2253

struct list_head *head = &conf->retry_list;

2225

struct blk_plug plug;

2254

struct blk_plug plug;

2226

2255

2227

md_check_recovery(mddev);

2256

md_check_recovery(mddev);

2228

2257

2229

blk_start_plug(&plug);

2258

blk_start_plug(&plug);

2230

for (;;) {

2259

for (;;) {

2231

2260

2232

flush_pending_writes(conf);

2261

flush_pending_writes(conf);

2233

2262

2234

spin_lock_irqsave(&conf->device_lock, flags);

2263

spin_lock_irqsave(&conf->device_lock, flags);

2235

if (list_empty(head)) {

2264

if (list_empty(head)) {

2236

spin_unlock_irqrestore(&conf->device_lock, flags);

2265

spin_unlock_irqrestore(&conf->device_lock, flags);

2237

break;

2266

break;

2238

}

2267

}

2239

r10_bio = list_entry(head->prev, struct r10bio, retry_list);

2268

r10_bio = list_entry(head->prev, struct r10bio, retry_list);

2240

list_del(head->prev);

2269

list_del(head->prev);

2241

conf->nr_queued--;

2270

conf->nr_queued--;

2242

spin_unlock_irqrestore(&conf->device_lock, flags);

2271

spin_unlock_irqrestore(&conf->device_lock, flags);

2243

2272

2244

mddev = r10_bio->mddev;

2273

mddev = r10_bio->mddev;

2245

conf = mddev->private;

2274

conf = mddev->private;

2246

if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||

2275

if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||

2247

test_bit(R10BIO_WriteError, &r10_bio->state))

2276

test_bit(R10BIO_WriteError, &r10_bio->state))

2248

handle_write_completed(conf, r10_bio);

2277

handle_write_completed(conf, r10_bio);

2249

else if (test_bit(R10BIO_IsSync, &r10_bio->state))

2278

else if (test_bit(R10BIO_IsSync, &r10_bio->state))

2250

sync_request_write(mddev, r10_bio);

2279

sync_request_write(mddev, r10_bio);

2251

else if (test_bit(R10BIO_IsRecover, &r10_bio->state))

2280

else if (test_bit(R10BIO_IsRecover, &r10_bio->state))

2252

recovery_request_write(mddev, r10_bio);

2281

recovery_request_write(mddev, r10_bio);

2253

else if (test_bit(R10BIO_ReadError, &r10_bio->state))

2282

else if (test_bit(R10BIO_ReadError, &r10_bio->state))

2254

handle_read_error(mddev, r10_bio);

2283

handle_read_error(mddev, r10_bio);

2255

else {

2284

else {

2256

/* just a partial read to be scheduled from a

2285

/* just a partial read to be scheduled from a

2257

* separate context

2286

* separate context

2258

*/

2287

*/

2259

int slot = r10_bio->read_slot;

2288

int slot = r10_bio->read_slot;

2260

generic_make_request(r10_bio->devs[slot].bio);

2289

generic_make_request(r10_bio->devs[slot].bio);

2261

}

2290

}

2262

2291

2263

cond_resched();

2292

cond_resched();

2264

if (mddev->flags & ~(1<<MD_CHANGE_PENDING))

2293

if (mddev->flags & ~(1<<MD_CHANGE_PENDING))

2265

md_check_recovery(mddev);

2294

md_check_recovery(mddev);

2266

}

2295

}

2267

blk_finish_plug(&plug);

2296

blk_finish_plug(&plug);

2268

}

2297

}

2269

2298

2270

2299

2271

static int init_resync(struct r10conf *conf)

2300

static int init_resync(struct r10conf *conf)

2272

{

2301

{

2273

int buffs;

2302

int buffs;

2303

int i;

2274

2304

2275

buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;

2305

buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;

2276

BUG_ON(conf->r10buf_pool);

2306

BUG_ON(conf->r10buf_pool);

2307

conf->have_replacement = 0;

2308

for (i = 0; i < conf->raid_disks; i++)

2309

if (conf->mirrors[i].replacement)

2310

conf->have_replacement = 1;

2277

conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);

2311

conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);

2278

if (!conf->r10buf_pool)

2312

if (!conf->r10buf_pool)

2279

return -ENOMEM;

2313

return -ENOMEM;

2280

conf->next_resync = 0;

2314

conf->next_resync = 0;

2281

return 0;

2315

return 0;

2282

}

2316

}

2283

2317

2284

/*

2318

/*

2285

* perform a "sync" on one "block"

2319

* perform a "sync" on one "block"

2286

*

2320

*

2287

* We need to make sure that no normal I/O request - particularly write

2321

* We need to make sure that no normal I/O request - particularly write

2288

* requests - conflict with active sync requests.

2322

* requests - conflict with active sync requests.

2289

*

2323

*

2290

* This is achieved by tracking pending requests and a 'barrier' concept

2324

* This is achieved by tracking pending requests and a 'barrier' concept

2291

* that can be installed to exclude normal IO requests.

2325

* that can be installed to exclude normal IO requests.

2292

*

2326

*

2293

* Resync and recovery are handled very differently.

2327

* Resync and recovery are handled very differently.

2294

* We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.

2328

* We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.

2295

*

2329

*

2296

* For resync, we iterate over virtual addresses, read all copies,

2330

* For resync, we iterate over virtual addresses, read all copies,

2297

* and update if there are differences. If only one copy is live,

2331

* and update if there are differences. If only one copy is live,

2298

* skip it.

2332

* skip it.

2299

* For recovery, we iterate over physical addresses, read a good

2333

* For recovery, we iterate over physical addresses, read a good

2300

* value for each non-in_sync drive, and over-write.

2334

* value for each non-in_sync drive, and over-write.

2301

*

2335

*

2302

* So, for recovery we may have several outstanding complex requests for a

2336

* So, for recovery we may have several outstanding complex requests for a

2303

* given address, one for each out-of-sync device. We model this by allocating

2337

* given address, one for each out-of-sync device. We model this by allocating

2304

* a number of r10_bio structures, one for each out-of-sync device.

2338

* a number of r10_bio structures, one for each out-of-sync device.

2305

* As we setup these structures, we collect all bio's together into a list

2339

* As we setup these structures, we collect all bio's together into a list

2306

* which we then process collectively to add pages, and then process again

2340

* which we then process collectively to add pages, and then process again

2307

* to pass to generic_make_request.

2341

* to pass to generic_make_request.

2308

*

2342

*

2309

* The r10_bio structures are linked using a borrowed master_bio pointer.

2343

* The r10_bio structures are linked using a borrowed master_bio pointer.

2310

* This link is counted in ->remaining. When the r10_bio that points to NULL

2344

* This link is counted in ->remaining. When the r10_bio that points to NULL

2311

* has its remaining count decremented to 0, the whole complex operation

2345

* has its remaining count decremented to 0, the whole complex operation

2312

* is complete.

2346

* is complete.

2313

*

2347

*

2314

*/

2348

*/

2315

2349

2316

static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,

2350

static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,

2317

int *skipped, int go_faster)

2351

int *skipped, int go_faster)

2318

{

2352

{

2319

struct r10conf *conf = mddev->private;

2353

struct r10conf *conf = mddev->private;

2320

struct r10bio *r10_bio;

2354

struct r10bio *r10_bio;

2321

struct bio *biolist = NULL, *bio;

2355

struct bio *biolist = NULL, *bio;

2322

sector_t max_sector, nr_sectors;

2356

sector_t max_sector, nr_sectors;

2323

int i;

2357

int i;

2324

int max_sync;

2358

int max_sync;

2325

sector_t sync_blocks;

2359

sector_t sync_blocks;

2326

sector_t sectors_skipped = 0;

2360

sector_t sectors_skipped = 0;

2327

int chunks_skipped = 0;

2361

int chunks_skipped = 0;

2328

2362

2329

if (!conf->r10buf_pool)

2363

if (!conf->r10buf_pool)

2330

if (init_resync(conf))

2364

if (init_resync(conf))

2331

return 0;

2365

return 0;

2332

2366

2333

skipped:

2367

skipped:

2334

max_sector = mddev->dev_sectors;

2368

max_sector = mddev->dev_sectors;

2335

if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))

2369

if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))

2336

max_sector = mddev->resync_max_sectors;

2370

max_sector = mddev->resync_max_sectors;

2337

if (sector_nr >= max_sector) {

2371

if (sector_nr >= max_sector) {

2338

/* If we aborted, we need to abort the

2372

/* If we aborted, we need to abort the

2339

* sync on the 'current' bitmap chucks (there can

2373

* sync on the 'current' bitmap chucks (there can

2340

* be several when recovering multiple devices).

2374

* be several when recovering multiple devices).

2341

* as we may have started syncing it but not finished.

2375

* as we may have started syncing it but not finished.

2342

* We can find the current address in

2376

* We can find the current address in

2343

* mddev->curr_resync, but for recovery,

2377

* mddev->curr_resync, but for recovery,

2344

* we need to convert that to several

2378

* we need to convert that to several

2345

* virtual addresses.

2379

* virtual addresses.

2346

*/

2380

*/

2347

if (mddev->curr_resync < max_sector) { /* aborted */

2381

if (mddev->curr_resync < max_sector) { /* aborted */

2348

if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))

2382

if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))

2349

bitmap_end_sync(mddev->bitmap, mddev->curr_resync,

2383

bitmap_end_sync(mddev->bitmap, mddev->curr_resync,

2350

&sync_blocks, 1);

2384

&sync_blocks, 1);

2351

else for (i=0; i<conf->raid_disks; i++) {

2385

else for (i=0; i<conf->raid_disks; i++) {

2352

sector_t sect =

2386

sector_t sect =

2353

raid10_find_virt(conf, mddev->curr_resync, i);

2387

raid10_find_virt(conf, mddev->curr_resync, i);

2354

bitmap_end_sync(mddev->bitmap, sect,

2388

bitmap_end_sync(mddev->bitmap, sect,

2355

&sync_blocks, 1);

2389

&sync_blocks, 1);

2356

}

2390

}

2357

} else /* completed sync */

2391

} else /* completed sync */

2358

conf->fullsync = 0;

2392

conf->fullsync = 0;

2359

2393

2360

bitmap_close_sync(mddev->bitmap);

2394

bitmap_close_sync(mddev->bitmap);

2361

close_sync(conf);

2395

close_sync(conf);

2362

*skipped = 1;

2396

*skipped = 1;

2363

return sectors_skipped;

2397

return sectors_skipped;

2364

}

2398

}

2365

if (chunks_skipped >= conf->raid_disks) {

2399

if (chunks_skipped >= conf->raid_disks) {

2366

/* if there has been nothing to do on any drive,

2400

/* if there has been nothing to do on any drive,

2367

* then there is nothing to do at all..

2401

* then there is nothing to do at all..

2368

*/

2402

*/

2369

*skipped = 1;

2403

*skipped = 1;

2370

return (max_sector - sector_nr) + sectors_skipped;

2404

return (max_sector - sector_nr) + sectors_skipped;

2371

}

2405

}

2372

2406

2373

if (max_sector > mddev->resync_max)

2407

if (max_sector > mddev->resync_max)

2374

max_sector = mddev->resync_max; /* Don't do IO beyond here */

2408

max_sector = mddev->resync_max; /* Don't do IO beyond here */

2375

2409

2376

/* make sure whole request will fit in a chunk - if chunks

2410

/* make sure whole request will fit in a chunk - if chunks

2377

* are meaningful

2411

* are meaningful

2378

*/

2412

*/

2379

if (conf->near_copies < conf->raid_disks &&

2413

if (conf->near_copies < conf->raid_disks &&

2380

max_sector > (sector_nr | conf->chunk_mask))

2414

max_sector > (sector_nr | conf->chunk_mask))

2381

max_sector = (sector_nr | conf->chunk_mask) + 1;

2415

max_sector = (sector_nr | conf->chunk_mask) + 1;

2382

/*

2416

/*

2383

* If there is non-resync activity waiting for us then

2417

* If there is non-resync activity waiting for us then

2384

* put in a delay to throttle resync.

2418

* put in a delay to throttle resync.

2385

*/

2419

*/

2386

if (!go_faster && conf->nr_waiting)

2420

if (!go_faster && conf->nr_waiting)

2387

msleep_interruptible(1000);

2421

msleep_interruptible(1000);

2388

2422

2389

/* Again, very different code for resync and recovery.

2423

/* Again, very different code for resync and recovery.

2390

* Both must result in an r10bio with a list of bios that

2424

* Both must result in an r10bio with a list of bios that

2391

* have bi_end_io, bi_sector, bi_bdev set,

2425

* have bi_end_io, bi_sector, bi_bdev set,

2392

* and bi_private set to the r10bio.

2426

* and bi_private set to the r10bio.

2393

* For recovery, we may actually create several r10bios

2427

* For recovery, we may actually create several r10bios

2394

* with 2 bios in each, that correspond to the bios in the main one.

2428

* with 2 bios in each, that correspond to the bios in the main one.

2395

* In this case, the subordinate r10bios link back through a

2429

* In this case, the subordinate r10bios link back through a

2396

* borrowed master_bio pointer, and the counter in the master

2430

* borrowed master_bio pointer, and the counter in the master

2397

* includes a ref from each subordinate.

2431

* includes a ref from each subordinate.

2398

*/

2432

*/

2399

/* First, we decide what to do and set ->bi_end_io

2433

/* First, we decide what to do and set ->bi_end_io

2400

* To end_sync_read if we want to read, and

2434

* To end_sync_read if we want to read, and

2401

* end_sync_write if we will want to write.

2435

* end_sync_write if we will want to write.

2402

*/

2436

*/

2403

2437

2404

max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);

2438

max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);

2405

if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {

2439

if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {

2406

/* recovery... the complicated one */

2440

/* recovery... the complicated one */

2407

int j;

2441

int j;

2408

r10_bio = NULL;

2442

r10_bio = NULL;

2409

2443

2410

for (i=0 ; i<conf->raid_disks; i++) {

2444

for (i=0 ; i<conf->raid_disks; i++) {

2411

int still_degraded;

2445

int still_degraded;

2412

struct r10bio *rb2;

2446

struct r10bio *rb2;

2413

sector_t sect;

2447

sector_t sect;

2414

int must_sync;

2448

int must_sync;

2415

int any_working;

2449

int any_working;

2416

2450

2417

if (conf->mirrors[i].rdev == NULL ||

2451

if (conf->mirrors[i].rdev == NULL ||

2418

test_bit(In_sync, &conf->mirrors[i].rdev->flags))

2452

test_bit(In_sync, &conf->mirrors[i].rdev->flags))

2419

continue;

2453

continue;

2420

2454

2421

still_degraded = 0;

2455

still_degraded = 0;

2422

/* want to reconstruct this device */

2456

/* want to reconstruct this device */

2423

rb2 = r10_bio;

2457

rb2 = r10_bio;

2424

sect = raid10_find_virt(conf, sector_nr, i);

2458

sect = raid10_find_virt(conf, sector_nr, i);

2425

/* Unless we are doing a full sync, we only need

2459

/* Unless we are doing a full sync, we only need

2426

* to recover the block if it is set in the bitmap

2460

* to recover the block if it is set in the bitmap

2427

*/

2461

*/

2428

must_sync = bitmap_start_sync(mddev->bitmap, sect,

2462

must_sync = bitmap_start_sync(mddev->bitmap, sect,

2429

&sync_blocks, 1);

2463

&sync_blocks, 1);

2430

if (sync_blocks < max_sync)

2464

if (sync_blocks < max_sync)

2431

max_sync = sync_blocks;

2465

max_sync = sync_blocks;

2432

if (!must_sync &&

2466

if (!must_sync &&

2433

!conf->fullsync) {

2467

!conf->fullsync) {

2434

/* yep, skip the sync_blocks here, but don't assume

2468

/* yep, skip the sync_blocks here, but don't assume

2435

* that there will never be anything to do here

2469

* that there will never be anything to do here

2436

*/

2470

*/

2437

chunks_skipped = -1;

2471

chunks_skipped = -1;

2438

continue;

2472

continue;

2439

}

2473

}

2440

2474

2441

r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);

2475

r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);

2442

raise_barrier(conf, rb2 != NULL);

2476

raise_barrier(conf, rb2 != NULL);

2443

atomic_set(&r10_bio->remaining, 0);

2477

atomic_set(&r10_bio->remaining, 0);

2444

2478

2445

r10_bio->master_bio = (struct bio*)rb2;

2479

r10_bio->master_bio = (struct bio*)rb2;

2446

if (rb2)

2480

if (rb2)

2447

atomic_inc(&rb2->remaining);

2481

atomic_inc(&rb2->remaining);

2448

r10_bio->mddev = mddev;

2482

r10_bio->mddev = mddev;

2449

set_bit(R10BIO_IsRecover, &r10_bio->state);

2483

set_bit(R10BIO_IsRecover, &r10_bio->state);

2450

r10_bio->sector = sect;

2484

r10_bio->sector = sect;

2451

2485

2452

raid10_find_phys(conf, r10_bio);

2486

raid10_find_phys(conf, r10_bio);

2453

2487

2454

/* Need to check if the array will still be

2488

/* Need to check if the array will still be

2455

* degraded

2489

* degraded

2456

*/

2490

*/

2457

for (j=0; j<conf->raid_disks; j++)

2491

for (j=0; j<conf->raid_disks; j++)

2458

if (conf->mirrors[j].rdev == NULL ||

2492

if (conf->mirrors[j].rdev == NULL ||

2459

test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {

2493

test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {

2460

still_degraded = 1;

2494

still_degraded = 1;

2461

break;

2495

break;

2462

}

2496

}

2463

2497

2464

must_sync = bitmap_start_sync(mddev->bitmap, sect,

2498

must_sync = bitmap_start_sync(mddev->bitmap, sect,

2465

&sync_blocks, still_degraded);

2499

&sync_blocks, still_degraded);

2466

2500

2467

any_working = 0;

2501

any_working = 0;

2468

for (j=0; j<conf->copies;j++) {

2502

for (j=0; j<conf->copies;j++) {

2469

int k;

2503

int k;

2470

int d = r10_bio->devs[j].devnum;

2504

int d = r10_bio->devs[j].devnum;

2471

sector_t from_addr, to_addr;

2505

sector_t from_addr, to_addr;

2472

struct md_rdev *rdev;

2506

struct md_rdev *rdev;

2473

sector_t sector, first_bad;

2507

sector_t sector, first_bad;

2474

int bad_sectors;

2508

int bad_sectors;

2475

if (!conf->mirrors[d].rdev ||

2509

if (!conf->mirrors[d].rdev ||

2476

!test_bit(In_sync, &conf->mirrors[d].rdev->flags))

2510

!test_bit(In_sync, &conf->mirrors[d].rdev->flags))

2477

continue;

2511

continue;

2478

/* This is where we read from */

2512

/* This is where we read from */

2479

any_working = 1;

2513

any_working = 1;

2480

rdev = conf->mirrors[d].rdev;

2514

rdev = conf->mirrors[d].rdev;

2481

sector = r10_bio->devs[j].addr;

2515

sector = r10_bio->devs[j].addr;

2482

2516

2483

if (is_badblock(rdev, sector, max_sync,

2517

if (is_badblock(rdev, sector, max_sync,

2484

&first_bad, &bad_sectors)) {

2518

&first_bad, &bad_sectors)) {

2485

if (first_bad > sector)

2519

if (first_bad > sector)

2486

max_sync = first_bad - sector;

2520

max_sync = first_bad - sector;

2487

else {

2521

else {

2488

bad_sectors -= (sector

2522

bad_sectors -= (sector

2489

- first_bad);

2523

- first_bad);

2490

if (max_sync > bad_sectors)

2524

if (max_sync > bad_sectors)

2491

max_sync = bad_sectors;

2525

max_sync = bad_sectors;

2492

continue;

2526

continue;

2493

}

2527

}

2494

}

2528

}

2495

bio = r10_bio->devs[0].bio;

2529

bio = r10_bio->devs[0].bio;

2496

bio->bi_next = biolist;

2530

bio->bi_next = biolist;

2497

biolist = bio;

2531

biolist = bio;

2498

bio->bi_private = r10_bio;

2532

bio->bi_private = r10_bio;

2499

bio->bi_end_io = end_sync_read;

2533

bio->bi_end_io = end_sync_read;

2500

bio->bi_rw = READ;

2534

bio->bi_rw = READ;

2501

from_addr = r10_bio->devs[j].addr;

2535

from_addr = r10_bio->devs[j].addr;

2502

bio->bi_sector = from_addr +

2536

bio->bi_sector = from_addr +

2503

conf->mirrors[d].rdev->data_offset;

2537

conf->mirrors[d].rdev->data_offset;

2504

bio->bi_bdev = conf->mirrors[d].rdev->bdev;

2538

bio->bi_bdev = conf->mirrors[d].rdev->bdev;

2505

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

2539

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

2506

atomic_inc(&r10_bio->remaining);

2540

atomic_inc(&r10_bio->remaining);

2507

/* and we write to 'i' */

2541

/* and we write to 'i' */

2508

2542

2509

for (k=0; k<conf->copies; k++)

2543

for (k=0; k<conf->copies; k++)

2510

if (r10_bio->devs[k].devnum == i)

2544

if (r10_bio->devs[k].devnum == i)

2511

break;

2545

break;

2512

BUG_ON(k == conf->copies);

2546

BUG_ON(k == conf->copies);

2513

bio = r10_bio->devs[1].bio;

2547

bio = r10_bio->devs[1].bio;

2514

bio->bi_next = biolist;

2548

bio->bi_next = biolist;

2515

biolist = bio;

2549

biolist = bio;

2516

bio->bi_private = r10_bio;

2550

bio->bi_private = r10_bio;

2517

bio->bi_end_io = end_sync_write;

2551

bio->bi_end_io = end_sync_write;

2518

bio->bi_rw = WRITE;

2552

bio->bi_rw = WRITE;

2519

to_addr = r10_bio->devs[k].addr;

2553

to_addr = r10_bio->devs[k].addr;

2520

bio->bi_sector = to_addr +

2554

bio->bi_sector = to_addr +

2521

conf->mirrors[i].rdev->data_offset;

2555

conf->mirrors[i].rdev->data_offset;

2522

bio->bi_bdev = conf->mirrors[i].rdev->bdev;

2556

bio->bi_bdev = conf->mirrors[i].rdev->bdev;

2523

2557

2524

r10_bio->devs[0].devnum = d;

2558

r10_bio->devs[0].devnum = d;

2525

r10_bio->devs[0].addr = from_addr;

2559

r10_bio->devs[0].addr = from_addr;

2526

r10_bio->devs[1].devnum = i;

2560

r10_bio->devs[1].devnum = i;

2527

r10_bio->devs[1].addr = to_addr;

2561

r10_bio->devs[1].addr = to_addr;

2528

2562

2529

break;

2563

break;

2530

}

2564

}

2531

if (j == conf->copies) {

2565

if (j == conf->copies) {

2532

/* Cannot recover, so abort the recovery or

2566

/* Cannot recover, so abort the recovery or

2533

* record a bad block */

2567

* record a bad block */

2534

put_buf(r10_bio);

2568

put_buf(r10_bio);

2535

if (rb2)

2569

if (rb2)

2536

atomic_dec(&rb2->remaining);

2570

atomic_dec(&rb2->remaining);

2537

r10_bio = rb2;

2571

r10_bio = rb2;

2538

if (any_working) {

2572

if (any_working) {

2539

/* problem is that there are bad blocks

2573

/* problem is that there are bad blocks

2540

* on other device(s)

2574

* on other device(s)

2541

*/

2575

*/

2542

int k;

2576

int k;

2543

for (k = 0; k < conf->copies; k++)

2577

for (k = 0; k < conf->copies; k++)

2544

if (r10_bio->devs[k].devnum == i)

2578

if (r10_bio->devs[k].devnum == i)

2545

break;

2579

break;

2546

if (!rdev_set_badblocks(

2580

if (!rdev_set_badblocks(

2547

conf->mirrors[i].rdev,

2581

conf->mirrors[i].rdev,

2548

r10_bio->devs[k].addr,

2582

r10_bio->devs[k].addr,

2549

max_sync, 0))

2583

max_sync, 0))

2550

any_working = 0;

2584

any_working = 0;

2551

}

2585

}

2552

if (!any_working) {

2586

if (!any_working) {

2553

if (!test_and_set_bit(MD_RECOVERY_INTR,

2587

if (!test_and_set_bit(MD_RECOVERY_INTR,

2554

&mddev->recovery))

2588

&mddev->recovery))

2555

printk(KERN_INFO "md/raid10:%s: insufficient "

2589

printk(KERN_INFO "md/raid10:%s: insufficient "

2556

"working devices for recovery.\n",

2590

"working devices for recovery.\n",

2557

mdname(mddev));

2591

mdname(mddev));

2558

conf->mirrors[i].recovery_disabled

2592

conf->mirrors[i].recovery_disabled

2559

= mddev->recovery_disabled;

2593

= mddev->recovery_disabled;

2560

}

2594

}

2561

break;

2595

break;

2562

}

2596

}

2563

}

2597

}

2564

if (biolist == NULL) {

2598

if (biolist == NULL) {

2565

while (r10_bio) {

2599

while (r10_bio) {

2566

struct r10bio *rb2 = r10_bio;

2600

struct r10bio *rb2 = r10_bio;

2567

r10_bio = (struct r10bio*) rb2->master_bio;

2601

r10_bio = (struct r10bio*) rb2->master_bio;

2568

rb2->master_bio = NULL;

2602

rb2->master_bio = NULL;

2569

put_buf(rb2);

2603

put_buf(rb2);

2570

}

2604

}

2571

goto giveup;

2605

goto giveup;

2572

}

2606

}

2573

} else {

2607

} else {

2574

/* resync. Schedule a read for every block at this virt offset */

2608

/* resync. Schedule a read for every block at this virt offset */

2575

int count = 0;

2609

int count = 0;

2576

2610

2577

bitmap_cond_end_sync(mddev->bitmap, sector_nr);

2611

bitmap_cond_end_sync(mddev->bitmap, sector_nr);

2578

2612

2579

if (!bitmap_start_sync(mddev->bitmap, sector_nr,

2613

if (!bitmap_start_sync(mddev->bitmap, sector_nr,

2580

&sync_blocks, mddev->degraded) &&

2614

&sync_blocks, mddev->degraded) &&

2581

!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,

2615

!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,

2582

&mddev->recovery)) {

2616

&mddev->recovery)) {

2583

/* We can skip this block */

2617

/* We can skip this block */

2584

*skipped = 1;

2618

*skipped = 1;

2585

return sync_blocks + sectors_skipped;

2619

return sync_blocks + sectors_skipped;

2586

}

2620

}

2587

if (sync_blocks < max_sync)

2621

if (sync_blocks < max_sync)

2588

max_sync = sync_blocks;

2622

max_sync = sync_blocks;

2589

r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);

2623

r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);

2590

2624

2591

r10_bio->mddev = mddev;

2625

r10_bio->mddev = mddev;

2592

atomic_set(&r10_bio->remaining, 0);

2626

atomic_set(&r10_bio->remaining, 0);

2593

raise_barrier(conf, 0);

2627

raise_barrier(conf, 0);

2594

conf->next_resync = sector_nr;

2628

conf->next_resync = sector_nr;

2595

2629

2596

r10_bio->master_bio = NULL;

2630

r10_bio->master_bio = NULL;

2597

r10_bio->sector = sector_nr;

2631

r10_bio->sector = sector_nr;

2598

set_bit(R10BIO_IsSync, &r10_bio->state);

2632

set_bit(R10BIO_IsSync, &r10_bio->state);

2599

raid10_find_phys(conf, r10_bio);

2633

raid10_find_phys(conf, r10_bio);

2600

r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;

2634

r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;

2601

2635

2602

for (i=0; i<conf->copies; i++) {

2636

for (i=0; i<conf->copies; i++) {

2603

int d = r10_bio->devs[i].devnum;

2637

int d = r10_bio->devs[i].devnum;

2604

sector_t first_bad, sector;

2638

sector_t first_bad, sector;

2605

int bad_sectors;

2639

int bad_sectors;

2606

2640

2607

bio = r10_bio->devs[i].bio;

2641

bio = r10_bio->devs[i].bio;

2608

bio->bi_end_io = NULL;

2642

bio->bi_end_io = NULL;

2609

clear_bit(BIO_UPTODATE, &bio->bi_flags);

2643

clear_bit(BIO_UPTODATE, &bio->bi_flags);

2610

if (conf->mirrors[d].rdev == NULL ||

2644

if (conf->mirrors[d].rdev == NULL ||

2611

test_bit(Faulty, &conf->mirrors[d].rdev->flags))

2645

test_bit(Faulty, &conf->mirrors[d].rdev->flags))

2612

continue;

2646

continue;

2613

sector = r10_bio->devs[i].addr;

2647

sector = r10_bio->devs[i].addr;

2614

if (is_badblock(conf->mirrors[d].rdev,

2648

if (is_badblock(conf->mirrors[d].rdev,

2615

sector, max_sync,

2649

sector, max_sync,

2616

&first_bad, &bad_sectors)) {

2650

&first_bad, &bad_sectors)) {

2617

if (first_bad > sector)

2651

if (first_bad > sector)

2618

max_sync = first_bad - sector;

2652

max_sync = first_bad - sector;

2619

else {

2653

else {

2620

bad_sectors -= (sector - first_bad);

2654

bad_sectors -= (sector - first_bad);

2621

if (max_sync > bad_sectors)

2655

if (max_sync > bad_sectors)

2622

max_sync = max_sync;

2656

max_sync = max_sync;

2623

continue;

2657

continue;

2624

}

2658

}

2625

}

2659

}

2626

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

2660

atomic_inc(&conf->mirrors[d].rdev->nr_pending);

2627

atomic_inc(&r10_bio->remaining);

2661

atomic_inc(&r10_bio->remaining);

2628

bio->bi_next = biolist;

2662

bio->bi_next = biolist;

2629

biolist = bio;

2663

biolist = bio;

2630

bio->bi_private = r10_bio;

2664

bio->bi_private = r10_bio;

2631

bio->bi_end_io = end_sync_read;

2665

bio->bi_end_io = end_sync_read;

2632

bio->bi_rw = READ;

2666

bio->bi_rw = READ;

2633

bio->bi_sector = sector +

2667

bio->bi_sector = sector +

2634

conf->mirrors[d].rdev->data_offset;

2668

conf->mirrors[d].rdev->data_offset;

2635

bio->bi_bdev = conf->mirrors[d].rdev->bdev;

2669

bio->bi_bdev = conf->mirrors[d].rdev->bdev;

2636

count++;

2670

count++;

2637

}

2671

}

2638

2672

2639

if (count < 2) {

2673

if (count < 2) {

2640

for (i=0; i<conf->copies; i++) {

2674

for (i=0; i<conf->copies; i++) {

2641

int d = r10_bio->devs[i].devnum;

2675

int d = r10_bio->devs[i].devnum;

2642

if (r10_bio->devs[i].bio->bi_end_io)

2676

if (r10_bio->devs[i].bio->bi_end_io)

2643

rdev_dec_pending(conf->mirrors[d].rdev,

2677

rdev_dec_pending(conf->mirrors[d].rdev,

2644

mddev);

2678

mddev);

2645

}

2679

}

2646

put_buf(r10_bio);

2680

put_buf(r10_bio);

2647

biolist = NULL;

2681

biolist = NULL;

2648

goto giveup;

2682

goto giveup;

2649

}

2683

}

2650

}

2684

}

2651

2685

2652

for (bio = biolist; bio ; bio=bio->bi_next) {

2686

for (bio = biolist; bio ; bio=bio->bi_next) {

2653

2687

2654

bio->bi_flags &= ~(BIO_POOL_MASK - 1);

2688

bio->bi_flags &= ~(BIO_POOL_MASK - 1);

2655

if (bio->bi_end_io)

2689

if (bio->bi_end_io)

2656

bio->bi_flags |= 1 << BIO_UPTODATE;

2690

bio->bi_flags |= 1 << BIO_UPTODATE;

2657

bio->bi_vcnt = 0;

2691

bio->bi_vcnt = 0;

2658

bio->bi_idx = 0;

2692

bio->bi_idx = 0;

2659

bio->bi_phys_segments = 0;

2693

bio->bi_phys_segments = 0;

2660

bio->bi_size = 0;

2694

bio->bi_size = 0;

2661

}

2695

}

2662

2696

2663

nr_sectors = 0;

2697

nr_sectors = 0;

2664

if (sector_nr + max_sync < max_sector)

2698

if (sector_nr + max_sync < max_sector)

2665

max_sector = sector_nr + max_sync;

2699

max_sector = sector_nr + max_sync;

2666

do {

2700

do {

2667

struct page *page;

2701

struct page *page;

2668

int len = PAGE_SIZE;

2702

int len = PAGE_SIZE;

2669

if (sector_nr + (len>>9) > max_sector)

2703

if (sector_nr + (len>>9) > max_sector)

2670

len = (max_sector - sector_nr) << 9;

2704

len = (max_sector - sector_nr) << 9;

2671

if (len == 0)

2705

if (len == 0)

2672

break;

2706

break;

2673

for (bio= biolist ; bio ; bio=bio->bi_next) {

2707

for (bio= biolist ; bio ; bio=bio->bi_next) {

2674

struct bio *bio2;

2708

struct bio *bio2;

2675

page = bio->bi_io_vec[bio->bi_vcnt].bv_page;

2709

page = bio->bi_io_vec[bio->bi_vcnt].bv_page;

2676

if (bio_add_page(bio, page, len, 0))

2710

if (bio_add_page(bio, page, len, 0))

2677

continue;

2711

continue;

2678

2712

2679

/* stop here */

2713

/* stop here */

2680

bio->bi_io_vec[bio->bi_vcnt].bv_page = page;

2714

bio->bi_io_vec[bio->bi_vcnt].bv_page = page;

2681

for (bio2 = biolist;

2715

for (bio2 = biolist;

2682

bio2 && bio2 != bio;

2716

bio2 && bio2 != bio;

2683

bio2 = bio2->bi_next) {

2717

bio2 = bio2->bi_next) {

2684

/* remove last page from this bio */

2718

/* remove last page from this bio */

2685

bio2->bi_vcnt--;

2719

bio2->bi_vcnt--;

2686

bio2->bi_size -= len;

2720

bio2->bi_size -= len;

2687

bio2->bi_flags &= ~(1<< BIO_SEG_VALID);

2721

bio2->bi_flags &= ~(1<< BIO_SEG_VALID);

2688

}

2722

}

2689

goto bio_full;

2723

goto bio_full;

2690

}

2724

}

2691

nr_sectors += len>>9;

2725

nr_sectors += len>>9;

2692

sector_nr += len>>9;

2726

sector_nr += len>>9;

2693

} while (biolist->bi_vcnt < RESYNC_PAGES);

2727

} while (biolist->bi_vcnt < RESYNC_PAGES);

2694

bio_full:

2728

bio_full:

2695

r10_bio->sectors = nr_sectors;

2729

r10_bio->sectors = nr_sectors;

2696

2730

2697

while (biolist) {

2731

while (biolist) {

2698

bio = biolist;

2732

bio = biolist;

2699

biolist = biolist->bi_next;

2733

biolist = biolist->bi_next;

2700

2734

2701

bio->bi_next = NULL;

2735

bio->bi_next = NULL;

2702

r10_bio = bio->bi_private;

2736

r10_bio = bio->bi_private;

2703

r10_bio->sectors = nr_sectors;

2737

r10_bio->sectors = nr_sectors;

2704

2738

2705

if (bio->bi_end_io == end_sync_read) {

2739

if (bio->bi_end_io == end_sync_read) {

2706

md_sync_acct(bio->bi_bdev, nr_sectors);

2740

md_sync_acct(bio->bi_bdev, nr_sectors);

2707

generic_make_request(bio);

2741

generic_make_request(bio);

2708

}

2742

}

2709

}

2743

}

2710

2744

2711

if (sectors_skipped)

2745

if (sectors_skipped)

2712

/* pretend they weren't skipped, it makes

2746

/* pretend they weren't skipped, it makes

2713

* no important difference in this case

2747

* no important difference in this case

2714

*/

2748

*/

2715

md_done_sync(mddev, sectors_skipped, 1);

2749

md_done_sync(mddev, sectors_skipped, 1);

2716

2750

2717

return sectors_skipped + nr_sectors;

2751

return sectors_skipped + nr_sectors;

2718

giveup:

2752

giveup:

2719

/* There is nowhere to write, so all non-sync

2753

/* There is nowhere to write, so all non-sync

2720

* drives must be failed or in resync, all drives

2754

* drives must be failed or in resync, all drives

2721

* have a bad block, so try the next chunk...

2755

* have a bad block, so try the next chunk...

2722

*/

2756

*/

2723

if (sector_nr + max_sync < max_sector)

2757

if (sector_nr + max_sync < max_sector)

2724

max_sector = sector_nr + max_sync;

2758

max_sector = sector_nr + max_sync;

2725

2759

2726

sectors_skipped += (max_sector - sector_nr);

2760

sectors_skipped += (max_sector - sector_nr);

2727

chunks_skipped ++;

2761

chunks_skipped ++;

2728

sector_nr = max_sector;

2762

sector_nr = max_sector;

2729

goto skipped;

2763

goto skipped;

2730

}

2764

}

2731

2765

2732

static sector_t

2766

static sector_t

2733

raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)

2767

raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)

2734

{

2768

{

2735

sector_t size;

2769

sector_t size;

2736

struct r10conf *conf = mddev->private;

2770

struct r10conf *conf = mddev->private;

2737

2771

2738

if (!raid_disks)

2772

if (!raid_disks)

2739

raid_disks = conf->raid_disks;

2773

raid_disks = conf->raid_disks;

2740

if (!sectors)

2774

if (!sectors)

2741

sectors = conf->dev_sectors;

2775

sectors = conf->dev_sectors;

2742

2776

2743

size = sectors >> conf->chunk_shift;

2777

size = sectors >> conf->chunk_shift;

2744

sector_div(size, conf->far_copies);

2778

sector_div(size, conf->far_copies);

2745

size = size * raid_disks;

2779

size = size * raid_disks;

2746

sector_div(size, conf->near_copies);

2780

sector_div(size, conf->near_copies);

2747

2781

2748

return size << conf->chunk_shift;

2782

return size << conf->chunk_shift;

2749

}

2783

}

2750

2784

2751

2785

2752

static struct r10conf *setup_conf(struct mddev *mddev)

2786

static struct r10conf *setup_conf(struct mddev *mddev)

2753

{

2787

{

2754

struct r10conf *conf = NULL;

2788

struct r10conf *conf = NULL;

2755

int nc, fc, fo;

2789

int nc, fc, fo;

2756

sector_t stride, size;

2790

sector_t stride, size;

2757

int err = -EINVAL;

2791

int err = -EINVAL;

2758

2792

2759

if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||

2793

if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||

2760

!is_power_of_2(mddev->new_chunk_sectors)) {

2794

!is_power_of_2(mddev->new_chunk_sectors)) {

2761

printk(KERN_ERR "md/raid10:%s: chunk size must be "

2795

printk(KERN_ERR "md/raid10:%s: chunk size must be "

2762

"at least PAGE_SIZE(%ld) and be a power of 2.\n",

2796

"at least PAGE_SIZE(%ld) and be a power of 2.\n",

2763

mdname(mddev), PAGE_SIZE);

2797

mdname(mddev), PAGE_SIZE);

2764

goto out;

2798

goto out;

2765

}

2799

}

2766

2800

2767

nc = mddev->new_layout & 255;

2801

nc = mddev->new_layout & 255;

2768

fc = (mddev->new_layout >> 8) & 255;

2802

fc = (mddev->new_layout >> 8) & 255;

2769

fo = mddev->new_layout & (1<<16);

2803

fo = mddev->new_layout & (1<<16);

2770

2804

2771

if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||

2805

if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||

2772

(mddev->new_layout >> 17)) {

2806

(mddev->new_layout >> 17)) {

2773

printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",

2807

printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",

2774

mdname(mddev), mddev->new_layout);

2808

mdname(mddev), mddev->new_layout);

2775

goto out;

2809

goto out;

2776

}

2810

}

2777

2811

2778

err = -ENOMEM;

2812

err = -ENOMEM;

2779

conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);

2813

conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);

2780

if (!conf)

2814

if (!conf)

2781

goto out;

2815

goto out;

2782

2816

2783

conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,

2817

conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,

2784

GFP_KERNEL);

2818

GFP_KERNEL);

2785

if (!conf->mirrors)

2819

if (!conf->mirrors)

2786

goto out;

2820

goto out;

2787

2821

2788

conf->tmppage = alloc_page(GFP_KERNEL);

2822

conf->tmppage = alloc_page(GFP_KERNEL);

2789

if (!conf->tmppage)

2823

if (!conf->tmppage)

2790

goto out;

2824

goto out;

2791

2825

2792

2826

2793

conf->raid_disks = mddev->raid_disks;

2827

conf->raid_disks = mddev->raid_disks;

2794

conf->near_copies = nc;

2828

conf->near_copies = nc;

2795

conf->far_copies = fc;

2829

conf->far_copies = fc;

2796

conf->copies = nc*fc;

2830

conf->copies = nc*fc;

2797

conf->far_offset = fo;

2831

conf->far_offset = fo;

2798

conf->chunk_mask = mddev->new_chunk_sectors - 1;

2832

conf->chunk_mask = mddev->new_chunk_sectors - 1;

2799

conf->chunk_shift = ffz(~mddev->new_chunk_sectors);

2833

conf->chunk_shift = ffz(~mddev->new_chunk_sectors);

2800

2834

2801

conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,

2835

conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,

2802

r10bio_pool_free, conf);

2836

r10bio_pool_free, conf);

2803

if (!conf->r10bio_pool)

2837

if (!conf->r10bio_pool)

2804

goto out;

2838

goto out;

2805

2839

2806

size = mddev->dev_sectors >> conf->chunk_shift;

2840

size = mddev->dev_sectors >> conf->chunk_shift;

2807

sector_div(size, fc);

2841

sector_div(size, fc);

2808

size = size * conf->raid_disks;

2842

size = size * conf->raid_disks;

2809

sector_div(size, nc);

2843

sector_div(size, nc);

2810

/* 'size' is now the number of chunks in the array */

2844

/* 'size' is now the number of chunks in the array */

2811

/* calculate "used chunks per device" in 'stride' */

2845

/* calculate "used chunks per device" in 'stride' */

2812

stride = size * conf->copies;

2846

stride = size * conf->copies;

2813

2847

2814

/* We need to round up when dividing by raid_disks to

2848

/* We need to round up when dividing by raid_disks to

2815

* get the stride size.

2849

* get the stride size.

2816

*/

2850

*/

2817

stride += conf->raid_disks - 1;

2851

stride += conf->raid_disks - 1;

2818

sector_div(stride, conf->raid_disks);

2852

sector_div(stride, conf->raid_disks);

2819

2853

2820

conf->dev_sectors = stride << conf->chunk_shift;

2854

conf->dev_sectors = stride << conf->chunk_shift;

2821

2855

2822

if (fo)

2856

if (fo)

2823

stride = 1;

2857

stride = 1;

2824

else

2858

else

2825

sector_div(stride, fc);

2859

sector_div(stride, fc);

2826

conf->stride = stride << conf->chunk_shift;

2860

conf->stride = stride << conf->chunk_shift;

2827

2861

2828

2862

2829

spin_lock_init(&conf->device_lock);

2863

spin_lock_init(&conf->device_lock);

2830

INIT_LIST_HEAD(&conf->retry_list);

2864

INIT_LIST_HEAD(&conf->retry_list);

2831

2865

2832

spin_lock_init(&conf->resync_lock);

2866

spin_lock_init(&conf->resync_lock);

2833

init_waitqueue_head(&conf->wait_barrier);

2867

init_waitqueue_head(&conf->wait_barrier);

2834

2868

2835

conf->thread = md_register_thread(raid10d, mddev, NULL);

2869

conf->thread = md_register_thread(raid10d, mddev, NULL);

2836

if (!conf->thread)

2870

if (!conf->thread)

2837

goto out;

2871

goto out;

2838

2872

2839

conf->mddev = mddev;

2873

conf->mddev = mddev;

2840

return conf;

2874

return conf;

2841

2875

2842

out:

2876

out:

2843

printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",

2877

printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",

2844

mdname(mddev));

2878

mdname(mddev));

2845

if (conf) {

2879

if (conf) {

2846

if (conf->r10bio_pool)

2880

if (conf->r10bio_pool)

2847

mempool_destroy(conf->r10bio_pool);

2881

mempool_destroy(conf->r10bio_pool);

2848

kfree(conf->mirrors);

2882

kfree(conf->mirrors);

2849

safe_put_page(conf->tmppage);

2883

safe_put_page(conf->tmppage);

2850

kfree(conf);

2884

kfree(conf);

2851

}

2885

}

2852

return ERR_PTR(err);

2886

return ERR_PTR(err);

2853

}

2887

}

2854

2888

2855

static int run(struct mddev *mddev)

2889

static int run(struct mddev *mddev)

2856

{

2890

{

2857

struct r10conf *conf;

2891

struct r10conf *conf;

2858

int i, disk_idx, chunk_size;

2892

int i, disk_idx, chunk_size;

2859

struct mirror_info *disk;

2893

struct mirror_info *disk;

2860

struct md_rdev *rdev;

2894

struct md_rdev *rdev;

2861

sector_t size;

2895

sector_t size;

2862

2896

2863

/*

2897

/*

2864

* copy the already verified devices into our private RAID10

2898

* copy the already verified devices into our private RAID10

2865

* bookkeeping area. [whatever we allocate in run(),

2899

* bookkeeping area. [whatever we allocate in run(),

2866

* should be freed in stop()]

2900

* should be freed in stop()]

2867

*/

2901

*/

2868

2902

2869

if (mddev->private == NULL) {

2903

if (mddev->private == NULL) {

2870

conf = setup_conf(mddev);

2904

conf = setup_conf(mddev);

2871

if (IS_ERR(conf))

2905

if (IS_ERR(conf))

2872

return PTR_ERR(conf);

2906

return PTR_ERR(conf);

2873

mddev->private = conf;

2907

mddev->private = conf;

2874

}

2908

}

2875

conf = mddev->private;

2909

conf = mddev->private;

2876

if (!conf)

2910

if (!conf)

2877

goto out;

2911

goto out;

2878

2912

2879

mddev->thread = conf->thread;

2913

mddev->thread = conf->thread;

2880

conf->thread = NULL;

2914

conf->thread = NULL;

2881

2915

2882

chunk_size = mddev->chunk_sectors << 9;

2916

chunk_size = mddev->chunk_sectors << 9;

2883

blk_queue_io_min(mddev->queue, chunk_size);

2917

blk_queue_io_min(mddev->queue, chunk_size);

2884

if (conf->raid_disks % conf->near_copies)

2918

if (conf->raid_disks % conf->near_copies)

2885

blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);

2919

blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);

2886

else

2920

else

2887

blk_queue_io_opt(mddev->queue, chunk_size *

2921

blk_queue_io_opt(mddev->queue, chunk_size *

2888

(conf->raid_disks / conf->near_copies));

2922

(conf->raid_disks / conf->near_copies));

2889

2923

2890

list_for_each_entry(rdev, &mddev->disks, same_set) {

2924

list_for_each_entry(rdev, &mddev->disks, same_set) {

2891

2925

2892

disk_idx = rdev->raid_disk;

2926

disk_idx = rdev->raid_disk;

2893

if (disk_idx >= conf->raid_disks

2927

if (disk_idx >= conf->raid_disks

2894

|| disk_idx < 0)

2928

|| disk_idx < 0)

2895

continue;

2929

continue;

2896

disk = conf->mirrors + disk_idx;

2930

disk = conf->mirrors + disk_idx;

2897

2931

2898

disk->rdev = rdev;

2932

disk->rdev = rdev;

2899

disk_stack_limits(mddev->gendisk, rdev->bdev,

2933

disk_stack_limits(mddev->gendisk, rdev->bdev,

2900

rdev->data_offset << 9);

2934

rdev->data_offset << 9);

2901

/* as we don't honour merge_bvec_fn, we must never risk

2935

/* as we don't honour merge_bvec_fn, we must never risk

2902

* violating it, so limit max_segments to 1 lying

2936

* violating it, so limit max_segments to 1 lying

2903

* within a single page.

2937

* within a single page.

2904

*/

2938

*/

2905

if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {

2939

if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {

2906

blk_queue_max_segments(mddev->queue, 1);

2940

blk_queue_max_segments(mddev->queue, 1);

2907

blk_queue_segment_boundary(mddev->queue,

2941

blk_queue_segment_boundary(mddev->queue,

2908

PAGE_CACHE_SIZE - 1);

2942

PAGE_CACHE_SIZE - 1);

2909

}

2943

}

2910

2944

2911

disk->head_position = 0;

2945

disk->head_position = 0;

2912

}

2946

}

2913

/* need to check that every block has at least one working mirror */

2947

/* need to check that every block has at least one working mirror */

2914

if (!enough(conf, -1)) {

2948

if (!enough(conf, -1)) {

2915

printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",

2949

printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",

2916

mdname(mddev));

2950

mdname(mddev));

2917

goto out_free_conf;

2951

goto out_free_conf;

2918

}

2952

}

2919

2953

2920

mddev->degraded = 0;

2954

mddev->degraded = 0;

2921

for (i = 0; i < conf->raid_disks; i++) {

2955

for (i = 0; i < conf->raid_disks; i++) {

2922

2956

2923

disk = conf->mirrors + i;

2957

disk = conf->mirrors + i;

2924

2958

2925

if (!disk->rdev ||

2959

if (!disk->rdev ||

2926

!test_bit(In_sync, &disk->rdev->flags)) {

2960

!test_bit(In_sync, &disk->rdev->flags)) {

2927

disk->head_position = 0;

2961

disk->head_position = 0;

2928

mddev->degraded++;

2962

mddev->degraded++;

2929

if (disk->rdev)

2963

if (disk->rdev)

2930

conf->fullsync = 1;

2964

conf->fullsync = 1;

2931

}

2965

}

2932

disk->recovery_disabled = mddev->recovery_disabled - 1;

2966

disk->recovery_disabled = mddev->recovery_disabled - 1;

2933

}

2967

}

2934

2968

2935

if (mddev->recovery_cp != MaxSector)

2969

if (mddev->recovery_cp != MaxSector)

2936

printk(KERN_NOTICE "md/raid10:%s: not clean"

2970

printk(KERN_NOTICE "md/raid10:%s: not clean"

2937

" -- starting background reconstruction\n",

2971

" -- starting background reconstruction\n",

2938

mdname(mddev));

2972

mdname(mddev));

2939

printk(KERN_INFO

2973

printk(KERN_INFO

2940

"md/raid10:%s: active with %d out of %d devices\n",

2974

"md/raid10:%s: active with %d out of %d devices\n",

2941

mdname(mddev), conf->raid_disks - mddev->degraded,

2975

mdname(mddev), conf->raid_disks - mddev->degraded,

2942

conf->raid_disks);

2976

conf->raid_disks);

2943

/*

2977

/*

2944

* Ok, everything is just fine now

2978

* Ok, everything is just fine now

2945

*/

2979

*/

2946

mddev->dev_sectors = conf->dev_sectors;

2980

mddev->dev_sectors = conf->dev_sectors;

2947

size = raid10_size(mddev, 0, 0);

2981

size = raid10_size(mddev, 0, 0);

2948

md_set_array_sectors(mddev, size);

2982

md_set_array_sectors(mddev, size);

2949

mddev->resync_max_sectors = size;

2983

mddev->resync_max_sectors = size;

2950

2984

2951

mddev->queue->backing_dev_info.congested_fn = raid10_congested;

2985

mddev->queue->backing_dev_info.congested_fn = raid10_congested;

2952

mddev->queue->backing_dev_info.congested_data = mddev;

2986

mddev->queue->backing_dev_info.congested_data = mddev;

2953

2987

2954

/* Calculate max read-ahead size.

2988

/* Calculate max read-ahead size.

2955

* We need to readahead at least twice a whole stripe....

2989

* We need to readahead at least twice a whole stripe....

2956

* maybe...

2990

* maybe...

2957

*/

2991

*/

2958

{

2992

{

2959

int stripe = conf->raid_disks *

2993

int stripe = conf->raid_disks *

2960

((mddev->chunk_sectors << 9) / PAGE_SIZE);

2994

((mddev->chunk_sectors << 9) / PAGE_SIZE);

2961

stripe /= conf->near_copies;

2995

stripe /= conf->near_copies;

2962

if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)

2996

if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)

2963

mddev->queue->backing_dev_info.ra_pages = 2* stripe;

2997

mddev->queue->backing_dev_info.ra_pages = 2* stripe;

2964

}

2998

}

2965

2999

2966

if (conf->near_copies < conf->raid_disks)

3000

if (conf->near_copies < conf->raid_disks)

2967

blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);

3001

blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);

2968

3002

2969

if (md_integrity_register(mddev))

3003

if (md_integrity_register(mddev))

2970

goto out_free_conf;

3004

goto out_free_conf;

2971

3005

2972

return 0;

3006

return 0;

2973

3007

2974

out_free_conf:

3008

out_free_conf:

2975

md_unregister_thread(&mddev->thread);

3009

md_unregister_thread(&mddev->thread);

2976

if (conf->r10bio_pool)

3010

if (conf->r10bio_pool)

2977

mempool_destroy(conf->r10bio_pool);

3011

mempool_destroy(conf->r10bio_pool);

2978

safe_put_page(conf->tmppage);

3012

safe_put_page(conf->tmppage);

2979

kfree(conf->mirrors);

3013

kfree(conf->mirrors);

2980

kfree(conf);

3014

kfree(conf);

2981

mddev->private = NULL;

3015

mddev->private = NULL;

2982

out:

3016

out:

2983

return -EIO;

3017

return -EIO;

2984

}

3018

}

2985

3019

2986

static int stop(struct mddev *mddev)

3020

static int stop(struct mddev *mddev)

2987

{

3021

{

2988

struct r10conf *conf = mddev->private;

3022

struct r10conf *conf = mddev->private;

2989

3023

2990

raise_barrier(conf, 0);

3024

raise_barrier(conf, 0);

2991

lower_barrier(conf);

3025

lower_barrier(conf);

2992

3026

2993

md_unregister_thread(&mddev->thread);

3027

md_unregister_thread(&mddev->thread);

2994

blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/

3028

blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/

2995

if (conf->r10bio_pool)

3029

if (conf->r10bio_pool)

2996

mempool_destroy(conf->r10bio_pool);

3030

mempool_destroy(conf->r10bio_pool);

2997

kfree(conf->mirrors);

3031

kfree(conf->mirrors);

2998

kfree(conf);

3032

kfree(conf);

2999

mddev->private = NULL;

3033

mddev->private = NULL;

3000

return 0;

3034

return 0;

3001

}

3035

}

3002

3036

3003

static void raid10_quiesce(struct mddev *mddev, int state)

3037

static void raid10_quiesce(struct mddev *mddev, int state)

3004

{

3038

{

3005

struct r10conf *conf = mddev->private;

3039

struct r10conf *conf = mddev->private;

3006

3040

3007

switch(state) {

3041

switch(state) {

3008

case 1:

3042

case 1:

3009

raise_barrier(conf, 0);

3043

raise_barrier(conf, 0);

3010

break;

3044

break;

3011

case 0:

3045

case 0:

3012

lower_barrier(conf);

3046

lower_barrier(conf);

3013

break;

3047

break;

3014

}

3048

}

3015

}

3049

}

3016

3050

3017

static void *raid10_takeover_raid0(struct mddev *mddev)

3051

static void *raid10_takeover_raid0(struct mddev *mddev)

3018

{

3052

{

3019

struct md_rdev *rdev;

3053

struct md_rdev *rdev;

3020

struct r10conf *conf;

3054

struct r10conf *conf;

3021

3055

3022

if (mddev->degraded > 0) {

3056

if (mddev->degraded > 0) {

3023

printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",

3057

printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",

3024

mdname(mddev));

3058

mdname(mddev));

3025

return ERR_PTR(-EINVAL);

3059

return ERR_PTR(-EINVAL);

3026

}

3060

}

3027

3061

3028

/* Set new parameters */

3062

/* Set new parameters */

3029

mddev->new_level = 10;

3063

mddev->new_level = 10;

3030

/* new layout: far_copies = 1, near_copies = 2 */

3064

/* new layout: far_copies = 1, near_copies = 2 */

3031

mddev->new_layout = (1<<8) + 2;

3065

mddev->new_layout = (1<<8) + 2;

3032

mddev->new_chunk_sectors = mddev->chunk_sectors;

3066

mddev->new_chunk_sectors = mddev->chunk_sectors;

3033

mddev->delta_disks = mddev->raid_disks;

3067

mddev->delta_disks = mddev->raid_disks;

3034

mddev->raid_disks *= 2;

3068

mddev->raid_disks *= 2;

3035

/* make sure it will be not marked as dirty */

3069

/* make sure it will be not marked as dirty */

3036

mddev->recovery_cp = MaxSector;

3070

mddev->recovery_cp = MaxSector;

3037

3071

3038

conf = setup_conf(mddev);

3072

conf = setup_conf(mddev);

3039

if (!IS_ERR(conf)) {

3073

if (!IS_ERR(conf)) {

3040

list_for_each_entry(rdev, &mddev->disks, same_set)

3074

list_for_each_entry(rdev, &mddev->disks, same_set)

3041

if (rdev->raid_disk >= 0)

3075

if (rdev->raid_disk >= 0)

3042

rdev->new_raid_disk = rdev->raid_disk * 2;

3076

rdev->new_raid_disk = rdev->raid_disk * 2;

3043

conf->barrier = 1;

3077

conf->barrier = 1;

3044

}

3078

}

3045

3079

3046

return conf;

3080

return conf;

3047

}

3081

}

3048

3082

3049

static void *raid10_takeover(struct mddev *mddev)

3083

static void *raid10_takeover(struct mddev *mddev)

3050

{

3084

{

3051

struct r0conf *raid0_conf;

3085

struct r0conf *raid0_conf;

3052

3086

3053

/* raid10 can take over:

3087

/* raid10 can take over:

3054

* raid0 - providing it has only two drives

3088

* raid0 - providing it has only two drives

3055

*/

3089

*/

3056

if (mddev->level == 0) {

3090

if (mddev->level == 0) {

3057

/* for raid0 takeover only one zone is supported */

3091

/* for raid0 takeover only one zone is supported */

3058

raid0_conf = mddev->private;

3092

raid0_conf = mddev->private;

3059

if (raid0_conf->nr_strip_zones > 1) {

3093

if (raid0_conf->nr_strip_zones > 1) {

3060

printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"

3094

printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"

3061

" with more than one zone.\n",

3095

" with more than one zone.\n",

3062

mdname(mddev));

3096

mdname(mddev));

3063

return ERR_PTR(-EINVAL);

3097

return ERR_PTR(-EINVAL);

3064

}

3098

}

3065

return raid10_takeover_raid0(mddev);

3099

return raid10_takeover_raid0(mddev);

3066

}

3100

}

3067

return ERR_PTR(-EINVAL);

3101

return ERR_PTR(-EINVAL);

3068

}

3102

}

3069

3103

3070

static struct md_personality raid10_personality =

3104

static struct md_personality raid10_personality =

3071

{

3105

{

3072

.name = "raid10",

3106

.name = "raid10",

3073

.level = 10,

3107

.level = 10,

3074

.owner = THIS_MODULE,

3108

.owner = THIS_MODULE,

3075

.make_request = make_request,

3109

.make_request = make_request,

3076

.run = run,

3110

.run = run,

3077

.stop = stop,

3111

.stop = stop,

3078

.status = status,

3112

.status = status,

3079

.error_handler = error,

3113

.error_handler = error,

3080

.hot_add_disk = raid10_add_disk,

3114

.hot_add_disk = raid10_add_disk,

3081

.hot_remove_disk= raid10_remove_disk,

3115

.hot_remove_disk= raid10_remove_disk,

3082

.spare_active = raid10_spare_active,

3116

.spare_active = raid10_spare_active,

3083

.sync_request = sync_request,

3117

.sync_request = sync_request,

3084

.quiesce = raid10_quiesce,

3118

.quiesce = raid10_quiesce,

3085

.size = raid10_size,

3119

.size = raid10_size,

3086

.takeover = raid10_takeover,

3120

.takeover = raid10_takeover,

3087

};

3121

};

3088

3122

3089

static int __init raid_init(void)

3123

static int __init raid_init(void)

3090

{

3124

{

3091

return register_md_personality(&raid10_personality);

3125

return register_md_personality(&raid10_personality);

3092

}

3126

}

3093

3127

3094

static void raid_exit(void)

3128

static void raid_exit(void)

3095

{

3129

{

3096

unregister_md_personality(&raid10_personality);

3130

unregister_md_personality(&raid10_personality);

3097

}

3131

}

3098

3132

3099

module_init(raid_init);

3133

module_init(raid_init);

3100

module_exit(raid_exit);

3134

module_exit(raid_exit);

3101

MODULE_LICENSE("GPL");

3135

MODULE_LICENSE("GPL");

3102

MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");

3136

MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");

3103

MODULE_ALIAS("md-personality-9"); /* RAID10 */

3137

MODULE_ALIAS("md-personality-9"); /* RAID10 */

3104

MODULE_ALIAS("md-raid10");

3138

MODULE_ALIAS("md-raid10");

3105

MODULE_ALIAS("md-level-10");

3139

MODULE_ALIAS("md-level-10");

3106

3140

3107

module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);

3141

module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);

3108

3142

GITLAB

md/raid10: prepare data structures for handling replacement.

 /*
  * raid10.c : Multiple Devices driver for Linux
  *
  * Copyright (C) 2000-2004 Neil Brown
  *
  * RAID-10 support for md.
  *
  * Base on code in raid1.c.  See raid1.c for further copyright information.
  *
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2, or (at your option)
  * any later version.
  *
  * You should have received a copy of the GNU General Public License
  * (for example /usr/src/linux/COPYING); if not, write to the Free
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/blkdev.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/ratelimit.h>
 #include "md.h"
 #include "raid10.h"
 #include "raid0.h"
 #include "bitmap.h"
 /*
  * RAID10 provides a combination of RAID0 and RAID1 functionality.
  * The layout of data is defined by
  *    chunk_size
  *    raid_disks
  *    near_copies (stored in low byte of layout)
  *    far_copies (stored in second byte of layout)
  *    far_offset (stored in bit 16 of layout )
  *
  * The data to be stored is divided into chunks using chunksize.
  * Each device is divided into far_copies sections.
  * In each section, chunks are laid out in a style similar to raid0, but
  * near_copies copies of each chunk is stored (each on a different drive).
  * The starting device for each section is offset near_copies from the starting
  * device of the previous section.
  * Thus they are (near_copies*far_copies) of each chunk, and each is on a different
  * drive.
  * near_copies and far_copies must be at least one, and their product is at most
  * raid_disks.
  *
  * If far_offset is true, then the far_copies are handled a bit differently.
  * The copies are still in different stripes, but instead of be very far apart
  * on disk, there are adjacent stripes.
  */
 /*
  * Number of guaranteed r10bios in case of extreme VM load:
  */
 #define	NR_RAID10_BIOS 256
 /* When there are this many requests queue to be written by
  * the raid10 thread, we become 'congested' to provide back-pressure
  * for writeback.
  */
 static int max_queued_requests = 1024;
 static void allow_barrier(struct r10conf *conf);
 static void lower_barrier(struct r10conf *conf);
 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
 	int size = offsetof(struct r10bio, devs[conf->copies]);
-	/* allocate a r10bio with room for raid_disks entries in the bios array */
+	/* allocate a r10bio with room for raid_disks entries in the
+	 * bios array */
 	return kzalloc(size, gfp_flags);
 }
 static void r10bio_pool_free(void *r10_bio, void *data)
 {
 	kfree(r10_bio);
 }
 /* Maximum size of each resync request */
 #define RESYNC_BLOCK_SIZE (64*1024)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 /* amount of memory to reserve for resync requests */
 #define RESYNC_WINDOW (1024*1024)
 /* maximum number of concurrent requests, memory permitting */
 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
 /*
  * When performing a resync, we need to read and compare, so
  * we need as many pages are there are copies.
  * When performing a recovery, we need 2 bios, one for read,
  * one for write (we recover only one drive per r10buf)
  *
  */
 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct r10conf *conf = data;
 	struct page *page;
 	struct r10bio *r10_bio;
 	struct bio *bio;
 	int i, j;
 	int nalloc;
 	r10_bio = r10bio_pool_alloc(gfp_flags, conf);
 	if (!r10_bio)
 		return NULL;
 	if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery))
 		nalloc = conf->copies; /* resync */
 	else
 		nalloc = 2; /* recovery */
 	/*
 	 * Allocate bios.
 	 */
 	for (j = nalloc ; j-- ; ) {
 		bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
 		if (!bio)
 			goto out_free_bio;
 		r10_bio->devs[j].bio = bio;
+		if (!conf->have_replacement)
+			continue;
+		bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
+		if (!bio)
+			goto out_free_bio;
+		r10_bio->devs[j].repl_bio = bio;
 	}
 	/*
 	 * Allocate RESYNC_PAGES data pages and attach them
 	 * where needed.
 	 */
 	for (j = 0 ; j < nalloc; j++) {
+		struct bio *rbio = r10_bio->devs[j].repl_bio;
 		bio = r10_bio->devs[j].bio;
 		for (i = 0; i < RESYNC_PAGES; i++) {
 			if (j == 1 && !test_bit(MD_RECOVERY_SYNC,
 						&conf->mddev->recovery)) {
 				/* we can share bv_page's during recovery */
 				struct bio *rbio = r10_bio->devs[0].bio;
 				page = rbio->bi_io_vec[i].bv_page;
 				get_page(page);
 			} else
 				page = alloc_page(gfp_flags);
 			if (unlikely(!page))
 				goto out_free_pages;
 			bio->bi_io_vec[i].bv_page = page;
+			if (rbio)
+				rbio->bi_io_vec[i].bv_page = page;
 		}
 	}
 	return r10_bio;
 out_free_pages:
 	for ( ; i > 0 ; i--)
 		safe_put_page(bio->bi_io_vec[i-1].bv_page);
 	while (j--)
 		for (i = 0; i < RESYNC_PAGES ; i++)
 			safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
 	j = -1;
 out_free_bio:
-	while ( ++j < nalloc )
+	while (++j < nalloc) {
 		bio_put(r10_bio->devs[j].bio);
+		if (r10_bio->devs[j].repl_bio)
+			bio_put(r10_bio->devs[j].repl_bio);
+	}
 	r10bio_pool_free(r10_bio, conf);
 	return NULL;
 }
 static void r10buf_pool_free(void *__r10_bio, void *data)
 {
 	int i;
 	struct r10conf *conf = data;
 	struct r10bio *r10bio = __r10_bio;
 	int j;
 	for (j=0; j < conf->copies; j++) {
 		struct bio *bio = r10bio->devs[j].bio;
 		if (bio) {
 			for (i = 0; i < RESYNC_PAGES; i++) {
 				safe_put_page(bio->bi_io_vec[i].bv_page);
 				bio->bi_io_vec[i].bv_page = NULL;
 			}
 			bio_put(bio);
 		}
+		bio = r10bio->devs[j].repl_bio;
+		if (bio)
+			bio_put(bio);
 	}
 	r10bio_pool_free(r10bio, conf);
 }
 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
 {
 	int i;
 	for (i = 0; i < conf->copies; i++) {
 		struct bio **bio = & r10_bio->devs[i].bio;
 		if (!BIO_SPECIAL(*bio))
 			bio_put(*bio);
 		*bio = NULL;
+		bio = &r10_bio->devs[i].repl_bio;
+		if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
+			bio_put(*bio);
+		*bio = NULL;
 	}
 }
 static void free_r10bio(struct r10bio *r10_bio)
 {
 	struct r10conf *conf = r10_bio->mddev->private;
 	put_all_bios(conf, r10_bio);
 	mempool_free(r10_bio, conf->r10bio_pool);
 }
 static void put_buf(struct r10bio *r10_bio)
 {
 	struct r10conf *conf = r10_bio->mddev->private;
 	mempool_free(r10_bio, conf->r10buf_pool);
 	lower_barrier(conf);
 }
 static void reschedule_retry(struct r10bio *r10_bio)
 {
 	unsigned long flags;
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r10_bio->retry_list, &conf->retry_list);
 	conf->nr_queued ++;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	/* wake up frozen array... */
 	wake_up(&conf->wait_barrier);
 	md_wakeup_thread(mddev->thread);
 }
 /*
  * raid_end_bio_io() is called when we have finished servicing a mirrored
  * operation and are ready to return a success/failure code to the buffer
  * cache layer.
  */
 static void raid_end_bio_io(struct r10bio *r10_bio)
 {
 	struct bio *bio = r10_bio->master_bio;
 	int done;
 	struct r10conf *conf = r10_bio->mddev->private;
 	if (bio->bi_phys_segments) {
 		unsigned long flags;
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio->bi_phys_segments--;
 		done = (bio->bi_phys_segments == 0);
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	} else
 		done = 1;
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	if (done) {
 		bio_endio(bio, 0);
 		/*
 		 * Wake up any possible resync thread that waits for the device
 		 * to go idle.
 		 */
 		allow_barrier(conf);
 	}
 	free_r10bio(r10_bio);
 }
 /*
  * Update disk head position estimator based on IRQ completion info.
  */
 static inline void update_head_pos(int slot, struct r10bio *r10_bio)
 {
 	struct r10conf *conf = r10_bio->mddev->private;
 	conf->mirrors[r10_bio->devs[slot].devnum].head_position =
 		r10_bio->devs[slot].addr + (r10_bio->sectors);
 }
 /*
  * Find the disk number which triggered given bio
  */
 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
-			 struct bio *bio, int *slotp)
+			 struct bio *bio, int *slotp, int *replp)
 {
 	int slot;
+	int repl = 0;
-	for (slot = 0; slot < conf->copies; slot++)
+	for (slot = 0; slot < conf->copies; slot++) {
 		if (r10_bio->devs[slot].bio == bio)
 			break;
+		if (r10_bio->devs[slot].repl_bio == bio) {
+			repl = 1;
+			break;
+		}
+	}
 	BUG_ON(slot == conf->copies);
 	update_head_pos(slot, r10_bio);
 	if (slotp)
 		*slotp = slot;
+	if (replp)
+		*replp = repl;
 	return r10_bio->devs[slot].devnum;
 }
 static void raid10_end_read_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct r10bio *r10_bio = bio->bi_private;
 	int slot, dev;
 	struct r10conf *conf = r10_bio->mddev->private;
 	slot = r10_bio->read_slot;
 	dev = r10_bio->devs[slot].devnum;
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
 	update_head_pos(slot, r10_bio);
 	if (uptodate) {
 		/*
 		 * Set R10BIO_Uptodate in our master bio, so that
 		 * we will return a good error code to the higher
 		 * levels even if IO on some other mirrored buffer fails.
 		 *
 		 * The 'master' represents the composite IO operation to
 		 * user-side. So if something waits for IO, then it will
 		 * wait for the 'master' bio.
 		 */
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
 		raid_end_bio_io(r10_bio);
 		rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 	} else {
 		/*
 		 * oops, read error - keep the refcount on the rdev
 		 */
 		char b[BDEVNAME_SIZE];
 		printk_ratelimited(KERN_ERR
 				   "md/raid10:%s: %s: rescheduling sector %llu\n",
 				   mdname(conf->mddev),
 				   bdevname(conf->mirrors[dev].rdev->bdev, b),
 				   (unsigned long long)r10_bio->sector);
 		set_bit(R10BIO_ReadError, &r10_bio->state);
 		reschedule_retry(r10_bio);
 	}
 }
 static void close_write(struct r10bio *r10_bio)
 {
 	/* clear the bitmap if all writes complete successfully */
 	bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
 			r10_bio->sectors,
 			!test_bit(R10BIO_Degraded, &r10_bio->state),
 			0);
 	md_write_end(r10_bio->mddev);
 }
 static void one_write_done(struct r10bio *r10_bio)
 {
 	if (atomic_dec_and_test(&r10_bio->remaining)) {
 		if (test_bit(R10BIO_WriteError, &r10_bio->state))
 			reschedule_retry(r10_bio);
 		else {
 			close_write(r10_bio);
 			if (test_bit(R10BIO_MadeGood, &r10_bio->state))
 				reschedule_retry(r10_bio);
 			else
 				raid_end_bio_io(r10_bio);
 		}
 	}
 }
 static void raid10_end_write_request(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct r10bio *r10_bio = bio->bi_private;
 	int dev;
 	int dec_rdev = 1;
 	struct r10conf *conf = r10_bio->mddev->private;
 	int slot;
-	dev = find_bio_disk(conf, r10_bio, bio, &slot);
+	dev = find_bio_disk(conf, r10_bio, bio, &slot, NULL);
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
 	if (!uptodate) {
 		set_bit(WriteErrorSeen,	&conf->mirrors[dev].rdev->flags);
 		set_bit(R10BIO_WriteError, &r10_bio->state);
 		dec_rdev = 0;
 	} else {
 		/*
 		 * Set R10BIO_Uptodate in our master bio, so that
 		 * we will return a good error code for to the higher
 		 * levels even if IO on some other mirrored buffer fails.
 		 *
 		 * The 'master' represents the composite IO operation to
 		 * user-side. So if something waits for IO, then it will
 		 * wait for the 'master' bio.
 		 */
 		sector_t first_bad;
 		int bad_sectors;
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
 		/* Maybe we can clear some bad blocks. */
 		if (is_badblock(conf->mirrors[dev].rdev,
 				r10_bio->devs[slot].addr,
 				r10_bio->sectors,
 				&first_bad, &bad_sectors)) {
 			bio_put(bio);
 			r10_bio->devs[slot].bio = IO_MADE_GOOD;
 			dec_rdev = 0;
 			set_bit(R10BIO_MadeGood, &r10_bio->state);
 		}
 	}
 	/*
 	 *
 	 * Let's see if all mirrored write operations have finished
 	 * already.
 	 */
 	one_write_done(r10_bio);
 	if (dec_rdev)
 		rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev);
 }
 /*
  * RAID10 layout manager
  * As well as the chunksize and raid_disks count, there are two
  * parameters: near_copies and far_copies.
  * near_copies * far_copies must be <= raid_disks.
  * Normally one of these will be 1.
  * If both are 1, we get raid0.
  * If near_copies == raid_disks, we get raid1.
  *
  * Chunks are laid out in raid0 style with near_copies copies of the
  * first chunk, followed by near_copies copies of the next chunk and
  * so on.
  * If far_copies > 1, then after 1/far_copies of the array has been assigned
  * as described above, we start again with a device offset of near_copies.
  * So we effectively have another copy of the whole array further down all
  * the drives, but with blocks on different drives.
  * With this layout, and block is never stored twice on the one device.
  *
  * raid10_find_phys finds the sector offset of a given virtual sector
  * on each device that it is on.
  *
  * raid10_find_virt does the reverse mapping, from a device and a
  * sector offset to a virtual address
  */
 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
 {
 	int n,f;
 	sector_t sector;
 	sector_t chunk;
 	sector_t stripe;
 	int dev;
 	int slot = 0;
 	/* now calculate first sector/dev */
 	chunk = r10bio->sector >> conf->chunk_shift;
 	sector = r10bio->sector & conf->chunk_mask;
 	chunk *= conf->near_copies;
 	stripe = chunk;
 	dev = sector_div(stripe, conf->raid_disks);
 	if (conf->far_offset)
 		stripe *= conf->far_copies;
 	sector += stripe << conf->chunk_shift;
 	/* and calculate all the others */
 	for (n=0; n < conf->near_copies; n++) {
 		int d = dev;
 		sector_t s = sector;
 		r10bio->devs[slot].addr = sector;
 		r10bio->devs[slot].devnum = d;
 		slot++;
 		for (f = 1; f < conf->far_copies; f++) {
 			d += conf->near_copies;
 			if (d >= conf->raid_disks)
 				d -= conf->raid_disks;
 			s += conf->stride;
 			r10bio->devs[slot].devnum = d;
 			r10bio->devs[slot].addr = s;
 			slot++;
 		}
 		dev++;
 		if (dev >= conf->raid_disks) {
 			dev = 0;
 			sector += (conf->chunk_mask + 1);
 		}
 	}
 	BUG_ON(slot != conf->copies);
 }
 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
 {
 	sector_t offset, chunk, vchunk;
 	offset = sector & conf->chunk_mask;
 	if (conf->far_offset) {
 		int fc;
 		chunk = sector >> conf->chunk_shift;
 		fc = sector_div(chunk, conf->far_copies);
 		dev -= fc * conf->near_copies;
 		if (dev < 0)
 			dev += conf->raid_disks;
 	} else {
 		while (sector >= conf->stride) {
 			sector -= conf->stride;
 			if (dev < conf->near_copies)
 				dev += conf->raid_disks - conf->near_copies;
 			else
 				dev -= conf->near_copies;
 		}
 		chunk = sector >> conf->chunk_shift;
 	}
 	vchunk = chunk * conf->raid_disks + dev;
 	sector_div(vchunk, conf->near_copies);
 	return (vchunk << conf->chunk_shift) + offset;
 }
 /**
  *	raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
  *	@q: request queue
  *	@bvm: properties of new bio
  *	@biovec: the request that could be merged to it.
  *
  *	Return amount of bytes we can accept at this offset
  *      If near_copies == raid_disk, there are no striping issues,
  *      but in that case, the function isn't called at all.
  */
 static int raid10_mergeable_bvec(struct request_queue *q,
 				 struct bvec_merge_data *bvm,
 				 struct bio_vec *biovec)
 {
 	struct mddev *mddev = q->queuedata;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
 	unsigned int chunk_sectors = mddev->chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 	if (max < 0) max = 0; /* bio_add cannot handle a negative return */
 	if (max <= biovec->bv_len && bio_sectors == 0)
 		return biovec->bv_len;
 	else
 		return max;
 }
 /*
  * This routine returns the disk from which the requested read should
  * be done. There is a per-array 'next expected sequential IO' sector
  * number - if this matches on the next IO then we use the last disk.
  * There is also a per-disk 'last know head position' sector that is
  * maintained from IRQ contexts, both the normal and the resync IO
  * completion handlers update this position correctly. If there is no
  * perfect sequential match then we pick the disk whose head is closest.
  *
  * If there are 2 mirrors in the same 2 devices, performance degrades
  * because position is mirror, not device based.
  *
  * The rdev for the device selected will have nr_pending incremented.
  */
 /*
  * FIXME: possibly should rethink readbalancing and do it differently
  * depending on near_copies / far_copies geometry.
  */
 static int read_balance(struct r10conf *conf, struct r10bio *r10_bio, int *max_sectors)
 {
 	const sector_t this_sector = r10_bio->sector;
 	int disk, slot;
 	int sectors = r10_bio->sectors;
 	int best_good_sectors;
 	sector_t new_distance, best_dist;
 	struct md_rdev *rdev;
 	int do_balance;
 	int best_slot;
 	raid10_find_phys(conf, r10_bio);
 	rcu_read_lock();
 retry:
 	sectors = r10_bio->sectors;
 	best_slot = -1;
 	best_dist = MaxSector;
 	best_good_sectors = 0;
 	do_balance = 1;
 	/*
 	 * Check if we can balance. We can balance on the whole
 	 * device if no resync is going on (recovery is ok), or below
 	 * the resync window. We take the first readable disk when
 	 * above the resync window.
 	 */
 	if (conf->mddev->recovery_cp < MaxSector
 	    && (this_sector + sectors >= conf->next_resync))
 		do_balance = 0;
 	for (slot = 0; slot < conf->copies ; slot++) {
 		sector_t first_bad;
 		int bad_sectors;
 		sector_t dev_sector;
 		if (r10_bio->devs[slot].bio == IO_BLOCKED)
 			continue;
 		disk = r10_bio->devs[slot].devnum;
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (rdev == NULL)
 			continue;
 		if (!test_bit(In_sync, &rdev->flags))
 			continue;
 		dev_sector = r10_bio->devs[slot].addr;
 		if (is_badblock(rdev, dev_sector, sectors,
 				&first_bad, &bad_sectors)) {
 			if (best_dist < MaxSector)
 				/* Already have a better slot */
 				continue;
 			if (first_bad <= dev_sector) {
 				/* Cannot read here.  If this is the
 				 * 'primary' device, then we must not read
 				 * beyond 'bad_sectors' from another device.
 				 */
 				bad_sectors -= (dev_sector - first_bad);
 				if (!do_balance && sectors > bad_sectors)
 					sectors = bad_sectors;
 				if (best_good_sectors > sectors)
 					best_good_sectors = sectors;
 			} else {
 				sector_t good_sectors =
 					first_bad - dev_sector;
 				if (good_sectors > best_good_sectors) {
 					best_good_sectors = good_sectors;
 					best_slot = slot;
 				}
 				if (!do_balance)
 					/* Must read from here */
 					break;
 			}
 			continue;
 		} else
 			best_good_sectors = sectors;
 		if (!do_balance)
 			break;
 		/* This optimisation is debatable, and completely destroys
 		 * sequential read speed for 'far copies' arrays.  So only
 		 * keep it for 'near' arrays, and review those later.
 		 */
 		if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending))
 			break;
 		/* for far > 1 always use the lowest address */
 		if (conf->far_copies > 1)
 			new_distance = r10_bio->devs[slot].addr;
 		else
 			new_distance = abs(r10_bio->devs[slot].addr -
 					   conf->mirrors[disk].head_position);
 		if (new_distance < best_dist) {
 			best_dist = new_distance;
 			best_slot = slot;
 		}
 	}
 	if (slot == conf->copies)
 		slot = best_slot;
 	if (slot >= 0) {
 		disk = r10_bio->devs[slot].devnum;
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (!rdev)
 			goto retry;
 		atomic_inc(&rdev->nr_pending);
 		if (test_bit(Faulty, &rdev->flags)) {
 			/* Cannot risk returning a device that failed
 			 * before we inc'ed nr_pending
 			 */
 			rdev_dec_pending(rdev, conf->mddev);
 			goto retry;
 		}
 		r10_bio->read_slot = slot;
 	} else
 		disk = -1;
 	rcu_read_unlock();
 	*max_sectors = best_good_sectors;
 	return disk;
 }
 static int raid10_congested(void *data, int bits)
 {
 	struct mddev *mddev = data;
 	struct r10conf *conf = mddev->private;
 	int i, ret = 0;
 	if ((bits & (1 << BDI_async_congested)) &&
 	    conf->pending_count >= max_queued_requests)
 		return 1;
 	if (mddev_congested(mddev, bits))
 		return 1;
 	rcu_read_lock();
 	for (i = 0; i < conf->raid_disks && ret == 0; i++) {
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct request_queue *q = bdev_get_queue(rdev->bdev);
 			ret |= bdi_congested(&q->backing_dev_info, bits);
 		}
 	}
 	rcu_read_unlock();
 	return ret;
 }
 static void flush_pending_writes(struct r10conf *conf)
 {
 	/* Any writes that have been queued but are awaiting
 	 * bitmap updates get flushed here.
 	 */
 	spin_lock_irq(&conf->device_lock);
 	if (conf->pending_bio_list.head) {
 		struct bio *bio;
 		bio = bio_list_get(&conf->pending_bio_list);
 		conf->pending_count = 0;
 		spin_unlock_irq(&conf->device_lock);
 		/* flush any pending bitmap writes to disk
 		 * before proceeding w/ I/O */
 		bitmap_unplug(conf->mddev->bitmap);
 		wake_up(&conf->wait_barrier);
 		while (bio) { /* submit pending writes */
 			struct bio *next = bio->bi_next;
 			bio->bi_next = NULL;
 			generic_make_request(bio);
 			bio = next;
 		}
 	} else
 		spin_unlock_irq(&conf->device_lock);
 }
 /* Barriers....
  * Sometimes we need to suspend IO while we do something else,
  * either some resync/recovery, or reconfigure the array.
  * To do this we raise a 'barrier'.
  * The 'barrier' is a counter that can be raised multiple times
  * to count how many activities are happening which preclude
  * normal IO.
  * We can only raise the barrier if there is no pending IO.
  * i.e. if nr_pending == 0.
  * We choose only to raise the barrier if no-one is waiting for the
  * barrier to go down.  This means that as soon as an IO request
  * is ready, no other operations which require a barrier will start
  * until the IO request has had a chance.
  *
  * So: regular IO calls 'wait_barrier'.  When that returns there
  *    is no backgroup IO happening,  It must arrange to call
  *    allow_barrier when it has finished its IO.
  * backgroup IO calls must call raise_barrier.  Once that returns
  *    there is no normal IO happeing.  It must arrange to call
  *    lower_barrier when the particular background IO completes.
  */
 static void raise_barrier(struct r10conf *conf, int force)
 {
 	BUG_ON(force && !conf->barrier);
 	spin_lock_irq(&conf->resync_lock);
 	/* Wait until no block IO is waiting (unless 'force') */
 	wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
 			    conf->resync_lock, );
 	/* block any new IO from starting */
 	conf->barrier++;
 	/* Now wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
 			    conf->resync_lock, );
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void lower_barrier(struct r10conf *conf)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->resync_lock, flags);
 	conf->barrier--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 static void wait_barrier(struct r10conf *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
 	if (conf->barrier) {
 		conf->nr_waiting++;
 		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 				    conf->resync_lock,
 				    );
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void allow_barrier(struct r10conf *conf)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->resync_lock, flags);
 	conf->nr_pending--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 static void freeze_array(struct r10conf *conf)
 {
 	/* stop syncio and normal IO and wait for everything to
 	 * go quiet.
 	 * We increment barrier and nr_waiting, and then
 	 * wait until nr_pending match nr_queued+1
 	 * This is called in the context of one normal IO request
 	 * that has failed. Thus any sync request that might be pending
 	 * will be blocked by nr_pending, and we need to wait for
 	 * pending IO requests to complete or be queued for re-try.
 	 * Thus the number queued (nr_queued) plus this request (1)
 	 * must match the number of pending IOs (nr_pending) before
 	 * we continue.
 	 */
 	spin_lock_irq(&conf->resync_lock);
 	conf->barrier++;
 	conf->nr_waiting++;
 	wait_event_lock_irq(conf->wait_barrier,
 			    conf->nr_pending == conf->nr_queued+1,
 			    conf->resync_lock,
 			    flush_pending_writes(conf));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(struct r10conf *conf)
 {
 	/* reverse the effect of the freeze */
 	spin_lock_irq(&conf->resync_lock);
 	conf->barrier--;
 	conf->nr_waiting--;
 	wake_up(&conf->wait_barrier);
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void make_request(struct mddev *mddev, struct bio * bio)
 {
 	struct r10conf *conf = mddev->private;
 	struct mirror_info *mirror;
 	struct r10bio *r10_bio;
 	struct bio *read_bio;
 	int i;
 	int chunk_sects = conf->chunk_mask + 1;
 	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
 	const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	int plugged;
 	int sectors_handled;
 	int max_sectors;
 	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
 		md_flush_request(mddev, bio);
 		return;
 	}
 	/* If this request crosses a chunk boundary, we need to
 	 * split it.  This will only happen for 1 PAGE (or less) requests.
 	 */
 	if (unlikely( (bio->bi_sector & conf->chunk_mask) + (bio->bi_size >> 9)
 		      > chunk_sects &&
 		    conf->near_copies < conf->raid_disks)) {
 		struct bio_pair *bp;
 		/* Sanity check -- queue functions should prevent this happening */
 		if (bio->bi_vcnt != 1 ||
 		    bio->bi_idx != 0)
 			goto bad_map;
 		/* This is a one page bio that upper layers
 		 * refuse to split for us, so we need to split it.
 		 */
 		bp = bio_split(bio,
 			       chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
 		/* Each of these 'make_request' calls will call 'wait_barrier'.
 		 * If the first succeeds but the second blocks due to the resync
 		 * thread raising the barrier, we will deadlock because the
 		 * IO to the underlying device will be queued in generic_make_request
 		 * and will never complete, so will never reduce nr_pending.
 		 * So increment nr_waiting here so no new raise_barriers will
 		 * succeed, and so the second wait_barrier cannot block.
 		 */
 		spin_lock_irq(&conf->resync_lock);
 		conf->nr_waiting++;
 		spin_unlock_irq(&conf->resync_lock);
 		make_request(mddev, &bp->bio1);
 		make_request(mddev, &bp->bio2);
 		spin_lock_irq(&conf->resync_lock);
 		conf->nr_waiting--;
 		wake_up(&conf->wait_barrier);
 		spin_unlock_irq(&conf->resync_lock);
 		bio_pair_release(bp);
 		return;
 	bad_map:
 		printk("md/raid10:%s: make_request bug: can't convert block across chunks"
 		       " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
 		       (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
 		bio_io_error(bio);
 		return;
 	}
 	md_write_start(mddev, bio);
 	/*
 	 * Register the new request and wait if the reconstruction
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
 	 */
 	wait_barrier(conf);
 	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 	r10_bio->master_bio = bio;
 	r10_bio->sectors = bio->bi_size >> 9;
 	r10_bio->mddev = mddev;
 	r10_bio->sector = bio->bi_sector;
 	r10_bio->state = 0;
 	/* We might need to issue multiple reads to different
 	 * devices if there are bad blocks around, so we keep
 	 * track of the number of reads in bio->bi_phys_segments.
 	 * If this is 0, there is only one r10_bio and no locking
 	 * will be needed when the request completes.  If it is
 	 * non-zero, then it is the number of not-completed requests.
 	 */
 	bio->bi_phys_segments = 0;
 	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
 	if (rw == READ) {
 		/*
 		 * read balancing logic:
 		 */
 		int disk;
 		int slot;
 read_again:
 		disk = read_balance(conf, r10_bio, &max_sectors);
 		slot = r10_bio->read_slot;
 		if (disk < 0) {
 			raid_end_bio_io(r10_bio);
 			return;
 		}
 		mirror = conf->mirrors + disk;
 		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 		md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
 			    max_sectors);
 		r10_bio->devs[slot].bio = read_bio;
 		read_bio->bi_sector = r10_bio->devs[slot].addr +
 			mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
 		read_bio->bi_end_io = raid10_end_read_request;
 		read_bio->bi_rw = READ | do_sync;
 		read_bio->bi_private = r10_bio;
 		if (max_sectors < r10_bio->sectors) {
 			/* Could not read all from this device, so we will
 			 * need another r10_bio.
 			 */
 			sectors_handled = (r10_bio->sectors + max_sectors
 					   - bio->bi_sector);
 			r10_bio->sectors = max_sectors;
 			spin_lock_irq(&conf->device_lock);
 			if (bio->bi_phys_segments == 0)
 				bio->bi_phys_segments = 2;
 			else
 				bio->bi_phys_segments++;
 			spin_unlock(&conf->device_lock);
 			/* Cannot call generic_make_request directly
 			 * as that will be queued in __generic_make_request
 			 * and subsequent mempool_alloc might block
 			 * waiting for it.  so hand bio over to raid10d.
 			 */
 			reschedule_retry(r10_bio);
 			r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 			r10_bio->master_bio = bio;
 			r10_bio->sectors = ((bio->bi_size >> 9)
 					    - sectors_handled);
 			r10_bio->state = 0;
 			r10_bio->mddev = mddev;
 			r10_bio->sector = bio->bi_sector + sectors_handled;
 			goto read_again;
 		} else
 			generic_make_request(read_bio);
 		return;
 	}
 	/*
 	 * WRITE:
 	 */
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		wait_event(conf->wait_barrier,
 			   conf->pending_count < max_queued_requests);
 	}
 	/* first select target devices under rcu_lock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
 	 * If there are known/acknowledged bad blocks on any device
 	 * on which we have seen a write error, we want to avoid
 	 * writing to those blocks.  This potentially requires several
 	 * writes to write around the bad blocks.  Each set of writes
 	 * gets its own r10_bio with a set of bios attached.  The number
 	 * of r10_bios is recored in bio->bi_phys_segments just as with
 	 * the read case.
 	 */
 	plugged = mddev_check_plugged(mddev);
+	r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
 	raid10_find_phys(conf, r10_bio);
 retry_write:
 	blocked_rdev = NULL;
 	rcu_read_lock();
 	max_sectors = r10_bio->sectors;
 	for (i = 0;  i < conf->copies; i++) {
 		int d = r10_bio->devs[i].devnum;
 		struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
 		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
 			atomic_inc(&rdev->nr_pending);
 			blocked_rdev = rdev;
 			break;
 		}
 		r10_bio->devs[i].bio = NULL;
 		if (!rdev || test_bit(Faulty, &rdev->flags)) {
 			set_bit(R10BIO_Degraded, &r10_bio->state);
 			continue;
 		}
 		if (test_bit(WriteErrorSeen, &rdev->flags)) {
 			sector_t first_bad;
 			sector_t dev_sector = r10_bio->devs[i].addr;
 			int bad_sectors;
 			int is_bad;
 			is_bad = is_badblock(rdev, dev_sector,
 					     max_sectors,
 					     &first_bad, &bad_sectors);
 			if (is_bad < 0) {
 				/* Mustn't write here until the bad block
 				 * is acknowledged
 				 */
 				atomic_inc(&rdev->nr_pending);
 				set_bit(BlockedBadBlocks, &rdev->flags);
 				blocked_rdev = rdev;
 				break;
 			}
 			if (is_bad && first_bad <= dev_sector) {
 				/* Cannot write here at all */
 				bad_sectors -= (dev_sector - first_bad);
 				if (bad_sectors < max_sectors)
 					/* Mustn't write more than bad_sectors
 					 * to other devices yet
 					 */
 					max_sectors = bad_sectors;
 				/* We don't set R10BIO_Degraded as that
 				 * only applies if the disk is missing,
 				 * so it might be re-added, and we want to
 				 * know to recover this chunk.
 				 * In this case the device is here, and the
 				 * fact that this chunk is not in-sync is
 				 * recorded in the bad block log.
 				 */
 				continue;
 			}
 			if (is_bad) {
 				int good_sectors = first_bad - dev_sector;
 				if (good_sectors < max_sectors)
 					max_sectors = good_sectors;
 			}
 		}
 		r10_bio->devs[i].bio = bio;
 		atomic_inc(&rdev->nr_pending);
 	}
 	rcu_read_unlock();
 	if (unlikely(blocked_rdev)) {
 		/* Have to wait for this device to get unblocked, then retry */
 		int j;
 		int d;
 		for (j = 0; j < i; j++)
 			if (r10_bio->devs[j].bio) {
 				d = r10_bio->devs[j].devnum;
 				rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 			}
 		allow_barrier(conf);
 		md_wait_for_blocked_rdev(blocked_rdev, mddev);
 		wait_barrier(conf);
 		goto retry_write;
 	}
 	if (max_sectors < r10_bio->sectors) {
 		/* We are splitting this into multiple parts, so
 		 * we need to prepare for allocating another r10_bio.
 		 */
 		r10_bio->sectors = max_sectors;
 		spin_lock_irq(&conf->device_lock);
 		if (bio->bi_phys_segments == 0)
 			bio->bi_phys_segments = 2;
 		else
 			bio->bi_phys_segments++;
 		spin_unlock_irq(&conf->device_lock);
 	}
 	sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
 	atomic_set(&r10_bio->remaining, 1);
 	bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
 	for (i = 0; i < conf->copies; i++) {
 		struct bio *mbio;
 		int d = r10_bio->devs[i].devnum;
 		if (!r10_bio->devs[i].bio)
 			continue;
 		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 		md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
 			    max_sectors);
 		r10_bio->devs[i].bio = mbio;
 		mbio->bi_sector	= (r10_bio->devs[i].addr+
 				   conf->mirrors[d].rdev->data_offset);
 		mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		mbio->bi_end_io	= raid10_end_write_request;
 		mbio->bi_rw = WRITE | do_sync | do_fua;
 		mbio->bi_private = r10_bio;
 		atomic_inc(&r10_bio->remaining);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		bio_list_add(&conf->pending_bio_list, mbio);
 		conf->pending_count++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 	}
 	/* Don't remove the bias on 'remaining' (one_write_done) until
 	 * after checking if we need to go around again.
 	 */
 	if (sectors_handled < (bio->bi_size >> 9)) {
 		one_write_done(r10_bio);
 		/* We need another r10_bio.  It has already been counted
 		 * in bio->bi_phys_segments.
 		 */
 		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
 		r10_bio->master_bio = bio;
 		r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
 		r10_bio->mddev = mddev;
 		r10_bio->sector = bio->bi_sector + sectors_handled;
 		r10_bio->state = 0;
 		goto retry_write;
 	}
 	one_write_done(r10_bio);
 	/* In case raid10d snuck in to freeze_array */
 	wake_up(&conf->wait_barrier);
 	if (do_sync || !mddev->bitmap || !plugged)
 		md_wakeup_thread(mddev->thread);
 }
 static void status(struct seq_file *seq, struct mddev *mddev)
 {
 	struct r10conf *conf = mddev->private;
 	int i;
 	if (conf->near_copies < conf->raid_disks)
 		seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
 	if (conf->near_copies > 1)
 		seq_printf(seq, " %d near-copies", conf->near_copies);
 	if (conf->far_copies > 1) {
 		if (conf->far_offset)
 			seq_printf(seq, " %d offset-copies", conf->far_copies);
 		else
 			seq_printf(seq, " %d far-copies", conf->far_copies);
 	}
 	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
 					conf->raid_disks - mddev->degraded);
 	for (i = 0; i < conf->raid_disks; i++)
 		seq_printf(seq, "%s",
 			      conf->mirrors[i].rdev &&
 			      test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
 	seq_printf(seq, "]");
 }
 /* check if there are enough drives for
  * every block to appear on atleast one.
  * Don't consider the device numbered 'ignore'
  * as we might be about to remove it.
  */
 static int enough(struct r10conf *conf, int ignore)
 {
 	int first = 0;
 	do {
 		int n = conf->copies;
 		int cnt = 0;
 		while (n--) {
 			if (conf->mirrors[first].rdev &&
 			    first != ignore)
 				cnt++;
 			first = (first+1) % conf->raid_disks;
 		}
 		if (cnt == 0)
 			return 0;
 	} while (first != 0);
 	return 1;
 }
 static void error(struct mddev *mddev, struct md_rdev *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	struct r10conf *conf = mddev->private;
 	/*
 	 * If it is not operational, then we have already marked it as dead
 	 * else if it is the last working disks, ignore the error, let the
 	 * next level up know.
 	 * else mark the drive as failed
 	 */
 	if (test_bit(In_sync, &rdev->flags)
 	    && !enough(conf, rdev->raid_disk))
 		/*
 		 * Don't fail the drive, just return an IO error.
 		 */
 		return;
 	if (test_and_clear_bit(In_sync, &rdev->flags)) {
 		unsigned long flags;
 		spin_lock_irqsave(&conf->device_lock, flags);
 		mddev->degraded++;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		/*
 		 * if recovery is running, make sure it aborts.
 		 */
 		set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	}
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 	printk(KERN_ALERT
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid10:%s: Operation continuing on %d devices.\n",
 	       mdname(mddev), bdevname(rdev->bdev, b),
 	       mdname(mddev), conf->raid_disks - mddev->degraded);
 }
 static void print_conf(struct r10conf *conf)
 {
 	int i;
 	struct mirror_info *tmp;
 	printk(KERN_DEBUG "RAID10 conf printout:\n");
 	if (!conf) {
 		printk(KERN_DEBUG "(!conf)\n");
 		return;
 	}
 	printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded,
 		conf->raid_disks);
 	for (i = 0; i < conf->raid_disks; i++) {
 		char b[BDEVNAME_SIZE];
 		tmp = conf->mirrors + i;
 		if (tmp->rdev)
 			printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
 				i, !test_bit(In_sync, &tmp->rdev->flags),
 			        !test_bit(Faulty, &tmp->rdev->flags),
 				bdevname(tmp->rdev->bdev,b));
 	}
 }
 static void close_sync(struct r10conf *conf)
 {
 	wait_barrier(conf);
 	allow_barrier(conf);
 	mempool_destroy(conf->r10buf_pool);
 	conf->r10buf_pool = NULL;
 }
 static int raid10_spare_active(struct mddev *mddev)
 {
 	int i;
 	struct r10conf *conf = mddev->private;
 	struct mirror_info *tmp;
 	int count = 0;
 	unsigned long flags;
 	/*
 	 * Find all non-in_sync disks within the RAID10 configuration
 	 * and mark them in_sync
 	 */
 	for (i = 0; i < conf->raid_disks; i++) {
 		tmp = conf->mirrors + i;
 		if (tmp->rdev
 		    && !test_bit(Faulty, &tmp->rdev->flags)
 		    && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
 			count++;
 			sysfs_notify_dirent(tmp->rdev->sysfs_state);
 		}
 	}
 	spin_lock_irqsave(&conf->device_lock, flags);
 	mddev->degraded -= count;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	print_conf(conf);
 	return count;
 }
 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct r10conf *conf = mddev->private;
 	int err = -EEXIST;
 	int mirror;
 	int first = 0;
 	int last = conf->raid_disks - 1;
 	if (mddev->recovery_cp < MaxSector)
 		/* only hot-add to in-sync arrays, as recovery is
 		 * very different from resync
 		 */
 		return -EBUSY;
 	if (!enough(conf, -1))
 		return -EINVAL;
 	if (rdev->raid_disk >= 0)
 		first = last = rdev->raid_disk;
 	if (rdev->saved_raid_disk >= first &&
 	    conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
 		mirror = rdev->saved_raid_disk;
 	else
 		mirror = first;
 	for ( ; mirror <= last ; mirror++) {
 		struct mirror_info *p = &conf->mirrors[mirror];
 		if (p->recovery_disabled == mddev->recovery_disabled)
 			continue;
 		if (p->rdev)
 			continue;
 		disk_stack_limits(mddev->gendisk, rdev->bdev,
 				  rdev->data_offset << 9);
 		/* as we don't honour merge_bvec_fn, we must
 		 * never risk violating it, so limit
 		 * ->max_segments to one lying with a single
 		 * page, as a one page request is never in
 		 * violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
 			blk_queue_max_segments(mddev->queue, 1);
 			blk_queue_segment_boundary(mddev->queue,
 						   PAGE_CACHE_SIZE - 1);
 		}
 		p->head_position = 0;
 		p->recovery_disabled = mddev->recovery_disabled - 1;
 		rdev->raid_disk = mirror;
 		err = 0;
 		if (rdev->saved_raid_disk != mirror)
 			conf->fullsync = 1;
 		rcu_assign_pointer(p->rdev, rdev);
 		break;
 	}
 	md_integrity_add_rdev(rdev, mddev);
 	print_conf(conf);
 	return err;
 }
 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct r10conf *conf = mddev->private;
 	int err = 0;
 	int number = rdev->raid_disk;
 	struct mirror_info *p = conf->mirrors+ number;
 	print_conf(conf);
 	if (rdev == p->rdev) {
 		if (test_bit(In_sync, &rdev->flags) ||
 		    atomic_read(&rdev->nr_pending)) {
 			err = -EBUSY;
 			goto abort;
 		}
 		/* Only remove faulty devices in recovery
 		 * is not possible.
 		 */
 		if (!test_bit(Faulty, &rdev->flags) &&
 		    mddev->recovery_disabled != p->recovery_disabled &&
 		    enough(conf, -1)) {
 			err = -EBUSY;
 			goto abort;
 		}
 		p->rdev = NULL;
 		synchronize_rcu();
 		if (atomic_read(&rdev->nr_pending)) {
 			/* lost the race, try later */
 			err = -EBUSY;
 			p->rdev = rdev;
 			goto abort;
 		}
 		err = md_integrity_register(mddev);
 	}
 abort:
 	print_conf(conf);
 	return err;
 }
 static void end_sync_read(struct bio *bio, int error)
 {
 	struct r10bio *r10_bio = bio->bi_private;
 	struct r10conf *conf = r10_bio->mddev->private;
 	int d;
-	d = find_bio_disk(conf, r10_bio, bio, NULL);
+	d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R10BIO_Uptodate, &r10_bio->state);
 	else
 		/* The write handler will notice the lack of
 		 * R10BIO_Uptodate and record any errors etc
 		 */
 		atomic_add(r10_bio->sectors,
 			   &conf->mirrors[d].rdev->corrected_errors);
 	/* for reconstruct, we always reschedule after a read.
 	 * for resync, only after all reads
 	 */
 	rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
 	if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
 	    atomic_dec_and_test(&r10_bio->remaining)) {
 		/* we have read all the blocks,
 		 * do the comparison in process context in raid10d
 		 */
 		reschedule_retry(r10_bio);
 	}
 }
 static void end_sync_request(struct r10bio *r10_bio)
 {
 	struct mddev *mddev = r10_bio->mddev;
 	while (atomic_dec_and_test(&r10_bio->remaining)) {
 		if (r10_bio->master_bio == NULL) {
 			/* the primary of several recovery bios */
 			sector_t s = r10_bio->sectors;
 			if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
 			    test_bit(R10BIO_WriteError, &r10_bio->state))
 				reschedule_retry(r10_bio);
 			else
 				put_buf(r10_bio);
 			md_done_sync(mddev, s, 1);
 			break;
 		} else {
 			struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
 			if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
 			    test_bit(R10BIO_WriteError, &r10_bio->state))
 				reschedule_retry(r10_bio);
 			else
 				put_buf(r10_bio);
 			r10_bio = r10_bio2;
 		}
 	}
 }
 static void end_sync_write(struct bio *bio, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct r10bio *r10_bio = bio->bi_private;
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	int d;
 	sector_t first_bad;
 	int bad_sectors;
 	int slot;
-	d = find_bio_disk(conf, r10_bio, bio, &slot);
+	d = find_bio_disk(conf, r10_bio, bio, &slot, NULL);
 	if (!uptodate) {
 		set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags);
 		set_bit(R10BIO_WriteError, &r10_bio->state);
 	} else if (is_badblock(conf->mirrors[d].rdev,
 			     r10_bio->devs[slot].addr,
 			     r10_bio->sectors,
 			     &first_bad, &bad_sectors))
 		set_bit(R10BIO_MadeGood, &r10_bio->state);
 	rdev_dec_pending(conf->mirrors[d].rdev, mddev);
 	end_sync_request(r10_bio);
 }
 /*
  * Note: sync and recover and handled very differently for raid10
  * This code is for resync.
  * For resync, we read through virtual addresses and read all blocks.
  * If there is any error, we schedule a write.  The lowest numbered
  * drive is authoritative.
  * However requests come for physical address, so we need to map.
  * For every physical address there are raid_disks/copies virtual addresses,
  * which is always are least one, but is not necessarly an integer.
  * This means that a physical address can span multiple chunks, so we may
  * have to submit multiple io requests for a single sync request.
  */
 /*
  * We check if all blocks are in-sync and only write to blocks that
  * aren't in sync
  */
 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 {
 	struct r10conf *conf = mddev->private;
 	int i, first;
 	struct bio *tbio, *fbio;
 	atomic_set(&r10_bio->remaining, 1);
 	/* find the first device with a block */
 	for (i=0; i<conf->copies; i++)
 		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
 			break;
 	if (i == conf->copies)
 		goto done;
 	first = i;
 	fbio = r10_bio->devs[i].bio;
 	/* now find blocks with errors */
 	for (i=0 ; i < conf->copies ; i++) {
 		int  j, d;
 		int vcnt = r10_bio->sectors >> (PAGE_SHIFT-9);
 		tbio = r10_bio->devs[i].bio;
 		if (tbio->bi_end_io != end_sync_read)
 			continue;
 		if (i == first)
 			continue;
 		if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
 			/* We know that the bi_io_vec layout is the same for
 			 * both 'first' and 'i', so we just compare them.
 			 * All vec entries are PAGE_SIZE;
 			 */
 			for (j = 0; j < vcnt; j++)
 				if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
 					   page_address(tbio->bi_io_vec[j].bv_page),
 					   PAGE_SIZE))
 					break;
 			if (j == vcnt)
 				continue;
 			mddev->resync_mismatches += r10_bio->sectors;
 			if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
 				/* Don't fix anything. */
 				continue;
 		}
 		/* Ok, we need to write this bio, either to correct an
 		 * inconsistency or to correct an unreadable block.
 		 * First we need to fixup bv_offset, bv_len and
 		 * bi_vecs, as the read request might have corrupted these
 		 */
 		tbio->bi_vcnt = vcnt;
 		tbio->bi_size = r10_bio->sectors << 9;
 		tbio->bi_idx = 0;
 		tbio->bi_phys_segments = 0;
 		tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
 		tbio->bi_flags |= 1 << BIO_UPTODATE;
 		tbio->bi_next = NULL;
 		tbio->bi_rw = WRITE;
 		tbio->bi_private = r10_bio;
 		tbio->bi_sector = r10_bio->devs[i].addr;
 		for (j=0; j < vcnt ; j++) {
 			tbio->bi_io_vec[j].bv_offset = 0;
 			tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
 			memcpy(page_address(tbio->bi_io_vec[j].bv_page),
 			       page_address(fbio->bi_io_vec[j].bv_page),
 			       PAGE_SIZE);
 		}
 		tbio->bi_end_io = end_sync_write;
 		d = r10_bio->devs[i].devnum;
 		atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 		atomic_inc(&r10_bio->remaining);
 		md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
 		tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
 		tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
 		generic_make_request(tbio);
 	}
 done:
 	if (atomic_dec_and_test(&r10_bio->remaining)) {
 		md_done_sync(mddev, r10_bio->sectors, 1);
 		put_buf(r10_bio);
 	}
 }
 /*
  * Now for the recovery code.
  * Recovery happens across physical sectors.
  * We recover all non-is_sync drives by finding the virtual address of
  * each, and then choose a working drive that also has that virt address.
  * There is a separate r10_bio for each non-in_sync drive.
  * Only the first two slots are in use. The first for reading,
  * The second for writing.
  *
  */
 static void fix_recovery_read_error(struct r10bio *r10_bio)
 {
 	/* We got a read error during recovery.
 	 * We repeat the read in smaller page-sized sections.
 	 * If a read succeeds, write it to the new device or record
 	 * a bad block if we cannot.
 	 * If a read fails, record a bad block on both old and
 	 * new devices.
 	 */
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	struct bio *bio = r10_bio->devs[0].bio;
 	sector_t sect = 0;
 	int sectors = r10_bio->sectors;
 	int idx = 0;
 	int dr = r10_bio->devs[0].devnum;
 	int dw = r10_bio->devs[1].devnum;
 	while (sectors) {
 		int s = sectors;
 		struct md_rdev *rdev;
 		sector_t addr;
 		int ok;
 		if (s > (PAGE_SIZE>>9))
 			s = PAGE_SIZE >> 9;
 		rdev = conf->mirrors[dr].rdev;
 		addr = r10_bio->devs[0].addr + sect,
 		ok = sync_page_io(rdev,
 				  addr,
 				  s << 9,
 				  bio->bi_io_vec[idx].bv_page,
 				  READ, false);
 		if (ok) {
 			rdev = conf->mirrors[dw].rdev;
 			addr = r10_bio->devs[1].addr + sect;
 			ok = sync_page_io(rdev,
 					  addr,
 					  s << 9,
 					  bio->bi_io_vec[idx].bv_page,
 					  WRITE, false);
 			if (!ok)
 				set_bit(WriteErrorSeen, &rdev->flags);
 		}
 		if (!ok) {
 			/* We don't worry if we cannot set a bad block -
 			 * it really is bad so there is no loss in not
 			 * recording it yet
 			 */
 			rdev_set_badblocks(rdev, addr, s, 0);
 			if (rdev != conf->mirrors[dw].rdev) {
 				/* need bad block on destination too */
 				struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
 				addr = r10_bio->devs[1].addr + sect;
 				ok = rdev_set_badblocks(rdev2, addr, s, 0);
 				if (!ok) {
 					/* just abort the recovery */
 					printk(KERN_NOTICE
 					       "md/raid10:%s: recovery aborted"
 					       " due to read error\n",
 					       mdname(mddev));
 					conf->mirrors[dw].recovery_disabled
 						= mddev->recovery_disabled;
 					set_bit(MD_RECOVERY_INTR,
 						&mddev->recovery);
 					break;
 				}
 			}
 		}
 		sectors -= s;
 		sect += s;
 		idx++;
 	}
 }
 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
 {
 	struct r10conf *conf = mddev->private;
 	int d;
 	struct bio *wbio;
 	if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
 		fix_recovery_read_error(r10_bio);
 		end_sync_request(r10_bio);
 		return;
 	}
 	/*
 	 * share the pages with the first bio
 	 * and submit the write request
 	 */
 	wbio = r10_bio->devs[1].bio;
 	d = r10_bio->devs[1].devnum;
 	atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 	md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
 	generic_make_request(wbio);
 }
 /*
  * Used by fix_read_error() to decay the per rdev read_errors.
  * We halve the read error count for every hour that has elapsed
  * since the last recorded read error.
  *
  */
 static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
 {
 	struct timespec cur_time_mon;
 	unsigned long hours_since_last;
 	unsigned int read_errors = atomic_read(&rdev->read_errors);
 	ktime_get_ts(&cur_time_mon);
 	if (rdev->last_read_error.tv_sec == 0 &&
 	    rdev->last_read_error.tv_nsec == 0) {
 		/* first time we've seen a read error */
 		rdev->last_read_error = cur_time_mon;
 		return;
 	}
 	hours_since_last = (cur_time_mon.tv_sec -
 			    rdev->last_read_error.tv_sec) / 3600;
 	rdev->last_read_error = cur_time_mon;
 	/*
 	 * if hours_since_last is > the number of bits in read_errors
 	 * just set read errors to 0. We do this to avoid
 	 * overflowing the shift of read_errors by hours_since_last.
 	 */
 	if (hours_since_last >= 8 * sizeof(read_errors))
 		atomic_set(&rdev->read_errors, 0);
 	else
 		atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
 }
 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
 			    int sectors, struct page *page, int rw)
 {
 	sector_t first_bad;
 	int bad_sectors;
 	if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
 	    && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
 		return -1;
 	if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
 		/* success */
 		return 1;
 	if (rw == WRITE)
 		set_bit(WriteErrorSeen, &rdev->flags);
 	/* need to record an error - either for the block or the device */
 	if (!rdev_set_badblocks(rdev, sector, sectors, 0))
 		md_error(rdev->mddev, rdev);
 	return 0;
 }
 /*
  * This is a kernel thread which:
  *
  *	1.	Retries failed read operations on working mirrors.
  *	2.	Updates the raid superblock when problems encounter.
  *	3.	Performs writes following reads for array synchronising.
  */
 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
 {
 	int sect = 0; /* Offset from r10_bio->sector */
 	int sectors = r10_bio->sectors;
 	struct md_rdev*rdev;
 	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
 	int d = r10_bio->devs[r10_bio->read_slot].devnum;
 	/* still own a reference to this rdev, so it cannot
 	 * have been cleared recently.
 	 */
 	rdev = conf->mirrors[d].rdev;
 	if (test_bit(Faulty, &rdev->flags))
 		/* drive has already been failed, just ignore any
 		   more fix_read_error() attempts */
 		return;
 	check_decay_read_errors(mddev, rdev);
 	atomic_inc(&rdev->read_errors);
 	if (atomic_read(&rdev->read_errors) > max_read_errors) {
 		char b[BDEVNAME_SIZE];
 		bdevname(rdev->bdev, b);
 		printk(KERN_NOTICE
 		       "md/raid10:%s: %s: Raid device exceeded "
 		       "read_error threshold [cur %d:max %d]\n",
 		       mdname(mddev), b,
 		       atomic_read(&rdev->read_errors), max_read_errors);
 		printk(KERN_NOTICE
 		       "md/raid10:%s: %s: Failing raid device\n",
 		       mdname(mddev), b);
 		md_error(mddev, conf->mirrors[d].rdev);
 		return;
 	}
 	while(sectors) {
 		int s = sectors;
 		int sl = r10_bio->read_slot;
 		int success = 0;
 		int start;
 		if (s > (PAGE_SIZE>>9))
 			s = PAGE_SIZE >> 9;
 		rcu_read_lock();
 		do {
 			sector_t first_bad;
 			int bad_sectors;
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (rdev &&
 			    test_bit(In_sync, &rdev->flags) &&
 			    is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
 					&first_bad, &bad_sectors) == 0) {
 				atomic_inc(&rdev->nr_pending);
 				rcu_read_unlock();
 				success = sync_page_io(rdev,
 						       r10_bio->devs[sl].addr +
 						       sect,
 						       s<<9,
 						       conf->tmppage, READ, false);
 				rdev_dec_pending(rdev, mddev);
 				rcu_read_lock();
 				if (success)
 					break;
 			}
 			sl++;
 			if (sl == conf->copies)
 				sl = 0;
 		} while (!success && sl != r10_bio->read_slot);
 		rcu_read_unlock();
 		if (!success) {
 			/* Cannot read from anywhere, just mark the block
 			 * as bad on the first device to discourage future
 			 * reads.
 			 */
 			int dn = r10_bio->devs[r10_bio->read_slot].devnum;
 			rdev = conf->mirrors[dn].rdev;
 			if (!rdev_set_badblocks(
 				    rdev,
 				    r10_bio->devs[r10_bio->read_slot].addr
 				    + sect,
 				    s, 0))
 				md_error(mddev, rdev);
 			break;
 		}
 		start = sl;
 		/* write it back and re-read */
 		rcu_read_lock();
 		while (sl != r10_bio->read_slot) {
 			char b[BDEVNAME_SIZE];
 			if (sl==0)
 				sl = conf->copies;
 			sl--;
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (!rdev ||
 			    !test_bit(In_sync, &rdev->flags))
 				continue;
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
 			if (r10_sync_page_io(rdev,
 					     r10_bio->devs[sl].addr +
 					     sect,
 					     s<<9, conf->tmppage, WRITE)
 			    == 0) {
 				/* Well, this device is dead */
 				printk(KERN_NOTICE
 				       "md/raid10:%s: read correction "
 				       "write failed"
 				       " (%d sectors at %llu on %s)\n",
 				       mdname(mddev), s,
 				       (unsigned long long)(
 					       sect + rdev->data_offset),
 				       bdevname(rdev->bdev, b));
 				printk(KERN_NOTICE "md/raid10:%s: %s: failing "
 				       "drive\n",
 				       mdname(mddev),
 				       bdevname(rdev->bdev, b));
 			}
 			rdev_dec_pending(rdev, mddev);
 			rcu_read_lock();
 		}
 		sl = start;
 		while (sl != r10_bio->read_slot) {
 			char b[BDEVNAME_SIZE];
 			if (sl==0)
 				sl = conf->copies;
 			sl--;
 			d = r10_bio->devs[sl].devnum;
 			rdev = rcu_dereference(conf->mirrors[d].rdev);
 			if (!rdev ||
 			    !test_bit(In_sync, &rdev->flags))
 				continue;
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
 			switch (r10_sync_page_io(rdev,
 					     r10_bio->devs[sl].addr +
 					     sect,
 					     s<<9, conf->tmppage,
 						 READ)) {
 			case 0:
 				/* Well, this device is dead */
 				printk(KERN_NOTICE
 				       "md/raid10:%s: unable to read back "
 				       "corrected sectors"
 				       " (%d sectors at %llu on %s)\n",
 				       mdname(mddev), s,
 				       (unsigned long long)(
 					       sect + rdev->data_offset),
 				       bdevname(rdev->bdev, b));
 				printk(KERN_NOTICE "md/raid10:%s: %s: failing "
 				       "drive\n",
 				       mdname(mddev),
 				       bdevname(rdev->bdev, b));
 				break;
 			case 1:
 				printk(KERN_INFO
 				       "md/raid10:%s: read error corrected"
 				       " (%d sectors at %llu on %s)\n",
 				       mdname(mddev), s,
 				       (unsigned long long)(
 					       sect + rdev->data_offset),
 				       bdevname(rdev->bdev, b));
 				atomic_add(s, &rdev->corrected_errors);
 			}
 			rdev_dec_pending(rdev, mddev);
 			rcu_read_lock();
 		}
 		rcu_read_unlock();
 		sectors -= s;
 		sect += s;
 	}
 }
 static void bi_complete(struct bio *bio, int error)
 {
 	complete((struct completion *)bio->bi_private);
 }
 static int submit_bio_wait(int rw, struct bio *bio)
 {
 	struct completion event;
 	rw |= REQ_SYNC;
 	init_completion(&event);
 	bio->bi_private = &event;
 	bio->bi_end_io = bi_complete;
 	submit_bio(rw, bio);
 	wait_for_completion(&event);
 	return test_bit(BIO_UPTODATE, &bio->bi_flags);
 }
 static int narrow_write_error(struct r10bio *r10_bio, int i)
 {
 	struct bio *bio = r10_bio->master_bio;
 	struct mddev *mddev = r10_bio->mddev;
 	struct r10conf *conf = mddev->private;
 	struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
 	/* bio has the data to be written to slot 'i' where
 	 * we just recently had a write error.
 	 * We repeatedly clone the bio and trim down to one block,
 	 * then try the write.  Where the write fails we record
 	 * a bad block.
 	 * It is conceivable that the bio doesn't exactly align with
 	 * blocks.  We must handle this.
 	 *
 	 * We currently own a reference to the rdev.
 	 */
 	int block_sectors;
 	sector_t sector;
 	int sectors;
 	int sect_to_write = r10_bio->sectors;
 	int ok = 1;
 	if (rdev->badblocks.shift < 0)
 		return 0;
 	block_sectors = 1 << rdev->badblocks.shift;
 	sector = r10_bio->sector;
 	sectors = ((r10_bio->sector + block_sectors)
 		   & ~(sector_t)(block_sectors - 1))
 		- sector;
 	while (sect_to_write) {
 		struct bio *wbio;
 		if (sectors > sect_to_write)
 			sectors = sect_to_write;
 		/* Write at 'sector' for 'sectors' */
 		wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
 		md_trim_bio(wbio, sector - bio->bi_sector, sectors);
 		wbio->bi_sector = (r10_bio->devs[i].addr+
 				   rdev->data_offset+
 				   (sector - r10_bio->sector));
 		wbio->bi_bdev = rdev->bdev;
 		if (submit_bio_wait(WRITE, wbio) == 0)
 			/* Failure! */
 			ok = rdev_set_badblocks(rdev, sector,
 						sectors, 0)
 				&& ok;
 		bio_put(wbio);
 		sect_to_write -= sectors;
 		sector += sectors;
 		sectors = block_sectors;
 	}
 	return ok;
 }
 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
 {
 	int slot = r10_bio->read_slot;
 	int mirror = r10_bio->devs[slot].devnum;
 	struct bio *bio;
 	struct r10conf *conf = mddev->private;
 	struct md_rdev *rdev;
 	char b[BDEVNAME_SIZE];
 	unsigned long do_sync;
 	int max_sectors;
 	/* we got a read error. Maybe the drive is bad.  Maybe just
 	 * the block and we can fix it.
 	 * We freeze all other IO, and try reading the block from
 	 * other devices.  When we find one, we re-write
 	 * and check it that fixes the read error.
 	 * This is all done synchronously while the array is
 	 * frozen.
 	 */
 	if (mddev->ro == 0) {
 		freeze_array(conf);
 		fix_read_error(conf, mddev, r10_bio);
 		unfreeze_array(conf);
 	}
 	rdev_dec_pending(conf->mirrors[mirror].rdev, mddev);
 	bio = r10_bio->devs[slot].bio;
 	bdevname(bio->bi_bdev, b);
 	r10_bio->devs[slot].bio =
 		mddev->ro ? IO_BLOCKED : NULL;
 read_more:
 	mirror = read_balance(conf, r10_bio, &max_sectors);
 	if (mirror == -1) {
 		printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
 		       " read error for block %llu\n",
 		       mdname(mddev), b,
 		       (unsigned long long)r10_bio->sector);
 		raid_end_bio_io(r10_bio);
 		bio_put(bio);
 		return;
 	}
 	do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
 	if (bio)
 		bio_put(bio);
 	slot = r10_bio->read_slot;
 	rdev = conf->mirrors[mirror].rdev;
 	printk_ratelimited(
 		KERN_ERR
 		"md/raid10:%s: %s: redirecting"
 		"sector %llu to another mirror\n",
 		mdname(mddev),
 		bdevname(rdev->bdev, b),
 		(unsigned long long)r10_bio->sector);
 	bio = bio_clone_mddev(r10_bio->master_bio,
 			      GFP_NOIO, mddev);
 	md_trim_bio(bio,
 		    r10_bio->sector - bio->bi_sector,
 		    max_sectors);
 	r10_bio->devs[slot].bio = bio;
 	bio->bi_sector = r10_bio->devs[slot].addr
 		+ rdev->data_offset;
 	bio->bi_bdev = rdev->bdev;
 	bio->bi_rw = READ | do_sync;
 	bio->bi_private = r10_bio;
 	bio->bi_end_io = raid10_end_read_request;
 	if (max_sectors < r10_bio->sectors) {
 		/* Drat - have to split this up more */
 		struct bio *mbio = r10_bio->master_bio;
 		int sectors_handled =
 			r10_bio->sector + max_sectors
 			- mbio->bi_sector;
 		r10_bio->sectors = max_sectors;
 		spin_lock_irq(&conf->device_lock);
 		if (mbio->bi_phys_segments == 0)
 			mbio->bi_phys_segments = 2;
 		else
 			mbio->bi_phys_segments++;
 		spin_unlock_irq(&conf->device_lock);
 		generic_make_request(bio);
 		bio = NULL;
 		r10_bio = mempool_alloc(conf->r10bio_pool,
 					GFP_NOIO);
 		r10_bio->master_bio = mbio;
 		r10_bio->sectors = (mbio->bi_size >> 9)
 			- sectors_handled;
 		r10_bio->state = 0;
 		set_bit(R10BIO_ReadError,
 			&r10_bio->state);
 		r10_bio->mddev = mddev;
 		r10_bio->sector = mbio->bi_sector
 			+ sectors_handled;
 		goto read_more;
 	} else
 		generic_make_request(bio);
 }
 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 {
 	/* Some sort of write request has finished and it
 	 * succeeded in writing where we thought there was a
 	 * bad block.  So forget the bad block.
 	 * Or possibly if failed and we need to record
 	 * a bad block.
 	 */
 	int m;
 	struct md_rdev *rdev;
 	if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
 	    test_bit(R10BIO_IsRecover, &r10_bio->state)) {
 		for (m = 0; m < conf->copies; m++) {
 			int dev = r10_bio->devs[m].devnum;
 			rdev = conf->mirrors[dev].rdev;
 			if (r10_bio->devs[m].bio == NULL)
 				continue;
 			if (test_bit(BIO_UPTODATE,
 				     &r10_bio->devs[m].bio->bi_flags)) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
 					r10_bio->sectors);
 			} else {
 				if (!rdev_set_badblocks(
 					    rdev,
 					    r10_bio->devs[m].addr,
 					    r10_bio->sectors, 0))
 					md_error(conf->mddev, rdev);
 			}
 		}
 		put_buf(r10_bio);
 	} else {
 		for (m = 0; m < conf->copies; m++) {
 			int dev = r10_bio->devs[m].devnum;
 			struct bio *bio = r10_bio->devs[m].bio;
 			rdev = conf->mirrors[dev].rdev;
 			if (bio == IO_MADE_GOOD) {
 				rdev_clear_badblocks(
 					rdev,
 					r10_bio->devs[m].addr,
 					r10_bio->sectors);
 				rdev_dec_pending(rdev, conf->mddev);
 			} else if (bio != NULL &&
 				   !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
 				if (!narrow_write_error(r10_bio, m)) {
 					md_error(conf->mddev, rdev);
 					set_bit(R10BIO_Degraded,
 						&r10_bio->state);
 				}
 				rdev_dec_pending(rdev, conf->mddev);
 			}
 		}
 		if (test_bit(R10BIO_WriteError,
 			     &r10_bio->state))
 			close_write(r10_bio);
 		raid_end_bio_io(r10_bio);
 	}
 }
 static void raid10d(struct mddev *mddev)
 {
 	struct r10bio *r10_bio;
 	unsigned long flags;
 	struct r10conf *conf = mddev->private;
 	struct list_head *head = &conf->retry_list;
 	struct blk_plug plug;
 	md_check_recovery(mddev);
 	blk_start_plug(&plug);
 	for (;;) {
 		flush_pending_writes(conf);
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (list_empty(head)) {
 			spin_unlock_irqrestore(&conf->device_lock, flags);
 			break;
 		}
 		r10_bio = list_entry(head->prev, struct r10bio, retry_list);
 		list_del(head->prev);
 		conf->nr_queued--;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		mddev = r10_bio->mddev;
 		conf = mddev->private;
 		if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
 		    test_bit(R10BIO_WriteError, &r10_bio->state))
 			handle_write_completed(conf, r10_bio);
 		else if (test_bit(R10BIO_IsSync, &r10_bio->state))
 			sync_request_write(mddev, r10_bio);
 		else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
 			recovery_request_write(mddev, r10_bio);
 		else if (test_bit(R10BIO_ReadError, &r10_bio->state))
 			handle_read_error(mddev, r10_bio);
 		else {
 			/* just a partial read to be scheduled from a
 			 * separate context
 			 */
 			int slot = r10_bio->read_slot;
 			generic_make_request(r10_bio->devs[slot].bio);
 		}
 		cond_resched();
 		if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
 			md_check_recovery(mddev);
 	}
 	blk_finish_plug(&plug);
 }
 static int init_resync(struct r10conf *conf)
 {
 	int buffs;
+	int i;
 	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
 	BUG_ON(conf->r10buf_pool);
+	conf->have_replacement = 0;
+	for (i = 0; i < conf->raid_disks; i++)
+		if (conf->mirrors[i].replacement)
+			conf->have_replacement = 1;
 	conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
 	if (!conf->r10buf_pool)
 		return -ENOMEM;
 	conf->next_resync = 0;
 	return 0;
 }
 /*
  * perform a "sync" on one "block"
  *
  * We need to make sure that no normal I/O request - particularly write
  * requests - conflict with active sync requests.
  *
  * This is achieved by tracking pending requests and a 'barrier' concept
  * that can be installed to exclude normal IO requests.
  *
  * Resync and recovery are handled very differently.
  * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
  *
  * For resync, we iterate over virtual addresses, read all copies,
  * and update if there are differences.  If only one copy is live,
  * skip it.
  * For recovery, we iterate over physical addresses, read a good
  * value for each non-in_sync drive, and over-write.
  *
  * So, for recovery we may have several outstanding complex requests for a
  * given address, one for each out-of-sync device.  We model this by allocating
  * a number of r10_bio structures, one for each out-of-sync device.
  * As we setup these structures, we collect all bio's together into a list
  * which we then process collectively to add pages, and then process again
  * to pass to generic_make_request.
  *
  * The r10_bio structures are linked using a borrowed master_bio pointer.
  * This link is counted in ->remaining.  When the r10_bio that points to NULL
  * has its remaining count decremented to 0, the whole complex operation
  * is complete.
  *
  */
 static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
 			     int *skipped, int go_faster)
 {
 	struct r10conf *conf = mddev->private;
 	struct r10bio *r10_bio;
 	struct bio *biolist = NULL, *bio;
 	sector_t max_sector, nr_sectors;
 	int i;
 	int max_sync;
 	sector_t sync_blocks;
 	sector_t sectors_skipped = 0;
 	int chunks_skipped = 0;
 	if (!conf->r10buf_pool)
 		if (init_resync(conf))
 			return 0;
  skipped:
 	max_sector = mddev->dev_sectors;
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 		max_sector = mddev->resync_max_sectors;
 	if (sector_nr >= max_sector) {
 		/* If we aborted, we need to abort the
 		 * sync on the 'current' bitmap chucks (there can
 		 * be several when recovering multiple devices).
 		 * as we may have started syncing it but not finished.
 		 * We can find the current address in
 		 * mddev->curr_resync, but for recovery,
 		 * we need to convert that to several
 		 * virtual addresses.
 		 */
 		if (mddev->curr_resync < max_sector) { /* aborted */
 			if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
 				bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
 						&sync_blocks, 1);
 			else for (i=0; i<conf->raid_disks; i++) {
 				sector_t sect =
 					raid10_find_virt(conf, mddev->curr_resync, i);
 				bitmap_end_sync(mddev->bitmap, sect,
 						&sync_blocks, 1);
 			}
 		} else /* completed sync */
 			conf->fullsync = 0;
 		bitmap_close_sync(mddev->bitmap);
 		close_sync(conf);
 		*skipped = 1;
 		return sectors_skipped;
 	}
 	if (chunks_skipped >= conf->raid_disks) {
 		/* if there has been nothing to do on any drive,
 		 * then there is nothing to do at all..
 		 */
 		*skipped = 1;
 		return (max_sector - sector_nr) + sectors_skipped;
 	}
 	if (max_sector > mddev->resync_max)
 		max_sector = mddev->resync_max; /* Don't do IO beyond here */
 	/* make sure whole request will fit in a chunk - if chunks
 	 * are meaningful
 	 */
 	if (conf->near_copies < conf->raid_disks &&
 	    max_sector > (sector_nr | conf->chunk_mask))
 		max_sector = (sector_nr | conf->chunk_mask) + 1;
 	/*
 	 * If there is non-resync activity waiting for us then
 	 * put in a delay to throttle resync.
 	 */
 	if (!go_faster && conf->nr_waiting)
 		msleep_interruptible(1000);
 	/* Again, very different code for resync and recovery.
 	 * Both must result in an r10bio with a list of bios that
 	 * have bi_end_io, bi_sector, bi_bdev set,
 	 * and bi_private set to the r10bio.
 	 * For recovery, we may actually create several r10bios
 	 * with 2 bios in each, that correspond to the bios in the main one.
 	 * In this case, the subordinate r10bios link back through a
 	 * borrowed master_bio pointer, and the counter in the master
 	 * includes a ref from each subordinate.
 	 */
 	/* First, we decide what to do and set ->bi_end_io
 	 * To end_sync_read if we want to read, and
 	 * end_sync_write if we will want to write.
 	 */
 	max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
 	if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
 		/* recovery... the complicated one */
 		int j;
 		r10_bio = NULL;
 		for (i=0 ; i<conf->raid_disks; i++) {
 			int still_degraded;
 			struct r10bio *rb2;
 			sector_t sect;
 			int must_sync;
 			int any_working;
 			if (conf->mirrors[i].rdev == NULL ||
 			    test_bit(In_sync, &conf->mirrors[i].rdev->flags))
 				continue;
 			still_degraded = 0;
 			/* want to reconstruct this device */
 			rb2 = r10_bio;
 			sect = raid10_find_virt(conf, sector_nr, i);
 			/* Unless we are doing a full sync, we only need
 			 * to recover the block if it is set in the bitmap
 			 */
 			must_sync = bitmap_start_sync(mddev->bitmap, sect,
 						      &sync_blocks, 1);
 			if (sync_blocks < max_sync)
 				max_sync = sync_blocks;
 			if (!must_sync &&
 			    !conf->fullsync) {
 				/* yep, skip the sync_blocks here, but don't assume
 				 * that there will never be anything to do here
 				 */
 				chunks_skipped = -1;
 				continue;
 			}
 			r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
 			raise_barrier(conf, rb2 != NULL);
 			atomic_set(&r10_bio->remaining, 0);
 			r10_bio->master_bio = (struct bio*)rb2;
 			if (rb2)
 				atomic_inc(&rb2->remaining);
 			r10_bio->mddev = mddev;
 			set_bit(R10BIO_IsRecover, &r10_bio->state);
 			r10_bio->sector = sect;
 			raid10_find_phys(conf, r10_bio);
 			/* Need to check if the array will still be
 			 * degraded
 			 */
 			for (j=0; j<conf->raid_disks; j++)
 				if (conf->mirrors[j].rdev == NULL ||
 				    test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
 					still_degraded = 1;
 					break;
 				}
 			must_sync = bitmap_start_sync(mddev->bitmap, sect,
 						      &sync_blocks, still_degraded);
 			any_working = 0;
 			for (j=0; j<conf->copies;j++) {
 				int k;
 				int d = r10_bio->devs[j].devnum;
 				sector_t from_addr, to_addr;
 				struct md_rdev *rdev;
 				sector_t sector, first_bad;
 				int bad_sectors;
 				if (!conf->mirrors[d].rdev ||
 				    !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
 					continue;
 				/* This is where we read from */
 				any_working = 1;
 				rdev = conf->mirrors[d].rdev;
 				sector = r10_bio->devs[j].addr;
 				if (is_badblock(rdev, sector, max_sync,
 						&first_bad, &bad_sectors)) {
 					if (first_bad > sector)
 						max_sync = first_bad - sector;
 					else {
 						bad_sectors -= (sector
 								- first_bad);
 						if (max_sync > bad_sectors)
 							max_sync = bad_sectors;
 						continue;
 					}
 				}
 				bio = r10_bio->devs[0].bio;
 				bio->bi_next = biolist;
 				biolist = bio;
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_read;
 				bio->bi_rw = READ;
 				from_addr = r10_bio->devs[j].addr;
 				bio->bi_sector = from_addr +
 					conf->mirrors[d].rdev->data_offset;
 				bio->bi_bdev = conf->mirrors[d].rdev->bdev;
 				atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 				atomic_inc(&r10_bio->remaining);
 				/* and we write to 'i' */
 				for (k=0; k<conf->copies; k++)
 					if (r10_bio->devs[k].devnum == i)
 						break;
 				BUG_ON(k == conf->copies);
 				bio = r10_bio->devs[1].bio;
 				bio->bi_next = biolist;
 				biolist = bio;
 				bio->bi_private = r10_bio;
 				bio->bi_end_io = end_sync_write;
 				bio->bi_rw = WRITE;
 				to_addr = r10_bio->devs[k].addr;
 				bio->bi_sector = to_addr +
 					conf->mirrors[i].rdev->data_offset;
 				bio->bi_bdev = conf->mirrors[i].rdev->bdev;
 				r10_bio->devs[0].devnum = d;
 				r10_bio->devs[0].addr = from_addr;
 				r10_bio->devs[1].devnum = i;
 				r10_bio->devs[1].addr = to_addr;
 				break;
 			}
 			if (j == conf->copies) {
 				/* Cannot recover, so abort the recovery or
 				 * record a bad block */
 				put_buf(r10_bio);
 				if (rb2)
 					atomic_dec(&rb2->remaining);
 				r10_bio = rb2;
 				if (any_working) {
 					/* problem is that there are bad blocks
 					 * on other device(s)
 					 */
 					int k;
 					for (k = 0; k < conf->copies; k++)
 						if (r10_bio->devs[k].devnum == i)
 							break;
 					if (!rdev_set_badblocks(
 						    conf->mirrors[i].rdev,
 						    r10_bio->devs[k].addr,
 						    max_sync, 0))
 						any_working = 0;
 				}
 				if (!any_working)  {
 					if (!test_and_set_bit(MD_RECOVERY_INTR,
 							      &mddev->recovery))
 						printk(KERN_INFO "md/raid10:%s: insufficient "
 						       "working devices for recovery.\n",
 						       mdname(mddev));
 					conf->mirrors[i].recovery_disabled
 						= mddev->recovery_disabled;
 				}
 				break;
 			}
 		}
 		if (biolist == NULL) {
 			while (r10_bio) {
 				struct r10bio *rb2 = r10_bio;
 				r10_bio = (struct r10bio*) rb2->master_bio;
 				rb2->master_bio = NULL;
 				put_buf(rb2);
 			}
 			goto giveup;
 		}
 	} else {
 		/* resync. Schedule a read for every block at this virt offset */
 		int count = 0;
 		bitmap_cond_end_sync(mddev->bitmap, sector_nr);
 		if (!bitmap_start_sync(mddev->bitmap, sector_nr,
 				       &sync_blocks, mddev->degraded) &&
 		    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
 						 &mddev->recovery)) {
 			/* We can skip this block */
 			*skipped = 1;
 			return sync_blocks + sectors_skipped;
 		}
 		if (sync_blocks < max_sync)
 			max_sync = sync_blocks;
 		r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
 		r10_bio->mddev = mddev;
 		atomic_set(&r10_bio->remaining, 0);
 		raise_barrier(conf, 0);
 		conf->next_resync = sector_nr;
 		r10_bio->master_bio = NULL;
 		r10_bio->sector = sector_nr;
 		set_bit(R10BIO_IsSync, &r10_bio->state);
 		raid10_find_phys(conf, r10_bio);
 		r10_bio->sectors = (sector_nr | conf->chunk_mask) - sector_nr +1;
 		for (i=0; i<conf->copies; i++) {
 			int d = r10_bio->devs[i].devnum;
 			sector_t first_bad, sector;
 			int bad_sectors;
 			bio = r10_bio->devs[i].bio;
 			bio->bi_end_io = NULL;
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 			if (conf->mirrors[d].rdev == NULL ||
 			    test_bit(Faulty, &conf->mirrors[d].rdev->flags))
 				continue;
 			sector = r10_bio->devs[i].addr;
 			if (is_badblock(conf->mirrors[d].rdev,
 					sector, max_sync,
 					&first_bad, &bad_sectors)) {
 				if (first_bad > sector)
 					max_sync = first_bad - sector;
 				else {
 					bad_sectors -= (sector - first_bad);
 					if (max_sync > bad_sectors)
 						max_sync = max_sync;
 					continue;
 				}
 			}
 			atomic_inc(&conf->mirrors[d].rdev->nr_pending);
 			atomic_inc(&r10_bio->remaining);
 			bio->bi_next = biolist;
 			biolist = bio;
 			bio->bi_private = r10_bio;
 			bio->bi_end_io = end_sync_read;
 			bio->bi_rw = READ;
 			bio->bi_sector = sector +
 				conf->mirrors[d].rdev->data_offset;
 			bio->bi_bdev = conf->mirrors[d].rdev->bdev;
 			count++;
 		}
 		if (count < 2) {
 			for (i=0; i<conf->copies; i++) {
 				int d = r10_bio->devs[i].devnum;
 				if (r10_bio->devs[i].bio->bi_end_io)
 					rdev_dec_pending(conf->mirrors[d].rdev,
 							 mddev);
 			}
 			put_buf(r10_bio);
 			biolist = NULL;
 			goto giveup;
 		}
 	}
 	for (bio = biolist; bio ; bio=bio->bi_next) {
 		bio->bi_flags &= ~(BIO_POOL_MASK - 1);
 		if (bio->bi_end_io)
 			bio->bi_flags |= 1 << BIO_UPTODATE;
 		bio->bi_vcnt = 0;
 		bio->bi_idx = 0;
 		bio->bi_phys_segments = 0;
 		bio->bi_size = 0;
 	}
 	nr_sectors = 0;
 	if (sector_nr + max_sync < max_sector)
 		max_sector = sector_nr + max_sync;
 	do {
 		struct page *page;
 		int len = PAGE_SIZE;
 		if (sector_nr + (len>>9) > max_sector)
 			len = (max_sector - sector_nr) << 9;
 		if (len == 0)
 			break;
 		for (bio= biolist ; bio ; bio=bio->bi_next) {
 			struct bio *bio2;
 			page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
 			if (bio_add_page(bio, page, len, 0))
 				continue;
 			/* stop here */
 			bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
 			for (bio2 = biolist;
 			     bio2 && bio2 != bio;
 			     bio2 = bio2->bi_next) {
 				/* remove last page from this bio */
 				bio2->bi_vcnt--;
 				bio2->bi_size -= len;
 				bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
 			}
 			goto bio_full;
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 	} while (biolist->bi_vcnt < RESYNC_PAGES);
  bio_full:
 	r10_bio->sectors = nr_sectors;
 	while (biolist) {
 		bio = biolist;
 		biolist = biolist->bi_next;
 		bio->bi_next = NULL;
 		r10_bio = bio->bi_private;
 		r10_bio->sectors = nr_sectors;
 		if (bio->bi_end_io == end_sync_read) {
 			md_sync_acct(bio->bi_bdev, nr_sectors);
 			generic_make_request(bio);
 		}
 	}
 	if (sectors_skipped)
 		/* pretend they weren't skipped, it makes
 		 * no important difference in this case
 		 */
 		md_done_sync(mddev, sectors_skipped, 1);
 	return sectors_skipped + nr_sectors;
  giveup:
 	/* There is nowhere to write, so all non-sync
 	 * drives must be failed or in resync, all drives
 	 * have a bad block, so try the next chunk...
 	 */
 	if (sector_nr + max_sync < max_sector)
 		max_sector = sector_nr + max_sync;
 	sectors_skipped += (max_sector - sector_nr);
 	chunks_skipped ++;
 	sector_nr = max_sector;
 	goto skipped;
 }
 static sector_t
 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 {
 	sector_t size;
 	struct r10conf *conf = mddev->private;
 	if (!raid_disks)
 		raid_disks = conf->raid_disks;
 	if (!sectors)
 		sectors = conf->dev_sectors;
 	size = sectors >> conf->chunk_shift;
 	sector_div(size, conf->far_copies);
 	size = size * raid_disks;
 	sector_div(size, conf->near_copies);
 	return size << conf->chunk_shift;
 }
 static struct r10conf *setup_conf(struct mddev *mddev)
 {
 	struct r10conf *conf = NULL;
 	int nc, fc, fo;
 	sector_t stride, size;
 	int err = -EINVAL;
 	if (mddev->new_chunk_sectors < (PAGE_SIZE >> 9) ||
 	    !is_power_of_2(mddev->new_chunk_sectors)) {
 		printk(KERN_ERR "md/raid10:%s: chunk size must be "
 		       "at least PAGE_SIZE(%ld) and be a power of 2.\n",
 		       mdname(mddev), PAGE_SIZE);
 		goto out;
 	}
 	nc = mddev->new_layout & 255;
 	fc = (mddev->new_layout >> 8) & 255;
 	fo = mddev->new_layout & (1<<16);
 	if ((nc*fc) <2 || (nc*fc) > mddev->raid_disks ||
 	    (mddev->new_layout >> 17)) {
 		printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
 		       mdname(mddev), mddev->new_layout);
 		goto out;
 	}
 	err = -ENOMEM;
 	conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
 	if (!conf)
 		goto out;
 	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
 				GFP_KERNEL);
 	if (!conf->mirrors)
 		goto out;
 	conf->tmppage = alloc_page(GFP_KERNEL);
 	if (!conf->tmppage)
 		goto out;
 	conf->raid_disks = mddev->raid_disks;
 	conf->near_copies = nc;
 	conf->far_copies = fc;
 	conf->copies = nc*fc;
 	conf->far_offset = fo;
 	conf->chunk_mask = mddev->new_chunk_sectors - 1;
 	conf->chunk_shift = ffz(~mddev->new_chunk_sectors);
 	conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
 					   r10bio_pool_free, conf);
 	if (!conf->r10bio_pool)
 		goto out;
 	size = mddev->dev_sectors >> conf->chunk_shift;
 	sector_div(size, fc);
 	size = size * conf->raid_disks;
 	sector_div(size, nc);
 	/* 'size' is now the number of chunks in the array */
 	/* calculate "used chunks per device" in 'stride' */
 	stride = size * conf->copies;
 	/* We need to round up when dividing by raid_disks to
 	 * get the stride size.
 	 */
 	stride += conf->raid_disks - 1;
 	sector_div(stride, conf->raid_disks);
 	conf->dev_sectors = stride << conf->chunk_shift;
 	if (fo)
 		stride = 1;
 	else
 		sector_div(stride, fc);
 	conf->stride = stride << conf->chunk_shift;
 	spin_lock_init(&conf->device_lock);
 	INIT_LIST_HEAD(&conf->retry_list);
 	spin_lock_init(&conf->resync_lock);
 	init_waitqueue_head(&conf->wait_barrier);
 	conf->thread = md_register_thread(raid10d, mddev, NULL);
 	if (!conf->thread)
 		goto out;
 	conf->mddev = mddev;
 	return conf;
  out:
 	printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
 	       mdname(mddev));
 	if (conf) {
 		if (conf->r10bio_pool)
 			mempool_destroy(conf->r10bio_pool);
 		kfree(conf->mirrors);
 		safe_put_page(conf->tmppage);
 		kfree(conf);
 	}
 	return ERR_PTR(err);
 }
 static int run(struct mddev *mddev)
 {
 	struct r10conf *conf;
 	int i, disk_idx, chunk_size;
 	struct mirror_info *disk;
 	struct md_rdev *rdev;
 	sector_t size;
 	/*
 	 * copy the already verified devices into our private RAID10
 	 * bookkeeping area. [whatever we allocate in run(),
 	 * should be freed in stop()]
 	 */
 	if (mddev->private == NULL) {
 		conf = setup_conf(mddev);
 		if (IS_ERR(conf))
 			return PTR_ERR(conf);
 		mddev->private = conf;
 	}
 	conf = mddev->private;
 	if (!conf)
 		goto out;
 	mddev->thread = conf->thread;
 	conf->thread = NULL;
 	chunk_size = mddev->chunk_sectors << 9;
 	blk_queue_io_min(mddev->queue, chunk_size);
 	if (conf->raid_disks % conf->near_copies)
 		blk_queue_io_opt(mddev->queue, chunk_size * conf->raid_disks);
 	else
 		blk_queue_io_opt(mddev->queue, chunk_size *
 				 (conf->raid_disks / conf->near_copies));
 	list_for_each_entry(rdev, &mddev->disks, same_set) {
 		disk_idx = rdev->raid_disk;
 		if (disk_idx >= conf->raid_disks
 		    || disk_idx < 0)
 			continue;
 		disk = conf->mirrors + disk_idx;
 		disk->rdev = rdev;
 		disk_stack_limits(mddev->gendisk, rdev->bdev,
 				  rdev->data_offset << 9);
 		/* as we don't honour merge_bvec_fn, we must never risk
 		 * violating it, so limit max_segments to 1 lying
 		 * within a single page.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
 			blk_queue_max_segments(mddev->queue, 1);
 			blk_queue_segment_boundary(mddev->queue,
 						   PAGE_CACHE_SIZE - 1);
 		}
 		disk->head_position = 0;
 	}
 	/* need to check that every block has at least one working mirror */
 	if (!enough(conf, -1)) {
 		printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
 		       mdname(mddev));
 		goto out_free_conf;
 	}
 	mddev->degraded = 0;
 	for (i = 0; i < conf->raid_disks; i++) {
 		disk = conf->mirrors + i;
 		if (!disk->rdev ||
 		    !test_bit(In_sync, &disk->rdev->flags)) {
 			disk->head_position = 0;
 			mddev->degraded++;
 			if (disk->rdev)
 				conf->fullsync = 1;
 		}
 		disk->recovery_disabled = mddev->recovery_disabled - 1;
 	}
 	if (mddev->recovery_cp != MaxSector)
 		printk(KERN_NOTICE "md/raid10:%s: not clean"
 		       " -- starting background reconstruction\n",
 		       mdname(mddev));
 	printk(KERN_INFO
 		"md/raid10:%s: active with %d out of %d devices\n",
 		mdname(mddev), conf->raid_disks - mddev->degraded,
 		conf->raid_disks);
 	/*
 	 * Ok, everything is just fine now
 	 */
 	mddev->dev_sectors = conf->dev_sectors;
 	size = raid10_size(mddev, 0, 0);
 	md_set_array_sectors(mddev, size);
 	mddev->resync_max_sectors = size;
 	mddev->queue->backing_dev_info.congested_fn = raid10_congested;
 	mddev->queue->backing_dev_info.congested_data = mddev;
 	/* Calculate max read-ahead size.
 	 * We need to readahead at least twice a whole stripe....
 	 * maybe...
 	 */
 	{
 		int stripe = conf->raid_disks *
 			((mddev->chunk_sectors << 9) / PAGE_SIZE);
 		stripe /= conf->near_copies;
 		if (mddev->queue->backing_dev_info.ra_pages < 2* stripe)
 			mddev->queue->backing_dev_info.ra_pages = 2* stripe;
 	}
 	if (conf->near_copies < conf->raid_disks)
 		blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
 	if (md_integrity_register(mddev))
 		goto out_free_conf;
 	return 0;
 out_free_conf:
 	md_unregister_thread(&mddev->thread);
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
 	safe_put_page(conf->tmppage);
 	kfree(conf->mirrors);
 	kfree(conf);
 	mddev->private = NULL;
 out:
 	return -EIO;
 }
 static int stop(struct mddev *mddev)
 {
 	struct r10conf *conf = mddev->private;
 	raise_barrier(conf, 0);
 	lower_barrier(conf);
 	md_unregister_thread(&mddev->thread);
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	if (conf->r10bio_pool)
 		mempool_destroy(conf->r10bio_pool);
 	kfree(conf->mirrors);
 	kfree(conf);
 	mddev->private = NULL;
 	return 0;
 }
 static void raid10_quiesce(struct mddev *mddev, int state)
 {
 	struct r10conf *conf = mddev->private;
 	switch(state) {
 	case 1:
 		raise_barrier(conf, 0);
 		break;
 	case 0:
 		lower_barrier(conf);
 		break;
 	}
 }
 static void *raid10_takeover_raid0(struct mddev *mddev)
 {
 	struct md_rdev *rdev;
 	struct r10conf *conf;
 	if (mddev->degraded > 0) {
 		printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
 		       mdname(mddev));
 		return ERR_PTR(-EINVAL);
 	}
 	/* Set new parameters */
 	mddev->new_level = 10;
 	/* new layout: far_copies = 1, near_copies = 2 */
 	mddev->new_layout = (1<<8) + 2;
 	mddev->new_chunk_sectors = mddev->chunk_sectors;
 	mddev->delta_disks = mddev->raid_disks;
 	mddev->raid_disks *= 2;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
 	conf = setup_conf(mddev);
 	if (!IS_ERR(conf)) {
 		list_for_each_entry(rdev, &mddev->disks, same_set)
 			if (rdev->raid_disk >= 0)
 				rdev->new_raid_disk = rdev->raid_disk * 2;
 		conf->barrier = 1;
 	}
 	return conf;
 }
 static void *raid10_takeover(struct mddev *mddev)
 {
 	struct r0conf *raid0_conf;
 	/* raid10 can take over:
 	 *  raid0 - providing it has only two drives
 	 */
 	if (mddev->level == 0) {
 		/* for raid0 takeover only one zone is supported */
 		raid0_conf = mddev->private;
 		if (raid0_conf->nr_strip_zones > 1) {
 			printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
 			       " with more than one zone.\n",
 			       mdname(mddev));
 			return ERR_PTR(-EINVAL);
 		}
 		return raid10_takeover_raid0(mddev);
 	}
 	return ERR_PTR(-EINVAL);
 }
 static struct md_personality raid10_personality =
 {
 	.name		= "raid10",
 	.level		= 10,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
 	.stop		= stop,
 	.status		= status,
 	.error_handler	= error,
 	.hot_add_disk	= raid10_add_disk,
 	.hot_remove_disk= raid10_remove_disk,
 	.spare_active	= raid10_spare_active,
 	.sync_request	= sync_request,
 	.quiesce	= raid10_quiesce,
 	.size		= raid10_size,
 	.takeover	= raid10_takeover,
 };
 static int __init raid_init(void)
 {
 	return register_md_personality(&raid10_personality);
 }
 static void raid_exit(void)
 {
 	unregister_md_personality(&raid10_personality);
 }
 module_init(raid_init);
 module_exit(raid_exit);
 MODULE_LICENSE("GPL");
 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
 MODULE_ALIAS("md-personality-9"); /* RAID10 */
 MODULE_ALIAS("md-raid10");
 MODULE_ALIAS("md-level-10");
 module_param(max_queued_requests, int, S_IRUGO|S_IWUSR);

1	#ifndef _RAID10_H	1	#ifndef _RAID10_H
2	#define _RAID10_H	2	#define _RAID10_H
3		3
4	struct mirror_info {	4	struct mirror_info {
5	struct md_rdev *rdev;	5	struct md_rdev rdev, replacement;
6	sector_t head_position;	6	sector_t head_position;
7	int recovery_disabled; /* matches	7	int recovery_disabled; /* matches
8	* mddev->recovery_disabled	8	* mddev->recovery_disabled
9	* when we shouldn't try	9	* when we shouldn't try
10	* recovering this device.	10	* recovering this device.
11	*/	11	*/
12	};	12	};
13		13
14	struct r10conf {	14	struct r10conf {
15	struct mddev *mddev;	15	struct mddev *mddev;
16	struct mirror_info *mirrors;	16	struct mirror_info *mirrors;
17	int raid_disks;	17	int raid_disks;
18	spinlock_t device_lock;	18	spinlock_t device_lock;
19		19
20	/* geometry */	20	/* geometry */
21	int near_copies; /* number of copies laid out raid0 style */	21	int near_copies; /* number of copies laid out
		22	* raid0 style */
22	int far_copies; /* number of copies laid out	23	int far_copies; /* number of copies laid out
23	* at large strides across drives	24	* at large strides across drives
24	*/	25	*/
25	int far_offset; /* far_copies are offset by 1 stripe	26	int far_offset; /* far_copies are offset by 1
26	* instead of many	27	* stripe instead of many
27	*/	28	*/
28	int copies; /* near_copies * far_copies.	29	int copies; /* near_copies * far_copies.
29	* must be <= raid_disks	30	* must be <= raid_disks
30	*/	31	*/
31	sector_t stride; /* distance between far copies.	32	sector_t stride; /* distance between far copies.
32	* This is size / far_copies unless	33	* This is size / far_copies unless
33	* far_offset, in which case it is	34	* far_offset, in which case it is
34	* 1 stripe.	35	* 1 stripe.
35	*/	36	*/
36		37
37	sector_t dev_sectors; /* temp copy of mddev->dev_sectors */	38	sector_t dev_sectors; /* temp copy of
		39	* mddev->dev_sectors */
38		40
39	int chunk_shift; /* shift from chunks to sectors */	41	int chunk_shift; /* shift from chunks to sectors */
40	sector_t chunk_mask;	42	sector_t chunk_mask;
41		43
42	struct list_head retry_list;	44	struct list_head retry_list;
43	/* queue pending writes and submit them on unplug */	45	/* queue pending writes and submit them on unplug */
44	struct bio_list pending_bio_list;	46	struct bio_list pending_bio_list;
45	int pending_count;	47	int pending_count;
46		48
47	spinlock_t resync_lock;	49	spinlock_t resync_lock;
48	int nr_pending;	50	int nr_pending;
49	int nr_waiting;	51	int nr_waiting;
50	int nr_queued;	52	int nr_queued;
51	int barrier;	53	int barrier;
52	sector_t next_resync;	54	sector_t next_resync;
53	int fullsync; /* set to 1 if a full sync is needed,	55	int fullsync; /* set to 1 if a full sync is needed,
54	* (fresh device added).	56	* (fresh device added).
55	* Cleared when a sync completes.	57	* Cleared when a sync completes.
56	*/	58	*/
57		59	int have_replacement; /* There is at least one
		60	* replacement device.
		61	*/
58	wait_queue_head_t wait_barrier;	62	wait_queue_head_t wait_barrier;
59		63
60	mempool_t *r10bio_pool;	64	mempool_t *r10bio_pool;
61	mempool_t *r10buf_pool;	65	mempool_t *r10buf_pool;
62	struct page *tmppage;	66	struct page *tmppage;
63		67
64	/* When taking over an array from a different personality, we store	68	/* When taking over an array from a different personality, we store
65	* the new thread here until we fully activate the array.	69	* the new thread here until we fully activate the array.
66	*/	70	*/
67	struct md_thread *thread;	71	struct md_thread *thread;
68	};	72	};
69		73
70	/*	74	/*
71	* this is our 'private' RAID10 bio.	75	* this is our 'private' RAID10 bio.
72	*	76	*
73	* it contains information about what kind of IO operations were started	77	* it contains information about what kind of IO operations were started
74	* for this RAID10 operation, and about their status:	78	* for this RAID10 operation, and about their status:
75	*/	79	*/
76		80
77	struct r10bio {	81	struct r10bio {
78	atomic_t remaining; /* 'have we finished' count,	82	atomic_t remaining; /* 'have we finished' count,
79	* used from IRQ handlers	83	* used from IRQ handlers
80	*/	84	*/
81	sector_t sector; /* virtual sector number */	85	sector_t sector; /* virtual sector number */
82	int sectors;	86	int sectors;
83	unsigned long state;	87	unsigned long state;
84	struct mddev *mddev;	88	struct mddev *mddev;
85	/*	89	/*
86	* original bio going to /dev/mdx	90	* original bio going to /dev/mdx
87	*/	91	*/
88	struct bio *master_bio;	92	struct bio *master_bio;
89	/*	93	/*
90	* if the IO is in READ direction, then this is where we read	94	* if the IO is in READ direction, then this is where we read
91	*/	95	*/
92	int read_slot;	96	int read_slot;
93		97
94	struct list_head retry_list;	98	struct list_head retry_list;
95	/*	99	/*
96	* if the IO is in WRITE direction, then multiple bios are used,	100	* if the IO is in WRITE direction, then multiple bios are used,
97	* one for each copy.	101	* one for each copy.
98	* When resyncing we also use one for each copy.	102	* When resyncing we also use one for each copy.
99	* When reconstructing, we use 2 bios, one for read, one for write.	103	* When reconstructing, we use 2 bios, one for read, one for write.
100	* We choose the number when they are allocated.	104	* We choose the number when they are allocated.
		105	* We sometimes need an extra bio to write to the replacement.
101	*/	106	*/
102	struct {	107	struct {
103	struct bio *bio;	108	struct bio *bio;
104	sector_t addr;	109	union {
105	int devnum;	110	struct bio repl_bio; / used for resync and
		111	* writes */
		112	struct md_rdev rdev; / used for reads
		113	* (read_slot >= 0) */
		114	};
		115	sector_t addr;
		116	int devnum;
106	} devs[0];	117	} devs[0];
107	};	118	};
108		119
109	/* when we get a read error on a read-only array, we redirect to another	120	/* when we get a read error on a read-only array, we redirect to another
110	* device without failing the first device, or trying to over-write to	121	* device without failing the first device, or trying to over-write to
111	* correct the read error. To keep track of bad blocks on a per-bio	122	* correct the read error. To keep track of bad blocks on a per-bio
112	* level, we store IO_BLOCKED in the appropriate 'bios' pointer	123	* level, we store IO_BLOCKED in the appropriate 'bios' pointer
113	*/	124	*/
114	#define IO_BLOCKED ((struct bio*)1)	125	#define IO_BLOCKED ((struct bio*)1)
115	/* When we successfully write to a known bad-block, we need to remove the	126	/* When we successfully write to a known bad-block, we need to remove the
116	* bad-block marking which must be done from process context. So we record	127	* bad-block marking which must be done from process context. So we record
117	* the success by setting devs[n].bio to IO_MADE_GOOD	128	* the success by setting devs[n].bio to IO_MADE_GOOD
118	*/	129	*/
119	#define IO_MADE_GOOD ((struct bio *)2)	130	#define IO_MADE_GOOD ((struct bio *)2)
120		131
121	#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)	132	#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
122		133
123	/* bits for r10bio.state */	134	/* bits for r10bio.state */
124	#define R10BIO_Uptodate 0	135	enum r10bio_state {
125	#define R10BIO_IsSync 1	136	R10BIO_Uptodate,
126	#define R10BIO_IsRecover 2	137	R10BIO_IsSync,
127	#define R10BIO_Degraded 3	138	R10BIO_IsRecover,
		139	R10BIO_Degraded,
128	/* Set ReadError on bios that experience a read error	140	/* Set ReadError on bios that experience a read error
129	* so that raid10d knows what to do with them.	141	* so that raid10d knows what to do with them.
130	*/	142	*/
131	#define R10BIO_ReadError 4	143	R10BIO_ReadError,
132	/* If a write for this request means we can clear some	144	/* If a write for this request means we can clear some
133	* known-bad-block records, we set this flag.	145	* known-bad-block records, we set this flag.
134	*/	146	*/
135	#define R10BIO_MadeGood 5	147	R10BIO_MadeGood,
136	#define R10BIO_WriteError 6	148	R10BIO_WriteError,
		149	};
137	#endif	150	#endif
138		151