Doug / smarc-fsl-linux-kernel

1

/*

1

/*

2

* raid1.c : Multiple Devices driver for Linux

2

* raid1.c : Multiple Devices driver for Linux

3

*

3

*

4

5

*

5

*

6

7

*

7

*

8

* RAID-1 management functions.

8

* RAID-1 management functions.

9

*

9

*

10

* Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000

10

* Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000

11

*

11

*

12

* Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>

12

* Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>

13

* Various fixes by Neil Brown <neilb@cse.unsw.edu.au>

13

* Various fixes by Neil Brown <neilb@cse.unsw.edu.au>

14

*

14

*

15

* Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support

15

* Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support

16

* bitmapped intelligence in resync:

16

* bitmapped intelligence in resync:

17

*

17

*

18

* - bitmap marked during normal i/o

18

* - bitmap marked during normal i/o

19

* - bitmap used to skip nondirty blocks during sync

19

* - bitmap used to skip nondirty blocks during sync

20

*

20

*

21

22

* - persistent bitmap code

22

* - persistent bitmap code

23

*

23

*

24

* This program is free software; you can redistribute it and/or modify

24

* This program is free software; you can redistribute it and/or modify

25

* it under the terms of the GNU General Public License as published by

25

* it under the terms of the GNU General Public License as published by

26

* the Free Software Foundation; either version 2, or (at your option)

26

* the Free Software Foundation; either version 2, or (at your option)

27

* any later version.

27

* any later version.

28

*

28

*

29

* You should have received a copy of the GNU General Public License

29

* You should have received a copy of the GNU General Public License

30

* (for example /usr/src/linux/COPYING); if not, write to the Free

30

* (for example /usr/src/linux/COPYING); if not, write to the Free

31

* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

31

* Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

32

*/

32

*/

33

34

#include "dm-bio-list.h"

34

#include "dm-bio-list.h"

35

#include <linux/raid/raid1.h>

35

#include <linux/raid/raid1.h>

36

#include <linux/raid/bitmap.h>

36

#include <linux/raid/bitmap.h>

37

38

#define DEBUG 0

38

#define DEBUG 0

39

#if DEBUG

39

#if DEBUG

40

#define PRINTK(x...) printk(x)

40

#define PRINTK(x...) printk(x)

41

#else

41

#else

42

#define PRINTK(x...)

42

#define PRINTK(x...)

43

#endif

43

#endif

44

45

/*

45

/*

46

* Number of guaranteed r1bios in case of extreme VM load:

46

* Number of guaranteed r1bios in case of extreme VM load:

47

*/

47

*/

48

#define NR_RAID1_BIOS 256

48

#define NR_RAID1_BIOS 256

49

50

51

static void unplug_slaves(mddev_t *mddev);

51

static void unplug_slaves(mddev_t *mddev);

52

53

static void allow_barrier(conf_t *conf);

53

static void allow_barrier(conf_t *conf);

54

static void lower_barrier(conf_t *conf);

54

static void lower_barrier(conf_t *conf);

55

56

static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)

56

static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)

57

{

57

{

58

struct pool_info *pi = data;

58

struct pool_info *pi = data;

59

r1bio_t *r1_bio;

59

r1bio_t *r1_bio;

60

int size = offsetof(r1bio_t, bios[pi->raid_disks]);

60

int size = offsetof(r1bio_t, bios[pi->raid_disks]);

61

62

/* allocate a r1bio with room for raid_disks entries in the bios array */

62

/* allocate a r1bio with room for raid_disks entries in the bios array */

63

r1_bio = kzalloc(size, gfp_flags);

63

r1_bio = kzalloc(size, gfp_flags);

64

if (!r1_bio)

64

if (!r1_bio)

65

unplug_slaves(pi->mddev);

65

unplug_slaves(pi->mddev);

66

67

return r1_bio;

67

return r1_bio;

68

}

68

}

69

70

static void r1bio_pool_free(void *r1_bio, void *data)

70

static void r1bio_pool_free(void *r1_bio, void *data)

71

{

71

{

72

kfree(r1_bio);

72

kfree(r1_bio);

73

}

73

}

74

75

#define RESYNC_BLOCK_SIZE (64*1024)

75

#define RESYNC_BLOCK_SIZE (64*1024)

76

//#define RESYNC_BLOCK_SIZE PAGE_SIZE

76

//#define RESYNC_BLOCK_SIZE PAGE_SIZE

77

#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)

77

#define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)

78

#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)

78

#define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)

79

#define RESYNC_WINDOW (2048*1024)

79

#define RESYNC_WINDOW (2048*1024)

80

81

static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)

81

static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)

82

{

82

{

83

struct pool_info *pi = data;

83

struct pool_info *pi = data;

84

struct page *page;

84

struct page *page;

85

r1bio_t *r1_bio;

85

r1bio_t *r1_bio;

86

struct bio *bio;

86

struct bio *bio;

87

int i, j;

87

int i, j;

88

89

r1_bio = r1bio_pool_alloc(gfp_flags, pi);

89

r1_bio = r1bio_pool_alloc(gfp_flags, pi);

90

if (!r1_bio) {

90

if (!r1_bio) {

91

unplug_slaves(pi->mddev);

91

unplug_slaves(pi->mddev);

92

return NULL;

92

return NULL;

93

}

93

}

94

95

/*

95

/*

96

* Allocate bios : 1 for reading, n-1 for writing

96

* Allocate bios : 1 for reading, n-1 for writing

97

*/

97

*/

98

for (j = pi->raid_disks ; j-- ; ) {

98

for (j = pi->raid_disks ; j-- ; ) {

99

bio = bio_alloc(gfp_flags, RESYNC_PAGES);

99

bio = bio_alloc(gfp_flags, RESYNC_PAGES);

100

if (!bio)

100

if (!bio)

101

goto out_free_bio;

101

goto out_free_bio;

102

r1_bio->bios[j] = bio;

102

r1_bio->bios[j] = bio;

103

}

103

}

104

/*

104

/*

105

* Allocate RESYNC_PAGES data pages and attach them to

105

* Allocate RESYNC_PAGES data pages and attach them to

106

* the first bio.

106

* the first bio.

107

* If this is a user-requested check/repair, allocate

107

* If this is a user-requested check/repair, allocate

108

* RESYNC_PAGES for each bio.

108

* RESYNC_PAGES for each bio.

109

*/

109

*/

110

if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))

110

if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))

111

j = pi->raid_disks;

111

j = pi->raid_disks;

112

else

112

else

113

j = 1;

113

j = 1;

114

while(j--) {

114

while(j--) {

115

bio = r1_bio->bios[j];

115

bio = r1_bio->bios[j];

116

for (i = 0; i < RESYNC_PAGES; i++) {

116

for (i = 0; i < RESYNC_PAGES; i++) {

117

page = alloc_page(gfp_flags);

117

page = alloc_page(gfp_flags);

118

if (unlikely(!page))

118

if (unlikely(!page))

119

goto out_free_pages;

119

goto out_free_pages;

120

121

bio->bi_io_vec[i].bv_page = page;

121

bio->bi_io_vec[i].bv_page = page;

122

}

122

}

123

}

123

}

124

/* If not user-requests, copy the page pointers to all bios */

124

/* If not user-requests, copy the page pointers to all bios */

125

if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {

125

if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {

126

for (i=0; i<RESYNC_PAGES ; i++)

126

for (i=0; i<RESYNC_PAGES ; i++)

127

for (j=1; j<pi->raid_disks; j++)

127

for (j=1; j<pi->raid_disks; j++)

128

r1_bio->bios[j]->bi_io_vec[i].bv_page =

128

r1_bio->bios[j]->bi_io_vec[i].bv_page =

129

r1_bio->bios[0]->bi_io_vec[i].bv_page;

129

r1_bio->bios[0]->bi_io_vec[i].bv_page;

130

}

130

}

131

132

r1_bio->master_bio = NULL;

132

r1_bio->master_bio = NULL;

133

134

return r1_bio;

134

return r1_bio;

135

136

out_free_pages:

136

out_free_pages:

137

for (i=0; i < RESYNC_PAGES ; i++)

137

for (i=0; i < RESYNC_PAGES ; i++)

138

for (j=0 ; j < pi->raid_disks; j++)

138

for (j=0 ; j < pi->raid_disks; j++)

139

safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);

139

safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);

140

j = -1;

140

j = -1;

141

out_free_bio:

141

out_free_bio:

142

while ( ++j < pi->raid_disks )

142

while ( ++j < pi->raid_disks )

143

bio_put(r1_bio->bios[j]);

143

bio_put(r1_bio->bios[j]);

144

r1bio_pool_free(r1_bio, data);

144

r1bio_pool_free(r1_bio, data);

145

return NULL;

145

return NULL;

146

}

146

}

147

148

static void r1buf_pool_free(void *__r1_bio, void *data)

148

static void r1buf_pool_free(void *__r1_bio, void *data)

149

{

149

{

150

struct pool_info *pi = data;

150

struct pool_info *pi = data;

151

int i,j;

151

int i,j;

152

r1bio_t *r1bio = __r1_bio;

152

r1bio_t *r1bio = __r1_bio;

153

154

for (i = 0; i < RESYNC_PAGES; i++)

154

for (i = 0; i < RESYNC_PAGES; i++)

155

for (j = pi->raid_disks; j-- ;) {

155

for (j = pi->raid_disks; j-- ;) {

156

if (j == 0 ||

156

if (j == 0 ||

157

r1bio->bios[j]->bi_io_vec[i].bv_page !=

157

r1bio->bios[j]->bi_io_vec[i].bv_page !=

158

r1bio->bios[0]->bi_io_vec[i].bv_page)

158

r1bio->bios[0]->bi_io_vec[i].bv_page)

159

safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);

159

safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);

160

}

160

}

161

for (i=0 ; i < pi->raid_disks; i++)

161

for (i=0 ; i < pi->raid_disks; i++)

162

bio_put(r1bio->bios[i]);

162

bio_put(r1bio->bios[i]);

163

164

r1bio_pool_free(r1bio, data);

164

r1bio_pool_free(r1bio, data);

165

}

165

}

166

167

static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)

167

static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)

168

{

168

{

169

int i;

169

int i;

170

171

for (i = 0; i < conf->raid_disks; i++) {

171

for (i = 0; i < conf->raid_disks; i++) {

172

struct bio **bio = r1_bio->bios + i;

172

struct bio **bio = r1_bio->bios + i;

173

if (*bio && *bio != IO_BLOCKED)

173

if (*bio && *bio != IO_BLOCKED)

174

bio_put(*bio);

174

bio_put(*bio);

175

*bio = NULL;

175

*bio = NULL;

176

}

176

}

177

}

177

}

178

179

static void free_r1bio(r1bio_t *r1_bio)

179

static void free_r1bio(r1bio_t *r1_bio)

180

{

180

{

181

conf_t *conf = mddev_to_conf(r1_bio->mddev);

181

conf_t *conf = mddev_to_conf(r1_bio->mddev);

182

183

/*

183

/*

184

* Wake up any possible resync thread that waits for the device

184

* Wake up any possible resync thread that waits for the device

185

* to go idle.

185

* to go idle.

186

*/

186

*/

187

allow_barrier(conf);

187

allow_barrier(conf);

188

189

put_all_bios(conf, r1_bio);

189

put_all_bios(conf, r1_bio);

190

mempool_free(r1_bio, conf->r1bio_pool);

190

mempool_free(r1_bio, conf->r1bio_pool);

191

}

191

}

192

193

static void put_buf(r1bio_t *r1_bio)

193

static void put_buf(r1bio_t *r1_bio)

194

{

194

{

195

conf_t *conf = mddev_to_conf(r1_bio->mddev);

195

conf_t *conf = mddev_to_conf(r1_bio->mddev);

196

int i;

196

int i;

197

198

for (i=0; i<conf->raid_disks; i++) {

198

for (i=0; i<conf->raid_disks; i++) {

199

struct bio *bio = r1_bio->bios[i];

199

struct bio *bio = r1_bio->bios[i];

200

if (bio->bi_end_io)

200

if (bio->bi_end_io)

201

rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);

201

rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);

202

}

202

}

203

204

mempool_free(r1_bio, conf->r1buf_pool);

204

mempool_free(r1_bio, conf->r1buf_pool);

205

206

lower_barrier(conf);

206

lower_barrier(conf);

207

}

207

}

208

209

static void reschedule_retry(r1bio_t *r1_bio)

209

static void reschedule_retry(r1bio_t *r1_bio)

210

{

210

{

211

unsigned long flags;

211

unsigned long flags;

212

mddev_t *mddev = r1_bio->mddev;

212

mddev_t *mddev = r1_bio->mddev;

213

conf_t *conf = mddev_to_conf(mddev);

213

conf_t *conf = mddev_to_conf(mddev);

214

215

spin_lock_irqsave(&conf->device_lock, flags);

215

spin_lock_irqsave(&conf->device_lock, flags);

216

list_add(&r1_bio->retry_list, &conf->retry_list);

216

list_add(&r1_bio->retry_list, &conf->retry_list);

217

conf->nr_queued ++;

217

conf->nr_queued ++;

218

spin_unlock_irqrestore(&conf->device_lock, flags);

218

spin_unlock_irqrestore(&conf->device_lock, flags);

219

220

wake_up(&conf->wait_barrier);

220

wake_up(&conf->wait_barrier);

221

md_wakeup_thread(mddev->thread);

221

md_wakeup_thread(mddev->thread);

222

}

222

}

223

224

/*

224

/*

225

* raid_end_bio_io() is called when we have finished servicing a mirrored

225

* raid_end_bio_io() is called when we have finished servicing a mirrored

226

* operation and are ready to return a success/failure code to the buffer

226

* operation and are ready to return a success/failure code to the buffer

227

* cache layer.

227

* cache layer.

228

*/

228

*/

229

static void raid_end_bio_io(r1bio_t *r1_bio)

229

static void raid_end_bio_io(r1bio_t *r1_bio)

230

{

230

{

231

struct bio *bio = r1_bio->master_bio;

231

struct bio *bio = r1_bio->master_bio;

232

233

/* if nobody has done the final endio yet, do it now */

233

/* if nobody has done the final endio yet, do it now */

234

if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {

234

if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {

235

PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",

235

PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",

236

(bio_data_dir(bio) == WRITE) ? "write" : "read",

236

(bio_data_dir(bio) == WRITE) ? "write" : "read",

237

(unsigned long long) bio->bi_sector,

237

(unsigned long long) bio->bi_sector,

238

(unsigned long long) bio->bi_sector +

238

(unsigned long long) bio->bi_sector +

239

(bio->bi_size >> 9) - 1);

239

(bio->bi_size >> 9) - 1);

240

241

bio_endio(bio, bio->bi_size,

241

bio_endio(bio, bio->bi_size,

242

test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);

242

test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);

243

}

243

}

244

free_r1bio(r1_bio);

244

free_r1bio(r1_bio);

245

}

245

}

246

247

/*

247

/*

248

* Update disk head position estimator based on IRQ completion info.

248

* Update disk head position estimator based on IRQ completion info.

249

*/

249

*/

250

static inline void update_head_pos(int disk, r1bio_t *r1_bio)

250

static inline void update_head_pos(int disk, r1bio_t *r1_bio)

251

{

251

{

252

conf_t *conf = mddev_to_conf(r1_bio->mddev);

252

conf_t *conf = mddev_to_conf(r1_bio->mddev);

253

254

conf->mirrors[disk].head_position =

254

conf->mirrors[disk].head_position =

255

r1_bio->sector + (r1_bio->sectors);

255

r1_bio->sector + (r1_bio->sectors);

256

}

256

}

257

258

static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error)

258

static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error)

259

{

259

{

260

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

260

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

261

r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);

261

r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);

262

int mirror;

262

int mirror;

263

conf_t *conf = mddev_to_conf(r1_bio->mddev);

263

conf_t *conf = mddev_to_conf(r1_bio->mddev);

264

265

if (bio->bi_size)

265

if (bio->bi_size)

266

return 1;

266

return 1;

267

268

mirror = r1_bio->read_disk;

268

mirror = r1_bio->read_disk;

269

/*

269

/*

270

* this branch is our 'one mirror IO has finished' event handler:

270

* this branch is our 'one mirror IO has finished' event handler:

271

*/

271

*/

272

update_head_pos(mirror, r1_bio);

272

update_head_pos(mirror, r1_bio);

273

274

if (uptodate || conf->working_disks <= 1) {

274

if (uptodate || conf->working_disks <= 1) {

275

/*

275

/*

276

* Set R1BIO_Uptodate in our master bio, so that

276

* Set R1BIO_Uptodate in our master bio, so that

277

* we will return a good error code for to the higher

277

* we will return a good error code for to the higher

278

* levels even if IO on some other mirrored buffer fails.

278

* levels even if IO on some other mirrored buffer fails.

279

*

279

*

280

* The 'master' represents the composite IO operation to

280

* The 'master' represents the composite IO operation to

281

* user-side. So if something waits for IO, then it will

281

* user-side. So if something waits for IO, then it will

282

* wait for the 'master' bio.

282

* wait for the 'master' bio.

283

*/

283

*/

284

if (uptodate)

284

if (uptodate)

285

set_bit(R1BIO_Uptodate, &r1_bio->state);

285

set_bit(R1BIO_Uptodate, &r1_bio->state);

286

287

raid_end_bio_io(r1_bio);

287

raid_end_bio_io(r1_bio);

288

} else {

288

} else {

289

/*

289

/*

290

* oops, read error:

290

* oops, read error:

291

*/

291

*/

292

char b[BDEVNAME_SIZE];

292

char b[BDEVNAME_SIZE];

293

if (printk_ratelimit())

293

if (printk_ratelimit())

294

printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",

294

printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",

295

bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);

295

bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);

296

reschedule_retry(r1_bio);

296

reschedule_retry(r1_bio);

297

}

297

}

298

299

rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);

299

rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);

300

return 0;

300

return 0;

301

}

301

}

302

303

static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error)

303

static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error)

304

{

304

{

305

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

305

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

306

r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);

306

r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);

307

int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);

307

int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);

308

conf_t *conf = mddev_to_conf(r1_bio->mddev);

308

conf_t *conf = mddev_to_conf(r1_bio->mddev);

309

struct bio *to_put = NULL;

309

struct bio *to_put = NULL;

310

311

if (bio->bi_size)

311

if (bio->bi_size)

312

return 1;

312

return 1;

313

314

for (mirror = 0; mirror < conf->raid_disks; mirror++)

314

for (mirror = 0; mirror < conf->raid_disks; mirror++)

315

if (r1_bio->bios[mirror] == bio)

315

if (r1_bio->bios[mirror] == bio)

316

break;

316

break;

317

318

if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {

318

if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {

319

set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);

319

set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);

320

set_bit(R1BIO_BarrierRetry, &r1_bio->state);

320

set_bit(R1BIO_BarrierRetry, &r1_bio->state);

321

r1_bio->mddev->barriers_work = 0;

321

r1_bio->mddev->barriers_work = 0;

322

} else {

322

} else {

323

/*

323

/*

324

* this branch is our 'one mirror IO has finished' event handler:

324

* this branch is our 'one mirror IO has finished' event handler:

325

*/

325

*/

326

r1_bio->bios[mirror] = NULL;

326

r1_bio->bios[mirror] = NULL;

327

to_put = bio;

327

to_put = bio;

328

if (!uptodate) {

328

if (!uptodate) {

329

md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);

329

md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);

330

/* an I/O failed, we can't clear the bitmap */

330

/* an I/O failed, we can't clear the bitmap */

331

set_bit(R1BIO_Degraded, &r1_bio->state);

331

set_bit(R1BIO_Degraded, &r1_bio->state);

332

} else

332

} else

333

/*

333

/*

334

* Set R1BIO_Uptodate in our master bio, so that

334

* Set R1BIO_Uptodate in our master bio, so that

335

* we will return a good error code for to the higher

335

* we will return a good error code for to the higher

336

* levels even if IO on some other mirrored buffer fails.

336

* levels even if IO on some other mirrored buffer fails.

337

*

337

*

338

* The 'master' represents the composite IO operation to

338

* The 'master' represents the composite IO operation to

339

* user-side. So if something waits for IO, then it will

339

* user-side. So if something waits for IO, then it will

340

* wait for the 'master' bio.

340

* wait for the 'master' bio.

341

*/

341

*/

342

set_bit(R1BIO_Uptodate, &r1_bio->state);

342

set_bit(R1BIO_Uptodate, &r1_bio->state);

343

344

update_head_pos(mirror, r1_bio);

344

update_head_pos(mirror, r1_bio);

345

346

if (behind) {

346

if (behind) {

347

if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))

347

if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))

348

atomic_dec(&r1_bio->behind_remaining);

348

atomic_dec(&r1_bio->behind_remaining);

349

350

/* In behind mode, we ACK the master bio once the I/O has safely

350

/* In behind mode, we ACK the master bio once the I/O has safely

351

* reached all non-writemostly disks. Setting the Returned bit

351

* reached all non-writemostly disks. Setting the Returned bit

352

* ensures that this gets done only once -- we don't ever want to

352

* ensures that this gets done only once -- we don't ever want to

353

* return -EIO here, instead we'll wait */

353

* return -EIO here, instead we'll wait */

354

355

if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&

355

if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&

356

test_bit(R1BIO_Uptodate, &r1_bio->state)) {

356

test_bit(R1BIO_Uptodate, &r1_bio->state)) {

357

/* Maybe we can return now */

357

/* Maybe we can return now */

358

if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {

358

if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {

359

struct bio *mbio = r1_bio->master_bio;

359

struct bio *mbio = r1_bio->master_bio;

360

PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",

360

PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",

361

(unsigned long long) mbio->bi_sector,

361

(unsigned long long) mbio->bi_sector,

362

(unsigned long long) mbio->bi_sector +

362

(unsigned long long) mbio->bi_sector +

363

(mbio->bi_size >> 9) - 1);

363

(mbio->bi_size >> 9) - 1);

364

bio_endio(mbio, mbio->bi_size, 0);

364

bio_endio(mbio, mbio->bi_size, 0);

365

}

365

}

366

}

366

}

367

}

367

}

368

}

368

}

369

/*

369

/*

370

*

370

*

371

* Let's see if all mirrored write operations have finished

371

* Let's see if all mirrored write operations have finished

372

* already.

372

* already.

373

*/

373

*/

374

if (atomic_dec_and_test(&r1_bio->remaining)) {

374

if (atomic_dec_and_test(&r1_bio->remaining)) {

375

if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {

375

if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {

376

reschedule_retry(r1_bio);

376

reschedule_retry(r1_bio);

377

/* Don't dec_pending yet, we want to hold

377

/* Don't dec_pending yet, we want to hold

378

* the reference over the retry

378

* the reference over the retry

379

*/

379

*/

380

goto out;

380

goto out;

381

}

381

}

382

if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {

382

if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {

383

/* free extra copy of the data pages */

383

/* free extra copy of the data pages */

384

int i = bio->bi_vcnt;

384

int i = bio->bi_vcnt;

385

while (i--)

385

while (i--)

386

safe_put_page(bio->bi_io_vec[i].bv_page);

386

safe_put_page(bio->bi_io_vec[i].bv_page);

387

}

387

}

388

/* clear the bitmap if all writes complete successfully */

388

/* clear the bitmap if all writes complete successfully */

389

bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,

389

bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,

390

r1_bio->sectors,

390

r1_bio->sectors,

391

!test_bit(R1BIO_Degraded, &r1_bio->state),

391

!test_bit(R1BIO_Degraded, &r1_bio->state),

392

behind);

392

behind);

393

md_write_end(r1_bio->mddev);

393

md_write_end(r1_bio->mddev);

394

raid_end_bio_io(r1_bio);

394

raid_end_bio_io(r1_bio);

395

}

395

}

396

397

rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);

397

rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);

398

out:

398

out:

399

if (to_put)

399

if (to_put)

400

bio_put(to_put);

400

bio_put(to_put);

401

402

return 0;

402

return 0;

403

}

403

}

404

405

406

/*

406

/*

407

* This routine returns the disk from which the requested read should

407

* This routine returns the disk from which the requested read should

408

* be done. There is a per-array 'next expected sequential IO' sector

408

* be done. There is a per-array 'next expected sequential IO' sector

409

* number - if this matches on the next IO then we use the last disk.

409

* number - if this matches on the next IO then we use the last disk.

410

* There is also a per-disk 'last know head position' sector that is

410

* There is also a per-disk 'last know head position' sector that is

411

* maintained from IRQ contexts, both the normal and the resync IO

411

* maintained from IRQ contexts, both the normal and the resync IO

412

* completion handlers update this position correctly. If there is no

412

* completion handlers update this position correctly. If there is no

413

* perfect sequential match then we pick the disk whose head is closest.

413

* perfect sequential match then we pick the disk whose head is closest.

414

*

414

*

415

* If there are 2 mirrors in the same 2 devices, performance degrades

415

* If there are 2 mirrors in the same 2 devices, performance degrades

416

* because position is mirror, not device based.

416

* because position is mirror, not device based.

417

*

417

*

418

* The rdev for the device selected will have nr_pending incremented.

418

* The rdev for the device selected will have nr_pending incremented.

419

*/

419

*/

420

static int read_balance(conf_t *conf, r1bio_t *r1_bio)

420

static int read_balance(conf_t *conf, r1bio_t *r1_bio)

421

{

421

{

422

const unsigned long this_sector = r1_bio->sector;

422

const unsigned long this_sector = r1_bio->sector;

423

int new_disk = conf->last_used, disk = new_disk;

423

int new_disk = conf->last_used, disk = new_disk;

424

int wonly_disk = -1;

424

int wonly_disk = -1;

425

const int sectors = r1_bio->sectors;

425

const int sectors = r1_bio->sectors;

426

sector_t new_distance, current_distance;

426

sector_t new_distance, current_distance;

427

mdk_rdev_t *rdev;

427

mdk_rdev_t *rdev;

428

429

rcu_read_lock();

429

rcu_read_lock();

430

/*

430

/*

431

* Check if we can balance. We can balance on the whole

431

* Check if we can balance. We can balance on the whole

432

* device if no resync is going on, or below the resync window.

432

* device if no resync is going on, or below the resync window.

433

* We take the first readable disk when above the resync window.

433

* We take the first readable disk when above the resync window.

434

*/

434

*/

435

retry:

435

retry:

436

if (conf->mddev->recovery_cp < MaxSector &&

436

if (conf->mddev->recovery_cp < MaxSector &&

437

(this_sector + sectors >= conf->next_resync)) {

437

(this_sector + sectors >= conf->next_resync)) {

438

/* Choose the first operation device, for consistancy */

438

/* Choose the first operation device, for consistancy */

439

new_disk = 0;

439

new_disk = 0;

440

441

for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);

441

for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);

442

r1_bio->bios[new_disk] == IO_BLOCKED ||

442

r1_bio->bios[new_disk] == IO_BLOCKED ||

443

!rdev || !test_bit(In_sync, &rdev->flags)

443

!rdev || !test_bit(In_sync, &rdev->flags)

444

|| test_bit(WriteMostly, &rdev->flags);

444

|| test_bit(WriteMostly, &rdev->flags);

445

rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {

445

rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {

446

447

if (rdev && test_bit(In_sync, &rdev->flags) &&

447

if (rdev && test_bit(In_sync, &rdev->flags) &&

448

r1_bio->bios[new_disk] != IO_BLOCKED)

448

r1_bio->bios[new_disk] != IO_BLOCKED)

449

wonly_disk = new_disk;

449

wonly_disk = new_disk;

450

451

if (new_disk == conf->raid_disks - 1) {

451

if (new_disk == conf->raid_disks - 1) {

452

new_disk = wonly_disk;

452

new_disk = wonly_disk;

453

break;

453

break;

454

}

454

}

455

}

455

}

456

goto rb_out;

456

goto rb_out;

457

}

457

}

458

459

460

/* make sure the disk is operational */

460

/* make sure the disk is operational */

461

for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);

461

for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);

462

r1_bio->bios[new_disk] == IO_BLOCKED ||

462

r1_bio->bios[new_disk] == IO_BLOCKED ||

463

!rdev || !test_bit(In_sync, &rdev->flags) ||

463

!rdev || !test_bit(In_sync, &rdev->flags) ||

464

test_bit(WriteMostly, &rdev->flags);

464

test_bit(WriteMostly, &rdev->flags);

465

rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {

465

rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {

466

467

if (rdev && test_bit(In_sync, &rdev->flags) &&

467

if (rdev && test_bit(In_sync, &rdev->flags) &&

468

r1_bio->bios[new_disk] != IO_BLOCKED)

468

r1_bio->bios[new_disk] != IO_BLOCKED)

469

wonly_disk = new_disk;

469

wonly_disk = new_disk;

470

471

if (new_disk <= 0)

471

if (new_disk <= 0)

472

new_disk = conf->raid_disks;

472

new_disk = conf->raid_disks;

473

new_disk--;

473

new_disk--;

474

if (new_disk == disk) {

474

if (new_disk == disk) {

475

new_disk = wonly_disk;

475

new_disk = wonly_disk;

476

break;

476

break;

477

}

477

}

478

}

478

}

479

480

if (new_disk < 0)

480

if (new_disk < 0)

481

goto rb_out;

481

goto rb_out;

482

483

disk = new_disk;

483

disk = new_disk;

484

/* now disk == new_disk == starting point for search */

484

/* now disk == new_disk == starting point for search */

485

486

/*

486

/*

487

* Don't change to another disk for sequential reads:

487

* Don't change to another disk for sequential reads:

488

*/

488

*/

489

if (conf->next_seq_sect == this_sector)

489

if (conf->next_seq_sect == this_sector)

490

goto rb_out;

490

goto rb_out;

491

if (this_sector == conf->mirrors[new_disk].head_position)

491

if (this_sector == conf->mirrors[new_disk].head_position)

492

goto rb_out;

492

goto rb_out;

493

494

current_distance = abs(this_sector - conf->mirrors[disk].head_position);

494

current_distance = abs(this_sector - conf->mirrors[disk].head_position);

495

496

/* Find the disk whose head is closest */

496

/* Find the disk whose head is closest */

497

498

do {

498

do {

499

if (disk <= 0)

499

if (disk <= 0)

500

disk = conf->raid_disks;

500

disk = conf->raid_disks;

501

disk--;

501

disk--;

502

503

rdev = rcu_dereference(conf->mirrors[disk].rdev);

503

rdev = rcu_dereference(conf->mirrors[disk].rdev);

504

505

if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||

505

if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||

506

!test_bit(In_sync, &rdev->flags) ||

506

!test_bit(In_sync, &rdev->flags) ||

507

test_bit(WriteMostly, &rdev->flags))

507

test_bit(WriteMostly, &rdev->flags))

508

continue;

508

continue;

509

510

if (!atomic_read(&rdev->nr_pending)) {

510

if (!atomic_read(&rdev->nr_pending)) {

511

new_disk = disk;

511

new_disk = disk;

512

break;

512

break;

513

}

513

}

514

new_distance = abs(this_sector - conf->mirrors[disk].head_position);

514

new_distance = abs(this_sector - conf->mirrors[disk].head_position);

515

if (new_distance < current_distance) {

515

if (new_distance < current_distance) {

516

current_distance = new_distance;

516

current_distance = new_distance;

517

new_disk = disk;

517

new_disk = disk;

518

}

518

}

519

} while (disk != conf->last_used);

519

} while (disk != conf->last_used);

520

521

rb_out:

521

rb_out:

522

523

524

if (new_disk >= 0) {

524

if (new_disk >= 0) {

525

rdev = rcu_dereference(conf->mirrors[new_disk].rdev);

525

rdev = rcu_dereference(conf->mirrors[new_disk].rdev);

526

if (!rdev)

526

if (!rdev)

527

goto retry;

527

goto retry;

528

atomic_inc(&rdev->nr_pending);

528

atomic_inc(&rdev->nr_pending);

529

if (!test_bit(In_sync, &rdev->flags)) {

529

if (!test_bit(In_sync, &rdev->flags)) {

530

/* cannot risk returning a device that failed

530

/* cannot risk returning a device that failed

531

* before we inc'ed nr_pending

531

* before we inc'ed nr_pending

532

*/

532

*/

533

rdev_dec_pending(rdev, conf->mddev);

533

rdev_dec_pending(rdev, conf->mddev);

534

goto retry;

534

goto retry;

535

}

535

}

536

conf->next_seq_sect = this_sector + sectors;

536

conf->next_seq_sect = this_sector + sectors;

537

conf->last_used = new_disk;

537

conf->last_used = new_disk;

538

}

538

}

539

rcu_read_unlock();

539

rcu_read_unlock();

540

541

return new_disk;

541

return new_disk;

542

}

542

}

543

544

static void unplug_slaves(mddev_t *mddev)

544

static void unplug_slaves(mddev_t *mddev)

545

{

545

{

546

conf_t *conf = mddev_to_conf(mddev);

546

conf_t *conf = mddev_to_conf(mddev);

547

int i;

547

int i;

548

549

rcu_read_lock();

549

rcu_read_lock();

550

for (i=0; i<mddev->raid_disks; i++) {

550

for (i=0; i<mddev->raid_disks; i++) {

551

mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);

551

mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);

552

if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {

552

if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {

553

request_queue_t *r_queue = bdev_get_queue(rdev->bdev);

553

request_queue_t *r_queue = bdev_get_queue(rdev->bdev);

554

555

atomic_inc(&rdev->nr_pending);

555

atomic_inc(&rdev->nr_pending);

556

rcu_read_unlock();

556

rcu_read_unlock();

557

558

if (r_queue->unplug_fn)

558

if (r_queue->unplug_fn)

559

r_queue->unplug_fn(r_queue);

559

r_queue->unplug_fn(r_queue);

560

561

rdev_dec_pending(rdev, mddev);

561

rdev_dec_pending(rdev, mddev);

562

rcu_read_lock();

562

rcu_read_lock();

563

}

563

}

564

}

564

}

565

rcu_read_unlock();

565

rcu_read_unlock();

566

}

566

}

567

568

static void raid1_unplug(request_queue_t *q)

568

static void raid1_unplug(request_queue_t *q)

569

{

569

{

570

mddev_t *mddev = q->queuedata;

570

mddev_t *mddev = q->queuedata;

571

572

unplug_slaves(mddev);

572

unplug_slaves(mddev);

573

md_wakeup_thread(mddev->thread);

573

md_wakeup_thread(mddev->thread);

574

}

574

}

575

576

static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,

576

static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,

577

sector_t *error_sector)

577

sector_t *error_sector)

578

{

578

{

579

mddev_t *mddev = q->queuedata;

579

mddev_t *mddev = q->queuedata;

580

conf_t *conf = mddev_to_conf(mddev);

580

conf_t *conf = mddev_to_conf(mddev);

581

int i, ret = 0;

581

int i, ret = 0;

582

583

rcu_read_lock();

583

rcu_read_lock();

584

for (i=0; i<mddev->raid_disks && ret == 0; i++) {

584

for (i=0; i<mddev->raid_disks && ret == 0; i++) {

585

mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);

585

mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);

586

if (rdev && !test_bit(Faulty, &rdev->flags)) {

586

if (rdev && !test_bit(Faulty, &rdev->flags)) {

587

struct block_device *bdev = rdev->bdev;

587

struct block_device *bdev = rdev->bdev;

588

request_queue_t *r_queue = bdev_get_queue(bdev);

588

request_queue_t *r_queue = bdev_get_queue(bdev);

589

590

if (!r_queue->issue_flush_fn)

590

if (!r_queue->issue_flush_fn)

591

ret = -EOPNOTSUPP;

591

ret = -EOPNOTSUPP;

592

else {

592

else {

593

atomic_inc(&rdev->nr_pending);

593

atomic_inc(&rdev->nr_pending);

594

rcu_read_unlock();

594

rcu_read_unlock();

595

ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,

595

ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,

596

error_sector);

596

error_sector);

597

rdev_dec_pending(rdev, mddev);

597

rdev_dec_pending(rdev, mddev);

598

rcu_read_lock();

598

rcu_read_lock();

599

}

599

}

600

}

600

}

601

}

601

}

602

rcu_read_unlock();

602

rcu_read_unlock();

603

return ret;

603

return ret;

604

}

604

}

605

606

/* Barriers....

606

/* Barriers....

607

* Sometimes we need to suspend IO while we do something else,

607

* Sometimes we need to suspend IO while we do something else,

608

* either some resync/recovery, or reconfigure the array.

608

* either some resync/recovery, or reconfigure the array.

609

* To do this we raise a 'barrier'.

609

* To do this we raise a 'barrier'.

610

* The 'barrier' is a counter that can be raised multiple times

610

* The 'barrier' is a counter that can be raised multiple times

611

* to count how many activities are happening which preclude

611

* to count how many activities are happening which preclude

612

* normal IO.

612

* normal IO.

613

* We can only raise the barrier if there is no pending IO.

613

* We can only raise the barrier if there is no pending IO.

614

* i.e. if nr_pending == 0.

614

* i.e. if nr_pending == 0.

615

* We choose only to raise the barrier if no-one is waiting for the

615

* We choose only to raise the barrier if no-one is waiting for the

616

* barrier to go down. This means that as soon as an IO request

616

* barrier to go down. This means that as soon as an IO request

617

* is ready, no other operations which require a barrier will start

617

* is ready, no other operations which require a barrier will start

618

* until the IO request has had a chance.

618

* until the IO request has had a chance.

619

*

619

*

620

* So: regular IO calls 'wait_barrier'. When that returns there

620

* So: regular IO calls 'wait_barrier'. When that returns there

621

* is no backgroup IO happening, It must arrange to call

621

* is no backgroup IO happening, It must arrange to call

622

* allow_barrier when it has finished its IO.

622

* allow_barrier when it has finished its IO.

623

* backgroup IO calls must call raise_barrier. Once that returns

623

* backgroup IO calls must call raise_barrier. Once that returns

624

* there is no normal IO happeing. It must arrange to call

624

* there is no normal IO happeing. It must arrange to call

625

* lower_barrier when the particular background IO completes.

625

* lower_barrier when the particular background IO completes.

626

*/

626

*/

627

#define RESYNC_DEPTH 32

627

#define RESYNC_DEPTH 32

628

629

static void raise_barrier(conf_t *conf)

629

static void raise_barrier(conf_t *conf)

630

{

630

{

631

spin_lock_irq(&conf->resync_lock);

631

spin_lock_irq(&conf->resync_lock);

632

633

/* Wait until no block IO is waiting */

633

/* Wait until no block IO is waiting */

634

wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,

634

wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,

635

conf->resync_lock,

635

conf->resync_lock,

636

raid1_unplug(conf->mddev->queue));

636

raid1_unplug(conf->mddev->queue));

637

638

/* block any new IO from starting */

638

/* block any new IO from starting */

639

conf->barrier++;

639

conf->barrier++;

640

641

/* No wait for all pending IO to complete */

641

/* No wait for all pending IO to complete */

642

wait_event_lock_irq(conf->wait_barrier,

642

wait_event_lock_irq(conf->wait_barrier,

643

!conf->nr_pending && conf->barrier < RESYNC_DEPTH,

643

!conf->nr_pending && conf->barrier < RESYNC_DEPTH,

644

conf->resync_lock,

644

conf->resync_lock,

645

raid1_unplug(conf->mddev->queue));

645

raid1_unplug(conf->mddev->queue));

646

647

spin_unlock_irq(&conf->resync_lock);

647

spin_unlock_irq(&conf->resync_lock);

648

}

648

}

649

650

static void lower_barrier(conf_t *conf)

650

static void lower_barrier(conf_t *conf)

651

{

651

{

652

unsigned long flags;

652

unsigned long flags;

653

spin_lock_irqsave(&conf->resync_lock, flags);

653

spin_lock_irqsave(&conf->resync_lock, flags);

654

conf->barrier--;

654

conf->barrier--;

655

spin_unlock_irqrestore(&conf->resync_lock, flags);

655

spin_unlock_irqrestore(&conf->resync_lock, flags);

656

wake_up(&conf->wait_barrier);

656

wake_up(&conf->wait_barrier);

657

}

657

}

658

659

static void wait_barrier(conf_t *conf)

659

static void wait_barrier(conf_t *conf)

660

{

660

{

661

spin_lock_irq(&conf->resync_lock);

661

spin_lock_irq(&conf->resync_lock);

662

if (conf->barrier) {

662

if (conf->barrier) {

663

conf->nr_waiting++;

663

conf->nr_waiting++;

664

wait_event_lock_irq(conf->wait_barrier, !conf->barrier,

664

wait_event_lock_irq(conf->wait_barrier, !conf->barrier,

665

conf->resync_lock,

665

conf->resync_lock,

666

raid1_unplug(conf->mddev->queue));

666

raid1_unplug(conf->mddev->queue));

667

conf->nr_waiting--;

667

conf->nr_waiting--;

668

}

668

}

669

conf->nr_pending++;

669

conf->nr_pending++;

670

spin_unlock_irq(&conf->resync_lock);

670

spin_unlock_irq(&conf->resync_lock);

671

}

671

}

672

673

static void allow_barrier(conf_t *conf)

673

static void allow_barrier(conf_t *conf)

674

{

674

{

675

unsigned long flags;

675

unsigned long flags;

676

spin_lock_irqsave(&conf->resync_lock, flags);

676

spin_lock_irqsave(&conf->resync_lock, flags);

677

conf->nr_pending--;

677

conf->nr_pending--;

678

spin_unlock_irqrestore(&conf->resync_lock, flags);

678

spin_unlock_irqrestore(&conf->resync_lock, flags);

679

wake_up(&conf->wait_barrier);

679

wake_up(&conf->wait_barrier);

680

}

680

}

681

682

static void freeze_array(conf_t *conf)

682

static void freeze_array(conf_t *conf)

683

{

683

{

684

/* stop syncio and normal IO and wait for everything to

684

/* stop syncio and normal IO and wait for everything to

685

* go quite.

685

* go quite.

686

* We increment barrier and nr_waiting, and then

686

* We increment barrier and nr_waiting, and then

687

* wait until barrier+nr_pending match nr_queued+2

687

* wait until barrier+nr_pending match nr_queued+2

688

*/

688

*/

689

spin_lock_irq(&conf->resync_lock);

689

spin_lock_irq(&conf->resync_lock);

690

conf->barrier++;

690

conf->barrier++;

691

conf->nr_waiting++;

691

conf->nr_waiting++;

692

wait_event_lock_irq(conf->wait_barrier,

692

wait_event_lock_irq(conf->wait_barrier,

693

conf->barrier+conf->nr_pending == conf->nr_queued+2,

693

conf->barrier+conf->nr_pending == conf->nr_queued+2,

694

conf->resync_lock,

694

conf->resync_lock,

695

raid1_unplug(conf->mddev->queue));

695

raid1_unplug(conf->mddev->queue));

696

spin_unlock_irq(&conf->resync_lock);

696

spin_unlock_irq(&conf->resync_lock);

697

}

697

}

698

static void unfreeze_array(conf_t *conf)

698

static void unfreeze_array(conf_t *conf)

699

{

699

{

700

/* reverse the effect of the freeze */

700

/* reverse the effect of the freeze */

701

spin_lock_irq(&conf->resync_lock);

701

spin_lock_irq(&conf->resync_lock);

702

conf->barrier--;

702

conf->barrier--;

703

conf->nr_waiting--;

703

conf->nr_waiting--;

704

wake_up(&conf->wait_barrier);

704

wake_up(&conf->wait_barrier);

705

spin_unlock_irq(&conf->resync_lock);

705

spin_unlock_irq(&conf->resync_lock);

706

}

706

}

707

708

709

/* duplicate the data pages for behind I/O */

709

/* duplicate the data pages for behind I/O */

710

static struct page **alloc_behind_pages(struct bio *bio)

710

static struct page **alloc_behind_pages(struct bio *bio)

711

{

711

{

712

int i;

712

int i;

713

struct bio_vec *bvec;

713

struct bio_vec *bvec;

714

struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),

714

struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),

715

GFP_NOIO);

715

GFP_NOIO);

716

if (unlikely(!pages))

716

if (unlikely(!pages))

717

goto do_sync_io;

717

goto do_sync_io;

718

719

bio_for_each_segment(bvec, bio, i) {

719

bio_for_each_segment(bvec, bio, i) {

720

pages[i] = alloc_page(GFP_NOIO);

720

pages[i] = alloc_page(GFP_NOIO);

721

if (unlikely(!pages[i]))

721

if (unlikely(!pages[i]))

722

goto do_sync_io;

722

goto do_sync_io;

723

memcpy(kmap(pages[i]) + bvec->bv_offset,

723

memcpy(kmap(pages[i]) + bvec->bv_offset,

724

kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);

724

kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);

725

kunmap(pages[i]);

725

kunmap(pages[i]);

726

kunmap(bvec->bv_page);

726

kunmap(bvec->bv_page);

727

}

727

}

728

729

return pages;

729

return pages;

730

731

do_sync_io:

731

do_sync_io:

732

if (pages)

732

if (pages)

733

for (i = 0; i < bio->bi_vcnt && pages[i]; i++)

733

for (i = 0; i < bio->bi_vcnt && pages[i]; i++)

734

put_page(pages[i]);

734

put_page(pages[i]);

735

kfree(pages);

735

kfree(pages);

736

PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);

736

PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);

737

return NULL;

737

return NULL;

738

}

738

}

739

740

static int make_request(request_queue_t *q, struct bio * bio)

740

static int make_request(request_queue_t *q, struct bio * bio)

741

{

741

{

742

mddev_t *mddev = q->queuedata;

742

mddev_t *mddev = q->queuedata;

743

conf_t *conf = mddev_to_conf(mddev);

743

conf_t *conf = mddev_to_conf(mddev);

744

mirror_info_t *mirror;

744

mirror_info_t *mirror;

745

r1bio_t *r1_bio;

745

r1bio_t *r1_bio;

746

struct bio *read_bio;

746

struct bio *read_bio;

747

int i, targets = 0, disks;

747

int i, targets = 0, disks;

748

mdk_rdev_t *rdev;

748

mdk_rdev_t *rdev;

749

struct bitmap *bitmap = mddev->bitmap;

749

struct bitmap *bitmap = mddev->bitmap;

750

unsigned long flags;

750

unsigned long flags;

751

struct bio_list bl;

751

struct bio_list bl;

752

struct page **behind_pages = NULL;

752

struct page **behind_pages = NULL;

753

const int rw = bio_data_dir(bio);

753

const int rw = bio_data_dir(bio);

754

int do_barriers;

754

int do_barriers;

755

756

if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {

757

bio_endio(bio, bio->bi_size, -EOPNOTSUPP);

758

return 0;

759

}

760

761

/*

756

/*

762

* Register the new request and wait if the reconstruction

757

* Register the new request and wait if the reconstruction

763

* thread has put up a bar for new requests.

758

* thread has put up a bar for new requests.

764

* Continue immediately if no resync is active currently.

759

* Continue immediately if no resync is active currently.

760

* We test barriers_work *after* md_write_start as md_write_start

761

* may cause the first superblock write, and that will check out

762

* if barriers work.

765

*/

763

*/

764

766

md_write_start(mddev, bio); /* wait on superblock update early */

765

md_write_start(mddev, bio); /* wait on superblock update early */

766

767

if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {

768

if (rw == WRITE)

769

md_write_end(mddev);

770

bio_endio(bio, bio->bi_size, -EOPNOTSUPP);

771

return 0;

772

}

767

773

768

wait_barrier(conf);

774

wait_barrier(conf);

769

775

770

disk_stat_inc(mddev->gendisk, ios[rw]);

776

disk_stat_inc(mddev->gendisk, ios[rw]);

771

disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));

777

disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));

772

778

773

/*

779

/*

774

* make_request() can abort the operation when READA is being

780

* make_request() can abort the operation when READA is being

775

* used and no empty request is available.

781

* used and no empty request is available.

776

*

782

*

777

*/

783

*/

778

r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);

784

r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);

779

785

780

r1_bio->master_bio = bio;

786

r1_bio->master_bio = bio;

781

r1_bio->sectors = bio->bi_size >> 9;

787

r1_bio->sectors = bio->bi_size >> 9;

782

r1_bio->state = 0;

788

r1_bio->state = 0;

783

r1_bio->mddev = mddev;

789

r1_bio->mddev = mddev;

784

r1_bio->sector = bio->bi_sector;

790

r1_bio->sector = bio->bi_sector;

785

791

786

if (rw == READ) {

792

if (rw == READ) {

787

/*

793

/*

788

* read balancing logic:

794

* read balancing logic:

789

*/

795

*/

790

int rdisk = read_balance(conf, r1_bio);

796

int rdisk = read_balance(conf, r1_bio);

791

797

792

if (rdisk < 0) {

798

if (rdisk < 0) {

793

/* couldn't find anywhere to read from */

799

/* couldn't find anywhere to read from */

794

raid_end_bio_io(r1_bio);

800

raid_end_bio_io(r1_bio);

795

return 0;

801

return 0;

796

}

802

}

797

mirror = conf->mirrors + rdisk;

803

mirror = conf->mirrors + rdisk;

798

804

799

r1_bio->read_disk = rdisk;

805

r1_bio->read_disk = rdisk;

800

806

801

read_bio = bio_clone(bio, GFP_NOIO);

807

read_bio = bio_clone(bio, GFP_NOIO);

802

808

803

r1_bio->bios[rdisk] = read_bio;

809

r1_bio->bios[rdisk] = read_bio;

804

810

805

read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;

811

read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;

806

read_bio->bi_bdev = mirror->rdev->bdev;

812

read_bio->bi_bdev = mirror->rdev->bdev;

807

read_bio->bi_end_io = raid1_end_read_request;

813

read_bio->bi_end_io = raid1_end_read_request;

808

read_bio->bi_rw = READ;

814

read_bio->bi_rw = READ;

809

read_bio->bi_private = r1_bio;

815

read_bio->bi_private = r1_bio;

810

816

811

generic_make_request(read_bio);

817

generic_make_request(read_bio);

812

return 0;

818

return 0;

813

}

819

}

814

820

815

/*

821

/*

816

* WRITE:

822

* WRITE:

817

*/

823

*/

818

/* first select target devices under spinlock and

824

/* first select target devices under spinlock and

819

* inc refcount on their rdev. Record them by setting

825

* inc refcount on their rdev. Record them by setting

820

* bios[x] to bio

826

* bios[x] to bio

821

*/

827

*/

822

disks = conf->raid_disks;

828

disks = conf->raid_disks;

823

#if 0

829

#if 0

824

{ static int first=1;

830

{ static int first=1;

825

if (first) printk("First Write sector %llu disks %d\n",

831

if (first) printk("First Write sector %llu disks %d\n",

826

(unsigned long long)r1_bio->sector, disks);

832

(unsigned long long)r1_bio->sector, disks);

827

first = 0;

833

first = 0;

828

}

834

}

829

#endif

835

#endif

830

rcu_read_lock();

836

rcu_read_lock();

831

for (i = 0; i < disks; i++) {

837

for (i = 0; i < disks; i++) {

832

if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL &&

838

if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL &&

833

!test_bit(Faulty, &rdev->flags)) {

839

!test_bit(Faulty, &rdev->flags)) {

834

atomic_inc(&rdev->nr_pending);

840

atomic_inc(&rdev->nr_pending);

835

if (test_bit(Faulty, &rdev->flags)) {

841

if (test_bit(Faulty, &rdev->flags)) {

836

rdev_dec_pending(rdev, mddev);

842

rdev_dec_pending(rdev, mddev);

837

r1_bio->bios[i] = NULL;

843

r1_bio->bios[i] = NULL;

838

} else

844

} else

839

r1_bio->bios[i] = bio;

845

r1_bio->bios[i] = bio;

840

targets++;

846

targets++;

841

} else

847

} else

842

r1_bio->bios[i] = NULL;

848

r1_bio->bios[i] = NULL;

843

}

849

}

844

rcu_read_unlock();

850

rcu_read_unlock();

845

851

846

BUG_ON(targets == 0); /* we never fail the last device */

852

BUG_ON(targets == 0); /* we never fail the last device */

847

853

848

if (targets < conf->raid_disks) {

854

if (targets < conf->raid_disks) {

849

/* array is degraded, we will not clear the bitmap

855

/* array is degraded, we will not clear the bitmap

850

* on I/O completion (see raid1_end_write_request) */

856

* on I/O completion (see raid1_end_write_request) */

851

set_bit(R1BIO_Degraded, &r1_bio->state);

857

set_bit(R1BIO_Degraded, &r1_bio->state);

852

}

858

}

853

859

854

/* do behind I/O ? */

860

/* do behind I/O ? */

855

if (bitmap &&

861

if (bitmap &&

856

atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&

862

atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&

857

(behind_pages = alloc_behind_pages(bio)) != NULL)

863

(behind_pages = alloc_behind_pages(bio)) != NULL)

858

set_bit(R1BIO_BehindIO, &r1_bio->state);

864

set_bit(R1BIO_BehindIO, &r1_bio->state);

859

865

860

atomic_set(&r1_bio->remaining, 0);

866

atomic_set(&r1_bio->remaining, 0);

861

atomic_set(&r1_bio->behind_remaining, 0);

867

atomic_set(&r1_bio->behind_remaining, 0);

862

868

863

do_barriers = bio_barrier(bio);

869

do_barriers = bio_barrier(bio);

864

if (do_barriers)

870

if (do_barriers)

865

set_bit(R1BIO_Barrier, &r1_bio->state);

871

set_bit(R1BIO_Barrier, &r1_bio->state);

866

872

867

bio_list_init(&bl);

873

bio_list_init(&bl);

868

for (i = 0; i < disks; i++) {

874

for (i = 0; i < disks; i++) {

869

struct bio *mbio;

875

struct bio *mbio;

870

if (!r1_bio->bios[i])

876

if (!r1_bio->bios[i])

871

continue;

877

continue;

872

878

873

mbio = bio_clone(bio, GFP_NOIO);

879

mbio = bio_clone(bio, GFP_NOIO);

874

r1_bio->bios[i] = mbio;

880

r1_bio->bios[i] = mbio;

875

881

876

mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;

882

mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;

877

mbio->bi_bdev = conf->mirrors[i].rdev->bdev;

883

mbio->bi_bdev = conf->mirrors[i].rdev->bdev;

878

mbio->bi_end_io = raid1_end_write_request;

884

mbio->bi_end_io = raid1_end_write_request;

879

mbio->bi_rw = WRITE | do_barriers;

885

mbio->bi_rw = WRITE | do_barriers;

880

mbio->bi_private = r1_bio;

886

mbio->bi_private = r1_bio;

881

887

882

if (behind_pages) {

888

if (behind_pages) {

883

struct bio_vec *bvec;

889

struct bio_vec *bvec;

884

int j;

890

int j;

885

891

886

/* Yes, I really want the '__' version so that

892

/* Yes, I really want the '__' version so that

887

* we clear any unused pointer in the io_vec, rather

893

* we clear any unused pointer in the io_vec, rather

888

* than leave them unchanged. This is important

894

* than leave them unchanged. This is important

889

* because when we come to free the pages, we won't

895

* because when we come to free the pages, we won't

890

* know the originial bi_idx, so we just free

896

* know the originial bi_idx, so we just free

891

* them all

897

* them all

892

*/

898

*/

893

__bio_for_each_segment(bvec, mbio, j, 0)

899

__bio_for_each_segment(bvec, mbio, j, 0)

894

bvec->bv_page = behind_pages[j];

900

bvec->bv_page = behind_pages[j];

895

if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))

901

if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))

896

atomic_inc(&r1_bio->behind_remaining);

902

atomic_inc(&r1_bio->behind_remaining);

897

}

903

}

898

904

899

atomic_inc(&r1_bio->remaining);

905

atomic_inc(&r1_bio->remaining);

900

906

901

bio_list_add(&bl, mbio);

907

bio_list_add(&bl, mbio);

902

}

908

}

903

kfree(behind_pages); /* the behind pages are attached to the bios now */

909

kfree(behind_pages); /* the behind pages are attached to the bios now */

904

910

905

bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,

911

bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,

906

test_bit(R1BIO_BehindIO, &r1_bio->state));

912

test_bit(R1BIO_BehindIO, &r1_bio->state));

907

spin_lock_irqsave(&conf->device_lock, flags);

913

spin_lock_irqsave(&conf->device_lock, flags);

908

bio_list_merge(&conf->pending_bio_list, &bl);

914

bio_list_merge(&conf->pending_bio_list, &bl);

909

bio_list_init(&bl);

915

bio_list_init(&bl);

910

916

911

blk_plug_device(mddev->queue);

917

blk_plug_device(mddev->queue);

912

spin_unlock_irqrestore(&conf->device_lock, flags);

918

spin_unlock_irqrestore(&conf->device_lock, flags);

913

919

914

#if 0

920

#if 0

915

while ((bio = bio_list_pop(&bl)) != NULL)

921

while ((bio = bio_list_pop(&bl)) != NULL)

916

generic_make_request(bio);

922

generic_make_request(bio);

917

#endif

923

#endif

918

924

919

return 0;

925

return 0;

920

}

926

}

921

927

922

static void status(struct seq_file *seq, mddev_t *mddev)

928

static void status(struct seq_file *seq, mddev_t *mddev)

923

{

929

{

924

conf_t *conf = mddev_to_conf(mddev);

930

conf_t *conf = mddev_to_conf(mddev);

925

int i;

931

int i;

926

932

927

seq_printf(seq, " [%d/%d] [", conf->raid_disks,

933

seq_printf(seq, " [%d/%d] [", conf->raid_disks,

928

conf->working_disks);

934

conf->working_disks);

929

for (i = 0; i < conf->raid_disks; i++)

935

for (i = 0; i < conf->raid_disks; i++)

930

seq_printf(seq, "%s",

936

seq_printf(seq, "%s",

931

conf->mirrors[i].rdev &&

937

conf->mirrors[i].rdev &&

932

test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");

938

test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");

933

seq_printf(seq, "]");

939

seq_printf(seq, "]");

934

}

940

}

935

941

936

942

937

static void error(mddev_t *mddev, mdk_rdev_t *rdev)

943

static void error(mddev_t *mddev, mdk_rdev_t *rdev)

938

{

944

{

939

char b[BDEVNAME_SIZE];

945

char b[BDEVNAME_SIZE];

940

conf_t *conf = mddev_to_conf(mddev);

946

conf_t *conf = mddev_to_conf(mddev);

941

947

942

/*

948

/*

943

* If it is not operational, then we have already marked it as dead

949

* If it is not operational, then we have already marked it as dead

944

* else if it is the last working disks, ignore the error, let the

950

* else if it is the last working disks, ignore the error, let the

945

* next level up know.

951

* next level up know.

946

* else mark the drive as failed

952

* else mark the drive as failed

947

*/

953

*/

948

if (test_bit(In_sync, &rdev->flags)

954

if (test_bit(In_sync, &rdev->flags)

949

&& conf->working_disks == 1)

955

&& conf->working_disks == 1)

950

/*

956

/*

951

* Don't fail the drive, act as though we were just a

957

* Don't fail the drive, act as though we were just a

952

* normal single drive

958

* normal single drive

953

*/

959

*/

954

return;

960

return;

955

if (test_bit(In_sync, &rdev->flags)) {

961

if (test_bit(In_sync, &rdev->flags)) {

956

mddev->degraded++;

962

mddev->degraded++;

957

conf->working_disks--;

963

conf->working_disks--;

958

/*

964

/*

959

* if recovery is running, make sure it aborts.

965

* if recovery is running, make sure it aborts.

960

*/

966

*/

961

set_bit(MD_RECOVERY_ERR, &mddev->recovery);

967

set_bit(MD_RECOVERY_ERR, &mddev->recovery);

962

}

968

}

963

clear_bit(In_sync, &rdev->flags);

969

clear_bit(In_sync, &rdev->flags);

964

set_bit(Faulty, &rdev->flags);

970

set_bit(Faulty, &rdev->flags);

965

mddev->sb_dirty = 1;

971

mddev->sb_dirty = 1;

966

printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"

972

printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"

967

" Operation continuing on %d devices\n",

973

" Operation continuing on %d devices\n",

968

bdevname(rdev->bdev,b), conf->working_disks);

974

bdevname(rdev->bdev,b), conf->working_disks);

969

}

975

}

970

976

971

static void print_conf(conf_t *conf)

977

static void print_conf(conf_t *conf)

972

{

978

{

973

int i;

979

int i;

974

mirror_info_t *tmp;

980

mirror_info_t *tmp;

975

981

976

printk("RAID1 conf printout:\n");

982

printk("RAID1 conf printout:\n");

977

if (!conf) {

983

if (!conf) {

978

printk("(!conf)\n");

984

printk("(!conf)\n");

979

return;

985

return;

980

}

986

}

981

printk(" --- wd:%d rd:%d\n", conf->working_disks,

987

printk(" --- wd:%d rd:%d\n", conf->working_disks,

982

conf->raid_disks);

988

conf->raid_disks);

983

989

984

for (i = 0; i < conf->raid_disks; i++) {

990

for (i = 0; i < conf->raid_disks; i++) {

985

char b[BDEVNAME_SIZE];

991

char b[BDEVNAME_SIZE];

986

tmp = conf->mirrors + i;

992

tmp = conf->mirrors + i;

987

if (tmp->rdev)

993

if (tmp->rdev)

988

printk(" disk %d, wo:%d, o:%d, dev:%s\n",

994

printk(" disk %d, wo:%d, o:%d, dev:%s\n",

989

i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags),

995

i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags),

990

bdevname(tmp->rdev->bdev,b));

996

bdevname(tmp->rdev->bdev,b));

991

}

997

}

992

}

998

}

993

999

994

static void close_sync(conf_t *conf)

1000

static void close_sync(conf_t *conf)

995

{

1001

{

996

wait_barrier(conf);

1002

wait_barrier(conf);

997

allow_barrier(conf);

1003

allow_barrier(conf);

998

1004

999

mempool_destroy(conf->r1buf_pool);

1005

mempool_destroy(conf->r1buf_pool);

1000

conf->r1buf_pool = NULL;

1006

conf->r1buf_pool = NULL;

1001

}

1007

}

1002

1008

1003

static int raid1_spare_active(mddev_t *mddev)

1009

static int raid1_spare_active(mddev_t *mddev)

1004

{

1010

{

1005

int i;

1011

int i;

1006

conf_t *conf = mddev->private;

1012

conf_t *conf = mddev->private;

1007

mirror_info_t *tmp;

1013

mirror_info_t *tmp;

1008

1014

1009

/*

1015

/*

1010

* Find all failed disks within the RAID1 configuration

1016

* Find all failed disks within the RAID1 configuration

1011

* and mark them readable

1017

* and mark them readable

1012

*/

1018

*/

1013

for (i = 0; i < conf->raid_disks; i++) {

1019

for (i = 0; i < conf->raid_disks; i++) {

1014

tmp = conf->mirrors + i;

1020

tmp = conf->mirrors + i;

1015

if (tmp->rdev

1021

if (tmp->rdev

1016

&& !test_bit(Faulty, &tmp->rdev->flags)

1022

&& !test_bit(Faulty, &tmp->rdev->flags)

1017

&& !test_bit(In_sync, &tmp->rdev->flags)) {

1023

&& !test_bit(In_sync, &tmp->rdev->flags)) {

1018

conf->working_disks++;

1024

conf->working_disks++;

1019

mddev->degraded--;

1025

mddev->degraded--;

1020

set_bit(In_sync, &tmp->rdev->flags);

1026

set_bit(In_sync, &tmp->rdev->flags);

1021

}

1027

}

1022

}

1028

}

1023

1029

1024

print_conf(conf);

1030

print_conf(conf);

1025

return 0;

1031

return 0;

1026

}

1032

}

1027

1033

1028

1034

1029

static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)

1035

static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)

1030

{

1036

{

1031

conf_t *conf = mddev->private;

1037

conf_t *conf = mddev->private;

1032

int found = 0;

1038

int found = 0;

1033

int mirror = 0;

1039

int mirror = 0;

1034

mirror_info_t *p;

1040

mirror_info_t *p;

1035

1041

1036

for (mirror=0; mirror < mddev->raid_disks; mirror++)

1042

for (mirror=0; mirror < mddev->raid_disks; mirror++)

1037

if ( !(p=conf->mirrors+mirror)->rdev) {

1043

if ( !(p=conf->mirrors+mirror)->rdev) {

1038

1044

1039

blk_queue_stack_limits(mddev->queue,

1045

blk_queue_stack_limits(mddev->queue,

1040

rdev->bdev->bd_disk->queue);

1046

rdev->bdev->bd_disk->queue);

1041

/* as we don't honour merge_bvec_fn, we must never risk

1047

/* as we don't honour merge_bvec_fn, we must never risk

1042

* violating it, so limit ->max_sector to one PAGE, as

1048

* violating it, so limit ->max_sector to one PAGE, as

1043

* a one page request is never in violation.

1049

* a one page request is never in violation.

1044

*/

1050

*/

1045

if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&

1051

if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&

1046

mddev->queue->max_sectors > (PAGE_SIZE>>9))

1052

mddev->queue->max_sectors > (PAGE_SIZE>>9))

1047

blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);

1053

blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);

1048

1054

1049

p->head_position = 0;

1055

p->head_position = 0;

1050

rdev->raid_disk = mirror;

1056

rdev->raid_disk = mirror;

1051

found = 1;

1057

found = 1;

1052

/* As all devices are equivalent, we don't need a full recovery

1058

/* As all devices are equivalent, we don't need a full recovery

1053

* if this was recently any drive of the array

1059

* if this was recently any drive of the array

1054

*/

1060

*/

1055

if (rdev->saved_raid_disk < 0)

1061

if (rdev->saved_raid_disk < 0)

1056

conf->fullsync = 1;

1062

conf->fullsync = 1;

1057

rcu_assign_pointer(p->rdev, rdev);

1063

rcu_assign_pointer(p->rdev, rdev);

1058

break;

1064

break;

1059

}

1065

}

1060

1066

1061

print_conf(conf);

1067

print_conf(conf);

1062

return found;

1068

return found;

1063

}

1069

}

1064

1070

1065

static int raid1_remove_disk(mddev_t *mddev, int number)

1071

static int raid1_remove_disk(mddev_t *mddev, int number)

1066

{

1072

{

1067

conf_t *conf = mddev->private;

1073

conf_t *conf = mddev->private;

1068

int err = 0;

1074

int err = 0;

1069

mdk_rdev_t *rdev;

1075

mdk_rdev_t *rdev;

1070

mirror_info_t *p = conf->mirrors+ number;

1076

mirror_info_t *p = conf->mirrors+ number;

1071

1077

1072

print_conf(conf);

1078

print_conf(conf);

1073

rdev = p->rdev;

1079

rdev = p->rdev;

1074

if (rdev) {

1080

if (rdev) {

1075

if (test_bit(In_sync, &rdev->flags) ||

1081

if (test_bit(In_sync, &rdev->flags) ||

1076

atomic_read(&rdev->nr_pending)) {

1082

atomic_read(&rdev->nr_pending)) {

1077

err = -EBUSY;

1083

err = -EBUSY;

1078

goto abort;

1084

goto abort;

1079

}

1085

}

1080

p->rdev = NULL;

1086

p->rdev = NULL;

1081

synchronize_rcu();

1087

synchronize_rcu();

1082

if (atomic_read(&rdev->nr_pending)) {

1088

if (atomic_read(&rdev->nr_pending)) {

1083

/* lost the race, try later */

1089

/* lost the race, try later */

1084

err = -EBUSY;

1090

err = -EBUSY;

1085

p->rdev = rdev;

1091

p->rdev = rdev;

1086

}

1092

}

1087

}

1093

}

1088

abort:

1094

abort:

1089

1095

1090

print_conf(conf);

1096

print_conf(conf);

1091

return err;

1097

return err;

1092

}

1098

}

1093

1099

1094

1100

1095

static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)

1101

static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)

1096

{

1102

{

1097

r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);

1103

r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);

1098

int i;

1104

int i;

1099

1105

1100

if (bio->bi_size)

1106

if (bio->bi_size)

1101

return 1;

1107

return 1;

1102

1108

1103

for (i=r1_bio->mddev->raid_disks; i--; )

1109

for (i=r1_bio->mddev->raid_disks; i--; )

1104

if (r1_bio->bios[i] == bio)

1110

if (r1_bio->bios[i] == bio)

1105

break;

1111

break;

1106

BUG_ON(i < 0);

1112

BUG_ON(i < 0);

1107

update_head_pos(i, r1_bio);

1113

update_head_pos(i, r1_bio);

1108

/*

1114

/*

1109

* we have read a block, now it needs to be re-written,

1115

* we have read a block, now it needs to be re-written,

1110

* or re-read if the read failed.

1116

* or re-read if the read failed.

1111

* We don't do much here, just schedule handling by raid1d

1117

* We don't do much here, just schedule handling by raid1d

1112

*/

1118

*/

1113

if (test_bit(BIO_UPTODATE, &bio->bi_flags))

1119

if (test_bit(BIO_UPTODATE, &bio->bi_flags))

1114

set_bit(R1BIO_Uptodate, &r1_bio->state);

1120

set_bit(R1BIO_Uptodate, &r1_bio->state);

1115

1121

1116

if (atomic_dec_and_test(&r1_bio->remaining))

1122

if (atomic_dec_and_test(&r1_bio->remaining))

1117

reschedule_retry(r1_bio);

1123

reschedule_retry(r1_bio);

1118

return 0;

1124

return 0;

1119

}

1125

}

1120

1126

1121

static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)

1127

static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)

1122

{

1128

{

1123

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

1129

int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);

1124

r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);

1130

r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);

1125

mddev_t *mddev = r1_bio->mddev;

1131

mddev_t *mddev = r1_bio->mddev;

1126

conf_t *conf = mddev_to_conf(mddev);

1132

conf_t *conf = mddev_to_conf(mddev);

1127

int i;

1133

int i;

1128

int mirror=0;

1134

int mirror=0;

1129

1135

1130

if (bio->bi_size)

1136

if (bio->bi_size)

1131

return 1;

1137

return 1;

1132

1138

1133

for (i = 0; i < conf->raid_disks; i++)

1139

for (i = 0; i < conf->raid_disks; i++)

1134

if (r1_bio->bios[i] == bio) {

1140

if (r1_bio->bios[i] == bio) {

1135

mirror = i;

1141

mirror = i;

1136

break;

1142

break;

1137

}

1143

}

1138

if (!uptodate) {

1144

if (!uptodate) {

1139

int sync_blocks = 0;

1145

int sync_blocks = 0;

1140

sector_t s = r1_bio->sector;

1146

sector_t s = r1_bio->sector;

1141

long sectors_to_go = r1_bio->sectors;

1147

long sectors_to_go = r1_bio->sectors;

1142

/* make sure these bits doesn't get cleared. */

1148

/* make sure these bits doesn't get cleared. */

1143

do {

1149

do {

1144

bitmap_end_sync(mddev->bitmap, r1_bio->sector,

1150

bitmap_end_sync(mddev->bitmap, r1_bio->sector,

1145

&sync_blocks, 1);

1151

&sync_blocks, 1);

1146

s += sync_blocks;

1152

s += sync_blocks;

1147

sectors_to_go -= sync_blocks;

1153

sectors_to_go -= sync_blocks;

1148

} while (sectors_to_go > 0);

1154

} while (sectors_to_go > 0);

1149

md_error(mddev, conf->mirrors[mirror].rdev);

1155

md_error(mddev, conf->mirrors[mirror].rdev);

1150

}

1156

}

1151

1157

1152

update_head_pos(mirror, r1_bio);

1158

update_head_pos(mirror, r1_bio);

1153

1159

1154

if (atomic_dec_and_test(&r1_bio->remaining)) {

1160

if (atomic_dec_and_test(&r1_bio->remaining)) {

1155

md_done_sync(mddev, r1_bio->sectors, uptodate);

1161

md_done_sync(mddev, r1_bio->sectors, uptodate);

1156

put_buf(r1_bio);

1162

put_buf(r1_bio);

1157

}

1163

}

1158

return 0;

1164

return 0;

1159

}

1165

}

1160

1166

1161

static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)

1167

static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)

1162

{

1168

{

1163

conf_t *conf = mddev_to_conf(mddev);

1169

conf_t *conf = mddev_to_conf(mddev);

1164

int i;

1170

int i;

1165

int disks = conf->raid_disks;

1171

int disks = conf->raid_disks;

1166

struct bio *bio, *wbio;

1172

struct bio *bio, *wbio;

1167

1173

1168

bio = r1_bio->bios[r1_bio->read_disk];

1174

bio = r1_bio->bios[r1_bio->read_disk];

1169

1175

1170

1176

1171

if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {

1177

if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {

1172

/* We have read all readable devices. If we haven't

1178

/* We have read all readable devices. If we haven't

1173

* got the block, then there is no hope left.

1179

* got the block, then there is no hope left.

1174

* If we have, then we want to do a comparison

1180

* If we have, then we want to do a comparison

1175

* and skip the write if everything is the same.

1181

* and skip the write if everything is the same.

1176

* If any blocks failed to read, then we need to

1182

* If any blocks failed to read, then we need to

1177

* attempt an over-write

1183

* attempt an over-write

1178

*/

1184

*/

1179

int primary;

1185

int primary;

1180

if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {

1186

if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {

1181

for (i=0; i<mddev->raid_disks; i++)

1187

for (i=0; i<mddev->raid_disks; i++)

1182

if (r1_bio->bios[i]->bi_end_io == end_sync_read)

1188

if (r1_bio->bios[i]->bi_end_io == end_sync_read)

1183

md_error(mddev, conf->mirrors[i].rdev);

1189

md_error(mddev, conf->mirrors[i].rdev);

1184

1190

1185

md_done_sync(mddev, r1_bio->sectors, 1);

1191

md_done_sync(mddev, r1_bio->sectors, 1);

1186

put_buf(r1_bio);

1192

put_buf(r1_bio);

1187

return;

1193

return;

1188

}

1194

}

1189

for (primary=0; primary<mddev->raid_disks; primary++)

1195

for (primary=0; primary<mddev->raid_disks; primary++)

1190

if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&

1196

if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&

1191

test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {

1197

test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {

1192

r1_bio->bios[primary]->bi_end_io = NULL;

1198

r1_bio->bios[primary]->bi_end_io = NULL;

1193

rdev_dec_pending(conf->mirrors[primary].rdev, mddev);

1199

rdev_dec_pending(conf->mirrors[primary].rdev, mddev);

1194

break;

1200

break;

1195

}

1201

}

1196

r1_bio->read_disk = primary;

1202

r1_bio->read_disk = primary;

1197

for (i=0; i<mddev->raid_disks; i++)

1203

for (i=0; i<mddev->raid_disks; i++)

1198

if (r1_bio->bios[i]->bi_end_io == end_sync_read &&

1204

if (r1_bio->bios[i]->bi_end_io == end_sync_read &&

1199

test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) {

1205

test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) {

1200

int j;

1206

int j;

1201

int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);

1207

int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);

1202

struct bio *pbio = r1_bio->bios[primary];

1208

struct bio *pbio = r1_bio->bios[primary];

1203

struct bio *sbio = r1_bio->bios[i];

1209

struct bio *sbio = r1_bio->bios[i];

1204

for (j = vcnt; j-- ; )

1210

for (j = vcnt; j-- ; )

1205

if (memcmp(page_address(pbio->bi_io_vec[j].bv_page),

1211

if (memcmp(page_address(pbio->bi_io_vec[j].bv_page),

1206

page_address(sbio->bi_io_vec[j].bv_page),

1212

page_address(sbio->bi_io_vec[j].bv_page),

1207

PAGE_SIZE))

1213

PAGE_SIZE))

1208

break;

1214

break;

1209

if (j >= 0)

1215

if (j >= 0)

1210

mddev->resync_mismatches += r1_bio->sectors;

1216

mddev->resync_mismatches += r1_bio->sectors;

1211

if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {

1217

if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {

1212

sbio->bi_end_io = NULL;

1218

sbio->bi_end_io = NULL;

1213

rdev_dec_pending(conf->mirrors[i].rdev, mddev);

1219

rdev_dec_pending(conf->mirrors[i].rdev, mddev);

1214

} else {

1220

} else {

1215

/* fixup the bio for reuse */

1221

/* fixup the bio for reuse */

1216

sbio->bi_vcnt = vcnt;

1222

sbio->bi_vcnt = vcnt;

1217

sbio->bi_size = r1_bio->sectors << 9;

1223

sbio->bi_size = r1_bio->sectors << 9;

1218

sbio->bi_idx = 0;

1224

sbio->bi_idx = 0;

1219

sbio->bi_phys_segments = 0;

1225

sbio->bi_phys_segments = 0;

1220

sbio->bi_hw_segments = 0;

1226

sbio->bi_hw_segments = 0;

1221

sbio->bi_hw_front_size = 0;

1227

sbio->bi_hw_front_size = 0;

1222

sbio->bi_hw_back_size = 0;

1228

sbio->bi_hw_back_size = 0;

1223

sbio->bi_flags &= ~(BIO_POOL_MASK - 1);

1229

sbio->bi_flags &= ~(BIO_POOL_MASK - 1);

1224

sbio->bi_flags |= 1 << BIO_UPTODATE;

1230

sbio->bi_flags |= 1 << BIO_UPTODATE;

1225

sbio->bi_next = NULL;

1231

sbio->bi_next = NULL;

1226

sbio->bi_sector = r1_bio->sector +

1232

sbio->bi_sector = r1_bio->sector +

1227

conf->mirrors[i].rdev->data_offset;

1233

conf->mirrors[i].rdev->data_offset;

1228

sbio->bi_bdev = conf->mirrors[i].rdev->bdev;

1234

sbio->bi_bdev = conf->mirrors[i].rdev->bdev;

1229

}

1235

}

1230

}

1236

}

1231

}

1237

}

1232

if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {

1238

if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {

1233

/* ouch - failed to read all of that.

1239

/* ouch - failed to read all of that.

1234

* Try some synchronous reads of other devices to get

1240

* Try some synchronous reads of other devices to get

1235

* good data, much like with normal read errors. Only

1241

* good data, much like with normal read errors. Only

1236

* read into the pages we already have so they we don't

1242

* read into the pages we already have so they we don't

1237

* need to re-issue the read request.

1243

* need to re-issue the read request.

1238

* We don't need to freeze the array, because being in an

1244

* We don't need to freeze the array, because being in an

1239

* active sync request, there is no normal IO, and

1245

* active sync request, there is no normal IO, and

1240

* no overlapping syncs.

1246

* no overlapping syncs.

1241

*/

1247

*/

1242

sector_t sect = r1_bio->sector;

1248

sector_t sect = r1_bio->sector;

1243

int sectors = r1_bio->sectors;

1249

int sectors = r1_bio->sectors;

1244

int idx = 0;

1250

int idx = 0;

1245

1251

1246

while(sectors) {

1252

while(sectors) {

1247

int s = sectors;

1253

int s = sectors;

1248

int d = r1_bio->read_disk;

1254

int d = r1_bio->read_disk;

1249

int success = 0;

1255

int success = 0;

1250

mdk_rdev_t *rdev;

1256

mdk_rdev_t *rdev;

1251

1257

1252

if (s > (PAGE_SIZE>>9))

1258

if (s > (PAGE_SIZE>>9))

1253

s = PAGE_SIZE >> 9;

1259

s = PAGE_SIZE >> 9;

1254

do {

1260

do {

1255

if (r1_bio->bios[d]->bi_end_io == end_sync_read) {

1261

if (r1_bio->bios[d]->bi_end_io == end_sync_read) {

1256

rdev = conf->mirrors[d].rdev;

1262

rdev = conf->mirrors[d].rdev;

1257

if (sync_page_io(rdev->bdev,

1263

if (sync_page_io(rdev->bdev,

1258

sect + rdev->data_offset,

1264

sect + rdev->data_offset,

1259

s<<9,

1265

s<<9,

1260

bio->bi_io_vec[idx].bv_page,

1266

bio->bi_io_vec[idx].bv_page,

1261

READ)) {

1267

READ)) {

1262

success = 1;

1268

success = 1;

1263

break;

1269

break;

1264

}

1270

}

1265

}

1271

}

1266

d++;

1272

d++;

1267

if (d == conf->raid_disks)

1273

if (d == conf->raid_disks)

1268

d = 0;

1274

d = 0;

1269

} while (!success && d != r1_bio->read_disk);

1275

} while (!success && d != r1_bio->read_disk);

1270

1276

1271

if (success) {

1277

if (success) {

1272

int start = d;

1278

int start = d;

1273

/* write it back and re-read */

1279

/* write it back and re-read */

1274

set_bit(R1BIO_Uptodate, &r1_bio->state);

1280

set_bit(R1BIO_Uptodate, &r1_bio->state);

1275

while (d != r1_bio->read_disk) {

1281

while (d != r1_bio->read_disk) {

1276

if (d == 0)

1282

if (d == 0)

1277

d = conf->raid_disks;

1283

d = conf->raid_disks;

1278

d--;

1284

d--;

1279

if (r1_bio->bios[d]->bi_end_io != end_sync_read)

1285

if (r1_bio->bios[d]->bi_end_io != end_sync_read)

1280

continue;

1286

continue;

1281

rdev = conf->mirrors[d].rdev;

1287

rdev = conf->mirrors[d].rdev;

1282

atomic_add(s, &rdev->corrected_errors);

1288

atomic_add(s, &rdev->corrected_errors);

1283

if (sync_page_io(rdev->bdev,

1289

if (sync_page_io(rdev->bdev,

1284

sect + rdev->data_offset,

1290

sect + rdev->data_offset,

1285

s<<9,

1291

s<<9,

1286

bio->bi_io_vec[idx].bv_page,

1292

bio->bi_io_vec[idx].bv_page,

1287

WRITE) == 0)

1293

WRITE) == 0)

1288

md_error(mddev, rdev);

1294

md_error(mddev, rdev);

1289

}

1295

}

1290

d = start;

1296

d = start;

1291

while (d != r1_bio->read_disk) {

1297

while (d != r1_bio->read_disk) {

1292

if (d == 0)

1298

if (d == 0)

1293

d = conf->raid_disks;

1299

d = conf->raid_disks;

1294

d--;

1300

d--;

1295

if (r1_bio->bios[d]->bi_end_io != end_sync_read)

1301

if (r1_bio->bios[d]->bi_end_io != end_sync_read)

1296

continue;

1302

continue;

1297

rdev = conf->mirrors[d].rdev;

1303

rdev = conf->mirrors[d].rdev;

1298

if (sync_page_io(rdev->bdev,

1304

if (sync_page_io(rdev->bdev,

1299

sect + rdev->data_offset,

1305

sect + rdev->data_offset,

1300

s<<9,

1306

s<<9,

1301

bio->bi_io_vec[idx].bv_page,

1307

bio->bi_io_vec[idx].bv_page,

1302

READ) == 0)

1308

READ) == 0)

1303

md_error(mddev, rdev);

1309

md_error(mddev, rdev);

1304

}

1310

}

1305

} else {

1311

} else {

1306

char b[BDEVNAME_SIZE];

1312

char b[BDEVNAME_SIZE];

1307

/* Cannot read from anywhere, array is toast */

1313

/* Cannot read from anywhere, array is toast */

1308

md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);

1314

md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);

1309

printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"

1315

printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"

1310

" for block %llu\n",

1316

" for block %llu\n",

1311

bdevname(bio->bi_bdev,b),

1317

bdevname(bio->bi_bdev,b),

1312

(unsigned long long)r1_bio->sector);

1318

(unsigned long long)r1_bio->sector);

1313

md_done_sync(mddev, r1_bio->sectors, 0);

1319

md_done_sync(mddev, r1_bio->sectors, 0);

1314

put_buf(r1_bio);

1320

put_buf(r1_bio);

1315

return;

1321

return;

1316

}

1322

}

1317

sectors -= s;

1323

sectors -= s;

1318

sect += s;

1324

sect += s;

1319

idx ++;

1325

idx ++;

1320

}

1326

}

1321

}

1327

}

1322

1328

1323

/*

1329

/*

1324

* schedule writes

1330

* schedule writes

1325

*/

1331

*/

1326

atomic_set(&r1_bio->remaining, 1);

1332

atomic_set(&r1_bio->remaining, 1);

1327

for (i = 0; i < disks ; i++) {

1333

for (i = 0; i < disks ; i++) {

1328

wbio = r1_bio->bios[i];

1334

wbio = r1_bio->bios[i];

1329

if (wbio->bi_end_io == NULL ||

1335

if (wbio->bi_end_io == NULL ||

1330

(wbio->bi_end_io == end_sync_read &&

1336

(wbio->bi_end_io == end_sync_read &&

1331

(i == r1_bio->read_disk ||

1337

(i == r1_bio->read_disk ||

1332

!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))

1338

!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))

1333

continue;

1339

continue;

1334

1340

1335

wbio->bi_rw = WRITE;

1341

wbio->bi_rw = WRITE;

1336

wbio->bi_end_io = end_sync_write;

1342

wbio->bi_end_io = end_sync_write;

1337

atomic_inc(&r1_bio->remaining);

1343

atomic_inc(&r1_bio->remaining);

1338

md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);

1344

md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);

1339

1345

1340

generic_make_request(wbio);

1346

generic_make_request(wbio);

1341

}

1347

}

1342

1348

1343

if (atomic_dec_and_test(&r1_bio->remaining)) {

1349

if (atomic_dec_and_test(&r1_bio->remaining)) {

1344

/* if we're here, all write(s) have completed, so clean up */

1350

/* if we're here, all write(s) have completed, so clean up */

1345

md_done_sync(mddev, r1_bio->sectors, 1);

1351

md_done_sync(mddev, r1_bio->sectors, 1);

1346

put_buf(r1_bio);

1352

put_buf(r1_bio);

1347

}

1353

}

1348

}

1354

}

1349

1355

1350

/*

1356

/*

1351

* This is a kernel thread which:

1357

* This is a kernel thread which:

1352

*

1358

*

1353

* 1. Retries failed read operations on working mirrors.

1359

* 1. Retries failed read operations on working mirrors.

1354

* 2. Updates the raid superblock when problems encounter.

1360

* 2. Updates the raid superblock when problems encounter.

1355

* 3. Performs writes following reads for array syncronising.

1361

* 3. Performs writes following reads for array syncronising.

1356

*/

1362

*/

1357

1363

1358

static void raid1d(mddev_t *mddev)

1364

static void raid1d(mddev_t *mddev)

1359

{

1365

{

1360

r1bio_t *r1_bio;

1366

r1bio_t *r1_bio;

1361

struct bio *bio;

1367

struct bio *bio;

1362

unsigned long flags;

1368

unsigned long flags;

1363

conf_t *conf = mddev_to_conf(mddev);

1369

conf_t *conf = mddev_to_conf(mddev);

1364

struct list_head *head = &conf->retry_list;

1370

struct list_head *head = &conf->retry_list;

1365

int unplug=0;

1371

int unplug=0;

1366

mdk_rdev_t *rdev;

1372

mdk_rdev_t *rdev;

1367

1373

1368

md_check_recovery(mddev);

1374

md_check_recovery(mddev);

1369

1375

1370

for (;;) {

1376

for (;;) {

1371

char b[BDEVNAME_SIZE];

1377

char b[BDEVNAME_SIZE];

1372

spin_lock_irqsave(&conf->device_lock, flags);

1378

spin_lock_irqsave(&conf->device_lock, flags);

1373

1379

1374

if (conf->pending_bio_list.head) {

1380

if (conf->pending_bio_list.head) {

1375

bio = bio_list_get(&conf->pending_bio_list);

1381

bio = bio_list_get(&conf->pending_bio_list);

1376

blk_remove_plug(mddev->queue);

1382

blk_remove_plug(mddev->queue);

1377

spin_unlock_irqrestore(&conf->device_lock, flags);

1383

spin_unlock_irqrestore(&conf->device_lock, flags);

1378

/* flush any pending bitmap writes to disk before proceeding w/ I/O */

1384

/* flush any pending bitmap writes to disk before proceeding w/ I/O */

1379

if (bitmap_unplug(mddev->bitmap) != 0)

1385

if (bitmap_unplug(mddev->bitmap) != 0)

1380

printk("%s: bitmap file write failed!\n", mdname(mddev));

1386

printk("%s: bitmap file write failed!\n", mdname(mddev));

1381

1387

1382

while (bio) { /* submit pending writes */

1388

while (bio) { /* submit pending writes */

1383

struct bio *next = bio->bi_next;

1389

struct bio *next = bio->bi_next;

1384

bio->bi_next = NULL;

1390

bio->bi_next = NULL;

1385

generic_make_request(bio);

1391

generic_make_request(bio);

1386

bio = next;

1392

bio = next;

1387

}

1393

}

1388

unplug = 1;

1394

unplug = 1;

1389

1395

1390

continue;

1396

continue;

1391

}

1397

}

1392

1398

1393

if (list_empty(head))

1399

if (list_empty(head))

1394

break;

1400

break;

1395

r1_bio = list_entry(head->prev, r1bio_t, retry_list);

1401

r1_bio = list_entry(head->prev, r1bio_t, retry_list);

1396

list_del(head->prev);

1402

list_del(head->prev);

1397

conf->nr_queued--;

1403

conf->nr_queued--;

1398

spin_unlock_irqrestore(&conf->device_lock, flags);

1404

spin_unlock_irqrestore(&conf->device_lock, flags);

1399

1405

1400

mddev = r1_bio->mddev;

1406

mddev = r1_bio->mddev;

1401

conf = mddev_to_conf(mddev);

1407

conf = mddev_to_conf(mddev);

1402

if (test_bit(R1BIO_IsSync, &r1_bio->state)) {

1408

if (test_bit(R1BIO_IsSync, &r1_bio->state)) {

1403

sync_request_write(mddev, r1_bio);

1409

sync_request_write(mddev, r1_bio);

1404

unplug = 1;

1410

unplug = 1;

1405

} else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {

1411

} else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {

1406

/* some requests in the r1bio were BIO_RW_BARRIER

1412

/* some requests in the r1bio were BIO_RW_BARRIER

1407

* requests which failed with -EOPNOTSUPP. Hohumm..

1413

* requests which failed with -EOPNOTSUPP. Hohumm..

1408

* Better resubmit without the barrier.

1414

* Better resubmit without the barrier.

1409

* We know which devices to resubmit for, because

1415

* We know which devices to resubmit for, because

1410

* all others have had their bios[] entry cleared.

1416

* all others have had their bios[] entry cleared.

1411

*/

1417

*/

1412

int i;

1418

int i;

1413

clear_bit(R1BIO_BarrierRetry, &r1_bio->state);

1419

clear_bit(R1BIO_BarrierRetry, &r1_bio->state);

1414

clear_bit(R1BIO_Barrier, &r1_bio->state);

1420

clear_bit(R1BIO_Barrier, &r1_bio->state);

1415

for (i=0; i < conf->raid_disks; i++)

1421

for (i=0; i < conf->raid_disks; i++)

1416

if (r1_bio->bios[i])

1422

if (r1_bio->bios[i])

1417

atomic_inc(&r1_bio->remaining);

1423

atomic_inc(&r1_bio->remaining);

1418

for (i=0; i < conf->raid_disks; i++)

1424

for (i=0; i < conf->raid_disks; i++)

1419

if (r1_bio->bios[i]) {

1425

if (r1_bio->bios[i]) {

1420

struct bio_vec *bvec;

1426

struct bio_vec *bvec;

1421

int j;

1427

int j;

1422

1428

1423

bio = bio_clone(r1_bio->master_bio, GFP_NOIO);

1429

bio = bio_clone(r1_bio->master_bio, GFP_NOIO);

1424

/* copy pages from the failed bio, as

1430

/* copy pages from the failed bio, as

1425

* this might be a write-behind device */

1431

* this might be a write-behind device */

1426

__bio_for_each_segment(bvec, bio, j, 0)

1432

__bio_for_each_segment(bvec, bio, j, 0)

1427

bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;

1433

bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;

1428

bio_put(r1_bio->bios[i]);

1434

bio_put(r1_bio->bios[i]);

1429

bio->bi_sector = r1_bio->sector +

1435

bio->bi_sector = r1_bio->sector +

1430

conf->mirrors[i].rdev->data_offset;

1436

conf->mirrors[i].rdev->data_offset;

1431

bio->bi_bdev = conf->mirrors[i].rdev->bdev;

1437

bio->bi_bdev = conf->mirrors[i].rdev->bdev;

1432

bio->bi_end_io = raid1_end_write_request;

1438

bio->bi_end_io = raid1_end_write_request;

1433

bio->bi_rw = WRITE;

1439

bio->bi_rw = WRITE;

1434

bio->bi_private = r1_bio;

1440

bio->bi_private = r1_bio;

1435

r1_bio->bios[i] = bio;

1441

r1_bio->bios[i] = bio;

1436

generic_make_request(bio);

1442

generic_make_request(bio);

1437

}

1443

}

1438

} else {

1444

} else {

1439

int disk;

1445

int disk;

1440

1446

1441

/* we got a read error. Maybe the drive is bad. Maybe just

1447

/* we got a read error. Maybe the drive is bad. Maybe just

1442

* the block and we can fix it.

1448

* the block and we can fix it.

1443

* We freeze all other IO, and try reading the block from

1449

* We freeze all other IO, and try reading the block from

1444

* other devices. When we find one, we re-write

1450

* other devices. When we find one, we re-write

1445

* and check it that fixes the read error.

1451

* and check it that fixes the read error.

1446

* This is all done synchronously while the array is

1452

* This is all done synchronously while the array is

1447

* frozen

1453

* frozen

1448

*/

1454

*/

1449

sector_t sect = r1_bio->sector;

1455

sector_t sect = r1_bio->sector;

1450

int sectors = r1_bio->sectors;

1456

int sectors = r1_bio->sectors;

1451

freeze_array(conf);

1457

freeze_array(conf);

1452

if (mddev->ro == 0) while(sectors) {

1458

if (mddev->ro == 0) while(sectors) {

1453

int s = sectors;

1459

int s = sectors;

1454

int d = r1_bio->read_disk;

1460

int d = r1_bio->read_disk;

1455

int success = 0;

1461

int success = 0;

1456

1462

1457

if (s > (PAGE_SIZE>>9))

1463

if (s > (PAGE_SIZE>>9))

1458

s = PAGE_SIZE >> 9;

1464

s = PAGE_SIZE >> 9;

1459

1465

1460

do {

1466

do {

1461

rdev = conf->mirrors[d].rdev;

1467

rdev = conf->mirrors[d].rdev;

1462

if (rdev &&

1468

if (rdev &&

1463

test_bit(In_sync, &rdev->flags) &&

1469

test_bit(In_sync, &rdev->flags) &&

1464

sync_page_io(rdev->bdev,

1470

sync_page_io(rdev->bdev,

1465

sect + rdev->data_offset,

1471

sect + rdev->data_offset,

1466

s<<9,

1472

s<<9,

1467

conf->tmppage, READ))

1473

conf->tmppage, READ))

1468

success = 1;

1474

success = 1;

1469

else {

1475

else {

1470

d++;

1476

d++;

1471

if (d == conf->raid_disks)

1477

if (d == conf->raid_disks)

1472

d = 0;

1478

d = 0;

1473

}

1479

}

1474

} while (!success && d != r1_bio->read_disk);

1480

} while (!success && d != r1_bio->read_disk);

1475

1481

1476

if (success) {

1482

if (success) {

1477

/* write it back and re-read */

1483

/* write it back and re-read */

1478

int start = d;

1484

int start = d;

1479

while (d != r1_bio->read_disk) {

1485

while (d != r1_bio->read_disk) {

1480

if (d==0)

1486

if (d==0)

1481

d = conf->raid_disks;

1487

d = conf->raid_disks;

1482

d--;

1488

d--;

1483

rdev = conf->mirrors[d].rdev;

1489

rdev = conf->mirrors[d].rdev;

1484

atomic_add(s, &rdev->corrected_errors);

1490

atomic_add(s, &rdev->corrected_errors);

1485

if (rdev &&

1491

if (rdev &&

1486

test_bit(In_sync, &rdev->flags)) {

1492

test_bit(In_sync, &rdev->flags)) {

1487

if (sync_page_io(rdev->bdev,

1493

if (sync_page_io(rdev->bdev,

1488

sect + rdev->data_offset,

1494

sect + rdev->data_offset,

1489

s<<9, conf->tmppage, WRITE) == 0)

1495

s<<9, conf->tmppage, WRITE) == 0)

1490

/* Well, this device is dead */

1496

/* Well, this device is dead */

1491

md_error(mddev, rdev);

1497

md_error(mddev, rdev);

1492

}

1498

}

1493

}

1499

}

1494

d = start;

1500

d = start;

1495

while (d != r1_bio->read_disk) {

1501

while (d != r1_bio->read_disk) {

1496

if (d==0)

1502

if (d==0)

1497

d = conf->raid_disks;

1503

d = conf->raid_disks;

1498

d--;

1504

d--;

1499

rdev = conf->mirrors[d].rdev;

1505

rdev = conf->mirrors[d].rdev;

1500

if (rdev &&

1506

if (rdev &&

1501

test_bit(In_sync, &rdev->flags)) {

1507

test_bit(In_sync, &rdev->flags)) {

1502

if (sync_page_io(rdev->bdev,

1508

if (sync_page_io(rdev->bdev,

1503

sect + rdev->data_offset,

1509

sect + rdev->data_offset,

1504

s<<9, conf->tmppage, READ) == 0)

1510

s<<9, conf->tmppage, READ) == 0)

1505

/* Well, this device is dead */

1511

/* Well, this device is dead */

1506

md_error(mddev, rdev);

1512

md_error(mddev, rdev);

1507

}

1513

}

1508

}

1514

}

1509

} else {

1515

} else {

1510

/* Cannot read from anywhere -- bye bye array */

1516

/* Cannot read from anywhere -- bye bye array */

1511

md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);

1517

md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);

1512

break;

1518

break;

1513

}

1519

}

1514

sectors -= s;

1520

sectors -= s;

1515

sect += s;

1521

sect += s;

1516

}

1522

}

1517

1523

1518

unfreeze_array(conf);

1524

unfreeze_array(conf);

1519

1525

1520

bio = r1_bio->bios[r1_bio->read_disk];

1526

bio = r1_bio->bios[r1_bio->read_disk];

1521

if ((disk=read_balance(conf, r1_bio)) == -1) {

1527

if ((disk=read_balance(conf, r1_bio)) == -1) {

1522

printk(KERN_ALERT "raid1: %s: unrecoverable I/O"

1528

printk(KERN_ALERT "raid1: %s: unrecoverable I/O"

1523

" read error for block %llu\n",

1529

" read error for block %llu\n",

1524

bdevname(bio->bi_bdev,b),

1530

bdevname(bio->bi_bdev,b),

1525

(unsigned long long)r1_bio->sector);

1531

(unsigned long long)r1_bio->sector);

1526

raid_end_bio_io(r1_bio);

1532

raid_end_bio_io(r1_bio);

1527

} else {

1533

} else {

1528

r1_bio->bios[r1_bio->read_disk] =

1534

r1_bio->bios[r1_bio->read_disk] =

1529

mddev->ro ? IO_BLOCKED : NULL;

1535

mddev->ro ? IO_BLOCKED : NULL;

1530

r1_bio->read_disk = disk;

1536

r1_bio->read_disk = disk;

1531

bio_put(bio);

1537

bio_put(bio);

1532

bio = bio_clone(r1_bio->master_bio, GFP_NOIO);

1538

bio = bio_clone(r1_bio->master_bio, GFP_NOIO);

1533

r1_bio->bios[r1_bio->read_disk] = bio;

1539

r1_bio->bios[r1_bio->read_disk] = bio;

1534

rdev = conf->mirrors[disk].rdev;

1540

rdev = conf->mirrors[disk].rdev;

1535

if (printk_ratelimit())

1541

if (printk_ratelimit())

1536

printk(KERN_ERR "raid1: %s: redirecting sector %llu to"

1542

printk(KERN_ERR "raid1: %s: redirecting sector %llu to"

1537

" another mirror\n",

1543

" another mirror\n",

1538

bdevname(rdev->bdev,b),

1544

bdevname(rdev->bdev,b),

1539

(unsigned long long)r1_bio->sector);

1545

(unsigned long long)r1_bio->sector);

1540

bio->bi_sector = r1_bio->sector + rdev->data_offset;

1546

bio->bi_sector = r1_bio->sector + rdev->data_offset;

1541

bio->bi_bdev = rdev->bdev;

1547

bio->bi_bdev = rdev->bdev;

1542

bio->bi_end_io = raid1_end_read_request;

1548

bio->bi_end_io = raid1_end_read_request;

1543

bio->bi_rw = READ;

1549

bio->bi_rw = READ;

1544

bio->bi_private = r1_bio;

1550

bio->bi_private = r1_bio;

1545

unplug = 1;

1551

unplug = 1;

1546

generic_make_request(bio);

1552

generic_make_request(bio);

1547

}

1553

}

1548

}

1554

}

1549

}

1555

}

1550

spin_unlock_irqrestore(&conf->device_lock, flags);

1556

spin_unlock_irqrestore(&conf->device_lock, flags);

1551

if (unplug)

1557

if (unplug)

1552

unplug_slaves(mddev);

1558

unplug_slaves(mddev);

1553

}

1559

}

1554

1560

1555

1561

1556

static int init_resync(conf_t *conf)

1562

static int init_resync(conf_t *conf)

1557

{

1563

{

1558

int buffs;

1564

int buffs;

1559

1565

1560

buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;

1566

buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;

1561

BUG_ON(conf->r1buf_pool);

1567

BUG_ON(conf->r1buf_pool);

1562

conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,

1568

conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,

1563

conf->poolinfo);

1569

conf->poolinfo);

1564

if (!conf->r1buf_pool)

1570

if (!conf->r1buf_pool)

1565

return -ENOMEM;

1571

return -ENOMEM;

1566

conf->next_resync = 0;

1572

conf->next_resync = 0;

1567

return 0;

1573

return 0;

1568

}

1574

}

1569

1575

1570

/*

1576

/*

1571

* perform a "sync" on one "block"

1577

* perform a "sync" on one "block"

1572

*

1578

*

1573

* We need to make sure that no normal I/O request - particularly write

1579

* We need to make sure that no normal I/O request - particularly write

1574

* requests - conflict with active sync requests.

1580

* requests - conflict with active sync requests.

1575

*

1581

*

1576

* This is achieved by tracking pending requests and a 'barrier' concept

1582

* This is achieved by tracking pending requests and a 'barrier' concept

1577

* that can be installed to exclude normal IO requests.

1583

* that can be installed to exclude normal IO requests.

1578

*/

1584

*/

1579

1585

1580

static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)

1586

static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)

1581

{

1587

{

1582

conf_t *conf = mddev_to_conf(mddev);

1588

conf_t *conf = mddev_to_conf(mddev);

1583

r1bio_t *r1_bio;

1589

r1bio_t *r1_bio;

1584

struct bio *bio;

1590

struct bio *bio;

1585

sector_t max_sector, nr_sectors;

1591

sector_t max_sector, nr_sectors;

1586

int disk = -1;

1592

int disk = -1;

1587

int i;

1593

int i;

1588

int wonly = -1;

1594

int wonly = -1;

1589

int write_targets = 0, read_targets = 0;

1595

int write_targets = 0, read_targets = 0;

1590

int sync_blocks;

1596

int sync_blocks;

1591

int still_degraded = 0;

1597

int still_degraded = 0;

1592

1598

1593

if (!conf->r1buf_pool)

1599

if (!conf->r1buf_pool)

1594

{

1600

{

1595

/*

1601

/*

1596

printk("sync start - bitmap %p\n", mddev->bitmap);

1602

printk("sync start - bitmap %p\n", mddev->bitmap);

1597

*/

1603

*/

1598

if (init_resync(conf))

1604

if (init_resync(conf))

1599

return 0;

1605

return 0;

1600

}

1606

}

1601

1607

1602

max_sector = mddev->size << 1;

1608

max_sector = mddev->size << 1;

1603

if (sector_nr >= max_sector) {

1609

if (sector_nr >= max_sector) {

1604

/* If we aborted, we need to abort the

1610

/* If we aborted, we need to abort the

1605

* sync on the 'current' bitmap chunk (there will

1611

* sync on the 'current' bitmap chunk (there will

1606

* only be one in raid1 resync.

1612

* only be one in raid1 resync.

1607

* We can find the current addess in mddev->curr_resync

1613

* We can find the current addess in mddev->curr_resync

1608

*/

1614

*/

1609

if (mddev->curr_resync < max_sector) /* aborted */

1615

if (mddev->curr_resync < max_sector) /* aborted */

1610

bitmap_end_sync(mddev->bitmap, mddev->curr_resync,

1616

bitmap_end_sync(mddev->bitmap, mddev->curr_resync,

1611

&sync_blocks, 1);

1617

&sync_blocks, 1);

1612

else /* completed sync */

1618

else /* completed sync */

1613

conf->fullsync = 0;

1619

conf->fullsync = 0;

1614

1620

1615

bitmap_close_sync(mddev->bitmap);

1621

bitmap_close_sync(mddev->bitmap);

1616

close_sync(conf);

1622

close_sync(conf);

1617

return 0;

1623

return 0;

1618

}

1624

}

1619

1625

1620

/* before building a request, check if we can skip these blocks..

1626

/* before building a request, check if we can skip these blocks..

1621

* This call the bitmap_start_sync doesn't actually record anything

1627

* This call the bitmap_start_sync doesn't actually record anything

1622

*/

1628

*/

1623

if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&

1629

if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&

1624

!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {

1630

!conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {

1625

/* We can skip this block, and probably several more */

1631

/* We can skip this block, and probably several more */

1626

*skipped = 1;

1632

*skipped = 1;

1627

return sync_blocks;

1633

return sync_blocks;

1628

}

1634

}

1629

/*

1635

/*

1630

* If there is non-resync activity waiting for a turn,

1636

* If there is non-resync activity waiting for a turn,

1631

* and resync is going fast enough,

1637

* and resync is going fast enough,

1632

* then let it though before starting on this new sync request.

1638

* then let it though before starting on this new sync request.

1633

*/

1639

*/

1634

if (!go_faster && conf->nr_waiting)

1640

if (!go_faster && conf->nr_waiting)

1635

msleep_interruptible(1000);

1641

msleep_interruptible(1000);

1636

1642

1637

raise_barrier(conf);

1643

raise_barrier(conf);

1638

1644

1639

conf->next_resync = sector_nr;

1645

conf->next_resync = sector_nr;

1640

1646

1641

r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);

1647

r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);

1642

rcu_read_lock();

1648

rcu_read_lock();

1643

/*

1649

/*

1644

* If we get a correctably read error during resync or recovery,

1650

* If we get a correctably read error during resync or recovery,

1645

* we might want to read from a different device. So we

1651

* we might want to read from a different device. So we

1646

* flag all drives that could conceivably be read from for READ,

1652

* flag all drives that could conceivably be read from for READ,

1647

* and any others (which will be non-In_sync devices) for WRITE.

1653

* and any others (which will be non-In_sync devices) for WRITE.

1648

* If a read fails, we try reading from something else for which READ

1654

* If a read fails, we try reading from something else for which READ

1649

* is OK.

1655

* is OK.

1650

*/

1656

*/

1651

1657

1652

r1_bio->mddev = mddev;

1658

r1_bio->mddev = mddev;

1653

r1_bio->sector = sector_nr;

1659

r1_bio->sector = sector_nr;

1654

r1_bio->state = 0;

1660

r1_bio->state = 0;

1655

set_bit(R1BIO_IsSync, &r1_bio->state);

1661

set_bit(R1BIO_IsSync, &r1_bio->state);

1656

1662

1657

for (i=0; i < conf->raid_disks; i++) {

1663

for (i=0; i < conf->raid_disks; i++) {

1658

mdk_rdev_t *rdev;

1664

mdk_rdev_t *rdev;

1659

bio = r1_bio->bios[i];

1665

bio = r1_bio->bios[i];

1660

1666

1661

/* take from bio_init */

1667

/* take from bio_init */

1662

bio->bi_next = NULL;

1668

bio->bi_next = NULL;

1663

bio->bi_flags |= 1 << BIO_UPTODATE;

1669

bio->bi_flags |= 1 << BIO_UPTODATE;

1664

bio->bi_rw = 0;

1670

bio->bi_rw = 0;

1665

bio->bi_vcnt = 0;

1671

bio->bi_vcnt = 0;

1666

bio->bi_idx = 0;

1672

bio->bi_idx = 0;

1667

bio->bi_phys_segments = 0;

1673

bio->bi_phys_segments = 0;

1668

bio->bi_hw_segments = 0;

1674

bio->bi_hw_segments = 0;

1669

bio->bi_size = 0;

1675

bio->bi_size = 0;

1670

bio->bi_end_io = NULL;

1676

bio->bi_end_io = NULL;

1671

bio->bi_private = NULL;

1677

bio->bi_private = NULL;

1672

1678

1673

rdev = rcu_dereference(conf->mirrors[i].rdev);

1679

rdev = rcu_dereference(conf->mirrors[i].rdev);

1674

if (rdev == NULL ||

1680

if (rdev == NULL ||

1675

test_bit(Faulty, &rdev->flags)) {

1681

test_bit(Faulty, &rdev->flags)) {

1676

still_degraded = 1;

1682

still_degraded = 1;

1677

continue;

1683

continue;

1678

} else if (!test_bit(In_sync, &rdev->flags)) {

1684

} else if (!test_bit(In_sync, &rdev->flags)) {

1679

bio->bi_rw = WRITE;

1685

bio->bi_rw = WRITE;

1680

bio->bi_end_io = end_sync_write;

1686

bio->bi_end_io = end_sync_write;

1681

write_targets ++;

1687

write_targets ++;

1682

} else {

1688

} else {

1683

/* may need to read from here */

1689

/* may need to read from here */

1684

bio->bi_rw = READ;

1690

bio->bi_rw = READ;

1685

bio->bi_end_io = end_sync_read;

1691

bio->bi_end_io = end_sync_read;

1686

if (test_bit(WriteMostly, &rdev->flags)) {

1692

if (test_bit(WriteMostly, &rdev->flags)) {

1687

if (wonly < 0)

1693

if (wonly < 0)

1688

wonly = i;

1694

wonly = i;

1689

} else {

1695

} else {

1690

if (disk < 0)

1696

if (disk < 0)

1691

disk = i;

1697

disk = i;

1692

}

1698

}

1693

read_targets++;

1699

read_targets++;

1694

}

1700

}

1695

atomic_inc(&rdev->nr_pending);

1701

atomic_inc(&rdev->nr_pending);

1696

bio->bi_sector = sector_nr + rdev->data_offset;

1702

bio->bi_sector = sector_nr + rdev->data_offset;

1697

bio->bi_bdev = rdev->bdev;

1703

bio->bi_bdev = rdev->bdev;

1698

bio->bi_private = r1_bio;

1704

bio->bi_private = r1_bio;

1699

}

1705

}

1700

rcu_read_unlock();

1706

rcu_read_unlock();

1701

if (disk < 0)

1707

if (disk < 0)

1702

disk = wonly;

1708

disk = wonly;

1703

r1_bio->read_disk = disk;

1709

r1_bio->read_disk = disk;

1704

1710

1705

if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)

1711

if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)

1706

/* extra read targets are also write targets */

1712

/* extra read targets are also write targets */

1707

write_targets += read_targets-1;

1713

write_targets += read_targets-1;

1708

1714

1709

if (write_targets == 0 || read_targets == 0) {

1715

if (write_targets == 0 || read_targets == 0) {

1710

/* There is nowhere to write, so all non-sync

1716

/* There is nowhere to write, so all non-sync

1711

* drives must be failed - so we are finished

1717

* drives must be failed - so we are finished

1712

*/

1718

*/

1713

sector_t rv = max_sector - sector_nr;

1719

sector_t rv = max_sector - sector_nr;

1714

*skipped = 1;

1720

*skipped = 1;

1715

put_buf(r1_bio);

1721

put_buf(r1_bio);

1716

return rv;

1722

return rv;

1717

}

1723

}

1718

1724

1719

nr_sectors = 0;

1725

nr_sectors = 0;

1720

sync_blocks = 0;

1726

sync_blocks = 0;

1721

do {

1727

do {

1722

struct page *page;

1728

struct page *page;

1723

int len = PAGE_SIZE;

1729

int len = PAGE_SIZE;

1724

if (sector_nr + (len>>9) > max_sector)

1730

if (sector_nr + (len>>9) > max_sector)

1725

len = (max_sector - sector_nr) << 9;

1731

len = (max_sector - sector_nr) << 9;

1726

if (len == 0)

1732

if (len == 0)

1727

break;

1733

break;

1728

if (sync_blocks == 0) {

1734

if (sync_blocks == 0) {

1729

if (!bitmap_start_sync(mddev->bitmap, sector_nr,

1735

if (!bitmap_start_sync(mddev->bitmap, sector_nr,

1730

&sync_blocks, still_degraded) &&

1736

&sync_blocks, still_degraded) &&

1731

!conf->fullsync &&

1737

!conf->fullsync &&

1732

!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))

1738

!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))

1733

break;

1739

break;

1734

BUG_ON(sync_blocks < (PAGE_SIZE>>9));

1740

BUG_ON(sync_blocks < (PAGE_SIZE>>9));

1735

if (len > (sync_blocks<<9))

1741

if (len > (sync_blocks<<9))

1736

len = sync_blocks<<9;

1742

len = sync_blocks<<9;

1737

}

1743

}

1738

1744

1739

for (i=0 ; i < conf->raid_disks; i++) {

1745

for (i=0 ; i < conf->raid_disks; i++) {

1740

bio = r1_bio->bios[i];

1746

bio = r1_bio->bios[i];

1741

if (bio->bi_end_io) {

1747

if (bio->bi_end_io) {

1742

page = bio->bi_io_vec[bio->bi_vcnt].bv_page;

1748

page = bio->bi_io_vec[bio->bi_vcnt].bv_page;

1743

if (bio_add_page(bio, page, len, 0) == 0) {

1749

if (bio_add_page(bio, page, len, 0) == 0) {

1744

/* stop here */

1750

/* stop here */

1745

bio->bi_io_vec[bio->bi_vcnt].bv_page = page;

1751

bio->bi_io_vec[bio->bi_vcnt].bv_page = page;

1746

while (i > 0) {

1752

while (i > 0) {

1747

i--;

1753

i--;

1748

bio = r1_bio->bios[i];

1754

bio = r1_bio->bios[i];

1749

if (bio->bi_end_io==NULL)

1755

if (bio->bi_end_io==NULL)

1750

continue;

1756

continue;

1751

/* remove last page from this bio */

1757

/* remove last page from this bio */

1752

bio->bi_vcnt--;

1758

bio->bi_vcnt--;

1753

bio->bi_size -= len;

1759

bio->bi_size -= len;

1754

bio->bi_flags &= ~(1<< BIO_SEG_VALID);

1760

bio->bi_flags &= ~(1<< BIO_SEG_VALID);

1755

}

1761

}

1756

goto bio_full;

1762

goto bio_full;

1757

}

1763

}

1758

}

1764

}

1759

}

1765

}

1760

nr_sectors += len>>9;

1766

nr_sectors += len>>9;

1761

sector_nr += len>>9;

1767

sector_nr += len>>9;

1762

sync_blocks -= (len>>9);

1768

sync_blocks -= (len>>9);

1763

} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);

1769

} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);

1764

bio_full:

1770

bio_full:

1765

r1_bio->sectors = nr_sectors;

1771

r1_bio->sectors = nr_sectors;

1766

1772

1767

/* For a user-requested sync, we read all readable devices and do a

1773

/* For a user-requested sync, we read all readable devices and do a

1768

* compare

1774

* compare

1769

*/

1775

*/

1770

if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {

1776

if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {

1771

atomic_set(&r1_bio->remaining, read_targets);

1777

atomic_set(&r1_bio->remaining, read_targets);

1772

for (i=0; i<conf->raid_disks; i++) {

1778

for (i=0; i<conf->raid_disks; i++) {

1773

bio = r1_bio->bios[i];

1779

bio = r1_bio->bios[i];

1774

if (bio->bi_end_io == end_sync_read) {

1780

if (bio->bi_end_io == end_sync_read) {

1775

md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors);

1781

md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors);

1776

generic_make_request(bio);

1782

generic_make_request(bio);

1777

}

1783

}

1778

}

1784

}

1779

} else {

1785

} else {

1780

atomic_set(&r1_bio->remaining, 1);

1786

atomic_set(&r1_bio->remaining, 1);

1781

bio = r1_bio->bios[r1_bio->read_disk];

1787

bio = r1_bio->bios[r1_bio->read_disk];

1782

md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev,

1788

md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev,

1783

nr_sectors);

1789

nr_sectors);

1784

generic_make_request(bio);

1790

generic_make_request(bio);

1785

1791

1786

}

1792

}

1787

1793

1788

return nr_sectors;

1794

return nr_sectors;

1789

}

1795

}

1790

1796

1791

static int run(mddev_t *mddev)

1797

static int run(mddev_t *mddev)

1792

{

1798

{

1793

conf_t *conf;

1799

conf_t *conf;

1794

int i, j, disk_idx;

1800

int i, j, disk_idx;

1795

mirror_info_t *disk;

1801

mirror_info_t *disk;

1796

mdk_rdev_t *rdev;

1802

mdk_rdev_t *rdev;

1797

struct list_head *tmp;

1803

struct list_head *tmp;

1798

1804

1799

if (mddev->level != 1) {

1805

if (mddev->level != 1) {

1800

printk("raid1: %s: raid level not set to mirroring (%d)\n",

1806

printk("raid1: %s: raid level not set to mirroring (%d)\n",

1801

mdname(mddev), mddev->level);

1807

mdname(mddev), mddev->level);

1802

goto out;

1808

goto out;

1803

}

1809

}

1804

if (mddev->reshape_position != MaxSector) {

1810

if (mddev->reshape_position != MaxSector) {

1805

printk("raid1: %s: reshape_position set but not supported\n",

1811

printk("raid1: %s: reshape_position set but not supported\n",

1806

mdname(mddev));

1812

mdname(mddev));

1807

goto out;

1813

goto out;

1808

}

1814

}

1809

/*

1815

/*

1810

* copy the already verified devices into our private RAID1

1816

* copy the already verified devices into our private RAID1

1811

* bookkeeping area. [whatever we allocate in run(),

1817

* bookkeeping area. [whatever we allocate in run(),

1812

* should be freed in stop()]

1818

* should be freed in stop()]

1813

*/

1819

*/

1814

conf = kzalloc(sizeof(conf_t), GFP_KERNEL);

1820

conf = kzalloc(sizeof(conf_t), GFP_KERNEL);

1815

mddev->private = conf;

1821

mddev->private = conf;

1816

if (!conf)

1822

if (!conf)

1817

goto out_no_mem;

1823

goto out_no_mem;

1818

1824

1819

conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,

1825

conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,

1820

GFP_KERNEL);

1826

GFP_KERNEL);

1821

if (!conf->mirrors)

1827

if (!conf->mirrors)

1822

goto out_no_mem;

1828

goto out_no_mem;

1823

1829

1824

conf->tmppage = alloc_page(GFP_KERNEL);

1830

conf->tmppage = alloc_page(GFP_KERNEL);

1825

if (!conf->tmppage)

1831

if (!conf->tmppage)

1826

goto out_no_mem;

1832

goto out_no_mem;

1827

1833

1828

conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);

1834

conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);

1829

if (!conf->poolinfo)

1835

if (!conf->poolinfo)

1830

goto out_no_mem;

1836

goto out_no_mem;

1831

conf->poolinfo->mddev = mddev;

1837

conf->poolinfo->mddev = mddev;

1832

conf->poolinfo->raid_disks = mddev->raid_disks;

1838

conf->poolinfo->raid_disks = mddev->raid_disks;

1833

conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,

1839

conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,

1834

r1bio_pool_free,

1840

r1bio_pool_free,

1835

conf->poolinfo);

1841

conf->poolinfo);

1836

if (!conf->r1bio_pool)

1842

if (!conf->r1bio_pool)

1837

goto out_no_mem;

1843

goto out_no_mem;

1838

1844

1839

ITERATE_RDEV(mddev, rdev, tmp) {

1845

ITERATE_RDEV(mddev, rdev, tmp) {

1840

disk_idx = rdev->raid_disk;

1846

disk_idx = rdev->raid_disk;

1841

if (disk_idx >= mddev->raid_disks

1847

if (disk_idx >= mddev->raid_disks

1842

|| disk_idx < 0)

1848

|| disk_idx < 0)

1843

continue;

1849

continue;

1844

disk = conf->mirrors + disk_idx;

1850

disk = conf->mirrors + disk_idx;

1845

1851

1846

disk->rdev = rdev;

1852

disk->rdev = rdev;

1847

1853

1848

blk_queue_stack_limits(mddev->queue,

1854

blk_queue_stack_limits(mddev->queue,

1849

rdev->bdev->bd_disk->queue);

1855

rdev->bdev->bd_disk->queue);

1850

/* as we don't honour merge_bvec_fn, we must never risk

1856

/* as we don't honour merge_bvec_fn, we must never risk

1851

* violating it, so limit ->max_sector to one PAGE, as

1857

* violating it, so limit ->max_sector to one PAGE, as

1852

* a one page request is never in violation.

1858

* a one page request is never in violation.

1853

*/

1859

*/

1854

if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&

1860

if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&

1855

mddev->queue->max_sectors > (PAGE_SIZE>>9))

1861

mddev->queue->max_sectors > (PAGE_SIZE>>9))

1856

blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);

1862

blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);

1857

1863

1858

disk->head_position = 0;

1864

disk->head_position = 0;

1859

if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))

1865

if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))

1860

conf->working_disks++;

1866

conf->working_disks++;

1861

}

1867

}

1862

conf->raid_disks = mddev->raid_disks;

1868

conf->raid_disks = mddev->raid_disks;

1863

conf->mddev = mddev;

1869

conf->mddev = mddev;

1864

spin_lock_init(&conf->device_lock);

1870

spin_lock_init(&conf->device_lock);

1865

INIT_LIST_HEAD(&conf->retry_list);

1871

INIT_LIST_HEAD(&conf->retry_list);

1866

if (conf->working_disks == 1)

1872

if (conf->working_disks == 1)

1867

mddev->recovery_cp = MaxSector;

1873

mddev->recovery_cp = MaxSector;

1868

1874

1869

spin_lock_init(&conf->resync_lock);

1875

spin_lock_init(&conf->resync_lock);

1870

init_waitqueue_head(&conf->wait_barrier);

1876

init_waitqueue_head(&conf->wait_barrier);

1871

1877

1872

bio_list_init(&conf->pending_bio_list);

1878

bio_list_init(&conf->pending_bio_list);

1873

bio_list_init(&conf->flushing_bio_list);

1879

bio_list_init(&conf->flushing_bio_list);

1874

1880

1875

if (!conf->working_disks) {

1881

if (!conf->working_disks) {

1876

printk(KERN_ERR "raid1: no operational mirrors for %s\n",

1882

printk(KERN_ERR "raid1: no operational mirrors for %s\n",

1877

mdname(mddev));

1883

mdname(mddev));

1878

goto out_free_conf;

1884

goto out_free_conf;

1879

}

1885

}

1880

1886

1881

mddev->degraded = 0;

1887

mddev->degraded = 0;

1882

for (i = 0; i < conf->raid_disks; i++) {

1888

for (i = 0; i < conf->raid_disks; i++) {

1883

1889

1884

disk = conf->mirrors + i;

1890

disk = conf->mirrors + i;

1885

1891

1886

if (!disk->rdev) {

1892

if (!disk->rdev) {

1887

disk->head_position = 0;

1893

disk->head_position = 0;

1888

mddev->degraded++;

1894

mddev->degraded++;

1889

}

1895

}

1890

}

1896

}

1891

1897

1892

/*

1898

/*

1893

* find the first working one and use it as a starting point

1899

* find the first working one and use it as a starting point

1894

* to read balancing.

1900

* to read balancing.

1895

*/

1901

*/

1896

for (j = 0; j < conf->raid_disks &&

1902

for (j = 0; j < conf->raid_disks &&

1897

(!conf->mirrors[j].rdev ||

1903

(!conf->mirrors[j].rdev ||

1898

!test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++)

1904

!test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++)

1899

/* nothing */;

1905

/* nothing */;

1900

conf->last_used = j;

1906

conf->last_used = j;

1901

1907

1902

1908

1903

mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");

1909

mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");

1904

if (!mddev->thread) {

1910

if (!mddev->thread) {

1905

printk(KERN_ERR

1911

printk(KERN_ERR

1906

"raid1: couldn't allocate thread for %s\n",

1912

"raid1: couldn't allocate thread for %s\n",

1907

mdname(mddev));

1913

mdname(mddev));

1908

goto out_free_conf;

1914

goto out_free_conf;

1909

}

1915

}

1910

1916

1911

printk(KERN_INFO

1917

printk(KERN_INFO

1912

"raid1: raid set %s active with %d out of %d mirrors\n",

1918

"raid1: raid set %s active with %d out of %d mirrors\n",

1913

mdname(mddev), mddev->raid_disks - mddev->degraded,

1919

mdname(mddev), mddev->raid_disks - mddev->degraded,

1914

mddev->raid_disks);

1920

mddev->raid_disks);

1915

/*

1921

/*

1916

* Ok, everything is just fine now

1922

* Ok, everything is just fine now

1917

*/

1923

*/

1918

mddev->array_size = mddev->size;

1924

mddev->array_size = mddev->size;

1919

1925

1920

mddev->queue->unplug_fn = raid1_unplug;

1926

mddev->queue->unplug_fn = raid1_unplug;

1921

mddev->queue->issue_flush_fn = raid1_issue_flush;

1927

mddev->queue->issue_flush_fn = raid1_issue_flush;

1922

1928

1923

return 0;

1929

return 0;

1924

1930

1925

out_no_mem:

1931

out_no_mem:

1926

printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",

1932

printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",

1927

mdname(mddev));

1933

mdname(mddev));

1928

1934

1929

out_free_conf:

1935

out_free_conf:

1930

if (conf) {

1936

if (conf) {

1931

if (conf->r1bio_pool)

1937

if (conf->r1bio_pool)

1932

mempool_destroy(conf->r1bio_pool);

1938

mempool_destroy(conf->r1bio_pool);

1933

kfree(conf->mirrors);

1939

kfree(conf->mirrors);

1934

safe_put_page(conf->tmppage);

1940

safe_put_page(conf->tmppage);

1935

kfree(conf->poolinfo);

1941

kfree(conf->poolinfo);

1936

kfree(conf);

1942

kfree(conf);

1937

mddev->private = NULL;

1943

mddev->private = NULL;

1938

}

1944

}

1939

out:

1945

out:

1940

return -EIO;

1946

return -EIO;

1941

}

1947

}

1942

1948

1943

static int stop(mddev_t *mddev)

1949

static int stop(mddev_t *mddev)

1944

{

1950

{

1945

conf_t *conf = mddev_to_conf(mddev);

1951

conf_t *conf = mddev_to_conf(mddev);

1946

struct bitmap *bitmap = mddev->bitmap;

1952

struct bitmap *bitmap = mddev->bitmap;

1947

int behind_wait = 0;

1953

int behind_wait = 0;

1948

1954

1949

/* wait for behind writes to complete */

1955

/* wait for behind writes to complete */

1950

while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {

1956

while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {

1951

behind_wait++;

1957

behind_wait++;

1952

printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);

1958

printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);

1953

set_current_state(TASK_UNINTERRUPTIBLE);

1959

set_current_state(TASK_UNINTERRUPTIBLE);

1954

schedule_timeout(HZ); /* wait a second */

1960

schedule_timeout(HZ); /* wait a second */

1955

/* need to kick something here to make sure I/O goes? */

1961

/* need to kick something here to make sure I/O goes? */

1956

}

1962

}

1957

1963

1958

md_unregister_thread(mddev->thread);

1964

md_unregister_thread(mddev->thread);

1959

mddev->thread = NULL;

1965

mddev->thread = NULL;

1960

blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/

1966

blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/

1961

if (conf->r1bio_pool)

1967

if (conf->r1bio_pool)

1962

mempool_destroy(conf->r1bio_pool);

1968

mempool_destroy(conf->r1bio_pool);

1963

kfree(conf->mirrors);

1969

kfree(conf->mirrors);

1964

kfree(conf->poolinfo);

1970

kfree(conf->poolinfo);

1965

kfree(conf);

1971

kfree(conf);

1966

mddev->private = NULL;

1972

mddev->private = NULL;

1967

return 0;

1973

return 0;

1968

}

1974

}

1969

1975

1970

static int raid1_resize(mddev_t *mddev, sector_t sectors)

1976

static int raid1_resize(mddev_t *mddev, sector_t sectors)

1971

{

1977

{

1972

/* no resync is happening, and there is enough space

1978

/* no resync is happening, and there is enough space

1973

* on all devices, so we can resize.

1979

* on all devices, so we can resize.

1974

* We need to make sure resync covers any new space.

1980

* We need to make sure resync covers any new space.

1975

* If the array is shrinking we should possibly wait until

1981

* If the array is shrinking we should possibly wait until

1976

* any io in the removed space completes, but it hardly seems

1982

* any io in the removed space completes, but it hardly seems

1977

* worth it.

1983

* worth it.

1978

*/

1984

*/

1979

mddev->array_size = sectors>>1;

1985

mddev->array_size = sectors>>1;

1980

set_capacity(mddev->gendisk, mddev->array_size << 1);

1986

set_capacity(mddev->gendisk, mddev->array_size << 1);

1981

mddev->changed = 1;

1987

mddev->changed = 1;

1982

if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {

1988

if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {

1983

mddev->recovery_cp = mddev->size << 1;

1989

mddev->recovery_cp = mddev->size << 1;

1984

set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);

1990

set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);

1985

}

1991

}

1986

mddev->size = mddev->array_size;

1992

mddev->size = mddev->array_size;

1987

mddev->resync_max_sectors = sectors;

1993

mddev->resync_max_sectors = sectors;

1988

return 0;

1994

return 0;

1989

}

1995

}

1990

1996

1991

static int raid1_reshape(mddev_t *mddev)

1997

static int raid1_reshape(mddev_t *mddev)

1992

{

1998

{

1993

/* We need to:

1999

/* We need to:

1994

* 1/ resize the r1bio_pool

2000

* 1/ resize the r1bio_pool

1995

* 2/ resize conf->mirrors

2001

* 2/ resize conf->mirrors

1996

*

2002

*

1997

* We allocate a new r1bio_pool if we can.

2003

* We allocate a new r1bio_pool if we can.

1998

* Then raise a device barrier and wait until all IO stops.

2004

* Then raise a device barrier and wait until all IO stops.

1999

* Then resize conf->mirrors and swap in the new r1bio pool.

2005

* Then resize conf->mirrors and swap in the new r1bio pool.

2000

*

2006

*

2001

* At the same time, we "pack" the devices so that all the missing

2007

* At the same time, we "pack" the devices so that all the missing

2002

* devices have the higher raid_disk numbers.

2008

* devices have the higher raid_disk numbers.

2003

*/

2009

*/

2004

mempool_t *newpool, *oldpool;

2010

mempool_t *newpool, *oldpool;

2005

struct pool_info *newpoolinfo;

2011

struct pool_info *newpoolinfo;

2006

mirror_info_t *newmirrors;

2012

mirror_info_t *newmirrors;

2007

conf_t *conf = mddev_to_conf(mddev);

2013

conf_t *conf = mddev_to_conf(mddev);

2008

int cnt, raid_disks;

2014

int cnt, raid_disks;

2009

2015

2010

int d, d2;

2016

int d, d2;

2011

2017

2012

/* Cannot change chunk_size, layout, or level */

2018

/* Cannot change chunk_size, layout, or level */

2013

if (mddev->chunk_size != mddev->new_chunk ||

2019

if (mddev->chunk_size != mddev->new_chunk ||

2014

mddev->layout != mddev->new_layout ||

2020

mddev->layout != mddev->new_layout ||

2015

mddev->level != mddev->new_level) {

2021

mddev->level != mddev->new_level) {

2016

mddev->new_chunk = mddev->chunk_size;

2022

mddev->new_chunk = mddev->chunk_size;

2017

mddev->new_layout = mddev->layout;

2023

mddev->new_layout = mddev->layout;

2018

mddev->new_level = mddev->level;

2024

mddev->new_level = mddev->level;

2019

return -EINVAL;

2025

return -EINVAL;

2020

}

2026

}

2021

2027

2022

raid_disks = mddev->raid_disks + mddev->delta_disks;

2028

raid_disks = mddev->raid_disks + mddev->delta_disks;

2023

2029

2024

if (raid_disks < conf->raid_disks) {

2030

if (raid_disks < conf->raid_disks) {

2025

cnt=0;

2031

cnt=0;

2026

for (d= 0; d < conf->raid_disks; d++)

2032

for (d= 0; d < conf->raid_disks; d++)

2027

if (conf->mirrors[d].rdev)

2033

if (conf->mirrors[d].rdev)

2028

cnt++;

2034

cnt++;

2029

if (cnt > raid_disks)

2035

if (cnt > raid_disks)

2030

return -EBUSY;

2036

return -EBUSY;

2031

}

2037

}

2032

2038

2033

newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);

2039

newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);

2034

if (!newpoolinfo)

2040

if (!newpoolinfo)

2035

return -ENOMEM;

2041

return -ENOMEM;

2036

newpoolinfo->mddev = mddev;

2042

newpoolinfo->mddev = mddev;

2037

newpoolinfo->raid_disks = raid_disks;

2043

newpoolinfo->raid_disks = raid_disks;

2038

2044

2039

newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,

2045

newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,

2040

r1bio_pool_free, newpoolinfo);

2046

r1bio_pool_free, newpoolinfo);

2041

if (!newpool) {

2047

if (!newpool) {

2042

kfree(newpoolinfo);

2048

kfree(newpoolinfo);

2043

return -ENOMEM;

2049

return -ENOMEM;

2044

}

2050

}

2045

newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);

2051

newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);

2046

if (!newmirrors) {

2052

if (!newmirrors) {

2047

kfree(newpoolinfo);

2053

kfree(newpoolinfo);

2048

mempool_destroy(newpool);

2054

mempool_destroy(newpool);

2049

return -ENOMEM;

2055

return -ENOMEM;

2050

}

2056

}

2051

2057

2052

raise_barrier(conf);

2058

raise_barrier(conf);

2053

2059

2054

/* ok, everything is stopped */

2060

/* ok, everything is stopped */

2055

oldpool = conf->r1bio_pool;

2061

oldpool = conf->r1bio_pool;

2056

conf->r1bio_pool = newpool;

2062

conf->r1bio_pool = newpool;

2057

2063

2058

for (d=d2=0; d < conf->raid_disks; d++)

2064

for (d=d2=0; d < conf->raid_disks; d++)

2059

if (conf->mirrors[d].rdev) {

2065

if (conf->mirrors[d].rdev) {

2060

conf->mirrors[d].rdev->raid_disk = d2;

2066

conf->mirrors[d].rdev->raid_disk = d2;

2061

newmirrors[d2++].rdev = conf->mirrors[d].rdev;

2067

newmirrors[d2++].rdev = conf->mirrors[d].rdev;

2062

}

2068

}

2063

kfree(conf->mirrors);

2069

kfree(conf->mirrors);

2064

conf->mirrors = newmirrors;

2070

conf->mirrors = newmirrors;

2065

kfree(conf->poolinfo);

2071

kfree(conf->poolinfo);

2066

conf->poolinfo = newpoolinfo;

2072

conf->poolinfo = newpoolinfo;

2067

2073

2068

mddev->degraded += (raid_disks - conf->raid_disks);

2074

mddev->degraded += (raid_disks - conf->raid_disks);

2069

conf->raid_disks = mddev->raid_disks = raid_disks;

2075

conf->raid_disks = mddev->raid_disks = raid_disks;

2070

mddev->delta_disks = 0;

2076

mddev->delta_disks = 0;

2071

2077

2072

conf->last_used = 0; /* just make sure it is in-range */

2078

conf->last_used = 0; /* just make sure it is in-range */

2073

lower_barrier(conf);

2079

lower_barrier(conf);

2074

2080

2075

set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);

2081

set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);

2076

md_wakeup_thread(mddev->thread);

2082

md_wakeup_thread(mddev->thread);

2077

2083

2078

mempool_destroy(oldpool);

2084

mempool_destroy(oldpool);

2079

return 0;

2085

return 0;

2080

}

2086

}

2081

2087

2082

static void raid1_quiesce(mddev_t *mddev, int state)

2088

static void raid1_quiesce(mddev_t *mddev, int state)

2083

{

2089

{

2084

conf_t *conf = mddev_to_conf(mddev);

2090

conf_t *conf = mddev_to_conf(mddev);

2085

2091

2086

switch(state) {

2092

switch(state) {

2087

case 1:

2093

case 1:

2088

raise_barrier(conf);

2094

raise_barrier(conf);

2089

break;

2095

break;

2090

case 0:

2096

case 0:

2091

lower_barrier(conf);

2097

lower_barrier(conf);

2092

break;

2098

break;

2093

}

2099

}

2094

}

2100

}

2095

2101

2096

2102

2097

static struct mdk_personality raid1_personality =

2103

static struct mdk_personality raid1_personality =

2098

{

2104

{

2099

.name = "raid1",

2105

.name = "raid1",

2100

.level = 1,

2106

.level = 1,

2101

.owner = THIS_MODULE,

2107

.owner = THIS_MODULE,

2102

.make_request = make_request,

2108

.make_request = make_request,

2103

.run = run,

2109

.run = run,

2104

.stop = stop,

2110

.stop = stop,

2105

.status = status,

2111

.status = status,

2106

.error_handler = error,

2112

.error_handler = error,

2107

.hot_add_disk = raid1_add_disk,

2113

.hot_add_disk = raid1_add_disk,

2108

.hot_remove_disk= raid1_remove_disk,

2114

.hot_remove_disk= raid1_remove_disk,

2109

.spare_active = raid1_spare_active,

2115

.spare_active = raid1_spare_active,

2110

.sync_request = sync_request,

2116

.sync_request = sync_request,

2111

.resize = raid1_resize,

2117

.resize = raid1_resize,

2112

.check_reshape = raid1_reshape,

2118

.check_reshape = raid1_reshape,

2113

.quiesce = raid1_quiesce,

2119

.quiesce = raid1_quiesce,

2114

};

2120

};

2115

2121

2116

static int __init raid_init(void)

2122

static int __init raid_init(void)

2117

{

2123

{

2118

return register_md_personality(&raid1_personality);

2124

return register_md_personality(&raid1_personality);

2119

}

2125

}

2120

2126

2121

static void raid_exit(void)

2127

static void raid_exit(void)

2122

{

2128

{

2123

unregister_md_personality(&raid1_personality);

2129

unregister_md_personality(&raid1_personality);

2124

}

2130

}

2125

2131

2126

module_init(raid_init);

2132

module_init(raid_init);

2127

module_exit(raid_exit);

2133

module_exit(raid_exit);

GITLAB

[PATCH] md: Improve detection of lack of barrier support in raid1

 /*
  * raid1.c : Multiple Devices driver for Linux
  *
  * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
  *
  * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
  *
  * RAID-1 management functions.
  *
  * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
  *
  * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
  * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
  *
  * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
  * bitmapped intelligence in resync:
  *
  *      - bitmap marked during normal i/o
  *      - bitmap used to skip nondirty blocks during sync
  *
  * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
  * - persistent bitmap code
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2, or (at your option)
  * any later version.
  *
  * You should have received a copy of the GNU General Public License
  * (for example /usr/src/linux/COPYING); if not, write to the Free
  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  */
 #include "dm-bio-list.h"
 #include <linux/raid/raid1.h>
 #include <linux/raid/bitmap.h>
 #define DEBUG 0
 #if DEBUG
 #define PRINTK(x...) printk(x)
 #else
 #define PRINTK(x...)
 #endif
 /*
  * Number of guaranteed r1bios in case of extreme VM load:
  */
 #define	NR_RAID1_BIOS 256
 static void unplug_slaves(mddev_t *mddev);
 static void allow_barrier(conf_t *conf);
 static void lower_barrier(conf_t *conf);
 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
 	r1bio_t *r1_bio;
 	int size = offsetof(r1bio_t, bios[pi->raid_disks]);
 	/* allocate a r1bio with room for raid_disks entries in the bios array */
 	r1_bio = kzalloc(size, gfp_flags);
 	if (!r1_bio)
 		unplug_slaves(pi->mddev);
 	return r1_bio;
 }
 static void r1bio_pool_free(void *r1_bio, void *data)
 {
 	kfree(r1_bio);
 }
 #define RESYNC_BLOCK_SIZE (64*1024)
 //#define RESYNC_BLOCK_SIZE PAGE_SIZE
 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
 #define RESYNC_WINDOW (2048*1024)
 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
 {
 	struct pool_info *pi = data;
 	struct page *page;
 	r1bio_t *r1_bio;
 	struct bio *bio;
 	int i, j;
 	r1_bio = r1bio_pool_alloc(gfp_flags, pi);
 	if (!r1_bio) {
 		unplug_slaves(pi->mddev);
 		return NULL;
 	}
 	/*
 	 * Allocate bios : 1 for reading, n-1 for writing
 	 */
 	for (j = pi->raid_disks ; j-- ; ) {
 		bio = bio_alloc(gfp_flags, RESYNC_PAGES);
 		if (!bio)
 			goto out_free_bio;
 		r1_bio->bios[j] = bio;
 	}
 	/*
 	 * Allocate RESYNC_PAGES data pages and attach them to
 	 * the first bio.
 	 * If this is a user-requested check/repair, allocate
 	 * RESYNC_PAGES for each bio.
 	 */
 	if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
 		j = pi->raid_disks;
 	else
 		j = 1;
 	while(j--) {
 		bio = r1_bio->bios[j];
 		for (i = 0; i < RESYNC_PAGES; i++) {
 			page = alloc_page(gfp_flags);
 			if (unlikely(!page))
 				goto out_free_pages;
 			bio->bi_io_vec[i].bv_page = page;
 		}
 	}
 	/* If not user-requests, copy the page pointers to all bios */
 	if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
 		for (i=0; i<RESYNC_PAGES ; i++)
 			for (j=1; j<pi->raid_disks; j++)
 				r1_bio->bios[j]->bi_io_vec[i].bv_page =
 					r1_bio->bios[0]->bi_io_vec[i].bv_page;
 	}
 	r1_bio->master_bio = NULL;
 	return r1_bio;
 out_free_pages:
 	for (i=0; i < RESYNC_PAGES ; i++)
 		for (j=0 ; j < pi->raid_disks; j++)
 			safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
 	j = -1;
 out_free_bio:
 	while ( ++j < pi->raid_disks )
 		bio_put(r1_bio->bios[j]);
 	r1bio_pool_free(r1_bio, data);
 	return NULL;
 }
 static void r1buf_pool_free(void *__r1_bio, void *data)
 {
 	struct pool_info *pi = data;
 	int i,j;
 	r1bio_t *r1bio = __r1_bio;
 	for (i = 0; i < RESYNC_PAGES; i++)
 		for (j = pi->raid_disks; j-- ;) {
 			if (j == 0 ||
 			    r1bio->bios[j]->bi_io_vec[i].bv_page !=
 			    r1bio->bios[0]->bi_io_vec[i].bv_page)
 				safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
 		}
 	for (i=0 ; i < pi->raid_disks; i++)
 		bio_put(r1bio->bios[i]);
 	r1bio_pool_free(r1bio, data);
 }
 static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
 {
 	int i;
 	for (i = 0; i < conf->raid_disks; i++) {
 		struct bio **bio = r1_bio->bios + i;
 		if (*bio && *bio != IO_BLOCKED)
 			bio_put(*bio);
 		*bio = NULL;
 	}
 }
 static void free_r1bio(r1bio_t *r1_bio)
 {
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
 	/*
 	 * Wake up any possible resync thread that waits for the device
 	 * to go idle.
 	 */
 	allow_barrier(conf);
 	put_all_bios(conf, r1_bio);
 	mempool_free(r1_bio, conf->r1bio_pool);
 }
 static void put_buf(r1bio_t *r1_bio)
 {
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
 	int i;
 	for (i=0; i<conf->raid_disks; i++) {
 		struct bio *bio = r1_bio->bios[i];
 		if (bio->bi_end_io)
 			rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
 	}
 	mempool_free(r1_bio, conf->r1buf_pool);
 	lower_barrier(conf);
 }
 static void reschedule_retry(r1bio_t *r1_bio)
 {
 	unsigned long flags;
 	mddev_t *mddev = r1_bio->mddev;
 	conf_t *conf = mddev_to_conf(mddev);
 	spin_lock_irqsave(&conf->device_lock, flags);
 	list_add(&r1_bio->retry_list, &conf->retry_list);
 	conf->nr_queued ++;
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	wake_up(&conf->wait_barrier);
 	md_wakeup_thread(mddev->thread);
 }
 /*
  * raid_end_bio_io() is called when we have finished servicing a mirrored
  * operation and are ready to return a success/failure code to the buffer
  * cache layer.
  */
 static void raid_end_bio_io(r1bio_t *r1_bio)
 {
 	struct bio *bio = r1_bio->master_bio;
 	/* if nobody has done the final endio yet, do it now */
 	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
 		PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
 			(bio_data_dir(bio) == WRITE) ? "write" : "read",
 			(unsigned long long) bio->bi_sector,
 			(unsigned long long) bio->bi_sector +
 				(bio->bi_size >> 9) - 1);
 		bio_endio(bio, bio->bi_size,
 			test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
 	}
 	free_r1bio(r1_bio);
 }
 /*
  * Update disk head position estimator based on IRQ completion info.
  */
 static inline void update_head_pos(int disk, r1bio_t *r1_bio)
 {
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
 	conf->mirrors[disk].head_position =
 		r1_bio->sector + (r1_bio->sectors);
 }
 static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
 	int mirror;
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
 	if (bio->bi_size)
 		return 1;
 	mirror = r1_bio->read_disk;
 	/*
 	 * this branch is our 'one mirror IO has finished' event handler:
 	 */
 	update_head_pos(mirror, r1_bio);
 	if (uptodate || conf->working_disks <= 1) {
 		/*
 		 * Set R1BIO_Uptodate in our master bio, so that
 		 * we will return a good error code for to the higher
 		 * levels even if IO on some other mirrored buffer fails.
 		 *
 		 * The 'master' represents the composite IO operation to
 		 * user-side. So if something waits for IO, then it will
 		 * wait for the 'master' bio.
 		 */
 		if (uptodate)
 			set_bit(R1BIO_Uptodate, &r1_bio->state);
 		raid_end_bio_io(r1_bio);
 	} else {
 		/*
 		 * oops, read error:
 		 */
 		char b[BDEVNAME_SIZE];
 		if (printk_ratelimit())
 			printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
 			       bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
 		reschedule_retry(r1_bio);
 	}
 	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
 	return 0;
 }
 static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
 	int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
 	conf_t *conf = mddev_to_conf(r1_bio->mddev);
 	struct bio *to_put = NULL;
 	if (bio->bi_size)
 		return 1;
 	for (mirror = 0; mirror < conf->raid_disks; mirror++)
 		if (r1_bio->bios[mirror] == bio)
 			break;
 	if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
 		set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
 		set_bit(R1BIO_BarrierRetry, &r1_bio->state);
 		r1_bio->mddev->barriers_work = 0;
 	} else {
 		/*
 		 * this branch is our 'one mirror IO has finished' event handler:
 		 */
 		r1_bio->bios[mirror] = NULL;
 		to_put = bio;
 		if (!uptodate) {
 			md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
 			/* an I/O failed, we can't clear the bitmap */
 			set_bit(R1BIO_Degraded, &r1_bio->state);
 		} else
 			/*
 			 * Set R1BIO_Uptodate in our master bio, so that
 			 * we will return a good error code for to the higher
 			 * levels even if IO on some other mirrored buffer fails.
 			 *
 			 * The 'master' represents the composite IO operation to
 			 * user-side. So if something waits for IO, then it will
 			 * wait for the 'master' bio.
 			 */
 			set_bit(R1BIO_Uptodate, &r1_bio->state);
 		update_head_pos(mirror, r1_bio);
 		if (behind) {
 			if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
 				atomic_dec(&r1_bio->behind_remaining);
 			/* In behind mode, we ACK the master bio once the I/O has safely
 			 * reached all non-writemostly disks. Setting the Returned bit
 			 * ensures that this gets done only once -- we don't ever want to
 			 * return -EIO here, instead we'll wait */
 			if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
 			    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
 				/* Maybe we can return now */
 				if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
 					struct bio *mbio = r1_bio->master_bio;
 					PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
 					       (unsigned long long) mbio->bi_sector,
 					       (unsigned long long) mbio->bi_sector +
 					       (mbio->bi_size >> 9) - 1);
 					bio_endio(mbio, mbio->bi_size, 0);
 				}
 			}
 		}
 	}
 	/*
 	 *
 	 * Let's see if all mirrored write operations have finished
 	 * already.
 	 */
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
 		if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
 			reschedule_retry(r1_bio);
 			/* Don't dec_pending yet, we want to hold
 			 * the reference over the retry
 			 */
 			goto out;
 		}
 		if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
 			/* free extra copy of the data pages */
 			int i = bio->bi_vcnt;
 			while (i--)
 				safe_put_page(bio->bi_io_vec[i].bv_page);
 		}
 		/* clear the bitmap if all writes complete successfully */
 		bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
 				r1_bio->sectors,
 				!test_bit(R1BIO_Degraded, &r1_bio->state),
 				behind);
 		md_write_end(r1_bio->mddev);
 		raid_end_bio_io(r1_bio);
 	}
 	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
  out:
 	if (to_put)
 		bio_put(to_put);
 	return 0;
 }
 /*
  * This routine returns the disk from which the requested read should
  * be done. There is a per-array 'next expected sequential IO' sector
  * number - if this matches on the next IO then we use the last disk.
  * There is also a per-disk 'last know head position' sector that is
  * maintained from IRQ contexts, both the normal and the resync IO
  * completion handlers update this position correctly. If there is no
  * perfect sequential match then we pick the disk whose head is closest.
  *
  * If there are 2 mirrors in the same 2 devices, performance degrades
  * because position is mirror, not device based.
  *
  * The rdev for the device selected will have nr_pending incremented.
  */
 static int read_balance(conf_t *conf, r1bio_t *r1_bio)
 {
 	const unsigned long this_sector = r1_bio->sector;
 	int new_disk = conf->last_used, disk = new_disk;
 	int wonly_disk = -1;
 	const int sectors = r1_bio->sectors;
 	sector_t new_distance, current_distance;
 	mdk_rdev_t *rdev;
 	rcu_read_lock();
 	/*
 	 * Check if we can balance. We can balance on the whole
 	 * device if no resync is going on, or below the resync window.
 	 * We take the first readable disk when above the resync window.
 	 */
  retry:
 	if (conf->mddev->recovery_cp < MaxSector &&
 	    (this_sector + sectors >= conf->next_resync)) {
 		/* Choose the first operation device, for consistancy */
 		new_disk = 0;
 		for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
 		     r1_bio->bios[new_disk] == IO_BLOCKED ||
 		     !rdev || !test_bit(In_sync, &rdev->flags)
 			     || test_bit(WriteMostly, &rdev->flags);
 		     rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
 			if (rdev && test_bit(In_sync, &rdev->flags) &&
 				r1_bio->bios[new_disk] != IO_BLOCKED)
 				wonly_disk = new_disk;
 			if (new_disk == conf->raid_disks - 1) {
 				new_disk = wonly_disk;
 				break;
 			}
 		}
 		goto rb_out;
 	}
 	/* make sure the disk is operational */
 	for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
 	     r1_bio->bios[new_disk] == IO_BLOCKED ||
 	     !rdev || !test_bit(In_sync, &rdev->flags) ||
 		     test_bit(WriteMostly, &rdev->flags);
 	     rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
 		if (rdev && test_bit(In_sync, &rdev->flags) &&
 		    r1_bio->bios[new_disk] != IO_BLOCKED)
 			wonly_disk = new_disk;
 		if (new_disk <= 0)
 			new_disk = conf->raid_disks;
 		new_disk--;
 		if (new_disk == disk) {
 			new_disk = wonly_disk;
 			break;
 		}
 	}
 	if (new_disk < 0)
 		goto rb_out;
 	disk = new_disk;
 	/* now disk == new_disk == starting point for search */
 	/*
 	 * Don't change to another disk for sequential reads:
 	 */
 	if (conf->next_seq_sect == this_sector)
 		goto rb_out;
 	if (this_sector == conf->mirrors[new_disk].head_position)
 		goto rb_out;
 	current_distance = abs(this_sector - conf->mirrors[disk].head_position);
 	/* Find the disk whose head is closest */
 	do {
 		if (disk <= 0)
 			disk = conf->raid_disks;
 		disk--;
 		rdev = rcu_dereference(conf->mirrors[disk].rdev);
 		if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
 		    !test_bit(In_sync, &rdev->flags) ||
 		    test_bit(WriteMostly, &rdev->flags))
 			continue;
 		if (!atomic_read(&rdev->nr_pending)) {
 			new_disk = disk;
 			break;
 		}
 		new_distance = abs(this_sector - conf->mirrors[disk].head_position);
 		if (new_distance < current_distance) {
 			current_distance = new_distance;
 			new_disk = disk;
 		}
 	} while (disk != conf->last_used);
  rb_out:
 	if (new_disk >= 0) {
 		rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
 		if (!rdev)
 			goto retry;
 		atomic_inc(&rdev->nr_pending);
 		if (!test_bit(In_sync, &rdev->flags)) {
 			/* cannot risk returning a device that failed
 			 * before we inc'ed nr_pending
 			 */
 			rdev_dec_pending(rdev, conf->mddev);
 			goto retry;
 		}
 		conf->next_seq_sect = this_sector + sectors;
 		conf->last_used = new_disk;
 	}
 	rcu_read_unlock();
 	return new_disk;
 }
 static void unplug_slaves(mddev_t *mddev)
 {
 	conf_t *conf = mddev_to_conf(mddev);
 	int i;
 	rcu_read_lock();
 	for (i=0; i<mddev->raid_disks; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
 			request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
 			atomic_inc(&rdev->nr_pending);
 			rcu_read_unlock();
 			if (r_queue->unplug_fn)
 				r_queue->unplug_fn(r_queue);
 			rdev_dec_pending(rdev, mddev);
 			rcu_read_lock();
 		}
 	}
 	rcu_read_unlock();
 }
 static void raid1_unplug(request_queue_t *q)
 {
 	mddev_t *mddev = q->queuedata;
 	unplug_slaves(mddev);
 	md_wakeup_thread(mddev->thread);
 }
 static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
 			     sector_t *error_sector)
 {
 	mddev_t *mddev = q->queuedata;
 	conf_t *conf = mddev_to_conf(mddev);
 	int i, ret = 0;
 	rcu_read_lock();
 	for (i=0; i<mddev->raid_disks && ret == 0; i++) {
 		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev && !test_bit(Faulty, &rdev->flags)) {
 			struct block_device *bdev = rdev->bdev;
 			request_queue_t *r_queue = bdev_get_queue(bdev);
 			if (!r_queue->issue_flush_fn)
 				ret = -EOPNOTSUPP;
 			else {
 				atomic_inc(&rdev->nr_pending);
 				rcu_read_unlock();
 				ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
 							      error_sector);
 				rdev_dec_pending(rdev, mddev);
 				rcu_read_lock();
 			}
 		}
 	}
 	rcu_read_unlock();
 	return ret;
 }
 /* Barriers....
  * Sometimes we need to suspend IO while we do something else,
  * either some resync/recovery, or reconfigure the array.
  * To do this we raise a 'barrier'.
  * The 'barrier' is a counter that can be raised multiple times
  * to count how many activities are happening which preclude
  * normal IO.
  * We can only raise the barrier if there is no pending IO.
  * i.e. if nr_pending == 0.
  * We choose only to raise the barrier if no-one is waiting for the
  * barrier to go down.  This means that as soon as an IO request
  * is ready, no other operations which require a barrier will start
  * until the IO request has had a chance.
  *
  * So: regular IO calls 'wait_barrier'.  When that returns there
  *    is no backgroup IO happening,  It must arrange to call
  *    allow_barrier when it has finished its IO.
  * backgroup IO calls must call raise_barrier.  Once that returns
  *    there is no normal IO happeing.  It must arrange to call
  *    lower_barrier when the particular background IO completes.
  */
 #define RESYNC_DEPTH 32
 static void raise_barrier(conf_t *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
 	/* Wait until no block IO is waiting */
 	wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
 			    conf->resync_lock,
 			    raid1_unplug(conf->mddev->queue));
 	/* block any new IO from starting */
 	conf->barrier++;
 	/* No wait for all pending IO to complete */
 	wait_event_lock_irq(conf->wait_barrier,
 			    !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
 			    conf->resync_lock,
 			    raid1_unplug(conf->mddev->queue));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void lower_barrier(conf_t *conf)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->resync_lock, flags);
 	conf->barrier--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 static void wait_barrier(conf_t *conf)
 {
 	spin_lock_irq(&conf->resync_lock);
 	if (conf->barrier) {
 		conf->nr_waiting++;
 		wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
 				    conf->resync_lock,
 				    raid1_unplug(conf->mddev->queue));
 		conf->nr_waiting--;
 	}
 	conf->nr_pending++;
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void allow_barrier(conf_t *conf)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&conf->resync_lock, flags);
 	conf->nr_pending--;
 	spin_unlock_irqrestore(&conf->resync_lock, flags);
 	wake_up(&conf->wait_barrier);
 }
 static void freeze_array(conf_t *conf)
 {
 	/* stop syncio and normal IO and wait for everything to
 	 * go quite.
 	 * We increment barrier and nr_waiting, and then
 	 * wait until barrier+nr_pending match nr_queued+2
 	 */
 	spin_lock_irq(&conf->resync_lock);
 	conf->barrier++;
 	conf->nr_waiting++;
 	wait_event_lock_irq(conf->wait_barrier,
 			    conf->barrier+conf->nr_pending == conf->nr_queued+2,
 			    conf->resync_lock,
 			    raid1_unplug(conf->mddev->queue));
 	spin_unlock_irq(&conf->resync_lock);
 }
 static void unfreeze_array(conf_t *conf)
 {
 	/* reverse the effect of the freeze */
 	spin_lock_irq(&conf->resync_lock);
 	conf->barrier--;
 	conf->nr_waiting--;
 	wake_up(&conf->wait_barrier);
 	spin_unlock_irq(&conf->resync_lock);
 }
 /* duplicate the data pages for behind I/O */
 static struct page **alloc_behind_pages(struct bio *bio)
 {
 	int i;
 	struct bio_vec *bvec;
 	struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
 					GFP_NOIO);
 	if (unlikely(!pages))
 		goto do_sync_io;
 	bio_for_each_segment(bvec, bio, i) {
 		pages[i] = alloc_page(GFP_NOIO);
 		if (unlikely(!pages[i]))
 			goto do_sync_io;
 		memcpy(kmap(pages[i]) + bvec->bv_offset,
 			kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
 		kunmap(pages[i]);
 		kunmap(bvec->bv_page);
 	}
 	return pages;
 do_sync_io:
 	if (pages)
 		for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
 			put_page(pages[i]);
 	kfree(pages);
 	PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
 	return NULL;
 }
 static int make_request(request_queue_t *q, struct bio * bio)
 {
 	mddev_t *mddev = q->queuedata;
 	conf_t *conf = mddev_to_conf(mddev);
 	mirror_info_t *mirror;
 	r1bio_t *r1_bio;
 	struct bio *read_bio;
 	int i, targets = 0, disks;
 	mdk_rdev_t *rdev;
 	struct bitmap *bitmap = mddev->bitmap;
 	unsigned long flags;
 	struct bio_list bl;
 	struct page **behind_pages = NULL;
 	const int rw = bio_data_dir(bio);
 	int do_barriers;
-	if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
-		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
-		return 0;
-	}
 	/*
 	 * Register the new request and wait if the reconstruction
 	 * thread has put up a bar for new requests.
 	 * Continue immediately if no resync is active currently.
+	 * We test barriers_work *after* md_write_start as md_write_start
+	 * may cause the first superblock write, and that will check out
+	 * if barriers work.
 	 */
 	md_write_start(mddev, bio); /* wait on superblock update early */
+	if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
+		if (rw == WRITE)
+			md_write_end(mddev);
+		bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
+		return 0;
+	}
 	wait_barrier(conf);
 	disk_stat_inc(mddev->gendisk, ios[rw]);
 	disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
 	/*
 	 * make_request() can abort the operation when READA is being
 	 * used and no empty request is available.
 	 *
 	 */
 	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
 	r1_bio->master_bio = bio;
 	r1_bio->sectors = bio->bi_size >> 9;
 	r1_bio->state = 0;
 	r1_bio->mddev = mddev;
 	r1_bio->sector = bio->bi_sector;
 	if (rw == READ) {
 		/*
 		 * read balancing logic:
 		 */
 		int rdisk = read_balance(conf, r1_bio);
 		if (rdisk < 0) {
 			/* couldn't find anywhere to read from */
 			raid_end_bio_io(r1_bio);
 			return 0;
 		}
 		mirror = conf->mirrors + rdisk;
 		r1_bio->read_disk = rdisk;
 		read_bio = bio_clone(bio, GFP_NOIO);
 		r1_bio->bios[rdisk] = read_bio;
 		read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
 		read_bio->bi_bdev = mirror->rdev->bdev;
 		read_bio->bi_end_io = raid1_end_read_request;
 		read_bio->bi_rw = READ;
 		read_bio->bi_private = r1_bio;
 		generic_make_request(read_bio);
 		return 0;
 	}
 	/*
 	 * WRITE:
 	 */
 	/* first select target devices under spinlock and
 	 * inc refcount on their rdev.  Record them by setting
 	 * bios[x] to bio
 	 */
 	disks = conf->raid_disks;
 #if 0
 	{ static int first=1;
 	if (first) printk("First Write sector %llu disks %d\n",
 			  (unsigned long long)r1_bio->sector, disks);
 	first = 0;
 	}
 #endif
 	rcu_read_lock();
 	for (i = 0;  i < disks; i++) {
 		if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL &&
 		    !test_bit(Faulty, &rdev->flags)) {
 			atomic_inc(&rdev->nr_pending);
 			if (test_bit(Faulty, &rdev->flags)) {
 				rdev_dec_pending(rdev, mddev);
 				r1_bio->bios[i] = NULL;
 			} else
 				r1_bio->bios[i] = bio;
 			targets++;
 		} else
 			r1_bio->bios[i] = NULL;
 	}
 	rcu_read_unlock();
 	BUG_ON(targets == 0); /* we never fail the last device */
 	if (targets < conf->raid_disks) {
 		/* array is degraded, we will not clear the bitmap
 		 * on I/O completion (see raid1_end_write_request) */
 		set_bit(R1BIO_Degraded, &r1_bio->state);
 	}
 	/* do behind I/O ? */
 	if (bitmap &&
 	    atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
 	    (behind_pages = alloc_behind_pages(bio)) != NULL)
 		set_bit(R1BIO_BehindIO, &r1_bio->state);
 	atomic_set(&r1_bio->remaining, 0);
 	atomic_set(&r1_bio->behind_remaining, 0);
 	do_barriers = bio_barrier(bio);
 	if (do_barriers)
 		set_bit(R1BIO_Barrier, &r1_bio->state);
 	bio_list_init(&bl);
 	for (i = 0; i < disks; i++) {
 		struct bio *mbio;
 		if (!r1_bio->bios[i])
 			continue;
 		mbio = bio_clone(bio, GFP_NOIO);
 		r1_bio->bios[i] = mbio;
 		mbio->bi_sector	= r1_bio->sector + conf->mirrors[i].rdev->data_offset;
 		mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 		mbio->bi_end_io	= raid1_end_write_request;
 		mbio->bi_rw = WRITE | do_barriers;
 		mbio->bi_private = r1_bio;
 		if (behind_pages) {
 			struct bio_vec *bvec;
 			int j;
 			/* Yes, I really want the '__' version so that
 			 * we clear any unused pointer in the io_vec, rather
 			 * than leave them unchanged.  This is important
 			 * because when we come to free the pages, we won't
 			 * know the originial bi_idx, so we just free
 			 * them all
 			 */
 			__bio_for_each_segment(bvec, mbio, j, 0)
 				bvec->bv_page = behind_pages[j];
 			if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
 				atomic_inc(&r1_bio->behind_remaining);
 		}
 		atomic_inc(&r1_bio->remaining);
 		bio_list_add(&bl, mbio);
 	}
 	kfree(behind_pages); /* the behind pages are attached to the bios now */
 	bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
 				test_bit(R1BIO_BehindIO, &r1_bio->state));
 	spin_lock_irqsave(&conf->device_lock, flags);
 	bio_list_merge(&conf->pending_bio_list, &bl);
 	bio_list_init(&bl);
 	blk_plug_device(mddev->queue);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 #if 0
 	while ((bio = bio_list_pop(&bl)) != NULL)
 		generic_make_request(bio);
 #endif
 	return 0;
 }
 static void status(struct seq_file *seq, mddev_t *mddev)
 {
 	conf_t *conf = mddev_to_conf(mddev);
 	int i;
 	seq_printf(seq, " [%d/%d] [", conf->raid_disks,
 						conf->working_disks);
 	for (i = 0; i < conf->raid_disks; i++)
 		seq_printf(seq, "%s",
 			      conf->mirrors[i].rdev &&
 			      test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
 	seq_printf(seq, "]");
 }
 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	char b[BDEVNAME_SIZE];
 	conf_t *conf = mddev_to_conf(mddev);
 	/*
 	 * If it is not operational, then we have already marked it as dead
 	 * else if it is the last working disks, ignore the error, let the
 	 * next level up know.
 	 * else mark the drive as failed
 	 */
 	if (test_bit(In_sync, &rdev->flags)
 	    && conf->working_disks == 1)
 		/*
 		 * Don't fail the drive, act as though we were just a
 		 * normal single drive
 		 */
 		return;
 	if (test_bit(In_sync, &rdev->flags)) {
 		mddev->degraded++;
 		conf->working_disks--;
 		/*
 		 * if recovery is running, make sure it aborts.
 		 */
 		set_bit(MD_RECOVERY_ERR, &mddev->recovery);
 	}
 	clear_bit(In_sync, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	mddev->sb_dirty = 1;
 	printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
 		"	Operation continuing on %d devices\n",
 		bdevname(rdev->bdev,b), conf->working_disks);
 }
 static void print_conf(conf_t *conf)
 {
 	int i;
 	mirror_info_t *tmp;
 	printk("RAID1 conf printout:\n");
 	if (!conf) {
 		printk("(!conf)\n");
 		return;
 	}
 	printk(" --- wd:%d rd:%d\n", conf->working_disks,
 		conf->raid_disks);
 	for (i = 0; i < conf->raid_disks; i++) {
 		char b[BDEVNAME_SIZE];
 		tmp = conf->mirrors + i;
 		if (tmp->rdev)
 			printk(" disk %d, wo:%d, o:%d, dev:%s\n",
 				i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags),
 				bdevname(tmp->rdev->bdev,b));
 	}
 }
 static void close_sync(conf_t *conf)
 {
 	wait_barrier(conf);
 	allow_barrier(conf);
 	mempool_destroy(conf->r1buf_pool);
 	conf->r1buf_pool = NULL;
 }
 static int raid1_spare_active(mddev_t *mddev)
 {
 	int i;
 	conf_t *conf = mddev->private;
 	mirror_info_t *tmp;
 	/*
 	 * Find all failed disks within the RAID1 configuration
 	 * and mark them readable
 	 */
 	for (i = 0; i < conf->raid_disks; i++) {
 		tmp = conf->mirrors + i;
 		if (tmp->rdev
 		    && !test_bit(Faulty, &tmp->rdev->flags)
 		    && !test_bit(In_sync, &tmp->rdev->flags)) {
 			conf->working_disks++;
 			mddev->degraded--;
 			set_bit(In_sync, &tmp->rdev->flags);
 		}
 	}
 	print_conf(conf);
 	return 0;
 }
 static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
 {
 	conf_t *conf = mddev->private;
 	int found = 0;
 	int mirror = 0;
 	mirror_info_t *p;
 	for (mirror=0; mirror < mddev->raid_disks; mirror++)
 		if ( !(p=conf->mirrors+mirror)->rdev) {
 			blk_queue_stack_limits(mddev->queue,
 					       rdev->bdev->bd_disk->queue);
 			/* as we don't honour merge_bvec_fn, we must never risk
 			 * violating it, so limit ->max_sector to one PAGE, as
 			 * a one page request is never in violation.
 			 */
 			if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
 			    mddev->queue->max_sectors > (PAGE_SIZE>>9))
 				blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 			p->head_position = 0;
 			rdev->raid_disk = mirror;
 			found = 1;
 			/* As all devices are equivalent, we don't need a full recovery
 			 * if this was recently any drive of the array
 			 */
 			if (rdev->saved_raid_disk < 0)
 				conf->fullsync = 1;
 			rcu_assign_pointer(p->rdev, rdev);
 			break;
 		}
 	print_conf(conf);
 	return found;
 }
 static int raid1_remove_disk(mddev_t *mddev, int number)
 {
 	conf_t *conf = mddev->private;
 	int err = 0;
 	mdk_rdev_t *rdev;
 	mirror_info_t *p = conf->mirrors+ number;
 	print_conf(conf);
 	rdev = p->rdev;
 	if (rdev) {
 		if (test_bit(In_sync, &rdev->flags) ||
 		    atomic_read(&rdev->nr_pending)) {
 			err = -EBUSY;
 			goto abort;
 		}
 		p->rdev = NULL;
 		synchronize_rcu();
 		if (atomic_read(&rdev->nr_pending)) {
 			/* lost the race, try later */
 			err = -EBUSY;
 			p->rdev = rdev;
 		}
 	}
 abort:
 	print_conf(conf);
 	return err;
 }
 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
 {
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
 	int i;
 	if (bio->bi_size)
 		return 1;
 	for (i=r1_bio->mddev->raid_disks; i--; )
 		if (r1_bio->bios[i] == bio)
 			break;
 	BUG_ON(i < 0);
 	update_head_pos(i, r1_bio);
 	/*
 	 * we have read a block, now it needs to be re-written,
 	 * or re-read if the read failed.
 	 * We don't do much here, just schedule handling by raid1d
 	 */
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		set_bit(R1BIO_Uptodate, &r1_bio->state);
 	if (atomic_dec_and_test(&r1_bio->remaining))
 		reschedule_retry(r1_bio);
 	return 0;
 }
 static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
 {
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
 	mddev_t *mddev = r1_bio->mddev;
 	conf_t *conf = mddev_to_conf(mddev);
 	int i;
 	int mirror=0;
 	if (bio->bi_size)
 		return 1;
 	for (i = 0; i < conf->raid_disks; i++)
 		if (r1_bio->bios[i] == bio) {
 			mirror = i;
 			break;
 		}
 	if (!uptodate) {
 		int sync_blocks = 0;
 		sector_t s = r1_bio->sector;
 		long sectors_to_go = r1_bio->sectors;
 		/* make sure these bits doesn't get cleared. */
 		do {
 			bitmap_end_sync(mddev->bitmap, r1_bio->sector,
 					&sync_blocks, 1);
 			s += sync_blocks;
 			sectors_to_go -= sync_blocks;
 		} while (sectors_to_go > 0);
 		md_error(mddev, conf->mirrors[mirror].rdev);
 	}
 	update_head_pos(mirror, r1_bio);
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
 		md_done_sync(mddev, r1_bio->sectors, uptodate);
 		put_buf(r1_bio);
 	}
 	return 0;
 }
 static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
 {
 	conf_t *conf = mddev_to_conf(mddev);
 	int i;
 	int disks = conf->raid_disks;
 	struct bio *bio, *wbio;
 	bio = r1_bio->bios[r1_bio->read_disk];
 	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 		/* We have read all readable devices.  If we haven't
 		 * got the block, then there is no hope left.
 		 * If we have, then we want to do a comparison
 		 * and skip the write if everything is the same.
 		 * If any blocks failed to read, then we need to
 		 * attempt an over-write
 		 */
 		int primary;
 		if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
 			for (i=0; i<mddev->raid_disks; i++)
 				if (r1_bio->bios[i]->bi_end_io == end_sync_read)
 					md_error(mddev, conf->mirrors[i].rdev);
 			md_done_sync(mddev, r1_bio->sectors, 1);
 			put_buf(r1_bio);
 			return;
 		}
 		for (primary=0; primary<mddev->raid_disks; primary++)
 			if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
 			    test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
 				r1_bio->bios[primary]->bi_end_io = NULL;
 				rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
 				break;
 			}
 		r1_bio->read_disk = primary;
 		for (i=0; i<mddev->raid_disks; i++)
 			if (r1_bio->bios[i]->bi_end_io == end_sync_read &&
 			    test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) {
 				int j;
 				int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
 				struct bio *pbio = r1_bio->bios[primary];
 				struct bio *sbio = r1_bio->bios[i];
 				for (j = vcnt; j-- ; )
 					if (memcmp(page_address(pbio->bi_io_vec[j].bv_page),
 						   page_address(sbio->bi_io_vec[j].bv_page),
 						   PAGE_SIZE))
 						break;
 				if (j >= 0)
 					mddev->resync_mismatches += r1_bio->sectors;
 				if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
 					sbio->bi_end_io = NULL;
 					rdev_dec_pending(conf->mirrors[i].rdev, mddev);
 				} else {
 					/* fixup the bio for reuse */
 					sbio->bi_vcnt = vcnt;
 					sbio->bi_size = r1_bio->sectors << 9;
 					sbio->bi_idx = 0;
 					sbio->bi_phys_segments = 0;
 					sbio->bi_hw_segments = 0;
 					sbio->bi_hw_front_size = 0;
 					sbio->bi_hw_back_size = 0;
 					sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
 					sbio->bi_flags |= 1 << BIO_UPTODATE;
 					sbio->bi_next = NULL;
 					sbio->bi_sector = r1_bio->sector +
 						conf->mirrors[i].rdev->data_offset;
 					sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
 				}
 			}
 	}
 	if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
 		/* ouch - failed to read all of that.
 		 * Try some synchronous reads of other devices to get
 		 * good data, much like with normal read errors.  Only
 		 * read into the pages we already have so they we don't
 		 * need to re-issue the read request.
 		 * We don't need to freeze the array, because being in an
 		 * active sync request, there is no normal IO, and
 		 * no overlapping syncs.
 		 */
 		sector_t sect = r1_bio->sector;
 		int sectors = r1_bio->sectors;
 		int idx = 0;
 		while(sectors) {
 			int s = sectors;
 			int d = r1_bio->read_disk;
 			int success = 0;
 			mdk_rdev_t *rdev;
 			if (s > (PAGE_SIZE>>9))
 				s = PAGE_SIZE >> 9;
 			do {
 				if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
 					rdev = conf->mirrors[d].rdev;
 					if (sync_page_io(rdev->bdev,
 							 sect + rdev->data_offset,
 							 s<<9,
 							 bio->bi_io_vec[idx].bv_page,
 							 READ)) {
 						success = 1;
 						break;
 					}
 				}
 				d++;
 				if (d == conf->raid_disks)
 					d = 0;
 			} while (!success && d != r1_bio->read_disk);
 			if (success) {
 				int start = d;
 				/* write it back and re-read */
 				set_bit(R1BIO_Uptodate, &r1_bio->state);
 				while (d != r1_bio->read_disk) {
 					if (d == 0)
 						d = conf->raid_disks;
 					d--;
 					if (r1_bio->bios[d]->bi_end_io != end_sync_read)
 						continue;
 					rdev = conf->mirrors[d].rdev;
 					atomic_add(s, &rdev->corrected_errors);
 					if (sync_page_io(rdev->bdev,
 							 sect + rdev->data_offset,
 							 s<<9,
 							 bio->bi_io_vec[idx].bv_page,
 							 WRITE) == 0)
 						md_error(mddev, rdev);
 				}
 				d = start;
 				while (d != r1_bio->read_disk) {
 					if (d == 0)
 						d = conf->raid_disks;
 					d--;
 					if (r1_bio->bios[d]->bi_end_io != end_sync_read)
 						continue;
 					rdev = conf->mirrors[d].rdev;
 					if (sync_page_io(rdev->bdev,
 							 sect + rdev->data_offset,
 							 s<<9,
 							 bio->bi_io_vec[idx].bv_page,
 							 READ) == 0)
 						md_error(mddev, rdev);
 				}
 			} else {
 				char b[BDEVNAME_SIZE];
 				/* Cannot read from anywhere, array is toast */
 				md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
 				printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
 				       " for block %llu\n",
 				       bdevname(bio->bi_bdev,b),
 				       (unsigned long long)r1_bio->sector);
 				md_done_sync(mddev, r1_bio->sectors, 0);
 				put_buf(r1_bio);
 				return;
 			}
 			sectors -= s;
 			sect += s;
 			idx ++;
 		}
 	}
 	/*
 	 * schedule writes
 	 */
 	atomic_set(&r1_bio->remaining, 1);
 	for (i = 0; i < disks ; i++) {
 		wbio = r1_bio->bios[i];
 		if (wbio->bi_end_io == NULL ||
 		    (wbio->bi_end_io == end_sync_read &&
 		     (i == r1_bio->read_disk ||
 		      !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
 			continue;
 		wbio->bi_rw = WRITE;
 		wbio->bi_end_io = end_sync_write;
 		atomic_inc(&r1_bio->remaining);
 		md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
 		generic_make_request(wbio);
 	}
 	if (atomic_dec_and_test(&r1_bio->remaining)) {
 		/* if we're here, all write(s) have completed, so clean up */
 		md_done_sync(mddev, r1_bio->sectors, 1);
 		put_buf(r1_bio);
 	}
 }
 /*
  * This is a kernel thread which:
  *
  *	1.	Retries failed read operations on working mirrors.
  *	2.	Updates the raid superblock when problems encounter.
  *	3.	Performs writes following reads for array syncronising.
  */
 static void raid1d(mddev_t *mddev)
 {
 	r1bio_t *r1_bio;
 	struct bio *bio;
 	unsigned long flags;
 	conf_t *conf = mddev_to_conf(mddev);
 	struct list_head *head = &conf->retry_list;
 	int unplug=0;
 	mdk_rdev_t *rdev;
 	md_check_recovery(mddev);
 	for (;;) {
 		char b[BDEVNAME_SIZE];
 		spin_lock_irqsave(&conf->device_lock, flags);
 		if (conf->pending_bio_list.head) {
 			bio = bio_list_get(&conf->pending_bio_list);
 			blk_remove_plug(mddev->queue);
 			spin_unlock_irqrestore(&conf->device_lock, flags);
 			/* flush any pending bitmap writes to disk before proceeding w/ I/O */
 			if (bitmap_unplug(mddev->bitmap) != 0)
 				printk("%s: bitmap file write failed!\n", mdname(mddev));
 			while (bio) { /* submit pending writes */
 				struct bio *next = bio->bi_next;
 				bio->bi_next = NULL;
 				generic_make_request(bio);
 				bio = next;
 			}
 			unplug = 1;
 			continue;
 		}
 		if (list_empty(head))
 			break;
 		r1_bio = list_entry(head->prev, r1bio_t, retry_list);
 		list_del(head->prev);
 		conf->nr_queued--;
 		spin_unlock_irqrestore(&conf->device_lock, flags);
 		mddev = r1_bio->mddev;
 		conf = mddev_to_conf(mddev);
 		if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
 			sync_request_write(mddev, r1_bio);
 			unplug = 1;
 		} else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
 			/* some requests in the r1bio were BIO_RW_BARRIER
 			 * requests which failed with -EOPNOTSUPP.  Hohumm..
 			 * Better resubmit without the barrier.
 			 * We know which devices to resubmit for, because
 			 * all others have had their bios[] entry cleared.
 			 */
 			int i;
 			clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
 			clear_bit(R1BIO_Barrier, &r1_bio->state);
 			for (i=0; i < conf->raid_disks; i++)
 				if (r1_bio->bios[i])
 					atomic_inc(&r1_bio->remaining);
 			for (i=0; i < conf->raid_disks; i++)
 				if (r1_bio->bios[i]) {
 					struct bio_vec *bvec;
 					int j;
 					bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
 					/* copy pages from the failed bio, as
 					 * this might be a write-behind device */
 					__bio_for_each_segment(bvec, bio, j, 0)
 						bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
 					bio_put(r1_bio->bios[i]);
 					bio->bi_sector = r1_bio->sector +
 						conf->mirrors[i].rdev->data_offset;
 					bio->bi_bdev = conf->mirrors[i].rdev->bdev;
 					bio->bi_end_io = raid1_end_write_request;
 					bio->bi_rw = WRITE;
 					bio->bi_private = r1_bio;
 					r1_bio->bios[i] = bio;
 					generic_make_request(bio);
 				}
 		} else {
 			int disk;
 			/* we got a read error. Maybe the drive is bad.  Maybe just
 			 * the block and we can fix it.
 			 * We freeze all other IO, and try reading the block from
 			 * other devices.  When we find one, we re-write
 			 * and check it that fixes the read error.
 			 * This is all done synchronously while the array is
 			 * frozen
 			 */
 			sector_t sect = r1_bio->sector;
 			int sectors = r1_bio->sectors;
 			freeze_array(conf);
 			if (mddev->ro == 0) while(sectors) {
 				int s = sectors;
 				int d = r1_bio->read_disk;
 				int success = 0;
 				if (s > (PAGE_SIZE>>9))
 					s = PAGE_SIZE >> 9;
 				do {
 					rdev = conf->mirrors[d].rdev;
 					if (rdev &&
 					    test_bit(In_sync, &rdev->flags) &&
 					    sync_page_io(rdev->bdev,
 							 sect + rdev->data_offset,
 							 s<<9,
 							 conf->tmppage, READ))
 						success = 1;
 					else {
 						d++;
 						if (d == conf->raid_disks)
 							d = 0;
 					}
 				} while (!success && d != r1_bio->read_disk);
 				if (success) {
 					/* write it back and re-read */
 					int start = d;
 					while (d != r1_bio->read_disk) {
 						if (d==0)
 							d = conf->raid_disks;
 						d--;
 						rdev = conf->mirrors[d].rdev;
 						atomic_add(s, &rdev->corrected_errors);
 						if (rdev &&
 						    test_bit(In_sync, &rdev->flags)) {
 							if (sync_page_io(rdev->bdev,
 									 sect + rdev->data_offset,
 									 s<<9, conf->tmppage, WRITE) == 0)
 								/* Well, this device is dead */
 								md_error(mddev, rdev);
 						}
 					}
 					d = start;
 					while (d != r1_bio->read_disk) {
 						if (d==0)
 							d = conf->raid_disks;
 						d--;
 						rdev = conf->mirrors[d].rdev;
 						if (rdev &&
 						    test_bit(In_sync, &rdev->flags)) {
 							if (sync_page_io(rdev->bdev,
 									 sect + rdev->data_offset,
 									 s<<9, conf->tmppage, READ) == 0)
 								/* Well, this device is dead */
 								md_error(mddev, rdev);
 						}
 					}
 				} else {
 					/* Cannot read from anywhere -- bye bye array */
 					md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
 					break;
 				}
 				sectors -= s;
 				sect += s;
 			}
 			unfreeze_array(conf);
 			bio = r1_bio->bios[r1_bio->read_disk];
 			if ((disk=read_balance(conf, r1_bio)) == -1) {
 				printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
 				       " read error for block %llu\n",
 				       bdevname(bio->bi_bdev,b),
 				       (unsigned long long)r1_bio->sector);
 				raid_end_bio_io(r1_bio);
 			} else {
 				r1_bio->bios[r1_bio->read_disk] =
 					mddev->ro ? IO_BLOCKED : NULL;
 				r1_bio->read_disk = disk;
 				bio_put(bio);
 				bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
 				r1_bio->bios[r1_bio->read_disk] = bio;
 				rdev = conf->mirrors[disk].rdev;
 				if (printk_ratelimit())
 					printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
 					       " another mirror\n",
 					       bdevname(rdev->bdev,b),
 					       (unsigned long long)r1_bio->sector);
 				bio->bi_sector = r1_bio->sector + rdev->data_offset;
 				bio->bi_bdev = rdev->bdev;
 				bio->bi_end_io = raid1_end_read_request;
 				bio->bi_rw = READ;
 				bio->bi_private = r1_bio;
 				unplug = 1;
 				generic_make_request(bio);
 			}
 		}
 	}
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	if (unplug)
 		unplug_slaves(mddev);
 }
 static int init_resync(conf_t *conf)
 {
 	int buffs;
 	buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
 	BUG_ON(conf->r1buf_pool);
 	conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
 					  conf->poolinfo);
 	if (!conf->r1buf_pool)
 		return -ENOMEM;
 	conf->next_resync = 0;
 	return 0;
 }
 /*
  * perform a "sync" on one "block"
  *
  * We need to make sure that no normal I/O request - particularly write
  * requests - conflict with active sync requests.
  *
  * This is achieved by tracking pending requests and a 'barrier' concept
  * that can be installed to exclude normal IO requests.
  */
 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
 {
 	conf_t *conf = mddev_to_conf(mddev);
 	r1bio_t *r1_bio;
 	struct bio *bio;
 	sector_t max_sector, nr_sectors;
 	int disk = -1;
 	int i;
 	int wonly = -1;
 	int write_targets = 0, read_targets = 0;
 	int sync_blocks;
 	int still_degraded = 0;
 	if (!conf->r1buf_pool)
 	{
 /*
 		printk("sync start - bitmap %p\n", mddev->bitmap);
 */
 		if (init_resync(conf))
 			return 0;
 	}
 	max_sector = mddev->size << 1;
 	if (sector_nr >= max_sector) {
 		/* If we aborted, we need to abort the
 		 * sync on the 'current' bitmap chunk (there will
 		 * only be one in raid1 resync.
 		 * We can find the current addess in mddev->curr_resync
 		 */
 		if (mddev->curr_resync < max_sector) /* aborted */
 			bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
 						&sync_blocks, 1);
 		else /* completed sync */
 			conf->fullsync = 0;
 		bitmap_close_sync(mddev->bitmap);
 		close_sync(conf);
 		return 0;
 	}
 	/* before building a request, check if we can skip these blocks..
 	 * This call the bitmap_start_sync doesn't actually record anything
 	 */
 	if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
 	    !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 		/* We can skip this block, and probably several more */
 		*skipped = 1;
 		return sync_blocks;
 	}
 	/*
 	 * If there is non-resync activity waiting for a turn,
 	 * and resync is going fast enough,
 	 * then let it though before starting on this new sync request.
 	 */
 	if (!go_faster && conf->nr_waiting)
 		msleep_interruptible(1000);
 	raise_barrier(conf);
 	conf->next_resync = sector_nr;
 	r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
 	rcu_read_lock();
 	/*
 	 * If we get a correctably read error during resync or recovery,
 	 * we might want to read from a different device.  So we
 	 * flag all drives that could conceivably be read from for READ,
 	 * and any others (which will be non-In_sync devices) for WRITE.
 	 * If a read fails, we try reading from something else for which READ
 	 * is OK.
 	 */
 	r1_bio->mddev = mddev;
 	r1_bio->sector = sector_nr;
 	r1_bio->state = 0;
 	set_bit(R1BIO_IsSync, &r1_bio->state);
 	for (i=0; i < conf->raid_disks; i++) {
 		mdk_rdev_t *rdev;
 		bio = r1_bio->bios[i];
 		/* take from bio_init */
 		bio->bi_next = NULL;
 		bio->bi_flags |= 1 << BIO_UPTODATE;
 		bio->bi_rw = 0;
 		bio->bi_vcnt = 0;
 		bio->bi_idx = 0;
 		bio->bi_phys_segments = 0;
 		bio->bi_hw_segments = 0;
 		bio->bi_size = 0;
 		bio->bi_end_io = NULL;
 		bio->bi_private = NULL;
 		rdev = rcu_dereference(conf->mirrors[i].rdev);
 		if (rdev == NULL ||
 			   test_bit(Faulty, &rdev->flags)) {
 			still_degraded = 1;
 			continue;
 		} else if (!test_bit(In_sync, &rdev->flags)) {
 			bio->bi_rw = WRITE;
 			bio->bi_end_io = end_sync_write;
 			write_targets ++;
 		} else {
 			/* may need to read from here */
 			bio->bi_rw = READ;
 			bio->bi_end_io = end_sync_read;
 			if (test_bit(WriteMostly, &rdev->flags)) {
 				if (wonly < 0)
 					wonly = i;
 			} else {
 				if (disk < 0)
 					disk = i;
 			}
 			read_targets++;
 		}
 		atomic_inc(&rdev->nr_pending);
 		bio->bi_sector = sector_nr + rdev->data_offset;
 		bio->bi_bdev = rdev->bdev;
 		bio->bi_private = r1_bio;
 	}
 	rcu_read_unlock();
 	if (disk < 0)
 		disk = wonly;
 	r1_bio->read_disk = disk;
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
 		/* extra read targets are also write targets */
 		write_targets += read_targets-1;
 	if (write_targets == 0 || read_targets == 0) {
 		/* There is nowhere to write, so all non-sync
 		 * drives must be failed - so we are finished
 		 */
 		sector_t rv = max_sector - sector_nr;
 		*skipped = 1;
 		put_buf(r1_bio);
 		return rv;
 	}
 	nr_sectors = 0;
 	sync_blocks = 0;
 	do {
 		struct page *page;
 		int len = PAGE_SIZE;
 		if (sector_nr + (len>>9) > max_sector)
 			len = (max_sector - sector_nr) << 9;
 		if (len == 0)
 			break;
 		if (sync_blocks == 0) {
 			if (!bitmap_start_sync(mddev->bitmap, sector_nr,
 					       &sync_blocks, still_degraded) &&
 			    !conf->fullsync &&
 			    !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
 				break;
 			BUG_ON(sync_blocks < (PAGE_SIZE>>9));
 			if (len > (sync_blocks<<9))
 				len = sync_blocks<<9;
 		}
 		for (i=0 ; i < conf->raid_disks; i++) {
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io) {
 				page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
 				if (bio_add_page(bio, page, len, 0) == 0) {
 					/* stop here */
 					bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
 					while (i > 0) {
 						i--;
 						bio = r1_bio->bios[i];
 						if (bio->bi_end_io==NULL)
 							continue;
 						/* remove last page from this bio */
 						bio->bi_vcnt--;
 						bio->bi_size -= len;
 						bio->bi_flags &= ~(1<< BIO_SEG_VALID);
 					}
 					goto bio_full;
 				}
 			}
 		}
 		nr_sectors += len>>9;
 		sector_nr += len>>9;
 		sync_blocks -= (len>>9);
 	} while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
  bio_full:
 	r1_bio->sectors = nr_sectors;
 	/* For a user-requested sync, we read all readable devices and do a
 	 * compare
 	 */
 	if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
 		atomic_set(&r1_bio->remaining, read_targets);
 		for (i=0; i<conf->raid_disks; i++) {
 			bio = r1_bio->bios[i];
 			if (bio->bi_end_io == end_sync_read) {
 				md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors);
 				generic_make_request(bio);
 			}
 		}
 	} else {
 		atomic_set(&r1_bio->remaining, 1);
 		bio = r1_bio->bios[r1_bio->read_disk];
 		md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev,
 			     nr_sectors);
 		generic_make_request(bio);
 	}
 	return nr_sectors;
 }
 static int run(mddev_t *mddev)
 {
 	conf_t *conf;
 	int i, j, disk_idx;
 	mirror_info_t *disk;
 	mdk_rdev_t *rdev;
 	struct list_head *tmp;
 	if (mddev->level != 1) {
 		printk("raid1: %s: raid level not set to mirroring (%d)\n",
 		       mdname(mddev), mddev->level);
 		goto out;
 	}
 	if (mddev->reshape_position != MaxSector) {
 		printk("raid1: %s: reshape_position set but not supported\n",
 		       mdname(mddev));
 		goto out;
 	}
 	/*
 	 * copy the already verified devices into our private RAID1
 	 * bookkeeping area. [whatever we allocate in run(),
 	 * should be freed in stop()]
 	 */
 	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
 	mddev->private = conf;
 	if (!conf)
 		goto out_no_mem;
 	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
 				 GFP_KERNEL);
 	if (!conf->mirrors)
 		goto out_no_mem;
 	conf->tmppage = alloc_page(GFP_KERNEL);
 	if (!conf->tmppage)
 		goto out_no_mem;
 	conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
 	if (!conf->poolinfo)
 		goto out_no_mem;
 	conf->poolinfo->mddev = mddev;
 	conf->poolinfo->raid_disks = mddev->raid_disks;
 	conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
 					  r1bio_pool_free,
 					  conf->poolinfo);
 	if (!conf->r1bio_pool)
 		goto out_no_mem;
 	ITERATE_RDEV(mddev, rdev, tmp) {
 		disk_idx = rdev->raid_disk;
 		if (disk_idx >= mddev->raid_disks
 		    || disk_idx < 0)
 			continue;
 		disk = conf->mirrors + disk_idx;
 		disk->rdev = rdev;
 		blk_queue_stack_limits(mddev->queue,
 				       rdev->bdev->bd_disk->queue);
 		/* as we don't honour merge_bvec_fn, we must never risk
 		 * violating it, so limit ->max_sector to one PAGE, as
 		 * a one page request is never in violation.
 		 */
 		if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
 		    mddev->queue->max_sectors > (PAGE_SIZE>>9))
 			blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
 		disk->head_position = 0;
 		if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
 			conf->working_disks++;
 	}
 	conf->raid_disks = mddev->raid_disks;
 	conf->mddev = mddev;
 	spin_lock_init(&conf->device_lock);
 	INIT_LIST_HEAD(&conf->retry_list);
 	if (conf->working_disks == 1)
 		mddev->recovery_cp = MaxSector;
 	spin_lock_init(&conf->resync_lock);
 	init_waitqueue_head(&conf->wait_barrier);
 	bio_list_init(&conf->pending_bio_list);
 	bio_list_init(&conf->flushing_bio_list);
 	if (!conf->working_disks) {
 		printk(KERN_ERR "raid1: no operational mirrors for %s\n",
 			mdname(mddev));
 		goto out_free_conf;
 	}
 	mddev->degraded = 0;
 	for (i = 0; i < conf->raid_disks; i++) {
 		disk = conf->mirrors + i;
 		if (!disk->rdev) {
 			disk->head_position = 0;
 			mddev->degraded++;
 		}
 	}
 	/*
 	 * find the first working one and use it as a starting point
 	 * to read balancing.
 	 */
 	for (j = 0; j < conf->raid_disks &&
 		     (!conf->mirrors[j].rdev ||
 		      !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++)
 		/* nothing */;
 	conf->last_used = j;
 	mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");
 	if (!mddev->thread) {
 		printk(KERN_ERR
 		       "raid1: couldn't allocate thread for %s\n",
 		       mdname(mddev));
 		goto out_free_conf;
 	}
 	printk(KERN_INFO
 		"raid1: raid set %s active with %d out of %d mirrors\n",
 		mdname(mddev), mddev->raid_disks - mddev->degraded,
 		mddev->raid_disks);
 	/*
 	 * Ok, everything is just fine now
 	 */
 	mddev->array_size = mddev->size;
 	mddev->queue->unplug_fn = raid1_unplug;
 	mddev->queue->issue_flush_fn = raid1_issue_flush;
 	return 0;
 out_no_mem:
 	printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
 	       mdname(mddev));
 out_free_conf:
 	if (conf) {
 		if (conf->r1bio_pool)
 			mempool_destroy(conf->r1bio_pool);
 		kfree(conf->mirrors);
 		safe_put_page(conf->tmppage);
 		kfree(conf->poolinfo);
 		kfree(conf);
 		mddev->private = NULL;
 	}
 out:
 	return -EIO;
 }
 static int stop(mddev_t *mddev)
 {
 	conf_t *conf = mddev_to_conf(mddev);
 	struct bitmap *bitmap = mddev->bitmap;
 	int behind_wait = 0;
 	/* wait for behind writes to complete */
 	while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
 		behind_wait++;
 		printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
 		set_current_state(TASK_UNINTERRUPTIBLE);
 		schedule_timeout(HZ); /* wait a second */
 		/* need to kick something here to make sure I/O goes? */
 	}
 	md_unregister_thread(mddev->thread);
 	mddev->thread = NULL;
 	blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
 	if (conf->r1bio_pool)
 		mempool_destroy(conf->r1bio_pool);
 	kfree(conf->mirrors);
 	kfree(conf->poolinfo);
 	kfree(conf);
 	mddev->private = NULL;
 	return 0;
 }
 static int raid1_resize(mddev_t *mddev, sector_t sectors)
 {
 	/* no resync is happening, and there is enough space
 	 * on all devices, so we can resize.
 	 * We need to make sure resync covers any new space.
 	 * If the array is shrinking we should possibly wait until
 	 * any io in the removed space completes, but it hardly seems
 	 * worth it.
 	 */
 	mddev->array_size = sectors>>1;
 	set_capacity(mddev->gendisk, mddev->array_size << 1);
 	mddev->changed = 1;
 	if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
 		mddev->recovery_cp = mddev->size << 1;
 		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	}
 	mddev->size = mddev->array_size;
 	mddev->resync_max_sectors = sectors;
 	return 0;
 }
 static int raid1_reshape(mddev_t *mddev)
 {
 	/* We need to:
 	 * 1/ resize the r1bio_pool
 	 * 2/ resize conf->mirrors
 	 *
 	 * We allocate a new r1bio_pool if we can.
 	 * Then raise a device barrier and wait until all IO stops.
 	 * Then resize conf->mirrors and swap in the new r1bio pool.
 	 *
 	 * At the same time, we "pack" the devices so that all the missing
 	 * devices have the higher raid_disk numbers.
 	 */
 	mempool_t *newpool, *oldpool;
 	struct pool_info *newpoolinfo;
 	mirror_info_t *newmirrors;
 	conf_t *conf = mddev_to_conf(mddev);
 	int cnt, raid_disks;
 	int d, d2;
 	/* Cannot change chunk_size, layout, or level */
 	if (mddev->chunk_size != mddev->new_chunk ||
 	    mddev->layout != mddev->new_layout ||
 	    mddev->level != mddev->new_level) {
 		mddev->new_chunk = mddev->chunk_size;
 		mddev->new_layout = mddev->layout;
 		mddev->new_level = mddev->level;
 		return -EINVAL;
 	}
 	raid_disks = mddev->raid_disks + mddev->delta_disks;
 	if (raid_disks < conf->raid_disks) {
 		cnt=0;
 		for (d= 0; d < conf->raid_disks; d++)
 			if (conf->mirrors[d].rdev)
 				cnt++;
 		if (cnt > raid_disks)
 			return -EBUSY;
 	}
 	newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
 	if (!newpoolinfo)
 		return -ENOMEM;
 	newpoolinfo->mddev = mddev;
 	newpoolinfo->raid_disks = raid_disks;
 	newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
 				 r1bio_pool_free, newpoolinfo);
 	if (!newpool) {
 		kfree(newpoolinfo);
 		return -ENOMEM;
 	}
 	newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
 	if (!newmirrors) {
 		kfree(newpoolinfo);
 		mempool_destroy(newpool);
 		return -ENOMEM;
 	}
 	raise_barrier(conf);
 	/* ok, everything is stopped */
 	oldpool = conf->r1bio_pool;
 	conf->r1bio_pool = newpool;
 	for (d=d2=0; d < conf->raid_disks; d++)
 		if (conf->mirrors[d].rdev) {
 			conf->mirrors[d].rdev->raid_disk = d2;
 			newmirrors[d2++].rdev = conf->mirrors[d].rdev;
 		}
 	kfree(conf->mirrors);
 	conf->mirrors = newmirrors;
 	kfree(conf->poolinfo);
 	conf->poolinfo = newpoolinfo;
 	mddev->degraded += (raid_disks - conf->raid_disks);
 	conf->raid_disks = mddev->raid_disks = raid_disks;
 	mddev->delta_disks = 0;
 	conf->last_used = 0; /* just make sure it is in-range */
 	lower_barrier(conf);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 	mempool_destroy(oldpool);
 	return 0;
 }
 static void raid1_quiesce(mddev_t *mddev, int state)
 {
 	conf_t *conf = mddev_to_conf(mddev);
 	switch(state) {
 	case 1:
 		raise_barrier(conf);
 		break;
 	case 0:
 		lower_barrier(conf);
 		break;
 	}
 }
 static struct mdk_personality raid1_personality =
 {
 	.name		= "raid1",
 	.level		= 1,
 	.owner		= THIS_MODULE,
 	.make_request	= make_request,
 	.run		= run,
 	.stop		= stop,
 	.status		= status,
 	.error_handler	= error,
 	.hot_add_disk	= raid1_add_disk,
 	.hot_remove_disk= raid1_remove_disk,
 	.spare_active	= raid1_spare_active,
 	.sync_request	= sync_request,
 	.resize		= raid1_resize,
 	.check_reshape	= raid1_reshape,
 	.quiesce	= raid1_quiesce,
 };
 static int __init raid_init(void)
 {
 	return register_md_personality(&raid1_personality);
 }
 static void raid_exit(void)
 {
 	unregister_md_personality(&raid1_personality);
 }
 module_init(raid_init);
 module_exit(raid_exit);