Commit 62de608da0b0ab17d81a233b50d1e952b9816f69

Authored by NeilBrown
Committed by Linus Torvalds
1 parent bea2771871

[PATCH] md: Improve detection of lack of barrier support in raid1

Move the test for 'do barrier work' down a bit so that if the first write to a
raid1 is a BIO_RW_BARRIER write, the checking done by superblock writes will
cause the right thing to happen.

Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 1 changed file with 11 additions and 5 deletions Inline Diff

1 /* 1 /*
2 * raid1.c : Multiple Devices driver for Linux 2 * raid1.c : Multiple Devices driver for Linux
3 * 3 *
4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat 4 * Copyright (C) 1999, 2000, 2001 Ingo Molnar, Red Hat
5 * 5 *
6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman 6 * Copyright (C) 1996, 1997, 1998 Ingo Molnar, Miguel de Icaza, Gadi Oxman
7 * 7 *
8 * RAID-1 management functions. 8 * RAID-1 management functions.
9 * 9 *
10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000 10 * Better read-balancing code written by Mika Kuoppala <miku@iki.fi>, 2000
11 * 11 *
12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk> 12 * Fixes to reconstruction by Jakob Østergaard" <jakob@ostenfeld.dk>
13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au> 13 * Various fixes by Neil Brown <neilb@cse.unsw.edu.au>
14 * 14 *
15 * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support 15 * Changes by Peter T. Breuer <ptb@it.uc3m.es> 31/1/2003 to support
16 * bitmapped intelligence in resync: 16 * bitmapped intelligence in resync:
17 * 17 *
18 * - bitmap marked during normal i/o 18 * - bitmap marked during normal i/o
19 * - bitmap used to skip nondirty blocks during sync 19 * - bitmap used to skip nondirty blocks during sync
20 * 20 *
21 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology: 21 * Additions to bitmap code, (C) 2003-2004 Paul Clements, SteelEye Technology:
22 * - persistent bitmap code 22 * - persistent bitmap code
23 * 23 *
24 * This program is free software; you can redistribute it and/or modify 24 * This program is free software; you can redistribute it and/or modify
25 * it under the terms of the GNU General Public License as published by 25 * it under the terms of the GNU General Public License as published by
26 * the Free Software Foundation; either version 2, or (at your option) 26 * the Free Software Foundation; either version 2, or (at your option)
27 * any later version. 27 * any later version.
28 * 28 *
29 * You should have received a copy of the GNU General Public License 29 * You should have received a copy of the GNU General Public License
30 * (for example /usr/src/linux/COPYING); if not, write to the Free 30 * (for example /usr/src/linux/COPYING); if not, write to the Free
31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 31 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
32 */ 32 */
33 33
34 #include "dm-bio-list.h" 34 #include "dm-bio-list.h"
35 #include <linux/raid/raid1.h> 35 #include <linux/raid/raid1.h>
36 #include <linux/raid/bitmap.h> 36 #include <linux/raid/bitmap.h>
37 37
38 #define DEBUG 0 38 #define DEBUG 0
39 #if DEBUG 39 #if DEBUG
40 #define PRINTK(x...) printk(x) 40 #define PRINTK(x...) printk(x)
41 #else 41 #else
42 #define PRINTK(x...) 42 #define PRINTK(x...)
43 #endif 43 #endif
44 44
45 /* 45 /*
46 * Number of guaranteed r1bios in case of extreme VM load: 46 * Number of guaranteed r1bios in case of extreme VM load:
47 */ 47 */
48 #define NR_RAID1_BIOS 256 48 #define NR_RAID1_BIOS 256
49 49
50 50
51 static void unplug_slaves(mddev_t *mddev); 51 static void unplug_slaves(mddev_t *mddev);
52 52
53 static void allow_barrier(conf_t *conf); 53 static void allow_barrier(conf_t *conf);
54 static void lower_barrier(conf_t *conf); 54 static void lower_barrier(conf_t *conf);
55 55
56 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) 56 static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data)
57 { 57 {
58 struct pool_info *pi = data; 58 struct pool_info *pi = data;
59 r1bio_t *r1_bio; 59 r1bio_t *r1_bio;
60 int size = offsetof(r1bio_t, bios[pi->raid_disks]); 60 int size = offsetof(r1bio_t, bios[pi->raid_disks]);
61 61
62 /* allocate a r1bio with room for raid_disks entries in the bios array */ 62 /* allocate a r1bio with room for raid_disks entries in the bios array */
63 r1_bio = kzalloc(size, gfp_flags); 63 r1_bio = kzalloc(size, gfp_flags);
64 if (!r1_bio) 64 if (!r1_bio)
65 unplug_slaves(pi->mddev); 65 unplug_slaves(pi->mddev);
66 66
67 return r1_bio; 67 return r1_bio;
68 } 68 }
69 69
70 static void r1bio_pool_free(void *r1_bio, void *data) 70 static void r1bio_pool_free(void *r1_bio, void *data)
71 { 71 {
72 kfree(r1_bio); 72 kfree(r1_bio);
73 } 73 }
74 74
75 #define RESYNC_BLOCK_SIZE (64*1024) 75 #define RESYNC_BLOCK_SIZE (64*1024)
76 //#define RESYNC_BLOCK_SIZE PAGE_SIZE 76 //#define RESYNC_BLOCK_SIZE PAGE_SIZE
77 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9) 77 #define RESYNC_SECTORS (RESYNC_BLOCK_SIZE >> 9)
78 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) 78 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
79 #define RESYNC_WINDOW (2048*1024) 79 #define RESYNC_WINDOW (2048*1024)
80 80
81 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) 81 static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data)
82 { 82 {
83 struct pool_info *pi = data; 83 struct pool_info *pi = data;
84 struct page *page; 84 struct page *page;
85 r1bio_t *r1_bio; 85 r1bio_t *r1_bio;
86 struct bio *bio; 86 struct bio *bio;
87 int i, j; 87 int i, j;
88 88
89 r1_bio = r1bio_pool_alloc(gfp_flags, pi); 89 r1_bio = r1bio_pool_alloc(gfp_flags, pi);
90 if (!r1_bio) { 90 if (!r1_bio) {
91 unplug_slaves(pi->mddev); 91 unplug_slaves(pi->mddev);
92 return NULL; 92 return NULL;
93 } 93 }
94 94
95 /* 95 /*
96 * Allocate bios : 1 for reading, n-1 for writing 96 * Allocate bios : 1 for reading, n-1 for writing
97 */ 97 */
98 for (j = pi->raid_disks ; j-- ; ) { 98 for (j = pi->raid_disks ; j-- ; ) {
99 bio = bio_alloc(gfp_flags, RESYNC_PAGES); 99 bio = bio_alloc(gfp_flags, RESYNC_PAGES);
100 if (!bio) 100 if (!bio)
101 goto out_free_bio; 101 goto out_free_bio;
102 r1_bio->bios[j] = bio; 102 r1_bio->bios[j] = bio;
103 } 103 }
104 /* 104 /*
105 * Allocate RESYNC_PAGES data pages and attach them to 105 * Allocate RESYNC_PAGES data pages and attach them to
106 * the first bio. 106 * the first bio.
107 * If this is a user-requested check/repair, allocate 107 * If this is a user-requested check/repair, allocate
108 * RESYNC_PAGES for each bio. 108 * RESYNC_PAGES for each bio.
109 */ 109 */
110 if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) 110 if (test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery))
111 j = pi->raid_disks; 111 j = pi->raid_disks;
112 else 112 else
113 j = 1; 113 j = 1;
114 while(j--) { 114 while(j--) {
115 bio = r1_bio->bios[j]; 115 bio = r1_bio->bios[j];
116 for (i = 0; i < RESYNC_PAGES; i++) { 116 for (i = 0; i < RESYNC_PAGES; i++) {
117 page = alloc_page(gfp_flags); 117 page = alloc_page(gfp_flags);
118 if (unlikely(!page)) 118 if (unlikely(!page))
119 goto out_free_pages; 119 goto out_free_pages;
120 120
121 bio->bi_io_vec[i].bv_page = page; 121 bio->bi_io_vec[i].bv_page = page;
122 } 122 }
123 } 123 }
124 /* If not user-requests, copy the page pointers to all bios */ 124 /* If not user-requests, copy the page pointers to all bios */
125 if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) { 125 if (!test_bit(MD_RECOVERY_REQUESTED, &pi->mddev->recovery)) {
126 for (i=0; i<RESYNC_PAGES ; i++) 126 for (i=0; i<RESYNC_PAGES ; i++)
127 for (j=1; j<pi->raid_disks; j++) 127 for (j=1; j<pi->raid_disks; j++)
128 r1_bio->bios[j]->bi_io_vec[i].bv_page = 128 r1_bio->bios[j]->bi_io_vec[i].bv_page =
129 r1_bio->bios[0]->bi_io_vec[i].bv_page; 129 r1_bio->bios[0]->bi_io_vec[i].bv_page;
130 } 130 }
131 131
132 r1_bio->master_bio = NULL; 132 r1_bio->master_bio = NULL;
133 133
134 return r1_bio; 134 return r1_bio;
135 135
136 out_free_pages: 136 out_free_pages:
137 for (i=0; i < RESYNC_PAGES ; i++) 137 for (i=0; i < RESYNC_PAGES ; i++)
138 for (j=0 ; j < pi->raid_disks; j++) 138 for (j=0 ; j < pi->raid_disks; j++)
139 safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page); 139 safe_put_page(r1_bio->bios[j]->bi_io_vec[i].bv_page);
140 j = -1; 140 j = -1;
141 out_free_bio: 141 out_free_bio:
142 while ( ++j < pi->raid_disks ) 142 while ( ++j < pi->raid_disks )
143 bio_put(r1_bio->bios[j]); 143 bio_put(r1_bio->bios[j]);
144 r1bio_pool_free(r1_bio, data); 144 r1bio_pool_free(r1_bio, data);
145 return NULL; 145 return NULL;
146 } 146 }
147 147
148 static void r1buf_pool_free(void *__r1_bio, void *data) 148 static void r1buf_pool_free(void *__r1_bio, void *data)
149 { 149 {
150 struct pool_info *pi = data; 150 struct pool_info *pi = data;
151 int i,j; 151 int i,j;
152 r1bio_t *r1bio = __r1_bio; 152 r1bio_t *r1bio = __r1_bio;
153 153
154 for (i = 0; i < RESYNC_PAGES; i++) 154 for (i = 0; i < RESYNC_PAGES; i++)
155 for (j = pi->raid_disks; j-- ;) { 155 for (j = pi->raid_disks; j-- ;) {
156 if (j == 0 || 156 if (j == 0 ||
157 r1bio->bios[j]->bi_io_vec[i].bv_page != 157 r1bio->bios[j]->bi_io_vec[i].bv_page !=
158 r1bio->bios[0]->bi_io_vec[i].bv_page) 158 r1bio->bios[0]->bi_io_vec[i].bv_page)
159 safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page); 159 safe_put_page(r1bio->bios[j]->bi_io_vec[i].bv_page);
160 } 160 }
161 for (i=0 ; i < pi->raid_disks; i++) 161 for (i=0 ; i < pi->raid_disks; i++)
162 bio_put(r1bio->bios[i]); 162 bio_put(r1bio->bios[i]);
163 163
164 r1bio_pool_free(r1bio, data); 164 r1bio_pool_free(r1bio, data);
165 } 165 }
166 166
167 static void put_all_bios(conf_t *conf, r1bio_t *r1_bio) 167 static void put_all_bios(conf_t *conf, r1bio_t *r1_bio)
168 { 168 {
169 int i; 169 int i;
170 170
171 for (i = 0; i < conf->raid_disks; i++) { 171 for (i = 0; i < conf->raid_disks; i++) {
172 struct bio **bio = r1_bio->bios + i; 172 struct bio **bio = r1_bio->bios + i;
173 if (*bio && *bio != IO_BLOCKED) 173 if (*bio && *bio != IO_BLOCKED)
174 bio_put(*bio); 174 bio_put(*bio);
175 *bio = NULL; 175 *bio = NULL;
176 } 176 }
177 } 177 }
178 178
179 static void free_r1bio(r1bio_t *r1_bio) 179 static void free_r1bio(r1bio_t *r1_bio)
180 { 180 {
181 conf_t *conf = mddev_to_conf(r1_bio->mddev); 181 conf_t *conf = mddev_to_conf(r1_bio->mddev);
182 182
183 /* 183 /*
184 * Wake up any possible resync thread that waits for the device 184 * Wake up any possible resync thread that waits for the device
185 * to go idle. 185 * to go idle.
186 */ 186 */
187 allow_barrier(conf); 187 allow_barrier(conf);
188 188
189 put_all_bios(conf, r1_bio); 189 put_all_bios(conf, r1_bio);
190 mempool_free(r1_bio, conf->r1bio_pool); 190 mempool_free(r1_bio, conf->r1bio_pool);
191 } 191 }
192 192
193 static void put_buf(r1bio_t *r1_bio) 193 static void put_buf(r1bio_t *r1_bio)
194 { 194 {
195 conf_t *conf = mddev_to_conf(r1_bio->mddev); 195 conf_t *conf = mddev_to_conf(r1_bio->mddev);
196 int i; 196 int i;
197 197
198 for (i=0; i<conf->raid_disks; i++) { 198 for (i=0; i<conf->raid_disks; i++) {
199 struct bio *bio = r1_bio->bios[i]; 199 struct bio *bio = r1_bio->bios[i];
200 if (bio->bi_end_io) 200 if (bio->bi_end_io)
201 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev); 201 rdev_dec_pending(conf->mirrors[i].rdev, r1_bio->mddev);
202 } 202 }
203 203
204 mempool_free(r1_bio, conf->r1buf_pool); 204 mempool_free(r1_bio, conf->r1buf_pool);
205 205
206 lower_barrier(conf); 206 lower_barrier(conf);
207 } 207 }
208 208
209 static void reschedule_retry(r1bio_t *r1_bio) 209 static void reschedule_retry(r1bio_t *r1_bio)
210 { 210 {
211 unsigned long flags; 211 unsigned long flags;
212 mddev_t *mddev = r1_bio->mddev; 212 mddev_t *mddev = r1_bio->mddev;
213 conf_t *conf = mddev_to_conf(mddev); 213 conf_t *conf = mddev_to_conf(mddev);
214 214
215 spin_lock_irqsave(&conf->device_lock, flags); 215 spin_lock_irqsave(&conf->device_lock, flags);
216 list_add(&r1_bio->retry_list, &conf->retry_list); 216 list_add(&r1_bio->retry_list, &conf->retry_list);
217 conf->nr_queued ++; 217 conf->nr_queued ++;
218 spin_unlock_irqrestore(&conf->device_lock, flags); 218 spin_unlock_irqrestore(&conf->device_lock, flags);
219 219
220 wake_up(&conf->wait_barrier); 220 wake_up(&conf->wait_barrier);
221 md_wakeup_thread(mddev->thread); 221 md_wakeup_thread(mddev->thread);
222 } 222 }
223 223
224 /* 224 /*
225 * raid_end_bio_io() is called when we have finished servicing a mirrored 225 * raid_end_bio_io() is called when we have finished servicing a mirrored
226 * operation and are ready to return a success/failure code to the buffer 226 * operation and are ready to return a success/failure code to the buffer
227 * cache layer. 227 * cache layer.
228 */ 228 */
229 static void raid_end_bio_io(r1bio_t *r1_bio) 229 static void raid_end_bio_io(r1bio_t *r1_bio)
230 { 230 {
231 struct bio *bio = r1_bio->master_bio; 231 struct bio *bio = r1_bio->master_bio;
232 232
233 /* if nobody has done the final endio yet, do it now */ 233 /* if nobody has done the final endio yet, do it now */
234 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 234 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
235 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", 235 PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n",
236 (bio_data_dir(bio) == WRITE) ? "write" : "read", 236 (bio_data_dir(bio) == WRITE) ? "write" : "read",
237 (unsigned long long) bio->bi_sector, 237 (unsigned long long) bio->bi_sector,
238 (unsigned long long) bio->bi_sector + 238 (unsigned long long) bio->bi_sector +
239 (bio->bi_size >> 9) - 1); 239 (bio->bi_size >> 9) - 1);
240 240
241 bio_endio(bio, bio->bi_size, 241 bio_endio(bio, bio->bi_size,
242 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); 242 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
243 } 243 }
244 free_r1bio(r1_bio); 244 free_r1bio(r1_bio);
245 } 245 }
246 246
247 /* 247 /*
248 * Update disk head position estimator based on IRQ completion info. 248 * Update disk head position estimator based on IRQ completion info.
249 */ 249 */
250 static inline void update_head_pos(int disk, r1bio_t *r1_bio) 250 static inline void update_head_pos(int disk, r1bio_t *r1_bio)
251 { 251 {
252 conf_t *conf = mddev_to_conf(r1_bio->mddev); 252 conf_t *conf = mddev_to_conf(r1_bio->mddev);
253 253
254 conf->mirrors[disk].head_position = 254 conf->mirrors[disk].head_position =
255 r1_bio->sector + (r1_bio->sectors); 255 r1_bio->sector + (r1_bio->sectors);
256 } 256 }
257 257
258 static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error) 258 static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int error)
259 { 259 {
260 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 260 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
261 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 261 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
262 int mirror; 262 int mirror;
263 conf_t *conf = mddev_to_conf(r1_bio->mddev); 263 conf_t *conf = mddev_to_conf(r1_bio->mddev);
264 264
265 if (bio->bi_size) 265 if (bio->bi_size)
266 return 1; 266 return 1;
267 267
268 mirror = r1_bio->read_disk; 268 mirror = r1_bio->read_disk;
269 /* 269 /*
270 * this branch is our 'one mirror IO has finished' event handler: 270 * this branch is our 'one mirror IO has finished' event handler:
271 */ 271 */
272 update_head_pos(mirror, r1_bio); 272 update_head_pos(mirror, r1_bio);
273 273
274 if (uptodate || conf->working_disks <= 1) { 274 if (uptodate || conf->working_disks <= 1) {
275 /* 275 /*
276 * Set R1BIO_Uptodate in our master bio, so that 276 * Set R1BIO_Uptodate in our master bio, so that
277 * we will return a good error code for to the higher 277 * we will return a good error code for to the higher
278 * levels even if IO on some other mirrored buffer fails. 278 * levels even if IO on some other mirrored buffer fails.
279 * 279 *
280 * The 'master' represents the composite IO operation to 280 * The 'master' represents the composite IO operation to
281 * user-side. So if something waits for IO, then it will 281 * user-side. So if something waits for IO, then it will
282 * wait for the 'master' bio. 282 * wait for the 'master' bio.
283 */ 283 */
284 if (uptodate) 284 if (uptodate)
285 set_bit(R1BIO_Uptodate, &r1_bio->state); 285 set_bit(R1BIO_Uptodate, &r1_bio->state);
286 286
287 raid_end_bio_io(r1_bio); 287 raid_end_bio_io(r1_bio);
288 } else { 288 } else {
289 /* 289 /*
290 * oops, read error: 290 * oops, read error:
291 */ 291 */
292 char b[BDEVNAME_SIZE]; 292 char b[BDEVNAME_SIZE];
293 if (printk_ratelimit()) 293 if (printk_ratelimit())
294 printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n", 294 printk(KERN_ERR "raid1: %s: rescheduling sector %llu\n",
295 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector); 295 bdevname(conf->mirrors[mirror].rdev->bdev,b), (unsigned long long)r1_bio->sector);
296 reschedule_retry(r1_bio); 296 reschedule_retry(r1_bio);
297 } 297 }
298 298
299 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 299 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
300 return 0; 300 return 0;
301 } 301 }
302 302
303 static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error) 303 static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int error)
304 { 304 {
305 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 305 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
306 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 306 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
307 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state); 307 int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
308 conf_t *conf = mddev_to_conf(r1_bio->mddev); 308 conf_t *conf = mddev_to_conf(r1_bio->mddev);
309 struct bio *to_put = NULL; 309 struct bio *to_put = NULL;
310 310
311 if (bio->bi_size) 311 if (bio->bi_size)
312 return 1; 312 return 1;
313 313
314 for (mirror = 0; mirror < conf->raid_disks; mirror++) 314 for (mirror = 0; mirror < conf->raid_disks; mirror++)
315 if (r1_bio->bios[mirror] == bio) 315 if (r1_bio->bios[mirror] == bio)
316 break; 316 break;
317 317
318 if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { 318 if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
319 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); 319 set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
320 set_bit(R1BIO_BarrierRetry, &r1_bio->state); 320 set_bit(R1BIO_BarrierRetry, &r1_bio->state);
321 r1_bio->mddev->barriers_work = 0; 321 r1_bio->mddev->barriers_work = 0;
322 } else { 322 } else {
323 /* 323 /*
324 * this branch is our 'one mirror IO has finished' event handler: 324 * this branch is our 'one mirror IO has finished' event handler:
325 */ 325 */
326 r1_bio->bios[mirror] = NULL; 326 r1_bio->bios[mirror] = NULL;
327 to_put = bio; 327 to_put = bio;
328 if (!uptodate) { 328 if (!uptodate) {
329 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); 329 md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
330 /* an I/O failed, we can't clear the bitmap */ 330 /* an I/O failed, we can't clear the bitmap */
331 set_bit(R1BIO_Degraded, &r1_bio->state); 331 set_bit(R1BIO_Degraded, &r1_bio->state);
332 } else 332 } else
333 /* 333 /*
334 * Set R1BIO_Uptodate in our master bio, so that 334 * Set R1BIO_Uptodate in our master bio, so that
335 * we will return a good error code for to the higher 335 * we will return a good error code for to the higher
336 * levels even if IO on some other mirrored buffer fails. 336 * levels even if IO on some other mirrored buffer fails.
337 * 337 *
338 * The 'master' represents the composite IO operation to 338 * The 'master' represents the composite IO operation to
339 * user-side. So if something waits for IO, then it will 339 * user-side. So if something waits for IO, then it will
340 * wait for the 'master' bio. 340 * wait for the 'master' bio.
341 */ 341 */
342 set_bit(R1BIO_Uptodate, &r1_bio->state); 342 set_bit(R1BIO_Uptodate, &r1_bio->state);
343 343
344 update_head_pos(mirror, r1_bio); 344 update_head_pos(mirror, r1_bio);
345 345
346 if (behind) { 346 if (behind) {
347 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) 347 if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
348 atomic_dec(&r1_bio->behind_remaining); 348 atomic_dec(&r1_bio->behind_remaining);
349 349
350 /* In behind mode, we ACK the master bio once the I/O has safely 350 /* In behind mode, we ACK the master bio once the I/O has safely
351 * reached all non-writemostly disks. Setting the Returned bit 351 * reached all non-writemostly disks. Setting the Returned bit
352 * ensures that this gets done only once -- we don't ever want to 352 * ensures that this gets done only once -- we don't ever want to
353 * return -EIO here, instead we'll wait */ 353 * return -EIO here, instead we'll wait */
354 354
355 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && 355 if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
356 test_bit(R1BIO_Uptodate, &r1_bio->state)) { 356 test_bit(R1BIO_Uptodate, &r1_bio->state)) {
357 /* Maybe we can return now */ 357 /* Maybe we can return now */
358 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { 358 if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
359 struct bio *mbio = r1_bio->master_bio; 359 struct bio *mbio = r1_bio->master_bio;
360 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", 360 PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
361 (unsigned long long) mbio->bi_sector, 361 (unsigned long long) mbio->bi_sector,
362 (unsigned long long) mbio->bi_sector + 362 (unsigned long long) mbio->bi_sector +
363 (mbio->bi_size >> 9) - 1); 363 (mbio->bi_size >> 9) - 1);
364 bio_endio(mbio, mbio->bi_size, 0); 364 bio_endio(mbio, mbio->bi_size, 0);
365 } 365 }
366 } 366 }
367 } 367 }
368 } 368 }
369 /* 369 /*
370 * 370 *
371 * Let's see if all mirrored write operations have finished 371 * Let's see if all mirrored write operations have finished
372 * already. 372 * already.
373 */ 373 */
374 if (atomic_dec_and_test(&r1_bio->remaining)) { 374 if (atomic_dec_and_test(&r1_bio->remaining)) {
375 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { 375 if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
376 reschedule_retry(r1_bio); 376 reschedule_retry(r1_bio);
377 /* Don't dec_pending yet, we want to hold 377 /* Don't dec_pending yet, we want to hold
378 * the reference over the retry 378 * the reference over the retry
379 */ 379 */
380 goto out; 380 goto out;
381 } 381 }
382 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { 382 if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
383 /* free extra copy of the data pages */ 383 /* free extra copy of the data pages */
384 int i = bio->bi_vcnt; 384 int i = bio->bi_vcnt;
385 while (i--) 385 while (i--)
386 safe_put_page(bio->bi_io_vec[i].bv_page); 386 safe_put_page(bio->bi_io_vec[i].bv_page);
387 } 387 }
388 /* clear the bitmap if all writes complete successfully */ 388 /* clear the bitmap if all writes complete successfully */
389 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, 389 bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
390 r1_bio->sectors, 390 r1_bio->sectors,
391 !test_bit(R1BIO_Degraded, &r1_bio->state), 391 !test_bit(R1BIO_Degraded, &r1_bio->state),
392 behind); 392 behind);
393 md_write_end(r1_bio->mddev); 393 md_write_end(r1_bio->mddev);
394 raid_end_bio_io(r1_bio); 394 raid_end_bio_io(r1_bio);
395 } 395 }
396 396
397 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); 397 rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
398 out: 398 out:
399 if (to_put) 399 if (to_put)
400 bio_put(to_put); 400 bio_put(to_put);
401 401
402 return 0; 402 return 0;
403 } 403 }
404 404
405 405
406 /* 406 /*
407 * This routine returns the disk from which the requested read should 407 * This routine returns the disk from which the requested read should
408 * be done. There is a per-array 'next expected sequential IO' sector 408 * be done. There is a per-array 'next expected sequential IO' sector
409 * number - if this matches on the next IO then we use the last disk. 409 * number - if this matches on the next IO then we use the last disk.
410 * There is also a per-disk 'last know head position' sector that is 410 * There is also a per-disk 'last know head position' sector that is
411 * maintained from IRQ contexts, both the normal and the resync IO 411 * maintained from IRQ contexts, both the normal and the resync IO
412 * completion handlers update this position correctly. If there is no 412 * completion handlers update this position correctly. If there is no
413 * perfect sequential match then we pick the disk whose head is closest. 413 * perfect sequential match then we pick the disk whose head is closest.
414 * 414 *
415 * If there are 2 mirrors in the same 2 devices, performance degrades 415 * If there are 2 mirrors in the same 2 devices, performance degrades
416 * because position is mirror, not device based. 416 * because position is mirror, not device based.
417 * 417 *
418 * The rdev for the device selected will have nr_pending incremented. 418 * The rdev for the device selected will have nr_pending incremented.
419 */ 419 */
420 static int read_balance(conf_t *conf, r1bio_t *r1_bio) 420 static int read_balance(conf_t *conf, r1bio_t *r1_bio)
421 { 421 {
422 const unsigned long this_sector = r1_bio->sector; 422 const unsigned long this_sector = r1_bio->sector;
423 int new_disk = conf->last_used, disk = new_disk; 423 int new_disk = conf->last_used, disk = new_disk;
424 int wonly_disk = -1; 424 int wonly_disk = -1;
425 const int sectors = r1_bio->sectors; 425 const int sectors = r1_bio->sectors;
426 sector_t new_distance, current_distance; 426 sector_t new_distance, current_distance;
427 mdk_rdev_t *rdev; 427 mdk_rdev_t *rdev;
428 428
429 rcu_read_lock(); 429 rcu_read_lock();
430 /* 430 /*
431 * Check if we can balance. We can balance on the whole 431 * Check if we can balance. We can balance on the whole
432 * device if no resync is going on, or below the resync window. 432 * device if no resync is going on, or below the resync window.
433 * We take the first readable disk when above the resync window. 433 * We take the first readable disk when above the resync window.
434 */ 434 */
435 retry: 435 retry:
436 if (conf->mddev->recovery_cp < MaxSector && 436 if (conf->mddev->recovery_cp < MaxSector &&
437 (this_sector + sectors >= conf->next_resync)) { 437 (this_sector + sectors >= conf->next_resync)) {
438 /* Choose the first operation device, for consistancy */ 438 /* Choose the first operation device, for consistancy */
439 new_disk = 0; 439 new_disk = 0;
440 440
441 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 441 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
442 r1_bio->bios[new_disk] == IO_BLOCKED || 442 r1_bio->bios[new_disk] == IO_BLOCKED ||
443 !rdev || !test_bit(In_sync, &rdev->flags) 443 !rdev || !test_bit(In_sync, &rdev->flags)
444 || test_bit(WriteMostly, &rdev->flags); 444 || test_bit(WriteMostly, &rdev->flags);
445 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { 445 rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) {
446 446
447 if (rdev && test_bit(In_sync, &rdev->flags) && 447 if (rdev && test_bit(In_sync, &rdev->flags) &&
448 r1_bio->bios[new_disk] != IO_BLOCKED) 448 r1_bio->bios[new_disk] != IO_BLOCKED)
449 wonly_disk = new_disk; 449 wonly_disk = new_disk;
450 450
451 if (new_disk == conf->raid_disks - 1) { 451 if (new_disk == conf->raid_disks - 1) {
452 new_disk = wonly_disk; 452 new_disk = wonly_disk;
453 break; 453 break;
454 } 454 }
455 } 455 }
456 goto rb_out; 456 goto rb_out;
457 } 457 }
458 458
459 459
460 /* make sure the disk is operational */ 460 /* make sure the disk is operational */
461 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 461 for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
462 r1_bio->bios[new_disk] == IO_BLOCKED || 462 r1_bio->bios[new_disk] == IO_BLOCKED ||
463 !rdev || !test_bit(In_sync, &rdev->flags) || 463 !rdev || !test_bit(In_sync, &rdev->flags) ||
464 test_bit(WriteMostly, &rdev->flags); 464 test_bit(WriteMostly, &rdev->flags);
465 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { 465 rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) {
466 466
467 if (rdev && test_bit(In_sync, &rdev->flags) && 467 if (rdev && test_bit(In_sync, &rdev->flags) &&
468 r1_bio->bios[new_disk] != IO_BLOCKED) 468 r1_bio->bios[new_disk] != IO_BLOCKED)
469 wonly_disk = new_disk; 469 wonly_disk = new_disk;
470 470
471 if (new_disk <= 0) 471 if (new_disk <= 0)
472 new_disk = conf->raid_disks; 472 new_disk = conf->raid_disks;
473 new_disk--; 473 new_disk--;
474 if (new_disk == disk) { 474 if (new_disk == disk) {
475 new_disk = wonly_disk; 475 new_disk = wonly_disk;
476 break; 476 break;
477 } 477 }
478 } 478 }
479 479
480 if (new_disk < 0) 480 if (new_disk < 0)
481 goto rb_out; 481 goto rb_out;
482 482
483 disk = new_disk; 483 disk = new_disk;
484 /* now disk == new_disk == starting point for search */ 484 /* now disk == new_disk == starting point for search */
485 485
486 /* 486 /*
487 * Don't change to another disk for sequential reads: 487 * Don't change to another disk for sequential reads:
488 */ 488 */
489 if (conf->next_seq_sect == this_sector) 489 if (conf->next_seq_sect == this_sector)
490 goto rb_out; 490 goto rb_out;
491 if (this_sector == conf->mirrors[new_disk].head_position) 491 if (this_sector == conf->mirrors[new_disk].head_position)
492 goto rb_out; 492 goto rb_out;
493 493
494 current_distance = abs(this_sector - conf->mirrors[disk].head_position); 494 current_distance = abs(this_sector - conf->mirrors[disk].head_position);
495 495
496 /* Find the disk whose head is closest */ 496 /* Find the disk whose head is closest */
497 497
498 do { 498 do {
499 if (disk <= 0) 499 if (disk <= 0)
500 disk = conf->raid_disks; 500 disk = conf->raid_disks;
501 disk--; 501 disk--;
502 502
503 rdev = rcu_dereference(conf->mirrors[disk].rdev); 503 rdev = rcu_dereference(conf->mirrors[disk].rdev);
504 504
505 if (!rdev || r1_bio->bios[disk] == IO_BLOCKED || 505 if (!rdev || r1_bio->bios[disk] == IO_BLOCKED ||
506 !test_bit(In_sync, &rdev->flags) || 506 !test_bit(In_sync, &rdev->flags) ||
507 test_bit(WriteMostly, &rdev->flags)) 507 test_bit(WriteMostly, &rdev->flags))
508 continue; 508 continue;
509 509
510 if (!atomic_read(&rdev->nr_pending)) { 510 if (!atomic_read(&rdev->nr_pending)) {
511 new_disk = disk; 511 new_disk = disk;
512 break; 512 break;
513 } 513 }
514 new_distance = abs(this_sector - conf->mirrors[disk].head_position); 514 new_distance = abs(this_sector - conf->mirrors[disk].head_position);
515 if (new_distance < current_distance) { 515 if (new_distance < current_distance) {
516 current_distance = new_distance; 516 current_distance = new_distance;
517 new_disk = disk; 517 new_disk = disk;
518 } 518 }
519 } while (disk != conf->last_used); 519 } while (disk != conf->last_used);
520 520
521 rb_out: 521 rb_out:
522 522
523 523
524 if (new_disk >= 0) { 524 if (new_disk >= 0) {
525 rdev = rcu_dereference(conf->mirrors[new_disk].rdev); 525 rdev = rcu_dereference(conf->mirrors[new_disk].rdev);
526 if (!rdev) 526 if (!rdev)
527 goto retry; 527 goto retry;
528 atomic_inc(&rdev->nr_pending); 528 atomic_inc(&rdev->nr_pending);
529 if (!test_bit(In_sync, &rdev->flags)) { 529 if (!test_bit(In_sync, &rdev->flags)) {
530 /* cannot risk returning a device that failed 530 /* cannot risk returning a device that failed
531 * before we inc'ed nr_pending 531 * before we inc'ed nr_pending
532 */ 532 */
533 rdev_dec_pending(rdev, conf->mddev); 533 rdev_dec_pending(rdev, conf->mddev);
534 goto retry; 534 goto retry;
535 } 535 }
536 conf->next_seq_sect = this_sector + sectors; 536 conf->next_seq_sect = this_sector + sectors;
537 conf->last_used = new_disk; 537 conf->last_used = new_disk;
538 } 538 }
539 rcu_read_unlock(); 539 rcu_read_unlock();
540 540
541 return new_disk; 541 return new_disk;
542 } 542 }
543 543
544 static void unplug_slaves(mddev_t *mddev) 544 static void unplug_slaves(mddev_t *mddev)
545 { 545 {
546 conf_t *conf = mddev_to_conf(mddev); 546 conf_t *conf = mddev_to_conf(mddev);
547 int i; 547 int i;
548 548
549 rcu_read_lock(); 549 rcu_read_lock();
550 for (i=0; i<mddev->raid_disks; i++) { 550 for (i=0; i<mddev->raid_disks; i++) {
551 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 551 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
552 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 552 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
553 request_queue_t *r_queue = bdev_get_queue(rdev->bdev); 553 request_queue_t *r_queue = bdev_get_queue(rdev->bdev);
554 554
555 atomic_inc(&rdev->nr_pending); 555 atomic_inc(&rdev->nr_pending);
556 rcu_read_unlock(); 556 rcu_read_unlock();
557 557
558 if (r_queue->unplug_fn) 558 if (r_queue->unplug_fn)
559 r_queue->unplug_fn(r_queue); 559 r_queue->unplug_fn(r_queue);
560 560
561 rdev_dec_pending(rdev, mddev); 561 rdev_dec_pending(rdev, mddev);
562 rcu_read_lock(); 562 rcu_read_lock();
563 } 563 }
564 } 564 }
565 rcu_read_unlock(); 565 rcu_read_unlock();
566 } 566 }
567 567
568 static void raid1_unplug(request_queue_t *q) 568 static void raid1_unplug(request_queue_t *q)
569 { 569 {
570 mddev_t *mddev = q->queuedata; 570 mddev_t *mddev = q->queuedata;
571 571
572 unplug_slaves(mddev); 572 unplug_slaves(mddev);
573 md_wakeup_thread(mddev->thread); 573 md_wakeup_thread(mddev->thread);
574 } 574 }
575 575
576 static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk, 576 static int raid1_issue_flush(request_queue_t *q, struct gendisk *disk,
577 sector_t *error_sector) 577 sector_t *error_sector)
578 { 578 {
579 mddev_t *mddev = q->queuedata; 579 mddev_t *mddev = q->queuedata;
580 conf_t *conf = mddev_to_conf(mddev); 580 conf_t *conf = mddev_to_conf(mddev);
581 int i, ret = 0; 581 int i, ret = 0;
582 582
583 rcu_read_lock(); 583 rcu_read_lock();
584 for (i=0; i<mddev->raid_disks && ret == 0; i++) { 584 for (i=0; i<mddev->raid_disks && ret == 0; i++) {
585 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); 585 mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
586 if (rdev && !test_bit(Faulty, &rdev->flags)) { 586 if (rdev && !test_bit(Faulty, &rdev->flags)) {
587 struct block_device *bdev = rdev->bdev; 587 struct block_device *bdev = rdev->bdev;
588 request_queue_t *r_queue = bdev_get_queue(bdev); 588 request_queue_t *r_queue = bdev_get_queue(bdev);
589 589
590 if (!r_queue->issue_flush_fn) 590 if (!r_queue->issue_flush_fn)
591 ret = -EOPNOTSUPP; 591 ret = -EOPNOTSUPP;
592 else { 592 else {
593 atomic_inc(&rdev->nr_pending); 593 atomic_inc(&rdev->nr_pending);
594 rcu_read_unlock(); 594 rcu_read_unlock();
595 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk, 595 ret = r_queue->issue_flush_fn(r_queue, bdev->bd_disk,
596 error_sector); 596 error_sector);
597 rdev_dec_pending(rdev, mddev); 597 rdev_dec_pending(rdev, mddev);
598 rcu_read_lock(); 598 rcu_read_lock();
599 } 599 }
600 } 600 }
601 } 601 }
602 rcu_read_unlock(); 602 rcu_read_unlock();
603 return ret; 603 return ret;
604 } 604 }
605 605
606 /* Barriers.... 606 /* Barriers....
607 * Sometimes we need to suspend IO while we do something else, 607 * Sometimes we need to suspend IO while we do something else,
608 * either some resync/recovery, or reconfigure the array. 608 * either some resync/recovery, or reconfigure the array.
609 * To do this we raise a 'barrier'. 609 * To do this we raise a 'barrier'.
610 * The 'barrier' is a counter that can be raised multiple times 610 * The 'barrier' is a counter that can be raised multiple times
611 * to count how many activities are happening which preclude 611 * to count how many activities are happening which preclude
612 * normal IO. 612 * normal IO.
613 * We can only raise the barrier if there is no pending IO. 613 * We can only raise the barrier if there is no pending IO.
614 * i.e. if nr_pending == 0. 614 * i.e. if nr_pending == 0.
615 * We choose only to raise the barrier if no-one is waiting for the 615 * We choose only to raise the barrier if no-one is waiting for the
616 * barrier to go down. This means that as soon as an IO request 616 * barrier to go down. This means that as soon as an IO request
617 * is ready, no other operations which require a barrier will start 617 * is ready, no other operations which require a barrier will start
618 * until the IO request has had a chance. 618 * until the IO request has had a chance.
619 * 619 *
620 * So: regular IO calls 'wait_barrier'. When that returns there 620 * So: regular IO calls 'wait_barrier'. When that returns there
621 * is no backgroup IO happening, It must arrange to call 621 * is no backgroup IO happening, It must arrange to call
622 * allow_barrier when it has finished its IO. 622 * allow_barrier when it has finished its IO.
623 * backgroup IO calls must call raise_barrier. Once that returns 623 * backgroup IO calls must call raise_barrier. Once that returns
624 * there is no normal IO happeing. It must arrange to call 624 * there is no normal IO happeing. It must arrange to call
625 * lower_barrier when the particular background IO completes. 625 * lower_barrier when the particular background IO completes.
626 */ 626 */
627 #define RESYNC_DEPTH 32 627 #define RESYNC_DEPTH 32
628 628
629 static void raise_barrier(conf_t *conf) 629 static void raise_barrier(conf_t *conf)
630 { 630 {
631 spin_lock_irq(&conf->resync_lock); 631 spin_lock_irq(&conf->resync_lock);
632 632
633 /* Wait until no block IO is waiting */ 633 /* Wait until no block IO is waiting */
634 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, 634 wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting,
635 conf->resync_lock, 635 conf->resync_lock,
636 raid1_unplug(conf->mddev->queue)); 636 raid1_unplug(conf->mddev->queue));
637 637
638 /* block any new IO from starting */ 638 /* block any new IO from starting */
639 conf->barrier++; 639 conf->barrier++;
640 640
641 /* No wait for all pending IO to complete */ 641 /* No wait for all pending IO to complete */
642 wait_event_lock_irq(conf->wait_barrier, 642 wait_event_lock_irq(conf->wait_barrier,
643 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 643 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
644 conf->resync_lock, 644 conf->resync_lock,
645 raid1_unplug(conf->mddev->queue)); 645 raid1_unplug(conf->mddev->queue));
646 646
647 spin_unlock_irq(&conf->resync_lock); 647 spin_unlock_irq(&conf->resync_lock);
648 } 648 }
649 649
650 static void lower_barrier(conf_t *conf) 650 static void lower_barrier(conf_t *conf)
651 { 651 {
652 unsigned long flags; 652 unsigned long flags;
653 spin_lock_irqsave(&conf->resync_lock, flags); 653 spin_lock_irqsave(&conf->resync_lock, flags);
654 conf->barrier--; 654 conf->barrier--;
655 spin_unlock_irqrestore(&conf->resync_lock, flags); 655 spin_unlock_irqrestore(&conf->resync_lock, flags);
656 wake_up(&conf->wait_barrier); 656 wake_up(&conf->wait_barrier);
657 } 657 }
658 658
659 static void wait_barrier(conf_t *conf) 659 static void wait_barrier(conf_t *conf)
660 { 660 {
661 spin_lock_irq(&conf->resync_lock); 661 spin_lock_irq(&conf->resync_lock);
662 if (conf->barrier) { 662 if (conf->barrier) {
663 conf->nr_waiting++; 663 conf->nr_waiting++;
664 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 664 wait_event_lock_irq(conf->wait_barrier, !conf->barrier,
665 conf->resync_lock, 665 conf->resync_lock,
666 raid1_unplug(conf->mddev->queue)); 666 raid1_unplug(conf->mddev->queue));
667 conf->nr_waiting--; 667 conf->nr_waiting--;
668 } 668 }
669 conf->nr_pending++; 669 conf->nr_pending++;
670 spin_unlock_irq(&conf->resync_lock); 670 spin_unlock_irq(&conf->resync_lock);
671 } 671 }
672 672
673 static void allow_barrier(conf_t *conf) 673 static void allow_barrier(conf_t *conf)
674 { 674 {
675 unsigned long flags; 675 unsigned long flags;
676 spin_lock_irqsave(&conf->resync_lock, flags); 676 spin_lock_irqsave(&conf->resync_lock, flags);
677 conf->nr_pending--; 677 conf->nr_pending--;
678 spin_unlock_irqrestore(&conf->resync_lock, flags); 678 spin_unlock_irqrestore(&conf->resync_lock, flags);
679 wake_up(&conf->wait_barrier); 679 wake_up(&conf->wait_barrier);
680 } 680 }
681 681
682 static void freeze_array(conf_t *conf) 682 static void freeze_array(conf_t *conf)
683 { 683 {
684 /* stop syncio and normal IO and wait for everything to 684 /* stop syncio and normal IO and wait for everything to
685 * go quite. 685 * go quite.
686 * We increment barrier and nr_waiting, and then 686 * We increment barrier and nr_waiting, and then
687 * wait until barrier+nr_pending match nr_queued+2 687 * wait until barrier+nr_pending match nr_queued+2
688 */ 688 */
689 spin_lock_irq(&conf->resync_lock); 689 spin_lock_irq(&conf->resync_lock);
690 conf->barrier++; 690 conf->barrier++;
691 conf->nr_waiting++; 691 conf->nr_waiting++;
692 wait_event_lock_irq(conf->wait_barrier, 692 wait_event_lock_irq(conf->wait_barrier,
693 conf->barrier+conf->nr_pending == conf->nr_queued+2, 693 conf->barrier+conf->nr_pending == conf->nr_queued+2,
694 conf->resync_lock, 694 conf->resync_lock,
695 raid1_unplug(conf->mddev->queue)); 695 raid1_unplug(conf->mddev->queue));
696 spin_unlock_irq(&conf->resync_lock); 696 spin_unlock_irq(&conf->resync_lock);
697 } 697 }
698 static void unfreeze_array(conf_t *conf) 698 static void unfreeze_array(conf_t *conf)
699 { 699 {
700 /* reverse the effect of the freeze */ 700 /* reverse the effect of the freeze */
701 spin_lock_irq(&conf->resync_lock); 701 spin_lock_irq(&conf->resync_lock);
702 conf->barrier--; 702 conf->barrier--;
703 conf->nr_waiting--; 703 conf->nr_waiting--;
704 wake_up(&conf->wait_barrier); 704 wake_up(&conf->wait_barrier);
705 spin_unlock_irq(&conf->resync_lock); 705 spin_unlock_irq(&conf->resync_lock);
706 } 706 }
707 707
708 708
709 /* duplicate the data pages for behind I/O */ 709 /* duplicate the data pages for behind I/O */
710 static struct page **alloc_behind_pages(struct bio *bio) 710 static struct page **alloc_behind_pages(struct bio *bio)
711 { 711 {
712 int i; 712 int i;
713 struct bio_vec *bvec; 713 struct bio_vec *bvec;
714 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), 714 struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *),
715 GFP_NOIO); 715 GFP_NOIO);
716 if (unlikely(!pages)) 716 if (unlikely(!pages))
717 goto do_sync_io; 717 goto do_sync_io;
718 718
719 bio_for_each_segment(bvec, bio, i) { 719 bio_for_each_segment(bvec, bio, i) {
720 pages[i] = alloc_page(GFP_NOIO); 720 pages[i] = alloc_page(GFP_NOIO);
721 if (unlikely(!pages[i])) 721 if (unlikely(!pages[i]))
722 goto do_sync_io; 722 goto do_sync_io;
723 memcpy(kmap(pages[i]) + bvec->bv_offset, 723 memcpy(kmap(pages[i]) + bvec->bv_offset,
724 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); 724 kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len);
725 kunmap(pages[i]); 725 kunmap(pages[i]);
726 kunmap(bvec->bv_page); 726 kunmap(bvec->bv_page);
727 } 727 }
728 728
729 return pages; 729 return pages;
730 730
731 do_sync_io: 731 do_sync_io:
732 if (pages) 732 if (pages)
733 for (i = 0; i < bio->bi_vcnt && pages[i]; i++) 733 for (i = 0; i < bio->bi_vcnt && pages[i]; i++)
734 put_page(pages[i]); 734 put_page(pages[i]);
735 kfree(pages); 735 kfree(pages);
736 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); 736 PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size);
737 return NULL; 737 return NULL;
738 } 738 }
739 739
740 static int make_request(request_queue_t *q, struct bio * bio) 740 static int make_request(request_queue_t *q, struct bio * bio)
741 { 741 {
742 mddev_t *mddev = q->queuedata; 742 mddev_t *mddev = q->queuedata;
743 conf_t *conf = mddev_to_conf(mddev); 743 conf_t *conf = mddev_to_conf(mddev);
744 mirror_info_t *mirror; 744 mirror_info_t *mirror;
745 r1bio_t *r1_bio; 745 r1bio_t *r1_bio;
746 struct bio *read_bio; 746 struct bio *read_bio;
747 int i, targets = 0, disks; 747 int i, targets = 0, disks;
748 mdk_rdev_t *rdev; 748 mdk_rdev_t *rdev;
749 struct bitmap *bitmap = mddev->bitmap; 749 struct bitmap *bitmap = mddev->bitmap;
750 unsigned long flags; 750 unsigned long flags;
751 struct bio_list bl; 751 struct bio_list bl;
752 struct page **behind_pages = NULL; 752 struct page **behind_pages = NULL;
753 const int rw = bio_data_dir(bio); 753 const int rw = bio_data_dir(bio);
754 int do_barriers; 754 int do_barriers;
755 755
756 if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
757 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
758 return 0;
759 }
760
761 /* 756 /*
762 * Register the new request and wait if the reconstruction 757 * Register the new request and wait if the reconstruction
763 * thread has put up a bar for new requests. 758 * thread has put up a bar for new requests.
764 * Continue immediately if no resync is active currently. 759 * Continue immediately if no resync is active currently.
760 * We test barriers_work *after* md_write_start as md_write_start
761 * may cause the first superblock write, and that will check out
762 * if barriers work.
765 */ 763 */
764
766 md_write_start(mddev, bio); /* wait on superblock update early */ 765 md_write_start(mddev, bio); /* wait on superblock update early */
766
767 if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
768 if (rw == WRITE)
769 md_write_end(mddev);
770 bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
771 return 0;
772 }
767 773
768 wait_barrier(conf); 774 wait_barrier(conf);
769 775
770 disk_stat_inc(mddev->gendisk, ios[rw]); 776 disk_stat_inc(mddev->gendisk, ios[rw]);
771 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio)); 777 disk_stat_add(mddev->gendisk, sectors[rw], bio_sectors(bio));
772 778
773 /* 779 /*
774 * make_request() can abort the operation when READA is being 780 * make_request() can abort the operation when READA is being
775 * used and no empty request is available. 781 * used and no empty request is available.
776 * 782 *
777 */ 783 */
778 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO); 784 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
779 785
780 r1_bio->master_bio = bio; 786 r1_bio->master_bio = bio;
781 r1_bio->sectors = bio->bi_size >> 9; 787 r1_bio->sectors = bio->bi_size >> 9;
782 r1_bio->state = 0; 788 r1_bio->state = 0;
783 r1_bio->mddev = mddev; 789 r1_bio->mddev = mddev;
784 r1_bio->sector = bio->bi_sector; 790 r1_bio->sector = bio->bi_sector;
785 791
786 if (rw == READ) { 792 if (rw == READ) {
787 /* 793 /*
788 * read balancing logic: 794 * read balancing logic:
789 */ 795 */
790 int rdisk = read_balance(conf, r1_bio); 796 int rdisk = read_balance(conf, r1_bio);
791 797
792 if (rdisk < 0) { 798 if (rdisk < 0) {
793 /* couldn't find anywhere to read from */ 799 /* couldn't find anywhere to read from */
794 raid_end_bio_io(r1_bio); 800 raid_end_bio_io(r1_bio);
795 return 0; 801 return 0;
796 } 802 }
797 mirror = conf->mirrors + rdisk; 803 mirror = conf->mirrors + rdisk;
798 804
799 r1_bio->read_disk = rdisk; 805 r1_bio->read_disk = rdisk;
800 806
801 read_bio = bio_clone(bio, GFP_NOIO); 807 read_bio = bio_clone(bio, GFP_NOIO);
802 808
803 r1_bio->bios[rdisk] = read_bio; 809 r1_bio->bios[rdisk] = read_bio;
804 810
805 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset; 811 read_bio->bi_sector = r1_bio->sector + mirror->rdev->data_offset;
806 read_bio->bi_bdev = mirror->rdev->bdev; 812 read_bio->bi_bdev = mirror->rdev->bdev;
807 read_bio->bi_end_io = raid1_end_read_request; 813 read_bio->bi_end_io = raid1_end_read_request;
808 read_bio->bi_rw = READ; 814 read_bio->bi_rw = READ;
809 read_bio->bi_private = r1_bio; 815 read_bio->bi_private = r1_bio;
810 816
811 generic_make_request(read_bio); 817 generic_make_request(read_bio);
812 return 0; 818 return 0;
813 } 819 }
814 820
815 /* 821 /*
816 * WRITE: 822 * WRITE:
817 */ 823 */
818 /* first select target devices under spinlock and 824 /* first select target devices under spinlock and
819 * inc refcount on their rdev. Record them by setting 825 * inc refcount on their rdev. Record them by setting
820 * bios[x] to bio 826 * bios[x] to bio
821 */ 827 */
822 disks = conf->raid_disks; 828 disks = conf->raid_disks;
823 #if 0 829 #if 0
824 { static int first=1; 830 { static int first=1;
825 if (first) printk("First Write sector %llu disks %d\n", 831 if (first) printk("First Write sector %llu disks %d\n",
826 (unsigned long long)r1_bio->sector, disks); 832 (unsigned long long)r1_bio->sector, disks);
827 first = 0; 833 first = 0;
828 } 834 }
829 #endif 835 #endif
830 rcu_read_lock(); 836 rcu_read_lock();
831 for (i = 0; i < disks; i++) { 837 for (i = 0; i < disks; i++) {
832 if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL && 838 if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL &&
833 !test_bit(Faulty, &rdev->flags)) { 839 !test_bit(Faulty, &rdev->flags)) {
834 atomic_inc(&rdev->nr_pending); 840 atomic_inc(&rdev->nr_pending);
835 if (test_bit(Faulty, &rdev->flags)) { 841 if (test_bit(Faulty, &rdev->flags)) {
836 rdev_dec_pending(rdev, mddev); 842 rdev_dec_pending(rdev, mddev);
837 r1_bio->bios[i] = NULL; 843 r1_bio->bios[i] = NULL;
838 } else 844 } else
839 r1_bio->bios[i] = bio; 845 r1_bio->bios[i] = bio;
840 targets++; 846 targets++;
841 } else 847 } else
842 r1_bio->bios[i] = NULL; 848 r1_bio->bios[i] = NULL;
843 } 849 }
844 rcu_read_unlock(); 850 rcu_read_unlock();
845 851
846 BUG_ON(targets == 0); /* we never fail the last device */ 852 BUG_ON(targets == 0); /* we never fail the last device */
847 853
848 if (targets < conf->raid_disks) { 854 if (targets < conf->raid_disks) {
849 /* array is degraded, we will not clear the bitmap 855 /* array is degraded, we will not clear the bitmap
850 * on I/O completion (see raid1_end_write_request) */ 856 * on I/O completion (see raid1_end_write_request) */
851 set_bit(R1BIO_Degraded, &r1_bio->state); 857 set_bit(R1BIO_Degraded, &r1_bio->state);
852 } 858 }
853 859
854 /* do behind I/O ? */ 860 /* do behind I/O ? */
855 if (bitmap && 861 if (bitmap &&
856 atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && 862 atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind &&
857 (behind_pages = alloc_behind_pages(bio)) != NULL) 863 (behind_pages = alloc_behind_pages(bio)) != NULL)
858 set_bit(R1BIO_BehindIO, &r1_bio->state); 864 set_bit(R1BIO_BehindIO, &r1_bio->state);
859 865
860 atomic_set(&r1_bio->remaining, 0); 866 atomic_set(&r1_bio->remaining, 0);
861 atomic_set(&r1_bio->behind_remaining, 0); 867 atomic_set(&r1_bio->behind_remaining, 0);
862 868
863 do_barriers = bio_barrier(bio); 869 do_barriers = bio_barrier(bio);
864 if (do_barriers) 870 if (do_barriers)
865 set_bit(R1BIO_Barrier, &r1_bio->state); 871 set_bit(R1BIO_Barrier, &r1_bio->state);
866 872
867 bio_list_init(&bl); 873 bio_list_init(&bl);
868 for (i = 0; i < disks; i++) { 874 for (i = 0; i < disks; i++) {
869 struct bio *mbio; 875 struct bio *mbio;
870 if (!r1_bio->bios[i]) 876 if (!r1_bio->bios[i])
871 continue; 877 continue;
872 878
873 mbio = bio_clone(bio, GFP_NOIO); 879 mbio = bio_clone(bio, GFP_NOIO);
874 r1_bio->bios[i] = mbio; 880 r1_bio->bios[i] = mbio;
875 881
876 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; 882 mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
877 mbio->bi_bdev = conf->mirrors[i].rdev->bdev; 883 mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
878 mbio->bi_end_io = raid1_end_write_request; 884 mbio->bi_end_io = raid1_end_write_request;
879 mbio->bi_rw = WRITE | do_barriers; 885 mbio->bi_rw = WRITE | do_barriers;
880 mbio->bi_private = r1_bio; 886 mbio->bi_private = r1_bio;
881 887
882 if (behind_pages) { 888 if (behind_pages) {
883 struct bio_vec *bvec; 889 struct bio_vec *bvec;
884 int j; 890 int j;
885 891
886 /* Yes, I really want the '__' version so that 892 /* Yes, I really want the '__' version so that
887 * we clear any unused pointer in the io_vec, rather 893 * we clear any unused pointer in the io_vec, rather
888 * than leave them unchanged. This is important 894 * than leave them unchanged. This is important
889 * because when we come to free the pages, we won't 895 * because when we come to free the pages, we won't
890 * know the originial bi_idx, so we just free 896 * know the originial bi_idx, so we just free
891 * them all 897 * them all
892 */ 898 */
893 __bio_for_each_segment(bvec, mbio, j, 0) 899 __bio_for_each_segment(bvec, mbio, j, 0)
894 bvec->bv_page = behind_pages[j]; 900 bvec->bv_page = behind_pages[j];
895 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) 901 if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags))
896 atomic_inc(&r1_bio->behind_remaining); 902 atomic_inc(&r1_bio->behind_remaining);
897 } 903 }
898 904
899 atomic_inc(&r1_bio->remaining); 905 atomic_inc(&r1_bio->remaining);
900 906
901 bio_list_add(&bl, mbio); 907 bio_list_add(&bl, mbio);
902 } 908 }
903 kfree(behind_pages); /* the behind pages are attached to the bios now */ 909 kfree(behind_pages); /* the behind pages are attached to the bios now */
904 910
905 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, 911 bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
906 test_bit(R1BIO_BehindIO, &r1_bio->state)); 912 test_bit(R1BIO_BehindIO, &r1_bio->state));
907 spin_lock_irqsave(&conf->device_lock, flags); 913 spin_lock_irqsave(&conf->device_lock, flags);
908 bio_list_merge(&conf->pending_bio_list, &bl); 914 bio_list_merge(&conf->pending_bio_list, &bl);
909 bio_list_init(&bl); 915 bio_list_init(&bl);
910 916
911 blk_plug_device(mddev->queue); 917 blk_plug_device(mddev->queue);
912 spin_unlock_irqrestore(&conf->device_lock, flags); 918 spin_unlock_irqrestore(&conf->device_lock, flags);
913 919
914 #if 0 920 #if 0
915 while ((bio = bio_list_pop(&bl)) != NULL) 921 while ((bio = bio_list_pop(&bl)) != NULL)
916 generic_make_request(bio); 922 generic_make_request(bio);
917 #endif 923 #endif
918 924
919 return 0; 925 return 0;
920 } 926 }
921 927
922 static void status(struct seq_file *seq, mddev_t *mddev) 928 static void status(struct seq_file *seq, mddev_t *mddev)
923 { 929 {
924 conf_t *conf = mddev_to_conf(mddev); 930 conf_t *conf = mddev_to_conf(mddev);
925 int i; 931 int i;
926 932
927 seq_printf(seq, " [%d/%d] [", conf->raid_disks, 933 seq_printf(seq, " [%d/%d] [", conf->raid_disks,
928 conf->working_disks); 934 conf->working_disks);
929 for (i = 0; i < conf->raid_disks; i++) 935 for (i = 0; i < conf->raid_disks; i++)
930 seq_printf(seq, "%s", 936 seq_printf(seq, "%s",
931 conf->mirrors[i].rdev && 937 conf->mirrors[i].rdev &&
932 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); 938 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
933 seq_printf(seq, "]"); 939 seq_printf(seq, "]");
934 } 940 }
935 941
936 942
937 static void error(mddev_t *mddev, mdk_rdev_t *rdev) 943 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
938 { 944 {
939 char b[BDEVNAME_SIZE]; 945 char b[BDEVNAME_SIZE];
940 conf_t *conf = mddev_to_conf(mddev); 946 conf_t *conf = mddev_to_conf(mddev);
941 947
942 /* 948 /*
943 * If it is not operational, then we have already marked it as dead 949 * If it is not operational, then we have already marked it as dead
944 * else if it is the last working disks, ignore the error, let the 950 * else if it is the last working disks, ignore the error, let the
945 * next level up know. 951 * next level up know.
946 * else mark the drive as failed 952 * else mark the drive as failed
947 */ 953 */
948 if (test_bit(In_sync, &rdev->flags) 954 if (test_bit(In_sync, &rdev->flags)
949 && conf->working_disks == 1) 955 && conf->working_disks == 1)
950 /* 956 /*
951 * Don't fail the drive, act as though we were just a 957 * Don't fail the drive, act as though we were just a
952 * normal single drive 958 * normal single drive
953 */ 959 */
954 return; 960 return;
955 if (test_bit(In_sync, &rdev->flags)) { 961 if (test_bit(In_sync, &rdev->flags)) {
956 mddev->degraded++; 962 mddev->degraded++;
957 conf->working_disks--; 963 conf->working_disks--;
958 /* 964 /*
959 * if recovery is running, make sure it aborts. 965 * if recovery is running, make sure it aborts.
960 */ 966 */
961 set_bit(MD_RECOVERY_ERR, &mddev->recovery); 967 set_bit(MD_RECOVERY_ERR, &mddev->recovery);
962 } 968 }
963 clear_bit(In_sync, &rdev->flags); 969 clear_bit(In_sync, &rdev->flags);
964 set_bit(Faulty, &rdev->flags); 970 set_bit(Faulty, &rdev->flags);
965 mddev->sb_dirty = 1; 971 mddev->sb_dirty = 1;
966 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n" 972 printk(KERN_ALERT "raid1: Disk failure on %s, disabling device. \n"
967 " Operation continuing on %d devices\n", 973 " Operation continuing on %d devices\n",
968 bdevname(rdev->bdev,b), conf->working_disks); 974 bdevname(rdev->bdev,b), conf->working_disks);
969 } 975 }
970 976
971 static void print_conf(conf_t *conf) 977 static void print_conf(conf_t *conf)
972 { 978 {
973 int i; 979 int i;
974 mirror_info_t *tmp; 980 mirror_info_t *tmp;
975 981
976 printk("RAID1 conf printout:\n"); 982 printk("RAID1 conf printout:\n");
977 if (!conf) { 983 if (!conf) {
978 printk("(!conf)\n"); 984 printk("(!conf)\n");
979 return; 985 return;
980 } 986 }
981 printk(" --- wd:%d rd:%d\n", conf->working_disks, 987 printk(" --- wd:%d rd:%d\n", conf->working_disks,
982 conf->raid_disks); 988 conf->raid_disks);
983 989
984 for (i = 0; i < conf->raid_disks; i++) { 990 for (i = 0; i < conf->raid_disks; i++) {
985 char b[BDEVNAME_SIZE]; 991 char b[BDEVNAME_SIZE];
986 tmp = conf->mirrors + i; 992 tmp = conf->mirrors + i;
987 if (tmp->rdev) 993 if (tmp->rdev)
988 printk(" disk %d, wo:%d, o:%d, dev:%s\n", 994 printk(" disk %d, wo:%d, o:%d, dev:%s\n",
989 i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags), 995 i, !test_bit(In_sync, &tmp->rdev->flags), !test_bit(Faulty, &tmp->rdev->flags),
990 bdevname(tmp->rdev->bdev,b)); 996 bdevname(tmp->rdev->bdev,b));
991 } 997 }
992 } 998 }
993 999
994 static void close_sync(conf_t *conf) 1000 static void close_sync(conf_t *conf)
995 { 1001 {
996 wait_barrier(conf); 1002 wait_barrier(conf);
997 allow_barrier(conf); 1003 allow_barrier(conf);
998 1004
999 mempool_destroy(conf->r1buf_pool); 1005 mempool_destroy(conf->r1buf_pool);
1000 conf->r1buf_pool = NULL; 1006 conf->r1buf_pool = NULL;
1001 } 1007 }
1002 1008
1003 static int raid1_spare_active(mddev_t *mddev) 1009 static int raid1_spare_active(mddev_t *mddev)
1004 { 1010 {
1005 int i; 1011 int i;
1006 conf_t *conf = mddev->private; 1012 conf_t *conf = mddev->private;
1007 mirror_info_t *tmp; 1013 mirror_info_t *tmp;
1008 1014
1009 /* 1015 /*
1010 * Find all failed disks within the RAID1 configuration 1016 * Find all failed disks within the RAID1 configuration
1011 * and mark them readable 1017 * and mark them readable
1012 */ 1018 */
1013 for (i = 0; i < conf->raid_disks; i++) { 1019 for (i = 0; i < conf->raid_disks; i++) {
1014 tmp = conf->mirrors + i; 1020 tmp = conf->mirrors + i;
1015 if (tmp->rdev 1021 if (tmp->rdev
1016 && !test_bit(Faulty, &tmp->rdev->flags) 1022 && !test_bit(Faulty, &tmp->rdev->flags)
1017 && !test_bit(In_sync, &tmp->rdev->flags)) { 1023 && !test_bit(In_sync, &tmp->rdev->flags)) {
1018 conf->working_disks++; 1024 conf->working_disks++;
1019 mddev->degraded--; 1025 mddev->degraded--;
1020 set_bit(In_sync, &tmp->rdev->flags); 1026 set_bit(In_sync, &tmp->rdev->flags);
1021 } 1027 }
1022 } 1028 }
1023 1029
1024 print_conf(conf); 1030 print_conf(conf);
1025 return 0; 1031 return 0;
1026 } 1032 }
1027 1033
1028 1034
1029 static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 1035 static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1030 { 1036 {
1031 conf_t *conf = mddev->private; 1037 conf_t *conf = mddev->private;
1032 int found = 0; 1038 int found = 0;
1033 int mirror = 0; 1039 int mirror = 0;
1034 mirror_info_t *p; 1040 mirror_info_t *p;
1035 1041
1036 for (mirror=0; mirror < mddev->raid_disks; mirror++) 1042 for (mirror=0; mirror < mddev->raid_disks; mirror++)
1037 if ( !(p=conf->mirrors+mirror)->rdev) { 1043 if ( !(p=conf->mirrors+mirror)->rdev) {
1038 1044
1039 blk_queue_stack_limits(mddev->queue, 1045 blk_queue_stack_limits(mddev->queue,
1040 rdev->bdev->bd_disk->queue); 1046 rdev->bdev->bd_disk->queue);
1041 /* as we don't honour merge_bvec_fn, we must never risk 1047 /* as we don't honour merge_bvec_fn, we must never risk
1042 * violating it, so limit ->max_sector to one PAGE, as 1048 * violating it, so limit ->max_sector to one PAGE, as
1043 * a one page request is never in violation. 1049 * a one page request is never in violation.
1044 */ 1050 */
1045 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 1051 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1046 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 1052 mddev->queue->max_sectors > (PAGE_SIZE>>9))
1047 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 1053 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
1048 1054
1049 p->head_position = 0; 1055 p->head_position = 0;
1050 rdev->raid_disk = mirror; 1056 rdev->raid_disk = mirror;
1051 found = 1; 1057 found = 1;
1052 /* As all devices are equivalent, we don't need a full recovery 1058 /* As all devices are equivalent, we don't need a full recovery
1053 * if this was recently any drive of the array 1059 * if this was recently any drive of the array
1054 */ 1060 */
1055 if (rdev->saved_raid_disk < 0) 1061 if (rdev->saved_raid_disk < 0)
1056 conf->fullsync = 1; 1062 conf->fullsync = 1;
1057 rcu_assign_pointer(p->rdev, rdev); 1063 rcu_assign_pointer(p->rdev, rdev);
1058 break; 1064 break;
1059 } 1065 }
1060 1066
1061 print_conf(conf); 1067 print_conf(conf);
1062 return found; 1068 return found;
1063 } 1069 }
1064 1070
1065 static int raid1_remove_disk(mddev_t *mddev, int number) 1071 static int raid1_remove_disk(mddev_t *mddev, int number)
1066 { 1072 {
1067 conf_t *conf = mddev->private; 1073 conf_t *conf = mddev->private;
1068 int err = 0; 1074 int err = 0;
1069 mdk_rdev_t *rdev; 1075 mdk_rdev_t *rdev;
1070 mirror_info_t *p = conf->mirrors+ number; 1076 mirror_info_t *p = conf->mirrors+ number;
1071 1077
1072 print_conf(conf); 1078 print_conf(conf);
1073 rdev = p->rdev; 1079 rdev = p->rdev;
1074 if (rdev) { 1080 if (rdev) {
1075 if (test_bit(In_sync, &rdev->flags) || 1081 if (test_bit(In_sync, &rdev->flags) ||
1076 atomic_read(&rdev->nr_pending)) { 1082 atomic_read(&rdev->nr_pending)) {
1077 err = -EBUSY; 1083 err = -EBUSY;
1078 goto abort; 1084 goto abort;
1079 } 1085 }
1080 p->rdev = NULL; 1086 p->rdev = NULL;
1081 synchronize_rcu(); 1087 synchronize_rcu();
1082 if (atomic_read(&rdev->nr_pending)) { 1088 if (atomic_read(&rdev->nr_pending)) {
1083 /* lost the race, try later */ 1089 /* lost the race, try later */
1084 err = -EBUSY; 1090 err = -EBUSY;
1085 p->rdev = rdev; 1091 p->rdev = rdev;
1086 } 1092 }
1087 } 1093 }
1088 abort: 1094 abort:
1089 1095
1090 print_conf(conf); 1096 print_conf(conf);
1091 return err; 1097 return err;
1092 } 1098 }
1093 1099
1094 1100
1095 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error) 1101 static int end_sync_read(struct bio *bio, unsigned int bytes_done, int error)
1096 { 1102 {
1097 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1103 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
1098 int i; 1104 int i;
1099 1105
1100 if (bio->bi_size) 1106 if (bio->bi_size)
1101 return 1; 1107 return 1;
1102 1108
1103 for (i=r1_bio->mddev->raid_disks; i--; ) 1109 for (i=r1_bio->mddev->raid_disks; i--; )
1104 if (r1_bio->bios[i] == bio) 1110 if (r1_bio->bios[i] == bio)
1105 break; 1111 break;
1106 BUG_ON(i < 0); 1112 BUG_ON(i < 0);
1107 update_head_pos(i, r1_bio); 1113 update_head_pos(i, r1_bio);
1108 /* 1114 /*
1109 * we have read a block, now it needs to be re-written, 1115 * we have read a block, now it needs to be re-written,
1110 * or re-read if the read failed. 1116 * or re-read if the read failed.
1111 * We don't do much here, just schedule handling by raid1d 1117 * We don't do much here, just schedule handling by raid1d
1112 */ 1118 */
1113 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1119 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1114 set_bit(R1BIO_Uptodate, &r1_bio->state); 1120 set_bit(R1BIO_Uptodate, &r1_bio->state);
1115 1121
1116 if (atomic_dec_and_test(&r1_bio->remaining)) 1122 if (atomic_dec_and_test(&r1_bio->remaining))
1117 reschedule_retry(r1_bio); 1123 reschedule_retry(r1_bio);
1118 return 0; 1124 return 0;
1119 } 1125 }
1120 1126
1121 static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error) 1127 static int end_sync_write(struct bio *bio, unsigned int bytes_done, int error)
1122 { 1128 {
1123 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1129 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1124 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); 1130 r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
1125 mddev_t *mddev = r1_bio->mddev; 1131 mddev_t *mddev = r1_bio->mddev;
1126 conf_t *conf = mddev_to_conf(mddev); 1132 conf_t *conf = mddev_to_conf(mddev);
1127 int i; 1133 int i;
1128 int mirror=0; 1134 int mirror=0;
1129 1135
1130 if (bio->bi_size) 1136 if (bio->bi_size)
1131 return 1; 1137 return 1;
1132 1138
1133 for (i = 0; i < conf->raid_disks; i++) 1139 for (i = 0; i < conf->raid_disks; i++)
1134 if (r1_bio->bios[i] == bio) { 1140 if (r1_bio->bios[i] == bio) {
1135 mirror = i; 1141 mirror = i;
1136 break; 1142 break;
1137 } 1143 }
1138 if (!uptodate) { 1144 if (!uptodate) {
1139 int sync_blocks = 0; 1145 int sync_blocks = 0;
1140 sector_t s = r1_bio->sector; 1146 sector_t s = r1_bio->sector;
1141 long sectors_to_go = r1_bio->sectors; 1147 long sectors_to_go = r1_bio->sectors;
1142 /* make sure these bits doesn't get cleared. */ 1148 /* make sure these bits doesn't get cleared. */
1143 do { 1149 do {
1144 bitmap_end_sync(mddev->bitmap, r1_bio->sector, 1150 bitmap_end_sync(mddev->bitmap, r1_bio->sector,
1145 &sync_blocks, 1); 1151 &sync_blocks, 1);
1146 s += sync_blocks; 1152 s += sync_blocks;
1147 sectors_to_go -= sync_blocks; 1153 sectors_to_go -= sync_blocks;
1148 } while (sectors_to_go > 0); 1154 } while (sectors_to_go > 0);
1149 md_error(mddev, conf->mirrors[mirror].rdev); 1155 md_error(mddev, conf->mirrors[mirror].rdev);
1150 } 1156 }
1151 1157
1152 update_head_pos(mirror, r1_bio); 1158 update_head_pos(mirror, r1_bio);
1153 1159
1154 if (atomic_dec_and_test(&r1_bio->remaining)) { 1160 if (atomic_dec_and_test(&r1_bio->remaining)) {
1155 md_done_sync(mddev, r1_bio->sectors, uptodate); 1161 md_done_sync(mddev, r1_bio->sectors, uptodate);
1156 put_buf(r1_bio); 1162 put_buf(r1_bio);
1157 } 1163 }
1158 return 0; 1164 return 0;
1159 } 1165 }
1160 1166
1161 static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) 1167 static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1162 { 1168 {
1163 conf_t *conf = mddev_to_conf(mddev); 1169 conf_t *conf = mddev_to_conf(mddev);
1164 int i; 1170 int i;
1165 int disks = conf->raid_disks; 1171 int disks = conf->raid_disks;
1166 struct bio *bio, *wbio; 1172 struct bio *bio, *wbio;
1167 1173
1168 bio = r1_bio->bios[r1_bio->read_disk]; 1174 bio = r1_bio->bios[r1_bio->read_disk];
1169 1175
1170 1176
1171 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 1177 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1172 /* We have read all readable devices. If we haven't 1178 /* We have read all readable devices. If we haven't
1173 * got the block, then there is no hope left. 1179 * got the block, then there is no hope left.
1174 * If we have, then we want to do a comparison 1180 * If we have, then we want to do a comparison
1175 * and skip the write if everything is the same. 1181 * and skip the write if everything is the same.
1176 * If any blocks failed to read, then we need to 1182 * If any blocks failed to read, then we need to
1177 * attempt an over-write 1183 * attempt an over-write
1178 */ 1184 */
1179 int primary; 1185 int primary;
1180 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1186 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
1181 for (i=0; i<mddev->raid_disks; i++) 1187 for (i=0; i<mddev->raid_disks; i++)
1182 if (r1_bio->bios[i]->bi_end_io == end_sync_read) 1188 if (r1_bio->bios[i]->bi_end_io == end_sync_read)
1183 md_error(mddev, conf->mirrors[i].rdev); 1189 md_error(mddev, conf->mirrors[i].rdev);
1184 1190
1185 md_done_sync(mddev, r1_bio->sectors, 1); 1191 md_done_sync(mddev, r1_bio->sectors, 1);
1186 put_buf(r1_bio); 1192 put_buf(r1_bio);
1187 return; 1193 return;
1188 } 1194 }
1189 for (primary=0; primary<mddev->raid_disks; primary++) 1195 for (primary=0; primary<mddev->raid_disks; primary++)
1190 if (r1_bio->bios[primary]->bi_end_io == end_sync_read && 1196 if (r1_bio->bios[primary]->bi_end_io == end_sync_read &&
1191 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { 1197 test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) {
1192 r1_bio->bios[primary]->bi_end_io = NULL; 1198 r1_bio->bios[primary]->bi_end_io = NULL;
1193 rdev_dec_pending(conf->mirrors[primary].rdev, mddev); 1199 rdev_dec_pending(conf->mirrors[primary].rdev, mddev);
1194 break; 1200 break;
1195 } 1201 }
1196 r1_bio->read_disk = primary; 1202 r1_bio->read_disk = primary;
1197 for (i=0; i<mddev->raid_disks; i++) 1203 for (i=0; i<mddev->raid_disks; i++)
1198 if (r1_bio->bios[i]->bi_end_io == end_sync_read && 1204 if (r1_bio->bios[i]->bi_end_io == end_sync_read &&
1199 test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) { 1205 test_bit(BIO_UPTODATE, &r1_bio->bios[i]->bi_flags)) {
1200 int j; 1206 int j;
1201 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); 1207 int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9);
1202 struct bio *pbio = r1_bio->bios[primary]; 1208 struct bio *pbio = r1_bio->bios[primary];
1203 struct bio *sbio = r1_bio->bios[i]; 1209 struct bio *sbio = r1_bio->bios[i];
1204 for (j = vcnt; j-- ; ) 1210 for (j = vcnt; j-- ; )
1205 if (memcmp(page_address(pbio->bi_io_vec[j].bv_page), 1211 if (memcmp(page_address(pbio->bi_io_vec[j].bv_page),
1206 page_address(sbio->bi_io_vec[j].bv_page), 1212 page_address(sbio->bi_io_vec[j].bv_page),
1207 PAGE_SIZE)) 1213 PAGE_SIZE))
1208 break; 1214 break;
1209 if (j >= 0) 1215 if (j >= 0)
1210 mddev->resync_mismatches += r1_bio->sectors; 1216 mddev->resync_mismatches += r1_bio->sectors;
1211 if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 1217 if (j < 0 || test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
1212 sbio->bi_end_io = NULL; 1218 sbio->bi_end_io = NULL;
1213 rdev_dec_pending(conf->mirrors[i].rdev, mddev); 1219 rdev_dec_pending(conf->mirrors[i].rdev, mddev);
1214 } else { 1220 } else {
1215 /* fixup the bio for reuse */ 1221 /* fixup the bio for reuse */
1216 sbio->bi_vcnt = vcnt; 1222 sbio->bi_vcnt = vcnt;
1217 sbio->bi_size = r1_bio->sectors << 9; 1223 sbio->bi_size = r1_bio->sectors << 9;
1218 sbio->bi_idx = 0; 1224 sbio->bi_idx = 0;
1219 sbio->bi_phys_segments = 0; 1225 sbio->bi_phys_segments = 0;
1220 sbio->bi_hw_segments = 0; 1226 sbio->bi_hw_segments = 0;
1221 sbio->bi_hw_front_size = 0; 1227 sbio->bi_hw_front_size = 0;
1222 sbio->bi_hw_back_size = 0; 1228 sbio->bi_hw_back_size = 0;
1223 sbio->bi_flags &= ~(BIO_POOL_MASK - 1); 1229 sbio->bi_flags &= ~(BIO_POOL_MASK - 1);
1224 sbio->bi_flags |= 1 << BIO_UPTODATE; 1230 sbio->bi_flags |= 1 << BIO_UPTODATE;
1225 sbio->bi_next = NULL; 1231 sbio->bi_next = NULL;
1226 sbio->bi_sector = r1_bio->sector + 1232 sbio->bi_sector = r1_bio->sector +
1227 conf->mirrors[i].rdev->data_offset; 1233 conf->mirrors[i].rdev->data_offset;
1228 sbio->bi_bdev = conf->mirrors[i].rdev->bdev; 1234 sbio->bi_bdev = conf->mirrors[i].rdev->bdev;
1229 } 1235 }
1230 } 1236 }
1231 } 1237 }
1232 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { 1238 if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) {
1233 /* ouch - failed to read all of that. 1239 /* ouch - failed to read all of that.
1234 * Try some synchronous reads of other devices to get 1240 * Try some synchronous reads of other devices to get
1235 * good data, much like with normal read errors. Only 1241 * good data, much like with normal read errors. Only
1236 * read into the pages we already have so they we don't 1242 * read into the pages we already have so they we don't
1237 * need to re-issue the read request. 1243 * need to re-issue the read request.
1238 * We don't need to freeze the array, because being in an 1244 * We don't need to freeze the array, because being in an
1239 * active sync request, there is no normal IO, and 1245 * active sync request, there is no normal IO, and
1240 * no overlapping syncs. 1246 * no overlapping syncs.
1241 */ 1247 */
1242 sector_t sect = r1_bio->sector; 1248 sector_t sect = r1_bio->sector;
1243 int sectors = r1_bio->sectors; 1249 int sectors = r1_bio->sectors;
1244 int idx = 0; 1250 int idx = 0;
1245 1251
1246 while(sectors) { 1252 while(sectors) {
1247 int s = sectors; 1253 int s = sectors;
1248 int d = r1_bio->read_disk; 1254 int d = r1_bio->read_disk;
1249 int success = 0; 1255 int success = 0;
1250 mdk_rdev_t *rdev; 1256 mdk_rdev_t *rdev;
1251 1257
1252 if (s > (PAGE_SIZE>>9)) 1258 if (s > (PAGE_SIZE>>9))
1253 s = PAGE_SIZE >> 9; 1259 s = PAGE_SIZE >> 9;
1254 do { 1260 do {
1255 if (r1_bio->bios[d]->bi_end_io == end_sync_read) { 1261 if (r1_bio->bios[d]->bi_end_io == end_sync_read) {
1256 rdev = conf->mirrors[d].rdev; 1262 rdev = conf->mirrors[d].rdev;
1257 if (sync_page_io(rdev->bdev, 1263 if (sync_page_io(rdev->bdev,
1258 sect + rdev->data_offset, 1264 sect + rdev->data_offset,
1259 s<<9, 1265 s<<9,
1260 bio->bi_io_vec[idx].bv_page, 1266 bio->bi_io_vec[idx].bv_page,
1261 READ)) { 1267 READ)) {
1262 success = 1; 1268 success = 1;
1263 break; 1269 break;
1264 } 1270 }
1265 } 1271 }
1266 d++; 1272 d++;
1267 if (d == conf->raid_disks) 1273 if (d == conf->raid_disks)
1268 d = 0; 1274 d = 0;
1269 } while (!success && d != r1_bio->read_disk); 1275 } while (!success && d != r1_bio->read_disk);
1270 1276
1271 if (success) { 1277 if (success) {
1272 int start = d; 1278 int start = d;
1273 /* write it back and re-read */ 1279 /* write it back and re-read */
1274 set_bit(R1BIO_Uptodate, &r1_bio->state); 1280 set_bit(R1BIO_Uptodate, &r1_bio->state);
1275 while (d != r1_bio->read_disk) { 1281 while (d != r1_bio->read_disk) {
1276 if (d == 0) 1282 if (d == 0)
1277 d = conf->raid_disks; 1283 d = conf->raid_disks;
1278 d--; 1284 d--;
1279 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1285 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1280 continue; 1286 continue;
1281 rdev = conf->mirrors[d].rdev; 1287 rdev = conf->mirrors[d].rdev;
1282 atomic_add(s, &rdev->corrected_errors); 1288 atomic_add(s, &rdev->corrected_errors);
1283 if (sync_page_io(rdev->bdev, 1289 if (sync_page_io(rdev->bdev,
1284 sect + rdev->data_offset, 1290 sect + rdev->data_offset,
1285 s<<9, 1291 s<<9,
1286 bio->bi_io_vec[idx].bv_page, 1292 bio->bi_io_vec[idx].bv_page,
1287 WRITE) == 0) 1293 WRITE) == 0)
1288 md_error(mddev, rdev); 1294 md_error(mddev, rdev);
1289 } 1295 }
1290 d = start; 1296 d = start;
1291 while (d != r1_bio->read_disk) { 1297 while (d != r1_bio->read_disk) {
1292 if (d == 0) 1298 if (d == 0)
1293 d = conf->raid_disks; 1299 d = conf->raid_disks;
1294 d--; 1300 d--;
1295 if (r1_bio->bios[d]->bi_end_io != end_sync_read) 1301 if (r1_bio->bios[d]->bi_end_io != end_sync_read)
1296 continue; 1302 continue;
1297 rdev = conf->mirrors[d].rdev; 1303 rdev = conf->mirrors[d].rdev;
1298 if (sync_page_io(rdev->bdev, 1304 if (sync_page_io(rdev->bdev,
1299 sect + rdev->data_offset, 1305 sect + rdev->data_offset,
1300 s<<9, 1306 s<<9,
1301 bio->bi_io_vec[idx].bv_page, 1307 bio->bi_io_vec[idx].bv_page,
1302 READ) == 0) 1308 READ) == 0)
1303 md_error(mddev, rdev); 1309 md_error(mddev, rdev);
1304 } 1310 }
1305 } else { 1311 } else {
1306 char b[BDEVNAME_SIZE]; 1312 char b[BDEVNAME_SIZE];
1307 /* Cannot read from anywhere, array is toast */ 1313 /* Cannot read from anywhere, array is toast */
1308 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 1314 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1309 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error" 1315 printk(KERN_ALERT "raid1: %s: unrecoverable I/O read error"
1310 " for block %llu\n", 1316 " for block %llu\n",
1311 bdevname(bio->bi_bdev,b), 1317 bdevname(bio->bi_bdev,b),
1312 (unsigned long long)r1_bio->sector); 1318 (unsigned long long)r1_bio->sector);
1313 md_done_sync(mddev, r1_bio->sectors, 0); 1319 md_done_sync(mddev, r1_bio->sectors, 0);
1314 put_buf(r1_bio); 1320 put_buf(r1_bio);
1315 return; 1321 return;
1316 } 1322 }
1317 sectors -= s; 1323 sectors -= s;
1318 sect += s; 1324 sect += s;
1319 idx ++; 1325 idx ++;
1320 } 1326 }
1321 } 1327 }
1322 1328
1323 /* 1329 /*
1324 * schedule writes 1330 * schedule writes
1325 */ 1331 */
1326 atomic_set(&r1_bio->remaining, 1); 1332 atomic_set(&r1_bio->remaining, 1);
1327 for (i = 0; i < disks ; i++) { 1333 for (i = 0; i < disks ; i++) {
1328 wbio = r1_bio->bios[i]; 1334 wbio = r1_bio->bios[i];
1329 if (wbio->bi_end_io == NULL || 1335 if (wbio->bi_end_io == NULL ||
1330 (wbio->bi_end_io == end_sync_read && 1336 (wbio->bi_end_io == end_sync_read &&
1331 (i == r1_bio->read_disk || 1337 (i == r1_bio->read_disk ||
1332 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery)))) 1338 !test_bit(MD_RECOVERY_SYNC, &mddev->recovery))))
1333 continue; 1339 continue;
1334 1340
1335 wbio->bi_rw = WRITE; 1341 wbio->bi_rw = WRITE;
1336 wbio->bi_end_io = end_sync_write; 1342 wbio->bi_end_io = end_sync_write;
1337 atomic_inc(&r1_bio->remaining); 1343 atomic_inc(&r1_bio->remaining);
1338 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9); 1344 md_sync_acct(conf->mirrors[i].rdev->bdev, wbio->bi_size >> 9);
1339 1345
1340 generic_make_request(wbio); 1346 generic_make_request(wbio);
1341 } 1347 }
1342 1348
1343 if (atomic_dec_and_test(&r1_bio->remaining)) { 1349 if (atomic_dec_and_test(&r1_bio->remaining)) {
1344 /* if we're here, all write(s) have completed, so clean up */ 1350 /* if we're here, all write(s) have completed, so clean up */
1345 md_done_sync(mddev, r1_bio->sectors, 1); 1351 md_done_sync(mddev, r1_bio->sectors, 1);
1346 put_buf(r1_bio); 1352 put_buf(r1_bio);
1347 } 1353 }
1348 } 1354 }
1349 1355
1350 /* 1356 /*
1351 * This is a kernel thread which: 1357 * This is a kernel thread which:
1352 * 1358 *
1353 * 1. Retries failed read operations on working mirrors. 1359 * 1. Retries failed read operations on working mirrors.
1354 * 2. Updates the raid superblock when problems encounter. 1360 * 2. Updates the raid superblock when problems encounter.
1355 * 3. Performs writes following reads for array syncronising. 1361 * 3. Performs writes following reads for array syncronising.
1356 */ 1362 */
1357 1363
1358 static void raid1d(mddev_t *mddev) 1364 static void raid1d(mddev_t *mddev)
1359 { 1365 {
1360 r1bio_t *r1_bio; 1366 r1bio_t *r1_bio;
1361 struct bio *bio; 1367 struct bio *bio;
1362 unsigned long flags; 1368 unsigned long flags;
1363 conf_t *conf = mddev_to_conf(mddev); 1369 conf_t *conf = mddev_to_conf(mddev);
1364 struct list_head *head = &conf->retry_list; 1370 struct list_head *head = &conf->retry_list;
1365 int unplug=0; 1371 int unplug=0;
1366 mdk_rdev_t *rdev; 1372 mdk_rdev_t *rdev;
1367 1373
1368 md_check_recovery(mddev); 1374 md_check_recovery(mddev);
1369 1375
1370 for (;;) { 1376 for (;;) {
1371 char b[BDEVNAME_SIZE]; 1377 char b[BDEVNAME_SIZE];
1372 spin_lock_irqsave(&conf->device_lock, flags); 1378 spin_lock_irqsave(&conf->device_lock, flags);
1373 1379
1374 if (conf->pending_bio_list.head) { 1380 if (conf->pending_bio_list.head) {
1375 bio = bio_list_get(&conf->pending_bio_list); 1381 bio = bio_list_get(&conf->pending_bio_list);
1376 blk_remove_plug(mddev->queue); 1382 blk_remove_plug(mddev->queue);
1377 spin_unlock_irqrestore(&conf->device_lock, flags); 1383 spin_unlock_irqrestore(&conf->device_lock, flags);
1378 /* flush any pending bitmap writes to disk before proceeding w/ I/O */ 1384 /* flush any pending bitmap writes to disk before proceeding w/ I/O */
1379 if (bitmap_unplug(mddev->bitmap) != 0) 1385 if (bitmap_unplug(mddev->bitmap) != 0)
1380 printk("%s: bitmap file write failed!\n", mdname(mddev)); 1386 printk("%s: bitmap file write failed!\n", mdname(mddev));
1381 1387
1382 while (bio) { /* submit pending writes */ 1388 while (bio) { /* submit pending writes */
1383 struct bio *next = bio->bi_next; 1389 struct bio *next = bio->bi_next;
1384 bio->bi_next = NULL; 1390 bio->bi_next = NULL;
1385 generic_make_request(bio); 1391 generic_make_request(bio);
1386 bio = next; 1392 bio = next;
1387 } 1393 }
1388 unplug = 1; 1394 unplug = 1;
1389 1395
1390 continue; 1396 continue;
1391 } 1397 }
1392 1398
1393 if (list_empty(head)) 1399 if (list_empty(head))
1394 break; 1400 break;
1395 r1_bio = list_entry(head->prev, r1bio_t, retry_list); 1401 r1_bio = list_entry(head->prev, r1bio_t, retry_list);
1396 list_del(head->prev); 1402 list_del(head->prev);
1397 conf->nr_queued--; 1403 conf->nr_queued--;
1398 spin_unlock_irqrestore(&conf->device_lock, flags); 1404 spin_unlock_irqrestore(&conf->device_lock, flags);
1399 1405
1400 mddev = r1_bio->mddev; 1406 mddev = r1_bio->mddev;
1401 conf = mddev_to_conf(mddev); 1407 conf = mddev_to_conf(mddev);
1402 if (test_bit(R1BIO_IsSync, &r1_bio->state)) { 1408 if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1403 sync_request_write(mddev, r1_bio); 1409 sync_request_write(mddev, r1_bio);
1404 unplug = 1; 1410 unplug = 1;
1405 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { 1411 } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1406 /* some requests in the r1bio were BIO_RW_BARRIER 1412 /* some requests in the r1bio were BIO_RW_BARRIER
1407 * requests which failed with -EOPNOTSUPP. Hohumm.. 1413 * requests which failed with -EOPNOTSUPP. Hohumm..
1408 * Better resubmit without the barrier. 1414 * Better resubmit without the barrier.
1409 * We know which devices to resubmit for, because 1415 * We know which devices to resubmit for, because
1410 * all others have had their bios[] entry cleared. 1416 * all others have had their bios[] entry cleared.
1411 */ 1417 */
1412 int i; 1418 int i;
1413 clear_bit(R1BIO_BarrierRetry, &r1_bio->state); 1419 clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1414 clear_bit(R1BIO_Barrier, &r1_bio->state); 1420 clear_bit(R1BIO_Barrier, &r1_bio->state);
1415 for (i=0; i < conf->raid_disks; i++) 1421 for (i=0; i < conf->raid_disks; i++)
1416 if (r1_bio->bios[i]) 1422 if (r1_bio->bios[i])
1417 atomic_inc(&r1_bio->remaining); 1423 atomic_inc(&r1_bio->remaining);
1418 for (i=0; i < conf->raid_disks; i++) 1424 for (i=0; i < conf->raid_disks; i++)
1419 if (r1_bio->bios[i]) { 1425 if (r1_bio->bios[i]) {
1420 struct bio_vec *bvec; 1426 struct bio_vec *bvec;
1421 int j; 1427 int j;
1422 1428
1423 bio = bio_clone(r1_bio->master_bio, GFP_NOIO); 1429 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1424 /* copy pages from the failed bio, as 1430 /* copy pages from the failed bio, as
1425 * this might be a write-behind device */ 1431 * this might be a write-behind device */
1426 __bio_for_each_segment(bvec, bio, j, 0) 1432 __bio_for_each_segment(bvec, bio, j, 0)
1427 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page; 1433 bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1428 bio_put(r1_bio->bios[i]); 1434 bio_put(r1_bio->bios[i]);
1429 bio->bi_sector = r1_bio->sector + 1435 bio->bi_sector = r1_bio->sector +
1430 conf->mirrors[i].rdev->data_offset; 1436 conf->mirrors[i].rdev->data_offset;
1431 bio->bi_bdev = conf->mirrors[i].rdev->bdev; 1437 bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1432 bio->bi_end_io = raid1_end_write_request; 1438 bio->bi_end_io = raid1_end_write_request;
1433 bio->bi_rw = WRITE; 1439 bio->bi_rw = WRITE;
1434 bio->bi_private = r1_bio; 1440 bio->bi_private = r1_bio;
1435 r1_bio->bios[i] = bio; 1441 r1_bio->bios[i] = bio;
1436 generic_make_request(bio); 1442 generic_make_request(bio);
1437 } 1443 }
1438 } else { 1444 } else {
1439 int disk; 1445 int disk;
1440 1446
1441 /* we got a read error. Maybe the drive is bad. Maybe just 1447 /* we got a read error. Maybe the drive is bad. Maybe just
1442 * the block and we can fix it. 1448 * the block and we can fix it.
1443 * We freeze all other IO, and try reading the block from 1449 * We freeze all other IO, and try reading the block from
1444 * other devices. When we find one, we re-write 1450 * other devices. When we find one, we re-write
1445 * and check it that fixes the read error. 1451 * and check it that fixes the read error.
1446 * This is all done synchronously while the array is 1452 * This is all done synchronously while the array is
1447 * frozen 1453 * frozen
1448 */ 1454 */
1449 sector_t sect = r1_bio->sector; 1455 sector_t sect = r1_bio->sector;
1450 int sectors = r1_bio->sectors; 1456 int sectors = r1_bio->sectors;
1451 freeze_array(conf); 1457 freeze_array(conf);
1452 if (mddev->ro == 0) while(sectors) { 1458 if (mddev->ro == 0) while(sectors) {
1453 int s = sectors; 1459 int s = sectors;
1454 int d = r1_bio->read_disk; 1460 int d = r1_bio->read_disk;
1455 int success = 0; 1461 int success = 0;
1456 1462
1457 if (s > (PAGE_SIZE>>9)) 1463 if (s > (PAGE_SIZE>>9))
1458 s = PAGE_SIZE >> 9; 1464 s = PAGE_SIZE >> 9;
1459 1465
1460 do { 1466 do {
1461 rdev = conf->mirrors[d].rdev; 1467 rdev = conf->mirrors[d].rdev;
1462 if (rdev && 1468 if (rdev &&
1463 test_bit(In_sync, &rdev->flags) && 1469 test_bit(In_sync, &rdev->flags) &&
1464 sync_page_io(rdev->bdev, 1470 sync_page_io(rdev->bdev,
1465 sect + rdev->data_offset, 1471 sect + rdev->data_offset,
1466 s<<9, 1472 s<<9,
1467 conf->tmppage, READ)) 1473 conf->tmppage, READ))
1468 success = 1; 1474 success = 1;
1469 else { 1475 else {
1470 d++; 1476 d++;
1471 if (d == conf->raid_disks) 1477 if (d == conf->raid_disks)
1472 d = 0; 1478 d = 0;
1473 } 1479 }
1474 } while (!success && d != r1_bio->read_disk); 1480 } while (!success && d != r1_bio->read_disk);
1475 1481
1476 if (success) { 1482 if (success) {
1477 /* write it back and re-read */ 1483 /* write it back and re-read */
1478 int start = d; 1484 int start = d;
1479 while (d != r1_bio->read_disk) { 1485 while (d != r1_bio->read_disk) {
1480 if (d==0) 1486 if (d==0)
1481 d = conf->raid_disks; 1487 d = conf->raid_disks;
1482 d--; 1488 d--;
1483 rdev = conf->mirrors[d].rdev; 1489 rdev = conf->mirrors[d].rdev;
1484 atomic_add(s, &rdev->corrected_errors); 1490 atomic_add(s, &rdev->corrected_errors);
1485 if (rdev && 1491 if (rdev &&
1486 test_bit(In_sync, &rdev->flags)) { 1492 test_bit(In_sync, &rdev->flags)) {
1487 if (sync_page_io(rdev->bdev, 1493 if (sync_page_io(rdev->bdev,
1488 sect + rdev->data_offset, 1494 sect + rdev->data_offset,
1489 s<<9, conf->tmppage, WRITE) == 0) 1495 s<<9, conf->tmppage, WRITE) == 0)
1490 /* Well, this device is dead */ 1496 /* Well, this device is dead */
1491 md_error(mddev, rdev); 1497 md_error(mddev, rdev);
1492 } 1498 }
1493 } 1499 }
1494 d = start; 1500 d = start;
1495 while (d != r1_bio->read_disk) { 1501 while (d != r1_bio->read_disk) {
1496 if (d==0) 1502 if (d==0)
1497 d = conf->raid_disks; 1503 d = conf->raid_disks;
1498 d--; 1504 d--;
1499 rdev = conf->mirrors[d].rdev; 1505 rdev = conf->mirrors[d].rdev;
1500 if (rdev && 1506 if (rdev &&
1501 test_bit(In_sync, &rdev->flags)) { 1507 test_bit(In_sync, &rdev->flags)) {
1502 if (sync_page_io(rdev->bdev, 1508 if (sync_page_io(rdev->bdev,
1503 sect + rdev->data_offset, 1509 sect + rdev->data_offset,
1504 s<<9, conf->tmppage, READ) == 0) 1510 s<<9, conf->tmppage, READ) == 0)
1505 /* Well, this device is dead */ 1511 /* Well, this device is dead */
1506 md_error(mddev, rdev); 1512 md_error(mddev, rdev);
1507 } 1513 }
1508 } 1514 }
1509 } else { 1515 } else {
1510 /* Cannot read from anywhere -- bye bye array */ 1516 /* Cannot read from anywhere -- bye bye array */
1511 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); 1517 md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
1512 break; 1518 break;
1513 } 1519 }
1514 sectors -= s; 1520 sectors -= s;
1515 sect += s; 1521 sect += s;
1516 } 1522 }
1517 1523
1518 unfreeze_array(conf); 1524 unfreeze_array(conf);
1519 1525
1520 bio = r1_bio->bios[r1_bio->read_disk]; 1526 bio = r1_bio->bios[r1_bio->read_disk];
1521 if ((disk=read_balance(conf, r1_bio)) == -1) { 1527 if ((disk=read_balance(conf, r1_bio)) == -1) {
1522 printk(KERN_ALERT "raid1: %s: unrecoverable I/O" 1528 printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
1523 " read error for block %llu\n", 1529 " read error for block %llu\n",
1524 bdevname(bio->bi_bdev,b), 1530 bdevname(bio->bi_bdev,b),
1525 (unsigned long long)r1_bio->sector); 1531 (unsigned long long)r1_bio->sector);
1526 raid_end_bio_io(r1_bio); 1532 raid_end_bio_io(r1_bio);
1527 } else { 1533 } else {
1528 r1_bio->bios[r1_bio->read_disk] = 1534 r1_bio->bios[r1_bio->read_disk] =
1529 mddev->ro ? IO_BLOCKED : NULL; 1535 mddev->ro ? IO_BLOCKED : NULL;
1530 r1_bio->read_disk = disk; 1536 r1_bio->read_disk = disk;
1531 bio_put(bio); 1537 bio_put(bio);
1532 bio = bio_clone(r1_bio->master_bio, GFP_NOIO); 1538 bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1533 r1_bio->bios[r1_bio->read_disk] = bio; 1539 r1_bio->bios[r1_bio->read_disk] = bio;
1534 rdev = conf->mirrors[disk].rdev; 1540 rdev = conf->mirrors[disk].rdev;
1535 if (printk_ratelimit()) 1541 if (printk_ratelimit())
1536 printk(KERN_ERR "raid1: %s: redirecting sector %llu to" 1542 printk(KERN_ERR "raid1: %s: redirecting sector %llu to"
1537 " another mirror\n", 1543 " another mirror\n",
1538 bdevname(rdev->bdev,b), 1544 bdevname(rdev->bdev,b),
1539 (unsigned long long)r1_bio->sector); 1545 (unsigned long long)r1_bio->sector);
1540 bio->bi_sector = r1_bio->sector + rdev->data_offset; 1546 bio->bi_sector = r1_bio->sector + rdev->data_offset;
1541 bio->bi_bdev = rdev->bdev; 1547 bio->bi_bdev = rdev->bdev;
1542 bio->bi_end_io = raid1_end_read_request; 1548 bio->bi_end_io = raid1_end_read_request;
1543 bio->bi_rw = READ; 1549 bio->bi_rw = READ;
1544 bio->bi_private = r1_bio; 1550 bio->bi_private = r1_bio;
1545 unplug = 1; 1551 unplug = 1;
1546 generic_make_request(bio); 1552 generic_make_request(bio);
1547 } 1553 }
1548 } 1554 }
1549 } 1555 }
1550 spin_unlock_irqrestore(&conf->device_lock, flags); 1556 spin_unlock_irqrestore(&conf->device_lock, flags);
1551 if (unplug) 1557 if (unplug)
1552 unplug_slaves(mddev); 1558 unplug_slaves(mddev);
1553 } 1559 }
1554 1560
1555 1561
1556 static int init_resync(conf_t *conf) 1562 static int init_resync(conf_t *conf)
1557 { 1563 {
1558 int buffs; 1564 int buffs;
1559 1565
1560 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 1566 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
1561 BUG_ON(conf->r1buf_pool); 1567 BUG_ON(conf->r1buf_pool);
1562 conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free, 1568 conf->r1buf_pool = mempool_create(buffs, r1buf_pool_alloc, r1buf_pool_free,
1563 conf->poolinfo); 1569 conf->poolinfo);
1564 if (!conf->r1buf_pool) 1570 if (!conf->r1buf_pool)
1565 return -ENOMEM; 1571 return -ENOMEM;
1566 conf->next_resync = 0; 1572 conf->next_resync = 0;
1567 return 0; 1573 return 0;
1568 } 1574 }
1569 1575
1570 /* 1576 /*
1571 * perform a "sync" on one "block" 1577 * perform a "sync" on one "block"
1572 * 1578 *
1573 * We need to make sure that no normal I/O request - particularly write 1579 * We need to make sure that no normal I/O request - particularly write
1574 * requests - conflict with active sync requests. 1580 * requests - conflict with active sync requests.
1575 * 1581 *
1576 * This is achieved by tracking pending requests and a 'barrier' concept 1582 * This is achieved by tracking pending requests and a 'barrier' concept
1577 * that can be installed to exclude normal IO requests. 1583 * that can be installed to exclude normal IO requests.
1578 */ 1584 */
1579 1585
1580 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 1586 static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
1581 { 1587 {
1582 conf_t *conf = mddev_to_conf(mddev); 1588 conf_t *conf = mddev_to_conf(mddev);
1583 r1bio_t *r1_bio; 1589 r1bio_t *r1_bio;
1584 struct bio *bio; 1590 struct bio *bio;
1585 sector_t max_sector, nr_sectors; 1591 sector_t max_sector, nr_sectors;
1586 int disk = -1; 1592 int disk = -1;
1587 int i; 1593 int i;
1588 int wonly = -1; 1594 int wonly = -1;
1589 int write_targets = 0, read_targets = 0; 1595 int write_targets = 0, read_targets = 0;
1590 int sync_blocks; 1596 int sync_blocks;
1591 int still_degraded = 0; 1597 int still_degraded = 0;
1592 1598
1593 if (!conf->r1buf_pool) 1599 if (!conf->r1buf_pool)
1594 { 1600 {
1595 /* 1601 /*
1596 printk("sync start - bitmap %p\n", mddev->bitmap); 1602 printk("sync start - bitmap %p\n", mddev->bitmap);
1597 */ 1603 */
1598 if (init_resync(conf)) 1604 if (init_resync(conf))
1599 return 0; 1605 return 0;
1600 } 1606 }
1601 1607
1602 max_sector = mddev->size << 1; 1608 max_sector = mddev->size << 1;
1603 if (sector_nr >= max_sector) { 1609 if (sector_nr >= max_sector) {
1604 /* If we aborted, we need to abort the 1610 /* If we aborted, we need to abort the
1605 * sync on the 'current' bitmap chunk (there will 1611 * sync on the 'current' bitmap chunk (there will
1606 * only be one in raid1 resync. 1612 * only be one in raid1 resync.
1607 * We can find the current addess in mddev->curr_resync 1613 * We can find the current addess in mddev->curr_resync
1608 */ 1614 */
1609 if (mddev->curr_resync < max_sector) /* aborted */ 1615 if (mddev->curr_resync < max_sector) /* aborted */
1610 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 1616 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
1611 &sync_blocks, 1); 1617 &sync_blocks, 1);
1612 else /* completed sync */ 1618 else /* completed sync */
1613 conf->fullsync = 0; 1619 conf->fullsync = 0;
1614 1620
1615 bitmap_close_sync(mddev->bitmap); 1621 bitmap_close_sync(mddev->bitmap);
1616 close_sync(conf); 1622 close_sync(conf);
1617 return 0; 1623 return 0;
1618 } 1624 }
1619 1625
1620 /* before building a request, check if we can skip these blocks.. 1626 /* before building a request, check if we can skip these blocks..
1621 * This call the bitmap_start_sync doesn't actually record anything 1627 * This call the bitmap_start_sync doesn't actually record anything
1622 */ 1628 */
1623 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 1629 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
1624 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 1630 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1625 /* We can skip this block, and probably several more */ 1631 /* We can skip this block, and probably several more */
1626 *skipped = 1; 1632 *skipped = 1;
1627 return sync_blocks; 1633 return sync_blocks;
1628 } 1634 }
1629 /* 1635 /*
1630 * If there is non-resync activity waiting for a turn, 1636 * If there is non-resync activity waiting for a turn,
1631 * and resync is going fast enough, 1637 * and resync is going fast enough,
1632 * then let it though before starting on this new sync request. 1638 * then let it though before starting on this new sync request.
1633 */ 1639 */
1634 if (!go_faster && conf->nr_waiting) 1640 if (!go_faster && conf->nr_waiting)
1635 msleep_interruptible(1000); 1641 msleep_interruptible(1000);
1636 1642
1637 raise_barrier(conf); 1643 raise_barrier(conf);
1638 1644
1639 conf->next_resync = sector_nr; 1645 conf->next_resync = sector_nr;
1640 1646
1641 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); 1647 r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO);
1642 rcu_read_lock(); 1648 rcu_read_lock();
1643 /* 1649 /*
1644 * If we get a correctably read error during resync or recovery, 1650 * If we get a correctably read error during resync or recovery,
1645 * we might want to read from a different device. So we 1651 * we might want to read from a different device. So we
1646 * flag all drives that could conceivably be read from for READ, 1652 * flag all drives that could conceivably be read from for READ,
1647 * and any others (which will be non-In_sync devices) for WRITE. 1653 * and any others (which will be non-In_sync devices) for WRITE.
1648 * If a read fails, we try reading from something else for which READ 1654 * If a read fails, we try reading from something else for which READ
1649 * is OK. 1655 * is OK.
1650 */ 1656 */
1651 1657
1652 r1_bio->mddev = mddev; 1658 r1_bio->mddev = mddev;
1653 r1_bio->sector = sector_nr; 1659 r1_bio->sector = sector_nr;
1654 r1_bio->state = 0; 1660 r1_bio->state = 0;
1655 set_bit(R1BIO_IsSync, &r1_bio->state); 1661 set_bit(R1BIO_IsSync, &r1_bio->state);
1656 1662
1657 for (i=0; i < conf->raid_disks; i++) { 1663 for (i=0; i < conf->raid_disks; i++) {
1658 mdk_rdev_t *rdev; 1664 mdk_rdev_t *rdev;
1659 bio = r1_bio->bios[i]; 1665 bio = r1_bio->bios[i];
1660 1666
1661 /* take from bio_init */ 1667 /* take from bio_init */
1662 bio->bi_next = NULL; 1668 bio->bi_next = NULL;
1663 bio->bi_flags |= 1 << BIO_UPTODATE; 1669 bio->bi_flags |= 1 << BIO_UPTODATE;
1664 bio->bi_rw = 0; 1670 bio->bi_rw = 0;
1665 bio->bi_vcnt = 0; 1671 bio->bi_vcnt = 0;
1666 bio->bi_idx = 0; 1672 bio->bi_idx = 0;
1667 bio->bi_phys_segments = 0; 1673 bio->bi_phys_segments = 0;
1668 bio->bi_hw_segments = 0; 1674 bio->bi_hw_segments = 0;
1669 bio->bi_size = 0; 1675 bio->bi_size = 0;
1670 bio->bi_end_io = NULL; 1676 bio->bi_end_io = NULL;
1671 bio->bi_private = NULL; 1677 bio->bi_private = NULL;
1672 1678
1673 rdev = rcu_dereference(conf->mirrors[i].rdev); 1679 rdev = rcu_dereference(conf->mirrors[i].rdev);
1674 if (rdev == NULL || 1680 if (rdev == NULL ||
1675 test_bit(Faulty, &rdev->flags)) { 1681 test_bit(Faulty, &rdev->flags)) {
1676 still_degraded = 1; 1682 still_degraded = 1;
1677 continue; 1683 continue;
1678 } else if (!test_bit(In_sync, &rdev->flags)) { 1684 } else if (!test_bit(In_sync, &rdev->flags)) {
1679 bio->bi_rw = WRITE; 1685 bio->bi_rw = WRITE;
1680 bio->bi_end_io = end_sync_write; 1686 bio->bi_end_io = end_sync_write;
1681 write_targets ++; 1687 write_targets ++;
1682 } else { 1688 } else {
1683 /* may need to read from here */ 1689 /* may need to read from here */
1684 bio->bi_rw = READ; 1690 bio->bi_rw = READ;
1685 bio->bi_end_io = end_sync_read; 1691 bio->bi_end_io = end_sync_read;
1686 if (test_bit(WriteMostly, &rdev->flags)) { 1692 if (test_bit(WriteMostly, &rdev->flags)) {
1687 if (wonly < 0) 1693 if (wonly < 0)
1688 wonly = i; 1694 wonly = i;
1689 } else { 1695 } else {
1690 if (disk < 0) 1696 if (disk < 0)
1691 disk = i; 1697 disk = i;
1692 } 1698 }
1693 read_targets++; 1699 read_targets++;
1694 } 1700 }
1695 atomic_inc(&rdev->nr_pending); 1701 atomic_inc(&rdev->nr_pending);
1696 bio->bi_sector = sector_nr + rdev->data_offset; 1702 bio->bi_sector = sector_nr + rdev->data_offset;
1697 bio->bi_bdev = rdev->bdev; 1703 bio->bi_bdev = rdev->bdev;
1698 bio->bi_private = r1_bio; 1704 bio->bi_private = r1_bio;
1699 } 1705 }
1700 rcu_read_unlock(); 1706 rcu_read_unlock();
1701 if (disk < 0) 1707 if (disk < 0)
1702 disk = wonly; 1708 disk = wonly;
1703 r1_bio->read_disk = disk; 1709 r1_bio->read_disk = disk;
1704 1710
1705 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0) 1711 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && read_targets > 0)
1706 /* extra read targets are also write targets */ 1712 /* extra read targets are also write targets */
1707 write_targets += read_targets-1; 1713 write_targets += read_targets-1;
1708 1714
1709 if (write_targets == 0 || read_targets == 0) { 1715 if (write_targets == 0 || read_targets == 0) {
1710 /* There is nowhere to write, so all non-sync 1716 /* There is nowhere to write, so all non-sync
1711 * drives must be failed - so we are finished 1717 * drives must be failed - so we are finished
1712 */ 1718 */
1713 sector_t rv = max_sector - sector_nr; 1719 sector_t rv = max_sector - sector_nr;
1714 *skipped = 1; 1720 *skipped = 1;
1715 put_buf(r1_bio); 1721 put_buf(r1_bio);
1716 return rv; 1722 return rv;
1717 } 1723 }
1718 1724
1719 nr_sectors = 0; 1725 nr_sectors = 0;
1720 sync_blocks = 0; 1726 sync_blocks = 0;
1721 do { 1727 do {
1722 struct page *page; 1728 struct page *page;
1723 int len = PAGE_SIZE; 1729 int len = PAGE_SIZE;
1724 if (sector_nr + (len>>9) > max_sector) 1730 if (sector_nr + (len>>9) > max_sector)
1725 len = (max_sector - sector_nr) << 9; 1731 len = (max_sector - sector_nr) << 9;
1726 if (len == 0) 1732 if (len == 0)
1727 break; 1733 break;
1728 if (sync_blocks == 0) { 1734 if (sync_blocks == 0) {
1729 if (!bitmap_start_sync(mddev->bitmap, sector_nr, 1735 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
1730 &sync_blocks, still_degraded) && 1736 &sync_blocks, still_degraded) &&
1731 !conf->fullsync && 1737 !conf->fullsync &&
1732 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 1738 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
1733 break; 1739 break;
1734 BUG_ON(sync_blocks < (PAGE_SIZE>>9)); 1740 BUG_ON(sync_blocks < (PAGE_SIZE>>9));
1735 if (len > (sync_blocks<<9)) 1741 if (len > (sync_blocks<<9))
1736 len = sync_blocks<<9; 1742 len = sync_blocks<<9;
1737 } 1743 }
1738 1744
1739 for (i=0 ; i < conf->raid_disks; i++) { 1745 for (i=0 ; i < conf->raid_disks; i++) {
1740 bio = r1_bio->bios[i]; 1746 bio = r1_bio->bios[i];
1741 if (bio->bi_end_io) { 1747 if (bio->bi_end_io) {
1742 page = bio->bi_io_vec[bio->bi_vcnt].bv_page; 1748 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
1743 if (bio_add_page(bio, page, len, 0) == 0) { 1749 if (bio_add_page(bio, page, len, 0) == 0) {
1744 /* stop here */ 1750 /* stop here */
1745 bio->bi_io_vec[bio->bi_vcnt].bv_page = page; 1751 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
1746 while (i > 0) { 1752 while (i > 0) {
1747 i--; 1753 i--;
1748 bio = r1_bio->bios[i]; 1754 bio = r1_bio->bios[i];
1749 if (bio->bi_end_io==NULL) 1755 if (bio->bi_end_io==NULL)
1750 continue; 1756 continue;
1751 /* remove last page from this bio */ 1757 /* remove last page from this bio */
1752 bio->bi_vcnt--; 1758 bio->bi_vcnt--;
1753 bio->bi_size -= len; 1759 bio->bi_size -= len;
1754 bio->bi_flags &= ~(1<< BIO_SEG_VALID); 1760 bio->bi_flags &= ~(1<< BIO_SEG_VALID);
1755 } 1761 }
1756 goto bio_full; 1762 goto bio_full;
1757 } 1763 }
1758 } 1764 }
1759 } 1765 }
1760 nr_sectors += len>>9; 1766 nr_sectors += len>>9;
1761 sector_nr += len>>9; 1767 sector_nr += len>>9;
1762 sync_blocks -= (len>>9); 1768 sync_blocks -= (len>>9);
1763 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES); 1769 } while (r1_bio->bios[disk]->bi_vcnt < RESYNC_PAGES);
1764 bio_full: 1770 bio_full:
1765 r1_bio->sectors = nr_sectors; 1771 r1_bio->sectors = nr_sectors;
1766 1772
1767 /* For a user-requested sync, we read all readable devices and do a 1773 /* For a user-requested sync, we read all readable devices and do a
1768 * compare 1774 * compare
1769 */ 1775 */
1770 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 1776 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
1771 atomic_set(&r1_bio->remaining, read_targets); 1777 atomic_set(&r1_bio->remaining, read_targets);
1772 for (i=0; i<conf->raid_disks; i++) { 1778 for (i=0; i<conf->raid_disks; i++) {
1773 bio = r1_bio->bios[i]; 1779 bio = r1_bio->bios[i];
1774 if (bio->bi_end_io == end_sync_read) { 1780 if (bio->bi_end_io == end_sync_read) {
1775 md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors); 1781 md_sync_acct(conf->mirrors[i].rdev->bdev, nr_sectors);
1776 generic_make_request(bio); 1782 generic_make_request(bio);
1777 } 1783 }
1778 } 1784 }
1779 } else { 1785 } else {
1780 atomic_set(&r1_bio->remaining, 1); 1786 atomic_set(&r1_bio->remaining, 1);
1781 bio = r1_bio->bios[r1_bio->read_disk]; 1787 bio = r1_bio->bios[r1_bio->read_disk];
1782 md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev, 1788 md_sync_acct(conf->mirrors[r1_bio->read_disk].rdev->bdev,
1783 nr_sectors); 1789 nr_sectors);
1784 generic_make_request(bio); 1790 generic_make_request(bio);
1785 1791
1786 } 1792 }
1787 1793
1788 return nr_sectors; 1794 return nr_sectors;
1789 } 1795 }
1790 1796
1791 static int run(mddev_t *mddev) 1797 static int run(mddev_t *mddev)
1792 { 1798 {
1793 conf_t *conf; 1799 conf_t *conf;
1794 int i, j, disk_idx; 1800 int i, j, disk_idx;
1795 mirror_info_t *disk; 1801 mirror_info_t *disk;
1796 mdk_rdev_t *rdev; 1802 mdk_rdev_t *rdev;
1797 struct list_head *tmp; 1803 struct list_head *tmp;
1798 1804
1799 if (mddev->level != 1) { 1805 if (mddev->level != 1) {
1800 printk("raid1: %s: raid level not set to mirroring (%d)\n", 1806 printk("raid1: %s: raid level not set to mirroring (%d)\n",
1801 mdname(mddev), mddev->level); 1807 mdname(mddev), mddev->level);
1802 goto out; 1808 goto out;
1803 } 1809 }
1804 if (mddev->reshape_position != MaxSector) { 1810 if (mddev->reshape_position != MaxSector) {
1805 printk("raid1: %s: reshape_position set but not supported\n", 1811 printk("raid1: %s: reshape_position set but not supported\n",
1806 mdname(mddev)); 1812 mdname(mddev));
1807 goto out; 1813 goto out;
1808 } 1814 }
1809 /* 1815 /*
1810 * copy the already verified devices into our private RAID1 1816 * copy the already verified devices into our private RAID1
1811 * bookkeeping area. [whatever we allocate in run(), 1817 * bookkeeping area. [whatever we allocate in run(),
1812 * should be freed in stop()] 1818 * should be freed in stop()]
1813 */ 1819 */
1814 conf = kzalloc(sizeof(conf_t), GFP_KERNEL); 1820 conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1815 mddev->private = conf; 1821 mddev->private = conf;
1816 if (!conf) 1822 if (!conf)
1817 goto out_no_mem; 1823 goto out_no_mem;
1818 1824
1819 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks, 1825 conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1820 GFP_KERNEL); 1826 GFP_KERNEL);
1821 if (!conf->mirrors) 1827 if (!conf->mirrors)
1822 goto out_no_mem; 1828 goto out_no_mem;
1823 1829
1824 conf->tmppage = alloc_page(GFP_KERNEL); 1830 conf->tmppage = alloc_page(GFP_KERNEL);
1825 if (!conf->tmppage) 1831 if (!conf->tmppage)
1826 goto out_no_mem; 1832 goto out_no_mem;
1827 1833
1828 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL); 1834 conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1829 if (!conf->poolinfo) 1835 if (!conf->poolinfo)
1830 goto out_no_mem; 1836 goto out_no_mem;
1831 conf->poolinfo->mddev = mddev; 1837 conf->poolinfo->mddev = mddev;
1832 conf->poolinfo->raid_disks = mddev->raid_disks; 1838 conf->poolinfo->raid_disks = mddev->raid_disks;
1833 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 1839 conf->r1bio_pool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
1834 r1bio_pool_free, 1840 r1bio_pool_free,
1835 conf->poolinfo); 1841 conf->poolinfo);
1836 if (!conf->r1bio_pool) 1842 if (!conf->r1bio_pool)
1837 goto out_no_mem; 1843 goto out_no_mem;
1838 1844
1839 ITERATE_RDEV(mddev, rdev, tmp) { 1845 ITERATE_RDEV(mddev, rdev, tmp) {
1840 disk_idx = rdev->raid_disk; 1846 disk_idx = rdev->raid_disk;
1841 if (disk_idx >= mddev->raid_disks 1847 if (disk_idx >= mddev->raid_disks
1842 || disk_idx < 0) 1848 || disk_idx < 0)
1843 continue; 1849 continue;
1844 disk = conf->mirrors + disk_idx; 1850 disk = conf->mirrors + disk_idx;
1845 1851
1846 disk->rdev = rdev; 1852 disk->rdev = rdev;
1847 1853
1848 blk_queue_stack_limits(mddev->queue, 1854 blk_queue_stack_limits(mddev->queue,
1849 rdev->bdev->bd_disk->queue); 1855 rdev->bdev->bd_disk->queue);
1850 /* as we don't honour merge_bvec_fn, we must never risk 1856 /* as we don't honour merge_bvec_fn, we must never risk
1851 * violating it, so limit ->max_sector to one PAGE, as 1857 * violating it, so limit ->max_sector to one PAGE, as
1852 * a one page request is never in violation. 1858 * a one page request is never in violation.
1853 */ 1859 */
1854 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 1860 if (rdev->bdev->bd_disk->queue->merge_bvec_fn &&
1855 mddev->queue->max_sectors > (PAGE_SIZE>>9)) 1861 mddev->queue->max_sectors > (PAGE_SIZE>>9))
1856 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 1862 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9);
1857 1863
1858 disk->head_position = 0; 1864 disk->head_position = 0;
1859 if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags)) 1865 if (!test_bit(Faulty, &rdev->flags) && test_bit(In_sync, &rdev->flags))
1860 conf->working_disks++; 1866 conf->working_disks++;
1861 } 1867 }
1862 conf->raid_disks = mddev->raid_disks; 1868 conf->raid_disks = mddev->raid_disks;
1863 conf->mddev = mddev; 1869 conf->mddev = mddev;
1864 spin_lock_init(&conf->device_lock); 1870 spin_lock_init(&conf->device_lock);
1865 INIT_LIST_HEAD(&conf->retry_list); 1871 INIT_LIST_HEAD(&conf->retry_list);
1866 if (conf->working_disks == 1) 1872 if (conf->working_disks == 1)
1867 mddev->recovery_cp = MaxSector; 1873 mddev->recovery_cp = MaxSector;
1868 1874
1869 spin_lock_init(&conf->resync_lock); 1875 spin_lock_init(&conf->resync_lock);
1870 init_waitqueue_head(&conf->wait_barrier); 1876 init_waitqueue_head(&conf->wait_barrier);
1871 1877
1872 bio_list_init(&conf->pending_bio_list); 1878 bio_list_init(&conf->pending_bio_list);
1873 bio_list_init(&conf->flushing_bio_list); 1879 bio_list_init(&conf->flushing_bio_list);
1874 1880
1875 if (!conf->working_disks) { 1881 if (!conf->working_disks) {
1876 printk(KERN_ERR "raid1: no operational mirrors for %s\n", 1882 printk(KERN_ERR "raid1: no operational mirrors for %s\n",
1877 mdname(mddev)); 1883 mdname(mddev));
1878 goto out_free_conf; 1884 goto out_free_conf;
1879 } 1885 }
1880 1886
1881 mddev->degraded = 0; 1887 mddev->degraded = 0;
1882 for (i = 0; i < conf->raid_disks; i++) { 1888 for (i = 0; i < conf->raid_disks; i++) {
1883 1889
1884 disk = conf->mirrors + i; 1890 disk = conf->mirrors + i;
1885 1891
1886 if (!disk->rdev) { 1892 if (!disk->rdev) {
1887 disk->head_position = 0; 1893 disk->head_position = 0;
1888 mddev->degraded++; 1894 mddev->degraded++;
1889 } 1895 }
1890 } 1896 }
1891 1897
1892 /* 1898 /*
1893 * find the first working one and use it as a starting point 1899 * find the first working one and use it as a starting point
1894 * to read balancing. 1900 * to read balancing.
1895 */ 1901 */
1896 for (j = 0; j < conf->raid_disks && 1902 for (j = 0; j < conf->raid_disks &&
1897 (!conf->mirrors[j].rdev || 1903 (!conf->mirrors[j].rdev ||
1898 !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++) 1904 !test_bit(In_sync, &conf->mirrors[j].rdev->flags)) ; j++)
1899 /* nothing */; 1905 /* nothing */;
1900 conf->last_used = j; 1906 conf->last_used = j;
1901 1907
1902 1908
1903 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1"); 1909 mddev->thread = md_register_thread(raid1d, mddev, "%s_raid1");
1904 if (!mddev->thread) { 1910 if (!mddev->thread) {
1905 printk(KERN_ERR 1911 printk(KERN_ERR
1906 "raid1: couldn't allocate thread for %s\n", 1912 "raid1: couldn't allocate thread for %s\n",
1907 mdname(mddev)); 1913 mdname(mddev));
1908 goto out_free_conf; 1914 goto out_free_conf;
1909 } 1915 }
1910 1916
1911 printk(KERN_INFO 1917 printk(KERN_INFO
1912 "raid1: raid set %s active with %d out of %d mirrors\n", 1918 "raid1: raid set %s active with %d out of %d mirrors\n",
1913 mdname(mddev), mddev->raid_disks - mddev->degraded, 1919 mdname(mddev), mddev->raid_disks - mddev->degraded,
1914 mddev->raid_disks); 1920 mddev->raid_disks);
1915 /* 1921 /*
1916 * Ok, everything is just fine now 1922 * Ok, everything is just fine now
1917 */ 1923 */
1918 mddev->array_size = mddev->size; 1924 mddev->array_size = mddev->size;
1919 1925
1920 mddev->queue->unplug_fn = raid1_unplug; 1926 mddev->queue->unplug_fn = raid1_unplug;
1921 mddev->queue->issue_flush_fn = raid1_issue_flush; 1927 mddev->queue->issue_flush_fn = raid1_issue_flush;
1922 1928
1923 return 0; 1929 return 0;
1924 1930
1925 out_no_mem: 1931 out_no_mem:
1926 printk(KERN_ERR "raid1: couldn't allocate memory for %s\n", 1932 printk(KERN_ERR "raid1: couldn't allocate memory for %s\n",
1927 mdname(mddev)); 1933 mdname(mddev));
1928 1934
1929 out_free_conf: 1935 out_free_conf:
1930 if (conf) { 1936 if (conf) {
1931 if (conf->r1bio_pool) 1937 if (conf->r1bio_pool)
1932 mempool_destroy(conf->r1bio_pool); 1938 mempool_destroy(conf->r1bio_pool);
1933 kfree(conf->mirrors); 1939 kfree(conf->mirrors);
1934 safe_put_page(conf->tmppage); 1940 safe_put_page(conf->tmppage);
1935 kfree(conf->poolinfo); 1941 kfree(conf->poolinfo);
1936 kfree(conf); 1942 kfree(conf);
1937 mddev->private = NULL; 1943 mddev->private = NULL;
1938 } 1944 }
1939 out: 1945 out:
1940 return -EIO; 1946 return -EIO;
1941 } 1947 }
1942 1948
1943 static int stop(mddev_t *mddev) 1949 static int stop(mddev_t *mddev)
1944 { 1950 {
1945 conf_t *conf = mddev_to_conf(mddev); 1951 conf_t *conf = mddev_to_conf(mddev);
1946 struct bitmap *bitmap = mddev->bitmap; 1952 struct bitmap *bitmap = mddev->bitmap;
1947 int behind_wait = 0; 1953 int behind_wait = 0;
1948 1954
1949 /* wait for behind writes to complete */ 1955 /* wait for behind writes to complete */
1950 while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { 1956 while (bitmap && atomic_read(&bitmap->behind_writes) > 0) {
1951 behind_wait++; 1957 behind_wait++;
1952 printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); 1958 printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait);
1953 set_current_state(TASK_UNINTERRUPTIBLE); 1959 set_current_state(TASK_UNINTERRUPTIBLE);
1954 schedule_timeout(HZ); /* wait a second */ 1960 schedule_timeout(HZ); /* wait a second */
1955 /* need to kick something here to make sure I/O goes? */ 1961 /* need to kick something here to make sure I/O goes? */
1956 } 1962 }
1957 1963
1958 md_unregister_thread(mddev->thread); 1964 md_unregister_thread(mddev->thread);
1959 mddev->thread = NULL; 1965 mddev->thread = NULL;
1960 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 1966 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
1961 if (conf->r1bio_pool) 1967 if (conf->r1bio_pool)
1962 mempool_destroy(conf->r1bio_pool); 1968 mempool_destroy(conf->r1bio_pool);
1963 kfree(conf->mirrors); 1969 kfree(conf->mirrors);
1964 kfree(conf->poolinfo); 1970 kfree(conf->poolinfo);
1965 kfree(conf); 1971 kfree(conf);
1966 mddev->private = NULL; 1972 mddev->private = NULL;
1967 return 0; 1973 return 0;
1968 } 1974 }
1969 1975
1970 static int raid1_resize(mddev_t *mddev, sector_t sectors) 1976 static int raid1_resize(mddev_t *mddev, sector_t sectors)
1971 { 1977 {
1972 /* no resync is happening, and there is enough space 1978 /* no resync is happening, and there is enough space
1973 * on all devices, so we can resize. 1979 * on all devices, so we can resize.
1974 * We need to make sure resync covers any new space. 1980 * We need to make sure resync covers any new space.
1975 * If the array is shrinking we should possibly wait until 1981 * If the array is shrinking we should possibly wait until
1976 * any io in the removed space completes, but it hardly seems 1982 * any io in the removed space completes, but it hardly seems
1977 * worth it. 1983 * worth it.
1978 */ 1984 */
1979 mddev->array_size = sectors>>1; 1985 mddev->array_size = sectors>>1;
1980 set_capacity(mddev->gendisk, mddev->array_size << 1); 1986 set_capacity(mddev->gendisk, mddev->array_size << 1);
1981 mddev->changed = 1; 1987 mddev->changed = 1;
1982 if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) { 1988 if (mddev->array_size > mddev->size && mddev->recovery_cp == MaxSector) {
1983 mddev->recovery_cp = mddev->size << 1; 1989 mddev->recovery_cp = mddev->size << 1;
1984 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 1990 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
1985 } 1991 }
1986 mddev->size = mddev->array_size; 1992 mddev->size = mddev->array_size;
1987 mddev->resync_max_sectors = sectors; 1993 mddev->resync_max_sectors = sectors;
1988 return 0; 1994 return 0;
1989 } 1995 }
1990 1996
1991 static int raid1_reshape(mddev_t *mddev) 1997 static int raid1_reshape(mddev_t *mddev)
1992 { 1998 {
1993 /* We need to: 1999 /* We need to:
1994 * 1/ resize the r1bio_pool 2000 * 1/ resize the r1bio_pool
1995 * 2/ resize conf->mirrors 2001 * 2/ resize conf->mirrors
1996 * 2002 *
1997 * We allocate a new r1bio_pool if we can. 2003 * We allocate a new r1bio_pool if we can.
1998 * Then raise a device barrier and wait until all IO stops. 2004 * Then raise a device barrier and wait until all IO stops.
1999 * Then resize conf->mirrors and swap in the new r1bio pool. 2005 * Then resize conf->mirrors and swap in the new r1bio pool.
2000 * 2006 *
2001 * At the same time, we "pack" the devices so that all the missing 2007 * At the same time, we "pack" the devices so that all the missing
2002 * devices have the higher raid_disk numbers. 2008 * devices have the higher raid_disk numbers.
2003 */ 2009 */
2004 mempool_t *newpool, *oldpool; 2010 mempool_t *newpool, *oldpool;
2005 struct pool_info *newpoolinfo; 2011 struct pool_info *newpoolinfo;
2006 mirror_info_t *newmirrors; 2012 mirror_info_t *newmirrors;
2007 conf_t *conf = mddev_to_conf(mddev); 2013 conf_t *conf = mddev_to_conf(mddev);
2008 int cnt, raid_disks; 2014 int cnt, raid_disks;
2009 2015
2010 int d, d2; 2016 int d, d2;
2011 2017
2012 /* Cannot change chunk_size, layout, or level */ 2018 /* Cannot change chunk_size, layout, or level */
2013 if (mddev->chunk_size != mddev->new_chunk || 2019 if (mddev->chunk_size != mddev->new_chunk ||
2014 mddev->layout != mddev->new_layout || 2020 mddev->layout != mddev->new_layout ||
2015 mddev->level != mddev->new_level) { 2021 mddev->level != mddev->new_level) {
2016 mddev->new_chunk = mddev->chunk_size; 2022 mddev->new_chunk = mddev->chunk_size;
2017 mddev->new_layout = mddev->layout; 2023 mddev->new_layout = mddev->layout;
2018 mddev->new_level = mddev->level; 2024 mddev->new_level = mddev->level;
2019 return -EINVAL; 2025 return -EINVAL;
2020 } 2026 }
2021 2027
2022 raid_disks = mddev->raid_disks + mddev->delta_disks; 2028 raid_disks = mddev->raid_disks + mddev->delta_disks;
2023 2029
2024 if (raid_disks < conf->raid_disks) { 2030 if (raid_disks < conf->raid_disks) {
2025 cnt=0; 2031 cnt=0;
2026 for (d= 0; d < conf->raid_disks; d++) 2032 for (d= 0; d < conf->raid_disks; d++)
2027 if (conf->mirrors[d].rdev) 2033 if (conf->mirrors[d].rdev)
2028 cnt++; 2034 cnt++;
2029 if (cnt > raid_disks) 2035 if (cnt > raid_disks)
2030 return -EBUSY; 2036 return -EBUSY;
2031 } 2037 }
2032 2038
2033 newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL); 2039 newpoolinfo = kmalloc(sizeof(*newpoolinfo), GFP_KERNEL);
2034 if (!newpoolinfo) 2040 if (!newpoolinfo)
2035 return -ENOMEM; 2041 return -ENOMEM;
2036 newpoolinfo->mddev = mddev; 2042 newpoolinfo->mddev = mddev;
2037 newpoolinfo->raid_disks = raid_disks; 2043 newpoolinfo->raid_disks = raid_disks;
2038 2044
2039 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc, 2045 newpool = mempool_create(NR_RAID1_BIOS, r1bio_pool_alloc,
2040 r1bio_pool_free, newpoolinfo); 2046 r1bio_pool_free, newpoolinfo);
2041 if (!newpool) { 2047 if (!newpool) {
2042 kfree(newpoolinfo); 2048 kfree(newpoolinfo);
2043 return -ENOMEM; 2049 return -ENOMEM;
2044 } 2050 }
2045 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL); 2051 newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks, GFP_KERNEL);
2046 if (!newmirrors) { 2052 if (!newmirrors) {
2047 kfree(newpoolinfo); 2053 kfree(newpoolinfo);
2048 mempool_destroy(newpool); 2054 mempool_destroy(newpool);
2049 return -ENOMEM; 2055 return -ENOMEM;
2050 } 2056 }
2051 2057
2052 raise_barrier(conf); 2058 raise_barrier(conf);
2053 2059
2054 /* ok, everything is stopped */ 2060 /* ok, everything is stopped */
2055 oldpool = conf->r1bio_pool; 2061 oldpool = conf->r1bio_pool;
2056 conf->r1bio_pool = newpool; 2062 conf->r1bio_pool = newpool;
2057 2063
2058 for (d=d2=0; d < conf->raid_disks; d++) 2064 for (d=d2=0; d < conf->raid_disks; d++)
2059 if (conf->mirrors[d].rdev) { 2065 if (conf->mirrors[d].rdev) {
2060 conf->mirrors[d].rdev->raid_disk = d2; 2066 conf->mirrors[d].rdev->raid_disk = d2;
2061 newmirrors[d2++].rdev = conf->mirrors[d].rdev; 2067 newmirrors[d2++].rdev = conf->mirrors[d].rdev;
2062 } 2068 }
2063 kfree(conf->mirrors); 2069 kfree(conf->mirrors);
2064 conf->mirrors = newmirrors; 2070 conf->mirrors = newmirrors;
2065 kfree(conf->poolinfo); 2071 kfree(conf->poolinfo);
2066 conf->poolinfo = newpoolinfo; 2072 conf->poolinfo = newpoolinfo;
2067 2073
2068 mddev->degraded += (raid_disks - conf->raid_disks); 2074 mddev->degraded += (raid_disks - conf->raid_disks);
2069 conf->raid_disks = mddev->raid_disks = raid_disks; 2075 conf->raid_disks = mddev->raid_disks = raid_disks;
2070 mddev->delta_disks = 0; 2076 mddev->delta_disks = 0;
2071 2077
2072 conf->last_used = 0; /* just make sure it is in-range */ 2078 conf->last_used = 0; /* just make sure it is in-range */
2073 lower_barrier(conf); 2079 lower_barrier(conf);
2074 2080
2075 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2081 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2076 md_wakeup_thread(mddev->thread); 2082 md_wakeup_thread(mddev->thread);
2077 2083
2078 mempool_destroy(oldpool); 2084 mempool_destroy(oldpool);
2079 return 0; 2085 return 0;
2080 } 2086 }
2081 2087
2082 static void raid1_quiesce(mddev_t *mddev, int state) 2088 static void raid1_quiesce(mddev_t *mddev, int state)
2083 { 2089 {
2084 conf_t *conf = mddev_to_conf(mddev); 2090 conf_t *conf = mddev_to_conf(mddev);
2085 2091
2086 switch(state) { 2092 switch(state) {
2087 case 1: 2093 case 1:
2088 raise_barrier(conf); 2094 raise_barrier(conf);
2089 break; 2095 break;
2090 case 0: 2096 case 0:
2091 lower_barrier(conf); 2097 lower_barrier(conf);
2092 break; 2098 break;
2093 } 2099 }
2094 } 2100 }
2095 2101
2096 2102
2097 static struct mdk_personality raid1_personality = 2103 static struct mdk_personality raid1_personality =
2098 { 2104 {
2099 .name = "raid1", 2105 .name = "raid1",
2100 .level = 1, 2106 .level = 1,
2101 .owner = THIS_MODULE, 2107 .owner = THIS_MODULE,
2102 .make_request = make_request, 2108 .make_request = make_request,
2103 .run = run, 2109 .run = run,
2104 .stop = stop, 2110 .stop = stop,
2105 .status = status, 2111 .status = status,
2106 .error_handler = error, 2112 .error_handler = error,
2107 .hot_add_disk = raid1_add_disk, 2113 .hot_add_disk = raid1_add_disk,
2108 .hot_remove_disk= raid1_remove_disk, 2114 .hot_remove_disk= raid1_remove_disk,
2109 .spare_active = raid1_spare_active, 2115 .spare_active = raid1_spare_active,
2110 .sync_request = sync_request, 2116 .sync_request = sync_request,
2111 .resize = raid1_resize, 2117 .resize = raid1_resize,
2112 .check_reshape = raid1_reshape, 2118 .check_reshape = raid1_reshape,
2113 .quiesce = raid1_quiesce, 2119 .quiesce = raid1_quiesce,
2114 }; 2120 };
2115 2121
2116 static int __init raid_init(void) 2122 static int __init raid_init(void)
2117 { 2123 {
2118 return register_md_personality(&raid1_personality); 2124 return register_md_personality(&raid1_personality);
2119 } 2125 }
2120 2126
2121 static void raid_exit(void) 2127 static void raid_exit(void)
2122 { 2128 {
2123 unregister_md_personality(&raid1_personality); 2129 unregister_md_personality(&raid1_personality);
2124 } 2130 }
2125 2131
2126 module_init(raid_init); 2132 module_init(raid_init);
2127 module_exit(raid_exit); 2133 module_exit(raid_exit);