Commit 69335ef3bc5b766f34db2d688be1d35313138bca
1 parent
3a6de2924a
Exists in
master
and in
6 other branches
md/raid10: prepare data structures for handling replacement.
Allow each slot in the RAID10 to have 2 devices, the want_replacement and the replacement. Also an r10bio to have 2 bios, and for resync/recovery allocate the second bio if there are any replacement devices. Signed-off-by: NeilBrown <neilb@suse.de>
Showing 2 changed files with 78 additions and 31 deletions Side-by-side Diff
drivers/md/raid10.c
... | ... | @@ -73,7 +73,8 @@ |
73 | 73 | struct r10conf *conf = data; |
74 | 74 | int size = offsetof(struct r10bio, devs[conf->copies]); |
75 | 75 | |
76 | - /* allocate a r10bio with room for raid_disks entries in the bios array */ | |
76 | + /* allocate a r10bio with room for raid_disks entries in the | |
77 | + * bios array */ | |
77 | 78 | return kzalloc(size, gfp_flags); |
78 | 79 | } |
79 | 80 | |
80 | 81 | |
... | ... | @@ -123,12 +124,19 @@ |
123 | 124 | if (!bio) |
124 | 125 | goto out_free_bio; |
125 | 126 | r10_bio->devs[j].bio = bio; |
127 | + if (!conf->have_replacement) | |
128 | + continue; | |
129 | + bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); | |
130 | + if (!bio) | |
131 | + goto out_free_bio; | |
132 | + r10_bio->devs[j].repl_bio = bio; | |
126 | 133 | } |
127 | 134 | /* |
128 | 135 | * Allocate RESYNC_PAGES data pages and attach them |
129 | 136 | * where needed. |
130 | 137 | */ |
131 | 138 | for (j = 0 ; j < nalloc; j++) { |
139 | + struct bio *rbio = r10_bio->devs[j].repl_bio; | |
132 | 140 | bio = r10_bio->devs[j].bio; |
133 | 141 | for (i = 0; i < RESYNC_PAGES; i++) { |
134 | 142 | if (j == 1 && !test_bit(MD_RECOVERY_SYNC, |
... | ... | @@ -143,6 +151,8 @@ |
143 | 151 | goto out_free_pages; |
144 | 152 | |
145 | 153 | bio->bi_io_vec[i].bv_page = page; |
154 | + if (rbio) | |
155 | + rbio->bi_io_vec[i].bv_page = page; | |
146 | 156 | } |
147 | 157 | } |
148 | 158 | |
149 | 159 | |
... | ... | @@ -156,8 +166,11 @@ |
156 | 166 | safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); |
157 | 167 | j = -1; |
158 | 168 | out_free_bio: |
159 | - while ( ++j < nalloc ) | |
169 | + while (++j < nalloc) { | |
160 | 170 | bio_put(r10_bio->devs[j].bio); |
171 | + if (r10_bio->devs[j].repl_bio) | |
172 | + bio_put(r10_bio->devs[j].repl_bio); | |
173 | + } | |
161 | 174 | r10bio_pool_free(r10_bio, conf); |
162 | 175 | return NULL; |
163 | 176 | } |
... | ... | @@ -178,6 +191,9 @@ |
178 | 191 | } |
179 | 192 | bio_put(bio); |
180 | 193 | } |
194 | + bio = r10bio->devs[j].repl_bio; | |
195 | + if (bio) | |
196 | + bio_put(bio); | |
181 | 197 | } |
182 | 198 | r10bio_pool_free(r10bio, conf); |
183 | 199 | } |
... | ... | @@ -191,6 +207,10 @@ |
191 | 207 | if (!BIO_SPECIAL(*bio)) |
192 | 208 | bio_put(*bio); |
193 | 209 | *bio = NULL; |
210 | + bio = &r10_bio->devs[i].repl_bio; | |
211 | + if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) | |
212 | + bio_put(*bio); | |
213 | + *bio = NULL; | |
194 | 214 | } |
195 | 215 | } |
196 | 216 | |
197 | 217 | |
198 | 218 | |
199 | 219 | |
200 | 220 | |
... | ... | @@ -275,19 +295,27 @@ |
275 | 295 | * Find the disk number which triggered given bio |
276 | 296 | */ |
277 | 297 | static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, |
278 | - struct bio *bio, int *slotp) | |
298 | + struct bio *bio, int *slotp, int *replp) | |
279 | 299 | { |
280 | 300 | int slot; |
301 | + int repl = 0; | |
281 | 302 | |
282 | - for (slot = 0; slot < conf->copies; slot++) | |
303 | + for (slot = 0; slot < conf->copies; slot++) { | |
283 | 304 | if (r10_bio->devs[slot].bio == bio) |
284 | 305 | break; |
306 | + if (r10_bio->devs[slot].repl_bio == bio) { | |
307 | + repl = 1; | |
308 | + break; | |
309 | + } | |
310 | + } | |
285 | 311 | |
286 | 312 | BUG_ON(slot == conf->copies); |
287 | 313 | update_head_pos(slot, r10_bio); |
288 | 314 | |
289 | 315 | if (slotp) |
290 | 316 | *slotp = slot; |
317 | + if (replp) | |
318 | + *replp = repl; | |
291 | 319 | return r10_bio->devs[slot].devnum; |
292 | 320 | } |
293 | 321 | |
... | ... | @@ -368,7 +396,7 @@ |
368 | 396 | struct r10conf *conf = r10_bio->mddev->private; |
369 | 397 | int slot; |
370 | 398 | |
371 | - dev = find_bio_disk(conf, r10_bio, bio, &slot); | |
399 | + dev = find_bio_disk(conf, r10_bio, bio, &slot, NULL); | |
372 | 400 | |
373 | 401 | /* |
374 | 402 | * this branch is our 'one mirror IO has finished' event handler: |
... | ... | @@ -1025,6 +1053,7 @@ |
1025 | 1053 | */ |
1026 | 1054 | plugged = mddev_check_plugged(mddev); |
1027 | 1055 | |
1056 | + r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ | |
1028 | 1057 | raid10_find_phys(conf, r10_bio); |
1029 | 1058 | retry_write: |
1030 | 1059 | blocked_rdev = NULL; |
... | ... | @@ -1431,7 +1460,7 @@ |
1431 | 1460 | struct r10conf *conf = r10_bio->mddev->private; |
1432 | 1461 | int d; |
1433 | 1462 | |
1434 | - d = find_bio_disk(conf, r10_bio, bio, NULL); | |
1463 | + d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); | |
1435 | 1464 | |
1436 | 1465 | if (test_bit(BIO_UPTODATE, &bio->bi_flags)) |
1437 | 1466 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
... | ... | @@ -1493,7 +1522,7 @@ |
1493 | 1522 | int bad_sectors; |
1494 | 1523 | int slot; |
1495 | 1524 | |
1496 | - d = find_bio_disk(conf, r10_bio, bio, &slot); | |
1525 | + d = find_bio_disk(conf, r10_bio, bio, &slot, NULL); | |
1497 | 1526 | |
1498 | 1527 | if (!uptodate) { |
1499 | 1528 | set_bit(WriteErrorSeen, &conf->mirrors[d].rdev->flags); |
1500 | 1529 | |
... | ... | @@ -2271,9 +2300,14 @@ |
2271 | 2300 | static int init_resync(struct r10conf *conf) |
2272 | 2301 | { |
2273 | 2302 | int buffs; |
2303 | + int i; | |
2274 | 2304 | |
2275 | 2305 | buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; |
2276 | 2306 | BUG_ON(conf->r10buf_pool); |
2307 | + conf->have_replacement = 0; | |
2308 | + for (i = 0; i < conf->raid_disks; i++) | |
2309 | + if (conf->mirrors[i].replacement) | |
2310 | + conf->have_replacement = 1; | |
2277 | 2311 | conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); |
2278 | 2312 | if (!conf->r10buf_pool) |
2279 | 2313 | return -ENOMEM; |
drivers/md/raid10.h
... | ... | @@ -2,7 +2,7 @@ |
2 | 2 | #define _RAID10_H |
3 | 3 | |
4 | 4 | struct mirror_info { |
5 | - struct md_rdev *rdev; | |
5 | + struct md_rdev *rdev, *replacement; | |
6 | 6 | sector_t head_position; |
7 | 7 | int recovery_disabled; /* matches |
8 | 8 | * mddev->recovery_disabled |
9 | 9 | |
... | ... | @@ -18,12 +18,13 @@ |
18 | 18 | spinlock_t device_lock; |
19 | 19 | |
20 | 20 | /* geometry */ |
21 | - int near_copies; /* number of copies laid out raid0 style */ | |
21 | + int near_copies; /* number of copies laid out | |
22 | + * raid0 style */ | |
22 | 23 | int far_copies; /* number of copies laid out |
23 | 24 | * at large strides across drives |
24 | 25 | */ |
25 | - int far_offset; /* far_copies are offset by 1 stripe | |
26 | - * instead of many | |
26 | + int far_offset; /* far_copies are offset by 1 | |
27 | + * stripe instead of many | |
27 | 28 | */ |
28 | 29 | int copies; /* near_copies * far_copies. |
29 | 30 | * must be <= raid_disks |
30 | 31 | |
... | ... | @@ -34,10 +35,11 @@ |
34 | 35 | * 1 stripe. |
35 | 36 | */ |
36 | 37 | |
37 | - sector_t dev_sectors; /* temp copy of mddev->dev_sectors */ | |
38 | + sector_t dev_sectors; /* temp copy of | |
39 | + * mddev->dev_sectors */ | |
38 | 40 | |
39 | - int chunk_shift; /* shift from chunks to sectors */ | |
40 | - sector_t chunk_mask; | |
41 | + int chunk_shift; /* shift from chunks to sectors */ | |
42 | + sector_t chunk_mask; | |
41 | 43 | |
42 | 44 | struct list_head retry_list; |
43 | 45 | /* queue pending writes and submit them on unplug */ |
44 | 46 | |
45 | 47 | |
... | ... | @@ -45,20 +47,22 @@ |
45 | 47 | int pending_count; |
46 | 48 | |
47 | 49 | spinlock_t resync_lock; |
48 | - int nr_pending; | |
49 | - int nr_waiting; | |
50 | - int nr_queued; | |
51 | - int barrier; | |
50 | + int nr_pending; | |
51 | + int nr_waiting; | |
52 | + int nr_queued; | |
53 | + int barrier; | |
52 | 54 | sector_t next_resync; |
53 | 55 | int fullsync; /* set to 1 if a full sync is needed, |
54 | 56 | * (fresh device added). |
55 | 57 | * Cleared when a sync completes. |
56 | 58 | */ |
57 | - | |
59 | + int have_replacement; /* There is at least one | |
60 | + * replacement device. | |
61 | + */ | |
58 | 62 | wait_queue_head_t wait_barrier; |
59 | 63 | |
60 | - mempool_t *r10bio_pool; | |
61 | - mempool_t *r10buf_pool; | |
64 | + mempool_t *r10bio_pool; | |
65 | + mempool_t *r10buf_pool; | |
62 | 66 | struct page *tmppage; |
63 | 67 | |
64 | 68 | /* When taking over an array from a different personality, we store |
65 | 69 | |
... | ... | @@ -98,11 +102,18 @@ |
98 | 102 | * When resyncing we also use one for each copy. |
99 | 103 | * When reconstructing, we use 2 bios, one for read, one for write. |
100 | 104 | * We choose the number when they are allocated. |
105 | + * We sometimes need an extra bio to write to the replacement. | |
101 | 106 | */ |
102 | 107 | struct { |
103 | - struct bio *bio; | |
104 | - sector_t addr; | |
105 | - int devnum; | |
108 | + struct bio *bio; | |
109 | + union { | |
110 | + struct bio *repl_bio; /* used for resync and | |
111 | + * writes */ | |
112 | + struct md_rdev *rdev; /* used for reads | |
113 | + * (read_slot >= 0) */ | |
114 | + }; | |
115 | + sector_t addr; | |
116 | + int devnum; | |
106 | 117 | } devs[0]; |
107 | 118 | }; |
108 | 119 | |
109 | 120 | |
110 | 121 | |
... | ... | @@ -121,18 +132,20 @@ |
121 | 132 | #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) |
122 | 133 | |
123 | 134 | /* bits for r10bio.state */ |
124 | -#define R10BIO_Uptodate 0 | |
125 | -#define R10BIO_IsSync 1 | |
126 | -#define R10BIO_IsRecover 2 | |
127 | -#define R10BIO_Degraded 3 | |
135 | +enum r10bio_state { | |
136 | + R10BIO_Uptodate, | |
137 | + R10BIO_IsSync, | |
138 | + R10BIO_IsRecover, | |
139 | + R10BIO_Degraded, | |
128 | 140 | /* Set ReadError on bios that experience a read error |
129 | 141 | * so that raid10d knows what to do with them. |
130 | 142 | */ |
131 | -#define R10BIO_ReadError 4 | |
143 | + R10BIO_ReadError, | |
132 | 144 | /* If a write for this request means we can clear some |
133 | 145 | * known-bad-block records, we set this flag. |
134 | 146 | */ |
135 | -#define R10BIO_MadeGood 5 | |
136 | -#define R10BIO_WriteError 6 | |
147 | + R10BIO_MadeGood, | |
148 | + R10BIO_WriteError, | |
149 | +}; | |
137 | 150 | #endif |