Commit 475901aff15841fb0a81e7546517407779a9b061

Authored by Jonathan Brassow
Committed by NeilBrown
1 parent 4c0ca26bd2

MD RAID10: Improve redundancy for 'far' and 'offset' algorithms (part 1)

The MD RAID10 'far' and 'offset' algorithms make copies of entire stripe
widths - copying them to a different location on the same devices after
shifting the stripe.  An example layout of each follows below:

	        "far" algorithm
	dev1 dev2 dev3 dev4 dev5 dev6
	==== ==== ==== ==== ==== ====
	 A    B    C    D    E    F
	 G    H    I    J    K    L
	            ...
	 F    A    B    C    D    E  --> Copy of stripe0, but shifted by 1
	 L    G    H    I    J    K
	            ...

		"offset" algorithm
	dev1 dev2 dev3 dev4 dev5 dev6
	==== ==== ==== ==== ==== ====
	 A    B    C    D    E    F
	 F    A    B    C    D    E  --> Copy of stripe0, but shifted by 1
	 G    H    I    J    K    L
	 L    G    H    I    J    K
	            ...

Redundancy for these algorithms is gained by shifting the copied stripes
one device to the right.  This patch proposes that array be divided into
sets of adjacent devices and when the stripe copies are shifted, they wrap
on set boundaries rather than the array size boundary.  That is, for the
purposes of shifting, the copies are confined to their sets within the
array.  The sets are 'near_copies * far_copies' in size.

The above "far" algorithm example would change to:
	        "far" algorithm
	dev1 dev2 dev3 dev4 dev5 dev6
	==== ==== ==== ==== ==== ====
	 A    B    C    D    E    F
	 G    H    I    J    K    L
	            ...
	 B    A    D    C    F    E  --> Copy of stripe0, shifted 1, 2-dev sets
	 H    G    J    I    L    K      Dev sets are 1-2, 3-4, 5-6
	            ...

This has the affect of improving the redundancy of the array.  We can
always sustain at least one failure, but sometimes more than one can
be handled.  In the first examples, the pairs of devices that CANNOT fail
together are:
	(1,2) (2,3) (3,4) (4,5) (5,6) (1, 6) [40% of possible pairs]
In the example where the copies are confined to sets, the pairs of
devices that cannot fail together are:
	(1,2) (3,4) (5,6)                    [20% of possible pairs]

We cannot simply replace the old algorithms, so the 17th bit of the 'layout'
variable is used to indicate whether we use the old or new method of computing
the shift.  (This is similar to the way the 16th bit indicates whether the
"far" algorithm or the "offset" algorithm is being used.)

This patch only handles the cases where the number of total raid disks is
a multiple of 'far_copies'.  A follow-on patch addresses the condition where
this is not true.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>

Showing 2 changed files with 45 additions and 18 deletions Inline Diff

1 /* 1 /*
2 * raid10.c : Multiple Devices driver for Linux 2 * raid10.c : Multiple Devices driver for Linux
3 * 3 *
4 * Copyright (C) 2000-2004 Neil Brown 4 * Copyright (C) 2000-2004 Neil Brown
5 * 5 *
6 * RAID-10 support for md. 6 * RAID-10 support for md.
7 * 7 *
8 * Base on code in raid1.c. See raid1.c for further copyright information. 8 * Base on code in raid1.c. See raid1.c for further copyright information.
9 * 9 *
10 * 10 *
11 * This program is free software; you can redistribute it and/or modify 11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by 12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option) 13 * the Free Software Foundation; either version 2, or (at your option)
14 * any later version. 14 * any later version.
15 * 15 *
16 * You should have received a copy of the GNU General Public License 16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free 17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20 20
21 #include <linux/slab.h> 21 #include <linux/slab.h>
22 #include <linux/delay.h> 22 #include <linux/delay.h>
23 #include <linux/blkdev.h> 23 #include <linux/blkdev.h>
24 #include <linux/module.h> 24 #include <linux/module.h>
25 #include <linux/seq_file.h> 25 #include <linux/seq_file.h>
26 #include <linux/ratelimit.h> 26 #include <linux/ratelimit.h>
27 #include <linux/kthread.h> 27 #include <linux/kthread.h>
28 #include "md.h" 28 #include "md.h"
29 #include "raid10.h" 29 #include "raid10.h"
30 #include "raid0.h" 30 #include "raid0.h"
31 #include "bitmap.h" 31 #include "bitmap.h"
32 32
33 /* 33 /*
34 * RAID10 provides a combination of RAID0 and RAID1 functionality. 34 * RAID10 provides a combination of RAID0 and RAID1 functionality.
35 * The layout of data is defined by 35 * The layout of data is defined by
36 * chunk_size 36 * chunk_size
37 * raid_disks 37 * raid_disks
38 * near_copies (stored in low byte of layout) 38 * near_copies (stored in low byte of layout)
39 * far_copies (stored in second byte of layout) 39 * far_copies (stored in second byte of layout)
40 * far_offset (stored in bit 16 of layout ) 40 * far_offset (stored in bit 16 of layout )
41 * use_far_sets (stored in bit 17 of layout )
41 * 42 *
42 * The data to be stored is divided into chunks using chunksize. 43 * The data to be stored is divided into chunks using chunksize. Each device
43 * Each device is divided into far_copies sections. 44 * is divided into far_copies sections. In each section, chunks are laid out
44 * In each section, chunks are laid out in a style similar to raid0, but 45 * in a style similar to raid0, but near_copies copies of each chunk is stored
45 * near_copies copies of each chunk is stored (each on a different drive). 46 * (each on a different drive). The starting device for each section is offset
46 * The starting device for each section is offset near_copies from the starting 47 * near_copies from the starting device of the previous section. Thus there
47 * device of the previous section. 48 * are (near_copies * far_copies) of each chunk, and each is on a different
48 * Thus they are (near_copies*far_copies) of each chunk, and each is on a different 49 * drive. near_copies and far_copies must be at least one, and their product
49 * drive. 50 * is at most raid_disks.
50 * near_copies and far_copies must be at least one, and their product is at most
51 * raid_disks.
52 * 51 *
53 * If far_offset is true, then the far_copies are handled a bit differently. 52 * If far_offset is true, then the far_copies are handled a bit differently.
54 * The copies are still in different stripes, but instead of be very far apart 53 * The copies are still in different stripes, but instead of being very far
55 * on disk, there are adjacent stripes. 54 * apart on disk, there are adjacent stripes.
55 *
56 * The far and offset algorithms are handled slightly differently if
57 * 'use_far_sets' is true. In this case, the array's devices are grouped into
58 * sets that are (near_copies * far_copies) in size. The far copied stripes
59 * are still shifted by 'near_copies' devices, but this shifting stays confined
60 * to the set rather than the entire array. This is done to improve the number
61 * of device combinations that can fail without causing the array to fail.
62 * Example 'far' algorithm w/o 'use_far_sets' (each letter represents a chunk
63 * on a device):
64 * A B C D A B C D E
65 * ... ...
66 * D A B C E A B C D
67 * Example 'far' algorithm w/ 'use_far_sets' enabled (sets illustrated w/ []'s):
68 * [A B] [C D] [A B] [C D E]
69 * |...| |...| |...| | ... |
70 * [B A] [D C] [B A] [E C D]
56 */ 71 */
57 72
58 /* 73 /*
59 * Number of guaranteed r10bios in case of extreme VM load: 74 * Number of guaranteed r10bios in case of extreme VM load:
60 */ 75 */
61 #define NR_RAID10_BIOS 256 76 #define NR_RAID10_BIOS 256
62 77
63 /* when we get a read error on a read-only array, we redirect to another 78 /* when we get a read error on a read-only array, we redirect to another
64 * device without failing the first device, or trying to over-write to 79 * device without failing the first device, or trying to over-write to
65 * correct the read error. To keep track of bad blocks on a per-bio 80 * correct the read error. To keep track of bad blocks on a per-bio
66 * level, we store IO_BLOCKED in the appropriate 'bios' pointer 81 * level, we store IO_BLOCKED in the appropriate 'bios' pointer
67 */ 82 */
68 #define IO_BLOCKED ((struct bio *)1) 83 #define IO_BLOCKED ((struct bio *)1)
69 /* When we successfully write to a known bad-block, we need to remove the 84 /* When we successfully write to a known bad-block, we need to remove the
70 * bad-block marking which must be done from process context. So we record 85 * bad-block marking which must be done from process context. So we record
71 * the success by setting devs[n].bio to IO_MADE_GOOD 86 * the success by setting devs[n].bio to IO_MADE_GOOD
72 */ 87 */
73 #define IO_MADE_GOOD ((struct bio *)2) 88 #define IO_MADE_GOOD ((struct bio *)2)
74 89
75 #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) 90 #define BIO_SPECIAL(bio) ((unsigned long)bio <= 2)
76 91
77 /* When there are this many requests queued to be written by 92 /* When there are this many requests queued to be written by
78 * the raid10 thread, we become 'congested' to provide back-pressure 93 * the raid10 thread, we become 'congested' to provide back-pressure
79 * for writeback. 94 * for writeback.
80 */ 95 */
81 static int max_queued_requests = 1024; 96 static int max_queued_requests = 1024;
82 97
83 static void allow_barrier(struct r10conf *conf); 98 static void allow_barrier(struct r10conf *conf);
84 static void lower_barrier(struct r10conf *conf); 99 static void lower_barrier(struct r10conf *conf);
85 static int enough(struct r10conf *conf, int ignore); 100 static int enough(struct r10conf *conf, int ignore);
86 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 101 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
87 int *skipped); 102 int *skipped);
88 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio); 103 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio);
89 static void end_reshape_write(struct bio *bio, int error); 104 static void end_reshape_write(struct bio *bio, int error);
90 static void end_reshape(struct r10conf *conf); 105 static void end_reshape(struct r10conf *conf);
91 106
92 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 107 static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
93 { 108 {
94 struct r10conf *conf = data; 109 struct r10conf *conf = data;
95 int size = offsetof(struct r10bio, devs[conf->copies]); 110 int size = offsetof(struct r10bio, devs[conf->copies]);
96 111
97 /* allocate a r10bio with room for raid_disks entries in the 112 /* allocate a r10bio with room for raid_disks entries in the
98 * bios array */ 113 * bios array */
99 return kzalloc(size, gfp_flags); 114 return kzalloc(size, gfp_flags);
100 } 115 }
101 116
102 static void r10bio_pool_free(void *r10_bio, void *data) 117 static void r10bio_pool_free(void *r10_bio, void *data)
103 { 118 {
104 kfree(r10_bio); 119 kfree(r10_bio);
105 } 120 }
106 121
107 /* Maximum size of each resync request */ 122 /* Maximum size of each resync request */
108 #define RESYNC_BLOCK_SIZE (64*1024) 123 #define RESYNC_BLOCK_SIZE (64*1024)
109 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE) 124 #define RESYNC_PAGES ((RESYNC_BLOCK_SIZE + PAGE_SIZE-1) / PAGE_SIZE)
110 /* amount of memory to reserve for resync requests */ 125 /* amount of memory to reserve for resync requests */
111 #define RESYNC_WINDOW (1024*1024) 126 #define RESYNC_WINDOW (1024*1024)
112 /* maximum number of concurrent requests, memory permitting */ 127 /* maximum number of concurrent requests, memory permitting */
113 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE) 128 #define RESYNC_DEPTH (32*1024*1024/RESYNC_BLOCK_SIZE)
114 129
115 /* 130 /*
116 * When performing a resync, we need to read and compare, so 131 * When performing a resync, we need to read and compare, so
117 * we need as many pages are there are copies. 132 * we need as many pages are there are copies.
118 * When performing a recovery, we need 2 bios, one for read, 133 * When performing a recovery, we need 2 bios, one for read,
119 * one for write (we recover only one drive per r10buf) 134 * one for write (we recover only one drive per r10buf)
120 * 135 *
121 */ 136 */
122 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data) 137 static void * r10buf_pool_alloc(gfp_t gfp_flags, void *data)
123 { 138 {
124 struct r10conf *conf = data; 139 struct r10conf *conf = data;
125 struct page *page; 140 struct page *page;
126 struct r10bio *r10_bio; 141 struct r10bio *r10_bio;
127 struct bio *bio; 142 struct bio *bio;
128 int i, j; 143 int i, j;
129 int nalloc; 144 int nalloc;
130 145
131 r10_bio = r10bio_pool_alloc(gfp_flags, conf); 146 r10_bio = r10bio_pool_alloc(gfp_flags, conf);
132 if (!r10_bio) 147 if (!r10_bio)
133 return NULL; 148 return NULL;
134 149
135 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) || 150 if (test_bit(MD_RECOVERY_SYNC, &conf->mddev->recovery) ||
136 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery)) 151 test_bit(MD_RECOVERY_RESHAPE, &conf->mddev->recovery))
137 nalloc = conf->copies; /* resync */ 152 nalloc = conf->copies; /* resync */
138 else 153 else
139 nalloc = 2; /* recovery */ 154 nalloc = 2; /* recovery */
140 155
141 /* 156 /*
142 * Allocate bios. 157 * Allocate bios.
143 */ 158 */
144 for (j = nalloc ; j-- ; ) { 159 for (j = nalloc ; j-- ; ) {
145 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); 160 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
146 if (!bio) 161 if (!bio)
147 goto out_free_bio; 162 goto out_free_bio;
148 r10_bio->devs[j].bio = bio; 163 r10_bio->devs[j].bio = bio;
149 if (!conf->have_replacement) 164 if (!conf->have_replacement)
150 continue; 165 continue;
151 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); 166 bio = bio_kmalloc(gfp_flags, RESYNC_PAGES);
152 if (!bio) 167 if (!bio)
153 goto out_free_bio; 168 goto out_free_bio;
154 r10_bio->devs[j].repl_bio = bio; 169 r10_bio->devs[j].repl_bio = bio;
155 } 170 }
156 /* 171 /*
157 * Allocate RESYNC_PAGES data pages and attach them 172 * Allocate RESYNC_PAGES data pages and attach them
158 * where needed. 173 * where needed.
159 */ 174 */
160 for (j = 0 ; j < nalloc; j++) { 175 for (j = 0 ; j < nalloc; j++) {
161 struct bio *rbio = r10_bio->devs[j].repl_bio; 176 struct bio *rbio = r10_bio->devs[j].repl_bio;
162 bio = r10_bio->devs[j].bio; 177 bio = r10_bio->devs[j].bio;
163 for (i = 0; i < RESYNC_PAGES; i++) { 178 for (i = 0; i < RESYNC_PAGES; i++) {
164 if (j > 0 && !test_bit(MD_RECOVERY_SYNC, 179 if (j > 0 && !test_bit(MD_RECOVERY_SYNC,
165 &conf->mddev->recovery)) { 180 &conf->mddev->recovery)) {
166 /* we can share bv_page's during recovery 181 /* we can share bv_page's during recovery
167 * and reshape */ 182 * and reshape */
168 struct bio *rbio = r10_bio->devs[0].bio; 183 struct bio *rbio = r10_bio->devs[0].bio;
169 page = rbio->bi_io_vec[i].bv_page; 184 page = rbio->bi_io_vec[i].bv_page;
170 get_page(page); 185 get_page(page);
171 } else 186 } else
172 page = alloc_page(gfp_flags); 187 page = alloc_page(gfp_flags);
173 if (unlikely(!page)) 188 if (unlikely(!page))
174 goto out_free_pages; 189 goto out_free_pages;
175 190
176 bio->bi_io_vec[i].bv_page = page; 191 bio->bi_io_vec[i].bv_page = page;
177 if (rbio) 192 if (rbio)
178 rbio->bi_io_vec[i].bv_page = page; 193 rbio->bi_io_vec[i].bv_page = page;
179 } 194 }
180 } 195 }
181 196
182 return r10_bio; 197 return r10_bio;
183 198
184 out_free_pages: 199 out_free_pages:
185 for ( ; i > 0 ; i--) 200 for ( ; i > 0 ; i--)
186 safe_put_page(bio->bi_io_vec[i-1].bv_page); 201 safe_put_page(bio->bi_io_vec[i-1].bv_page);
187 while (j--) 202 while (j--)
188 for (i = 0; i < RESYNC_PAGES ; i++) 203 for (i = 0; i < RESYNC_PAGES ; i++)
189 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page); 204 safe_put_page(r10_bio->devs[j].bio->bi_io_vec[i].bv_page);
190 j = 0; 205 j = 0;
191 out_free_bio: 206 out_free_bio:
192 for ( ; j < nalloc; j++) { 207 for ( ; j < nalloc; j++) {
193 if (r10_bio->devs[j].bio) 208 if (r10_bio->devs[j].bio)
194 bio_put(r10_bio->devs[j].bio); 209 bio_put(r10_bio->devs[j].bio);
195 if (r10_bio->devs[j].repl_bio) 210 if (r10_bio->devs[j].repl_bio)
196 bio_put(r10_bio->devs[j].repl_bio); 211 bio_put(r10_bio->devs[j].repl_bio);
197 } 212 }
198 r10bio_pool_free(r10_bio, conf); 213 r10bio_pool_free(r10_bio, conf);
199 return NULL; 214 return NULL;
200 } 215 }
201 216
202 static void r10buf_pool_free(void *__r10_bio, void *data) 217 static void r10buf_pool_free(void *__r10_bio, void *data)
203 { 218 {
204 int i; 219 int i;
205 struct r10conf *conf = data; 220 struct r10conf *conf = data;
206 struct r10bio *r10bio = __r10_bio; 221 struct r10bio *r10bio = __r10_bio;
207 int j; 222 int j;
208 223
209 for (j=0; j < conf->copies; j++) { 224 for (j=0; j < conf->copies; j++) {
210 struct bio *bio = r10bio->devs[j].bio; 225 struct bio *bio = r10bio->devs[j].bio;
211 if (bio) { 226 if (bio) {
212 for (i = 0; i < RESYNC_PAGES; i++) { 227 for (i = 0; i < RESYNC_PAGES; i++) {
213 safe_put_page(bio->bi_io_vec[i].bv_page); 228 safe_put_page(bio->bi_io_vec[i].bv_page);
214 bio->bi_io_vec[i].bv_page = NULL; 229 bio->bi_io_vec[i].bv_page = NULL;
215 } 230 }
216 bio_put(bio); 231 bio_put(bio);
217 } 232 }
218 bio = r10bio->devs[j].repl_bio; 233 bio = r10bio->devs[j].repl_bio;
219 if (bio) 234 if (bio)
220 bio_put(bio); 235 bio_put(bio);
221 } 236 }
222 r10bio_pool_free(r10bio, conf); 237 r10bio_pool_free(r10bio, conf);
223 } 238 }
224 239
225 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) 240 static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
226 { 241 {
227 int i; 242 int i;
228 243
229 for (i = 0; i < conf->copies; i++) { 244 for (i = 0; i < conf->copies; i++) {
230 struct bio **bio = & r10_bio->devs[i].bio; 245 struct bio **bio = & r10_bio->devs[i].bio;
231 if (!BIO_SPECIAL(*bio)) 246 if (!BIO_SPECIAL(*bio))
232 bio_put(*bio); 247 bio_put(*bio);
233 *bio = NULL; 248 *bio = NULL;
234 bio = &r10_bio->devs[i].repl_bio; 249 bio = &r10_bio->devs[i].repl_bio;
235 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio)) 250 if (r10_bio->read_slot < 0 && !BIO_SPECIAL(*bio))
236 bio_put(*bio); 251 bio_put(*bio);
237 *bio = NULL; 252 *bio = NULL;
238 } 253 }
239 } 254 }
240 255
241 static void free_r10bio(struct r10bio *r10_bio) 256 static void free_r10bio(struct r10bio *r10_bio)
242 { 257 {
243 struct r10conf *conf = r10_bio->mddev->private; 258 struct r10conf *conf = r10_bio->mddev->private;
244 259
245 put_all_bios(conf, r10_bio); 260 put_all_bios(conf, r10_bio);
246 mempool_free(r10_bio, conf->r10bio_pool); 261 mempool_free(r10_bio, conf->r10bio_pool);
247 } 262 }
248 263
249 static void put_buf(struct r10bio *r10_bio) 264 static void put_buf(struct r10bio *r10_bio)
250 { 265 {
251 struct r10conf *conf = r10_bio->mddev->private; 266 struct r10conf *conf = r10_bio->mddev->private;
252 267
253 mempool_free(r10_bio, conf->r10buf_pool); 268 mempool_free(r10_bio, conf->r10buf_pool);
254 269
255 lower_barrier(conf); 270 lower_barrier(conf);
256 } 271 }
257 272
258 static void reschedule_retry(struct r10bio *r10_bio) 273 static void reschedule_retry(struct r10bio *r10_bio)
259 { 274 {
260 unsigned long flags; 275 unsigned long flags;
261 struct mddev *mddev = r10_bio->mddev; 276 struct mddev *mddev = r10_bio->mddev;
262 struct r10conf *conf = mddev->private; 277 struct r10conf *conf = mddev->private;
263 278
264 spin_lock_irqsave(&conf->device_lock, flags); 279 spin_lock_irqsave(&conf->device_lock, flags);
265 list_add(&r10_bio->retry_list, &conf->retry_list); 280 list_add(&r10_bio->retry_list, &conf->retry_list);
266 conf->nr_queued ++; 281 conf->nr_queued ++;
267 spin_unlock_irqrestore(&conf->device_lock, flags); 282 spin_unlock_irqrestore(&conf->device_lock, flags);
268 283
269 /* wake up frozen array... */ 284 /* wake up frozen array... */
270 wake_up(&conf->wait_barrier); 285 wake_up(&conf->wait_barrier);
271 286
272 md_wakeup_thread(mddev->thread); 287 md_wakeup_thread(mddev->thread);
273 } 288 }
274 289
275 /* 290 /*
276 * raid_end_bio_io() is called when we have finished servicing a mirrored 291 * raid_end_bio_io() is called when we have finished servicing a mirrored
277 * operation and are ready to return a success/failure code to the buffer 292 * operation and are ready to return a success/failure code to the buffer
278 * cache layer. 293 * cache layer.
279 */ 294 */
280 static void raid_end_bio_io(struct r10bio *r10_bio) 295 static void raid_end_bio_io(struct r10bio *r10_bio)
281 { 296 {
282 struct bio *bio = r10_bio->master_bio; 297 struct bio *bio = r10_bio->master_bio;
283 int done; 298 int done;
284 struct r10conf *conf = r10_bio->mddev->private; 299 struct r10conf *conf = r10_bio->mddev->private;
285 300
286 if (bio->bi_phys_segments) { 301 if (bio->bi_phys_segments) {
287 unsigned long flags; 302 unsigned long flags;
288 spin_lock_irqsave(&conf->device_lock, flags); 303 spin_lock_irqsave(&conf->device_lock, flags);
289 bio->bi_phys_segments--; 304 bio->bi_phys_segments--;
290 done = (bio->bi_phys_segments == 0); 305 done = (bio->bi_phys_segments == 0);
291 spin_unlock_irqrestore(&conf->device_lock, flags); 306 spin_unlock_irqrestore(&conf->device_lock, flags);
292 } else 307 } else
293 done = 1; 308 done = 1;
294 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 309 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
295 clear_bit(BIO_UPTODATE, &bio->bi_flags); 310 clear_bit(BIO_UPTODATE, &bio->bi_flags);
296 if (done) { 311 if (done) {
297 bio_endio(bio, 0); 312 bio_endio(bio, 0);
298 /* 313 /*
299 * Wake up any possible resync thread that waits for the device 314 * Wake up any possible resync thread that waits for the device
300 * to go idle. 315 * to go idle.
301 */ 316 */
302 allow_barrier(conf); 317 allow_barrier(conf);
303 } 318 }
304 free_r10bio(r10_bio); 319 free_r10bio(r10_bio);
305 } 320 }
306 321
307 /* 322 /*
308 * Update disk head position estimator based on IRQ completion info. 323 * Update disk head position estimator based on IRQ completion info.
309 */ 324 */
310 static inline void update_head_pos(int slot, struct r10bio *r10_bio) 325 static inline void update_head_pos(int slot, struct r10bio *r10_bio)
311 { 326 {
312 struct r10conf *conf = r10_bio->mddev->private; 327 struct r10conf *conf = r10_bio->mddev->private;
313 328
314 conf->mirrors[r10_bio->devs[slot].devnum].head_position = 329 conf->mirrors[r10_bio->devs[slot].devnum].head_position =
315 r10_bio->devs[slot].addr + (r10_bio->sectors); 330 r10_bio->devs[slot].addr + (r10_bio->sectors);
316 } 331 }
317 332
318 /* 333 /*
319 * Find the disk number which triggered given bio 334 * Find the disk number which triggered given bio
320 */ 335 */
321 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, 336 static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
322 struct bio *bio, int *slotp, int *replp) 337 struct bio *bio, int *slotp, int *replp)
323 { 338 {
324 int slot; 339 int slot;
325 int repl = 0; 340 int repl = 0;
326 341
327 for (slot = 0; slot < conf->copies; slot++) { 342 for (slot = 0; slot < conf->copies; slot++) {
328 if (r10_bio->devs[slot].bio == bio) 343 if (r10_bio->devs[slot].bio == bio)
329 break; 344 break;
330 if (r10_bio->devs[slot].repl_bio == bio) { 345 if (r10_bio->devs[slot].repl_bio == bio) {
331 repl = 1; 346 repl = 1;
332 break; 347 break;
333 } 348 }
334 } 349 }
335 350
336 BUG_ON(slot == conf->copies); 351 BUG_ON(slot == conf->copies);
337 update_head_pos(slot, r10_bio); 352 update_head_pos(slot, r10_bio);
338 353
339 if (slotp) 354 if (slotp)
340 *slotp = slot; 355 *slotp = slot;
341 if (replp) 356 if (replp)
342 *replp = repl; 357 *replp = repl;
343 return r10_bio->devs[slot].devnum; 358 return r10_bio->devs[slot].devnum;
344 } 359 }
345 360
346 static void raid10_end_read_request(struct bio *bio, int error) 361 static void raid10_end_read_request(struct bio *bio, int error)
347 { 362 {
348 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 363 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
349 struct r10bio *r10_bio = bio->bi_private; 364 struct r10bio *r10_bio = bio->bi_private;
350 int slot, dev; 365 int slot, dev;
351 struct md_rdev *rdev; 366 struct md_rdev *rdev;
352 struct r10conf *conf = r10_bio->mddev->private; 367 struct r10conf *conf = r10_bio->mddev->private;
353 368
354 369
355 slot = r10_bio->read_slot; 370 slot = r10_bio->read_slot;
356 dev = r10_bio->devs[slot].devnum; 371 dev = r10_bio->devs[slot].devnum;
357 rdev = r10_bio->devs[slot].rdev; 372 rdev = r10_bio->devs[slot].rdev;
358 /* 373 /*
359 * this branch is our 'one mirror IO has finished' event handler: 374 * this branch is our 'one mirror IO has finished' event handler:
360 */ 375 */
361 update_head_pos(slot, r10_bio); 376 update_head_pos(slot, r10_bio);
362 377
363 if (uptodate) { 378 if (uptodate) {
364 /* 379 /*
365 * Set R10BIO_Uptodate in our master bio, so that 380 * Set R10BIO_Uptodate in our master bio, so that
366 * we will return a good error code to the higher 381 * we will return a good error code to the higher
367 * levels even if IO on some other mirrored buffer fails. 382 * levels even if IO on some other mirrored buffer fails.
368 * 383 *
369 * The 'master' represents the composite IO operation to 384 * The 'master' represents the composite IO operation to
370 * user-side. So if something waits for IO, then it will 385 * user-side. So if something waits for IO, then it will
371 * wait for the 'master' bio. 386 * wait for the 'master' bio.
372 */ 387 */
373 set_bit(R10BIO_Uptodate, &r10_bio->state); 388 set_bit(R10BIO_Uptodate, &r10_bio->state);
374 } else { 389 } else {
375 /* If all other devices that store this block have 390 /* If all other devices that store this block have
376 * failed, we want to return the error upwards rather 391 * failed, we want to return the error upwards rather
377 * than fail the last device. Here we redefine 392 * than fail the last device. Here we redefine
378 * "uptodate" to mean "Don't want to retry" 393 * "uptodate" to mean "Don't want to retry"
379 */ 394 */
380 unsigned long flags; 395 unsigned long flags;
381 spin_lock_irqsave(&conf->device_lock, flags); 396 spin_lock_irqsave(&conf->device_lock, flags);
382 if (!enough(conf, rdev->raid_disk)) 397 if (!enough(conf, rdev->raid_disk))
383 uptodate = 1; 398 uptodate = 1;
384 spin_unlock_irqrestore(&conf->device_lock, flags); 399 spin_unlock_irqrestore(&conf->device_lock, flags);
385 } 400 }
386 if (uptodate) { 401 if (uptodate) {
387 raid_end_bio_io(r10_bio); 402 raid_end_bio_io(r10_bio);
388 rdev_dec_pending(rdev, conf->mddev); 403 rdev_dec_pending(rdev, conf->mddev);
389 } else { 404 } else {
390 /* 405 /*
391 * oops, read error - keep the refcount on the rdev 406 * oops, read error - keep the refcount on the rdev
392 */ 407 */
393 char b[BDEVNAME_SIZE]; 408 char b[BDEVNAME_SIZE];
394 printk_ratelimited(KERN_ERR 409 printk_ratelimited(KERN_ERR
395 "md/raid10:%s: %s: rescheduling sector %llu\n", 410 "md/raid10:%s: %s: rescheduling sector %llu\n",
396 mdname(conf->mddev), 411 mdname(conf->mddev),
397 bdevname(rdev->bdev, b), 412 bdevname(rdev->bdev, b),
398 (unsigned long long)r10_bio->sector); 413 (unsigned long long)r10_bio->sector);
399 set_bit(R10BIO_ReadError, &r10_bio->state); 414 set_bit(R10BIO_ReadError, &r10_bio->state);
400 reschedule_retry(r10_bio); 415 reschedule_retry(r10_bio);
401 } 416 }
402 } 417 }
403 418
404 static void close_write(struct r10bio *r10_bio) 419 static void close_write(struct r10bio *r10_bio)
405 { 420 {
406 /* clear the bitmap if all writes complete successfully */ 421 /* clear the bitmap if all writes complete successfully */
407 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector, 422 bitmap_endwrite(r10_bio->mddev->bitmap, r10_bio->sector,
408 r10_bio->sectors, 423 r10_bio->sectors,
409 !test_bit(R10BIO_Degraded, &r10_bio->state), 424 !test_bit(R10BIO_Degraded, &r10_bio->state),
410 0); 425 0);
411 md_write_end(r10_bio->mddev); 426 md_write_end(r10_bio->mddev);
412 } 427 }
413 428
414 static void one_write_done(struct r10bio *r10_bio) 429 static void one_write_done(struct r10bio *r10_bio)
415 { 430 {
416 if (atomic_dec_and_test(&r10_bio->remaining)) { 431 if (atomic_dec_and_test(&r10_bio->remaining)) {
417 if (test_bit(R10BIO_WriteError, &r10_bio->state)) 432 if (test_bit(R10BIO_WriteError, &r10_bio->state))
418 reschedule_retry(r10_bio); 433 reschedule_retry(r10_bio);
419 else { 434 else {
420 close_write(r10_bio); 435 close_write(r10_bio);
421 if (test_bit(R10BIO_MadeGood, &r10_bio->state)) 436 if (test_bit(R10BIO_MadeGood, &r10_bio->state))
422 reschedule_retry(r10_bio); 437 reschedule_retry(r10_bio);
423 else 438 else
424 raid_end_bio_io(r10_bio); 439 raid_end_bio_io(r10_bio);
425 } 440 }
426 } 441 }
427 } 442 }
428 443
429 static void raid10_end_write_request(struct bio *bio, int error) 444 static void raid10_end_write_request(struct bio *bio, int error)
430 { 445 {
431 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 446 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
432 struct r10bio *r10_bio = bio->bi_private; 447 struct r10bio *r10_bio = bio->bi_private;
433 int dev; 448 int dev;
434 int dec_rdev = 1; 449 int dec_rdev = 1;
435 struct r10conf *conf = r10_bio->mddev->private; 450 struct r10conf *conf = r10_bio->mddev->private;
436 int slot, repl; 451 int slot, repl;
437 struct md_rdev *rdev = NULL; 452 struct md_rdev *rdev = NULL;
438 453
439 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 454 dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
440 455
441 if (repl) 456 if (repl)
442 rdev = conf->mirrors[dev].replacement; 457 rdev = conf->mirrors[dev].replacement;
443 if (!rdev) { 458 if (!rdev) {
444 smp_rmb(); 459 smp_rmb();
445 repl = 0; 460 repl = 0;
446 rdev = conf->mirrors[dev].rdev; 461 rdev = conf->mirrors[dev].rdev;
447 } 462 }
448 /* 463 /*
449 * this branch is our 'one mirror IO has finished' event handler: 464 * this branch is our 'one mirror IO has finished' event handler:
450 */ 465 */
451 if (!uptodate) { 466 if (!uptodate) {
452 if (repl) 467 if (repl)
453 /* Never record new bad blocks to replacement, 468 /* Never record new bad blocks to replacement,
454 * just fail it. 469 * just fail it.
455 */ 470 */
456 md_error(rdev->mddev, rdev); 471 md_error(rdev->mddev, rdev);
457 else { 472 else {
458 set_bit(WriteErrorSeen, &rdev->flags); 473 set_bit(WriteErrorSeen, &rdev->flags);
459 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 474 if (!test_and_set_bit(WantReplacement, &rdev->flags))
460 set_bit(MD_RECOVERY_NEEDED, 475 set_bit(MD_RECOVERY_NEEDED,
461 &rdev->mddev->recovery); 476 &rdev->mddev->recovery);
462 set_bit(R10BIO_WriteError, &r10_bio->state); 477 set_bit(R10BIO_WriteError, &r10_bio->state);
463 dec_rdev = 0; 478 dec_rdev = 0;
464 } 479 }
465 } else { 480 } else {
466 /* 481 /*
467 * Set R10BIO_Uptodate in our master bio, so that 482 * Set R10BIO_Uptodate in our master bio, so that
468 * we will return a good error code for to the higher 483 * we will return a good error code for to the higher
469 * levels even if IO on some other mirrored buffer fails. 484 * levels even if IO on some other mirrored buffer fails.
470 * 485 *
471 * The 'master' represents the composite IO operation to 486 * The 'master' represents the composite IO operation to
472 * user-side. So if something waits for IO, then it will 487 * user-side. So if something waits for IO, then it will
473 * wait for the 'master' bio. 488 * wait for the 'master' bio.
474 */ 489 */
475 sector_t first_bad; 490 sector_t first_bad;
476 int bad_sectors; 491 int bad_sectors;
477 492
478 set_bit(R10BIO_Uptodate, &r10_bio->state); 493 set_bit(R10BIO_Uptodate, &r10_bio->state);
479 494
480 /* Maybe we can clear some bad blocks. */ 495 /* Maybe we can clear some bad blocks. */
481 if (is_badblock(rdev, 496 if (is_badblock(rdev,
482 r10_bio->devs[slot].addr, 497 r10_bio->devs[slot].addr,
483 r10_bio->sectors, 498 r10_bio->sectors,
484 &first_bad, &bad_sectors)) { 499 &first_bad, &bad_sectors)) {
485 bio_put(bio); 500 bio_put(bio);
486 if (repl) 501 if (repl)
487 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD; 502 r10_bio->devs[slot].repl_bio = IO_MADE_GOOD;
488 else 503 else
489 r10_bio->devs[slot].bio = IO_MADE_GOOD; 504 r10_bio->devs[slot].bio = IO_MADE_GOOD;
490 dec_rdev = 0; 505 dec_rdev = 0;
491 set_bit(R10BIO_MadeGood, &r10_bio->state); 506 set_bit(R10BIO_MadeGood, &r10_bio->state);
492 } 507 }
493 } 508 }
494 509
495 /* 510 /*
496 * 511 *
497 * Let's see if all mirrored write operations have finished 512 * Let's see if all mirrored write operations have finished
498 * already. 513 * already.
499 */ 514 */
500 one_write_done(r10_bio); 515 one_write_done(r10_bio);
501 if (dec_rdev) 516 if (dec_rdev)
502 rdev_dec_pending(rdev, conf->mddev); 517 rdev_dec_pending(rdev, conf->mddev);
503 } 518 }
504 519
505 /* 520 /*
506 * RAID10 layout manager 521 * RAID10 layout manager
507 * As well as the chunksize and raid_disks count, there are two 522 * As well as the chunksize and raid_disks count, there are two
508 * parameters: near_copies and far_copies. 523 * parameters: near_copies and far_copies.
509 * near_copies * far_copies must be <= raid_disks. 524 * near_copies * far_copies must be <= raid_disks.
510 * Normally one of these will be 1. 525 * Normally one of these will be 1.
511 * If both are 1, we get raid0. 526 * If both are 1, we get raid0.
512 * If near_copies == raid_disks, we get raid1. 527 * If near_copies == raid_disks, we get raid1.
513 * 528 *
514 * Chunks are laid out in raid0 style with near_copies copies of the 529 * Chunks are laid out in raid0 style with near_copies copies of the
515 * first chunk, followed by near_copies copies of the next chunk and 530 * first chunk, followed by near_copies copies of the next chunk and
516 * so on. 531 * so on.
517 * If far_copies > 1, then after 1/far_copies of the array has been assigned 532 * If far_copies > 1, then after 1/far_copies of the array has been assigned
518 * as described above, we start again with a device offset of near_copies. 533 * as described above, we start again with a device offset of near_copies.
519 * So we effectively have another copy of the whole array further down all 534 * So we effectively have another copy of the whole array further down all
520 * the drives, but with blocks on different drives. 535 * the drives, but with blocks on different drives.
521 * With this layout, and block is never stored twice on the one device. 536 * With this layout, and block is never stored twice on the one device.
522 * 537 *
523 * raid10_find_phys finds the sector offset of a given virtual sector 538 * raid10_find_phys finds the sector offset of a given virtual sector
524 * on each device that it is on. 539 * on each device that it is on.
525 * 540 *
526 * raid10_find_virt does the reverse mapping, from a device and a 541 * raid10_find_virt does the reverse mapping, from a device and a
527 * sector offset to a virtual address 542 * sector offset to a virtual address
528 */ 543 */
529 544
530 static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio) 545 static void __raid10_find_phys(struct geom *geo, struct r10bio *r10bio)
531 { 546 {
532 int n,f; 547 int n,f;
533 sector_t sector; 548 sector_t sector;
534 sector_t chunk; 549 sector_t chunk;
535 sector_t stripe; 550 sector_t stripe;
536 int dev; 551 int dev;
537 int slot = 0; 552 int slot = 0;
538 553
539 /* now calculate first sector/dev */ 554 /* now calculate first sector/dev */
540 chunk = r10bio->sector >> geo->chunk_shift; 555 chunk = r10bio->sector >> geo->chunk_shift;
541 sector = r10bio->sector & geo->chunk_mask; 556 sector = r10bio->sector & geo->chunk_mask;
542 557
543 chunk *= geo->near_copies; 558 chunk *= geo->near_copies;
544 stripe = chunk; 559 stripe = chunk;
545 dev = sector_div(stripe, geo->raid_disks); 560 dev = sector_div(stripe, geo->raid_disks);
546 if (geo->far_offset) 561 if (geo->far_offset)
547 stripe *= geo->far_copies; 562 stripe *= geo->far_copies;
548 563
549 sector += stripe << geo->chunk_shift; 564 sector += stripe << geo->chunk_shift;
550 565
551 /* and calculate all the others */ 566 /* and calculate all the others */
552 for (n = 0; n < geo->near_copies; n++) { 567 for (n = 0; n < geo->near_copies; n++) {
553 int d = dev; 568 int d = dev;
569 int set;
554 sector_t s = sector; 570 sector_t s = sector;
555 r10bio->devs[slot].devnum = d; 571 r10bio->devs[slot].devnum = d;
556 r10bio->devs[slot].addr = s; 572 r10bio->devs[slot].addr = s;
557 slot++; 573 slot++;
558 574
559 for (f = 1; f < geo->far_copies; f++) { 575 for (f = 1; f < geo->far_copies; f++) {
576 set = d / geo->far_set_size;
560 d += geo->near_copies; 577 d += geo->near_copies;
561 d %= geo->raid_disks; 578 d %= geo->far_set_size;
579 d += geo->far_set_size * set;
580
562 s += geo->stride; 581 s += geo->stride;
563 r10bio->devs[slot].devnum = d; 582 r10bio->devs[slot].devnum = d;
564 r10bio->devs[slot].addr = s; 583 r10bio->devs[slot].addr = s;
565 slot++; 584 slot++;
566 } 585 }
567 dev++; 586 dev++;
568 if (dev >= geo->raid_disks) { 587 if (dev >= geo->raid_disks) {
569 dev = 0; 588 dev = 0;
570 sector += (geo->chunk_mask + 1); 589 sector += (geo->chunk_mask + 1);
571 } 590 }
572 } 591 }
573 } 592 }
574 593
575 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio) 594 static void raid10_find_phys(struct r10conf *conf, struct r10bio *r10bio)
576 { 595 {
577 struct geom *geo = &conf->geo; 596 struct geom *geo = &conf->geo;
578 597
579 if (conf->reshape_progress != MaxSector && 598 if (conf->reshape_progress != MaxSector &&
580 ((r10bio->sector >= conf->reshape_progress) != 599 ((r10bio->sector >= conf->reshape_progress) !=
581 conf->mddev->reshape_backwards)) { 600 conf->mddev->reshape_backwards)) {
582 set_bit(R10BIO_Previous, &r10bio->state); 601 set_bit(R10BIO_Previous, &r10bio->state);
583 geo = &conf->prev; 602 geo = &conf->prev;
584 } else 603 } else
585 clear_bit(R10BIO_Previous, &r10bio->state); 604 clear_bit(R10BIO_Previous, &r10bio->state);
586 605
587 __raid10_find_phys(geo, r10bio); 606 __raid10_find_phys(geo, r10bio);
588 } 607 }
589 608
590 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) 609 static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
591 { 610 {
592 sector_t offset, chunk, vchunk; 611 sector_t offset, chunk, vchunk;
593 /* Never use conf->prev as this is only called during resync 612 /* Never use conf->prev as this is only called during resync
594 * or recovery, so reshape isn't happening 613 * or recovery, so reshape isn't happening
595 */ 614 */
596 struct geom *geo = &conf->geo; 615 struct geom *geo = &conf->geo;
616 int far_set_start = (dev / geo->far_set_size) * geo->far_set_size;
617 int far_set_size = geo->far_set_size;
597 618
598 offset = sector & geo->chunk_mask; 619 offset = sector & geo->chunk_mask;
599 if (geo->far_offset) { 620 if (geo->far_offset) {
600 int fc; 621 int fc;
601 chunk = sector >> geo->chunk_shift; 622 chunk = sector >> geo->chunk_shift;
602 fc = sector_div(chunk, geo->far_copies); 623 fc = sector_div(chunk, geo->far_copies);
603 dev -= fc * geo->near_copies; 624 dev -= fc * geo->near_copies;
604 if (dev < 0) 625 if (dev < far_set_start)
605 dev += geo->raid_disks; 626 dev += far_set_size;
606 } else { 627 } else {
607 while (sector >= geo->stride) { 628 while (sector >= geo->stride) {
608 sector -= geo->stride; 629 sector -= geo->stride;
609 if (dev < geo->near_copies) 630 if (dev < (geo->near_copies + far_set_start))
610 dev += geo->raid_disks - geo->near_copies; 631 dev += far_set_size - geo->near_copies;
611 else 632 else
612 dev -= geo->near_copies; 633 dev -= geo->near_copies;
613 } 634 }
614 chunk = sector >> geo->chunk_shift; 635 chunk = sector >> geo->chunk_shift;
615 } 636 }
616 vchunk = chunk * geo->raid_disks + dev; 637 vchunk = chunk * geo->raid_disks + dev;
617 sector_div(vchunk, geo->near_copies); 638 sector_div(vchunk, geo->near_copies);
618 return (vchunk << geo->chunk_shift) + offset; 639 return (vchunk << geo->chunk_shift) + offset;
619 } 640 }
620 641
621 /** 642 /**
622 * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged 643 * raid10_mergeable_bvec -- tell bio layer if a two requests can be merged
623 * @q: request queue 644 * @q: request queue
624 * @bvm: properties of new bio 645 * @bvm: properties of new bio
625 * @biovec: the request that could be merged to it. 646 * @biovec: the request that could be merged to it.
626 * 647 *
627 * Return amount of bytes we can accept at this offset 648 * Return amount of bytes we can accept at this offset
628 * This requires checking for end-of-chunk if near_copies != raid_disks, 649 * This requires checking for end-of-chunk if near_copies != raid_disks,
629 * and for subordinate merge_bvec_fns if merge_check_needed. 650 * and for subordinate merge_bvec_fns if merge_check_needed.
630 */ 651 */
631 static int raid10_mergeable_bvec(struct request_queue *q, 652 static int raid10_mergeable_bvec(struct request_queue *q,
632 struct bvec_merge_data *bvm, 653 struct bvec_merge_data *bvm,
633 struct bio_vec *biovec) 654 struct bio_vec *biovec)
634 { 655 {
635 struct mddev *mddev = q->queuedata; 656 struct mddev *mddev = q->queuedata;
636 struct r10conf *conf = mddev->private; 657 struct r10conf *conf = mddev->private;
637 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 658 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
638 int max; 659 int max;
639 unsigned int chunk_sectors; 660 unsigned int chunk_sectors;
640 unsigned int bio_sectors = bvm->bi_size >> 9; 661 unsigned int bio_sectors = bvm->bi_size >> 9;
641 struct geom *geo = &conf->geo; 662 struct geom *geo = &conf->geo;
642 663
643 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1; 664 chunk_sectors = (conf->geo.chunk_mask & conf->prev.chunk_mask) + 1;
644 if (conf->reshape_progress != MaxSector && 665 if (conf->reshape_progress != MaxSector &&
645 ((sector >= conf->reshape_progress) != 666 ((sector >= conf->reshape_progress) !=
646 conf->mddev->reshape_backwards)) 667 conf->mddev->reshape_backwards))
647 geo = &conf->prev; 668 geo = &conf->prev;
648 669
649 if (geo->near_copies < geo->raid_disks) { 670 if (geo->near_copies < geo->raid_disks) {
650 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) 671 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
651 + bio_sectors)) << 9; 672 + bio_sectors)) << 9;
652 if (max < 0) 673 if (max < 0)
653 /* bio_add cannot handle a negative return */ 674 /* bio_add cannot handle a negative return */
654 max = 0; 675 max = 0;
655 if (max <= biovec->bv_len && bio_sectors == 0) 676 if (max <= biovec->bv_len && bio_sectors == 0)
656 return biovec->bv_len; 677 return biovec->bv_len;
657 } else 678 } else
658 max = biovec->bv_len; 679 max = biovec->bv_len;
659 680
660 if (mddev->merge_check_needed) { 681 if (mddev->merge_check_needed) {
661 struct { 682 struct {
662 struct r10bio r10_bio; 683 struct r10bio r10_bio;
663 struct r10dev devs[conf->copies]; 684 struct r10dev devs[conf->copies];
664 } on_stack; 685 } on_stack;
665 struct r10bio *r10_bio = &on_stack.r10_bio; 686 struct r10bio *r10_bio = &on_stack.r10_bio;
666 int s; 687 int s;
667 if (conf->reshape_progress != MaxSector) { 688 if (conf->reshape_progress != MaxSector) {
668 /* Cannot give any guidance during reshape */ 689 /* Cannot give any guidance during reshape */
669 if (max <= biovec->bv_len && bio_sectors == 0) 690 if (max <= biovec->bv_len && bio_sectors == 0)
670 return biovec->bv_len; 691 return biovec->bv_len;
671 return 0; 692 return 0;
672 } 693 }
673 r10_bio->sector = sector; 694 r10_bio->sector = sector;
674 raid10_find_phys(conf, r10_bio); 695 raid10_find_phys(conf, r10_bio);
675 rcu_read_lock(); 696 rcu_read_lock();
676 for (s = 0; s < conf->copies; s++) { 697 for (s = 0; s < conf->copies; s++) {
677 int disk = r10_bio->devs[s].devnum; 698 int disk = r10_bio->devs[s].devnum;
678 struct md_rdev *rdev = rcu_dereference( 699 struct md_rdev *rdev = rcu_dereference(
679 conf->mirrors[disk].rdev); 700 conf->mirrors[disk].rdev);
680 if (rdev && !test_bit(Faulty, &rdev->flags)) { 701 if (rdev && !test_bit(Faulty, &rdev->flags)) {
681 struct request_queue *q = 702 struct request_queue *q =
682 bdev_get_queue(rdev->bdev); 703 bdev_get_queue(rdev->bdev);
683 if (q->merge_bvec_fn) { 704 if (q->merge_bvec_fn) {
684 bvm->bi_sector = r10_bio->devs[s].addr 705 bvm->bi_sector = r10_bio->devs[s].addr
685 + rdev->data_offset; 706 + rdev->data_offset;
686 bvm->bi_bdev = rdev->bdev; 707 bvm->bi_bdev = rdev->bdev;
687 max = min(max, q->merge_bvec_fn( 708 max = min(max, q->merge_bvec_fn(
688 q, bvm, biovec)); 709 q, bvm, biovec));
689 } 710 }
690 } 711 }
691 rdev = rcu_dereference(conf->mirrors[disk].replacement); 712 rdev = rcu_dereference(conf->mirrors[disk].replacement);
692 if (rdev && !test_bit(Faulty, &rdev->flags)) { 713 if (rdev && !test_bit(Faulty, &rdev->flags)) {
693 struct request_queue *q = 714 struct request_queue *q =
694 bdev_get_queue(rdev->bdev); 715 bdev_get_queue(rdev->bdev);
695 if (q->merge_bvec_fn) { 716 if (q->merge_bvec_fn) {
696 bvm->bi_sector = r10_bio->devs[s].addr 717 bvm->bi_sector = r10_bio->devs[s].addr
697 + rdev->data_offset; 718 + rdev->data_offset;
698 bvm->bi_bdev = rdev->bdev; 719 bvm->bi_bdev = rdev->bdev;
699 max = min(max, q->merge_bvec_fn( 720 max = min(max, q->merge_bvec_fn(
700 q, bvm, biovec)); 721 q, bvm, biovec));
701 } 722 }
702 } 723 }
703 } 724 }
704 rcu_read_unlock(); 725 rcu_read_unlock();
705 } 726 }
706 return max; 727 return max;
707 } 728 }
708 729
709 /* 730 /*
710 * This routine returns the disk from which the requested read should 731 * This routine returns the disk from which the requested read should
711 * be done. There is a per-array 'next expected sequential IO' sector 732 * be done. There is a per-array 'next expected sequential IO' sector
712 * number - if this matches on the next IO then we use the last disk. 733 * number - if this matches on the next IO then we use the last disk.
713 * There is also a per-disk 'last know head position' sector that is 734 * There is also a per-disk 'last know head position' sector that is
714 * maintained from IRQ contexts, both the normal and the resync IO 735 * maintained from IRQ contexts, both the normal and the resync IO
715 * completion handlers update this position correctly. If there is no 736 * completion handlers update this position correctly. If there is no
716 * perfect sequential match then we pick the disk whose head is closest. 737 * perfect sequential match then we pick the disk whose head is closest.
717 * 738 *
718 * If there are 2 mirrors in the same 2 devices, performance degrades 739 * If there are 2 mirrors in the same 2 devices, performance degrades
719 * because position is mirror, not device based. 740 * because position is mirror, not device based.
720 * 741 *
721 * The rdev for the device selected will have nr_pending incremented. 742 * The rdev for the device selected will have nr_pending incremented.
722 */ 743 */
723 744
724 /* 745 /*
725 * FIXME: possibly should rethink readbalancing and do it differently 746 * FIXME: possibly should rethink readbalancing and do it differently
726 * depending on near_copies / far_copies geometry. 747 * depending on near_copies / far_copies geometry.
727 */ 748 */
728 static struct md_rdev *read_balance(struct r10conf *conf, 749 static struct md_rdev *read_balance(struct r10conf *conf,
729 struct r10bio *r10_bio, 750 struct r10bio *r10_bio,
730 int *max_sectors) 751 int *max_sectors)
731 { 752 {
732 const sector_t this_sector = r10_bio->sector; 753 const sector_t this_sector = r10_bio->sector;
733 int disk, slot; 754 int disk, slot;
734 int sectors = r10_bio->sectors; 755 int sectors = r10_bio->sectors;
735 int best_good_sectors; 756 int best_good_sectors;
736 sector_t new_distance, best_dist; 757 sector_t new_distance, best_dist;
737 struct md_rdev *best_rdev, *rdev = NULL; 758 struct md_rdev *best_rdev, *rdev = NULL;
738 int do_balance; 759 int do_balance;
739 int best_slot; 760 int best_slot;
740 struct geom *geo = &conf->geo; 761 struct geom *geo = &conf->geo;
741 762
742 raid10_find_phys(conf, r10_bio); 763 raid10_find_phys(conf, r10_bio);
743 rcu_read_lock(); 764 rcu_read_lock();
744 retry: 765 retry:
745 sectors = r10_bio->sectors; 766 sectors = r10_bio->sectors;
746 best_slot = -1; 767 best_slot = -1;
747 best_rdev = NULL; 768 best_rdev = NULL;
748 best_dist = MaxSector; 769 best_dist = MaxSector;
749 best_good_sectors = 0; 770 best_good_sectors = 0;
750 do_balance = 1; 771 do_balance = 1;
751 /* 772 /*
752 * Check if we can balance. We can balance on the whole 773 * Check if we can balance. We can balance on the whole
753 * device if no resync is going on (recovery is ok), or below 774 * device if no resync is going on (recovery is ok), or below
754 * the resync window. We take the first readable disk when 775 * the resync window. We take the first readable disk when
755 * above the resync window. 776 * above the resync window.
756 */ 777 */
757 if (conf->mddev->recovery_cp < MaxSector 778 if (conf->mddev->recovery_cp < MaxSector
758 && (this_sector + sectors >= conf->next_resync)) 779 && (this_sector + sectors >= conf->next_resync))
759 do_balance = 0; 780 do_balance = 0;
760 781
761 for (slot = 0; slot < conf->copies ; slot++) { 782 for (slot = 0; slot < conf->copies ; slot++) {
762 sector_t first_bad; 783 sector_t first_bad;
763 int bad_sectors; 784 int bad_sectors;
764 sector_t dev_sector; 785 sector_t dev_sector;
765 786
766 if (r10_bio->devs[slot].bio == IO_BLOCKED) 787 if (r10_bio->devs[slot].bio == IO_BLOCKED)
767 continue; 788 continue;
768 disk = r10_bio->devs[slot].devnum; 789 disk = r10_bio->devs[slot].devnum;
769 rdev = rcu_dereference(conf->mirrors[disk].replacement); 790 rdev = rcu_dereference(conf->mirrors[disk].replacement);
770 if (rdev == NULL || test_bit(Faulty, &rdev->flags) || 791 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
771 test_bit(Unmerged, &rdev->flags) || 792 test_bit(Unmerged, &rdev->flags) ||
772 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 793 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
773 rdev = rcu_dereference(conf->mirrors[disk].rdev); 794 rdev = rcu_dereference(conf->mirrors[disk].rdev);
774 if (rdev == NULL || 795 if (rdev == NULL ||
775 test_bit(Faulty, &rdev->flags) || 796 test_bit(Faulty, &rdev->flags) ||
776 test_bit(Unmerged, &rdev->flags)) 797 test_bit(Unmerged, &rdev->flags))
777 continue; 798 continue;
778 if (!test_bit(In_sync, &rdev->flags) && 799 if (!test_bit(In_sync, &rdev->flags) &&
779 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 800 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
780 continue; 801 continue;
781 802
782 dev_sector = r10_bio->devs[slot].addr; 803 dev_sector = r10_bio->devs[slot].addr;
783 if (is_badblock(rdev, dev_sector, sectors, 804 if (is_badblock(rdev, dev_sector, sectors,
784 &first_bad, &bad_sectors)) { 805 &first_bad, &bad_sectors)) {
785 if (best_dist < MaxSector) 806 if (best_dist < MaxSector)
786 /* Already have a better slot */ 807 /* Already have a better slot */
787 continue; 808 continue;
788 if (first_bad <= dev_sector) { 809 if (first_bad <= dev_sector) {
789 /* Cannot read here. If this is the 810 /* Cannot read here. If this is the
790 * 'primary' device, then we must not read 811 * 'primary' device, then we must not read
791 * beyond 'bad_sectors' from another device. 812 * beyond 'bad_sectors' from another device.
792 */ 813 */
793 bad_sectors -= (dev_sector - first_bad); 814 bad_sectors -= (dev_sector - first_bad);
794 if (!do_balance && sectors > bad_sectors) 815 if (!do_balance && sectors > bad_sectors)
795 sectors = bad_sectors; 816 sectors = bad_sectors;
796 if (best_good_sectors > sectors) 817 if (best_good_sectors > sectors)
797 best_good_sectors = sectors; 818 best_good_sectors = sectors;
798 } else { 819 } else {
799 sector_t good_sectors = 820 sector_t good_sectors =
800 first_bad - dev_sector; 821 first_bad - dev_sector;
801 if (good_sectors > best_good_sectors) { 822 if (good_sectors > best_good_sectors) {
802 best_good_sectors = good_sectors; 823 best_good_sectors = good_sectors;
803 best_slot = slot; 824 best_slot = slot;
804 best_rdev = rdev; 825 best_rdev = rdev;
805 } 826 }
806 if (!do_balance) 827 if (!do_balance)
807 /* Must read from here */ 828 /* Must read from here */
808 break; 829 break;
809 } 830 }
810 continue; 831 continue;
811 } else 832 } else
812 best_good_sectors = sectors; 833 best_good_sectors = sectors;
813 834
814 if (!do_balance) 835 if (!do_balance)
815 break; 836 break;
816 837
817 /* This optimisation is debatable, and completely destroys 838 /* This optimisation is debatable, and completely destroys
818 * sequential read speed for 'far copies' arrays. So only 839 * sequential read speed for 'far copies' arrays. So only
819 * keep it for 'near' arrays, and review those later. 840 * keep it for 'near' arrays, and review those later.
820 */ 841 */
821 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending)) 842 if (geo->near_copies > 1 && !atomic_read(&rdev->nr_pending))
822 break; 843 break;
823 844
824 /* for far > 1 always use the lowest address */ 845 /* for far > 1 always use the lowest address */
825 if (geo->far_copies > 1) 846 if (geo->far_copies > 1)
826 new_distance = r10_bio->devs[slot].addr; 847 new_distance = r10_bio->devs[slot].addr;
827 else 848 else
828 new_distance = abs(r10_bio->devs[slot].addr - 849 new_distance = abs(r10_bio->devs[slot].addr -
829 conf->mirrors[disk].head_position); 850 conf->mirrors[disk].head_position);
830 if (new_distance < best_dist) { 851 if (new_distance < best_dist) {
831 best_dist = new_distance; 852 best_dist = new_distance;
832 best_slot = slot; 853 best_slot = slot;
833 best_rdev = rdev; 854 best_rdev = rdev;
834 } 855 }
835 } 856 }
836 if (slot >= conf->copies) { 857 if (slot >= conf->copies) {
837 slot = best_slot; 858 slot = best_slot;
838 rdev = best_rdev; 859 rdev = best_rdev;
839 } 860 }
840 861
841 if (slot >= 0) { 862 if (slot >= 0) {
842 atomic_inc(&rdev->nr_pending); 863 atomic_inc(&rdev->nr_pending);
843 if (test_bit(Faulty, &rdev->flags)) { 864 if (test_bit(Faulty, &rdev->flags)) {
844 /* Cannot risk returning a device that failed 865 /* Cannot risk returning a device that failed
845 * before we inc'ed nr_pending 866 * before we inc'ed nr_pending
846 */ 867 */
847 rdev_dec_pending(rdev, conf->mddev); 868 rdev_dec_pending(rdev, conf->mddev);
848 goto retry; 869 goto retry;
849 } 870 }
850 r10_bio->read_slot = slot; 871 r10_bio->read_slot = slot;
851 } else 872 } else
852 rdev = NULL; 873 rdev = NULL;
853 rcu_read_unlock(); 874 rcu_read_unlock();
854 *max_sectors = best_good_sectors; 875 *max_sectors = best_good_sectors;
855 876
856 return rdev; 877 return rdev;
857 } 878 }
858 879
859 int md_raid10_congested(struct mddev *mddev, int bits) 880 int md_raid10_congested(struct mddev *mddev, int bits)
860 { 881 {
861 struct r10conf *conf = mddev->private; 882 struct r10conf *conf = mddev->private;
862 int i, ret = 0; 883 int i, ret = 0;
863 884
864 if ((bits & (1 << BDI_async_congested)) && 885 if ((bits & (1 << BDI_async_congested)) &&
865 conf->pending_count >= max_queued_requests) 886 conf->pending_count >= max_queued_requests)
866 return 1; 887 return 1;
867 888
868 rcu_read_lock(); 889 rcu_read_lock();
869 for (i = 0; 890 for (i = 0;
870 (i < conf->geo.raid_disks || i < conf->prev.raid_disks) 891 (i < conf->geo.raid_disks || i < conf->prev.raid_disks)
871 && ret == 0; 892 && ret == 0;
872 i++) { 893 i++) {
873 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 894 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
874 if (rdev && !test_bit(Faulty, &rdev->flags)) { 895 if (rdev && !test_bit(Faulty, &rdev->flags)) {
875 struct request_queue *q = bdev_get_queue(rdev->bdev); 896 struct request_queue *q = bdev_get_queue(rdev->bdev);
876 897
877 ret |= bdi_congested(&q->backing_dev_info, bits); 898 ret |= bdi_congested(&q->backing_dev_info, bits);
878 } 899 }
879 } 900 }
880 rcu_read_unlock(); 901 rcu_read_unlock();
881 return ret; 902 return ret;
882 } 903 }
883 EXPORT_SYMBOL_GPL(md_raid10_congested); 904 EXPORT_SYMBOL_GPL(md_raid10_congested);
884 905
885 static int raid10_congested(void *data, int bits) 906 static int raid10_congested(void *data, int bits)
886 { 907 {
887 struct mddev *mddev = data; 908 struct mddev *mddev = data;
888 909
889 return mddev_congested(mddev, bits) || 910 return mddev_congested(mddev, bits) ||
890 md_raid10_congested(mddev, bits); 911 md_raid10_congested(mddev, bits);
891 } 912 }
892 913
893 static void flush_pending_writes(struct r10conf *conf) 914 static void flush_pending_writes(struct r10conf *conf)
894 { 915 {
895 /* Any writes that have been queued but are awaiting 916 /* Any writes that have been queued but are awaiting
896 * bitmap updates get flushed here. 917 * bitmap updates get flushed here.
897 */ 918 */
898 spin_lock_irq(&conf->device_lock); 919 spin_lock_irq(&conf->device_lock);
899 920
900 if (conf->pending_bio_list.head) { 921 if (conf->pending_bio_list.head) {
901 struct bio *bio; 922 struct bio *bio;
902 bio = bio_list_get(&conf->pending_bio_list); 923 bio = bio_list_get(&conf->pending_bio_list);
903 conf->pending_count = 0; 924 conf->pending_count = 0;
904 spin_unlock_irq(&conf->device_lock); 925 spin_unlock_irq(&conf->device_lock);
905 /* flush any pending bitmap writes to disk 926 /* flush any pending bitmap writes to disk
906 * before proceeding w/ I/O */ 927 * before proceeding w/ I/O */
907 bitmap_unplug(conf->mddev->bitmap); 928 bitmap_unplug(conf->mddev->bitmap);
908 wake_up(&conf->wait_barrier); 929 wake_up(&conf->wait_barrier);
909 930
910 while (bio) { /* submit pending writes */ 931 while (bio) { /* submit pending writes */
911 struct bio *next = bio->bi_next; 932 struct bio *next = bio->bi_next;
912 bio->bi_next = NULL; 933 bio->bi_next = NULL;
913 if (unlikely((bio->bi_rw & REQ_DISCARD) && 934 if (unlikely((bio->bi_rw & REQ_DISCARD) &&
914 !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) 935 !blk_queue_discard(bdev_get_queue(bio->bi_bdev))))
915 /* Just ignore it */ 936 /* Just ignore it */
916 bio_endio(bio, 0); 937 bio_endio(bio, 0);
917 else 938 else
918 generic_make_request(bio); 939 generic_make_request(bio);
919 bio = next; 940 bio = next;
920 } 941 }
921 } else 942 } else
922 spin_unlock_irq(&conf->device_lock); 943 spin_unlock_irq(&conf->device_lock);
923 } 944 }
924 945
925 /* Barriers.... 946 /* Barriers....
926 * Sometimes we need to suspend IO while we do something else, 947 * Sometimes we need to suspend IO while we do something else,
927 * either some resync/recovery, or reconfigure the array. 948 * either some resync/recovery, or reconfigure the array.
928 * To do this we raise a 'barrier'. 949 * To do this we raise a 'barrier'.
929 * The 'barrier' is a counter that can be raised multiple times 950 * The 'barrier' is a counter that can be raised multiple times
930 * to count how many activities are happening which preclude 951 * to count how many activities are happening which preclude
931 * normal IO. 952 * normal IO.
932 * We can only raise the barrier if there is no pending IO. 953 * We can only raise the barrier if there is no pending IO.
933 * i.e. if nr_pending == 0. 954 * i.e. if nr_pending == 0.
934 * We choose only to raise the barrier if no-one is waiting for the 955 * We choose only to raise the barrier if no-one is waiting for the
935 * barrier to go down. This means that as soon as an IO request 956 * barrier to go down. This means that as soon as an IO request
936 * is ready, no other operations which require a barrier will start 957 * is ready, no other operations which require a barrier will start
937 * until the IO request has had a chance. 958 * until the IO request has had a chance.
938 * 959 *
939 * So: regular IO calls 'wait_barrier'. When that returns there 960 * So: regular IO calls 'wait_barrier'. When that returns there
940 * is no backgroup IO happening, It must arrange to call 961 * is no backgroup IO happening, It must arrange to call
941 * allow_barrier when it has finished its IO. 962 * allow_barrier when it has finished its IO.
942 * backgroup IO calls must call raise_barrier. Once that returns 963 * backgroup IO calls must call raise_barrier. Once that returns
943 * there is no normal IO happeing. It must arrange to call 964 * there is no normal IO happeing. It must arrange to call
944 * lower_barrier when the particular background IO completes. 965 * lower_barrier when the particular background IO completes.
945 */ 966 */
946 967
947 static void raise_barrier(struct r10conf *conf, int force) 968 static void raise_barrier(struct r10conf *conf, int force)
948 { 969 {
949 BUG_ON(force && !conf->barrier); 970 BUG_ON(force && !conf->barrier);
950 spin_lock_irq(&conf->resync_lock); 971 spin_lock_irq(&conf->resync_lock);
951 972
952 /* Wait until no block IO is waiting (unless 'force') */ 973 /* Wait until no block IO is waiting (unless 'force') */
953 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting, 974 wait_event_lock_irq(conf->wait_barrier, force || !conf->nr_waiting,
954 conf->resync_lock); 975 conf->resync_lock);
955 976
956 /* block any new IO from starting */ 977 /* block any new IO from starting */
957 conf->barrier++; 978 conf->barrier++;
958 979
959 /* Now wait for all pending IO to complete */ 980 /* Now wait for all pending IO to complete */
960 wait_event_lock_irq(conf->wait_barrier, 981 wait_event_lock_irq(conf->wait_barrier,
961 !conf->nr_pending && conf->barrier < RESYNC_DEPTH, 982 !conf->nr_pending && conf->barrier < RESYNC_DEPTH,
962 conf->resync_lock); 983 conf->resync_lock);
963 984
964 spin_unlock_irq(&conf->resync_lock); 985 spin_unlock_irq(&conf->resync_lock);
965 } 986 }
966 987
967 static void lower_barrier(struct r10conf *conf) 988 static void lower_barrier(struct r10conf *conf)
968 { 989 {
969 unsigned long flags; 990 unsigned long flags;
970 spin_lock_irqsave(&conf->resync_lock, flags); 991 spin_lock_irqsave(&conf->resync_lock, flags);
971 conf->barrier--; 992 conf->barrier--;
972 spin_unlock_irqrestore(&conf->resync_lock, flags); 993 spin_unlock_irqrestore(&conf->resync_lock, flags);
973 wake_up(&conf->wait_barrier); 994 wake_up(&conf->wait_barrier);
974 } 995 }
975 996
976 static void wait_barrier(struct r10conf *conf) 997 static void wait_barrier(struct r10conf *conf)
977 { 998 {
978 spin_lock_irq(&conf->resync_lock); 999 spin_lock_irq(&conf->resync_lock);
979 if (conf->barrier) { 1000 if (conf->barrier) {
980 conf->nr_waiting++; 1001 conf->nr_waiting++;
981 /* Wait for the barrier to drop. 1002 /* Wait for the barrier to drop.
982 * However if there are already pending 1003 * However if there are already pending
983 * requests (preventing the barrier from 1004 * requests (preventing the barrier from
984 * rising completely), and the 1005 * rising completely), and the
985 * pre-process bio queue isn't empty, 1006 * pre-process bio queue isn't empty,
986 * then don't wait, as we need to empty 1007 * then don't wait, as we need to empty
987 * that queue to get the nr_pending 1008 * that queue to get the nr_pending
988 * count down. 1009 * count down.
989 */ 1010 */
990 wait_event_lock_irq(conf->wait_barrier, 1011 wait_event_lock_irq(conf->wait_barrier,
991 !conf->barrier || 1012 !conf->barrier ||
992 (conf->nr_pending && 1013 (conf->nr_pending &&
993 current->bio_list && 1014 current->bio_list &&
994 !bio_list_empty(current->bio_list)), 1015 !bio_list_empty(current->bio_list)),
995 conf->resync_lock); 1016 conf->resync_lock);
996 conf->nr_waiting--; 1017 conf->nr_waiting--;
997 } 1018 }
998 conf->nr_pending++; 1019 conf->nr_pending++;
999 spin_unlock_irq(&conf->resync_lock); 1020 spin_unlock_irq(&conf->resync_lock);
1000 } 1021 }
1001 1022
1002 static void allow_barrier(struct r10conf *conf) 1023 static void allow_barrier(struct r10conf *conf)
1003 { 1024 {
1004 unsigned long flags; 1025 unsigned long flags;
1005 spin_lock_irqsave(&conf->resync_lock, flags); 1026 spin_lock_irqsave(&conf->resync_lock, flags);
1006 conf->nr_pending--; 1027 conf->nr_pending--;
1007 spin_unlock_irqrestore(&conf->resync_lock, flags); 1028 spin_unlock_irqrestore(&conf->resync_lock, flags);
1008 wake_up(&conf->wait_barrier); 1029 wake_up(&conf->wait_barrier);
1009 } 1030 }
1010 1031
1011 static void freeze_array(struct r10conf *conf) 1032 static void freeze_array(struct r10conf *conf)
1012 { 1033 {
1013 /* stop syncio and normal IO and wait for everything to 1034 /* stop syncio and normal IO and wait for everything to
1014 * go quiet. 1035 * go quiet.
1015 * We increment barrier and nr_waiting, and then 1036 * We increment barrier and nr_waiting, and then
1016 * wait until nr_pending match nr_queued+1 1037 * wait until nr_pending match nr_queued+1
1017 * This is called in the context of one normal IO request 1038 * This is called in the context of one normal IO request
1018 * that has failed. Thus any sync request that might be pending 1039 * that has failed. Thus any sync request that might be pending
1019 * will be blocked by nr_pending, and we need to wait for 1040 * will be blocked by nr_pending, and we need to wait for
1020 * pending IO requests to complete or be queued for re-try. 1041 * pending IO requests to complete or be queued for re-try.
1021 * Thus the number queued (nr_queued) plus this request (1) 1042 * Thus the number queued (nr_queued) plus this request (1)
1022 * must match the number of pending IOs (nr_pending) before 1043 * must match the number of pending IOs (nr_pending) before
1023 * we continue. 1044 * we continue.
1024 */ 1045 */
1025 spin_lock_irq(&conf->resync_lock); 1046 spin_lock_irq(&conf->resync_lock);
1026 conf->barrier++; 1047 conf->barrier++;
1027 conf->nr_waiting++; 1048 conf->nr_waiting++;
1028 wait_event_lock_irq_cmd(conf->wait_barrier, 1049 wait_event_lock_irq_cmd(conf->wait_barrier,
1029 conf->nr_pending == conf->nr_queued+1, 1050 conf->nr_pending == conf->nr_queued+1,
1030 conf->resync_lock, 1051 conf->resync_lock,
1031 flush_pending_writes(conf)); 1052 flush_pending_writes(conf));
1032 1053
1033 spin_unlock_irq(&conf->resync_lock); 1054 spin_unlock_irq(&conf->resync_lock);
1034 } 1055 }
1035 1056
1036 static void unfreeze_array(struct r10conf *conf) 1057 static void unfreeze_array(struct r10conf *conf)
1037 { 1058 {
1038 /* reverse the effect of the freeze */ 1059 /* reverse the effect of the freeze */
1039 spin_lock_irq(&conf->resync_lock); 1060 spin_lock_irq(&conf->resync_lock);
1040 conf->barrier--; 1061 conf->barrier--;
1041 conf->nr_waiting--; 1062 conf->nr_waiting--;
1042 wake_up(&conf->wait_barrier); 1063 wake_up(&conf->wait_barrier);
1043 spin_unlock_irq(&conf->resync_lock); 1064 spin_unlock_irq(&conf->resync_lock);
1044 } 1065 }
1045 1066
1046 static sector_t choose_data_offset(struct r10bio *r10_bio, 1067 static sector_t choose_data_offset(struct r10bio *r10_bio,
1047 struct md_rdev *rdev) 1068 struct md_rdev *rdev)
1048 { 1069 {
1049 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) || 1070 if (!test_bit(MD_RECOVERY_RESHAPE, &rdev->mddev->recovery) ||
1050 test_bit(R10BIO_Previous, &r10_bio->state)) 1071 test_bit(R10BIO_Previous, &r10_bio->state))
1051 return rdev->data_offset; 1072 return rdev->data_offset;
1052 else 1073 else
1053 return rdev->new_data_offset; 1074 return rdev->new_data_offset;
1054 } 1075 }
1055 1076
1056 struct raid10_plug_cb { 1077 struct raid10_plug_cb {
1057 struct blk_plug_cb cb; 1078 struct blk_plug_cb cb;
1058 struct bio_list pending; 1079 struct bio_list pending;
1059 int pending_cnt; 1080 int pending_cnt;
1060 }; 1081 };
1061 1082
1062 static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) 1083 static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
1063 { 1084 {
1064 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb, 1085 struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb,
1065 cb); 1086 cb);
1066 struct mddev *mddev = plug->cb.data; 1087 struct mddev *mddev = plug->cb.data;
1067 struct r10conf *conf = mddev->private; 1088 struct r10conf *conf = mddev->private;
1068 struct bio *bio; 1089 struct bio *bio;
1069 1090
1070 if (from_schedule || current->bio_list) { 1091 if (from_schedule || current->bio_list) {
1071 spin_lock_irq(&conf->device_lock); 1092 spin_lock_irq(&conf->device_lock);
1072 bio_list_merge(&conf->pending_bio_list, &plug->pending); 1093 bio_list_merge(&conf->pending_bio_list, &plug->pending);
1073 conf->pending_count += plug->pending_cnt; 1094 conf->pending_count += plug->pending_cnt;
1074 spin_unlock_irq(&conf->device_lock); 1095 spin_unlock_irq(&conf->device_lock);
1075 md_wakeup_thread(mddev->thread); 1096 md_wakeup_thread(mddev->thread);
1076 kfree(plug); 1097 kfree(plug);
1077 return; 1098 return;
1078 } 1099 }
1079 1100
1080 /* we aren't scheduling, so we can do the write-out directly. */ 1101 /* we aren't scheduling, so we can do the write-out directly. */
1081 bio = bio_list_get(&plug->pending); 1102 bio = bio_list_get(&plug->pending);
1082 bitmap_unplug(mddev->bitmap); 1103 bitmap_unplug(mddev->bitmap);
1083 wake_up(&conf->wait_barrier); 1104 wake_up(&conf->wait_barrier);
1084 1105
1085 while (bio) { /* submit pending writes */ 1106 while (bio) { /* submit pending writes */
1086 struct bio *next = bio->bi_next; 1107 struct bio *next = bio->bi_next;
1087 bio->bi_next = NULL; 1108 bio->bi_next = NULL;
1088 generic_make_request(bio); 1109 generic_make_request(bio);
1089 bio = next; 1110 bio = next;
1090 } 1111 }
1091 kfree(plug); 1112 kfree(plug);
1092 } 1113 }
1093 1114
1094 static void make_request(struct mddev *mddev, struct bio * bio) 1115 static void make_request(struct mddev *mddev, struct bio * bio)
1095 { 1116 {
1096 struct r10conf *conf = mddev->private; 1117 struct r10conf *conf = mddev->private;
1097 struct r10bio *r10_bio; 1118 struct r10bio *r10_bio;
1098 struct bio *read_bio; 1119 struct bio *read_bio;
1099 int i; 1120 int i;
1100 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask); 1121 sector_t chunk_mask = (conf->geo.chunk_mask & conf->prev.chunk_mask);
1101 int chunk_sects = chunk_mask + 1; 1122 int chunk_sects = chunk_mask + 1;
1102 const int rw = bio_data_dir(bio); 1123 const int rw = bio_data_dir(bio);
1103 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); 1124 const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
1104 const unsigned long do_fua = (bio->bi_rw & REQ_FUA); 1125 const unsigned long do_fua = (bio->bi_rw & REQ_FUA);
1105 const unsigned long do_discard = (bio->bi_rw 1126 const unsigned long do_discard = (bio->bi_rw
1106 & (REQ_DISCARD | REQ_SECURE)); 1127 & (REQ_DISCARD | REQ_SECURE));
1107 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME); 1128 const unsigned long do_same = (bio->bi_rw & REQ_WRITE_SAME);
1108 unsigned long flags; 1129 unsigned long flags;
1109 struct md_rdev *blocked_rdev; 1130 struct md_rdev *blocked_rdev;
1110 struct blk_plug_cb *cb; 1131 struct blk_plug_cb *cb;
1111 struct raid10_plug_cb *plug = NULL; 1132 struct raid10_plug_cb *plug = NULL;
1112 int sectors_handled; 1133 int sectors_handled;
1113 int max_sectors; 1134 int max_sectors;
1114 int sectors; 1135 int sectors;
1115 1136
1116 if (unlikely(bio->bi_rw & REQ_FLUSH)) { 1137 if (unlikely(bio->bi_rw & REQ_FLUSH)) {
1117 md_flush_request(mddev, bio); 1138 md_flush_request(mddev, bio);
1118 return; 1139 return;
1119 } 1140 }
1120 1141
1121 /* If this request crosses a chunk boundary, we need to 1142 /* If this request crosses a chunk boundary, we need to
1122 * split it. This will only happen for 1 PAGE (or less) requests. 1143 * split it. This will only happen for 1 PAGE (or less) requests.
1123 */ 1144 */
1124 if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9) 1145 if (unlikely((bio->bi_sector & chunk_mask) + (bio->bi_size >> 9)
1125 > chunk_sects 1146 > chunk_sects
1126 && (conf->geo.near_copies < conf->geo.raid_disks 1147 && (conf->geo.near_copies < conf->geo.raid_disks
1127 || conf->prev.near_copies < conf->prev.raid_disks))) { 1148 || conf->prev.near_copies < conf->prev.raid_disks))) {
1128 struct bio_pair *bp; 1149 struct bio_pair *bp;
1129 /* Sanity check -- queue functions should prevent this happening */ 1150 /* Sanity check -- queue functions should prevent this happening */
1130 if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || 1151 if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) ||
1131 bio->bi_idx != 0) 1152 bio->bi_idx != 0)
1132 goto bad_map; 1153 goto bad_map;
1133 /* This is a one page bio that upper layers 1154 /* This is a one page bio that upper layers
1134 * refuse to split for us, so we need to split it. 1155 * refuse to split for us, so we need to split it.
1135 */ 1156 */
1136 bp = bio_split(bio, 1157 bp = bio_split(bio,
1137 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) ); 1158 chunk_sects - (bio->bi_sector & (chunk_sects - 1)) );
1138 1159
1139 /* Each of these 'make_request' calls will call 'wait_barrier'. 1160 /* Each of these 'make_request' calls will call 'wait_barrier'.
1140 * If the first succeeds but the second blocks due to the resync 1161 * If the first succeeds but the second blocks due to the resync
1141 * thread raising the barrier, we will deadlock because the 1162 * thread raising the barrier, we will deadlock because the
1142 * IO to the underlying device will be queued in generic_make_request 1163 * IO to the underlying device will be queued in generic_make_request
1143 * and will never complete, so will never reduce nr_pending. 1164 * and will never complete, so will never reduce nr_pending.
1144 * So increment nr_waiting here so no new raise_barriers will 1165 * So increment nr_waiting here so no new raise_barriers will
1145 * succeed, and so the second wait_barrier cannot block. 1166 * succeed, and so the second wait_barrier cannot block.
1146 */ 1167 */
1147 spin_lock_irq(&conf->resync_lock); 1168 spin_lock_irq(&conf->resync_lock);
1148 conf->nr_waiting++; 1169 conf->nr_waiting++;
1149 spin_unlock_irq(&conf->resync_lock); 1170 spin_unlock_irq(&conf->resync_lock);
1150 1171
1151 make_request(mddev, &bp->bio1); 1172 make_request(mddev, &bp->bio1);
1152 make_request(mddev, &bp->bio2); 1173 make_request(mddev, &bp->bio2);
1153 1174
1154 spin_lock_irq(&conf->resync_lock); 1175 spin_lock_irq(&conf->resync_lock);
1155 conf->nr_waiting--; 1176 conf->nr_waiting--;
1156 wake_up(&conf->wait_barrier); 1177 wake_up(&conf->wait_barrier);
1157 spin_unlock_irq(&conf->resync_lock); 1178 spin_unlock_irq(&conf->resync_lock);
1158 1179
1159 bio_pair_release(bp); 1180 bio_pair_release(bp);
1160 return; 1181 return;
1161 bad_map: 1182 bad_map:
1162 printk("md/raid10:%s: make_request bug: can't convert block across chunks" 1183 printk("md/raid10:%s: make_request bug: can't convert block across chunks"
1163 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2, 1184 " or bigger than %dk %llu %d\n", mdname(mddev), chunk_sects/2,
1164 (unsigned long long)bio->bi_sector, bio->bi_size >> 10); 1185 (unsigned long long)bio->bi_sector, bio->bi_size >> 10);
1165 1186
1166 bio_io_error(bio); 1187 bio_io_error(bio);
1167 return; 1188 return;
1168 } 1189 }
1169 1190
1170 md_write_start(mddev, bio); 1191 md_write_start(mddev, bio);
1171 1192
1172 /* 1193 /*
1173 * Register the new request and wait if the reconstruction 1194 * Register the new request and wait if the reconstruction
1174 * thread has put up a bar for new requests. 1195 * thread has put up a bar for new requests.
1175 * Continue immediately if no resync is active currently. 1196 * Continue immediately if no resync is active currently.
1176 */ 1197 */
1177 wait_barrier(conf); 1198 wait_barrier(conf);
1178 1199
1179 sectors = bio->bi_size >> 9; 1200 sectors = bio->bi_size >> 9;
1180 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1201 while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1181 bio->bi_sector < conf->reshape_progress && 1202 bio->bi_sector < conf->reshape_progress &&
1182 bio->bi_sector + sectors > conf->reshape_progress) { 1203 bio->bi_sector + sectors > conf->reshape_progress) {
1183 /* IO spans the reshape position. Need to wait for 1204 /* IO spans the reshape position. Need to wait for
1184 * reshape to pass 1205 * reshape to pass
1185 */ 1206 */
1186 allow_barrier(conf); 1207 allow_barrier(conf);
1187 wait_event(conf->wait_barrier, 1208 wait_event(conf->wait_barrier,
1188 conf->reshape_progress <= bio->bi_sector || 1209 conf->reshape_progress <= bio->bi_sector ||
1189 conf->reshape_progress >= bio->bi_sector + sectors); 1210 conf->reshape_progress >= bio->bi_sector + sectors);
1190 wait_barrier(conf); 1211 wait_barrier(conf);
1191 } 1212 }
1192 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 1213 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
1193 bio_data_dir(bio) == WRITE && 1214 bio_data_dir(bio) == WRITE &&
1194 (mddev->reshape_backwards 1215 (mddev->reshape_backwards
1195 ? (bio->bi_sector < conf->reshape_safe && 1216 ? (bio->bi_sector < conf->reshape_safe &&
1196 bio->bi_sector + sectors > conf->reshape_progress) 1217 bio->bi_sector + sectors > conf->reshape_progress)
1197 : (bio->bi_sector + sectors > conf->reshape_safe && 1218 : (bio->bi_sector + sectors > conf->reshape_safe &&
1198 bio->bi_sector < conf->reshape_progress))) { 1219 bio->bi_sector < conf->reshape_progress))) {
1199 /* Need to update reshape_position in metadata */ 1220 /* Need to update reshape_position in metadata */
1200 mddev->reshape_position = conf->reshape_progress; 1221 mddev->reshape_position = conf->reshape_progress;
1201 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1222 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1202 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1223 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1203 md_wakeup_thread(mddev->thread); 1224 md_wakeup_thread(mddev->thread);
1204 wait_event(mddev->sb_wait, 1225 wait_event(mddev->sb_wait,
1205 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 1226 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
1206 1227
1207 conf->reshape_safe = mddev->reshape_position; 1228 conf->reshape_safe = mddev->reshape_position;
1208 } 1229 }
1209 1230
1210 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 1231 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1211 1232
1212 r10_bio->master_bio = bio; 1233 r10_bio->master_bio = bio;
1213 r10_bio->sectors = sectors; 1234 r10_bio->sectors = sectors;
1214 1235
1215 r10_bio->mddev = mddev; 1236 r10_bio->mddev = mddev;
1216 r10_bio->sector = bio->bi_sector; 1237 r10_bio->sector = bio->bi_sector;
1217 r10_bio->state = 0; 1238 r10_bio->state = 0;
1218 1239
1219 /* We might need to issue multiple reads to different 1240 /* We might need to issue multiple reads to different
1220 * devices if there are bad blocks around, so we keep 1241 * devices if there are bad blocks around, so we keep
1221 * track of the number of reads in bio->bi_phys_segments. 1242 * track of the number of reads in bio->bi_phys_segments.
1222 * If this is 0, there is only one r10_bio and no locking 1243 * If this is 0, there is only one r10_bio and no locking
1223 * will be needed when the request completes. If it is 1244 * will be needed when the request completes. If it is
1224 * non-zero, then it is the number of not-completed requests. 1245 * non-zero, then it is the number of not-completed requests.
1225 */ 1246 */
1226 bio->bi_phys_segments = 0; 1247 bio->bi_phys_segments = 0;
1227 clear_bit(BIO_SEG_VALID, &bio->bi_flags); 1248 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
1228 1249
1229 if (rw == READ) { 1250 if (rw == READ) {
1230 /* 1251 /*
1231 * read balancing logic: 1252 * read balancing logic:
1232 */ 1253 */
1233 struct md_rdev *rdev; 1254 struct md_rdev *rdev;
1234 int slot; 1255 int slot;
1235 1256
1236 read_again: 1257 read_again:
1237 rdev = read_balance(conf, r10_bio, &max_sectors); 1258 rdev = read_balance(conf, r10_bio, &max_sectors);
1238 if (!rdev) { 1259 if (!rdev) {
1239 raid_end_bio_io(r10_bio); 1260 raid_end_bio_io(r10_bio);
1240 return; 1261 return;
1241 } 1262 }
1242 slot = r10_bio->read_slot; 1263 slot = r10_bio->read_slot;
1243 1264
1244 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1265 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1245 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector, 1266 md_trim_bio(read_bio, r10_bio->sector - bio->bi_sector,
1246 max_sectors); 1267 max_sectors);
1247 1268
1248 r10_bio->devs[slot].bio = read_bio; 1269 r10_bio->devs[slot].bio = read_bio;
1249 r10_bio->devs[slot].rdev = rdev; 1270 r10_bio->devs[slot].rdev = rdev;
1250 1271
1251 read_bio->bi_sector = r10_bio->devs[slot].addr + 1272 read_bio->bi_sector = r10_bio->devs[slot].addr +
1252 choose_data_offset(r10_bio, rdev); 1273 choose_data_offset(r10_bio, rdev);
1253 read_bio->bi_bdev = rdev->bdev; 1274 read_bio->bi_bdev = rdev->bdev;
1254 read_bio->bi_end_io = raid10_end_read_request; 1275 read_bio->bi_end_io = raid10_end_read_request;
1255 read_bio->bi_rw = READ | do_sync; 1276 read_bio->bi_rw = READ | do_sync;
1256 read_bio->bi_private = r10_bio; 1277 read_bio->bi_private = r10_bio;
1257 1278
1258 if (max_sectors < r10_bio->sectors) { 1279 if (max_sectors < r10_bio->sectors) {
1259 /* Could not read all from this device, so we will 1280 /* Could not read all from this device, so we will
1260 * need another r10_bio. 1281 * need another r10_bio.
1261 */ 1282 */
1262 sectors_handled = (r10_bio->sectors + max_sectors 1283 sectors_handled = (r10_bio->sectors + max_sectors
1263 - bio->bi_sector); 1284 - bio->bi_sector);
1264 r10_bio->sectors = max_sectors; 1285 r10_bio->sectors = max_sectors;
1265 spin_lock_irq(&conf->device_lock); 1286 spin_lock_irq(&conf->device_lock);
1266 if (bio->bi_phys_segments == 0) 1287 if (bio->bi_phys_segments == 0)
1267 bio->bi_phys_segments = 2; 1288 bio->bi_phys_segments = 2;
1268 else 1289 else
1269 bio->bi_phys_segments++; 1290 bio->bi_phys_segments++;
1270 spin_unlock(&conf->device_lock); 1291 spin_unlock(&conf->device_lock);
1271 /* Cannot call generic_make_request directly 1292 /* Cannot call generic_make_request directly
1272 * as that will be queued in __generic_make_request 1293 * as that will be queued in __generic_make_request
1273 * and subsequent mempool_alloc might block 1294 * and subsequent mempool_alloc might block
1274 * waiting for it. so hand bio over to raid10d. 1295 * waiting for it. so hand bio over to raid10d.
1275 */ 1296 */
1276 reschedule_retry(r10_bio); 1297 reschedule_retry(r10_bio);
1277 1298
1278 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 1299 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1279 1300
1280 r10_bio->master_bio = bio; 1301 r10_bio->master_bio = bio;
1281 r10_bio->sectors = ((bio->bi_size >> 9) 1302 r10_bio->sectors = ((bio->bi_size >> 9)
1282 - sectors_handled); 1303 - sectors_handled);
1283 r10_bio->state = 0; 1304 r10_bio->state = 0;
1284 r10_bio->mddev = mddev; 1305 r10_bio->mddev = mddev;
1285 r10_bio->sector = bio->bi_sector + sectors_handled; 1306 r10_bio->sector = bio->bi_sector + sectors_handled;
1286 goto read_again; 1307 goto read_again;
1287 } else 1308 } else
1288 generic_make_request(read_bio); 1309 generic_make_request(read_bio);
1289 return; 1310 return;
1290 } 1311 }
1291 1312
1292 /* 1313 /*
1293 * WRITE: 1314 * WRITE:
1294 */ 1315 */
1295 if (conf->pending_count >= max_queued_requests) { 1316 if (conf->pending_count >= max_queued_requests) {
1296 md_wakeup_thread(mddev->thread); 1317 md_wakeup_thread(mddev->thread);
1297 wait_event(conf->wait_barrier, 1318 wait_event(conf->wait_barrier,
1298 conf->pending_count < max_queued_requests); 1319 conf->pending_count < max_queued_requests);
1299 } 1320 }
1300 /* first select target devices under rcu_lock and 1321 /* first select target devices under rcu_lock and
1301 * inc refcount on their rdev. Record them by setting 1322 * inc refcount on their rdev. Record them by setting
1302 * bios[x] to bio 1323 * bios[x] to bio
1303 * If there are known/acknowledged bad blocks on any device 1324 * If there are known/acknowledged bad blocks on any device
1304 * on which we have seen a write error, we want to avoid 1325 * on which we have seen a write error, we want to avoid
1305 * writing to those blocks. This potentially requires several 1326 * writing to those blocks. This potentially requires several
1306 * writes to write around the bad blocks. Each set of writes 1327 * writes to write around the bad blocks. Each set of writes
1307 * gets its own r10_bio with a set of bios attached. The number 1328 * gets its own r10_bio with a set of bios attached. The number
1308 * of r10_bios is recored in bio->bi_phys_segments just as with 1329 * of r10_bios is recored in bio->bi_phys_segments just as with
1309 * the read case. 1330 * the read case.
1310 */ 1331 */
1311 1332
1312 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ 1333 r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
1313 raid10_find_phys(conf, r10_bio); 1334 raid10_find_phys(conf, r10_bio);
1314 retry_write: 1335 retry_write:
1315 blocked_rdev = NULL; 1336 blocked_rdev = NULL;
1316 rcu_read_lock(); 1337 rcu_read_lock();
1317 max_sectors = r10_bio->sectors; 1338 max_sectors = r10_bio->sectors;
1318 1339
1319 for (i = 0; i < conf->copies; i++) { 1340 for (i = 0; i < conf->copies; i++) {
1320 int d = r10_bio->devs[i].devnum; 1341 int d = r10_bio->devs[i].devnum;
1321 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev); 1342 struct md_rdev *rdev = rcu_dereference(conf->mirrors[d].rdev);
1322 struct md_rdev *rrdev = rcu_dereference( 1343 struct md_rdev *rrdev = rcu_dereference(
1323 conf->mirrors[d].replacement); 1344 conf->mirrors[d].replacement);
1324 if (rdev == rrdev) 1345 if (rdev == rrdev)
1325 rrdev = NULL; 1346 rrdev = NULL;
1326 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 1347 if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
1327 atomic_inc(&rdev->nr_pending); 1348 atomic_inc(&rdev->nr_pending);
1328 blocked_rdev = rdev; 1349 blocked_rdev = rdev;
1329 break; 1350 break;
1330 } 1351 }
1331 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { 1352 if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
1332 atomic_inc(&rrdev->nr_pending); 1353 atomic_inc(&rrdev->nr_pending);
1333 blocked_rdev = rrdev; 1354 blocked_rdev = rrdev;
1334 break; 1355 break;
1335 } 1356 }
1336 if (rdev && (test_bit(Faulty, &rdev->flags) 1357 if (rdev && (test_bit(Faulty, &rdev->flags)
1337 || test_bit(Unmerged, &rdev->flags))) 1358 || test_bit(Unmerged, &rdev->flags)))
1338 rdev = NULL; 1359 rdev = NULL;
1339 if (rrdev && (test_bit(Faulty, &rrdev->flags) 1360 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1340 || test_bit(Unmerged, &rrdev->flags))) 1361 || test_bit(Unmerged, &rrdev->flags)))
1341 rrdev = NULL; 1362 rrdev = NULL;
1342 1363
1343 r10_bio->devs[i].bio = NULL; 1364 r10_bio->devs[i].bio = NULL;
1344 r10_bio->devs[i].repl_bio = NULL; 1365 r10_bio->devs[i].repl_bio = NULL;
1345 1366
1346 if (!rdev && !rrdev) { 1367 if (!rdev && !rrdev) {
1347 set_bit(R10BIO_Degraded, &r10_bio->state); 1368 set_bit(R10BIO_Degraded, &r10_bio->state);
1348 continue; 1369 continue;
1349 } 1370 }
1350 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { 1371 if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
1351 sector_t first_bad; 1372 sector_t first_bad;
1352 sector_t dev_sector = r10_bio->devs[i].addr; 1373 sector_t dev_sector = r10_bio->devs[i].addr;
1353 int bad_sectors; 1374 int bad_sectors;
1354 int is_bad; 1375 int is_bad;
1355 1376
1356 is_bad = is_badblock(rdev, dev_sector, 1377 is_bad = is_badblock(rdev, dev_sector,
1357 max_sectors, 1378 max_sectors,
1358 &first_bad, &bad_sectors); 1379 &first_bad, &bad_sectors);
1359 if (is_bad < 0) { 1380 if (is_bad < 0) {
1360 /* Mustn't write here until the bad block 1381 /* Mustn't write here until the bad block
1361 * is acknowledged 1382 * is acknowledged
1362 */ 1383 */
1363 atomic_inc(&rdev->nr_pending); 1384 atomic_inc(&rdev->nr_pending);
1364 set_bit(BlockedBadBlocks, &rdev->flags); 1385 set_bit(BlockedBadBlocks, &rdev->flags);
1365 blocked_rdev = rdev; 1386 blocked_rdev = rdev;
1366 break; 1387 break;
1367 } 1388 }
1368 if (is_bad && first_bad <= dev_sector) { 1389 if (is_bad && first_bad <= dev_sector) {
1369 /* Cannot write here at all */ 1390 /* Cannot write here at all */
1370 bad_sectors -= (dev_sector - first_bad); 1391 bad_sectors -= (dev_sector - first_bad);
1371 if (bad_sectors < max_sectors) 1392 if (bad_sectors < max_sectors)
1372 /* Mustn't write more than bad_sectors 1393 /* Mustn't write more than bad_sectors
1373 * to other devices yet 1394 * to other devices yet
1374 */ 1395 */
1375 max_sectors = bad_sectors; 1396 max_sectors = bad_sectors;
1376 /* We don't set R10BIO_Degraded as that 1397 /* We don't set R10BIO_Degraded as that
1377 * only applies if the disk is missing, 1398 * only applies if the disk is missing,
1378 * so it might be re-added, and we want to 1399 * so it might be re-added, and we want to
1379 * know to recover this chunk. 1400 * know to recover this chunk.
1380 * In this case the device is here, and the 1401 * In this case the device is here, and the
1381 * fact that this chunk is not in-sync is 1402 * fact that this chunk is not in-sync is
1382 * recorded in the bad block log. 1403 * recorded in the bad block log.
1383 */ 1404 */
1384 continue; 1405 continue;
1385 } 1406 }
1386 if (is_bad) { 1407 if (is_bad) {
1387 int good_sectors = first_bad - dev_sector; 1408 int good_sectors = first_bad - dev_sector;
1388 if (good_sectors < max_sectors) 1409 if (good_sectors < max_sectors)
1389 max_sectors = good_sectors; 1410 max_sectors = good_sectors;
1390 } 1411 }
1391 } 1412 }
1392 if (rdev) { 1413 if (rdev) {
1393 r10_bio->devs[i].bio = bio; 1414 r10_bio->devs[i].bio = bio;
1394 atomic_inc(&rdev->nr_pending); 1415 atomic_inc(&rdev->nr_pending);
1395 } 1416 }
1396 if (rrdev) { 1417 if (rrdev) {
1397 r10_bio->devs[i].repl_bio = bio; 1418 r10_bio->devs[i].repl_bio = bio;
1398 atomic_inc(&rrdev->nr_pending); 1419 atomic_inc(&rrdev->nr_pending);
1399 } 1420 }
1400 } 1421 }
1401 rcu_read_unlock(); 1422 rcu_read_unlock();
1402 1423
1403 if (unlikely(blocked_rdev)) { 1424 if (unlikely(blocked_rdev)) {
1404 /* Have to wait for this device to get unblocked, then retry */ 1425 /* Have to wait for this device to get unblocked, then retry */
1405 int j; 1426 int j;
1406 int d; 1427 int d;
1407 1428
1408 for (j = 0; j < i; j++) { 1429 for (j = 0; j < i; j++) {
1409 if (r10_bio->devs[j].bio) { 1430 if (r10_bio->devs[j].bio) {
1410 d = r10_bio->devs[j].devnum; 1431 d = r10_bio->devs[j].devnum;
1411 rdev_dec_pending(conf->mirrors[d].rdev, mddev); 1432 rdev_dec_pending(conf->mirrors[d].rdev, mddev);
1412 } 1433 }
1413 if (r10_bio->devs[j].repl_bio) { 1434 if (r10_bio->devs[j].repl_bio) {
1414 struct md_rdev *rdev; 1435 struct md_rdev *rdev;
1415 d = r10_bio->devs[j].devnum; 1436 d = r10_bio->devs[j].devnum;
1416 rdev = conf->mirrors[d].replacement; 1437 rdev = conf->mirrors[d].replacement;
1417 if (!rdev) { 1438 if (!rdev) {
1418 /* Race with remove_disk */ 1439 /* Race with remove_disk */
1419 smp_mb(); 1440 smp_mb();
1420 rdev = conf->mirrors[d].rdev; 1441 rdev = conf->mirrors[d].rdev;
1421 } 1442 }
1422 rdev_dec_pending(rdev, mddev); 1443 rdev_dec_pending(rdev, mddev);
1423 } 1444 }
1424 } 1445 }
1425 allow_barrier(conf); 1446 allow_barrier(conf);
1426 md_wait_for_blocked_rdev(blocked_rdev, mddev); 1447 md_wait_for_blocked_rdev(blocked_rdev, mddev);
1427 wait_barrier(conf); 1448 wait_barrier(conf);
1428 goto retry_write; 1449 goto retry_write;
1429 } 1450 }
1430 1451
1431 if (max_sectors < r10_bio->sectors) { 1452 if (max_sectors < r10_bio->sectors) {
1432 /* We are splitting this into multiple parts, so 1453 /* We are splitting this into multiple parts, so
1433 * we need to prepare for allocating another r10_bio. 1454 * we need to prepare for allocating another r10_bio.
1434 */ 1455 */
1435 r10_bio->sectors = max_sectors; 1456 r10_bio->sectors = max_sectors;
1436 spin_lock_irq(&conf->device_lock); 1457 spin_lock_irq(&conf->device_lock);
1437 if (bio->bi_phys_segments == 0) 1458 if (bio->bi_phys_segments == 0)
1438 bio->bi_phys_segments = 2; 1459 bio->bi_phys_segments = 2;
1439 else 1460 else
1440 bio->bi_phys_segments++; 1461 bio->bi_phys_segments++;
1441 spin_unlock_irq(&conf->device_lock); 1462 spin_unlock_irq(&conf->device_lock);
1442 } 1463 }
1443 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector; 1464 sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
1444 1465
1445 atomic_set(&r10_bio->remaining, 1); 1466 atomic_set(&r10_bio->remaining, 1);
1446 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0); 1467 bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1447 1468
1448 for (i = 0; i < conf->copies; i++) { 1469 for (i = 0; i < conf->copies; i++) {
1449 struct bio *mbio; 1470 struct bio *mbio;
1450 int d = r10_bio->devs[i].devnum; 1471 int d = r10_bio->devs[i].devnum;
1451 if (r10_bio->devs[i].bio) { 1472 if (r10_bio->devs[i].bio) {
1452 struct md_rdev *rdev = conf->mirrors[d].rdev; 1473 struct md_rdev *rdev = conf->mirrors[d].rdev;
1453 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1474 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1454 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, 1475 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1455 max_sectors); 1476 max_sectors);
1456 r10_bio->devs[i].bio = mbio; 1477 r10_bio->devs[i].bio = mbio;
1457 1478
1458 mbio->bi_sector = (r10_bio->devs[i].addr+ 1479 mbio->bi_sector = (r10_bio->devs[i].addr+
1459 choose_data_offset(r10_bio, 1480 choose_data_offset(r10_bio,
1460 rdev)); 1481 rdev));
1461 mbio->bi_bdev = rdev->bdev; 1482 mbio->bi_bdev = rdev->bdev;
1462 mbio->bi_end_io = raid10_end_write_request; 1483 mbio->bi_end_io = raid10_end_write_request;
1463 mbio->bi_rw = 1484 mbio->bi_rw =
1464 WRITE | do_sync | do_fua | do_discard | do_same; 1485 WRITE | do_sync | do_fua | do_discard | do_same;
1465 mbio->bi_private = r10_bio; 1486 mbio->bi_private = r10_bio;
1466 1487
1467 atomic_inc(&r10_bio->remaining); 1488 atomic_inc(&r10_bio->remaining);
1468 1489
1469 cb = blk_check_plugged(raid10_unplug, mddev, 1490 cb = blk_check_plugged(raid10_unplug, mddev,
1470 sizeof(*plug)); 1491 sizeof(*plug));
1471 if (cb) 1492 if (cb)
1472 plug = container_of(cb, struct raid10_plug_cb, 1493 plug = container_of(cb, struct raid10_plug_cb,
1473 cb); 1494 cb);
1474 else 1495 else
1475 plug = NULL; 1496 plug = NULL;
1476 spin_lock_irqsave(&conf->device_lock, flags); 1497 spin_lock_irqsave(&conf->device_lock, flags);
1477 if (plug) { 1498 if (plug) {
1478 bio_list_add(&plug->pending, mbio); 1499 bio_list_add(&plug->pending, mbio);
1479 plug->pending_cnt++; 1500 plug->pending_cnt++;
1480 } else { 1501 } else {
1481 bio_list_add(&conf->pending_bio_list, mbio); 1502 bio_list_add(&conf->pending_bio_list, mbio);
1482 conf->pending_count++; 1503 conf->pending_count++;
1483 } 1504 }
1484 spin_unlock_irqrestore(&conf->device_lock, flags); 1505 spin_unlock_irqrestore(&conf->device_lock, flags);
1485 if (!plug) 1506 if (!plug)
1486 md_wakeup_thread(mddev->thread); 1507 md_wakeup_thread(mddev->thread);
1487 } 1508 }
1488 1509
1489 if (r10_bio->devs[i].repl_bio) { 1510 if (r10_bio->devs[i].repl_bio) {
1490 struct md_rdev *rdev = conf->mirrors[d].replacement; 1511 struct md_rdev *rdev = conf->mirrors[d].replacement;
1491 if (rdev == NULL) { 1512 if (rdev == NULL) {
1492 /* Replacement just got moved to main 'rdev' */ 1513 /* Replacement just got moved to main 'rdev' */
1493 smp_mb(); 1514 smp_mb();
1494 rdev = conf->mirrors[d].rdev; 1515 rdev = conf->mirrors[d].rdev;
1495 } 1516 }
1496 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 1517 mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
1497 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector, 1518 md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
1498 max_sectors); 1519 max_sectors);
1499 r10_bio->devs[i].repl_bio = mbio; 1520 r10_bio->devs[i].repl_bio = mbio;
1500 1521
1501 mbio->bi_sector = (r10_bio->devs[i].addr + 1522 mbio->bi_sector = (r10_bio->devs[i].addr +
1502 choose_data_offset( 1523 choose_data_offset(
1503 r10_bio, rdev)); 1524 r10_bio, rdev));
1504 mbio->bi_bdev = rdev->bdev; 1525 mbio->bi_bdev = rdev->bdev;
1505 mbio->bi_end_io = raid10_end_write_request; 1526 mbio->bi_end_io = raid10_end_write_request;
1506 mbio->bi_rw = 1527 mbio->bi_rw =
1507 WRITE | do_sync | do_fua | do_discard | do_same; 1528 WRITE | do_sync | do_fua | do_discard | do_same;
1508 mbio->bi_private = r10_bio; 1529 mbio->bi_private = r10_bio;
1509 1530
1510 atomic_inc(&r10_bio->remaining); 1531 atomic_inc(&r10_bio->remaining);
1511 spin_lock_irqsave(&conf->device_lock, flags); 1532 spin_lock_irqsave(&conf->device_lock, flags);
1512 bio_list_add(&conf->pending_bio_list, mbio); 1533 bio_list_add(&conf->pending_bio_list, mbio);
1513 conf->pending_count++; 1534 conf->pending_count++;
1514 spin_unlock_irqrestore(&conf->device_lock, flags); 1535 spin_unlock_irqrestore(&conf->device_lock, flags);
1515 if (!mddev_check_plugged(mddev)) 1536 if (!mddev_check_plugged(mddev))
1516 md_wakeup_thread(mddev->thread); 1537 md_wakeup_thread(mddev->thread);
1517 } 1538 }
1518 } 1539 }
1519 1540
1520 /* Don't remove the bias on 'remaining' (one_write_done) until 1541 /* Don't remove the bias on 'remaining' (one_write_done) until
1521 * after checking if we need to go around again. 1542 * after checking if we need to go around again.
1522 */ 1543 */
1523 1544
1524 if (sectors_handled < (bio->bi_size >> 9)) { 1545 if (sectors_handled < (bio->bi_size >> 9)) {
1525 one_write_done(r10_bio); 1546 one_write_done(r10_bio);
1526 /* We need another r10_bio. It has already been counted 1547 /* We need another r10_bio. It has already been counted
1527 * in bio->bi_phys_segments. 1548 * in bio->bi_phys_segments.
1528 */ 1549 */
1529 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO); 1550 r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
1530 1551
1531 r10_bio->master_bio = bio; 1552 r10_bio->master_bio = bio;
1532 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled; 1553 r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
1533 1554
1534 r10_bio->mddev = mddev; 1555 r10_bio->mddev = mddev;
1535 r10_bio->sector = bio->bi_sector + sectors_handled; 1556 r10_bio->sector = bio->bi_sector + sectors_handled;
1536 r10_bio->state = 0; 1557 r10_bio->state = 0;
1537 goto retry_write; 1558 goto retry_write;
1538 } 1559 }
1539 one_write_done(r10_bio); 1560 one_write_done(r10_bio);
1540 1561
1541 /* In case raid10d snuck in to freeze_array */ 1562 /* In case raid10d snuck in to freeze_array */
1542 wake_up(&conf->wait_barrier); 1563 wake_up(&conf->wait_barrier);
1543 } 1564 }
1544 1565
1545 static void status(struct seq_file *seq, struct mddev *mddev) 1566 static void status(struct seq_file *seq, struct mddev *mddev)
1546 { 1567 {
1547 struct r10conf *conf = mddev->private; 1568 struct r10conf *conf = mddev->private;
1548 int i; 1569 int i;
1549 1570
1550 if (conf->geo.near_copies < conf->geo.raid_disks) 1571 if (conf->geo.near_copies < conf->geo.raid_disks)
1551 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2); 1572 seq_printf(seq, " %dK chunks", mddev->chunk_sectors / 2);
1552 if (conf->geo.near_copies > 1) 1573 if (conf->geo.near_copies > 1)
1553 seq_printf(seq, " %d near-copies", conf->geo.near_copies); 1574 seq_printf(seq, " %d near-copies", conf->geo.near_copies);
1554 if (conf->geo.far_copies > 1) { 1575 if (conf->geo.far_copies > 1) {
1555 if (conf->geo.far_offset) 1576 if (conf->geo.far_offset)
1556 seq_printf(seq, " %d offset-copies", conf->geo.far_copies); 1577 seq_printf(seq, " %d offset-copies", conf->geo.far_copies);
1557 else 1578 else
1558 seq_printf(seq, " %d far-copies", conf->geo.far_copies); 1579 seq_printf(seq, " %d far-copies", conf->geo.far_copies);
1559 } 1580 }
1560 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks, 1581 seq_printf(seq, " [%d/%d] [", conf->geo.raid_disks,
1561 conf->geo.raid_disks - mddev->degraded); 1582 conf->geo.raid_disks - mddev->degraded);
1562 for (i = 0; i < conf->geo.raid_disks; i++) 1583 for (i = 0; i < conf->geo.raid_disks; i++)
1563 seq_printf(seq, "%s", 1584 seq_printf(seq, "%s",
1564 conf->mirrors[i].rdev && 1585 conf->mirrors[i].rdev &&
1565 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_"); 1586 test_bit(In_sync, &conf->mirrors[i].rdev->flags) ? "U" : "_");
1566 seq_printf(seq, "]"); 1587 seq_printf(seq, "]");
1567 } 1588 }
1568 1589
1569 /* check if there are enough drives for 1590 /* check if there are enough drives for
1570 * every block to appear on atleast one. 1591 * every block to appear on atleast one.
1571 * Don't consider the device numbered 'ignore' 1592 * Don't consider the device numbered 'ignore'
1572 * as we might be about to remove it. 1593 * as we might be about to remove it.
1573 */ 1594 */
1574 static int _enough(struct r10conf *conf, struct geom *geo, int ignore) 1595 static int _enough(struct r10conf *conf, struct geom *geo, int ignore)
1575 { 1596 {
1576 int first = 0; 1597 int first = 0;
1577 1598
1578 do { 1599 do {
1579 int n = conf->copies; 1600 int n = conf->copies;
1580 int cnt = 0; 1601 int cnt = 0;
1581 int this = first; 1602 int this = first;
1582 while (n--) { 1603 while (n--) {
1583 if (conf->mirrors[this].rdev && 1604 if (conf->mirrors[this].rdev &&
1584 this != ignore) 1605 this != ignore)
1585 cnt++; 1606 cnt++;
1586 this = (this+1) % geo->raid_disks; 1607 this = (this+1) % geo->raid_disks;
1587 } 1608 }
1588 if (cnt == 0) 1609 if (cnt == 0)
1589 return 0; 1610 return 0;
1590 first = (first + geo->near_copies) % geo->raid_disks; 1611 first = (first + geo->near_copies) % geo->raid_disks;
1591 } while (first != 0); 1612 } while (first != 0);
1592 return 1; 1613 return 1;
1593 } 1614 }
1594 1615
1595 static int enough(struct r10conf *conf, int ignore) 1616 static int enough(struct r10conf *conf, int ignore)
1596 { 1617 {
1597 return _enough(conf, &conf->geo, ignore) && 1618 return _enough(conf, &conf->geo, ignore) &&
1598 _enough(conf, &conf->prev, ignore); 1619 _enough(conf, &conf->prev, ignore);
1599 } 1620 }
1600 1621
1601 static void error(struct mddev *mddev, struct md_rdev *rdev) 1622 static void error(struct mddev *mddev, struct md_rdev *rdev)
1602 { 1623 {
1603 char b[BDEVNAME_SIZE]; 1624 char b[BDEVNAME_SIZE];
1604 struct r10conf *conf = mddev->private; 1625 struct r10conf *conf = mddev->private;
1605 1626
1606 /* 1627 /*
1607 * If it is not operational, then we have already marked it as dead 1628 * If it is not operational, then we have already marked it as dead
1608 * else if it is the last working disks, ignore the error, let the 1629 * else if it is the last working disks, ignore the error, let the
1609 * next level up know. 1630 * next level up know.
1610 * else mark the drive as failed 1631 * else mark the drive as failed
1611 */ 1632 */
1612 if (test_bit(In_sync, &rdev->flags) 1633 if (test_bit(In_sync, &rdev->flags)
1613 && !enough(conf, rdev->raid_disk)) 1634 && !enough(conf, rdev->raid_disk))
1614 /* 1635 /*
1615 * Don't fail the drive, just return an IO error. 1636 * Don't fail the drive, just return an IO error.
1616 */ 1637 */
1617 return; 1638 return;
1618 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1639 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1619 unsigned long flags; 1640 unsigned long flags;
1620 spin_lock_irqsave(&conf->device_lock, flags); 1641 spin_lock_irqsave(&conf->device_lock, flags);
1621 mddev->degraded++; 1642 mddev->degraded++;
1622 spin_unlock_irqrestore(&conf->device_lock, flags); 1643 spin_unlock_irqrestore(&conf->device_lock, flags);
1623 /* 1644 /*
1624 * if recovery is running, make sure it aborts. 1645 * if recovery is running, make sure it aborts.
1625 */ 1646 */
1626 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1647 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1627 } 1648 }
1628 set_bit(Blocked, &rdev->flags); 1649 set_bit(Blocked, &rdev->flags);
1629 set_bit(Faulty, &rdev->flags); 1650 set_bit(Faulty, &rdev->flags);
1630 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1651 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1631 printk(KERN_ALERT 1652 printk(KERN_ALERT
1632 "md/raid10:%s: Disk failure on %s, disabling device.\n" 1653 "md/raid10:%s: Disk failure on %s, disabling device.\n"
1633 "md/raid10:%s: Operation continuing on %d devices.\n", 1654 "md/raid10:%s: Operation continuing on %d devices.\n",
1634 mdname(mddev), bdevname(rdev->bdev, b), 1655 mdname(mddev), bdevname(rdev->bdev, b),
1635 mdname(mddev), conf->geo.raid_disks - mddev->degraded); 1656 mdname(mddev), conf->geo.raid_disks - mddev->degraded);
1636 } 1657 }
1637 1658
1638 static void print_conf(struct r10conf *conf) 1659 static void print_conf(struct r10conf *conf)
1639 { 1660 {
1640 int i; 1661 int i;
1641 struct raid10_info *tmp; 1662 struct raid10_info *tmp;
1642 1663
1643 printk(KERN_DEBUG "RAID10 conf printout:\n"); 1664 printk(KERN_DEBUG "RAID10 conf printout:\n");
1644 if (!conf) { 1665 if (!conf) {
1645 printk(KERN_DEBUG "(!conf)\n"); 1666 printk(KERN_DEBUG "(!conf)\n");
1646 return; 1667 return;
1647 } 1668 }
1648 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded, 1669 printk(KERN_DEBUG " --- wd:%d rd:%d\n", conf->geo.raid_disks - conf->mddev->degraded,
1649 conf->geo.raid_disks); 1670 conf->geo.raid_disks);
1650 1671
1651 for (i = 0; i < conf->geo.raid_disks; i++) { 1672 for (i = 0; i < conf->geo.raid_disks; i++) {
1652 char b[BDEVNAME_SIZE]; 1673 char b[BDEVNAME_SIZE];
1653 tmp = conf->mirrors + i; 1674 tmp = conf->mirrors + i;
1654 if (tmp->rdev) 1675 if (tmp->rdev)
1655 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n", 1676 printk(KERN_DEBUG " disk %d, wo:%d, o:%d, dev:%s\n",
1656 i, !test_bit(In_sync, &tmp->rdev->flags), 1677 i, !test_bit(In_sync, &tmp->rdev->flags),
1657 !test_bit(Faulty, &tmp->rdev->flags), 1678 !test_bit(Faulty, &tmp->rdev->flags),
1658 bdevname(tmp->rdev->bdev,b)); 1679 bdevname(tmp->rdev->bdev,b));
1659 } 1680 }
1660 } 1681 }
1661 1682
1662 static void close_sync(struct r10conf *conf) 1683 static void close_sync(struct r10conf *conf)
1663 { 1684 {
1664 wait_barrier(conf); 1685 wait_barrier(conf);
1665 allow_barrier(conf); 1686 allow_barrier(conf);
1666 1687
1667 mempool_destroy(conf->r10buf_pool); 1688 mempool_destroy(conf->r10buf_pool);
1668 conf->r10buf_pool = NULL; 1689 conf->r10buf_pool = NULL;
1669 } 1690 }
1670 1691
1671 static int raid10_spare_active(struct mddev *mddev) 1692 static int raid10_spare_active(struct mddev *mddev)
1672 { 1693 {
1673 int i; 1694 int i;
1674 struct r10conf *conf = mddev->private; 1695 struct r10conf *conf = mddev->private;
1675 struct raid10_info *tmp; 1696 struct raid10_info *tmp;
1676 int count = 0; 1697 int count = 0;
1677 unsigned long flags; 1698 unsigned long flags;
1678 1699
1679 /* 1700 /*
1680 * Find all non-in_sync disks within the RAID10 configuration 1701 * Find all non-in_sync disks within the RAID10 configuration
1681 * and mark them in_sync 1702 * and mark them in_sync
1682 */ 1703 */
1683 for (i = 0; i < conf->geo.raid_disks; i++) { 1704 for (i = 0; i < conf->geo.raid_disks; i++) {
1684 tmp = conf->mirrors + i; 1705 tmp = conf->mirrors + i;
1685 if (tmp->replacement 1706 if (tmp->replacement
1686 && tmp->replacement->recovery_offset == MaxSector 1707 && tmp->replacement->recovery_offset == MaxSector
1687 && !test_bit(Faulty, &tmp->replacement->flags) 1708 && !test_bit(Faulty, &tmp->replacement->flags)
1688 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) { 1709 && !test_and_set_bit(In_sync, &tmp->replacement->flags)) {
1689 /* Replacement has just become active */ 1710 /* Replacement has just become active */
1690 if (!tmp->rdev 1711 if (!tmp->rdev
1691 || !test_and_clear_bit(In_sync, &tmp->rdev->flags)) 1712 || !test_and_clear_bit(In_sync, &tmp->rdev->flags))
1692 count++; 1713 count++;
1693 if (tmp->rdev) { 1714 if (tmp->rdev) {
1694 /* Replaced device not technically faulty, 1715 /* Replaced device not technically faulty,
1695 * but we need to be sure it gets removed 1716 * but we need to be sure it gets removed
1696 * and never re-added. 1717 * and never re-added.
1697 */ 1718 */
1698 set_bit(Faulty, &tmp->rdev->flags); 1719 set_bit(Faulty, &tmp->rdev->flags);
1699 sysfs_notify_dirent_safe( 1720 sysfs_notify_dirent_safe(
1700 tmp->rdev->sysfs_state); 1721 tmp->rdev->sysfs_state);
1701 } 1722 }
1702 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state); 1723 sysfs_notify_dirent_safe(tmp->replacement->sysfs_state);
1703 } else if (tmp->rdev 1724 } else if (tmp->rdev
1704 && !test_bit(Faulty, &tmp->rdev->flags) 1725 && !test_bit(Faulty, &tmp->rdev->flags)
1705 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 1726 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
1706 count++; 1727 count++;
1707 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); 1728 sysfs_notify_dirent_safe(tmp->rdev->sysfs_state);
1708 } 1729 }
1709 } 1730 }
1710 spin_lock_irqsave(&conf->device_lock, flags); 1731 spin_lock_irqsave(&conf->device_lock, flags);
1711 mddev->degraded -= count; 1732 mddev->degraded -= count;
1712 spin_unlock_irqrestore(&conf->device_lock, flags); 1733 spin_unlock_irqrestore(&conf->device_lock, flags);
1713 1734
1714 print_conf(conf); 1735 print_conf(conf);
1715 return count; 1736 return count;
1716 } 1737 }
1717 1738
1718 1739
1719 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) 1740 static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1720 { 1741 {
1721 struct r10conf *conf = mddev->private; 1742 struct r10conf *conf = mddev->private;
1722 int err = -EEXIST; 1743 int err = -EEXIST;
1723 int mirror; 1744 int mirror;
1724 int first = 0; 1745 int first = 0;
1725 int last = conf->geo.raid_disks - 1; 1746 int last = conf->geo.raid_disks - 1;
1726 struct request_queue *q = bdev_get_queue(rdev->bdev); 1747 struct request_queue *q = bdev_get_queue(rdev->bdev);
1727 1748
1728 if (mddev->recovery_cp < MaxSector) 1749 if (mddev->recovery_cp < MaxSector)
1729 /* only hot-add to in-sync arrays, as recovery is 1750 /* only hot-add to in-sync arrays, as recovery is
1730 * very different from resync 1751 * very different from resync
1731 */ 1752 */
1732 return -EBUSY; 1753 return -EBUSY;
1733 if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1)) 1754 if (rdev->saved_raid_disk < 0 && !_enough(conf, &conf->prev, -1))
1734 return -EINVAL; 1755 return -EINVAL;
1735 1756
1736 if (rdev->raid_disk >= 0) 1757 if (rdev->raid_disk >= 0)
1737 first = last = rdev->raid_disk; 1758 first = last = rdev->raid_disk;
1738 1759
1739 if (q->merge_bvec_fn) { 1760 if (q->merge_bvec_fn) {
1740 set_bit(Unmerged, &rdev->flags); 1761 set_bit(Unmerged, &rdev->flags);
1741 mddev->merge_check_needed = 1; 1762 mddev->merge_check_needed = 1;
1742 } 1763 }
1743 1764
1744 if (rdev->saved_raid_disk >= first && 1765 if (rdev->saved_raid_disk >= first &&
1745 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1766 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1746 mirror = rdev->saved_raid_disk; 1767 mirror = rdev->saved_raid_disk;
1747 else 1768 else
1748 mirror = first; 1769 mirror = first;
1749 for ( ; mirror <= last ; mirror++) { 1770 for ( ; mirror <= last ; mirror++) {
1750 struct raid10_info *p = &conf->mirrors[mirror]; 1771 struct raid10_info *p = &conf->mirrors[mirror];
1751 if (p->recovery_disabled == mddev->recovery_disabled) 1772 if (p->recovery_disabled == mddev->recovery_disabled)
1752 continue; 1773 continue;
1753 if (p->rdev) { 1774 if (p->rdev) {
1754 if (!test_bit(WantReplacement, &p->rdev->flags) || 1775 if (!test_bit(WantReplacement, &p->rdev->flags) ||
1755 p->replacement != NULL) 1776 p->replacement != NULL)
1756 continue; 1777 continue;
1757 clear_bit(In_sync, &rdev->flags); 1778 clear_bit(In_sync, &rdev->flags);
1758 set_bit(Replacement, &rdev->flags); 1779 set_bit(Replacement, &rdev->flags);
1759 rdev->raid_disk = mirror; 1780 rdev->raid_disk = mirror;
1760 err = 0; 1781 err = 0;
1761 disk_stack_limits(mddev->gendisk, rdev->bdev, 1782 disk_stack_limits(mddev->gendisk, rdev->bdev,
1762 rdev->data_offset << 9); 1783 rdev->data_offset << 9);
1763 conf->fullsync = 1; 1784 conf->fullsync = 1;
1764 rcu_assign_pointer(p->replacement, rdev); 1785 rcu_assign_pointer(p->replacement, rdev);
1765 break; 1786 break;
1766 } 1787 }
1767 1788
1768 disk_stack_limits(mddev->gendisk, rdev->bdev, 1789 disk_stack_limits(mddev->gendisk, rdev->bdev,
1769 rdev->data_offset << 9); 1790 rdev->data_offset << 9);
1770 1791
1771 p->head_position = 0; 1792 p->head_position = 0;
1772 p->recovery_disabled = mddev->recovery_disabled - 1; 1793 p->recovery_disabled = mddev->recovery_disabled - 1;
1773 rdev->raid_disk = mirror; 1794 rdev->raid_disk = mirror;
1774 err = 0; 1795 err = 0;
1775 if (rdev->saved_raid_disk != mirror) 1796 if (rdev->saved_raid_disk != mirror)
1776 conf->fullsync = 1; 1797 conf->fullsync = 1;
1777 rcu_assign_pointer(p->rdev, rdev); 1798 rcu_assign_pointer(p->rdev, rdev);
1778 break; 1799 break;
1779 } 1800 }
1780 if (err == 0 && test_bit(Unmerged, &rdev->flags)) { 1801 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1781 /* Some requests might not have seen this new 1802 /* Some requests might not have seen this new
1782 * merge_bvec_fn. We must wait for them to complete 1803 * merge_bvec_fn. We must wait for them to complete
1783 * before merging the device fully. 1804 * before merging the device fully.
1784 * First we make sure any code which has tested 1805 * First we make sure any code which has tested
1785 * our function has submitted the request, then 1806 * our function has submitted the request, then
1786 * we wait for all outstanding requests to complete. 1807 * we wait for all outstanding requests to complete.
1787 */ 1808 */
1788 synchronize_sched(); 1809 synchronize_sched();
1789 raise_barrier(conf, 0); 1810 raise_barrier(conf, 0);
1790 lower_barrier(conf); 1811 lower_barrier(conf);
1791 clear_bit(Unmerged, &rdev->flags); 1812 clear_bit(Unmerged, &rdev->flags);
1792 } 1813 }
1793 md_integrity_add_rdev(rdev, mddev); 1814 md_integrity_add_rdev(rdev, mddev);
1794 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev))) 1815 if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
1795 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); 1816 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
1796 1817
1797 print_conf(conf); 1818 print_conf(conf);
1798 return err; 1819 return err;
1799 } 1820 }
1800 1821
1801 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev) 1822 static int raid10_remove_disk(struct mddev *mddev, struct md_rdev *rdev)
1802 { 1823 {
1803 struct r10conf *conf = mddev->private; 1824 struct r10conf *conf = mddev->private;
1804 int err = 0; 1825 int err = 0;
1805 int number = rdev->raid_disk; 1826 int number = rdev->raid_disk;
1806 struct md_rdev **rdevp; 1827 struct md_rdev **rdevp;
1807 struct raid10_info *p = conf->mirrors + number; 1828 struct raid10_info *p = conf->mirrors + number;
1808 1829
1809 print_conf(conf); 1830 print_conf(conf);
1810 if (rdev == p->rdev) 1831 if (rdev == p->rdev)
1811 rdevp = &p->rdev; 1832 rdevp = &p->rdev;
1812 else if (rdev == p->replacement) 1833 else if (rdev == p->replacement)
1813 rdevp = &p->replacement; 1834 rdevp = &p->replacement;
1814 else 1835 else
1815 return 0; 1836 return 0;
1816 1837
1817 if (test_bit(In_sync, &rdev->flags) || 1838 if (test_bit(In_sync, &rdev->flags) ||
1818 atomic_read(&rdev->nr_pending)) { 1839 atomic_read(&rdev->nr_pending)) {
1819 err = -EBUSY; 1840 err = -EBUSY;
1820 goto abort; 1841 goto abort;
1821 } 1842 }
1822 /* Only remove faulty devices if recovery 1843 /* Only remove faulty devices if recovery
1823 * is not possible. 1844 * is not possible.
1824 */ 1845 */
1825 if (!test_bit(Faulty, &rdev->flags) && 1846 if (!test_bit(Faulty, &rdev->flags) &&
1826 mddev->recovery_disabled != p->recovery_disabled && 1847 mddev->recovery_disabled != p->recovery_disabled &&
1827 (!p->replacement || p->replacement == rdev) && 1848 (!p->replacement || p->replacement == rdev) &&
1828 number < conf->geo.raid_disks && 1849 number < conf->geo.raid_disks &&
1829 enough(conf, -1)) { 1850 enough(conf, -1)) {
1830 err = -EBUSY; 1851 err = -EBUSY;
1831 goto abort; 1852 goto abort;
1832 } 1853 }
1833 *rdevp = NULL; 1854 *rdevp = NULL;
1834 synchronize_rcu(); 1855 synchronize_rcu();
1835 if (atomic_read(&rdev->nr_pending)) { 1856 if (atomic_read(&rdev->nr_pending)) {
1836 /* lost the race, try later */ 1857 /* lost the race, try later */
1837 err = -EBUSY; 1858 err = -EBUSY;
1838 *rdevp = rdev; 1859 *rdevp = rdev;
1839 goto abort; 1860 goto abort;
1840 } else if (p->replacement) { 1861 } else if (p->replacement) {
1841 /* We must have just cleared 'rdev' */ 1862 /* We must have just cleared 'rdev' */
1842 p->rdev = p->replacement; 1863 p->rdev = p->replacement;
1843 clear_bit(Replacement, &p->replacement->flags); 1864 clear_bit(Replacement, &p->replacement->flags);
1844 smp_mb(); /* Make sure other CPUs may see both as identical 1865 smp_mb(); /* Make sure other CPUs may see both as identical
1845 * but will never see neither -- if they are careful. 1866 * but will never see neither -- if they are careful.
1846 */ 1867 */
1847 p->replacement = NULL; 1868 p->replacement = NULL;
1848 clear_bit(WantReplacement, &rdev->flags); 1869 clear_bit(WantReplacement, &rdev->flags);
1849 } else 1870 } else
1850 /* We might have just remove the Replacement as faulty 1871 /* We might have just remove the Replacement as faulty
1851 * Clear the flag just in case 1872 * Clear the flag just in case
1852 */ 1873 */
1853 clear_bit(WantReplacement, &rdev->flags); 1874 clear_bit(WantReplacement, &rdev->flags);
1854 1875
1855 err = md_integrity_register(mddev); 1876 err = md_integrity_register(mddev);
1856 1877
1857 abort: 1878 abort:
1858 1879
1859 print_conf(conf); 1880 print_conf(conf);
1860 return err; 1881 return err;
1861 } 1882 }
1862 1883
1863 1884
1864 static void end_sync_read(struct bio *bio, int error) 1885 static void end_sync_read(struct bio *bio, int error)
1865 { 1886 {
1866 struct r10bio *r10_bio = bio->bi_private; 1887 struct r10bio *r10_bio = bio->bi_private;
1867 struct r10conf *conf = r10_bio->mddev->private; 1888 struct r10conf *conf = r10_bio->mddev->private;
1868 int d; 1889 int d;
1869 1890
1870 if (bio == r10_bio->master_bio) { 1891 if (bio == r10_bio->master_bio) {
1871 /* this is a reshape read */ 1892 /* this is a reshape read */
1872 d = r10_bio->read_slot; /* really the read dev */ 1893 d = r10_bio->read_slot; /* really the read dev */
1873 } else 1894 } else
1874 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL); 1895 d = find_bio_disk(conf, r10_bio, bio, NULL, NULL);
1875 1896
1876 if (test_bit(BIO_UPTODATE, &bio->bi_flags)) 1897 if (test_bit(BIO_UPTODATE, &bio->bi_flags))
1877 set_bit(R10BIO_Uptodate, &r10_bio->state); 1898 set_bit(R10BIO_Uptodate, &r10_bio->state);
1878 else 1899 else
1879 /* The write handler will notice the lack of 1900 /* The write handler will notice the lack of
1880 * R10BIO_Uptodate and record any errors etc 1901 * R10BIO_Uptodate and record any errors etc
1881 */ 1902 */
1882 atomic_add(r10_bio->sectors, 1903 atomic_add(r10_bio->sectors,
1883 &conf->mirrors[d].rdev->corrected_errors); 1904 &conf->mirrors[d].rdev->corrected_errors);
1884 1905
1885 /* for reconstruct, we always reschedule after a read. 1906 /* for reconstruct, we always reschedule after a read.
1886 * for resync, only after all reads 1907 * for resync, only after all reads
1887 */ 1908 */
1888 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev); 1909 rdev_dec_pending(conf->mirrors[d].rdev, conf->mddev);
1889 if (test_bit(R10BIO_IsRecover, &r10_bio->state) || 1910 if (test_bit(R10BIO_IsRecover, &r10_bio->state) ||
1890 atomic_dec_and_test(&r10_bio->remaining)) { 1911 atomic_dec_and_test(&r10_bio->remaining)) {
1891 /* we have read all the blocks, 1912 /* we have read all the blocks,
1892 * do the comparison in process context in raid10d 1913 * do the comparison in process context in raid10d
1893 */ 1914 */
1894 reschedule_retry(r10_bio); 1915 reschedule_retry(r10_bio);
1895 } 1916 }
1896 } 1917 }
1897 1918
1898 static void end_sync_request(struct r10bio *r10_bio) 1919 static void end_sync_request(struct r10bio *r10_bio)
1899 { 1920 {
1900 struct mddev *mddev = r10_bio->mddev; 1921 struct mddev *mddev = r10_bio->mddev;
1901 1922
1902 while (atomic_dec_and_test(&r10_bio->remaining)) { 1923 while (atomic_dec_and_test(&r10_bio->remaining)) {
1903 if (r10_bio->master_bio == NULL) { 1924 if (r10_bio->master_bio == NULL) {
1904 /* the primary of several recovery bios */ 1925 /* the primary of several recovery bios */
1905 sector_t s = r10_bio->sectors; 1926 sector_t s = r10_bio->sectors;
1906 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 1927 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1907 test_bit(R10BIO_WriteError, &r10_bio->state)) 1928 test_bit(R10BIO_WriteError, &r10_bio->state))
1908 reschedule_retry(r10_bio); 1929 reschedule_retry(r10_bio);
1909 else 1930 else
1910 put_buf(r10_bio); 1931 put_buf(r10_bio);
1911 md_done_sync(mddev, s, 1); 1932 md_done_sync(mddev, s, 1);
1912 break; 1933 break;
1913 } else { 1934 } else {
1914 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio; 1935 struct r10bio *r10_bio2 = (struct r10bio *)r10_bio->master_bio;
1915 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 1936 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
1916 test_bit(R10BIO_WriteError, &r10_bio->state)) 1937 test_bit(R10BIO_WriteError, &r10_bio->state))
1917 reschedule_retry(r10_bio); 1938 reschedule_retry(r10_bio);
1918 else 1939 else
1919 put_buf(r10_bio); 1940 put_buf(r10_bio);
1920 r10_bio = r10_bio2; 1941 r10_bio = r10_bio2;
1921 } 1942 }
1922 } 1943 }
1923 } 1944 }
1924 1945
1925 static void end_sync_write(struct bio *bio, int error) 1946 static void end_sync_write(struct bio *bio, int error)
1926 { 1947 {
1927 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 1948 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
1928 struct r10bio *r10_bio = bio->bi_private; 1949 struct r10bio *r10_bio = bio->bi_private;
1929 struct mddev *mddev = r10_bio->mddev; 1950 struct mddev *mddev = r10_bio->mddev;
1930 struct r10conf *conf = mddev->private; 1951 struct r10conf *conf = mddev->private;
1931 int d; 1952 int d;
1932 sector_t first_bad; 1953 sector_t first_bad;
1933 int bad_sectors; 1954 int bad_sectors;
1934 int slot; 1955 int slot;
1935 int repl; 1956 int repl;
1936 struct md_rdev *rdev = NULL; 1957 struct md_rdev *rdev = NULL;
1937 1958
1938 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 1959 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1939 if (repl) 1960 if (repl)
1940 rdev = conf->mirrors[d].replacement; 1961 rdev = conf->mirrors[d].replacement;
1941 else 1962 else
1942 rdev = conf->mirrors[d].rdev; 1963 rdev = conf->mirrors[d].rdev;
1943 1964
1944 if (!uptodate) { 1965 if (!uptodate) {
1945 if (repl) 1966 if (repl)
1946 md_error(mddev, rdev); 1967 md_error(mddev, rdev);
1947 else { 1968 else {
1948 set_bit(WriteErrorSeen, &rdev->flags); 1969 set_bit(WriteErrorSeen, &rdev->flags);
1949 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 1970 if (!test_and_set_bit(WantReplacement, &rdev->flags))
1950 set_bit(MD_RECOVERY_NEEDED, 1971 set_bit(MD_RECOVERY_NEEDED,
1951 &rdev->mddev->recovery); 1972 &rdev->mddev->recovery);
1952 set_bit(R10BIO_WriteError, &r10_bio->state); 1973 set_bit(R10BIO_WriteError, &r10_bio->state);
1953 } 1974 }
1954 } else if (is_badblock(rdev, 1975 } else if (is_badblock(rdev,
1955 r10_bio->devs[slot].addr, 1976 r10_bio->devs[slot].addr,
1956 r10_bio->sectors, 1977 r10_bio->sectors,
1957 &first_bad, &bad_sectors)) 1978 &first_bad, &bad_sectors))
1958 set_bit(R10BIO_MadeGood, &r10_bio->state); 1979 set_bit(R10BIO_MadeGood, &r10_bio->state);
1959 1980
1960 rdev_dec_pending(rdev, mddev); 1981 rdev_dec_pending(rdev, mddev);
1961 1982
1962 end_sync_request(r10_bio); 1983 end_sync_request(r10_bio);
1963 } 1984 }
1964 1985
1965 /* 1986 /*
1966 * Note: sync and recover and handled very differently for raid10 1987 * Note: sync and recover and handled very differently for raid10
1967 * This code is for resync. 1988 * This code is for resync.
1968 * For resync, we read through virtual addresses and read all blocks. 1989 * For resync, we read through virtual addresses and read all blocks.
1969 * If there is any error, we schedule a write. The lowest numbered 1990 * If there is any error, we schedule a write. The lowest numbered
1970 * drive is authoritative. 1991 * drive is authoritative.
1971 * However requests come for physical address, so we need to map. 1992 * However requests come for physical address, so we need to map.
1972 * For every physical address there are raid_disks/copies virtual addresses, 1993 * For every physical address there are raid_disks/copies virtual addresses,
1973 * which is always are least one, but is not necessarly an integer. 1994 * which is always are least one, but is not necessarly an integer.
1974 * This means that a physical address can span multiple chunks, so we may 1995 * This means that a physical address can span multiple chunks, so we may
1975 * have to submit multiple io requests for a single sync request. 1996 * have to submit multiple io requests for a single sync request.
1976 */ 1997 */
1977 /* 1998 /*
1978 * We check if all blocks are in-sync and only write to blocks that 1999 * We check if all blocks are in-sync and only write to blocks that
1979 * aren't in sync 2000 * aren't in sync
1980 */ 2001 */
1981 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) 2002 static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio)
1982 { 2003 {
1983 struct r10conf *conf = mddev->private; 2004 struct r10conf *conf = mddev->private;
1984 int i, first; 2005 int i, first;
1985 struct bio *tbio, *fbio; 2006 struct bio *tbio, *fbio;
1986 int vcnt; 2007 int vcnt;
1987 2008
1988 atomic_set(&r10_bio->remaining, 1); 2009 atomic_set(&r10_bio->remaining, 1);
1989 2010
1990 /* find the first device with a block */ 2011 /* find the first device with a block */
1991 for (i=0; i<conf->copies; i++) 2012 for (i=0; i<conf->copies; i++)
1992 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) 2013 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags))
1993 break; 2014 break;
1994 2015
1995 if (i == conf->copies) 2016 if (i == conf->copies)
1996 goto done; 2017 goto done;
1997 2018
1998 first = i; 2019 first = i;
1999 fbio = r10_bio->devs[i].bio; 2020 fbio = r10_bio->devs[i].bio;
2000 2021
2001 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9); 2022 vcnt = (r10_bio->sectors + (PAGE_SIZE >> 9) - 1) >> (PAGE_SHIFT - 9);
2002 /* now find blocks with errors */ 2023 /* now find blocks with errors */
2003 for (i=0 ; i < conf->copies ; i++) { 2024 for (i=0 ; i < conf->copies ; i++) {
2004 int j, d; 2025 int j, d;
2005 2026
2006 tbio = r10_bio->devs[i].bio; 2027 tbio = r10_bio->devs[i].bio;
2007 2028
2008 if (tbio->bi_end_io != end_sync_read) 2029 if (tbio->bi_end_io != end_sync_read)
2009 continue; 2030 continue;
2010 if (i == first) 2031 if (i == first)
2011 continue; 2032 continue;
2012 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) { 2033 if (test_bit(BIO_UPTODATE, &r10_bio->devs[i].bio->bi_flags)) {
2013 /* We know that the bi_io_vec layout is the same for 2034 /* We know that the bi_io_vec layout is the same for
2014 * both 'first' and 'i', so we just compare them. 2035 * both 'first' and 'i', so we just compare them.
2015 * All vec entries are PAGE_SIZE; 2036 * All vec entries are PAGE_SIZE;
2016 */ 2037 */
2017 for (j = 0; j < vcnt; j++) 2038 for (j = 0; j < vcnt; j++)
2018 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page), 2039 if (memcmp(page_address(fbio->bi_io_vec[j].bv_page),
2019 page_address(tbio->bi_io_vec[j].bv_page), 2040 page_address(tbio->bi_io_vec[j].bv_page),
2020 fbio->bi_io_vec[j].bv_len)) 2041 fbio->bi_io_vec[j].bv_len))
2021 break; 2042 break;
2022 if (j == vcnt) 2043 if (j == vcnt)
2023 continue; 2044 continue;
2024 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); 2045 atomic64_add(r10_bio->sectors, &mddev->resync_mismatches);
2025 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 2046 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
2026 /* Don't fix anything. */ 2047 /* Don't fix anything. */
2027 continue; 2048 continue;
2028 } 2049 }
2029 /* Ok, we need to write this bio, either to correct an 2050 /* Ok, we need to write this bio, either to correct an
2030 * inconsistency or to correct an unreadable block. 2051 * inconsistency or to correct an unreadable block.
2031 * First we need to fixup bv_offset, bv_len and 2052 * First we need to fixup bv_offset, bv_len and
2032 * bi_vecs, as the read request might have corrupted these 2053 * bi_vecs, as the read request might have corrupted these
2033 */ 2054 */
2034 tbio->bi_vcnt = vcnt; 2055 tbio->bi_vcnt = vcnt;
2035 tbio->bi_size = r10_bio->sectors << 9; 2056 tbio->bi_size = r10_bio->sectors << 9;
2036 tbio->bi_idx = 0; 2057 tbio->bi_idx = 0;
2037 tbio->bi_phys_segments = 0; 2058 tbio->bi_phys_segments = 0;
2038 tbio->bi_flags &= ~(BIO_POOL_MASK - 1); 2059 tbio->bi_flags &= ~(BIO_POOL_MASK - 1);
2039 tbio->bi_flags |= 1 << BIO_UPTODATE; 2060 tbio->bi_flags |= 1 << BIO_UPTODATE;
2040 tbio->bi_next = NULL; 2061 tbio->bi_next = NULL;
2041 tbio->bi_rw = WRITE; 2062 tbio->bi_rw = WRITE;
2042 tbio->bi_private = r10_bio; 2063 tbio->bi_private = r10_bio;
2043 tbio->bi_sector = r10_bio->devs[i].addr; 2064 tbio->bi_sector = r10_bio->devs[i].addr;
2044 2065
2045 for (j=0; j < vcnt ; j++) { 2066 for (j=0; j < vcnt ; j++) {
2046 tbio->bi_io_vec[j].bv_offset = 0; 2067 tbio->bi_io_vec[j].bv_offset = 0;
2047 tbio->bi_io_vec[j].bv_len = PAGE_SIZE; 2068 tbio->bi_io_vec[j].bv_len = PAGE_SIZE;
2048 2069
2049 memcpy(page_address(tbio->bi_io_vec[j].bv_page), 2070 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2050 page_address(fbio->bi_io_vec[j].bv_page), 2071 page_address(fbio->bi_io_vec[j].bv_page),
2051 PAGE_SIZE); 2072 PAGE_SIZE);
2052 } 2073 }
2053 tbio->bi_end_io = end_sync_write; 2074 tbio->bi_end_io = end_sync_write;
2054 2075
2055 d = r10_bio->devs[i].devnum; 2076 d = r10_bio->devs[i].devnum;
2056 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2077 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2057 atomic_inc(&r10_bio->remaining); 2078 atomic_inc(&r10_bio->remaining);
2058 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9); 2079 md_sync_acct(conf->mirrors[d].rdev->bdev, tbio->bi_size >> 9);
2059 2080
2060 tbio->bi_sector += conf->mirrors[d].rdev->data_offset; 2081 tbio->bi_sector += conf->mirrors[d].rdev->data_offset;
2061 tbio->bi_bdev = conf->mirrors[d].rdev->bdev; 2082 tbio->bi_bdev = conf->mirrors[d].rdev->bdev;
2062 generic_make_request(tbio); 2083 generic_make_request(tbio);
2063 } 2084 }
2064 2085
2065 /* Now write out to any replacement devices 2086 /* Now write out to any replacement devices
2066 * that are active 2087 * that are active
2067 */ 2088 */
2068 for (i = 0; i < conf->copies; i++) { 2089 for (i = 0; i < conf->copies; i++) {
2069 int j, d; 2090 int j, d;
2070 2091
2071 tbio = r10_bio->devs[i].repl_bio; 2092 tbio = r10_bio->devs[i].repl_bio;
2072 if (!tbio || !tbio->bi_end_io) 2093 if (!tbio || !tbio->bi_end_io)
2073 continue; 2094 continue;
2074 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write 2095 if (r10_bio->devs[i].bio->bi_end_io != end_sync_write
2075 && r10_bio->devs[i].bio != fbio) 2096 && r10_bio->devs[i].bio != fbio)
2076 for (j = 0; j < vcnt; j++) 2097 for (j = 0; j < vcnt; j++)
2077 memcpy(page_address(tbio->bi_io_vec[j].bv_page), 2098 memcpy(page_address(tbio->bi_io_vec[j].bv_page),
2078 page_address(fbio->bi_io_vec[j].bv_page), 2099 page_address(fbio->bi_io_vec[j].bv_page),
2079 PAGE_SIZE); 2100 PAGE_SIZE);
2080 d = r10_bio->devs[i].devnum; 2101 d = r10_bio->devs[i].devnum;
2081 atomic_inc(&r10_bio->remaining); 2102 atomic_inc(&r10_bio->remaining);
2082 md_sync_acct(conf->mirrors[d].replacement->bdev, 2103 md_sync_acct(conf->mirrors[d].replacement->bdev,
2083 tbio->bi_size >> 9); 2104 tbio->bi_size >> 9);
2084 generic_make_request(tbio); 2105 generic_make_request(tbio);
2085 } 2106 }
2086 2107
2087 done: 2108 done:
2088 if (atomic_dec_and_test(&r10_bio->remaining)) { 2109 if (atomic_dec_and_test(&r10_bio->remaining)) {
2089 md_done_sync(mddev, r10_bio->sectors, 1); 2110 md_done_sync(mddev, r10_bio->sectors, 1);
2090 put_buf(r10_bio); 2111 put_buf(r10_bio);
2091 } 2112 }
2092 } 2113 }
2093 2114
2094 /* 2115 /*
2095 * Now for the recovery code. 2116 * Now for the recovery code.
2096 * Recovery happens across physical sectors. 2117 * Recovery happens across physical sectors.
2097 * We recover all non-is_sync drives by finding the virtual address of 2118 * We recover all non-is_sync drives by finding the virtual address of
2098 * each, and then choose a working drive that also has that virt address. 2119 * each, and then choose a working drive that also has that virt address.
2099 * There is a separate r10_bio for each non-in_sync drive. 2120 * There is a separate r10_bio for each non-in_sync drive.
2100 * Only the first two slots are in use. The first for reading, 2121 * Only the first two slots are in use. The first for reading,
2101 * The second for writing. 2122 * The second for writing.
2102 * 2123 *
2103 */ 2124 */
2104 static void fix_recovery_read_error(struct r10bio *r10_bio) 2125 static void fix_recovery_read_error(struct r10bio *r10_bio)
2105 { 2126 {
2106 /* We got a read error during recovery. 2127 /* We got a read error during recovery.
2107 * We repeat the read in smaller page-sized sections. 2128 * We repeat the read in smaller page-sized sections.
2108 * If a read succeeds, write it to the new device or record 2129 * If a read succeeds, write it to the new device or record
2109 * a bad block if we cannot. 2130 * a bad block if we cannot.
2110 * If a read fails, record a bad block on both old and 2131 * If a read fails, record a bad block on both old and
2111 * new devices. 2132 * new devices.
2112 */ 2133 */
2113 struct mddev *mddev = r10_bio->mddev; 2134 struct mddev *mddev = r10_bio->mddev;
2114 struct r10conf *conf = mddev->private; 2135 struct r10conf *conf = mddev->private;
2115 struct bio *bio = r10_bio->devs[0].bio; 2136 struct bio *bio = r10_bio->devs[0].bio;
2116 sector_t sect = 0; 2137 sector_t sect = 0;
2117 int sectors = r10_bio->sectors; 2138 int sectors = r10_bio->sectors;
2118 int idx = 0; 2139 int idx = 0;
2119 int dr = r10_bio->devs[0].devnum; 2140 int dr = r10_bio->devs[0].devnum;
2120 int dw = r10_bio->devs[1].devnum; 2141 int dw = r10_bio->devs[1].devnum;
2121 2142
2122 while (sectors) { 2143 while (sectors) {
2123 int s = sectors; 2144 int s = sectors;
2124 struct md_rdev *rdev; 2145 struct md_rdev *rdev;
2125 sector_t addr; 2146 sector_t addr;
2126 int ok; 2147 int ok;
2127 2148
2128 if (s > (PAGE_SIZE>>9)) 2149 if (s > (PAGE_SIZE>>9))
2129 s = PAGE_SIZE >> 9; 2150 s = PAGE_SIZE >> 9;
2130 2151
2131 rdev = conf->mirrors[dr].rdev; 2152 rdev = conf->mirrors[dr].rdev;
2132 addr = r10_bio->devs[0].addr + sect, 2153 addr = r10_bio->devs[0].addr + sect,
2133 ok = sync_page_io(rdev, 2154 ok = sync_page_io(rdev,
2134 addr, 2155 addr,
2135 s << 9, 2156 s << 9,
2136 bio->bi_io_vec[idx].bv_page, 2157 bio->bi_io_vec[idx].bv_page,
2137 READ, false); 2158 READ, false);
2138 if (ok) { 2159 if (ok) {
2139 rdev = conf->mirrors[dw].rdev; 2160 rdev = conf->mirrors[dw].rdev;
2140 addr = r10_bio->devs[1].addr + sect; 2161 addr = r10_bio->devs[1].addr + sect;
2141 ok = sync_page_io(rdev, 2162 ok = sync_page_io(rdev,
2142 addr, 2163 addr,
2143 s << 9, 2164 s << 9,
2144 bio->bi_io_vec[idx].bv_page, 2165 bio->bi_io_vec[idx].bv_page,
2145 WRITE, false); 2166 WRITE, false);
2146 if (!ok) { 2167 if (!ok) {
2147 set_bit(WriteErrorSeen, &rdev->flags); 2168 set_bit(WriteErrorSeen, &rdev->flags);
2148 if (!test_and_set_bit(WantReplacement, 2169 if (!test_and_set_bit(WantReplacement,
2149 &rdev->flags)) 2170 &rdev->flags))
2150 set_bit(MD_RECOVERY_NEEDED, 2171 set_bit(MD_RECOVERY_NEEDED,
2151 &rdev->mddev->recovery); 2172 &rdev->mddev->recovery);
2152 } 2173 }
2153 } 2174 }
2154 if (!ok) { 2175 if (!ok) {
2155 /* We don't worry if we cannot set a bad block - 2176 /* We don't worry if we cannot set a bad block -
2156 * it really is bad so there is no loss in not 2177 * it really is bad so there is no loss in not
2157 * recording it yet 2178 * recording it yet
2158 */ 2179 */
2159 rdev_set_badblocks(rdev, addr, s, 0); 2180 rdev_set_badblocks(rdev, addr, s, 0);
2160 2181
2161 if (rdev != conf->mirrors[dw].rdev) { 2182 if (rdev != conf->mirrors[dw].rdev) {
2162 /* need bad block on destination too */ 2183 /* need bad block on destination too */
2163 struct md_rdev *rdev2 = conf->mirrors[dw].rdev; 2184 struct md_rdev *rdev2 = conf->mirrors[dw].rdev;
2164 addr = r10_bio->devs[1].addr + sect; 2185 addr = r10_bio->devs[1].addr + sect;
2165 ok = rdev_set_badblocks(rdev2, addr, s, 0); 2186 ok = rdev_set_badblocks(rdev2, addr, s, 0);
2166 if (!ok) { 2187 if (!ok) {
2167 /* just abort the recovery */ 2188 /* just abort the recovery */
2168 printk(KERN_NOTICE 2189 printk(KERN_NOTICE
2169 "md/raid10:%s: recovery aborted" 2190 "md/raid10:%s: recovery aborted"
2170 " due to read error\n", 2191 " due to read error\n",
2171 mdname(mddev)); 2192 mdname(mddev));
2172 2193
2173 conf->mirrors[dw].recovery_disabled 2194 conf->mirrors[dw].recovery_disabled
2174 = mddev->recovery_disabled; 2195 = mddev->recovery_disabled;
2175 set_bit(MD_RECOVERY_INTR, 2196 set_bit(MD_RECOVERY_INTR,
2176 &mddev->recovery); 2197 &mddev->recovery);
2177 break; 2198 break;
2178 } 2199 }
2179 } 2200 }
2180 } 2201 }
2181 2202
2182 sectors -= s; 2203 sectors -= s;
2183 sect += s; 2204 sect += s;
2184 idx++; 2205 idx++;
2185 } 2206 }
2186 } 2207 }
2187 2208
2188 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio) 2209 static void recovery_request_write(struct mddev *mddev, struct r10bio *r10_bio)
2189 { 2210 {
2190 struct r10conf *conf = mddev->private; 2211 struct r10conf *conf = mddev->private;
2191 int d; 2212 int d;
2192 struct bio *wbio, *wbio2; 2213 struct bio *wbio, *wbio2;
2193 2214
2194 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) { 2215 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) {
2195 fix_recovery_read_error(r10_bio); 2216 fix_recovery_read_error(r10_bio);
2196 end_sync_request(r10_bio); 2217 end_sync_request(r10_bio);
2197 return; 2218 return;
2198 } 2219 }
2199 2220
2200 /* 2221 /*
2201 * share the pages with the first bio 2222 * share the pages with the first bio
2202 * and submit the write request 2223 * and submit the write request
2203 */ 2224 */
2204 d = r10_bio->devs[1].devnum; 2225 d = r10_bio->devs[1].devnum;
2205 wbio = r10_bio->devs[1].bio; 2226 wbio = r10_bio->devs[1].bio;
2206 wbio2 = r10_bio->devs[1].repl_bio; 2227 wbio2 = r10_bio->devs[1].repl_bio;
2207 if (wbio->bi_end_io) { 2228 if (wbio->bi_end_io) {
2208 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 2229 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
2209 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9); 2230 md_sync_acct(conf->mirrors[d].rdev->bdev, wbio->bi_size >> 9);
2210 generic_make_request(wbio); 2231 generic_make_request(wbio);
2211 } 2232 }
2212 if (wbio2 && wbio2->bi_end_io) { 2233 if (wbio2 && wbio2->bi_end_io) {
2213 atomic_inc(&conf->mirrors[d].replacement->nr_pending); 2234 atomic_inc(&conf->mirrors[d].replacement->nr_pending);
2214 md_sync_acct(conf->mirrors[d].replacement->bdev, 2235 md_sync_acct(conf->mirrors[d].replacement->bdev,
2215 wbio2->bi_size >> 9); 2236 wbio2->bi_size >> 9);
2216 generic_make_request(wbio2); 2237 generic_make_request(wbio2);
2217 } 2238 }
2218 } 2239 }
2219 2240
2220 2241
2221 /* 2242 /*
2222 * Used by fix_read_error() to decay the per rdev read_errors. 2243 * Used by fix_read_error() to decay the per rdev read_errors.
2223 * We halve the read error count for every hour that has elapsed 2244 * We halve the read error count for every hour that has elapsed
2224 * since the last recorded read error. 2245 * since the last recorded read error.
2225 * 2246 *
2226 */ 2247 */
2227 static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev) 2248 static void check_decay_read_errors(struct mddev *mddev, struct md_rdev *rdev)
2228 { 2249 {
2229 struct timespec cur_time_mon; 2250 struct timespec cur_time_mon;
2230 unsigned long hours_since_last; 2251 unsigned long hours_since_last;
2231 unsigned int read_errors = atomic_read(&rdev->read_errors); 2252 unsigned int read_errors = atomic_read(&rdev->read_errors);
2232 2253
2233 ktime_get_ts(&cur_time_mon); 2254 ktime_get_ts(&cur_time_mon);
2234 2255
2235 if (rdev->last_read_error.tv_sec == 0 && 2256 if (rdev->last_read_error.tv_sec == 0 &&
2236 rdev->last_read_error.tv_nsec == 0) { 2257 rdev->last_read_error.tv_nsec == 0) {
2237 /* first time we've seen a read error */ 2258 /* first time we've seen a read error */
2238 rdev->last_read_error = cur_time_mon; 2259 rdev->last_read_error = cur_time_mon;
2239 return; 2260 return;
2240 } 2261 }
2241 2262
2242 hours_since_last = (cur_time_mon.tv_sec - 2263 hours_since_last = (cur_time_mon.tv_sec -
2243 rdev->last_read_error.tv_sec) / 3600; 2264 rdev->last_read_error.tv_sec) / 3600;
2244 2265
2245 rdev->last_read_error = cur_time_mon; 2266 rdev->last_read_error = cur_time_mon;
2246 2267
2247 /* 2268 /*
2248 * if hours_since_last is > the number of bits in read_errors 2269 * if hours_since_last is > the number of bits in read_errors
2249 * just set read errors to 0. We do this to avoid 2270 * just set read errors to 0. We do this to avoid
2250 * overflowing the shift of read_errors by hours_since_last. 2271 * overflowing the shift of read_errors by hours_since_last.
2251 */ 2272 */
2252 if (hours_since_last >= 8 * sizeof(read_errors)) 2273 if (hours_since_last >= 8 * sizeof(read_errors))
2253 atomic_set(&rdev->read_errors, 0); 2274 atomic_set(&rdev->read_errors, 0);
2254 else 2275 else
2255 atomic_set(&rdev->read_errors, read_errors >> hours_since_last); 2276 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
2256 } 2277 }
2257 2278
2258 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector, 2279 static int r10_sync_page_io(struct md_rdev *rdev, sector_t sector,
2259 int sectors, struct page *page, int rw) 2280 int sectors, struct page *page, int rw)
2260 { 2281 {
2261 sector_t first_bad; 2282 sector_t first_bad;
2262 int bad_sectors; 2283 int bad_sectors;
2263 2284
2264 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) 2285 if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors)
2265 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) 2286 && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags)))
2266 return -1; 2287 return -1;
2267 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) 2288 if (sync_page_io(rdev, sector, sectors << 9, page, rw, false))
2268 /* success */ 2289 /* success */
2269 return 1; 2290 return 1;
2270 if (rw == WRITE) { 2291 if (rw == WRITE) {
2271 set_bit(WriteErrorSeen, &rdev->flags); 2292 set_bit(WriteErrorSeen, &rdev->flags);
2272 if (!test_and_set_bit(WantReplacement, &rdev->flags)) 2293 if (!test_and_set_bit(WantReplacement, &rdev->flags))
2273 set_bit(MD_RECOVERY_NEEDED, 2294 set_bit(MD_RECOVERY_NEEDED,
2274 &rdev->mddev->recovery); 2295 &rdev->mddev->recovery);
2275 } 2296 }
2276 /* need to record an error - either for the block or the device */ 2297 /* need to record an error - either for the block or the device */
2277 if (!rdev_set_badblocks(rdev, sector, sectors, 0)) 2298 if (!rdev_set_badblocks(rdev, sector, sectors, 0))
2278 md_error(rdev->mddev, rdev); 2299 md_error(rdev->mddev, rdev);
2279 return 0; 2300 return 0;
2280 } 2301 }
2281 2302
2282 /* 2303 /*
2283 * This is a kernel thread which: 2304 * This is a kernel thread which:
2284 * 2305 *
2285 * 1. Retries failed read operations on working mirrors. 2306 * 1. Retries failed read operations on working mirrors.
2286 * 2. Updates the raid superblock when problems encounter. 2307 * 2. Updates the raid superblock when problems encounter.
2287 * 3. Performs writes following reads for array synchronising. 2308 * 3. Performs writes following reads for array synchronising.
2288 */ 2309 */
2289 2310
2290 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio) 2311 static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10bio *r10_bio)
2291 { 2312 {
2292 int sect = 0; /* Offset from r10_bio->sector */ 2313 int sect = 0; /* Offset from r10_bio->sector */
2293 int sectors = r10_bio->sectors; 2314 int sectors = r10_bio->sectors;
2294 struct md_rdev*rdev; 2315 struct md_rdev*rdev;
2295 int max_read_errors = atomic_read(&mddev->max_corr_read_errors); 2316 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
2296 int d = r10_bio->devs[r10_bio->read_slot].devnum; 2317 int d = r10_bio->devs[r10_bio->read_slot].devnum;
2297 2318
2298 /* still own a reference to this rdev, so it cannot 2319 /* still own a reference to this rdev, so it cannot
2299 * have been cleared recently. 2320 * have been cleared recently.
2300 */ 2321 */
2301 rdev = conf->mirrors[d].rdev; 2322 rdev = conf->mirrors[d].rdev;
2302 2323
2303 if (test_bit(Faulty, &rdev->flags)) 2324 if (test_bit(Faulty, &rdev->flags))
2304 /* drive has already been failed, just ignore any 2325 /* drive has already been failed, just ignore any
2305 more fix_read_error() attempts */ 2326 more fix_read_error() attempts */
2306 return; 2327 return;
2307 2328
2308 check_decay_read_errors(mddev, rdev); 2329 check_decay_read_errors(mddev, rdev);
2309 atomic_inc(&rdev->read_errors); 2330 atomic_inc(&rdev->read_errors);
2310 if (atomic_read(&rdev->read_errors) > max_read_errors) { 2331 if (atomic_read(&rdev->read_errors) > max_read_errors) {
2311 char b[BDEVNAME_SIZE]; 2332 char b[BDEVNAME_SIZE];
2312 bdevname(rdev->bdev, b); 2333 bdevname(rdev->bdev, b);
2313 2334
2314 printk(KERN_NOTICE 2335 printk(KERN_NOTICE
2315 "md/raid10:%s: %s: Raid device exceeded " 2336 "md/raid10:%s: %s: Raid device exceeded "
2316 "read_error threshold [cur %d:max %d]\n", 2337 "read_error threshold [cur %d:max %d]\n",
2317 mdname(mddev), b, 2338 mdname(mddev), b,
2318 atomic_read(&rdev->read_errors), max_read_errors); 2339 atomic_read(&rdev->read_errors), max_read_errors);
2319 printk(KERN_NOTICE 2340 printk(KERN_NOTICE
2320 "md/raid10:%s: %s: Failing raid device\n", 2341 "md/raid10:%s: %s: Failing raid device\n",
2321 mdname(mddev), b); 2342 mdname(mddev), b);
2322 md_error(mddev, conf->mirrors[d].rdev); 2343 md_error(mddev, conf->mirrors[d].rdev);
2323 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; 2344 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2324 return; 2345 return;
2325 } 2346 }
2326 2347
2327 while(sectors) { 2348 while(sectors) {
2328 int s = sectors; 2349 int s = sectors;
2329 int sl = r10_bio->read_slot; 2350 int sl = r10_bio->read_slot;
2330 int success = 0; 2351 int success = 0;
2331 int start; 2352 int start;
2332 2353
2333 if (s > (PAGE_SIZE>>9)) 2354 if (s > (PAGE_SIZE>>9))
2334 s = PAGE_SIZE >> 9; 2355 s = PAGE_SIZE >> 9;
2335 2356
2336 rcu_read_lock(); 2357 rcu_read_lock();
2337 do { 2358 do {
2338 sector_t first_bad; 2359 sector_t first_bad;
2339 int bad_sectors; 2360 int bad_sectors;
2340 2361
2341 d = r10_bio->devs[sl].devnum; 2362 d = r10_bio->devs[sl].devnum;
2342 rdev = rcu_dereference(conf->mirrors[d].rdev); 2363 rdev = rcu_dereference(conf->mirrors[d].rdev);
2343 if (rdev && 2364 if (rdev &&
2344 !test_bit(Unmerged, &rdev->flags) && 2365 !test_bit(Unmerged, &rdev->flags) &&
2345 test_bit(In_sync, &rdev->flags) && 2366 test_bit(In_sync, &rdev->flags) &&
2346 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 2367 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2347 &first_bad, &bad_sectors) == 0) { 2368 &first_bad, &bad_sectors) == 0) {
2348 atomic_inc(&rdev->nr_pending); 2369 atomic_inc(&rdev->nr_pending);
2349 rcu_read_unlock(); 2370 rcu_read_unlock();
2350 success = sync_page_io(rdev, 2371 success = sync_page_io(rdev,
2351 r10_bio->devs[sl].addr + 2372 r10_bio->devs[sl].addr +
2352 sect, 2373 sect,
2353 s<<9, 2374 s<<9,
2354 conf->tmppage, READ, false); 2375 conf->tmppage, READ, false);
2355 rdev_dec_pending(rdev, mddev); 2376 rdev_dec_pending(rdev, mddev);
2356 rcu_read_lock(); 2377 rcu_read_lock();
2357 if (success) 2378 if (success)
2358 break; 2379 break;
2359 } 2380 }
2360 sl++; 2381 sl++;
2361 if (sl == conf->copies) 2382 if (sl == conf->copies)
2362 sl = 0; 2383 sl = 0;
2363 } while (!success && sl != r10_bio->read_slot); 2384 } while (!success && sl != r10_bio->read_slot);
2364 rcu_read_unlock(); 2385 rcu_read_unlock();
2365 2386
2366 if (!success) { 2387 if (!success) {
2367 /* Cannot read from anywhere, just mark the block 2388 /* Cannot read from anywhere, just mark the block
2368 * as bad on the first device to discourage future 2389 * as bad on the first device to discourage future
2369 * reads. 2390 * reads.
2370 */ 2391 */
2371 int dn = r10_bio->devs[r10_bio->read_slot].devnum; 2392 int dn = r10_bio->devs[r10_bio->read_slot].devnum;
2372 rdev = conf->mirrors[dn].rdev; 2393 rdev = conf->mirrors[dn].rdev;
2373 2394
2374 if (!rdev_set_badblocks( 2395 if (!rdev_set_badblocks(
2375 rdev, 2396 rdev,
2376 r10_bio->devs[r10_bio->read_slot].addr 2397 r10_bio->devs[r10_bio->read_slot].addr
2377 + sect, 2398 + sect,
2378 s, 0)) { 2399 s, 0)) {
2379 md_error(mddev, rdev); 2400 md_error(mddev, rdev);
2380 r10_bio->devs[r10_bio->read_slot].bio 2401 r10_bio->devs[r10_bio->read_slot].bio
2381 = IO_BLOCKED; 2402 = IO_BLOCKED;
2382 } 2403 }
2383 break; 2404 break;
2384 } 2405 }
2385 2406
2386 start = sl; 2407 start = sl;
2387 /* write it back and re-read */ 2408 /* write it back and re-read */
2388 rcu_read_lock(); 2409 rcu_read_lock();
2389 while (sl != r10_bio->read_slot) { 2410 while (sl != r10_bio->read_slot) {
2390 char b[BDEVNAME_SIZE]; 2411 char b[BDEVNAME_SIZE];
2391 2412
2392 if (sl==0) 2413 if (sl==0)
2393 sl = conf->copies; 2414 sl = conf->copies;
2394 sl--; 2415 sl--;
2395 d = r10_bio->devs[sl].devnum; 2416 d = r10_bio->devs[sl].devnum;
2396 rdev = rcu_dereference(conf->mirrors[d].rdev); 2417 rdev = rcu_dereference(conf->mirrors[d].rdev);
2397 if (!rdev || 2418 if (!rdev ||
2398 test_bit(Unmerged, &rdev->flags) || 2419 test_bit(Unmerged, &rdev->flags) ||
2399 !test_bit(In_sync, &rdev->flags)) 2420 !test_bit(In_sync, &rdev->flags))
2400 continue; 2421 continue;
2401 2422
2402 atomic_inc(&rdev->nr_pending); 2423 atomic_inc(&rdev->nr_pending);
2403 rcu_read_unlock(); 2424 rcu_read_unlock();
2404 if (r10_sync_page_io(rdev, 2425 if (r10_sync_page_io(rdev,
2405 r10_bio->devs[sl].addr + 2426 r10_bio->devs[sl].addr +
2406 sect, 2427 sect,
2407 s, conf->tmppage, WRITE) 2428 s, conf->tmppage, WRITE)
2408 == 0) { 2429 == 0) {
2409 /* Well, this device is dead */ 2430 /* Well, this device is dead */
2410 printk(KERN_NOTICE 2431 printk(KERN_NOTICE
2411 "md/raid10:%s: read correction " 2432 "md/raid10:%s: read correction "
2412 "write failed" 2433 "write failed"
2413 " (%d sectors at %llu on %s)\n", 2434 " (%d sectors at %llu on %s)\n",
2414 mdname(mddev), s, 2435 mdname(mddev), s,
2415 (unsigned long long)( 2436 (unsigned long long)(
2416 sect + 2437 sect +
2417 choose_data_offset(r10_bio, 2438 choose_data_offset(r10_bio,
2418 rdev)), 2439 rdev)),
2419 bdevname(rdev->bdev, b)); 2440 bdevname(rdev->bdev, b));
2420 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 2441 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2421 "drive\n", 2442 "drive\n",
2422 mdname(mddev), 2443 mdname(mddev),
2423 bdevname(rdev->bdev, b)); 2444 bdevname(rdev->bdev, b));
2424 } 2445 }
2425 rdev_dec_pending(rdev, mddev); 2446 rdev_dec_pending(rdev, mddev);
2426 rcu_read_lock(); 2447 rcu_read_lock();
2427 } 2448 }
2428 sl = start; 2449 sl = start;
2429 while (sl != r10_bio->read_slot) { 2450 while (sl != r10_bio->read_slot) {
2430 char b[BDEVNAME_SIZE]; 2451 char b[BDEVNAME_SIZE];
2431 2452
2432 if (sl==0) 2453 if (sl==0)
2433 sl = conf->copies; 2454 sl = conf->copies;
2434 sl--; 2455 sl--;
2435 d = r10_bio->devs[sl].devnum; 2456 d = r10_bio->devs[sl].devnum;
2436 rdev = rcu_dereference(conf->mirrors[d].rdev); 2457 rdev = rcu_dereference(conf->mirrors[d].rdev);
2437 if (!rdev || 2458 if (!rdev ||
2438 !test_bit(In_sync, &rdev->flags)) 2459 !test_bit(In_sync, &rdev->flags))
2439 continue; 2460 continue;
2440 2461
2441 atomic_inc(&rdev->nr_pending); 2462 atomic_inc(&rdev->nr_pending);
2442 rcu_read_unlock(); 2463 rcu_read_unlock();
2443 switch (r10_sync_page_io(rdev, 2464 switch (r10_sync_page_io(rdev,
2444 r10_bio->devs[sl].addr + 2465 r10_bio->devs[sl].addr +
2445 sect, 2466 sect,
2446 s, conf->tmppage, 2467 s, conf->tmppage,
2447 READ)) { 2468 READ)) {
2448 case 0: 2469 case 0:
2449 /* Well, this device is dead */ 2470 /* Well, this device is dead */
2450 printk(KERN_NOTICE 2471 printk(KERN_NOTICE
2451 "md/raid10:%s: unable to read back " 2472 "md/raid10:%s: unable to read back "
2452 "corrected sectors" 2473 "corrected sectors"
2453 " (%d sectors at %llu on %s)\n", 2474 " (%d sectors at %llu on %s)\n",
2454 mdname(mddev), s, 2475 mdname(mddev), s,
2455 (unsigned long long)( 2476 (unsigned long long)(
2456 sect + 2477 sect +
2457 choose_data_offset(r10_bio, rdev)), 2478 choose_data_offset(r10_bio, rdev)),
2458 bdevname(rdev->bdev, b)); 2479 bdevname(rdev->bdev, b));
2459 printk(KERN_NOTICE "md/raid10:%s: %s: failing " 2480 printk(KERN_NOTICE "md/raid10:%s: %s: failing "
2460 "drive\n", 2481 "drive\n",
2461 mdname(mddev), 2482 mdname(mddev),
2462 bdevname(rdev->bdev, b)); 2483 bdevname(rdev->bdev, b));
2463 break; 2484 break;
2464 case 1: 2485 case 1:
2465 printk(KERN_INFO 2486 printk(KERN_INFO
2466 "md/raid10:%s: read error corrected" 2487 "md/raid10:%s: read error corrected"
2467 " (%d sectors at %llu on %s)\n", 2488 " (%d sectors at %llu on %s)\n",
2468 mdname(mddev), s, 2489 mdname(mddev), s,
2469 (unsigned long long)( 2490 (unsigned long long)(
2470 sect + 2491 sect +
2471 choose_data_offset(r10_bio, rdev)), 2492 choose_data_offset(r10_bio, rdev)),
2472 bdevname(rdev->bdev, b)); 2493 bdevname(rdev->bdev, b));
2473 atomic_add(s, &rdev->corrected_errors); 2494 atomic_add(s, &rdev->corrected_errors);
2474 } 2495 }
2475 2496
2476 rdev_dec_pending(rdev, mddev); 2497 rdev_dec_pending(rdev, mddev);
2477 rcu_read_lock(); 2498 rcu_read_lock();
2478 } 2499 }
2479 rcu_read_unlock(); 2500 rcu_read_unlock();
2480 2501
2481 sectors -= s; 2502 sectors -= s;
2482 sect += s; 2503 sect += s;
2483 } 2504 }
2484 } 2505 }
2485 2506
2486 static void bi_complete(struct bio *bio, int error) 2507 static void bi_complete(struct bio *bio, int error)
2487 { 2508 {
2488 complete((struct completion *)bio->bi_private); 2509 complete((struct completion *)bio->bi_private);
2489 } 2510 }
2490 2511
2491 static int submit_bio_wait(int rw, struct bio *bio) 2512 static int submit_bio_wait(int rw, struct bio *bio)
2492 { 2513 {
2493 struct completion event; 2514 struct completion event;
2494 rw |= REQ_SYNC; 2515 rw |= REQ_SYNC;
2495 2516
2496 init_completion(&event); 2517 init_completion(&event);
2497 bio->bi_private = &event; 2518 bio->bi_private = &event;
2498 bio->bi_end_io = bi_complete; 2519 bio->bi_end_io = bi_complete;
2499 submit_bio(rw, bio); 2520 submit_bio(rw, bio);
2500 wait_for_completion(&event); 2521 wait_for_completion(&event);
2501 2522
2502 return test_bit(BIO_UPTODATE, &bio->bi_flags); 2523 return test_bit(BIO_UPTODATE, &bio->bi_flags);
2503 } 2524 }
2504 2525
2505 static int narrow_write_error(struct r10bio *r10_bio, int i) 2526 static int narrow_write_error(struct r10bio *r10_bio, int i)
2506 { 2527 {
2507 struct bio *bio = r10_bio->master_bio; 2528 struct bio *bio = r10_bio->master_bio;
2508 struct mddev *mddev = r10_bio->mddev; 2529 struct mddev *mddev = r10_bio->mddev;
2509 struct r10conf *conf = mddev->private; 2530 struct r10conf *conf = mddev->private;
2510 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev; 2531 struct md_rdev *rdev = conf->mirrors[r10_bio->devs[i].devnum].rdev;
2511 /* bio has the data to be written to slot 'i' where 2532 /* bio has the data to be written to slot 'i' where
2512 * we just recently had a write error. 2533 * we just recently had a write error.
2513 * We repeatedly clone the bio and trim down to one block, 2534 * We repeatedly clone the bio and trim down to one block,
2514 * then try the write. Where the write fails we record 2535 * then try the write. Where the write fails we record
2515 * a bad block. 2536 * a bad block.
2516 * It is conceivable that the bio doesn't exactly align with 2537 * It is conceivable that the bio doesn't exactly align with
2517 * blocks. We must handle this. 2538 * blocks. We must handle this.
2518 * 2539 *
2519 * We currently own a reference to the rdev. 2540 * We currently own a reference to the rdev.
2520 */ 2541 */
2521 2542
2522 int block_sectors; 2543 int block_sectors;
2523 sector_t sector; 2544 sector_t sector;
2524 int sectors; 2545 int sectors;
2525 int sect_to_write = r10_bio->sectors; 2546 int sect_to_write = r10_bio->sectors;
2526 int ok = 1; 2547 int ok = 1;
2527 2548
2528 if (rdev->badblocks.shift < 0) 2549 if (rdev->badblocks.shift < 0)
2529 return 0; 2550 return 0;
2530 2551
2531 block_sectors = 1 << rdev->badblocks.shift; 2552 block_sectors = 1 << rdev->badblocks.shift;
2532 sector = r10_bio->sector; 2553 sector = r10_bio->sector;
2533 sectors = ((r10_bio->sector + block_sectors) 2554 sectors = ((r10_bio->sector + block_sectors)
2534 & ~(sector_t)(block_sectors - 1)) 2555 & ~(sector_t)(block_sectors - 1))
2535 - sector; 2556 - sector;
2536 2557
2537 while (sect_to_write) { 2558 while (sect_to_write) {
2538 struct bio *wbio; 2559 struct bio *wbio;
2539 if (sectors > sect_to_write) 2560 if (sectors > sect_to_write)
2540 sectors = sect_to_write; 2561 sectors = sect_to_write;
2541 /* Write at 'sector' for 'sectors' */ 2562 /* Write at 'sector' for 'sectors' */
2542 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); 2563 wbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
2543 md_trim_bio(wbio, sector - bio->bi_sector, sectors); 2564 md_trim_bio(wbio, sector - bio->bi_sector, sectors);
2544 wbio->bi_sector = (r10_bio->devs[i].addr+ 2565 wbio->bi_sector = (r10_bio->devs[i].addr+
2545 choose_data_offset(r10_bio, rdev) + 2566 choose_data_offset(r10_bio, rdev) +
2546 (sector - r10_bio->sector)); 2567 (sector - r10_bio->sector));
2547 wbio->bi_bdev = rdev->bdev; 2568 wbio->bi_bdev = rdev->bdev;
2548 if (submit_bio_wait(WRITE, wbio) == 0) 2569 if (submit_bio_wait(WRITE, wbio) == 0)
2549 /* Failure! */ 2570 /* Failure! */
2550 ok = rdev_set_badblocks(rdev, sector, 2571 ok = rdev_set_badblocks(rdev, sector,
2551 sectors, 0) 2572 sectors, 0)
2552 && ok; 2573 && ok;
2553 2574
2554 bio_put(wbio); 2575 bio_put(wbio);
2555 sect_to_write -= sectors; 2576 sect_to_write -= sectors;
2556 sector += sectors; 2577 sector += sectors;
2557 sectors = block_sectors; 2578 sectors = block_sectors;
2558 } 2579 }
2559 return ok; 2580 return ok;
2560 } 2581 }
2561 2582
2562 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) 2583 static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2563 { 2584 {
2564 int slot = r10_bio->read_slot; 2585 int slot = r10_bio->read_slot;
2565 struct bio *bio; 2586 struct bio *bio;
2566 struct r10conf *conf = mddev->private; 2587 struct r10conf *conf = mddev->private;
2567 struct md_rdev *rdev = r10_bio->devs[slot].rdev; 2588 struct md_rdev *rdev = r10_bio->devs[slot].rdev;
2568 char b[BDEVNAME_SIZE]; 2589 char b[BDEVNAME_SIZE];
2569 unsigned long do_sync; 2590 unsigned long do_sync;
2570 int max_sectors; 2591 int max_sectors;
2571 2592
2572 /* we got a read error. Maybe the drive is bad. Maybe just 2593 /* we got a read error. Maybe the drive is bad. Maybe just
2573 * the block and we can fix it. 2594 * the block and we can fix it.
2574 * We freeze all other IO, and try reading the block from 2595 * We freeze all other IO, and try reading the block from
2575 * other devices. When we find one, we re-write 2596 * other devices. When we find one, we re-write
2576 * and check it that fixes the read error. 2597 * and check it that fixes the read error.
2577 * This is all done synchronously while the array is 2598 * This is all done synchronously while the array is
2578 * frozen. 2599 * frozen.
2579 */ 2600 */
2580 bio = r10_bio->devs[slot].bio; 2601 bio = r10_bio->devs[slot].bio;
2581 bdevname(bio->bi_bdev, b); 2602 bdevname(bio->bi_bdev, b);
2582 bio_put(bio); 2603 bio_put(bio);
2583 r10_bio->devs[slot].bio = NULL; 2604 r10_bio->devs[slot].bio = NULL;
2584 2605
2585 if (mddev->ro == 0) { 2606 if (mddev->ro == 0) {
2586 freeze_array(conf); 2607 freeze_array(conf);
2587 fix_read_error(conf, mddev, r10_bio); 2608 fix_read_error(conf, mddev, r10_bio);
2588 unfreeze_array(conf); 2609 unfreeze_array(conf);
2589 } else 2610 } else
2590 r10_bio->devs[slot].bio = IO_BLOCKED; 2611 r10_bio->devs[slot].bio = IO_BLOCKED;
2591 2612
2592 rdev_dec_pending(rdev, mddev); 2613 rdev_dec_pending(rdev, mddev);
2593 2614
2594 read_more: 2615 read_more:
2595 rdev = read_balance(conf, r10_bio, &max_sectors); 2616 rdev = read_balance(conf, r10_bio, &max_sectors);
2596 if (rdev == NULL) { 2617 if (rdev == NULL) {
2597 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O" 2618 printk(KERN_ALERT "md/raid10:%s: %s: unrecoverable I/O"
2598 " read error for block %llu\n", 2619 " read error for block %llu\n",
2599 mdname(mddev), b, 2620 mdname(mddev), b,
2600 (unsigned long long)r10_bio->sector); 2621 (unsigned long long)r10_bio->sector);
2601 raid_end_bio_io(r10_bio); 2622 raid_end_bio_io(r10_bio);
2602 return; 2623 return;
2603 } 2624 }
2604 2625
2605 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); 2626 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2606 slot = r10_bio->read_slot; 2627 slot = r10_bio->read_slot;
2607 printk_ratelimited( 2628 printk_ratelimited(
2608 KERN_ERR 2629 KERN_ERR
2609 "md/raid10:%s: %s: redirecting " 2630 "md/raid10:%s: %s: redirecting "
2610 "sector %llu to another mirror\n", 2631 "sector %llu to another mirror\n",
2611 mdname(mddev), 2632 mdname(mddev),
2612 bdevname(rdev->bdev, b), 2633 bdevname(rdev->bdev, b),
2613 (unsigned long long)r10_bio->sector); 2634 (unsigned long long)r10_bio->sector);
2614 bio = bio_clone_mddev(r10_bio->master_bio, 2635 bio = bio_clone_mddev(r10_bio->master_bio,
2615 GFP_NOIO, mddev); 2636 GFP_NOIO, mddev);
2616 md_trim_bio(bio, 2637 md_trim_bio(bio,
2617 r10_bio->sector - bio->bi_sector, 2638 r10_bio->sector - bio->bi_sector,
2618 max_sectors); 2639 max_sectors);
2619 r10_bio->devs[slot].bio = bio; 2640 r10_bio->devs[slot].bio = bio;
2620 r10_bio->devs[slot].rdev = rdev; 2641 r10_bio->devs[slot].rdev = rdev;
2621 bio->bi_sector = r10_bio->devs[slot].addr 2642 bio->bi_sector = r10_bio->devs[slot].addr
2622 + choose_data_offset(r10_bio, rdev); 2643 + choose_data_offset(r10_bio, rdev);
2623 bio->bi_bdev = rdev->bdev; 2644 bio->bi_bdev = rdev->bdev;
2624 bio->bi_rw = READ | do_sync; 2645 bio->bi_rw = READ | do_sync;
2625 bio->bi_private = r10_bio; 2646 bio->bi_private = r10_bio;
2626 bio->bi_end_io = raid10_end_read_request; 2647 bio->bi_end_io = raid10_end_read_request;
2627 if (max_sectors < r10_bio->sectors) { 2648 if (max_sectors < r10_bio->sectors) {
2628 /* Drat - have to split this up more */ 2649 /* Drat - have to split this up more */
2629 struct bio *mbio = r10_bio->master_bio; 2650 struct bio *mbio = r10_bio->master_bio;
2630 int sectors_handled = 2651 int sectors_handled =
2631 r10_bio->sector + max_sectors 2652 r10_bio->sector + max_sectors
2632 - mbio->bi_sector; 2653 - mbio->bi_sector;
2633 r10_bio->sectors = max_sectors; 2654 r10_bio->sectors = max_sectors;
2634 spin_lock_irq(&conf->device_lock); 2655 spin_lock_irq(&conf->device_lock);
2635 if (mbio->bi_phys_segments == 0) 2656 if (mbio->bi_phys_segments == 0)
2636 mbio->bi_phys_segments = 2; 2657 mbio->bi_phys_segments = 2;
2637 else 2658 else
2638 mbio->bi_phys_segments++; 2659 mbio->bi_phys_segments++;
2639 spin_unlock_irq(&conf->device_lock); 2660 spin_unlock_irq(&conf->device_lock);
2640 generic_make_request(bio); 2661 generic_make_request(bio);
2641 2662
2642 r10_bio = mempool_alloc(conf->r10bio_pool, 2663 r10_bio = mempool_alloc(conf->r10bio_pool,
2643 GFP_NOIO); 2664 GFP_NOIO);
2644 r10_bio->master_bio = mbio; 2665 r10_bio->master_bio = mbio;
2645 r10_bio->sectors = (mbio->bi_size >> 9) 2666 r10_bio->sectors = (mbio->bi_size >> 9)
2646 - sectors_handled; 2667 - sectors_handled;
2647 r10_bio->state = 0; 2668 r10_bio->state = 0;
2648 set_bit(R10BIO_ReadError, 2669 set_bit(R10BIO_ReadError,
2649 &r10_bio->state); 2670 &r10_bio->state);
2650 r10_bio->mddev = mddev; 2671 r10_bio->mddev = mddev;
2651 r10_bio->sector = mbio->bi_sector 2672 r10_bio->sector = mbio->bi_sector
2652 + sectors_handled; 2673 + sectors_handled;
2653 2674
2654 goto read_more; 2675 goto read_more;
2655 } else 2676 } else
2656 generic_make_request(bio); 2677 generic_make_request(bio);
2657 } 2678 }
2658 2679
2659 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) 2680 static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
2660 { 2681 {
2661 /* Some sort of write request has finished and it 2682 /* Some sort of write request has finished and it
2662 * succeeded in writing where we thought there was a 2683 * succeeded in writing where we thought there was a
2663 * bad block. So forget the bad block. 2684 * bad block. So forget the bad block.
2664 * Or possibly if failed and we need to record 2685 * Or possibly if failed and we need to record
2665 * a bad block. 2686 * a bad block.
2666 */ 2687 */
2667 int m; 2688 int m;
2668 struct md_rdev *rdev; 2689 struct md_rdev *rdev;
2669 2690
2670 if (test_bit(R10BIO_IsSync, &r10_bio->state) || 2691 if (test_bit(R10BIO_IsSync, &r10_bio->state) ||
2671 test_bit(R10BIO_IsRecover, &r10_bio->state)) { 2692 test_bit(R10BIO_IsRecover, &r10_bio->state)) {
2672 for (m = 0; m < conf->copies; m++) { 2693 for (m = 0; m < conf->copies; m++) {
2673 int dev = r10_bio->devs[m].devnum; 2694 int dev = r10_bio->devs[m].devnum;
2674 rdev = conf->mirrors[dev].rdev; 2695 rdev = conf->mirrors[dev].rdev;
2675 if (r10_bio->devs[m].bio == NULL) 2696 if (r10_bio->devs[m].bio == NULL)
2676 continue; 2697 continue;
2677 if (test_bit(BIO_UPTODATE, 2698 if (test_bit(BIO_UPTODATE,
2678 &r10_bio->devs[m].bio->bi_flags)) { 2699 &r10_bio->devs[m].bio->bi_flags)) {
2679 rdev_clear_badblocks( 2700 rdev_clear_badblocks(
2680 rdev, 2701 rdev,
2681 r10_bio->devs[m].addr, 2702 r10_bio->devs[m].addr,
2682 r10_bio->sectors, 0); 2703 r10_bio->sectors, 0);
2683 } else { 2704 } else {
2684 if (!rdev_set_badblocks( 2705 if (!rdev_set_badblocks(
2685 rdev, 2706 rdev,
2686 r10_bio->devs[m].addr, 2707 r10_bio->devs[m].addr,
2687 r10_bio->sectors, 0)) 2708 r10_bio->sectors, 0))
2688 md_error(conf->mddev, rdev); 2709 md_error(conf->mddev, rdev);
2689 } 2710 }
2690 rdev = conf->mirrors[dev].replacement; 2711 rdev = conf->mirrors[dev].replacement;
2691 if (r10_bio->devs[m].repl_bio == NULL) 2712 if (r10_bio->devs[m].repl_bio == NULL)
2692 continue; 2713 continue;
2693 if (test_bit(BIO_UPTODATE, 2714 if (test_bit(BIO_UPTODATE,
2694 &r10_bio->devs[m].repl_bio->bi_flags)) { 2715 &r10_bio->devs[m].repl_bio->bi_flags)) {
2695 rdev_clear_badblocks( 2716 rdev_clear_badblocks(
2696 rdev, 2717 rdev,
2697 r10_bio->devs[m].addr, 2718 r10_bio->devs[m].addr,
2698 r10_bio->sectors, 0); 2719 r10_bio->sectors, 0);
2699 } else { 2720 } else {
2700 if (!rdev_set_badblocks( 2721 if (!rdev_set_badblocks(
2701 rdev, 2722 rdev,
2702 r10_bio->devs[m].addr, 2723 r10_bio->devs[m].addr,
2703 r10_bio->sectors, 0)) 2724 r10_bio->sectors, 0))
2704 md_error(conf->mddev, rdev); 2725 md_error(conf->mddev, rdev);
2705 } 2726 }
2706 } 2727 }
2707 put_buf(r10_bio); 2728 put_buf(r10_bio);
2708 } else { 2729 } else {
2709 for (m = 0; m < conf->copies; m++) { 2730 for (m = 0; m < conf->copies; m++) {
2710 int dev = r10_bio->devs[m].devnum; 2731 int dev = r10_bio->devs[m].devnum;
2711 struct bio *bio = r10_bio->devs[m].bio; 2732 struct bio *bio = r10_bio->devs[m].bio;
2712 rdev = conf->mirrors[dev].rdev; 2733 rdev = conf->mirrors[dev].rdev;
2713 if (bio == IO_MADE_GOOD) { 2734 if (bio == IO_MADE_GOOD) {
2714 rdev_clear_badblocks( 2735 rdev_clear_badblocks(
2715 rdev, 2736 rdev,
2716 r10_bio->devs[m].addr, 2737 r10_bio->devs[m].addr,
2717 r10_bio->sectors, 0); 2738 r10_bio->sectors, 0);
2718 rdev_dec_pending(rdev, conf->mddev); 2739 rdev_dec_pending(rdev, conf->mddev);
2719 } else if (bio != NULL && 2740 } else if (bio != NULL &&
2720 !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 2741 !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
2721 if (!narrow_write_error(r10_bio, m)) { 2742 if (!narrow_write_error(r10_bio, m)) {
2722 md_error(conf->mddev, rdev); 2743 md_error(conf->mddev, rdev);
2723 set_bit(R10BIO_Degraded, 2744 set_bit(R10BIO_Degraded,
2724 &r10_bio->state); 2745 &r10_bio->state);
2725 } 2746 }
2726 rdev_dec_pending(rdev, conf->mddev); 2747 rdev_dec_pending(rdev, conf->mddev);
2727 } 2748 }
2728 bio = r10_bio->devs[m].repl_bio; 2749 bio = r10_bio->devs[m].repl_bio;
2729 rdev = conf->mirrors[dev].replacement; 2750 rdev = conf->mirrors[dev].replacement;
2730 if (rdev && bio == IO_MADE_GOOD) { 2751 if (rdev && bio == IO_MADE_GOOD) {
2731 rdev_clear_badblocks( 2752 rdev_clear_badblocks(
2732 rdev, 2753 rdev,
2733 r10_bio->devs[m].addr, 2754 r10_bio->devs[m].addr,
2734 r10_bio->sectors, 0); 2755 r10_bio->sectors, 0);
2735 rdev_dec_pending(rdev, conf->mddev); 2756 rdev_dec_pending(rdev, conf->mddev);
2736 } 2757 }
2737 } 2758 }
2738 if (test_bit(R10BIO_WriteError, 2759 if (test_bit(R10BIO_WriteError,
2739 &r10_bio->state)) 2760 &r10_bio->state))
2740 close_write(r10_bio); 2761 close_write(r10_bio);
2741 raid_end_bio_io(r10_bio); 2762 raid_end_bio_io(r10_bio);
2742 } 2763 }
2743 } 2764 }
2744 2765
2745 static void raid10d(struct md_thread *thread) 2766 static void raid10d(struct md_thread *thread)
2746 { 2767 {
2747 struct mddev *mddev = thread->mddev; 2768 struct mddev *mddev = thread->mddev;
2748 struct r10bio *r10_bio; 2769 struct r10bio *r10_bio;
2749 unsigned long flags; 2770 unsigned long flags;
2750 struct r10conf *conf = mddev->private; 2771 struct r10conf *conf = mddev->private;
2751 struct list_head *head = &conf->retry_list; 2772 struct list_head *head = &conf->retry_list;
2752 struct blk_plug plug; 2773 struct blk_plug plug;
2753 2774
2754 md_check_recovery(mddev); 2775 md_check_recovery(mddev);
2755 2776
2756 blk_start_plug(&plug); 2777 blk_start_plug(&plug);
2757 for (;;) { 2778 for (;;) {
2758 2779
2759 flush_pending_writes(conf); 2780 flush_pending_writes(conf);
2760 2781
2761 spin_lock_irqsave(&conf->device_lock, flags); 2782 spin_lock_irqsave(&conf->device_lock, flags);
2762 if (list_empty(head)) { 2783 if (list_empty(head)) {
2763 spin_unlock_irqrestore(&conf->device_lock, flags); 2784 spin_unlock_irqrestore(&conf->device_lock, flags);
2764 break; 2785 break;
2765 } 2786 }
2766 r10_bio = list_entry(head->prev, struct r10bio, retry_list); 2787 r10_bio = list_entry(head->prev, struct r10bio, retry_list);
2767 list_del(head->prev); 2788 list_del(head->prev);
2768 conf->nr_queued--; 2789 conf->nr_queued--;
2769 spin_unlock_irqrestore(&conf->device_lock, flags); 2790 spin_unlock_irqrestore(&conf->device_lock, flags);
2770 2791
2771 mddev = r10_bio->mddev; 2792 mddev = r10_bio->mddev;
2772 conf = mddev->private; 2793 conf = mddev->private;
2773 if (test_bit(R10BIO_MadeGood, &r10_bio->state) || 2794 if (test_bit(R10BIO_MadeGood, &r10_bio->state) ||
2774 test_bit(R10BIO_WriteError, &r10_bio->state)) 2795 test_bit(R10BIO_WriteError, &r10_bio->state))
2775 handle_write_completed(conf, r10_bio); 2796 handle_write_completed(conf, r10_bio);
2776 else if (test_bit(R10BIO_IsReshape, &r10_bio->state)) 2797 else if (test_bit(R10BIO_IsReshape, &r10_bio->state))
2777 reshape_request_write(mddev, r10_bio); 2798 reshape_request_write(mddev, r10_bio);
2778 else if (test_bit(R10BIO_IsSync, &r10_bio->state)) 2799 else if (test_bit(R10BIO_IsSync, &r10_bio->state))
2779 sync_request_write(mddev, r10_bio); 2800 sync_request_write(mddev, r10_bio);
2780 else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) 2801 else if (test_bit(R10BIO_IsRecover, &r10_bio->state))
2781 recovery_request_write(mddev, r10_bio); 2802 recovery_request_write(mddev, r10_bio);
2782 else if (test_bit(R10BIO_ReadError, &r10_bio->state)) 2803 else if (test_bit(R10BIO_ReadError, &r10_bio->state))
2783 handle_read_error(mddev, r10_bio); 2804 handle_read_error(mddev, r10_bio);
2784 else { 2805 else {
2785 /* just a partial read to be scheduled from a 2806 /* just a partial read to be scheduled from a
2786 * separate context 2807 * separate context
2787 */ 2808 */
2788 int slot = r10_bio->read_slot; 2809 int slot = r10_bio->read_slot;
2789 generic_make_request(r10_bio->devs[slot].bio); 2810 generic_make_request(r10_bio->devs[slot].bio);
2790 } 2811 }
2791 2812
2792 cond_resched(); 2813 cond_resched();
2793 if (mddev->flags & ~(1<<MD_CHANGE_PENDING)) 2814 if (mddev->flags & ~(1<<MD_CHANGE_PENDING))
2794 md_check_recovery(mddev); 2815 md_check_recovery(mddev);
2795 } 2816 }
2796 blk_finish_plug(&plug); 2817 blk_finish_plug(&plug);
2797 } 2818 }
2798 2819
2799 2820
2800 static int init_resync(struct r10conf *conf) 2821 static int init_resync(struct r10conf *conf)
2801 { 2822 {
2802 int buffs; 2823 int buffs;
2803 int i; 2824 int i;
2804 2825
2805 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE; 2826 buffs = RESYNC_WINDOW / RESYNC_BLOCK_SIZE;
2806 BUG_ON(conf->r10buf_pool); 2827 BUG_ON(conf->r10buf_pool);
2807 conf->have_replacement = 0; 2828 conf->have_replacement = 0;
2808 for (i = 0; i < conf->geo.raid_disks; i++) 2829 for (i = 0; i < conf->geo.raid_disks; i++)
2809 if (conf->mirrors[i].replacement) 2830 if (conf->mirrors[i].replacement)
2810 conf->have_replacement = 1; 2831 conf->have_replacement = 1;
2811 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf); 2832 conf->r10buf_pool = mempool_create(buffs, r10buf_pool_alloc, r10buf_pool_free, conf);
2812 if (!conf->r10buf_pool) 2833 if (!conf->r10buf_pool)
2813 return -ENOMEM; 2834 return -ENOMEM;
2814 conf->next_resync = 0; 2835 conf->next_resync = 0;
2815 return 0; 2836 return 0;
2816 } 2837 }
2817 2838
2818 /* 2839 /*
2819 * perform a "sync" on one "block" 2840 * perform a "sync" on one "block"
2820 * 2841 *
2821 * We need to make sure that no normal I/O request - particularly write 2842 * We need to make sure that no normal I/O request - particularly write
2822 * requests - conflict with active sync requests. 2843 * requests - conflict with active sync requests.
2823 * 2844 *
2824 * This is achieved by tracking pending requests and a 'barrier' concept 2845 * This is achieved by tracking pending requests and a 'barrier' concept
2825 * that can be installed to exclude normal IO requests. 2846 * that can be installed to exclude normal IO requests.
2826 * 2847 *
2827 * Resync and recovery are handled very differently. 2848 * Resync and recovery are handled very differently.
2828 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery. 2849 * We differentiate by looking at MD_RECOVERY_SYNC in mddev->recovery.
2829 * 2850 *
2830 * For resync, we iterate over virtual addresses, read all copies, 2851 * For resync, we iterate over virtual addresses, read all copies,
2831 * and update if there are differences. If only one copy is live, 2852 * and update if there are differences. If only one copy is live,
2832 * skip it. 2853 * skip it.
2833 * For recovery, we iterate over physical addresses, read a good 2854 * For recovery, we iterate over physical addresses, read a good
2834 * value for each non-in_sync drive, and over-write. 2855 * value for each non-in_sync drive, and over-write.
2835 * 2856 *
2836 * So, for recovery we may have several outstanding complex requests for a 2857 * So, for recovery we may have several outstanding complex requests for a
2837 * given address, one for each out-of-sync device. We model this by allocating 2858 * given address, one for each out-of-sync device. We model this by allocating
2838 * a number of r10_bio structures, one for each out-of-sync device. 2859 * a number of r10_bio structures, one for each out-of-sync device.
2839 * As we setup these structures, we collect all bio's together into a list 2860 * As we setup these structures, we collect all bio's together into a list
2840 * which we then process collectively to add pages, and then process again 2861 * which we then process collectively to add pages, and then process again
2841 * to pass to generic_make_request. 2862 * to pass to generic_make_request.
2842 * 2863 *
2843 * The r10_bio structures are linked using a borrowed master_bio pointer. 2864 * The r10_bio structures are linked using a borrowed master_bio pointer.
2844 * This link is counted in ->remaining. When the r10_bio that points to NULL 2865 * This link is counted in ->remaining. When the r10_bio that points to NULL
2845 * has its remaining count decremented to 0, the whole complex operation 2866 * has its remaining count decremented to 0, the whole complex operation
2846 * is complete. 2867 * is complete.
2847 * 2868 *
2848 */ 2869 */
2849 2870
2850 static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, 2871 static sector_t sync_request(struct mddev *mddev, sector_t sector_nr,
2851 int *skipped, int go_faster) 2872 int *skipped, int go_faster)
2852 { 2873 {
2853 struct r10conf *conf = mddev->private; 2874 struct r10conf *conf = mddev->private;
2854 struct r10bio *r10_bio; 2875 struct r10bio *r10_bio;
2855 struct bio *biolist = NULL, *bio; 2876 struct bio *biolist = NULL, *bio;
2856 sector_t max_sector, nr_sectors; 2877 sector_t max_sector, nr_sectors;
2857 int i; 2878 int i;
2858 int max_sync; 2879 int max_sync;
2859 sector_t sync_blocks; 2880 sector_t sync_blocks;
2860 sector_t sectors_skipped = 0; 2881 sector_t sectors_skipped = 0;
2861 int chunks_skipped = 0; 2882 int chunks_skipped = 0;
2862 sector_t chunk_mask = conf->geo.chunk_mask; 2883 sector_t chunk_mask = conf->geo.chunk_mask;
2863 2884
2864 if (!conf->r10buf_pool) 2885 if (!conf->r10buf_pool)
2865 if (init_resync(conf)) 2886 if (init_resync(conf))
2866 return 0; 2887 return 0;
2867 2888
2868 skipped: 2889 skipped:
2869 max_sector = mddev->dev_sectors; 2890 max_sector = mddev->dev_sectors;
2870 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 2891 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
2871 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2892 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2872 max_sector = mddev->resync_max_sectors; 2893 max_sector = mddev->resync_max_sectors;
2873 if (sector_nr >= max_sector) { 2894 if (sector_nr >= max_sector) {
2874 /* If we aborted, we need to abort the 2895 /* If we aborted, we need to abort the
2875 * sync on the 'current' bitmap chucks (there can 2896 * sync on the 'current' bitmap chucks (there can
2876 * be several when recovering multiple devices). 2897 * be several when recovering multiple devices).
2877 * as we may have started syncing it but not finished. 2898 * as we may have started syncing it but not finished.
2878 * We can find the current address in 2899 * We can find the current address in
2879 * mddev->curr_resync, but for recovery, 2900 * mddev->curr_resync, but for recovery,
2880 * we need to convert that to several 2901 * we need to convert that to several
2881 * virtual addresses. 2902 * virtual addresses.
2882 */ 2903 */
2883 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 2904 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
2884 end_reshape(conf); 2905 end_reshape(conf);
2885 return 0; 2906 return 0;
2886 } 2907 }
2887 2908
2888 if (mddev->curr_resync < max_sector) { /* aborted */ 2909 if (mddev->curr_resync < max_sector) { /* aborted */
2889 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 2910 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
2890 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 2911 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
2891 &sync_blocks, 1); 2912 &sync_blocks, 1);
2892 else for (i = 0; i < conf->geo.raid_disks; i++) { 2913 else for (i = 0; i < conf->geo.raid_disks; i++) {
2893 sector_t sect = 2914 sector_t sect =
2894 raid10_find_virt(conf, mddev->curr_resync, i); 2915 raid10_find_virt(conf, mddev->curr_resync, i);
2895 bitmap_end_sync(mddev->bitmap, sect, 2916 bitmap_end_sync(mddev->bitmap, sect,
2896 &sync_blocks, 1); 2917 &sync_blocks, 1);
2897 } 2918 }
2898 } else { 2919 } else {
2899 /* completed sync */ 2920 /* completed sync */
2900 if ((!mddev->bitmap || conf->fullsync) 2921 if ((!mddev->bitmap || conf->fullsync)
2901 && conf->have_replacement 2922 && conf->have_replacement
2902 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2923 && test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2903 /* Completed a full sync so the replacements 2924 /* Completed a full sync so the replacements
2904 * are now fully recovered. 2925 * are now fully recovered.
2905 */ 2926 */
2906 for (i = 0; i < conf->geo.raid_disks; i++) 2927 for (i = 0; i < conf->geo.raid_disks; i++)
2907 if (conf->mirrors[i].replacement) 2928 if (conf->mirrors[i].replacement)
2908 conf->mirrors[i].replacement 2929 conf->mirrors[i].replacement
2909 ->recovery_offset 2930 ->recovery_offset
2910 = MaxSector; 2931 = MaxSector;
2911 } 2932 }
2912 conf->fullsync = 0; 2933 conf->fullsync = 0;
2913 } 2934 }
2914 bitmap_close_sync(mddev->bitmap); 2935 bitmap_close_sync(mddev->bitmap);
2915 close_sync(conf); 2936 close_sync(conf);
2916 *skipped = 1; 2937 *skipped = 1;
2917 return sectors_skipped; 2938 return sectors_skipped;
2918 } 2939 }
2919 2940
2920 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 2941 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
2921 return reshape_request(mddev, sector_nr, skipped); 2942 return reshape_request(mddev, sector_nr, skipped);
2922 2943
2923 if (chunks_skipped >= conf->geo.raid_disks) { 2944 if (chunks_skipped >= conf->geo.raid_disks) {
2924 /* if there has been nothing to do on any drive, 2945 /* if there has been nothing to do on any drive,
2925 * then there is nothing to do at all.. 2946 * then there is nothing to do at all..
2926 */ 2947 */
2927 *skipped = 1; 2948 *skipped = 1;
2928 return (max_sector - sector_nr) + sectors_skipped; 2949 return (max_sector - sector_nr) + sectors_skipped;
2929 } 2950 }
2930 2951
2931 if (max_sector > mddev->resync_max) 2952 if (max_sector > mddev->resync_max)
2932 max_sector = mddev->resync_max; /* Don't do IO beyond here */ 2953 max_sector = mddev->resync_max; /* Don't do IO beyond here */
2933 2954
2934 /* make sure whole request will fit in a chunk - if chunks 2955 /* make sure whole request will fit in a chunk - if chunks
2935 * are meaningful 2956 * are meaningful
2936 */ 2957 */
2937 if (conf->geo.near_copies < conf->geo.raid_disks && 2958 if (conf->geo.near_copies < conf->geo.raid_disks &&
2938 max_sector > (sector_nr | chunk_mask)) 2959 max_sector > (sector_nr | chunk_mask))
2939 max_sector = (sector_nr | chunk_mask) + 1; 2960 max_sector = (sector_nr | chunk_mask) + 1;
2940 /* 2961 /*
2941 * If there is non-resync activity waiting for us then 2962 * If there is non-resync activity waiting for us then
2942 * put in a delay to throttle resync. 2963 * put in a delay to throttle resync.
2943 */ 2964 */
2944 if (!go_faster && conf->nr_waiting) 2965 if (!go_faster && conf->nr_waiting)
2945 msleep_interruptible(1000); 2966 msleep_interruptible(1000);
2946 2967
2947 /* Again, very different code for resync and recovery. 2968 /* Again, very different code for resync and recovery.
2948 * Both must result in an r10bio with a list of bios that 2969 * Both must result in an r10bio with a list of bios that
2949 * have bi_end_io, bi_sector, bi_bdev set, 2970 * have bi_end_io, bi_sector, bi_bdev set,
2950 * and bi_private set to the r10bio. 2971 * and bi_private set to the r10bio.
2951 * For recovery, we may actually create several r10bios 2972 * For recovery, we may actually create several r10bios
2952 * with 2 bios in each, that correspond to the bios in the main one. 2973 * with 2 bios in each, that correspond to the bios in the main one.
2953 * In this case, the subordinate r10bios link back through a 2974 * In this case, the subordinate r10bios link back through a
2954 * borrowed master_bio pointer, and the counter in the master 2975 * borrowed master_bio pointer, and the counter in the master
2955 * includes a ref from each subordinate. 2976 * includes a ref from each subordinate.
2956 */ 2977 */
2957 /* First, we decide what to do and set ->bi_end_io 2978 /* First, we decide what to do and set ->bi_end_io
2958 * To end_sync_read if we want to read, and 2979 * To end_sync_read if we want to read, and
2959 * end_sync_write if we will want to write. 2980 * end_sync_write if we will want to write.
2960 */ 2981 */
2961 2982
2962 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9); 2983 max_sync = RESYNC_PAGES << (PAGE_SHIFT-9);
2963 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 2984 if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
2964 /* recovery... the complicated one */ 2985 /* recovery... the complicated one */
2965 int j; 2986 int j;
2966 r10_bio = NULL; 2987 r10_bio = NULL;
2967 2988
2968 for (i = 0 ; i < conf->geo.raid_disks; i++) { 2989 for (i = 0 ; i < conf->geo.raid_disks; i++) {
2969 int still_degraded; 2990 int still_degraded;
2970 struct r10bio *rb2; 2991 struct r10bio *rb2;
2971 sector_t sect; 2992 sector_t sect;
2972 int must_sync; 2993 int must_sync;
2973 int any_working; 2994 int any_working;
2974 struct raid10_info *mirror = &conf->mirrors[i]; 2995 struct raid10_info *mirror = &conf->mirrors[i];
2975 2996
2976 if ((mirror->rdev == NULL || 2997 if ((mirror->rdev == NULL ||
2977 test_bit(In_sync, &mirror->rdev->flags)) 2998 test_bit(In_sync, &mirror->rdev->flags))
2978 && 2999 &&
2979 (mirror->replacement == NULL || 3000 (mirror->replacement == NULL ||
2980 test_bit(Faulty, 3001 test_bit(Faulty,
2981 &mirror->replacement->flags))) 3002 &mirror->replacement->flags)))
2982 continue; 3003 continue;
2983 3004
2984 still_degraded = 0; 3005 still_degraded = 0;
2985 /* want to reconstruct this device */ 3006 /* want to reconstruct this device */
2986 rb2 = r10_bio; 3007 rb2 = r10_bio;
2987 sect = raid10_find_virt(conf, sector_nr, i); 3008 sect = raid10_find_virt(conf, sector_nr, i);
2988 if (sect >= mddev->resync_max_sectors) { 3009 if (sect >= mddev->resync_max_sectors) {
2989 /* last stripe is not complete - don't 3010 /* last stripe is not complete - don't
2990 * try to recover this sector. 3011 * try to recover this sector.
2991 */ 3012 */
2992 continue; 3013 continue;
2993 } 3014 }
2994 /* Unless we are doing a full sync, or a replacement 3015 /* Unless we are doing a full sync, or a replacement
2995 * we only need to recover the block if it is set in 3016 * we only need to recover the block if it is set in
2996 * the bitmap 3017 * the bitmap
2997 */ 3018 */
2998 must_sync = bitmap_start_sync(mddev->bitmap, sect, 3019 must_sync = bitmap_start_sync(mddev->bitmap, sect,
2999 &sync_blocks, 1); 3020 &sync_blocks, 1);
3000 if (sync_blocks < max_sync) 3021 if (sync_blocks < max_sync)
3001 max_sync = sync_blocks; 3022 max_sync = sync_blocks;
3002 if (!must_sync && 3023 if (!must_sync &&
3003 mirror->replacement == NULL && 3024 mirror->replacement == NULL &&
3004 !conf->fullsync) { 3025 !conf->fullsync) {
3005 /* yep, skip the sync_blocks here, but don't assume 3026 /* yep, skip the sync_blocks here, but don't assume
3006 * that there will never be anything to do here 3027 * that there will never be anything to do here
3007 */ 3028 */
3008 chunks_skipped = -1; 3029 chunks_skipped = -1;
3009 continue; 3030 continue;
3010 } 3031 }
3011 3032
3012 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); 3033 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3013 raise_barrier(conf, rb2 != NULL); 3034 raise_barrier(conf, rb2 != NULL);
3014 atomic_set(&r10_bio->remaining, 0); 3035 atomic_set(&r10_bio->remaining, 0);
3015 3036
3016 r10_bio->master_bio = (struct bio*)rb2; 3037 r10_bio->master_bio = (struct bio*)rb2;
3017 if (rb2) 3038 if (rb2)
3018 atomic_inc(&rb2->remaining); 3039 atomic_inc(&rb2->remaining);
3019 r10_bio->mddev = mddev; 3040 r10_bio->mddev = mddev;
3020 set_bit(R10BIO_IsRecover, &r10_bio->state); 3041 set_bit(R10BIO_IsRecover, &r10_bio->state);
3021 r10_bio->sector = sect; 3042 r10_bio->sector = sect;
3022 3043
3023 raid10_find_phys(conf, r10_bio); 3044 raid10_find_phys(conf, r10_bio);
3024 3045
3025 /* Need to check if the array will still be 3046 /* Need to check if the array will still be
3026 * degraded 3047 * degraded
3027 */ 3048 */
3028 for (j = 0; j < conf->geo.raid_disks; j++) 3049 for (j = 0; j < conf->geo.raid_disks; j++)
3029 if (conf->mirrors[j].rdev == NULL || 3050 if (conf->mirrors[j].rdev == NULL ||
3030 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { 3051 test_bit(Faulty, &conf->mirrors[j].rdev->flags)) {
3031 still_degraded = 1; 3052 still_degraded = 1;
3032 break; 3053 break;
3033 } 3054 }
3034 3055
3035 must_sync = bitmap_start_sync(mddev->bitmap, sect, 3056 must_sync = bitmap_start_sync(mddev->bitmap, sect,
3036 &sync_blocks, still_degraded); 3057 &sync_blocks, still_degraded);
3037 3058
3038 any_working = 0; 3059 any_working = 0;
3039 for (j=0; j<conf->copies;j++) { 3060 for (j=0; j<conf->copies;j++) {
3040 int k; 3061 int k;
3041 int d = r10_bio->devs[j].devnum; 3062 int d = r10_bio->devs[j].devnum;
3042 sector_t from_addr, to_addr; 3063 sector_t from_addr, to_addr;
3043 struct md_rdev *rdev; 3064 struct md_rdev *rdev;
3044 sector_t sector, first_bad; 3065 sector_t sector, first_bad;
3045 int bad_sectors; 3066 int bad_sectors;
3046 if (!conf->mirrors[d].rdev || 3067 if (!conf->mirrors[d].rdev ||
3047 !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) 3068 !test_bit(In_sync, &conf->mirrors[d].rdev->flags))
3048 continue; 3069 continue;
3049 /* This is where we read from */ 3070 /* This is where we read from */
3050 any_working = 1; 3071 any_working = 1;
3051 rdev = conf->mirrors[d].rdev; 3072 rdev = conf->mirrors[d].rdev;
3052 sector = r10_bio->devs[j].addr; 3073 sector = r10_bio->devs[j].addr;
3053 3074
3054 if (is_badblock(rdev, sector, max_sync, 3075 if (is_badblock(rdev, sector, max_sync,
3055 &first_bad, &bad_sectors)) { 3076 &first_bad, &bad_sectors)) {
3056 if (first_bad > sector) 3077 if (first_bad > sector)
3057 max_sync = first_bad - sector; 3078 max_sync = first_bad - sector;
3058 else { 3079 else {
3059 bad_sectors -= (sector 3080 bad_sectors -= (sector
3060 - first_bad); 3081 - first_bad);
3061 if (max_sync > bad_sectors) 3082 if (max_sync > bad_sectors)
3062 max_sync = bad_sectors; 3083 max_sync = bad_sectors;
3063 continue; 3084 continue;
3064 } 3085 }
3065 } 3086 }
3066 bio = r10_bio->devs[0].bio; 3087 bio = r10_bio->devs[0].bio;
3067 bio->bi_next = biolist; 3088 bio->bi_next = biolist;
3068 biolist = bio; 3089 biolist = bio;
3069 bio->bi_private = r10_bio; 3090 bio->bi_private = r10_bio;
3070 bio->bi_end_io = end_sync_read; 3091 bio->bi_end_io = end_sync_read;
3071 bio->bi_rw = READ; 3092 bio->bi_rw = READ;
3072 from_addr = r10_bio->devs[j].addr; 3093 from_addr = r10_bio->devs[j].addr;
3073 bio->bi_sector = from_addr + rdev->data_offset; 3094 bio->bi_sector = from_addr + rdev->data_offset;
3074 bio->bi_bdev = rdev->bdev; 3095 bio->bi_bdev = rdev->bdev;
3075 atomic_inc(&rdev->nr_pending); 3096 atomic_inc(&rdev->nr_pending);
3076 /* and we write to 'i' (if not in_sync) */ 3097 /* and we write to 'i' (if not in_sync) */
3077 3098
3078 for (k=0; k<conf->copies; k++) 3099 for (k=0; k<conf->copies; k++)
3079 if (r10_bio->devs[k].devnum == i) 3100 if (r10_bio->devs[k].devnum == i)
3080 break; 3101 break;
3081 BUG_ON(k == conf->copies); 3102 BUG_ON(k == conf->copies);
3082 to_addr = r10_bio->devs[k].addr; 3103 to_addr = r10_bio->devs[k].addr;
3083 r10_bio->devs[0].devnum = d; 3104 r10_bio->devs[0].devnum = d;
3084 r10_bio->devs[0].addr = from_addr; 3105 r10_bio->devs[0].addr = from_addr;
3085 r10_bio->devs[1].devnum = i; 3106 r10_bio->devs[1].devnum = i;
3086 r10_bio->devs[1].addr = to_addr; 3107 r10_bio->devs[1].addr = to_addr;
3087 3108
3088 rdev = mirror->rdev; 3109 rdev = mirror->rdev;
3089 if (!test_bit(In_sync, &rdev->flags)) { 3110 if (!test_bit(In_sync, &rdev->flags)) {
3090 bio = r10_bio->devs[1].bio; 3111 bio = r10_bio->devs[1].bio;
3091 bio->bi_next = biolist; 3112 bio->bi_next = biolist;
3092 biolist = bio; 3113 biolist = bio;
3093 bio->bi_private = r10_bio; 3114 bio->bi_private = r10_bio;
3094 bio->bi_end_io = end_sync_write; 3115 bio->bi_end_io = end_sync_write;
3095 bio->bi_rw = WRITE; 3116 bio->bi_rw = WRITE;
3096 bio->bi_sector = to_addr 3117 bio->bi_sector = to_addr
3097 + rdev->data_offset; 3118 + rdev->data_offset;
3098 bio->bi_bdev = rdev->bdev; 3119 bio->bi_bdev = rdev->bdev;
3099 atomic_inc(&r10_bio->remaining); 3120 atomic_inc(&r10_bio->remaining);
3100 } else 3121 } else
3101 r10_bio->devs[1].bio->bi_end_io = NULL; 3122 r10_bio->devs[1].bio->bi_end_io = NULL;
3102 3123
3103 /* and maybe write to replacement */ 3124 /* and maybe write to replacement */
3104 bio = r10_bio->devs[1].repl_bio; 3125 bio = r10_bio->devs[1].repl_bio;
3105 if (bio) 3126 if (bio)
3106 bio->bi_end_io = NULL; 3127 bio->bi_end_io = NULL;
3107 rdev = mirror->replacement; 3128 rdev = mirror->replacement;
3108 /* Note: if rdev != NULL, then bio 3129 /* Note: if rdev != NULL, then bio
3109 * cannot be NULL as r10buf_pool_alloc will 3130 * cannot be NULL as r10buf_pool_alloc will
3110 * have allocated it. 3131 * have allocated it.
3111 * So the second test here is pointless. 3132 * So the second test here is pointless.
3112 * But it keeps semantic-checkers happy, and 3133 * But it keeps semantic-checkers happy, and
3113 * this comment keeps human reviewers 3134 * this comment keeps human reviewers
3114 * happy. 3135 * happy.
3115 */ 3136 */
3116 if (rdev == NULL || bio == NULL || 3137 if (rdev == NULL || bio == NULL ||
3117 test_bit(Faulty, &rdev->flags)) 3138 test_bit(Faulty, &rdev->flags))
3118 break; 3139 break;
3119 bio->bi_next = biolist; 3140 bio->bi_next = biolist;
3120 biolist = bio; 3141 biolist = bio;
3121 bio->bi_private = r10_bio; 3142 bio->bi_private = r10_bio;
3122 bio->bi_end_io = end_sync_write; 3143 bio->bi_end_io = end_sync_write;
3123 bio->bi_rw = WRITE; 3144 bio->bi_rw = WRITE;
3124 bio->bi_sector = to_addr + rdev->data_offset; 3145 bio->bi_sector = to_addr + rdev->data_offset;
3125 bio->bi_bdev = rdev->bdev; 3146 bio->bi_bdev = rdev->bdev;
3126 atomic_inc(&r10_bio->remaining); 3147 atomic_inc(&r10_bio->remaining);
3127 break; 3148 break;
3128 } 3149 }
3129 if (j == conf->copies) { 3150 if (j == conf->copies) {
3130 /* Cannot recover, so abort the recovery or 3151 /* Cannot recover, so abort the recovery or
3131 * record a bad block */ 3152 * record a bad block */
3132 put_buf(r10_bio); 3153 put_buf(r10_bio);
3133 if (rb2) 3154 if (rb2)
3134 atomic_dec(&rb2->remaining); 3155 atomic_dec(&rb2->remaining);
3135 r10_bio = rb2; 3156 r10_bio = rb2;
3136 if (any_working) { 3157 if (any_working) {
3137 /* problem is that there are bad blocks 3158 /* problem is that there are bad blocks
3138 * on other device(s) 3159 * on other device(s)
3139 */ 3160 */
3140 int k; 3161 int k;
3141 for (k = 0; k < conf->copies; k++) 3162 for (k = 0; k < conf->copies; k++)
3142 if (r10_bio->devs[k].devnum == i) 3163 if (r10_bio->devs[k].devnum == i)
3143 break; 3164 break;
3144 if (!test_bit(In_sync, 3165 if (!test_bit(In_sync,
3145 &mirror->rdev->flags) 3166 &mirror->rdev->flags)
3146 && !rdev_set_badblocks( 3167 && !rdev_set_badblocks(
3147 mirror->rdev, 3168 mirror->rdev,
3148 r10_bio->devs[k].addr, 3169 r10_bio->devs[k].addr,
3149 max_sync, 0)) 3170 max_sync, 0))
3150 any_working = 0; 3171 any_working = 0;
3151 if (mirror->replacement && 3172 if (mirror->replacement &&
3152 !rdev_set_badblocks( 3173 !rdev_set_badblocks(
3153 mirror->replacement, 3174 mirror->replacement,
3154 r10_bio->devs[k].addr, 3175 r10_bio->devs[k].addr,
3155 max_sync, 0)) 3176 max_sync, 0))
3156 any_working = 0; 3177 any_working = 0;
3157 } 3178 }
3158 if (!any_working) { 3179 if (!any_working) {
3159 if (!test_and_set_bit(MD_RECOVERY_INTR, 3180 if (!test_and_set_bit(MD_RECOVERY_INTR,
3160 &mddev->recovery)) 3181 &mddev->recovery))
3161 printk(KERN_INFO "md/raid10:%s: insufficient " 3182 printk(KERN_INFO "md/raid10:%s: insufficient "
3162 "working devices for recovery.\n", 3183 "working devices for recovery.\n",
3163 mdname(mddev)); 3184 mdname(mddev));
3164 mirror->recovery_disabled 3185 mirror->recovery_disabled
3165 = mddev->recovery_disabled; 3186 = mddev->recovery_disabled;
3166 } 3187 }
3167 break; 3188 break;
3168 } 3189 }
3169 } 3190 }
3170 if (biolist == NULL) { 3191 if (biolist == NULL) {
3171 while (r10_bio) { 3192 while (r10_bio) {
3172 struct r10bio *rb2 = r10_bio; 3193 struct r10bio *rb2 = r10_bio;
3173 r10_bio = (struct r10bio*) rb2->master_bio; 3194 r10_bio = (struct r10bio*) rb2->master_bio;
3174 rb2->master_bio = NULL; 3195 rb2->master_bio = NULL;
3175 put_buf(rb2); 3196 put_buf(rb2);
3176 } 3197 }
3177 goto giveup; 3198 goto giveup;
3178 } 3199 }
3179 } else { 3200 } else {
3180 /* resync. Schedule a read for every block at this virt offset */ 3201 /* resync. Schedule a read for every block at this virt offset */
3181 int count = 0; 3202 int count = 0;
3182 3203
3183 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 3204 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
3184 3205
3185 if (!bitmap_start_sync(mddev->bitmap, sector_nr, 3206 if (!bitmap_start_sync(mddev->bitmap, sector_nr,
3186 &sync_blocks, mddev->degraded) && 3207 &sync_blocks, mddev->degraded) &&
3187 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, 3208 !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED,
3188 &mddev->recovery)) { 3209 &mddev->recovery)) {
3189 /* We can skip this block */ 3210 /* We can skip this block */
3190 *skipped = 1; 3211 *skipped = 1;
3191 return sync_blocks + sectors_skipped; 3212 return sync_blocks + sectors_skipped;
3192 } 3213 }
3193 if (sync_blocks < max_sync) 3214 if (sync_blocks < max_sync)
3194 max_sync = sync_blocks; 3215 max_sync = sync_blocks;
3195 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); 3216 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
3196 3217
3197 r10_bio->mddev = mddev; 3218 r10_bio->mddev = mddev;
3198 atomic_set(&r10_bio->remaining, 0); 3219 atomic_set(&r10_bio->remaining, 0);
3199 raise_barrier(conf, 0); 3220 raise_barrier(conf, 0);
3200 conf->next_resync = sector_nr; 3221 conf->next_resync = sector_nr;
3201 3222
3202 r10_bio->master_bio = NULL; 3223 r10_bio->master_bio = NULL;
3203 r10_bio->sector = sector_nr; 3224 r10_bio->sector = sector_nr;
3204 set_bit(R10BIO_IsSync, &r10_bio->state); 3225 set_bit(R10BIO_IsSync, &r10_bio->state);
3205 raid10_find_phys(conf, r10_bio); 3226 raid10_find_phys(conf, r10_bio);
3206 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1; 3227 r10_bio->sectors = (sector_nr | chunk_mask) - sector_nr + 1;
3207 3228
3208 for (i = 0; i < conf->copies; i++) { 3229 for (i = 0; i < conf->copies; i++) {
3209 int d = r10_bio->devs[i].devnum; 3230 int d = r10_bio->devs[i].devnum;
3210 sector_t first_bad, sector; 3231 sector_t first_bad, sector;
3211 int bad_sectors; 3232 int bad_sectors;
3212 3233
3213 if (r10_bio->devs[i].repl_bio) 3234 if (r10_bio->devs[i].repl_bio)
3214 r10_bio->devs[i].repl_bio->bi_end_io = NULL; 3235 r10_bio->devs[i].repl_bio->bi_end_io = NULL;
3215 3236
3216 bio = r10_bio->devs[i].bio; 3237 bio = r10_bio->devs[i].bio;
3217 bio->bi_end_io = NULL; 3238 bio->bi_end_io = NULL;
3218 clear_bit(BIO_UPTODATE, &bio->bi_flags); 3239 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3219 if (conf->mirrors[d].rdev == NULL || 3240 if (conf->mirrors[d].rdev == NULL ||
3220 test_bit(Faulty, &conf->mirrors[d].rdev->flags)) 3241 test_bit(Faulty, &conf->mirrors[d].rdev->flags))
3221 continue; 3242 continue;
3222 sector = r10_bio->devs[i].addr; 3243 sector = r10_bio->devs[i].addr;
3223 if (is_badblock(conf->mirrors[d].rdev, 3244 if (is_badblock(conf->mirrors[d].rdev,
3224 sector, max_sync, 3245 sector, max_sync,
3225 &first_bad, &bad_sectors)) { 3246 &first_bad, &bad_sectors)) {
3226 if (first_bad > sector) 3247 if (first_bad > sector)
3227 max_sync = first_bad - sector; 3248 max_sync = first_bad - sector;
3228 else { 3249 else {
3229 bad_sectors -= (sector - first_bad); 3250 bad_sectors -= (sector - first_bad);
3230 if (max_sync > bad_sectors) 3251 if (max_sync > bad_sectors)
3231 max_sync = bad_sectors; 3252 max_sync = bad_sectors;
3232 continue; 3253 continue;
3233 } 3254 }
3234 } 3255 }
3235 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 3256 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3236 atomic_inc(&r10_bio->remaining); 3257 atomic_inc(&r10_bio->remaining);
3237 bio->bi_next = biolist; 3258 bio->bi_next = biolist;
3238 biolist = bio; 3259 biolist = bio;
3239 bio->bi_private = r10_bio; 3260 bio->bi_private = r10_bio;
3240 bio->bi_end_io = end_sync_read; 3261 bio->bi_end_io = end_sync_read;
3241 bio->bi_rw = READ; 3262 bio->bi_rw = READ;
3242 bio->bi_sector = sector + 3263 bio->bi_sector = sector +
3243 conf->mirrors[d].rdev->data_offset; 3264 conf->mirrors[d].rdev->data_offset;
3244 bio->bi_bdev = conf->mirrors[d].rdev->bdev; 3265 bio->bi_bdev = conf->mirrors[d].rdev->bdev;
3245 count++; 3266 count++;
3246 3267
3247 if (conf->mirrors[d].replacement == NULL || 3268 if (conf->mirrors[d].replacement == NULL ||
3248 test_bit(Faulty, 3269 test_bit(Faulty,
3249 &conf->mirrors[d].replacement->flags)) 3270 &conf->mirrors[d].replacement->flags))
3250 continue; 3271 continue;
3251 3272
3252 /* Need to set up for writing to the replacement */ 3273 /* Need to set up for writing to the replacement */
3253 bio = r10_bio->devs[i].repl_bio; 3274 bio = r10_bio->devs[i].repl_bio;
3254 clear_bit(BIO_UPTODATE, &bio->bi_flags); 3275 clear_bit(BIO_UPTODATE, &bio->bi_flags);
3255 3276
3256 sector = r10_bio->devs[i].addr; 3277 sector = r10_bio->devs[i].addr;
3257 atomic_inc(&conf->mirrors[d].rdev->nr_pending); 3278 atomic_inc(&conf->mirrors[d].rdev->nr_pending);
3258 bio->bi_next = biolist; 3279 bio->bi_next = biolist;
3259 biolist = bio; 3280 biolist = bio;
3260 bio->bi_private = r10_bio; 3281 bio->bi_private = r10_bio;
3261 bio->bi_end_io = end_sync_write; 3282 bio->bi_end_io = end_sync_write;
3262 bio->bi_rw = WRITE; 3283 bio->bi_rw = WRITE;
3263 bio->bi_sector = sector + 3284 bio->bi_sector = sector +
3264 conf->mirrors[d].replacement->data_offset; 3285 conf->mirrors[d].replacement->data_offset;
3265 bio->bi_bdev = conf->mirrors[d].replacement->bdev; 3286 bio->bi_bdev = conf->mirrors[d].replacement->bdev;
3266 count++; 3287 count++;
3267 } 3288 }
3268 3289
3269 if (count < 2) { 3290 if (count < 2) {
3270 for (i=0; i<conf->copies; i++) { 3291 for (i=0; i<conf->copies; i++) {
3271 int d = r10_bio->devs[i].devnum; 3292 int d = r10_bio->devs[i].devnum;
3272 if (r10_bio->devs[i].bio->bi_end_io) 3293 if (r10_bio->devs[i].bio->bi_end_io)
3273 rdev_dec_pending(conf->mirrors[d].rdev, 3294 rdev_dec_pending(conf->mirrors[d].rdev,
3274 mddev); 3295 mddev);
3275 if (r10_bio->devs[i].repl_bio && 3296 if (r10_bio->devs[i].repl_bio &&
3276 r10_bio->devs[i].repl_bio->bi_end_io) 3297 r10_bio->devs[i].repl_bio->bi_end_io)
3277 rdev_dec_pending( 3298 rdev_dec_pending(
3278 conf->mirrors[d].replacement, 3299 conf->mirrors[d].replacement,
3279 mddev); 3300 mddev);
3280 } 3301 }
3281 put_buf(r10_bio); 3302 put_buf(r10_bio);
3282 biolist = NULL; 3303 biolist = NULL;
3283 goto giveup; 3304 goto giveup;
3284 } 3305 }
3285 } 3306 }
3286 3307
3287 for (bio = biolist; bio ; bio=bio->bi_next) { 3308 for (bio = biolist; bio ; bio=bio->bi_next) {
3288 3309
3289 bio->bi_flags &= ~(BIO_POOL_MASK - 1); 3310 bio->bi_flags &= ~(BIO_POOL_MASK - 1);
3290 if (bio->bi_end_io) 3311 if (bio->bi_end_io)
3291 bio->bi_flags |= 1 << BIO_UPTODATE; 3312 bio->bi_flags |= 1 << BIO_UPTODATE;
3292 bio->bi_vcnt = 0; 3313 bio->bi_vcnt = 0;
3293 bio->bi_idx = 0; 3314 bio->bi_idx = 0;
3294 bio->bi_phys_segments = 0; 3315 bio->bi_phys_segments = 0;
3295 bio->bi_size = 0; 3316 bio->bi_size = 0;
3296 } 3317 }
3297 3318
3298 nr_sectors = 0; 3319 nr_sectors = 0;
3299 if (sector_nr + max_sync < max_sector) 3320 if (sector_nr + max_sync < max_sector)
3300 max_sector = sector_nr + max_sync; 3321 max_sector = sector_nr + max_sync;
3301 do { 3322 do {
3302 struct page *page; 3323 struct page *page;
3303 int len = PAGE_SIZE; 3324 int len = PAGE_SIZE;
3304 if (sector_nr + (len>>9) > max_sector) 3325 if (sector_nr + (len>>9) > max_sector)
3305 len = (max_sector - sector_nr) << 9; 3326 len = (max_sector - sector_nr) << 9;
3306 if (len == 0) 3327 if (len == 0)
3307 break; 3328 break;
3308 for (bio= biolist ; bio ; bio=bio->bi_next) { 3329 for (bio= biolist ; bio ; bio=bio->bi_next) {
3309 struct bio *bio2; 3330 struct bio *bio2;
3310 page = bio->bi_io_vec[bio->bi_vcnt].bv_page; 3331 page = bio->bi_io_vec[bio->bi_vcnt].bv_page;
3311 if (bio_add_page(bio, page, len, 0)) 3332 if (bio_add_page(bio, page, len, 0))
3312 continue; 3333 continue;
3313 3334
3314 /* stop here */ 3335 /* stop here */
3315 bio->bi_io_vec[bio->bi_vcnt].bv_page = page; 3336 bio->bi_io_vec[bio->bi_vcnt].bv_page = page;
3316 for (bio2 = biolist; 3337 for (bio2 = biolist;
3317 bio2 && bio2 != bio; 3338 bio2 && bio2 != bio;
3318 bio2 = bio2->bi_next) { 3339 bio2 = bio2->bi_next) {
3319 /* remove last page from this bio */ 3340 /* remove last page from this bio */
3320 bio2->bi_vcnt--; 3341 bio2->bi_vcnt--;
3321 bio2->bi_size -= len; 3342 bio2->bi_size -= len;
3322 bio2->bi_flags &= ~(1<< BIO_SEG_VALID); 3343 bio2->bi_flags &= ~(1<< BIO_SEG_VALID);
3323 } 3344 }
3324 goto bio_full; 3345 goto bio_full;
3325 } 3346 }
3326 nr_sectors += len>>9; 3347 nr_sectors += len>>9;
3327 sector_nr += len>>9; 3348 sector_nr += len>>9;
3328 } while (biolist->bi_vcnt < RESYNC_PAGES); 3349 } while (biolist->bi_vcnt < RESYNC_PAGES);
3329 bio_full: 3350 bio_full:
3330 r10_bio->sectors = nr_sectors; 3351 r10_bio->sectors = nr_sectors;
3331 3352
3332 while (biolist) { 3353 while (biolist) {
3333 bio = biolist; 3354 bio = biolist;
3334 biolist = biolist->bi_next; 3355 biolist = biolist->bi_next;
3335 3356
3336 bio->bi_next = NULL; 3357 bio->bi_next = NULL;
3337 r10_bio = bio->bi_private; 3358 r10_bio = bio->bi_private;
3338 r10_bio->sectors = nr_sectors; 3359 r10_bio->sectors = nr_sectors;
3339 3360
3340 if (bio->bi_end_io == end_sync_read) { 3361 if (bio->bi_end_io == end_sync_read) {
3341 md_sync_acct(bio->bi_bdev, nr_sectors); 3362 md_sync_acct(bio->bi_bdev, nr_sectors);
3342 generic_make_request(bio); 3363 generic_make_request(bio);
3343 } 3364 }
3344 } 3365 }
3345 3366
3346 if (sectors_skipped) 3367 if (sectors_skipped)
3347 /* pretend they weren't skipped, it makes 3368 /* pretend they weren't skipped, it makes
3348 * no important difference in this case 3369 * no important difference in this case
3349 */ 3370 */
3350 md_done_sync(mddev, sectors_skipped, 1); 3371 md_done_sync(mddev, sectors_skipped, 1);
3351 3372
3352 return sectors_skipped + nr_sectors; 3373 return sectors_skipped + nr_sectors;
3353 giveup: 3374 giveup:
3354 /* There is nowhere to write, so all non-sync 3375 /* There is nowhere to write, so all non-sync
3355 * drives must be failed or in resync, all drives 3376 * drives must be failed or in resync, all drives
3356 * have a bad block, so try the next chunk... 3377 * have a bad block, so try the next chunk...
3357 */ 3378 */
3358 if (sector_nr + max_sync < max_sector) 3379 if (sector_nr + max_sync < max_sector)
3359 max_sector = sector_nr + max_sync; 3380 max_sector = sector_nr + max_sync;
3360 3381
3361 sectors_skipped += (max_sector - sector_nr); 3382 sectors_skipped += (max_sector - sector_nr);
3362 chunks_skipped ++; 3383 chunks_skipped ++;
3363 sector_nr = max_sector; 3384 sector_nr = max_sector;
3364 goto skipped; 3385 goto skipped;
3365 } 3386 }
3366 3387
3367 static sector_t 3388 static sector_t
3368 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks) 3389 raid10_size(struct mddev *mddev, sector_t sectors, int raid_disks)
3369 { 3390 {
3370 sector_t size; 3391 sector_t size;
3371 struct r10conf *conf = mddev->private; 3392 struct r10conf *conf = mddev->private;
3372 3393
3373 if (!raid_disks) 3394 if (!raid_disks)
3374 raid_disks = min(conf->geo.raid_disks, 3395 raid_disks = min(conf->geo.raid_disks,
3375 conf->prev.raid_disks); 3396 conf->prev.raid_disks);
3376 if (!sectors) 3397 if (!sectors)
3377 sectors = conf->dev_sectors; 3398 sectors = conf->dev_sectors;
3378 3399
3379 size = sectors >> conf->geo.chunk_shift; 3400 size = sectors >> conf->geo.chunk_shift;
3380 sector_div(size, conf->geo.far_copies); 3401 sector_div(size, conf->geo.far_copies);
3381 size = size * raid_disks; 3402 size = size * raid_disks;
3382 sector_div(size, conf->geo.near_copies); 3403 sector_div(size, conf->geo.near_copies);
3383 3404
3384 return size << conf->geo.chunk_shift; 3405 return size << conf->geo.chunk_shift;
3385 } 3406 }
3386 3407
3387 static void calc_sectors(struct r10conf *conf, sector_t size) 3408 static void calc_sectors(struct r10conf *conf, sector_t size)
3388 { 3409 {
3389 /* Calculate the number of sectors-per-device that will 3410 /* Calculate the number of sectors-per-device that will
3390 * actually be used, and set conf->dev_sectors and 3411 * actually be used, and set conf->dev_sectors and
3391 * conf->stride 3412 * conf->stride
3392 */ 3413 */
3393 3414
3394 size = size >> conf->geo.chunk_shift; 3415 size = size >> conf->geo.chunk_shift;
3395 sector_div(size, conf->geo.far_copies); 3416 sector_div(size, conf->geo.far_copies);
3396 size = size * conf->geo.raid_disks; 3417 size = size * conf->geo.raid_disks;
3397 sector_div(size, conf->geo.near_copies); 3418 sector_div(size, conf->geo.near_copies);
3398 /* 'size' is now the number of chunks in the array */ 3419 /* 'size' is now the number of chunks in the array */
3399 /* calculate "used chunks per device" */ 3420 /* calculate "used chunks per device" */
3400 size = size * conf->copies; 3421 size = size * conf->copies;
3401 3422
3402 /* We need to round up when dividing by raid_disks to 3423 /* We need to round up when dividing by raid_disks to
3403 * get the stride size. 3424 * get the stride size.
3404 */ 3425 */
3405 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks); 3426 size = DIV_ROUND_UP_SECTOR_T(size, conf->geo.raid_disks);
3406 3427
3407 conf->dev_sectors = size << conf->geo.chunk_shift; 3428 conf->dev_sectors = size << conf->geo.chunk_shift;
3408 3429
3409 if (conf->geo.far_offset) 3430 if (conf->geo.far_offset)
3410 conf->geo.stride = 1 << conf->geo.chunk_shift; 3431 conf->geo.stride = 1 << conf->geo.chunk_shift;
3411 else { 3432 else {
3412 sector_div(size, conf->geo.far_copies); 3433 sector_div(size, conf->geo.far_copies);
3413 conf->geo.stride = size << conf->geo.chunk_shift; 3434 conf->geo.stride = size << conf->geo.chunk_shift;
3414 } 3435 }
3415 } 3436 }
3416 3437
3417 enum geo_type {geo_new, geo_old, geo_start}; 3438 enum geo_type {geo_new, geo_old, geo_start};
3418 static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new) 3439 static int setup_geo(struct geom *geo, struct mddev *mddev, enum geo_type new)
3419 { 3440 {
3420 int nc, fc, fo; 3441 int nc, fc, fo;
3421 int layout, chunk, disks; 3442 int layout, chunk, disks;
3422 switch (new) { 3443 switch (new) {
3423 case geo_old: 3444 case geo_old:
3424 layout = mddev->layout; 3445 layout = mddev->layout;
3425 chunk = mddev->chunk_sectors; 3446 chunk = mddev->chunk_sectors;
3426 disks = mddev->raid_disks - mddev->delta_disks; 3447 disks = mddev->raid_disks - mddev->delta_disks;
3427 break; 3448 break;
3428 case geo_new: 3449 case geo_new:
3429 layout = mddev->new_layout; 3450 layout = mddev->new_layout;
3430 chunk = mddev->new_chunk_sectors; 3451 chunk = mddev->new_chunk_sectors;
3431 disks = mddev->raid_disks; 3452 disks = mddev->raid_disks;
3432 break; 3453 break;
3433 default: /* avoid 'may be unused' warnings */ 3454 default: /* avoid 'may be unused' warnings */
3434 case geo_start: /* new when starting reshape - raid_disks not 3455 case geo_start: /* new when starting reshape - raid_disks not
3435 * updated yet. */ 3456 * updated yet. */
3436 layout = mddev->new_layout; 3457 layout = mddev->new_layout;
3437 chunk = mddev->new_chunk_sectors; 3458 chunk = mddev->new_chunk_sectors;
3438 disks = mddev->raid_disks + mddev->delta_disks; 3459 disks = mddev->raid_disks + mddev->delta_disks;
3439 break; 3460 break;
3440 } 3461 }
3441 if (layout >> 17) 3462 if (layout >> 18)
3442 return -1; 3463 return -1;
3443 if (chunk < (PAGE_SIZE >> 9) || 3464 if (chunk < (PAGE_SIZE >> 9) ||
3444 !is_power_of_2(chunk)) 3465 !is_power_of_2(chunk))
3445 return -2; 3466 return -2;
3446 nc = layout & 255; 3467 nc = layout & 255;
3447 fc = (layout >> 8) & 255; 3468 fc = (layout >> 8) & 255;
3448 fo = layout & (1<<16); 3469 fo = layout & (1<<16);
3449 geo->raid_disks = disks; 3470 geo->raid_disks = disks;
3450 geo->near_copies = nc; 3471 geo->near_copies = nc;
3451 geo->far_copies = fc; 3472 geo->far_copies = fc;
3452 geo->far_offset = fo; 3473 geo->far_offset = fo;
3474 geo->far_set_size = (layout & (1<<17)) ? disks / fc : disks;
3453 geo->chunk_mask = chunk - 1; 3475 geo->chunk_mask = chunk - 1;
3454 geo->chunk_shift = ffz(~chunk); 3476 geo->chunk_shift = ffz(~chunk);
3455 return nc*fc; 3477 return nc*fc;
3456 } 3478 }
3457 3479
3458 static struct r10conf *setup_conf(struct mddev *mddev) 3480 static struct r10conf *setup_conf(struct mddev *mddev)
3459 { 3481 {
3460 struct r10conf *conf = NULL; 3482 struct r10conf *conf = NULL;
3461 int err = -EINVAL; 3483 int err = -EINVAL;
3462 struct geom geo; 3484 struct geom geo;
3463 int copies; 3485 int copies;
3464 3486
3465 copies = setup_geo(&geo, mddev, geo_new); 3487 copies = setup_geo(&geo, mddev, geo_new);
3466 3488
3467 if (copies == -2) { 3489 if (copies == -2) {
3468 printk(KERN_ERR "md/raid10:%s: chunk size must be " 3490 printk(KERN_ERR "md/raid10:%s: chunk size must be "
3469 "at least PAGE_SIZE(%ld) and be a power of 2.\n", 3491 "at least PAGE_SIZE(%ld) and be a power of 2.\n",
3470 mdname(mddev), PAGE_SIZE); 3492 mdname(mddev), PAGE_SIZE);
3471 goto out; 3493 goto out;
3472 } 3494 }
3473 3495
3474 if (copies < 2 || copies > mddev->raid_disks) { 3496 if (copies < 2 || copies > mddev->raid_disks) {
3475 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n", 3497 printk(KERN_ERR "md/raid10:%s: unsupported raid10 layout: 0x%8x\n",
3476 mdname(mddev), mddev->new_layout); 3498 mdname(mddev), mddev->new_layout);
3477 goto out; 3499 goto out;
3478 } 3500 }
3479 3501
3480 err = -ENOMEM; 3502 err = -ENOMEM;
3481 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL); 3503 conf = kzalloc(sizeof(struct r10conf), GFP_KERNEL);
3482 if (!conf) 3504 if (!conf)
3483 goto out; 3505 goto out;
3484 3506
3485 /* FIXME calc properly */ 3507 /* FIXME calc properly */
3486 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks + 3508 conf->mirrors = kzalloc(sizeof(struct raid10_info)*(mddev->raid_disks +
3487 max(0,mddev->delta_disks)), 3509 max(0,mddev->delta_disks)),
3488 GFP_KERNEL); 3510 GFP_KERNEL);
3489 if (!conf->mirrors) 3511 if (!conf->mirrors)
3490 goto out; 3512 goto out;
3491 3513
3492 conf->tmppage = alloc_page(GFP_KERNEL); 3514 conf->tmppage = alloc_page(GFP_KERNEL);
3493 if (!conf->tmppage) 3515 if (!conf->tmppage)
3494 goto out; 3516 goto out;
3495 3517
3496 conf->geo = geo; 3518 conf->geo = geo;
3497 conf->copies = copies; 3519 conf->copies = copies;
3498 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc, 3520 conf->r10bio_pool = mempool_create(NR_RAID10_BIOS, r10bio_pool_alloc,
3499 r10bio_pool_free, conf); 3521 r10bio_pool_free, conf);
3500 if (!conf->r10bio_pool) 3522 if (!conf->r10bio_pool)
3501 goto out; 3523 goto out;
3502 3524
3503 calc_sectors(conf, mddev->dev_sectors); 3525 calc_sectors(conf, mddev->dev_sectors);
3504 if (mddev->reshape_position == MaxSector) { 3526 if (mddev->reshape_position == MaxSector) {
3505 conf->prev = conf->geo; 3527 conf->prev = conf->geo;
3506 conf->reshape_progress = MaxSector; 3528 conf->reshape_progress = MaxSector;
3507 } else { 3529 } else {
3508 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) { 3530 if (setup_geo(&conf->prev, mddev, geo_old) != conf->copies) {
3509 err = -EINVAL; 3531 err = -EINVAL;
3510 goto out; 3532 goto out;
3511 } 3533 }
3512 conf->reshape_progress = mddev->reshape_position; 3534 conf->reshape_progress = mddev->reshape_position;
3513 if (conf->prev.far_offset) 3535 if (conf->prev.far_offset)
3514 conf->prev.stride = 1 << conf->prev.chunk_shift; 3536 conf->prev.stride = 1 << conf->prev.chunk_shift;
3515 else 3537 else
3516 /* far_copies must be 1 */ 3538 /* far_copies must be 1 */
3517 conf->prev.stride = conf->dev_sectors; 3539 conf->prev.stride = conf->dev_sectors;
3518 } 3540 }
3519 spin_lock_init(&conf->device_lock); 3541 spin_lock_init(&conf->device_lock);
3520 INIT_LIST_HEAD(&conf->retry_list); 3542 INIT_LIST_HEAD(&conf->retry_list);
3521 3543
3522 spin_lock_init(&conf->resync_lock); 3544 spin_lock_init(&conf->resync_lock);
3523 init_waitqueue_head(&conf->wait_barrier); 3545 init_waitqueue_head(&conf->wait_barrier);
3524 3546
3525 conf->thread = md_register_thread(raid10d, mddev, "raid10"); 3547 conf->thread = md_register_thread(raid10d, mddev, "raid10");
3526 if (!conf->thread) 3548 if (!conf->thread)
3527 goto out; 3549 goto out;
3528 3550
3529 conf->mddev = mddev; 3551 conf->mddev = mddev;
3530 return conf; 3552 return conf;
3531 3553
3532 out: 3554 out:
3533 if (err == -ENOMEM) 3555 if (err == -ENOMEM)
3534 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n", 3556 printk(KERN_ERR "md/raid10:%s: couldn't allocate memory.\n",
3535 mdname(mddev)); 3557 mdname(mddev));
3536 if (conf) { 3558 if (conf) {
3537 if (conf->r10bio_pool) 3559 if (conf->r10bio_pool)
3538 mempool_destroy(conf->r10bio_pool); 3560 mempool_destroy(conf->r10bio_pool);
3539 kfree(conf->mirrors); 3561 kfree(conf->mirrors);
3540 safe_put_page(conf->tmppage); 3562 safe_put_page(conf->tmppage);
3541 kfree(conf); 3563 kfree(conf);
3542 } 3564 }
3543 return ERR_PTR(err); 3565 return ERR_PTR(err);
3544 } 3566 }
3545 3567
3546 static int run(struct mddev *mddev) 3568 static int run(struct mddev *mddev)
3547 { 3569 {
3548 struct r10conf *conf; 3570 struct r10conf *conf;
3549 int i, disk_idx, chunk_size; 3571 int i, disk_idx, chunk_size;
3550 struct raid10_info *disk; 3572 struct raid10_info *disk;
3551 struct md_rdev *rdev; 3573 struct md_rdev *rdev;
3552 sector_t size; 3574 sector_t size;
3553 sector_t min_offset_diff = 0; 3575 sector_t min_offset_diff = 0;
3554 int first = 1; 3576 int first = 1;
3555 bool discard_supported = false; 3577 bool discard_supported = false;
3556 3578
3557 if (mddev->private == NULL) { 3579 if (mddev->private == NULL) {
3558 conf = setup_conf(mddev); 3580 conf = setup_conf(mddev);
3559 if (IS_ERR(conf)) 3581 if (IS_ERR(conf))
3560 return PTR_ERR(conf); 3582 return PTR_ERR(conf);
3561 mddev->private = conf; 3583 mddev->private = conf;
3562 } 3584 }
3563 conf = mddev->private; 3585 conf = mddev->private;
3564 if (!conf) 3586 if (!conf)
3565 goto out; 3587 goto out;
3566 3588
3567 mddev->thread = conf->thread; 3589 mddev->thread = conf->thread;
3568 conf->thread = NULL; 3590 conf->thread = NULL;
3569 3591
3570 chunk_size = mddev->chunk_sectors << 9; 3592 chunk_size = mddev->chunk_sectors << 9;
3571 if (mddev->queue) { 3593 if (mddev->queue) {
3572 blk_queue_max_discard_sectors(mddev->queue, 3594 blk_queue_max_discard_sectors(mddev->queue,
3573 mddev->chunk_sectors); 3595 mddev->chunk_sectors);
3574 blk_queue_max_write_same_sectors(mddev->queue, 3596 blk_queue_max_write_same_sectors(mddev->queue,
3575 mddev->chunk_sectors); 3597 mddev->chunk_sectors);
3576 blk_queue_io_min(mddev->queue, chunk_size); 3598 blk_queue_io_min(mddev->queue, chunk_size);
3577 if (conf->geo.raid_disks % conf->geo.near_copies) 3599 if (conf->geo.raid_disks % conf->geo.near_copies)
3578 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); 3600 blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks);
3579 else 3601 else
3580 blk_queue_io_opt(mddev->queue, chunk_size * 3602 blk_queue_io_opt(mddev->queue, chunk_size *
3581 (conf->geo.raid_disks / conf->geo.near_copies)); 3603 (conf->geo.raid_disks / conf->geo.near_copies));
3582 } 3604 }
3583 3605
3584 rdev_for_each(rdev, mddev) { 3606 rdev_for_each(rdev, mddev) {
3585 long long diff; 3607 long long diff;
3586 struct request_queue *q; 3608 struct request_queue *q;
3587 3609
3588 disk_idx = rdev->raid_disk; 3610 disk_idx = rdev->raid_disk;
3589 if (disk_idx < 0) 3611 if (disk_idx < 0)
3590 continue; 3612 continue;
3591 if (disk_idx >= conf->geo.raid_disks && 3613 if (disk_idx >= conf->geo.raid_disks &&
3592 disk_idx >= conf->prev.raid_disks) 3614 disk_idx >= conf->prev.raid_disks)
3593 continue; 3615 continue;
3594 disk = conf->mirrors + disk_idx; 3616 disk = conf->mirrors + disk_idx;
3595 3617
3596 if (test_bit(Replacement, &rdev->flags)) { 3618 if (test_bit(Replacement, &rdev->flags)) {
3597 if (disk->replacement) 3619 if (disk->replacement)
3598 goto out_free_conf; 3620 goto out_free_conf;
3599 disk->replacement = rdev; 3621 disk->replacement = rdev;
3600 } else { 3622 } else {
3601 if (disk->rdev) 3623 if (disk->rdev)
3602 goto out_free_conf; 3624 goto out_free_conf;
3603 disk->rdev = rdev; 3625 disk->rdev = rdev;
3604 } 3626 }
3605 q = bdev_get_queue(rdev->bdev); 3627 q = bdev_get_queue(rdev->bdev);
3606 if (q->merge_bvec_fn) 3628 if (q->merge_bvec_fn)
3607 mddev->merge_check_needed = 1; 3629 mddev->merge_check_needed = 1;
3608 diff = (rdev->new_data_offset - rdev->data_offset); 3630 diff = (rdev->new_data_offset - rdev->data_offset);
3609 if (!mddev->reshape_backwards) 3631 if (!mddev->reshape_backwards)
3610 diff = -diff; 3632 diff = -diff;
3611 if (diff < 0) 3633 if (diff < 0)
3612 diff = 0; 3634 diff = 0;
3613 if (first || diff < min_offset_diff) 3635 if (first || diff < min_offset_diff)
3614 min_offset_diff = diff; 3636 min_offset_diff = diff;
3615 3637
3616 if (mddev->gendisk) 3638 if (mddev->gendisk)
3617 disk_stack_limits(mddev->gendisk, rdev->bdev, 3639 disk_stack_limits(mddev->gendisk, rdev->bdev,
3618 rdev->data_offset << 9); 3640 rdev->data_offset << 9);
3619 3641
3620 disk->head_position = 0; 3642 disk->head_position = 0;
3621 3643
3622 if (blk_queue_discard(bdev_get_queue(rdev->bdev))) 3644 if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
3623 discard_supported = true; 3645 discard_supported = true;
3624 } 3646 }
3625 3647
3626 if (mddev->queue) { 3648 if (mddev->queue) {
3627 if (discard_supported) 3649 if (discard_supported)
3628 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, 3650 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD,
3629 mddev->queue); 3651 mddev->queue);
3630 else 3652 else
3631 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, 3653 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
3632 mddev->queue); 3654 mddev->queue);
3633 } 3655 }
3634 /* need to check that every block has at least one working mirror */ 3656 /* need to check that every block has at least one working mirror */
3635 if (!enough(conf, -1)) { 3657 if (!enough(conf, -1)) {
3636 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", 3658 printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n",
3637 mdname(mddev)); 3659 mdname(mddev));
3638 goto out_free_conf; 3660 goto out_free_conf;
3639 } 3661 }
3640 3662
3641 if (conf->reshape_progress != MaxSector) { 3663 if (conf->reshape_progress != MaxSector) {
3642 /* must ensure that shape change is supported */ 3664 /* must ensure that shape change is supported */
3643 if (conf->geo.far_copies != 1 && 3665 if (conf->geo.far_copies != 1 &&
3644 conf->geo.far_offset == 0) 3666 conf->geo.far_offset == 0)
3645 goto out_free_conf; 3667 goto out_free_conf;
3646 if (conf->prev.far_copies != 1 && 3668 if (conf->prev.far_copies != 1 &&
3647 conf->geo.far_offset == 0) 3669 conf->geo.far_offset == 0)
3648 goto out_free_conf; 3670 goto out_free_conf;
3649 } 3671 }
3650 3672
3651 mddev->degraded = 0; 3673 mddev->degraded = 0;
3652 for (i = 0; 3674 for (i = 0;
3653 i < conf->geo.raid_disks 3675 i < conf->geo.raid_disks
3654 || i < conf->prev.raid_disks; 3676 || i < conf->prev.raid_disks;
3655 i++) { 3677 i++) {
3656 3678
3657 disk = conf->mirrors + i; 3679 disk = conf->mirrors + i;
3658 3680
3659 if (!disk->rdev && disk->replacement) { 3681 if (!disk->rdev && disk->replacement) {
3660 /* The replacement is all we have - use it */ 3682 /* The replacement is all we have - use it */
3661 disk->rdev = disk->replacement; 3683 disk->rdev = disk->replacement;
3662 disk->replacement = NULL; 3684 disk->replacement = NULL;
3663 clear_bit(Replacement, &disk->rdev->flags); 3685 clear_bit(Replacement, &disk->rdev->flags);
3664 } 3686 }
3665 3687
3666 if (!disk->rdev || 3688 if (!disk->rdev ||
3667 !test_bit(In_sync, &disk->rdev->flags)) { 3689 !test_bit(In_sync, &disk->rdev->flags)) {
3668 disk->head_position = 0; 3690 disk->head_position = 0;
3669 mddev->degraded++; 3691 mddev->degraded++;
3670 if (disk->rdev) 3692 if (disk->rdev)
3671 conf->fullsync = 1; 3693 conf->fullsync = 1;
3672 } 3694 }
3673 disk->recovery_disabled = mddev->recovery_disabled - 1; 3695 disk->recovery_disabled = mddev->recovery_disabled - 1;
3674 } 3696 }
3675 3697
3676 if (mddev->recovery_cp != MaxSector) 3698 if (mddev->recovery_cp != MaxSector)
3677 printk(KERN_NOTICE "md/raid10:%s: not clean" 3699 printk(KERN_NOTICE "md/raid10:%s: not clean"
3678 " -- starting background reconstruction\n", 3700 " -- starting background reconstruction\n",
3679 mdname(mddev)); 3701 mdname(mddev));
3680 printk(KERN_INFO 3702 printk(KERN_INFO
3681 "md/raid10:%s: active with %d out of %d devices\n", 3703 "md/raid10:%s: active with %d out of %d devices\n",
3682 mdname(mddev), conf->geo.raid_disks - mddev->degraded, 3704 mdname(mddev), conf->geo.raid_disks - mddev->degraded,
3683 conf->geo.raid_disks); 3705 conf->geo.raid_disks);
3684 /* 3706 /*
3685 * Ok, everything is just fine now 3707 * Ok, everything is just fine now
3686 */ 3708 */
3687 mddev->dev_sectors = conf->dev_sectors; 3709 mddev->dev_sectors = conf->dev_sectors;
3688 size = raid10_size(mddev, 0, 0); 3710 size = raid10_size(mddev, 0, 0);
3689 md_set_array_sectors(mddev, size); 3711 md_set_array_sectors(mddev, size);
3690 mddev->resync_max_sectors = size; 3712 mddev->resync_max_sectors = size;
3691 3713
3692 if (mddev->queue) { 3714 if (mddev->queue) {
3693 int stripe = conf->geo.raid_disks * 3715 int stripe = conf->geo.raid_disks *
3694 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 3716 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
3695 mddev->queue->backing_dev_info.congested_fn = raid10_congested; 3717 mddev->queue->backing_dev_info.congested_fn = raid10_congested;
3696 mddev->queue->backing_dev_info.congested_data = mddev; 3718 mddev->queue->backing_dev_info.congested_data = mddev;
3697 3719
3698 /* Calculate max read-ahead size. 3720 /* Calculate max read-ahead size.
3699 * We need to readahead at least twice a whole stripe.... 3721 * We need to readahead at least twice a whole stripe....
3700 * maybe... 3722 * maybe...
3701 */ 3723 */
3702 stripe /= conf->geo.near_copies; 3724 stripe /= conf->geo.near_copies;
3703 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 3725 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
3704 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 3726 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
3705 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); 3727 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3706 } 3728 }
3707 3729
3708 3730
3709 if (md_integrity_register(mddev)) 3731 if (md_integrity_register(mddev))
3710 goto out_free_conf; 3732 goto out_free_conf;
3711 3733
3712 if (conf->reshape_progress != MaxSector) { 3734 if (conf->reshape_progress != MaxSector) {
3713 unsigned long before_length, after_length; 3735 unsigned long before_length, after_length;
3714 3736
3715 before_length = ((1 << conf->prev.chunk_shift) * 3737 before_length = ((1 << conf->prev.chunk_shift) *
3716 conf->prev.far_copies); 3738 conf->prev.far_copies);
3717 after_length = ((1 << conf->geo.chunk_shift) * 3739 after_length = ((1 << conf->geo.chunk_shift) *
3718 conf->geo.far_copies); 3740 conf->geo.far_copies);
3719 3741
3720 if (max(before_length, after_length) > min_offset_diff) { 3742 if (max(before_length, after_length) > min_offset_diff) {
3721 /* This cannot work */ 3743 /* This cannot work */
3722 printk("md/raid10: offset difference not enough to continue reshape\n"); 3744 printk("md/raid10: offset difference not enough to continue reshape\n");
3723 goto out_free_conf; 3745 goto out_free_conf;
3724 } 3746 }
3725 conf->offset_diff = min_offset_diff; 3747 conf->offset_diff = min_offset_diff;
3726 3748
3727 conf->reshape_safe = conf->reshape_progress; 3749 conf->reshape_safe = conf->reshape_progress;
3728 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3750 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3729 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3751 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3730 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 3752 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
3731 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 3753 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
3732 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 3754 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
3733 "reshape"); 3755 "reshape");
3734 } 3756 }
3735 3757
3736 return 0; 3758 return 0;
3737 3759
3738 out_free_conf: 3760 out_free_conf:
3739 md_unregister_thread(&mddev->thread); 3761 md_unregister_thread(&mddev->thread);
3740 if (conf->r10bio_pool) 3762 if (conf->r10bio_pool)
3741 mempool_destroy(conf->r10bio_pool); 3763 mempool_destroy(conf->r10bio_pool);
3742 safe_put_page(conf->tmppage); 3764 safe_put_page(conf->tmppage);
3743 kfree(conf->mirrors); 3765 kfree(conf->mirrors);
3744 kfree(conf); 3766 kfree(conf);
3745 mddev->private = NULL; 3767 mddev->private = NULL;
3746 out: 3768 out:
3747 return -EIO; 3769 return -EIO;
3748 } 3770 }
3749 3771
3750 static int stop(struct mddev *mddev) 3772 static int stop(struct mddev *mddev)
3751 { 3773 {
3752 struct r10conf *conf = mddev->private; 3774 struct r10conf *conf = mddev->private;
3753 3775
3754 raise_barrier(conf, 0); 3776 raise_barrier(conf, 0);
3755 lower_barrier(conf); 3777 lower_barrier(conf);
3756 3778
3757 md_unregister_thread(&mddev->thread); 3779 md_unregister_thread(&mddev->thread);
3758 if (mddev->queue) 3780 if (mddev->queue)
3759 /* the unplug fn references 'conf'*/ 3781 /* the unplug fn references 'conf'*/
3760 blk_sync_queue(mddev->queue); 3782 blk_sync_queue(mddev->queue);
3761 3783
3762 if (conf->r10bio_pool) 3784 if (conf->r10bio_pool)
3763 mempool_destroy(conf->r10bio_pool); 3785 mempool_destroy(conf->r10bio_pool);
3764 kfree(conf->mirrors); 3786 kfree(conf->mirrors);
3765 kfree(conf); 3787 kfree(conf);
3766 mddev->private = NULL; 3788 mddev->private = NULL;
3767 return 0; 3789 return 0;
3768 } 3790 }
3769 3791
3770 static void raid10_quiesce(struct mddev *mddev, int state) 3792 static void raid10_quiesce(struct mddev *mddev, int state)
3771 { 3793 {
3772 struct r10conf *conf = mddev->private; 3794 struct r10conf *conf = mddev->private;
3773 3795
3774 switch(state) { 3796 switch(state) {
3775 case 1: 3797 case 1:
3776 raise_barrier(conf, 0); 3798 raise_barrier(conf, 0);
3777 break; 3799 break;
3778 case 0: 3800 case 0:
3779 lower_barrier(conf); 3801 lower_barrier(conf);
3780 break; 3802 break;
3781 } 3803 }
3782 } 3804 }
3783 3805
3784 static int raid10_resize(struct mddev *mddev, sector_t sectors) 3806 static int raid10_resize(struct mddev *mddev, sector_t sectors)
3785 { 3807 {
3786 /* Resize of 'far' arrays is not supported. 3808 /* Resize of 'far' arrays is not supported.
3787 * For 'near' and 'offset' arrays we can set the 3809 * For 'near' and 'offset' arrays we can set the
3788 * number of sectors used to be an appropriate multiple 3810 * number of sectors used to be an appropriate multiple
3789 * of the chunk size. 3811 * of the chunk size.
3790 * For 'offset', this is far_copies*chunksize. 3812 * For 'offset', this is far_copies*chunksize.
3791 * For 'near' the multiplier is the LCM of 3813 * For 'near' the multiplier is the LCM of
3792 * near_copies and raid_disks. 3814 * near_copies and raid_disks.
3793 * So if far_copies > 1 && !far_offset, fail. 3815 * So if far_copies > 1 && !far_offset, fail.
3794 * Else find LCM(raid_disks, near_copy)*far_copies and 3816 * Else find LCM(raid_disks, near_copy)*far_copies and
3795 * multiply by chunk_size. Then round to this number. 3817 * multiply by chunk_size. Then round to this number.
3796 * This is mostly done by raid10_size() 3818 * This is mostly done by raid10_size()
3797 */ 3819 */
3798 struct r10conf *conf = mddev->private; 3820 struct r10conf *conf = mddev->private;
3799 sector_t oldsize, size; 3821 sector_t oldsize, size;
3800 3822
3801 if (mddev->reshape_position != MaxSector) 3823 if (mddev->reshape_position != MaxSector)
3802 return -EBUSY; 3824 return -EBUSY;
3803 3825
3804 if (conf->geo.far_copies > 1 && !conf->geo.far_offset) 3826 if (conf->geo.far_copies > 1 && !conf->geo.far_offset)
3805 return -EINVAL; 3827 return -EINVAL;
3806 3828
3807 oldsize = raid10_size(mddev, 0, 0); 3829 oldsize = raid10_size(mddev, 0, 0);
3808 size = raid10_size(mddev, sectors, 0); 3830 size = raid10_size(mddev, sectors, 0);
3809 if (mddev->external_size && 3831 if (mddev->external_size &&
3810 mddev->array_sectors > size) 3832 mddev->array_sectors > size)
3811 return -EINVAL; 3833 return -EINVAL;
3812 if (mddev->bitmap) { 3834 if (mddev->bitmap) {
3813 int ret = bitmap_resize(mddev->bitmap, size, 0, 0); 3835 int ret = bitmap_resize(mddev->bitmap, size, 0, 0);
3814 if (ret) 3836 if (ret)
3815 return ret; 3837 return ret;
3816 } 3838 }
3817 md_set_array_sectors(mddev, size); 3839 md_set_array_sectors(mddev, size);
3818 set_capacity(mddev->gendisk, mddev->array_sectors); 3840 set_capacity(mddev->gendisk, mddev->array_sectors);
3819 revalidate_disk(mddev->gendisk); 3841 revalidate_disk(mddev->gendisk);
3820 if (sectors > mddev->dev_sectors && 3842 if (sectors > mddev->dev_sectors &&
3821 mddev->recovery_cp > oldsize) { 3843 mddev->recovery_cp > oldsize) {
3822 mddev->recovery_cp = oldsize; 3844 mddev->recovery_cp = oldsize;
3823 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3845 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3824 } 3846 }
3825 calc_sectors(conf, sectors); 3847 calc_sectors(conf, sectors);
3826 mddev->dev_sectors = conf->dev_sectors; 3848 mddev->dev_sectors = conf->dev_sectors;
3827 mddev->resync_max_sectors = size; 3849 mddev->resync_max_sectors = size;
3828 return 0; 3850 return 0;
3829 } 3851 }
3830 3852
3831 static void *raid10_takeover_raid0(struct mddev *mddev) 3853 static void *raid10_takeover_raid0(struct mddev *mddev)
3832 { 3854 {
3833 struct md_rdev *rdev; 3855 struct md_rdev *rdev;
3834 struct r10conf *conf; 3856 struct r10conf *conf;
3835 3857
3836 if (mddev->degraded > 0) { 3858 if (mddev->degraded > 0) {
3837 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n", 3859 printk(KERN_ERR "md/raid10:%s: Error: degraded raid0!\n",
3838 mdname(mddev)); 3860 mdname(mddev));
3839 return ERR_PTR(-EINVAL); 3861 return ERR_PTR(-EINVAL);
3840 } 3862 }
3841 3863
3842 /* Set new parameters */ 3864 /* Set new parameters */
3843 mddev->new_level = 10; 3865 mddev->new_level = 10;
3844 /* new layout: far_copies = 1, near_copies = 2 */ 3866 /* new layout: far_copies = 1, near_copies = 2 */
3845 mddev->new_layout = (1<<8) + 2; 3867 mddev->new_layout = (1<<8) + 2;
3846 mddev->new_chunk_sectors = mddev->chunk_sectors; 3868 mddev->new_chunk_sectors = mddev->chunk_sectors;
3847 mddev->delta_disks = mddev->raid_disks; 3869 mddev->delta_disks = mddev->raid_disks;
3848 mddev->raid_disks *= 2; 3870 mddev->raid_disks *= 2;
3849 /* make sure it will be not marked as dirty */ 3871 /* make sure it will be not marked as dirty */
3850 mddev->recovery_cp = MaxSector; 3872 mddev->recovery_cp = MaxSector;
3851 3873
3852 conf = setup_conf(mddev); 3874 conf = setup_conf(mddev);
3853 if (!IS_ERR(conf)) { 3875 if (!IS_ERR(conf)) {
3854 rdev_for_each(rdev, mddev) 3876 rdev_for_each(rdev, mddev)
3855 if (rdev->raid_disk >= 0) 3877 if (rdev->raid_disk >= 0)
3856 rdev->new_raid_disk = rdev->raid_disk * 2; 3878 rdev->new_raid_disk = rdev->raid_disk * 2;
3857 conf->barrier = 1; 3879 conf->barrier = 1;
3858 } 3880 }
3859 3881
3860 return conf; 3882 return conf;
3861 } 3883 }
3862 3884
3863 static void *raid10_takeover(struct mddev *mddev) 3885 static void *raid10_takeover(struct mddev *mddev)
3864 { 3886 {
3865 struct r0conf *raid0_conf; 3887 struct r0conf *raid0_conf;
3866 3888
3867 /* raid10 can take over: 3889 /* raid10 can take over:
3868 * raid0 - providing it has only two drives 3890 * raid0 - providing it has only two drives
3869 */ 3891 */
3870 if (mddev->level == 0) { 3892 if (mddev->level == 0) {
3871 /* for raid0 takeover only one zone is supported */ 3893 /* for raid0 takeover only one zone is supported */
3872 raid0_conf = mddev->private; 3894 raid0_conf = mddev->private;
3873 if (raid0_conf->nr_strip_zones > 1) { 3895 if (raid0_conf->nr_strip_zones > 1) {
3874 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0" 3896 printk(KERN_ERR "md/raid10:%s: cannot takeover raid 0"
3875 " with more than one zone.\n", 3897 " with more than one zone.\n",
3876 mdname(mddev)); 3898 mdname(mddev));
3877 return ERR_PTR(-EINVAL); 3899 return ERR_PTR(-EINVAL);
3878 } 3900 }
3879 return raid10_takeover_raid0(mddev); 3901 return raid10_takeover_raid0(mddev);
3880 } 3902 }
3881 return ERR_PTR(-EINVAL); 3903 return ERR_PTR(-EINVAL);
3882 } 3904 }
3883 3905
3884 static int raid10_check_reshape(struct mddev *mddev) 3906 static int raid10_check_reshape(struct mddev *mddev)
3885 { 3907 {
3886 /* Called when there is a request to change 3908 /* Called when there is a request to change
3887 * - layout (to ->new_layout) 3909 * - layout (to ->new_layout)
3888 * - chunk size (to ->new_chunk_sectors) 3910 * - chunk size (to ->new_chunk_sectors)
3889 * - raid_disks (by delta_disks) 3911 * - raid_disks (by delta_disks)
3890 * or when trying to restart a reshape that was ongoing. 3912 * or when trying to restart a reshape that was ongoing.
3891 * 3913 *
3892 * We need to validate the request and possibly allocate 3914 * We need to validate the request and possibly allocate
3893 * space if that might be an issue later. 3915 * space if that might be an issue later.
3894 * 3916 *
3895 * Currently we reject any reshape of a 'far' mode array, 3917 * Currently we reject any reshape of a 'far' mode array,
3896 * allow chunk size to change if new is generally acceptable, 3918 * allow chunk size to change if new is generally acceptable,
3897 * allow raid_disks to increase, and allow 3919 * allow raid_disks to increase, and allow
3898 * a switch between 'near' mode and 'offset' mode. 3920 * a switch between 'near' mode and 'offset' mode.
3899 */ 3921 */
3900 struct r10conf *conf = mddev->private; 3922 struct r10conf *conf = mddev->private;
3901 struct geom geo; 3923 struct geom geo;
3902 3924
3903 if (conf->geo.far_copies != 1 && !conf->geo.far_offset) 3925 if (conf->geo.far_copies != 1 && !conf->geo.far_offset)
3904 return -EINVAL; 3926 return -EINVAL;
3905 3927
3906 if (setup_geo(&geo, mddev, geo_start) != conf->copies) 3928 if (setup_geo(&geo, mddev, geo_start) != conf->copies)
3907 /* mustn't change number of copies */ 3929 /* mustn't change number of copies */
3908 return -EINVAL; 3930 return -EINVAL;
3909 if (geo.far_copies > 1 && !geo.far_offset) 3931 if (geo.far_copies > 1 && !geo.far_offset)
3910 /* Cannot switch to 'far' mode */ 3932 /* Cannot switch to 'far' mode */
3911 return -EINVAL; 3933 return -EINVAL;
3912 3934
3913 if (mddev->array_sectors & geo.chunk_mask) 3935 if (mddev->array_sectors & geo.chunk_mask)
3914 /* not factor of array size */ 3936 /* not factor of array size */
3915 return -EINVAL; 3937 return -EINVAL;
3916 3938
3917 if (!enough(conf, -1)) 3939 if (!enough(conf, -1))
3918 return -EINVAL; 3940 return -EINVAL;
3919 3941
3920 kfree(conf->mirrors_new); 3942 kfree(conf->mirrors_new);
3921 conf->mirrors_new = NULL; 3943 conf->mirrors_new = NULL;
3922 if (mddev->delta_disks > 0) { 3944 if (mddev->delta_disks > 0) {
3923 /* allocate new 'mirrors' list */ 3945 /* allocate new 'mirrors' list */
3924 conf->mirrors_new = kzalloc( 3946 conf->mirrors_new = kzalloc(
3925 sizeof(struct raid10_info) 3947 sizeof(struct raid10_info)
3926 *(mddev->raid_disks + 3948 *(mddev->raid_disks +
3927 mddev->delta_disks), 3949 mddev->delta_disks),
3928 GFP_KERNEL); 3950 GFP_KERNEL);
3929 if (!conf->mirrors_new) 3951 if (!conf->mirrors_new)
3930 return -ENOMEM; 3952 return -ENOMEM;
3931 } 3953 }
3932 return 0; 3954 return 0;
3933 } 3955 }
3934 3956
3935 /* 3957 /*
3936 * Need to check if array has failed when deciding whether to: 3958 * Need to check if array has failed when deciding whether to:
3937 * - start an array 3959 * - start an array
3938 * - remove non-faulty devices 3960 * - remove non-faulty devices
3939 * - add a spare 3961 * - add a spare
3940 * - allow a reshape 3962 * - allow a reshape
3941 * This determination is simple when no reshape is happening. 3963 * This determination is simple when no reshape is happening.
3942 * However if there is a reshape, we need to carefully check 3964 * However if there is a reshape, we need to carefully check
3943 * both the before and after sections. 3965 * both the before and after sections.
3944 * This is because some failed devices may only affect one 3966 * This is because some failed devices may only affect one
3945 * of the two sections, and some non-in_sync devices may 3967 * of the two sections, and some non-in_sync devices may
3946 * be insync in the section most affected by failed devices. 3968 * be insync in the section most affected by failed devices.
3947 */ 3969 */
3948 static int calc_degraded(struct r10conf *conf) 3970 static int calc_degraded(struct r10conf *conf)
3949 { 3971 {
3950 int degraded, degraded2; 3972 int degraded, degraded2;
3951 int i; 3973 int i;
3952 3974
3953 rcu_read_lock(); 3975 rcu_read_lock();
3954 degraded = 0; 3976 degraded = 0;
3955 /* 'prev' section first */ 3977 /* 'prev' section first */
3956 for (i = 0; i < conf->prev.raid_disks; i++) { 3978 for (i = 0; i < conf->prev.raid_disks; i++) {
3957 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 3979 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3958 if (!rdev || test_bit(Faulty, &rdev->flags)) 3980 if (!rdev || test_bit(Faulty, &rdev->flags))
3959 degraded++; 3981 degraded++;
3960 else if (!test_bit(In_sync, &rdev->flags)) 3982 else if (!test_bit(In_sync, &rdev->flags))
3961 /* When we can reduce the number of devices in 3983 /* When we can reduce the number of devices in
3962 * an array, this might not contribute to 3984 * an array, this might not contribute to
3963 * 'degraded'. It does now. 3985 * 'degraded'. It does now.
3964 */ 3986 */
3965 degraded++; 3987 degraded++;
3966 } 3988 }
3967 rcu_read_unlock(); 3989 rcu_read_unlock();
3968 if (conf->geo.raid_disks == conf->prev.raid_disks) 3990 if (conf->geo.raid_disks == conf->prev.raid_disks)
3969 return degraded; 3991 return degraded;
3970 rcu_read_lock(); 3992 rcu_read_lock();
3971 degraded2 = 0; 3993 degraded2 = 0;
3972 for (i = 0; i < conf->geo.raid_disks; i++) { 3994 for (i = 0; i < conf->geo.raid_disks; i++) {
3973 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 3995 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
3974 if (!rdev || test_bit(Faulty, &rdev->flags)) 3996 if (!rdev || test_bit(Faulty, &rdev->flags))
3975 degraded2++; 3997 degraded2++;
3976 else if (!test_bit(In_sync, &rdev->flags)) { 3998 else if (!test_bit(In_sync, &rdev->flags)) {
3977 /* If reshape is increasing the number of devices, 3999 /* If reshape is increasing the number of devices,
3978 * this section has already been recovered, so 4000 * this section has already been recovered, so
3979 * it doesn't contribute to degraded. 4001 * it doesn't contribute to degraded.
3980 * else it does. 4002 * else it does.
3981 */ 4003 */
3982 if (conf->geo.raid_disks <= conf->prev.raid_disks) 4004 if (conf->geo.raid_disks <= conf->prev.raid_disks)
3983 degraded2++; 4005 degraded2++;
3984 } 4006 }
3985 } 4007 }
3986 rcu_read_unlock(); 4008 rcu_read_unlock();
3987 if (degraded2 > degraded) 4009 if (degraded2 > degraded)
3988 return degraded2; 4010 return degraded2;
3989 return degraded; 4011 return degraded;
3990 } 4012 }
3991 4013
3992 static int raid10_start_reshape(struct mddev *mddev) 4014 static int raid10_start_reshape(struct mddev *mddev)
3993 { 4015 {
3994 /* A 'reshape' has been requested. This commits 4016 /* A 'reshape' has been requested. This commits
3995 * the various 'new' fields and sets MD_RECOVER_RESHAPE 4017 * the various 'new' fields and sets MD_RECOVER_RESHAPE
3996 * This also checks if there are enough spares and adds them 4018 * This also checks if there are enough spares and adds them
3997 * to the array. 4019 * to the array.
3998 * We currently require enough spares to make the final 4020 * We currently require enough spares to make the final
3999 * array non-degraded. We also require that the difference 4021 * array non-degraded. We also require that the difference
4000 * between old and new data_offset - on each device - is 4022 * between old and new data_offset - on each device - is
4001 * enough that we never risk over-writing. 4023 * enough that we never risk over-writing.
4002 */ 4024 */
4003 4025
4004 unsigned long before_length, after_length; 4026 unsigned long before_length, after_length;
4005 sector_t min_offset_diff = 0; 4027 sector_t min_offset_diff = 0;
4006 int first = 1; 4028 int first = 1;
4007 struct geom new; 4029 struct geom new;
4008 struct r10conf *conf = mddev->private; 4030 struct r10conf *conf = mddev->private;
4009 struct md_rdev *rdev; 4031 struct md_rdev *rdev;
4010 int spares = 0; 4032 int spares = 0;
4011 int ret; 4033 int ret;
4012 4034
4013 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4035 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4014 return -EBUSY; 4036 return -EBUSY;
4015 4037
4016 if (setup_geo(&new, mddev, geo_start) != conf->copies) 4038 if (setup_geo(&new, mddev, geo_start) != conf->copies)
4017 return -EINVAL; 4039 return -EINVAL;
4018 4040
4019 before_length = ((1 << conf->prev.chunk_shift) * 4041 before_length = ((1 << conf->prev.chunk_shift) *
4020 conf->prev.far_copies); 4042 conf->prev.far_copies);
4021 after_length = ((1 << conf->geo.chunk_shift) * 4043 after_length = ((1 << conf->geo.chunk_shift) *
4022 conf->geo.far_copies); 4044 conf->geo.far_copies);
4023 4045
4024 rdev_for_each(rdev, mddev) { 4046 rdev_for_each(rdev, mddev) {
4025 if (!test_bit(In_sync, &rdev->flags) 4047 if (!test_bit(In_sync, &rdev->flags)
4026 && !test_bit(Faulty, &rdev->flags)) 4048 && !test_bit(Faulty, &rdev->flags))
4027 spares++; 4049 spares++;
4028 if (rdev->raid_disk >= 0) { 4050 if (rdev->raid_disk >= 0) {
4029 long long diff = (rdev->new_data_offset 4051 long long diff = (rdev->new_data_offset
4030 - rdev->data_offset); 4052 - rdev->data_offset);
4031 if (!mddev->reshape_backwards) 4053 if (!mddev->reshape_backwards)
4032 diff = -diff; 4054 diff = -diff;
4033 if (diff < 0) 4055 if (diff < 0)
4034 diff = 0; 4056 diff = 0;
4035 if (first || diff < min_offset_diff) 4057 if (first || diff < min_offset_diff)
4036 min_offset_diff = diff; 4058 min_offset_diff = diff;
4037 } 4059 }
4038 } 4060 }
4039 4061
4040 if (max(before_length, after_length) > min_offset_diff) 4062 if (max(before_length, after_length) > min_offset_diff)
4041 return -EINVAL; 4063 return -EINVAL;
4042 4064
4043 if (spares < mddev->delta_disks) 4065 if (spares < mddev->delta_disks)
4044 return -EINVAL; 4066 return -EINVAL;
4045 4067
4046 conf->offset_diff = min_offset_diff; 4068 conf->offset_diff = min_offset_diff;
4047 spin_lock_irq(&conf->device_lock); 4069 spin_lock_irq(&conf->device_lock);
4048 if (conf->mirrors_new) { 4070 if (conf->mirrors_new) {
4049 memcpy(conf->mirrors_new, conf->mirrors, 4071 memcpy(conf->mirrors_new, conf->mirrors,
4050 sizeof(struct raid10_info)*conf->prev.raid_disks); 4072 sizeof(struct raid10_info)*conf->prev.raid_disks);
4051 smp_mb(); 4073 smp_mb();
4052 kfree(conf->mirrors_old); /* FIXME and elsewhere */ 4074 kfree(conf->mirrors_old); /* FIXME and elsewhere */
4053 conf->mirrors_old = conf->mirrors; 4075 conf->mirrors_old = conf->mirrors;
4054 conf->mirrors = conf->mirrors_new; 4076 conf->mirrors = conf->mirrors_new;
4055 conf->mirrors_new = NULL; 4077 conf->mirrors_new = NULL;
4056 } 4078 }
4057 setup_geo(&conf->geo, mddev, geo_start); 4079 setup_geo(&conf->geo, mddev, geo_start);
4058 smp_mb(); 4080 smp_mb();
4059 if (mddev->reshape_backwards) { 4081 if (mddev->reshape_backwards) {
4060 sector_t size = raid10_size(mddev, 0, 0); 4082 sector_t size = raid10_size(mddev, 0, 0);
4061 if (size < mddev->array_sectors) { 4083 if (size < mddev->array_sectors) {
4062 spin_unlock_irq(&conf->device_lock); 4084 spin_unlock_irq(&conf->device_lock);
4063 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n", 4085 printk(KERN_ERR "md/raid10:%s: array size must be reduce before number of disks\n",
4064 mdname(mddev)); 4086 mdname(mddev));
4065 return -EINVAL; 4087 return -EINVAL;
4066 } 4088 }
4067 mddev->resync_max_sectors = size; 4089 mddev->resync_max_sectors = size;
4068 conf->reshape_progress = size; 4090 conf->reshape_progress = size;
4069 } else 4091 } else
4070 conf->reshape_progress = 0; 4092 conf->reshape_progress = 0;
4071 spin_unlock_irq(&conf->device_lock); 4093 spin_unlock_irq(&conf->device_lock);
4072 4094
4073 if (mddev->delta_disks && mddev->bitmap) { 4095 if (mddev->delta_disks && mddev->bitmap) {
4074 ret = bitmap_resize(mddev->bitmap, 4096 ret = bitmap_resize(mddev->bitmap,
4075 raid10_size(mddev, 0, 4097 raid10_size(mddev, 0,
4076 conf->geo.raid_disks), 4098 conf->geo.raid_disks),
4077 0, 0); 4099 0, 0);
4078 if (ret) 4100 if (ret)
4079 goto abort; 4101 goto abort;
4080 } 4102 }
4081 if (mddev->delta_disks > 0) { 4103 if (mddev->delta_disks > 0) {
4082 rdev_for_each(rdev, mddev) 4104 rdev_for_each(rdev, mddev)
4083 if (rdev->raid_disk < 0 && 4105 if (rdev->raid_disk < 0 &&
4084 !test_bit(Faulty, &rdev->flags)) { 4106 !test_bit(Faulty, &rdev->flags)) {
4085 if (raid10_add_disk(mddev, rdev) == 0) { 4107 if (raid10_add_disk(mddev, rdev) == 0) {
4086 if (rdev->raid_disk >= 4108 if (rdev->raid_disk >=
4087 conf->prev.raid_disks) 4109 conf->prev.raid_disks)
4088 set_bit(In_sync, &rdev->flags); 4110 set_bit(In_sync, &rdev->flags);
4089 else 4111 else
4090 rdev->recovery_offset = 0; 4112 rdev->recovery_offset = 0;
4091 4113
4092 if (sysfs_link_rdev(mddev, rdev)) 4114 if (sysfs_link_rdev(mddev, rdev))
4093 /* Failure here is OK */; 4115 /* Failure here is OK */;
4094 } 4116 }
4095 } else if (rdev->raid_disk >= conf->prev.raid_disks 4117 } else if (rdev->raid_disk >= conf->prev.raid_disks
4096 && !test_bit(Faulty, &rdev->flags)) { 4118 && !test_bit(Faulty, &rdev->flags)) {
4097 /* This is a spare that was manually added */ 4119 /* This is a spare that was manually added */
4098 set_bit(In_sync, &rdev->flags); 4120 set_bit(In_sync, &rdev->flags);
4099 } 4121 }
4100 } 4122 }
4101 /* When a reshape changes the number of devices, 4123 /* When a reshape changes the number of devices,
4102 * ->degraded is measured against the larger of the 4124 * ->degraded is measured against the larger of the
4103 * pre and post numbers. 4125 * pre and post numbers.
4104 */ 4126 */
4105 spin_lock_irq(&conf->device_lock); 4127 spin_lock_irq(&conf->device_lock);
4106 mddev->degraded = calc_degraded(conf); 4128 mddev->degraded = calc_degraded(conf);
4107 spin_unlock_irq(&conf->device_lock); 4129 spin_unlock_irq(&conf->device_lock);
4108 mddev->raid_disks = conf->geo.raid_disks; 4130 mddev->raid_disks = conf->geo.raid_disks;
4109 mddev->reshape_position = conf->reshape_progress; 4131 mddev->reshape_position = conf->reshape_progress;
4110 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4132 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4111 4133
4112 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4134 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4113 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4135 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4114 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4136 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4115 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4137 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4116 4138
4117 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4139 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4118 "reshape"); 4140 "reshape");
4119 if (!mddev->sync_thread) { 4141 if (!mddev->sync_thread) {
4120 ret = -EAGAIN; 4142 ret = -EAGAIN;
4121 goto abort; 4143 goto abort;
4122 } 4144 }
4123 conf->reshape_checkpoint = jiffies; 4145 conf->reshape_checkpoint = jiffies;
4124 md_wakeup_thread(mddev->sync_thread); 4146 md_wakeup_thread(mddev->sync_thread);
4125 md_new_event(mddev); 4147 md_new_event(mddev);
4126 return 0; 4148 return 0;
4127 4149
4128 abort: 4150 abort:
4129 mddev->recovery = 0; 4151 mddev->recovery = 0;
4130 spin_lock_irq(&conf->device_lock); 4152 spin_lock_irq(&conf->device_lock);
4131 conf->geo = conf->prev; 4153 conf->geo = conf->prev;
4132 mddev->raid_disks = conf->geo.raid_disks; 4154 mddev->raid_disks = conf->geo.raid_disks;
4133 rdev_for_each(rdev, mddev) 4155 rdev_for_each(rdev, mddev)
4134 rdev->new_data_offset = rdev->data_offset; 4156 rdev->new_data_offset = rdev->data_offset;
4135 smp_wmb(); 4157 smp_wmb();
4136 conf->reshape_progress = MaxSector; 4158 conf->reshape_progress = MaxSector;
4137 mddev->reshape_position = MaxSector; 4159 mddev->reshape_position = MaxSector;
4138 spin_unlock_irq(&conf->device_lock); 4160 spin_unlock_irq(&conf->device_lock);
4139 return ret; 4161 return ret;
4140 } 4162 }
4141 4163
4142 /* Calculate the last device-address that could contain 4164 /* Calculate the last device-address that could contain
4143 * any block from the chunk that includes the array-address 's' 4165 * any block from the chunk that includes the array-address 's'
4144 * and report the next address. 4166 * and report the next address.
4145 * i.e. the address returned will be chunk-aligned and after 4167 * i.e. the address returned will be chunk-aligned and after
4146 * any data that is in the chunk containing 's'. 4168 * any data that is in the chunk containing 's'.
4147 */ 4169 */
4148 static sector_t last_dev_address(sector_t s, struct geom *geo) 4170 static sector_t last_dev_address(sector_t s, struct geom *geo)
4149 { 4171 {
4150 s = (s | geo->chunk_mask) + 1; 4172 s = (s | geo->chunk_mask) + 1;
4151 s >>= geo->chunk_shift; 4173 s >>= geo->chunk_shift;
4152 s *= geo->near_copies; 4174 s *= geo->near_copies;
4153 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks); 4175 s = DIV_ROUND_UP_SECTOR_T(s, geo->raid_disks);
4154 s *= geo->far_copies; 4176 s *= geo->far_copies;
4155 s <<= geo->chunk_shift; 4177 s <<= geo->chunk_shift;
4156 return s; 4178 return s;
4157 } 4179 }
4158 4180
4159 /* Calculate the first device-address that could contain 4181 /* Calculate the first device-address that could contain
4160 * any block from the chunk that includes the array-address 's'. 4182 * any block from the chunk that includes the array-address 's'.
4161 * This too will be the start of a chunk 4183 * This too will be the start of a chunk
4162 */ 4184 */
4163 static sector_t first_dev_address(sector_t s, struct geom *geo) 4185 static sector_t first_dev_address(sector_t s, struct geom *geo)
4164 { 4186 {
4165 s >>= geo->chunk_shift; 4187 s >>= geo->chunk_shift;
4166 s *= geo->near_copies; 4188 s *= geo->near_copies;
4167 sector_div(s, geo->raid_disks); 4189 sector_div(s, geo->raid_disks);
4168 s *= geo->far_copies; 4190 s *= geo->far_copies;
4169 s <<= geo->chunk_shift; 4191 s <<= geo->chunk_shift;
4170 return s; 4192 return s;
4171 } 4193 }
4172 4194
4173 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, 4195 static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
4174 int *skipped) 4196 int *skipped)
4175 { 4197 {
4176 /* We simply copy at most one chunk (smallest of old and new) 4198 /* We simply copy at most one chunk (smallest of old and new)
4177 * at a time, possibly less if that exceeds RESYNC_PAGES, 4199 * at a time, possibly less if that exceeds RESYNC_PAGES,
4178 * or we hit a bad block or something. 4200 * or we hit a bad block or something.
4179 * This might mean we pause for normal IO in the middle of 4201 * This might mean we pause for normal IO in the middle of
4180 * a chunk, but that is not a problem was mddev->reshape_position 4202 * a chunk, but that is not a problem was mddev->reshape_position
4181 * can record any location. 4203 * can record any location.
4182 * 4204 *
4183 * If we will want to write to a location that isn't 4205 * If we will want to write to a location that isn't
4184 * yet recorded as 'safe' (i.e. in metadata on disk) then 4206 * yet recorded as 'safe' (i.e. in metadata on disk) then
4185 * we need to flush all reshape requests and update the metadata. 4207 * we need to flush all reshape requests and update the metadata.
4186 * 4208 *
4187 * When reshaping forwards (e.g. to more devices), we interpret 4209 * When reshaping forwards (e.g. to more devices), we interpret
4188 * 'safe' as the earliest block which might not have been copied 4210 * 'safe' as the earliest block which might not have been copied
4189 * down yet. We divide this by previous stripe size and multiply 4211 * down yet. We divide this by previous stripe size and multiply
4190 * by previous stripe length to get lowest device offset that we 4212 * by previous stripe length to get lowest device offset that we
4191 * cannot write to yet. 4213 * cannot write to yet.
4192 * We interpret 'sector_nr' as an address that we want to write to. 4214 * We interpret 'sector_nr' as an address that we want to write to.
4193 * From this we use last_device_address() to find where we might 4215 * From this we use last_device_address() to find where we might
4194 * write to, and first_device_address on the 'safe' position. 4216 * write to, and first_device_address on the 'safe' position.
4195 * If this 'next' write position is after the 'safe' position, 4217 * If this 'next' write position is after the 'safe' position,
4196 * we must update the metadata to increase the 'safe' position. 4218 * we must update the metadata to increase the 'safe' position.
4197 * 4219 *
4198 * When reshaping backwards, we round in the opposite direction 4220 * When reshaping backwards, we round in the opposite direction
4199 * and perform the reverse test: next write position must not be 4221 * and perform the reverse test: next write position must not be
4200 * less than current safe position. 4222 * less than current safe position.
4201 * 4223 *
4202 * In all this the minimum difference in data offsets 4224 * In all this the minimum difference in data offsets
4203 * (conf->offset_diff - always positive) allows a bit of slack, 4225 * (conf->offset_diff - always positive) allows a bit of slack,
4204 * so next can be after 'safe', but not by more than offset_disk 4226 * so next can be after 'safe', but not by more than offset_disk
4205 * 4227 *
4206 * We need to prepare all the bios here before we start any IO 4228 * We need to prepare all the bios here before we start any IO
4207 * to ensure the size we choose is acceptable to all devices. 4229 * to ensure the size we choose is acceptable to all devices.
4208 * The means one for each copy for write-out and an extra one for 4230 * The means one for each copy for write-out and an extra one for
4209 * read-in. 4231 * read-in.
4210 * We store the read-in bio in ->master_bio and the others in 4232 * We store the read-in bio in ->master_bio and the others in
4211 * ->devs[x].bio and ->devs[x].repl_bio. 4233 * ->devs[x].bio and ->devs[x].repl_bio.
4212 */ 4234 */
4213 struct r10conf *conf = mddev->private; 4235 struct r10conf *conf = mddev->private;
4214 struct r10bio *r10_bio; 4236 struct r10bio *r10_bio;
4215 sector_t next, safe, last; 4237 sector_t next, safe, last;
4216 int max_sectors; 4238 int max_sectors;
4217 int nr_sectors; 4239 int nr_sectors;
4218 int s; 4240 int s;
4219 struct md_rdev *rdev; 4241 struct md_rdev *rdev;
4220 int need_flush = 0; 4242 int need_flush = 0;
4221 struct bio *blist; 4243 struct bio *blist;
4222 struct bio *bio, *read_bio; 4244 struct bio *bio, *read_bio;
4223 int sectors_done = 0; 4245 int sectors_done = 0;
4224 4246
4225 if (sector_nr == 0) { 4247 if (sector_nr == 0) {
4226 /* If restarting in the middle, skip the initial sectors */ 4248 /* If restarting in the middle, skip the initial sectors */
4227 if (mddev->reshape_backwards && 4249 if (mddev->reshape_backwards &&
4228 conf->reshape_progress < raid10_size(mddev, 0, 0)) { 4250 conf->reshape_progress < raid10_size(mddev, 0, 0)) {
4229 sector_nr = (raid10_size(mddev, 0, 0) 4251 sector_nr = (raid10_size(mddev, 0, 0)
4230 - conf->reshape_progress); 4252 - conf->reshape_progress);
4231 } else if (!mddev->reshape_backwards && 4253 } else if (!mddev->reshape_backwards &&
4232 conf->reshape_progress > 0) 4254 conf->reshape_progress > 0)
4233 sector_nr = conf->reshape_progress; 4255 sector_nr = conf->reshape_progress;
4234 if (sector_nr) { 4256 if (sector_nr) {
4235 mddev->curr_resync_completed = sector_nr; 4257 mddev->curr_resync_completed = sector_nr;
4236 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4258 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4237 *skipped = 1; 4259 *skipped = 1;
4238 return sector_nr; 4260 return sector_nr;
4239 } 4261 }
4240 } 4262 }
4241 4263
4242 /* We don't use sector_nr to track where we are up to 4264 /* We don't use sector_nr to track where we are up to
4243 * as that doesn't work well for ->reshape_backwards. 4265 * as that doesn't work well for ->reshape_backwards.
4244 * So just use ->reshape_progress. 4266 * So just use ->reshape_progress.
4245 */ 4267 */
4246 if (mddev->reshape_backwards) { 4268 if (mddev->reshape_backwards) {
4247 /* 'next' is the earliest device address that we might 4269 /* 'next' is the earliest device address that we might
4248 * write to for this chunk in the new layout 4270 * write to for this chunk in the new layout
4249 */ 4271 */
4250 next = first_dev_address(conf->reshape_progress - 1, 4272 next = first_dev_address(conf->reshape_progress - 1,
4251 &conf->geo); 4273 &conf->geo);
4252 4274
4253 /* 'safe' is the last device address that we might read from 4275 /* 'safe' is the last device address that we might read from
4254 * in the old layout after a restart 4276 * in the old layout after a restart
4255 */ 4277 */
4256 safe = last_dev_address(conf->reshape_safe - 1, 4278 safe = last_dev_address(conf->reshape_safe - 1,
4257 &conf->prev); 4279 &conf->prev);
4258 4280
4259 if (next + conf->offset_diff < safe) 4281 if (next + conf->offset_diff < safe)
4260 need_flush = 1; 4282 need_flush = 1;
4261 4283
4262 last = conf->reshape_progress - 1; 4284 last = conf->reshape_progress - 1;
4263 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask 4285 sector_nr = last & ~(sector_t)(conf->geo.chunk_mask
4264 & conf->prev.chunk_mask); 4286 & conf->prev.chunk_mask);
4265 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last) 4287 if (sector_nr + RESYNC_BLOCK_SIZE/512 < last)
4266 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512; 4288 sector_nr = last + 1 - RESYNC_BLOCK_SIZE/512;
4267 } else { 4289 } else {
4268 /* 'next' is after the last device address that we 4290 /* 'next' is after the last device address that we
4269 * might write to for this chunk in the new layout 4291 * might write to for this chunk in the new layout
4270 */ 4292 */
4271 next = last_dev_address(conf->reshape_progress, &conf->geo); 4293 next = last_dev_address(conf->reshape_progress, &conf->geo);
4272 4294
4273 /* 'safe' is the earliest device address that we might 4295 /* 'safe' is the earliest device address that we might
4274 * read from in the old layout after a restart 4296 * read from in the old layout after a restart
4275 */ 4297 */
4276 safe = first_dev_address(conf->reshape_safe, &conf->prev); 4298 safe = first_dev_address(conf->reshape_safe, &conf->prev);
4277 4299
4278 /* Need to update metadata if 'next' might be beyond 'safe' 4300 /* Need to update metadata if 'next' might be beyond 'safe'
4279 * as that would possibly corrupt data 4301 * as that would possibly corrupt data
4280 */ 4302 */
4281 if (next > safe + conf->offset_diff) 4303 if (next > safe + conf->offset_diff)
4282 need_flush = 1; 4304 need_flush = 1;
4283 4305
4284 sector_nr = conf->reshape_progress; 4306 sector_nr = conf->reshape_progress;
4285 last = sector_nr | (conf->geo.chunk_mask 4307 last = sector_nr | (conf->geo.chunk_mask
4286 & conf->prev.chunk_mask); 4308 & conf->prev.chunk_mask);
4287 4309
4288 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last) 4310 if (sector_nr + RESYNC_BLOCK_SIZE/512 <= last)
4289 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1; 4311 last = sector_nr + RESYNC_BLOCK_SIZE/512 - 1;
4290 } 4312 }
4291 4313
4292 if (need_flush || 4314 if (need_flush ||
4293 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4315 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4294 /* Need to update reshape_position in metadata */ 4316 /* Need to update reshape_position in metadata */
4295 wait_barrier(conf); 4317 wait_barrier(conf);
4296 mddev->reshape_position = conf->reshape_progress; 4318 mddev->reshape_position = conf->reshape_progress;
4297 if (mddev->reshape_backwards) 4319 if (mddev->reshape_backwards)
4298 mddev->curr_resync_completed = raid10_size(mddev, 0, 0) 4320 mddev->curr_resync_completed = raid10_size(mddev, 0, 0)
4299 - conf->reshape_progress; 4321 - conf->reshape_progress;
4300 else 4322 else
4301 mddev->curr_resync_completed = conf->reshape_progress; 4323 mddev->curr_resync_completed = conf->reshape_progress;
4302 conf->reshape_checkpoint = jiffies; 4324 conf->reshape_checkpoint = jiffies;
4303 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4325 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4304 md_wakeup_thread(mddev->thread); 4326 md_wakeup_thread(mddev->thread);
4305 wait_event(mddev->sb_wait, mddev->flags == 0 || 4327 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4306 kthread_should_stop()); 4328 kthread_should_stop());
4307 conf->reshape_safe = mddev->reshape_position; 4329 conf->reshape_safe = mddev->reshape_position;
4308 allow_barrier(conf); 4330 allow_barrier(conf);
4309 } 4331 }
4310 4332
4311 read_more: 4333 read_more:
4312 /* Now schedule reads for blocks from sector_nr to last */ 4334 /* Now schedule reads for blocks from sector_nr to last */
4313 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); 4335 r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO);
4314 raise_barrier(conf, sectors_done != 0); 4336 raise_barrier(conf, sectors_done != 0);
4315 atomic_set(&r10_bio->remaining, 0); 4337 atomic_set(&r10_bio->remaining, 0);
4316 r10_bio->mddev = mddev; 4338 r10_bio->mddev = mddev;
4317 r10_bio->sector = sector_nr; 4339 r10_bio->sector = sector_nr;
4318 set_bit(R10BIO_IsReshape, &r10_bio->state); 4340 set_bit(R10BIO_IsReshape, &r10_bio->state);
4319 r10_bio->sectors = last - sector_nr + 1; 4341 r10_bio->sectors = last - sector_nr + 1;
4320 rdev = read_balance(conf, r10_bio, &max_sectors); 4342 rdev = read_balance(conf, r10_bio, &max_sectors);
4321 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state)); 4343 BUG_ON(!test_bit(R10BIO_Previous, &r10_bio->state));
4322 4344
4323 if (!rdev) { 4345 if (!rdev) {
4324 /* Cannot read from here, so need to record bad blocks 4346 /* Cannot read from here, so need to record bad blocks
4325 * on all the target devices. 4347 * on all the target devices.
4326 */ 4348 */
4327 // FIXME 4349 // FIXME
4328 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4350 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4329 return sectors_done; 4351 return sectors_done;
4330 } 4352 }
4331 4353
4332 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev); 4354 read_bio = bio_alloc_mddev(GFP_KERNEL, RESYNC_PAGES, mddev);
4333 4355
4334 read_bio->bi_bdev = rdev->bdev; 4356 read_bio->bi_bdev = rdev->bdev;
4335 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr 4357 read_bio->bi_sector = (r10_bio->devs[r10_bio->read_slot].addr
4336 + rdev->data_offset); 4358 + rdev->data_offset);
4337 read_bio->bi_private = r10_bio; 4359 read_bio->bi_private = r10_bio;
4338 read_bio->bi_end_io = end_sync_read; 4360 read_bio->bi_end_io = end_sync_read;
4339 read_bio->bi_rw = READ; 4361 read_bio->bi_rw = READ;
4340 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1); 4362 read_bio->bi_flags &= ~(BIO_POOL_MASK - 1);
4341 read_bio->bi_flags |= 1 << BIO_UPTODATE; 4363 read_bio->bi_flags |= 1 << BIO_UPTODATE;
4342 read_bio->bi_vcnt = 0; 4364 read_bio->bi_vcnt = 0;
4343 read_bio->bi_idx = 0; 4365 read_bio->bi_idx = 0;
4344 read_bio->bi_size = 0; 4366 read_bio->bi_size = 0;
4345 r10_bio->master_bio = read_bio; 4367 r10_bio->master_bio = read_bio;
4346 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum; 4368 r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
4347 4369
4348 /* Now find the locations in the new layout */ 4370 /* Now find the locations in the new layout */
4349 __raid10_find_phys(&conf->geo, r10_bio); 4371 __raid10_find_phys(&conf->geo, r10_bio);
4350 4372
4351 blist = read_bio; 4373 blist = read_bio;
4352 read_bio->bi_next = NULL; 4374 read_bio->bi_next = NULL;
4353 4375
4354 for (s = 0; s < conf->copies*2; s++) { 4376 for (s = 0; s < conf->copies*2; s++) {
4355 struct bio *b; 4377 struct bio *b;
4356 int d = r10_bio->devs[s/2].devnum; 4378 int d = r10_bio->devs[s/2].devnum;
4357 struct md_rdev *rdev2; 4379 struct md_rdev *rdev2;
4358 if (s&1) { 4380 if (s&1) {
4359 rdev2 = conf->mirrors[d].replacement; 4381 rdev2 = conf->mirrors[d].replacement;
4360 b = r10_bio->devs[s/2].repl_bio; 4382 b = r10_bio->devs[s/2].repl_bio;
4361 } else { 4383 } else {
4362 rdev2 = conf->mirrors[d].rdev; 4384 rdev2 = conf->mirrors[d].rdev;
4363 b = r10_bio->devs[s/2].bio; 4385 b = r10_bio->devs[s/2].bio;
4364 } 4386 }
4365 if (!rdev2 || test_bit(Faulty, &rdev2->flags)) 4387 if (!rdev2 || test_bit(Faulty, &rdev2->flags))
4366 continue; 4388 continue;
4367 b->bi_bdev = rdev2->bdev; 4389 b->bi_bdev = rdev2->bdev;
4368 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset; 4390 b->bi_sector = r10_bio->devs[s/2].addr + rdev2->new_data_offset;
4369 b->bi_private = r10_bio; 4391 b->bi_private = r10_bio;
4370 b->bi_end_io = end_reshape_write; 4392 b->bi_end_io = end_reshape_write;
4371 b->bi_rw = WRITE; 4393 b->bi_rw = WRITE;
4372 b->bi_flags &= ~(BIO_POOL_MASK - 1); 4394 b->bi_flags &= ~(BIO_POOL_MASK - 1);
4373 b->bi_flags |= 1 << BIO_UPTODATE; 4395 b->bi_flags |= 1 << BIO_UPTODATE;
4374 b->bi_next = blist; 4396 b->bi_next = blist;
4375 b->bi_vcnt = 0; 4397 b->bi_vcnt = 0;
4376 b->bi_idx = 0; 4398 b->bi_idx = 0;
4377 b->bi_size = 0; 4399 b->bi_size = 0;
4378 blist = b; 4400 blist = b;
4379 } 4401 }
4380 4402
4381 /* Now add as many pages as possible to all of these bios. */ 4403 /* Now add as many pages as possible to all of these bios. */
4382 4404
4383 nr_sectors = 0; 4405 nr_sectors = 0;
4384 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) { 4406 for (s = 0 ; s < max_sectors; s += PAGE_SIZE >> 9) {
4385 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page; 4407 struct page *page = r10_bio->devs[0].bio->bi_io_vec[s/(PAGE_SIZE>>9)].bv_page;
4386 int len = (max_sectors - s) << 9; 4408 int len = (max_sectors - s) << 9;
4387 if (len > PAGE_SIZE) 4409 if (len > PAGE_SIZE)
4388 len = PAGE_SIZE; 4410 len = PAGE_SIZE;
4389 for (bio = blist; bio ; bio = bio->bi_next) { 4411 for (bio = blist; bio ; bio = bio->bi_next) {
4390 struct bio *bio2; 4412 struct bio *bio2;
4391 if (bio_add_page(bio, page, len, 0)) 4413 if (bio_add_page(bio, page, len, 0))
4392 continue; 4414 continue;
4393 4415
4394 /* Didn't fit, must stop */ 4416 /* Didn't fit, must stop */
4395 for (bio2 = blist; 4417 for (bio2 = blist;
4396 bio2 && bio2 != bio; 4418 bio2 && bio2 != bio;
4397 bio2 = bio2->bi_next) { 4419 bio2 = bio2->bi_next) {
4398 /* Remove last page from this bio */ 4420 /* Remove last page from this bio */
4399 bio2->bi_vcnt--; 4421 bio2->bi_vcnt--;
4400 bio2->bi_size -= len; 4422 bio2->bi_size -= len;
4401 bio2->bi_flags &= ~(1<<BIO_SEG_VALID); 4423 bio2->bi_flags &= ~(1<<BIO_SEG_VALID);
4402 } 4424 }
4403 goto bio_full; 4425 goto bio_full;
4404 } 4426 }
4405 sector_nr += len >> 9; 4427 sector_nr += len >> 9;
4406 nr_sectors += len >> 9; 4428 nr_sectors += len >> 9;
4407 } 4429 }
4408 bio_full: 4430 bio_full:
4409 r10_bio->sectors = nr_sectors; 4431 r10_bio->sectors = nr_sectors;
4410 4432
4411 /* Now submit the read */ 4433 /* Now submit the read */
4412 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors); 4434 md_sync_acct(read_bio->bi_bdev, r10_bio->sectors);
4413 atomic_inc(&r10_bio->remaining); 4435 atomic_inc(&r10_bio->remaining);
4414 read_bio->bi_next = NULL; 4436 read_bio->bi_next = NULL;
4415 generic_make_request(read_bio); 4437 generic_make_request(read_bio);
4416 sector_nr += nr_sectors; 4438 sector_nr += nr_sectors;
4417 sectors_done += nr_sectors; 4439 sectors_done += nr_sectors;
4418 if (sector_nr <= last) 4440 if (sector_nr <= last)
4419 goto read_more; 4441 goto read_more;
4420 4442
4421 /* Now that we have done the whole section we can 4443 /* Now that we have done the whole section we can
4422 * update reshape_progress 4444 * update reshape_progress
4423 */ 4445 */
4424 if (mddev->reshape_backwards) 4446 if (mddev->reshape_backwards)
4425 conf->reshape_progress -= sectors_done; 4447 conf->reshape_progress -= sectors_done;
4426 else 4448 else
4427 conf->reshape_progress += sectors_done; 4449 conf->reshape_progress += sectors_done;
4428 4450
4429 return sectors_done; 4451 return sectors_done;
4430 } 4452 }
4431 4453
4432 static void end_reshape_request(struct r10bio *r10_bio); 4454 static void end_reshape_request(struct r10bio *r10_bio);
4433 static int handle_reshape_read_error(struct mddev *mddev, 4455 static int handle_reshape_read_error(struct mddev *mddev,
4434 struct r10bio *r10_bio); 4456 struct r10bio *r10_bio);
4435 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio) 4457 static void reshape_request_write(struct mddev *mddev, struct r10bio *r10_bio)
4436 { 4458 {
4437 /* Reshape read completed. Hopefully we have a block 4459 /* Reshape read completed. Hopefully we have a block
4438 * to write out. 4460 * to write out.
4439 * If we got a read error then we do sync 1-page reads from 4461 * If we got a read error then we do sync 1-page reads from
4440 * elsewhere until we find the data - or give up. 4462 * elsewhere until we find the data - or give up.
4441 */ 4463 */
4442 struct r10conf *conf = mddev->private; 4464 struct r10conf *conf = mddev->private;
4443 int s; 4465 int s;
4444 4466
4445 if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) 4467 if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
4446 if (handle_reshape_read_error(mddev, r10_bio) < 0) { 4468 if (handle_reshape_read_error(mddev, r10_bio) < 0) {
4447 /* Reshape has been aborted */ 4469 /* Reshape has been aborted */
4448 md_done_sync(mddev, r10_bio->sectors, 0); 4470 md_done_sync(mddev, r10_bio->sectors, 0);
4449 return; 4471 return;
4450 } 4472 }
4451 4473
4452 /* We definitely have the data in the pages, schedule the 4474 /* We definitely have the data in the pages, schedule the
4453 * writes. 4475 * writes.
4454 */ 4476 */
4455 atomic_set(&r10_bio->remaining, 1); 4477 atomic_set(&r10_bio->remaining, 1);
4456 for (s = 0; s < conf->copies*2; s++) { 4478 for (s = 0; s < conf->copies*2; s++) {
4457 struct bio *b; 4479 struct bio *b;
4458 int d = r10_bio->devs[s/2].devnum; 4480 int d = r10_bio->devs[s/2].devnum;
4459 struct md_rdev *rdev; 4481 struct md_rdev *rdev;
4460 if (s&1) { 4482 if (s&1) {
4461 rdev = conf->mirrors[d].replacement; 4483 rdev = conf->mirrors[d].replacement;
4462 b = r10_bio->devs[s/2].repl_bio; 4484 b = r10_bio->devs[s/2].repl_bio;
4463 } else { 4485 } else {
4464 rdev = conf->mirrors[d].rdev; 4486 rdev = conf->mirrors[d].rdev;
4465 b = r10_bio->devs[s/2].bio; 4487 b = r10_bio->devs[s/2].bio;
4466 } 4488 }
4467 if (!rdev || test_bit(Faulty, &rdev->flags)) 4489 if (!rdev || test_bit(Faulty, &rdev->flags))
4468 continue; 4490 continue;
4469 atomic_inc(&rdev->nr_pending); 4491 atomic_inc(&rdev->nr_pending);
4470 md_sync_acct(b->bi_bdev, r10_bio->sectors); 4492 md_sync_acct(b->bi_bdev, r10_bio->sectors);
4471 atomic_inc(&r10_bio->remaining); 4493 atomic_inc(&r10_bio->remaining);
4472 b->bi_next = NULL; 4494 b->bi_next = NULL;
4473 generic_make_request(b); 4495 generic_make_request(b);
4474 } 4496 }
4475 end_reshape_request(r10_bio); 4497 end_reshape_request(r10_bio);
4476 } 4498 }
4477 4499
4478 static void end_reshape(struct r10conf *conf) 4500 static void end_reshape(struct r10conf *conf)
4479 { 4501 {
4480 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) 4502 if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery))
4481 return; 4503 return;
4482 4504
4483 spin_lock_irq(&conf->device_lock); 4505 spin_lock_irq(&conf->device_lock);
4484 conf->prev = conf->geo; 4506 conf->prev = conf->geo;
4485 md_finish_reshape(conf->mddev); 4507 md_finish_reshape(conf->mddev);
4486 smp_wmb(); 4508 smp_wmb();
4487 conf->reshape_progress = MaxSector; 4509 conf->reshape_progress = MaxSector;
4488 spin_unlock_irq(&conf->device_lock); 4510 spin_unlock_irq(&conf->device_lock);
4489 4511
4490 /* read-ahead size must cover two whole stripes, which is 4512 /* read-ahead size must cover two whole stripes, which is
4491 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 4513 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4492 */ 4514 */
4493 if (conf->mddev->queue) { 4515 if (conf->mddev->queue) {
4494 int stripe = conf->geo.raid_disks * 4516 int stripe = conf->geo.raid_disks *
4495 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); 4517 ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE);
4496 stripe /= conf->geo.near_copies; 4518 stripe /= conf->geo.near_copies;
4497 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 4519 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4498 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 4520 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4499 } 4521 }
4500 conf->fullsync = 0; 4522 conf->fullsync = 0;
4501 } 4523 }
4502 4524
4503 4525
4504 static int handle_reshape_read_error(struct mddev *mddev, 4526 static int handle_reshape_read_error(struct mddev *mddev,
4505 struct r10bio *r10_bio) 4527 struct r10bio *r10_bio)
4506 { 4528 {
4507 /* Use sync reads to get the blocks from somewhere else */ 4529 /* Use sync reads to get the blocks from somewhere else */
4508 int sectors = r10_bio->sectors; 4530 int sectors = r10_bio->sectors;
4509 struct r10conf *conf = mddev->private; 4531 struct r10conf *conf = mddev->private;
4510 struct { 4532 struct {
4511 struct r10bio r10_bio; 4533 struct r10bio r10_bio;
4512 struct r10dev devs[conf->copies]; 4534 struct r10dev devs[conf->copies];
4513 } on_stack; 4535 } on_stack;
4514 struct r10bio *r10b = &on_stack.r10_bio; 4536 struct r10bio *r10b = &on_stack.r10_bio;
4515 int slot = 0; 4537 int slot = 0;
4516 int idx = 0; 4538 int idx = 0;
4517 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec; 4539 struct bio_vec *bvec = r10_bio->master_bio->bi_io_vec;
4518 4540
4519 r10b->sector = r10_bio->sector; 4541 r10b->sector = r10_bio->sector;
4520 __raid10_find_phys(&conf->prev, r10b); 4542 __raid10_find_phys(&conf->prev, r10b);
4521 4543
4522 while (sectors) { 4544 while (sectors) {
4523 int s = sectors; 4545 int s = sectors;
4524 int success = 0; 4546 int success = 0;
4525 int first_slot = slot; 4547 int first_slot = slot;
4526 4548
4527 if (s > (PAGE_SIZE >> 9)) 4549 if (s > (PAGE_SIZE >> 9))
4528 s = PAGE_SIZE >> 9; 4550 s = PAGE_SIZE >> 9;
4529 4551
4530 while (!success) { 4552 while (!success) {
4531 int d = r10b->devs[slot].devnum; 4553 int d = r10b->devs[slot].devnum;
4532 struct md_rdev *rdev = conf->mirrors[d].rdev; 4554 struct md_rdev *rdev = conf->mirrors[d].rdev;
4533 sector_t addr; 4555 sector_t addr;
4534 if (rdev == NULL || 4556 if (rdev == NULL ||
4535 test_bit(Faulty, &rdev->flags) || 4557 test_bit(Faulty, &rdev->flags) ||
4536 !test_bit(In_sync, &rdev->flags)) 4558 !test_bit(In_sync, &rdev->flags))
4537 goto failed; 4559 goto failed;
4538 4560
4539 addr = r10b->devs[slot].addr + idx * PAGE_SIZE; 4561 addr = r10b->devs[slot].addr + idx * PAGE_SIZE;
4540 success = sync_page_io(rdev, 4562 success = sync_page_io(rdev,
4541 addr, 4563 addr,
4542 s << 9, 4564 s << 9,
4543 bvec[idx].bv_page, 4565 bvec[idx].bv_page,
4544 READ, false); 4566 READ, false);
4545 if (success) 4567 if (success)
4546 break; 4568 break;
4547 failed: 4569 failed:
4548 slot++; 4570 slot++;
4549 if (slot >= conf->copies) 4571 if (slot >= conf->copies)
4550 slot = 0; 4572 slot = 0;
4551 if (slot == first_slot) 4573 if (slot == first_slot)
4552 break; 4574 break;
4553 } 4575 }
4554 if (!success) { 4576 if (!success) {
4555 /* couldn't read this block, must give up */ 4577 /* couldn't read this block, must give up */
4556 set_bit(MD_RECOVERY_INTR, 4578 set_bit(MD_RECOVERY_INTR,
4557 &mddev->recovery); 4579 &mddev->recovery);
4558 return -EIO; 4580 return -EIO;
4559 } 4581 }
4560 sectors -= s; 4582 sectors -= s;
4561 idx++; 4583 idx++;
4562 } 4584 }
4563 return 0; 4585 return 0;
4564 } 4586 }
4565 4587
4566 static void end_reshape_write(struct bio *bio, int error) 4588 static void end_reshape_write(struct bio *bio, int error)
4567 { 4589 {
4568 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); 4590 int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
4569 struct r10bio *r10_bio = bio->bi_private; 4591 struct r10bio *r10_bio = bio->bi_private;
4570 struct mddev *mddev = r10_bio->mddev; 4592 struct mddev *mddev = r10_bio->mddev;
4571 struct r10conf *conf = mddev->private; 4593 struct r10conf *conf = mddev->private;
4572 int d; 4594 int d;
4573 int slot; 4595 int slot;
4574 int repl; 4596 int repl;
4575 struct md_rdev *rdev = NULL; 4597 struct md_rdev *rdev = NULL;
4576 4598
4577 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 4599 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
4578 if (repl) 4600 if (repl)
4579 rdev = conf->mirrors[d].replacement; 4601 rdev = conf->mirrors[d].replacement;
4580 if (!rdev) { 4602 if (!rdev) {
4581 smp_mb(); 4603 smp_mb();
4582 rdev = conf->mirrors[d].rdev; 4604 rdev = conf->mirrors[d].rdev;
4583 } 4605 }
4584 4606
4585 if (!uptodate) { 4607 if (!uptodate) {
4586 /* FIXME should record badblock */ 4608 /* FIXME should record badblock */
4587 md_error(mddev, rdev); 4609 md_error(mddev, rdev);
4588 } 4610 }
4589 4611
4590 rdev_dec_pending(rdev, mddev); 4612 rdev_dec_pending(rdev, mddev);
4591 end_reshape_request(r10_bio); 4613 end_reshape_request(r10_bio);
4592 } 4614 }
4593 4615
4594 static void end_reshape_request(struct r10bio *r10_bio) 4616 static void end_reshape_request(struct r10bio *r10_bio)
4595 { 4617 {
4596 if (!atomic_dec_and_test(&r10_bio->remaining)) 4618 if (!atomic_dec_and_test(&r10_bio->remaining))
4597 return; 4619 return;
4598 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1); 4620 md_done_sync(r10_bio->mddev, r10_bio->sectors, 1);
4599 bio_put(r10_bio->master_bio); 4621 bio_put(r10_bio->master_bio);
4600 put_buf(r10_bio); 4622 put_buf(r10_bio);
4601 } 4623 }
4602 4624
4603 static void raid10_finish_reshape(struct mddev *mddev) 4625 static void raid10_finish_reshape(struct mddev *mddev)
4604 { 4626 {
4605 struct r10conf *conf = mddev->private; 4627 struct r10conf *conf = mddev->private;
4606 4628
4607 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 4629 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
4608 return; 4630 return;
4609 4631
4610 if (mddev->delta_disks > 0) { 4632 if (mddev->delta_disks > 0) {
4611 sector_t size = raid10_size(mddev, 0, 0); 4633 sector_t size = raid10_size(mddev, 0, 0);
4612 md_set_array_sectors(mddev, size); 4634 md_set_array_sectors(mddev, size);
4613 if (mddev->recovery_cp > mddev->resync_max_sectors) { 4635 if (mddev->recovery_cp > mddev->resync_max_sectors) {
4614 mddev->recovery_cp = mddev->resync_max_sectors; 4636 mddev->recovery_cp = mddev->resync_max_sectors;
4615 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4637 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4616 } 4638 }
4617 mddev->resync_max_sectors = size; 4639 mddev->resync_max_sectors = size;
4618 set_capacity(mddev->gendisk, mddev->array_sectors); 4640 set_capacity(mddev->gendisk, mddev->array_sectors);
4619 revalidate_disk(mddev->gendisk); 4641 revalidate_disk(mddev->gendisk);
4620 } else { 4642 } else {
4621 int d; 4643 int d;
4622 for (d = conf->geo.raid_disks ; 4644 for (d = conf->geo.raid_disks ;
4623 d < conf->geo.raid_disks - mddev->delta_disks; 4645 d < conf->geo.raid_disks - mddev->delta_disks;
4624 d++) { 4646 d++) {
4625 struct md_rdev *rdev = conf->mirrors[d].rdev; 4647 struct md_rdev *rdev = conf->mirrors[d].rdev;
4626 if (rdev) 4648 if (rdev)
4627 clear_bit(In_sync, &rdev->flags); 4649 clear_bit(In_sync, &rdev->flags);
4628 rdev = conf->mirrors[d].replacement; 4650 rdev = conf->mirrors[d].replacement;
4629 if (rdev) 4651 if (rdev)
4630 clear_bit(In_sync, &rdev->flags); 4652 clear_bit(In_sync, &rdev->flags);
4631 } 4653 }
4632 } 4654 }
4633 mddev->layout = mddev->new_layout; 4655 mddev->layout = mddev->new_layout;
4634 mddev->chunk_sectors = 1 << conf->geo.chunk_shift; 4656 mddev->chunk_sectors = 1 << conf->geo.chunk_shift;
4635 mddev->reshape_position = MaxSector; 4657 mddev->reshape_position = MaxSector;
4636 mddev->delta_disks = 0; 4658 mddev->delta_disks = 0;
4637 mddev->reshape_backwards = 0; 4659 mddev->reshape_backwards = 0;
4638 } 4660 }
4639 4661
4640 static struct md_personality raid10_personality = 4662 static struct md_personality raid10_personality =
4641 { 4663 {
4642 .name = "raid10", 4664 .name = "raid10",
4643 .level = 10, 4665 .level = 10,
4644 .owner = THIS_MODULE, 4666 .owner = THIS_MODULE,
4645 .make_request = make_request, 4667 .make_request = make_request,
4646 .run = run, 4668 .run = run,
4647 .stop = stop, 4669 .stop = stop,
4648 .status = status, 4670 .status = status,
4649 .error_handler = error, 4671 .error_handler = error,
4650 .hot_add_disk = raid10_add_disk, 4672 .hot_add_disk = raid10_add_disk,
4651 .hot_remove_disk= raid10_remove_disk, 4673 .hot_remove_disk= raid10_remove_disk,
4652 .spare_active = raid10_spare_active, 4674 .spare_active = raid10_spare_active,
4653 .sync_request = sync_request, 4675 .sync_request = sync_request,
4654 .quiesce = raid10_quiesce, 4676 .quiesce = raid10_quiesce,
4655 .size = raid10_size, 4677 .size = raid10_size,
4656 .resize = raid10_resize, 4678 .resize = raid10_resize,
4657 .takeover = raid10_takeover, 4679 .takeover = raid10_takeover,
4658 .check_reshape = raid10_check_reshape, 4680 .check_reshape = raid10_check_reshape,
4659 .start_reshape = raid10_start_reshape, 4681 .start_reshape = raid10_start_reshape,
4660 .finish_reshape = raid10_finish_reshape, 4682 .finish_reshape = raid10_finish_reshape,
4661 }; 4683 };
4662 4684
4663 static int __init raid_init(void) 4685 static int __init raid_init(void)
4664 { 4686 {
4665 return register_md_personality(&raid10_personality); 4687 return register_md_personality(&raid10_personality);
4666 } 4688 }
4667 4689
4668 static void raid_exit(void) 4690 static void raid_exit(void)
4669 { 4691 {
4670 unregister_md_personality(&raid10_personality); 4692 unregister_md_personality(&raid10_personality);
4671 } 4693 }
4672 4694
4673 module_init(raid_init); 4695 module_init(raid_init);
4674 module_exit(raid_exit); 4696 module_exit(raid_exit);
4675 MODULE_LICENSE("GPL"); 4697 MODULE_LICENSE("GPL");
4676 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); 4698 MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
4677 MODULE_ALIAS("md-personality-9"); /* RAID10 */ 4699 MODULE_ALIAS("md-personality-9"); /* RAID10 */
4678 MODULE_ALIAS("md-raid10"); 4700 MODULE_ALIAS("md-raid10");
4679 MODULE_ALIAS("md-level-10"); 4701 MODULE_ALIAS("md-level-10");
4680 4702
1 #ifndef _RAID10_H 1 #ifndef _RAID10_H
2 #define _RAID10_H 2 #define _RAID10_H
3 3
4 struct raid10_info { 4 struct raid10_info {
5 struct md_rdev *rdev, *replacement; 5 struct md_rdev *rdev, *replacement;
6 sector_t head_position; 6 sector_t head_position;
7 int recovery_disabled; /* matches 7 int recovery_disabled; /* matches
8 * mddev->recovery_disabled 8 * mddev->recovery_disabled
9 * when we shouldn't try 9 * when we shouldn't try
10 * recovering this device. 10 * recovering this device.
11 */ 11 */
12 }; 12 };
13 13
14 struct r10conf { 14 struct r10conf {
15 struct mddev *mddev; 15 struct mddev *mddev;
16 struct raid10_info *mirrors; 16 struct raid10_info *mirrors;
17 struct raid10_info *mirrors_new, *mirrors_old; 17 struct raid10_info *mirrors_new, *mirrors_old;
18 spinlock_t device_lock; 18 spinlock_t device_lock;
19 19
20 /* geometry */ 20 /* geometry */
21 struct geom { 21 struct geom {
22 int raid_disks; 22 int raid_disks;
23 int near_copies; /* number of copies laid out 23 int near_copies; /* number of copies laid out
24 * raid0 style */ 24 * raid0 style */
25 int far_copies; /* number of copies laid out 25 int far_copies; /* number of copies laid out
26 * at large strides across drives 26 * at large strides across drives
27 */ 27 */
28 int far_offset; /* far_copies are offset by 1 28 int far_offset; /* far_copies are offset by 1
29 * stripe instead of many 29 * stripe instead of many
30 */ 30 */
31 sector_t stride; /* distance between far copies. 31 sector_t stride; /* distance between far copies.
32 * This is size / far_copies unless 32 * This is size / far_copies unless
33 * far_offset, in which case it is 33 * far_offset, in which case it is
34 * 1 stripe. 34 * 1 stripe.
35 */ 35 */
36 int far_set_size; /* The number of devices in a set,
37 * where a 'set' are devices that
38 * contain far/offset copies of
39 * each other.
40 */
36 int chunk_shift; /* shift from chunks to sectors */ 41 int chunk_shift; /* shift from chunks to sectors */
37 sector_t chunk_mask; 42 sector_t chunk_mask;
38 } prev, geo; 43 } prev, geo;
39 int copies; /* near_copies * far_copies. 44 int copies; /* near_copies * far_copies.
40 * must be <= raid_disks 45 * must be <= raid_disks
41 */ 46 */
42 47
43 sector_t dev_sectors; /* temp copy of 48 sector_t dev_sectors; /* temp copy of
44 * mddev->dev_sectors */ 49 * mddev->dev_sectors */
45 sector_t reshape_progress; 50 sector_t reshape_progress;
46 sector_t reshape_safe; 51 sector_t reshape_safe;
47 unsigned long reshape_checkpoint; 52 unsigned long reshape_checkpoint;
48 sector_t offset_diff; 53 sector_t offset_diff;
49 54
50 struct list_head retry_list; 55 struct list_head retry_list;
51 /* queue pending writes and submit them on unplug */ 56 /* queue pending writes and submit them on unplug */
52 struct bio_list pending_bio_list; 57 struct bio_list pending_bio_list;
53 int pending_count; 58 int pending_count;
54 59
55 spinlock_t resync_lock; 60 spinlock_t resync_lock;
56 int nr_pending; 61 int nr_pending;
57 int nr_waiting; 62 int nr_waiting;
58 int nr_queued; 63 int nr_queued;
59 int barrier; 64 int barrier;
60 sector_t next_resync; 65 sector_t next_resync;
61 int fullsync; /* set to 1 if a full sync is needed, 66 int fullsync; /* set to 1 if a full sync is needed,
62 * (fresh device added). 67 * (fresh device added).
63 * Cleared when a sync completes. 68 * Cleared when a sync completes.
64 */ 69 */
65 int have_replacement; /* There is at least one 70 int have_replacement; /* There is at least one
66 * replacement device. 71 * replacement device.
67 */ 72 */
68 wait_queue_head_t wait_barrier; 73 wait_queue_head_t wait_barrier;
69 74
70 mempool_t *r10bio_pool; 75 mempool_t *r10bio_pool;
71 mempool_t *r10buf_pool; 76 mempool_t *r10buf_pool;
72 struct page *tmppage; 77 struct page *tmppage;
73 78
74 /* When taking over an array from a different personality, we store 79 /* When taking over an array from a different personality, we store
75 * the new thread here until we fully activate the array. 80 * the new thread here until we fully activate the array.
76 */ 81 */
77 struct md_thread *thread; 82 struct md_thread *thread;
78 }; 83 };
79 84
80 /* 85 /*
81 * this is our 'private' RAID10 bio. 86 * this is our 'private' RAID10 bio.
82 * 87 *
83 * it contains information about what kind of IO operations were started 88 * it contains information about what kind of IO operations were started
84 * for this RAID10 operation, and about their status: 89 * for this RAID10 operation, and about their status:
85 */ 90 */
86 91
87 struct r10bio { 92 struct r10bio {
88 atomic_t remaining; /* 'have we finished' count, 93 atomic_t remaining; /* 'have we finished' count,
89 * used from IRQ handlers 94 * used from IRQ handlers
90 */ 95 */
91 sector_t sector; /* virtual sector number */ 96 sector_t sector; /* virtual sector number */
92 int sectors; 97 int sectors;
93 unsigned long state; 98 unsigned long state;
94 struct mddev *mddev; 99 struct mddev *mddev;
95 /* 100 /*
96 * original bio going to /dev/mdx 101 * original bio going to /dev/mdx
97 */ 102 */
98 struct bio *master_bio; 103 struct bio *master_bio;
99 /* 104 /*
100 * if the IO is in READ direction, then this is where we read 105 * if the IO is in READ direction, then this is where we read
101 */ 106 */
102 int read_slot; 107 int read_slot;
103 108
104 struct list_head retry_list; 109 struct list_head retry_list;
105 /* 110 /*
106 * if the IO is in WRITE direction, then multiple bios are used, 111 * if the IO is in WRITE direction, then multiple bios are used,
107 * one for each copy. 112 * one for each copy.
108 * When resyncing we also use one for each copy. 113 * When resyncing we also use one for each copy.
109 * When reconstructing, we use 2 bios, one for read, one for write. 114 * When reconstructing, we use 2 bios, one for read, one for write.
110 * We choose the number when they are allocated. 115 * We choose the number when they are allocated.
111 * We sometimes need an extra bio to write to the replacement. 116 * We sometimes need an extra bio to write to the replacement.
112 */ 117 */
113 struct r10dev { 118 struct r10dev {
114 struct bio *bio; 119 struct bio *bio;
115 union { 120 union {
116 struct bio *repl_bio; /* used for resync and 121 struct bio *repl_bio; /* used for resync and
117 * writes */ 122 * writes */
118 struct md_rdev *rdev; /* used for reads 123 struct md_rdev *rdev; /* used for reads
119 * (read_slot >= 0) */ 124 * (read_slot >= 0) */
120 }; 125 };
121 sector_t addr; 126 sector_t addr;
122 int devnum; 127 int devnum;
123 } devs[0]; 128 } devs[0];
124 }; 129 };
125 130
126 /* bits for r10bio.state */ 131 /* bits for r10bio.state */
127 enum r10bio_state { 132 enum r10bio_state {
128 R10BIO_Uptodate, 133 R10BIO_Uptodate,
129 R10BIO_IsSync, 134 R10BIO_IsSync,
130 R10BIO_IsRecover, 135 R10BIO_IsRecover,
131 R10BIO_IsReshape, 136 R10BIO_IsReshape,
132 R10BIO_Degraded, 137 R10BIO_Degraded,
133 /* Set ReadError on bios that experience a read error 138 /* Set ReadError on bios that experience a read error
134 * so that raid10d knows what to do with them. 139 * so that raid10d knows what to do with them.
135 */ 140 */
136 R10BIO_ReadError, 141 R10BIO_ReadError,
137 /* If a write for this request means we can clear some 142 /* If a write for this request means we can clear some
138 * known-bad-block records, we set this flag. 143 * known-bad-block records, we set this flag.
139 */ 144 */
140 R10BIO_MadeGood, 145 R10BIO_MadeGood,
141 R10BIO_WriteError, 146 R10BIO_WriteError,
142 /* During a reshape we might be performing IO on the 147 /* During a reshape we might be performing IO on the
143 * 'previous' part of the array, in which case this 148 * 'previous' part of the array, in which case this
144 * flag is set 149 * flag is set
145 */ 150 */
146 R10BIO_Previous, 151 R10BIO_Previous,
147 }; 152 };
148 153
149 extern int md_raid10_congested(struct mddev *mddev, int bits); 154 extern int md_raid10_congested(struct mddev *mddev, int bits);
150 155
151 #endif 156 #endif
152 157