Commit 5e5e3e78ed9038b8f7112835d07084eefb9daa47

Authored by NeilBrown
1 parent e4424fee18

md: Fix handling of raid5 array which is being reshaped to fewer devices.

When a raid5 (or raid6) array is being reshaped to have fewer devices,
conf->raid_disks is the latter and hence smaller number of devices.
However sometimes we want to use a number which is the total number of
currently required devices - the larger of the 'old' and 'new' sizes.
Before we implemented reducing the number of devices, this was always
'new' i.e. ->raid_disks.
Now we need max(raid_disks, previous_raid_disks) in those places.

This particularly affects assembling an array that was shutdown while
in the middle of a reshape to fewer devices.

md.c needs a similar fix when interpreting the md metadata.

Signed-off-by: NeilBrown <neilb@suse.de>

Showing 2 changed files with 19 additions and 20 deletions Inline Diff

1 /* 1 /*
2 md.c : Multiple Devices driver for Linux 2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
4 4
5 completely rewritten, based on the MD driver code from Marc Zyngier 5 completely rewritten, based on the MD driver code from Marc Zyngier
6 6
7 Changes: 7 Changes:
8 8
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin 13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16 16
17 - lots of fixes and improvements to the RAID1/RAID5 and generic 17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization): 18 RAID code (such as request based resynchronization):
19 19
20 Neil Brown <neilb@cse.unsw.edu.au>. 20 Neil Brown <neilb@cse.unsw.edu.au>.
21 21
22 - persistent bitmap code 22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24 24
25 This program is free software; you can redistribute it and/or modify 25 This program is free software; you can redistribute it and/or modify
26 it under the terms of the GNU General Public License as published by 26 it under the terms of the GNU General Public License as published by
27 the Free Software Foundation; either version 2, or (at your option) 27 the Free Software Foundation; either version 2, or (at your option)
28 any later version. 28 any later version.
29 29
30 You should have received a copy of the GNU General Public License 30 You should have received a copy of the GNU General Public License
31 (for example /usr/src/linux/COPYING); if not, write to the Free 31 (for example /usr/src/linux/COPYING); if not, write to the Free
32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */ 33 */
34 34
35 #include <linux/kthread.h> 35 #include <linux/kthread.h>
36 #include <linux/blkdev.h> 36 #include <linux/blkdev.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/seq_file.h> 38 #include <linux/seq_file.h>
39 #include <linux/buffer_head.h> /* for invalidate_bdev */ 39 #include <linux/buffer_head.h> /* for invalidate_bdev */
40 #include <linux/poll.h> 40 #include <linux/poll.h>
41 #include <linux/ctype.h> 41 #include <linux/ctype.h>
42 #include <linux/hdreg.h> 42 #include <linux/hdreg.h>
43 #include <linux/proc_fs.h> 43 #include <linux/proc_fs.h>
44 #include <linux/random.h> 44 #include <linux/random.h>
45 #include <linux/reboot.h> 45 #include <linux/reboot.h>
46 #include <linux/file.h> 46 #include <linux/file.h>
47 #include <linux/delay.h> 47 #include <linux/delay.h>
48 #include <linux/raid/md_p.h> 48 #include <linux/raid/md_p.h>
49 #include <linux/raid/md_u.h> 49 #include <linux/raid/md_u.h>
50 #include "md.h" 50 #include "md.h"
51 #include "bitmap.h" 51 #include "bitmap.h"
52 52
53 #define DEBUG 0 53 #define DEBUG 0
54 #define dprintk(x...) ((void)(DEBUG && printk(x))) 54 #define dprintk(x...) ((void)(DEBUG && printk(x)))
55 55
56 56
57 #ifndef MODULE 57 #ifndef MODULE
58 static void autostart_arrays(int part); 58 static void autostart_arrays(int part);
59 #endif 59 #endif
60 60
61 static LIST_HEAD(pers_list); 61 static LIST_HEAD(pers_list);
62 static DEFINE_SPINLOCK(pers_lock); 62 static DEFINE_SPINLOCK(pers_lock);
63 63
64 static void md_print_devices(void); 64 static void md_print_devices(void);
65 65
66 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 66 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
67 67
68 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 68 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
69 69
70 /* 70 /*
71 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 71 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
72 * is 1000 KB/sec, so the extra system load does not show up that much. 72 * is 1000 KB/sec, so the extra system load does not show up that much.
73 * Increase it if you want to have more _guaranteed_ speed. Note that 73 * Increase it if you want to have more _guaranteed_ speed. Note that
74 * the RAID driver will use the maximum available bandwidth if the IO 74 * the RAID driver will use the maximum available bandwidth if the IO
75 * subsystem is idle. There is also an 'absolute maximum' reconstruction 75 * subsystem is idle. There is also an 'absolute maximum' reconstruction
76 * speed limit - in case reconstruction slows down your system despite 76 * speed limit - in case reconstruction slows down your system despite
77 * idle IO detection. 77 * idle IO detection.
78 * 78 *
79 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 79 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
80 * or /sys/block/mdX/md/sync_speed_{min,max} 80 * or /sys/block/mdX/md/sync_speed_{min,max}
81 */ 81 */
82 82
83 static int sysctl_speed_limit_min = 1000; 83 static int sysctl_speed_limit_min = 1000;
84 static int sysctl_speed_limit_max = 200000; 84 static int sysctl_speed_limit_max = 200000;
85 static inline int speed_min(mddev_t *mddev) 85 static inline int speed_min(mddev_t *mddev)
86 { 86 {
87 return mddev->sync_speed_min ? 87 return mddev->sync_speed_min ?
88 mddev->sync_speed_min : sysctl_speed_limit_min; 88 mddev->sync_speed_min : sysctl_speed_limit_min;
89 } 89 }
90 90
91 static inline int speed_max(mddev_t *mddev) 91 static inline int speed_max(mddev_t *mddev)
92 { 92 {
93 return mddev->sync_speed_max ? 93 return mddev->sync_speed_max ?
94 mddev->sync_speed_max : sysctl_speed_limit_max; 94 mddev->sync_speed_max : sysctl_speed_limit_max;
95 } 95 }
96 96
97 static struct ctl_table_header *raid_table_header; 97 static struct ctl_table_header *raid_table_header;
98 98
99 static ctl_table raid_table[] = { 99 static ctl_table raid_table[] = {
100 { 100 {
101 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN, 101 .ctl_name = DEV_RAID_SPEED_LIMIT_MIN,
102 .procname = "speed_limit_min", 102 .procname = "speed_limit_min",
103 .data = &sysctl_speed_limit_min, 103 .data = &sysctl_speed_limit_min,
104 .maxlen = sizeof(int), 104 .maxlen = sizeof(int),
105 .mode = S_IRUGO|S_IWUSR, 105 .mode = S_IRUGO|S_IWUSR,
106 .proc_handler = &proc_dointvec, 106 .proc_handler = &proc_dointvec,
107 }, 107 },
108 { 108 {
109 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX, 109 .ctl_name = DEV_RAID_SPEED_LIMIT_MAX,
110 .procname = "speed_limit_max", 110 .procname = "speed_limit_max",
111 .data = &sysctl_speed_limit_max, 111 .data = &sysctl_speed_limit_max,
112 .maxlen = sizeof(int), 112 .maxlen = sizeof(int),
113 .mode = S_IRUGO|S_IWUSR, 113 .mode = S_IRUGO|S_IWUSR,
114 .proc_handler = &proc_dointvec, 114 .proc_handler = &proc_dointvec,
115 }, 115 },
116 { .ctl_name = 0 } 116 { .ctl_name = 0 }
117 }; 117 };
118 118
119 static ctl_table raid_dir_table[] = { 119 static ctl_table raid_dir_table[] = {
120 { 120 {
121 .ctl_name = DEV_RAID, 121 .ctl_name = DEV_RAID,
122 .procname = "raid", 122 .procname = "raid",
123 .maxlen = 0, 123 .maxlen = 0,
124 .mode = S_IRUGO|S_IXUGO, 124 .mode = S_IRUGO|S_IXUGO,
125 .child = raid_table, 125 .child = raid_table,
126 }, 126 },
127 { .ctl_name = 0 } 127 { .ctl_name = 0 }
128 }; 128 };
129 129
130 static ctl_table raid_root_table[] = { 130 static ctl_table raid_root_table[] = {
131 { 131 {
132 .ctl_name = CTL_DEV, 132 .ctl_name = CTL_DEV,
133 .procname = "dev", 133 .procname = "dev",
134 .maxlen = 0, 134 .maxlen = 0,
135 .mode = 0555, 135 .mode = 0555,
136 .child = raid_dir_table, 136 .child = raid_dir_table,
137 }, 137 },
138 { .ctl_name = 0 } 138 { .ctl_name = 0 }
139 }; 139 };
140 140
141 static const struct block_device_operations md_fops; 141 static const struct block_device_operations md_fops;
142 142
143 static int start_readonly; 143 static int start_readonly;
144 144
145 /* 145 /*
146 * We have a system wide 'event count' that is incremented 146 * We have a system wide 'event count' that is incremented
147 * on any 'interesting' event, and readers of /proc/mdstat 147 * on any 'interesting' event, and readers of /proc/mdstat
148 * can use 'poll' or 'select' to find out when the event 148 * can use 'poll' or 'select' to find out when the event
149 * count increases. 149 * count increases.
150 * 150 *
151 * Events are: 151 * Events are:
152 * start array, stop array, error, add device, remove device, 152 * start array, stop array, error, add device, remove device,
153 * start build, activate spare 153 * start build, activate spare
154 */ 154 */
155 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 155 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
156 static atomic_t md_event_count; 156 static atomic_t md_event_count;
157 void md_new_event(mddev_t *mddev) 157 void md_new_event(mddev_t *mddev)
158 { 158 {
159 atomic_inc(&md_event_count); 159 atomic_inc(&md_event_count);
160 wake_up(&md_event_waiters); 160 wake_up(&md_event_waiters);
161 } 161 }
162 EXPORT_SYMBOL_GPL(md_new_event); 162 EXPORT_SYMBOL_GPL(md_new_event);
163 163
164 /* Alternate version that can be called from interrupts 164 /* Alternate version that can be called from interrupts
165 * when calling sysfs_notify isn't needed. 165 * when calling sysfs_notify isn't needed.
166 */ 166 */
167 static void md_new_event_inintr(mddev_t *mddev) 167 static void md_new_event_inintr(mddev_t *mddev)
168 { 168 {
169 atomic_inc(&md_event_count); 169 atomic_inc(&md_event_count);
170 wake_up(&md_event_waiters); 170 wake_up(&md_event_waiters);
171 } 171 }
172 172
173 /* 173 /*
174 * Enables to iterate over all existing md arrays 174 * Enables to iterate over all existing md arrays
175 * all_mddevs_lock protects this list. 175 * all_mddevs_lock protects this list.
176 */ 176 */
177 static LIST_HEAD(all_mddevs); 177 static LIST_HEAD(all_mddevs);
178 static DEFINE_SPINLOCK(all_mddevs_lock); 178 static DEFINE_SPINLOCK(all_mddevs_lock);
179 179
180 180
181 /* 181 /*
182 * iterates through all used mddevs in the system. 182 * iterates through all used mddevs in the system.
183 * We take care to grab the all_mddevs_lock whenever navigating 183 * We take care to grab the all_mddevs_lock whenever navigating
184 * the list, and to always hold a refcount when unlocked. 184 * the list, and to always hold a refcount when unlocked.
185 * Any code which breaks out of this loop while own 185 * Any code which breaks out of this loop while own
186 * a reference to the current mddev and must mddev_put it. 186 * a reference to the current mddev and must mddev_put it.
187 */ 187 */
188 #define for_each_mddev(mddev,tmp) \ 188 #define for_each_mddev(mddev,tmp) \
189 \ 189 \
190 for (({ spin_lock(&all_mddevs_lock); \ 190 for (({ spin_lock(&all_mddevs_lock); \
191 tmp = all_mddevs.next; \ 191 tmp = all_mddevs.next; \
192 mddev = NULL;}); \ 192 mddev = NULL;}); \
193 ({ if (tmp != &all_mddevs) \ 193 ({ if (tmp != &all_mddevs) \
194 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\ 194 mddev_get(list_entry(tmp, mddev_t, all_mddevs));\
195 spin_unlock(&all_mddevs_lock); \ 195 spin_unlock(&all_mddevs_lock); \
196 if (mddev) mddev_put(mddev); \ 196 if (mddev) mddev_put(mddev); \
197 mddev = list_entry(tmp, mddev_t, all_mddevs); \ 197 mddev = list_entry(tmp, mddev_t, all_mddevs); \
198 tmp != &all_mddevs;}); \ 198 tmp != &all_mddevs;}); \
199 ({ spin_lock(&all_mddevs_lock); \ 199 ({ spin_lock(&all_mddevs_lock); \
200 tmp = tmp->next;}) \ 200 tmp = tmp->next;}) \
201 ) 201 )
202 202
203 203
204 /* Rather than calling directly into the personality make_request function, 204 /* Rather than calling directly into the personality make_request function,
205 * IO requests come here first so that we can check if the device is 205 * IO requests come here first so that we can check if the device is
206 * being suspended pending a reconfiguration. 206 * being suspended pending a reconfiguration.
207 * We hold a refcount over the call to ->make_request. By the time that 207 * We hold a refcount over the call to ->make_request. By the time that
208 * call has finished, the bio has been linked into some internal structure 208 * call has finished, the bio has been linked into some internal structure
209 * and so is visible to ->quiesce(), so we don't need the refcount any more. 209 * and so is visible to ->quiesce(), so we don't need the refcount any more.
210 */ 210 */
211 static int md_make_request(struct request_queue *q, struct bio *bio) 211 static int md_make_request(struct request_queue *q, struct bio *bio)
212 { 212 {
213 mddev_t *mddev = q->queuedata; 213 mddev_t *mddev = q->queuedata;
214 int rv; 214 int rv;
215 if (mddev == NULL || mddev->pers == NULL) { 215 if (mddev == NULL || mddev->pers == NULL) {
216 bio_io_error(bio); 216 bio_io_error(bio);
217 return 0; 217 return 0;
218 } 218 }
219 rcu_read_lock(); 219 rcu_read_lock();
220 if (mddev->suspended) { 220 if (mddev->suspended) {
221 DEFINE_WAIT(__wait); 221 DEFINE_WAIT(__wait);
222 for (;;) { 222 for (;;) {
223 prepare_to_wait(&mddev->sb_wait, &__wait, 223 prepare_to_wait(&mddev->sb_wait, &__wait,
224 TASK_UNINTERRUPTIBLE); 224 TASK_UNINTERRUPTIBLE);
225 if (!mddev->suspended) 225 if (!mddev->suspended)
226 break; 226 break;
227 rcu_read_unlock(); 227 rcu_read_unlock();
228 schedule(); 228 schedule();
229 rcu_read_lock(); 229 rcu_read_lock();
230 } 230 }
231 finish_wait(&mddev->sb_wait, &__wait); 231 finish_wait(&mddev->sb_wait, &__wait);
232 } 232 }
233 atomic_inc(&mddev->active_io); 233 atomic_inc(&mddev->active_io);
234 rcu_read_unlock(); 234 rcu_read_unlock();
235 rv = mddev->pers->make_request(q, bio); 235 rv = mddev->pers->make_request(q, bio);
236 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 236 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
237 wake_up(&mddev->sb_wait); 237 wake_up(&mddev->sb_wait);
238 238
239 return rv; 239 return rv;
240 } 240 }
241 241
242 static void mddev_suspend(mddev_t *mddev) 242 static void mddev_suspend(mddev_t *mddev)
243 { 243 {
244 BUG_ON(mddev->suspended); 244 BUG_ON(mddev->suspended);
245 mddev->suspended = 1; 245 mddev->suspended = 1;
246 synchronize_rcu(); 246 synchronize_rcu();
247 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 247 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
248 mddev->pers->quiesce(mddev, 1); 248 mddev->pers->quiesce(mddev, 1);
249 md_unregister_thread(mddev->thread); 249 md_unregister_thread(mddev->thread);
250 mddev->thread = NULL; 250 mddev->thread = NULL;
251 /* we now know that no code is executing in the personality module, 251 /* we now know that no code is executing in the personality module,
252 * except possibly the tail end of a ->bi_end_io function, but that 252 * except possibly the tail end of a ->bi_end_io function, but that
253 * is certain to complete before the module has a chance to get 253 * is certain to complete before the module has a chance to get
254 * unloaded 254 * unloaded
255 */ 255 */
256 } 256 }
257 257
258 static void mddev_resume(mddev_t *mddev) 258 static void mddev_resume(mddev_t *mddev)
259 { 259 {
260 mddev->suspended = 0; 260 mddev->suspended = 0;
261 wake_up(&mddev->sb_wait); 261 wake_up(&mddev->sb_wait);
262 mddev->pers->quiesce(mddev, 0); 262 mddev->pers->quiesce(mddev, 0);
263 } 263 }
264 264
265 int mddev_congested(mddev_t *mddev, int bits) 265 int mddev_congested(mddev_t *mddev, int bits)
266 { 266 {
267 return mddev->suspended; 267 return mddev->suspended;
268 } 268 }
269 EXPORT_SYMBOL(mddev_congested); 269 EXPORT_SYMBOL(mddev_congested);
270 270
271 271
272 static inline mddev_t *mddev_get(mddev_t *mddev) 272 static inline mddev_t *mddev_get(mddev_t *mddev)
273 { 273 {
274 atomic_inc(&mddev->active); 274 atomic_inc(&mddev->active);
275 return mddev; 275 return mddev;
276 } 276 }
277 277
278 static void mddev_delayed_delete(struct work_struct *ws); 278 static void mddev_delayed_delete(struct work_struct *ws);
279 279
280 static void mddev_put(mddev_t *mddev) 280 static void mddev_put(mddev_t *mddev)
281 { 281 {
282 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 282 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
283 return; 283 return;
284 if (!mddev->raid_disks && list_empty(&mddev->disks) && 284 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
285 !mddev->hold_active) { 285 !mddev->hold_active) {
286 list_del(&mddev->all_mddevs); 286 list_del(&mddev->all_mddevs);
287 if (mddev->gendisk) { 287 if (mddev->gendisk) {
288 /* we did a probe so need to clean up. 288 /* we did a probe so need to clean up.
289 * Call schedule_work inside the spinlock 289 * Call schedule_work inside the spinlock
290 * so that flush_scheduled_work() after 290 * so that flush_scheduled_work() after
291 * mddev_find will succeed in waiting for the 291 * mddev_find will succeed in waiting for the
292 * work to be done. 292 * work to be done.
293 */ 293 */
294 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 294 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
295 schedule_work(&mddev->del_work); 295 schedule_work(&mddev->del_work);
296 } else 296 } else
297 kfree(mddev); 297 kfree(mddev);
298 } 298 }
299 spin_unlock(&all_mddevs_lock); 299 spin_unlock(&all_mddevs_lock);
300 } 300 }
301 301
302 static mddev_t * mddev_find(dev_t unit) 302 static mddev_t * mddev_find(dev_t unit)
303 { 303 {
304 mddev_t *mddev, *new = NULL; 304 mddev_t *mddev, *new = NULL;
305 305
306 retry: 306 retry:
307 spin_lock(&all_mddevs_lock); 307 spin_lock(&all_mddevs_lock);
308 308
309 if (unit) { 309 if (unit) {
310 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 310 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
311 if (mddev->unit == unit) { 311 if (mddev->unit == unit) {
312 mddev_get(mddev); 312 mddev_get(mddev);
313 spin_unlock(&all_mddevs_lock); 313 spin_unlock(&all_mddevs_lock);
314 kfree(new); 314 kfree(new);
315 return mddev; 315 return mddev;
316 } 316 }
317 317
318 if (new) { 318 if (new) {
319 list_add(&new->all_mddevs, &all_mddevs); 319 list_add(&new->all_mddevs, &all_mddevs);
320 spin_unlock(&all_mddevs_lock); 320 spin_unlock(&all_mddevs_lock);
321 new->hold_active = UNTIL_IOCTL; 321 new->hold_active = UNTIL_IOCTL;
322 return new; 322 return new;
323 } 323 }
324 } else if (new) { 324 } else if (new) {
325 /* find an unused unit number */ 325 /* find an unused unit number */
326 static int next_minor = 512; 326 static int next_minor = 512;
327 int start = next_minor; 327 int start = next_minor;
328 int is_free = 0; 328 int is_free = 0;
329 int dev = 0; 329 int dev = 0;
330 while (!is_free) { 330 while (!is_free) {
331 dev = MKDEV(MD_MAJOR, next_minor); 331 dev = MKDEV(MD_MAJOR, next_minor);
332 next_minor++; 332 next_minor++;
333 if (next_minor > MINORMASK) 333 if (next_minor > MINORMASK)
334 next_minor = 0; 334 next_minor = 0;
335 if (next_minor == start) { 335 if (next_minor == start) {
336 /* Oh dear, all in use. */ 336 /* Oh dear, all in use. */
337 spin_unlock(&all_mddevs_lock); 337 spin_unlock(&all_mddevs_lock);
338 kfree(new); 338 kfree(new);
339 return NULL; 339 return NULL;
340 } 340 }
341 341
342 is_free = 1; 342 is_free = 1;
343 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 343 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
344 if (mddev->unit == dev) { 344 if (mddev->unit == dev) {
345 is_free = 0; 345 is_free = 0;
346 break; 346 break;
347 } 347 }
348 } 348 }
349 new->unit = dev; 349 new->unit = dev;
350 new->md_minor = MINOR(dev); 350 new->md_minor = MINOR(dev);
351 new->hold_active = UNTIL_STOP; 351 new->hold_active = UNTIL_STOP;
352 list_add(&new->all_mddevs, &all_mddevs); 352 list_add(&new->all_mddevs, &all_mddevs);
353 spin_unlock(&all_mddevs_lock); 353 spin_unlock(&all_mddevs_lock);
354 return new; 354 return new;
355 } 355 }
356 spin_unlock(&all_mddevs_lock); 356 spin_unlock(&all_mddevs_lock);
357 357
358 new = kzalloc(sizeof(*new), GFP_KERNEL); 358 new = kzalloc(sizeof(*new), GFP_KERNEL);
359 if (!new) 359 if (!new)
360 return NULL; 360 return NULL;
361 361
362 new->unit = unit; 362 new->unit = unit;
363 if (MAJOR(unit) == MD_MAJOR) 363 if (MAJOR(unit) == MD_MAJOR)
364 new->md_minor = MINOR(unit); 364 new->md_minor = MINOR(unit);
365 else 365 else
366 new->md_minor = MINOR(unit) >> MdpMinorShift; 366 new->md_minor = MINOR(unit) >> MdpMinorShift;
367 367
368 mutex_init(&new->open_mutex); 368 mutex_init(&new->open_mutex);
369 mutex_init(&new->reconfig_mutex); 369 mutex_init(&new->reconfig_mutex);
370 INIT_LIST_HEAD(&new->disks); 370 INIT_LIST_HEAD(&new->disks);
371 INIT_LIST_HEAD(&new->all_mddevs); 371 INIT_LIST_HEAD(&new->all_mddevs);
372 init_timer(&new->safemode_timer); 372 init_timer(&new->safemode_timer);
373 atomic_set(&new->active, 1); 373 atomic_set(&new->active, 1);
374 atomic_set(&new->openers, 0); 374 atomic_set(&new->openers, 0);
375 atomic_set(&new->active_io, 0); 375 atomic_set(&new->active_io, 0);
376 spin_lock_init(&new->write_lock); 376 spin_lock_init(&new->write_lock);
377 init_waitqueue_head(&new->sb_wait); 377 init_waitqueue_head(&new->sb_wait);
378 init_waitqueue_head(&new->recovery_wait); 378 init_waitqueue_head(&new->recovery_wait);
379 new->reshape_position = MaxSector; 379 new->reshape_position = MaxSector;
380 new->resync_min = 0; 380 new->resync_min = 0;
381 new->resync_max = MaxSector; 381 new->resync_max = MaxSector;
382 new->level = LEVEL_NONE; 382 new->level = LEVEL_NONE;
383 383
384 goto retry; 384 goto retry;
385 } 385 }
386 386
387 static inline int mddev_lock(mddev_t * mddev) 387 static inline int mddev_lock(mddev_t * mddev)
388 { 388 {
389 return mutex_lock_interruptible(&mddev->reconfig_mutex); 389 return mutex_lock_interruptible(&mddev->reconfig_mutex);
390 } 390 }
391 391
392 static inline int mddev_is_locked(mddev_t *mddev) 392 static inline int mddev_is_locked(mddev_t *mddev)
393 { 393 {
394 return mutex_is_locked(&mddev->reconfig_mutex); 394 return mutex_is_locked(&mddev->reconfig_mutex);
395 } 395 }
396 396
397 static inline int mddev_trylock(mddev_t * mddev) 397 static inline int mddev_trylock(mddev_t * mddev)
398 { 398 {
399 return mutex_trylock(&mddev->reconfig_mutex); 399 return mutex_trylock(&mddev->reconfig_mutex);
400 } 400 }
401 401
402 static inline void mddev_unlock(mddev_t * mddev) 402 static inline void mddev_unlock(mddev_t * mddev)
403 { 403 {
404 mutex_unlock(&mddev->reconfig_mutex); 404 mutex_unlock(&mddev->reconfig_mutex);
405 405
406 md_wakeup_thread(mddev->thread); 406 md_wakeup_thread(mddev->thread);
407 } 407 }
408 408
409 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr) 409 static mdk_rdev_t * find_rdev_nr(mddev_t *mddev, int nr)
410 { 410 {
411 mdk_rdev_t *rdev; 411 mdk_rdev_t *rdev;
412 412
413 list_for_each_entry(rdev, &mddev->disks, same_set) 413 list_for_each_entry(rdev, &mddev->disks, same_set)
414 if (rdev->desc_nr == nr) 414 if (rdev->desc_nr == nr)
415 return rdev; 415 return rdev;
416 416
417 return NULL; 417 return NULL;
418 } 418 }
419 419
420 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev) 420 static mdk_rdev_t * find_rdev(mddev_t * mddev, dev_t dev)
421 { 421 {
422 mdk_rdev_t *rdev; 422 mdk_rdev_t *rdev;
423 423
424 list_for_each_entry(rdev, &mddev->disks, same_set) 424 list_for_each_entry(rdev, &mddev->disks, same_set)
425 if (rdev->bdev->bd_dev == dev) 425 if (rdev->bdev->bd_dev == dev)
426 return rdev; 426 return rdev;
427 427
428 return NULL; 428 return NULL;
429 } 429 }
430 430
431 static struct mdk_personality *find_pers(int level, char *clevel) 431 static struct mdk_personality *find_pers(int level, char *clevel)
432 { 432 {
433 struct mdk_personality *pers; 433 struct mdk_personality *pers;
434 list_for_each_entry(pers, &pers_list, list) { 434 list_for_each_entry(pers, &pers_list, list) {
435 if (level != LEVEL_NONE && pers->level == level) 435 if (level != LEVEL_NONE && pers->level == level)
436 return pers; 436 return pers;
437 if (strcmp(pers->name, clevel)==0) 437 if (strcmp(pers->name, clevel)==0)
438 return pers; 438 return pers;
439 } 439 }
440 return NULL; 440 return NULL;
441 } 441 }
442 442
443 /* return the offset of the super block in 512byte sectors */ 443 /* return the offset of the super block in 512byte sectors */
444 static inline sector_t calc_dev_sboffset(struct block_device *bdev) 444 static inline sector_t calc_dev_sboffset(struct block_device *bdev)
445 { 445 {
446 sector_t num_sectors = bdev->bd_inode->i_size / 512; 446 sector_t num_sectors = bdev->bd_inode->i_size / 512;
447 return MD_NEW_SIZE_SECTORS(num_sectors); 447 return MD_NEW_SIZE_SECTORS(num_sectors);
448 } 448 }
449 449
450 static int alloc_disk_sb(mdk_rdev_t * rdev) 450 static int alloc_disk_sb(mdk_rdev_t * rdev)
451 { 451 {
452 if (rdev->sb_page) 452 if (rdev->sb_page)
453 MD_BUG(); 453 MD_BUG();
454 454
455 rdev->sb_page = alloc_page(GFP_KERNEL); 455 rdev->sb_page = alloc_page(GFP_KERNEL);
456 if (!rdev->sb_page) { 456 if (!rdev->sb_page) {
457 printk(KERN_ALERT "md: out of memory.\n"); 457 printk(KERN_ALERT "md: out of memory.\n");
458 return -ENOMEM; 458 return -ENOMEM;
459 } 459 }
460 460
461 return 0; 461 return 0;
462 } 462 }
463 463
464 static void free_disk_sb(mdk_rdev_t * rdev) 464 static void free_disk_sb(mdk_rdev_t * rdev)
465 { 465 {
466 if (rdev->sb_page) { 466 if (rdev->sb_page) {
467 put_page(rdev->sb_page); 467 put_page(rdev->sb_page);
468 rdev->sb_loaded = 0; 468 rdev->sb_loaded = 0;
469 rdev->sb_page = NULL; 469 rdev->sb_page = NULL;
470 rdev->sb_start = 0; 470 rdev->sb_start = 0;
471 rdev->sectors = 0; 471 rdev->sectors = 0;
472 } 472 }
473 } 473 }
474 474
475 475
476 static void super_written(struct bio *bio, int error) 476 static void super_written(struct bio *bio, int error)
477 { 477 {
478 mdk_rdev_t *rdev = bio->bi_private; 478 mdk_rdev_t *rdev = bio->bi_private;
479 mddev_t *mddev = rdev->mddev; 479 mddev_t *mddev = rdev->mddev;
480 480
481 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 481 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
482 printk("md: super_written gets error=%d, uptodate=%d\n", 482 printk("md: super_written gets error=%d, uptodate=%d\n",
483 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 483 error, test_bit(BIO_UPTODATE, &bio->bi_flags));
484 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 484 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
485 md_error(mddev, rdev); 485 md_error(mddev, rdev);
486 } 486 }
487 487
488 if (atomic_dec_and_test(&mddev->pending_writes)) 488 if (atomic_dec_and_test(&mddev->pending_writes))
489 wake_up(&mddev->sb_wait); 489 wake_up(&mddev->sb_wait);
490 bio_put(bio); 490 bio_put(bio);
491 } 491 }
492 492
493 static void super_written_barrier(struct bio *bio, int error) 493 static void super_written_barrier(struct bio *bio, int error)
494 { 494 {
495 struct bio *bio2 = bio->bi_private; 495 struct bio *bio2 = bio->bi_private;
496 mdk_rdev_t *rdev = bio2->bi_private; 496 mdk_rdev_t *rdev = bio2->bi_private;
497 mddev_t *mddev = rdev->mddev; 497 mddev_t *mddev = rdev->mddev;
498 498
499 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) && 499 if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
500 error == -EOPNOTSUPP) { 500 error == -EOPNOTSUPP) {
501 unsigned long flags; 501 unsigned long flags;
502 /* barriers don't appear to be supported :-( */ 502 /* barriers don't appear to be supported :-( */
503 set_bit(BarriersNotsupp, &rdev->flags); 503 set_bit(BarriersNotsupp, &rdev->flags);
504 mddev->barriers_work = 0; 504 mddev->barriers_work = 0;
505 spin_lock_irqsave(&mddev->write_lock, flags); 505 spin_lock_irqsave(&mddev->write_lock, flags);
506 bio2->bi_next = mddev->biolist; 506 bio2->bi_next = mddev->biolist;
507 mddev->biolist = bio2; 507 mddev->biolist = bio2;
508 spin_unlock_irqrestore(&mddev->write_lock, flags); 508 spin_unlock_irqrestore(&mddev->write_lock, flags);
509 wake_up(&mddev->sb_wait); 509 wake_up(&mddev->sb_wait);
510 bio_put(bio); 510 bio_put(bio);
511 } else { 511 } else {
512 bio_put(bio2); 512 bio_put(bio2);
513 bio->bi_private = rdev; 513 bio->bi_private = rdev;
514 super_written(bio, error); 514 super_written(bio, error);
515 } 515 }
516 } 516 }
517 517
518 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev, 518 void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
519 sector_t sector, int size, struct page *page) 519 sector_t sector, int size, struct page *page)
520 { 520 {
521 /* write first size bytes of page to sector of rdev 521 /* write first size bytes of page to sector of rdev
522 * Increment mddev->pending_writes before returning 522 * Increment mddev->pending_writes before returning
523 * and decrement it on completion, waking up sb_wait 523 * and decrement it on completion, waking up sb_wait
524 * if zero is reached. 524 * if zero is reached.
525 * If an error occurred, call md_error 525 * If an error occurred, call md_error
526 * 526 *
527 * As we might need to resubmit the request if BIO_RW_BARRIER 527 * As we might need to resubmit the request if BIO_RW_BARRIER
528 * causes ENOTSUPP, we allocate a spare bio... 528 * causes ENOTSUPP, we allocate a spare bio...
529 */ 529 */
530 struct bio *bio = bio_alloc(GFP_NOIO, 1); 530 struct bio *bio = bio_alloc(GFP_NOIO, 1);
531 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG); 531 int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
532 532
533 bio->bi_bdev = rdev->bdev; 533 bio->bi_bdev = rdev->bdev;
534 bio->bi_sector = sector; 534 bio->bi_sector = sector;
535 bio_add_page(bio, page, size, 0); 535 bio_add_page(bio, page, size, 0);
536 bio->bi_private = rdev; 536 bio->bi_private = rdev;
537 bio->bi_end_io = super_written; 537 bio->bi_end_io = super_written;
538 bio->bi_rw = rw; 538 bio->bi_rw = rw;
539 539
540 atomic_inc(&mddev->pending_writes); 540 atomic_inc(&mddev->pending_writes);
541 if (!test_bit(BarriersNotsupp, &rdev->flags)) { 541 if (!test_bit(BarriersNotsupp, &rdev->flags)) {
542 struct bio *rbio; 542 struct bio *rbio;
543 rw |= (1<<BIO_RW_BARRIER); 543 rw |= (1<<BIO_RW_BARRIER);
544 rbio = bio_clone(bio, GFP_NOIO); 544 rbio = bio_clone(bio, GFP_NOIO);
545 rbio->bi_private = bio; 545 rbio->bi_private = bio;
546 rbio->bi_end_io = super_written_barrier; 546 rbio->bi_end_io = super_written_barrier;
547 submit_bio(rw, rbio); 547 submit_bio(rw, rbio);
548 } else 548 } else
549 submit_bio(rw, bio); 549 submit_bio(rw, bio);
550 } 550 }
551 551
552 void md_super_wait(mddev_t *mddev) 552 void md_super_wait(mddev_t *mddev)
553 { 553 {
554 /* wait for all superblock writes that were scheduled to complete. 554 /* wait for all superblock writes that were scheduled to complete.
555 * if any had to be retried (due to BARRIER problems), retry them 555 * if any had to be retried (due to BARRIER problems), retry them
556 */ 556 */
557 DEFINE_WAIT(wq); 557 DEFINE_WAIT(wq);
558 for(;;) { 558 for(;;) {
559 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 559 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
560 if (atomic_read(&mddev->pending_writes)==0) 560 if (atomic_read(&mddev->pending_writes)==0)
561 break; 561 break;
562 while (mddev->biolist) { 562 while (mddev->biolist) {
563 struct bio *bio; 563 struct bio *bio;
564 spin_lock_irq(&mddev->write_lock); 564 spin_lock_irq(&mddev->write_lock);
565 bio = mddev->biolist; 565 bio = mddev->biolist;
566 mddev->biolist = bio->bi_next ; 566 mddev->biolist = bio->bi_next ;
567 bio->bi_next = NULL; 567 bio->bi_next = NULL;
568 spin_unlock_irq(&mddev->write_lock); 568 spin_unlock_irq(&mddev->write_lock);
569 submit_bio(bio->bi_rw, bio); 569 submit_bio(bio->bi_rw, bio);
570 } 570 }
571 schedule(); 571 schedule();
572 } 572 }
573 finish_wait(&mddev->sb_wait, &wq); 573 finish_wait(&mddev->sb_wait, &wq);
574 } 574 }
575 575
576 static void bi_complete(struct bio *bio, int error) 576 static void bi_complete(struct bio *bio, int error)
577 { 577 {
578 complete((struct completion*)bio->bi_private); 578 complete((struct completion*)bio->bi_private);
579 } 579 }
580 580
581 int sync_page_io(struct block_device *bdev, sector_t sector, int size, 581 int sync_page_io(struct block_device *bdev, sector_t sector, int size,
582 struct page *page, int rw) 582 struct page *page, int rw)
583 { 583 {
584 struct bio *bio = bio_alloc(GFP_NOIO, 1); 584 struct bio *bio = bio_alloc(GFP_NOIO, 1);
585 struct completion event; 585 struct completion event;
586 int ret; 586 int ret;
587 587
588 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG); 588 rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
589 589
590 bio->bi_bdev = bdev; 590 bio->bi_bdev = bdev;
591 bio->bi_sector = sector; 591 bio->bi_sector = sector;
592 bio_add_page(bio, page, size, 0); 592 bio_add_page(bio, page, size, 0);
593 init_completion(&event); 593 init_completion(&event);
594 bio->bi_private = &event; 594 bio->bi_private = &event;
595 bio->bi_end_io = bi_complete; 595 bio->bi_end_io = bi_complete;
596 submit_bio(rw, bio); 596 submit_bio(rw, bio);
597 wait_for_completion(&event); 597 wait_for_completion(&event);
598 598
599 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 599 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
600 bio_put(bio); 600 bio_put(bio);
601 return ret; 601 return ret;
602 } 602 }
603 EXPORT_SYMBOL_GPL(sync_page_io); 603 EXPORT_SYMBOL_GPL(sync_page_io);
604 604
605 static int read_disk_sb(mdk_rdev_t * rdev, int size) 605 static int read_disk_sb(mdk_rdev_t * rdev, int size)
606 { 606 {
607 char b[BDEVNAME_SIZE]; 607 char b[BDEVNAME_SIZE];
608 if (!rdev->sb_page) { 608 if (!rdev->sb_page) {
609 MD_BUG(); 609 MD_BUG();
610 return -EINVAL; 610 return -EINVAL;
611 } 611 }
612 if (rdev->sb_loaded) 612 if (rdev->sb_loaded)
613 return 0; 613 return 0;
614 614
615 615
616 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ)) 616 if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
617 goto fail; 617 goto fail;
618 rdev->sb_loaded = 1; 618 rdev->sb_loaded = 1;
619 return 0; 619 return 0;
620 620
621 fail: 621 fail:
622 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 622 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
623 bdevname(rdev->bdev,b)); 623 bdevname(rdev->bdev,b));
624 return -EINVAL; 624 return -EINVAL;
625 } 625 }
626 626
627 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 627 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
628 { 628 {
629 return sb1->set_uuid0 == sb2->set_uuid0 && 629 return sb1->set_uuid0 == sb2->set_uuid0 &&
630 sb1->set_uuid1 == sb2->set_uuid1 && 630 sb1->set_uuid1 == sb2->set_uuid1 &&
631 sb1->set_uuid2 == sb2->set_uuid2 && 631 sb1->set_uuid2 == sb2->set_uuid2 &&
632 sb1->set_uuid3 == sb2->set_uuid3; 632 sb1->set_uuid3 == sb2->set_uuid3;
633 } 633 }
634 634
635 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 635 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
636 { 636 {
637 int ret; 637 int ret;
638 mdp_super_t *tmp1, *tmp2; 638 mdp_super_t *tmp1, *tmp2;
639 639
640 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 640 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
641 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 641 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
642 642
643 if (!tmp1 || !tmp2) { 643 if (!tmp1 || !tmp2) {
644 ret = 0; 644 ret = 0;
645 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 645 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
646 goto abort; 646 goto abort;
647 } 647 }
648 648
649 *tmp1 = *sb1; 649 *tmp1 = *sb1;
650 *tmp2 = *sb2; 650 *tmp2 = *sb2;
651 651
652 /* 652 /*
653 * nr_disks is not constant 653 * nr_disks is not constant
654 */ 654 */
655 tmp1->nr_disks = 0; 655 tmp1->nr_disks = 0;
656 tmp2->nr_disks = 0; 656 tmp2->nr_disks = 0;
657 657
658 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 658 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
659 abort: 659 abort:
660 kfree(tmp1); 660 kfree(tmp1);
661 kfree(tmp2); 661 kfree(tmp2);
662 return ret; 662 return ret;
663 } 663 }
664 664
665 665
666 static u32 md_csum_fold(u32 csum) 666 static u32 md_csum_fold(u32 csum)
667 { 667 {
668 csum = (csum & 0xffff) + (csum >> 16); 668 csum = (csum & 0xffff) + (csum >> 16);
669 return (csum & 0xffff) + (csum >> 16); 669 return (csum & 0xffff) + (csum >> 16);
670 } 670 }
671 671
672 static unsigned int calc_sb_csum(mdp_super_t * sb) 672 static unsigned int calc_sb_csum(mdp_super_t * sb)
673 { 673 {
674 u64 newcsum = 0; 674 u64 newcsum = 0;
675 u32 *sb32 = (u32*)sb; 675 u32 *sb32 = (u32*)sb;
676 int i; 676 int i;
677 unsigned int disk_csum, csum; 677 unsigned int disk_csum, csum;
678 678
679 disk_csum = sb->sb_csum; 679 disk_csum = sb->sb_csum;
680 sb->sb_csum = 0; 680 sb->sb_csum = 0;
681 681
682 for (i = 0; i < MD_SB_BYTES/4 ; i++) 682 for (i = 0; i < MD_SB_BYTES/4 ; i++)
683 newcsum += sb32[i]; 683 newcsum += sb32[i];
684 csum = (newcsum & 0xffffffff) + (newcsum>>32); 684 csum = (newcsum & 0xffffffff) + (newcsum>>32);
685 685
686 686
687 #ifdef CONFIG_ALPHA 687 #ifdef CONFIG_ALPHA
688 /* This used to use csum_partial, which was wrong for several 688 /* This used to use csum_partial, which was wrong for several
689 * reasons including that different results are returned on 689 * reasons including that different results are returned on
690 * different architectures. It isn't critical that we get exactly 690 * different architectures. It isn't critical that we get exactly
691 * the same return value as before (we always csum_fold before 691 * the same return value as before (we always csum_fold before
692 * testing, and that removes any differences). However as we 692 * testing, and that removes any differences). However as we
693 * know that csum_partial always returned a 16bit value on 693 * know that csum_partial always returned a 16bit value on
694 * alphas, do a fold to maximise conformity to previous behaviour. 694 * alphas, do a fold to maximise conformity to previous behaviour.
695 */ 695 */
696 sb->sb_csum = md_csum_fold(disk_csum); 696 sb->sb_csum = md_csum_fold(disk_csum);
697 #else 697 #else
698 sb->sb_csum = disk_csum; 698 sb->sb_csum = disk_csum;
699 #endif 699 #endif
700 return csum; 700 return csum;
701 } 701 }
702 702
703 703
704 /* 704 /*
705 * Handle superblock details. 705 * Handle superblock details.
706 * We want to be able to handle multiple superblock formats 706 * We want to be able to handle multiple superblock formats
707 * so we have a common interface to them all, and an array of 707 * so we have a common interface to them all, and an array of
708 * different handlers. 708 * different handlers.
709 * We rely on user-space to write the initial superblock, and support 709 * We rely on user-space to write the initial superblock, and support
710 * reading and updating of superblocks. 710 * reading and updating of superblocks.
711 * Interface methods are: 711 * Interface methods are:
712 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version) 712 * int load_super(mdk_rdev_t *dev, mdk_rdev_t *refdev, int minor_version)
713 * loads and validates a superblock on dev. 713 * loads and validates a superblock on dev.
714 * if refdev != NULL, compare superblocks on both devices 714 * if refdev != NULL, compare superblocks on both devices
715 * Return: 715 * Return:
716 * 0 - dev has a superblock that is compatible with refdev 716 * 0 - dev has a superblock that is compatible with refdev
717 * 1 - dev has a superblock that is compatible and newer than refdev 717 * 1 - dev has a superblock that is compatible and newer than refdev
718 * so dev should be used as the refdev in future 718 * so dev should be used as the refdev in future
719 * -EINVAL superblock incompatible or invalid 719 * -EINVAL superblock incompatible or invalid
720 * -othererror e.g. -EIO 720 * -othererror e.g. -EIO
721 * 721 *
722 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev) 722 * int validate_super(mddev_t *mddev, mdk_rdev_t *dev)
723 * Verify that dev is acceptable into mddev. 723 * Verify that dev is acceptable into mddev.
724 * The first time, mddev->raid_disks will be 0, and data from 724 * The first time, mddev->raid_disks will be 0, and data from
725 * dev should be merged in. Subsequent calls check that dev 725 * dev should be merged in. Subsequent calls check that dev
726 * is new enough. Return 0 or -EINVAL 726 * is new enough. Return 0 or -EINVAL
727 * 727 *
728 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev) 728 * void sync_super(mddev_t *mddev, mdk_rdev_t *dev)
729 * Update the superblock for rdev with data in mddev 729 * Update the superblock for rdev with data in mddev
730 * This does not write to disc. 730 * This does not write to disc.
731 * 731 *
732 */ 732 */
733 733
734 struct super_type { 734 struct super_type {
735 char *name; 735 char *name;
736 struct module *owner; 736 struct module *owner;
737 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev, 737 int (*load_super)(mdk_rdev_t *rdev, mdk_rdev_t *refdev,
738 int minor_version); 738 int minor_version);
739 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev); 739 int (*validate_super)(mddev_t *mddev, mdk_rdev_t *rdev);
740 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev); 740 void (*sync_super)(mddev_t *mddev, mdk_rdev_t *rdev);
741 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev, 741 unsigned long long (*rdev_size_change)(mdk_rdev_t *rdev,
742 sector_t num_sectors); 742 sector_t num_sectors);
743 }; 743 };
744 744
745 /* 745 /*
746 * Check that the given mddev has no bitmap. 746 * Check that the given mddev has no bitmap.
747 * 747 *
748 * This function is called from the run method of all personalities that do not 748 * This function is called from the run method of all personalities that do not
749 * support bitmaps. It prints an error message and returns non-zero if mddev 749 * support bitmaps. It prints an error message and returns non-zero if mddev
750 * has a bitmap. Otherwise, it returns 0. 750 * has a bitmap. Otherwise, it returns 0.
751 * 751 *
752 */ 752 */
753 int md_check_no_bitmap(mddev_t *mddev) 753 int md_check_no_bitmap(mddev_t *mddev)
754 { 754 {
755 if (!mddev->bitmap_file && !mddev->bitmap_offset) 755 if (!mddev->bitmap_file && !mddev->bitmap_offset)
756 return 0; 756 return 0;
757 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 757 printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
758 mdname(mddev), mddev->pers->name); 758 mdname(mddev), mddev->pers->name);
759 return 1; 759 return 1;
760 } 760 }
761 EXPORT_SYMBOL(md_check_no_bitmap); 761 EXPORT_SYMBOL(md_check_no_bitmap);
762 762
763 /* 763 /*
764 * load_super for 0.90.0 764 * load_super for 0.90.0
765 */ 765 */
766 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 766 static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
767 { 767 {
768 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 768 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
769 mdp_super_t *sb; 769 mdp_super_t *sb;
770 int ret; 770 int ret;
771 771
772 /* 772 /*
773 * Calculate the position of the superblock (512byte sectors), 773 * Calculate the position of the superblock (512byte sectors),
774 * it's at the end of the disk. 774 * it's at the end of the disk.
775 * 775 *
776 * It also happens to be a multiple of 4Kb. 776 * It also happens to be a multiple of 4Kb.
777 */ 777 */
778 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 778 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
779 779
780 ret = read_disk_sb(rdev, MD_SB_BYTES); 780 ret = read_disk_sb(rdev, MD_SB_BYTES);
781 if (ret) return ret; 781 if (ret) return ret;
782 782
783 ret = -EINVAL; 783 ret = -EINVAL;
784 784
785 bdevname(rdev->bdev, b); 785 bdevname(rdev->bdev, b);
786 sb = (mdp_super_t*)page_address(rdev->sb_page); 786 sb = (mdp_super_t*)page_address(rdev->sb_page);
787 787
788 if (sb->md_magic != MD_SB_MAGIC) { 788 if (sb->md_magic != MD_SB_MAGIC) {
789 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 789 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
790 b); 790 b);
791 goto abort; 791 goto abort;
792 } 792 }
793 793
794 if (sb->major_version != 0 || 794 if (sb->major_version != 0 ||
795 sb->minor_version < 90 || 795 sb->minor_version < 90 ||
796 sb->minor_version > 91) { 796 sb->minor_version > 91) {
797 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 797 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
798 sb->major_version, sb->minor_version, 798 sb->major_version, sb->minor_version,
799 b); 799 b);
800 goto abort; 800 goto abort;
801 } 801 }
802 802
803 if (sb->raid_disks <= 0) 803 if (sb->raid_disks <= 0)
804 goto abort; 804 goto abort;
805 805
806 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 806 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
807 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 807 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
808 b); 808 b);
809 goto abort; 809 goto abort;
810 } 810 }
811 811
812 rdev->preferred_minor = sb->md_minor; 812 rdev->preferred_minor = sb->md_minor;
813 rdev->data_offset = 0; 813 rdev->data_offset = 0;
814 rdev->sb_size = MD_SB_BYTES; 814 rdev->sb_size = MD_SB_BYTES;
815 815
816 if (sb->level == LEVEL_MULTIPATH) 816 if (sb->level == LEVEL_MULTIPATH)
817 rdev->desc_nr = -1; 817 rdev->desc_nr = -1;
818 else 818 else
819 rdev->desc_nr = sb->this_disk.number; 819 rdev->desc_nr = sb->this_disk.number;
820 820
821 if (!refdev) { 821 if (!refdev) {
822 ret = 1; 822 ret = 1;
823 } else { 823 } else {
824 __u64 ev1, ev2; 824 __u64 ev1, ev2;
825 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 825 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page);
826 if (!uuid_equal(refsb, sb)) { 826 if (!uuid_equal(refsb, sb)) {
827 printk(KERN_WARNING "md: %s has different UUID to %s\n", 827 printk(KERN_WARNING "md: %s has different UUID to %s\n",
828 b, bdevname(refdev->bdev,b2)); 828 b, bdevname(refdev->bdev,b2));
829 goto abort; 829 goto abort;
830 } 830 }
831 if (!sb_equal(refsb, sb)) { 831 if (!sb_equal(refsb, sb)) {
832 printk(KERN_WARNING "md: %s has same UUID" 832 printk(KERN_WARNING "md: %s has same UUID"
833 " but different superblock to %s\n", 833 " but different superblock to %s\n",
834 b, bdevname(refdev->bdev, b2)); 834 b, bdevname(refdev->bdev, b2));
835 goto abort; 835 goto abort;
836 } 836 }
837 ev1 = md_event(sb); 837 ev1 = md_event(sb);
838 ev2 = md_event(refsb); 838 ev2 = md_event(refsb);
839 if (ev1 > ev2) 839 if (ev1 > ev2)
840 ret = 1; 840 ret = 1;
841 else 841 else
842 ret = 0; 842 ret = 0;
843 } 843 }
844 rdev->sectors = rdev->sb_start; 844 rdev->sectors = rdev->sb_start;
845 845
846 if (rdev->sectors < sb->size * 2 && sb->level > 1) 846 if (rdev->sectors < sb->size * 2 && sb->level > 1)
847 /* "this cannot possibly happen" ... */ 847 /* "this cannot possibly happen" ... */
848 ret = -EINVAL; 848 ret = -EINVAL;
849 849
850 abort: 850 abort:
851 return ret; 851 return ret;
852 } 852 }
853 853
854 /* 854 /*
855 * validate_super for 0.90.0 855 * validate_super for 0.90.0
856 */ 856 */
857 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 857 static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
858 { 858 {
859 mdp_disk_t *desc; 859 mdp_disk_t *desc;
860 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 860 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page);
861 __u64 ev1 = md_event(sb); 861 __u64 ev1 = md_event(sb);
862 862
863 rdev->raid_disk = -1; 863 rdev->raid_disk = -1;
864 clear_bit(Faulty, &rdev->flags); 864 clear_bit(Faulty, &rdev->flags);
865 clear_bit(In_sync, &rdev->flags); 865 clear_bit(In_sync, &rdev->flags);
866 clear_bit(WriteMostly, &rdev->flags); 866 clear_bit(WriteMostly, &rdev->flags);
867 clear_bit(BarriersNotsupp, &rdev->flags); 867 clear_bit(BarriersNotsupp, &rdev->flags);
868 868
869 if (mddev->raid_disks == 0) { 869 if (mddev->raid_disks == 0) {
870 mddev->major_version = 0; 870 mddev->major_version = 0;
871 mddev->minor_version = sb->minor_version; 871 mddev->minor_version = sb->minor_version;
872 mddev->patch_version = sb->patch_version; 872 mddev->patch_version = sb->patch_version;
873 mddev->external = 0; 873 mddev->external = 0;
874 mddev->chunk_sectors = sb->chunk_size >> 9; 874 mddev->chunk_sectors = sb->chunk_size >> 9;
875 mddev->ctime = sb->ctime; 875 mddev->ctime = sb->ctime;
876 mddev->utime = sb->utime; 876 mddev->utime = sb->utime;
877 mddev->level = sb->level; 877 mddev->level = sb->level;
878 mddev->clevel[0] = 0; 878 mddev->clevel[0] = 0;
879 mddev->layout = sb->layout; 879 mddev->layout = sb->layout;
880 mddev->raid_disks = sb->raid_disks; 880 mddev->raid_disks = sb->raid_disks;
881 mddev->dev_sectors = sb->size * 2; 881 mddev->dev_sectors = sb->size * 2;
882 mddev->events = ev1; 882 mddev->events = ev1;
883 mddev->bitmap_offset = 0; 883 mddev->bitmap_offset = 0;
884 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 884 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
885 885
886 if (mddev->minor_version >= 91) { 886 if (mddev->minor_version >= 91) {
887 mddev->reshape_position = sb->reshape_position; 887 mddev->reshape_position = sb->reshape_position;
888 mddev->delta_disks = sb->delta_disks; 888 mddev->delta_disks = sb->delta_disks;
889 mddev->new_level = sb->new_level; 889 mddev->new_level = sb->new_level;
890 mddev->new_layout = sb->new_layout; 890 mddev->new_layout = sb->new_layout;
891 mddev->new_chunk_sectors = sb->new_chunk >> 9; 891 mddev->new_chunk_sectors = sb->new_chunk >> 9;
892 } else { 892 } else {
893 mddev->reshape_position = MaxSector; 893 mddev->reshape_position = MaxSector;
894 mddev->delta_disks = 0; 894 mddev->delta_disks = 0;
895 mddev->new_level = mddev->level; 895 mddev->new_level = mddev->level;
896 mddev->new_layout = mddev->layout; 896 mddev->new_layout = mddev->layout;
897 mddev->new_chunk_sectors = mddev->chunk_sectors; 897 mddev->new_chunk_sectors = mddev->chunk_sectors;
898 } 898 }
899 899
900 if (sb->state & (1<<MD_SB_CLEAN)) 900 if (sb->state & (1<<MD_SB_CLEAN))
901 mddev->recovery_cp = MaxSector; 901 mddev->recovery_cp = MaxSector;
902 else { 902 else {
903 if (sb->events_hi == sb->cp_events_hi && 903 if (sb->events_hi == sb->cp_events_hi &&
904 sb->events_lo == sb->cp_events_lo) { 904 sb->events_lo == sb->cp_events_lo) {
905 mddev->recovery_cp = sb->recovery_cp; 905 mddev->recovery_cp = sb->recovery_cp;
906 } else 906 } else
907 mddev->recovery_cp = 0; 907 mddev->recovery_cp = 0;
908 } 908 }
909 909
910 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 910 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
911 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 911 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
912 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 912 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
913 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 913 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
914 914
915 mddev->max_disks = MD_SB_DISKS; 915 mddev->max_disks = MD_SB_DISKS;
916 916
917 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 917 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
918 mddev->bitmap_file == NULL) 918 mddev->bitmap_file == NULL)
919 mddev->bitmap_offset = mddev->default_bitmap_offset; 919 mddev->bitmap_offset = mddev->default_bitmap_offset;
920 920
921 } else if (mddev->pers == NULL) { 921 } else if (mddev->pers == NULL) {
922 /* Insist on good event counter while assembling */ 922 /* Insist on good event counter while assembling */
923 ++ev1; 923 ++ev1;
924 if (ev1 < mddev->events) 924 if (ev1 < mddev->events)
925 return -EINVAL; 925 return -EINVAL;
926 } else if (mddev->bitmap) { 926 } else if (mddev->bitmap) {
927 /* if adding to array with a bitmap, then we can accept an 927 /* if adding to array with a bitmap, then we can accept an
928 * older device ... but not too old. 928 * older device ... but not too old.
929 */ 929 */
930 if (ev1 < mddev->bitmap->events_cleared) 930 if (ev1 < mddev->bitmap->events_cleared)
931 return 0; 931 return 0;
932 } else { 932 } else {
933 if (ev1 < mddev->events) 933 if (ev1 < mddev->events)
934 /* just a hot-add of a new device, leave raid_disk at -1 */ 934 /* just a hot-add of a new device, leave raid_disk at -1 */
935 return 0; 935 return 0;
936 } 936 }
937 937
938 if (mddev->level != LEVEL_MULTIPATH) { 938 if (mddev->level != LEVEL_MULTIPATH) {
939 desc = sb->disks + rdev->desc_nr; 939 desc = sb->disks + rdev->desc_nr;
940 940
941 if (desc->state & (1<<MD_DISK_FAULTY)) 941 if (desc->state & (1<<MD_DISK_FAULTY))
942 set_bit(Faulty, &rdev->flags); 942 set_bit(Faulty, &rdev->flags);
943 else if (desc->state & (1<<MD_DISK_SYNC) /* && 943 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
944 desc->raid_disk < mddev->raid_disks */) { 944 desc->raid_disk < mddev->raid_disks */) {
945 set_bit(In_sync, &rdev->flags); 945 set_bit(In_sync, &rdev->flags);
946 rdev->raid_disk = desc->raid_disk; 946 rdev->raid_disk = desc->raid_disk;
947 } 947 }
948 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 948 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
949 set_bit(WriteMostly, &rdev->flags); 949 set_bit(WriteMostly, &rdev->flags);
950 } else /* MULTIPATH are always insync */ 950 } else /* MULTIPATH are always insync */
951 set_bit(In_sync, &rdev->flags); 951 set_bit(In_sync, &rdev->flags);
952 return 0; 952 return 0;
953 } 953 }
954 954
955 /* 955 /*
956 * sync_super for 0.90.0 956 * sync_super for 0.90.0
957 */ 957 */
958 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) 958 static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
959 { 959 {
960 mdp_super_t *sb; 960 mdp_super_t *sb;
961 mdk_rdev_t *rdev2; 961 mdk_rdev_t *rdev2;
962 int next_spare = mddev->raid_disks; 962 int next_spare = mddev->raid_disks;
963 963
964 964
965 /* make rdev->sb match mddev data.. 965 /* make rdev->sb match mddev data..
966 * 966 *
967 * 1/ zero out disks 967 * 1/ zero out disks
968 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 968 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
969 * 3/ any empty disks < next_spare become removed 969 * 3/ any empty disks < next_spare become removed
970 * 970 *
971 * disks[0] gets initialised to REMOVED because 971 * disks[0] gets initialised to REMOVED because
972 * we cannot be sure from other fields if it has 972 * we cannot be sure from other fields if it has
973 * been initialised or not. 973 * been initialised or not.
974 */ 974 */
975 int i; 975 int i;
976 int active=0, working=0,failed=0,spare=0,nr_disks=0; 976 int active=0, working=0,failed=0,spare=0,nr_disks=0;
977 977
978 rdev->sb_size = MD_SB_BYTES; 978 rdev->sb_size = MD_SB_BYTES;
979 979
980 sb = (mdp_super_t*)page_address(rdev->sb_page); 980 sb = (mdp_super_t*)page_address(rdev->sb_page);
981 981
982 memset(sb, 0, sizeof(*sb)); 982 memset(sb, 0, sizeof(*sb));
983 983
984 sb->md_magic = MD_SB_MAGIC; 984 sb->md_magic = MD_SB_MAGIC;
985 sb->major_version = mddev->major_version; 985 sb->major_version = mddev->major_version;
986 sb->patch_version = mddev->patch_version; 986 sb->patch_version = mddev->patch_version;
987 sb->gvalid_words = 0; /* ignored */ 987 sb->gvalid_words = 0; /* ignored */
988 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 988 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
989 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 989 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
990 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 990 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
991 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 991 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
992 992
993 sb->ctime = mddev->ctime; 993 sb->ctime = mddev->ctime;
994 sb->level = mddev->level; 994 sb->level = mddev->level;
995 sb->size = mddev->dev_sectors / 2; 995 sb->size = mddev->dev_sectors / 2;
996 sb->raid_disks = mddev->raid_disks; 996 sb->raid_disks = mddev->raid_disks;
997 sb->md_minor = mddev->md_minor; 997 sb->md_minor = mddev->md_minor;
998 sb->not_persistent = 0; 998 sb->not_persistent = 0;
999 sb->utime = mddev->utime; 999 sb->utime = mddev->utime;
1000 sb->state = 0; 1000 sb->state = 0;
1001 sb->events_hi = (mddev->events>>32); 1001 sb->events_hi = (mddev->events>>32);
1002 sb->events_lo = (u32)mddev->events; 1002 sb->events_lo = (u32)mddev->events;
1003 1003
1004 if (mddev->reshape_position == MaxSector) 1004 if (mddev->reshape_position == MaxSector)
1005 sb->minor_version = 90; 1005 sb->minor_version = 90;
1006 else { 1006 else {
1007 sb->minor_version = 91; 1007 sb->minor_version = 91;
1008 sb->reshape_position = mddev->reshape_position; 1008 sb->reshape_position = mddev->reshape_position;
1009 sb->new_level = mddev->new_level; 1009 sb->new_level = mddev->new_level;
1010 sb->delta_disks = mddev->delta_disks; 1010 sb->delta_disks = mddev->delta_disks;
1011 sb->new_layout = mddev->new_layout; 1011 sb->new_layout = mddev->new_layout;
1012 sb->new_chunk = mddev->new_chunk_sectors << 9; 1012 sb->new_chunk = mddev->new_chunk_sectors << 9;
1013 } 1013 }
1014 mddev->minor_version = sb->minor_version; 1014 mddev->minor_version = sb->minor_version;
1015 if (mddev->in_sync) 1015 if (mddev->in_sync)
1016 { 1016 {
1017 sb->recovery_cp = mddev->recovery_cp; 1017 sb->recovery_cp = mddev->recovery_cp;
1018 sb->cp_events_hi = (mddev->events>>32); 1018 sb->cp_events_hi = (mddev->events>>32);
1019 sb->cp_events_lo = (u32)mddev->events; 1019 sb->cp_events_lo = (u32)mddev->events;
1020 if (mddev->recovery_cp == MaxSector) 1020 if (mddev->recovery_cp == MaxSector)
1021 sb->state = (1<< MD_SB_CLEAN); 1021 sb->state = (1<< MD_SB_CLEAN);
1022 } else 1022 } else
1023 sb->recovery_cp = 0; 1023 sb->recovery_cp = 0;
1024 1024
1025 sb->layout = mddev->layout; 1025 sb->layout = mddev->layout;
1026 sb->chunk_size = mddev->chunk_sectors << 9; 1026 sb->chunk_size = mddev->chunk_sectors << 9;
1027 1027
1028 if (mddev->bitmap && mddev->bitmap_file == NULL) 1028 if (mddev->bitmap && mddev->bitmap_file == NULL)
1029 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1029 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1030 1030
1031 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1031 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1032 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1032 list_for_each_entry(rdev2, &mddev->disks, same_set) {
1033 mdp_disk_t *d; 1033 mdp_disk_t *d;
1034 int desc_nr; 1034 int desc_nr;
1035 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 1035 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
1036 && !test_bit(Faulty, &rdev2->flags)) 1036 && !test_bit(Faulty, &rdev2->flags))
1037 desc_nr = rdev2->raid_disk; 1037 desc_nr = rdev2->raid_disk;
1038 else 1038 else
1039 desc_nr = next_spare++; 1039 desc_nr = next_spare++;
1040 rdev2->desc_nr = desc_nr; 1040 rdev2->desc_nr = desc_nr;
1041 d = &sb->disks[rdev2->desc_nr]; 1041 d = &sb->disks[rdev2->desc_nr];
1042 nr_disks++; 1042 nr_disks++;
1043 d->number = rdev2->desc_nr; 1043 d->number = rdev2->desc_nr;
1044 d->major = MAJOR(rdev2->bdev->bd_dev); 1044 d->major = MAJOR(rdev2->bdev->bd_dev);
1045 d->minor = MINOR(rdev2->bdev->bd_dev); 1045 d->minor = MINOR(rdev2->bdev->bd_dev);
1046 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags) 1046 if (rdev2->raid_disk >= 0 && test_bit(In_sync, &rdev2->flags)
1047 && !test_bit(Faulty, &rdev2->flags)) 1047 && !test_bit(Faulty, &rdev2->flags))
1048 d->raid_disk = rdev2->raid_disk; 1048 d->raid_disk = rdev2->raid_disk;
1049 else 1049 else
1050 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1050 d->raid_disk = rdev2->desc_nr; /* compatibility */
1051 if (test_bit(Faulty, &rdev2->flags)) 1051 if (test_bit(Faulty, &rdev2->flags))
1052 d->state = (1<<MD_DISK_FAULTY); 1052 d->state = (1<<MD_DISK_FAULTY);
1053 else if (test_bit(In_sync, &rdev2->flags)) { 1053 else if (test_bit(In_sync, &rdev2->flags)) {
1054 d->state = (1<<MD_DISK_ACTIVE); 1054 d->state = (1<<MD_DISK_ACTIVE);
1055 d->state |= (1<<MD_DISK_SYNC); 1055 d->state |= (1<<MD_DISK_SYNC);
1056 active++; 1056 active++;
1057 working++; 1057 working++;
1058 } else { 1058 } else {
1059 d->state = 0; 1059 d->state = 0;
1060 spare++; 1060 spare++;
1061 working++; 1061 working++;
1062 } 1062 }
1063 if (test_bit(WriteMostly, &rdev2->flags)) 1063 if (test_bit(WriteMostly, &rdev2->flags))
1064 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1064 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1065 } 1065 }
1066 /* now set the "removed" and "faulty" bits on any missing devices */ 1066 /* now set the "removed" and "faulty" bits on any missing devices */
1067 for (i=0 ; i < mddev->raid_disks ; i++) { 1067 for (i=0 ; i < mddev->raid_disks ; i++) {
1068 mdp_disk_t *d = &sb->disks[i]; 1068 mdp_disk_t *d = &sb->disks[i];
1069 if (d->state == 0 && d->number == 0) { 1069 if (d->state == 0 && d->number == 0) {
1070 d->number = i; 1070 d->number = i;
1071 d->raid_disk = i; 1071 d->raid_disk = i;
1072 d->state = (1<<MD_DISK_REMOVED); 1072 d->state = (1<<MD_DISK_REMOVED);
1073 d->state |= (1<<MD_DISK_FAULTY); 1073 d->state |= (1<<MD_DISK_FAULTY);
1074 failed++; 1074 failed++;
1075 } 1075 }
1076 } 1076 }
1077 sb->nr_disks = nr_disks; 1077 sb->nr_disks = nr_disks;
1078 sb->active_disks = active; 1078 sb->active_disks = active;
1079 sb->working_disks = working; 1079 sb->working_disks = working;
1080 sb->failed_disks = failed; 1080 sb->failed_disks = failed;
1081 sb->spare_disks = spare; 1081 sb->spare_disks = spare;
1082 1082
1083 sb->this_disk = sb->disks[rdev->desc_nr]; 1083 sb->this_disk = sb->disks[rdev->desc_nr];
1084 sb->sb_csum = calc_sb_csum(sb); 1084 sb->sb_csum = calc_sb_csum(sb);
1085 } 1085 }
1086 1086
1087 /* 1087 /*
1088 * rdev_size_change for 0.90.0 1088 * rdev_size_change for 0.90.0
1089 */ 1089 */
1090 static unsigned long long 1090 static unsigned long long
1091 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1091 super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1092 { 1092 {
1093 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1093 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1094 return 0; /* component must fit device */ 1094 return 0; /* component must fit device */
1095 if (rdev->mddev->bitmap_offset) 1095 if (rdev->mddev->bitmap_offset)
1096 return 0; /* can't move bitmap */ 1096 return 0; /* can't move bitmap */
1097 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 1097 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
1098 if (!num_sectors || num_sectors > rdev->sb_start) 1098 if (!num_sectors || num_sectors > rdev->sb_start)
1099 num_sectors = rdev->sb_start; 1099 num_sectors = rdev->sb_start;
1100 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1100 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1101 rdev->sb_page); 1101 rdev->sb_page);
1102 md_super_wait(rdev->mddev); 1102 md_super_wait(rdev->mddev);
1103 return num_sectors / 2; /* kB for sysfs */ 1103 return num_sectors / 2; /* kB for sysfs */
1104 } 1104 }
1105 1105
1106 1106
1107 /* 1107 /*
1108 * version 1 superblock 1108 * version 1 superblock
1109 */ 1109 */
1110 1110
1111 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1111 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1112 { 1112 {
1113 __le32 disk_csum; 1113 __le32 disk_csum;
1114 u32 csum; 1114 u32 csum;
1115 unsigned long long newcsum; 1115 unsigned long long newcsum;
1116 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1116 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1117 __le32 *isuper = (__le32*)sb; 1117 __le32 *isuper = (__le32*)sb;
1118 int i; 1118 int i;
1119 1119
1120 disk_csum = sb->sb_csum; 1120 disk_csum = sb->sb_csum;
1121 sb->sb_csum = 0; 1121 sb->sb_csum = 0;
1122 newcsum = 0; 1122 newcsum = 0;
1123 for (i=0; size>=4; size -= 4 ) 1123 for (i=0; size>=4; size -= 4 )
1124 newcsum += le32_to_cpu(*isuper++); 1124 newcsum += le32_to_cpu(*isuper++);
1125 1125
1126 if (size == 2) 1126 if (size == 2)
1127 newcsum += le16_to_cpu(*(__le16*) isuper); 1127 newcsum += le16_to_cpu(*(__le16*) isuper);
1128 1128
1129 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1129 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1130 sb->sb_csum = disk_csum; 1130 sb->sb_csum = disk_csum;
1131 return cpu_to_le32(csum); 1131 return cpu_to_le32(csum);
1132 } 1132 }
1133 1133
1134 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1134 static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1135 { 1135 {
1136 struct mdp_superblock_1 *sb; 1136 struct mdp_superblock_1 *sb;
1137 int ret; 1137 int ret;
1138 sector_t sb_start; 1138 sector_t sb_start;
1139 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1139 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1140 int bmask; 1140 int bmask;
1141 1141
1142 /* 1142 /*
1143 * Calculate the position of the superblock in 512byte sectors. 1143 * Calculate the position of the superblock in 512byte sectors.
1144 * It is always aligned to a 4K boundary and 1144 * It is always aligned to a 4K boundary and
1145 * depeding on minor_version, it can be: 1145 * depeding on minor_version, it can be:
1146 * 0: At least 8K, but less than 12K, from end of device 1146 * 0: At least 8K, but less than 12K, from end of device
1147 * 1: At start of device 1147 * 1: At start of device
1148 * 2: 4K from start of device. 1148 * 2: 4K from start of device.
1149 */ 1149 */
1150 switch(minor_version) { 1150 switch(minor_version) {
1151 case 0: 1151 case 0:
1152 sb_start = rdev->bdev->bd_inode->i_size >> 9; 1152 sb_start = rdev->bdev->bd_inode->i_size >> 9;
1153 sb_start -= 8*2; 1153 sb_start -= 8*2;
1154 sb_start &= ~(sector_t)(4*2-1); 1154 sb_start &= ~(sector_t)(4*2-1);
1155 break; 1155 break;
1156 case 1: 1156 case 1:
1157 sb_start = 0; 1157 sb_start = 0;
1158 break; 1158 break;
1159 case 2: 1159 case 2:
1160 sb_start = 8; 1160 sb_start = 8;
1161 break; 1161 break;
1162 default: 1162 default:
1163 return -EINVAL; 1163 return -EINVAL;
1164 } 1164 }
1165 rdev->sb_start = sb_start; 1165 rdev->sb_start = sb_start;
1166 1166
1167 /* superblock is rarely larger than 1K, but it can be larger, 1167 /* superblock is rarely larger than 1K, but it can be larger,
1168 * and it is safe to read 4k, so we do that 1168 * and it is safe to read 4k, so we do that
1169 */ 1169 */
1170 ret = read_disk_sb(rdev, 4096); 1170 ret = read_disk_sb(rdev, 4096);
1171 if (ret) return ret; 1171 if (ret) return ret;
1172 1172
1173 1173
1174 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1174 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1175 1175
1176 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1176 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1177 sb->major_version != cpu_to_le32(1) || 1177 sb->major_version != cpu_to_le32(1) ||
1178 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1178 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1179 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1179 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1180 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1180 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1181 return -EINVAL; 1181 return -EINVAL;
1182 1182
1183 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1183 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1184 printk("md: invalid superblock checksum on %s\n", 1184 printk("md: invalid superblock checksum on %s\n",
1185 bdevname(rdev->bdev,b)); 1185 bdevname(rdev->bdev,b));
1186 return -EINVAL; 1186 return -EINVAL;
1187 } 1187 }
1188 if (le64_to_cpu(sb->data_size) < 10) { 1188 if (le64_to_cpu(sb->data_size) < 10) {
1189 printk("md: data_size too small on %s\n", 1189 printk("md: data_size too small on %s\n",
1190 bdevname(rdev->bdev,b)); 1190 bdevname(rdev->bdev,b));
1191 return -EINVAL; 1191 return -EINVAL;
1192 } 1192 }
1193 1193
1194 rdev->preferred_minor = 0xffff; 1194 rdev->preferred_minor = 0xffff;
1195 rdev->data_offset = le64_to_cpu(sb->data_offset); 1195 rdev->data_offset = le64_to_cpu(sb->data_offset);
1196 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1196 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1197 1197
1198 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1198 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1199 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1199 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1200 if (rdev->sb_size & bmask) 1200 if (rdev->sb_size & bmask)
1201 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1201 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1202 1202
1203 if (minor_version 1203 if (minor_version
1204 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1204 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1205 return -EINVAL; 1205 return -EINVAL;
1206 1206
1207 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1207 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1208 rdev->desc_nr = -1; 1208 rdev->desc_nr = -1;
1209 else 1209 else
1210 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1210 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1211 1211
1212 if (!refdev) { 1212 if (!refdev) {
1213 ret = 1; 1213 ret = 1;
1214 } else { 1214 } else {
1215 __u64 ev1, ev2; 1215 __u64 ev1, ev2;
1216 struct mdp_superblock_1 *refsb = 1216 struct mdp_superblock_1 *refsb =
1217 (struct mdp_superblock_1*)page_address(refdev->sb_page); 1217 (struct mdp_superblock_1*)page_address(refdev->sb_page);
1218 1218
1219 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1219 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1220 sb->level != refsb->level || 1220 sb->level != refsb->level ||
1221 sb->layout != refsb->layout || 1221 sb->layout != refsb->layout ||
1222 sb->chunksize != refsb->chunksize) { 1222 sb->chunksize != refsb->chunksize) {
1223 printk(KERN_WARNING "md: %s has strangely different" 1223 printk(KERN_WARNING "md: %s has strangely different"
1224 " superblock to %s\n", 1224 " superblock to %s\n",
1225 bdevname(rdev->bdev,b), 1225 bdevname(rdev->bdev,b),
1226 bdevname(refdev->bdev,b2)); 1226 bdevname(refdev->bdev,b2));
1227 return -EINVAL; 1227 return -EINVAL;
1228 } 1228 }
1229 ev1 = le64_to_cpu(sb->events); 1229 ev1 = le64_to_cpu(sb->events);
1230 ev2 = le64_to_cpu(refsb->events); 1230 ev2 = le64_to_cpu(refsb->events);
1231 1231
1232 if (ev1 > ev2) 1232 if (ev1 > ev2)
1233 ret = 1; 1233 ret = 1;
1234 else 1234 else
1235 ret = 0; 1235 ret = 0;
1236 } 1236 }
1237 if (minor_version) 1237 if (minor_version)
1238 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) - 1238 rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
1239 le64_to_cpu(sb->data_offset); 1239 le64_to_cpu(sb->data_offset);
1240 else 1240 else
1241 rdev->sectors = rdev->sb_start; 1241 rdev->sectors = rdev->sb_start;
1242 if (rdev->sectors < le64_to_cpu(sb->data_size)) 1242 if (rdev->sectors < le64_to_cpu(sb->data_size))
1243 return -EINVAL; 1243 return -EINVAL;
1244 rdev->sectors = le64_to_cpu(sb->data_size); 1244 rdev->sectors = le64_to_cpu(sb->data_size);
1245 if (le64_to_cpu(sb->size) > rdev->sectors) 1245 if (le64_to_cpu(sb->size) > rdev->sectors)
1246 return -EINVAL; 1246 return -EINVAL;
1247 return ret; 1247 return ret;
1248 } 1248 }
1249 1249
1250 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1250 static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1251 { 1251 {
1252 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1252 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1253 __u64 ev1 = le64_to_cpu(sb->events); 1253 __u64 ev1 = le64_to_cpu(sb->events);
1254 1254
1255 rdev->raid_disk = -1; 1255 rdev->raid_disk = -1;
1256 clear_bit(Faulty, &rdev->flags); 1256 clear_bit(Faulty, &rdev->flags);
1257 clear_bit(In_sync, &rdev->flags); 1257 clear_bit(In_sync, &rdev->flags);
1258 clear_bit(WriteMostly, &rdev->flags); 1258 clear_bit(WriteMostly, &rdev->flags);
1259 clear_bit(BarriersNotsupp, &rdev->flags); 1259 clear_bit(BarriersNotsupp, &rdev->flags);
1260 1260
1261 if (mddev->raid_disks == 0) { 1261 if (mddev->raid_disks == 0) {
1262 mddev->major_version = 1; 1262 mddev->major_version = 1;
1263 mddev->patch_version = 0; 1263 mddev->patch_version = 0;
1264 mddev->external = 0; 1264 mddev->external = 0;
1265 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1265 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1266 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1266 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1267 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1267 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1268 mddev->level = le32_to_cpu(sb->level); 1268 mddev->level = le32_to_cpu(sb->level);
1269 mddev->clevel[0] = 0; 1269 mddev->clevel[0] = 0;
1270 mddev->layout = le32_to_cpu(sb->layout); 1270 mddev->layout = le32_to_cpu(sb->layout);
1271 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1271 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1272 mddev->dev_sectors = le64_to_cpu(sb->size); 1272 mddev->dev_sectors = le64_to_cpu(sb->size);
1273 mddev->events = ev1; 1273 mddev->events = ev1;
1274 mddev->bitmap_offset = 0; 1274 mddev->bitmap_offset = 0;
1275 mddev->default_bitmap_offset = 1024 >> 9; 1275 mddev->default_bitmap_offset = 1024 >> 9;
1276 1276
1277 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1277 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1278 memcpy(mddev->uuid, sb->set_uuid, 16); 1278 memcpy(mddev->uuid, sb->set_uuid, 16);
1279 1279
1280 mddev->max_disks = (4096-256)/2; 1280 mddev->max_disks = (4096-256)/2;
1281 1281
1282 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1282 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1283 mddev->bitmap_file == NULL ) 1283 mddev->bitmap_file == NULL )
1284 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset); 1284 mddev->bitmap_offset = (__s32)le32_to_cpu(sb->bitmap_offset);
1285 1285
1286 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1286 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1287 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1287 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1288 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1288 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1289 mddev->new_level = le32_to_cpu(sb->new_level); 1289 mddev->new_level = le32_to_cpu(sb->new_level);
1290 mddev->new_layout = le32_to_cpu(sb->new_layout); 1290 mddev->new_layout = le32_to_cpu(sb->new_layout);
1291 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1291 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1292 } else { 1292 } else {
1293 mddev->reshape_position = MaxSector; 1293 mddev->reshape_position = MaxSector;
1294 mddev->delta_disks = 0; 1294 mddev->delta_disks = 0;
1295 mddev->new_level = mddev->level; 1295 mddev->new_level = mddev->level;
1296 mddev->new_layout = mddev->layout; 1296 mddev->new_layout = mddev->layout;
1297 mddev->new_chunk_sectors = mddev->chunk_sectors; 1297 mddev->new_chunk_sectors = mddev->chunk_sectors;
1298 } 1298 }
1299 1299
1300 } else if (mddev->pers == NULL) { 1300 } else if (mddev->pers == NULL) {
1301 /* Insist of good event counter while assembling */ 1301 /* Insist of good event counter while assembling */
1302 ++ev1; 1302 ++ev1;
1303 if (ev1 < mddev->events) 1303 if (ev1 < mddev->events)
1304 return -EINVAL; 1304 return -EINVAL;
1305 } else if (mddev->bitmap) { 1305 } else if (mddev->bitmap) {
1306 /* If adding to array with a bitmap, then we can accept an 1306 /* If adding to array with a bitmap, then we can accept an
1307 * older device, but not too old. 1307 * older device, but not too old.
1308 */ 1308 */
1309 if (ev1 < mddev->bitmap->events_cleared) 1309 if (ev1 < mddev->bitmap->events_cleared)
1310 return 0; 1310 return 0;
1311 } else { 1311 } else {
1312 if (ev1 < mddev->events) 1312 if (ev1 < mddev->events)
1313 /* just a hot-add of a new device, leave raid_disk at -1 */ 1313 /* just a hot-add of a new device, leave raid_disk at -1 */
1314 return 0; 1314 return 0;
1315 } 1315 }
1316 if (mddev->level != LEVEL_MULTIPATH) { 1316 if (mddev->level != LEVEL_MULTIPATH) {
1317 int role; 1317 int role;
1318 if (rdev->desc_nr < 0 || 1318 if (rdev->desc_nr < 0 ||
1319 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1319 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1320 role = 0xffff; 1320 role = 0xffff;
1321 rdev->desc_nr = -1; 1321 rdev->desc_nr = -1;
1322 } else 1322 } else
1323 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1323 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1324 switch(role) { 1324 switch(role) {
1325 case 0xffff: /* spare */ 1325 case 0xffff: /* spare */
1326 break; 1326 break;
1327 case 0xfffe: /* faulty */ 1327 case 0xfffe: /* faulty */
1328 set_bit(Faulty, &rdev->flags); 1328 set_bit(Faulty, &rdev->flags);
1329 break; 1329 break;
1330 default: 1330 default:
1331 if ((le32_to_cpu(sb->feature_map) & 1331 if ((le32_to_cpu(sb->feature_map) &
1332 MD_FEATURE_RECOVERY_OFFSET)) 1332 MD_FEATURE_RECOVERY_OFFSET))
1333 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1333 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1334 else 1334 else
1335 set_bit(In_sync, &rdev->flags); 1335 set_bit(In_sync, &rdev->flags);
1336 rdev->raid_disk = role; 1336 rdev->raid_disk = role;
1337 break; 1337 break;
1338 } 1338 }
1339 if (sb->devflags & WriteMostly1) 1339 if (sb->devflags & WriteMostly1)
1340 set_bit(WriteMostly, &rdev->flags); 1340 set_bit(WriteMostly, &rdev->flags);
1341 } else /* MULTIPATH are always insync */ 1341 } else /* MULTIPATH are always insync */
1342 set_bit(In_sync, &rdev->flags); 1342 set_bit(In_sync, &rdev->flags);
1343 1343
1344 return 0; 1344 return 0;
1345 } 1345 }
1346 1346
1347 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) 1347 static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1348 { 1348 {
1349 struct mdp_superblock_1 *sb; 1349 struct mdp_superblock_1 *sb;
1350 mdk_rdev_t *rdev2; 1350 mdk_rdev_t *rdev2;
1351 int max_dev, i; 1351 int max_dev, i;
1352 /* make rdev->sb match mddev and rdev data. */ 1352 /* make rdev->sb match mddev and rdev data. */
1353 1353
1354 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1354 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page);
1355 1355
1356 sb->feature_map = 0; 1356 sb->feature_map = 0;
1357 sb->pad0 = 0; 1357 sb->pad0 = 0;
1358 sb->recovery_offset = cpu_to_le64(0); 1358 sb->recovery_offset = cpu_to_le64(0);
1359 memset(sb->pad1, 0, sizeof(sb->pad1)); 1359 memset(sb->pad1, 0, sizeof(sb->pad1));
1360 memset(sb->pad2, 0, sizeof(sb->pad2)); 1360 memset(sb->pad2, 0, sizeof(sb->pad2));
1361 memset(sb->pad3, 0, sizeof(sb->pad3)); 1361 memset(sb->pad3, 0, sizeof(sb->pad3));
1362 1362
1363 sb->utime = cpu_to_le64((__u64)mddev->utime); 1363 sb->utime = cpu_to_le64((__u64)mddev->utime);
1364 sb->events = cpu_to_le64(mddev->events); 1364 sb->events = cpu_to_le64(mddev->events);
1365 if (mddev->in_sync) 1365 if (mddev->in_sync)
1366 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1366 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1367 else 1367 else
1368 sb->resync_offset = cpu_to_le64(0); 1368 sb->resync_offset = cpu_to_le64(0);
1369 1369
1370 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1370 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1371 1371
1372 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1372 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1373 sb->size = cpu_to_le64(mddev->dev_sectors); 1373 sb->size = cpu_to_le64(mddev->dev_sectors);
1374 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1374 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1375 sb->level = cpu_to_le32(mddev->level); 1375 sb->level = cpu_to_le32(mddev->level);
1376 sb->layout = cpu_to_le32(mddev->layout); 1376 sb->layout = cpu_to_le32(mddev->layout);
1377 1377
1378 if (mddev->bitmap && mddev->bitmap_file == NULL) { 1378 if (mddev->bitmap && mddev->bitmap_file == NULL) {
1379 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset); 1379 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_offset);
1380 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1380 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1381 } 1381 }
1382 1382
1383 if (rdev->raid_disk >= 0 && 1383 if (rdev->raid_disk >= 0 &&
1384 !test_bit(In_sync, &rdev->flags)) { 1384 !test_bit(In_sync, &rdev->flags)) {
1385 if (mddev->curr_resync_completed > rdev->recovery_offset) 1385 if (mddev->curr_resync_completed > rdev->recovery_offset)
1386 rdev->recovery_offset = mddev->curr_resync_completed; 1386 rdev->recovery_offset = mddev->curr_resync_completed;
1387 if (rdev->recovery_offset > 0) { 1387 if (rdev->recovery_offset > 0) {
1388 sb->feature_map |= 1388 sb->feature_map |=
1389 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1389 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1390 sb->recovery_offset = 1390 sb->recovery_offset =
1391 cpu_to_le64(rdev->recovery_offset); 1391 cpu_to_le64(rdev->recovery_offset);
1392 } 1392 }
1393 } 1393 }
1394 1394
1395 if (mddev->reshape_position != MaxSector) { 1395 if (mddev->reshape_position != MaxSector) {
1396 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1396 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1397 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1397 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1398 sb->new_layout = cpu_to_le32(mddev->new_layout); 1398 sb->new_layout = cpu_to_le32(mddev->new_layout);
1399 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1399 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1400 sb->new_level = cpu_to_le32(mddev->new_level); 1400 sb->new_level = cpu_to_le32(mddev->new_level);
1401 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1401 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1402 } 1402 }
1403 1403
1404 max_dev = 0; 1404 max_dev = 0;
1405 list_for_each_entry(rdev2, &mddev->disks, same_set) 1405 list_for_each_entry(rdev2, &mddev->disks, same_set)
1406 if (rdev2->desc_nr+1 > max_dev) 1406 if (rdev2->desc_nr+1 > max_dev)
1407 max_dev = rdev2->desc_nr+1; 1407 max_dev = rdev2->desc_nr+1;
1408 1408
1409 if (max_dev > le32_to_cpu(sb->max_dev)) { 1409 if (max_dev > le32_to_cpu(sb->max_dev)) {
1410 int bmask; 1410 int bmask;
1411 sb->max_dev = cpu_to_le32(max_dev); 1411 sb->max_dev = cpu_to_le32(max_dev);
1412 rdev->sb_size = max_dev * 2 + 256; 1412 rdev->sb_size = max_dev * 2 + 256;
1413 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1413 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1414 if (rdev->sb_size & bmask) 1414 if (rdev->sb_size & bmask)
1415 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1415 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1416 } 1416 }
1417 for (i=0; i<max_dev;i++) 1417 for (i=0; i<max_dev;i++)
1418 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1418 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1419 1419
1420 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1420 list_for_each_entry(rdev2, &mddev->disks, same_set) {
1421 i = rdev2->desc_nr; 1421 i = rdev2->desc_nr;
1422 if (test_bit(Faulty, &rdev2->flags)) 1422 if (test_bit(Faulty, &rdev2->flags))
1423 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1423 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1424 else if (test_bit(In_sync, &rdev2->flags)) 1424 else if (test_bit(In_sync, &rdev2->flags))
1425 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1425 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1426 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0) 1426 else if (rdev2->raid_disk >= 0 && rdev2->recovery_offset > 0)
1427 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1427 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1428 else 1428 else
1429 sb->dev_roles[i] = cpu_to_le16(0xffff); 1429 sb->dev_roles[i] = cpu_to_le16(0xffff);
1430 } 1430 }
1431 1431
1432 sb->sb_csum = calc_sb_1_csum(sb); 1432 sb->sb_csum = calc_sb_1_csum(sb);
1433 } 1433 }
1434 1434
1435 static unsigned long long 1435 static unsigned long long
1436 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) 1436 super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1437 { 1437 {
1438 struct mdp_superblock_1 *sb; 1438 struct mdp_superblock_1 *sb;
1439 sector_t max_sectors; 1439 sector_t max_sectors;
1440 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1440 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1441 return 0; /* component must fit device */ 1441 return 0; /* component must fit device */
1442 if (rdev->sb_start < rdev->data_offset) { 1442 if (rdev->sb_start < rdev->data_offset) {
1443 /* minor versions 1 and 2; superblock before data */ 1443 /* minor versions 1 and 2; superblock before data */
1444 max_sectors = rdev->bdev->bd_inode->i_size >> 9; 1444 max_sectors = rdev->bdev->bd_inode->i_size >> 9;
1445 max_sectors -= rdev->data_offset; 1445 max_sectors -= rdev->data_offset;
1446 if (!num_sectors || num_sectors > max_sectors) 1446 if (!num_sectors || num_sectors > max_sectors)
1447 num_sectors = max_sectors; 1447 num_sectors = max_sectors;
1448 } else if (rdev->mddev->bitmap_offset) { 1448 } else if (rdev->mddev->bitmap_offset) {
1449 /* minor version 0 with bitmap we can't move */ 1449 /* minor version 0 with bitmap we can't move */
1450 return 0; 1450 return 0;
1451 } else { 1451 } else {
1452 /* minor version 0; superblock after data */ 1452 /* minor version 0; superblock after data */
1453 sector_t sb_start; 1453 sector_t sb_start;
1454 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2; 1454 sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
1455 sb_start &= ~(sector_t)(4*2 - 1); 1455 sb_start &= ~(sector_t)(4*2 - 1);
1456 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1456 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1457 if (!num_sectors || num_sectors > max_sectors) 1457 if (!num_sectors || num_sectors > max_sectors)
1458 num_sectors = max_sectors; 1458 num_sectors = max_sectors;
1459 rdev->sb_start = sb_start; 1459 rdev->sb_start = sb_start;
1460 } 1460 }
1461 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); 1461 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page);
1462 sb->data_size = cpu_to_le64(num_sectors); 1462 sb->data_size = cpu_to_le64(num_sectors);
1463 sb->super_offset = rdev->sb_start; 1463 sb->super_offset = rdev->sb_start;
1464 sb->sb_csum = calc_sb_1_csum(sb); 1464 sb->sb_csum = calc_sb_1_csum(sb);
1465 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1465 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1466 rdev->sb_page); 1466 rdev->sb_page);
1467 md_super_wait(rdev->mddev); 1467 md_super_wait(rdev->mddev);
1468 return num_sectors / 2; /* kB for sysfs */ 1468 return num_sectors / 2; /* kB for sysfs */
1469 } 1469 }
1470 1470
1471 static struct super_type super_types[] = { 1471 static struct super_type super_types[] = {
1472 [0] = { 1472 [0] = {
1473 .name = "0.90.0", 1473 .name = "0.90.0",
1474 .owner = THIS_MODULE, 1474 .owner = THIS_MODULE,
1475 .load_super = super_90_load, 1475 .load_super = super_90_load,
1476 .validate_super = super_90_validate, 1476 .validate_super = super_90_validate,
1477 .sync_super = super_90_sync, 1477 .sync_super = super_90_sync,
1478 .rdev_size_change = super_90_rdev_size_change, 1478 .rdev_size_change = super_90_rdev_size_change,
1479 }, 1479 },
1480 [1] = { 1480 [1] = {
1481 .name = "md-1", 1481 .name = "md-1",
1482 .owner = THIS_MODULE, 1482 .owner = THIS_MODULE,
1483 .load_super = super_1_load, 1483 .load_super = super_1_load,
1484 .validate_super = super_1_validate, 1484 .validate_super = super_1_validate,
1485 .sync_super = super_1_sync, 1485 .sync_super = super_1_sync,
1486 .rdev_size_change = super_1_rdev_size_change, 1486 .rdev_size_change = super_1_rdev_size_change,
1487 }, 1487 },
1488 }; 1488 };
1489 1489
1490 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) 1490 static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2)
1491 { 1491 {
1492 mdk_rdev_t *rdev, *rdev2; 1492 mdk_rdev_t *rdev, *rdev2;
1493 1493
1494 rcu_read_lock(); 1494 rcu_read_lock();
1495 rdev_for_each_rcu(rdev, mddev1) 1495 rdev_for_each_rcu(rdev, mddev1)
1496 rdev_for_each_rcu(rdev2, mddev2) 1496 rdev_for_each_rcu(rdev2, mddev2)
1497 if (rdev->bdev->bd_contains == 1497 if (rdev->bdev->bd_contains ==
1498 rdev2->bdev->bd_contains) { 1498 rdev2->bdev->bd_contains) {
1499 rcu_read_unlock(); 1499 rcu_read_unlock();
1500 return 1; 1500 return 1;
1501 } 1501 }
1502 rcu_read_unlock(); 1502 rcu_read_unlock();
1503 return 0; 1503 return 0;
1504 } 1504 }
1505 1505
1506 static LIST_HEAD(pending_raid_disks); 1506 static LIST_HEAD(pending_raid_disks);
1507 1507
1508 /* 1508 /*
1509 * Try to register data integrity profile for an mddev 1509 * Try to register data integrity profile for an mddev
1510 * 1510 *
1511 * This is called when an array is started and after a disk has been kicked 1511 * This is called when an array is started and after a disk has been kicked
1512 * from the array. It only succeeds if all working and active component devices 1512 * from the array. It only succeeds if all working and active component devices
1513 * are integrity capable with matching profiles. 1513 * are integrity capable with matching profiles.
1514 */ 1514 */
1515 int md_integrity_register(mddev_t *mddev) 1515 int md_integrity_register(mddev_t *mddev)
1516 { 1516 {
1517 mdk_rdev_t *rdev, *reference = NULL; 1517 mdk_rdev_t *rdev, *reference = NULL;
1518 1518
1519 if (list_empty(&mddev->disks)) 1519 if (list_empty(&mddev->disks))
1520 return 0; /* nothing to do */ 1520 return 0; /* nothing to do */
1521 if (blk_get_integrity(mddev->gendisk)) 1521 if (blk_get_integrity(mddev->gendisk))
1522 return 0; /* already registered */ 1522 return 0; /* already registered */
1523 list_for_each_entry(rdev, &mddev->disks, same_set) { 1523 list_for_each_entry(rdev, &mddev->disks, same_set) {
1524 /* skip spares and non-functional disks */ 1524 /* skip spares and non-functional disks */
1525 if (test_bit(Faulty, &rdev->flags)) 1525 if (test_bit(Faulty, &rdev->flags))
1526 continue; 1526 continue;
1527 if (rdev->raid_disk < 0) 1527 if (rdev->raid_disk < 0)
1528 continue; 1528 continue;
1529 /* 1529 /*
1530 * If at least one rdev is not integrity capable, we can not 1530 * If at least one rdev is not integrity capable, we can not
1531 * enable data integrity for the md device. 1531 * enable data integrity for the md device.
1532 */ 1532 */
1533 if (!bdev_get_integrity(rdev->bdev)) 1533 if (!bdev_get_integrity(rdev->bdev))
1534 return -EINVAL; 1534 return -EINVAL;
1535 if (!reference) { 1535 if (!reference) {
1536 /* Use the first rdev as the reference */ 1536 /* Use the first rdev as the reference */
1537 reference = rdev; 1537 reference = rdev;
1538 continue; 1538 continue;
1539 } 1539 }
1540 /* does this rdev's profile match the reference profile? */ 1540 /* does this rdev's profile match the reference profile? */
1541 if (blk_integrity_compare(reference->bdev->bd_disk, 1541 if (blk_integrity_compare(reference->bdev->bd_disk,
1542 rdev->bdev->bd_disk) < 0) 1542 rdev->bdev->bd_disk) < 0)
1543 return -EINVAL; 1543 return -EINVAL;
1544 } 1544 }
1545 /* 1545 /*
1546 * All component devices are integrity capable and have matching 1546 * All component devices are integrity capable and have matching
1547 * profiles, register the common profile for the md device. 1547 * profiles, register the common profile for the md device.
1548 */ 1548 */
1549 if (blk_integrity_register(mddev->gendisk, 1549 if (blk_integrity_register(mddev->gendisk,
1550 bdev_get_integrity(reference->bdev)) != 0) { 1550 bdev_get_integrity(reference->bdev)) != 0) {
1551 printk(KERN_ERR "md: failed to register integrity for %s\n", 1551 printk(KERN_ERR "md: failed to register integrity for %s\n",
1552 mdname(mddev)); 1552 mdname(mddev));
1553 return -EINVAL; 1553 return -EINVAL;
1554 } 1554 }
1555 printk(KERN_NOTICE "md: data integrity on %s enabled\n", 1555 printk(KERN_NOTICE "md: data integrity on %s enabled\n",
1556 mdname(mddev)); 1556 mdname(mddev));
1557 return 0; 1557 return 0;
1558 } 1558 }
1559 EXPORT_SYMBOL(md_integrity_register); 1559 EXPORT_SYMBOL(md_integrity_register);
1560 1560
1561 /* Disable data integrity if non-capable/non-matching disk is being added */ 1561 /* Disable data integrity if non-capable/non-matching disk is being added */
1562 void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 1562 void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
1563 { 1563 {
1564 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); 1564 struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev);
1565 struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); 1565 struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk);
1566 1566
1567 if (!bi_mddev) /* nothing to do */ 1567 if (!bi_mddev) /* nothing to do */
1568 return; 1568 return;
1569 if (rdev->raid_disk < 0) /* skip spares */ 1569 if (rdev->raid_disk < 0) /* skip spares */
1570 return; 1570 return;
1571 if (bi_rdev && blk_integrity_compare(mddev->gendisk, 1571 if (bi_rdev && blk_integrity_compare(mddev->gendisk,
1572 rdev->bdev->bd_disk) >= 0) 1572 rdev->bdev->bd_disk) >= 0)
1573 return; 1573 return;
1574 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); 1574 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
1575 blk_integrity_unregister(mddev->gendisk); 1575 blk_integrity_unregister(mddev->gendisk);
1576 } 1576 }
1577 EXPORT_SYMBOL(md_integrity_add_rdev); 1577 EXPORT_SYMBOL(md_integrity_add_rdev);
1578 1578
1579 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) 1579 static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1580 { 1580 {
1581 char b[BDEVNAME_SIZE]; 1581 char b[BDEVNAME_SIZE];
1582 struct kobject *ko; 1582 struct kobject *ko;
1583 char *s; 1583 char *s;
1584 int err; 1584 int err;
1585 1585
1586 if (rdev->mddev) { 1586 if (rdev->mddev) {
1587 MD_BUG(); 1587 MD_BUG();
1588 return -EINVAL; 1588 return -EINVAL;
1589 } 1589 }
1590 1590
1591 /* prevent duplicates */ 1591 /* prevent duplicates */
1592 if (find_rdev(mddev, rdev->bdev->bd_dev)) 1592 if (find_rdev(mddev, rdev->bdev->bd_dev))
1593 return -EEXIST; 1593 return -EEXIST;
1594 1594
1595 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 1595 /* make sure rdev->sectors exceeds mddev->dev_sectors */
1596 if (rdev->sectors && (mddev->dev_sectors == 0 || 1596 if (rdev->sectors && (mddev->dev_sectors == 0 ||
1597 rdev->sectors < mddev->dev_sectors)) { 1597 rdev->sectors < mddev->dev_sectors)) {
1598 if (mddev->pers) { 1598 if (mddev->pers) {
1599 /* Cannot change size, so fail 1599 /* Cannot change size, so fail
1600 * If mddev->level <= 0, then we don't care 1600 * If mddev->level <= 0, then we don't care
1601 * about aligning sizes (e.g. linear) 1601 * about aligning sizes (e.g. linear)
1602 */ 1602 */
1603 if (mddev->level > 0) 1603 if (mddev->level > 0)
1604 return -ENOSPC; 1604 return -ENOSPC;
1605 } else 1605 } else
1606 mddev->dev_sectors = rdev->sectors; 1606 mddev->dev_sectors = rdev->sectors;
1607 } 1607 }
1608 1608
1609 /* Verify rdev->desc_nr is unique. 1609 /* Verify rdev->desc_nr is unique.
1610 * If it is -1, assign a free number, else 1610 * If it is -1, assign a free number, else
1611 * check number is not in use 1611 * check number is not in use
1612 */ 1612 */
1613 if (rdev->desc_nr < 0) { 1613 if (rdev->desc_nr < 0) {
1614 int choice = 0; 1614 int choice = 0;
1615 if (mddev->pers) choice = mddev->raid_disks; 1615 if (mddev->pers) choice = mddev->raid_disks;
1616 while (find_rdev_nr(mddev, choice)) 1616 while (find_rdev_nr(mddev, choice))
1617 choice++; 1617 choice++;
1618 rdev->desc_nr = choice; 1618 rdev->desc_nr = choice;
1619 } else { 1619 } else {
1620 if (find_rdev_nr(mddev, rdev->desc_nr)) 1620 if (find_rdev_nr(mddev, rdev->desc_nr))
1621 return -EBUSY; 1621 return -EBUSY;
1622 } 1622 }
1623 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 1623 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
1624 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 1624 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
1625 mdname(mddev), mddev->max_disks); 1625 mdname(mddev), mddev->max_disks);
1626 return -EBUSY; 1626 return -EBUSY;
1627 } 1627 }
1628 bdevname(rdev->bdev,b); 1628 bdevname(rdev->bdev,b);
1629 while ( (s=strchr(b, '/')) != NULL) 1629 while ( (s=strchr(b, '/')) != NULL)
1630 *s = '!'; 1630 *s = '!';
1631 1631
1632 rdev->mddev = mddev; 1632 rdev->mddev = mddev;
1633 printk(KERN_INFO "md: bind<%s>\n", b); 1633 printk(KERN_INFO "md: bind<%s>\n", b);
1634 1634
1635 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 1635 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
1636 goto fail; 1636 goto fail;
1637 1637
1638 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 1638 ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
1639 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) { 1639 if ((err = sysfs_create_link(&rdev->kobj, ko, "block"))) {
1640 kobject_del(&rdev->kobj); 1640 kobject_del(&rdev->kobj);
1641 goto fail; 1641 goto fail;
1642 } 1642 }
1643 rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state"); 1643 rdev->sysfs_state = sysfs_get_dirent(rdev->kobj.sd, "state");
1644 1644
1645 list_add_rcu(&rdev->same_set, &mddev->disks); 1645 list_add_rcu(&rdev->same_set, &mddev->disks);
1646 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk); 1646 bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
1647 1647
1648 /* May as well allow recovery to be retried once */ 1648 /* May as well allow recovery to be retried once */
1649 mddev->recovery_disabled = 0; 1649 mddev->recovery_disabled = 0;
1650 1650
1651 return 0; 1651 return 0;
1652 1652
1653 fail: 1653 fail:
1654 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 1654 printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
1655 b, mdname(mddev)); 1655 b, mdname(mddev));
1656 return err; 1656 return err;
1657 } 1657 }
1658 1658
1659 static void md_delayed_delete(struct work_struct *ws) 1659 static void md_delayed_delete(struct work_struct *ws)
1660 { 1660 {
1661 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work); 1661 mdk_rdev_t *rdev = container_of(ws, mdk_rdev_t, del_work);
1662 kobject_del(&rdev->kobj); 1662 kobject_del(&rdev->kobj);
1663 kobject_put(&rdev->kobj); 1663 kobject_put(&rdev->kobj);
1664 } 1664 }
1665 1665
1666 static void unbind_rdev_from_array(mdk_rdev_t * rdev) 1666 static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1667 { 1667 {
1668 char b[BDEVNAME_SIZE]; 1668 char b[BDEVNAME_SIZE];
1669 if (!rdev->mddev) { 1669 if (!rdev->mddev) {
1670 MD_BUG(); 1670 MD_BUG();
1671 return; 1671 return;
1672 } 1672 }
1673 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk); 1673 bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
1674 list_del_rcu(&rdev->same_set); 1674 list_del_rcu(&rdev->same_set);
1675 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 1675 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
1676 rdev->mddev = NULL; 1676 rdev->mddev = NULL;
1677 sysfs_remove_link(&rdev->kobj, "block"); 1677 sysfs_remove_link(&rdev->kobj, "block");
1678 sysfs_put(rdev->sysfs_state); 1678 sysfs_put(rdev->sysfs_state);
1679 rdev->sysfs_state = NULL; 1679 rdev->sysfs_state = NULL;
1680 /* We need to delay this, otherwise we can deadlock when 1680 /* We need to delay this, otherwise we can deadlock when
1681 * writing to 'remove' to "dev/state". We also need 1681 * writing to 'remove' to "dev/state". We also need
1682 * to delay it due to rcu usage. 1682 * to delay it due to rcu usage.
1683 */ 1683 */
1684 synchronize_rcu(); 1684 synchronize_rcu();
1685 INIT_WORK(&rdev->del_work, md_delayed_delete); 1685 INIT_WORK(&rdev->del_work, md_delayed_delete);
1686 kobject_get(&rdev->kobj); 1686 kobject_get(&rdev->kobj);
1687 schedule_work(&rdev->del_work); 1687 schedule_work(&rdev->del_work);
1688 } 1688 }
1689 1689
1690 /* 1690 /*
1691 * prevent the device from being mounted, repartitioned or 1691 * prevent the device from being mounted, repartitioned or
1692 * otherwise reused by a RAID array (or any other kernel 1692 * otherwise reused by a RAID array (or any other kernel
1693 * subsystem), by bd_claiming the device. 1693 * subsystem), by bd_claiming the device.
1694 */ 1694 */
1695 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared) 1695 static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
1696 { 1696 {
1697 int err = 0; 1697 int err = 0;
1698 struct block_device *bdev; 1698 struct block_device *bdev;
1699 char b[BDEVNAME_SIZE]; 1699 char b[BDEVNAME_SIZE];
1700 1700
1701 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE); 1701 bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
1702 if (IS_ERR(bdev)) { 1702 if (IS_ERR(bdev)) {
1703 printk(KERN_ERR "md: could not open %s.\n", 1703 printk(KERN_ERR "md: could not open %s.\n",
1704 __bdevname(dev, b)); 1704 __bdevname(dev, b));
1705 return PTR_ERR(bdev); 1705 return PTR_ERR(bdev);
1706 } 1706 }
1707 err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev); 1707 err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
1708 if (err) { 1708 if (err) {
1709 printk(KERN_ERR "md: could not bd_claim %s.\n", 1709 printk(KERN_ERR "md: could not bd_claim %s.\n",
1710 bdevname(bdev, b)); 1710 bdevname(bdev, b));
1711 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1711 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1712 return err; 1712 return err;
1713 } 1713 }
1714 if (!shared) 1714 if (!shared)
1715 set_bit(AllReserved, &rdev->flags); 1715 set_bit(AllReserved, &rdev->flags);
1716 rdev->bdev = bdev; 1716 rdev->bdev = bdev;
1717 return err; 1717 return err;
1718 } 1718 }
1719 1719
1720 static void unlock_rdev(mdk_rdev_t *rdev) 1720 static void unlock_rdev(mdk_rdev_t *rdev)
1721 { 1721 {
1722 struct block_device *bdev = rdev->bdev; 1722 struct block_device *bdev = rdev->bdev;
1723 rdev->bdev = NULL; 1723 rdev->bdev = NULL;
1724 if (!bdev) 1724 if (!bdev)
1725 MD_BUG(); 1725 MD_BUG();
1726 bd_release(bdev); 1726 bd_release(bdev);
1727 blkdev_put(bdev, FMODE_READ|FMODE_WRITE); 1727 blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
1728 } 1728 }
1729 1729
1730 void md_autodetect_dev(dev_t dev); 1730 void md_autodetect_dev(dev_t dev);
1731 1731
1732 static void export_rdev(mdk_rdev_t * rdev) 1732 static void export_rdev(mdk_rdev_t * rdev)
1733 { 1733 {
1734 char b[BDEVNAME_SIZE]; 1734 char b[BDEVNAME_SIZE];
1735 printk(KERN_INFO "md: export_rdev(%s)\n", 1735 printk(KERN_INFO "md: export_rdev(%s)\n",
1736 bdevname(rdev->bdev,b)); 1736 bdevname(rdev->bdev,b));
1737 if (rdev->mddev) 1737 if (rdev->mddev)
1738 MD_BUG(); 1738 MD_BUG();
1739 free_disk_sb(rdev); 1739 free_disk_sb(rdev);
1740 #ifndef MODULE 1740 #ifndef MODULE
1741 if (test_bit(AutoDetected, &rdev->flags)) 1741 if (test_bit(AutoDetected, &rdev->flags))
1742 md_autodetect_dev(rdev->bdev->bd_dev); 1742 md_autodetect_dev(rdev->bdev->bd_dev);
1743 #endif 1743 #endif
1744 unlock_rdev(rdev); 1744 unlock_rdev(rdev);
1745 kobject_put(&rdev->kobj); 1745 kobject_put(&rdev->kobj);
1746 } 1746 }
1747 1747
1748 static void kick_rdev_from_array(mdk_rdev_t * rdev) 1748 static void kick_rdev_from_array(mdk_rdev_t * rdev)
1749 { 1749 {
1750 unbind_rdev_from_array(rdev); 1750 unbind_rdev_from_array(rdev);
1751 export_rdev(rdev); 1751 export_rdev(rdev);
1752 } 1752 }
1753 1753
1754 static void export_array(mddev_t *mddev) 1754 static void export_array(mddev_t *mddev)
1755 { 1755 {
1756 mdk_rdev_t *rdev, *tmp; 1756 mdk_rdev_t *rdev, *tmp;
1757 1757
1758 rdev_for_each(rdev, tmp, mddev) { 1758 rdev_for_each(rdev, tmp, mddev) {
1759 if (!rdev->mddev) { 1759 if (!rdev->mddev) {
1760 MD_BUG(); 1760 MD_BUG();
1761 continue; 1761 continue;
1762 } 1762 }
1763 kick_rdev_from_array(rdev); 1763 kick_rdev_from_array(rdev);
1764 } 1764 }
1765 if (!list_empty(&mddev->disks)) 1765 if (!list_empty(&mddev->disks))
1766 MD_BUG(); 1766 MD_BUG();
1767 mddev->raid_disks = 0; 1767 mddev->raid_disks = 0;
1768 mddev->major_version = 0; 1768 mddev->major_version = 0;
1769 } 1769 }
1770 1770
1771 static void print_desc(mdp_disk_t *desc) 1771 static void print_desc(mdp_disk_t *desc)
1772 { 1772 {
1773 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 1773 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
1774 desc->major,desc->minor,desc->raid_disk,desc->state); 1774 desc->major,desc->minor,desc->raid_disk,desc->state);
1775 } 1775 }
1776 1776
1777 static void print_sb_90(mdp_super_t *sb) 1777 static void print_sb_90(mdp_super_t *sb)
1778 { 1778 {
1779 int i; 1779 int i;
1780 1780
1781 printk(KERN_INFO 1781 printk(KERN_INFO
1782 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 1782 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
1783 sb->major_version, sb->minor_version, sb->patch_version, 1783 sb->major_version, sb->minor_version, sb->patch_version,
1784 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 1784 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
1785 sb->ctime); 1785 sb->ctime);
1786 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 1786 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
1787 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 1787 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
1788 sb->md_minor, sb->layout, sb->chunk_size); 1788 sb->md_minor, sb->layout, sb->chunk_size);
1789 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 1789 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
1790 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 1790 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
1791 sb->utime, sb->state, sb->active_disks, sb->working_disks, 1791 sb->utime, sb->state, sb->active_disks, sb->working_disks,
1792 sb->failed_disks, sb->spare_disks, 1792 sb->failed_disks, sb->spare_disks,
1793 sb->sb_csum, (unsigned long)sb->events_lo); 1793 sb->sb_csum, (unsigned long)sb->events_lo);
1794 1794
1795 printk(KERN_INFO); 1795 printk(KERN_INFO);
1796 for (i = 0; i < MD_SB_DISKS; i++) { 1796 for (i = 0; i < MD_SB_DISKS; i++) {
1797 mdp_disk_t *desc; 1797 mdp_disk_t *desc;
1798 1798
1799 desc = sb->disks + i; 1799 desc = sb->disks + i;
1800 if (desc->number || desc->major || desc->minor || 1800 if (desc->number || desc->major || desc->minor ||
1801 desc->raid_disk || (desc->state && (desc->state != 4))) { 1801 desc->raid_disk || (desc->state && (desc->state != 4))) {
1802 printk(" D %2d: ", i); 1802 printk(" D %2d: ", i);
1803 print_desc(desc); 1803 print_desc(desc);
1804 } 1804 }
1805 } 1805 }
1806 printk(KERN_INFO "md: THIS: "); 1806 printk(KERN_INFO "md: THIS: ");
1807 print_desc(&sb->this_disk); 1807 print_desc(&sb->this_disk);
1808 } 1808 }
1809 1809
1810 static void print_sb_1(struct mdp_superblock_1 *sb) 1810 static void print_sb_1(struct mdp_superblock_1 *sb)
1811 { 1811 {
1812 __u8 *uuid; 1812 __u8 *uuid;
1813 1813
1814 uuid = sb->set_uuid; 1814 uuid = sb->set_uuid;
1815 printk(KERN_INFO 1815 printk(KERN_INFO
1816 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x" 1816 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%02x%02x%02x%02x"
1817 ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n" 1817 ":%02x%02x:%02x%02x:%02x%02x:%02x%02x%02x%02x%02x%02x>\n"
1818 "md: Name: \"%s\" CT:%llu\n", 1818 "md: Name: \"%s\" CT:%llu\n",
1819 le32_to_cpu(sb->major_version), 1819 le32_to_cpu(sb->major_version),
1820 le32_to_cpu(sb->feature_map), 1820 le32_to_cpu(sb->feature_map),
1821 uuid[0], uuid[1], uuid[2], uuid[3], 1821 uuid[0], uuid[1], uuid[2], uuid[3],
1822 uuid[4], uuid[5], uuid[6], uuid[7], 1822 uuid[4], uuid[5], uuid[6], uuid[7],
1823 uuid[8], uuid[9], uuid[10], uuid[11], 1823 uuid[8], uuid[9], uuid[10], uuid[11],
1824 uuid[12], uuid[13], uuid[14], uuid[15], 1824 uuid[12], uuid[13], uuid[14], uuid[15],
1825 sb->set_name, 1825 sb->set_name,
1826 (unsigned long long)le64_to_cpu(sb->ctime) 1826 (unsigned long long)le64_to_cpu(sb->ctime)
1827 & MD_SUPERBLOCK_1_TIME_SEC_MASK); 1827 & MD_SUPERBLOCK_1_TIME_SEC_MASK);
1828 1828
1829 uuid = sb->device_uuid; 1829 uuid = sb->device_uuid;
1830 printk(KERN_INFO 1830 printk(KERN_INFO
1831 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" 1831 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
1832 " RO:%llu\n" 1832 " RO:%llu\n"
1833 "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x" 1833 "md: Dev:%08x UUID: %02x%02x%02x%02x:%02x%02x:%02x%02x:%02x%02x"
1834 ":%02x%02x%02x%02x%02x%02x\n" 1834 ":%02x%02x%02x%02x%02x%02x\n"
1835 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" 1835 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
1836 "md: (MaxDev:%u) \n", 1836 "md: (MaxDev:%u) \n",
1837 le32_to_cpu(sb->level), 1837 le32_to_cpu(sb->level),
1838 (unsigned long long)le64_to_cpu(sb->size), 1838 (unsigned long long)le64_to_cpu(sb->size),
1839 le32_to_cpu(sb->raid_disks), 1839 le32_to_cpu(sb->raid_disks),
1840 le32_to_cpu(sb->layout), 1840 le32_to_cpu(sb->layout),
1841 le32_to_cpu(sb->chunksize), 1841 le32_to_cpu(sb->chunksize),
1842 (unsigned long long)le64_to_cpu(sb->data_offset), 1842 (unsigned long long)le64_to_cpu(sb->data_offset),
1843 (unsigned long long)le64_to_cpu(sb->data_size), 1843 (unsigned long long)le64_to_cpu(sb->data_size),
1844 (unsigned long long)le64_to_cpu(sb->super_offset), 1844 (unsigned long long)le64_to_cpu(sb->super_offset),
1845 (unsigned long long)le64_to_cpu(sb->recovery_offset), 1845 (unsigned long long)le64_to_cpu(sb->recovery_offset),
1846 le32_to_cpu(sb->dev_number), 1846 le32_to_cpu(sb->dev_number),
1847 uuid[0], uuid[1], uuid[2], uuid[3], 1847 uuid[0], uuid[1], uuid[2], uuid[3],
1848 uuid[4], uuid[5], uuid[6], uuid[7], 1848 uuid[4], uuid[5], uuid[6], uuid[7],
1849 uuid[8], uuid[9], uuid[10], uuid[11], 1849 uuid[8], uuid[9], uuid[10], uuid[11],
1850 uuid[12], uuid[13], uuid[14], uuid[15], 1850 uuid[12], uuid[13], uuid[14], uuid[15],
1851 sb->devflags, 1851 sb->devflags,
1852 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, 1852 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
1853 (unsigned long long)le64_to_cpu(sb->events), 1853 (unsigned long long)le64_to_cpu(sb->events),
1854 (unsigned long long)le64_to_cpu(sb->resync_offset), 1854 (unsigned long long)le64_to_cpu(sb->resync_offset),
1855 le32_to_cpu(sb->sb_csum), 1855 le32_to_cpu(sb->sb_csum),
1856 le32_to_cpu(sb->max_dev) 1856 le32_to_cpu(sb->max_dev)
1857 ); 1857 );
1858 } 1858 }
1859 1859
1860 static void print_rdev(mdk_rdev_t *rdev, int major_version) 1860 static void print_rdev(mdk_rdev_t *rdev, int major_version)
1861 { 1861 {
1862 char b[BDEVNAME_SIZE]; 1862 char b[BDEVNAME_SIZE];
1863 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", 1863 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
1864 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, 1864 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
1865 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 1865 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
1866 rdev->desc_nr); 1866 rdev->desc_nr);
1867 if (rdev->sb_loaded) { 1867 if (rdev->sb_loaded) {
1868 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); 1868 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
1869 switch (major_version) { 1869 switch (major_version) {
1870 case 0: 1870 case 0:
1871 print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); 1871 print_sb_90((mdp_super_t*)page_address(rdev->sb_page));
1872 break; 1872 break;
1873 case 1: 1873 case 1:
1874 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); 1874 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page));
1875 break; 1875 break;
1876 } 1876 }
1877 } else 1877 } else
1878 printk(KERN_INFO "md: no rdev superblock!\n"); 1878 printk(KERN_INFO "md: no rdev superblock!\n");
1879 } 1879 }
1880 1880
1881 static void md_print_devices(void) 1881 static void md_print_devices(void)
1882 { 1882 {
1883 struct list_head *tmp; 1883 struct list_head *tmp;
1884 mdk_rdev_t *rdev; 1884 mdk_rdev_t *rdev;
1885 mddev_t *mddev; 1885 mddev_t *mddev;
1886 char b[BDEVNAME_SIZE]; 1886 char b[BDEVNAME_SIZE];
1887 1887
1888 printk("\n"); 1888 printk("\n");
1889 printk("md: **********************************\n"); 1889 printk("md: **********************************\n");
1890 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 1890 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
1891 printk("md: **********************************\n"); 1891 printk("md: **********************************\n");
1892 for_each_mddev(mddev, tmp) { 1892 for_each_mddev(mddev, tmp) {
1893 1893
1894 if (mddev->bitmap) 1894 if (mddev->bitmap)
1895 bitmap_print_sb(mddev->bitmap); 1895 bitmap_print_sb(mddev->bitmap);
1896 else 1896 else
1897 printk("%s: ", mdname(mddev)); 1897 printk("%s: ", mdname(mddev));
1898 list_for_each_entry(rdev, &mddev->disks, same_set) 1898 list_for_each_entry(rdev, &mddev->disks, same_set)
1899 printk("<%s>", bdevname(rdev->bdev,b)); 1899 printk("<%s>", bdevname(rdev->bdev,b));
1900 printk("\n"); 1900 printk("\n");
1901 1901
1902 list_for_each_entry(rdev, &mddev->disks, same_set) 1902 list_for_each_entry(rdev, &mddev->disks, same_set)
1903 print_rdev(rdev, mddev->major_version); 1903 print_rdev(rdev, mddev->major_version);
1904 } 1904 }
1905 printk("md: **********************************\n"); 1905 printk("md: **********************************\n");
1906 printk("\n"); 1906 printk("\n");
1907 } 1907 }
1908 1908
1909 1909
1910 static void sync_sbs(mddev_t * mddev, int nospares) 1910 static void sync_sbs(mddev_t * mddev, int nospares)
1911 { 1911 {
1912 /* Update each superblock (in-memory image), but 1912 /* Update each superblock (in-memory image), but
1913 * if we are allowed to, skip spares which already 1913 * if we are allowed to, skip spares which already
1914 * have the right event counter, or have one earlier 1914 * have the right event counter, or have one earlier
1915 * (which would mean they aren't being marked as dirty 1915 * (which would mean they aren't being marked as dirty
1916 * with the rest of the array) 1916 * with the rest of the array)
1917 */ 1917 */
1918 mdk_rdev_t *rdev; 1918 mdk_rdev_t *rdev;
1919 1919
1920 list_for_each_entry(rdev, &mddev->disks, same_set) { 1920 list_for_each_entry(rdev, &mddev->disks, same_set) {
1921 if (rdev->sb_events == mddev->events || 1921 if (rdev->sb_events == mddev->events ||
1922 (nospares && 1922 (nospares &&
1923 rdev->raid_disk < 0 && 1923 rdev->raid_disk < 0 &&
1924 (rdev->sb_events&1)==0 && 1924 (rdev->sb_events&1)==0 &&
1925 rdev->sb_events+1 == mddev->events)) { 1925 rdev->sb_events+1 == mddev->events)) {
1926 /* Don't update this superblock */ 1926 /* Don't update this superblock */
1927 rdev->sb_loaded = 2; 1927 rdev->sb_loaded = 2;
1928 } else { 1928 } else {
1929 super_types[mddev->major_version]. 1929 super_types[mddev->major_version].
1930 sync_super(mddev, rdev); 1930 sync_super(mddev, rdev);
1931 rdev->sb_loaded = 1; 1931 rdev->sb_loaded = 1;
1932 } 1932 }
1933 } 1933 }
1934 } 1934 }
1935 1935
1936 static void md_update_sb(mddev_t * mddev, int force_change) 1936 static void md_update_sb(mddev_t * mddev, int force_change)
1937 { 1937 {
1938 mdk_rdev_t *rdev; 1938 mdk_rdev_t *rdev;
1939 int sync_req; 1939 int sync_req;
1940 int nospares = 0; 1940 int nospares = 0;
1941 1941
1942 mddev->utime = get_seconds(); 1942 mddev->utime = get_seconds();
1943 if (mddev->external) 1943 if (mddev->external)
1944 return; 1944 return;
1945 repeat: 1945 repeat:
1946 spin_lock_irq(&mddev->write_lock); 1946 spin_lock_irq(&mddev->write_lock);
1947 1947
1948 set_bit(MD_CHANGE_PENDING, &mddev->flags); 1948 set_bit(MD_CHANGE_PENDING, &mddev->flags);
1949 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 1949 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
1950 force_change = 1; 1950 force_change = 1;
1951 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 1951 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
1952 /* just a clean<-> dirty transition, possibly leave spares alone, 1952 /* just a clean<-> dirty transition, possibly leave spares alone,
1953 * though if events isn't the right even/odd, we will have to do 1953 * though if events isn't the right even/odd, we will have to do
1954 * spares after all 1954 * spares after all
1955 */ 1955 */
1956 nospares = 1; 1956 nospares = 1;
1957 if (force_change) 1957 if (force_change)
1958 nospares = 0; 1958 nospares = 0;
1959 if (mddev->degraded) 1959 if (mddev->degraded)
1960 /* If the array is degraded, then skipping spares is both 1960 /* If the array is degraded, then skipping spares is both
1961 * dangerous and fairly pointless. 1961 * dangerous and fairly pointless.
1962 * Dangerous because a device that was removed from the array 1962 * Dangerous because a device that was removed from the array
1963 * might have a event_count that still looks up-to-date, 1963 * might have a event_count that still looks up-to-date,
1964 * so it can be re-added without a resync. 1964 * so it can be re-added without a resync.
1965 * Pointless because if there are any spares to skip, 1965 * Pointless because if there are any spares to skip,
1966 * then a recovery will happen and soon that array won't 1966 * then a recovery will happen and soon that array won't
1967 * be degraded any more and the spare can go back to sleep then. 1967 * be degraded any more and the spare can go back to sleep then.
1968 */ 1968 */
1969 nospares = 0; 1969 nospares = 0;
1970 1970
1971 sync_req = mddev->in_sync; 1971 sync_req = mddev->in_sync;
1972 1972
1973 /* If this is just a dirty<->clean transition, and the array is clean 1973 /* If this is just a dirty<->clean transition, and the array is clean
1974 * and 'events' is odd, we can roll back to the previous clean state */ 1974 * and 'events' is odd, we can roll back to the previous clean state */
1975 if (nospares 1975 if (nospares
1976 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 1976 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
1977 && (mddev->events & 1) 1977 && (mddev->events & 1)
1978 && mddev->events != 1) 1978 && mddev->events != 1)
1979 mddev->events--; 1979 mddev->events--;
1980 else { 1980 else {
1981 /* otherwise we have to go forward and ... */ 1981 /* otherwise we have to go forward and ... */
1982 mddev->events ++; 1982 mddev->events ++;
1983 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */ 1983 if (!mddev->in_sync || mddev->recovery_cp != MaxSector) { /* not clean */
1984 /* .. if the array isn't clean, an 'even' event must also go 1984 /* .. if the array isn't clean, an 'even' event must also go
1985 * to spares. */ 1985 * to spares. */
1986 if ((mddev->events&1)==0) 1986 if ((mddev->events&1)==0)
1987 nospares = 0; 1987 nospares = 0;
1988 } else { 1988 } else {
1989 /* otherwise an 'odd' event must go to spares */ 1989 /* otherwise an 'odd' event must go to spares */
1990 if ((mddev->events&1)) 1990 if ((mddev->events&1))
1991 nospares = 0; 1991 nospares = 0;
1992 } 1992 }
1993 } 1993 }
1994 1994
1995 if (!mddev->events) { 1995 if (!mddev->events) {
1996 /* 1996 /*
1997 * oops, this 64-bit counter should never wrap. 1997 * oops, this 64-bit counter should never wrap.
1998 * Either we are in around ~1 trillion A.C., assuming 1998 * Either we are in around ~1 trillion A.C., assuming
1999 * 1 reboot per second, or we have a bug: 1999 * 1 reboot per second, or we have a bug:
2000 */ 2000 */
2001 MD_BUG(); 2001 MD_BUG();
2002 mddev->events --; 2002 mddev->events --;
2003 } 2003 }
2004 2004
2005 /* 2005 /*
2006 * do not write anything to disk if using 2006 * do not write anything to disk if using
2007 * nonpersistent superblocks 2007 * nonpersistent superblocks
2008 */ 2008 */
2009 if (!mddev->persistent) { 2009 if (!mddev->persistent) {
2010 if (!mddev->external) 2010 if (!mddev->external)
2011 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2011 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2012 2012
2013 spin_unlock_irq(&mddev->write_lock); 2013 spin_unlock_irq(&mddev->write_lock);
2014 wake_up(&mddev->sb_wait); 2014 wake_up(&mddev->sb_wait);
2015 return; 2015 return;
2016 } 2016 }
2017 sync_sbs(mddev, nospares); 2017 sync_sbs(mddev, nospares);
2018 spin_unlock_irq(&mddev->write_lock); 2018 spin_unlock_irq(&mddev->write_lock);
2019 2019
2020 dprintk(KERN_INFO 2020 dprintk(KERN_INFO
2021 "md: updating %s RAID superblock on device (in sync %d)\n", 2021 "md: updating %s RAID superblock on device (in sync %d)\n",
2022 mdname(mddev),mddev->in_sync); 2022 mdname(mddev),mddev->in_sync);
2023 2023
2024 bitmap_update_sb(mddev->bitmap); 2024 bitmap_update_sb(mddev->bitmap);
2025 list_for_each_entry(rdev, &mddev->disks, same_set) { 2025 list_for_each_entry(rdev, &mddev->disks, same_set) {
2026 char b[BDEVNAME_SIZE]; 2026 char b[BDEVNAME_SIZE];
2027 dprintk(KERN_INFO "md: "); 2027 dprintk(KERN_INFO "md: ");
2028 if (rdev->sb_loaded != 1) 2028 if (rdev->sb_loaded != 1)
2029 continue; /* no noise on spare devices */ 2029 continue; /* no noise on spare devices */
2030 if (test_bit(Faulty, &rdev->flags)) 2030 if (test_bit(Faulty, &rdev->flags))
2031 dprintk("(skipping faulty "); 2031 dprintk("(skipping faulty ");
2032 2032
2033 dprintk("%s ", bdevname(rdev->bdev,b)); 2033 dprintk("%s ", bdevname(rdev->bdev,b));
2034 if (!test_bit(Faulty, &rdev->flags)) { 2034 if (!test_bit(Faulty, &rdev->flags)) {
2035 md_super_write(mddev,rdev, 2035 md_super_write(mddev,rdev,
2036 rdev->sb_start, rdev->sb_size, 2036 rdev->sb_start, rdev->sb_size,
2037 rdev->sb_page); 2037 rdev->sb_page);
2038 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n", 2038 dprintk(KERN_INFO "(write) %s's sb offset: %llu\n",
2039 bdevname(rdev->bdev,b), 2039 bdevname(rdev->bdev,b),
2040 (unsigned long long)rdev->sb_start); 2040 (unsigned long long)rdev->sb_start);
2041 rdev->sb_events = mddev->events; 2041 rdev->sb_events = mddev->events;
2042 2042
2043 } else 2043 } else
2044 dprintk(")\n"); 2044 dprintk(")\n");
2045 if (mddev->level == LEVEL_MULTIPATH) 2045 if (mddev->level == LEVEL_MULTIPATH)
2046 /* only need to write one superblock... */ 2046 /* only need to write one superblock... */
2047 break; 2047 break;
2048 } 2048 }
2049 md_super_wait(mddev); 2049 md_super_wait(mddev);
2050 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2050 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2051 2051
2052 spin_lock_irq(&mddev->write_lock); 2052 spin_lock_irq(&mddev->write_lock);
2053 if (mddev->in_sync != sync_req || 2053 if (mddev->in_sync != sync_req ||
2054 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2054 test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2055 /* have to write it out again */ 2055 /* have to write it out again */
2056 spin_unlock_irq(&mddev->write_lock); 2056 spin_unlock_irq(&mddev->write_lock);
2057 goto repeat; 2057 goto repeat;
2058 } 2058 }
2059 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2059 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2060 spin_unlock_irq(&mddev->write_lock); 2060 spin_unlock_irq(&mddev->write_lock);
2061 wake_up(&mddev->sb_wait); 2061 wake_up(&mddev->sb_wait);
2062 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2062 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2063 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2063 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2064 2064
2065 } 2065 }
2066 2066
2067 /* words written to sysfs files may, or may not, be \n terminated. 2067 /* words written to sysfs files may, or may not, be \n terminated.
2068 * We want to accept with case. For this we use cmd_match. 2068 * We want to accept with case. For this we use cmd_match.
2069 */ 2069 */
2070 static int cmd_match(const char *cmd, const char *str) 2070 static int cmd_match(const char *cmd, const char *str)
2071 { 2071 {
2072 /* See if cmd, written into a sysfs file, matches 2072 /* See if cmd, written into a sysfs file, matches
2073 * str. They must either be the same, or cmd can 2073 * str. They must either be the same, or cmd can
2074 * have a trailing newline 2074 * have a trailing newline
2075 */ 2075 */
2076 while (*cmd && *str && *cmd == *str) { 2076 while (*cmd && *str && *cmd == *str) {
2077 cmd++; 2077 cmd++;
2078 str++; 2078 str++;
2079 } 2079 }
2080 if (*cmd == '\n') 2080 if (*cmd == '\n')
2081 cmd++; 2081 cmd++;
2082 if (*str || *cmd) 2082 if (*str || *cmd)
2083 return 0; 2083 return 0;
2084 return 1; 2084 return 1;
2085 } 2085 }
2086 2086
2087 struct rdev_sysfs_entry { 2087 struct rdev_sysfs_entry {
2088 struct attribute attr; 2088 struct attribute attr;
2089 ssize_t (*show)(mdk_rdev_t *, char *); 2089 ssize_t (*show)(mdk_rdev_t *, char *);
2090 ssize_t (*store)(mdk_rdev_t *, const char *, size_t); 2090 ssize_t (*store)(mdk_rdev_t *, const char *, size_t);
2091 }; 2091 };
2092 2092
2093 static ssize_t 2093 static ssize_t
2094 state_show(mdk_rdev_t *rdev, char *page) 2094 state_show(mdk_rdev_t *rdev, char *page)
2095 { 2095 {
2096 char *sep = ""; 2096 char *sep = "";
2097 size_t len = 0; 2097 size_t len = 0;
2098 2098
2099 if (test_bit(Faulty, &rdev->flags)) { 2099 if (test_bit(Faulty, &rdev->flags)) {
2100 len+= sprintf(page+len, "%sfaulty",sep); 2100 len+= sprintf(page+len, "%sfaulty",sep);
2101 sep = ","; 2101 sep = ",";
2102 } 2102 }
2103 if (test_bit(In_sync, &rdev->flags)) { 2103 if (test_bit(In_sync, &rdev->flags)) {
2104 len += sprintf(page+len, "%sin_sync",sep); 2104 len += sprintf(page+len, "%sin_sync",sep);
2105 sep = ","; 2105 sep = ",";
2106 } 2106 }
2107 if (test_bit(WriteMostly, &rdev->flags)) { 2107 if (test_bit(WriteMostly, &rdev->flags)) {
2108 len += sprintf(page+len, "%swrite_mostly",sep); 2108 len += sprintf(page+len, "%swrite_mostly",sep);
2109 sep = ","; 2109 sep = ",";
2110 } 2110 }
2111 if (test_bit(Blocked, &rdev->flags)) { 2111 if (test_bit(Blocked, &rdev->flags)) {
2112 len += sprintf(page+len, "%sblocked", sep); 2112 len += sprintf(page+len, "%sblocked", sep);
2113 sep = ","; 2113 sep = ",";
2114 } 2114 }
2115 if (!test_bit(Faulty, &rdev->flags) && 2115 if (!test_bit(Faulty, &rdev->flags) &&
2116 !test_bit(In_sync, &rdev->flags)) { 2116 !test_bit(In_sync, &rdev->flags)) {
2117 len += sprintf(page+len, "%sspare", sep); 2117 len += sprintf(page+len, "%sspare", sep);
2118 sep = ","; 2118 sep = ",";
2119 } 2119 }
2120 return len+sprintf(page+len, "\n"); 2120 return len+sprintf(page+len, "\n");
2121 } 2121 }
2122 2122
2123 static ssize_t 2123 static ssize_t
2124 state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2124 state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2125 { 2125 {
2126 /* can write 2126 /* can write
2127 * faulty - simulates and error 2127 * faulty - simulates and error
2128 * remove - disconnects the device 2128 * remove - disconnects the device
2129 * writemostly - sets write_mostly 2129 * writemostly - sets write_mostly
2130 * -writemostly - clears write_mostly 2130 * -writemostly - clears write_mostly
2131 * blocked - sets the Blocked flag 2131 * blocked - sets the Blocked flag
2132 * -blocked - clears the Blocked flag 2132 * -blocked - clears the Blocked flag
2133 * insync - sets Insync providing device isn't active 2133 * insync - sets Insync providing device isn't active
2134 */ 2134 */
2135 int err = -EINVAL; 2135 int err = -EINVAL;
2136 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2136 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2137 md_error(rdev->mddev, rdev); 2137 md_error(rdev->mddev, rdev);
2138 err = 0; 2138 err = 0;
2139 } else if (cmd_match(buf, "remove")) { 2139 } else if (cmd_match(buf, "remove")) {
2140 if (rdev->raid_disk >= 0) 2140 if (rdev->raid_disk >= 0)
2141 err = -EBUSY; 2141 err = -EBUSY;
2142 else { 2142 else {
2143 mddev_t *mddev = rdev->mddev; 2143 mddev_t *mddev = rdev->mddev;
2144 kick_rdev_from_array(rdev); 2144 kick_rdev_from_array(rdev);
2145 if (mddev->pers) 2145 if (mddev->pers)
2146 md_update_sb(mddev, 1); 2146 md_update_sb(mddev, 1);
2147 md_new_event(mddev); 2147 md_new_event(mddev);
2148 err = 0; 2148 err = 0;
2149 } 2149 }
2150 } else if (cmd_match(buf, "writemostly")) { 2150 } else if (cmd_match(buf, "writemostly")) {
2151 set_bit(WriteMostly, &rdev->flags); 2151 set_bit(WriteMostly, &rdev->flags);
2152 err = 0; 2152 err = 0;
2153 } else if (cmd_match(buf, "-writemostly")) { 2153 } else if (cmd_match(buf, "-writemostly")) {
2154 clear_bit(WriteMostly, &rdev->flags); 2154 clear_bit(WriteMostly, &rdev->flags);
2155 err = 0; 2155 err = 0;
2156 } else if (cmd_match(buf, "blocked")) { 2156 } else if (cmd_match(buf, "blocked")) {
2157 set_bit(Blocked, &rdev->flags); 2157 set_bit(Blocked, &rdev->flags);
2158 err = 0; 2158 err = 0;
2159 } else if (cmd_match(buf, "-blocked")) { 2159 } else if (cmd_match(buf, "-blocked")) {
2160 clear_bit(Blocked, &rdev->flags); 2160 clear_bit(Blocked, &rdev->flags);
2161 wake_up(&rdev->blocked_wait); 2161 wake_up(&rdev->blocked_wait);
2162 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2162 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2163 md_wakeup_thread(rdev->mddev->thread); 2163 md_wakeup_thread(rdev->mddev->thread);
2164 2164
2165 err = 0; 2165 err = 0;
2166 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2166 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2167 set_bit(In_sync, &rdev->flags); 2167 set_bit(In_sync, &rdev->flags);
2168 err = 0; 2168 err = 0;
2169 } 2169 }
2170 if (!err && rdev->sysfs_state) 2170 if (!err && rdev->sysfs_state)
2171 sysfs_notify_dirent(rdev->sysfs_state); 2171 sysfs_notify_dirent(rdev->sysfs_state);
2172 return err ? err : len; 2172 return err ? err : len;
2173 } 2173 }
2174 static struct rdev_sysfs_entry rdev_state = 2174 static struct rdev_sysfs_entry rdev_state =
2175 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 2175 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2176 2176
2177 static ssize_t 2177 static ssize_t
2178 errors_show(mdk_rdev_t *rdev, char *page) 2178 errors_show(mdk_rdev_t *rdev, char *page)
2179 { 2179 {
2180 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2180 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2181 } 2181 }
2182 2182
2183 static ssize_t 2183 static ssize_t
2184 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2184 errors_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2185 { 2185 {
2186 char *e; 2186 char *e;
2187 unsigned long n = simple_strtoul(buf, &e, 10); 2187 unsigned long n = simple_strtoul(buf, &e, 10);
2188 if (*buf && (*e == 0 || *e == '\n')) { 2188 if (*buf && (*e == 0 || *e == '\n')) {
2189 atomic_set(&rdev->corrected_errors, n); 2189 atomic_set(&rdev->corrected_errors, n);
2190 return len; 2190 return len;
2191 } 2191 }
2192 return -EINVAL; 2192 return -EINVAL;
2193 } 2193 }
2194 static struct rdev_sysfs_entry rdev_errors = 2194 static struct rdev_sysfs_entry rdev_errors =
2195 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2195 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2196 2196
2197 static ssize_t 2197 static ssize_t
2198 slot_show(mdk_rdev_t *rdev, char *page) 2198 slot_show(mdk_rdev_t *rdev, char *page)
2199 { 2199 {
2200 if (rdev->raid_disk < 0) 2200 if (rdev->raid_disk < 0)
2201 return sprintf(page, "none\n"); 2201 return sprintf(page, "none\n");
2202 else 2202 else
2203 return sprintf(page, "%d\n", rdev->raid_disk); 2203 return sprintf(page, "%d\n", rdev->raid_disk);
2204 } 2204 }
2205 2205
2206 static ssize_t 2206 static ssize_t
2207 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2207 slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2208 { 2208 {
2209 char *e; 2209 char *e;
2210 int err; 2210 int err;
2211 char nm[20]; 2211 char nm[20];
2212 int slot = simple_strtoul(buf, &e, 10); 2212 int slot = simple_strtoul(buf, &e, 10);
2213 if (strncmp(buf, "none", 4)==0) 2213 if (strncmp(buf, "none", 4)==0)
2214 slot = -1; 2214 slot = -1;
2215 else if (e==buf || (*e && *e!= '\n')) 2215 else if (e==buf || (*e && *e!= '\n'))
2216 return -EINVAL; 2216 return -EINVAL;
2217 if (rdev->mddev->pers && slot == -1) { 2217 if (rdev->mddev->pers && slot == -1) {
2218 /* Setting 'slot' on an active array requires also 2218 /* Setting 'slot' on an active array requires also
2219 * updating the 'rd%d' link, and communicating 2219 * updating the 'rd%d' link, and communicating
2220 * with the personality with ->hot_*_disk. 2220 * with the personality with ->hot_*_disk.
2221 * For now we only support removing 2221 * For now we only support removing
2222 * failed/spare devices. This normally happens automatically, 2222 * failed/spare devices. This normally happens automatically,
2223 * but not when the metadata is externally managed. 2223 * but not when the metadata is externally managed.
2224 */ 2224 */
2225 if (rdev->raid_disk == -1) 2225 if (rdev->raid_disk == -1)
2226 return -EEXIST; 2226 return -EEXIST;
2227 /* personality does all needed checks */ 2227 /* personality does all needed checks */
2228 if (rdev->mddev->pers->hot_add_disk == NULL) 2228 if (rdev->mddev->pers->hot_add_disk == NULL)
2229 return -EINVAL; 2229 return -EINVAL;
2230 err = rdev->mddev->pers-> 2230 err = rdev->mddev->pers->
2231 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2231 hot_remove_disk(rdev->mddev, rdev->raid_disk);
2232 if (err) 2232 if (err)
2233 return err; 2233 return err;
2234 sprintf(nm, "rd%d", rdev->raid_disk); 2234 sprintf(nm, "rd%d", rdev->raid_disk);
2235 sysfs_remove_link(&rdev->mddev->kobj, nm); 2235 sysfs_remove_link(&rdev->mddev->kobj, nm);
2236 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2236 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2237 md_wakeup_thread(rdev->mddev->thread); 2237 md_wakeup_thread(rdev->mddev->thread);
2238 } else if (rdev->mddev->pers) { 2238 } else if (rdev->mddev->pers) {
2239 mdk_rdev_t *rdev2; 2239 mdk_rdev_t *rdev2;
2240 /* Activating a spare .. or possibly reactivating 2240 /* Activating a spare .. or possibly reactivating
2241 * if we ever get bitmaps working here. 2241 * if we ever get bitmaps working here.
2242 */ 2242 */
2243 2243
2244 if (rdev->raid_disk != -1) 2244 if (rdev->raid_disk != -1)
2245 return -EBUSY; 2245 return -EBUSY;
2246 2246
2247 if (rdev->mddev->pers->hot_add_disk == NULL) 2247 if (rdev->mddev->pers->hot_add_disk == NULL)
2248 return -EINVAL; 2248 return -EINVAL;
2249 2249
2250 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set) 2250 list_for_each_entry(rdev2, &rdev->mddev->disks, same_set)
2251 if (rdev2->raid_disk == slot) 2251 if (rdev2->raid_disk == slot)
2252 return -EEXIST; 2252 return -EEXIST;
2253 2253
2254 rdev->raid_disk = slot; 2254 rdev->raid_disk = slot;
2255 if (test_bit(In_sync, &rdev->flags)) 2255 if (test_bit(In_sync, &rdev->flags))
2256 rdev->saved_raid_disk = slot; 2256 rdev->saved_raid_disk = slot;
2257 else 2257 else
2258 rdev->saved_raid_disk = -1; 2258 rdev->saved_raid_disk = -1;
2259 err = rdev->mddev->pers-> 2259 err = rdev->mddev->pers->
2260 hot_add_disk(rdev->mddev, rdev); 2260 hot_add_disk(rdev->mddev, rdev);
2261 if (err) { 2261 if (err) {
2262 rdev->raid_disk = -1; 2262 rdev->raid_disk = -1;
2263 return err; 2263 return err;
2264 } else 2264 } else
2265 sysfs_notify_dirent(rdev->sysfs_state); 2265 sysfs_notify_dirent(rdev->sysfs_state);
2266 sprintf(nm, "rd%d", rdev->raid_disk); 2266 sprintf(nm, "rd%d", rdev->raid_disk);
2267 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) 2267 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2268 printk(KERN_WARNING 2268 printk(KERN_WARNING
2269 "md: cannot register " 2269 "md: cannot register "
2270 "%s for %s\n", 2270 "%s for %s\n",
2271 nm, mdname(rdev->mddev)); 2271 nm, mdname(rdev->mddev));
2272 2272
2273 /* don't wakeup anyone, leave that to userspace. */ 2273 /* don't wakeup anyone, leave that to userspace. */
2274 } else { 2274 } else {
2275 if (slot >= rdev->mddev->raid_disks) 2275 if (slot >= rdev->mddev->raid_disks)
2276 return -ENOSPC; 2276 return -ENOSPC;
2277 rdev->raid_disk = slot; 2277 rdev->raid_disk = slot;
2278 /* assume it is working */ 2278 /* assume it is working */
2279 clear_bit(Faulty, &rdev->flags); 2279 clear_bit(Faulty, &rdev->flags);
2280 clear_bit(WriteMostly, &rdev->flags); 2280 clear_bit(WriteMostly, &rdev->flags);
2281 set_bit(In_sync, &rdev->flags); 2281 set_bit(In_sync, &rdev->flags);
2282 sysfs_notify_dirent(rdev->sysfs_state); 2282 sysfs_notify_dirent(rdev->sysfs_state);
2283 } 2283 }
2284 return len; 2284 return len;
2285 } 2285 }
2286 2286
2287 2287
2288 static struct rdev_sysfs_entry rdev_slot = 2288 static struct rdev_sysfs_entry rdev_slot =
2289 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2289 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2290 2290
2291 static ssize_t 2291 static ssize_t
2292 offset_show(mdk_rdev_t *rdev, char *page) 2292 offset_show(mdk_rdev_t *rdev, char *page)
2293 { 2293 {
2294 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2294 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2295 } 2295 }
2296 2296
2297 static ssize_t 2297 static ssize_t
2298 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2298 offset_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2299 { 2299 {
2300 char *e; 2300 char *e;
2301 unsigned long long offset = simple_strtoull(buf, &e, 10); 2301 unsigned long long offset = simple_strtoull(buf, &e, 10);
2302 if (e==buf || (*e && *e != '\n')) 2302 if (e==buf || (*e && *e != '\n'))
2303 return -EINVAL; 2303 return -EINVAL;
2304 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2304 if (rdev->mddev->pers && rdev->raid_disk >= 0)
2305 return -EBUSY; 2305 return -EBUSY;
2306 if (rdev->sectors && rdev->mddev->external) 2306 if (rdev->sectors && rdev->mddev->external)
2307 /* Must set offset before size, so overlap checks 2307 /* Must set offset before size, so overlap checks
2308 * can be sane */ 2308 * can be sane */
2309 return -EBUSY; 2309 return -EBUSY;
2310 rdev->data_offset = offset; 2310 rdev->data_offset = offset;
2311 return len; 2311 return len;
2312 } 2312 }
2313 2313
2314 static struct rdev_sysfs_entry rdev_offset = 2314 static struct rdev_sysfs_entry rdev_offset =
2315 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2315 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2316 2316
2317 static ssize_t 2317 static ssize_t
2318 rdev_size_show(mdk_rdev_t *rdev, char *page) 2318 rdev_size_show(mdk_rdev_t *rdev, char *page)
2319 { 2319 {
2320 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2320 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2321 } 2321 }
2322 2322
2323 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2323 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2324 { 2324 {
2325 /* check if two start/length pairs overlap */ 2325 /* check if two start/length pairs overlap */
2326 if (s1+l1 <= s2) 2326 if (s1+l1 <= s2)
2327 return 0; 2327 return 0;
2328 if (s2+l2 <= s1) 2328 if (s2+l2 <= s1)
2329 return 0; 2329 return 0;
2330 return 1; 2330 return 1;
2331 } 2331 }
2332 2332
2333 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2333 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2334 { 2334 {
2335 unsigned long long blocks; 2335 unsigned long long blocks;
2336 sector_t new; 2336 sector_t new;
2337 2337
2338 if (strict_strtoull(buf, 10, &blocks) < 0) 2338 if (strict_strtoull(buf, 10, &blocks) < 0)
2339 return -EINVAL; 2339 return -EINVAL;
2340 2340
2341 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2341 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2342 return -EINVAL; /* sector conversion overflow */ 2342 return -EINVAL; /* sector conversion overflow */
2343 2343
2344 new = blocks * 2; 2344 new = blocks * 2;
2345 if (new != blocks * 2) 2345 if (new != blocks * 2)
2346 return -EINVAL; /* unsigned long long to sector_t overflow */ 2346 return -EINVAL; /* unsigned long long to sector_t overflow */
2347 2347
2348 *sectors = new; 2348 *sectors = new;
2349 return 0; 2349 return 0;
2350 } 2350 }
2351 2351
2352 static ssize_t 2352 static ssize_t
2353 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2353 rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2354 { 2354 {
2355 mddev_t *my_mddev = rdev->mddev; 2355 mddev_t *my_mddev = rdev->mddev;
2356 sector_t oldsectors = rdev->sectors; 2356 sector_t oldsectors = rdev->sectors;
2357 sector_t sectors; 2357 sector_t sectors;
2358 2358
2359 if (strict_blocks_to_sectors(buf, &sectors) < 0) 2359 if (strict_blocks_to_sectors(buf, &sectors) < 0)
2360 return -EINVAL; 2360 return -EINVAL;
2361 if (my_mddev->pers && rdev->raid_disk >= 0) { 2361 if (my_mddev->pers && rdev->raid_disk >= 0) {
2362 if (my_mddev->persistent) { 2362 if (my_mddev->persistent) {
2363 sectors = super_types[my_mddev->major_version]. 2363 sectors = super_types[my_mddev->major_version].
2364 rdev_size_change(rdev, sectors); 2364 rdev_size_change(rdev, sectors);
2365 if (!sectors) 2365 if (!sectors)
2366 return -EBUSY; 2366 return -EBUSY;
2367 } else if (!sectors) 2367 } else if (!sectors)
2368 sectors = (rdev->bdev->bd_inode->i_size >> 9) - 2368 sectors = (rdev->bdev->bd_inode->i_size >> 9) -
2369 rdev->data_offset; 2369 rdev->data_offset;
2370 } 2370 }
2371 if (sectors < my_mddev->dev_sectors) 2371 if (sectors < my_mddev->dev_sectors)
2372 return -EINVAL; /* component must fit device */ 2372 return -EINVAL; /* component must fit device */
2373 2373
2374 rdev->sectors = sectors; 2374 rdev->sectors = sectors;
2375 if (sectors > oldsectors && my_mddev->external) { 2375 if (sectors > oldsectors && my_mddev->external) {
2376 /* need to check that all other rdevs with the same ->bdev 2376 /* need to check that all other rdevs with the same ->bdev
2377 * do not overlap. We need to unlock the mddev to avoid 2377 * do not overlap. We need to unlock the mddev to avoid
2378 * a deadlock. We have already changed rdev->sectors, and if 2378 * a deadlock. We have already changed rdev->sectors, and if
2379 * we have to change it back, we will have the lock again. 2379 * we have to change it back, we will have the lock again.
2380 */ 2380 */
2381 mddev_t *mddev; 2381 mddev_t *mddev;
2382 int overlap = 0; 2382 int overlap = 0;
2383 struct list_head *tmp; 2383 struct list_head *tmp;
2384 2384
2385 mddev_unlock(my_mddev); 2385 mddev_unlock(my_mddev);
2386 for_each_mddev(mddev, tmp) { 2386 for_each_mddev(mddev, tmp) {
2387 mdk_rdev_t *rdev2; 2387 mdk_rdev_t *rdev2;
2388 2388
2389 mddev_lock(mddev); 2389 mddev_lock(mddev);
2390 list_for_each_entry(rdev2, &mddev->disks, same_set) 2390 list_for_each_entry(rdev2, &mddev->disks, same_set)
2391 if (test_bit(AllReserved, &rdev2->flags) || 2391 if (test_bit(AllReserved, &rdev2->flags) ||
2392 (rdev->bdev == rdev2->bdev && 2392 (rdev->bdev == rdev2->bdev &&
2393 rdev != rdev2 && 2393 rdev != rdev2 &&
2394 overlaps(rdev->data_offset, rdev->sectors, 2394 overlaps(rdev->data_offset, rdev->sectors,
2395 rdev2->data_offset, 2395 rdev2->data_offset,
2396 rdev2->sectors))) { 2396 rdev2->sectors))) {
2397 overlap = 1; 2397 overlap = 1;
2398 break; 2398 break;
2399 } 2399 }
2400 mddev_unlock(mddev); 2400 mddev_unlock(mddev);
2401 if (overlap) { 2401 if (overlap) {
2402 mddev_put(mddev); 2402 mddev_put(mddev);
2403 break; 2403 break;
2404 } 2404 }
2405 } 2405 }
2406 mddev_lock(my_mddev); 2406 mddev_lock(my_mddev);
2407 if (overlap) { 2407 if (overlap) {
2408 /* Someone else could have slipped in a size 2408 /* Someone else could have slipped in a size
2409 * change here, but doing so is just silly. 2409 * change here, but doing so is just silly.
2410 * We put oldsectors back because we *know* it is 2410 * We put oldsectors back because we *know* it is
2411 * safe, and trust userspace not to race with 2411 * safe, and trust userspace not to race with
2412 * itself 2412 * itself
2413 */ 2413 */
2414 rdev->sectors = oldsectors; 2414 rdev->sectors = oldsectors;
2415 return -EBUSY; 2415 return -EBUSY;
2416 } 2416 }
2417 } 2417 }
2418 return len; 2418 return len;
2419 } 2419 }
2420 2420
2421 static struct rdev_sysfs_entry rdev_size = 2421 static struct rdev_sysfs_entry rdev_size =
2422 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 2422 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
2423 2423
2424 static struct attribute *rdev_default_attrs[] = { 2424 static struct attribute *rdev_default_attrs[] = {
2425 &rdev_state.attr, 2425 &rdev_state.attr,
2426 &rdev_errors.attr, 2426 &rdev_errors.attr,
2427 &rdev_slot.attr, 2427 &rdev_slot.attr,
2428 &rdev_offset.attr, 2428 &rdev_offset.attr,
2429 &rdev_size.attr, 2429 &rdev_size.attr,
2430 NULL, 2430 NULL,
2431 }; 2431 };
2432 static ssize_t 2432 static ssize_t
2433 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 2433 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
2434 { 2434 {
2435 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2435 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2436 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2436 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2437 mddev_t *mddev = rdev->mddev; 2437 mddev_t *mddev = rdev->mddev;
2438 ssize_t rv; 2438 ssize_t rv;
2439 2439
2440 if (!entry->show) 2440 if (!entry->show)
2441 return -EIO; 2441 return -EIO;
2442 2442
2443 rv = mddev ? mddev_lock(mddev) : -EBUSY; 2443 rv = mddev ? mddev_lock(mddev) : -EBUSY;
2444 if (!rv) { 2444 if (!rv) {
2445 if (rdev->mddev == NULL) 2445 if (rdev->mddev == NULL)
2446 rv = -EBUSY; 2446 rv = -EBUSY;
2447 else 2447 else
2448 rv = entry->show(rdev, page); 2448 rv = entry->show(rdev, page);
2449 mddev_unlock(mddev); 2449 mddev_unlock(mddev);
2450 } 2450 }
2451 return rv; 2451 return rv;
2452 } 2452 }
2453 2453
2454 static ssize_t 2454 static ssize_t
2455 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 2455 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
2456 const char *page, size_t length) 2456 const char *page, size_t length)
2457 { 2457 {
2458 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 2458 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
2459 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj); 2459 mdk_rdev_t *rdev = container_of(kobj, mdk_rdev_t, kobj);
2460 ssize_t rv; 2460 ssize_t rv;
2461 mddev_t *mddev = rdev->mddev; 2461 mddev_t *mddev = rdev->mddev;
2462 2462
2463 if (!entry->store) 2463 if (!entry->store)
2464 return -EIO; 2464 return -EIO;
2465 if (!capable(CAP_SYS_ADMIN)) 2465 if (!capable(CAP_SYS_ADMIN))
2466 return -EACCES; 2466 return -EACCES;
2467 rv = mddev ? mddev_lock(mddev): -EBUSY; 2467 rv = mddev ? mddev_lock(mddev): -EBUSY;
2468 if (!rv) { 2468 if (!rv) {
2469 if (rdev->mddev == NULL) 2469 if (rdev->mddev == NULL)
2470 rv = -EBUSY; 2470 rv = -EBUSY;
2471 else 2471 else
2472 rv = entry->store(rdev, page, length); 2472 rv = entry->store(rdev, page, length);
2473 mddev_unlock(mddev); 2473 mddev_unlock(mddev);
2474 } 2474 }
2475 return rv; 2475 return rv;
2476 } 2476 }
2477 2477
2478 static void rdev_free(struct kobject *ko) 2478 static void rdev_free(struct kobject *ko)
2479 { 2479 {
2480 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj); 2480 mdk_rdev_t *rdev = container_of(ko, mdk_rdev_t, kobj);
2481 kfree(rdev); 2481 kfree(rdev);
2482 } 2482 }
2483 static struct sysfs_ops rdev_sysfs_ops = { 2483 static struct sysfs_ops rdev_sysfs_ops = {
2484 .show = rdev_attr_show, 2484 .show = rdev_attr_show,
2485 .store = rdev_attr_store, 2485 .store = rdev_attr_store,
2486 }; 2486 };
2487 static struct kobj_type rdev_ktype = { 2487 static struct kobj_type rdev_ktype = {
2488 .release = rdev_free, 2488 .release = rdev_free,
2489 .sysfs_ops = &rdev_sysfs_ops, 2489 .sysfs_ops = &rdev_sysfs_ops,
2490 .default_attrs = rdev_default_attrs, 2490 .default_attrs = rdev_default_attrs,
2491 }; 2491 };
2492 2492
2493 /* 2493 /*
2494 * Import a device. If 'super_format' >= 0, then sanity check the superblock 2494 * Import a device. If 'super_format' >= 0, then sanity check the superblock
2495 * 2495 *
2496 * mark the device faulty if: 2496 * mark the device faulty if:
2497 * 2497 *
2498 * - the device is nonexistent (zero size) 2498 * - the device is nonexistent (zero size)
2499 * - the device has no valid superblock 2499 * - the device has no valid superblock
2500 * 2500 *
2501 * a faulty rdev _never_ has rdev->sb set. 2501 * a faulty rdev _never_ has rdev->sb set.
2502 */ 2502 */
2503 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor) 2503 static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_minor)
2504 { 2504 {
2505 char b[BDEVNAME_SIZE]; 2505 char b[BDEVNAME_SIZE];
2506 int err; 2506 int err;
2507 mdk_rdev_t *rdev; 2507 mdk_rdev_t *rdev;
2508 sector_t size; 2508 sector_t size;
2509 2509
2510 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 2510 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
2511 if (!rdev) { 2511 if (!rdev) {
2512 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 2512 printk(KERN_ERR "md: could not alloc mem for new device!\n");
2513 return ERR_PTR(-ENOMEM); 2513 return ERR_PTR(-ENOMEM);
2514 } 2514 }
2515 2515
2516 if ((err = alloc_disk_sb(rdev))) 2516 if ((err = alloc_disk_sb(rdev)))
2517 goto abort_free; 2517 goto abort_free;
2518 2518
2519 err = lock_rdev(rdev, newdev, super_format == -2); 2519 err = lock_rdev(rdev, newdev, super_format == -2);
2520 if (err) 2520 if (err)
2521 goto abort_free; 2521 goto abort_free;
2522 2522
2523 kobject_init(&rdev->kobj, &rdev_ktype); 2523 kobject_init(&rdev->kobj, &rdev_ktype);
2524 2524
2525 rdev->desc_nr = -1; 2525 rdev->desc_nr = -1;
2526 rdev->saved_raid_disk = -1; 2526 rdev->saved_raid_disk = -1;
2527 rdev->raid_disk = -1; 2527 rdev->raid_disk = -1;
2528 rdev->flags = 0; 2528 rdev->flags = 0;
2529 rdev->data_offset = 0; 2529 rdev->data_offset = 0;
2530 rdev->sb_events = 0; 2530 rdev->sb_events = 0;
2531 atomic_set(&rdev->nr_pending, 0); 2531 atomic_set(&rdev->nr_pending, 0);
2532 atomic_set(&rdev->read_errors, 0); 2532 atomic_set(&rdev->read_errors, 0);
2533 atomic_set(&rdev->corrected_errors, 0); 2533 atomic_set(&rdev->corrected_errors, 0);
2534 2534
2535 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS; 2535 size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
2536 if (!size) { 2536 if (!size) {
2537 printk(KERN_WARNING 2537 printk(KERN_WARNING
2538 "md: %s has zero or unknown size, marking faulty!\n", 2538 "md: %s has zero or unknown size, marking faulty!\n",
2539 bdevname(rdev->bdev,b)); 2539 bdevname(rdev->bdev,b));
2540 err = -EINVAL; 2540 err = -EINVAL;
2541 goto abort_free; 2541 goto abort_free;
2542 } 2542 }
2543 2543
2544 if (super_format >= 0) { 2544 if (super_format >= 0) {
2545 err = super_types[super_format]. 2545 err = super_types[super_format].
2546 load_super(rdev, NULL, super_minor); 2546 load_super(rdev, NULL, super_minor);
2547 if (err == -EINVAL) { 2547 if (err == -EINVAL) {
2548 printk(KERN_WARNING 2548 printk(KERN_WARNING
2549 "md: %s does not have a valid v%d.%d " 2549 "md: %s does not have a valid v%d.%d "
2550 "superblock, not importing!\n", 2550 "superblock, not importing!\n",
2551 bdevname(rdev->bdev,b), 2551 bdevname(rdev->bdev,b),
2552 super_format, super_minor); 2552 super_format, super_minor);
2553 goto abort_free; 2553 goto abort_free;
2554 } 2554 }
2555 if (err < 0) { 2555 if (err < 0) {
2556 printk(KERN_WARNING 2556 printk(KERN_WARNING
2557 "md: could not read %s's sb, not importing!\n", 2557 "md: could not read %s's sb, not importing!\n",
2558 bdevname(rdev->bdev,b)); 2558 bdevname(rdev->bdev,b));
2559 goto abort_free; 2559 goto abort_free;
2560 } 2560 }
2561 } 2561 }
2562 2562
2563 INIT_LIST_HEAD(&rdev->same_set); 2563 INIT_LIST_HEAD(&rdev->same_set);
2564 init_waitqueue_head(&rdev->blocked_wait); 2564 init_waitqueue_head(&rdev->blocked_wait);
2565 2565
2566 return rdev; 2566 return rdev;
2567 2567
2568 abort_free: 2568 abort_free:
2569 if (rdev->sb_page) { 2569 if (rdev->sb_page) {
2570 if (rdev->bdev) 2570 if (rdev->bdev)
2571 unlock_rdev(rdev); 2571 unlock_rdev(rdev);
2572 free_disk_sb(rdev); 2572 free_disk_sb(rdev);
2573 } 2573 }
2574 kfree(rdev); 2574 kfree(rdev);
2575 return ERR_PTR(err); 2575 return ERR_PTR(err);
2576 } 2576 }
2577 2577
2578 /* 2578 /*
2579 * Check a full RAID array for plausibility 2579 * Check a full RAID array for plausibility
2580 */ 2580 */
2581 2581
2582 2582
2583 static void analyze_sbs(mddev_t * mddev) 2583 static void analyze_sbs(mddev_t * mddev)
2584 { 2584 {
2585 int i; 2585 int i;
2586 mdk_rdev_t *rdev, *freshest, *tmp; 2586 mdk_rdev_t *rdev, *freshest, *tmp;
2587 char b[BDEVNAME_SIZE]; 2587 char b[BDEVNAME_SIZE];
2588 2588
2589 freshest = NULL; 2589 freshest = NULL;
2590 rdev_for_each(rdev, tmp, mddev) 2590 rdev_for_each(rdev, tmp, mddev)
2591 switch (super_types[mddev->major_version]. 2591 switch (super_types[mddev->major_version].
2592 load_super(rdev, freshest, mddev->minor_version)) { 2592 load_super(rdev, freshest, mddev->minor_version)) {
2593 case 1: 2593 case 1:
2594 freshest = rdev; 2594 freshest = rdev;
2595 break; 2595 break;
2596 case 0: 2596 case 0:
2597 break; 2597 break;
2598 default: 2598 default:
2599 printk( KERN_ERR \ 2599 printk( KERN_ERR \
2600 "md: fatal superblock inconsistency in %s" 2600 "md: fatal superblock inconsistency in %s"
2601 " -- removing from array\n", 2601 " -- removing from array\n",
2602 bdevname(rdev->bdev,b)); 2602 bdevname(rdev->bdev,b));
2603 kick_rdev_from_array(rdev); 2603 kick_rdev_from_array(rdev);
2604 } 2604 }
2605 2605
2606 2606
2607 super_types[mddev->major_version]. 2607 super_types[mddev->major_version].
2608 validate_super(mddev, freshest); 2608 validate_super(mddev, freshest);
2609 2609
2610 i = 0; 2610 i = 0;
2611 rdev_for_each(rdev, tmp, mddev) { 2611 rdev_for_each(rdev, tmp, mddev) {
2612 if (rdev->desc_nr >= mddev->max_disks || 2612 if (rdev->desc_nr >= mddev->max_disks ||
2613 i > mddev->max_disks) { 2613 i > mddev->max_disks) {
2614 printk(KERN_WARNING 2614 printk(KERN_WARNING
2615 "md: %s: %s: only %d devices permitted\n", 2615 "md: %s: %s: only %d devices permitted\n",
2616 mdname(mddev), bdevname(rdev->bdev, b), 2616 mdname(mddev), bdevname(rdev->bdev, b),
2617 mddev->max_disks); 2617 mddev->max_disks);
2618 kick_rdev_from_array(rdev); 2618 kick_rdev_from_array(rdev);
2619 continue; 2619 continue;
2620 } 2620 }
2621 if (rdev != freshest) 2621 if (rdev != freshest)
2622 if (super_types[mddev->major_version]. 2622 if (super_types[mddev->major_version].
2623 validate_super(mddev, rdev)) { 2623 validate_super(mddev, rdev)) {
2624 printk(KERN_WARNING "md: kicking non-fresh %s" 2624 printk(KERN_WARNING "md: kicking non-fresh %s"
2625 " from array!\n", 2625 " from array!\n",
2626 bdevname(rdev->bdev,b)); 2626 bdevname(rdev->bdev,b));
2627 kick_rdev_from_array(rdev); 2627 kick_rdev_from_array(rdev);
2628 continue; 2628 continue;
2629 } 2629 }
2630 if (mddev->level == LEVEL_MULTIPATH) { 2630 if (mddev->level == LEVEL_MULTIPATH) {
2631 rdev->desc_nr = i++; 2631 rdev->desc_nr = i++;
2632 rdev->raid_disk = rdev->desc_nr; 2632 rdev->raid_disk = rdev->desc_nr;
2633 set_bit(In_sync, &rdev->flags); 2633 set_bit(In_sync, &rdev->flags);
2634 } else if (rdev->raid_disk >= mddev->raid_disks) { 2634 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
2635 rdev->raid_disk = -1; 2635 rdev->raid_disk = -1;
2636 clear_bit(In_sync, &rdev->flags); 2636 clear_bit(In_sync, &rdev->flags);
2637 } 2637 }
2638 } 2638 }
2639 } 2639 }
2640 2640
2641 static void md_safemode_timeout(unsigned long data); 2641 static void md_safemode_timeout(unsigned long data);
2642 2642
2643 static ssize_t 2643 static ssize_t
2644 safe_delay_show(mddev_t *mddev, char *page) 2644 safe_delay_show(mddev_t *mddev, char *page)
2645 { 2645 {
2646 int msec = (mddev->safemode_delay*1000)/HZ; 2646 int msec = (mddev->safemode_delay*1000)/HZ;
2647 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 2647 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
2648 } 2648 }
2649 static ssize_t 2649 static ssize_t
2650 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len) 2650 safe_delay_store(mddev_t *mddev, const char *cbuf, size_t len)
2651 { 2651 {
2652 int scale=1; 2652 int scale=1;
2653 int dot=0; 2653 int dot=0;
2654 int i; 2654 int i;
2655 unsigned long msec; 2655 unsigned long msec;
2656 char buf[30]; 2656 char buf[30];
2657 2657
2658 /* remove a period, and count digits after it */ 2658 /* remove a period, and count digits after it */
2659 if (len >= sizeof(buf)) 2659 if (len >= sizeof(buf))
2660 return -EINVAL; 2660 return -EINVAL;
2661 strlcpy(buf, cbuf, sizeof(buf)); 2661 strlcpy(buf, cbuf, sizeof(buf));
2662 for (i=0; i<len; i++) { 2662 for (i=0; i<len; i++) {
2663 if (dot) { 2663 if (dot) {
2664 if (isdigit(buf[i])) { 2664 if (isdigit(buf[i])) {
2665 buf[i-1] = buf[i]; 2665 buf[i-1] = buf[i];
2666 scale *= 10; 2666 scale *= 10;
2667 } 2667 }
2668 buf[i] = 0; 2668 buf[i] = 0;
2669 } else if (buf[i] == '.') { 2669 } else if (buf[i] == '.') {
2670 dot=1; 2670 dot=1;
2671 buf[i] = 0; 2671 buf[i] = 0;
2672 } 2672 }
2673 } 2673 }
2674 if (strict_strtoul(buf, 10, &msec) < 0) 2674 if (strict_strtoul(buf, 10, &msec) < 0)
2675 return -EINVAL; 2675 return -EINVAL;
2676 msec = (msec * 1000) / scale; 2676 msec = (msec * 1000) / scale;
2677 if (msec == 0) 2677 if (msec == 0)
2678 mddev->safemode_delay = 0; 2678 mddev->safemode_delay = 0;
2679 else { 2679 else {
2680 unsigned long old_delay = mddev->safemode_delay; 2680 unsigned long old_delay = mddev->safemode_delay;
2681 mddev->safemode_delay = (msec*HZ)/1000; 2681 mddev->safemode_delay = (msec*HZ)/1000;
2682 if (mddev->safemode_delay == 0) 2682 if (mddev->safemode_delay == 0)
2683 mddev->safemode_delay = 1; 2683 mddev->safemode_delay = 1;
2684 if (mddev->safemode_delay < old_delay) 2684 if (mddev->safemode_delay < old_delay)
2685 md_safemode_timeout((unsigned long)mddev); 2685 md_safemode_timeout((unsigned long)mddev);
2686 } 2686 }
2687 return len; 2687 return len;
2688 } 2688 }
2689 static struct md_sysfs_entry md_safe_delay = 2689 static struct md_sysfs_entry md_safe_delay =
2690 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 2690 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
2691 2691
2692 static ssize_t 2692 static ssize_t
2693 level_show(mddev_t *mddev, char *page) 2693 level_show(mddev_t *mddev, char *page)
2694 { 2694 {
2695 struct mdk_personality *p = mddev->pers; 2695 struct mdk_personality *p = mddev->pers;
2696 if (p) 2696 if (p)
2697 return sprintf(page, "%s\n", p->name); 2697 return sprintf(page, "%s\n", p->name);
2698 else if (mddev->clevel[0]) 2698 else if (mddev->clevel[0])
2699 return sprintf(page, "%s\n", mddev->clevel); 2699 return sprintf(page, "%s\n", mddev->clevel);
2700 else if (mddev->level != LEVEL_NONE) 2700 else if (mddev->level != LEVEL_NONE)
2701 return sprintf(page, "%d\n", mddev->level); 2701 return sprintf(page, "%d\n", mddev->level);
2702 else 2702 else
2703 return 0; 2703 return 0;
2704 } 2704 }
2705 2705
2706 static ssize_t 2706 static ssize_t
2707 level_store(mddev_t *mddev, const char *buf, size_t len) 2707 level_store(mddev_t *mddev, const char *buf, size_t len)
2708 { 2708 {
2709 char level[16]; 2709 char level[16];
2710 ssize_t rv = len; 2710 ssize_t rv = len;
2711 struct mdk_personality *pers; 2711 struct mdk_personality *pers;
2712 void *priv; 2712 void *priv;
2713 mdk_rdev_t *rdev; 2713 mdk_rdev_t *rdev;
2714 2714
2715 if (mddev->pers == NULL) { 2715 if (mddev->pers == NULL) {
2716 if (len == 0) 2716 if (len == 0)
2717 return 0; 2717 return 0;
2718 if (len >= sizeof(mddev->clevel)) 2718 if (len >= sizeof(mddev->clevel))
2719 return -ENOSPC; 2719 return -ENOSPC;
2720 strncpy(mddev->clevel, buf, len); 2720 strncpy(mddev->clevel, buf, len);
2721 if (mddev->clevel[len-1] == '\n') 2721 if (mddev->clevel[len-1] == '\n')
2722 len--; 2722 len--;
2723 mddev->clevel[len] = 0; 2723 mddev->clevel[len] = 0;
2724 mddev->level = LEVEL_NONE; 2724 mddev->level = LEVEL_NONE;
2725 return rv; 2725 return rv;
2726 } 2726 }
2727 2727
2728 /* request to change the personality. Need to ensure: 2728 /* request to change the personality. Need to ensure:
2729 * - array is not engaged in resync/recovery/reshape 2729 * - array is not engaged in resync/recovery/reshape
2730 * - old personality can be suspended 2730 * - old personality can be suspended
2731 * - new personality will access other array. 2731 * - new personality will access other array.
2732 */ 2732 */
2733 2733
2734 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 2734 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
2735 return -EBUSY; 2735 return -EBUSY;
2736 2736
2737 if (!mddev->pers->quiesce) { 2737 if (!mddev->pers->quiesce) {
2738 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 2738 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
2739 mdname(mddev), mddev->pers->name); 2739 mdname(mddev), mddev->pers->name);
2740 return -EINVAL; 2740 return -EINVAL;
2741 } 2741 }
2742 2742
2743 /* Now find the new personality */ 2743 /* Now find the new personality */
2744 if (len == 0 || len >= sizeof(level)) 2744 if (len == 0 || len >= sizeof(level))
2745 return -EINVAL; 2745 return -EINVAL;
2746 strncpy(level, buf, len); 2746 strncpy(level, buf, len);
2747 if (level[len-1] == '\n') 2747 if (level[len-1] == '\n')
2748 len--; 2748 len--;
2749 level[len] = 0; 2749 level[len] = 0;
2750 2750
2751 request_module("md-%s", level); 2751 request_module("md-%s", level);
2752 spin_lock(&pers_lock); 2752 spin_lock(&pers_lock);
2753 pers = find_pers(LEVEL_NONE, level); 2753 pers = find_pers(LEVEL_NONE, level);
2754 if (!pers || !try_module_get(pers->owner)) { 2754 if (!pers || !try_module_get(pers->owner)) {
2755 spin_unlock(&pers_lock); 2755 spin_unlock(&pers_lock);
2756 printk(KERN_WARNING "md: personality %s not loaded\n", level); 2756 printk(KERN_WARNING "md: personality %s not loaded\n", level);
2757 return -EINVAL; 2757 return -EINVAL;
2758 } 2758 }
2759 spin_unlock(&pers_lock); 2759 spin_unlock(&pers_lock);
2760 2760
2761 if (pers == mddev->pers) { 2761 if (pers == mddev->pers) {
2762 /* Nothing to do! */ 2762 /* Nothing to do! */
2763 module_put(pers->owner); 2763 module_put(pers->owner);
2764 return rv; 2764 return rv;
2765 } 2765 }
2766 if (!pers->takeover) { 2766 if (!pers->takeover) {
2767 module_put(pers->owner); 2767 module_put(pers->owner);
2768 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 2768 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
2769 mdname(mddev), level); 2769 mdname(mddev), level);
2770 return -EINVAL; 2770 return -EINVAL;
2771 } 2771 }
2772 2772
2773 /* ->takeover must set new_* and/or delta_disks 2773 /* ->takeover must set new_* and/or delta_disks
2774 * if it succeeds, and may set them when it fails. 2774 * if it succeeds, and may set them when it fails.
2775 */ 2775 */
2776 priv = pers->takeover(mddev); 2776 priv = pers->takeover(mddev);
2777 if (IS_ERR(priv)) { 2777 if (IS_ERR(priv)) {
2778 mddev->new_level = mddev->level; 2778 mddev->new_level = mddev->level;
2779 mddev->new_layout = mddev->layout; 2779 mddev->new_layout = mddev->layout;
2780 mddev->new_chunk_sectors = mddev->chunk_sectors; 2780 mddev->new_chunk_sectors = mddev->chunk_sectors;
2781 mddev->raid_disks -= mddev->delta_disks; 2781 mddev->raid_disks -= mddev->delta_disks;
2782 mddev->delta_disks = 0; 2782 mddev->delta_disks = 0;
2783 module_put(pers->owner); 2783 module_put(pers->owner);
2784 printk(KERN_WARNING "md: %s: %s would not accept array\n", 2784 printk(KERN_WARNING "md: %s: %s would not accept array\n",
2785 mdname(mddev), level); 2785 mdname(mddev), level);
2786 return PTR_ERR(priv); 2786 return PTR_ERR(priv);
2787 } 2787 }
2788 2788
2789 /* Looks like we have a winner */ 2789 /* Looks like we have a winner */
2790 mddev_suspend(mddev); 2790 mddev_suspend(mddev);
2791 mddev->pers->stop(mddev); 2791 mddev->pers->stop(mddev);
2792 module_put(mddev->pers->owner); 2792 module_put(mddev->pers->owner);
2793 /* Invalidate devices that are now superfluous */ 2793 /* Invalidate devices that are now superfluous */
2794 list_for_each_entry(rdev, &mddev->disks, same_set) 2794 list_for_each_entry(rdev, &mddev->disks, same_set)
2795 if (rdev->raid_disk >= mddev->raid_disks) { 2795 if (rdev->raid_disk >= mddev->raid_disks) {
2796 rdev->raid_disk = -1; 2796 rdev->raid_disk = -1;
2797 clear_bit(In_sync, &rdev->flags); 2797 clear_bit(In_sync, &rdev->flags);
2798 } 2798 }
2799 mddev->pers = pers; 2799 mddev->pers = pers;
2800 mddev->private = priv; 2800 mddev->private = priv;
2801 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 2801 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
2802 mddev->level = mddev->new_level; 2802 mddev->level = mddev->new_level;
2803 mddev->layout = mddev->new_layout; 2803 mddev->layout = mddev->new_layout;
2804 mddev->chunk_sectors = mddev->new_chunk_sectors; 2804 mddev->chunk_sectors = mddev->new_chunk_sectors;
2805 mddev->delta_disks = 0; 2805 mddev->delta_disks = 0;
2806 pers->run(mddev); 2806 pers->run(mddev);
2807 mddev_resume(mddev); 2807 mddev_resume(mddev);
2808 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2808 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2809 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 2809 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
2810 md_wakeup_thread(mddev->thread); 2810 md_wakeup_thread(mddev->thread);
2811 return rv; 2811 return rv;
2812 } 2812 }
2813 2813
2814 static struct md_sysfs_entry md_level = 2814 static struct md_sysfs_entry md_level =
2815 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 2815 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
2816 2816
2817 2817
2818 static ssize_t 2818 static ssize_t
2819 layout_show(mddev_t *mddev, char *page) 2819 layout_show(mddev_t *mddev, char *page)
2820 { 2820 {
2821 /* just a number, not meaningful for all levels */ 2821 /* just a number, not meaningful for all levels */
2822 if (mddev->reshape_position != MaxSector && 2822 if (mddev->reshape_position != MaxSector &&
2823 mddev->layout != mddev->new_layout) 2823 mddev->layout != mddev->new_layout)
2824 return sprintf(page, "%d (%d)\n", 2824 return sprintf(page, "%d (%d)\n",
2825 mddev->new_layout, mddev->layout); 2825 mddev->new_layout, mddev->layout);
2826 return sprintf(page, "%d\n", mddev->layout); 2826 return sprintf(page, "%d\n", mddev->layout);
2827 } 2827 }
2828 2828
2829 static ssize_t 2829 static ssize_t
2830 layout_store(mddev_t *mddev, const char *buf, size_t len) 2830 layout_store(mddev_t *mddev, const char *buf, size_t len)
2831 { 2831 {
2832 char *e; 2832 char *e;
2833 unsigned long n = simple_strtoul(buf, &e, 10); 2833 unsigned long n = simple_strtoul(buf, &e, 10);
2834 2834
2835 if (!*buf || (*e && *e != '\n')) 2835 if (!*buf || (*e && *e != '\n'))
2836 return -EINVAL; 2836 return -EINVAL;
2837 2837
2838 if (mddev->pers) { 2838 if (mddev->pers) {
2839 int err; 2839 int err;
2840 if (mddev->pers->check_reshape == NULL) 2840 if (mddev->pers->check_reshape == NULL)
2841 return -EBUSY; 2841 return -EBUSY;
2842 mddev->new_layout = n; 2842 mddev->new_layout = n;
2843 err = mddev->pers->check_reshape(mddev); 2843 err = mddev->pers->check_reshape(mddev);
2844 if (err) { 2844 if (err) {
2845 mddev->new_layout = mddev->layout; 2845 mddev->new_layout = mddev->layout;
2846 return err; 2846 return err;
2847 } 2847 }
2848 } else { 2848 } else {
2849 mddev->new_layout = n; 2849 mddev->new_layout = n;
2850 if (mddev->reshape_position == MaxSector) 2850 if (mddev->reshape_position == MaxSector)
2851 mddev->layout = n; 2851 mddev->layout = n;
2852 } 2852 }
2853 return len; 2853 return len;
2854 } 2854 }
2855 static struct md_sysfs_entry md_layout = 2855 static struct md_sysfs_entry md_layout =
2856 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 2856 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
2857 2857
2858 2858
2859 static ssize_t 2859 static ssize_t
2860 raid_disks_show(mddev_t *mddev, char *page) 2860 raid_disks_show(mddev_t *mddev, char *page)
2861 { 2861 {
2862 if (mddev->raid_disks == 0) 2862 if (mddev->raid_disks == 0)
2863 return 0; 2863 return 0;
2864 if (mddev->reshape_position != MaxSector && 2864 if (mddev->reshape_position != MaxSector &&
2865 mddev->delta_disks != 0) 2865 mddev->delta_disks != 0)
2866 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 2866 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
2867 mddev->raid_disks - mddev->delta_disks); 2867 mddev->raid_disks - mddev->delta_disks);
2868 return sprintf(page, "%d\n", mddev->raid_disks); 2868 return sprintf(page, "%d\n", mddev->raid_disks);
2869 } 2869 }
2870 2870
2871 static int update_raid_disks(mddev_t *mddev, int raid_disks); 2871 static int update_raid_disks(mddev_t *mddev, int raid_disks);
2872 2872
2873 static ssize_t 2873 static ssize_t
2874 raid_disks_store(mddev_t *mddev, const char *buf, size_t len) 2874 raid_disks_store(mddev_t *mddev, const char *buf, size_t len)
2875 { 2875 {
2876 char *e; 2876 char *e;
2877 int rv = 0; 2877 int rv = 0;
2878 unsigned long n = simple_strtoul(buf, &e, 10); 2878 unsigned long n = simple_strtoul(buf, &e, 10);
2879 2879
2880 if (!*buf || (*e && *e != '\n')) 2880 if (!*buf || (*e && *e != '\n'))
2881 return -EINVAL; 2881 return -EINVAL;
2882 2882
2883 if (mddev->pers) 2883 if (mddev->pers)
2884 rv = update_raid_disks(mddev, n); 2884 rv = update_raid_disks(mddev, n);
2885 else if (mddev->reshape_position != MaxSector) { 2885 else if (mddev->reshape_position != MaxSector) {
2886 int olddisks = mddev->raid_disks - mddev->delta_disks; 2886 int olddisks = mddev->raid_disks - mddev->delta_disks;
2887 mddev->delta_disks = n - olddisks; 2887 mddev->delta_disks = n - olddisks;
2888 mddev->raid_disks = n; 2888 mddev->raid_disks = n;
2889 } else 2889 } else
2890 mddev->raid_disks = n; 2890 mddev->raid_disks = n;
2891 return rv ? rv : len; 2891 return rv ? rv : len;
2892 } 2892 }
2893 static struct md_sysfs_entry md_raid_disks = 2893 static struct md_sysfs_entry md_raid_disks =
2894 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 2894 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
2895 2895
2896 static ssize_t 2896 static ssize_t
2897 chunk_size_show(mddev_t *mddev, char *page) 2897 chunk_size_show(mddev_t *mddev, char *page)
2898 { 2898 {
2899 if (mddev->reshape_position != MaxSector && 2899 if (mddev->reshape_position != MaxSector &&
2900 mddev->chunk_sectors != mddev->new_chunk_sectors) 2900 mddev->chunk_sectors != mddev->new_chunk_sectors)
2901 return sprintf(page, "%d (%d)\n", 2901 return sprintf(page, "%d (%d)\n",
2902 mddev->new_chunk_sectors << 9, 2902 mddev->new_chunk_sectors << 9,
2903 mddev->chunk_sectors << 9); 2903 mddev->chunk_sectors << 9);
2904 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 2904 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
2905 } 2905 }
2906 2906
2907 static ssize_t 2907 static ssize_t
2908 chunk_size_store(mddev_t *mddev, const char *buf, size_t len) 2908 chunk_size_store(mddev_t *mddev, const char *buf, size_t len)
2909 { 2909 {
2910 char *e; 2910 char *e;
2911 unsigned long n = simple_strtoul(buf, &e, 10); 2911 unsigned long n = simple_strtoul(buf, &e, 10);
2912 2912
2913 if (!*buf || (*e && *e != '\n')) 2913 if (!*buf || (*e && *e != '\n'))
2914 return -EINVAL; 2914 return -EINVAL;
2915 2915
2916 if (mddev->pers) { 2916 if (mddev->pers) {
2917 int err; 2917 int err;
2918 if (mddev->pers->check_reshape == NULL) 2918 if (mddev->pers->check_reshape == NULL)
2919 return -EBUSY; 2919 return -EBUSY;
2920 mddev->new_chunk_sectors = n >> 9; 2920 mddev->new_chunk_sectors = n >> 9;
2921 err = mddev->pers->check_reshape(mddev); 2921 err = mddev->pers->check_reshape(mddev);
2922 if (err) { 2922 if (err) {
2923 mddev->new_chunk_sectors = mddev->chunk_sectors; 2923 mddev->new_chunk_sectors = mddev->chunk_sectors;
2924 return err; 2924 return err;
2925 } 2925 }
2926 } else { 2926 } else {
2927 mddev->new_chunk_sectors = n >> 9; 2927 mddev->new_chunk_sectors = n >> 9;
2928 if (mddev->reshape_position == MaxSector) 2928 if (mddev->reshape_position == MaxSector)
2929 mddev->chunk_sectors = n >> 9; 2929 mddev->chunk_sectors = n >> 9;
2930 } 2930 }
2931 return len; 2931 return len;
2932 } 2932 }
2933 static struct md_sysfs_entry md_chunk_size = 2933 static struct md_sysfs_entry md_chunk_size =
2934 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 2934 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
2935 2935
2936 static ssize_t 2936 static ssize_t
2937 resync_start_show(mddev_t *mddev, char *page) 2937 resync_start_show(mddev_t *mddev, char *page)
2938 { 2938 {
2939 if (mddev->recovery_cp == MaxSector) 2939 if (mddev->recovery_cp == MaxSector)
2940 return sprintf(page, "none\n"); 2940 return sprintf(page, "none\n");
2941 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 2941 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
2942 } 2942 }
2943 2943
2944 static ssize_t 2944 static ssize_t
2945 resync_start_store(mddev_t *mddev, const char *buf, size_t len) 2945 resync_start_store(mddev_t *mddev, const char *buf, size_t len)
2946 { 2946 {
2947 char *e; 2947 char *e;
2948 unsigned long long n = simple_strtoull(buf, &e, 10); 2948 unsigned long long n = simple_strtoull(buf, &e, 10);
2949 2949
2950 if (mddev->pers) 2950 if (mddev->pers)
2951 return -EBUSY; 2951 return -EBUSY;
2952 if (!*buf || (*e && *e != '\n')) 2952 if (!*buf || (*e && *e != '\n'))
2953 return -EINVAL; 2953 return -EINVAL;
2954 2954
2955 mddev->recovery_cp = n; 2955 mddev->recovery_cp = n;
2956 return len; 2956 return len;
2957 } 2957 }
2958 static struct md_sysfs_entry md_resync_start = 2958 static struct md_sysfs_entry md_resync_start =
2959 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 2959 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
2960 2960
2961 /* 2961 /*
2962 * The array state can be: 2962 * The array state can be:
2963 * 2963 *
2964 * clear 2964 * clear
2965 * No devices, no size, no level 2965 * No devices, no size, no level
2966 * Equivalent to STOP_ARRAY ioctl 2966 * Equivalent to STOP_ARRAY ioctl
2967 * inactive 2967 * inactive
2968 * May have some settings, but array is not active 2968 * May have some settings, but array is not active
2969 * all IO results in error 2969 * all IO results in error
2970 * When written, doesn't tear down array, but just stops it 2970 * When written, doesn't tear down array, but just stops it
2971 * suspended (not supported yet) 2971 * suspended (not supported yet)
2972 * All IO requests will block. The array can be reconfigured. 2972 * All IO requests will block. The array can be reconfigured.
2973 * Writing this, if accepted, will block until array is quiescent 2973 * Writing this, if accepted, will block until array is quiescent
2974 * readonly 2974 * readonly
2975 * no resync can happen. no superblocks get written. 2975 * no resync can happen. no superblocks get written.
2976 * write requests fail 2976 * write requests fail
2977 * read-auto 2977 * read-auto
2978 * like readonly, but behaves like 'clean' on a write request. 2978 * like readonly, but behaves like 'clean' on a write request.
2979 * 2979 *
2980 * clean - no pending writes, but otherwise active. 2980 * clean - no pending writes, but otherwise active.
2981 * When written to inactive array, starts without resync 2981 * When written to inactive array, starts without resync
2982 * If a write request arrives then 2982 * If a write request arrives then
2983 * if metadata is known, mark 'dirty' and switch to 'active'. 2983 * if metadata is known, mark 'dirty' and switch to 'active'.
2984 * if not known, block and switch to write-pending 2984 * if not known, block and switch to write-pending
2985 * If written to an active array that has pending writes, then fails. 2985 * If written to an active array that has pending writes, then fails.
2986 * active 2986 * active
2987 * fully active: IO and resync can be happening. 2987 * fully active: IO and resync can be happening.
2988 * When written to inactive array, starts with resync 2988 * When written to inactive array, starts with resync
2989 * 2989 *
2990 * write-pending 2990 * write-pending
2991 * clean, but writes are blocked waiting for 'active' to be written. 2991 * clean, but writes are blocked waiting for 'active' to be written.
2992 * 2992 *
2993 * active-idle 2993 * active-idle
2994 * like active, but no writes have been seen for a while (100msec). 2994 * like active, but no writes have been seen for a while (100msec).
2995 * 2995 *
2996 */ 2996 */
2997 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 2997 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
2998 write_pending, active_idle, bad_word}; 2998 write_pending, active_idle, bad_word};
2999 static char *array_states[] = { 2999 static char *array_states[] = {
3000 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 3000 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3001 "write-pending", "active-idle", NULL }; 3001 "write-pending", "active-idle", NULL };
3002 3002
3003 static int match_word(const char *word, char **list) 3003 static int match_word(const char *word, char **list)
3004 { 3004 {
3005 int n; 3005 int n;
3006 for (n=0; list[n]; n++) 3006 for (n=0; list[n]; n++)
3007 if (cmd_match(word, list[n])) 3007 if (cmd_match(word, list[n]))
3008 break; 3008 break;
3009 return n; 3009 return n;
3010 } 3010 }
3011 3011
3012 static ssize_t 3012 static ssize_t
3013 array_state_show(mddev_t *mddev, char *page) 3013 array_state_show(mddev_t *mddev, char *page)
3014 { 3014 {
3015 enum array_state st = inactive; 3015 enum array_state st = inactive;
3016 3016
3017 if (mddev->pers) 3017 if (mddev->pers)
3018 switch(mddev->ro) { 3018 switch(mddev->ro) {
3019 case 1: 3019 case 1:
3020 st = readonly; 3020 st = readonly;
3021 break; 3021 break;
3022 case 2: 3022 case 2:
3023 st = read_auto; 3023 st = read_auto;
3024 break; 3024 break;
3025 case 0: 3025 case 0:
3026 if (mddev->in_sync) 3026 if (mddev->in_sync)
3027 st = clean; 3027 st = clean;
3028 else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 3028 else if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
3029 st = write_pending; 3029 st = write_pending;
3030 else if (mddev->safemode) 3030 else if (mddev->safemode)
3031 st = active_idle; 3031 st = active_idle;
3032 else 3032 else
3033 st = active; 3033 st = active;
3034 } 3034 }
3035 else { 3035 else {
3036 if (list_empty(&mddev->disks) && 3036 if (list_empty(&mddev->disks) &&
3037 mddev->raid_disks == 0 && 3037 mddev->raid_disks == 0 &&
3038 mddev->dev_sectors == 0) 3038 mddev->dev_sectors == 0)
3039 st = clear; 3039 st = clear;
3040 else 3040 else
3041 st = inactive; 3041 st = inactive;
3042 } 3042 }
3043 return sprintf(page, "%s\n", array_states[st]); 3043 return sprintf(page, "%s\n", array_states[st]);
3044 } 3044 }
3045 3045
3046 static int do_md_stop(mddev_t * mddev, int ro, int is_open); 3046 static int do_md_stop(mddev_t * mddev, int ro, int is_open);
3047 static int do_md_run(mddev_t * mddev); 3047 static int do_md_run(mddev_t * mddev);
3048 static int restart_array(mddev_t *mddev); 3048 static int restart_array(mddev_t *mddev);
3049 3049
3050 static ssize_t 3050 static ssize_t
3051 array_state_store(mddev_t *mddev, const char *buf, size_t len) 3051 array_state_store(mddev_t *mddev, const char *buf, size_t len)
3052 { 3052 {
3053 int err = -EINVAL; 3053 int err = -EINVAL;
3054 enum array_state st = match_word(buf, array_states); 3054 enum array_state st = match_word(buf, array_states);
3055 switch(st) { 3055 switch(st) {
3056 case bad_word: 3056 case bad_word:
3057 break; 3057 break;
3058 case clear: 3058 case clear:
3059 /* stopping an active array */ 3059 /* stopping an active array */
3060 if (atomic_read(&mddev->openers) > 0) 3060 if (atomic_read(&mddev->openers) > 0)
3061 return -EBUSY; 3061 return -EBUSY;
3062 err = do_md_stop(mddev, 0, 0); 3062 err = do_md_stop(mddev, 0, 0);
3063 break; 3063 break;
3064 case inactive: 3064 case inactive:
3065 /* stopping an active array */ 3065 /* stopping an active array */
3066 if (mddev->pers) { 3066 if (mddev->pers) {
3067 if (atomic_read(&mddev->openers) > 0) 3067 if (atomic_read(&mddev->openers) > 0)
3068 return -EBUSY; 3068 return -EBUSY;
3069 err = do_md_stop(mddev, 2, 0); 3069 err = do_md_stop(mddev, 2, 0);
3070 } else 3070 } else
3071 err = 0; /* already inactive */ 3071 err = 0; /* already inactive */
3072 break; 3072 break;
3073 case suspended: 3073 case suspended:
3074 break; /* not supported yet */ 3074 break; /* not supported yet */
3075 case readonly: 3075 case readonly:
3076 if (mddev->pers) 3076 if (mddev->pers)
3077 err = do_md_stop(mddev, 1, 0); 3077 err = do_md_stop(mddev, 1, 0);
3078 else { 3078 else {
3079 mddev->ro = 1; 3079 mddev->ro = 1;
3080 set_disk_ro(mddev->gendisk, 1); 3080 set_disk_ro(mddev->gendisk, 1);
3081 err = do_md_run(mddev); 3081 err = do_md_run(mddev);
3082 } 3082 }
3083 break; 3083 break;
3084 case read_auto: 3084 case read_auto:
3085 if (mddev->pers) { 3085 if (mddev->pers) {
3086 if (mddev->ro == 0) 3086 if (mddev->ro == 0)
3087 err = do_md_stop(mddev, 1, 0); 3087 err = do_md_stop(mddev, 1, 0);
3088 else if (mddev->ro == 1) 3088 else if (mddev->ro == 1)
3089 err = restart_array(mddev); 3089 err = restart_array(mddev);
3090 if (err == 0) { 3090 if (err == 0) {
3091 mddev->ro = 2; 3091 mddev->ro = 2;
3092 set_disk_ro(mddev->gendisk, 0); 3092 set_disk_ro(mddev->gendisk, 0);
3093 } 3093 }
3094 } else { 3094 } else {
3095 mddev->ro = 2; 3095 mddev->ro = 2;
3096 err = do_md_run(mddev); 3096 err = do_md_run(mddev);
3097 } 3097 }
3098 break; 3098 break;
3099 case clean: 3099 case clean:
3100 if (mddev->pers) { 3100 if (mddev->pers) {
3101 restart_array(mddev); 3101 restart_array(mddev);
3102 spin_lock_irq(&mddev->write_lock); 3102 spin_lock_irq(&mddev->write_lock);
3103 if (atomic_read(&mddev->writes_pending) == 0) { 3103 if (atomic_read(&mddev->writes_pending) == 0) {
3104 if (mddev->in_sync == 0) { 3104 if (mddev->in_sync == 0) {
3105 mddev->in_sync = 1; 3105 mddev->in_sync = 1;
3106 if (mddev->safemode == 1) 3106 if (mddev->safemode == 1)
3107 mddev->safemode = 0; 3107 mddev->safemode = 0;
3108 if (mddev->persistent) 3108 if (mddev->persistent)
3109 set_bit(MD_CHANGE_CLEAN, 3109 set_bit(MD_CHANGE_CLEAN,
3110 &mddev->flags); 3110 &mddev->flags);
3111 } 3111 }
3112 err = 0; 3112 err = 0;
3113 } else 3113 } else
3114 err = -EBUSY; 3114 err = -EBUSY;
3115 spin_unlock_irq(&mddev->write_lock); 3115 spin_unlock_irq(&mddev->write_lock);
3116 } else 3116 } else
3117 err = -EINVAL; 3117 err = -EINVAL;
3118 break; 3118 break;
3119 case active: 3119 case active:
3120 if (mddev->pers) { 3120 if (mddev->pers) {
3121 restart_array(mddev); 3121 restart_array(mddev);
3122 if (mddev->external) 3122 if (mddev->external)
3123 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 3123 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
3124 wake_up(&mddev->sb_wait); 3124 wake_up(&mddev->sb_wait);
3125 err = 0; 3125 err = 0;
3126 } else { 3126 } else {
3127 mddev->ro = 0; 3127 mddev->ro = 0;
3128 set_disk_ro(mddev->gendisk, 0); 3128 set_disk_ro(mddev->gendisk, 0);
3129 err = do_md_run(mddev); 3129 err = do_md_run(mddev);
3130 } 3130 }
3131 break; 3131 break;
3132 case write_pending: 3132 case write_pending:
3133 case active_idle: 3133 case active_idle:
3134 /* these cannot be set */ 3134 /* these cannot be set */
3135 break; 3135 break;
3136 } 3136 }
3137 if (err) 3137 if (err)
3138 return err; 3138 return err;
3139 else { 3139 else {
3140 sysfs_notify_dirent(mddev->sysfs_state); 3140 sysfs_notify_dirent(mddev->sysfs_state);
3141 return len; 3141 return len;
3142 } 3142 }
3143 } 3143 }
3144 static struct md_sysfs_entry md_array_state = 3144 static struct md_sysfs_entry md_array_state =
3145 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3145 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3146 3146
3147 static ssize_t 3147 static ssize_t
3148 null_show(mddev_t *mddev, char *page) 3148 null_show(mddev_t *mddev, char *page)
3149 { 3149 {
3150 return -EINVAL; 3150 return -EINVAL;
3151 } 3151 }
3152 3152
3153 static ssize_t 3153 static ssize_t
3154 new_dev_store(mddev_t *mddev, const char *buf, size_t len) 3154 new_dev_store(mddev_t *mddev, const char *buf, size_t len)
3155 { 3155 {
3156 /* buf must be %d:%d\n? giving major and minor numbers */ 3156 /* buf must be %d:%d\n? giving major and minor numbers */
3157 /* The new device is added to the array. 3157 /* The new device is added to the array.
3158 * If the array has a persistent superblock, we read the 3158 * If the array has a persistent superblock, we read the
3159 * superblock to initialise info and check validity. 3159 * superblock to initialise info and check validity.
3160 * Otherwise, only checking done is that in bind_rdev_to_array, 3160 * Otherwise, only checking done is that in bind_rdev_to_array,
3161 * which mainly checks size. 3161 * which mainly checks size.
3162 */ 3162 */
3163 char *e; 3163 char *e;
3164 int major = simple_strtoul(buf, &e, 10); 3164 int major = simple_strtoul(buf, &e, 10);
3165 int minor; 3165 int minor;
3166 dev_t dev; 3166 dev_t dev;
3167 mdk_rdev_t *rdev; 3167 mdk_rdev_t *rdev;
3168 int err; 3168 int err;
3169 3169
3170 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 3170 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
3171 return -EINVAL; 3171 return -EINVAL;
3172 minor = simple_strtoul(e+1, &e, 10); 3172 minor = simple_strtoul(e+1, &e, 10);
3173 if (*e && *e != '\n') 3173 if (*e && *e != '\n')
3174 return -EINVAL; 3174 return -EINVAL;
3175 dev = MKDEV(major, minor); 3175 dev = MKDEV(major, minor);
3176 if (major != MAJOR(dev) || 3176 if (major != MAJOR(dev) ||
3177 minor != MINOR(dev)) 3177 minor != MINOR(dev))
3178 return -EOVERFLOW; 3178 return -EOVERFLOW;
3179 3179
3180 3180
3181 if (mddev->persistent) { 3181 if (mddev->persistent) {
3182 rdev = md_import_device(dev, mddev->major_version, 3182 rdev = md_import_device(dev, mddev->major_version,
3183 mddev->minor_version); 3183 mddev->minor_version);
3184 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 3184 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
3185 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 3185 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
3186 mdk_rdev_t, same_set); 3186 mdk_rdev_t, same_set);
3187 err = super_types[mddev->major_version] 3187 err = super_types[mddev->major_version]
3188 .load_super(rdev, rdev0, mddev->minor_version); 3188 .load_super(rdev, rdev0, mddev->minor_version);
3189 if (err < 0) 3189 if (err < 0)
3190 goto out; 3190 goto out;
3191 } 3191 }
3192 } else if (mddev->external) 3192 } else if (mddev->external)
3193 rdev = md_import_device(dev, -2, -1); 3193 rdev = md_import_device(dev, -2, -1);
3194 else 3194 else
3195 rdev = md_import_device(dev, -1, -1); 3195 rdev = md_import_device(dev, -1, -1);
3196 3196
3197 if (IS_ERR(rdev)) 3197 if (IS_ERR(rdev))
3198 return PTR_ERR(rdev); 3198 return PTR_ERR(rdev);
3199 err = bind_rdev_to_array(rdev, mddev); 3199 err = bind_rdev_to_array(rdev, mddev);
3200 out: 3200 out:
3201 if (err) 3201 if (err)
3202 export_rdev(rdev); 3202 export_rdev(rdev);
3203 return err ? err : len; 3203 return err ? err : len;
3204 } 3204 }
3205 3205
3206 static struct md_sysfs_entry md_new_device = 3206 static struct md_sysfs_entry md_new_device =
3207 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 3207 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
3208 3208
3209 static ssize_t 3209 static ssize_t
3210 bitmap_store(mddev_t *mddev, const char *buf, size_t len) 3210 bitmap_store(mddev_t *mddev, const char *buf, size_t len)
3211 { 3211 {
3212 char *end; 3212 char *end;
3213 unsigned long chunk, end_chunk; 3213 unsigned long chunk, end_chunk;
3214 3214
3215 if (!mddev->bitmap) 3215 if (!mddev->bitmap)
3216 goto out; 3216 goto out;
3217 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 3217 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
3218 while (*buf) { 3218 while (*buf) {
3219 chunk = end_chunk = simple_strtoul(buf, &end, 0); 3219 chunk = end_chunk = simple_strtoul(buf, &end, 0);
3220 if (buf == end) break; 3220 if (buf == end) break;
3221 if (*end == '-') { /* range */ 3221 if (*end == '-') { /* range */
3222 buf = end + 1; 3222 buf = end + 1;
3223 end_chunk = simple_strtoul(buf, &end, 0); 3223 end_chunk = simple_strtoul(buf, &end, 0);
3224 if (buf == end) break; 3224 if (buf == end) break;
3225 } 3225 }
3226 if (*end && !isspace(*end)) break; 3226 if (*end && !isspace(*end)) break;
3227 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 3227 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
3228 buf = end; 3228 buf = end;
3229 while (isspace(*buf)) buf++; 3229 while (isspace(*buf)) buf++;
3230 } 3230 }
3231 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 3231 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
3232 out: 3232 out:
3233 return len; 3233 return len;
3234 } 3234 }
3235 3235
3236 static struct md_sysfs_entry md_bitmap = 3236 static struct md_sysfs_entry md_bitmap =
3237 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 3237 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
3238 3238
3239 static ssize_t 3239 static ssize_t
3240 size_show(mddev_t *mddev, char *page) 3240 size_show(mddev_t *mddev, char *page)
3241 { 3241 {
3242 return sprintf(page, "%llu\n", 3242 return sprintf(page, "%llu\n",
3243 (unsigned long long)mddev->dev_sectors / 2); 3243 (unsigned long long)mddev->dev_sectors / 2);
3244 } 3244 }
3245 3245
3246 static int update_size(mddev_t *mddev, sector_t num_sectors); 3246 static int update_size(mddev_t *mddev, sector_t num_sectors);
3247 3247
3248 static ssize_t 3248 static ssize_t
3249 size_store(mddev_t *mddev, const char *buf, size_t len) 3249 size_store(mddev_t *mddev, const char *buf, size_t len)
3250 { 3250 {
3251 /* If array is inactive, we can reduce the component size, but 3251 /* If array is inactive, we can reduce the component size, but
3252 * not increase it (except from 0). 3252 * not increase it (except from 0).
3253 * If array is active, we can try an on-line resize 3253 * If array is active, we can try an on-line resize
3254 */ 3254 */
3255 sector_t sectors; 3255 sector_t sectors;
3256 int err = strict_blocks_to_sectors(buf, &sectors); 3256 int err = strict_blocks_to_sectors(buf, &sectors);
3257 3257
3258 if (err < 0) 3258 if (err < 0)
3259 return err; 3259 return err;
3260 if (mddev->pers) { 3260 if (mddev->pers) {
3261 err = update_size(mddev, sectors); 3261 err = update_size(mddev, sectors);
3262 md_update_sb(mddev, 1); 3262 md_update_sb(mddev, 1);
3263 } else { 3263 } else {
3264 if (mddev->dev_sectors == 0 || 3264 if (mddev->dev_sectors == 0 ||
3265 mddev->dev_sectors > sectors) 3265 mddev->dev_sectors > sectors)
3266 mddev->dev_sectors = sectors; 3266 mddev->dev_sectors = sectors;
3267 else 3267 else
3268 err = -ENOSPC; 3268 err = -ENOSPC;
3269 } 3269 }
3270 return err ? err : len; 3270 return err ? err : len;
3271 } 3271 }
3272 3272
3273 static struct md_sysfs_entry md_size = 3273 static struct md_sysfs_entry md_size =
3274 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 3274 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
3275 3275
3276 3276
3277 /* Metdata version. 3277 /* Metdata version.
3278 * This is one of 3278 * This is one of
3279 * 'none' for arrays with no metadata (good luck...) 3279 * 'none' for arrays with no metadata (good luck...)
3280 * 'external' for arrays with externally managed metadata, 3280 * 'external' for arrays with externally managed metadata,
3281 * or N.M for internally known formats 3281 * or N.M for internally known formats
3282 */ 3282 */
3283 static ssize_t 3283 static ssize_t
3284 metadata_show(mddev_t *mddev, char *page) 3284 metadata_show(mddev_t *mddev, char *page)
3285 { 3285 {
3286 if (mddev->persistent) 3286 if (mddev->persistent)
3287 return sprintf(page, "%d.%d\n", 3287 return sprintf(page, "%d.%d\n",
3288 mddev->major_version, mddev->minor_version); 3288 mddev->major_version, mddev->minor_version);
3289 else if (mddev->external) 3289 else if (mddev->external)
3290 return sprintf(page, "external:%s\n", mddev->metadata_type); 3290 return sprintf(page, "external:%s\n", mddev->metadata_type);
3291 else 3291 else
3292 return sprintf(page, "none\n"); 3292 return sprintf(page, "none\n");
3293 } 3293 }
3294 3294
3295 static ssize_t 3295 static ssize_t
3296 metadata_store(mddev_t *mddev, const char *buf, size_t len) 3296 metadata_store(mddev_t *mddev, const char *buf, size_t len)
3297 { 3297 {
3298 int major, minor; 3298 int major, minor;
3299 char *e; 3299 char *e;
3300 /* Changing the details of 'external' metadata is 3300 /* Changing the details of 'external' metadata is
3301 * always permitted. Otherwise there must be 3301 * always permitted. Otherwise there must be
3302 * no devices attached to the array. 3302 * no devices attached to the array.
3303 */ 3303 */
3304 if (mddev->external && strncmp(buf, "external:", 9) == 0) 3304 if (mddev->external && strncmp(buf, "external:", 9) == 0)
3305 ; 3305 ;
3306 else if (!list_empty(&mddev->disks)) 3306 else if (!list_empty(&mddev->disks))
3307 return -EBUSY; 3307 return -EBUSY;
3308 3308
3309 if (cmd_match(buf, "none")) { 3309 if (cmd_match(buf, "none")) {
3310 mddev->persistent = 0; 3310 mddev->persistent = 0;
3311 mddev->external = 0; 3311 mddev->external = 0;
3312 mddev->major_version = 0; 3312 mddev->major_version = 0;
3313 mddev->minor_version = 90; 3313 mddev->minor_version = 90;
3314 return len; 3314 return len;
3315 } 3315 }
3316 if (strncmp(buf, "external:", 9) == 0) { 3316 if (strncmp(buf, "external:", 9) == 0) {
3317 size_t namelen = len-9; 3317 size_t namelen = len-9;
3318 if (namelen >= sizeof(mddev->metadata_type)) 3318 if (namelen >= sizeof(mddev->metadata_type))
3319 namelen = sizeof(mddev->metadata_type)-1; 3319 namelen = sizeof(mddev->metadata_type)-1;
3320 strncpy(mddev->metadata_type, buf+9, namelen); 3320 strncpy(mddev->metadata_type, buf+9, namelen);
3321 mddev->metadata_type[namelen] = 0; 3321 mddev->metadata_type[namelen] = 0;
3322 if (namelen && mddev->metadata_type[namelen-1] == '\n') 3322 if (namelen && mddev->metadata_type[namelen-1] == '\n')
3323 mddev->metadata_type[--namelen] = 0; 3323 mddev->metadata_type[--namelen] = 0;
3324 mddev->persistent = 0; 3324 mddev->persistent = 0;
3325 mddev->external = 1; 3325 mddev->external = 1;
3326 mddev->major_version = 0; 3326 mddev->major_version = 0;
3327 mddev->minor_version = 90; 3327 mddev->minor_version = 90;
3328 return len; 3328 return len;
3329 } 3329 }
3330 major = simple_strtoul(buf, &e, 10); 3330 major = simple_strtoul(buf, &e, 10);
3331 if (e==buf || *e != '.') 3331 if (e==buf || *e != '.')
3332 return -EINVAL; 3332 return -EINVAL;
3333 buf = e+1; 3333 buf = e+1;
3334 minor = simple_strtoul(buf, &e, 10); 3334 minor = simple_strtoul(buf, &e, 10);
3335 if (e==buf || (*e && *e != '\n') ) 3335 if (e==buf || (*e && *e != '\n') )
3336 return -EINVAL; 3336 return -EINVAL;
3337 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 3337 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
3338 return -ENOENT; 3338 return -ENOENT;
3339 mddev->major_version = major; 3339 mddev->major_version = major;
3340 mddev->minor_version = minor; 3340 mddev->minor_version = minor;
3341 mddev->persistent = 1; 3341 mddev->persistent = 1;
3342 mddev->external = 0; 3342 mddev->external = 0;
3343 return len; 3343 return len;
3344 } 3344 }
3345 3345
3346 static struct md_sysfs_entry md_metadata = 3346 static struct md_sysfs_entry md_metadata =
3347 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 3347 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
3348 3348
3349 static ssize_t 3349 static ssize_t
3350 action_show(mddev_t *mddev, char *page) 3350 action_show(mddev_t *mddev, char *page)
3351 { 3351 {
3352 char *type = "idle"; 3352 char *type = "idle";
3353 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3353 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3354 type = "frozen"; 3354 type = "frozen";
3355 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3355 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3356 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { 3356 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
3357 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 3357 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
3358 type = "reshape"; 3358 type = "reshape";
3359 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 3359 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
3360 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 3360 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
3361 type = "resync"; 3361 type = "resync";
3362 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 3362 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
3363 type = "check"; 3363 type = "check";
3364 else 3364 else
3365 type = "repair"; 3365 type = "repair";
3366 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 3366 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
3367 type = "recover"; 3367 type = "recover";
3368 } 3368 }
3369 return sprintf(page, "%s\n", type); 3369 return sprintf(page, "%s\n", type);
3370 } 3370 }
3371 3371
3372 static ssize_t 3372 static ssize_t
3373 action_store(mddev_t *mddev, const char *page, size_t len) 3373 action_store(mddev_t *mddev, const char *page, size_t len)
3374 { 3374 {
3375 if (!mddev->pers || !mddev->pers->sync_request) 3375 if (!mddev->pers || !mddev->pers->sync_request)
3376 return -EINVAL; 3376 return -EINVAL;
3377 3377
3378 if (cmd_match(page, "frozen")) 3378 if (cmd_match(page, "frozen"))
3379 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3379 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3380 else 3380 else
3381 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 3381 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
3382 3382
3383 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 3383 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
3384 if (mddev->sync_thread) { 3384 if (mddev->sync_thread) {
3385 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 3385 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
3386 md_unregister_thread(mddev->sync_thread); 3386 md_unregister_thread(mddev->sync_thread);
3387 mddev->sync_thread = NULL; 3387 mddev->sync_thread = NULL;
3388 mddev->recovery = 0; 3388 mddev->recovery = 0;
3389 } 3389 }
3390 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 3390 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
3391 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 3391 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
3392 return -EBUSY; 3392 return -EBUSY;
3393 else if (cmd_match(page, "resync")) 3393 else if (cmd_match(page, "resync"))
3394 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3394 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3395 else if (cmd_match(page, "recover")) { 3395 else if (cmd_match(page, "recover")) {
3396 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 3396 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
3397 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3397 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3398 } else if (cmd_match(page, "reshape")) { 3398 } else if (cmd_match(page, "reshape")) {
3399 int err; 3399 int err;
3400 if (mddev->pers->start_reshape == NULL) 3400 if (mddev->pers->start_reshape == NULL)
3401 return -EINVAL; 3401 return -EINVAL;
3402 err = mddev->pers->start_reshape(mddev); 3402 err = mddev->pers->start_reshape(mddev);
3403 if (err) 3403 if (err)
3404 return err; 3404 return err;
3405 sysfs_notify(&mddev->kobj, NULL, "degraded"); 3405 sysfs_notify(&mddev->kobj, NULL, "degraded");
3406 } else { 3406 } else {
3407 if (cmd_match(page, "check")) 3407 if (cmd_match(page, "check"))
3408 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 3408 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
3409 else if (!cmd_match(page, "repair")) 3409 else if (!cmd_match(page, "repair"))
3410 return -EINVAL; 3410 return -EINVAL;
3411 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 3411 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
3412 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 3412 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
3413 } 3413 }
3414 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 3414 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3415 md_wakeup_thread(mddev->thread); 3415 md_wakeup_thread(mddev->thread);
3416 sysfs_notify_dirent(mddev->sysfs_action); 3416 sysfs_notify_dirent(mddev->sysfs_action);
3417 return len; 3417 return len;
3418 } 3418 }
3419 3419
3420 static ssize_t 3420 static ssize_t
3421 mismatch_cnt_show(mddev_t *mddev, char *page) 3421 mismatch_cnt_show(mddev_t *mddev, char *page)
3422 { 3422 {
3423 return sprintf(page, "%llu\n", 3423 return sprintf(page, "%llu\n",
3424 (unsigned long long) mddev->resync_mismatches); 3424 (unsigned long long) mddev->resync_mismatches);
3425 } 3425 }
3426 3426
3427 static struct md_sysfs_entry md_scan_mode = 3427 static struct md_sysfs_entry md_scan_mode =
3428 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 3428 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
3429 3429
3430 3430
3431 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 3431 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
3432 3432
3433 static ssize_t 3433 static ssize_t
3434 sync_min_show(mddev_t *mddev, char *page) 3434 sync_min_show(mddev_t *mddev, char *page)
3435 { 3435 {
3436 return sprintf(page, "%d (%s)\n", speed_min(mddev), 3436 return sprintf(page, "%d (%s)\n", speed_min(mddev),
3437 mddev->sync_speed_min ? "local": "system"); 3437 mddev->sync_speed_min ? "local": "system");
3438 } 3438 }
3439 3439
3440 static ssize_t 3440 static ssize_t
3441 sync_min_store(mddev_t *mddev, const char *buf, size_t len) 3441 sync_min_store(mddev_t *mddev, const char *buf, size_t len)
3442 { 3442 {
3443 int min; 3443 int min;
3444 char *e; 3444 char *e;
3445 if (strncmp(buf, "system", 6)==0) { 3445 if (strncmp(buf, "system", 6)==0) {
3446 mddev->sync_speed_min = 0; 3446 mddev->sync_speed_min = 0;
3447 return len; 3447 return len;
3448 } 3448 }
3449 min = simple_strtoul(buf, &e, 10); 3449 min = simple_strtoul(buf, &e, 10);
3450 if (buf == e || (*e && *e != '\n') || min <= 0) 3450 if (buf == e || (*e && *e != '\n') || min <= 0)
3451 return -EINVAL; 3451 return -EINVAL;
3452 mddev->sync_speed_min = min; 3452 mddev->sync_speed_min = min;
3453 return len; 3453 return len;
3454 } 3454 }
3455 3455
3456 static struct md_sysfs_entry md_sync_min = 3456 static struct md_sysfs_entry md_sync_min =
3457 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 3457 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
3458 3458
3459 static ssize_t 3459 static ssize_t
3460 sync_max_show(mddev_t *mddev, char *page) 3460 sync_max_show(mddev_t *mddev, char *page)
3461 { 3461 {
3462 return sprintf(page, "%d (%s)\n", speed_max(mddev), 3462 return sprintf(page, "%d (%s)\n", speed_max(mddev),
3463 mddev->sync_speed_max ? "local": "system"); 3463 mddev->sync_speed_max ? "local": "system");
3464 } 3464 }
3465 3465
3466 static ssize_t 3466 static ssize_t
3467 sync_max_store(mddev_t *mddev, const char *buf, size_t len) 3467 sync_max_store(mddev_t *mddev, const char *buf, size_t len)
3468 { 3468 {
3469 int max; 3469 int max;
3470 char *e; 3470 char *e;
3471 if (strncmp(buf, "system", 6)==0) { 3471 if (strncmp(buf, "system", 6)==0) {
3472 mddev->sync_speed_max = 0; 3472 mddev->sync_speed_max = 0;
3473 return len; 3473 return len;
3474 } 3474 }
3475 max = simple_strtoul(buf, &e, 10); 3475 max = simple_strtoul(buf, &e, 10);
3476 if (buf == e || (*e && *e != '\n') || max <= 0) 3476 if (buf == e || (*e && *e != '\n') || max <= 0)
3477 return -EINVAL; 3477 return -EINVAL;
3478 mddev->sync_speed_max = max; 3478 mddev->sync_speed_max = max;
3479 return len; 3479 return len;
3480 } 3480 }
3481 3481
3482 static struct md_sysfs_entry md_sync_max = 3482 static struct md_sysfs_entry md_sync_max =
3483 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 3483 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
3484 3484
3485 static ssize_t 3485 static ssize_t
3486 degraded_show(mddev_t *mddev, char *page) 3486 degraded_show(mddev_t *mddev, char *page)
3487 { 3487 {
3488 return sprintf(page, "%d\n", mddev->degraded); 3488 return sprintf(page, "%d\n", mddev->degraded);
3489 } 3489 }
3490 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 3490 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
3491 3491
3492 static ssize_t 3492 static ssize_t
3493 sync_force_parallel_show(mddev_t *mddev, char *page) 3493 sync_force_parallel_show(mddev_t *mddev, char *page)
3494 { 3494 {
3495 return sprintf(page, "%d\n", mddev->parallel_resync); 3495 return sprintf(page, "%d\n", mddev->parallel_resync);
3496 } 3496 }
3497 3497
3498 static ssize_t 3498 static ssize_t
3499 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len) 3499 sync_force_parallel_store(mddev_t *mddev, const char *buf, size_t len)
3500 { 3500 {
3501 long n; 3501 long n;
3502 3502
3503 if (strict_strtol(buf, 10, &n)) 3503 if (strict_strtol(buf, 10, &n))
3504 return -EINVAL; 3504 return -EINVAL;
3505 3505
3506 if (n != 0 && n != 1) 3506 if (n != 0 && n != 1)
3507 return -EINVAL; 3507 return -EINVAL;
3508 3508
3509 mddev->parallel_resync = n; 3509 mddev->parallel_resync = n;
3510 3510
3511 if (mddev->sync_thread) 3511 if (mddev->sync_thread)
3512 wake_up(&resync_wait); 3512 wake_up(&resync_wait);
3513 3513
3514 return len; 3514 return len;
3515 } 3515 }
3516 3516
3517 /* force parallel resync, even with shared block devices */ 3517 /* force parallel resync, even with shared block devices */
3518 static struct md_sysfs_entry md_sync_force_parallel = 3518 static struct md_sysfs_entry md_sync_force_parallel =
3519 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 3519 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
3520 sync_force_parallel_show, sync_force_parallel_store); 3520 sync_force_parallel_show, sync_force_parallel_store);
3521 3521
3522 static ssize_t 3522 static ssize_t
3523 sync_speed_show(mddev_t *mddev, char *page) 3523 sync_speed_show(mddev_t *mddev, char *page)
3524 { 3524 {
3525 unsigned long resync, dt, db; 3525 unsigned long resync, dt, db;
3526 if (mddev->curr_resync == 0) 3526 if (mddev->curr_resync == 0)
3527 return sprintf(page, "none\n"); 3527 return sprintf(page, "none\n");
3528 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 3528 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
3529 dt = (jiffies - mddev->resync_mark) / HZ; 3529 dt = (jiffies - mddev->resync_mark) / HZ;
3530 if (!dt) dt++; 3530 if (!dt) dt++;
3531 db = resync - mddev->resync_mark_cnt; 3531 db = resync - mddev->resync_mark_cnt;
3532 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 3532 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
3533 } 3533 }
3534 3534
3535 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 3535 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
3536 3536
3537 static ssize_t 3537 static ssize_t
3538 sync_completed_show(mddev_t *mddev, char *page) 3538 sync_completed_show(mddev_t *mddev, char *page)
3539 { 3539 {
3540 unsigned long max_sectors, resync; 3540 unsigned long max_sectors, resync;
3541 3541
3542 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3542 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3543 return sprintf(page, "none\n"); 3543 return sprintf(page, "none\n");
3544 3544
3545 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 3545 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
3546 max_sectors = mddev->resync_max_sectors; 3546 max_sectors = mddev->resync_max_sectors;
3547 else 3547 else
3548 max_sectors = mddev->dev_sectors; 3548 max_sectors = mddev->dev_sectors;
3549 3549
3550 resync = mddev->curr_resync_completed; 3550 resync = mddev->curr_resync_completed;
3551 return sprintf(page, "%lu / %lu\n", resync, max_sectors); 3551 return sprintf(page, "%lu / %lu\n", resync, max_sectors);
3552 } 3552 }
3553 3553
3554 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 3554 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
3555 3555
3556 static ssize_t 3556 static ssize_t
3557 min_sync_show(mddev_t *mddev, char *page) 3557 min_sync_show(mddev_t *mddev, char *page)
3558 { 3558 {
3559 return sprintf(page, "%llu\n", 3559 return sprintf(page, "%llu\n",
3560 (unsigned long long)mddev->resync_min); 3560 (unsigned long long)mddev->resync_min);
3561 } 3561 }
3562 static ssize_t 3562 static ssize_t
3563 min_sync_store(mddev_t *mddev, const char *buf, size_t len) 3563 min_sync_store(mddev_t *mddev, const char *buf, size_t len)
3564 { 3564 {
3565 unsigned long long min; 3565 unsigned long long min;
3566 if (strict_strtoull(buf, 10, &min)) 3566 if (strict_strtoull(buf, 10, &min))
3567 return -EINVAL; 3567 return -EINVAL;
3568 if (min > mddev->resync_max) 3568 if (min > mddev->resync_max)
3569 return -EINVAL; 3569 return -EINVAL;
3570 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3570 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3571 return -EBUSY; 3571 return -EBUSY;
3572 3572
3573 /* Must be a multiple of chunk_size */ 3573 /* Must be a multiple of chunk_size */
3574 if (mddev->chunk_sectors) { 3574 if (mddev->chunk_sectors) {
3575 sector_t temp = min; 3575 sector_t temp = min;
3576 if (sector_div(temp, mddev->chunk_sectors)) 3576 if (sector_div(temp, mddev->chunk_sectors))
3577 return -EINVAL; 3577 return -EINVAL;
3578 } 3578 }
3579 mddev->resync_min = min; 3579 mddev->resync_min = min;
3580 3580
3581 return len; 3581 return len;
3582 } 3582 }
3583 3583
3584 static struct md_sysfs_entry md_min_sync = 3584 static struct md_sysfs_entry md_min_sync =
3585 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 3585 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
3586 3586
3587 static ssize_t 3587 static ssize_t
3588 max_sync_show(mddev_t *mddev, char *page) 3588 max_sync_show(mddev_t *mddev, char *page)
3589 { 3589 {
3590 if (mddev->resync_max == MaxSector) 3590 if (mddev->resync_max == MaxSector)
3591 return sprintf(page, "max\n"); 3591 return sprintf(page, "max\n");
3592 else 3592 else
3593 return sprintf(page, "%llu\n", 3593 return sprintf(page, "%llu\n",
3594 (unsigned long long)mddev->resync_max); 3594 (unsigned long long)mddev->resync_max);
3595 } 3595 }
3596 static ssize_t 3596 static ssize_t
3597 max_sync_store(mddev_t *mddev, const char *buf, size_t len) 3597 max_sync_store(mddev_t *mddev, const char *buf, size_t len)
3598 { 3598 {
3599 if (strncmp(buf, "max", 3) == 0) 3599 if (strncmp(buf, "max", 3) == 0)
3600 mddev->resync_max = MaxSector; 3600 mddev->resync_max = MaxSector;
3601 else { 3601 else {
3602 unsigned long long max; 3602 unsigned long long max;
3603 if (strict_strtoull(buf, 10, &max)) 3603 if (strict_strtoull(buf, 10, &max))
3604 return -EINVAL; 3604 return -EINVAL;
3605 if (max < mddev->resync_min) 3605 if (max < mddev->resync_min)
3606 return -EINVAL; 3606 return -EINVAL;
3607 if (max < mddev->resync_max && 3607 if (max < mddev->resync_max &&
3608 mddev->ro == 0 && 3608 mddev->ro == 0 &&
3609 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 3609 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
3610 return -EBUSY; 3610 return -EBUSY;
3611 3611
3612 /* Must be a multiple of chunk_size */ 3612 /* Must be a multiple of chunk_size */
3613 if (mddev->chunk_sectors) { 3613 if (mddev->chunk_sectors) {
3614 sector_t temp = max; 3614 sector_t temp = max;
3615 if (sector_div(temp, mddev->chunk_sectors)) 3615 if (sector_div(temp, mddev->chunk_sectors))
3616 return -EINVAL; 3616 return -EINVAL;
3617 } 3617 }
3618 mddev->resync_max = max; 3618 mddev->resync_max = max;
3619 } 3619 }
3620 wake_up(&mddev->recovery_wait); 3620 wake_up(&mddev->recovery_wait);
3621 return len; 3621 return len;
3622 } 3622 }
3623 3623
3624 static struct md_sysfs_entry md_max_sync = 3624 static struct md_sysfs_entry md_max_sync =
3625 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 3625 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
3626 3626
3627 static ssize_t 3627 static ssize_t
3628 suspend_lo_show(mddev_t *mddev, char *page) 3628 suspend_lo_show(mddev_t *mddev, char *page)
3629 { 3629 {
3630 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 3630 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
3631 } 3631 }
3632 3632
3633 static ssize_t 3633 static ssize_t
3634 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len) 3634 suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
3635 { 3635 {
3636 char *e; 3636 char *e;
3637 unsigned long long new = simple_strtoull(buf, &e, 10); 3637 unsigned long long new = simple_strtoull(buf, &e, 10);
3638 3638
3639 if (mddev->pers == NULL || 3639 if (mddev->pers == NULL ||
3640 mddev->pers->quiesce == NULL) 3640 mddev->pers->quiesce == NULL)
3641 return -EINVAL; 3641 return -EINVAL;
3642 if (buf == e || (*e && *e != '\n')) 3642 if (buf == e || (*e && *e != '\n'))
3643 return -EINVAL; 3643 return -EINVAL;
3644 if (new >= mddev->suspend_hi || 3644 if (new >= mddev->suspend_hi ||
3645 (new > mddev->suspend_lo && new < mddev->suspend_hi)) { 3645 (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
3646 mddev->suspend_lo = new; 3646 mddev->suspend_lo = new;
3647 mddev->pers->quiesce(mddev, 2); 3647 mddev->pers->quiesce(mddev, 2);
3648 return len; 3648 return len;
3649 } else 3649 } else
3650 return -EINVAL; 3650 return -EINVAL;
3651 } 3651 }
3652 static struct md_sysfs_entry md_suspend_lo = 3652 static struct md_sysfs_entry md_suspend_lo =
3653 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 3653 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
3654 3654
3655 3655
3656 static ssize_t 3656 static ssize_t
3657 suspend_hi_show(mddev_t *mddev, char *page) 3657 suspend_hi_show(mddev_t *mddev, char *page)
3658 { 3658 {
3659 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 3659 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
3660 } 3660 }
3661 3661
3662 static ssize_t 3662 static ssize_t
3663 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len) 3663 suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
3664 { 3664 {
3665 char *e; 3665 char *e;
3666 unsigned long long new = simple_strtoull(buf, &e, 10); 3666 unsigned long long new = simple_strtoull(buf, &e, 10);
3667 3667
3668 if (mddev->pers == NULL || 3668 if (mddev->pers == NULL ||
3669 mddev->pers->quiesce == NULL) 3669 mddev->pers->quiesce == NULL)
3670 return -EINVAL; 3670 return -EINVAL;
3671 if (buf == e || (*e && *e != '\n')) 3671 if (buf == e || (*e && *e != '\n'))
3672 return -EINVAL; 3672 return -EINVAL;
3673 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) || 3673 if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
3674 (new > mddev->suspend_lo && new > mddev->suspend_hi)) { 3674 (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
3675 mddev->suspend_hi = new; 3675 mddev->suspend_hi = new;
3676 mddev->pers->quiesce(mddev, 1); 3676 mddev->pers->quiesce(mddev, 1);
3677 mddev->pers->quiesce(mddev, 0); 3677 mddev->pers->quiesce(mddev, 0);
3678 return len; 3678 return len;
3679 } else 3679 } else
3680 return -EINVAL; 3680 return -EINVAL;
3681 } 3681 }
3682 static struct md_sysfs_entry md_suspend_hi = 3682 static struct md_sysfs_entry md_suspend_hi =
3683 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 3683 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
3684 3684
3685 static ssize_t 3685 static ssize_t
3686 reshape_position_show(mddev_t *mddev, char *page) 3686 reshape_position_show(mddev_t *mddev, char *page)
3687 { 3687 {
3688 if (mddev->reshape_position != MaxSector) 3688 if (mddev->reshape_position != MaxSector)
3689 return sprintf(page, "%llu\n", 3689 return sprintf(page, "%llu\n",
3690 (unsigned long long)mddev->reshape_position); 3690 (unsigned long long)mddev->reshape_position);
3691 strcpy(page, "none\n"); 3691 strcpy(page, "none\n");
3692 return 5; 3692 return 5;
3693 } 3693 }
3694 3694
3695 static ssize_t 3695 static ssize_t
3696 reshape_position_store(mddev_t *mddev, const char *buf, size_t len) 3696 reshape_position_store(mddev_t *mddev, const char *buf, size_t len)
3697 { 3697 {
3698 char *e; 3698 char *e;
3699 unsigned long long new = simple_strtoull(buf, &e, 10); 3699 unsigned long long new = simple_strtoull(buf, &e, 10);
3700 if (mddev->pers) 3700 if (mddev->pers)
3701 return -EBUSY; 3701 return -EBUSY;
3702 if (buf == e || (*e && *e != '\n')) 3702 if (buf == e || (*e && *e != '\n'))
3703 return -EINVAL; 3703 return -EINVAL;
3704 mddev->reshape_position = new; 3704 mddev->reshape_position = new;
3705 mddev->delta_disks = 0; 3705 mddev->delta_disks = 0;
3706 mddev->new_level = mddev->level; 3706 mddev->new_level = mddev->level;
3707 mddev->new_layout = mddev->layout; 3707 mddev->new_layout = mddev->layout;
3708 mddev->new_chunk_sectors = mddev->chunk_sectors; 3708 mddev->new_chunk_sectors = mddev->chunk_sectors;
3709 return len; 3709 return len;
3710 } 3710 }
3711 3711
3712 static struct md_sysfs_entry md_reshape_position = 3712 static struct md_sysfs_entry md_reshape_position =
3713 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 3713 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
3714 reshape_position_store); 3714 reshape_position_store);
3715 3715
3716 static ssize_t 3716 static ssize_t
3717 array_size_show(mddev_t *mddev, char *page) 3717 array_size_show(mddev_t *mddev, char *page)
3718 { 3718 {
3719 if (mddev->external_size) 3719 if (mddev->external_size)
3720 return sprintf(page, "%llu\n", 3720 return sprintf(page, "%llu\n",
3721 (unsigned long long)mddev->array_sectors/2); 3721 (unsigned long long)mddev->array_sectors/2);
3722 else 3722 else
3723 return sprintf(page, "default\n"); 3723 return sprintf(page, "default\n");
3724 } 3724 }
3725 3725
3726 static ssize_t 3726 static ssize_t
3727 array_size_store(mddev_t *mddev, const char *buf, size_t len) 3727 array_size_store(mddev_t *mddev, const char *buf, size_t len)
3728 { 3728 {
3729 sector_t sectors; 3729 sector_t sectors;
3730 3730
3731 if (strncmp(buf, "default", 7) == 0) { 3731 if (strncmp(buf, "default", 7) == 0) {
3732 if (mddev->pers) 3732 if (mddev->pers)
3733 sectors = mddev->pers->size(mddev, 0, 0); 3733 sectors = mddev->pers->size(mddev, 0, 0);
3734 else 3734 else
3735 sectors = mddev->array_sectors; 3735 sectors = mddev->array_sectors;
3736 3736
3737 mddev->external_size = 0; 3737 mddev->external_size = 0;
3738 } else { 3738 } else {
3739 if (strict_blocks_to_sectors(buf, &sectors) < 0) 3739 if (strict_blocks_to_sectors(buf, &sectors) < 0)
3740 return -EINVAL; 3740 return -EINVAL;
3741 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 3741 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
3742 return -E2BIG; 3742 return -E2BIG;
3743 3743
3744 mddev->external_size = 1; 3744 mddev->external_size = 1;
3745 } 3745 }
3746 3746
3747 mddev->array_sectors = sectors; 3747 mddev->array_sectors = sectors;
3748 set_capacity(mddev->gendisk, mddev->array_sectors); 3748 set_capacity(mddev->gendisk, mddev->array_sectors);
3749 if (mddev->pers) 3749 if (mddev->pers)
3750 revalidate_disk(mddev->gendisk); 3750 revalidate_disk(mddev->gendisk);
3751 3751
3752 return len; 3752 return len;
3753 } 3753 }
3754 3754
3755 static struct md_sysfs_entry md_array_size = 3755 static struct md_sysfs_entry md_array_size =
3756 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 3756 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
3757 array_size_store); 3757 array_size_store);
3758 3758
3759 static struct attribute *md_default_attrs[] = { 3759 static struct attribute *md_default_attrs[] = {
3760 &md_level.attr, 3760 &md_level.attr,
3761 &md_layout.attr, 3761 &md_layout.attr,
3762 &md_raid_disks.attr, 3762 &md_raid_disks.attr,
3763 &md_chunk_size.attr, 3763 &md_chunk_size.attr,
3764 &md_size.attr, 3764 &md_size.attr,
3765 &md_resync_start.attr, 3765 &md_resync_start.attr,
3766 &md_metadata.attr, 3766 &md_metadata.attr,
3767 &md_new_device.attr, 3767 &md_new_device.attr,
3768 &md_safe_delay.attr, 3768 &md_safe_delay.attr,
3769 &md_array_state.attr, 3769 &md_array_state.attr,
3770 &md_reshape_position.attr, 3770 &md_reshape_position.attr,
3771 &md_array_size.attr, 3771 &md_array_size.attr,
3772 NULL, 3772 NULL,
3773 }; 3773 };
3774 3774
3775 static struct attribute *md_redundancy_attrs[] = { 3775 static struct attribute *md_redundancy_attrs[] = {
3776 &md_scan_mode.attr, 3776 &md_scan_mode.attr,
3777 &md_mismatches.attr, 3777 &md_mismatches.attr,
3778 &md_sync_min.attr, 3778 &md_sync_min.attr,
3779 &md_sync_max.attr, 3779 &md_sync_max.attr,
3780 &md_sync_speed.attr, 3780 &md_sync_speed.attr,
3781 &md_sync_force_parallel.attr, 3781 &md_sync_force_parallel.attr,
3782 &md_sync_completed.attr, 3782 &md_sync_completed.attr,
3783 &md_min_sync.attr, 3783 &md_min_sync.attr,
3784 &md_max_sync.attr, 3784 &md_max_sync.attr,
3785 &md_suspend_lo.attr, 3785 &md_suspend_lo.attr,
3786 &md_suspend_hi.attr, 3786 &md_suspend_hi.attr,
3787 &md_bitmap.attr, 3787 &md_bitmap.attr,
3788 &md_degraded.attr, 3788 &md_degraded.attr,
3789 NULL, 3789 NULL,
3790 }; 3790 };
3791 static struct attribute_group md_redundancy_group = { 3791 static struct attribute_group md_redundancy_group = {
3792 .name = NULL, 3792 .name = NULL,
3793 .attrs = md_redundancy_attrs, 3793 .attrs = md_redundancy_attrs,
3794 }; 3794 };
3795 3795
3796 3796
3797 static ssize_t 3797 static ssize_t
3798 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3798 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3799 { 3799 {
3800 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3800 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3801 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3801 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3802 ssize_t rv; 3802 ssize_t rv;
3803 3803
3804 if (!entry->show) 3804 if (!entry->show)
3805 return -EIO; 3805 return -EIO;
3806 rv = mddev_lock(mddev); 3806 rv = mddev_lock(mddev);
3807 if (!rv) { 3807 if (!rv) {
3808 rv = entry->show(mddev, page); 3808 rv = entry->show(mddev, page);
3809 mddev_unlock(mddev); 3809 mddev_unlock(mddev);
3810 } 3810 }
3811 return rv; 3811 return rv;
3812 } 3812 }
3813 3813
3814 static ssize_t 3814 static ssize_t
3815 md_attr_store(struct kobject *kobj, struct attribute *attr, 3815 md_attr_store(struct kobject *kobj, struct attribute *attr,
3816 const char *page, size_t length) 3816 const char *page, size_t length)
3817 { 3817 {
3818 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 3818 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
3819 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj); 3819 mddev_t *mddev = container_of(kobj, struct mddev_s, kobj);
3820 ssize_t rv; 3820 ssize_t rv;
3821 3821
3822 if (!entry->store) 3822 if (!entry->store)
3823 return -EIO; 3823 return -EIO;
3824 if (!capable(CAP_SYS_ADMIN)) 3824 if (!capable(CAP_SYS_ADMIN))
3825 return -EACCES; 3825 return -EACCES;
3826 rv = mddev_lock(mddev); 3826 rv = mddev_lock(mddev);
3827 if (mddev->hold_active == UNTIL_IOCTL) 3827 if (mddev->hold_active == UNTIL_IOCTL)
3828 mddev->hold_active = 0; 3828 mddev->hold_active = 0;
3829 if (!rv) { 3829 if (!rv) {
3830 rv = entry->store(mddev, page, length); 3830 rv = entry->store(mddev, page, length);
3831 mddev_unlock(mddev); 3831 mddev_unlock(mddev);
3832 } 3832 }
3833 return rv; 3833 return rv;
3834 } 3834 }
3835 3835
3836 static void md_free(struct kobject *ko) 3836 static void md_free(struct kobject *ko)
3837 { 3837 {
3838 mddev_t *mddev = container_of(ko, mddev_t, kobj); 3838 mddev_t *mddev = container_of(ko, mddev_t, kobj);
3839 3839
3840 if (mddev->sysfs_state) 3840 if (mddev->sysfs_state)
3841 sysfs_put(mddev->sysfs_state); 3841 sysfs_put(mddev->sysfs_state);
3842 3842
3843 if (mddev->gendisk) { 3843 if (mddev->gendisk) {
3844 del_gendisk(mddev->gendisk); 3844 del_gendisk(mddev->gendisk);
3845 put_disk(mddev->gendisk); 3845 put_disk(mddev->gendisk);
3846 } 3846 }
3847 if (mddev->queue) 3847 if (mddev->queue)
3848 blk_cleanup_queue(mddev->queue); 3848 blk_cleanup_queue(mddev->queue);
3849 3849
3850 kfree(mddev); 3850 kfree(mddev);
3851 } 3851 }
3852 3852
3853 static struct sysfs_ops md_sysfs_ops = { 3853 static struct sysfs_ops md_sysfs_ops = {
3854 .show = md_attr_show, 3854 .show = md_attr_show,
3855 .store = md_attr_store, 3855 .store = md_attr_store,
3856 }; 3856 };
3857 static struct kobj_type md_ktype = { 3857 static struct kobj_type md_ktype = {
3858 .release = md_free, 3858 .release = md_free,
3859 .sysfs_ops = &md_sysfs_ops, 3859 .sysfs_ops = &md_sysfs_ops,
3860 .default_attrs = md_default_attrs, 3860 .default_attrs = md_default_attrs,
3861 }; 3861 };
3862 3862
3863 int mdp_major = 0; 3863 int mdp_major = 0;
3864 3864
3865 static void mddev_delayed_delete(struct work_struct *ws) 3865 static void mddev_delayed_delete(struct work_struct *ws)
3866 { 3866 {
3867 mddev_t *mddev = container_of(ws, mddev_t, del_work); 3867 mddev_t *mddev = container_of(ws, mddev_t, del_work);
3868 3868
3869 if (mddev->private == &md_redundancy_group) { 3869 if (mddev->private == &md_redundancy_group) {
3870 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 3870 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
3871 if (mddev->sysfs_action) 3871 if (mddev->sysfs_action)
3872 sysfs_put(mddev->sysfs_action); 3872 sysfs_put(mddev->sysfs_action);
3873 mddev->sysfs_action = NULL; 3873 mddev->sysfs_action = NULL;
3874 mddev->private = NULL; 3874 mddev->private = NULL;
3875 } 3875 }
3876 kobject_del(&mddev->kobj); 3876 kobject_del(&mddev->kobj);
3877 kobject_put(&mddev->kobj); 3877 kobject_put(&mddev->kobj);
3878 } 3878 }
3879 3879
3880 static int md_alloc(dev_t dev, char *name) 3880 static int md_alloc(dev_t dev, char *name)
3881 { 3881 {
3882 static DEFINE_MUTEX(disks_mutex); 3882 static DEFINE_MUTEX(disks_mutex);
3883 mddev_t *mddev = mddev_find(dev); 3883 mddev_t *mddev = mddev_find(dev);
3884 struct gendisk *disk; 3884 struct gendisk *disk;
3885 int partitioned; 3885 int partitioned;
3886 int shift; 3886 int shift;
3887 int unit; 3887 int unit;
3888 int error; 3888 int error;
3889 3889
3890 if (!mddev) 3890 if (!mddev)
3891 return -ENODEV; 3891 return -ENODEV;
3892 3892
3893 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 3893 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
3894 shift = partitioned ? MdpMinorShift : 0; 3894 shift = partitioned ? MdpMinorShift : 0;
3895 unit = MINOR(mddev->unit) >> shift; 3895 unit = MINOR(mddev->unit) >> shift;
3896 3896
3897 /* wait for any previous instance if this device 3897 /* wait for any previous instance if this device
3898 * to be completed removed (mddev_delayed_delete). 3898 * to be completed removed (mddev_delayed_delete).
3899 */ 3899 */
3900 flush_scheduled_work(); 3900 flush_scheduled_work();
3901 3901
3902 mutex_lock(&disks_mutex); 3902 mutex_lock(&disks_mutex);
3903 error = -EEXIST; 3903 error = -EEXIST;
3904 if (mddev->gendisk) 3904 if (mddev->gendisk)
3905 goto abort; 3905 goto abort;
3906 3906
3907 if (name) { 3907 if (name) {
3908 /* Need to ensure that 'name' is not a duplicate. 3908 /* Need to ensure that 'name' is not a duplicate.
3909 */ 3909 */
3910 mddev_t *mddev2; 3910 mddev_t *mddev2;
3911 spin_lock(&all_mddevs_lock); 3911 spin_lock(&all_mddevs_lock);
3912 3912
3913 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 3913 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
3914 if (mddev2->gendisk && 3914 if (mddev2->gendisk &&
3915 strcmp(mddev2->gendisk->disk_name, name) == 0) { 3915 strcmp(mddev2->gendisk->disk_name, name) == 0) {
3916 spin_unlock(&all_mddevs_lock); 3916 spin_unlock(&all_mddevs_lock);
3917 goto abort; 3917 goto abort;
3918 } 3918 }
3919 spin_unlock(&all_mddevs_lock); 3919 spin_unlock(&all_mddevs_lock);
3920 } 3920 }
3921 3921
3922 error = -ENOMEM; 3922 error = -ENOMEM;
3923 mddev->queue = blk_alloc_queue(GFP_KERNEL); 3923 mddev->queue = blk_alloc_queue(GFP_KERNEL);
3924 if (!mddev->queue) 3924 if (!mddev->queue)
3925 goto abort; 3925 goto abort;
3926 mddev->queue->queuedata = mddev; 3926 mddev->queue->queuedata = mddev;
3927 3927
3928 /* Can be unlocked because the queue is new: no concurrency */ 3928 /* Can be unlocked because the queue is new: no concurrency */
3929 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue); 3929 queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
3930 3930
3931 blk_queue_make_request(mddev->queue, md_make_request); 3931 blk_queue_make_request(mddev->queue, md_make_request);
3932 3932
3933 disk = alloc_disk(1 << shift); 3933 disk = alloc_disk(1 << shift);
3934 if (!disk) { 3934 if (!disk) {
3935 blk_cleanup_queue(mddev->queue); 3935 blk_cleanup_queue(mddev->queue);
3936 mddev->queue = NULL; 3936 mddev->queue = NULL;
3937 goto abort; 3937 goto abort;
3938 } 3938 }
3939 disk->major = MAJOR(mddev->unit); 3939 disk->major = MAJOR(mddev->unit);
3940 disk->first_minor = unit << shift; 3940 disk->first_minor = unit << shift;
3941 if (name) 3941 if (name)
3942 strcpy(disk->disk_name, name); 3942 strcpy(disk->disk_name, name);
3943 else if (partitioned) 3943 else if (partitioned)
3944 sprintf(disk->disk_name, "md_d%d", unit); 3944 sprintf(disk->disk_name, "md_d%d", unit);
3945 else 3945 else
3946 sprintf(disk->disk_name, "md%d", unit); 3946 sprintf(disk->disk_name, "md%d", unit);
3947 disk->fops = &md_fops; 3947 disk->fops = &md_fops;
3948 disk->private_data = mddev; 3948 disk->private_data = mddev;
3949 disk->queue = mddev->queue; 3949 disk->queue = mddev->queue;
3950 /* Allow extended partitions. This makes the 3950 /* Allow extended partitions. This makes the
3951 * 'mdp' device redundant, but we can't really 3951 * 'mdp' device redundant, but we can't really
3952 * remove it now. 3952 * remove it now.
3953 */ 3953 */
3954 disk->flags |= GENHD_FL_EXT_DEVT; 3954 disk->flags |= GENHD_FL_EXT_DEVT;
3955 add_disk(disk); 3955 add_disk(disk);
3956 mddev->gendisk = disk; 3956 mddev->gendisk = disk;
3957 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 3957 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
3958 &disk_to_dev(disk)->kobj, "%s", "md"); 3958 &disk_to_dev(disk)->kobj, "%s", "md");
3959 if (error) { 3959 if (error) {
3960 /* This isn't possible, but as kobject_init_and_add is marked 3960 /* This isn't possible, but as kobject_init_and_add is marked
3961 * __must_check, we must do something with the result 3961 * __must_check, we must do something with the result
3962 */ 3962 */
3963 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 3963 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
3964 disk->disk_name); 3964 disk->disk_name);
3965 error = 0; 3965 error = 0;
3966 } 3966 }
3967 abort: 3967 abort:
3968 mutex_unlock(&disks_mutex); 3968 mutex_unlock(&disks_mutex);
3969 if (!error) { 3969 if (!error) {
3970 kobject_uevent(&mddev->kobj, KOBJ_ADD); 3970 kobject_uevent(&mddev->kobj, KOBJ_ADD);
3971 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state"); 3971 mddev->sysfs_state = sysfs_get_dirent(mddev->kobj.sd, "array_state");
3972 } 3972 }
3973 mddev_put(mddev); 3973 mddev_put(mddev);
3974 return error; 3974 return error;
3975 } 3975 }
3976 3976
3977 static struct kobject *md_probe(dev_t dev, int *part, void *data) 3977 static struct kobject *md_probe(dev_t dev, int *part, void *data)
3978 { 3978 {
3979 md_alloc(dev, NULL); 3979 md_alloc(dev, NULL);
3980 return NULL; 3980 return NULL;
3981 } 3981 }
3982 3982
3983 static int add_named_array(const char *val, struct kernel_param *kp) 3983 static int add_named_array(const char *val, struct kernel_param *kp)
3984 { 3984 {
3985 /* val must be "md_*" where * is not all digits. 3985 /* val must be "md_*" where * is not all digits.
3986 * We allocate an array with a large free minor number, and 3986 * We allocate an array with a large free minor number, and
3987 * set the name to val. val must not already be an active name. 3987 * set the name to val. val must not already be an active name.
3988 */ 3988 */
3989 int len = strlen(val); 3989 int len = strlen(val);
3990 char buf[DISK_NAME_LEN]; 3990 char buf[DISK_NAME_LEN];
3991 3991
3992 while (len && val[len-1] == '\n') 3992 while (len && val[len-1] == '\n')
3993 len--; 3993 len--;
3994 if (len >= DISK_NAME_LEN) 3994 if (len >= DISK_NAME_LEN)
3995 return -E2BIG; 3995 return -E2BIG;
3996 strlcpy(buf, val, len+1); 3996 strlcpy(buf, val, len+1);
3997 if (strncmp(buf, "md_", 3) != 0) 3997 if (strncmp(buf, "md_", 3) != 0)
3998 return -EINVAL; 3998 return -EINVAL;
3999 return md_alloc(0, buf); 3999 return md_alloc(0, buf);
4000 } 4000 }
4001 4001
4002 static void md_safemode_timeout(unsigned long data) 4002 static void md_safemode_timeout(unsigned long data)
4003 { 4003 {
4004 mddev_t *mddev = (mddev_t *) data; 4004 mddev_t *mddev = (mddev_t *) data;
4005 4005
4006 if (!atomic_read(&mddev->writes_pending)) { 4006 if (!atomic_read(&mddev->writes_pending)) {
4007 mddev->safemode = 1; 4007 mddev->safemode = 1;
4008 if (mddev->external) 4008 if (mddev->external)
4009 sysfs_notify_dirent(mddev->sysfs_state); 4009 sysfs_notify_dirent(mddev->sysfs_state);
4010 } 4010 }
4011 md_wakeup_thread(mddev->thread); 4011 md_wakeup_thread(mddev->thread);
4012 } 4012 }
4013 4013
4014 static int start_dirty_degraded; 4014 static int start_dirty_degraded;
4015 4015
4016 static int do_md_run(mddev_t * mddev) 4016 static int do_md_run(mddev_t * mddev)
4017 { 4017 {
4018 int err; 4018 int err;
4019 mdk_rdev_t *rdev; 4019 mdk_rdev_t *rdev;
4020 struct gendisk *disk; 4020 struct gendisk *disk;
4021 struct mdk_personality *pers; 4021 struct mdk_personality *pers;
4022 4022
4023 if (list_empty(&mddev->disks)) 4023 if (list_empty(&mddev->disks))
4024 /* cannot run an array with no devices.. */ 4024 /* cannot run an array with no devices.. */
4025 return -EINVAL; 4025 return -EINVAL;
4026 4026
4027 if (mddev->pers) 4027 if (mddev->pers)
4028 return -EBUSY; 4028 return -EBUSY;
4029 4029
4030 /* 4030 /*
4031 * Analyze all RAID superblock(s) 4031 * Analyze all RAID superblock(s)
4032 */ 4032 */
4033 if (!mddev->raid_disks) { 4033 if (!mddev->raid_disks) {
4034 if (!mddev->persistent) 4034 if (!mddev->persistent)
4035 return -EINVAL; 4035 return -EINVAL;
4036 analyze_sbs(mddev); 4036 analyze_sbs(mddev);
4037 } 4037 }
4038 4038
4039 if (mddev->level != LEVEL_NONE) 4039 if (mddev->level != LEVEL_NONE)
4040 request_module("md-level-%d", mddev->level); 4040 request_module("md-level-%d", mddev->level);
4041 else if (mddev->clevel[0]) 4041 else if (mddev->clevel[0])
4042 request_module("md-%s", mddev->clevel); 4042 request_module("md-%s", mddev->clevel);
4043 4043
4044 /* 4044 /*
4045 * Drop all container device buffers, from now on 4045 * Drop all container device buffers, from now on
4046 * the only valid external interface is through the md 4046 * the only valid external interface is through the md
4047 * device. 4047 * device.
4048 */ 4048 */
4049 list_for_each_entry(rdev, &mddev->disks, same_set) { 4049 list_for_each_entry(rdev, &mddev->disks, same_set) {
4050 if (test_bit(Faulty, &rdev->flags)) 4050 if (test_bit(Faulty, &rdev->flags))
4051 continue; 4051 continue;
4052 sync_blockdev(rdev->bdev); 4052 sync_blockdev(rdev->bdev);
4053 invalidate_bdev(rdev->bdev); 4053 invalidate_bdev(rdev->bdev);
4054 4054
4055 /* perform some consistency tests on the device. 4055 /* perform some consistency tests on the device.
4056 * We don't want the data to overlap the metadata, 4056 * We don't want the data to overlap the metadata,
4057 * Internal Bitmap issues have been handled elsewhere. 4057 * Internal Bitmap issues have been handled elsewhere.
4058 */ 4058 */
4059 if (rdev->data_offset < rdev->sb_start) { 4059 if (rdev->data_offset < rdev->sb_start) {
4060 if (mddev->dev_sectors && 4060 if (mddev->dev_sectors &&
4061 rdev->data_offset + mddev->dev_sectors 4061 rdev->data_offset + mddev->dev_sectors
4062 > rdev->sb_start) { 4062 > rdev->sb_start) {
4063 printk("md: %s: data overlaps metadata\n", 4063 printk("md: %s: data overlaps metadata\n",
4064 mdname(mddev)); 4064 mdname(mddev));
4065 return -EINVAL; 4065 return -EINVAL;
4066 } 4066 }
4067 } else { 4067 } else {
4068 if (rdev->sb_start + rdev->sb_size/512 4068 if (rdev->sb_start + rdev->sb_size/512
4069 > rdev->data_offset) { 4069 > rdev->data_offset) {
4070 printk("md: %s: metadata overlaps data\n", 4070 printk("md: %s: metadata overlaps data\n",
4071 mdname(mddev)); 4071 mdname(mddev));
4072 return -EINVAL; 4072 return -EINVAL;
4073 } 4073 }
4074 } 4074 }
4075 sysfs_notify_dirent(rdev->sysfs_state); 4075 sysfs_notify_dirent(rdev->sysfs_state);
4076 } 4076 }
4077 4077
4078 md_probe(mddev->unit, NULL, NULL); 4078 md_probe(mddev->unit, NULL, NULL);
4079 disk = mddev->gendisk; 4079 disk = mddev->gendisk;
4080 if (!disk) 4080 if (!disk)
4081 return -ENOMEM; 4081 return -ENOMEM;
4082 4082
4083 spin_lock(&pers_lock); 4083 spin_lock(&pers_lock);
4084 pers = find_pers(mddev->level, mddev->clevel); 4084 pers = find_pers(mddev->level, mddev->clevel);
4085 if (!pers || !try_module_get(pers->owner)) { 4085 if (!pers || !try_module_get(pers->owner)) {
4086 spin_unlock(&pers_lock); 4086 spin_unlock(&pers_lock);
4087 if (mddev->level != LEVEL_NONE) 4087 if (mddev->level != LEVEL_NONE)
4088 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 4088 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
4089 mddev->level); 4089 mddev->level);
4090 else 4090 else
4091 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 4091 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
4092 mddev->clevel); 4092 mddev->clevel);
4093 return -EINVAL; 4093 return -EINVAL;
4094 } 4094 }
4095 mddev->pers = pers; 4095 mddev->pers = pers;
4096 spin_unlock(&pers_lock); 4096 spin_unlock(&pers_lock);
4097 if (mddev->level != pers->level) { 4097 if (mddev->level != pers->level) {
4098 mddev->level = pers->level; 4098 mddev->level = pers->level;
4099 mddev->new_level = pers->level; 4099 mddev->new_level = pers->level;
4100 } 4100 }
4101 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 4101 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
4102 4102
4103 if (mddev->reshape_position != MaxSector && 4103 if (mddev->reshape_position != MaxSector &&
4104 pers->start_reshape == NULL) { 4104 pers->start_reshape == NULL) {
4105 /* This personality cannot handle reshaping... */ 4105 /* This personality cannot handle reshaping... */
4106 mddev->pers = NULL; 4106 mddev->pers = NULL;
4107 module_put(pers->owner); 4107 module_put(pers->owner);
4108 return -EINVAL; 4108 return -EINVAL;
4109 } 4109 }
4110 4110
4111 if (pers->sync_request) { 4111 if (pers->sync_request) {
4112 /* Warn if this is a potentially silly 4112 /* Warn if this is a potentially silly
4113 * configuration. 4113 * configuration.
4114 */ 4114 */
4115 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4115 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4116 mdk_rdev_t *rdev2; 4116 mdk_rdev_t *rdev2;
4117 int warned = 0; 4117 int warned = 0;
4118 4118
4119 list_for_each_entry(rdev, &mddev->disks, same_set) 4119 list_for_each_entry(rdev, &mddev->disks, same_set)
4120 list_for_each_entry(rdev2, &mddev->disks, same_set) { 4120 list_for_each_entry(rdev2, &mddev->disks, same_set) {
4121 if (rdev < rdev2 && 4121 if (rdev < rdev2 &&
4122 rdev->bdev->bd_contains == 4122 rdev->bdev->bd_contains ==
4123 rdev2->bdev->bd_contains) { 4123 rdev2->bdev->bd_contains) {
4124 printk(KERN_WARNING 4124 printk(KERN_WARNING
4125 "%s: WARNING: %s appears to be" 4125 "%s: WARNING: %s appears to be"
4126 " on the same physical disk as" 4126 " on the same physical disk as"
4127 " %s.\n", 4127 " %s.\n",
4128 mdname(mddev), 4128 mdname(mddev),
4129 bdevname(rdev->bdev,b), 4129 bdevname(rdev->bdev,b),
4130 bdevname(rdev2->bdev,b2)); 4130 bdevname(rdev2->bdev,b2));
4131 warned = 1; 4131 warned = 1;
4132 } 4132 }
4133 } 4133 }
4134 4134
4135 if (warned) 4135 if (warned)
4136 printk(KERN_WARNING 4136 printk(KERN_WARNING
4137 "True protection against single-disk" 4137 "True protection against single-disk"
4138 " failure might be compromised.\n"); 4138 " failure might be compromised.\n");
4139 } 4139 }
4140 4140
4141 mddev->recovery = 0; 4141 mddev->recovery = 0;
4142 /* may be over-ridden by personality */ 4142 /* may be over-ridden by personality */
4143 mddev->resync_max_sectors = mddev->dev_sectors; 4143 mddev->resync_max_sectors = mddev->dev_sectors;
4144 4144
4145 mddev->barriers_work = 1; 4145 mddev->barriers_work = 1;
4146 mddev->ok_start_degraded = start_dirty_degraded; 4146 mddev->ok_start_degraded = start_dirty_degraded;
4147 4147
4148 if (start_readonly) 4148 if (start_readonly)
4149 mddev->ro = 2; /* read-only, but switch on first write */ 4149 mddev->ro = 2; /* read-only, but switch on first write */
4150 4150
4151 err = mddev->pers->run(mddev); 4151 err = mddev->pers->run(mddev);
4152 if (err) 4152 if (err)
4153 printk(KERN_ERR "md: pers->run() failed ...\n"); 4153 printk(KERN_ERR "md: pers->run() failed ...\n");
4154 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { 4154 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
4155 WARN_ONCE(!mddev->external_size, "%s: default size too small," 4155 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
4156 " but 'external_size' not in effect?\n", __func__); 4156 " but 'external_size' not in effect?\n", __func__);
4157 printk(KERN_ERR 4157 printk(KERN_ERR
4158 "md: invalid array_size %llu > default size %llu\n", 4158 "md: invalid array_size %llu > default size %llu\n",
4159 (unsigned long long)mddev->array_sectors / 2, 4159 (unsigned long long)mddev->array_sectors / 2,
4160 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); 4160 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
4161 err = -EINVAL; 4161 err = -EINVAL;
4162 mddev->pers->stop(mddev); 4162 mddev->pers->stop(mddev);
4163 } 4163 }
4164 if (err == 0 && mddev->pers->sync_request) { 4164 if (err == 0 && mddev->pers->sync_request) {
4165 err = bitmap_create(mddev); 4165 err = bitmap_create(mddev);
4166 if (err) { 4166 if (err) {
4167 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 4167 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
4168 mdname(mddev), err); 4168 mdname(mddev), err);
4169 mddev->pers->stop(mddev); 4169 mddev->pers->stop(mddev);
4170 } 4170 }
4171 } 4171 }
4172 if (err) { 4172 if (err) {
4173 module_put(mddev->pers->owner); 4173 module_put(mddev->pers->owner);
4174 mddev->pers = NULL; 4174 mddev->pers = NULL;
4175 bitmap_destroy(mddev); 4175 bitmap_destroy(mddev);
4176 return err; 4176 return err;
4177 } 4177 }
4178 if (mddev->pers->sync_request) { 4178 if (mddev->pers->sync_request) {
4179 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 4179 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
4180 printk(KERN_WARNING 4180 printk(KERN_WARNING
4181 "md: cannot register extra attributes for %s\n", 4181 "md: cannot register extra attributes for %s\n",
4182 mdname(mddev)); 4182 mdname(mddev));
4183 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 4183 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
4184 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 4184 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
4185 mddev->ro = 0; 4185 mddev->ro = 0;
4186 4186
4187 atomic_set(&mddev->writes_pending,0); 4187 atomic_set(&mddev->writes_pending,0);
4188 mddev->safemode = 0; 4188 mddev->safemode = 0;
4189 mddev->safemode_timer.function = md_safemode_timeout; 4189 mddev->safemode_timer.function = md_safemode_timeout;
4190 mddev->safemode_timer.data = (unsigned long) mddev; 4190 mddev->safemode_timer.data = (unsigned long) mddev;
4191 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 4191 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
4192 mddev->in_sync = 1; 4192 mddev->in_sync = 1;
4193 4193
4194 list_for_each_entry(rdev, &mddev->disks, same_set) 4194 list_for_each_entry(rdev, &mddev->disks, same_set)
4195 if (rdev->raid_disk >= 0) { 4195 if (rdev->raid_disk >= 0) {
4196 char nm[20]; 4196 char nm[20];
4197 sprintf(nm, "rd%d", rdev->raid_disk); 4197 sprintf(nm, "rd%d", rdev->raid_disk);
4198 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 4198 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
4199 printk("md: cannot register %s for %s\n", 4199 printk("md: cannot register %s for %s\n",
4200 nm, mdname(mddev)); 4200 nm, mdname(mddev));
4201 } 4201 }
4202 4202
4203 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4203 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4204 4204
4205 if (mddev->flags) 4205 if (mddev->flags)
4206 md_update_sb(mddev, 0); 4206 md_update_sb(mddev, 0);
4207 4207
4208 set_capacity(disk, mddev->array_sectors); 4208 set_capacity(disk, mddev->array_sectors);
4209 4209
4210 /* If there is a partially-recovered drive we need to 4210 /* If there is a partially-recovered drive we need to
4211 * start recovery here. If we leave it to md_check_recovery, 4211 * start recovery here. If we leave it to md_check_recovery,
4212 * it will remove the drives and not do the right thing 4212 * it will remove the drives and not do the right thing
4213 */ 4213 */
4214 if (mddev->degraded && !mddev->sync_thread) { 4214 if (mddev->degraded && !mddev->sync_thread) {
4215 int spares = 0; 4215 int spares = 0;
4216 list_for_each_entry(rdev, &mddev->disks, same_set) 4216 list_for_each_entry(rdev, &mddev->disks, same_set)
4217 if (rdev->raid_disk >= 0 && 4217 if (rdev->raid_disk >= 0 &&
4218 !test_bit(In_sync, &rdev->flags) && 4218 !test_bit(In_sync, &rdev->flags) &&
4219 !test_bit(Faulty, &rdev->flags)) 4219 !test_bit(Faulty, &rdev->flags))
4220 /* complete an interrupted recovery */ 4220 /* complete an interrupted recovery */
4221 spares++; 4221 spares++;
4222 if (spares && mddev->pers->sync_request) { 4222 if (spares && mddev->pers->sync_request) {
4223 mddev->recovery = 0; 4223 mddev->recovery = 0;
4224 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4224 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4225 mddev->sync_thread = md_register_thread(md_do_sync, 4225 mddev->sync_thread = md_register_thread(md_do_sync,
4226 mddev, 4226 mddev,
4227 "resync"); 4227 "resync");
4228 if (!mddev->sync_thread) { 4228 if (!mddev->sync_thread) {
4229 printk(KERN_ERR "%s: could not start resync" 4229 printk(KERN_ERR "%s: could not start resync"
4230 " thread...\n", 4230 " thread...\n",
4231 mdname(mddev)); 4231 mdname(mddev));
4232 /* leave the spares where they are, it shouldn't hurt */ 4232 /* leave the spares where they are, it shouldn't hurt */
4233 mddev->recovery = 0; 4233 mddev->recovery = 0;
4234 } 4234 }
4235 } 4235 }
4236 } 4236 }
4237 md_wakeup_thread(mddev->thread); 4237 md_wakeup_thread(mddev->thread);
4238 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 4238 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
4239 4239
4240 revalidate_disk(mddev->gendisk); 4240 revalidate_disk(mddev->gendisk);
4241 mddev->changed = 1; 4241 mddev->changed = 1;
4242 md_new_event(mddev); 4242 md_new_event(mddev);
4243 sysfs_notify_dirent(mddev->sysfs_state); 4243 sysfs_notify_dirent(mddev->sysfs_state);
4244 if (mddev->sysfs_action) 4244 if (mddev->sysfs_action)
4245 sysfs_notify_dirent(mddev->sysfs_action); 4245 sysfs_notify_dirent(mddev->sysfs_action);
4246 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4246 sysfs_notify(&mddev->kobj, NULL, "degraded");
4247 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4247 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4248 return 0; 4248 return 0;
4249 } 4249 }
4250 4250
4251 static int restart_array(mddev_t *mddev) 4251 static int restart_array(mddev_t *mddev)
4252 { 4252 {
4253 struct gendisk *disk = mddev->gendisk; 4253 struct gendisk *disk = mddev->gendisk;
4254 4254
4255 /* Complain if it has no devices */ 4255 /* Complain if it has no devices */
4256 if (list_empty(&mddev->disks)) 4256 if (list_empty(&mddev->disks))
4257 return -ENXIO; 4257 return -ENXIO;
4258 if (!mddev->pers) 4258 if (!mddev->pers)
4259 return -EINVAL; 4259 return -EINVAL;
4260 if (!mddev->ro) 4260 if (!mddev->ro)
4261 return -EBUSY; 4261 return -EBUSY;
4262 mddev->safemode = 0; 4262 mddev->safemode = 0;
4263 mddev->ro = 0; 4263 mddev->ro = 0;
4264 set_disk_ro(disk, 0); 4264 set_disk_ro(disk, 0);
4265 printk(KERN_INFO "md: %s switched to read-write mode.\n", 4265 printk(KERN_INFO "md: %s switched to read-write mode.\n",
4266 mdname(mddev)); 4266 mdname(mddev));
4267 /* Kick recovery or resync if necessary */ 4267 /* Kick recovery or resync if necessary */
4268 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4268 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4269 md_wakeup_thread(mddev->thread); 4269 md_wakeup_thread(mddev->thread);
4270 md_wakeup_thread(mddev->sync_thread); 4270 md_wakeup_thread(mddev->sync_thread);
4271 sysfs_notify_dirent(mddev->sysfs_state); 4271 sysfs_notify_dirent(mddev->sysfs_state);
4272 return 0; 4272 return 0;
4273 } 4273 }
4274 4274
4275 /* similar to deny_write_access, but accounts for our holding a reference 4275 /* similar to deny_write_access, but accounts for our holding a reference
4276 * to the file ourselves */ 4276 * to the file ourselves */
4277 static int deny_bitmap_write_access(struct file * file) 4277 static int deny_bitmap_write_access(struct file * file)
4278 { 4278 {
4279 struct inode *inode = file->f_mapping->host; 4279 struct inode *inode = file->f_mapping->host;
4280 4280
4281 spin_lock(&inode->i_lock); 4281 spin_lock(&inode->i_lock);
4282 if (atomic_read(&inode->i_writecount) > 1) { 4282 if (atomic_read(&inode->i_writecount) > 1) {
4283 spin_unlock(&inode->i_lock); 4283 spin_unlock(&inode->i_lock);
4284 return -ETXTBSY; 4284 return -ETXTBSY;
4285 } 4285 }
4286 atomic_set(&inode->i_writecount, -1); 4286 atomic_set(&inode->i_writecount, -1);
4287 spin_unlock(&inode->i_lock); 4287 spin_unlock(&inode->i_lock);
4288 4288
4289 return 0; 4289 return 0;
4290 } 4290 }
4291 4291
4292 static void restore_bitmap_write_access(struct file *file) 4292 static void restore_bitmap_write_access(struct file *file)
4293 { 4293 {
4294 struct inode *inode = file->f_mapping->host; 4294 struct inode *inode = file->f_mapping->host;
4295 4295
4296 spin_lock(&inode->i_lock); 4296 spin_lock(&inode->i_lock);
4297 atomic_set(&inode->i_writecount, 1); 4297 atomic_set(&inode->i_writecount, 1);
4298 spin_unlock(&inode->i_lock); 4298 spin_unlock(&inode->i_lock);
4299 } 4299 }
4300 4300
4301 /* mode: 4301 /* mode:
4302 * 0 - completely stop and dis-assemble array 4302 * 0 - completely stop and dis-assemble array
4303 * 1 - switch to readonly 4303 * 1 - switch to readonly
4304 * 2 - stop but do not disassemble array 4304 * 2 - stop but do not disassemble array
4305 */ 4305 */
4306 static int do_md_stop(mddev_t * mddev, int mode, int is_open) 4306 static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4307 { 4307 {
4308 int err = 0; 4308 int err = 0;
4309 struct gendisk *disk = mddev->gendisk; 4309 struct gendisk *disk = mddev->gendisk;
4310 mdk_rdev_t *rdev; 4310 mdk_rdev_t *rdev;
4311 4311
4312 mutex_lock(&mddev->open_mutex); 4312 mutex_lock(&mddev->open_mutex);
4313 if (atomic_read(&mddev->openers) > is_open) { 4313 if (atomic_read(&mddev->openers) > is_open) {
4314 printk("md: %s still in use.\n",mdname(mddev)); 4314 printk("md: %s still in use.\n",mdname(mddev));
4315 err = -EBUSY; 4315 err = -EBUSY;
4316 } else if (mddev->pers) { 4316 } else if (mddev->pers) {
4317 4317
4318 if (mddev->sync_thread) { 4318 if (mddev->sync_thread) {
4319 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4319 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4320 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4320 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4321 md_unregister_thread(mddev->sync_thread); 4321 md_unregister_thread(mddev->sync_thread);
4322 mddev->sync_thread = NULL; 4322 mddev->sync_thread = NULL;
4323 } 4323 }
4324 4324
4325 del_timer_sync(&mddev->safemode_timer); 4325 del_timer_sync(&mddev->safemode_timer);
4326 4326
4327 switch(mode) { 4327 switch(mode) {
4328 case 1: /* readonly */ 4328 case 1: /* readonly */
4329 err = -ENXIO; 4329 err = -ENXIO;
4330 if (mddev->ro==1) 4330 if (mddev->ro==1)
4331 goto out; 4331 goto out;
4332 mddev->ro = 1; 4332 mddev->ro = 1;
4333 break; 4333 break;
4334 case 0: /* disassemble */ 4334 case 0: /* disassemble */
4335 case 2: /* stop */ 4335 case 2: /* stop */
4336 bitmap_flush(mddev); 4336 bitmap_flush(mddev);
4337 md_super_wait(mddev); 4337 md_super_wait(mddev);
4338 if (mddev->ro) 4338 if (mddev->ro)
4339 set_disk_ro(disk, 0); 4339 set_disk_ro(disk, 0);
4340 4340
4341 mddev->pers->stop(mddev); 4341 mddev->pers->stop(mddev);
4342 mddev->queue->merge_bvec_fn = NULL; 4342 mddev->queue->merge_bvec_fn = NULL;
4343 mddev->queue->unplug_fn = NULL; 4343 mddev->queue->unplug_fn = NULL;
4344 mddev->queue->backing_dev_info.congested_fn = NULL; 4344 mddev->queue->backing_dev_info.congested_fn = NULL;
4345 module_put(mddev->pers->owner); 4345 module_put(mddev->pers->owner);
4346 if (mddev->pers->sync_request) 4346 if (mddev->pers->sync_request)
4347 mddev->private = &md_redundancy_group; 4347 mddev->private = &md_redundancy_group;
4348 mddev->pers = NULL; 4348 mddev->pers = NULL;
4349 /* tell userspace to handle 'inactive' */ 4349 /* tell userspace to handle 'inactive' */
4350 sysfs_notify_dirent(mddev->sysfs_state); 4350 sysfs_notify_dirent(mddev->sysfs_state);
4351 4351
4352 list_for_each_entry(rdev, &mddev->disks, same_set) 4352 list_for_each_entry(rdev, &mddev->disks, same_set)
4353 if (rdev->raid_disk >= 0) { 4353 if (rdev->raid_disk >= 0) {
4354 char nm[20]; 4354 char nm[20];
4355 sprintf(nm, "rd%d", rdev->raid_disk); 4355 sprintf(nm, "rd%d", rdev->raid_disk);
4356 sysfs_remove_link(&mddev->kobj, nm); 4356 sysfs_remove_link(&mddev->kobj, nm);
4357 } 4357 }
4358 4358
4359 set_capacity(disk, 0); 4359 set_capacity(disk, 0);
4360 mddev->changed = 1; 4360 mddev->changed = 1;
4361 4361
4362 if (mddev->ro) 4362 if (mddev->ro)
4363 mddev->ro = 0; 4363 mddev->ro = 0;
4364 } 4364 }
4365 if (!mddev->in_sync || mddev->flags) { 4365 if (!mddev->in_sync || mddev->flags) {
4366 /* mark array as shutdown cleanly */ 4366 /* mark array as shutdown cleanly */
4367 mddev->in_sync = 1; 4367 mddev->in_sync = 1;
4368 md_update_sb(mddev, 1); 4368 md_update_sb(mddev, 1);
4369 } 4369 }
4370 if (mode == 1) 4370 if (mode == 1)
4371 set_disk_ro(disk, 1); 4371 set_disk_ro(disk, 1);
4372 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4372 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4373 err = 0; 4373 err = 0;
4374 } 4374 }
4375 out: 4375 out:
4376 mutex_unlock(&mddev->open_mutex); 4376 mutex_unlock(&mddev->open_mutex);
4377 if (err) 4377 if (err)
4378 return err; 4378 return err;
4379 /* 4379 /*
4380 * Free resources if final stop 4380 * Free resources if final stop
4381 */ 4381 */
4382 if (mode == 0) { 4382 if (mode == 0) {
4383 4383
4384 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 4384 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
4385 4385
4386 bitmap_destroy(mddev); 4386 bitmap_destroy(mddev);
4387 if (mddev->bitmap_file) { 4387 if (mddev->bitmap_file) {
4388 restore_bitmap_write_access(mddev->bitmap_file); 4388 restore_bitmap_write_access(mddev->bitmap_file);
4389 fput(mddev->bitmap_file); 4389 fput(mddev->bitmap_file);
4390 mddev->bitmap_file = NULL; 4390 mddev->bitmap_file = NULL;
4391 } 4391 }
4392 mddev->bitmap_offset = 0; 4392 mddev->bitmap_offset = 0;
4393 4393
4394 /* make sure all md_delayed_delete calls have finished */ 4394 /* make sure all md_delayed_delete calls have finished */
4395 flush_scheduled_work(); 4395 flush_scheduled_work();
4396 4396
4397 export_array(mddev); 4397 export_array(mddev);
4398 4398
4399 mddev->array_sectors = 0; 4399 mddev->array_sectors = 0;
4400 mddev->external_size = 0; 4400 mddev->external_size = 0;
4401 mddev->dev_sectors = 0; 4401 mddev->dev_sectors = 0;
4402 mddev->raid_disks = 0; 4402 mddev->raid_disks = 0;
4403 mddev->recovery_cp = 0; 4403 mddev->recovery_cp = 0;
4404 mddev->resync_min = 0; 4404 mddev->resync_min = 0;
4405 mddev->resync_max = MaxSector; 4405 mddev->resync_max = MaxSector;
4406 mddev->reshape_position = MaxSector; 4406 mddev->reshape_position = MaxSector;
4407 mddev->external = 0; 4407 mddev->external = 0;
4408 mddev->persistent = 0; 4408 mddev->persistent = 0;
4409 mddev->level = LEVEL_NONE; 4409 mddev->level = LEVEL_NONE;
4410 mddev->clevel[0] = 0; 4410 mddev->clevel[0] = 0;
4411 mddev->flags = 0; 4411 mddev->flags = 0;
4412 mddev->ro = 0; 4412 mddev->ro = 0;
4413 mddev->metadata_type[0] = 0; 4413 mddev->metadata_type[0] = 0;
4414 mddev->chunk_sectors = 0; 4414 mddev->chunk_sectors = 0;
4415 mddev->ctime = mddev->utime = 0; 4415 mddev->ctime = mddev->utime = 0;
4416 mddev->layout = 0; 4416 mddev->layout = 0;
4417 mddev->max_disks = 0; 4417 mddev->max_disks = 0;
4418 mddev->events = 0; 4418 mddev->events = 0;
4419 mddev->delta_disks = 0; 4419 mddev->delta_disks = 0;
4420 mddev->new_level = LEVEL_NONE; 4420 mddev->new_level = LEVEL_NONE;
4421 mddev->new_layout = 0; 4421 mddev->new_layout = 0;
4422 mddev->new_chunk_sectors = 0; 4422 mddev->new_chunk_sectors = 0;
4423 mddev->curr_resync = 0; 4423 mddev->curr_resync = 0;
4424 mddev->resync_mismatches = 0; 4424 mddev->resync_mismatches = 0;
4425 mddev->suspend_lo = mddev->suspend_hi = 0; 4425 mddev->suspend_lo = mddev->suspend_hi = 0;
4426 mddev->sync_speed_min = mddev->sync_speed_max = 0; 4426 mddev->sync_speed_min = mddev->sync_speed_max = 0;
4427 mddev->recovery = 0; 4427 mddev->recovery = 0;
4428 mddev->in_sync = 0; 4428 mddev->in_sync = 0;
4429 mddev->changed = 0; 4429 mddev->changed = 0;
4430 mddev->degraded = 0; 4430 mddev->degraded = 0;
4431 mddev->barriers_work = 0; 4431 mddev->barriers_work = 0;
4432 mddev->safemode = 0; 4432 mddev->safemode = 0;
4433 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 4433 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
4434 if (mddev->hold_active == UNTIL_STOP) 4434 if (mddev->hold_active == UNTIL_STOP)
4435 mddev->hold_active = 0; 4435 mddev->hold_active = 0;
4436 4436
4437 } else if (mddev->pers) 4437 } else if (mddev->pers)
4438 printk(KERN_INFO "md: %s switched to read-only mode.\n", 4438 printk(KERN_INFO "md: %s switched to read-only mode.\n",
4439 mdname(mddev)); 4439 mdname(mddev));
4440 err = 0; 4440 err = 0;
4441 blk_integrity_unregister(disk); 4441 blk_integrity_unregister(disk);
4442 md_new_event(mddev); 4442 md_new_event(mddev);
4443 sysfs_notify_dirent(mddev->sysfs_state); 4443 sysfs_notify_dirent(mddev->sysfs_state);
4444 return err; 4444 return err;
4445 } 4445 }
4446 4446
4447 #ifndef MODULE 4447 #ifndef MODULE
4448 static void autorun_array(mddev_t *mddev) 4448 static void autorun_array(mddev_t *mddev)
4449 { 4449 {
4450 mdk_rdev_t *rdev; 4450 mdk_rdev_t *rdev;
4451 int err; 4451 int err;
4452 4452
4453 if (list_empty(&mddev->disks)) 4453 if (list_empty(&mddev->disks))
4454 return; 4454 return;
4455 4455
4456 printk(KERN_INFO "md: running: "); 4456 printk(KERN_INFO "md: running: ");
4457 4457
4458 list_for_each_entry(rdev, &mddev->disks, same_set) { 4458 list_for_each_entry(rdev, &mddev->disks, same_set) {
4459 char b[BDEVNAME_SIZE]; 4459 char b[BDEVNAME_SIZE];
4460 printk("<%s>", bdevname(rdev->bdev,b)); 4460 printk("<%s>", bdevname(rdev->bdev,b));
4461 } 4461 }
4462 printk("\n"); 4462 printk("\n");
4463 4463
4464 err = do_md_run(mddev); 4464 err = do_md_run(mddev);
4465 if (err) { 4465 if (err) {
4466 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 4466 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
4467 do_md_stop(mddev, 0, 0); 4467 do_md_stop(mddev, 0, 0);
4468 } 4468 }
4469 } 4469 }
4470 4470
4471 /* 4471 /*
4472 * lets try to run arrays based on all disks that have arrived 4472 * lets try to run arrays based on all disks that have arrived
4473 * until now. (those are in pending_raid_disks) 4473 * until now. (those are in pending_raid_disks)
4474 * 4474 *
4475 * the method: pick the first pending disk, collect all disks with 4475 * the method: pick the first pending disk, collect all disks with
4476 * the same UUID, remove all from the pending list and put them into 4476 * the same UUID, remove all from the pending list and put them into
4477 * the 'same_array' list. Then order this list based on superblock 4477 * the 'same_array' list. Then order this list based on superblock
4478 * update time (freshest comes first), kick out 'old' disks and 4478 * update time (freshest comes first), kick out 'old' disks and
4479 * compare superblocks. If everything's fine then run it. 4479 * compare superblocks. If everything's fine then run it.
4480 * 4480 *
4481 * If "unit" is allocated, then bump its reference count 4481 * If "unit" is allocated, then bump its reference count
4482 */ 4482 */
4483 static void autorun_devices(int part) 4483 static void autorun_devices(int part)
4484 { 4484 {
4485 mdk_rdev_t *rdev0, *rdev, *tmp; 4485 mdk_rdev_t *rdev0, *rdev, *tmp;
4486 mddev_t *mddev; 4486 mddev_t *mddev;
4487 char b[BDEVNAME_SIZE]; 4487 char b[BDEVNAME_SIZE];
4488 4488
4489 printk(KERN_INFO "md: autorun ...\n"); 4489 printk(KERN_INFO "md: autorun ...\n");
4490 while (!list_empty(&pending_raid_disks)) { 4490 while (!list_empty(&pending_raid_disks)) {
4491 int unit; 4491 int unit;
4492 dev_t dev; 4492 dev_t dev;
4493 LIST_HEAD(candidates); 4493 LIST_HEAD(candidates);
4494 rdev0 = list_entry(pending_raid_disks.next, 4494 rdev0 = list_entry(pending_raid_disks.next,
4495 mdk_rdev_t, same_set); 4495 mdk_rdev_t, same_set);
4496 4496
4497 printk(KERN_INFO "md: considering %s ...\n", 4497 printk(KERN_INFO "md: considering %s ...\n",
4498 bdevname(rdev0->bdev,b)); 4498 bdevname(rdev0->bdev,b));
4499 INIT_LIST_HEAD(&candidates); 4499 INIT_LIST_HEAD(&candidates);
4500 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 4500 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
4501 if (super_90_load(rdev, rdev0, 0) >= 0) { 4501 if (super_90_load(rdev, rdev0, 0) >= 0) {
4502 printk(KERN_INFO "md: adding %s ...\n", 4502 printk(KERN_INFO "md: adding %s ...\n",
4503 bdevname(rdev->bdev,b)); 4503 bdevname(rdev->bdev,b));
4504 list_move(&rdev->same_set, &candidates); 4504 list_move(&rdev->same_set, &candidates);
4505 } 4505 }
4506 /* 4506 /*
4507 * now we have a set of devices, with all of them having 4507 * now we have a set of devices, with all of them having
4508 * mostly sane superblocks. It's time to allocate the 4508 * mostly sane superblocks. It's time to allocate the
4509 * mddev. 4509 * mddev.
4510 */ 4510 */
4511 if (part) { 4511 if (part) {
4512 dev = MKDEV(mdp_major, 4512 dev = MKDEV(mdp_major,
4513 rdev0->preferred_minor << MdpMinorShift); 4513 rdev0->preferred_minor << MdpMinorShift);
4514 unit = MINOR(dev) >> MdpMinorShift; 4514 unit = MINOR(dev) >> MdpMinorShift;
4515 } else { 4515 } else {
4516 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 4516 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
4517 unit = MINOR(dev); 4517 unit = MINOR(dev);
4518 } 4518 }
4519 if (rdev0->preferred_minor != unit) { 4519 if (rdev0->preferred_minor != unit) {
4520 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 4520 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
4521 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 4521 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
4522 break; 4522 break;
4523 } 4523 }
4524 4524
4525 md_probe(dev, NULL, NULL); 4525 md_probe(dev, NULL, NULL);
4526 mddev = mddev_find(dev); 4526 mddev = mddev_find(dev);
4527 if (!mddev || !mddev->gendisk) { 4527 if (!mddev || !mddev->gendisk) {
4528 if (mddev) 4528 if (mddev)
4529 mddev_put(mddev); 4529 mddev_put(mddev);
4530 printk(KERN_ERR 4530 printk(KERN_ERR
4531 "md: cannot allocate memory for md drive.\n"); 4531 "md: cannot allocate memory for md drive.\n");
4532 break; 4532 break;
4533 } 4533 }
4534 if (mddev_lock(mddev)) 4534 if (mddev_lock(mddev))
4535 printk(KERN_WARNING "md: %s locked, cannot run\n", 4535 printk(KERN_WARNING "md: %s locked, cannot run\n",
4536 mdname(mddev)); 4536 mdname(mddev));
4537 else if (mddev->raid_disks || mddev->major_version 4537 else if (mddev->raid_disks || mddev->major_version
4538 || !list_empty(&mddev->disks)) { 4538 || !list_empty(&mddev->disks)) {
4539 printk(KERN_WARNING 4539 printk(KERN_WARNING
4540 "md: %s already running, cannot run %s\n", 4540 "md: %s already running, cannot run %s\n",
4541 mdname(mddev), bdevname(rdev0->bdev,b)); 4541 mdname(mddev), bdevname(rdev0->bdev,b));
4542 mddev_unlock(mddev); 4542 mddev_unlock(mddev);
4543 } else { 4543 } else {
4544 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 4544 printk(KERN_INFO "md: created %s\n", mdname(mddev));
4545 mddev->persistent = 1; 4545 mddev->persistent = 1;
4546 rdev_for_each_list(rdev, tmp, &candidates) { 4546 rdev_for_each_list(rdev, tmp, &candidates) {
4547 list_del_init(&rdev->same_set); 4547 list_del_init(&rdev->same_set);
4548 if (bind_rdev_to_array(rdev, mddev)) 4548 if (bind_rdev_to_array(rdev, mddev))
4549 export_rdev(rdev); 4549 export_rdev(rdev);
4550 } 4550 }
4551 autorun_array(mddev); 4551 autorun_array(mddev);
4552 mddev_unlock(mddev); 4552 mddev_unlock(mddev);
4553 } 4553 }
4554 /* on success, candidates will be empty, on error 4554 /* on success, candidates will be empty, on error
4555 * it won't... 4555 * it won't...
4556 */ 4556 */
4557 rdev_for_each_list(rdev, tmp, &candidates) { 4557 rdev_for_each_list(rdev, tmp, &candidates) {
4558 list_del_init(&rdev->same_set); 4558 list_del_init(&rdev->same_set);
4559 export_rdev(rdev); 4559 export_rdev(rdev);
4560 } 4560 }
4561 mddev_put(mddev); 4561 mddev_put(mddev);
4562 } 4562 }
4563 printk(KERN_INFO "md: ... autorun DONE.\n"); 4563 printk(KERN_INFO "md: ... autorun DONE.\n");
4564 } 4564 }
4565 #endif /* !MODULE */ 4565 #endif /* !MODULE */
4566 4566
4567 static int get_version(void __user * arg) 4567 static int get_version(void __user * arg)
4568 { 4568 {
4569 mdu_version_t ver; 4569 mdu_version_t ver;
4570 4570
4571 ver.major = MD_MAJOR_VERSION; 4571 ver.major = MD_MAJOR_VERSION;
4572 ver.minor = MD_MINOR_VERSION; 4572 ver.minor = MD_MINOR_VERSION;
4573 ver.patchlevel = MD_PATCHLEVEL_VERSION; 4573 ver.patchlevel = MD_PATCHLEVEL_VERSION;
4574 4574
4575 if (copy_to_user(arg, &ver, sizeof(ver))) 4575 if (copy_to_user(arg, &ver, sizeof(ver)))
4576 return -EFAULT; 4576 return -EFAULT;
4577 4577
4578 return 0; 4578 return 0;
4579 } 4579 }
4580 4580
4581 static int get_array_info(mddev_t * mddev, void __user * arg) 4581 static int get_array_info(mddev_t * mddev, void __user * arg)
4582 { 4582 {
4583 mdu_array_info_t info; 4583 mdu_array_info_t info;
4584 int nr,working,insync,failed,spare; 4584 int nr,working,insync,failed,spare;
4585 mdk_rdev_t *rdev; 4585 mdk_rdev_t *rdev;
4586 4586
4587 nr=working=insync=failed=spare=0; 4587 nr=working=insync=failed=spare=0;
4588 list_for_each_entry(rdev, &mddev->disks, same_set) { 4588 list_for_each_entry(rdev, &mddev->disks, same_set) {
4589 nr++; 4589 nr++;
4590 if (test_bit(Faulty, &rdev->flags)) 4590 if (test_bit(Faulty, &rdev->flags))
4591 failed++; 4591 failed++;
4592 else { 4592 else {
4593 working++; 4593 working++;
4594 if (test_bit(In_sync, &rdev->flags)) 4594 if (test_bit(In_sync, &rdev->flags))
4595 insync++; 4595 insync++;
4596 else 4596 else
4597 spare++; 4597 spare++;
4598 } 4598 }
4599 } 4599 }
4600 4600
4601 info.major_version = mddev->major_version; 4601 info.major_version = mddev->major_version;
4602 info.minor_version = mddev->minor_version; 4602 info.minor_version = mddev->minor_version;
4603 info.patch_version = MD_PATCHLEVEL_VERSION; 4603 info.patch_version = MD_PATCHLEVEL_VERSION;
4604 info.ctime = mddev->ctime; 4604 info.ctime = mddev->ctime;
4605 info.level = mddev->level; 4605 info.level = mddev->level;
4606 info.size = mddev->dev_sectors / 2; 4606 info.size = mddev->dev_sectors / 2;
4607 if (info.size != mddev->dev_sectors / 2) /* overflow */ 4607 if (info.size != mddev->dev_sectors / 2) /* overflow */
4608 info.size = -1; 4608 info.size = -1;
4609 info.nr_disks = nr; 4609 info.nr_disks = nr;
4610 info.raid_disks = mddev->raid_disks; 4610 info.raid_disks = mddev->raid_disks;
4611 info.md_minor = mddev->md_minor; 4611 info.md_minor = mddev->md_minor;
4612 info.not_persistent= !mddev->persistent; 4612 info.not_persistent= !mddev->persistent;
4613 4613
4614 info.utime = mddev->utime; 4614 info.utime = mddev->utime;
4615 info.state = 0; 4615 info.state = 0;
4616 if (mddev->in_sync) 4616 if (mddev->in_sync)
4617 info.state = (1<<MD_SB_CLEAN); 4617 info.state = (1<<MD_SB_CLEAN);
4618 if (mddev->bitmap && mddev->bitmap_offset) 4618 if (mddev->bitmap && mddev->bitmap_offset)
4619 info.state = (1<<MD_SB_BITMAP_PRESENT); 4619 info.state = (1<<MD_SB_BITMAP_PRESENT);
4620 info.active_disks = insync; 4620 info.active_disks = insync;
4621 info.working_disks = working; 4621 info.working_disks = working;
4622 info.failed_disks = failed; 4622 info.failed_disks = failed;
4623 info.spare_disks = spare; 4623 info.spare_disks = spare;
4624 4624
4625 info.layout = mddev->layout; 4625 info.layout = mddev->layout;
4626 info.chunk_size = mddev->chunk_sectors << 9; 4626 info.chunk_size = mddev->chunk_sectors << 9;
4627 4627
4628 if (copy_to_user(arg, &info, sizeof(info))) 4628 if (copy_to_user(arg, &info, sizeof(info)))
4629 return -EFAULT; 4629 return -EFAULT;
4630 4630
4631 return 0; 4631 return 0;
4632 } 4632 }
4633 4633
4634 static int get_bitmap_file(mddev_t * mddev, void __user * arg) 4634 static int get_bitmap_file(mddev_t * mddev, void __user * arg)
4635 { 4635 {
4636 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 4636 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
4637 char *ptr, *buf = NULL; 4637 char *ptr, *buf = NULL;
4638 int err = -ENOMEM; 4638 int err = -ENOMEM;
4639 4639
4640 if (md_allow_write(mddev)) 4640 if (md_allow_write(mddev))
4641 file = kmalloc(sizeof(*file), GFP_NOIO); 4641 file = kmalloc(sizeof(*file), GFP_NOIO);
4642 else 4642 else
4643 file = kmalloc(sizeof(*file), GFP_KERNEL); 4643 file = kmalloc(sizeof(*file), GFP_KERNEL);
4644 4644
4645 if (!file) 4645 if (!file)
4646 goto out; 4646 goto out;
4647 4647
4648 /* bitmap disabled, zero the first byte and copy out */ 4648 /* bitmap disabled, zero the first byte and copy out */
4649 if (!mddev->bitmap || !mddev->bitmap->file) { 4649 if (!mddev->bitmap || !mddev->bitmap->file) {
4650 file->pathname[0] = '\0'; 4650 file->pathname[0] = '\0';
4651 goto copy_out; 4651 goto copy_out;
4652 } 4652 }
4653 4653
4654 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 4654 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
4655 if (!buf) 4655 if (!buf)
4656 goto out; 4656 goto out;
4657 4657
4658 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname)); 4658 ptr = d_path(&mddev->bitmap->file->f_path, buf, sizeof(file->pathname));
4659 if (IS_ERR(ptr)) 4659 if (IS_ERR(ptr))
4660 goto out; 4660 goto out;
4661 4661
4662 strcpy(file->pathname, ptr); 4662 strcpy(file->pathname, ptr);
4663 4663
4664 copy_out: 4664 copy_out:
4665 err = 0; 4665 err = 0;
4666 if (copy_to_user(arg, file, sizeof(*file))) 4666 if (copy_to_user(arg, file, sizeof(*file)))
4667 err = -EFAULT; 4667 err = -EFAULT;
4668 out: 4668 out:
4669 kfree(buf); 4669 kfree(buf);
4670 kfree(file); 4670 kfree(file);
4671 return err; 4671 return err;
4672 } 4672 }
4673 4673
4674 static int get_disk_info(mddev_t * mddev, void __user * arg) 4674 static int get_disk_info(mddev_t * mddev, void __user * arg)
4675 { 4675 {
4676 mdu_disk_info_t info; 4676 mdu_disk_info_t info;
4677 mdk_rdev_t *rdev; 4677 mdk_rdev_t *rdev;
4678 4678
4679 if (copy_from_user(&info, arg, sizeof(info))) 4679 if (copy_from_user(&info, arg, sizeof(info)))
4680 return -EFAULT; 4680 return -EFAULT;
4681 4681
4682 rdev = find_rdev_nr(mddev, info.number); 4682 rdev = find_rdev_nr(mddev, info.number);
4683 if (rdev) { 4683 if (rdev) {
4684 info.major = MAJOR(rdev->bdev->bd_dev); 4684 info.major = MAJOR(rdev->bdev->bd_dev);
4685 info.minor = MINOR(rdev->bdev->bd_dev); 4685 info.minor = MINOR(rdev->bdev->bd_dev);
4686 info.raid_disk = rdev->raid_disk; 4686 info.raid_disk = rdev->raid_disk;
4687 info.state = 0; 4687 info.state = 0;
4688 if (test_bit(Faulty, &rdev->flags)) 4688 if (test_bit(Faulty, &rdev->flags))
4689 info.state |= (1<<MD_DISK_FAULTY); 4689 info.state |= (1<<MD_DISK_FAULTY);
4690 else if (test_bit(In_sync, &rdev->flags)) { 4690 else if (test_bit(In_sync, &rdev->flags)) {
4691 info.state |= (1<<MD_DISK_ACTIVE); 4691 info.state |= (1<<MD_DISK_ACTIVE);
4692 info.state |= (1<<MD_DISK_SYNC); 4692 info.state |= (1<<MD_DISK_SYNC);
4693 } 4693 }
4694 if (test_bit(WriteMostly, &rdev->flags)) 4694 if (test_bit(WriteMostly, &rdev->flags))
4695 info.state |= (1<<MD_DISK_WRITEMOSTLY); 4695 info.state |= (1<<MD_DISK_WRITEMOSTLY);
4696 } else { 4696 } else {
4697 info.major = info.minor = 0; 4697 info.major = info.minor = 0;
4698 info.raid_disk = -1; 4698 info.raid_disk = -1;
4699 info.state = (1<<MD_DISK_REMOVED); 4699 info.state = (1<<MD_DISK_REMOVED);
4700 } 4700 }
4701 4701
4702 if (copy_to_user(arg, &info, sizeof(info))) 4702 if (copy_to_user(arg, &info, sizeof(info)))
4703 return -EFAULT; 4703 return -EFAULT;
4704 4704
4705 return 0; 4705 return 0;
4706 } 4706 }
4707 4707
4708 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) 4708 static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
4709 { 4709 {
4710 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 4710 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
4711 mdk_rdev_t *rdev; 4711 mdk_rdev_t *rdev;
4712 dev_t dev = MKDEV(info->major,info->minor); 4712 dev_t dev = MKDEV(info->major,info->minor);
4713 4713
4714 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 4714 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
4715 return -EOVERFLOW; 4715 return -EOVERFLOW;
4716 4716
4717 if (!mddev->raid_disks) { 4717 if (!mddev->raid_disks) {
4718 int err; 4718 int err;
4719 /* expecting a device which has a superblock */ 4719 /* expecting a device which has a superblock */
4720 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 4720 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
4721 if (IS_ERR(rdev)) { 4721 if (IS_ERR(rdev)) {
4722 printk(KERN_WARNING 4722 printk(KERN_WARNING
4723 "md: md_import_device returned %ld\n", 4723 "md: md_import_device returned %ld\n",
4724 PTR_ERR(rdev)); 4724 PTR_ERR(rdev));
4725 return PTR_ERR(rdev); 4725 return PTR_ERR(rdev);
4726 } 4726 }
4727 if (!list_empty(&mddev->disks)) { 4727 if (!list_empty(&mddev->disks)) {
4728 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next, 4728 mdk_rdev_t *rdev0 = list_entry(mddev->disks.next,
4729 mdk_rdev_t, same_set); 4729 mdk_rdev_t, same_set);
4730 err = super_types[mddev->major_version] 4730 err = super_types[mddev->major_version]
4731 .load_super(rdev, rdev0, mddev->minor_version); 4731 .load_super(rdev, rdev0, mddev->minor_version);
4732 if (err < 0) { 4732 if (err < 0) {
4733 printk(KERN_WARNING 4733 printk(KERN_WARNING
4734 "md: %s has different UUID to %s\n", 4734 "md: %s has different UUID to %s\n",
4735 bdevname(rdev->bdev,b), 4735 bdevname(rdev->bdev,b),
4736 bdevname(rdev0->bdev,b2)); 4736 bdevname(rdev0->bdev,b2));
4737 export_rdev(rdev); 4737 export_rdev(rdev);
4738 return -EINVAL; 4738 return -EINVAL;
4739 } 4739 }
4740 } 4740 }
4741 err = bind_rdev_to_array(rdev, mddev); 4741 err = bind_rdev_to_array(rdev, mddev);
4742 if (err) 4742 if (err)
4743 export_rdev(rdev); 4743 export_rdev(rdev);
4744 return err; 4744 return err;
4745 } 4745 }
4746 4746
4747 /* 4747 /*
4748 * add_new_disk can be used once the array is assembled 4748 * add_new_disk can be used once the array is assembled
4749 * to add "hot spares". They must already have a superblock 4749 * to add "hot spares". They must already have a superblock
4750 * written 4750 * written
4751 */ 4751 */
4752 if (mddev->pers) { 4752 if (mddev->pers) {
4753 int err; 4753 int err;
4754 if (!mddev->pers->hot_add_disk) { 4754 if (!mddev->pers->hot_add_disk) {
4755 printk(KERN_WARNING 4755 printk(KERN_WARNING
4756 "%s: personality does not support diskops!\n", 4756 "%s: personality does not support diskops!\n",
4757 mdname(mddev)); 4757 mdname(mddev));
4758 return -EINVAL; 4758 return -EINVAL;
4759 } 4759 }
4760 if (mddev->persistent) 4760 if (mddev->persistent)
4761 rdev = md_import_device(dev, mddev->major_version, 4761 rdev = md_import_device(dev, mddev->major_version,
4762 mddev->minor_version); 4762 mddev->minor_version);
4763 else 4763 else
4764 rdev = md_import_device(dev, -1, -1); 4764 rdev = md_import_device(dev, -1, -1);
4765 if (IS_ERR(rdev)) { 4765 if (IS_ERR(rdev)) {
4766 printk(KERN_WARNING 4766 printk(KERN_WARNING
4767 "md: md_import_device returned %ld\n", 4767 "md: md_import_device returned %ld\n",
4768 PTR_ERR(rdev)); 4768 PTR_ERR(rdev));
4769 return PTR_ERR(rdev); 4769 return PTR_ERR(rdev);
4770 } 4770 }
4771 /* set save_raid_disk if appropriate */ 4771 /* set save_raid_disk if appropriate */
4772 if (!mddev->persistent) { 4772 if (!mddev->persistent) {
4773 if (info->state & (1<<MD_DISK_SYNC) && 4773 if (info->state & (1<<MD_DISK_SYNC) &&
4774 info->raid_disk < mddev->raid_disks) 4774 info->raid_disk < mddev->raid_disks)
4775 rdev->raid_disk = info->raid_disk; 4775 rdev->raid_disk = info->raid_disk;
4776 else 4776 else
4777 rdev->raid_disk = -1; 4777 rdev->raid_disk = -1;
4778 } else 4778 } else
4779 super_types[mddev->major_version]. 4779 super_types[mddev->major_version].
4780 validate_super(mddev, rdev); 4780 validate_super(mddev, rdev);
4781 rdev->saved_raid_disk = rdev->raid_disk; 4781 rdev->saved_raid_disk = rdev->raid_disk;
4782 4782
4783 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 4783 clear_bit(In_sync, &rdev->flags); /* just to be sure */
4784 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4784 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4785 set_bit(WriteMostly, &rdev->flags); 4785 set_bit(WriteMostly, &rdev->flags);
4786 else 4786 else
4787 clear_bit(WriteMostly, &rdev->flags); 4787 clear_bit(WriteMostly, &rdev->flags);
4788 4788
4789 rdev->raid_disk = -1; 4789 rdev->raid_disk = -1;
4790 err = bind_rdev_to_array(rdev, mddev); 4790 err = bind_rdev_to_array(rdev, mddev);
4791 if (!err && !mddev->pers->hot_remove_disk) { 4791 if (!err && !mddev->pers->hot_remove_disk) {
4792 /* If there is hot_add_disk but no hot_remove_disk 4792 /* If there is hot_add_disk but no hot_remove_disk
4793 * then added disks for geometry changes, 4793 * then added disks for geometry changes,
4794 * and should be added immediately. 4794 * and should be added immediately.
4795 */ 4795 */
4796 super_types[mddev->major_version]. 4796 super_types[mddev->major_version].
4797 validate_super(mddev, rdev); 4797 validate_super(mddev, rdev);
4798 err = mddev->pers->hot_add_disk(mddev, rdev); 4798 err = mddev->pers->hot_add_disk(mddev, rdev);
4799 if (err) 4799 if (err)
4800 unbind_rdev_from_array(rdev); 4800 unbind_rdev_from_array(rdev);
4801 } 4801 }
4802 if (err) 4802 if (err)
4803 export_rdev(rdev); 4803 export_rdev(rdev);
4804 else 4804 else
4805 sysfs_notify_dirent(rdev->sysfs_state); 4805 sysfs_notify_dirent(rdev->sysfs_state);
4806 4806
4807 md_update_sb(mddev, 1); 4807 md_update_sb(mddev, 1);
4808 if (mddev->degraded) 4808 if (mddev->degraded)
4809 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4809 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4810 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4810 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4811 md_wakeup_thread(mddev->thread); 4811 md_wakeup_thread(mddev->thread);
4812 return err; 4812 return err;
4813 } 4813 }
4814 4814
4815 /* otherwise, add_new_disk is only allowed 4815 /* otherwise, add_new_disk is only allowed
4816 * for major_version==0 superblocks 4816 * for major_version==0 superblocks
4817 */ 4817 */
4818 if (mddev->major_version != 0) { 4818 if (mddev->major_version != 0) {
4819 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 4819 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
4820 mdname(mddev)); 4820 mdname(mddev));
4821 return -EINVAL; 4821 return -EINVAL;
4822 } 4822 }
4823 4823
4824 if (!(info->state & (1<<MD_DISK_FAULTY))) { 4824 if (!(info->state & (1<<MD_DISK_FAULTY))) {
4825 int err; 4825 int err;
4826 rdev = md_import_device(dev, -1, 0); 4826 rdev = md_import_device(dev, -1, 0);
4827 if (IS_ERR(rdev)) { 4827 if (IS_ERR(rdev)) {
4828 printk(KERN_WARNING 4828 printk(KERN_WARNING
4829 "md: error, md_import_device() returned %ld\n", 4829 "md: error, md_import_device() returned %ld\n",
4830 PTR_ERR(rdev)); 4830 PTR_ERR(rdev));
4831 return PTR_ERR(rdev); 4831 return PTR_ERR(rdev);
4832 } 4832 }
4833 rdev->desc_nr = info->number; 4833 rdev->desc_nr = info->number;
4834 if (info->raid_disk < mddev->raid_disks) 4834 if (info->raid_disk < mddev->raid_disks)
4835 rdev->raid_disk = info->raid_disk; 4835 rdev->raid_disk = info->raid_disk;
4836 else 4836 else
4837 rdev->raid_disk = -1; 4837 rdev->raid_disk = -1;
4838 4838
4839 if (rdev->raid_disk < mddev->raid_disks) 4839 if (rdev->raid_disk < mddev->raid_disks)
4840 if (info->state & (1<<MD_DISK_SYNC)) 4840 if (info->state & (1<<MD_DISK_SYNC))
4841 set_bit(In_sync, &rdev->flags); 4841 set_bit(In_sync, &rdev->flags);
4842 4842
4843 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 4843 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
4844 set_bit(WriteMostly, &rdev->flags); 4844 set_bit(WriteMostly, &rdev->flags);
4845 4845
4846 if (!mddev->persistent) { 4846 if (!mddev->persistent) {
4847 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 4847 printk(KERN_INFO "md: nonpersistent superblock ...\n");
4848 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4848 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4849 } else 4849 } else
4850 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4850 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4851 rdev->sectors = rdev->sb_start; 4851 rdev->sectors = rdev->sb_start;
4852 4852
4853 err = bind_rdev_to_array(rdev, mddev); 4853 err = bind_rdev_to_array(rdev, mddev);
4854 if (err) { 4854 if (err) {
4855 export_rdev(rdev); 4855 export_rdev(rdev);
4856 return err; 4856 return err;
4857 } 4857 }
4858 } 4858 }
4859 4859
4860 return 0; 4860 return 0;
4861 } 4861 }
4862 4862
4863 static int hot_remove_disk(mddev_t * mddev, dev_t dev) 4863 static int hot_remove_disk(mddev_t * mddev, dev_t dev)
4864 { 4864 {
4865 char b[BDEVNAME_SIZE]; 4865 char b[BDEVNAME_SIZE];
4866 mdk_rdev_t *rdev; 4866 mdk_rdev_t *rdev;
4867 4867
4868 rdev = find_rdev(mddev, dev); 4868 rdev = find_rdev(mddev, dev);
4869 if (!rdev) 4869 if (!rdev)
4870 return -ENXIO; 4870 return -ENXIO;
4871 4871
4872 if (rdev->raid_disk >= 0) 4872 if (rdev->raid_disk >= 0)
4873 goto busy; 4873 goto busy;
4874 4874
4875 kick_rdev_from_array(rdev); 4875 kick_rdev_from_array(rdev);
4876 md_update_sb(mddev, 1); 4876 md_update_sb(mddev, 1);
4877 md_new_event(mddev); 4877 md_new_event(mddev);
4878 4878
4879 return 0; 4879 return 0;
4880 busy: 4880 busy:
4881 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 4881 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
4882 bdevname(rdev->bdev,b), mdname(mddev)); 4882 bdevname(rdev->bdev,b), mdname(mddev));
4883 return -EBUSY; 4883 return -EBUSY;
4884 } 4884 }
4885 4885
4886 static int hot_add_disk(mddev_t * mddev, dev_t dev) 4886 static int hot_add_disk(mddev_t * mddev, dev_t dev)
4887 { 4887 {
4888 char b[BDEVNAME_SIZE]; 4888 char b[BDEVNAME_SIZE];
4889 int err; 4889 int err;
4890 mdk_rdev_t *rdev; 4890 mdk_rdev_t *rdev;
4891 4891
4892 if (!mddev->pers) 4892 if (!mddev->pers)
4893 return -ENODEV; 4893 return -ENODEV;
4894 4894
4895 if (mddev->major_version != 0) { 4895 if (mddev->major_version != 0) {
4896 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 4896 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
4897 " version-0 superblocks.\n", 4897 " version-0 superblocks.\n",
4898 mdname(mddev)); 4898 mdname(mddev));
4899 return -EINVAL; 4899 return -EINVAL;
4900 } 4900 }
4901 if (!mddev->pers->hot_add_disk) { 4901 if (!mddev->pers->hot_add_disk) {
4902 printk(KERN_WARNING 4902 printk(KERN_WARNING
4903 "%s: personality does not support diskops!\n", 4903 "%s: personality does not support diskops!\n",
4904 mdname(mddev)); 4904 mdname(mddev));
4905 return -EINVAL; 4905 return -EINVAL;
4906 } 4906 }
4907 4907
4908 rdev = md_import_device(dev, -1, 0); 4908 rdev = md_import_device(dev, -1, 0);
4909 if (IS_ERR(rdev)) { 4909 if (IS_ERR(rdev)) {
4910 printk(KERN_WARNING 4910 printk(KERN_WARNING
4911 "md: error, md_import_device() returned %ld\n", 4911 "md: error, md_import_device() returned %ld\n",
4912 PTR_ERR(rdev)); 4912 PTR_ERR(rdev));
4913 return -EINVAL; 4913 return -EINVAL;
4914 } 4914 }
4915 4915
4916 if (mddev->persistent) 4916 if (mddev->persistent)
4917 rdev->sb_start = calc_dev_sboffset(rdev->bdev); 4917 rdev->sb_start = calc_dev_sboffset(rdev->bdev);
4918 else 4918 else
4919 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512; 4919 rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
4920 4920
4921 rdev->sectors = rdev->sb_start; 4921 rdev->sectors = rdev->sb_start;
4922 4922
4923 if (test_bit(Faulty, &rdev->flags)) { 4923 if (test_bit(Faulty, &rdev->flags)) {
4924 printk(KERN_WARNING 4924 printk(KERN_WARNING
4925 "md: can not hot-add faulty %s disk to %s!\n", 4925 "md: can not hot-add faulty %s disk to %s!\n",
4926 bdevname(rdev->bdev,b), mdname(mddev)); 4926 bdevname(rdev->bdev,b), mdname(mddev));
4927 err = -EINVAL; 4927 err = -EINVAL;
4928 goto abort_export; 4928 goto abort_export;
4929 } 4929 }
4930 clear_bit(In_sync, &rdev->flags); 4930 clear_bit(In_sync, &rdev->flags);
4931 rdev->desc_nr = -1; 4931 rdev->desc_nr = -1;
4932 rdev->saved_raid_disk = -1; 4932 rdev->saved_raid_disk = -1;
4933 err = bind_rdev_to_array(rdev, mddev); 4933 err = bind_rdev_to_array(rdev, mddev);
4934 if (err) 4934 if (err)
4935 goto abort_export; 4935 goto abort_export;
4936 4936
4937 /* 4937 /*
4938 * The rest should better be atomic, we can have disk failures 4938 * The rest should better be atomic, we can have disk failures
4939 * noticed in interrupt contexts ... 4939 * noticed in interrupt contexts ...
4940 */ 4940 */
4941 4941
4942 rdev->raid_disk = -1; 4942 rdev->raid_disk = -1;
4943 4943
4944 md_update_sb(mddev, 1); 4944 md_update_sb(mddev, 1);
4945 4945
4946 /* 4946 /*
4947 * Kick recovery, maybe this spare has to be added to the 4947 * Kick recovery, maybe this spare has to be added to the
4948 * array immediately. 4948 * array immediately.
4949 */ 4949 */
4950 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4950 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4951 md_wakeup_thread(mddev->thread); 4951 md_wakeup_thread(mddev->thread);
4952 md_new_event(mddev); 4952 md_new_event(mddev);
4953 return 0; 4953 return 0;
4954 4954
4955 abort_export: 4955 abort_export:
4956 export_rdev(rdev); 4956 export_rdev(rdev);
4957 return err; 4957 return err;
4958 } 4958 }
4959 4959
4960 static int set_bitmap_file(mddev_t *mddev, int fd) 4960 static int set_bitmap_file(mddev_t *mddev, int fd)
4961 { 4961 {
4962 int err; 4962 int err;
4963 4963
4964 if (mddev->pers) { 4964 if (mddev->pers) {
4965 if (!mddev->pers->quiesce) 4965 if (!mddev->pers->quiesce)
4966 return -EBUSY; 4966 return -EBUSY;
4967 if (mddev->recovery || mddev->sync_thread) 4967 if (mddev->recovery || mddev->sync_thread)
4968 return -EBUSY; 4968 return -EBUSY;
4969 /* we should be able to change the bitmap.. */ 4969 /* we should be able to change the bitmap.. */
4970 } 4970 }
4971 4971
4972 4972
4973 if (fd >= 0) { 4973 if (fd >= 0) {
4974 if (mddev->bitmap) 4974 if (mddev->bitmap)
4975 return -EEXIST; /* cannot add when bitmap is present */ 4975 return -EEXIST; /* cannot add when bitmap is present */
4976 mddev->bitmap_file = fget(fd); 4976 mddev->bitmap_file = fget(fd);
4977 4977
4978 if (mddev->bitmap_file == NULL) { 4978 if (mddev->bitmap_file == NULL) {
4979 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 4979 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
4980 mdname(mddev)); 4980 mdname(mddev));
4981 return -EBADF; 4981 return -EBADF;
4982 } 4982 }
4983 4983
4984 err = deny_bitmap_write_access(mddev->bitmap_file); 4984 err = deny_bitmap_write_access(mddev->bitmap_file);
4985 if (err) { 4985 if (err) {
4986 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 4986 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
4987 mdname(mddev)); 4987 mdname(mddev));
4988 fput(mddev->bitmap_file); 4988 fput(mddev->bitmap_file);
4989 mddev->bitmap_file = NULL; 4989 mddev->bitmap_file = NULL;
4990 return err; 4990 return err;
4991 } 4991 }
4992 mddev->bitmap_offset = 0; /* file overrides offset */ 4992 mddev->bitmap_offset = 0; /* file overrides offset */
4993 } else if (mddev->bitmap == NULL) 4993 } else if (mddev->bitmap == NULL)
4994 return -ENOENT; /* cannot remove what isn't there */ 4994 return -ENOENT; /* cannot remove what isn't there */
4995 err = 0; 4995 err = 0;
4996 if (mddev->pers) { 4996 if (mddev->pers) {
4997 mddev->pers->quiesce(mddev, 1); 4997 mddev->pers->quiesce(mddev, 1);
4998 if (fd >= 0) 4998 if (fd >= 0)
4999 err = bitmap_create(mddev); 4999 err = bitmap_create(mddev);
5000 if (fd < 0 || err) { 5000 if (fd < 0 || err) {
5001 bitmap_destroy(mddev); 5001 bitmap_destroy(mddev);
5002 fd = -1; /* make sure to put the file */ 5002 fd = -1; /* make sure to put the file */
5003 } 5003 }
5004 mddev->pers->quiesce(mddev, 0); 5004 mddev->pers->quiesce(mddev, 0);
5005 } 5005 }
5006 if (fd < 0) { 5006 if (fd < 0) {
5007 if (mddev->bitmap_file) { 5007 if (mddev->bitmap_file) {
5008 restore_bitmap_write_access(mddev->bitmap_file); 5008 restore_bitmap_write_access(mddev->bitmap_file);
5009 fput(mddev->bitmap_file); 5009 fput(mddev->bitmap_file);
5010 } 5010 }
5011 mddev->bitmap_file = NULL; 5011 mddev->bitmap_file = NULL;
5012 } 5012 }
5013 5013
5014 return err; 5014 return err;
5015 } 5015 }
5016 5016
5017 /* 5017 /*
5018 * set_array_info is used two different ways 5018 * set_array_info is used two different ways
5019 * The original usage is when creating a new array. 5019 * The original usage is when creating a new array.
5020 * In this usage, raid_disks is > 0 and it together with 5020 * In this usage, raid_disks is > 0 and it together with
5021 * level, size, not_persistent,layout,chunksize determine the 5021 * level, size, not_persistent,layout,chunksize determine the
5022 * shape of the array. 5022 * shape of the array.
5023 * This will always create an array with a type-0.90.0 superblock. 5023 * This will always create an array with a type-0.90.0 superblock.
5024 * The newer usage is when assembling an array. 5024 * The newer usage is when assembling an array.
5025 * In this case raid_disks will be 0, and the major_version field is 5025 * In this case raid_disks will be 0, and the major_version field is
5026 * use to determine which style super-blocks are to be found on the devices. 5026 * use to determine which style super-blocks are to be found on the devices.
5027 * The minor and patch _version numbers are also kept incase the 5027 * The minor and patch _version numbers are also kept incase the
5028 * super_block handler wishes to interpret them. 5028 * super_block handler wishes to interpret them.
5029 */ 5029 */
5030 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info) 5030 static int set_array_info(mddev_t * mddev, mdu_array_info_t *info)
5031 { 5031 {
5032 5032
5033 if (info->raid_disks == 0) { 5033 if (info->raid_disks == 0) {
5034 /* just setting version number for superblock loading */ 5034 /* just setting version number for superblock loading */
5035 if (info->major_version < 0 || 5035 if (info->major_version < 0 ||
5036 info->major_version >= ARRAY_SIZE(super_types) || 5036 info->major_version >= ARRAY_SIZE(super_types) ||
5037 super_types[info->major_version].name == NULL) { 5037 super_types[info->major_version].name == NULL) {
5038 /* maybe try to auto-load a module? */ 5038 /* maybe try to auto-load a module? */
5039 printk(KERN_INFO 5039 printk(KERN_INFO
5040 "md: superblock version %d not known\n", 5040 "md: superblock version %d not known\n",
5041 info->major_version); 5041 info->major_version);
5042 return -EINVAL; 5042 return -EINVAL;
5043 } 5043 }
5044 mddev->major_version = info->major_version; 5044 mddev->major_version = info->major_version;
5045 mddev->minor_version = info->minor_version; 5045 mddev->minor_version = info->minor_version;
5046 mddev->patch_version = info->patch_version; 5046 mddev->patch_version = info->patch_version;
5047 mddev->persistent = !info->not_persistent; 5047 mddev->persistent = !info->not_persistent;
5048 return 0; 5048 return 0;
5049 } 5049 }
5050 mddev->major_version = MD_MAJOR_VERSION; 5050 mddev->major_version = MD_MAJOR_VERSION;
5051 mddev->minor_version = MD_MINOR_VERSION; 5051 mddev->minor_version = MD_MINOR_VERSION;
5052 mddev->patch_version = MD_PATCHLEVEL_VERSION; 5052 mddev->patch_version = MD_PATCHLEVEL_VERSION;
5053 mddev->ctime = get_seconds(); 5053 mddev->ctime = get_seconds();
5054 5054
5055 mddev->level = info->level; 5055 mddev->level = info->level;
5056 mddev->clevel[0] = 0; 5056 mddev->clevel[0] = 0;
5057 mddev->dev_sectors = 2 * (sector_t)info->size; 5057 mddev->dev_sectors = 2 * (sector_t)info->size;
5058 mddev->raid_disks = info->raid_disks; 5058 mddev->raid_disks = info->raid_disks;
5059 /* don't set md_minor, it is determined by which /dev/md* was 5059 /* don't set md_minor, it is determined by which /dev/md* was
5060 * openned 5060 * openned
5061 */ 5061 */
5062 if (info->state & (1<<MD_SB_CLEAN)) 5062 if (info->state & (1<<MD_SB_CLEAN))
5063 mddev->recovery_cp = MaxSector; 5063 mddev->recovery_cp = MaxSector;
5064 else 5064 else
5065 mddev->recovery_cp = 0; 5065 mddev->recovery_cp = 0;
5066 mddev->persistent = ! info->not_persistent; 5066 mddev->persistent = ! info->not_persistent;
5067 mddev->external = 0; 5067 mddev->external = 0;
5068 5068
5069 mddev->layout = info->layout; 5069 mddev->layout = info->layout;
5070 mddev->chunk_sectors = info->chunk_size >> 9; 5070 mddev->chunk_sectors = info->chunk_size >> 9;
5071 5071
5072 mddev->max_disks = MD_SB_DISKS; 5072 mddev->max_disks = MD_SB_DISKS;
5073 5073
5074 if (mddev->persistent) 5074 if (mddev->persistent)
5075 mddev->flags = 0; 5075 mddev->flags = 0;
5076 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5076 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5077 5077
5078 mddev->default_bitmap_offset = MD_SB_BYTES >> 9; 5078 mddev->default_bitmap_offset = MD_SB_BYTES >> 9;
5079 mddev->bitmap_offset = 0; 5079 mddev->bitmap_offset = 0;
5080 5080
5081 mddev->reshape_position = MaxSector; 5081 mddev->reshape_position = MaxSector;
5082 5082
5083 /* 5083 /*
5084 * Generate a 128 bit UUID 5084 * Generate a 128 bit UUID
5085 */ 5085 */
5086 get_random_bytes(mddev->uuid, 16); 5086 get_random_bytes(mddev->uuid, 16);
5087 5087
5088 mddev->new_level = mddev->level; 5088 mddev->new_level = mddev->level;
5089 mddev->new_chunk_sectors = mddev->chunk_sectors; 5089 mddev->new_chunk_sectors = mddev->chunk_sectors;
5090 mddev->new_layout = mddev->layout; 5090 mddev->new_layout = mddev->layout;
5091 mddev->delta_disks = 0; 5091 mddev->delta_disks = 0;
5092 5092
5093 return 0; 5093 return 0;
5094 } 5094 }
5095 5095
5096 void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors) 5096 void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors)
5097 { 5097 {
5098 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 5098 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
5099 5099
5100 if (mddev->external_size) 5100 if (mddev->external_size)
5101 return; 5101 return;
5102 5102
5103 mddev->array_sectors = array_sectors; 5103 mddev->array_sectors = array_sectors;
5104 } 5104 }
5105 EXPORT_SYMBOL(md_set_array_sectors); 5105 EXPORT_SYMBOL(md_set_array_sectors);
5106 5106
5107 static int update_size(mddev_t *mddev, sector_t num_sectors) 5107 static int update_size(mddev_t *mddev, sector_t num_sectors)
5108 { 5108 {
5109 mdk_rdev_t *rdev; 5109 mdk_rdev_t *rdev;
5110 int rv; 5110 int rv;
5111 int fit = (num_sectors == 0); 5111 int fit = (num_sectors == 0);
5112 5112
5113 if (mddev->pers->resize == NULL) 5113 if (mddev->pers->resize == NULL)
5114 return -EINVAL; 5114 return -EINVAL;
5115 /* The "num_sectors" is the number of sectors of each device that 5115 /* The "num_sectors" is the number of sectors of each device that
5116 * is used. This can only make sense for arrays with redundancy. 5116 * is used. This can only make sense for arrays with redundancy.
5117 * linear and raid0 always use whatever space is available. We can only 5117 * linear and raid0 always use whatever space is available. We can only
5118 * consider changing this number if no resync or reconstruction is 5118 * consider changing this number if no resync or reconstruction is
5119 * happening, and if the new size is acceptable. It must fit before the 5119 * happening, and if the new size is acceptable. It must fit before the
5120 * sb_start or, if that is <data_offset, it must fit before the size 5120 * sb_start or, if that is <data_offset, it must fit before the size
5121 * of each device. If num_sectors is zero, we find the largest size 5121 * of each device. If num_sectors is zero, we find the largest size
5122 * that fits. 5122 * that fits.
5123 5123
5124 */ 5124 */
5125 if (mddev->sync_thread) 5125 if (mddev->sync_thread)
5126 return -EBUSY; 5126 return -EBUSY;
5127 if (mddev->bitmap) 5127 if (mddev->bitmap)
5128 /* Sorry, cannot grow a bitmap yet, just remove it, 5128 /* Sorry, cannot grow a bitmap yet, just remove it,
5129 * grow, and re-add. 5129 * grow, and re-add.
5130 */ 5130 */
5131 return -EBUSY; 5131 return -EBUSY;
5132 list_for_each_entry(rdev, &mddev->disks, same_set) { 5132 list_for_each_entry(rdev, &mddev->disks, same_set) {
5133 sector_t avail = rdev->sectors; 5133 sector_t avail = rdev->sectors;
5134 5134
5135 if (fit && (num_sectors == 0 || num_sectors > avail)) 5135 if (fit && (num_sectors == 0 || num_sectors > avail))
5136 num_sectors = avail; 5136 num_sectors = avail;
5137 if (avail < num_sectors) 5137 if (avail < num_sectors)
5138 return -ENOSPC; 5138 return -ENOSPC;
5139 } 5139 }
5140 rv = mddev->pers->resize(mddev, num_sectors); 5140 rv = mddev->pers->resize(mddev, num_sectors);
5141 if (!rv) 5141 if (!rv)
5142 revalidate_disk(mddev->gendisk); 5142 revalidate_disk(mddev->gendisk);
5143 return rv; 5143 return rv;
5144 } 5144 }
5145 5145
5146 static int update_raid_disks(mddev_t *mddev, int raid_disks) 5146 static int update_raid_disks(mddev_t *mddev, int raid_disks)
5147 { 5147 {
5148 int rv; 5148 int rv;
5149 /* change the number of raid disks */ 5149 /* change the number of raid disks */
5150 if (mddev->pers->check_reshape == NULL) 5150 if (mddev->pers->check_reshape == NULL)
5151 return -EINVAL; 5151 return -EINVAL;
5152 if (raid_disks <= 0 || 5152 if (raid_disks <= 0 ||
5153 raid_disks >= mddev->max_disks) 5153 raid_disks >= mddev->max_disks)
5154 return -EINVAL; 5154 return -EINVAL;
5155 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 5155 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
5156 return -EBUSY; 5156 return -EBUSY;
5157 mddev->delta_disks = raid_disks - mddev->raid_disks; 5157 mddev->delta_disks = raid_disks - mddev->raid_disks;
5158 5158
5159 rv = mddev->pers->check_reshape(mddev); 5159 rv = mddev->pers->check_reshape(mddev);
5160 return rv; 5160 return rv;
5161 } 5161 }
5162 5162
5163 5163
5164 /* 5164 /*
5165 * update_array_info is used to change the configuration of an 5165 * update_array_info is used to change the configuration of an
5166 * on-line array. 5166 * on-line array.
5167 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 5167 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
5168 * fields in the info are checked against the array. 5168 * fields in the info are checked against the array.
5169 * Any differences that cannot be handled will cause an error. 5169 * Any differences that cannot be handled will cause an error.
5170 * Normally, only one change can be managed at a time. 5170 * Normally, only one change can be managed at a time.
5171 */ 5171 */
5172 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info) 5172 static int update_array_info(mddev_t *mddev, mdu_array_info_t *info)
5173 { 5173 {
5174 int rv = 0; 5174 int rv = 0;
5175 int cnt = 0; 5175 int cnt = 0;
5176 int state = 0; 5176 int state = 0;
5177 5177
5178 /* calculate expected state,ignoring low bits */ 5178 /* calculate expected state,ignoring low bits */
5179 if (mddev->bitmap && mddev->bitmap_offset) 5179 if (mddev->bitmap && mddev->bitmap_offset)
5180 state |= (1 << MD_SB_BITMAP_PRESENT); 5180 state |= (1 << MD_SB_BITMAP_PRESENT);
5181 5181
5182 if (mddev->major_version != info->major_version || 5182 if (mddev->major_version != info->major_version ||
5183 mddev->minor_version != info->minor_version || 5183 mddev->minor_version != info->minor_version ||
5184 /* mddev->patch_version != info->patch_version || */ 5184 /* mddev->patch_version != info->patch_version || */
5185 mddev->ctime != info->ctime || 5185 mddev->ctime != info->ctime ||
5186 mddev->level != info->level || 5186 mddev->level != info->level ||
5187 /* mddev->layout != info->layout || */ 5187 /* mddev->layout != info->layout || */
5188 !mddev->persistent != info->not_persistent|| 5188 !mddev->persistent != info->not_persistent||
5189 mddev->chunk_sectors != info->chunk_size >> 9 || 5189 mddev->chunk_sectors != info->chunk_size >> 9 ||
5190 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 5190 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
5191 ((state^info->state) & 0xfffffe00) 5191 ((state^info->state) & 0xfffffe00)
5192 ) 5192 )
5193 return -EINVAL; 5193 return -EINVAL;
5194 /* Check there is only one change */ 5194 /* Check there is only one change */
5195 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5195 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5196 cnt++; 5196 cnt++;
5197 if (mddev->raid_disks != info->raid_disks) 5197 if (mddev->raid_disks != info->raid_disks)
5198 cnt++; 5198 cnt++;
5199 if (mddev->layout != info->layout) 5199 if (mddev->layout != info->layout)
5200 cnt++; 5200 cnt++;
5201 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 5201 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
5202 cnt++; 5202 cnt++;
5203 if (cnt == 0) 5203 if (cnt == 0)
5204 return 0; 5204 return 0;
5205 if (cnt > 1) 5205 if (cnt > 1)
5206 return -EINVAL; 5206 return -EINVAL;
5207 5207
5208 if (mddev->layout != info->layout) { 5208 if (mddev->layout != info->layout) {
5209 /* Change layout 5209 /* Change layout
5210 * we don't need to do anything at the md level, the 5210 * we don't need to do anything at the md level, the
5211 * personality will take care of it all. 5211 * personality will take care of it all.
5212 */ 5212 */
5213 if (mddev->pers->check_reshape == NULL) 5213 if (mddev->pers->check_reshape == NULL)
5214 return -EINVAL; 5214 return -EINVAL;
5215 else { 5215 else {
5216 mddev->new_layout = info->layout; 5216 mddev->new_layout = info->layout;
5217 rv = mddev->pers->check_reshape(mddev); 5217 rv = mddev->pers->check_reshape(mddev);
5218 if (rv) 5218 if (rv)
5219 mddev->new_layout = mddev->layout; 5219 mddev->new_layout = mddev->layout;
5220 return rv; 5220 return rv;
5221 } 5221 }
5222 } 5222 }
5223 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 5223 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
5224 rv = update_size(mddev, (sector_t)info->size * 2); 5224 rv = update_size(mddev, (sector_t)info->size * 2);
5225 5225
5226 if (mddev->raid_disks != info->raid_disks) 5226 if (mddev->raid_disks != info->raid_disks)
5227 rv = update_raid_disks(mddev, info->raid_disks); 5227 rv = update_raid_disks(mddev, info->raid_disks);
5228 5228
5229 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 5229 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
5230 if (mddev->pers->quiesce == NULL) 5230 if (mddev->pers->quiesce == NULL)
5231 return -EINVAL; 5231 return -EINVAL;
5232 if (mddev->recovery || mddev->sync_thread) 5232 if (mddev->recovery || mddev->sync_thread)
5233 return -EBUSY; 5233 return -EBUSY;
5234 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 5234 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
5235 /* add the bitmap */ 5235 /* add the bitmap */
5236 if (mddev->bitmap) 5236 if (mddev->bitmap)
5237 return -EEXIST; 5237 return -EEXIST;
5238 if (mddev->default_bitmap_offset == 0) 5238 if (mddev->default_bitmap_offset == 0)
5239 return -EINVAL; 5239 return -EINVAL;
5240 mddev->bitmap_offset = mddev->default_bitmap_offset; 5240 mddev->bitmap_offset = mddev->default_bitmap_offset;
5241 mddev->pers->quiesce(mddev, 1); 5241 mddev->pers->quiesce(mddev, 1);
5242 rv = bitmap_create(mddev); 5242 rv = bitmap_create(mddev);
5243 if (rv) 5243 if (rv)
5244 bitmap_destroy(mddev); 5244 bitmap_destroy(mddev);
5245 mddev->pers->quiesce(mddev, 0); 5245 mddev->pers->quiesce(mddev, 0);
5246 } else { 5246 } else {
5247 /* remove the bitmap */ 5247 /* remove the bitmap */
5248 if (!mddev->bitmap) 5248 if (!mddev->bitmap)
5249 return -ENOENT; 5249 return -ENOENT;
5250 if (mddev->bitmap->file) 5250 if (mddev->bitmap->file)
5251 return -EINVAL; 5251 return -EINVAL;
5252 mddev->pers->quiesce(mddev, 1); 5252 mddev->pers->quiesce(mddev, 1);
5253 bitmap_destroy(mddev); 5253 bitmap_destroy(mddev);
5254 mddev->pers->quiesce(mddev, 0); 5254 mddev->pers->quiesce(mddev, 0);
5255 mddev->bitmap_offset = 0; 5255 mddev->bitmap_offset = 0;
5256 } 5256 }
5257 } 5257 }
5258 md_update_sb(mddev, 1); 5258 md_update_sb(mddev, 1);
5259 return rv; 5259 return rv;
5260 } 5260 }
5261 5261
5262 static int set_disk_faulty(mddev_t *mddev, dev_t dev) 5262 static int set_disk_faulty(mddev_t *mddev, dev_t dev)
5263 { 5263 {
5264 mdk_rdev_t *rdev; 5264 mdk_rdev_t *rdev;
5265 5265
5266 if (mddev->pers == NULL) 5266 if (mddev->pers == NULL)
5267 return -ENODEV; 5267 return -ENODEV;
5268 5268
5269 rdev = find_rdev(mddev, dev); 5269 rdev = find_rdev(mddev, dev);
5270 if (!rdev) 5270 if (!rdev)
5271 return -ENODEV; 5271 return -ENODEV;
5272 5272
5273 md_error(mddev, rdev); 5273 md_error(mddev, rdev);
5274 return 0; 5274 return 0;
5275 } 5275 }
5276 5276
5277 /* 5277 /*
5278 * We have a problem here : there is no easy way to give a CHS 5278 * We have a problem here : there is no easy way to give a CHS
5279 * virtual geometry. We currently pretend that we have a 2 heads 5279 * virtual geometry. We currently pretend that we have a 2 heads
5280 * 4 sectors (with a BIG number of cylinders...). This drives 5280 * 4 sectors (with a BIG number of cylinders...). This drives
5281 * dosfs just mad... ;-) 5281 * dosfs just mad... ;-)
5282 */ 5282 */
5283 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 5283 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
5284 { 5284 {
5285 mddev_t *mddev = bdev->bd_disk->private_data; 5285 mddev_t *mddev = bdev->bd_disk->private_data;
5286 5286
5287 geo->heads = 2; 5287 geo->heads = 2;
5288 geo->sectors = 4; 5288 geo->sectors = 4;
5289 geo->cylinders = get_capacity(mddev->gendisk) / 8; 5289 geo->cylinders = get_capacity(mddev->gendisk) / 8;
5290 return 0; 5290 return 0;
5291 } 5291 }
5292 5292
5293 static int md_ioctl(struct block_device *bdev, fmode_t mode, 5293 static int md_ioctl(struct block_device *bdev, fmode_t mode,
5294 unsigned int cmd, unsigned long arg) 5294 unsigned int cmd, unsigned long arg)
5295 { 5295 {
5296 int err = 0; 5296 int err = 0;
5297 void __user *argp = (void __user *)arg; 5297 void __user *argp = (void __user *)arg;
5298 mddev_t *mddev = NULL; 5298 mddev_t *mddev = NULL;
5299 5299
5300 if (!capable(CAP_SYS_ADMIN)) 5300 if (!capable(CAP_SYS_ADMIN))
5301 return -EACCES; 5301 return -EACCES;
5302 5302
5303 /* 5303 /*
5304 * Commands dealing with the RAID driver but not any 5304 * Commands dealing with the RAID driver but not any
5305 * particular array: 5305 * particular array:
5306 */ 5306 */
5307 switch (cmd) 5307 switch (cmd)
5308 { 5308 {
5309 case RAID_VERSION: 5309 case RAID_VERSION:
5310 err = get_version(argp); 5310 err = get_version(argp);
5311 goto done; 5311 goto done;
5312 5312
5313 case PRINT_RAID_DEBUG: 5313 case PRINT_RAID_DEBUG:
5314 err = 0; 5314 err = 0;
5315 md_print_devices(); 5315 md_print_devices();
5316 goto done; 5316 goto done;
5317 5317
5318 #ifndef MODULE 5318 #ifndef MODULE
5319 case RAID_AUTORUN: 5319 case RAID_AUTORUN:
5320 err = 0; 5320 err = 0;
5321 autostart_arrays(arg); 5321 autostart_arrays(arg);
5322 goto done; 5322 goto done;
5323 #endif 5323 #endif
5324 default:; 5324 default:;
5325 } 5325 }
5326 5326
5327 /* 5327 /*
5328 * Commands creating/starting a new array: 5328 * Commands creating/starting a new array:
5329 */ 5329 */
5330 5330
5331 mddev = bdev->bd_disk->private_data; 5331 mddev = bdev->bd_disk->private_data;
5332 5332
5333 if (!mddev) { 5333 if (!mddev) {
5334 BUG(); 5334 BUG();
5335 goto abort; 5335 goto abort;
5336 } 5336 }
5337 5337
5338 err = mddev_lock(mddev); 5338 err = mddev_lock(mddev);
5339 if (err) { 5339 if (err) {
5340 printk(KERN_INFO 5340 printk(KERN_INFO
5341 "md: ioctl lock interrupted, reason %d, cmd %d\n", 5341 "md: ioctl lock interrupted, reason %d, cmd %d\n",
5342 err, cmd); 5342 err, cmd);
5343 goto abort; 5343 goto abort;
5344 } 5344 }
5345 5345
5346 switch (cmd) 5346 switch (cmd)
5347 { 5347 {
5348 case SET_ARRAY_INFO: 5348 case SET_ARRAY_INFO:
5349 { 5349 {
5350 mdu_array_info_t info; 5350 mdu_array_info_t info;
5351 if (!arg) 5351 if (!arg)
5352 memset(&info, 0, sizeof(info)); 5352 memset(&info, 0, sizeof(info));
5353 else if (copy_from_user(&info, argp, sizeof(info))) { 5353 else if (copy_from_user(&info, argp, sizeof(info))) {
5354 err = -EFAULT; 5354 err = -EFAULT;
5355 goto abort_unlock; 5355 goto abort_unlock;
5356 } 5356 }
5357 if (mddev->pers) { 5357 if (mddev->pers) {
5358 err = update_array_info(mddev, &info); 5358 err = update_array_info(mddev, &info);
5359 if (err) { 5359 if (err) {
5360 printk(KERN_WARNING "md: couldn't update" 5360 printk(KERN_WARNING "md: couldn't update"
5361 " array info. %d\n", err); 5361 " array info. %d\n", err);
5362 goto abort_unlock; 5362 goto abort_unlock;
5363 } 5363 }
5364 goto done_unlock; 5364 goto done_unlock;
5365 } 5365 }
5366 if (!list_empty(&mddev->disks)) { 5366 if (!list_empty(&mddev->disks)) {
5367 printk(KERN_WARNING 5367 printk(KERN_WARNING
5368 "md: array %s already has disks!\n", 5368 "md: array %s already has disks!\n",
5369 mdname(mddev)); 5369 mdname(mddev));
5370 err = -EBUSY; 5370 err = -EBUSY;
5371 goto abort_unlock; 5371 goto abort_unlock;
5372 } 5372 }
5373 if (mddev->raid_disks) { 5373 if (mddev->raid_disks) {
5374 printk(KERN_WARNING 5374 printk(KERN_WARNING
5375 "md: array %s already initialised!\n", 5375 "md: array %s already initialised!\n",
5376 mdname(mddev)); 5376 mdname(mddev));
5377 err = -EBUSY; 5377 err = -EBUSY;
5378 goto abort_unlock; 5378 goto abort_unlock;
5379 } 5379 }
5380 err = set_array_info(mddev, &info); 5380 err = set_array_info(mddev, &info);
5381 if (err) { 5381 if (err) {
5382 printk(KERN_WARNING "md: couldn't set" 5382 printk(KERN_WARNING "md: couldn't set"
5383 " array info. %d\n", err); 5383 " array info. %d\n", err);
5384 goto abort_unlock; 5384 goto abort_unlock;
5385 } 5385 }
5386 } 5386 }
5387 goto done_unlock; 5387 goto done_unlock;
5388 5388
5389 default:; 5389 default:;
5390 } 5390 }
5391 5391
5392 /* 5392 /*
5393 * Commands querying/configuring an existing array: 5393 * Commands querying/configuring an existing array:
5394 */ 5394 */
5395 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 5395 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
5396 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 5396 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
5397 if ((!mddev->raid_disks && !mddev->external) 5397 if ((!mddev->raid_disks && !mddev->external)
5398 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 5398 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
5399 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 5399 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
5400 && cmd != GET_BITMAP_FILE) { 5400 && cmd != GET_BITMAP_FILE) {
5401 err = -ENODEV; 5401 err = -ENODEV;
5402 goto abort_unlock; 5402 goto abort_unlock;
5403 } 5403 }
5404 5404
5405 /* 5405 /*
5406 * Commands even a read-only array can execute: 5406 * Commands even a read-only array can execute:
5407 */ 5407 */
5408 switch (cmd) 5408 switch (cmd)
5409 { 5409 {
5410 case GET_ARRAY_INFO: 5410 case GET_ARRAY_INFO:
5411 err = get_array_info(mddev, argp); 5411 err = get_array_info(mddev, argp);
5412 goto done_unlock; 5412 goto done_unlock;
5413 5413
5414 case GET_BITMAP_FILE: 5414 case GET_BITMAP_FILE:
5415 err = get_bitmap_file(mddev, argp); 5415 err = get_bitmap_file(mddev, argp);
5416 goto done_unlock; 5416 goto done_unlock;
5417 5417
5418 case GET_DISK_INFO: 5418 case GET_DISK_INFO:
5419 err = get_disk_info(mddev, argp); 5419 err = get_disk_info(mddev, argp);
5420 goto done_unlock; 5420 goto done_unlock;
5421 5421
5422 case RESTART_ARRAY_RW: 5422 case RESTART_ARRAY_RW:
5423 err = restart_array(mddev); 5423 err = restart_array(mddev);
5424 goto done_unlock; 5424 goto done_unlock;
5425 5425
5426 case STOP_ARRAY: 5426 case STOP_ARRAY:
5427 err = do_md_stop(mddev, 0, 1); 5427 err = do_md_stop(mddev, 0, 1);
5428 goto done_unlock; 5428 goto done_unlock;
5429 5429
5430 case STOP_ARRAY_RO: 5430 case STOP_ARRAY_RO:
5431 err = do_md_stop(mddev, 1, 1); 5431 err = do_md_stop(mddev, 1, 1);
5432 goto done_unlock; 5432 goto done_unlock;
5433 5433
5434 } 5434 }
5435 5435
5436 /* 5436 /*
5437 * The remaining ioctls are changing the state of the 5437 * The remaining ioctls are changing the state of the
5438 * superblock, so we do not allow them on read-only arrays. 5438 * superblock, so we do not allow them on read-only arrays.
5439 * However non-MD ioctls (e.g. get-size) will still come through 5439 * However non-MD ioctls (e.g. get-size) will still come through
5440 * here and hit the 'default' below, so only disallow 5440 * here and hit the 'default' below, so only disallow
5441 * 'md' ioctls, and switch to rw mode if started auto-readonly. 5441 * 'md' ioctls, and switch to rw mode if started auto-readonly.
5442 */ 5442 */
5443 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { 5443 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
5444 if (mddev->ro == 2) { 5444 if (mddev->ro == 2) {
5445 mddev->ro = 0; 5445 mddev->ro = 0;
5446 sysfs_notify_dirent(mddev->sysfs_state); 5446 sysfs_notify_dirent(mddev->sysfs_state);
5447 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5447 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5448 md_wakeup_thread(mddev->thread); 5448 md_wakeup_thread(mddev->thread);
5449 } else { 5449 } else {
5450 err = -EROFS; 5450 err = -EROFS;
5451 goto abort_unlock; 5451 goto abort_unlock;
5452 } 5452 }
5453 } 5453 }
5454 5454
5455 switch (cmd) 5455 switch (cmd)
5456 { 5456 {
5457 case ADD_NEW_DISK: 5457 case ADD_NEW_DISK:
5458 { 5458 {
5459 mdu_disk_info_t info; 5459 mdu_disk_info_t info;
5460 if (copy_from_user(&info, argp, sizeof(info))) 5460 if (copy_from_user(&info, argp, sizeof(info)))
5461 err = -EFAULT; 5461 err = -EFAULT;
5462 else 5462 else
5463 err = add_new_disk(mddev, &info); 5463 err = add_new_disk(mddev, &info);
5464 goto done_unlock; 5464 goto done_unlock;
5465 } 5465 }
5466 5466
5467 case HOT_REMOVE_DISK: 5467 case HOT_REMOVE_DISK:
5468 err = hot_remove_disk(mddev, new_decode_dev(arg)); 5468 err = hot_remove_disk(mddev, new_decode_dev(arg));
5469 goto done_unlock; 5469 goto done_unlock;
5470 5470
5471 case HOT_ADD_DISK: 5471 case HOT_ADD_DISK:
5472 err = hot_add_disk(mddev, new_decode_dev(arg)); 5472 err = hot_add_disk(mddev, new_decode_dev(arg));
5473 goto done_unlock; 5473 goto done_unlock;
5474 5474
5475 case SET_DISK_FAULTY: 5475 case SET_DISK_FAULTY:
5476 err = set_disk_faulty(mddev, new_decode_dev(arg)); 5476 err = set_disk_faulty(mddev, new_decode_dev(arg));
5477 goto done_unlock; 5477 goto done_unlock;
5478 5478
5479 case RUN_ARRAY: 5479 case RUN_ARRAY:
5480 err = do_md_run(mddev); 5480 err = do_md_run(mddev);
5481 goto done_unlock; 5481 goto done_unlock;
5482 5482
5483 case SET_BITMAP_FILE: 5483 case SET_BITMAP_FILE:
5484 err = set_bitmap_file(mddev, (int)arg); 5484 err = set_bitmap_file(mddev, (int)arg);
5485 goto done_unlock; 5485 goto done_unlock;
5486 5486
5487 default: 5487 default:
5488 err = -EINVAL; 5488 err = -EINVAL;
5489 goto abort_unlock; 5489 goto abort_unlock;
5490 } 5490 }
5491 5491
5492 done_unlock: 5492 done_unlock:
5493 abort_unlock: 5493 abort_unlock:
5494 if (mddev->hold_active == UNTIL_IOCTL && 5494 if (mddev->hold_active == UNTIL_IOCTL &&
5495 err != -EINVAL) 5495 err != -EINVAL)
5496 mddev->hold_active = 0; 5496 mddev->hold_active = 0;
5497 mddev_unlock(mddev); 5497 mddev_unlock(mddev);
5498 5498
5499 return err; 5499 return err;
5500 done: 5500 done:
5501 if (err) 5501 if (err)
5502 MD_BUG(); 5502 MD_BUG();
5503 abort: 5503 abort:
5504 return err; 5504 return err;
5505 } 5505 }
5506 5506
5507 static int md_open(struct block_device *bdev, fmode_t mode) 5507 static int md_open(struct block_device *bdev, fmode_t mode)
5508 { 5508 {
5509 /* 5509 /*
5510 * Succeed if we can lock the mddev, which confirms that 5510 * Succeed if we can lock the mddev, which confirms that
5511 * it isn't being stopped right now. 5511 * it isn't being stopped right now.
5512 */ 5512 */
5513 mddev_t *mddev = mddev_find(bdev->bd_dev); 5513 mddev_t *mddev = mddev_find(bdev->bd_dev);
5514 int err; 5514 int err;
5515 5515
5516 if (mddev->gendisk != bdev->bd_disk) { 5516 if (mddev->gendisk != bdev->bd_disk) {
5517 /* we are racing with mddev_put which is discarding this 5517 /* we are racing with mddev_put which is discarding this
5518 * bd_disk. 5518 * bd_disk.
5519 */ 5519 */
5520 mddev_put(mddev); 5520 mddev_put(mddev);
5521 /* Wait until bdev->bd_disk is definitely gone */ 5521 /* Wait until bdev->bd_disk is definitely gone */
5522 flush_scheduled_work(); 5522 flush_scheduled_work();
5523 /* Then retry the open from the top */ 5523 /* Then retry the open from the top */
5524 return -ERESTARTSYS; 5524 return -ERESTARTSYS;
5525 } 5525 }
5526 BUG_ON(mddev != bdev->bd_disk->private_data); 5526 BUG_ON(mddev != bdev->bd_disk->private_data);
5527 5527
5528 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 5528 if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
5529 goto out; 5529 goto out;
5530 5530
5531 err = 0; 5531 err = 0;
5532 atomic_inc(&mddev->openers); 5532 atomic_inc(&mddev->openers);
5533 mutex_unlock(&mddev->open_mutex); 5533 mutex_unlock(&mddev->open_mutex);
5534 5534
5535 check_disk_change(bdev); 5535 check_disk_change(bdev);
5536 out: 5536 out:
5537 return err; 5537 return err;
5538 } 5538 }
5539 5539
5540 static int md_release(struct gendisk *disk, fmode_t mode) 5540 static int md_release(struct gendisk *disk, fmode_t mode)
5541 { 5541 {
5542 mddev_t *mddev = disk->private_data; 5542 mddev_t *mddev = disk->private_data;
5543 5543
5544 BUG_ON(!mddev); 5544 BUG_ON(!mddev);
5545 atomic_dec(&mddev->openers); 5545 atomic_dec(&mddev->openers);
5546 mddev_put(mddev); 5546 mddev_put(mddev);
5547 5547
5548 return 0; 5548 return 0;
5549 } 5549 }
5550 5550
5551 static int md_media_changed(struct gendisk *disk) 5551 static int md_media_changed(struct gendisk *disk)
5552 { 5552 {
5553 mddev_t *mddev = disk->private_data; 5553 mddev_t *mddev = disk->private_data;
5554 5554
5555 return mddev->changed; 5555 return mddev->changed;
5556 } 5556 }
5557 5557
5558 static int md_revalidate(struct gendisk *disk) 5558 static int md_revalidate(struct gendisk *disk)
5559 { 5559 {
5560 mddev_t *mddev = disk->private_data; 5560 mddev_t *mddev = disk->private_data;
5561 5561
5562 mddev->changed = 0; 5562 mddev->changed = 0;
5563 return 0; 5563 return 0;
5564 } 5564 }
5565 static const struct block_device_operations md_fops = 5565 static const struct block_device_operations md_fops =
5566 { 5566 {
5567 .owner = THIS_MODULE, 5567 .owner = THIS_MODULE,
5568 .open = md_open, 5568 .open = md_open,
5569 .release = md_release, 5569 .release = md_release,
5570 .ioctl = md_ioctl, 5570 .ioctl = md_ioctl,
5571 .getgeo = md_getgeo, 5571 .getgeo = md_getgeo,
5572 .media_changed = md_media_changed, 5572 .media_changed = md_media_changed,
5573 .revalidate_disk= md_revalidate, 5573 .revalidate_disk= md_revalidate,
5574 }; 5574 };
5575 5575
5576 static int md_thread(void * arg) 5576 static int md_thread(void * arg)
5577 { 5577 {
5578 mdk_thread_t *thread = arg; 5578 mdk_thread_t *thread = arg;
5579 5579
5580 /* 5580 /*
5581 * md_thread is a 'system-thread', it's priority should be very 5581 * md_thread is a 'system-thread', it's priority should be very
5582 * high. We avoid resource deadlocks individually in each 5582 * high. We avoid resource deadlocks individually in each
5583 * raid personality. (RAID5 does preallocation) We also use RR and 5583 * raid personality. (RAID5 does preallocation) We also use RR and
5584 * the very same RT priority as kswapd, thus we will never get 5584 * the very same RT priority as kswapd, thus we will never get
5585 * into a priority inversion deadlock. 5585 * into a priority inversion deadlock.
5586 * 5586 *
5587 * we definitely have to have equal or higher priority than 5587 * we definitely have to have equal or higher priority than
5588 * bdflush, otherwise bdflush will deadlock if there are too 5588 * bdflush, otherwise bdflush will deadlock if there are too
5589 * many dirty RAID5 blocks. 5589 * many dirty RAID5 blocks.
5590 */ 5590 */
5591 5591
5592 allow_signal(SIGKILL); 5592 allow_signal(SIGKILL);
5593 while (!kthread_should_stop()) { 5593 while (!kthread_should_stop()) {
5594 5594
5595 /* We need to wait INTERRUPTIBLE so that 5595 /* We need to wait INTERRUPTIBLE so that
5596 * we don't add to the load-average. 5596 * we don't add to the load-average.
5597 * That means we need to be sure no signals are 5597 * That means we need to be sure no signals are
5598 * pending 5598 * pending
5599 */ 5599 */
5600 if (signal_pending(current)) 5600 if (signal_pending(current))
5601 flush_signals(current); 5601 flush_signals(current);
5602 5602
5603 wait_event_interruptible_timeout 5603 wait_event_interruptible_timeout
5604 (thread->wqueue, 5604 (thread->wqueue,
5605 test_bit(THREAD_WAKEUP, &thread->flags) 5605 test_bit(THREAD_WAKEUP, &thread->flags)
5606 || kthread_should_stop(), 5606 || kthread_should_stop(),
5607 thread->timeout); 5607 thread->timeout);
5608 5608
5609 clear_bit(THREAD_WAKEUP, &thread->flags); 5609 clear_bit(THREAD_WAKEUP, &thread->flags);
5610 5610
5611 thread->run(thread->mddev); 5611 thread->run(thread->mddev);
5612 } 5612 }
5613 5613
5614 return 0; 5614 return 0;
5615 } 5615 }
5616 5616
5617 void md_wakeup_thread(mdk_thread_t *thread) 5617 void md_wakeup_thread(mdk_thread_t *thread)
5618 { 5618 {
5619 if (thread) { 5619 if (thread) {
5620 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm); 5620 dprintk("md: waking up MD thread %s.\n", thread->tsk->comm);
5621 set_bit(THREAD_WAKEUP, &thread->flags); 5621 set_bit(THREAD_WAKEUP, &thread->flags);
5622 wake_up(&thread->wqueue); 5622 wake_up(&thread->wqueue);
5623 } 5623 }
5624 } 5624 }
5625 5625
5626 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev, 5626 mdk_thread_t *md_register_thread(void (*run) (mddev_t *), mddev_t *mddev,
5627 const char *name) 5627 const char *name)
5628 { 5628 {
5629 mdk_thread_t *thread; 5629 mdk_thread_t *thread;
5630 5630
5631 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL); 5631 thread = kzalloc(sizeof(mdk_thread_t), GFP_KERNEL);
5632 if (!thread) 5632 if (!thread)
5633 return NULL; 5633 return NULL;
5634 5634
5635 init_waitqueue_head(&thread->wqueue); 5635 init_waitqueue_head(&thread->wqueue);
5636 5636
5637 thread->run = run; 5637 thread->run = run;
5638 thread->mddev = mddev; 5638 thread->mddev = mddev;
5639 thread->timeout = MAX_SCHEDULE_TIMEOUT; 5639 thread->timeout = MAX_SCHEDULE_TIMEOUT;
5640 thread->tsk = kthread_run(md_thread, thread, 5640 thread->tsk = kthread_run(md_thread, thread,
5641 "%s_%s", 5641 "%s_%s",
5642 mdname(thread->mddev), 5642 mdname(thread->mddev),
5643 name ?: mddev->pers->name); 5643 name ?: mddev->pers->name);
5644 if (IS_ERR(thread->tsk)) { 5644 if (IS_ERR(thread->tsk)) {
5645 kfree(thread); 5645 kfree(thread);
5646 return NULL; 5646 return NULL;
5647 } 5647 }
5648 return thread; 5648 return thread;
5649 } 5649 }
5650 5650
5651 void md_unregister_thread(mdk_thread_t *thread) 5651 void md_unregister_thread(mdk_thread_t *thread)
5652 { 5652 {
5653 if (!thread) 5653 if (!thread)
5654 return; 5654 return;
5655 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 5655 dprintk("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
5656 5656
5657 kthread_stop(thread->tsk); 5657 kthread_stop(thread->tsk);
5658 kfree(thread); 5658 kfree(thread);
5659 } 5659 }
5660 5660
5661 void md_error(mddev_t *mddev, mdk_rdev_t *rdev) 5661 void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
5662 { 5662 {
5663 if (!mddev) { 5663 if (!mddev) {
5664 MD_BUG(); 5664 MD_BUG();
5665 return; 5665 return;
5666 } 5666 }
5667 5667
5668 if (!rdev || test_bit(Faulty, &rdev->flags)) 5668 if (!rdev || test_bit(Faulty, &rdev->flags))
5669 return; 5669 return;
5670 5670
5671 if (mddev->external) 5671 if (mddev->external)
5672 set_bit(Blocked, &rdev->flags); 5672 set_bit(Blocked, &rdev->flags);
5673 /* 5673 /*
5674 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", 5674 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
5675 mdname(mddev), 5675 mdname(mddev),
5676 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), 5676 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
5677 __builtin_return_address(0),__builtin_return_address(1), 5677 __builtin_return_address(0),__builtin_return_address(1),
5678 __builtin_return_address(2),__builtin_return_address(3)); 5678 __builtin_return_address(2),__builtin_return_address(3));
5679 */ 5679 */
5680 if (!mddev->pers) 5680 if (!mddev->pers)
5681 return; 5681 return;
5682 if (!mddev->pers->error_handler) 5682 if (!mddev->pers->error_handler)
5683 return; 5683 return;
5684 mddev->pers->error_handler(mddev,rdev); 5684 mddev->pers->error_handler(mddev,rdev);
5685 if (mddev->degraded) 5685 if (mddev->degraded)
5686 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5686 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5687 set_bit(StateChanged, &rdev->flags); 5687 set_bit(StateChanged, &rdev->flags);
5688 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5688 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5689 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5689 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5690 md_wakeup_thread(mddev->thread); 5690 md_wakeup_thread(mddev->thread);
5691 md_new_event_inintr(mddev); 5691 md_new_event_inintr(mddev);
5692 } 5692 }
5693 5693
5694 /* seq_file implementation /proc/mdstat */ 5694 /* seq_file implementation /proc/mdstat */
5695 5695
5696 static void status_unused(struct seq_file *seq) 5696 static void status_unused(struct seq_file *seq)
5697 { 5697 {
5698 int i = 0; 5698 int i = 0;
5699 mdk_rdev_t *rdev; 5699 mdk_rdev_t *rdev;
5700 5700
5701 seq_printf(seq, "unused devices: "); 5701 seq_printf(seq, "unused devices: ");
5702 5702
5703 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 5703 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
5704 char b[BDEVNAME_SIZE]; 5704 char b[BDEVNAME_SIZE];
5705 i++; 5705 i++;
5706 seq_printf(seq, "%s ", 5706 seq_printf(seq, "%s ",
5707 bdevname(rdev->bdev,b)); 5707 bdevname(rdev->bdev,b));
5708 } 5708 }
5709 if (!i) 5709 if (!i)
5710 seq_printf(seq, "<none>"); 5710 seq_printf(seq, "<none>");
5711 5711
5712 seq_printf(seq, "\n"); 5712 seq_printf(seq, "\n");
5713 } 5713 }
5714 5714
5715 5715
5716 static void status_resync(struct seq_file *seq, mddev_t * mddev) 5716 static void status_resync(struct seq_file *seq, mddev_t * mddev)
5717 { 5717 {
5718 sector_t max_sectors, resync, res; 5718 sector_t max_sectors, resync, res;
5719 unsigned long dt, db; 5719 unsigned long dt, db;
5720 sector_t rt; 5720 sector_t rt;
5721 int scale; 5721 int scale;
5722 unsigned int per_milli; 5722 unsigned int per_milli;
5723 5723
5724 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active); 5724 resync = mddev->curr_resync - atomic_read(&mddev->recovery_active);
5725 5725
5726 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 5726 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
5727 max_sectors = mddev->resync_max_sectors; 5727 max_sectors = mddev->resync_max_sectors;
5728 else 5728 else
5729 max_sectors = mddev->dev_sectors; 5729 max_sectors = mddev->dev_sectors;
5730 5730
5731 /* 5731 /*
5732 * Should not happen. 5732 * Should not happen.
5733 */ 5733 */
5734 if (!max_sectors) { 5734 if (!max_sectors) {
5735 MD_BUG(); 5735 MD_BUG();
5736 return; 5736 return;
5737 } 5737 }
5738 /* Pick 'scale' such that (resync>>scale)*1000 will fit 5738 /* Pick 'scale' such that (resync>>scale)*1000 will fit
5739 * in a sector_t, and (max_sectors>>scale) will fit in a 5739 * in a sector_t, and (max_sectors>>scale) will fit in a
5740 * u32, as those are the requirements for sector_div. 5740 * u32, as those are the requirements for sector_div.
5741 * Thus 'scale' must be at least 10 5741 * Thus 'scale' must be at least 10
5742 */ 5742 */
5743 scale = 10; 5743 scale = 10;
5744 if (sizeof(sector_t) > sizeof(unsigned long)) { 5744 if (sizeof(sector_t) > sizeof(unsigned long)) {
5745 while ( max_sectors/2 > (1ULL<<(scale+32))) 5745 while ( max_sectors/2 > (1ULL<<(scale+32)))
5746 scale++; 5746 scale++;
5747 } 5747 }
5748 res = (resync>>scale)*1000; 5748 res = (resync>>scale)*1000;
5749 sector_div(res, (u32)((max_sectors>>scale)+1)); 5749 sector_div(res, (u32)((max_sectors>>scale)+1));
5750 5750
5751 per_milli = res; 5751 per_milli = res;
5752 { 5752 {
5753 int i, x = per_milli/50, y = 20-x; 5753 int i, x = per_milli/50, y = 20-x;
5754 seq_printf(seq, "["); 5754 seq_printf(seq, "[");
5755 for (i = 0; i < x; i++) 5755 for (i = 0; i < x; i++)
5756 seq_printf(seq, "="); 5756 seq_printf(seq, "=");
5757 seq_printf(seq, ">"); 5757 seq_printf(seq, ">");
5758 for (i = 0; i < y; i++) 5758 for (i = 0; i < y; i++)
5759 seq_printf(seq, "."); 5759 seq_printf(seq, ".");
5760 seq_printf(seq, "] "); 5760 seq_printf(seq, "] ");
5761 } 5761 }
5762 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 5762 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
5763 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 5763 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
5764 "reshape" : 5764 "reshape" :
5765 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 5765 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
5766 "check" : 5766 "check" :
5767 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 5767 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
5768 "resync" : "recovery"))), 5768 "resync" : "recovery"))),
5769 per_milli/10, per_milli % 10, 5769 per_milli/10, per_milli % 10,
5770 (unsigned long long) resync/2, 5770 (unsigned long long) resync/2,
5771 (unsigned long long) max_sectors/2); 5771 (unsigned long long) max_sectors/2);
5772 5772
5773 /* 5773 /*
5774 * dt: time from mark until now 5774 * dt: time from mark until now
5775 * db: blocks written from mark until now 5775 * db: blocks written from mark until now
5776 * rt: remaining time 5776 * rt: remaining time
5777 * 5777 *
5778 * rt is a sector_t, so could be 32bit or 64bit. 5778 * rt is a sector_t, so could be 32bit or 64bit.
5779 * So we divide before multiply in case it is 32bit and close 5779 * So we divide before multiply in case it is 32bit and close
5780 * to the limit. 5780 * to the limit.
5781 * We scale the divisor (db) by 32 to avoid loosing precision 5781 * We scale the divisor (db) by 32 to avoid loosing precision
5782 * near the end of resync when the number of remaining sectors 5782 * near the end of resync when the number of remaining sectors
5783 * is close to 'db'. 5783 * is close to 'db'.
5784 * We then divide rt by 32 after multiplying by db to compensate. 5784 * We then divide rt by 32 after multiplying by db to compensate.
5785 * The '+1' avoids division by zero if db is very small. 5785 * The '+1' avoids division by zero if db is very small.
5786 */ 5786 */
5787 dt = ((jiffies - mddev->resync_mark) / HZ); 5787 dt = ((jiffies - mddev->resync_mark) / HZ);
5788 if (!dt) dt++; 5788 if (!dt) dt++;
5789 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 5789 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
5790 - mddev->resync_mark_cnt; 5790 - mddev->resync_mark_cnt;
5791 5791
5792 rt = max_sectors - resync; /* number of remaining sectors */ 5792 rt = max_sectors - resync; /* number of remaining sectors */
5793 sector_div(rt, db/32+1); 5793 sector_div(rt, db/32+1);
5794 rt *= dt; 5794 rt *= dt;
5795 rt >>= 5; 5795 rt >>= 5;
5796 5796
5797 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 5797 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
5798 ((unsigned long)rt % 60)/6); 5798 ((unsigned long)rt % 60)/6);
5799 5799
5800 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 5800 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
5801 } 5801 }
5802 5802
5803 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 5803 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
5804 { 5804 {
5805 struct list_head *tmp; 5805 struct list_head *tmp;
5806 loff_t l = *pos; 5806 loff_t l = *pos;
5807 mddev_t *mddev; 5807 mddev_t *mddev;
5808 5808
5809 if (l >= 0x10000) 5809 if (l >= 0x10000)
5810 return NULL; 5810 return NULL;
5811 if (!l--) 5811 if (!l--)
5812 /* header */ 5812 /* header */
5813 return (void*)1; 5813 return (void*)1;
5814 5814
5815 spin_lock(&all_mddevs_lock); 5815 spin_lock(&all_mddevs_lock);
5816 list_for_each(tmp,&all_mddevs) 5816 list_for_each(tmp,&all_mddevs)
5817 if (!l--) { 5817 if (!l--) {
5818 mddev = list_entry(tmp, mddev_t, all_mddevs); 5818 mddev = list_entry(tmp, mddev_t, all_mddevs);
5819 mddev_get(mddev); 5819 mddev_get(mddev);
5820 spin_unlock(&all_mddevs_lock); 5820 spin_unlock(&all_mddevs_lock);
5821 return mddev; 5821 return mddev;
5822 } 5822 }
5823 spin_unlock(&all_mddevs_lock); 5823 spin_unlock(&all_mddevs_lock);
5824 if (!l--) 5824 if (!l--)
5825 return (void*)2;/* tail */ 5825 return (void*)2;/* tail */
5826 return NULL; 5826 return NULL;
5827 } 5827 }
5828 5828
5829 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 5829 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
5830 { 5830 {
5831 struct list_head *tmp; 5831 struct list_head *tmp;
5832 mddev_t *next_mddev, *mddev = v; 5832 mddev_t *next_mddev, *mddev = v;
5833 5833
5834 ++*pos; 5834 ++*pos;
5835 if (v == (void*)2) 5835 if (v == (void*)2)
5836 return NULL; 5836 return NULL;
5837 5837
5838 spin_lock(&all_mddevs_lock); 5838 spin_lock(&all_mddevs_lock);
5839 if (v == (void*)1) 5839 if (v == (void*)1)
5840 tmp = all_mddevs.next; 5840 tmp = all_mddevs.next;
5841 else 5841 else
5842 tmp = mddev->all_mddevs.next; 5842 tmp = mddev->all_mddevs.next;
5843 if (tmp != &all_mddevs) 5843 if (tmp != &all_mddevs)
5844 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs)); 5844 next_mddev = mddev_get(list_entry(tmp,mddev_t,all_mddevs));
5845 else { 5845 else {
5846 next_mddev = (void*)2; 5846 next_mddev = (void*)2;
5847 *pos = 0x10000; 5847 *pos = 0x10000;
5848 } 5848 }
5849 spin_unlock(&all_mddevs_lock); 5849 spin_unlock(&all_mddevs_lock);
5850 5850
5851 if (v != (void*)1) 5851 if (v != (void*)1)
5852 mddev_put(mddev); 5852 mddev_put(mddev);
5853 return next_mddev; 5853 return next_mddev;
5854 5854
5855 } 5855 }
5856 5856
5857 static void md_seq_stop(struct seq_file *seq, void *v) 5857 static void md_seq_stop(struct seq_file *seq, void *v)
5858 { 5858 {
5859 mddev_t *mddev = v; 5859 mddev_t *mddev = v;
5860 5860
5861 if (mddev && v != (void*)1 && v != (void*)2) 5861 if (mddev && v != (void*)1 && v != (void*)2)
5862 mddev_put(mddev); 5862 mddev_put(mddev);
5863 } 5863 }
5864 5864
5865 struct mdstat_info { 5865 struct mdstat_info {
5866 int event; 5866 int event;
5867 }; 5867 };
5868 5868
5869 static int md_seq_show(struct seq_file *seq, void *v) 5869 static int md_seq_show(struct seq_file *seq, void *v)
5870 { 5870 {
5871 mddev_t *mddev = v; 5871 mddev_t *mddev = v;
5872 sector_t sectors; 5872 sector_t sectors;
5873 mdk_rdev_t *rdev; 5873 mdk_rdev_t *rdev;
5874 struct mdstat_info *mi = seq->private; 5874 struct mdstat_info *mi = seq->private;
5875 struct bitmap *bitmap; 5875 struct bitmap *bitmap;
5876 5876
5877 if (v == (void*)1) { 5877 if (v == (void*)1) {
5878 struct mdk_personality *pers; 5878 struct mdk_personality *pers;
5879 seq_printf(seq, "Personalities : "); 5879 seq_printf(seq, "Personalities : ");
5880 spin_lock(&pers_lock); 5880 spin_lock(&pers_lock);
5881 list_for_each_entry(pers, &pers_list, list) 5881 list_for_each_entry(pers, &pers_list, list)
5882 seq_printf(seq, "[%s] ", pers->name); 5882 seq_printf(seq, "[%s] ", pers->name);
5883 5883
5884 spin_unlock(&pers_lock); 5884 spin_unlock(&pers_lock);
5885 seq_printf(seq, "\n"); 5885 seq_printf(seq, "\n");
5886 mi->event = atomic_read(&md_event_count); 5886 mi->event = atomic_read(&md_event_count);
5887 return 0; 5887 return 0;
5888 } 5888 }
5889 if (v == (void*)2) { 5889 if (v == (void*)2) {
5890 status_unused(seq); 5890 status_unused(seq);
5891 return 0; 5891 return 0;
5892 } 5892 }
5893 5893
5894 if (mddev_lock(mddev) < 0) 5894 if (mddev_lock(mddev) < 0)
5895 return -EINTR; 5895 return -EINTR;
5896 5896
5897 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 5897 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
5898 seq_printf(seq, "%s : %sactive", mdname(mddev), 5898 seq_printf(seq, "%s : %sactive", mdname(mddev),
5899 mddev->pers ? "" : "in"); 5899 mddev->pers ? "" : "in");
5900 if (mddev->pers) { 5900 if (mddev->pers) {
5901 if (mddev->ro==1) 5901 if (mddev->ro==1)
5902 seq_printf(seq, " (read-only)"); 5902 seq_printf(seq, " (read-only)");
5903 if (mddev->ro==2) 5903 if (mddev->ro==2)
5904 seq_printf(seq, " (auto-read-only)"); 5904 seq_printf(seq, " (auto-read-only)");
5905 seq_printf(seq, " %s", mddev->pers->name); 5905 seq_printf(seq, " %s", mddev->pers->name);
5906 } 5906 }
5907 5907
5908 sectors = 0; 5908 sectors = 0;
5909 list_for_each_entry(rdev, &mddev->disks, same_set) { 5909 list_for_each_entry(rdev, &mddev->disks, same_set) {
5910 char b[BDEVNAME_SIZE]; 5910 char b[BDEVNAME_SIZE];
5911 seq_printf(seq, " %s[%d]", 5911 seq_printf(seq, " %s[%d]",
5912 bdevname(rdev->bdev,b), rdev->desc_nr); 5912 bdevname(rdev->bdev,b), rdev->desc_nr);
5913 if (test_bit(WriteMostly, &rdev->flags)) 5913 if (test_bit(WriteMostly, &rdev->flags))
5914 seq_printf(seq, "(W)"); 5914 seq_printf(seq, "(W)");
5915 if (test_bit(Faulty, &rdev->flags)) { 5915 if (test_bit(Faulty, &rdev->flags)) {
5916 seq_printf(seq, "(F)"); 5916 seq_printf(seq, "(F)");
5917 continue; 5917 continue;
5918 } else if (rdev->raid_disk < 0) 5918 } else if (rdev->raid_disk < 0)
5919 seq_printf(seq, "(S)"); /* spare */ 5919 seq_printf(seq, "(S)"); /* spare */
5920 sectors += rdev->sectors; 5920 sectors += rdev->sectors;
5921 } 5921 }
5922 5922
5923 if (!list_empty(&mddev->disks)) { 5923 if (!list_empty(&mddev->disks)) {
5924 if (mddev->pers) 5924 if (mddev->pers)
5925 seq_printf(seq, "\n %llu blocks", 5925 seq_printf(seq, "\n %llu blocks",
5926 (unsigned long long) 5926 (unsigned long long)
5927 mddev->array_sectors / 2); 5927 mddev->array_sectors / 2);
5928 else 5928 else
5929 seq_printf(seq, "\n %llu blocks", 5929 seq_printf(seq, "\n %llu blocks",
5930 (unsigned long long)sectors / 2); 5930 (unsigned long long)sectors / 2);
5931 } 5931 }
5932 if (mddev->persistent) { 5932 if (mddev->persistent) {
5933 if (mddev->major_version != 0 || 5933 if (mddev->major_version != 0 ||
5934 mddev->minor_version != 90) { 5934 mddev->minor_version != 90) {
5935 seq_printf(seq," super %d.%d", 5935 seq_printf(seq," super %d.%d",
5936 mddev->major_version, 5936 mddev->major_version,
5937 mddev->minor_version); 5937 mddev->minor_version);
5938 } 5938 }
5939 } else if (mddev->external) 5939 } else if (mddev->external)
5940 seq_printf(seq, " super external:%s", 5940 seq_printf(seq, " super external:%s",
5941 mddev->metadata_type); 5941 mddev->metadata_type);
5942 else 5942 else
5943 seq_printf(seq, " super non-persistent"); 5943 seq_printf(seq, " super non-persistent");
5944 5944
5945 if (mddev->pers) { 5945 if (mddev->pers) {
5946 mddev->pers->status(seq, mddev); 5946 mddev->pers->status(seq, mddev);
5947 seq_printf(seq, "\n "); 5947 seq_printf(seq, "\n ");
5948 if (mddev->pers->sync_request) { 5948 if (mddev->pers->sync_request) {
5949 if (mddev->curr_resync > 2) { 5949 if (mddev->curr_resync > 2) {
5950 status_resync(seq, mddev); 5950 status_resync(seq, mddev);
5951 seq_printf(seq, "\n "); 5951 seq_printf(seq, "\n ");
5952 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2) 5952 } else if (mddev->curr_resync == 1 || mddev->curr_resync == 2)
5953 seq_printf(seq, "\tresync=DELAYED\n "); 5953 seq_printf(seq, "\tresync=DELAYED\n ");
5954 else if (mddev->recovery_cp < MaxSector) 5954 else if (mddev->recovery_cp < MaxSector)
5955 seq_printf(seq, "\tresync=PENDING\n "); 5955 seq_printf(seq, "\tresync=PENDING\n ");
5956 } 5956 }
5957 } else 5957 } else
5958 seq_printf(seq, "\n "); 5958 seq_printf(seq, "\n ");
5959 5959
5960 if ((bitmap = mddev->bitmap)) { 5960 if ((bitmap = mddev->bitmap)) {
5961 unsigned long chunk_kb; 5961 unsigned long chunk_kb;
5962 unsigned long flags; 5962 unsigned long flags;
5963 spin_lock_irqsave(&bitmap->lock, flags); 5963 spin_lock_irqsave(&bitmap->lock, flags);
5964 chunk_kb = bitmap->chunksize >> 10; 5964 chunk_kb = bitmap->chunksize >> 10;
5965 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " 5965 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
5966 "%lu%s chunk", 5966 "%lu%s chunk",
5967 bitmap->pages - bitmap->missing_pages, 5967 bitmap->pages - bitmap->missing_pages,
5968 bitmap->pages, 5968 bitmap->pages,
5969 (bitmap->pages - bitmap->missing_pages) 5969 (bitmap->pages - bitmap->missing_pages)
5970 << (PAGE_SHIFT - 10), 5970 << (PAGE_SHIFT - 10),
5971 chunk_kb ? chunk_kb : bitmap->chunksize, 5971 chunk_kb ? chunk_kb : bitmap->chunksize,
5972 chunk_kb ? "KB" : "B"); 5972 chunk_kb ? "KB" : "B");
5973 if (bitmap->file) { 5973 if (bitmap->file) {
5974 seq_printf(seq, ", file: "); 5974 seq_printf(seq, ", file: ");
5975 seq_path(seq, &bitmap->file->f_path, " \t\n"); 5975 seq_path(seq, &bitmap->file->f_path, " \t\n");
5976 } 5976 }
5977 5977
5978 seq_printf(seq, "\n"); 5978 seq_printf(seq, "\n");
5979 spin_unlock_irqrestore(&bitmap->lock, flags); 5979 spin_unlock_irqrestore(&bitmap->lock, flags);
5980 } 5980 }
5981 5981
5982 seq_printf(seq, "\n"); 5982 seq_printf(seq, "\n");
5983 } 5983 }
5984 mddev_unlock(mddev); 5984 mddev_unlock(mddev);
5985 5985
5986 return 0; 5986 return 0;
5987 } 5987 }
5988 5988
5989 static const struct seq_operations md_seq_ops = { 5989 static const struct seq_operations md_seq_ops = {
5990 .start = md_seq_start, 5990 .start = md_seq_start,
5991 .next = md_seq_next, 5991 .next = md_seq_next,
5992 .stop = md_seq_stop, 5992 .stop = md_seq_stop,
5993 .show = md_seq_show, 5993 .show = md_seq_show,
5994 }; 5994 };
5995 5995
5996 static int md_seq_open(struct inode *inode, struct file *file) 5996 static int md_seq_open(struct inode *inode, struct file *file)
5997 { 5997 {
5998 int error; 5998 int error;
5999 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL); 5999 struct mdstat_info *mi = kmalloc(sizeof(*mi), GFP_KERNEL);
6000 if (mi == NULL) 6000 if (mi == NULL)
6001 return -ENOMEM; 6001 return -ENOMEM;
6002 6002
6003 error = seq_open(file, &md_seq_ops); 6003 error = seq_open(file, &md_seq_ops);
6004 if (error) 6004 if (error)
6005 kfree(mi); 6005 kfree(mi);
6006 else { 6006 else {
6007 struct seq_file *p = file->private_data; 6007 struct seq_file *p = file->private_data;
6008 p->private = mi; 6008 p->private = mi;
6009 mi->event = atomic_read(&md_event_count); 6009 mi->event = atomic_read(&md_event_count);
6010 } 6010 }
6011 return error; 6011 return error;
6012 } 6012 }
6013 6013
6014 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 6014 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
6015 { 6015 {
6016 struct seq_file *m = filp->private_data; 6016 struct seq_file *m = filp->private_data;
6017 struct mdstat_info *mi = m->private; 6017 struct mdstat_info *mi = m->private;
6018 int mask; 6018 int mask;
6019 6019
6020 poll_wait(filp, &md_event_waiters, wait); 6020 poll_wait(filp, &md_event_waiters, wait);
6021 6021
6022 /* always allow read */ 6022 /* always allow read */
6023 mask = POLLIN | POLLRDNORM; 6023 mask = POLLIN | POLLRDNORM;
6024 6024
6025 if (mi->event != atomic_read(&md_event_count)) 6025 if (mi->event != atomic_read(&md_event_count))
6026 mask |= POLLERR | POLLPRI; 6026 mask |= POLLERR | POLLPRI;
6027 return mask; 6027 return mask;
6028 } 6028 }
6029 6029
6030 static const struct file_operations md_seq_fops = { 6030 static const struct file_operations md_seq_fops = {
6031 .owner = THIS_MODULE, 6031 .owner = THIS_MODULE,
6032 .open = md_seq_open, 6032 .open = md_seq_open,
6033 .read = seq_read, 6033 .read = seq_read,
6034 .llseek = seq_lseek, 6034 .llseek = seq_lseek,
6035 .release = seq_release_private, 6035 .release = seq_release_private,
6036 .poll = mdstat_poll, 6036 .poll = mdstat_poll,
6037 }; 6037 };
6038 6038
6039 int register_md_personality(struct mdk_personality *p) 6039 int register_md_personality(struct mdk_personality *p)
6040 { 6040 {
6041 spin_lock(&pers_lock); 6041 spin_lock(&pers_lock);
6042 list_add_tail(&p->list, &pers_list); 6042 list_add_tail(&p->list, &pers_list);
6043 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 6043 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
6044 spin_unlock(&pers_lock); 6044 spin_unlock(&pers_lock);
6045 return 0; 6045 return 0;
6046 } 6046 }
6047 6047
6048 int unregister_md_personality(struct mdk_personality *p) 6048 int unregister_md_personality(struct mdk_personality *p)
6049 { 6049 {
6050 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 6050 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
6051 spin_lock(&pers_lock); 6051 spin_lock(&pers_lock);
6052 list_del_init(&p->list); 6052 list_del_init(&p->list);
6053 spin_unlock(&pers_lock); 6053 spin_unlock(&pers_lock);
6054 return 0; 6054 return 0;
6055 } 6055 }
6056 6056
6057 static int is_mddev_idle(mddev_t *mddev, int init) 6057 static int is_mddev_idle(mddev_t *mddev, int init)
6058 { 6058 {
6059 mdk_rdev_t * rdev; 6059 mdk_rdev_t * rdev;
6060 int idle; 6060 int idle;
6061 int curr_events; 6061 int curr_events;
6062 6062
6063 idle = 1; 6063 idle = 1;
6064 rcu_read_lock(); 6064 rcu_read_lock();
6065 rdev_for_each_rcu(rdev, mddev) { 6065 rdev_for_each_rcu(rdev, mddev) {
6066 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 6066 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
6067 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 6067 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
6068 (int)part_stat_read(&disk->part0, sectors[1]) - 6068 (int)part_stat_read(&disk->part0, sectors[1]) -
6069 atomic_read(&disk->sync_io); 6069 atomic_read(&disk->sync_io);
6070 /* sync IO will cause sync_io to increase before the disk_stats 6070 /* sync IO will cause sync_io to increase before the disk_stats
6071 * as sync_io is counted when a request starts, and 6071 * as sync_io is counted when a request starts, and
6072 * disk_stats is counted when it completes. 6072 * disk_stats is counted when it completes.
6073 * So resync activity will cause curr_events to be smaller than 6073 * So resync activity will cause curr_events to be smaller than
6074 * when there was no such activity. 6074 * when there was no such activity.
6075 * non-sync IO will cause disk_stat to increase without 6075 * non-sync IO will cause disk_stat to increase without
6076 * increasing sync_io so curr_events will (eventually) 6076 * increasing sync_io so curr_events will (eventually)
6077 * be larger than it was before. Once it becomes 6077 * be larger than it was before. Once it becomes
6078 * substantially larger, the test below will cause 6078 * substantially larger, the test below will cause
6079 * the array to appear non-idle, and resync will slow 6079 * the array to appear non-idle, and resync will slow
6080 * down. 6080 * down.
6081 * If there is a lot of outstanding resync activity when 6081 * If there is a lot of outstanding resync activity when
6082 * we set last_event to curr_events, then all that activity 6082 * we set last_event to curr_events, then all that activity
6083 * completing might cause the array to appear non-idle 6083 * completing might cause the array to appear non-idle
6084 * and resync will be slowed down even though there might 6084 * and resync will be slowed down even though there might
6085 * not have been non-resync activity. This will only 6085 * not have been non-resync activity. This will only
6086 * happen once though. 'last_events' will soon reflect 6086 * happen once though. 'last_events' will soon reflect
6087 * the state where there is little or no outstanding 6087 * the state where there is little or no outstanding
6088 * resync requests, and further resync activity will 6088 * resync requests, and further resync activity will
6089 * always make curr_events less than last_events. 6089 * always make curr_events less than last_events.
6090 * 6090 *
6091 */ 6091 */
6092 if (init || curr_events - rdev->last_events > 64) { 6092 if (init || curr_events - rdev->last_events > 64) {
6093 rdev->last_events = curr_events; 6093 rdev->last_events = curr_events;
6094 idle = 0; 6094 idle = 0;
6095 } 6095 }
6096 } 6096 }
6097 rcu_read_unlock(); 6097 rcu_read_unlock();
6098 return idle; 6098 return idle;
6099 } 6099 }
6100 6100
6101 void md_done_sync(mddev_t *mddev, int blocks, int ok) 6101 void md_done_sync(mddev_t *mddev, int blocks, int ok)
6102 { 6102 {
6103 /* another "blocks" (512byte) blocks have been synced */ 6103 /* another "blocks" (512byte) blocks have been synced */
6104 atomic_sub(blocks, &mddev->recovery_active); 6104 atomic_sub(blocks, &mddev->recovery_active);
6105 wake_up(&mddev->recovery_wait); 6105 wake_up(&mddev->recovery_wait);
6106 if (!ok) { 6106 if (!ok) {
6107 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6107 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6108 md_wakeup_thread(mddev->thread); 6108 md_wakeup_thread(mddev->thread);
6109 // stop recovery, signal do_sync .... 6109 // stop recovery, signal do_sync ....
6110 } 6110 }
6111 } 6111 }
6112 6112
6113 6113
6114 /* md_write_start(mddev, bi) 6114 /* md_write_start(mddev, bi)
6115 * If we need to update some array metadata (e.g. 'active' flag 6115 * If we need to update some array metadata (e.g. 'active' flag
6116 * in superblock) before writing, schedule a superblock update 6116 * in superblock) before writing, schedule a superblock update
6117 * and wait for it to complete. 6117 * and wait for it to complete.
6118 */ 6118 */
6119 void md_write_start(mddev_t *mddev, struct bio *bi) 6119 void md_write_start(mddev_t *mddev, struct bio *bi)
6120 { 6120 {
6121 int did_change = 0; 6121 int did_change = 0;
6122 if (bio_data_dir(bi) != WRITE) 6122 if (bio_data_dir(bi) != WRITE)
6123 return; 6123 return;
6124 6124
6125 BUG_ON(mddev->ro == 1); 6125 BUG_ON(mddev->ro == 1);
6126 if (mddev->ro == 2) { 6126 if (mddev->ro == 2) {
6127 /* need to switch to read/write */ 6127 /* need to switch to read/write */
6128 mddev->ro = 0; 6128 mddev->ro = 0;
6129 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6129 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6130 md_wakeup_thread(mddev->thread); 6130 md_wakeup_thread(mddev->thread);
6131 md_wakeup_thread(mddev->sync_thread); 6131 md_wakeup_thread(mddev->sync_thread);
6132 did_change = 1; 6132 did_change = 1;
6133 } 6133 }
6134 atomic_inc(&mddev->writes_pending); 6134 atomic_inc(&mddev->writes_pending);
6135 if (mddev->safemode == 1) 6135 if (mddev->safemode == 1)
6136 mddev->safemode = 0; 6136 mddev->safemode = 0;
6137 if (mddev->in_sync) { 6137 if (mddev->in_sync) {
6138 spin_lock_irq(&mddev->write_lock); 6138 spin_lock_irq(&mddev->write_lock);
6139 if (mddev->in_sync) { 6139 if (mddev->in_sync) {
6140 mddev->in_sync = 0; 6140 mddev->in_sync = 0;
6141 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6141 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6142 md_wakeup_thread(mddev->thread); 6142 md_wakeup_thread(mddev->thread);
6143 did_change = 1; 6143 did_change = 1;
6144 } 6144 }
6145 spin_unlock_irq(&mddev->write_lock); 6145 spin_unlock_irq(&mddev->write_lock);
6146 } 6146 }
6147 if (did_change) 6147 if (did_change)
6148 sysfs_notify_dirent(mddev->sysfs_state); 6148 sysfs_notify_dirent(mddev->sysfs_state);
6149 wait_event(mddev->sb_wait, 6149 wait_event(mddev->sb_wait,
6150 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) && 6150 !test_bit(MD_CHANGE_CLEAN, &mddev->flags) &&
6151 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6151 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6152 } 6152 }
6153 6153
6154 void md_write_end(mddev_t *mddev) 6154 void md_write_end(mddev_t *mddev)
6155 { 6155 {
6156 if (atomic_dec_and_test(&mddev->writes_pending)) { 6156 if (atomic_dec_and_test(&mddev->writes_pending)) {
6157 if (mddev->safemode == 2) 6157 if (mddev->safemode == 2)
6158 md_wakeup_thread(mddev->thread); 6158 md_wakeup_thread(mddev->thread);
6159 else if (mddev->safemode_delay) 6159 else if (mddev->safemode_delay)
6160 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 6160 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
6161 } 6161 }
6162 } 6162 }
6163 6163
6164 /* md_allow_write(mddev) 6164 /* md_allow_write(mddev)
6165 * Calling this ensures that the array is marked 'active' so that writes 6165 * Calling this ensures that the array is marked 'active' so that writes
6166 * may proceed without blocking. It is important to call this before 6166 * may proceed without blocking. It is important to call this before
6167 * attempting a GFP_KERNEL allocation while holding the mddev lock. 6167 * attempting a GFP_KERNEL allocation while holding the mddev lock.
6168 * Must be called with mddev_lock held. 6168 * Must be called with mddev_lock held.
6169 * 6169 *
6170 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 6170 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
6171 * is dropped, so return -EAGAIN after notifying userspace. 6171 * is dropped, so return -EAGAIN after notifying userspace.
6172 */ 6172 */
6173 int md_allow_write(mddev_t *mddev) 6173 int md_allow_write(mddev_t *mddev)
6174 { 6174 {
6175 if (!mddev->pers) 6175 if (!mddev->pers)
6176 return 0; 6176 return 0;
6177 if (mddev->ro) 6177 if (mddev->ro)
6178 return 0; 6178 return 0;
6179 if (!mddev->pers->sync_request) 6179 if (!mddev->pers->sync_request)
6180 return 0; 6180 return 0;
6181 6181
6182 spin_lock_irq(&mddev->write_lock); 6182 spin_lock_irq(&mddev->write_lock);
6183 if (mddev->in_sync) { 6183 if (mddev->in_sync) {
6184 mddev->in_sync = 0; 6184 mddev->in_sync = 0;
6185 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6185 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6186 if (mddev->safemode_delay && 6186 if (mddev->safemode_delay &&
6187 mddev->safemode == 0) 6187 mddev->safemode == 0)
6188 mddev->safemode = 1; 6188 mddev->safemode = 1;
6189 spin_unlock_irq(&mddev->write_lock); 6189 spin_unlock_irq(&mddev->write_lock);
6190 md_update_sb(mddev, 0); 6190 md_update_sb(mddev, 0);
6191 sysfs_notify_dirent(mddev->sysfs_state); 6191 sysfs_notify_dirent(mddev->sysfs_state);
6192 } else 6192 } else
6193 spin_unlock_irq(&mddev->write_lock); 6193 spin_unlock_irq(&mddev->write_lock);
6194 6194
6195 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags)) 6195 if (test_bit(MD_CHANGE_CLEAN, &mddev->flags))
6196 return -EAGAIN; 6196 return -EAGAIN;
6197 else 6197 else
6198 return 0; 6198 return 0;
6199 } 6199 }
6200 EXPORT_SYMBOL_GPL(md_allow_write); 6200 EXPORT_SYMBOL_GPL(md_allow_write);
6201 6201
6202 #define SYNC_MARKS 10 6202 #define SYNC_MARKS 10
6203 #define SYNC_MARK_STEP (3*HZ) 6203 #define SYNC_MARK_STEP (3*HZ)
6204 void md_do_sync(mddev_t *mddev) 6204 void md_do_sync(mddev_t *mddev)
6205 { 6205 {
6206 mddev_t *mddev2; 6206 mddev_t *mddev2;
6207 unsigned int currspeed = 0, 6207 unsigned int currspeed = 0,
6208 window; 6208 window;
6209 sector_t max_sectors,j, io_sectors; 6209 sector_t max_sectors,j, io_sectors;
6210 unsigned long mark[SYNC_MARKS]; 6210 unsigned long mark[SYNC_MARKS];
6211 sector_t mark_cnt[SYNC_MARKS]; 6211 sector_t mark_cnt[SYNC_MARKS];
6212 int last_mark,m; 6212 int last_mark,m;
6213 struct list_head *tmp; 6213 struct list_head *tmp;
6214 sector_t last_check; 6214 sector_t last_check;
6215 int skipped = 0; 6215 int skipped = 0;
6216 mdk_rdev_t *rdev; 6216 mdk_rdev_t *rdev;
6217 char *desc; 6217 char *desc;
6218 6218
6219 /* just incase thread restarts... */ 6219 /* just incase thread restarts... */
6220 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 6220 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
6221 return; 6221 return;
6222 if (mddev->ro) /* never try to sync a read-only array */ 6222 if (mddev->ro) /* never try to sync a read-only array */
6223 return; 6223 return;
6224 6224
6225 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6225 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6226 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 6226 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
6227 desc = "data-check"; 6227 desc = "data-check";
6228 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6228 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6229 desc = "requested-resync"; 6229 desc = "requested-resync";
6230 else 6230 else
6231 desc = "resync"; 6231 desc = "resync";
6232 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6232 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6233 desc = "reshape"; 6233 desc = "reshape";
6234 else 6234 else
6235 desc = "recovery"; 6235 desc = "recovery";
6236 6236
6237 /* we overload curr_resync somewhat here. 6237 /* we overload curr_resync somewhat here.
6238 * 0 == not engaged in resync at all 6238 * 0 == not engaged in resync at all
6239 * 2 == checking that there is no conflict with another sync 6239 * 2 == checking that there is no conflict with another sync
6240 * 1 == like 2, but have yielded to allow conflicting resync to 6240 * 1 == like 2, but have yielded to allow conflicting resync to
6241 * commense 6241 * commense
6242 * other == active in resync - this many blocks 6242 * other == active in resync - this many blocks
6243 * 6243 *
6244 * Before starting a resync we must have set curr_resync to 6244 * Before starting a resync we must have set curr_resync to
6245 * 2, and then checked that every "conflicting" array has curr_resync 6245 * 2, and then checked that every "conflicting" array has curr_resync
6246 * less than ours. When we find one that is the same or higher 6246 * less than ours. When we find one that is the same or higher
6247 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 6247 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
6248 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 6248 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
6249 * This will mean we have to start checking from the beginning again. 6249 * This will mean we have to start checking from the beginning again.
6250 * 6250 *
6251 */ 6251 */
6252 6252
6253 do { 6253 do {
6254 mddev->curr_resync = 2; 6254 mddev->curr_resync = 2;
6255 6255
6256 try_again: 6256 try_again:
6257 if (kthread_should_stop()) { 6257 if (kthread_should_stop()) {
6258 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6258 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6259 goto skip; 6259 goto skip;
6260 } 6260 }
6261 for_each_mddev(mddev2, tmp) { 6261 for_each_mddev(mddev2, tmp) {
6262 if (mddev2 == mddev) 6262 if (mddev2 == mddev)
6263 continue; 6263 continue;
6264 if (!mddev->parallel_resync 6264 if (!mddev->parallel_resync
6265 && mddev2->curr_resync 6265 && mddev2->curr_resync
6266 && match_mddev_units(mddev, mddev2)) { 6266 && match_mddev_units(mddev, mddev2)) {
6267 DEFINE_WAIT(wq); 6267 DEFINE_WAIT(wq);
6268 if (mddev < mddev2 && mddev->curr_resync == 2) { 6268 if (mddev < mddev2 && mddev->curr_resync == 2) {
6269 /* arbitrarily yield */ 6269 /* arbitrarily yield */
6270 mddev->curr_resync = 1; 6270 mddev->curr_resync = 1;
6271 wake_up(&resync_wait); 6271 wake_up(&resync_wait);
6272 } 6272 }
6273 if (mddev > mddev2 && mddev->curr_resync == 1) 6273 if (mddev > mddev2 && mddev->curr_resync == 1)
6274 /* no need to wait here, we can wait the next 6274 /* no need to wait here, we can wait the next
6275 * time 'round when curr_resync == 2 6275 * time 'round when curr_resync == 2
6276 */ 6276 */
6277 continue; 6277 continue;
6278 /* We need to wait 'interruptible' so as not to 6278 /* We need to wait 'interruptible' so as not to
6279 * contribute to the load average, and not to 6279 * contribute to the load average, and not to
6280 * be caught by 'softlockup' 6280 * be caught by 'softlockup'
6281 */ 6281 */
6282 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 6282 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
6283 if (!kthread_should_stop() && 6283 if (!kthread_should_stop() &&
6284 mddev2->curr_resync >= mddev->curr_resync) { 6284 mddev2->curr_resync >= mddev->curr_resync) {
6285 printk(KERN_INFO "md: delaying %s of %s" 6285 printk(KERN_INFO "md: delaying %s of %s"
6286 " until %s has finished (they" 6286 " until %s has finished (they"
6287 " share one or more physical units)\n", 6287 " share one or more physical units)\n",
6288 desc, mdname(mddev), mdname(mddev2)); 6288 desc, mdname(mddev), mdname(mddev2));
6289 mddev_put(mddev2); 6289 mddev_put(mddev2);
6290 if (signal_pending(current)) 6290 if (signal_pending(current))
6291 flush_signals(current); 6291 flush_signals(current);
6292 schedule(); 6292 schedule();
6293 finish_wait(&resync_wait, &wq); 6293 finish_wait(&resync_wait, &wq);
6294 goto try_again; 6294 goto try_again;
6295 } 6295 }
6296 finish_wait(&resync_wait, &wq); 6296 finish_wait(&resync_wait, &wq);
6297 } 6297 }
6298 } 6298 }
6299 } while (mddev->curr_resync < 2); 6299 } while (mddev->curr_resync < 2);
6300 6300
6301 j = 0; 6301 j = 0;
6302 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6302 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6303 /* resync follows the size requested by the personality, 6303 /* resync follows the size requested by the personality,
6304 * which defaults to physical size, but can be virtual size 6304 * which defaults to physical size, but can be virtual size
6305 */ 6305 */
6306 max_sectors = mddev->resync_max_sectors; 6306 max_sectors = mddev->resync_max_sectors;
6307 mddev->resync_mismatches = 0; 6307 mddev->resync_mismatches = 0;
6308 /* we don't use the checkpoint if there's a bitmap */ 6308 /* we don't use the checkpoint if there's a bitmap */
6309 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 6309 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
6310 j = mddev->resync_min; 6310 j = mddev->resync_min;
6311 else if (!mddev->bitmap) 6311 else if (!mddev->bitmap)
6312 j = mddev->recovery_cp; 6312 j = mddev->recovery_cp;
6313 6313
6314 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6314 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6315 max_sectors = mddev->dev_sectors; 6315 max_sectors = mddev->dev_sectors;
6316 else { 6316 else {
6317 /* recovery follows the physical size of devices */ 6317 /* recovery follows the physical size of devices */
6318 max_sectors = mddev->dev_sectors; 6318 max_sectors = mddev->dev_sectors;
6319 j = MaxSector; 6319 j = MaxSector;
6320 list_for_each_entry(rdev, &mddev->disks, same_set) 6320 list_for_each_entry(rdev, &mddev->disks, same_set)
6321 if (rdev->raid_disk >= 0 && 6321 if (rdev->raid_disk >= 0 &&
6322 !test_bit(Faulty, &rdev->flags) && 6322 !test_bit(Faulty, &rdev->flags) &&
6323 !test_bit(In_sync, &rdev->flags) && 6323 !test_bit(In_sync, &rdev->flags) &&
6324 rdev->recovery_offset < j) 6324 rdev->recovery_offset < j)
6325 j = rdev->recovery_offset; 6325 j = rdev->recovery_offset;
6326 } 6326 }
6327 6327
6328 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 6328 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
6329 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 6329 printk(KERN_INFO "md: minimum _guaranteed_ speed:"
6330 " %d KB/sec/disk.\n", speed_min(mddev)); 6330 " %d KB/sec/disk.\n", speed_min(mddev));
6331 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 6331 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
6332 "(but not more than %d KB/sec) for %s.\n", 6332 "(but not more than %d KB/sec) for %s.\n",
6333 speed_max(mddev), desc); 6333 speed_max(mddev), desc);
6334 6334
6335 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 6335 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
6336 6336
6337 io_sectors = 0; 6337 io_sectors = 0;
6338 for (m = 0; m < SYNC_MARKS; m++) { 6338 for (m = 0; m < SYNC_MARKS; m++) {
6339 mark[m] = jiffies; 6339 mark[m] = jiffies;
6340 mark_cnt[m] = io_sectors; 6340 mark_cnt[m] = io_sectors;
6341 } 6341 }
6342 last_mark = 0; 6342 last_mark = 0;
6343 mddev->resync_mark = mark[last_mark]; 6343 mddev->resync_mark = mark[last_mark];
6344 mddev->resync_mark_cnt = mark_cnt[last_mark]; 6344 mddev->resync_mark_cnt = mark_cnt[last_mark];
6345 6345
6346 /* 6346 /*
6347 * Tune reconstruction: 6347 * Tune reconstruction:
6348 */ 6348 */
6349 window = 32*(PAGE_SIZE/512); 6349 window = 32*(PAGE_SIZE/512);
6350 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n", 6350 printk(KERN_INFO "md: using %dk window, over a total of %llu blocks.\n",
6351 window/2,(unsigned long long) max_sectors/2); 6351 window/2,(unsigned long long) max_sectors/2);
6352 6352
6353 atomic_set(&mddev->recovery_active, 0); 6353 atomic_set(&mddev->recovery_active, 0);
6354 last_check = 0; 6354 last_check = 0;
6355 6355
6356 if (j>2) { 6356 if (j>2) {
6357 printk(KERN_INFO 6357 printk(KERN_INFO
6358 "md: resuming %s of %s from checkpoint.\n", 6358 "md: resuming %s of %s from checkpoint.\n",
6359 desc, mdname(mddev)); 6359 desc, mdname(mddev));
6360 mddev->curr_resync = j; 6360 mddev->curr_resync = j;
6361 } 6361 }
6362 6362
6363 while (j < max_sectors) { 6363 while (j < max_sectors) {
6364 sector_t sectors; 6364 sector_t sectors;
6365 6365
6366 skipped = 0; 6366 skipped = 0;
6367 6367
6368 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 6368 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6369 ((mddev->curr_resync > mddev->curr_resync_completed && 6369 ((mddev->curr_resync > mddev->curr_resync_completed &&
6370 (mddev->curr_resync - mddev->curr_resync_completed) 6370 (mddev->curr_resync - mddev->curr_resync_completed)
6371 > (max_sectors >> 4)) || 6371 > (max_sectors >> 4)) ||
6372 (j - mddev->curr_resync_completed)*2 6372 (j - mddev->curr_resync_completed)*2
6373 >= mddev->resync_max - mddev->curr_resync_completed 6373 >= mddev->resync_max - mddev->curr_resync_completed
6374 )) { 6374 )) {
6375 /* time to update curr_resync_completed */ 6375 /* time to update curr_resync_completed */
6376 blk_unplug(mddev->queue); 6376 blk_unplug(mddev->queue);
6377 wait_event(mddev->recovery_wait, 6377 wait_event(mddev->recovery_wait,
6378 atomic_read(&mddev->recovery_active) == 0); 6378 atomic_read(&mddev->recovery_active) == 0);
6379 mddev->curr_resync_completed = 6379 mddev->curr_resync_completed =
6380 mddev->curr_resync; 6380 mddev->curr_resync;
6381 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6381 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6382 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6382 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6383 } 6383 }
6384 6384
6385 while (j >= mddev->resync_max && !kthread_should_stop()) { 6385 while (j >= mddev->resync_max && !kthread_should_stop()) {
6386 /* As this condition is controlled by user-space, 6386 /* As this condition is controlled by user-space,
6387 * we can block indefinitely, so use '_interruptible' 6387 * we can block indefinitely, so use '_interruptible'
6388 * to avoid triggering warnings. 6388 * to avoid triggering warnings.
6389 */ 6389 */
6390 flush_signals(current); /* just in case */ 6390 flush_signals(current); /* just in case */
6391 wait_event_interruptible(mddev->recovery_wait, 6391 wait_event_interruptible(mddev->recovery_wait,
6392 mddev->resync_max > j 6392 mddev->resync_max > j
6393 || kthread_should_stop()); 6393 || kthread_should_stop());
6394 } 6394 }
6395 6395
6396 if (kthread_should_stop()) 6396 if (kthread_should_stop())
6397 goto interrupted; 6397 goto interrupted;
6398 6398
6399 sectors = mddev->pers->sync_request(mddev, j, &skipped, 6399 sectors = mddev->pers->sync_request(mddev, j, &skipped,
6400 currspeed < speed_min(mddev)); 6400 currspeed < speed_min(mddev));
6401 if (sectors == 0) { 6401 if (sectors == 0) {
6402 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6402 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6403 goto out; 6403 goto out;
6404 } 6404 }
6405 6405
6406 if (!skipped) { /* actual IO requested */ 6406 if (!skipped) { /* actual IO requested */
6407 io_sectors += sectors; 6407 io_sectors += sectors;
6408 atomic_add(sectors, &mddev->recovery_active); 6408 atomic_add(sectors, &mddev->recovery_active);
6409 } 6409 }
6410 6410
6411 j += sectors; 6411 j += sectors;
6412 if (j>1) mddev->curr_resync = j; 6412 if (j>1) mddev->curr_resync = j;
6413 mddev->curr_mark_cnt = io_sectors; 6413 mddev->curr_mark_cnt = io_sectors;
6414 if (last_check == 0) 6414 if (last_check == 0)
6415 /* this is the earliers that rebuilt will be 6415 /* this is the earliers that rebuilt will be
6416 * visible in /proc/mdstat 6416 * visible in /proc/mdstat
6417 */ 6417 */
6418 md_new_event(mddev); 6418 md_new_event(mddev);
6419 6419
6420 if (last_check + window > io_sectors || j == max_sectors) 6420 if (last_check + window > io_sectors || j == max_sectors)
6421 continue; 6421 continue;
6422 6422
6423 last_check = io_sectors; 6423 last_check = io_sectors;
6424 6424
6425 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6425 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6426 break; 6426 break;
6427 6427
6428 repeat: 6428 repeat:
6429 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 6429 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
6430 /* step marks */ 6430 /* step marks */
6431 int next = (last_mark+1) % SYNC_MARKS; 6431 int next = (last_mark+1) % SYNC_MARKS;
6432 6432
6433 mddev->resync_mark = mark[next]; 6433 mddev->resync_mark = mark[next];
6434 mddev->resync_mark_cnt = mark_cnt[next]; 6434 mddev->resync_mark_cnt = mark_cnt[next];
6435 mark[next] = jiffies; 6435 mark[next] = jiffies;
6436 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 6436 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
6437 last_mark = next; 6437 last_mark = next;
6438 } 6438 }
6439 6439
6440 6440
6441 if (kthread_should_stop()) 6441 if (kthread_should_stop())
6442 goto interrupted; 6442 goto interrupted;
6443 6443
6444 6444
6445 /* 6445 /*
6446 * this loop exits only if either when we are slower than 6446 * this loop exits only if either when we are slower than
6447 * the 'hard' speed limit, or the system was IO-idle for 6447 * the 'hard' speed limit, or the system was IO-idle for
6448 * a jiffy. 6448 * a jiffy.
6449 * the system might be non-idle CPU-wise, but we only care 6449 * the system might be non-idle CPU-wise, but we only care
6450 * about not overloading the IO subsystem. (things like an 6450 * about not overloading the IO subsystem. (things like an
6451 * e2fsck being done on the RAID array should execute fast) 6451 * e2fsck being done on the RAID array should execute fast)
6452 */ 6452 */
6453 blk_unplug(mddev->queue); 6453 blk_unplug(mddev->queue);
6454 cond_resched(); 6454 cond_resched();
6455 6455
6456 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 6456 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
6457 /((jiffies-mddev->resync_mark)/HZ +1) +1; 6457 /((jiffies-mddev->resync_mark)/HZ +1) +1;
6458 6458
6459 if (currspeed > speed_min(mddev)) { 6459 if (currspeed > speed_min(mddev)) {
6460 if ((currspeed > speed_max(mddev)) || 6460 if ((currspeed > speed_max(mddev)) ||
6461 !is_mddev_idle(mddev, 0)) { 6461 !is_mddev_idle(mddev, 0)) {
6462 msleep(500); 6462 msleep(500);
6463 goto repeat; 6463 goto repeat;
6464 } 6464 }
6465 } 6465 }
6466 } 6466 }
6467 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 6467 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
6468 /* 6468 /*
6469 * this also signals 'finished resyncing' to md_stop 6469 * this also signals 'finished resyncing' to md_stop
6470 */ 6470 */
6471 out: 6471 out:
6472 blk_unplug(mddev->queue); 6472 blk_unplug(mddev->queue);
6473 6473
6474 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 6474 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
6475 6475
6476 /* tell personality that we are finished */ 6476 /* tell personality that we are finished */
6477 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 6477 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
6478 6478
6479 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 6479 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
6480 mddev->curr_resync > 2) { 6480 mddev->curr_resync > 2) {
6481 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 6481 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
6482 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 6482 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
6483 if (mddev->curr_resync >= mddev->recovery_cp) { 6483 if (mddev->curr_resync >= mddev->recovery_cp) {
6484 printk(KERN_INFO 6484 printk(KERN_INFO
6485 "md: checkpointing %s of %s.\n", 6485 "md: checkpointing %s of %s.\n",
6486 desc, mdname(mddev)); 6486 desc, mdname(mddev));
6487 mddev->recovery_cp = mddev->curr_resync; 6487 mddev->recovery_cp = mddev->curr_resync;
6488 } 6488 }
6489 } else 6489 } else
6490 mddev->recovery_cp = MaxSector; 6490 mddev->recovery_cp = MaxSector;
6491 } else { 6491 } else {
6492 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 6492 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6493 mddev->curr_resync = MaxSector; 6493 mddev->curr_resync = MaxSector;
6494 list_for_each_entry(rdev, &mddev->disks, same_set) 6494 list_for_each_entry(rdev, &mddev->disks, same_set)
6495 if (rdev->raid_disk >= 0 && 6495 if (rdev->raid_disk >= 0 &&
6496 !test_bit(Faulty, &rdev->flags) && 6496 !test_bit(Faulty, &rdev->flags) &&
6497 !test_bit(In_sync, &rdev->flags) && 6497 !test_bit(In_sync, &rdev->flags) &&
6498 rdev->recovery_offset < mddev->curr_resync) 6498 rdev->recovery_offset < mddev->curr_resync)
6499 rdev->recovery_offset = mddev->curr_resync; 6499 rdev->recovery_offset = mddev->curr_resync;
6500 } 6500 }
6501 } 6501 }
6502 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6502 set_bit(MD_CHANGE_DEVS, &mddev->flags);
6503 6503
6504 skip: 6504 skip:
6505 mddev->curr_resync = 0; 6505 mddev->curr_resync = 0;
6506 mddev->curr_resync_completed = 0; 6506 mddev->curr_resync_completed = 0;
6507 mddev->resync_min = 0; 6507 mddev->resync_min = 0;
6508 mddev->resync_max = MaxSector; 6508 mddev->resync_max = MaxSector;
6509 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 6509 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
6510 wake_up(&resync_wait); 6510 wake_up(&resync_wait);
6511 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 6511 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
6512 md_wakeup_thread(mddev->thread); 6512 md_wakeup_thread(mddev->thread);
6513 return; 6513 return;
6514 6514
6515 interrupted: 6515 interrupted:
6516 /* 6516 /*
6517 * got a signal, exit. 6517 * got a signal, exit.
6518 */ 6518 */
6519 printk(KERN_INFO 6519 printk(KERN_INFO
6520 "md: md_do_sync() got signal ... exiting\n"); 6520 "md: md_do_sync() got signal ... exiting\n");
6521 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6521 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6522 goto out; 6522 goto out;
6523 6523
6524 } 6524 }
6525 EXPORT_SYMBOL_GPL(md_do_sync); 6525 EXPORT_SYMBOL_GPL(md_do_sync);
6526 6526
6527 6527
6528 static int remove_and_add_spares(mddev_t *mddev) 6528 static int remove_and_add_spares(mddev_t *mddev)
6529 { 6529 {
6530 mdk_rdev_t *rdev; 6530 mdk_rdev_t *rdev;
6531 int spares = 0; 6531 int spares = 0;
6532 6532
6533 mddev->curr_resync_completed = 0; 6533 mddev->curr_resync_completed = 0;
6534 6534
6535 list_for_each_entry(rdev, &mddev->disks, same_set) 6535 list_for_each_entry(rdev, &mddev->disks, same_set)
6536 if (rdev->raid_disk >= 0 && 6536 if (rdev->raid_disk >= 0 &&
6537 !test_bit(Blocked, &rdev->flags) && 6537 !test_bit(Blocked, &rdev->flags) &&
6538 (test_bit(Faulty, &rdev->flags) || 6538 (test_bit(Faulty, &rdev->flags) ||
6539 ! test_bit(In_sync, &rdev->flags)) && 6539 ! test_bit(In_sync, &rdev->flags)) &&
6540 atomic_read(&rdev->nr_pending)==0) { 6540 atomic_read(&rdev->nr_pending)==0) {
6541 if (mddev->pers->hot_remove_disk( 6541 if (mddev->pers->hot_remove_disk(
6542 mddev, rdev->raid_disk)==0) { 6542 mddev, rdev->raid_disk)==0) {
6543 char nm[20]; 6543 char nm[20];
6544 sprintf(nm,"rd%d", rdev->raid_disk); 6544 sprintf(nm,"rd%d", rdev->raid_disk);
6545 sysfs_remove_link(&mddev->kobj, nm); 6545 sysfs_remove_link(&mddev->kobj, nm);
6546 rdev->raid_disk = -1; 6546 rdev->raid_disk = -1;
6547 } 6547 }
6548 } 6548 }
6549 6549
6550 if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) { 6550 if (mddev->degraded && ! mddev->ro && !mddev->recovery_disabled) {
6551 list_for_each_entry(rdev, &mddev->disks, same_set) { 6551 list_for_each_entry(rdev, &mddev->disks, same_set) {
6552 if (rdev->raid_disk >= 0 && 6552 if (rdev->raid_disk >= 0 &&
6553 !test_bit(In_sync, &rdev->flags) && 6553 !test_bit(In_sync, &rdev->flags) &&
6554 !test_bit(Blocked, &rdev->flags)) 6554 !test_bit(Blocked, &rdev->flags))
6555 spares++; 6555 spares++;
6556 if (rdev->raid_disk < 0 6556 if (rdev->raid_disk < 0
6557 && !test_bit(Faulty, &rdev->flags)) { 6557 && !test_bit(Faulty, &rdev->flags)) {
6558 rdev->recovery_offset = 0; 6558 rdev->recovery_offset = 0;
6559 if (mddev->pers-> 6559 if (mddev->pers->
6560 hot_add_disk(mddev, rdev) == 0) { 6560 hot_add_disk(mddev, rdev) == 0) {
6561 char nm[20]; 6561 char nm[20];
6562 sprintf(nm, "rd%d", rdev->raid_disk); 6562 sprintf(nm, "rd%d", rdev->raid_disk);
6563 if (sysfs_create_link(&mddev->kobj, 6563 if (sysfs_create_link(&mddev->kobj,
6564 &rdev->kobj, nm)) 6564 &rdev->kobj, nm))
6565 printk(KERN_WARNING 6565 printk(KERN_WARNING
6566 "md: cannot register " 6566 "md: cannot register "
6567 "%s for %s\n", 6567 "%s for %s\n",
6568 nm, mdname(mddev)); 6568 nm, mdname(mddev));
6569 spares++; 6569 spares++;
6570 md_new_event(mddev); 6570 md_new_event(mddev);
6571 } else 6571 } else
6572 break; 6572 break;
6573 } 6573 }
6574 } 6574 }
6575 } 6575 }
6576 return spares; 6576 return spares;
6577 } 6577 }
6578 /* 6578 /*
6579 * This routine is regularly called by all per-raid-array threads to 6579 * This routine is regularly called by all per-raid-array threads to
6580 * deal with generic issues like resync and super-block update. 6580 * deal with generic issues like resync and super-block update.
6581 * Raid personalities that don't have a thread (linear/raid0) do not 6581 * Raid personalities that don't have a thread (linear/raid0) do not
6582 * need this as they never do any recovery or update the superblock. 6582 * need this as they never do any recovery or update the superblock.
6583 * 6583 *
6584 * It does not do any resync itself, but rather "forks" off other threads 6584 * It does not do any resync itself, but rather "forks" off other threads
6585 * to do that as needed. 6585 * to do that as needed.
6586 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 6586 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
6587 * "->recovery" and create a thread at ->sync_thread. 6587 * "->recovery" and create a thread at ->sync_thread.
6588 * When the thread finishes it sets MD_RECOVERY_DONE 6588 * When the thread finishes it sets MD_RECOVERY_DONE
6589 * and wakeups up this thread which will reap the thread and finish up. 6589 * and wakeups up this thread which will reap the thread and finish up.
6590 * This thread also removes any faulty devices (with nr_pending == 0). 6590 * This thread also removes any faulty devices (with nr_pending == 0).
6591 * 6591 *
6592 * The overall approach is: 6592 * The overall approach is:
6593 * 1/ if the superblock needs updating, update it. 6593 * 1/ if the superblock needs updating, update it.
6594 * 2/ If a recovery thread is running, don't do anything else. 6594 * 2/ If a recovery thread is running, don't do anything else.
6595 * 3/ If recovery has finished, clean up, possibly marking spares active. 6595 * 3/ If recovery has finished, clean up, possibly marking spares active.
6596 * 4/ If there are any faulty devices, remove them. 6596 * 4/ If there are any faulty devices, remove them.
6597 * 5/ If array is degraded, try to add spares devices 6597 * 5/ If array is degraded, try to add spares devices
6598 * 6/ If array has spares or is not in-sync, start a resync thread. 6598 * 6/ If array has spares or is not in-sync, start a resync thread.
6599 */ 6599 */
6600 void md_check_recovery(mddev_t *mddev) 6600 void md_check_recovery(mddev_t *mddev)
6601 { 6601 {
6602 mdk_rdev_t *rdev; 6602 mdk_rdev_t *rdev;
6603 6603
6604 6604
6605 if (mddev->bitmap) 6605 if (mddev->bitmap)
6606 bitmap_daemon_work(mddev->bitmap); 6606 bitmap_daemon_work(mddev->bitmap);
6607 6607
6608 if (mddev->ro) 6608 if (mddev->ro)
6609 return; 6609 return;
6610 6610
6611 if (signal_pending(current)) { 6611 if (signal_pending(current)) {
6612 if (mddev->pers->sync_request && !mddev->external) { 6612 if (mddev->pers->sync_request && !mddev->external) {
6613 printk(KERN_INFO "md: %s in immediate safe mode\n", 6613 printk(KERN_INFO "md: %s in immediate safe mode\n",
6614 mdname(mddev)); 6614 mdname(mddev));
6615 mddev->safemode = 2; 6615 mddev->safemode = 2;
6616 } 6616 }
6617 flush_signals(current); 6617 flush_signals(current);
6618 } 6618 }
6619 6619
6620 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 6620 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
6621 return; 6621 return;
6622 if ( ! ( 6622 if ( ! (
6623 (mddev->flags && !mddev->external) || 6623 (mddev->flags && !mddev->external) ||
6624 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 6624 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
6625 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 6625 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
6626 (mddev->external == 0 && mddev->safemode == 1) || 6626 (mddev->external == 0 && mddev->safemode == 1) ||
6627 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 6627 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
6628 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 6628 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
6629 )) 6629 ))
6630 return; 6630 return;
6631 6631
6632 if (mddev_trylock(mddev)) { 6632 if (mddev_trylock(mddev)) {
6633 int spares = 0; 6633 int spares = 0;
6634 6634
6635 if (mddev->ro) { 6635 if (mddev->ro) {
6636 /* Only thing we do on a ro array is remove 6636 /* Only thing we do on a ro array is remove
6637 * failed devices. 6637 * failed devices.
6638 */ 6638 */
6639 remove_and_add_spares(mddev); 6639 remove_and_add_spares(mddev);
6640 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6640 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6641 goto unlock; 6641 goto unlock;
6642 } 6642 }
6643 6643
6644 if (!mddev->external) { 6644 if (!mddev->external) {
6645 int did_change = 0; 6645 int did_change = 0;
6646 spin_lock_irq(&mddev->write_lock); 6646 spin_lock_irq(&mddev->write_lock);
6647 if (mddev->safemode && 6647 if (mddev->safemode &&
6648 !atomic_read(&mddev->writes_pending) && 6648 !atomic_read(&mddev->writes_pending) &&
6649 !mddev->in_sync && 6649 !mddev->in_sync &&
6650 mddev->recovery_cp == MaxSector) { 6650 mddev->recovery_cp == MaxSector) {
6651 mddev->in_sync = 1; 6651 mddev->in_sync = 1;
6652 did_change = 1; 6652 did_change = 1;
6653 if (mddev->persistent) 6653 if (mddev->persistent)
6654 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 6654 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
6655 } 6655 }
6656 if (mddev->safemode == 1) 6656 if (mddev->safemode == 1)
6657 mddev->safemode = 0; 6657 mddev->safemode = 0;
6658 spin_unlock_irq(&mddev->write_lock); 6658 spin_unlock_irq(&mddev->write_lock);
6659 if (did_change) 6659 if (did_change)
6660 sysfs_notify_dirent(mddev->sysfs_state); 6660 sysfs_notify_dirent(mddev->sysfs_state);
6661 } 6661 }
6662 6662
6663 if (mddev->flags) 6663 if (mddev->flags)
6664 md_update_sb(mddev, 0); 6664 md_update_sb(mddev, 0);
6665 6665
6666 list_for_each_entry(rdev, &mddev->disks, same_set) 6666 list_for_each_entry(rdev, &mddev->disks, same_set)
6667 if (test_and_clear_bit(StateChanged, &rdev->flags)) 6667 if (test_and_clear_bit(StateChanged, &rdev->flags))
6668 sysfs_notify_dirent(rdev->sysfs_state); 6668 sysfs_notify_dirent(rdev->sysfs_state);
6669 6669
6670 6670
6671 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 6671 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
6672 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 6672 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
6673 /* resync/recovery still happening */ 6673 /* resync/recovery still happening */
6674 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6674 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6675 goto unlock; 6675 goto unlock;
6676 } 6676 }
6677 if (mddev->sync_thread) { 6677 if (mddev->sync_thread) {
6678 /* resync has finished, collect result */ 6678 /* resync has finished, collect result */
6679 md_unregister_thread(mddev->sync_thread); 6679 md_unregister_thread(mddev->sync_thread);
6680 mddev->sync_thread = NULL; 6680 mddev->sync_thread = NULL;
6681 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 6681 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
6682 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 6682 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
6683 /* success...*/ 6683 /* success...*/
6684 /* activate any spares */ 6684 /* activate any spares */
6685 if (mddev->pers->spare_active(mddev)) 6685 if (mddev->pers->spare_active(mddev))
6686 sysfs_notify(&mddev->kobj, NULL, 6686 sysfs_notify(&mddev->kobj, NULL,
6687 "degraded"); 6687 "degraded");
6688 } 6688 }
6689 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 6689 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
6690 mddev->pers->finish_reshape) 6690 mddev->pers->finish_reshape)
6691 mddev->pers->finish_reshape(mddev); 6691 mddev->pers->finish_reshape(mddev);
6692 md_update_sb(mddev, 1); 6692 md_update_sb(mddev, 1);
6693 6693
6694 /* if array is no-longer degraded, then any saved_raid_disk 6694 /* if array is no-longer degraded, then any saved_raid_disk
6695 * information must be scrapped 6695 * information must be scrapped
6696 */ 6696 */
6697 if (!mddev->degraded) 6697 if (!mddev->degraded)
6698 list_for_each_entry(rdev, &mddev->disks, same_set) 6698 list_for_each_entry(rdev, &mddev->disks, same_set)
6699 rdev->saved_raid_disk = -1; 6699 rdev->saved_raid_disk = -1;
6700 6700
6701 mddev->recovery = 0; 6701 mddev->recovery = 0;
6702 /* flag recovery needed just to double check */ 6702 /* flag recovery needed just to double check */
6703 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6703 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6704 sysfs_notify_dirent(mddev->sysfs_action); 6704 sysfs_notify_dirent(mddev->sysfs_action);
6705 md_new_event(mddev); 6705 md_new_event(mddev);
6706 goto unlock; 6706 goto unlock;
6707 } 6707 }
6708 /* Set RUNNING before clearing NEEDED to avoid 6708 /* Set RUNNING before clearing NEEDED to avoid
6709 * any transients in the value of "sync_action". 6709 * any transients in the value of "sync_action".
6710 */ 6710 */
6711 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6711 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6712 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6712 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6713 /* Clear some bits that don't mean anything, but 6713 /* Clear some bits that don't mean anything, but
6714 * might be left set 6714 * might be left set
6715 */ 6715 */
6716 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 6716 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
6717 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 6717 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
6718 6718
6719 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 6719 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
6720 goto unlock; 6720 goto unlock;
6721 /* no recovery is running. 6721 /* no recovery is running.
6722 * remove any failed drives, then 6722 * remove any failed drives, then
6723 * add spares if possible. 6723 * add spares if possible.
6724 * Spare are also removed and re-added, to allow 6724 * Spare are also removed and re-added, to allow
6725 * the personality to fail the re-add. 6725 * the personality to fail the re-add.
6726 */ 6726 */
6727 6727
6728 if (mddev->reshape_position != MaxSector) { 6728 if (mddev->reshape_position != MaxSector) {
6729 if (mddev->pers->check_reshape == NULL || 6729 if (mddev->pers->check_reshape == NULL ||
6730 mddev->pers->check_reshape(mddev) != 0) 6730 mddev->pers->check_reshape(mddev) != 0)
6731 /* Cannot proceed */ 6731 /* Cannot proceed */
6732 goto unlock; 6732 goto unlock;
6733 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 6733 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
6734 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6734 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6735 } else if ((spares = remove_and_add_spares(mddev))) { 6735 } else if ((spares = remove_and_add_spares(mddev))) {
6736 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6736 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6737 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 6737 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
6738 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 6738 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
6739 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6739 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6740 } else if (mddev->recovery_cp < MaxSector) { 6740 } else if (mddev->recovery_cp < MaxSector) {
6741 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 6741 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
6742 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6742 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6743 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 6743 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
6744 /* nothing to be done ... */ 6744 /* nothing to be done ... */
6745 goto unlock; 6745 goto unlock;
6746 6746
6747 if (mddev->pers->sync_request) { 6747 if (mddev->pers->sync_request) {
6748 if (spares && mddev->bitmap && ! mddev->bitmap->file) { 6748 if (spares && mddev->bitmap && ! mddev->bitmap->file) {
6749 /* We are adding a device or devices to an array 6749 /* We are adding a device or devices to an array
6750 * which has the bitmap stored on all devices. 6750 * which has the bitmap stored on all devices.
6751 * So make sure all bitmap pages get written 6751 * So make sure all bitmap pages get written
6752 */ 6752 */
6753 bitmap_write_all(mddev->bitmap); 6753 bitmap_write_all(mddev->bitmap);
6754 } 6754 }
6755 mddev->sync_thread = md_register_thread(md_do_sync, 6755 mddev->sync_thread = md_register_thread(md_do_sync,
6756 mddev, 6756 mddev,
6757 "resync"); 6757 "resync");
6758 if (!mddev->sync_thread) { 6758 if (!mddev->sync_thread) {
6759 printk(KERN_ERR "%s: could not start resync" 6759 printk(KERN_ERR "%s: could not start resync"
6760 " thread...\n", 6760 " thread...\n",
6761 mdname(mddev)); 6761 mdname(mddev));
6762 /* leave the spares where they are, it shouldn't hurt */ 6762 /* leave the spares where they are, it shouldn't hurt */
6763 mddev->recovery = 0; 6763 mddev->recovery = 0;
6764 } else 6764 } else
6765 md_wakeup_thread(mddev->sync_thread); 6765 md_wakeup_thread(mddev->sync_thread);
6766 sysfs_notify_dirent(mddev->sysfs_action); 6766 sysfs_notify_dirent(mddev->sysfs_action);
6767 md_new_event(mddev); 6767 md_new_event(mddev);
6768 } 6768 }
6769 unlock: 6769 unlock:
6770 if (!mddev->sync_thread) { 6770 if (!mddev->sync_thread) {
6771 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 6771 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
6772 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 6772 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
6773 &mddev->recovery)) 6773 &mddev->recovery))
6774 if (mddev->sysfs_action) 6774 if (mddev->sysfs_action)
6775 sysfs_notify_dirent(mddev->sysfs_action); 6775 sysfs_notify_dirent(mddev->sysfs_action);
6776 } 6776 }
6777 mddev_unlock(mddev); 6777 mddev_unlock(mddev);
6778 } 6778 }
6779 } 6779 }
6780 6780
6781 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) 6781 void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
6782 { 6782 {
6783 sysfs_notify_dirent(rdev->sysfs_state); 6783 sysfs_notify_dirent(rdev->sysfs_state);
6784 wait_event_timeout(rdev->blocked_wait, 6784 wait_event_timeout(rdev->blocked_wait,
6785 !test_bit(Blocked, &rdev->flags), 6785 !test_bit(Blocked, &rdev->flags),
6786 msecs_to_jiffies(5000)); 6786 msecs_to_jiffies(5000));
6787 rdev_dec_pending(rdev, mddev); 6787 rdev_dec_pending(rdev, mddev);
6788 } 6788 }
6789 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 6789 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
6790 6790
6791 static int md_notify_reboot(struct notifier_block *this, 6791 static int md_notify_reboot(struct notifier_block *this,
6792 unsigned long code, void *x) 6792 unsigned long code, void *x)
6793 { 6793 {
6794 struct list_head *tmp; 6794 struct list_head *tmp;
6795 mddev_t *mddev; 6795 mddev_t *mddev;
6796 6796
6797 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 6797 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) {
6798 6798
6799 printk(KERN_INFO "md: stopping all md devices.\n"); 6799 printk(KERN_INFO "md: stopping all md devices.\n");
6800 6800
6801 for_each_mddev(mddev, tmp) 6801 for_each_mddev(mddev, tmp)
6802 if (mddev_trylock(mddev)) { 6802 if (mddev_trylock(mddev)) {
6803 /* Force a switch to readonly even array 6803 /* Force a switch to readonly even array
6804 * appears to still be in use. Hence 6804 * appears to still be in use. Hence
6805 * the '100'. 6805 * the '100'.
6806 */ 6806 */
6807 do_md_stop(mddev, 1, 100); 6807 do_md_stop(mddev, 1, 100);
6808 mddev_unlock(mddev); 6808 mddev_unlock(mddev);
6809 } 6809 }
6810 /* 6810 /*
6811 * certain more exotic SCSI devices are known to be 6811 * certain more exotic SCSI devices are known to be
6812 * volatile wrt too early system reboots. While the 6812 * volatile wrt too early system reboots. While the
6813 * right place to handle this issue is the given 6813 * right place to handle this issue is the given
6814 * driver, we do want to have a safe RAID driver ... 6814 * driver, we do want to have a safe RAID driver ...
6815 */ 6815 */
6816 mdelay(1000*1); 6816 mdelay(1000*1);
6817 } 6817 }
6818 return NOTIFY_DONE; 6818 return NOTIFY_DONE;
6819 } 6819 }
6820 6820
6821 static struct notifier_block md_notifier = { 6821 static struct notifier_block md_notifier = {
6822 .notifier_call = md_notify_reboot, 6822 .notifier_call = md_notify_reboot,
6823 .next = NULL, 6823 .next = NULL,
6824 .priority = INT_MAX, /* before any real devices */ 6824 .priority = INT_MAX, /* before any real devices */
6825 }; 6825 };
6826 6826
6827 static void md_geninit(void) 6827 static void md_geninit(void)
6828 { 6828 {
6829 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 6829 dprintk("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
6830 6830
6831 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 6831 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
6832 } 6832 }
6833 6833
6834 static int __init md_init(void) 6834 static int __init md_init(void)
6835 { 6835 {
6836 if (register_blkdev(MD_MAJOR, "md")) 6836 if (register_blkdev(MD_MAJOR, "md"))
6837 return -1; 6837 return -1;
6838 if ((mdp_major=register_blkdev(0, "mdp"))<=0) { 6838 if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
6839 unregister_blkdev(MD_MAJOR, "md"); 6839 unregister_blkdev(MD_MAJOR, "md");
6840 return -1; 6840 return -1;
6841 } 6841 }
6842 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, 6842 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
6843 md_probe, NULL, NULL); 6843 md_probe, NULL, NULL);
6844 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 6844 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
6845 md_probe, NULL, NULL); 6845 md_probe, NULL, NULL);
6846 6846
6847 register_reboot_notifier(&md_notifier); 6847 register_reboot_notifier(&md_notifier);
6848 raid_table_header = register_sysctl_table(raid_root_table); 6848 raid_table_header = register_sysctl_table(raid_root_table);
6849 6849
6850 md_geninit(); 6850 md_geninit();
6851 return 0; 6851 return 0;
6852 } 6852 }
6853 6853
6854 6854
6855 #ifndef MODULE 6855 #ifndef MODULE
6856 6856
6857 /* 6857 /*
6858 * Searches all registered partitions for autorun RAID arrays 6858 * Searches all registered partitions for autorun RAID arrays
6859 * at boot time. 6859 * at boot time.
6860 */ 6860 */
6861 6861
6862 static LIST_HEAD(all_detected_devices); 6862 static LIST_HEAD(all_detected_devices);
6863 struct detected_devices_node { 6863 struct detected_devices_node {
6864 struct list_head list; 6864 struct list_head list;
6865 dev_t dev; 6865 dev_t dev;
6866 }; 6866 };
6867 6867
6868 void md_autodetect_dev(dev_t dev) 6868 void md_autodetect_dev(dev_t dev)
6869 { 6869 {
6870 struct detected_devices_node *node_detected_dev; 6870 struct detected_devices_node *node_detected_dev;
6871 6871
6872 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 6872 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
6873 if (node_detected_dev) { 6873 if (node_detected_dev) {
6874 node_detected_dev->dev = dev; 6874 node_detected_dev->dev = dev;
6875 list_add_tail(&node_detected_dev->list, &all_detected_devices); 6875 list_add_tail(&node_detected_dev->list, &all_detected_devices);
6876 } else { 6876 } else {
6877 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 6877 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
6878 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 6878 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
6879 } 6879 }
6880 } 6880 }
6881 6881
6882 6882
6883 static void autostart_arrays(int part) 6883 static void autostart_arrays(int part)
6884 { 6884 {
6885 mdk_rdev_t *rdev; 6885 mdk_rdev_t *rdev;
6886 struct detected_devices_node *node_detected_dev; 6886 struct detected_devices_node *node_detected_dev;
6887 dev_t dev; 6887 dev_t dev;
6888 int i_scanned, i_passed; 6888 int i_scanned, i_passed;
6889 6889
6890 i_scanned = 0; 6890 i_scanned = 0;
6891 i_passed = 0; 6891 i_passed = 0;
6892 6892
6893 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 6893 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
6894 6894
6895 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 6895 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
6896 i_scanned++; 6896 i_scanned++;
6897 node_detected_dev = list_entry(all_detected_devices.next, 6897 node_detected_dev = list_entry(all_detected_devices.next,
6898 struct detected_devices_node, list); 6898 struct detected_devices_node, list);
6899 list_del(&node_detected_dev->list); 6899 list_del(&node_detected_dev->list);
6900 dev = node_detected_dev->dev; 6900 dev = node_detected_dev->dev;
6901 kfree(node_detected_dev); 6901 kfree(node_detected_dev);
6902 rdev = md_import_device(dev,0, 90); 6902 rdev = md_import_device(dev,0, 90);
6903 if (IS_ERR(rdev)) 6903 if (IS_ERR(rdev))
6904 continue; 6904 continue;
6905 6905
6906 if (test_bit(Faulty, &rdev->flags)) { 6906 if (test_bit(Faulty, &rdev->flags)) {
6907 MD_BUG(); 6907 MD_BUG();
6908 continue; 6908 continue;
6909 } 6909 }
6910 set_bit(AutoDetected, &rdev->flags); 6910 set_bit(AutoDetected, &rdev->flags);
6911 list_add(&rdev->same_set, &pending_raid_disks); 6911 list_add(&rdev->same_set, &pending_raid_disks);
6912 i_passed++; 6912 i_passed++;
6913 } 6913 }
6914 6914
6915 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 6915 printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
6916 i_scanned, i_passed); 6916 i_scanned, i_passed);
6917 6917
6918 autorun_devices(part); 6918 autorun_devices(part);
6919 } 6919 }
6920 6920
6921 #endif /* !MODULE */ 6921 #endif /* !MODULE */
6922 6922
6923 static __exit void md_exit(void) 6923 static __exit void md_exit(void)
6924 { 6924 {
6925 mddev_t *mddev; 6925 mddev_t *mddev;
6926 struct list_head *tmp; 6926 struct list_head *tmp;
6927 6927
6928 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); 6928 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
6929 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 6929 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
6930 6930
6931 unregister_blkdev(MD_MAJOR,"md"); 6931 unregister_blkdev(MD_MAJOR,"md");
6932 unregister_blkdev(mdp_major, "mdp"); 6932 unregister_blkdev(mdp_major, "mdp");
6933 unregister_reboot_notifier(&md_notifier); 6933 unregister_reboot_notifier(&md_notifier);
6934 unregister_sysctl_table(raid_table_header); 6934 unregister_sysctl_table(raid_table_header);
6935 remove_proc_entry("mdstat", NULL); 6935 remove_proc_entry("mdstat", NULL);
6936 for_each_mddev(mddev, tmp) { 6936 for_each_mddev(mddev, tmp) {
6937 export_array(mddev); 6937 export_array(mddev);
6938 mddev->hold_active = 0; 6938 mddev->hold_active = 0;
6939 } 6939 }
6940 } 6940 }
6941 6941
6942 subsys_initcall(md_init); 6942 subsys_initcall(md_init);
6943 module_exit(md_exit) 6943 module_exit(md_exit)
6944 6944
6945 static int get_ro(char *buffer, struct kernel_param *kp) 6945 static int get_ro(char *buffer, struct kernel_param *kp)
6946 { 6946 {
6947 return sprintf(buffer, "%d", start_readonly); 6947 return sprintf(buffer, "%d", start_readonly);
6948 } 6948 }
6949 static int set_ro(const char *val, struct kernel_param *kp) 6949 static int set_ro(const char *val, struct kernel_param *kp)
6950 { 6950 {
6951 char *e; 6951 char *e;
6952 int num = simple_strtoul(val, &e, 10); 6952 int num = simple_strtoul(val, &e, 10);
6953 if (*val && (*e == '\0' || *e == '\n')) { 6953 if (*val && (*e == '\0' || *e == '\n')) {
6954 start_readonly = num; 6954 start_readonly = num;
6955 return 0; 6955 return 0;
6956 } 6956 }
6957 return -EINVAL; 6957 return -EINVAL;
6958 } 6958 }
6959 6959
6960 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 6960 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
6961 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 6961 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
6962 6962
6963 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 6963 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
6964 6964
6965 EXPORT_SYMBOL(register_md_personality); 6965 EXPORT_SYMBOL(register_md_personality);
6966 EXPORT_SYMBOL(unregister_md_personality); 6966 EXPORT_SYMBOL(unregister_md_personality);
6967 EXPORT_SYMBOL(md_error); 6967 EXPORT_SYMBOL(md_error);
6968 EXPORT_SYMBOL(md_done_sync); 6968 EXPORT_SYMBOL(md_done_sync);
6969 EXPORT_SYMBOL(md_write_start); 6969 EXPORT_SYMBOL(md_write_start);
6970 EXPORT_SYMBOL(md_write_end); 6970 EXPORT_SYMBOL(md_write_end);
6971 EXPORT_SYMBOL(md_register_thread); 6971 EXPORT_SYMBOL(md_register_thread);
6972 EXPORT_SYMBOL(md_unregister_thread); 6972 EXPORT_SYMBOL(md_unregister_thread);
6973 EXPORT_SYMBOL(md_wakeup_thread); 6973 EXPORT_SYMBOL(md_wakeup_thread);
6974 EXPORT_SYMBOL(md_check_recovery); 6974 EXPORT_SYMBOL(md_check_recovery);
6975 MODULE_LICENSE("GPL"); 6975 MODULE_LICENSE("GPL");
6976 MODULE_ALIAS("md"); 6976 MODULE_ALIAS("md");
6977 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 6977 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
6978 6978
1 /* 1 /*
2 * raid5.c : Multiple Devices driver for Linux 2 * raid5.c : Multiple Devices driver for Linux
3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman 3 * Copyright (C) 1996, 1997 Ingo Molnar, Miguel de Icaza, Gadi Oxman
4 * Copyright (C) 1999, 2000 Ingo Molnar 4 * Copyright (C) 1999, 2000 Ingo Molnar
5 * Copyright (C) 2002, 2003 H. Peter Anvin 5 * Copyright (C) 2002, 2003 H. Peter Anvin
6 * 6 *
7 * RAID-4/5/6 management functions. 7 * RAID-4/5/6 management functions.
8 * Thanks to Penguin Computing for making the RAID-6 development possible 8 * Thanks to Penguin Computing for making the RAID-6 development possible
9 * by donating a test server! 9 * by donating a test server!
10 * 10 *
11 * This program is free software; you can redistribute it and/or modify 11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by 12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2, or (at your option) 13 * the Free Software Foundation; either version 2, or (at your option)
14 * any later version. 14 * any later version.
15 * 15 *
16 * You should have received a copy of the GNU General Public License 16 * You should have received a copy of the GNU General Public License
17 * (for example /usr/src/linux/COPYING); if not, write to the Free 17 * (for example /usr/src/linux/COPYING); if not, write to the Free
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20 20
21 /* 21 /*
22 * BITMAP UNPLUGGING: 22 * BITMAP UNPLUGGING:
23 * 23 *
24 * The sequencing for updating the bitmap reliably is a little 24 * The sequencing for updating the bitmap reliably is a little
25 * subtle (and I got it wrong the first time) so it deserves some 25 * subtle (and I got it wrong the first time) so it deserves some
26 * explanation. 26 * explanation.
27 * 27 *
28 * We group bitmap updates into batches. Each batch has a number. 28 * We group bitmap updates into batches. Each batch has a number.
29 * We may write out several batches at once, but that isn't very important. 29 * We may write out several batches at once, but that isn't very important.
30 * conf->bm_write is the number of the last batch successfully written. 30 * conf->bm_write is the number of the last batch successfully written.
31 * conf->bm_flush is the number of the last batch that was closed to 31 * conf->bm_flush is the number of the last batch that was closed to
32 * new additions. 32 * new additions.
33 * When we discover that we will need to write to any block in a stripe 33 * When we discover that we will need to write to any block in a stripe
34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq 34 * (in add_stripe_bio) we update the in-memory bitmap and record in sh->bm_seq
35 * the number of the batch it will be in. This is bm_flush+1. 35 * the number of the batch it will be in. This is bm_flush+1.
36 * When we are ready to do a write, if that batch hasn't been written yet, 36 * When we are ready to do a write, if that batch hasn't been written yet,
37 * we plug the array and queue the stripe for later. 37 * we plug the array and queue the stripe for later.
38 * When an unplug happens, we increment bm_flush, thus closing the current 38 * When an unplug happens, we increment bm_flush, thus closing the current
39 * batch. 39 * batch.
40 * When we notice that bm_flush > bm_write, we write out all pending updates 40 * When we notice that bm_flush > bm_write, we write out all pending updates
41 * to the bitmap, and advance bm_write to where bm_flush was. 41 * to the bitmap, and advance bm_write to where bm_flush was.
42 * This may occasionally write a bit out twice, but is sure never to 42 * This may occasionally write a bit out twice, but is sure never to
43 * miss any bits. 43 * miss any bits.
44 */ 44 */
45 45
46 #include <linux/blkdev.h> 46 #include <linux/blkdev.h>
47 #include <linux/kthread.h> 47 #include <linux/kthread.h>
48 #include <linux/raid/pq.h> 48 #include <linux/raid/pq.h>
49 #include <linux/async_tx.h> 49 #include <linux/async_tx.h>
50 #include <linux/async.h> 50 #include <linux/async.h>
51 #include <linux/seq_file.h> 51 #include <linux/seq_file.h>
52 #include <linux/cpu.h> 52 #include <linux/cpu.h>
53 #include "md.h" 53 #include "md.h"
54 #include "raid5.h" 54 #include "raid5.h"
55 #include "bitmap.h" 55 #include "bitmap.h"
56 56
57 /* 57 /*
58 * Stripe cache 58 * Stripe cache
59 */ 59 */
60 60
61 #define NR_STRIPES 256 61 #define NR_STRIPES 256
62 #define STRIPE_SIZE PAGE_SIZE 62 #define STRIPE_SIZE PAGE_SIZE
63 #define STRIPE_SHIFT (PAGE_SHIFT - 9) 63 #define STRIPE_SHIFT (PAGE_SHIFT - 9)
64 #define STRIPE_SECTORS (STRIPE_SIZE>>9) 64 #define STRIPE_SECTORS (STRIPE_SIZE>>9)
65 #define IO_THRESHOLD 1 65 #define IO_THRESHOLD 1
66 #define BYPASS_THRESHOLD 1 66 #define BYPASS_THRESHOLD 1
67 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) 67 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head))
68 #define HASH_MASK (NR_HASH - 1) 68 #define HASH_MASK (NR_HASH - 1)
69 69
70 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK])) 70 #define stripe_hash(conf, sect) (&((conf)->stripe_hashtbl[((sect) >> STRIPE_SHIFT) & HASH_MASK]))
71 71
72 /* bio's attached to a stripe+device for I/O are linked together in bi_sector 72 /* bio's attached to a stripe+device for I/O are linked together in bi_sector
73 * order without overlap. There may be several bio's per stripe+device, and 73 * order without overlap. There may be several bio's per stripe+device, and
74 * a bio could span several devices. 74 * a bio could span several devices.
75 * When walking this list for a particular stripe+device, we must never proceed 75 * When walking this list for a particular stripe+device, we must never proceed
76 * beyond a bio that extends past this device, as the next bio might no longer 76 * beyond a bio that extends past this device, as the next bio might no longer
77 * be valid. 77 * be valid.
78 * This macro is used to determine the 'next' bio in the list, given the sector 78 * This macro is used to determine the 'next' bio in the list, given the sector
79 * of the current stripe+device 79 * of the current stripe+device
80 */ 80 */
81 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL) 81 #define r5_next_bio(bio, sect) ( ( (bio)->bi_sector + ((bio)->bi_size>>9) < sect + STRIPE_SECTORS) ? (bio)->bi_next : NULL)
82 /* 82 /*
83 * The following can be used to debug the driver 83 * The following can be used to debug the driver
84 */ 84 */
85 #define RAID5_PARANOIA 1 85 #define RAID5_PARANOIA 1
86 #if RAID5_PARANOIA && defined(CONFIG_SMP) 86 #if RAID5_PARANOIA && defined(CONFIG_SMP)
87 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock) 87 # define CHECK_DEVLOCK() assert_spin_locked(&conf->device_lock)
88 #else 88 #else
89 # define CHECK_DEVLOCK() 89 # define CHECK_DEVLOCK()
90 #endif 90 #endif
91 91
92 #ifdef DEBUG 92 #ifdef DEBUG
93 #define inline 93 #define inline
94 #define __inline__ 94 #define __inline__
95 #endif 95 #endif
96 96
97 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args))) 97 #define printk_rl(args...) ((void) (printk_ratelimit() && printk(args)))
98 98
99 /* 99 /*
100 * We maintain a biased count of active stripes in the bottom 16 bits of 100 * We maintain a biased count of active stripes in the bottom 16 bits of
101 * bi_phys_segments, and a count of processed stripes in the upper 16 bits 101 * bi_phys_segments, and a count of processed stripes in the upper 16 bits
102 */ 102 */
103 static inline int raid5_bi_phys_segments(struct bio *bio) 103 static inline int raid5_bi_phys_segments(struct bio *bio)
104 { 104 {
105 return bio->bi_phys_segments & 0xffff; 105 return bio->bi_phys_segments & 0xffff;
106 } 106 }
107 107
108 static inline int raid5_bi_hw_segments(struct bio *bio) 108 static inline int raid5_bi_hw_segments(struct bio *bio)
109 { 109 {
110 return (bio->bi_phys_segments >> 16) & 0xffff; 110 return (bio->bi_phys_segments >> 16) & 0xffff;
111 } 111 }
112 112
113 static inline int raid5_dec_bi_phys_segments(struct bio *bio) 113 static inline int raid5_dec_bi_phys_segments(struct bio *bio)
114 { 114 {
115 --bio->bi_phys_segments; 115 --bio->bi_phys_segments;
116 return raid5_bi_phys_segments(bio); 116 return raid5_bi_phys_segments(bio);
117 } 117 }
118 118
119 static inline int raid5_dec_bi_hw_segments(struct bio *bio) 119 static inline int raid5_dec_bi_hw_segments(struct bio *bio)
120 { 120 {
121 unsigned short val = raid5_bi_hw_segments(bio); 121 unsigned short val = raid5_bi_hw_segments(bio);
122 122
123 --val; 123 --val;
124 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio); 124 bio->bi_phys_segments = (val << 16) | raid5_bi_phys_segments(bio);
125 return val; 125 return val;
126 } 126 }
127 127
128 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt) 128 static inline void raid5_set_bi_hw_segments(struct bio *bio, unsigned int cnt)
129 { 129 {
130 bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16); 130 bio->bi_phys_segments = raid5_bi_phys_segments(bio) || (cnt << 16);
131 } 131 }
132 132
133 /* Find first data disk in a raid6 stripe */ 133 /* Find first data disk in a raid6 stripe */
134 static inline int raid6_d0(struct stripe_head *sh) 134 static inline int raid6_d0(struct stripe_head *sh)
135 { 135 {
136 if (sh->ddf_layout) 136 if (sh->ddf_layout)
137 /* ddf always start from first device */ 137 /* ddf always start from first device */
138 return 0; 138 return 0;
139 /* md starts just after Q block */ 139 /* md starts just after Q block */
140 if (sh->qd_idx == sh->disks - 1) 140 if (sh->qd_idx == sh->disks - 1)
141 return 0; 141 return 0;
142 else 142 else
143 return sh->qd_idx + 1; 143 return sh->qd_idx + 1;
144 } 144 }
145 static inline int raid6_next_disk(int disk, int raid_disks) 145 static inline int raid6_next_disk(int disk, int raid_disks)
146 { 146 {
147 disk++; 147 disk++;
148 return (disk < raid_disks) ? disk : 0; 148 return (disk < raid_disks) ? disk : 0;
149 } 149 }
150 150
151 /* When walking through the disks in a raid5, starting at raid6_d0, 151 /* When walking through the disks in a raid5, starting at raid6_d0,
152 * We need to map each disk to a 'slot', where the data disks are slot 152 * We need to map each disk to a 'slot', where the data disks are slot
153 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk 153 * 0 .. raid_disks-3, the parity disk is raid_disks-2 and the Q disk
154 * is raid_disks-1. This help does that mapping. 154 * is raid_disks-1. This help does that mapping.
155 */ 155 */
156 static int raid6_idx_to_slot(int idx, struct stripe_head *sh, 156 static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
157 int *count, int syndrome_disks) 157 int *count, int syndrome_disks)
158 { 158 {
159 int slot; 159 int slot;
160 160
161 if (sh->ddf_layout) 161 if (sh->ddf_layout)
162 slot = (*count)++; 162 slot = (*count)++;
163 if (idx == sh->pd_idx) 163 if (idx == sh->pd_idx)
164 return syndrome_disks; 164 return syndrome_disks;
165 if (idx == sh->qd_idx) 165 if (idx == sh->qd_idx)
166 return syndrome_disks + 1; 166 return syndrome_disks + 1;
167 if (!sh->ddf_layout) 167 if (!sh->ddf_layout)
168 slot = (*count)++; 168 slot = (*count)++;
169 return slot; 169 return slot;
170 } 170 }
171 171
172 static void return_io(struct bio *return_bi) 172 static void return_io(struct bio *return_bi)
173 { 173 {
174 struct bio *bi = return_bi; 174 struct bio *bi = return_bi;
175 while (bi) { 175 while (bi) {
176 176
177 return_bi = bi->bi_next; 177 return_bi = bi->bi_next;
178 bi->bi_next = NULL; 178 bi->bi_next = NULL;
179 bi->bi_size = 0; 179 bi->bi_size = 0;
180 bio_endio(bi, 0); 180 bio_endio(bi, 0);
181 bi = return_bi; 181 bi = return_bi;
182 } 182 }
183 } 183 }
184 184
185 static void print_raid5_conf (raid5_conf_t *conf); 185 static void print_raid5_conf (raid5_conf_t *conf);
186 186
187 static int stripe_operations_active(struct stripe_head *sh) 187 static int stripe_operations_active(struct stripe_head *sh)
188 { 188 {
189 return sh->check_state || sh->reconstruct_state || 189 return sh->check_state || sh->reconstruct_state ||
190 test_bit(STRIPE_BIOFILL_RUN, &sh->state) || 190 test_bit(STRIPE_BIOFILL_RUN, &sh->state) ||
191 test_bit(STRIPE_COMPUTE_RUN, &sh->state); 191 test_bit(STRIPE_COMPUTE_RUN, &sh->state);
192 } 192 }
193 193
194 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh) 194 static void __release_stripe(raid5_conf_t *conf, struct stripe_head *sh)
195 { 195 {
196 if (atomic_dec_and_test(&sh->count)) { 196 if (atomic_dec_and_test(&sh->count)) {
197 BUG_ON(!list_empty(&sh->lru)); 197 BUG_ON(!list_empty(&sh->lru));
198 BUG_ON(atomic_read(&conf->active_stripes)==0); 198 BUG_ON(atomic_read(&conf->active_stripes)==0);
199 if (test_bit(STRIPE_HANDLE, &sh->state)) { 199 if (test_bit(STRIPE_HANDLE, &sh->state)) {
200 if (test_bit(STRIPE_DELAYED, &sh->state)) { 200 if (test_bit(STRIPE_DELAYED, &sh->state)) {
201 list_add_tail(&sh->lru, &conf->delayed_list); 201 list_add_tail(&sh->lru, &conf->delayed_list);
202 blk_plug_device(conf->mddev->queue); 202 blk_plug_device(conf->mddev->queue);
203 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) && 203 } else if (test_bit(STRIPE_BIT_DELAY, &sh->state) &&
204 sh->bm_seq - conf->seq_write > 0) { 204 sh->bm_seq - conf->seq_write > 0) {
205 list_add_tail(&sh->lru, &conf->bitmap_list); 205 list_add_tail(&sh->lru, &conf->bitmap_list);
206 blk_plug_device(conf->mddev->queue); 206 blk_plug_device(conf->mddev->queue);
207 } else { 207 } else {
208 clear_bit(STRIPE_BIT_DELAY, &sh->state); 208 clear_bit(STRIPE_BIT_DELAY, &sh->state);
209 list_add_tail(&sh->lru, &conf->handle_list); 209 list_add_tail(&sh->lru, &conf->handle_list);
210 } 210 }
211 md_wakeup_thread(conf->mddev->thread); 211 md_wakeup_thread(conf->mddev->thread);
212 } else { 212 } else {
213 BUG_ON(stripe_operations_active(sh)); 213 BUG_ON(stripe_operations_active(sh));
214 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 214 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
215 atomic_dec(&conf->preread_active_stripes); 215 atomic_dec(&conf->preread_active_stripes);
216 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 216 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD)
217 md_wakeup_thread(conf->mddev->thread); 217 md_wakeup_thread(conf->mddev->thread);
218 } 218 }
219 atomic_dec(&conf->active_stripes); 219 atomic_dec(&conf->active_stripes);
220 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 220 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
221 list_add_tail(&sh->lru, &conf->inactive_list); 221 list_add_tail(&sh->lru, &conf->inactive_list);
222 wake_up(&conf->wait_for_stripe); 222 wake_up(&conf->wait_for_stripe);
223 if (conf->retry_read_aligned) 223 if (conf->retry_read_aligned)
224 md_wakeup_thread(conf->mddev->thread); 224 md_wakeup_thread(conf->mddev->thread);
225 } 225 }
226 } 226 }
227 } 227 }
228 } 228 }
229 229
230 static void release_stripe(struct stripe_head *sh) 230 static void release_stripe(struct stripe_head *sh)
231 { 231 {
232 raid5_conf_t *conf = sh->raid_conf; 232 raid5_conf_t *conf = sh->raid_conf;
233 unsigned long flags; 233 unsigned long flags;
234 234
235 spin_lock_irqsave(&conf->device_lock, flags); 235 spin_lock_irqsave(&conf->device_lock, flags);
236 __release_stripe(conf, sh); 236 __release_stripe(conf, sh);
237 spin_unlock_irqrestore(&conf->device_lock, flags); 237 spin_unlock_irqrestore(&conf->device_lock, flags);
238 } 238 }
239 239
240 static inline void remove_hash(struct stripe_head *sh) 240 static inline void remove_hash(struct stripe_head *sh)
241 { 241 {
242 pr_debug("remove_hash(), stripe %llu\n", 242 pr_debug("remove_hash(), stripe %llu\n",
243 (unsigned long long)sh->sector); 243 (unsigned long long)sh->sector);
244 244
245 hlist_del_init(&sh->hash); 245 hlist_del_init(&sh->hash);
246 } 246 }
247 247
248 static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh) 248 static inline void insert_hash(raid5_conf_t *conf, struct stripe_head *sh)
249 { 249 {
250 struct hlist_head *hp = stripe_hash(conf, sh->sector); 250 struct hlist_head *hp = stripe_hash(conf, sh->sector);
251 251
252 pr_debug("insert_hash(), stripe %llu\n", 252 pr_debug("insert_hash(), stripe %llu\n",
253 (unsigned long long)sh->sector); 253 (unsigned long long)sh->sector);
254 254
255 CHECK_DEVLOCK(); 255 CHECK_DEVLOCK();
256 hlist_add_head(&sh->hash, hp); 256 hlist_add_head(&sh->hash, hp);
257 } 257 }
258 258
259 259
260 /* find an idle stripe, make sure it is unhashed, and return it. */ 260 /* find an idle stripe, make sure it is unhashed, and return it. */
261 static struct stripe_head *get_free_stripe(raid5_conf_t *conf) 261 static struct stripe_head *get_free_stripe(raid5_conf_t *conf)
262 { 262 {
263 struct stripe_head *sh = NULL; 263 struct stripe_head *sh = NULL;
264 struct list_head *first; 264 struct list_head *first;
265 265
266 CHECK_DEVLOCK(); 266 CHECK_DEVLOCK();
267 if (list_empty(&conf->inactive_list)) 267 if (list_empty(&conf->inactive_list))
268 goto out; 268 goto out;
269 first = conf->inactive_list.next; 269 first = conf->inactive_list.next;
270 sh = list_entry(first, struct stripe_head, lru); 270 sh = list_entry(first, struct stripe_head, lru);
271 list_del_init(first); 271 list_del_init(first);
272 remove_hash(sh); 272 remove_hash(sh);
273 atomic_inc(&conf->active_stripes); 273 atomic_inc(&conf->active_stripes);
274 out: 274 out:
275 return sh; 275 return sh;
276 } 276 }
277 277
278 static void shrink_buffers(struct stripe_head *sh, int num) 278 static void shrink_buffers(struct stripe_head *sh, int num)
279 { 279 {
280 struct page *p; 280 struct page *p;
281 int i; 281 int i;
282 282
283 for (i=0; i<num ; i++) { 283 for (i=0; i<num ; i++) {
284 p = sh->dev[i].page; 284 p = sh->dev[i].page;
285 if (!p) 285 if (!p)
286 continue; 286 continue;
287 sh->dev[i].page = NULL; 287 sh->dev[i].page = NULL;
288 put_page(p); 288 put_page(p);
289 } 289 }
290 } 290 }
291 291
292 static int grow_buffers(struct stripe_head *sh, int num) 292 static int grow_buffers(struct stripe_head *sh, int num)
293 { 293 {
294 int i; 294 int i;
295 295
296 for (i=0; i<num; i++) { 296 for (i=0; i<num; i++) {
297 struct page *page; 297 struct page *page;
298 298
299 if (!(page = alloc_page(GFP_KERNEL))) { 299 if (!(page = alloc_page(GFP_KERNEL))) {
300 return 1; 300 return 1;
301 } 301 }
302 sh->dev[i].page = page; 302 sh->dev[i].page = page;
303 } 303 }
304 return 0; 304 return 0;
305 } 305 }
306 306
307 static void raid5_build_block(struct stripe_head *sh, int i, int previous); 307 static void raid5_build_block(struct stripe_head *sh, int i, int previous);
308 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 308 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
309 struct stripe_head *sh); 309 struct stripe_head *sh);
310 310
311 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) 311 static void init_stripe(struct stripe_head *sh, sector_t sector, int previous)
312 { 312 {
313 raid5_conf_t *conf = sh->raid_conf; 313 raid5_conf_t *conf = sh->raid_conf;
314 int i; 314 int i;
315 315
316 BUG_ON(atomic_read(&sh->count) != 0); 316 BUG_ON(atomic_read(&sh->count) != 0);
317 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state)); 317 BUG_ON(test_bit(STRIPE_HANDLE, &sh->state));
318 BUG_ON(stripe_operations_active(sh)); 318 BUG_ON(stripe_operations_active(sh));
319 319
320 CHECK_DEVLOCK(); 320 CHECK_DEVLOCK();
321 pr_debug("init_stripe called, stripe %llu\n", 321 pr_debug("init_stripe called, stripe %llu\n",
322 (unsigned long long)sh->sector); 322 (unsigned long long)sh->sector);
323 323
324 remove_hash(sh); 324 remove_hash(sh);
325 325
326 sh->generation = conf->generation - previous; 326 sh->generation = conf->generation - previous;
327 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks; 327 sh->disks = previous ? conf->previous_raid_disks : conf->raid_disks;
328 sh->sector = sector; 328 sh->sector = sector;
329 stripe_set_idx(sector, conf, previous, sh); 329 stripe_set_idx(sector, conf, previous, sh);
330 sh->state = 0; 330 sh->state = 0;
331 331
332 332
333 for (i = sh->disks; i--; ) { 333 for (i = sh->disks; i--; ) {
334 struct r5dev *dev = &sh->dev[i]; 334 struct r5dev *dev = &sh->dev[i];
335 335
336 if (dev->toread || dev->read || dev->towrite || dev->written || 336 if (dev->toread || dev->read || dev->towrite || dev->written ||
337 test_bit(R5_LOCKED, &dev->flags)) { 337 test_bit(R5_LOCKED, &dev->flags)) {
338 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n", 338 printk(KERN_ERR "sector=%llx i=%d %p %p %p %p %d\n",
339 (unsigned long long)sh->sector, i, dev->toread, 339 (unsigned long long)sh->sector, i, dev->toread,
340 dev->read, dev->towrite, dev->written, 340 dev->read, dev->towrite, dev->written,
341 test_bit(R5_LOCKED, &dev->flags)); 341 test_bit(R5_LOCKED, &dev->flags));
342 BUG(); 342 BUG();
343 } 343 }
344 dev->flags = 0; 344 dev->flags = 0;
345 raid5_build_block(sh, i, previous); 345 raid5_build_block(sh, i, previous);
346 } 346 }
347 insert_hash(conf, sh); 347 insert_hash(conf, sh);
348 } 348 }
349 349
350 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector, 350 static struct stripe_head *__find_stripe(raid5_conf_t *conf, sector_t sector,
351 short generation) 351 short generation)
352 { 352 {
353 struct stripe_head *sh; 353 struct stripe_head *sh;
354 struct hlist_node *hn; 354 struct hlist_node *hn;
355 355
356 CHECK_DEVLOCK(); 356 CHECK_DEVLOCK();
357 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector); 357 pr_debug("__find_stripe, sector %llu\n", (unsigned long long)sector);
358 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash) 358 hlist_for_each_entry(sh, hn, stripe_hash(conf, sector), hash)
359 if (sh->sector == sector && sh->generation == generation) 359 if (sh->sector == sector && sh->generation == generation)
360 return sh; 360 return sh;
361 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector); 361 pr_debug("__stripe %llu not in cache\n", (unsigned long long)sector);
362 return NULL; 362 return NULL;
363 } 363 }
364 364
365 static void unplug_slaves(mddev_t *mddev); 365 static void unplug_slaves(mddev_t *mddev);
366 static void raid5_unplug_device(struct request_queue *q); 366 static void raid5_unplug_device(struct request_queue *q);
367 367
368 static struct stripe_head * 368 static struct stripe_head *
369 get_active_stripe(raid5_conf_t *conf, sector_t sector, 369 get_active_stripe(raid5_conf_t *conf, sector_t sector,
370 int previous, int noblock, int noquiesce) 370 int previous, int noblock, int noquiesce)
371 { 371 {
372 struct stripe_head *sh; 372 struct stripe_head *sh;
373 373
374 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); 374 pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector);
375 375
376 spin_lock_irq(&conf->device_lock); 376 spin_lock_irq(&conf->device_lock);
377 377
378 do { 378 do {
379 wait_event_lock_irq(conf->wait_for_stripe, 379 wait_event_lock_irq(conf->wait_for_stripe,
380 conf->quiesce == 0 || noquiesce, 380 conf->quiesce == 0 || noquiesce,
381 conf->device_lock, /* nothing */); 381 conf->device_lock, /* nothing */);
382 sh = __find_stripe(conf, sector, conf->generation - previous); 382 sh = __find_stripe(conf, sector, conf->generation - previous);
383 if (!sh) { 383 if (!sh) {
384 if (!conf->inactive_blocked) 384 if (!conf->inactive_blocked)
385 sh = get_free_stripe(conf); 385 sh = get_free_stripe(conf);
386 if (noblock && sh == NULL) 386 if (noblock && sh == NULL)
387 break; 387 break;
388 if (!sh) { 388 if (!sh) {
389 conf->inactive_blocked = 1; 389 conf->inactive_blocked = 1;
390 wait_event_lock_irq(conf->wait_for_stripe, 390 wait_event_lock_irq(conf->wait_for_stripe,
391 !list_empty(&conf->inactive_list) && 391 !list_empty(&conf->inactive_list) &&
392 (atomic_read(&conf->active_stripes) 392 (atomic_read(&conf->active_stripes)
393 < (conf->max_nr_stripes *3/4) 393 < (conf->max_nr_stripes *3/4)
394 || !conf->inactive_blocked), 394 || !conf->inactive_blocked),
395 conf->device_lock, 395 conf->device_lock,
396 raid5_unplug_device(conf->mddev->queue) 396 raid5_unplug_device(conf->mddev->queue)
397 ); 397 );
398 conf->inactive_blocked = 0; 398 conf->inactive_blocked = 0;
399 } else 399 } else
400 init_stripe(sh, sector, previous); 400 init_stripe(sh, sector, previous);
401 } else { 401 } else {
402 if (atomic_read(&sh->count)) { 402 if (atomic_read(&sh->count)) {
403 BUG_ON(!list_empty(&sh->lru) 403 BUG_ON(!list_empty(&sh->lru)
404 && !test_bit(STRIPE_EXPANDING, &sh->state)); 404 && !test_bit(STRIPE_EXPANDING, &sh->state));
405 } else { 405 } else {
406 if (!test_bit(STRIPE_HANDLE, &sh->state)) 406 if (!test_bit(STRIPE_HANDLE, &sh->state))
407 atomic_inc(&conf->active_stripes); 407 atomic_inc(&conf->active_stripes);
408 if (list_empty(&sh->lru) && 408 if (list_empty(&sh->lru) &&
409 !test_bit(STRIPE_EXPANDING, &sh->state)) 409 !test_bit(STRIPE_EXPANDING, &sh->state))
410 BUG(); 410 BUG();
411 list_del_init(&sh->lru); 411 list_del_init(&sh->lru);
412 } 412 }
413 } 413 }
414 } while (sh == NULL); 414 } while (sh == NULL);
415 415
416 if (sh) 416 if (sh)
417 atomic_inc(&sh->count); 417 atomic_inc(&sh->count);
418 418
419 spin_unlock_irq(&conf->device_lock); 419 spin_unlock_irq(&conf->device_lock);
420 return sh; 420 return sh;
421 } 421 }
422 422
423 static void 423 static void
424 raid5_end_read_request(struct bio *bi, int error); 424 raid5_end_read_request(struct bio *bi, int error);
425 static void 425 static void
426 raid5_end_write_request(struct bio *bi, int error); 426 raid5_end_write_request(struct bio *bi, int error);
427 427
428 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) 428 static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s)
429 { 429 {
430 raid5_conf_t *conf = sh->raid_conf; 430 raid5_conf_t *conf = sh->raid_conf;
431 int i, disks = sh->disks; 431 int i, disks = sh->disks;
432 432
433 might_sleep(); 433 might_sleep();
434 434
435 for (i = disks; i--; ) { 435 for (i = disks; i--; ) {
436 int rw; 436 int rw;
437 struct bio *bi; 437 struct bio *bi;
438 mdk_rdev_t *rdev; 438 mdk_rdev_t *rdev;
439 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags)) 439 if (test_and_clear_bit(R5_Wantwrite, &sh->dev[i].flags))
440 rw = WRITE; 440 rw = WRITE;
441 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags)) 441 else if (test_and_clear_bit(R5_Wantread, &sh->dev[i].flags))
442 rw = READ; 442 rw = READ;
443 else 443 else
444 continue; 444 continue;
445 445
446 bi = &sh->dev[i].req; 446 bi = &sh->dev[i].req;
447 447
448 bi->bi_rw = rw; 448 bi->bi_rw = rw;
449 if (rw == WRITE) 449 if (rw == WRITE)
450 bi->bi_end_io = raid5_end_write_request; 450 bi->bi_end_io = raid5_end_write_request;
451 else 451 else
452 bi->bi_end_io = raid5_end_read_request; 452 bi->bi_end_io = raid5_end_read_request;
453 453
454 rcu_read_lock(); 454 rcu_read_lock();
455 rdev = rcu_dereference(conf->disks[i].rdev); 455 rdev = rcu_dereference(conf->disks[i].rdev);
456 if (rdev && test_bit(Faulty, &rdev->flags)) 456 if (rdev && test_bit(Faulty, &rdev->flags))
457 rdev = NULL; 457 rdev = NULL;
458 if (rdev) 458 if (rdev)
459 atomic_inc(&rdev->nr_pending); 459 atomic_inc(&rdev->nr_pending);
460 rcu_read_unlock(); 460 rcu_read_unlock();
461 461
462 if (rdev) { 462 if (rdev) {
463 if (s->syncing || s->expanding || s->expanded) 463 if (s->syncing || s->expanding || s->expanded)
464 md_sync_acct(rdev->bdev, STRIPE_SECTORS); 464 md_sync_acct(rdev->bdev, STRIPE_SECTORS);
465 465
466 set_bit(STRIPE_IO_STARTED, &sh->state); 466 set_bit(STRIPE_IO_STARTED, &sh->state);
467 467
468 bi->bi_bdev = rdev->bdev; 468 bi->bi_bdev = rdev->bdev;
469 pr_debug("%s: for %llu schedule op %ld on disc %d\n", 469 pr_debug("%s: for %llu schedule op %ld on disc %d\n",
470 __func__, (unsigned long long)sh->sector, 470 __func__, (unsigned long long)sh->sector,
471 bi->bi_rw, i); 471 bi->bi_rw, i);
472 atomic_inc(&sh->count); 472 atomic_inc(&sh->count);
473 bi->bi_sector = sh->sector + rdev->data_offset; 473 bi->bi_sector = sh->sector + rdev->data_offset;
474 bi->bi_flags = 1 << BIO_UPTODATE; 474 bi->bi_flags = 1 << BIO_UPTODATE;
475 bi->bi_vcnt = 1; 475 bi->bi_vcnt = 1;
476 bi->bi_max_vecs = 1; 476 bi->bi_max_vecs = 1;
477 bi->bi_idx = 0; 477 bi->bi_idx = 0;
478 bi->bi_io_vec = &sh->dev[i].vec; 478 bi->bi_io_vec = &sh->dev[i].vec;
479 bi->bi_io_vec[0].bv_len = STRIPE_SIZE; 479 bi->bi_io_vec[0].bv_len = STRIPE_SIZE;
480 bi->bi_io_vec[0].bv_offset = 0; 480 bi->bi_io_vec[0].bv_offset = 0;
481 bi->bi_size = STRIPE_SIZE; 481 bi->bi_size = STRIPE_SIZE;
482 bi->bi_next = NULL; 482 bi->bi_next = NULL;
483 if (rw == WRITE && 483 if (rw == WRITE &&
484 test_bit(R5_ReWrite, &sh->dev[i].flags)) 484 test_bit(R5_ReWrite, &sh->dev[i].flags))
485 atomic_add(STRIPE_SECTORS, 485 atomic_add(STRIPE_SECTORS,
486 &rdev->corrected_errors); 486 &rdev->corrected_errors);
487 generic_make_request(bi); 487 generic_make_request(bi);
488 } else { 488 } else {
489 if (rw == WRITE) 489 if (rw == WRITE)
490 set_bit(STRIPE_DEGRADED, &sh->state); 490 set_bit(STRIPE_DEGRADED, &sh->state);
491 pr_debug("skip op %ld on disc %d for sector %llu\n", 491 pr_debug("skip op %ld on disc %d for sector %llu\n",
492 bi->bi_rw, i, (unsigned long long)sh->sector); 492 bi->bi_rw, i, (unsigned long long)sh->sector);
493 clear_bit(R5_LOCKED, &sh->dev[i].flags); 493 clear_bit(R5_LOCKED, &sh->dev[i].flags);
494 set_bit(STRIPE_HANDLE, &sh->state); 494 set_bit(STRIPE_HANDLE, &sh->state);
495 } 495 }
496 } 496 }
497 } 497 }
498 498
499 static struct dma_async_tx_descriptor * 499 static struct dma_async_tx_descriptor *
500 async_copy_data(int frombio, struct bio *bio, struct page *page, 500 async_copy_data(int frombio, struct bio *bio, struct page *page,
501 sector_t sector, struct dma_async_tx_descriptor *tx) 501 sector_t sector, struct dma_async_tx_descriptor *tx)
502 { 502 {
503 struct bio_vec *bvl; 503 struct bio_vec *bvl;
504 struct page *bio_page; 504 struct page *bio_page;
505 int i; 505 int i;
506 int page_offset; 506 int page_offset;
507 struct async_submit_ctl submit; 507 struct async_submit_ctl submit;
508 enum async_tx_flags flags = 0; 508 enum async_tx_flags flags = 0;
509 509
510 if (bio->bi_sector >= sector) 510 if (bio->bi_sector >= sector)
511 page_offset = (signed)(bio->bi_sector - sector) * 512; 511 page_offset = (signed)(bio->bi_sector - sector) * 512;
512 else 512 else
513 page_offset = (signed)(sector - bio->bi_sector) * -512; 513 page_offset = (signed)(sector - bio->bi_sector) * -512;
514 514
515 if (frombio) 515 if (frombio)
516 flags |= ASYNC_TX_FENCE; 516 flags |= ASYNC_TX_FENCE;
517 init_async_submit(&submit, flags, tx, NULL, NULL, NULL); 517 init_async_submit(&submit, flags, tx, NULL, NULL, NULL);
518 518
519 bio_for_each_segment(bvl, bio, i) { 519 bio_for_each_segment(bvl, bio, i) {
520 int len = bio_iovec_idx(bio, i)->bv_len; 520 int len = bio_iovec_idx(bio, i)->bv_len;
521 int clen; 521 int clen;
522 int b_offset = 0; 522 int b_offset = 0;
523 523
524 if (page_offset < 0) { 524 if (page_offset < 0) {
525 b_offset = -page_offset; 525 b_offset = -page_offset;
526 page_offset += b_offset; 526 page_offset += b_offset;
527 len -= b_offset; 527 len -= b_offset;
528 } 528 }
529 529
530 if (len > 0 && page_offset + len > STRIPE_SIZE) 530 if (len > 0 && page_offset + len > STRIPE_SIZE)
531 clen = STRIPE_SIZE - page_offset; 531 clen = STRIPE_SIZE - page_offset;
532 else 532 else
533 clen = len; 533 clen = len;
534 534
535 if (clen > 0) { 535 if (clen > 0) {
536 b_offset += bio_iovec_idx(bio, i)->bv_offset; 536 b_offset += bio_iovec_idx(bio, i)->bv_offset;
537 bio_page = bio_iovec_idx(bio, i)->bv_page; 537 bio_page = bio_iovec_idx(bio, i)->bv_page;
538 if (frombio) 538 if (frombio)
539 tx = async_memcpy(page, bio_page, page_offset, 539 tx = async_memcpy(page, bio_page, page_offset,
540 b_offset, clen, &submit); 540 b_offset, clen, &submit);
541 else 541 else
542 tx = async_memcpy(bio_page, page, b_offset, 542 tx = async_memcpy(bio_page, page, b_offset,
543 page_offset, clen, &submit); 543 page_offset, clen, &submit);
544 } 544 }
545 /* chain the operations */ 545 /* chain the operations */
546 submit.depend_tx = tx; 546 submit.depend_tx = tx;
547 547
548 if (clen < len) /* hit end of page */ 548 if (clen < len) /* hit end of page */
549 break; 549 break;
550 page_offset += len; 550 page_offset += len;
551 } 551 }
552 552
553 return tx; 553 return tx;
554 } 554 }
555 555
556 static void ops_complete_biofill(void *stripe_head_ref) 556 static void ops_complete_biofill(void *stripe_head_ref)
557 { 557 {
558 struct stripe_head *sh = stripe_head_ref; 558 struct stripe_head *sh = stripe_head_ref;
559 struct bio *return_bi = NULL; 559 struct bio *return_bi = NULL;
560 raid5_conf_t *conf = sh->raid_conf; 560 raid5_conf_t *conf = sh->raid_conf;
561 int i; 561 int i;
562 562
563 pr_debug("%s: stripe %llu\n", __func__, 563 pr_debug("%s: stripe %llu\n", __func__,
564 (unsigned long long)sh->sector); 564 (unsigned long long)sh->sector);
565 565
566 /* clear completed biofills */ 566 /* clear completed biofills */
567 spin_lock_irq(&conf->device_lock); 567 spin_lock_irq(&conf->device_lock);
568 for (i = sh->disks; i--; ) { 568 for (i = sh->disks; i--; ) {
569 struct r5dev *dev = &sh->dev[i]; 569 struct r5dev *dev = &sh->dev[i];
570 570
571 /* acknowledge completion of a biofill operation */ 571 /* acknowledge completion of a biofill operation */
572 /* and check if we need to reply to a read request, 572 /* and check if we need to reply to a read request,
573 * new R5_Wantfill requests are held off until 573 * new R5_Wantfill requests are held off until
574 * !STRIPE_BIOFILL_RUN 574 * !STRIPE_BIOFILL_RUN
575 */ 575 */
576 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) { 576 if (test_and_clear_bit(R5_Wantfill, &dev->flags)) {
577 struct bio *rbi, *rbi2; 577 struct bio *rbi, *rbi2;
578 578
579 BUG_ON(!dev->read); 579 BUG_ON(!dev->read);
580 rbi = dev->read; 580 rbi = dev->read;
581 dev->read = NULL; 581 dev->read = NULL;
582 while (rbi && rbi->bi_sector < 582 while (rbi && rbi->bi_sector <
583 dev->sector + STRIPE_SECTORS) { 583 dev->sector + STRIPE_SECTORS) {
584 rbi2 = r5_next_bio(rbi, dev->sector); 584 rbi2 = r5_next_bio(rbi, dev->sector);
585 if (!raid5_dec_bi_phys_segments(rbi)) { 585 if (!raid5_dec_bi_phys_segments(rbi)) {
586 rbi->bi_next = return_bi; 586 rbi->bi_next = return_bi;
587 return_bi = rbi; 587 return_bi = rbi;
588 } 588 }
589 rbi = rbi2; 589 rbi = rbi2;
590 } 590 }
591 } 591 }
592 } 592 }
593 spin_unlock_irq(&conf->device_lock); 593 spin_unlock_irq(&conf->device_lock);
594 clear_bit(STRIPE_BIOFILL_RUN, &sh->state); 594 clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
595 595
596 return_io(return_bi); 596 return_io(return_bi);
597 597
598 set_bit(STRIPE_HANDLE, &sh->state); 598 set_bit(STRIPE_HANDLE, &sh->state);
599 release_stripe(sh); 599 release_stripe(sh);
600 } 600 }
601 601
602 static void ops_run_biofill(struct stripe_head *sh) 602 static void ops_run_biofill(struct stripe_head *sh)
603 { 603 {
604 struct dma_async_tx_descriptor *tx = NULL; 604 struct dma_async_tx_descriptor *tx = NULL;
605 raid5_conf_t *conf = sh->raid_conf; 605 raid5_conf_t *conf = sh->raid_conf;
606 struct async_submit_ctl submit; 606 struct async_submit_ctl submit;
607 int i; 607 int i;
608 608
609 pr_debug("%s: stripe %llu\n", __func__, 609 pr_debug("%s: stripe %llu\n", __func__,
610 (unsigned long long)sh->sector); 610 (unsigned long long)sh->sector);
611 611
612 for (i = sh->disks; i--; ) { 612 for (i = sh->disks; i--; ) {
613 struct r5dev *dev = &sh->dev[i]; 613 struct r5dev *dev = &sh->dev[i];
614 if (test_bit(R5_Wantfill, &dev->flags)) { 614 if (test_bit(R5_Wantfill, &dev->flags)) {
615 struct bio *rbi; 615 struct bio *rbi;
616 spin_lock_irq(&conf->device_lock); 616 spin_lock_irq(&conf->device_lock);
617 dev->read = rbi = dev->toread; 617 dev->read = rbi = dev->toread;
618 dev->toread = NULL; 618 dev->toread = NULL;
619 spin_unlock_irq(&conf->device_lock); 619 spin_unlock_irq(&conf->device_lock);
620 while (rbi && rbi->bi_sector < 620 while (rbi && rbi->bi_sector <
621 dev->sector + STRIPE_SECTORS) { 621 dev->sector + STRIPE_SECTORS) {
622 tx = async_copy_data(0, rbi, dev->page, 622 tx = async_copy_data(0, rbi, dev->page,
623 dev->sector, tx); 623 dev->sector, tx);
624 rbi = r5_next_bio(rbi, dev->sector); 624 rbi = r5_next_bio(rbi, dev->sector);
625 } 625 }
626 } 626 }
627 } 627 }
628 628
629 atomic_inc(&sh->count); 629 atomic_inc(&sh->count);
630 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL); 630 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_biofill, sh, NULL);
631 async_trigger_callback(&submit); 631 async_trigger_callback(&submit);
632 } 632 }
633 633
634 static void mark_target_uptodate(struct stripe_head *sh, int target) 634 static void mark_target_uptodate(struct stripe_head *sh, int target)
635 { 635 {
636 struct r5dev *tgt; 636 struct r5dev *tgt;
637 637
638 if (target < 0) 638 if (target < 0)
639 return; 639 return;
640 640
641 tgt = &sh->dev[target]; 641 tgt = &sh->dev[target];
642 set_bit(R5_UPTODATE, &tgt->flags); 642 set_bit(R5_UPTODATE, &tgt->flags);
643 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 643 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
644 clear_bit(R5_Wantcompute, &tgt->flags); 644 clear_bit(R5_Wantcompute, &tgt->flags);
645 } 645 }
646 646
647 static void ops_complete_compute(void *stripe_head_ref) 647 static void ops_complete_compute(void *stripe_head_ref)
648 { 648 {
649 struct stripe_head *sh = stripe_head_ref; 649 struct stripe_head *sh = stripe_head_ref;
650 650
651 pr_debug("%s: stripe %llu\n", __func__, 651 pr_debug("%s: stripe %llu\n", __func__,
652 (unsigned long long)sh->sector); 652 (unsigned long long)sh->sector);
653 653
654 /* mark the computed target(s) as uptodate */ 654 /* mark the computed target(s) as uptodate */
655 mark_target_uptodate(sh, sh->ops.target); 655 mark_target_uptodate(sh, sh->ops.target);
656 mark_target_uptodate(sh, sh->ops.target2); 656 mark_target_uptodate(sh, sh->ops.target2);
657 657
658 clear_bit(STRIPE_COMPUTE_RUN, &sh->state); 658 clear_bit(STRIPE_COMPUTE_RUN, &sh->state);
659 if (sh->check_state == check_state_compute_run) 659 if (sh->check_state == check_state_compute_run)
660 sh->check_state = check_state_compute_result; 660 sh->check_state = check_state_compute_result;
661 set_bit(STRIPE_HANDLE, &sh->state); 661 set_bit(STRIPE_HANDLE, &sh->state);
662 release_stripe(sh); 662 release_stripe(sh);
663 } 663 }
664 664
665 /* return a pointer to the address conversion region of the scribble buffer */ 665 /* return a pointer to the address conversion region of the scribble buffer */
666 static addr_conv_t *to_addr_conv(struct stripe_head *sh, 666 static addr_conv_t *to_addr_conv(struct stripe_head *sh,
667 struct raid5_percpu *percpu) 667 struct raid5_percpu *percpu)
668 { 668 {
669 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2); 669 return percpu->scribble + sizeof(struct page *) * (sh->disks + 2);
670 } 670 }
671 671
672 static struct dma_async_tx_descriptor * 672 static struct dma_async_tx_descriptor *
673 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) 673 ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu)
674 { 674 {
675 int disks = sh->disks; 675 int disks = sh->disks;
676 struct page **xor_srcs = percpu->scribble; 676 struct page **xor_srcs = percpu->scribble;
677 int target = sh->ops.target; 677 int target = sh->ops.target;
678 struct r5dev *tgt = &sh->dev[target]; 678 struct r5dev *tgt = &sh->dev[target];
679 struct page *xor_dest = tgt->page; 679 struct page *xor_dest = tgt->page;
680 int count = 0; 680 int count = 0;
681 struct dma_async_tx_descriptor *tx; 681 struct dma_async_tx_descriptor *tx;
682 struct async_submit_ctl submit; 682 struct async_submit_ctl submit;
683 int i; 683 int i;
684 684
685 pr_debug("%s: stripe %llu block: %d\n", 685 pr_debug("%s: stripe %llu block: %d\n",
686 __func__, (unsigned long long)sh->sector, target); 686 __func__, (unsigned long long)sh->sector, target);
687 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 687 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
688 688
689 for (i = disks; i--; ) 689 for (i = disks; i--; )
690 if (i != target) 690 if (i != target)
691 xor_srcs[count++] = sh->dev[i].page; 691 xor_srcs[count++] = sh->dev[i].page;
692 692
693 atomic_inc(&sh->count); 693 atomic_inc(&sh->count);
694 694
695 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, 695 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL,
696 ops_complete_compute, sh, to_addr_conv(sh, percpu)); 696 ops_complete_compute, sh, to_addr_conv(sh, percpu));
697 if (unlikely(count == 1)) 697 if (unlikely(count == 1))
698 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 698 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
699 else 699 else
700 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 700 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
701 701
702 return tx; 702 return tx;
703 } 703 }
704 704
705 /* set_syndrome_sources - populate source buffers for gen_syndrome 705 /* set_syndrome_sources - populate source buffers for gen_syndrome
706 * @srcs - (struct page *) array of size sh->disks 706 * @srcs - (struct page *) array of size sh->disks
707 * @sh - stripe_head to parse 707 * @sh - stripe_head to parse
708 * 708 *
709 * Populates srcs in proper layout order for the stripe and returns the 709 * Populates srcs in proper layout order for the stripe and returns the
710 * 'count' of sources to be used in a call to async_gen_syndrome. The P 710 * 'count' of sources to be used in a call to async_gen_syndrome. The P
711 * destination buffer is recorded in srcs[count] and the Q destination 711 * destination buffer is recorded in srcs[count] and the Q destination
712 * is recorded in srcs[count+1]]. 712 * is recorded in srcs[count+1]].
713 */ 713 */
714 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh) 714 static int set_syndrome_sources(struct page **srcs, struct stripe_head *sh)
715 { 715 {
716 int disks = sh->disks; 716 int disks = sh->disks;
717 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2); 717 int syndrome_disks = sh->ddf_layout ? disks : (disks - 2);
718 int d0_idx = raid6_d0(sh); 718 int d0_idx = raid6_d0(sh);
719 int count; 719 int count;
720 int i; 720 int i;
721 721
722 for (i = 0; i < disks; i++) 722 for (i = 0; i < disks; i++)
723 srcs[i] = (void *)raid6_empty_zero_page; 723 srcs[i] = (void *)raid6_empty_zero_page;
724 724
725 count = 0; 725 count = 0;
726 i = d0_idx; 726 i = d0_idx;
727 do { 727 do {
728 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 728 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
729 729
730 srcs[slot] = sh->dev[i].page; 730 srcs[slot] = sh->dev[i].page;
731 i = raid6_next_disk(i, disks); 731 i = raid6_next_disk(i, disks);
732 } while (i != d0_idx); 732 } while (i != d0_idx);
733 733
734 return syndrome_disks; 734 return syndrome_disks;
735 } 735 }
736 736
737 static struct dma_async_tx_descriptor * 737 static struct dma_async_tx_descriptor *
738 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) 738 ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu)
739 { 739 {
740 int disks = sh->disks; 740 int disks = sh->disks;
741 struct page **blocks = percpu->scribble; 741 struct page **blocks = percpu->scribble;
742 int target; 742 int target;
743 int qd_idx = sh->qd_idx; 743 int qd_idx = sh->qd_idx;
744 struct dma_async_tx_descriptor *tx; 744 struct dma_async_tx_descriptor *tx;
745 struct async_submit_ctl submit; 745 struct async_submit_ctl submit;
746 struct r5dev *tgt; 746 struct r5dev *tgt;
747 struct page *dest; 747 struct page *dest;
748 int i; 748 int i;
749 int count; 749 int count;
750 750
751 if (sh->ops.target < 0) 751 if (sh->ops.target < 0)
752 target = sh->ops.target2; 752 target = sh->ops.target2;
753 else if (sh->ops.target2 < 0) 753 else if (sh->ops.target2 < 0)
754 target = sh->ops.target; 754 target = sh->ops.target;
755 else 755 else
756 /* we should only have one valid target */ 756 /* we should only have one valid target */
757 BUG(); 757 BUG();
758 BUG_ON(target < 0); 758 BUG_ON(target < 0);
759 pr_debug("%s: stripe %llu block: %d\n", 759 pr_debug("%s: stripe %llu block: %d\n",
760 __func__, (unsigned long long)sh->sector, target); 760 __func__, (unsigned long long)sh->sector, target);
761 761
762 tgt = &sh->dev[target]; 762 tgt = &sh->dev[target];
763 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 763 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
764 dest = tgt->page; 764 dest = tgt->page;
765 765
766 atomic_inc(&sh->count); 766 atomic_inc(&sh->count);
767 767
768 if (target == qd_idx) { 768 if (target == qd_idx) {
769 count = set_syndrome_sources(blocks, sh); 769 count = set_syndrome_sources(blocks, sh);
770 blocks[count] = NULL; /* regenerating p is not necessary */ 770 blocks[count] = NULL; /* regenerating p is not necessary */
771 BUG_ON(blocks[count+1] != dest); /* q should already be set */ 771 BUG_ON(blocks[count+1] != dest); /* q should already be set */
772 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 772 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
773 ops_complete_compute, sh, 773 ops_complete_compute, sh,
774 to_addr_conv(sh, percpu)); 774 to_addr_conv(sh, percpu));
775 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 775 tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
776 } else { 776 } else {
777 /* Compute any data- or p-drive using XOR */ 777 /* Compute any data- or p-drive using XOR */
778 count = 0; 778 count = 0;
779 for (i = disks; i-- ; ) { 779 for (i = disks; i-- ; ) {
780 if (i == target || i == qd_idx) 780 if (i == target || i == qd_idx)
781 continue; 781 continue;
782 blocks[count++] = sh->dev[i].page; 782 blocks[count++] = sh->dev[i].page;
783 } 783 }
784 784
785 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 785 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
786 NULL, ops_complete_compute, sh, 786 NULL, ops_complete_compute, sh,
787 to_addr_conv(sh, percpu)); 787 to_addr_conv(sh, percpu));
788 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); 788 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit);
789 } 789 }
790 790
791 return tx; 791 return tx;
792 } 792 }
793 793
794 static struct dma_async_tx_descriptor * 794 static struct dma_async_tx_descriptor *
795 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) 795 ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu)
796 { 796 {
797 int i, count, disks = sh->disks; 797 int i, count, disks = sh->disks;
798 int syndrome_disks = sh->ddf_layout ? disks : disks-2; 798 int syndrome_disks = sh->ddf_layout ? disks : disks-2;
799 int d0_idx = raid6_d0(sh); 799 int d0_idx = raid6_d0(sh);
800 int faila = -1, failb = -1; 800 int faila = -1, failb = -1;
801 int target = sh->ops.target; 801 int target = sh->ops.target;
802 int target2 = sh->ops.target2; 802 int target2 = sh->ops.target2;
803 struct r5dev *tgt = &sh->dev[target]; 803 struct r5dev *tgt = &sh->dev[target];
804 struct r5dev *tgt2 = &sh->dev[target2]; 804 struct r5dev *tgt2 = &sh->dev[target2];
805 struct dma_async_tx_descriptor *tx; 805 struct dma_async_tx_descriptor *tx;
806 struct page **blocks = percpu->scribble; 806 struct page **blocks = percpu->scribble;
807 struct async_submit_ctl submit; 807 struct async_submit_ctl submit;
808 808
809 pr_debug("%s: stripe %llu block1: %d block2: %d\n", 809 pr_debug("%s: stripe %llu block1: %d block2: %d\n",
810 __func__, (unsigned long long)sh->sector, target, target2); 810 __func__, (unsigned long long)sh->sector, target, target2);
811 BUG_ON(target < 0 || target2 < 0); 811 BUG_ON(target < 0 || target2 < 0);
812 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags)); 812 BUG_ON(!test_bit(R5_Wantcompute, &tgt->flags));
813 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags)); 813 BUG_ON(!test_bit(R5_Wantcompute, &tgt2->flags));
814 814
815 /* we need to open-code set_syndrome_sources to handle the 815 /* we need to open-code set_syndrome_sources to handle the
816 * slot number conversion for 'faila' and 'failb' 816 * slot number conversion for 'faila' and 'failb'
817 */ 817 */
818 for (i = 0; i < disks ; i++) 818 for (i = 0; i < disks ; i++)
819 blocks[i] = (void *)raid6_empty_zero_page; 819 blocks[i] = (void *)raid6_empty_zero_page;
820 count = 0; 820 count = 0;
821 i = d0_idx; 821 i = d0_idx;
822 do { 822 do {
823 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks); 823 int slot = raid6_idx_to_slot(i, sh, &count, syndrome_disks);
824 824
825 blocks[slot] = sh->dev[i].page; 825 blocks[slot] = sh->dev[i].page;
826 826
827 if (i == target) 827 if (i == target)
828 faila = slot; 828 faila = slot;
829 if (i == target2) 829 if (i == target2)
830 failb = slot; 830 failb = slot;
831 i = raid6_next_disk(i, disks); 831 i = raid6_next_disk(i, disks);
832 } while (i != d0_idx); 832 } while (i != d0_idx);
833 833
834 BUG_ON(faila == failb); 834 BUG_ON(faila == failb);
835 if (failb < faila) 835 if (failb < faila)
836 swap(faila, failb); 836 swap(faila, failb);
837 pr_debug("%s: stripe: %llu faila: %d failb: %d\n", 837 pr_debug("%s: stripe: %llu faila: %d failb: %d\n",
838 __func__, (unsigned long long)sh->sector, faila, failb); 838 __func__, (unsigned long long)sh->sector, faila, failb);
839 839
840 atomic_inc(&sh->count); 840 atomic_inc(&sh->count);
841 841
842 if (failb == syndrome_disks+1) { 842 if (failb == syndrome_disks+1) {
843 /* Q disk is one of the missing disks */ 843 /* Q disk is one of the missing disks */
844 if (faila == syndrome_disks) { 844 if (faila == syndrome_disks) {
845 /* Missing P+Q, just recompute */ 845 /* Missing P+Q, just recompute */
846 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 846 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
847 ops_complete_compute, sh, 847 ops_complete_compute, sh,
848 to_addr_conv(sh, percpu)); 848 to_addr_conv(sh, percpu));
849 return async_gen_syndrome(blocks, 0, syndrome_disks+2, 849 return async_gen_syndrome(blocks, 0, syndrome_disks+2,
850 STRIPE_SIZE, &submit); 850 STRIPE_SIZE, &submit);
851 } else { 851 } else {
852 struct page *dest; 852 struct page *dest;
853 int data_target; 853 int data_target;
854 int qd_idx = sh->qd_idx; 854 int qd_idx = sh->qd_idx;
855 855
856 /* Missing D+Q: recompute D from P, then recompute Q */ 856 /* Missing D+Q: recompute D from P, then recompute Q */
857 if (target == qd_idx) 857 if (target == qd_idx)
858 data_target = target2; 858 data_target = target2;
859 else 859 else
860 data_target = target; 860 data_target = target;
861 861
862 count = 0; 862 count = 0;
863 for (i = disks; i-- ; ) { 863 for (i = disks; i-- ; ) {
864 if (i == data_target || i == qd_idx) 864 if (i == data_target || i == qd_idx)
865 continue; 865 continue;
866 blocks[count++] = sh->dev[i].page; 866 blocks[count++] = sh->dev[i].page;
867 } 867 }
868 dest = sh->dev[data_target].page; 868 dest = sh->dev[data_target].page;
869 init_async_submit(&submit, 869 init_async_submit(&submit,
870 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, 870 ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST,
871 NULL, NULL, NULL, 871 NULL, NULL, NULL,
872 to_addr_conv(sh, percpu)); 872 to_addr_conv(sh, percpu));
873 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, 873 tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE,
874 &submit); 874 &submit);
875 875
876 count = set_syndrome_sources(blocks, sh); 876 count = set_syndrome_sources(blocks, sh);
877 init_async_submit(&submit, ASYNC_TX_FENCE, tx, 877 init_async_submit(&submit, ASYNC_TX_FENCE, tx,
878 ops_complete_compute, sh, 878 ops_complete_compute, sh,
879 to_addr_conv(sh, percpu)); 879 to_addr_conv(sh, percpu));
880 return async_gen_syndrome(blocks, 0, count+2, 880 return async_gen_syndrome(blocks, 0, count+2,
881 STRIPE_SIZE, &submit); 881 STRIPE_SIZE, &submit);
882 } 882 }
883 } else { 883 } else {
884 init_async_submit(&submit, ASYNC_TX_FENCE, NULL, 884 init_async_submit(&submit, ASYNC_TX_FENCE, NULL,
885 ops_complete_compute, sh, 885 ops_complete_compute, sh,
886 to_addr_conv(sh, percpu)); 886 to_addr_conv(sh, percpu));
887 if (failb == syndrome_disks) { 887 if (failb == syndrome_disks) {
888 /* We're missing D+P. */ 888 /* We're missing D+P. */
889 return async_raid6_datap_recov(syndrome_disks+2, 889 return async_raid6_datap_recov(syndrome_disks+2,
890 STRIPE_SIZE, faila, 890 STRIPE_SIZE, faila,
891 blocks, &submit); 891 blocks, &submit);
892 } else { 892 } else {
893 /* We're missing D+D. */ 893 /* We're missing D+D. */
894 return async_raid6_2data_recov(syndrome_disks+2, 894 return async_raid6_2data_recov(syndrome_disks+2,
895 STRIPE_SIZE, faila, failb, 895 STRIPE_SIZE, faila, failb,
896 blocks, &submit); 896 blocks, &submit);
897 } 897 }
898 } 898 }
899 } 899 }
900 900
901 901
902 static void ops_complete_prexor(void *stripe_head_ref) 902 static void ops_complete_prexor(void *stripe_head_ref)
903 { 903 {
904 struct stripe_head *sh = stripe_head_ref; 904 struct stripe_head *sh = stripe_head_ref;
905 905
906 pr_debug("%s: stripe %llu\n", __func__, 906 pr_debug("%s: stripe %llu\n", __func__,
907 (unsigned long long)sh->sector); 907 (unsigned long long)sh->sector);
908 } 908 }
909 909
910 static struct dma_async_tx_descriptor * 910 static struct dma_async_tx_descriptor *
911 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu, 911 ops_run_prexor(struct stripe_head *sh, struct raid5_percpu *percpu,
912 struct dma_async_tx_descriptor *tx) 912 struct dma_async_tx_descriptor *tx)
913 { 913 {
914 int disks = sh->disks; 914 int disks = sh->disks;
915 struct page **xor_srcs = percpu->scribble; 915 struct page **xor_srcs = percpu->scribble;
916 int count = 0, pd_idx = sh->pd_idx, i; 916 int count = 0, pd_idx = sh->pd_idx, i;
917 struct async_submit_ctl submit; 917 struct async_submit_ctl submit;
918 918
919 /* existing parity data subtracted */ 919 /* existing parity data subtracted */
920 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 920 struct page *xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
921 921
922 pr_debug("%s: stripe %llu\n", __func__, 922 pr_debug("%s: stripe %llu\n", __func__,
923 (unsigned long long)sh->sector); 923 (unsigned long long)sh->sector);
924 924
925 for (i = disks; i--; ) { 925 for (i = disks; i--; ) {
926 struct r5dev *dev = &sh->dev[i]; 926 struct r5dev *dev = &sh->dev[i];
927 /* Only process blocks that are known to be uptodate */ 927 /* Only process blocks that are known to be uptodate */
928 if (test_bit(R5_Wantdrain, &dev->flags)) 928 if (test_bit(R5_Wantdrain, &dev->flags))
929 xor_srcs[count++] = dev->page; 929 xor_srcs[count++] = dev->page;
930 } 930 }
931 931
932 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, 932 init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx,
933 ops_complete_prexor, sh, to_addr_conv(sh, percpu)); 933 ops_complete_prexor, sh, to_addr_conv(sh, percpu));
934 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 934 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
935 935
936 return tx; 936 return tx;
937 } 937 }
938 938
939 static struct dma_async_tx_descriptor * 939 static struct dma_async_tx_descriptor *
940 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx) 940 ops_run_biodrain(struct stripe_head *sh, struct dma_async_tx_descriptor *tx)
941 { 941 {
942 int disks = sh->disks; 942 int disks = sh->disks;
943 int i; 943 int i;
944 944
945 pr_debug("%s: stripe %llu\n", __func__, 945 pr_debug("%s: stripe %llu\n", __func__,
946 (unsigned long long)sh->sector); 946 (unsigned long long)sh->sector);
947 947
948 for (i = disks; i--; ) { 948 for (i = disks; i--; ) {
949 struct r5dev *dev = &sh->dev[i]; 949 struct r5dev *dev = &sh->dev[i];
950 struct bio *chosen; 950 struct bio *chosen;
951 951
952 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) { 952 if (test_and_clear_bit(R5_Wantdrain, &dev->flags)) {
953 struct bio *wbi; 953 struct bio *wbi;
954 954
955 spin_lock(&sh->lock); 955 spin_lock(&sh->lock);
956 chosen = dev->towrite; 956 chosen = dev->towrite;
957 dev->towrite = NULL; 957 dev->towrite = NULL;
958 BUG_ON(dev->written); 958 BUG_ON(dev->written);
959 wbi = dev->written = chosen; 959 wbi = dev->written = chosen;
960 spin_unlock(&sh->lock); 960 spin_unlock(&sh->lock);
961 961
962 while (wbi && wbi->bi_sector < 962 while (wbi && wbi->bi_sector <
963 dev->sector + STRIPE_SECTORS) { 963 dev->sector + STRIPE_SECTORS) {
964 tx = async_copy_data(1, wbi, dev->page, 964 tx = async_copy_data(1, wbi, dev->page,
965 dev->sector, tx); 965 dev->sector, tx);
966 wbi = r5_next_bio(wbi, dev->sector); 966 wbi = r5_next_bio(wbi, dev->sector);
967 } 967 }
968 } 968 }
969 } 969 }
970 970
971 return tx; 971 return tx;
972 } 972 }
973 973
974 static void ops_complete_reconstruct(void *stripe_head_ref) 974 static void ops_complete_reconstruct(void *stripe_head_ref)
975 { 975 {
976 struct stripe_head *sh = stripe_head_ref; 976 struct stripe_head *sh = stripe_head_ref;
977 int disks = sh->disks; 977 int disks = sh->disks;
978 int pd_idx = sh->pd_idx; 978 int pd_idx = sh->pd_idx;
979 int qd_idx = sh->qd_idx; 979 int qd_idx = sh->qd_idx;
980 int i; 980 int i;
981 981
982 pr_debug("%s: stripe %llu\n", __func__, 982 pr_debug("%s: stripe %llu\n", __func__,
983 (unsigned long long)sh->sector); 983 (unsigned long long)sh->sector);
984 984
985 for (i = disks; i--; ) { 985 for (i = disks; i--; ) {
986 struct r5dev *dev = &sh->dev[i]; 986 struct r5dev *dev = &sh->dev[i];
987 987
988 if (dev->written || i == pd_idx || i == qd_idx) 988 if (dev->written || i == pd_idx || i == qd_idx)
989 set_bit(R5_UPTODATE, &dev->flags); 989 set_bit(R5_UPTODATE, &dev->flags);
990 } 990 }
991 991
992 if (sh->reconstruct_state == reconstruct_state_drain_run) 992 if (sh->reconstruct_state == reconstruct_state_drain_run)
993 sh->reconstruct_state = reconstruct_state_drain_result; 993 sh->reconstruct_state = reconstruct_state_drain_result;
994 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) 994 else if (sh->reconstruct_state == reconstruct_state_prexor_drain_run)
995 sh->reconstruct_state = reconstruct_state_prexor_drain_result; 995 sh->reconstruct_state = reconstruct_state_prexor_drain_result;
996 else { 996 else {
997 BUG_ON(sh->reconstruct_state != reconstruct_state_run); 997 BUG_ON(sh->reconstruct_state != reconstruct_state_run);
998 sh->reconstruct_state = reconstruct_state_result; 998 sh->reconstruct_state = reconstruct_state_result;
999 } 999 }
1000 1000
1001 set_bit(STRIPE_HANDLE, &sh->state); 1001 set_bit(STRIPE_HANDLE, &sh->state);
1002 release_stripe(sh); 1002 release_stripe(sh);
1003 } 1003 }
1004 1004
1005 static void 1005 static void
1006 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu, 1006 ops_run_reconstruct5(struct stripe_head *sh, struct raid5_percpu *percpu,
1007 struct dma_async_tx_descriptor *tx) 1007 struct dma_async_tx_descriptor *tx)
1008 { 1008 {
1009 int disks = sh->disks; 1009 int disks = sh->disks;
1010 struct page **xor_srcs = percpu->scribble; 1010 struct page **xor_srcs = percpu->scribble;
1011 struct async_submit_ctl submit; 1011 struct async_submit_ctl submit;
1012 int count = 0, pd_idx = sh->pd_idx, i; 1012 int count = 0, pd_idx = sh->pd_idx, i;
1013 struct page *xor_dest; 1013 struct page *xor_dest;
1014 int prexor = 0; 1014 int prexor = 0;
1015 unsigned long flags; 1015 unsigned long flags;
1016 1016
1017 pr_debug("%s: stripe %llu\n", __func__, 1017 pr_debug("%s: stripe %llu\n", __func__,
1018 (unsigned long long)sh->sector); 1018 (unsigned long long)sh->sector);
1019 1019
1020 /* check if prexor is active which means only process blocks 1020 /* check if prexor is active which means only process blocks
1021 * that are part of a read-modify-write (written) 1021 * that are part of a read-modify-write (written)
1022 */ 1022 */
1023 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) { 1023 if (sh->reconstruct_state == reconstruct_state_prexor_drain_run) {
1024 prexor = 1; 1024 prexor = 1;
1025 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page; 1025 xor_dest = xor_srcs[count++] = sh->dev[pd_idx].page;
1026 for (i = disks; i--; ) { 1026 for (i = disks; i--; ) {
1027 struct r5dev *dev = &sh->dev[i]; 1027 struct r5dev *dev = &sh->dev[i];
1028 if (dev->written) 1028 if (dev->written)
1029 xor_srcs[count++] = dev->page; 1029 xor_srcs[count++] = dev->page;
1030 } 1030 }
1031 } else { 1031 } else {
1032 xor_dest = sh->dev[pd_idx].page; 1032 xor_dest = sh->dev[pd_idx].page;
1033 for (i = disks; i--; ) { 1033 for (i = disks; i--; ) {
1034 struct r5dev *dev = &sh->dev[i]; 1034 struct r5dev *dev = &sh->dev[i];
1035 if (i != pd_idx) 1035 if (i != pd_idx)
1036 xor_srcs[count++] = dev->page; 1036 xor_srcs[count++] = dev->page;
1037 } 1037 }
1038 } 1038 }
1039 1039
1040 /* 1/ if we prexor'd then the dest is reused as a source 1040 /* 1/ if we prexor'd then the dest is reused as a source
1041 * 2/ if we did not prexor then we are redoing the parity 1041 * 2/ if we did not prexor then we are redoing the parity
1042 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST 1042 * set ASYNC_TX_XOR_DROP_DST and ASYNC_TX_XOR_ZERO_DST
1043 * for the synchronous xor case 1043 * for the synchronous xor case
1044 */ 1044 */
1045 flags = ASYNC_TX_ACK | 1045 flags = ASYNC_TX_ACK |
1046 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST); 1046 (prexor ? ASYNC_TX_XOR_DROP_DST : ASYNC_TX_XOR_ZERO_DST);
1047 1047
1048 atomic_inc(&sh->count); 1048 atomic_inc(&sh->count);
1049 1049
1050 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh, 1050 init_async_submit(&submit, flags, tx, ops_complete_reconstruct, sh,
1051 to_addr_conv(sh, percpu)); 1051 to_addr_conv(sh, percpu));
1052 if (unlikely(count == 1)) 1052 if (unlikely(count == 1))
1053 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); 1053 tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit);
1054 else 1054 else
1055 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); 1055 tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit);
1056 } 1056 }
1057 1057
1058 static void 1058 static void
1059 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu, 1059 ops_run_reconstruct6(struct stripe_head *sh, struct raid5_percpu *percpu,
1060 struct dma_async_tx_descriptor *tx) 1060 struct dma_async_tx_descriptor *tx)
1061 { 1061 {
1062 struct async_submit_ctl submit; 1062 struct async_submit_ctl submit;
1063 struct page **blocks = percpu->scribble; 1063 struct page **blocks = percpu->scribble;
1064 int count; 1064 int count;
1065 1065
1066 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); 1066 pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector);
1067 1067
1068 count = set_syndrome_sources(blocks, sh); 1068 count = set_syndrome_sources(blocks, sh);
1069 1069
1070 atomic_inc(&sh->count); 1070 atomic_inc(&sh->count);
1071 1071
1072 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct, 1072 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_reconstruct,
1073 sh, to_addr_conv(sh, percpu)); 1073 sh, to_addr_conv(sh, percpu));
1074 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); 1074 async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit);
1075 } 1075 }
1076 1076
1077 static void ops_complete_check(void *stripe_head_ref) 1077 static void ops_complete_check(void *stripe_head_ref)
1078 { 1078 {
1079 struct stripe_head *sh = stripe_head_ref; 1079 struct stripe_head *sh = stripe_head_ref;
1080 1080
1081 pr_debug("%s: stripe %llu\n", __func__, 1081 pr_debug("%s: stripe %llu\n", __func__,
1082 (unsigned long long)sh->sector); 1082 (unsigned long long)sh->sector);
1083 1083
1084 sh->check_state = check_state_check_result; 1084 sh->check_state = check_state_check_result;
1085 set_bit(STRIPE_HANDLE, &sh->state); 1085 set_bit(STRIPE_HANDLE, &sh->state);
1086 release_stripe(sh); 1086 release_stripe(sh);
1087 } 1087 }
1088 1088
1089 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) 1089 static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu)
1090 { 1090 {
1091 int disks = sh->disks; 1091 int disks = sh->disks;
1092 int pd_idx = sh->pd_idx; 1092 int pd_idx = sh->pd_idx;
1093 int qd_idx = sh->qd_idx; 1093 int qd_idx = sh->qd_idx;
1094 struct page *xor_dest; 1094 struct page *xor_dest;
1095 struct page **xor_srcs = percpu->scribble; 1095 struct page **xor_srcs = percpu->scribble;
1096 struct dma_async_tx_descriptor *tx; 1096 struct dma_async_tx_descriptor *tx;
1097 struct async_submit_ctl submit; 1097 struct async_submit_ctl submit;
1098 int count; 1098 int count;
1099 int i; 1099 int i;
1100 1100
1101 pr_debug("%s: stripe %llu\n", __func__, 1101 pr_debug("%s: stripe %llu\n", __func__,
1102 (unsigned long long)sh->sector); 1102 (unsigned long long)sh->sector);
1103 1103
1104 count = 0; 1104 count = 0;
1105 xor_dest = sh->dev[pd_idx].page; 1105 xor_dest = sh->dev[pd_idx].page;
1106 xor_srcs[count++] = xor_dest; 1106 xor_srcs[count++] = xor_dest;
1107 for (i = disks; i--; ) { 1107 for (i = disks; i--; ) {
1108 if (i == pd_idx || i == qd_idx) 1108 if (i == pd_idx || i == qd_idx)
1109 continue; 1109 continue;
1110 xor_srcs[count++] = sh->dev[i].page; 1110 xor_srcs[count++] = sh->dev[i].page;
1111 } 1111 }
1112 1112
1113 init_async_submit(&submit, 0, NULL, NULL, NULL, 1113 init_async_submit(&submit, 0, NULL, NULL, NULL,
1114 to_addr_conv(sh, percpu)); 1114 to_addr_conv(sh, percpu));
1115 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, 1115 tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE,
1116 &sh->ops.zero_sum_result, &submit); 1116 &sh->ops.zero_sum_result, &submit);
1117 1117
1118 atomic_inc(&sh->count); 1118 atomic_inc(&sh->count);
1119 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL); 1119 init_async_submit(&submit, ASYNC_TX_ACK, tx, ops_complete_check, sh, NULL);
1120 tx = async_trigger_callback(&submit); 1120 tx = async_trigger_callback(&submit);
1121 } 1121 }
1122 1122
1123 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp) 1123 static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu, int checkp)
1124 { 1124 {
1125 struct page **srcs = percpu->scribble; 1125 struct page **srcs = percpu->scribble;
1126 struct async_submit_ctl submit; 1126 struct async_submit_ctl submit;
1127 int count; 1127 int count;
1128 1128
1129 pr_debug("%s: stripe %llu checkp: %d\n", __func__, 1129 pr_debug("%s: stripe %llu checkp: %d\n", __func__,
1130 (unsigned long long)sh->sector, checkp); 1130 (unsigned long long)sh->sector, checkp);
1131 1131
1132 count = set_syndrome_sources(srcs, sh); 1132 count = set_syndrome_sources(srcs, sh);
1133 if (!checkp) 1133 if (!checkp)
1134 srcs[count] = NULL; 1134 srcs[count] = NULL;
1135 1135
1136 atomic_inc(&sh->count); 1136 atomic_inc(&sh->count);
1137 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, 1137 init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check,
1138 sh, to_addr_conv(sh, percpu)); 1138 sh, to_addr_conv(sh, percpu));
1139 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, 1139 async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE,
1140 &sh->ops.zero_sum_result, percpu->spare_page, &submit); 1140 &sh->ops.zero_sum_result, percpu->spare_page, &submit);
1141 } 1141 }
1142 1142
1143 static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1143 static void __raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1144 { 1144 {
1145 int overlap_clear = 0, i, disks = sh->disks; 1145 int overlap_clear = 0, i, disks = sh->disks;
1146 struct dma_async_tx_descriptor *tx = NULL; 1146 struct dma_async_tx_descriptor *tx = NULL;
1147 raid5_conf_t *conf = sh->raid_conf; 1147 raid5_conf_t *conf = sh->raid_conf;
1148 int level = conf->level; 1148 int level = conf->level;
1149 struct raid5_percpu *percpu; 1149 struct raid5_percpu *percpu;
1150 unsigned long cpu; 1150 unsigned long cpu;
1151 1151
1152 cpu = get_cpu(); 1152 cpu = get_cpu();
1153 percpu = per_cpu_ptr(conf->percpu, cpu); 1153 percpu = per_cpu_ptr(conf->percpu, cpu);
1154 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { 1154 if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) {
1155 ops_run_biofill(sh); 1155 ops_run_biofill(sh);
1156 overlap_clear++; 1156 overlap_clear++;
1157 } 1157 }
1158 1158
1159 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) { 1159 if (test_bit(STRIPE_OP_COMPUTE_BLK, &ops_request)) {
1160 if (level < 6) 1160 if (level < 6)
1161 tx = ops_run_compute5(sh, percpu); 1161 tx = ops_run_compute5(sh, percpu);
1162 else { 1162 else {
1163 if (sh->ops.target2 < 0 || sh->ops.target < 0) 1163 if (sh->ops.target2 < 0 || sh->ops.target < 0)
1164 tx = ops_run_compute6_1(sh, percpu); 1164 tx = ops_run_compute6_1(sh, percpu);
1165 else 1165 else
1166 tx = ops_run_compute6_2(sh, percpu); 1166 tx = ops_run_compute6_2(sh, percpu);
1167 } 1167 }
1168 /* terminate the chain if reconstruct is not set to be run */ 1168 /* terminate the chain if reconstruct is not set to be run */
1169 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) 1169 if (tx && !test_bit(STRIPE_OP_RECONSTRUCT, &ops_request))
1170 async_tx_ack(tx); 1170 async_tx_ack(tx);
1171 } 1171 }
1172 1172
1173 if (test_bit(STRIPE_OP_PREXOR, &ops_request)) 1173 if (test_bit(STRIPE_OP_PREXOR, &ops_request))
1174 tx = ops_run_prexor(sh, percpu, tx); 1174 tx = ops_run_prexor(sh, percpu, tx);
1175 1175
1176 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) { 1176 if (test_bit(STRIPE_OP_BIODRAIN, &ops_request)) {
1177 tx = ops_run_biodrain(sh, tx); 1177 tx = ops_run_biodrain(sh, tx);
1178 overlap_clear++; 1178 overlap_clear++;
1179 } 1179 }
1180 1180
1181 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) { 1181 if (test_bit(STRIPE_OP_RECONSTRUCT, &ops_request)) {
1182 if (level < 6) 1182 if (level < 6)
1183 ops_run_reconstruct5(sh, percpu, tx); 1183 ops_run_reconstruct5(sh, percpu, tx);
1184 else 1184 else
1185 ops_run_reconstruct6(sh, percpu, tx); 1185 ops_run_reconstruct6(sh, percpu, tx);
1186 } 1186 }
1187 1187
1188 if (test_bit(STRIPE_OP_CHECK, &ops_request)) { 1188 if (test_bit(STRIPE_OP_CHECK, &ops_request)) {
1189 if (sh->check_state == check_state_run) 1189 if (sh->check_state == check_state_run)
1190 ops_run_check_p(sh, percpu); 1190 ops_run_check_p(sh, percpu);
1191 else if (sh->check_state == check_state_run_q) 1191 else if (sh->check_state == check_state_run_q)
1192 ops_run_check_pq(sh, percpu, 0); 1192 ops_run_check_pq(sh, percpu, 0);
1193 else if (sh->check_state == check_state_run_pq) 1193 else if (sh->check_state == check_state_run_pq)
1194 ops_run_check_pq(sh, percpu, 1); 1194 ops_run_check_pq(sh, percpu, 1);
1195 else 1195 else
1196 BUG(); 1196 BUG();
1197 } 1197 }
1198 1198
1199 if (overlap_clear) 1199 if (overlap_clear)
1200 for (i = disks; i--; ) { 1200 for (i = disks; i--; ) {
1201 struct r5dev *dev = &sh->dev[i]; 1201 struct r5dev *dev = &sh->dev[i];
1202 if (test_and_clear_bit(R5_Overlap, &dev->flags)) 1202 if (test_and_clear_bit(R5_Overlap, &dev->flags))
1203 wake_up(&sh->raid_conf->wait_for_overlap); 1203 wake_up(&sh->raid_conf->wait_for_overlap);
1204 } 1204 }
1205 put_cpu(); 1205 put_cpu();
1206 } 1206 }
1207 1207
1208 #ifdef CONFIG_MULTICORE_RAID456 1208 #ifdef CONFIG_MULTICORE_RAID456
1209 static void async_run_ops(void *param, async_cookie_t cookie) 1209 static void async_run_ops(void *param, async_cookie_t cookie)
1210 { 1210 {
1211 struct stripe_head *sh = param; 1211 struct stripe_head *sh = param;
1212 unsigned long ops_request = sh->ops.request; 1212 unsigned long ops_request = sh->ops.request;
1213 1213
1214 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state); 1214 clear_bit_unlock(STRIPE_OPS_REQ_PENDING, &sh->state);
1215 wake_up(&sh->ops.wait_for_ops); 1215 wake_up(&sh->ops.wait_for_ops);
1216 1216
1217 __raid_run_ops(sh, ops_request); 1217 __raid_run_ops(sh, ops_request);
1218 release_stripe(sh); 1218 release_stripe(sh);
1219 } 1219 }
1220 1220
1221 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) 1221 static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request)
1222 { 1222 {
1223 /* since handle_stripe can be called outside of raid5d context 1223 /* since handle_stripe can be called outside of raid5d context
1224 * we need to ensure sh->ops.request is de-staged before another 1224 * we need to ensure sh->ops.request is de-staged before another
1225 * request arrives 1225 * request arrives
1226 */ 1226 */
1227 wait_event(sh->ops.wait_for_ops, 1227 wait_event(sh->ops.wait_for_ops,
1228 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state)); 1228 !test_and_set_bit_lock(STRIPE_OPS_REQ_PENDING, &sh->state));
1229 sh->ops.request = ops_request; 1229 sh->ops.request = ops_request;
1230 1230
1231 atomic_inc(&sh->count); 1231 atomic_inc(&sh->count);
1232 async_schedule(async_run_ops, sh); 1232 async_schedule(async_run_ops, sh);
1233 } 1233 }
1234 #else 1234 #else
1235 #define raid_run_ops __raid_run_ops 1235 #define raid_run_ops __raid_run_ops
1236 #endif 1236 #endif
1237 1237
1238 static int grow_one_stripe(raid5_conf_t *conf) 1238 static int grow_one_stripe(raid5_conf_t *conf)
1239 { 1239 {
1240 struct stripe_head *sh; 1240 struct stripe_head *sh;
1241 int disks = max(conf->raid_disks, conf->previous_raid_disks);
1241 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL); 1242 sh = kmem_cache_alloc(conf->slab_cache, GFP_KERNEL);
1242 if (!sh) 1243 if (!sh)
1243 return 0; 1244 return 0;
1244 memset(sh, 0, sizeof(*sh) + (conf->raid_disks-1)*sizeof(struct r5dev)); 1245 memset(sh, 0, sizeof(*sh) + (disks-1)*sizeof(struct r5dev));
1245 sh->raid_conf = conf; 1246 sh->raid_conf = conf;
1246 spin_lock_init(&sh->lock); 1247 spin_lock_init(&sh->lock);
1247 #ifdef CONFIG_MULTICORE_RAID456 1248 #ifdef CONFIG_MULTICORE_RAID456
1248 init_waitqueue_head(&sh->ops.wait_for_ops); 1249 init_waitqueue_head(&sh->ops.wait_for_ops);
1249 #endif 1250 #endif
1250 1251
1251 if (grow_buffers(sh, conf->raid_disks)) { 1252 if (grow_buffers(sh, disks)) {
1252 shrink_buffers(sh, conf->raid_disks); 1253 shrink_buffers(sh, disks);
1253 kmem_cache_free(conf->slab_cache, sh); 1254 kmem_cache_free(conf->slab_cache, sh);
1254 return 0; 1255 return 0;
1255 } 1256 }
1256 sh->disks = conf->raid_disks;
1257 /* we just created an active stripe so... */ 1257 /* we just created an active stripe so... */
1258 atomic_set(&sh->count, 1); 1258 atomic_set(&sh->count, 1);
1259 atomic_inc(&conf->active_stripes); 1259 atomic_inc(&conf->active_stripes);
1260 INIT_LIST_HEAD(&sh->lru); 1260 INIT_LIST_HEAD(&sh->lru);
1261 release_stripe(sh); 1261 release_stripe(sh);
1262 return 1; 1262 return 1;
1263 } 1263 }
1264 1264
1265 static int grow_stripes(raid5_conf_t *conf, int num) 1265 static int grow_stripes(raid5_conf_t *conf, int num)
1266 { 1266 {
1267 struct kmem_cache *sc; 1267 struct kmem_cache *sc;
1268 int devs = conf->raid_disks; 1268 int devs = max(conf->raid_disks, conf->previous_raid_disks);
1269 1269
1270 sprintf(conf->cache_name[0], 1270 sprintf(conf->cache_name[0],
1271 "raid%d-%s", conf->level, mdname(conf->mddev)); 1271 "raid%d-%s", conf->level, mdname(conf->mddev));
1272 sprintf(conf->cache_name[1], 1272 sprintf(conf->cache_name[1],
1273 "raid%d-%s-alt", conf->level, mdname(conf->mddev)); 1273 "raid%d-%s-alt", conf->level, mdname(conf->mddev));
1274 conf->active_name = 0; 1274 conf->active_name = 0;
1275 sc = kmem_cache_create(conf->cache_name[conf->active_name], 1275 sc = kmem_cache_create(conf->cache_name[conf->active_name],
1276 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev), 1276 sizeof(struct stripe_head)+(devs-1)*sizeof(struct r5dev),
1277 0, 0, NULL); 1277 0, 0, NULL);
1278 if (!sc) 1278 if (!sc)
1279 return 1; 1279 return 1;
1280 conf->slab_cache = sc; 1280 conf->slab_cache = sc;
1281 conf->pool_size = devs; 1281 conf->pool_size = devs;
1282 while (num--) 1282 while (num--)
1283 if (!grow_one_stripe(conf)) 1283 if (!grow_one_stripe(conf))
1284 return 1; 1284 return 1;
1285 return 0; 1285 return 0;
1286 } 1286 }
1287 1287
1288 /** 1288 /**
1289 * scribble_len - return the required size of the scribble region 1289 * scribble_len - return the required size of the scribble region
1290 * @num - total number of disks in the array 1290 * @num - total number of disks in the array
1291 * 1291 *
1292 * The size must be enough to contain: 1292 * The size must be enough to contain:
1293 * 1/ a struct page pointer for each device in the array +2 1293 * 1/ a struct page pointer for each device in the array +2
1294 * 2/ room to convert each entry in (1) to its corresponding dma 1294 * 2/ room to convert each entry in (1) to its corresponding dma
1295 * (dma_map_page()) or page (page_address()) address. 1295 * (dma_map_page()) or page (page_address()) address.
1296 * 1296 *
1297 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we 1297 * Note: the +2 is for the destination buffers of the ddf/raid6 case where we
1298 * calculate over all devices (not just the data blocks), using zeros in place 1298 * calculate over all devices (not just the data blocks), using zeros in place
1299 * of the P and Q blocks. 1299 * of the P and Q blocks.
1300 */ 1300 */
1301 static size_t scribble_len(int num) 1301 static size_t scribble_len(int num)
1302 { 1302 {
1303 size_t len; 1303 size_t len;
1304 1304
1305 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2); 1305 len = sizeof(struct page *) * (num+2) + sizeof(addr_conv_t) * (num+2);
1306 1306
1307 return len; 1307 return len;
1308 } 1308 }
1309 1309
1310 static int resize_stripes(raid5_conf_t *conf, int newsize) 1310 static int resize_stripes(raid5_conf_t *conf, int newsize)
1311 { 1311 {
1312 /* Make all the stripes able to hold 'newsize' devices. 1312 /* Make all the stripes able to hold 'newsize' devices.
1313 * New slots in each stripe get 'page' set to a new page. 1313 * New slots in each stripe get 'page' set to a new page.
1314 * 1314 *
1315 * This happens in stages: 1315 * This happens in stages:
1316 * 1/ create a new kmem_cache and allocate the required number of 1316 * 1/ create a new kmem_cache and allocate the required number of
1317 * stripe_heads. 1317 * stripe_heads.
1318 * 2/ gather all the old stripe_heads and tranfer the pages across 1318 * 2/ gather all the old stripe_heads and tranfer the pages across
1319 * to the new stripe_heads. This will have the side effect of 1319 * to the new stripe_heads. This will have the side effect of
1320 * freezing the array as once all stripe_heads have been collected, 1320 * freezing the array as once all stripe_heads have been collected,
1321 * no IO will be possible. Old stripe heads are freed once their 1321 * no IO will be possible. Old stripe heads are freed once their
1322 * pages have been transferred over, and the old kmem_cache is 1322 * pages have been transferred over, and the old kmem_cache is
1323 * freed when all stripes are done. 1323 * freed when all stripes are done.
1324 * 3/ reallocate conf->disks to be suitable bigger. If this fails, 1324 * 3/ reallocate conf->disks to be suitable bigger. If this fails,
1325 * we simple return a failre status - no need to clean anything up. 1325 * we simple return a failre status - no need to clean anything up.
1326 * 4/ allocate new pages for the new slots in the new stripe_heads. 1326 * 4/ allocate new pages for the new slots in the new stripe_heads.
1327 * If this fails, we don't bother trying the shrink the 1327 * If this fails, we don't bother trying the shrink the
1328 * stripe_heads down again, we just leave them as they are. 1328 * stripe_heads down again, we just leave them as they are.
1329 * As each stripe_head is processed the new one is released into 1329 * As each stripe_head is processed the new one is released into
1330 * active service. 1330 * active service.
1331 * 1331 *
1332 * Once step2 is started, we cannot afford to wait for a write, 1332 * Once step2 is started, we cannot afford to wait for a write,
1333 * so we use GFP_NOIO allocations. 1333 * so we use GFP_NOIO allocations.
1334 */ 1334 */
1335 struct stripe_head *osh, *nsh; 1335 struct stripe_head *osh, *nsh;
1336 LIST_HEAD(newstripes); 1336 LIST_HEAD(newstripes);
1337 struct disk_info *ndisks; 1337 struct disk_info *ndisks;
1338 unsigned long cpu; 1338 unsigned long cpu;
1339 int err; 1339 int err;
1340 struct kmem_cache *sc; 1340 struct kmem_cache *sc;
1341 int i; 1341 int i;
1342 1342
1343 if (newsize <= conf->pool_size) 1343 if (newsize <= conf->pool_size)
1344 return 0; /* never bother to shrink */ 1344 return 0; /* never bother to shrink */
1345 1345
1346 err = md_allow_write(conf->mddev); 1346 err = md_allow_write(conf->mddev);
1347 if (err) 1347 if (err)
1348 return err; 1348 return err;
1349 1349
1350 /* Step 1 */ 1350 /* Step 1 */
1351 sc = kmem_cache_create(conf->cache_name[1-conf->active_name], 1351 sc = kmem_cache_create(conf->cache_name[1-conf->active_name],
1352 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev), 1352 sizeof(struct stripe_head)+(newsize-1)*sizeof(struct r5dev),
1353 0, 0, NULL); 1353 0, 0, NULL);
1354 if (!sc) 1354 if (!sc)
1355 return -ENOMEM; 1355 return -ENOMEM;
1356 1356
1357 for (i = conf->max_nr_stripes; i; i--) { 1357 for (i = conf->max_nr_stripes; i; i--) {
1358 nsh = kmem_cache_alloc(sc, GFP_KERNEL); 1358 nsh = kmem_cache_alloc(sc, GFP_KERNEL);
1359 if (!nsh) 1359 if (!nsh)
1360 break; 1360 break;
1361 1361
1362 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev)); 1362 memset(nsh, 0, sizeof(*nsh) + (newsize-1)*sizeof(struct r5dev));
1363 1363
1364 nsh->raid_conf = conf; 1364 nsh->raid_conf = conf;
1365 spin_lock_init(&nsh->lock); 1365 spin_lock_init(&nsh->lock);
1366 #ifdef CONFIG_MULTICORE_RAID456 1366 #ifdef CONFIG_MULTICORE_RAID456
1367 init_waitqueue_head(&nsh->ops.wait_for_ops); 1367 init_waitqueue_head(&nsh->ops.wait_for_ops);
1368 #endif 1368 #endif
1369 1369
1370 list_add(&nsh->lru, &newstripes); 1370 list_add(&nsh->lru, &newstripes);
1371 } 1371 }
1372 if (i) { 1372 if (i) {
1373 /* didn't get enough, give up */ 1373 /* didn't get enough, give up */
1374 while (!list_empty(&newstripes)) { 1374 while (!list_empty(&newstripes)) {
1375 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1375 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1376 list_del(&nsh->lru); 1376 list_del(&nsh->lru);
1377 kmem_cache_free(sc, nsh); 1377 kmem_cache_free(sc, nsh);
1378 } 1378 }
1379 kmem_cache_destroy(sc); 1379 kmem_cache_destroy(sc);
1380 return -ENOMEM; 1380 return -ENOMEM;
1381 } 1381 }
1382 /* Step 2 - Must use GFP_NOIO now. 1382 /* Step 2 - Must use GFP_NOIO now.
1383 * OK, we have enough stripes, start collecting inactive 1383 * OK, we have enough stripes, start collecting inactive
1384 * stripes and copying them over 1384 * stripes and copying them over
1385 */ 1385 */
1386 list_for_each_entry(nsh, &newstripes, lru) { 1386 list_for_each_entry(nsh, &newstripes, lru) {
1387 spin_lock_irq(&conf->device_lock); 1387 spin_lock_irq(&conf->device_lock);
1388 wait_event_lock_irq(conf->wait_for_stripe, 1388 wait_event_lock_irq(conf->wait_for_stripe,
1389 !list_empty(&conf->inactive_list), 1389 !list_empty(&conf->inactive_list),
1390 conf->device_lock, 1390 conf->device_lock,
1391 unplug_slaves(conf->mddev) 1391 unplug_slaves(conf->mddev)
1392 ); 1392 );
1393 osh = get_free_stripe(conf); 1393 osh = get_free_stripe(conf);
1394 spin_unlock_irq(&conf->device_lock); 1394 spin_unlock_irq(&conf->device_lock);
1395 atomic_set(&nsh->count, 1); 1395 atomic_set(&nsh->count, 1);
1396 for(i=0; i<conf->pool_size; i++) 1396 for(i=0; i<conf->pool_size; i++)
1397 nsh->dev[i].page = osh->dev[i].page; 1397 nsh->dev[i].page = osh->dev[i].page;
1398 for( ; i<newsize; i++) 1398 for( ; i<newsize; i++)
1399 nsh->dev[i].page = NULL; 1399 nsh->dev[i].page = NULL;
1400 kmem_cache_free(conf->slab_cache, osh); 1400 kmem_cache_free(conf->slab_cache, osh);
1401 } 1401 }
1402 kmem_cache_destroy(conf->slab_cache); 1402 kmem_cache_destroy(conf->slab_cache);
1403 1403
1404 /* Step 3. 1404 /* Step 3.
1405 * At this point, we are holding all the stripes so the array 1405 * At this point, we are holding all the stripes so the array
1406 * is completely stalled, so now is a good time to resize 1406 * is completely stalled, so now is a good time to resize
1407 * conf->disks and the scribble region 1407 * conf->disks and the scribble region
1408 */ 1408 */
1409 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO); 1409 ndisks = kzalloc(newsize * sizeof(struct disk_info), GFP_NOIO);
1410 if (ndisks) { 1410 if (ndisks) {
1411 for (i=0; i<conf->raid_disks; i++) 1411 for (i=0; i<conf->raid_disks; i++)
1412 ndisks[i] = conf->disks[i]; 1412 ndisks[i] = conf->disks[i];
1413 kfree(conf->disks); 1413 kfree(conf->disks);
1414 conf->disks = ndisks; 1414 conf->disks = ndisks;
1415 } else 1415 } else
1416 err = -ENOMEM; 1416 err = -ENOMEM;
1417 1417
1418 get_online_cpus(); 1418 get_online_cpus();
1419 conf->scribble_len = scribble_len(newsize); 1419 conf->scribble_len = scribble_len(newsize);
1420 for_each_present_cpu(cpu) { 1420 for_each_present_cpu(cpu) {
1421 struct raid5_percpu *percpu; 1421 struct raid5_percpu *percpu;
1422 void *scribble; 1422 void *scribble;
1423 1423
1424 percpu = per_cpu_ptr(conf->percpu, cpu); 1424 percpu = per_cpu_ptr(conf->percpu, cpu);
1425 scribble = kmalloc(conf->scribble_len, GFP_NOIO); 1425 scribble = kmalloc(conf->scribble_len, GFP_NOIO);
1426 1426
1427 if (scribble) { 1427 if (scribble) {
1428 kfree(percpu->scribble); 1428 kfree(percpu->scribble);
1429 percpu->scribble = scribble; 1429 percpu->scribble = scribble;
1430 } else { 1430 } else {
1431 err = -ENOMEM; 1431 err = -ENOMEM;
1432 break; 1432 break;
1433 } 1433 }
1434 } 1434 }
1435 put_online_cpus(); 1435 put_online_cpus();
1436 1436
1437 /* Step 4, return new stripes to service */ 1437 /* Step 4, return new stripes to service */
1438 while(!list_empty(&newstripes)) { 1438 while(!list_empty(&newstripes)) {
1439 nsh = list_entry(newstripes.next, struct stripe_head, lru); 1439 nsh = list_entry(newstripes.next, struct stripe_head, lru);
1440 list_del_init(&nsh->lru); 1440 list_del_init(&nsh->lru);
1441 1441
1442 for (i=conf->raid_disks; i < newsize; i++) 1442 for (i=conf->raid_disks; i < newsize; i++)
1443 if (nsh->dev[i].page == NULL) { 1443 if (nsh->dev[i].page == NULL) {
1444 struct page *p = alloc_page(GFP_NOIO); 1444 struct page *p = alloc_page(GFP_NOIO);
1445 nsh->dev[i].page = p; 1445 nsh->dev[i].page = p;
1446 if (!p) 1446 if (!p)
1447 err = -ENOMEM; 1447 err = -ENOMEM;
1448 } 1448 }
1449 release_stripe(nsh); 1449 release_stripe(nsh);
1450 } 1450 }
1451 /* critical section pass, GFP_NOIO no longer needed */ 1451 /* critical section pass, GFP_NOIO no longer needed */
1452 1452
1453 conf->slab_cache = sc; 1453 conf->slab_cache = sc;
1454 conf->active_name = 1-conf->active_name; 1454 conf->active_name = 1-conf->active_name;
1455 conf->pool_size = newsize; 1455 conf->pool_size = newsize;
1456 return err; 1456 return err;
1457 } 1457 }
1458 1458
1459 static int drop_one_stripe(raid5_conf_t *conf) 1459 static int drop_one_stripe(raid5_conf_t *conf)
1460 { 1460 {
1461 struct stripe_head *sh; 1461 struct stripe_head *sh;
1462 1462
1463 spin_lock_irq(&conf->device_lock); 1463 spin_lock_irq(&conf->device_lock);
1464 sh = get_free_stripe(conf); 1464 sh = get_free_stripe(conf);
1465 spin_unlock_irq(&conf->device_lock); 1465 spin_unlock_irq(&conf->device_lock);
1466 if (!sh) 1466 if (!sh)
1467 return 0; 1467 return 0;
1468 BUG_ON(atomic_read(&sh->count)); 1468 BUG_ON(atomic_read(&sh->count));
1469 shrink_buffers(sh, conf->pool_size); 1469 shrink_buffers(sh, conf->pool_size);
1470 kmem_cache_free(conf->slab_cache, sh); 1470 kmem_cache_free(conf->slab_cache, sh);
1471 atomic_dec(&conf->active_stripes); 1471 atomic_dec(&conf->active_stripes);
1472 return 1; 1472 return 1;
1473 } 1473 }
1474 1474
1475 static void shrink_stripes(raid5_conf_t *conf) 1475 static void shrink_stripes(raid5_conf_t *conf)
1476 { 1476 {
1477 while (drop_one_stripe(conf)) 1477 while (drop_one_stripe(conf))
1478 ; 1478 ;
1479 1479
1480 if (conf->slab_cache) 1480 if (conf->slab_cache)
1481 kmem_cache_destroy(conf->slab_cache); 1481 kmem_cache_destroy(conf->slab_cache);
1482 conf->slab_cache = NULL; 1482 conf->slab_cache = NULL;
1483 } 1483 }
1484 1484
1485 static void raid5_end_read_request(struct bio * bi, int error) 1485 static void raid5_end_read_request(struct bio * bi, int error)
1486 { 1486 {
1487 struct stripe_head *sh = bi->bi_private; 1487 struct stripe_head *sh = bi->bi_private;
1488 raid5_conf_t *conf = sh->raid_conf; 1488 raid5_conf_t *conf = sh->raid_conf;
1489 int disks = sh->disks, i; 1489 int disks = sh->disks, i;
1490 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1490 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1491 char b[BDEVNAME_SIZE]; 1491 char b[BDEVNAME_SIZE];
1492 mdk_rdev_t *rdev; 1492 mdk_rdev_t *rdev;
1493 1493
1494 1494
1495 for (i=0 ; i<disks; i++) 1495 for (i=0 ; i<disks; i++)
1496 if (bi == &sh->dev[i].req) 1496 if (bi == &sh->dev[i].req)
1497 break; 1497 break;
1498 1498
1499 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n", 1499 pr_debug("end_read_request %llu/%d, count: %d, uptodate %d.\n",
1500 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1500 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1501 uptodate); 1501 uptodate);
1502 if (i == disks) { 1502 if (i == disks) {
1503 BUG(); 1503 BUG();
1504 return; 1504 return;
1505 } 1505 }
1506 1506
1507 if (uptodate) { 1507 if (uptodate) {
1508 set_bit(R5_UPTODATE, &sh->dev[i].flags); 1508 set_bit(R5_UPTODATE, &sh->dev[i].flags);
1509 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 1509 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
1510 rdev = conf->disks[i].rdev; 1510 rdev = conf->disks[i].rdev;
1511 printk_rl(KERN_INFO "raid5:%s: read error corrected" 1511 printk_rl(KERN_INFO "raid5:%s: read error corrected"
1512 " (%lu sectors at %llu on %s)\n", 1512 " (%lu sectors at %llu on %s)\n",
1513 mdname(conf->mddev), STRIPE_SECTORS, 1513 mdname(conf->mddev), STRIPE_SECTORS,
1514 (unsigned long long)(sh->sector 1514 (unsigned long long)(sh->sector
1515 + rdev->data_offset), 1515 + rdev->data_offset),
1516 bdevname(rdev->bdev, b)); 1516 bdevname(rdev->bdev, b));
1517 clear_bit(R5_ReadError, &sh->dev[i].flags); 1517 clear_bit(R5_ReadError, &sh->dev[i].flags);
1518 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1518 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1519 } 1519 }
1520 if (atomic_read(&conf->disks[i].rdev->read_errors)) 1520 if (atomic_read(&conf->disks[i].rdev->read_errors))
1521 atomic_set(&conf->disks[i].rdev->read_errors, 0); 1521 atomic_set(&conf->disks[i].rdev->read_errors, 0);
1522 } else { 1522 } else {
1523 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b); 1523 const char *bdn = bdevname(conf->disks[i].rdev->bdev, b);
1524 int retry = 0; 1524 int retry = 0;
1525 rdev = conf->disks[i].rdev; 1525 rdev = conf->disks[i].rdev;
1526 1526
1527 clear_bit(R5_UPTODATE, &sh->dev[i].flags); 1527 clear_bit(R5_UPTODATE, &sh->dev[i].flags);
1528 atomic_inc(&rdev->read_errors); 1528 atomic_inc(&rdev->read_errors);
1529 if (conf->mddev->degraded) 1529 if (conf->mddev->degraded)
1530 printk_rl(KERN_WARNING 1530 printk_rl(KERN_WARNING
1531 "raid5:%s: read error not correctable " 1531 "raid5:%s: read error not correctable "
1532 "(sector %llu on %s).\n", 1532 "(sector %llu on %s).\n",
1533 mdname(conf->mddev), 1533 mdname(conf->mddev),
1534 (unsigned long long)(sh->sector 1534 (unsigned long long)(sh->sector
1535 + rdev->data_offset), 1535 + rdev->data_offset),
1536 bdn); 1536 bdn);
1537 else if (test_bit(R5_ReWrite, &sh->dev[i].flags)) 1537 else if (test_bit(R5_ReWrite, &sh->dev[i].flags))
1538 /* Oh, no!!! */ 1538 /* Oh, no!!! */
1539 printk_rl(KERN_WARNING 1539 printk_rl(KERN_WARNING
1540 "raid5:%s: read error NOT corrected!! " 1540 "raid5:%s: read error NOT corrected!! "
1541 "(sector %llu on %s).\n", 1541 "(sector %llu on %s).\n",
1542 mdname(conf->mddev), 1542 mdname(conf->mddev),
1543 (unsigned long long)(sh->sector 1543 (unsigned long long)(sh->sector
1544 + rdev->data_offset), 1544 + rdev->data_offset),
1545 bdn); 1545 bdn);
1546 else if (atomic_read(&rdev->read_errors) 1546 else if (atomic_read(&rdev->read_errors)
1547 > conf->max_nr_stripes) 1547 > conf->max_nr_stripes)
1548 printk(KERN_WARNING 1548 printk(KERN_WARNING
1549 "raid5:%s: Too many read errors, failing device %s.\n", 1549 "raid5:%s: Too many read errors, failing device %s.\n",
1550 mdname(conf->mddev), bdn); 1550 mdname(conf->mddev), bdn);
1551 else 1551 else
1552 retry = 1; 1552 retry = 1;
1553 if (retry) 1553 if (retry)
1554 set_bit(R5_ReadError, &sh->dev[i].flags); 1554 set_bit(R5_ReadError, &sh->dev[i].flags);
1555 else { 1555 else {
1556 clear_bit(R5_ReadError, &sh->dev[i].flags); 1556 clear_bit(R5_ReadError, &sh->dev[i].flags);
1557 clear_bit(R5_ReWrite, &sh->dev[i].flags); 1557 clear_bit(R5_ReWrite, &sh->dev[i].flags);
1558 md_error(conf->mddev, rdev); 1558 md_error(conf->mddev, rdev);
1559 } 1559 }
1560 } 1560 }
1561 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1561 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1562 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1562 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1563 set_bit(STRIPE_HANDLE, &sh->state); 1563 set_bit(STRIPE_HANDLE, &sh->state);
1564 release_stripe(sh); 1564 release_stripe(sh);
1565 } 1565 }
1566 1566
1567 static void raid5_end_write_request(struct bio *bi, int error) 1567 static void raid5_end_write_request(struct bio *bi, int error)
1568 { 1568 {
1569 struct stripe_head *sh = bi->bi_private; 1569 struct stripe_head *sh = bi->bi_private;
1570 raid5_conf_t *conf = sh->raid_conf; 1570 raid5_conf_t *conf = sh->raid_conf;
1571 int disks = sh->disks, i; 1571 int disks = sh->disks, i;
1572 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 1572 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
1573 1573
1574 for (i=0 ; i<disks; i++) 1574 for (i=0 ; i<disks; i++)
1575 if (bi == &sh->dev[i].req) 1575 if (bi == &sh->dev[i].req)
1576 break; 1576 break;
1577 1577
1578 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n", 1578 pr_debug("end_write_request %llu/%d, count %d, uptodate: %d.\n",
1579 (unsigned long long)sh->sector, i, atomic_read(&sh->count), 1579 (unsigned long long)sh->sector, i, atomic_read(&sh->count),
1580 uptodate); 1580 uptodate);
1581 if (i == disks) { 1581 if (i == disks) {
1582 BUG(); 1582 BUG();
1583 return; 1583 return;
1584 } 1584 }
1585 1585
1586 if (!uptodate) 1586 if (!uptodate)
1587 md_error(conf->mddev, conf->disks[i].rdev); 1587 md_error(conf->mddev, conf->disks[i].rdev);
1588 1588
1589 rdev_dec_pending(conf->disks[i].rdev, conf->mddev); 1589 rdev_dec_pending(conf->disks[i].rdev, conf->mddev);
1590 1590
1591 clear_bit(R5_LOCKED, &sh->dev[i].flags); 1591 clear_bit(R5_LOCKED, &sh->dev[i].flags);
1592 set_bit(STRIPE_HANDLE, &sh->state); 1592 set_bit(STRIPE_HANDLE, &sh->state);
1593 release_stripe(sh); 1593 release_stripe(sh);
1594 } 1594 }
1595 1595
1596 1596
1597 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous); 1597 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous);
1598 1598
1599 static void raid5_build_block(struct stripe_head *sh, int i, int previous) 1599 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
1600 { 1600 {
1601 struct r5dev *dev = &sh->dev[i]; 1601 struct r5dev *dev = &sh->dev[i];
1602 1602
1603 bio_init(&dev->req); 1603 bio_init(&dev->req);
1604 dev->req.bi_io_vec = &dev->vec; 1604 dev->req.bi_io_vec = &dev->vec;
1605 dev->req.bi_vcnt++; 1605 dev->req.bi_vcnt++;
1606 dev->req.bi_max_vecs++; 1606 dev->req.bi_max_vecs++;
1607 dev->vec.bv_page = dev->page; 1607 dev->vec.bv_page = dev->page;
1608 dev->vec.bv_len = STRIPE_SIZE; 1608 dev->vec.bv_len = STRIPE_SIZE;
1609 dev->vec.bv_offset = 0; 1609 dev->vec.bv_offset = 0;
1610 1610
1611 dev->req.bi_sector = sh->sector; 1611 dev->req.bi_sector = sh->sector;
1612 dev->req.bi_private = sh; 1612 dev->req.bi_private = sh;
1613 1613
1614 dev->flags = 0; 1614 dev->flags = 0;
1615 dev->sector = compute_blocknr(sh, i, previous); 1615 dev->sector = compute_blocknr(sh, i, previous);
1616 } 1616 }
1617 1617
1618 static void error(mddev_t *mddev, mdk_rdev_t *rdev) 1618 static void error(mddev_t *mddev, mdk_rdev_t *rdev)
1619 { 1619 {
1620 char b[BDEVNAME_SIZE]; 1620 char b[BDEVNAME_SIZE];
1621 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 1621 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
1622 pr_debug("raid5: error called\n"); 1622 pr_debug("raid5: error called\n");
1623 1623
1624 if (!test_bit(Faulty, &rdev->flags)) { 1624 if (!test_bit(Faulty, &rdev->flags)) {
1625 set_bit(MD_CHANGE_DEVS, &mddev->flags); 1625 set_bit(MD_CHANGE_DEVS, &mddev->flags);
1626 if (test_and_clear_bit(In_sync, &rdev->flags)) { 1626 if (test_and_clear_bit(In_sync, &rdev->flags)) {
1627 unsigned long flags; 1627 unsigned long flags;
1628 spin_lock_irqsave(&conf->device_lock, flags); 1628 spin_lock_irqsave(&conf->device_lock, flags);
1629 mddev->degraded++; 1629 mddev->degraded++;
1630 spin_unlock_irqrestore(&conf->device_lock, flags); 1630 spin_unlock_irqrestore(&conf->device_lock, flags);
1631 /* 1631 /*
1632 * if recovery was running, make sure it aborts. 1632 * if recovery was running, make sure it aborts.
1633 */ 1633 */
1634 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 1634 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
1635 } 1635 }
1636 set_bit(Faulty, &rdev->flags); 1636 set_bit(Faulty, &rdev->flags);
1637 printk(KERN_ALERT 1637 printk(KERN_ALERT
1638 "raid5: Disk failure on %s, disabling device.\n" 1638 "raid5: Disk failure on %s, disabling device.\n"
1639 "raid5: Operation continuing on %d devices.\n", 1639 "raid5: Operation continuing on %d devices.\n",
1640 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded); 1640 bdevname(rdev->bdev,b), conf->raid_disks - mddev->degraded);
1641 } 1641 }
1642 } 1642 }
1643 1643
1644 /* 1644 /*
1645 * Input: a 'big' sector number, 1645 * Input: a 'big' sector number,
1646 * Output: index of the data and parity disk, and the sector # in them. 1646 * Output: index of the data and parity disk, and the sector # in them.
1647 */ 1647 */
1648 static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector, 1648 static sector_t raid5_compute_sector(raid5_conf_t *conf, sector_t r_sector,
1649 int previous, int *dd_idx, 1649 int previous, int *dd_idx,
1650 struct stripe_head *sh) 1650 struct stripe_head *sh)
1651 { 1651 {
1652 long stripe; 1652 long stripe;
1653 unsigned long chunk_number; 1653 unsigned long chunk_number;
1654 unsigned int chunk_offset; 1654 unsigned int chunk_offset;
1655 int pd_idx, qd_idx; 1655 int pd_idx, qd_idx;
1656 int ddf_layout = 0; 1656 int ddf_layout = 0;
1657 sector_t new_sector; 1657 sector_t new_sector;
1658 int algorithm = previous ? conf->prev_algo 1658 int algorithm = previous ? conf->prev_algo
1659 : conf->algorithm; 1659 : conf->algorithm;
1660 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1660 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1661 : conf->chunk_sectors; 1661 : conf->chunk_sectors;
1662 int raid_disks = previous ? conf->previous_raid_disks 1662 int raid_disks = previous ? conf->previous_raid_disks
1663 : conf->raid_disks; 1663 : conf->raid_disks;
1664 int data_disks = raid_disks - conf->max_degraded; 1664 int data_disks = raid_disks - conf->max_degraded;
1665 1665
1666 /* First compute the information on this sector */ 1666 /* First compute the information on this sector */
1667 1667
1668 /* 1668 /*
1669 * Compute the chunk number and the sector offset inside the chunk 1669 * Compute the chunk number and the sector offset inside the chunk
1670 */ 1670 */
1671 chunk_offset = sector_div(r_sector, sectors_per_chunk); 1671 chunk_offset = sector_div(r_sector, sectors_per_chunk);
1672 chunk_number = r_sector; 1672 chunk_number = r_sector;
1673 BUG_ON(r_sector != chunk_number); 1673 BUG_ON(r_sector != chunk_number);
1674 1674
1675 /* 1675 /*
1676 * Compute the stripe number 1676 * Compute the stripe number
1677 */ 1677 */
1678 stripe = chunk_number / data_disks; 1678 stripe = chunk_number / data_disks;
1679 1679
1680 /* 1680 /*
1681 * Compute the data disk and parity disk indexes inside the stripe 1681 * Compute the data disk and parity disk indexes inside the stripe
1682 */ 1682 */
1683 *dd_idx = chunk_number % data_disks; 1683 *dd_idx = chunk_number % data_disks;
1684 1684
1685 /* 1685 /*
1686 * Select the parity disk based on the user selected algorithm. 1686 * Select the parity disk based on the user selected algorithm.
1687 */ 1687 */
1688 pd_idx = qd_idx = ~0; 1688 pd_idx = qd_idx = ~0;
1689 switch(conf->level) { 1689 switch(conf->level) {
1690 case 4: 1690 case 4:
1691 pd_idx = data_disks; 1691 pd_idx = data_disks;
1692 break; 1692 break;
1693 case 5: 1693 case 5:
1694 switch (algorithm) { 1694 switch (algorithm) {
1695 case ALGORITHM_LEFT_ASYMMETRIC: 1695 case ALGORITHM_LEFT_ASYMMETRIC:
1696 pd_idx = data_disks - stripe % raid_disks; 1696 pd_idx = data_disks - stripe % raid_disks;
1697 if (*dd_idx >= pd_idx) 1697 if (*dd_idx >= pd_idx)
1698 (*dd_idx)++; 1698 (*dd_idx)++;
1699 break; 1699 break;
1700 case ALGORITHM_RIGHT_ASYMMETRIC: 1700 case ALGORITHM_RIGHT_ASYMMETRIC:
1701 pd_idx = stripe % raid_disks; 1701 pd_idx = stripe % raid_disks;
1702 if (*dd_idx >= pd_idx) 1702 if (*dd_idx >= pd_idx)
1703 (*dd_idx)++; 1703 (*dd_idx)++;
1704 break; 1704 break;
1705 case ALGORITHM_LEFT_SYMMETRIC: 1705 case ALGORITHM_LEFT_SYMMETRIC:
1706 pd_idx = data_disks - stripe % raid_disks; 1706 pd_idx = data_disks - stripe % raid_disks;
1707 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1707 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1708 break; 1708 break;
1709 case ALGORITHM_RIGHT_SYMMETRIC: 1709 case ALGORITHM_RIGHT_SYMMETRIC:
1710 pd_idx = stripe % raid_disks; 1710 pd_idx = stripe % raid_disks;
1711 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1711 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1712 break; 1712 break;
1713 case ALGORITHM_PARITY_0: 1713 case ALGORITHM_PARITY_0:
1714 pd_idx = 0; 1714 pd_idx = 0;
1715 (*dd_idx)++; 1715 (*dd_idx)++;
1716 break; 1716 break;
1717 case ALGORITHM_PARITY_N: 1717 case ALGORITHM_PARITY_N:
1718 pd_idx = data_disks; 1718 pd_idx = data_disks;
1719 break; 1719 break;
1720 default: 1720 default:
1721 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 1721 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1722 algorithm); 1722 algorithm);
1723 BUG(); 1723 BUG();
1724 } 1724 }
1725 break; 1725 break;
1726 case 6: 1726 case 6:
1727 1727
1728 switch (algorithm) { 1728 switch (algorithm) {
1729 case ALGORITHM_LEFT_ASYMMETRIC: 1729 case ALGORITHM_LEFT_ASYMMETRIC:
1730 pd_idx = raid_disks - 1 - (stripe % raid_disks); 1730 pd_idx = raid_disks - 1 - (stripe % raid_disks);
1731 qd_idx = pd_idx + 1; 1731 qd_idx = pd_idx + 1;
1732 if (pd_idx == raid_disks-1) { 1732 if (pd_idx == raid_disks-1) {
1733 (*dd_idx)++; /* Q D D D P */ 1733 (*dd_idx)++; /* Q D D D P */
1734 qd_idx = 0; 1734 qd_idx = 0;
1735 } else if (*dd_idx >= pd_idx) 1735 } else if (*dd_idx >= pd_idx)
1736 (*dd_idx) += 2; /* D D P Q D */ 1736 (*dd_idx) += 2; /* D D P Q D */
1737 break; 1737 break;
1738 case ALGORITHM_RIGHT_ASYMMETRIC: 1738 case ALGORITHM_RIGHT_ASYMMETRIC:
1739 pd_idx = stripe % raid_disks; 1739 pd_idx = stripe % raid_disks;
1740 qd_idx = pd_idx + 1; 1740 qd_idx = pd_idx + 1;
1741 if (pd_idx == raid_disks-1) { 1741 if (pd_idx == raid_disks-1) {
1742 (*dd_idx)++; /* Q D D D P */ 1742 (*dd_idx)++; /* Q D D D P */
1743 qd_idx = 0; 1743 qd_idx = 0;
1744 } else if (*dd_idx >= pd_idx) 1744 } else if (*dd_idx >= pd_idx)
1745 (*dd_idx) += 2; /* D D P Q D */ 1745 (*dd_idx) += 2; /* D D P Q D */
1746 break; 1746 break;
1747 case ALGORITHM_LEFT_SYMMETRIC: 1747 case ALGORITHM_LEFT_SYMMETRIC:
1748 pd_idx = raid_disks - 1 - (stripe % raid_disks); 1748 pd_idx = raid_disks - 1 - (stripe % raid_disks);
1749 qd_idx = (pd_idx + 1) % raid_disks; 1749 qd_idx = (pd_idx + 1) % raid_disks;
1750 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1750 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1751 break; 1751 break;
1752 case ALGORITHM_RIGHT_SYMMETRIC: 1752 case ALGORITHM_RIGHT_SYMMETRIC:
1753 pd_idx = stripe % raid_disks; 1753 pd_idx = stripe % raid_disks;
1754 qd_idx = (pd_idx + 1) % raid_disks; 1754 qd_idx = (pd_idx + 1) % raid_disks;
1755 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks; 1755 *dd_idx = (pd_idx + 2 + *dd_idx) % raid_disks;
1756 break; 1756 break;
1757 1757
1758 case ALGORITHM_PARITY_0: 1758 case ALGORITHM_PARITY_0:
1759 pd_idx = 0; 1759 pd_idx = 0;
1760 qd_idx = 1; 1760 qd_idx = 1;
1761 (*dd_idx) += 2; 1761 (*dd_idx) += 2;
1762 break; 1762 break;
1763 case ALGORITHM_PARITY_N: 1763 case ALGORITHM_PARITY_N:
1764 pd_idx = data_disks; 1764 pd_idx = data_disks;
1765 qd_idx = data_disks + 1; 1765 qd_idx = data_disks + 1;
1766 break; 1766 break;
1767 1767
1768 case ALGORITHM_ROTATING_ZERO_RESTART: 1768 case ALGORITHM_ROTATING_ZERO_RESTART:
1769 /* Exactly the same as RIGHT_ASYMMETRIC, but or 1769 /* Exactly the same as RIGHT_ASYMMETRIC, but or
1770 * of blocks for computing Q is different. 1770 * of blocks for computing Q is different.
1771 */ 1771 */
1772 pd_idx = stripe % raid_disks; 1772 pd_idx = stripe % raid_disks;
1773 qd_idx = pd_idx + 1; 1773 qd_idx = pd_idx + 1;
1774 if (pd_idx == raid_disks-1) { 1774 if (pd_idx == raid_disks-1) {
1775 (*dd_idx)++; /* Q D D D P */ 1775 (*dd_idx)++; /* Q D D D P */
1776 qd_idx = 0; 1776 qd_idx = 0;
1777 } else if (*dd_idx >= pd_idx) 1777 } else if (*dd_idx >= pd_idx)
1778 (*dd_idx) += 2; /* D D P Q D */ 1778 (*dd_idx) += 2; /* D D P Q D */
1779 ddf_layout = 1; 1779 ddf_layout = 1;
1780 break; 1780 break;
1781 1781
1782 case ALGORITHM_ROTATING_N_RESTART: 1782 case ALGORITHM_ROTATING_N_RESTART:
1783 /* Same a left_asymmetric, by first stripe is 1783 /* Same a left_asymmetric, by first stripe is
1784 * D D D P Q rather than 1784 * D D D P Q rather than
1785 * Q D D D P 1785 * Q D D D P
1786 */ 1786 */
1787 pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks); 1787 pd_idx = raid_disks - 1 - ((stripe + 1) % raid_disks);
1788 qd_idx = pd_idx + 1; 1788 qd_idx = pd_idx + 1;
1789 if (pd_idx == raid_disks-1) { 1789 if (pd_idx == raid_disks-1) {
1790 (*dd_idx)++; /* Q D D D P */ 1790 (*dd_idx)++; /* Q D D D P */
1791 qd_idx = 0; 1791 qd_idx = 0;
1792 } else if (*dd_idx >= pd_idx) 1792 } else if (*dd_idx >= pd_idx)
1793 (*dd_idx) += 2; /* D D P Q D */ 1793 (*dd_idx) += 2; /* D D P Q D */
1794 ddf_layout = 1; 1794 ddf_layout = 1;
1795 break; 1795 break;
1796 1796
1797 case ALGORITHM_ROTATING_N_CONTINUE: 1797 case ALGORITHM_ROTATING_N_CONTINUE:
1798 /* Same as left_symmetric but Q is before P */ 1798 /* Same as left_symmetric but Q is before P */
1799 pd_idx = raid_disks - 1 - (stripe % raid_disks); 1799 pd_idx = raid_disks - 1 - (stripe % raid_disks);
1800 qd_idx = (pd_idx + raid_disks - 1) % raid_disks; 1800 qd_idx = (pd_idx + raid_disks - 1) % raid_disks;
1801 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks; 1801 *dd_idx = (pd_idx + 1 + *dd_idx) % raid_disks;
1802 ddf_layout = 1; 1802 ddf_layout = 1;
1803 break; 1803 break;
1804 1804
1805 case ALGORITHM_LEFT_ASYMMETRIC_6: 1805 case ALGORITHM_LEFT_ASYMMETRIC_6:
1806 /* RAID5 left_asymmetric, with Q on last device */ 1806 /* RAID5 left_asymmetric, with Q on last device */
1807 pd_idx = data_disks - stripe % (raid_disks-1); 1807 pd_idx = data_disks - stripe % (raid_disks-1);
1808 if (*dd_idx >= pd_idx) 1808 if (*dd_idx >= pd_idx)
1809 (*dd_idx)++; 1809 (*dd_idx)++;
1810 qd_idx = raid_disks - 1; 1810 qd_idx = raid_disks - 1;
1811 break; 1811 break;
1812 1812
1813 case ALGORITHM_RIGHT_ASYMMETRIC_6: 1813 case ALGORITHM_RIGHT_ASYMMETRIC_6:
1814 pd_idx = stripe % (raid_disks-1); 1814 pd_idx = stripe % (raid_disks-1);
1815 if (*dd_idx >= pd_idx) 1815 if (*dd_idx >= pd_idx)
1816 (*dd_idx)++; 1816 (*dd_idx)++;
1817 qd_idx = raid_disks - 1; 1817 qd_idx = raid_disks - 1;
1818 break; 1818 break;
1819 1819
1820 case ALGORITHM_LEFT_SYMMETRIC_6: 1820 case ALGORITHM_LEFT_SYMMETRIC_6:
1821 pd_idx = data_disks - stripe % (raid_disks-1); 1821 pd_idx = data_disks - stripe % (raid_disks-1);
1822 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1822 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1823 qd_idx = raid_disks - 1; 1823 qd_idx = raid_disks - 1;
1824 break; 1824 break;
1825 1825
1826 case ALGORITHM_RIGHT_SYMMETRIC_6: 1826 case ALGORITHM_RIGHT_SYMMETRIC_6:
1827 pd_idx = stripe % (raid_disks-1); 1827 pd_idx = stripe % (raid_disks-1);
1828 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1); 1828 *dd_idx = (pd_idx + 1 + *dd_idx) % (raid_disks-1);
1829 qd_idx = raid_disks - 1; 1829 qd_idx = raid_disks - 1;
1830 break; 1830 break;
1831 1831
1832 case ALGORITHM_PARITY_0_6: 1832 case ALGORITHM_PARITY_0_6:
1833 pd_idx = 0; 1833 pd_idx = 0;
1834 (*dd_idx)++; 1834 (*dd_idx)++;
1835 qd_idx = raid_disks - 1; 1835 qd_idx = raid_disks - 1;
1836 break; 1836 break;
1837 1837
1838 1838
1839 default: 1839 default:
1840 printk(KERN_CRIT "raid6: unsupported algorithm %d\n", 1840 printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1841 algorithm); 1841 algorithm);
1842 BUG(); 1842 BUG();
1843 } 1843 }
1844 break; 1844 break;
1845 } 1845 }
1846 1846
1847 if (sh) { 1847 if (sh) {
1848 sh->pd_idx = pd_idx; 1848 sh->pd_idx = pd_idx;
1849 sh->qd_idx = qd_idx; 1849 sh->qd_idx = qd_idx;
1850 sh->ddf_layout = ddf_layout; 1850 sh->ddf_layout = ddf_layout;
1851 } 1851 }
1852 /* 1852 /*
1853 * Finally, compute the new sector number 1853 * Finally, compute the new sector number
1854 */ 1854 */
1855 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset; 1855 new_sector = (sector_t)stripe * sectors_per_chunk + chunk_offset;
1856 return new_sector; 1856 return new_sector;
1857 } 1857 }
1858 1858
1859 1859
1860 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous) 1860 static sector_t compute_blocknr(struct stripe_head *sh, int i, int previous)
1861 { 1861 {
1862 raid5_conf_t *conf = sh->raid_conf; 1862 raid5_conf_t *conf = sh->raid_conf;
1863 int raid_disks = sh->disks; 1863 int raid_disks = sh->disks;
1864 int data_disks = raid_disks - conf->max_degraded; 1864 int data_disks = raid_disks - conf->max_degraded;
1865 sector_t new_sector = sh->sector, check; 1865 sector_t new_sector = sh->sector, check;
1866 int sectors_per_chunk = previous ? conf->prev_chunk_sectors 1866 int sectors_per_chunk = previous ? conf->prev_chunk_sectors
1867 : conf->chunk_sectors; 1867 : conf->chunk_sectors;
1868 int algorithm = previous ? conf->prev_algo 1868 int algorithm = previous ? conf->prev_algo
1869 : conf->algorithm; 1869 : conf->algorithm;
1870 sector_t stripe; 1870 sector_t stripe;
1871 int chunk_offset; 1871 int chunk_offset;
1872 int chunk_number, dummy1, dd_idx = i; 1872 int chunk_number, dummy1, dd_idx = i;
1873 sector_t r_sector; 1873 sector_t r_sector;
1874 struct stripe_head sh2; 1874 struct stripe_head sh2;
1875 1875
1876 1876
1877 chunk_offset = sector_div(new_sector, sectors_per_chunk); 1877 chunk_offset = sector_div(new_sector, sectors_per_chunk);
1878 stripe = new_sector; 1878 stripe = new_sector;
1879 BUG_ON(new_sector != stripe); 1879 BUG_ON(new_sector != stripe);
1880 1880
1881 if (i == sh->pd_idx) 1881 if (i == sh->pd_idx)
1882 return 0; 1882 return 0;
1883 switch(conf->level) { 1883 switch(conf->level) {
1884 case 4: break; 1884 case 4: break;
1885 case 5: 1885 case 5:
1886 switch (algorithm) { 1886 switch (algorithm) {
1887 case ALGORITHM_LEFT_ASYMMETRIC: 1887 case ALGORITHM_LEFT_ASYMMETRIC:
1888 case ALGORITHM_RIGHT_ASYMMETRIC: 1888 case ALGORITHM_RIGHT_ASYMMETRIC:
1889 if (i > sh->pd_idx) 1889 if (i > sh->pd_idx)
1890 i--; 1890 i--;
1891 break; 1891 break;
1892 case ALGORITHM_LEFT_SYMMETRIC: 1892 case ALGORITHM_LEFT_SYMMETRIC:
1893 case ALGORITHM_RIGHT_SYMMETRIC: 1893 case ALGORITHM_RIGHT_SYMMETRIC:
1894 if (i < sh->pd_idx) 1894 if (i < sh->pd_idx)
1895 i += raid_disks; 1895 i += raid_disks;
1896 i -= (sh->pd_idx + 1); 1896 i -= (sh->pd_idx + 1);
1897 break; 1897 break;
1898 case ALGORITHM_PARITY_0: 1898 case ALGORITHM_PARITY_0:
1899 i -= 1; 1899 i -= 1;
1900 break; 1900 break;
1901 case ALGORITHM_PARITY_N: 1901 case ALGORITHM_PARITY_N:
1902 break; 1902 break;
1903 default: 1903 default:
1904 printk(KERN_ERR "raid5: unsupported algorithm %d\n", 1904 printk(KERN_ERR "raid5: unsupported algorithm %d\n",
1905 algorithm); 1905 algorithm);
1906 BUG(); 1906 BUG();
1907 } 1907 }
1908 break; 1908 break;
1909 case 6: 1909 case 6:
1910 if (i == sh->qd_idx) 1910 if (i == sh->qd_idx)
1911 return 0; /* It is the Q disk */ 1911 return 0; /* It is the Q disk */
1912 switch (algorithm) { 1912 switch (algorithm) {
1913 case ALGORITHM_LEFT_ASYMMETRIC: 1913 case ALGORITHM_LEFT_ASYMMETRIC:
1914 case ALGORITHM_RIGHT_ASYMMETRIC: 1914 case ALGORITHM_RIGHT_ASYMMETRIC:
1915 case ALGORITHM_ROTATING_ZERO_RESTART: 1915 case ALGORITHM_ROTATING_ZERO_RESTART:
1916 case ALGORITHM_ROTATING_N_RESTART: 1916 case ALGORITHM_ROTATING_N_RESTART:
1917 if (sh->pd_idx == raid_disks-1) 1917 if (sh->pd_idx == raid_disks-1)
1918 i--; /* Q D D D P */ 1918 i--; /* Q D D D P */
1919 else if (i > sh->pd_idx) 1919 else if (i > sh->pd_idx)
1920 i -= 2; /* D D P Q D */ 1920 i -= 2; /* D D P Q D */
1921 break; 1921 break;
1922 case ALGORITHM_LEFT_SYMMETRIC: 1922 case ALGORITHM_LEFT_SYMMETRIC:
1923 case ALGORITHM_RIGHT_SYMMETRIC: 1923 case ALGORITHM_RIGHT_SYMMETRIC:
1924 if (sh->pd_idx == raid_disks-1) 1924 if (sh->pd_idx == raid_disks-1)
1925 i--; /* Q D D D P */ 1925 i--; /* Q D D D P */
1926 else { 1926 else {
1927 /* D D P Q D */ 1927 /* D D P Q D */
1928 if (i < sh->pd_idx) 1928 if (i < sh->pd_idx)
1929 i += raid_disks; 1929 i += raid_disks;
1930 i -= (sh->pd_idx + 2); 1930 i -= (sh->pd_idx + 2);
1931 } 1931 }
1932 break; 1932 break;
1933 case ALGORITHM_PARITY_0: 1933 case ALGORITHM_PARITY_0:
1934 i -= 2; 1934 i -= 2;
1935 break; 1935 break;
1936 case ALGORITHM_PARITY_N: 1936 case ALGORITHM_PARITY_N:
1937 break; 1937 break;
1938 case ALGORITHM_ROTATING_N_CONTINUE: 1938 case ALGORITHM_ROTATING_N_CONTINUE:
1939 /* Like left_symmetric, but P is before Q */ 1939 /* Like left_symmetric, but P is before Q */
1940 if (sh->pd_idx == 0) 1940 if (sh->pd_idx == 0)
1941 i--; /* P D D D Q */ 1941 i--; /* P D D D Q */
1942 else { 1942 else {
1943 /* D D Q P D */ 1943 /* D D Q P D */
1944 if (i < sh->pd_idx) 1944 if (i < sh->pd_idx)
1945 i += raid_disks; 1945 i += raid_disks;
1946 i -= (sh->pd_idx + 1); 1946 i -= (sh->pd_idx + 1);
1947 } 1947 }
1948 break; 1948 break;
1949 case ALGORITHM_LEFT_ASYMMETRIC_6: 1949 case ALGORITHM_LEFT_ASYMMETRIC_6:
1950 case ALGORITHM_RIGHT_ASYMMETRIC_6: 1950 case ALGORITHM_RIGHT_ASYMMETRIC_6:
1951 if (i > sh->pd_idx) 1951 if (i > sh->pd_idx)
1952 i--; 1952 i--;
1953 break; 1953 break;
1954 case ALGORITHM_LEFT_SYMMETRIC_6: 1954 case ALGORITHM_LEFT_SYMMETRIC_6:
1955 case ALGORITHM_RIGHT_SYMMETRIC_6: 1955 case ALGORITHM_RIGHT_SYMMETRIC_6:
1956 if (i < sh->pd_idx) 1956 if (i < sh->pd_idx)
1957 i += data_disks + 1; 1957 i += data_disks + 1;
1958 i -= (sh->pd_idx + 1); 1958 i -= (sh->pd_idx + 1);
1959 break; 1959 break;
1960 case ALGORITHM_PARITY_0_6: 1960 case ALGORITHM_PARITY_0_6:
1961 i -= 1; 1961 i -= 1;
1962 break; 1962 break;
1963 default: 1963 default:
1964 printk(KERN_CRIT "raid6: unsupported algorithm %d\n", 1964 printk(KERN_CRIT "raid6: unsupported algorithm %d\n",
1965 algorithm); 1965 algorithm);
1966 BUG(); 1966 BUG();
1967 } 1967 }
1968 break; 1968 break;
1969 } 1969 }
1970 1970
1971 chunk_number = stripe * data_disks + i; 1971 chunk_number = stripe * data_disks + i;
1972 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset; 1972 r_sector = (sector_t)chunk_number * sectors_per_chunk + chunk_offset;
1973 1973
1974 check = raid5_compute_sector(conf, r_sector, 1974 check = raid5_compute_sector(conf, r_sector,
1975 previous, &dummy1, &sh2); 1975 previous, &dummy1, &sh2);
1976 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx 1976 if (check != sh->sector || dummy1 != dd_idx || sh2.pd_idx != sh->pd_idx
1977 || sh2.qd_idx != sh->qd_idx) { 1977 || sh2.qd_idx != sh->qd_idx) {
1978 printk(KERN_ERR "compute_blocknr: map not correct\n"); 1978 printk(KERN_ERR "compute_blocknr: map not correct\n");
1979 return 0; 1979 return 0;
1980 } 1980 }
1981 return r_sector; 1981 return r_sector;
1982 } 1982 }
1983 1983
1984 1984
1985 static void 1985 static void
1986 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, 1986 schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s,
1987 int rcw, int expand) 1987 int rcw, int expand)
1988 { 1988 {
1989 int i, pd_idx = sh->pd_idx, disks = sh->disks; 1989 int i, pd_idx = sh->pd_idx, disks = sh->disks;
1990 raid5_conf_t *conf = sh->raid_conf; 1990 raid5_conf_t *conf = sh->raid_conf;
1991 int level = conf->level; 1991 int level = conf->level;
1992 1992
1993 if (rcw) { 1993 if (rcw) {
1994 /* if we are not expanding this is a proper write request, and 1994 /* if we are not expanding this is a proper write request, and
1995 * there will be bios with new data to be drained into the 1995 * there will be bios with new data to be drained into the
1996 * stripe cache 1996 * stripe cache
1997 */ 1997 */
1998 if (!expand) { 1998 if (!expand) {
1999 sh->reconstruct_state = reconstruct_state_drain_run; 1999 sh->reconstruct_state = reconstruct_state_drain_run;
2000 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2000 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2001 } else 2001 } else
2002 sh->reconstruct_state = reconstruct_state_run; 2002 sh->reconstruct_state = reconstruct_state_run;
2003 2003
2004 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2004 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2005 2005
2006 for (i = disks; i--; ) { 2006 for (i = disks; i--; ) {
2007 struct r5dev *dev = &sh->dev[i]; 2007 struct r5dev *dev = &sh->dev[i];
2008 2008
2009 if (dev->towrite) { 2009 if (dev->towrite) {
2010 set_bit(R5_LOCKED, &dev->flags); 2010 set_bit(R5_LOCKED, &dev->flags);
2011 set_bit(R5_Wantdrain, &dev->flags); 2011 set_bit(R5_Wantdrain, &dev->flags);
2012 if (!expand) 2012 if (!expand)
2013 clear_bit(R5_UPTODATE, &dev->flags); 2013 clear_bit(R5_UPTODATE, &dev->flags);
2014 s->locked++; 2014 s->locked++;
2015 } 2015 }
2016 } 2016 }
2017 if (s->locked + conf->max_degraded == disks) 2017 if (s->locked + conf->max_degraded == disks)
2018 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state)) 2018 if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
2019 atomic_inc(&conf->pending_full_writes); 2019 atomic_inc(&conf->pending_full_writes);
2020 } else { 2020 } else {
2021 BUG_ON(level == 6); 2021 BUG_ON(level == 6);
2022 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) || 2022 BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
2023 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags))); 2023 test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
2024 2024
2025 sh->reconstruct_state = reconstruct_state_prexor_drain_run; 2025 sh->reconstruct_state = reconstruct_state_prexor_drain_run;
2026 set_bit(STRIPE_OP_PREXOR, &s->ops_request); 2026 set_bit(STRIPE_OP_PREXOR, &s->ops_request);
2027 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request); 2027 set_bit(STRIPE_OP_BIODRAIN, &s->ops_request);
2028 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request); 2028 set_bit(STRIPE_OP_RECONSTRUCT, &s->ops_request);
2029 2029
2030 for (i = disks; i--; ) { 2030 for (i = disks; i--; ) {
2031 struct r5dev *dev = &sh->dev[i]; 2031 struct r5dev *dev = &sh->dev[i];
2032 if (i == pd_idx) 2032 if (i == pd_idx)
2033 continue; 2033 continue;
2034 2034
2035 if (dev->towrite && 2035 if (dev->towrite &&
2036 (test_bit(R5_UPTODATE, &dev->flags) || 2036 (test_bit(R5_UPTODATE, &dev->flags) ||
2037 test_bit(R5_Wantcompute, &dev->flags))) { 2037 test_bit(R5_Wantcompute, &dev->flags))) {
2038 set_bit(R5_Wantdrain, &dev->flags); 2038 set_bit(R5_Wantdrain, &dev->flags);
2039 set_bit(R5_LOCKED, &dev->flags); 2039 set_bit(R5_LOCKED, &dev->flags);
2040 clear_bit(R5_UPTODATE, &dev->flags); 2040 clear_bit(R5_UPTODATE, &dev->flags);
2041 s->locked++; 2041 s->locked++;
2042 } 2042 }
2043 } 2043 }
2044 } 2044 }
2045 2045
2046 /* keep the parity disk(s) locked while asynchronous operations 2046 /* keep the parity disk(s) locked while asynchronous operations
2047 * are in flight 2047 * are in flight
2048 */ 2048 */
2049 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags); 2049 set_bit(R5_LOCKED, &sh->dev[pd_idx].flags);
2050 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2050 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2051 s->locked++; 2051 s->locked++;
2052 2052
2053 if (level == 6) { 2053 if (level == 6) {
2054 int qd_idx = sh->qd_idx; 2054 int qd_idx = sh->qd_idx;
2055 struct r5dev *dev = &sh->dev[qd_idx]; 2055 struct r5dev *dev = &sh->dev[qd_idx];
2056 2056
2057 set_bit(R5_LOCKED, &dev->flags); 2057 set_bit(R5_LOCKED, &dev->flags);
2058 clear_bit(R5_UPTODATE, &dev->flags); 2058 clear_bit(R5_UPTODATE, &dev->flags);
2059 s->locked++; 2059 s->locked++;
2060 } 2060 }
2061 2061
2062 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n", 2062 pr_debug("%s: stripe %llu locked: %d ops_request: %lx\n",
2063 __func__, (unsigned long long)sh->sector, 2063 __func__, (unsigned long long)sh->sector,
2064 s->locked, s->ops_request); 2064 s->locked, s->ops_request);
2065 } 2065 }
2066 2066
2067 /* 2067 /*
2068 * Each stripe/dev can have one or more bion attached. 2068 * Each stripe/dev can have one or more bion attached.
2069 * toread/towrite point to the first in a chain. 2069 * toread/towrite point to the first in a chain.
2070 * The bi_next chain must be in order. 2070 * The bi_next chain must be in order.
2071 */ 2071 */
2072 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite) 2072 static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, int forwrite)
2073 { 2073 {
2074 struct bio **bip; 2074 struct bio **bip;
2075 raid5_conf_t *conf = sh->raid_conf; 2075 raid5_conf_t *conf = sh->raid_conf;
2076 int firstwrite=0; 2076 int firstwrite=0;
2077 2077
2078 pr_debug("adding bh b#%llu to stripe s#%llu\n", 2078 pr_debug("adding bh b#%llu to stripe s#%llu\n",
2079 (unsigned long long)bi->bi_sector, 2079 (unsigned long long)bi->bi_sector,
2080 (unsigned long long)sh->sector); 2080 (unsigned long long)sh->sector);
2081 2081
2082 2082
2083 spin_lock(&sh->lock); 2083 spin_lock(&sh->lock);
2084 spin_lock_irq(&conf->device_lock); 2084 spin_lock_irq(&conf->device_lock);
2085 if (forwrite) { 2085 if (forwrite) {
2086 bip = &sh->dev[dd_idx].towrite; 2086 bip = &sh->dev[dd_idx].towrite;
2087 if (*bip == NULL && sh->dev[dd_idx].written == NULL) 2087 if (*bip == NULL && sh->dev[dd_idx].written == NULL)
2088 firstwrite = 1; 2088 firstwrite = 1;
2089 } else 2089 } else
2090 bip = &sh->dev[dd_idx].toread; 2090 bip = &sh->dev[dd_idx].toread;
2091 while (*bip && (*bip)->bi_sector < bi->bi_sector) { 2091 while (*bip && (*bip)->bi_sector < bi->bi_sector) {
2092 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector) 2092 if ((*bip)->bi_sector + ((*bip)->bi_size >> 9) > bi->bi_sector)
2093 goto overlap; 2093 goto overlap;
2094 bip = & (*bip)->bi_next; 2094 bip = & (*bip)->bi_next;
2095 } 2095 }
2096 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9)) 2096 if (*bip && (*bip)->bi_sector < bi->bi_sector + ((bi->bi_size)>>9))
2097 goto overlap; 2097 goto overlap;
2098 2098
2099 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next); 2099 BUG_ON(*bip && bi->bi_next && (*bip) != bi->bi_next);
2100 if (*bip) 2100 if (*bip)
2101 bi->bi_next = *bip; 2101 bi->bi_next = *bip;
2102 *bip = bi; 2102 *bip = bi;
2103 bi->bi_phys_segments++; 2103 bi->bi_phys_segments++;
2104 spin_unlock_irq(&conf->device_lock); 2104 spin_unlock_irq(&conf->device_lock);
2105 spin_unlock(&sh->lock); 2105 spin_unlock(&sh->lock);
2106 2106
2107 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n", 2107 pr_debug("added bi b#%llu to stripe s#%llu, disk %d.\n",
2108 (unsigned long long)bi->bi_sector, 2108 (unsigned long long)bi->bi_sector,
2109 (unsigned long long)sh->sector, dd_idx); 2109 (unsigned long long)sh->sector, dd_idx);
2110 2110
2111 if (conf->mddev->bitmap && firstwrite) { 2111 if (conf->mddev->bitmap && firstwrite) {
2112 bitmap_startwrite(conf->mddev->bitmap, sh->sector, 2112 bitmap_startwrite(conf->mddev->bitmap, sh->sector,
2113 STRIPE_SECTORS, 0); 2113 STRIPE_SECTORS, 0);
2114 sh->bm_seq = conf->seq_flush+1; 2114 sh->bm_seq = conf->seq_flush+1;
2115 set_bit(STRIPE_BIT_DELAY, &sh->state); 2115 set_bit(STRIPE_BIT_DELAY, &sh->state);
2116 } 2116 }
2117 2117
2118 if (forwrite) { 2118 if (forwrite) {
2119 /* check if page is covered */ 2119 /* check if page is covered */
2120 sector_t sector = sh->dev[dd_idx].sector; 2120 sector_t sector = sh->dev[dd_idx].sector;
2121 for (bi=sh->dev[dd_idx].towrite; 2121 for (bi=sh->dev[dd_idx].towrite;
2122 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && 2122 sector < sh->dev[dd_idx].sector + STRIPE_SECTORS &&
2123 bi && bi->bi_sector <= sector; 2123 bi && bi->bi_sector <= sector;
2124 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { 2124 bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) {
2125 if (bi->bi_sector + (bi->bi_size>>9) >= sector) 2125 if (bi->bi_sector + (bi->bi_size>>9) >= sector)
2126 sector = bi->bi_sector + (bi->bi_size>>9); 2126 sector = bi->bi_sector + (bi->bi_size>>9);
2127 } 2127 }
2128 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) 2128 if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS)
2129 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags); 2129 set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags);
2130 } 2130 }
2131 return 1; 2131 return 1;
2132 2132
2133 overlap: 2133 overlap:
2134 set_bit(R5_Overlap, &sh->dev[dd_idx].flags); 2134 set_bit(R5_Overlap, &sh->dev[dd_idx].flags);
2135 spin_unlock_irq(&conf->device_lock); 2135 spin_unlock_irq(&conf->device_lock);
2136 spin_unlock(&sh->lock); 2136 spin_unlock(&sh->lock);
2137 return 0; 2137 return 0;
2138 } 2138 }
2139 2139
2140 static void end_reshape(raid5_conf_t *conf); 2140 static void end_reshape(raid5_conf_t *conf);
2141 2141
2142 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous, 2142 static void stripe_set_idx(sector_t stripe, raid5_conf_t *conf, int previous,
2143 struct stripe_head *sh) 2143 struct stripe_head *sh)
2144 { 2144 {
2145 int sectors_per_chunk = 2145 int sectors_per_chunk =
2146 previous ? conf->prev_chunk_sectors : conf->chunk_sectors; 2146 previous ? conf->prev_chunk_sectors : conf->chunk_sectors;
2147 int dd_idx; 2147 int dd_idx;
2148 int chunk_offset = sector_div(stripe, sectors_per_chunk); 2148 int chunk_offset = sector_div(stripe, sectors_per_chunk);
2149 int disks = previous ? conf->previous_raid_disks : conf->raid_disks; 2149 int disks = previous ? conf->previous_raid_disks : conf->raid_disks;
2150 2150
2151 raid5_compute_sector(conf, 2151 raid5_compute_sector(conf,
2152 stripe * (disks - conf->max_degraded) 2152 stripe * (disks - conf->max_degraded)
2153 *sectors_per_chunk + chunk_offset, 2153 *sectors_per_chunk + chunk_offset,
2154 previous, 2154 previous,
2155 &dd_idx, sh); 2155 &dd_idx, sh);
2156 } 2156 }
2157 2157
2158 static void 2158 static void
2159 handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh, 2159 handle_failed_stripe(raid5_conf_t *conf, struct stripe_head *sh,
2160 struct stripe_head_state *s, int disks, 2160 struct stripe_head_state *s, int disks,
2161 struct bio **return_bi) 2161 struct bio **return_bi)
2162 { 2162 {
2163 int i; 2163 int i;
2164 for (i = disks; i--; ) { 2164 for (i = disks; i--; ) {
2165 struct bio *bi; 2165 struct bio *bi;
2166 int bitmap_end = 0; 2166 int bitmap_end = 0;
2167 2167
2168 if (test_bit(R5_ReadError, &sh->dev[i].flags)) { 2168 if (test_bit(R5_ReadError, &sh->dev[i].flags)) {
2169 mdk_rdev_t *rdev; 2169 mdk_rdev_t *rdev;
2170 rcu_read_lock(); 2170 rcu_read_lock();
2171 rdev = rcu_dereference(conf->disks[i].rdev); 2171 rdev = rcu_dereference(conf->disks[i].rdev);
2172 if (rdev && test_bit(In_sync, &rdev->flags)) 2172 if (rdev && test_bit(In_sync, &rdev->flags))
2173 /* multiple read failures in one stripe */ 2173 /* multiple read failures in one stripe */
2174 md_error(conf->mddev, rdev); 2174 md_error(conf->mddev, rdev);
2175 rcu_read_unlock(); 2175 rcu_read_unlock();
2176 } 2176 }
2177 spin_lock_irq(&conf->device_lock); 2177 spin_lock_irq(&conf->device_lock);
2178 /* fail all writes first */ 2178 /* fail all writes first */
2179 bi = sh->dev[i].towrite; 2179 bi = sh->dev[i].towrite;
2180 sh->dev[i].towrite = NULL; 2180 sh->dev[i].towrite = NULL;
2181 if (bi) { 2181 if (bi) {
2182 s->to_write--; 2182 s->to_write--;
2183 bitmap_end = 1; 2183 bitmap_end = 1;
2184 } 2184 }
2185 2185
2186 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2186 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2187 wake_up(&conf->wait_for_overlap); 2187 wake_up(&conf->wait_for_overlap);
2188 2188
2189 while (bi && bi->bi_sector < 2189 while (bi && bi->bi_sector <
2190 sh->dev[i].sector + STRIPE_SECTORS) { 2190 sh->dev[i].sector + STRIPE_SECTORS) {
2191 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); 2191 struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector);
2192 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2192 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2193 if (!raid5_dec_bi_phys_segments(bi)) { 2193 if (!raid5_dec_bi_phys_segments(bi)) {
2194 md_write_end(conf->mddev); 2194 md_write_end(conf->mddev);
2195 bi->bi_next = *return_bi; 2195 bi->bi_next = *return_bi;
2196 *return_bi = bi; 2196 *return_bi = bi;
2197 } 2197 }
2198 bi = nextbi; 2198 bi = nextbi;
2199 } 2199 }
2200 /* and fail all 'written' */ 2200 /* and fail all 'written' */
2201 bi = sh->dev[i].written; 2201 bi = sh->dev[i].written;
2202 sh->dev[i].written = NULL; 2202 sh->dev[i].written = NULL;
2203 if (bi) bitmap_end = 1; 2203 if (bi) bitmap_end = 1;
2204 while (bi && bi->bi_sector < 2204 while (bi && bi->bi_sector <
2205 sh->dev[i].sector + STRIPE_SECTORS) { 2205 sh->dev[i].sector + STRIPE_SECTORS) {
2206 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); 2206 struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector);
2207 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2207 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2208 if (!raid5_dec_bi_phys_segments(bi)) { 2208 if (!raid5_dec_bi_phys_segments(bi)) {
2209 md_write_end(conf->mddev); 2209 md_write_end(conf->mddev);
2210 bi->bi_next = *return_bi; 2210 bi->bi_next = *return_bi;
2211 *return_bi = bi; 2211 *return_bi = bi;
2212 } 2212 }
2213 bi = bi2; 2213 bi = bi2;
2214 } 2214 }
2215 2215
2216 /* fail any reads if this device is non-operational and 2216 /* fail any reads if this device is non-operational and
2217 * the data has not reached the cache yet. 2217 * the data has not reached the cache yet.
2218 */ 2218 */
2219 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) && 2219 if (!test_bit(R5_Wantfill, &sh->dev[i].flags) &&
2220 (!test_bit(R5_Insync, &sh->dev[i].flags) || 2220 (!test_bit(R5_Insync, &sh->dev[i].flags) ||
2221 test_bit(R5_ReadError, &sh->dev[i].flags))) { 2221 test_bit(R5_ReadError, &sh->dev[i].flags))) {
2222 bi = sh->dev[i].toread; 2222 bi = sh->dev[i].toread;
2223 sh->dev[i].toread = NULL; 2223 sh->dev[i].toread = NULL;
2224 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags)) 2224 if (test_and_clear_bit(R5_Overlap, &sh->dev[i].flags))
2225 wake_up(&conf->wait_for_overlap); 2225 wake_up(&conf->wait_for_overlap);
2226 if (bi) s->to_read--; 2226 if (bi) s->to_read--;
2227 while (bi && bi->bi_sector < 2227 while (bi && bi->bi_sector <
2228 sh->dev[i].sector + STRIPE_SECTORS) { 2228 sh->dev[i].sector + STRIPE_SECTORS) {
2229 struct bio *nextbi = 2229 struct bio *nextbi =
2230 r5_next_bio(bi, sh->dev[i].sector); 2230 r5_next_bio(bi, sh->dev[i].sector);
2231 clear_bit(BIO_UPTODATE, &bi->bi_flags); 2231 clear_bit(BIO_UPTODATE, &bi->bi_flags);
2232 if (!raid5_dec_bi_phys_segments(bi)) { 2232 if (!raid5_dec_bi_phys_segments(bi)) {
2233 bi->bi_next = *return_bi; 2233 bi->bi_next = *return_bi;
2234 *return_bi = bi; 2234 *return_bi = bi;
2235 } 2235 }
2236 bi = nextbi; 2236 bi = nextbi;
2237 } 2237 }
2238 } 2238 }
2239 spin_unlock_irq(&conf->device_lock); 2239 spin_unlock_irq(&conf->device_lock);
2240 if (bitmap_end) 2240 if (bitmap_end)
2241 bitmap_endwrite(conf->mddev->bitmap, sh->sector, 2241 bitmap_endwrite(conf->mddev->bitmap, sh->sector,
2242 STRIPE_SECTORS, 0, 0); 2242 STRIPE_SECTORS, 0, 0);
2243 } 2243 }
2244 2244
2245 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2245 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2246 if (atomic_dec_and_test(&conf->pending_full_writes)) 2246 if (atomic_dec_and_test(&conf->pending_full_writes))
2247 md_wakeup_thread(conf->mddev->thread); 2247 md_wakeup_thread(conf->mddev->thread);
2248 } 2248 }
2249 2249
2250 /* fetch_block5 - checks the given member device to see if its data needs 2250 /* fetch_block5 - checks the given member device to see if its data needs
2251 * to be read or computed to satisfy a request. 2251 * to be read or computed to satisfy a request.
2252 * 2252 *
2253 * Returns 1 when no more member devices need to be checked, otherwise returns 2253 * Returns 1 when no more member devices need to be checked, otherwise returns
2254 * 0 to tell the loop in handle_stripe_fill5 to continue 2254 * 0 to tell the loop in handle_stripe_fill5 to continue
2255 */ 2255 */
2256 static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s, 2256 static int fetch_block5(struct stripe_head *sh, struct stripe_head_state *s,
2257 int disk_idx, int disks) 2257 int disk_idx, int disks)
2258 { 2258 {
2259 struct r5dev *dev = &sh->dev[disk_idx]; 2259 struct r5dev *dev = &sh->dev[disk_idx];
2260 struct r5dev *failed_dev = &sh->dev[s->failed_num]; 2260 struct r5dev *failed_dev = &sh->dev[s->failed_num];
2261 2261
2262 /* is the data in this block needed, and can we get it? */ 2262 /* is the data in this block needed, and can we get it? */
2263 if (!test_bit(R5_LOCKED, &dev->flags) && 2263 if (!test_bit(R5_LOCKED, &dev->flags) &&
2264 !test_bit(R5_UPTODATE, &dev->flags) && 2264 !test_bit(R5_UPTODATE, &dev->flags) &&
2265 (dev->toread || 2265 (dev->toread ||
2266 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2266 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2267 s->syncing || s->expanding || 2267 s->syncing || s->expanding ||
2268 (s->failed && 2268 (s->failed &&
2269 (failed_dev->toread || 2269 (failed_dev->toread ||
2270 (failed_dev->towrite && 2270 (failed_dev->towrite &&
2271 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) { 2271 !test_bit(R5_OVERWRITE, &failed_dev->flags)))))) {
2272 /* We would like to get this block, possibly by computing it, 2272 /* We would like to get this block, possibly by computing it,
2273 * otherwise read it if the backing disk is insync 2273 * otherwise read it if the backing disk is insync
2274 */ 2274 */
2275 if ((s->uptodate == disks - 1) && 2275 if ((s->uptodate == disks - 1) &&
2276 (s->failed && disk_idx == s->failed_num)) { 2276 (s->failed && disk_idx == s->failed_num)) {
2277 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2277 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2278 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2278 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2279 set_bit(R5_Wantcompute, &dev->flags); 2279 set_bit(R5_Wantcompute, &dev->flags);
2280 sh->ops.target = disk_idx; 2280 sh->ops.target = disk_idx;
2281 sh->ops.target2 = -1; 2281 sh->ops.target2 = -1;
2282 s->req_compute = 1; 2282 s->req_compute = 1;
2283 /* Careful: from this point on 'uptodate' is in the eye 2283 /* Careful: from this point on 'uptodate' is in the eye
2284 * of raid_run_ops which services 'compute' operations 2284 * of raid_run_ops which services 'compute' operations
2285 * before writes. R5_Wantcompute flags a block that will 2285 * before writes. R5_Wantcompute flags a block that will
2286 * be R5_UPTODATE by the time it is needed for a 2286 * be R5_UPTODATE by the time it is needed for a
2287 * subsequent operation. 2287 * subsequent operation.
2288 */ 2288 */
2289 s->uptodate++; 2289 s->uptodate++;
2290 return 1; /* uptodate + compute == disks */ 2290 return 1; /* uptodate + compute == disks */
2291 } else if (test_bit(R5_Insync, &dev->flags)) { 2291 } else if (test_bit(R5_Insync, &dev->flags)) {
2292 set_bit(R5_LOCKED, &dev->flags); 2292 set_bit(R5_LOCKED, &dev->flags);
2293 set_bit(R5_Wantread, &dev->flags); 2293 set_bit(R5_Wantread, &dev->flags);
2294 s->locked++; 2294 s->locked++;
2295 pr_debug("Reading block %d (sync=%d)\n", disk_idx, 2295 pr_debug("Reading block %d (sync=%d)\n", disk_idx,
2296 s->syncing); 2296 s->syncing);
2297 } 2297 }
2298 } 2298 }
2299 2299
2300 return 0; 2300 return 0;
2301 } 2301 }
2302 2302
2303 /** 2303 /**
2304 * handle_stripe_fill5 - read or compute data to satisfy pending requests. 2304 * handle_stripe_fill5 - read or compute data to satisfy pending requests.
2305 */ 2305 */
2306 static void handle_stripe_fill5(struct stripe_head *sh, 2306 static void handle_stripe_fill5(struct stripe_head *sh,
2307 struct stripe_head_state *s, int disks) 2307 struct stripe_head_state *s, int disks)
2308 { 2308 {
2309 int i; 2309 int i;
2310 2310
2311 /* look for blocks to read/compute, skip this if a compute 2311 /* look for blocks to read/compute, skip this if a compute
2312 * is already in flight, or if the stripe contents are in the 2312 * is already in flight, or if the stripe contents are in the
2313 * midst of changing due to a write 2313 * midst of changing due to a write
2314 */ 2314 */
2315 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2315 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2316 !sh->reconstruct_state) 2316 !sh->reconstruct_state)
2317 for (i = disks; i--; ) 2317 for (i = disks; i--; )
2318 if (fetch_block5(sh, s, i, disks)) 2318 if (fetch_block5(sh, s, i, disks))
2319 break; 2319 break;
2320 set_bit(STRIPE_HANDLE, &sh->state); 2320 set_bit(STRIPE_HANDLE, &sh->state);
2321 } 2321 }
2322 2322
2323 /* fetch_block6 - checks the given member device to see if its data needs 2323 /* fetch_block6 - checks the given member device to see if its data needs
2324 * to be read or computed to satisfy a request. 2324 * to be read or computed to satisfy a request.
2325 * 2325 *
2326 * Returns 1 when no more member devices need to be checked, otherwise returns 2326 * Returns 1 when no more member devices need to be checked, otherwise returns
2327 * 0 to tell the loop in handle_stripe_fill6 to continue 2327 * 0 to tell the loop in handle_stripe_fill6 to continue
2328 */ 2328 */
2329 static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s, 2329 static int fetch_block6(struct stripe_head *sh, struct stripe_head_state *s,
2330 struct r6_state *r6s, int disk_idx, int disks) 2330 struct r6_state *r6s, int disk_idx, int disks)
2331 { 2331 {
2332 struct r5dev *dev = &sh->dev[disk_idx]; 2332 struct r5dev *dev = &sh->dev[disk_idx];
2333 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]], 2333 struct r5dev *fdev[2] = { &sh->dev[r6s->failed_num[0]],
2334 &sh->dev[r6s->failed_num[1]] }; 2334 &sh->dev[r6s->failed_num[1]] };
2335 2335
2336 if (!test_bit(R5_LOCKED, &dev->flags) && 2336 if (!test_bit(R5_LOCKED, &dev->flags) &&
2337 !test_bit(R5_UPTODATE, &dev->flags) && 2337 !test_bit(R5_UPTODATE, &dev->flags) &&
2338 (dev->toread || 2338 (dev->toread ||
2339 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) || 2339 (dev->towrite && !test_bit(R5_OVERWRITE, &dev->flags)) ||
2340 s->syncing || s->expanding || 2340 s->syncing || s->expanding ||
2341 (s->failed >= 1 && 2341 (s->failed >= 1 &&
2342 (fdev[0]->toread || s->to_write)) || 2342 (fdev[0]->toread || s->to_write)) ||
2343 (s->failed >= 2 && 2343 (s->failed >= 2 &&
2344 (fdev[1]->toread || s->to_write)))) { 2344 (fdev[1]->toread || s->to_write)))) {
2345 /* we would like to get this block, possibly by computing it, 2345 /* we would like to get this block, possibly by computing it,
2346 * otherwise read it if the backing disk is insync 2346 * otherwise read it if the backing disk is insync
2347 */ 2347 */
2348 BUG_ON(test_bit(R5_Wantcompute, &dev->flags)); 2348 BUG_ON(test_bit(R5_Wantcompute, &dev->flags));
2349 BUG_ON(test_bit(R5_Wantread, &dev->flags)); 2349 BUG_ON(test_bit(R5_Wantread, &dev->flags));
2350 if ((s->uptodate == disks - 1) && 2350 if ((s->uptodate == disks - 1) &&
2351 (s->failed && (disk_idx == r6s->failed_num[0] || 2351 (s->failed && (disk_idx == r6s->failed_num[0] ||
2352 disk_idx == r6s->failed_num[1]))) { 2352 disk_idx == r6s->failed_num[1]))) {
2353 /* have disk failed, and we're requested to fetch it; 2353 /* have disk failed, and we're requested to fetch it;
2354 * do compute it 2354 * do compute it
2355 */ 2355 */
2356 pr_debug("Computing stripe %llu block %d\n", 2356 pr_debug("Computing stripe %llu block %d\n",
2357 (unsigned long long)sh->sector, disk_idx); 2357 (unsigned long long)sh->sector, disk_idx);
2358 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2358 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2359 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2359 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2360 set_bit(R5_Wantcompute, &dev->flags); 2360 set_bit(R5_Wantcompute, &dev->flags);
2361 sh->ops.target = disk_idx; 2361 sh->ops.target = disk_idx;
2362 sh->ops.target2 = -1; /* no 2nd target */ 2362 sh->ops.target2 = -1; /* no 2nd target */
2363 s->req_compute = 1; 2363 s->req_compute = 1;
2364 s->uptodate++; 2364 s->uptodate++;
2365 return 1; 2365 return 1;
2366 } else if (s->uptodate == disks-2 && s->failed >= 2) { 2366 } else if (s->uptodate == disks-2 && s->failed >= 2) {
2367 /* Computing 2-failure is *very* expensive; only 2367 /* Computing 2-failure is *very* expensive; only
2368 * do it if failed >= 2 2368 * do it if failed >= 2
2369 */ 2369 */
2370 int other; 2370 int other;
2371 for (other = disks; other--; ) { 2371 for (other = disks; other--; ) {
2372 if (other == disk_idx) 2372 if (other == disk_idx)
2373 continue; 2373 continue;
2374 if (!test_bit(R5_UPTODATE, 2374 if (!test_bit(R5_UPTODATE,
2375 &sh->dev[other].flags)) 2375 &sh->dev[other].flags))
2376 break; 2376 break;
2377 } 2377 }
2378 BUG_ON(other < 0); 2378 BUG_ON(other < 0);
2379 pr_debug("Computing stripe %llu blocks %d,%d\n", 2379 pr_debug("Computing stripe %llu blocks %d,%d\n",
2380 (unsigned long long)sh->sector, 2380 (unsigned long long)sh->sector,
2381 disk_idx, other); 2381 disk_idx, other);
2382 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2382 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2383 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2383 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2384 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags); 2384 set_bit(R5_Wantcompute, &sh->dev[disk_idx].flags);
2385 set_bit(R5_Wantcompute, &sh->dev[other].flags); 2385 set_bit(R5_Wantcompute, &sh->dev[other].flags);
2386 sh->ops.target = disk_idx; 2386 sh->ops.target = disk_idx;
2387 sh->ops.target2 = other; 2387 sh->ops.target2 = other;
2388 s->uptodate += 2; 2388 s->uptodate += 2;
2389 s->req_compute = 1; 2389 s->req_compute = 1;
2390 return 1; 2390 return 1;
2391 } else if (test_bit(R5_Insync, &dev->flags)) { 2391 } else if (test_bit(R5_Insync, &dev->flags)) {
2392 set_bit(R5_LOCKED, &dev->flags); 2392 set_bit(R5_LOCKED, &dev->flags);
2393 set_bit(R5_Wantread, &dev->flags); 2393 set_bit(R5_Wantread, &dev->flags);
2394 s->locked++; 2394 s->locked++;
2395 pr_debug("Reading block %d (sync=%d)\n", 2395 pr_debug("Reading block %d (sync=%d)\n",
2396 disk_idx, s->syncing); 2396 disk_idx, s->syncing);
2397 } 2397 }
2398 } 2398 }
2399 2399
2400 return 0; 2400 return 0;
2401 } 2401 }
2402 2402
2403 /** 2403 /**
2404 * handle_stripe_fill6 - read or compute data to satisfy pending requests. 2404 * handle_stripe_fill6 - read or compute data to satisfy pending requests.
2405 */ 2405 */
2406 static void handle_stripe_fill6(struct stripe_head *sh, 2406 static void handle_stripe_fill6(struct stripe_head *sh,
2407 struct stripe_head_state *s, struct r6_state *r6s, 2407 struct stripe_head_state *s, struct r6_state *r6s,
2408 int disks) 2408 int disks)
2409 { 2409 {
2410 int i; 2410 int i;
2411 2411
2412 /* look for blocks to read/compute, skip this if a compute 2412 /* look for blocks to read/compute, skip this if a compute
2413 * is already in flight, or if the stripe contents are in the 2413 * is already in flight, or if the stripe contents are in the
2414 * midst of changing due to a write 2414 * midst of changing due to a write
2415 */ 2415 */
2416 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && 2416 if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state &&
2417 !sh->reconstruct_state) 2417 !sh->reconstruct_state)
2418 for (i = disks; i--; ) 2418 for (i = disks; i--; )
2419 if (fetch_block6(sh, s, r6s, i, disks)) 2419 if (fetch_block6(sh, s, r6s, i, disks))
2420 break; 2420 break;
2421 set_bit(STRIPE_HANDLE, &sh->state); 2421 set_bit(STRIPE_HANDLE, &sh->state);
2422 } 2422 }
2423 2423
2424 2424
2425 /* handle_stripe_clean_event 2425 /* handle_stripe_clean_event
2426 * any written block on an uptodate or failed drive can be returned. 2426 * any written block on an uptodate or failed drive can be returned.
2427 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but 2427 * Note that if we 'wrote' to a failed drive, it will be UPTODATE, but
2428 * never LOCKED, so we don't need to test 'failed' directly. 2428 * never LOCKED, so we don't need to test 'failed' directly.
2429 */ 2429 */
2430 static void handle_stripe_clean_event(raid5_conf_t *conf, 2430 static void handle_stripe_clean_event(raid5_conf_t *conf,
2431 struct stripe_head *sh, int disks, struct bio **return_bi) 2431 struct stripe_head *sh, int disks, struct bio **return_bi)
2432 { 2432 {
2433 int i; 2433 int i;
2434 struct r5dev *dev; 2434 struct r5dev *dev;
2435 2435
2436 for (i = disks; i--; ) 2436 for (i = disks; i--; )
2437 if (sh->dev[i].written) { 2437 if (sh->dev[i].written) {
2438 dev = &sh->dev[i]; 2438 dev = &sh->dev[i];
2439 if (!test_bit(R5_LOCKED, &dev->flags) && 2439 if (!test_bit(R5_LOCKED, &dev->flags) &&
2440 test_bit(R5_UPTODATE, &dev->flags)) { 2440 test_bit(R5_UPTODATE, &dev->flags)) {
2441 /* We can return any write requests */ 2441 /* We can return any write requests */
2442 struct bio *wbi, *wbi2; 2442 struct bio *wbi, *wbi2;
2443 int bitmap_end = 0; 2443 int bitmap_end = 0;
2444 pr_debug("Return write for disc %d\n", i); 2444 pr_debug("Return write for disc %d\n", i);
2445 spin_lock_irq(&conf->device_lock); 2445 spin_lock_irq(&conf->device_lock);
2446 wbi = dev->written; 2446 wbi = dev->written;
2447 dev->written = NULL; 2447 dev->written = NULL;
2448 while (wbi && wbi->bi_sector < 2448 while (wbi && wbi->bi_sector <
2449 dev->sector + STRIPE_SECTORS) { 2449 dev->sector + STRIPE_SECTORS) {
2450 wbi2 = r5_next_bio(wbi, dev->sector); 2450 wbi2 = r5_next_bio(wbi, dev->sector);
2451 if (!raid5_dec_bi_phys_segments(wbi)) { 2451 if (!raid5_dec_bi_phys_segments(wbi)) {
2452 md_write_end(conf->mddev); 2452 md_write_end(conf->mddev);
2453 wbi->bi_next = *return_bi; 2453 wbi->bi_next = *return_bi;
2454 *return_bi = wbi; 2454 *return_bi = wbi;
2455 } 2455 }
2456 wbi = wbi2; 2456 wbi = wbi2;
2457 } 2457 }
2458 if (dev->towrite == NULL) 2458 if (dev->towrite == NULL)
2459 bitmap_end = 1; 2459 bitmap_end = 1;
2460 spin_unlock_irq(&conf->device_lock); 2460 spin_unlock_irq(&conf->device_lock);
2461 if (bitmap_end) 2461 if (bitmap_end)
2462 bitmap_endwrite(conf->mddev->bitmap, 2462 bitmap_endwrite(conf->mddev->bitmap,
2463 sh->sector, 2463 sh->sector,
2464 STRIPE_SECTORS, 2464 STRIPE_SECTORS,
2465 !test_bit(STRIPE_DEGRADED, &sh->state), 2465 !test_bit(STRIPE_DEGRADED, &sh->state),
2466 0); 2466 0);
2467 } 2467 }
2468 } 2468 }
2469 2469
2470 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state)) 2470 if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
2471 if (atomic_dec_and_test(&conf->pending_full_writes)) 2471 if (atomic_dec_and_test(&conf->pending_full_writes))
2472 md_wakeup_thread(conf->mddev->thread); 2472 md_wakeup_thread(conf->mddev->thread);
2473 } 2473 }
2474 2474
2475 static void handle_stripe_dirtying5(raid5_conf_t *conf, 2475 static void handle_stripe_dirtying5(raid5_conf_t *conf,
2476 struct stripe_head *sh, struct stripe_head_state *s, int disks) 2476 struct stripe_head *sh, struct stripe_head_state *s, int disks)
2477 { 2477 {
2478 int rmw = 0, rcw = 0, i; 2478 int rmw = 0, rcw = 0, i;
2479 for (i = disks; i--; ) { 2479 for (i = disks; i--; ) {
2480 /* would I have to read this buffer for read_modify_write */ 2480 /* would I have to read this buffer for read_modify_write */
2481 struct r5dev *dev = &sh->dev[i]; 2481 struct r5dev *dev = &sh->dev[i];
2482 if ((dev->towrite || i == sh->pd_idx) && 2482 if ((dev->towrite || i == sh->pd_idx) &&
2483 !test_bit(R5_LOCKED, &dev->flags) && 2483 !test_bit(R5_LOCKED, &dev->flags) &&
2484 !(test_bit(R5_UPTODATE, &dev->flags) || 2484 !(test_bit(R5_UPTODATE, &dev->flags) ||
2485 test_bit(R5_Wantcompute, &dev->flags))) { 2485 test_bit(R5_Wantcompute, &dev->flags))) {
2486 if (test_bit(R5_Insync, &dev->flags)) 2486 if (test_bit(R5_Insync, &dev->flags))
2487 rmw++; 2487 rmw++;
2488 else 2488 else
2489 rmw += 2*disks; /* cannot read it */ 2489 rmw += 2*disks; /* cannot read it */
2490 } 2490 }
2491 /* Would I have to read this buffer for reconstruct_write */ 2491 /* Would I have to read this buffer for reconstruct_write */
2492 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx && 2492 if (!test_bit(R5_OVERWRITE, &dev->flags) && i != sh->pd_idx &&
2493 !test_bit(R5_LOCKED, &dev->flags) && 2493 !test_bit(R5_LOCKED, &dev->flags) &&
2494 !(test_bit(R5_UPTODATE, &dev->flags) || 2494 !(test_bit(R5_UPTODATE, &dev->flags) ||
2495 test_bit(R5_Wantcompute, &dev->flags))) { 2495 test_bit(R5_Wantcompute, &dev->flags))) {
2496 if (test_bit(R5_Insync, &dev->flags)) rcw++; 2496 if (test_bit(R5_Insync, &dev->flags)) rcw++;
2497 else 2497 else
2498 rcw += 2*disks; 2498 rcw += 2*disks;
2499 } 2499 }
2500 } 2500 }
2501 pr_debug("for sector %llu, rmw=%d rcw=%d\n", 2501 pr_debug("for sector %llu, rmw=%d rcw=%d\n",
2502 (unsigned long long)sh->sector, rmw, rcw); 2502 (unsigned long long)sh->sector, rmw, rcw);
2503 set_bit(STRIPE_HANDLE, &sh->state); 2503 set_bit(STRIPE_HANDLE, &sh->state);
2504 if (rmw < rcw && rmw > 0) 2504 if (rmw < rcw && rmw > 0)
2505 /* prefer read-modify-write, but need to get some data */ 2505 /* prefer read-modify-write, but need to get some data */
2506 for (i = disks; i--; ) { 2506 for (i = disks; i--; ) {
2507 struct r5dev *dev = &sh->dev[i]; 2507 struct r5dev *dev = &sh->dev[i];
2508 if ((dev->towrite || i == sh->pd_idx) && 2508 if ((dev->towrite || i == sh->pd_idx) &&
2509 !test_bit(R5_LOCKED, &dev->flags) && 2509 !test_bit(R5_LOCKED, &dev->flags) &&
2510 !(test_bit(R5_UPTODATE, &dev->flags) || 2510 !(test_bit(R5_UPTODATE, &dev->flags) ||
2511 test_bit(R5_Wantcompute, &dev->flags)) && 2511 test_bit(R5_Wantcompute, &dev->flags)) &&
2512 test_bit(R5_Insync, &dev->flags)) { 2512 test_bit(R5_Insync, &dev->flags)) {
2513 if ( 2513 if (
2514 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2514 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2515 pr_debug("Read_old block " 2515 pr_debug("Read_old block "
2516 "%d for r-m-w\n", i); 2516 "%d for r-m-w\n", i);
2517 set_bit(R5_LOCKED, &dev->flags); 2517 set_bit(R5_LOCKED, &dev->flags);
2518 set_bit(R5_Wantread, &dev->flags); 2518 set_bit(R5_Wantread, &dev->flags);
2519 s->locked++; 2519 s->locked++;
2520 } else { 2520 } else {
2521 set_bit(STRIPE_DELAYED, &sh->state); 2521 set_bit(STRIPE_DELAYED, &sh->state);
2522 set_bit(STRIPE_HANDLE, &sh->state); 2522 set_bit(STRIPE_HANDLE, &sh->state);
2523 } 2523 }
2524 } 2524 }
2525 } 2525 }
2526 if (rcw <= rmw && rcw > 0) 2526 if (rcw <= rmw && rcw > 0)
2527 /* want reconstruct write, but need to get some data */ 2527 /* want reconstruct write, but need to get some data */
2528 for (i = disks; i--; ) { 2528 for (i = disks; i--; ) {
2529 struct r5dev *dev = &sh->dev[i]; 2529 struct r5dev *dev = &sh->dev[i];
2530 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2530 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2531 i != sh->pd_idx && 2531 i != sh->pd_idx &&
2532 !test_bit(R5_LOCKED, &dev->flags) && 2532 !test_bit(R5_LOCKED, &dev->flags) &&
2533 !(test_bit(R5_UPTODATE, &dev->flags) || 2533 !(test_bit(R5_UPTODATE, &dev->flags) ||
2534 test_bit(R5_Wantcompute, &dev->flags)) && 2534 test_bit(R5_Wantcompute, &dev->flags)) &&
2535 test_bit(R5_Insync, &dev->flags)) { 2535 test_bit(R5_Insync, &dev->flags)) {
2536 if ( 2536 if (
2537 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2537 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2538 pr_debug("Read_old block " 2538 pr_debug("Read_old block "
2539 "%d for Reconstruct\n", i); 2539 "%d for Reconstruct\n", i);
2540 set_bit(R5_LOCKED, &dev->flags); 2540 set_bit(R5_LOCKED, &dev->flags);
2541 set_bit(R5_Wantread, &dev->flags); 2541 set_bit(R5_Wantread, &dev->flags);
2542 s->locked++; 2542 s->locked++;
2543 } else { 2543 } else {
2544 set_bit(STRIPE_DELAYED, &sh->state); 2544 set_bit(STRIPE_DELAYED, &sh->state);
2545 set_bit(STRIPE_HANDLE, &sh->state); 2545 set_bit(STRIPE_HANDLE, &sh->state);
2546 } 2546 }
2547 } 2547 }
2548 } 2548 }
2549 /* now if nothing is locked, and if we have enough data, 2549 /* now if nothing is locked, and if we have enough data,
2550 * we can start a write request 2550 * we can start a write request
2551 */ 2551 */
2552 /* since handle_stripe can be called at any time we need to handle the 2552 /* since handle_stripe can be called at any time we need to handle the
2553 * case where a compute block operation has been submitted and then a 2553 * case where a compute block operation has been submitted and then a
2554 * subsequent call wants to start a write request. raid_run_ops only 2554 * subsequent call wants to start a write request. raid_run_ops only
2555 * handles the case where compute block and reconstruct are requested 2555 * handles the case where compute block and reconstruct are requested
2556 * simultaneously. If this is not the case then new writes need to be 2556 * simultaneously. If this is not the case then new writes need to be
2557 * held off until the compute completes. 2557 * held off until the compute completes.
2558 */ 2558 */
2559 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2559 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2560 (s->locked == 0 && (rcw == 0 || rmw == 0) && 2560 (s->locked == 0 && (rcw == 0 || rmw == 0) &&
2561 !test_bit(STRIPE_BIT_DELAY, &sh->state))) 2561 !test_bit(STRIPE_BIT_DELAY, &sh->state)))
2562 schedule_reconstruction(sh, s, rcw == 0, 0); 2562 schedule_reconstruction(sh, s, rcw == 0, 0);
2563 } 2563 }
2564 2564
2565 static void handle_stripe_dirtying6(raid5_conf_t *conf, 2565 static void handle_stripe_dirtying6(raid5_conf_t *conf,
2566 struct stripe_head *sh, struct stripe_head_state *s, 2566 struct stripe_head *sh, struct stripe_head_state *s,
2567 struct r6_state *r6s, int disks) 2567 struct r6_state *r6s, int disks)
2568 { 2568 {
2569 int rcw = 0, pd_idx = sh->pd_idx, i; 2569 int rcw = 0, pd_idx = sh->pd_idx, i;
2570 int qd_idx = sh->qd_idx; 2570 int qd_idx = sh->qd_idx;
2571 2571
2572 set_bit(STRIPE_HANDLE, &sh->state); 2572 set_bit(STRIPE_HANDLE, &sh->state);
2573 for (i = disks; i--; ) { 2573 for (i = disks; i--; ) {
2574 struct r5dev *dev = &sh->dev[i]; 2574 struct r5dev *dev = &sh->dev[i];
2575 /* check if we haven't enough data */ 2575 /* check if we haven't enough data */
2576 if (!test_bit(R5_OVERWRITE, &dev->flags) && 2576 if (!test_bit(R5_OVERWRITE, &dev->flags) &&
2577 i != pd_idx && i != qd_idx && 2577 i != pd_idx && i != qd_idx &&
2578 !test_bit(R5_LOCKED, &dev->flags) && 2578 !test_bit(R5_LOCKED, &dev->flags) &&
2579 !(test_bit(R5_UPTODATE, &dev->flags) || 2579 !(test_bit(R5_UPTODATE, &dev->flags) ||
2580 test_bit(R5_Wantcompute, &dev->flags))) { 2580 test_bit(R5_Wantcompute, &dev->flags))) {
2581 rcw++; 2581 rcw++;
2582 if (!test_bit(R5_Insync, &dev->flags)) 2582 if (!test_bit(R5_Insync, &dev->flags))
2583 continue; /* it's a failed drive */ 2583 continue; /* it's a failed drive */
2584 2584
2585 if ( 2585 if (
2586 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 2586 test_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
2587 pr_debug("Read_old stripe %llu " 2587 pr_debug("Read_old stripe %llu "
2588 "block %d for Reconstruct\n", 2588 "block %d for Reconstruct\n",
2589 (unsigned long long)sh->sector, i); 2589 (unsigned long long)sh->sector, i);
2590 set_bit(R5_LOCKED, &dev->flags); 2590 set_bit(R5_LOCKED, &dev->flags);
2591 set_bit(R5_Wantread, &dev->flags); 2591 set_bit(R5_Wantread, &dev->flags);
2592 s->locked++; 2592 s->locked++;
2593 } else { 2593 } else {
2594 pr_debug("Request delayed stripe %llu " 2594 pr_debug("Request delayed stripe %llu "
2595 "block %d for Reconstruct\n", 2595 "block %d for Reconstruct\n",
2596 (unsigned long long)sh->sector, i); 2596 (unsigned long long)sh->sector, i);
2597 set_bit(STRIPE_DELAYED, &sh->state); 2597 set_bit(STRIPE_DELAYED, &sh->state);
2598 set_bit(STRIPE_HANDLE, &sh->state); 2598 set_bit(STRIPE_HANDLE, &sh->state);
2599 } 2599 }
2600 } 2600 }
2601 } 2601 }
2602 /* now if nothing is locked, and if we have enough data, we can start a 2602 /* now if nothing is locked, and if we have enough data, we can start a
2603 * write request 2603 * write request
2604 */ 2604 */
2605 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) && 2605 if ((s->req_compute || !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) &&
2606 s->locked == 0 && rcw == 0 && 2606 s->locked == 0 && rcw == 0 &&
2607 !test_bit(STRIPE_BIT_DELAY, &sh->state)) { 2607 !test_bit(STRIPE_BIT_DELAY, &sh->state)) {
2608 schedule_reconstruction(sh, s, 1, 0); 2608 schedule_reconstruction(sh, s, 1, 0);
2609 } 2609 }
2610 } 2610 }
2611 2611
2612 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh, 2612 static void handle_parity_checks5(raid5_conf_t *conf, struct stripe_head *sh,
2613 struct stripe_head_state *s, int disks) 2613 struct stripe_head_state *s, int disks)
2614 { 2614 {
2615 struct r5dev *dev = NULL; 2615 struct r5dev *dev = NULL;
2616 2616
2617 set_bit(STRIPE_HANDLE, &sh->state); 2617 set_bit(STRIPE_HANDLE, &sh->state);
2618 2618
2619 switch (sh->check_state) { 2619 switch (sh->check_state) {
2620 case check_state_idle: 2620 case check_state_idle:
2621 /* start a new check operation if there are no failures */ 2621 /* start a new check operation if there are no failures */
2622 if (s->failed == 0) { 2622 if (s->failed == 0) {
2623 BUG_ON(s->uptodate != disks); 2623 BUG_ON(s->uptodate != disks);
2624 sh->check_state = check_state_run; 2624 sh->check_state = check_state_run;
2625 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2625 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2626 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags); 2626 clear_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags);
2627 s->uptodate--; 2627 s->uptodate--;
2628 break; 2628 break;
2629 } 2629 }
2630 dev = &sh->dev[s->failed_num]; 2630 dev = &sh->dev[s->failed_num];
2631 /* fall through */ 2631 /* fall through */
2632 case check_state_compute_result: 2632 case check_state_compute_result:
2633 sh->check_state = check_state_idle; 2633 sh->check_state = check_state_idle;
2634 if (!dev) 2634 if (!dev)
2635 dev = &sh->dev[sh->pd_idx]; 2635 dev = &sh->dev[sh->pd_idx];
2636 2636
2637 /* check that a write has not made the stripe insync */ 2637 /* check that a write has not made the stripe insync */
2638 if (test_bit(STRIPE_INSYNC, &sh->state)) 2638 if (test_bit(STRIPE_INSYNC, &sh->state))
2639 break; 2639 break;
2640 2640
2641 /* either failed parity check, or recovery is happening */ 2641 /* either failed parity check, or recovery is happening */
2642 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 2642 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
2643 BUG_ON(s->uptodate != disks); 2643 BUG_ON(s->uptodate != disks);
2644 2644
2645 set_bit(R5_LOCKED, &dev->flags); 2645 set_bit(R5_LOCKED, &dev->flags);
2646 s->locked++; 2646 s->locked++;
2647 set_bit(R5_Wantwrite, &dev->flags); 2647 set_bit(R5_Wantwrite, &dev->flags);
2648 2648
2649 clear_bit(STRIPE_DEGRADED, &sh->state); 2649 clear_bit(STRIPE_DEGRADED, &sh->state);
2650 set_bit(STRIPE_INSYNC, &sh->state); 2650 set_bit(STRIPE_INSYNC, &sh->state);
2651 break; 2651 break;
2652 case check_state_run: 2652 case check_state_run:
2653 break; /* we will be called again upon completion */ 2653 break; /* we will be called again upon completion */
2654 case check_state_check_result: 2654 case check_state_check_result:
2655 sh->check_state = check_state_idle; 2655 sh->check_state = check_state_idle;
2656 2656
2657 /* if a failure occurred during the check operation, leave 2657 /* if a failure occurred during the check operation, leave
2658 * STRIPE_INSYNC not set and let the stripe be handled again 2658 * STRIPE_INSYNC not set and let the stripe be handled again
2659 */ 2659 */
2660 if (s->failed) 2660 if (s->failed)
2661 break; 2661 break;
2662 2662
2663 /* handle a successful check operation, if parity is correct 2663 /* handle a successful check operation, if parity is correct
2664 * we are done. Otherwise update the mismatch count and repair 2664 * we are done. Otherwise update the mismatch count and repair
2665 * parity if !MD_RECOVERY_CHECK 2665 * parity if !MD_RECOVERY_CHECK
2666 */ 2666 */
2667 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0) 2667 if ((sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) == 0)
2668 /* parity is correct (on disc, 2668 /* parity is correct (on disc,
2669 * not in buffer any more) 2669 * not in buffer any more)
2670 */ 2670 */
2671 set_bit(STRIPE_INSYNC, &sh->state); 2671 set_bit(STRIPE_INSYNC, &sh->state);
2672 else { 2672 else {
2673 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2673 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2674 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2674 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2675 /* don't try to repair!! */ 2675 /* don't try to repair!! */
2676 set_bit(STRIPE_INSYNC, &sh->state); 2676 set_bit(STRIPE_INSYNC, &sh->state);
2677 else { 2677 else {
2678 sh->check_state = check_state_compute_run; 2678 sh->check_state = check_state_compute_run;
2679 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2679 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2680 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2680 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2681 set_bit(R5_Wantcompute, 2681 set_bit(R5_Wantcompute,
2682 &sh->dev[sh->pd_idx].flags); 2682 &sh->dev[sh->pd_idx].flags);
2683 sh->ops.target = sh->pd_idx; 2683 sh->ops.target = sh->pd_idx;
2684 sh->ops.target2 = -1; 2684 sh->ops.target2 = -1;
2685 s->uptodate++; 2685 s->uptodate++;
2686 } 2686 }
2687 } 2687 }
2688 break; 2688 break;
2689 case check_state_compute_run: 2689 case check_state_compute_run:
2690 break; 2690 break;
2691 default: 2691 default:
2692 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2692 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2693 __func__, sh->check_state, 2693 __func__, sh->check_state,
2694 (unsigned long long) sh->sector); 2694 (unsigned long long) sh->sector);
2695 BUG(); 2695 BUG();
2696 } 2696 }
2697 } 2697 }
2698 2698
2699 2699
2700 static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh, 2700 static void handle_parity_checks6(raid5_conf_t *conf, struct stripe_head *sh,
2701 struct stripe_head_state *s, 2701 struct stripe_head_state *s,
2702 struct r6_state *r6s, int disks) 2702 struct r6_state *r6s, int disks)
2703 { 2703 {
2704 int pd_idx = sh->pd_idx; 2704 int pd_idx = sh->pd_idx;
2705 int qd_idx = sh->qd_idx; 2705 int qd_idx = sh->qd_idx;
2706 struct r5dev *dev; 2706 struct r5dev *dev;
2707 2707
2708 set_bit(STRIPE_HANDLE, &sh->state); 2708 set_bit(STRIPE_HANDLE, &sh->state);
2709 2709
2710 BUG_ON(s->failed > 2); 2710 BUG_ON(s->failed > 2);
2711 2711
2712 /* Want to check and possibly repair P and Q. 2712 /* Want to check and possibly repair P and Q.
2713 * However there could be one 'failed' device, in which 2713 * However there could be one 'failed' device, in which
2714 * case we can only check one of them, possibly using the 2714 * case we can only check one of them, possibly using the
2715 * other to generate missing data 2715 * other to generate missing data
2716 */ 2716 */
2717 2717
2718 switch (sh->check_state) { 2718 switch (sh->check_state) {
2719 case check_state_idle: 2719 case check_state_idle:
2720 /* start a new check operation if there are < 2 failures */ 2720 /* start a new check operation if there are < 2 failures */
2721 if (s->failed == r6s->q_failed) { 2721 if (s->failed == r6s->q_failed) {
2722 /* The only possible failed device holds Q, so it 2722 /* The only possible failed device holds Q, so it
2723 * makes sense to check P (If anything else were failed, 2723 * makes sense to check P (If anything else were failed,
2724 * we would have used P to recreate it). 2724 * we would have used P to recreate it).
2725 */ 2725 */
2726 sh->check_state = check_state_run; 2726 sh->check_state = check_state_run;
2727 } 2727 }
2728 if (!r6s->q_failed && s->failed < 2) { 2728 if (!r6s->q_failed && s->failed < 2) {
2729 /* Q is not failed, and we didn't use it to generate 2729 /* Q is not failed, and we didn't use it to generate
2730 * anything, so it makes sense to check it 2730 * anything, so it makes sense to check it
2731 */ 2731 */
2732 if (sh->check_state == check_state_run) 2732 if (sh->check_state == check_state_run)
2733 sh->check_state = check_state_run_pq; 2733 sh->check_state = check_state_run_pq;
2734 else 2734 else
2735 sh->check_state = check_state_run_q; 2735 sh->check_state = check_state_run_q;
2736 } 2736 }
2737 2737
2738 /* discard potentially stale zero_sum_result */ 2738 /* discard potentially stale zero_sum_result */
2739 sh->ops.zero_sum_result = 0; 2739 sh->ops.zero_sum_result = 0;
2740 2740
2741 if (sh->check_state == check_state_run) { 2741 if (sh->check_state == check_state_run) {
2742 /* async_xor_zero_sum destroys the contents of P */ 2742 /* async_xor_zero_sum destroys the contents of P */
2743 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags); 2743 clear_bit(R5_UPTODATE, &sh->dev[pd_idx].flags);
2744 s->uptodate--; 2744 s->uptodate--;
2745 } 2745 }
2746 if (sh->check_state >= check_state_run && 2746 if (sh->check_state >= check_state_run &&
2747 sh->check_state <= check_state_run_pq) { 2747 sh->check_state <= check_state_run_pq) {
2748 /* async_syndrome_zero_sum preserves P and Q, so 2748 /* async_syndrome_zero_sum preserves P and Q, so
2749 * no need to mark them !uptodate here 2749 * no need to mark them !uptodate here
2750 */ 2750 */
2751 set_bit(STRIPE_OP_CHECK, &s->ops_request); 2751 set_bit(STRIPE_OP_CHECK, &s->ops_request);
2752 break; 2752 break;
2753 } 2753 }
2754 2754
2755 /* we have 2-disk failure */ 2755 /* we have 2-disk failure */
2756 BUG_ON(s->failed != 2); 2756 BUG_ON(s->failed != 2);
2757 /* fall through */ 2757 /* fall through */
2758 case check_state_compute_result: 2758 case check_state_compute_result:
2759 sh->check_state = check_state_idle; 2759 sh->check_state = check_state_idle;
2760 2760
2761 /* check that a write has not made the stripe insync */ 2761 /* check that a write has not made the stripe insync */
2762 if (test_bit(STRIPE_INSYNC, &sh->state)) 2762 if (test_bit(STRIPE_INSYNC, &sh->state))
2763 break; 2763 break;
2764 2764
2765 /* now write out any block on a failed drive, 2765 /* now write out any block on a failed drive,
2766 * or P or Q if they were recomputed 2766 * or P or Q if they were recomputed
2767 */ 2767 */
2768 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */ 2768 BUG_ON(s->uptodate < disks - 1); /* We don't need Q to recover */
2769 if (s->failed == 2) { 2769 if (s->failed == 2) {
2770 dev = &sh->dev[r6s->failed_num[1]]; 2770 dev = &sh->dev[r6s->failed_num[1]];
2771 s->locked++; 2771 s->locked++;
2772 set_bit(R5_LOCKED, &dev->flags); 2772 set_bit(R5_LOCKED, &dev->flags);
2773 set_bit(R5_Wantwrite, &dev->flags); 2773 set_bit(R5_Wantwrite, &dev->flags);
2774 } 2774 }
2775 if (s->failed >= 1) { 2775 if (s->failed >= 1) {
2776 dev = &sh->dev[r6s->failed_num[0]]; 2776 dev = &sh->dev[r6s->failed_num[0]];
2777 s->locked++; 2777 s->locked++;
2778 set_bit(R5_LOCKED, &dev->flags); 2778 set_bit(R5_LOCKED, &dev->flags);
2779 set_bit(R5_Wantwrite, &dev->flags); 2779 set_bit(R5_Wantwrite, &dev->flags);
2780 } 2780 }
2781 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2781 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2782 dev = &sh->dev[pd_idx]; 2782 dev = &sh->dev[pd_idx];
2783 s->locked++; 2783 s->locked++;
2784 set_bit(R5_LOCKED, &dev->flags); 2784 set_bit(R5_LOCKED, &dev->flags);
2785 set_bit(R5_Wantwrite, &dev->flags); 2785 set_bit(R5_Wantwrite, &dev->flags);
2786 } 2786 }
2787 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2787 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2788 dev = &sh->dev[qd_idx]; 2788 dev = &sh->dev[qd_idx];
2789 s->locked++; 2789 s->locked++;
2790 set_bit(R5_LOCKED, &dev->flags); 2790 set_bit(R5_LOCKED, &dev->flags);
2791 set_bit(R5_Wantwrite, &dev->flags); 2791 set_bit(R5_Wantwrite, &dev->flags);
2792 } 2792 }
2793 clear_bit(STRIPE_DEGRADED, &sh->state); 2793 clear_bit(STRIPE_DEGRADED, &sh->state);
2794 2794
2795 set_bit(STRIPE_INSYNC, &sh->state); 2795 set_bit(STRIPE_INSYNC, &sh->state);
2796 break; 2796 break;
2797 case check_state_run: 2797 case check_state_run:
2798 case check_state_run_q: 2798 case check_state_run_q:
2799 case check_state_run_pq: 2799 case check_state_run_pq:
2800 break; /* we will be called again upon completion */ 2800 break; /* we will be called again upon completion */
2801 case check_state_check_result: 2801 case check_state_check_result:
2802 sh->check_state = check_state_idle; 2802 sh->check_state = check_state_idle;
2803 2803
2804 /* handle a successful check operation, if parity is correct 2804 /* handle a successful check operation, if parity is correct
2805 * we are done. Otherwise update the mismatch count and repair 2805 * we are done. Otherwise update the mismatch count and repair
2806 * parity if !MD_RECOVERY_CHECK 2806 * parity if !MD_RECOVERY_CHECK
2807 */ 2807 */
2808 if (sh->ops.zero_sum_result == 0) { 2808 if (sh->ops.zero_sum_result == 0) {
2809 /* both parities are correct */ 2809 /* both parities are correct */
2810 if (!s->failed) 2810 if (!s->failed)
2811 set_bit(STRIPE_INSYNC, &sh->state); 2811 set_bit(STRIPE_INSYNC, &sh->state);
2812 else { 2812 else {
2813 /* in contrast to the raid5 case we can validate 2813 /* in contrast to the raid5 case we can validate
2814 * parity, but still have a failure to write 2814 * parity, but still have a failure to write
2815 * back 2815 * back
2816 */ 2816 */
2817 sh->check_state = check_state_compute_result; 2817 sh->check_state = check_state_compute_result;
2818 /* Returning at this point means that we may go 2818 /* Returning at this point means that we may go
2819 * off and bring p and/or q uptodate again so 2819 * off and bring p and/or q uptodate again so
2820 * we make sure to check zero_sum_result again 2820 * we make sure to check zero_sum_result again
2821 * to verify if p or q need writeback 2821 * to verify if p or q need writeback
2822 */ 2822 */
2823 } 2823 }
2824 } else { 2824 } else {
2825 conf->mddev->resync_mismatches += STRIPE_SECTORS; 2825 conf->mddev->resync_mismatches += STRIPE_SECTORS;
2826 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) 2826 if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery))
2827 /* don't try to repair!! */ 2827 /* don't try to repair!! */
2828 set_bit(STRIPE_INSYNC, &sh->state); 2828 set_bit(STRIPE_INSYNC, &sh->state);
2829 else { 2829 else {
2830 int *target = &sh->ops.target; 2830 int *target = &sh->ops.target;
2831 2831
2832 sh->ops.target = -1; 2832 sh->ops.target = -1;
2833 sh->ops.target2 = -1; 2833 sh->ops.target2 = -1;
2834 sh->check_state = check_state_compute_run; 2834 sh->check_state = check_state_compute_run;
2835 set_bit(STRIPE_COMPUTE_RUN, &sh->state); 2835 set_bit(STRIPE_COMPUTE_RUN, &sh->state);
2836 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request); 2836 set_bit(STRIPE_OP_COMPUTE_BLK, &s->ops_request);
2837 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) { 2837 if (sh->ops.zero_sum_result & SUM_CHECK_P_RESULT) {
2838 set_bit(R5_Wantcompute, 2838 set_bit(R5_Wantcompute,
2839 &sh->dev[pd_idx].flags); 2839 &sh->dev[pd_idx].flags);
2840 *target = pd_idx; 2840 *target = pd_idx;
2841 target = &sh->ops.target2; 2841 target = &sh->ops.target2;
2842 s->uptodate++; 2842 s->uptodate++;
2843 } 2843 }
2844 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) { 2844 if (sh->ops.zero_sum_result & SUM_CHECK_Q_RESULT) {
2845 set_bit(R5_Wantcompute, 2845 set_bit(R5_Wantcompute,
2846 &sh->dev[qd_idx].flags); 2846 &sh->dev[qd_idx].flags);
2847 *target = qd_idx; 2847 *target = qd_idx;
2848 s->uptodate++; 2848 s->uptodate++;
2849 } 2849 }
2850 } 2850 }
2851 } 2851 }
2852 break; 2852 break;
2853 case check_state_compute_run: 2853 case check_state_compute_run:
2854 break; 2854 break;
2855 default: 2855 default:
2856 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n", 2856 printk(KERN_ERR "%s: unknown check_state: %d sector: %llu\n",
2857 __func__, sh->check_state, 2857 __func__, sh->check_state,
2858 (unsigned long long) sh->sector); 2858 (unsigned long long) sh->sector);
2859 BUG(); 2859 BUG();
2860 } 2860 }
2861 } 2861 }
2862 2862
2863 static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh, 2863 static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
2864 struct r6_state *r6s) 2864 struct r6_state *r6s)
2865 { 2865 {
2866 int i; 2866 int i;
2867 2867
2868 /* We have read all the blocks in this stripe and now we need to 2868 /* We have read all the blocks in this stripe and now we need to
2869 * copy some of them into a target stripe for expand. 2869 * copy some of them into a target stripe for expand.
2870 */ 2870 */
2871 struct dma_async_tx_descriptor *tx = NULL; 2871 struct dma_async_tx_descriptor *tx = NULL;
2872 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2872 clear_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2873 for (i = 0; i < sh->disks; i++) 2873 for (i = 0; i < sh->disks; i++)
2874 if (i != sh->pd_idx && i != sh->qd_idx) { 2874 if (i != sh->pd_idx && i != sh->qd_idx) {
2875 int dd_idx, j; 2875 int dd_idx, j;
2876 struct stripe_head *sh2; 2876 struct stripe_head *sh2;
2877 struct async_submit_ctl submit; 2877 struct async_submit_ctl submit;
2878 2878
2879 sector_t bn = compute_blocknr(sh, i, 1); 2879 sector_t bn = compute_blocknr(sh, i, 1);
2880 sector_t s = raid5_compute_sector(conf, bn, 0, 2880 sector_t s = raid5_compute_sector(conf, bn, 0,
2881 &dd_idx, NULL); 2881 &dd_idx, NULL);
2882 sh2 = get_active_stripe(conf, s, 0, 1, 1); 2882 sh2 = get_active_stripe(conf, s, 0, 1, 1);
2883 if (sh2 == NULL) 2883 if (sh2 == NULL)
2884 /* so far only the early blocks of this stripe 2884 /* so far only the early blocks of this stripe
2885 * have been requested. When later blocks 2885 * have been requested. When later blocks
2886 * get requested, we will try again 2886 * get requested, we will try again
2887 */ 2887 */
2888 continue; 2888 continue;
2889 if (!test_bit(STRIPE_EXPANDING, &sh2->state) || 2889 if (!test_bit(STRIPE_EXPANDING, &sh2->state) ||
2890 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) { 2890 test_bit(R5_Expanded, &sh2->dev[dd_idx].flags)) {
2891 /* must have already done this block */ 2891 /* must have already done this block */
2892 release_stripe(sh2); 2892 release_stripe(sh2);
2893 continue; 2893 continue;
2894 } 2894 }
2895 2895
2896 /* place all the copies on one channel */ 2896 /* place all the copies on one channel */
2897 init_async_submit(&submit, 0, tx, NULL, NULL, NULL); 2897 init_async_submit(&submit, 0, tx, NULL, NULL, NULL);
2898 tx = async_memcpy(sh2->dev[dd_idx].page, 2898 tx = async_memcpy(sh2->dev[dd_idx].page,
2899 sh->dev[i].page, 0, 0, STRIPE_SIZE, 2899 sh->dev[i].page, 0, 0, STRIPE_SIZE,
2900 &submit); 2900 &submit);
2901 2901
2902 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); 2902 set_bit(R5_Expanded, &sh2->dev[dd_idx].flags);
2903 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags); 2903 set_bit(R5_UPTODATE, &sh2->dev[dd_idx].flags);
2904 for (j = 0; j < conf->raid_disks; j++) 2904 for (j = 0; j < conf->raid_disks; j++)
2905 if (j != sh2->pd_idx && 2905 if (j != sh2->pd_idx &&
2906 (!r6s || j != sh2->qd_idx) && 2906 (!r6s || j != sh2->qd_idx) &&
2907 !test_bit(R5_Expanded, &sh2->dev[j].flags)) 2907 !test_bit(R5_Expanded, &sh2->dev[j].flags))
2908 break; 2908 break;
2909 if (j == conf->raid_disks) { 2909 if (j == conf->raid_disks) {
2910 set_bit(STRIPE_EXPAND_READY, &sh2->state); 2910 set_bit(STRIPE_EXPAND_READY, &sh2->state);
2911 set_bit(STRIPE_HANDLE, &sh2->state); 2911 set_bit(STRIPE_HANDLE, &sh2->state);
2912 } 2912 }
2913 release_stripe(sh2); 2913 release_stripe(sh2);
2914 2914
2915 } 2915 }
2916 /* done submitting copies, wait for them to complete */ 2916 /* done submitting copies, wait for them to complete */
2917 if (tx) { 2917 if (tx) {
2918 async_tx_ack(tx); 2918 async_tx_ack(tx);
2919 dma_wait_for_async_tx(tx); 2919 dma_wait_for_async_tx(tx);
2920 } 2920 }
2921 } 2921 }
2922 2922
2923 2923
2924 /* 2924 /*
2925 * handle_stripe - do things to a stripe. 2925 * handle_stripe - do things to a stripe.
2926 * 2926 *
2927 * We lock the stripe and then examine the state of various bits 2927 * We lock the stripe and then examine the state of various bits
2928 * to see what needs to be done. 2928 * to see what needs to be done.
2929 * Possible results: 2929 * Possible results:
2930 * return some read request which now have data 2930 * return some read request which now have data
2931 * return some write requests which are safely on disc 2931 * return some write requests which are safely on disc
2932 * schedule a read on some buffers 2932 * schedule a read on some buffers
2933 * schedule a write of some buffers 2933 * schedule a write of some buffers
2934 * return confirmation of parity correctness 2934 * return confirmation of parity correctness
2935 * 2935 *
2936 * buffers are taken off read_list or write_list, and bh_cache buffers 2936 * buffers are taken off read_list or write_list, and bh_cache buffers
2937 * get BH_Lock set before the stripe lock is released. 2937 * get BH_Lock set before the stripe lock is released.
2938 * 2938 *
2939 */ 2939 */
2940 2940
2941 static void handle_stripe5(struct stripe_head *sh) 2941 static void handle_stripe5(struct stripe_head *sh)
2942 { 2942 {
2943 raid5_conf_t *conf = sh->raid_conf; 2943 raid5_conf_t *conf = sh->raid_conf;
2944 int disks = sh->disks, i; 2944 int disks = sh->disks, i;
2945 struct bio *return_bi = NULL; 2945 struct bio *return_bi = NULL;
2946 struct stripe_head_state s; 2946 struct stripe_head_state s;
2947 struct r5dev *dev; 2947 struct r5dev *dev;
2948 mdk_rdev_t *blocked_rdev = NULL; 2948 mdk_rdev_t *blocked_rdev = NULL;
2949 int prexor; 2949 int prexor;
2950 2950
2951 memset(&s, 0, sizeof(s)); 2951 memset(&s, 0, sizeof(s));
2952 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d " 2952 pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d check:%d "
2953 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state, 2953 "reconstruct:%d\n", (unsigned long long)sh->sector, sh->state,
2954 atomic_read(&sh->count), sh->pd_idx, sh->check_state, 2954 atomic_read(&sh->count), sh->pd_idx, sh->check_state,
2955 sh->reconstruct_state); 2955 sh->reconstruct_state);
2956 2956
2957 spin_lock(&sh->lock); 2957 spin_lock(&sh->lock);
2958 clear_bit(STRIPE_HANDLE, &sh->state); 2958 clear_bit(STRIPE_HANDLE, &sh->state);
2959 clear_bit(STRIPE_DELAYED, &sh->state); 2959 clear_bit(STRIPE_DELAYED, &sh->state);
2960 2960
2961 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 2961 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
2962 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 2962 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
2963 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 2963 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
2964 2964
2965 /* Now to look around and see what can be done */ 2965 /* Now to look around and see what can be done */
2966 rcu_read_lock(); 2966 rcu_read_lock();
2967 for (i=disks; i--; ) { 2967 for (i=disks; i--; ) {
2968 mdk_rdev_t *rdev; 2968 mdk_rdev_t *rdev;
2969 2969
2970 dev = &sh->dev[i]; 2970 dev = &sh->dev[i];
2971 clear_bit(R5_Insync, &dev->flags); 2971 clear_bit(R5_Insync, &dev->flags);
2972 2972
2973 pr_debug("check %d: state 0x%lx toread %p read %p write %p " 2973 pr_debug("check %d: state 0x%lx toread %p read %p write %p "
2974 "written %p\n", i, dev->flags, dev->toread, dev->read, 2974 "written %p\n", i, dev->flags, dev->toread, dev->read,
2975 dev->towrite, dev->written); 2975 dev->towrite, dev->written);
2976 2976
2977 /* maybe we can request a biofill operation 2977 /* maybe we can request a biofill operation
2978 * 2978 *
2979 * new wantfill requests are only permitted while 2979 * new wantfill requests are only permitted while
2980 * ops_complete_biofill is guaranteed to be inactive 2980 * ops_complete_biofill is guaranteed to be inactive
2981 */ 2981 */
2982 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 2982 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
2983 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 2983 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
2984 set_bit(R5_Wantfill, &dev->flags); 2984 set_bit(R5_Wantfill, &dev->flags);
2985 2985
2986 /* now count some things */ 2986 /* now count some things */
2987 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 2987 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
2988 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 2988 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
2989 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++; 2989 if (test_bit(R5_Wantcompute, &dev->flags)) s.compute++;
2990 2990
2991 if (test_bit(R5_Wantfill, &dev->flags)) 2991 if (test_bit(R5_Wantfill, &dev->flags))
2992 s.to_fill++; 2992 s.to_fill++;
2993 else if (dev->toread) 2993 else if (dev->toread)
2994 s.to_read++; 2994 s.to_read++;
2995 if (dev->towrite) { 2995 if (dev->towrite) {
2996 s.to_write++; 2996 s.to_write++;
2997 if (!test_bit(R5_OVERWRITE, &dev->flags)) 2997 if (!test_bit(R5_OVERWRITE, &dev->flags))
2998 s.non_overwrite++; 2998 s.non_overwrite++;
2999 } 2999 }
3000 if (dev->written) 3000 if (dev->written)
3001 s.written++; 3001 s.written++;
3002 rdev = rcu_dereference(conf->disks[i].rdev); 3002 rdev = rcu_dereference(conf->disks[i].rdev);
3003 if (blocked_rdev == NULL && 3003 if (blocked_rdev == NULL &&
3004 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3004 rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
3005 blocked_rdev = rdev; 3005 blocked_rdev = rdev;
3006 atomic_inc(&rdev->nr_pending); 3006 atomic_inc(&rdev->nr_pending);
3007 } 3007 }
3008 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 3008 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
3009 /* The ReadError flag will just be confusing now */ 3009 /* The ReadError flag will just be confusing now */
3010 clear_bit(R5_ReadError, &dev->flags); 3010 clear_bit(R5_ReadError, &dev->flags);
3011 clear_bit(R5_ReWrite, &dev->flags); 3011 clear_bit(R5_ReWrite, &dev->flags);
3012 } 3012 }
3013 if (!rdev || !test_bit(In_sync, &rdev->flags) 3013 if (!rdev || !test_bit(In_sync, &rdev->flags)
3014 || test_bit(R5_ReadError, &dev->flags)) { 3014 || test_bit(R5_ReadError, &dev->flags)) {
3015 s.failed++; 3015 s.failed++;
3016 s.failed_num = i; 3016 s.failed_num = i;
3017 } else 3017 } else
3018 set_bit(R5_Insync, &dev->flags); 3018 set_bit(R5_Insync, &dev->flags);
3019 } 3019 }
3020 rcu_read_unlock(); 3020 rcu_read_unlock();
3021 3021
3022 if (unlikely(blocked_rdev)) { 3022 if (unlikely(blocked_rdev)) {
3023 if (s.syncing || s.expanding || s.expanded || 3023 if (s.syncing || s.expanding || s.expanded ||
3024 s.to_write || s.written) { 3024 s.to_write || s.written) {
3025 set_bit(STRIPE_HANDLE, &sh->state); 3025 set_bit(STRIPE_HANDLE, &sh->state);
3026 goto unlock; 3026 goto unlock;
3027 } 3027 }
3028 /* There is nothing for the blocked_rdev to block */ 3028 /* There is nothing for the blocked_rdev to block */
3029 rdev_dec_pending(blocked_rdev, conf->mddev); 3029 rdev_dec_pending(blocked_rdev, conf->mddev);
3030 blocked_rdev = NULL; 3030 blocked_rdev = NULL;
3031 } 3031 }
3032 3032
3033 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3033 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3034 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3034 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3035 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3035 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3036 } 3036 }
3037 3037
3038 pr_debug("locked=%d uptodate=%d to_read=%d" 3038 pr_debug("locked=%d uptodate=%d to_read=%d"
3039 " to_write=%d failed=%d failed_num=%d\n", 3039 " to_write=%d failed=%d failed_num=%d\n",
3040 s.locked, s.uptodate, s.to_read, s.to_write, 3040 s.locked, s.uptodate, s.to_read, s.to_write,
3041 s.failed, s.failed_num); 3041 s.failed, s.failed_num);
3042 /* check if the array has lost two devices and, if so, some requests might 3042 /* check if the array has lost two devices and, if so, some requests might
3043 * need to be failed 3043 * need to be failed
3044 */ 3044 */
3045 if (s.failed > 1 && s.to_read+s.to_write+s.written) 3045 if (s.failed > 1 && s.to_read+s.to_write+s.written)
3046 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3046 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
3047 if (s.failed > 1 && s.syncing) { 3047 if (s.failed > 1 && s.syncing) {
3048 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3048 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3049 clear_bit(STRIPE_SYNCING, &sh->state); 3049 clear_bit(STRIPE_SYNCING, &sh->state);
3050 s.syncing = 0; 3050 s.syncing = 0;
3051 } 3051 }
3052 3052
3053 /* might be able to return some write requests if the parity block 3053 /* might be able to return some write requests if the parity block
3054 * is safe, or on a failed drive 3054 * is safe, or on a failed drive
3055 */ 3055 */
3056 dev = &sh->dev[sh->pd_idx]; 3056 dev = &sh->dev[sh->pd_idx];
3057 if ( s.written && 3057 if ( s.written &&
3058 ((test_bit(R5_Insync, &dev->flags) && 3058 ((test_bit(R5_Insync, &dev->flags) &&
3059 !test_bit(R5_LOCKED, &dev->flags) && 3059 !test_bit(R5_LOCKED, &dev->flags) &&
3060 test_bit(R5_UPTODATE, &dev->flags)) || 3060 test_bit(R5_UPTODATE, &dev->flags)) ||
3061 (s.failed == 1 && s.failed_num == sh->pd_idx))) 3061 (s.failed == 1 && s.failed_num == sh->pd_idx)))
3062 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3062 handle_stripe_clean_event(conf, sh, disks, &return_bi);
3063 3063
3064 /* Now we might consider reading some blocks, either to check/generate 3064 /* Now we might consider reading some blocks, either to check/generate
3065 * parity, or to satisfy requests 3065 * parity, or to satisfy requests
3066 * or to load a block that is being partially written. 3066 * or to load a block that is being partially written.
3067 */ 3067 */
3068 if (s.to_read || s.non_overwrite || 3068 if (s.to_read || s.non_overwrite ||
3069 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3069 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3070 handle_stripe_fill5(sh, &s, disks); 3070 handle_stripe_fill5(sh, &s, disks);
3071 3071
3072 /* Now we check to see if any write operations have recently 3072 /* Now we check to see if any write operations have recently
3073 * completed 3073 * completed
3074 */ 3074 */
3075 prexor = 0; 3075 prexor = 0;
3076 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result) 3076 if (sh->reconstruct_state == reconstruct_state_prexor_drain_result)
3077 prexor = 1; 3077 prexor = 1;
3078 if (sh->reconstruct_state == reconstruct_state_drain_result || 3078 if (sh->reconstruct_state == reconstruct_state_drain_result ||
3079 sh->reconstruct_state == reconstruct_state_prexor_drain_result) { 3079 sh->reconstruct_state == reconstruct_state_prexor_drain_result) {
3080 sh->reconstruct_state = reconstruct_state_idle; 3080 sh->reconstruct_state = reconstruct_state_idle;
3081 3081
3082 /* All the 'written' buffers and the parity block are ready to 3082 /* All the 'written' buffers and the parity block are ready to
3083 * be written back to disk 3083 * be written back to disk
3084 */ 3084 */
3085 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3085 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3086 for (i = disks; i--; ) { 3086 for (i = disks; i--; ) {
3087 dev = &sh->dev[i]; 3087 dev = &sh->dev[i];
3088 if (test_bit(R5_LOCKED, &dev->flags) && 3088 if (test_bit(R5_LOCKED, &dev->flags) &&
3089 (i == sh->pd_idx || dev->written)) { 3089 (i == sh->pd_idx || dev->written)) {
3090 pr_debug("Writing block %d\n", i); 3090 pr_debug("Writing block %d\n", i);
3091 set_bit(R5_Wantwrite, &dev->flags); 3091 set_bit(R5_Wantwrite, &dev->flags);
3092 if (prexor) 3092 if (prexor)
3093 continue; 3093 continue;
3094 if (!test_bit(R5_Insync, &dev->flags) || 3094 if (!test_bit(R5_Insync, &dev->flags) ||
3095 (i == sh->pd_idx && s.failed == 0)) 3095 (i == sh->pd_idx && s.failed == 0))
3096 set_bit(STRIPE_INSYNC, &sh->state); 3096 set_bit(STRIPE_INSYNC, &sh->state);
3097 } 3097 }
3098 } 3098 }
3099 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3099 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
3100 atomic_dec(&conf->preread_active_stripes); 3100 atomic_dec(&conf->preread_active_stripes);
3101 if (atomic_read(&conf->preread_active_stripes) < 3101 if (atomic_read(&conf->preread_active_stripes) <
3102 IO_THRESHOLD) 3102 IO_THRESHOLD)
3103 md_wakeup_thread(conf->mddev->thread); 3103 md_wakeup_thread(conf->mddev->thread);
3104 } 3104 }
3105 } 3105 }
3106 3106
3107 /* Now to consider new write requests and what else, if anything 3107 /* Now to consider new write requests and what else, if anything
3108 * should be read. We do not handle new writes when: 3108 * should be read. We do not handle new writes when:
3109 * 1/ A 'write' operation (copy+xor) is already in flight. 3109 * 1/ A 'write' operation (copy+xor) is already in flight.
3110 * 2/ A 'check' operation is in flight, as it may clobber the parity 3110 * 2/ A 'check' operation is in flight, as it may clobber the parity
3111 * block. 3111 * block.
3112 */ 3112 */
3113 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3113 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3114 handle_stripe_dirtying5(conf, sh, &s, disks); 3114 handle_stripe_dirtying5(conf, sh, &s, disks);
3115 3115
3116 /* maybe we need to check and possibly fix the parity for this stripe 3116 /* maybe we need to check and possibly fix the parity for this stripe
3117 * Any reads will already have been scheduled, so we just see if enough 3117 * Any reads will already have been scheduled, so we just see if enough
3118 * data is available. The parity check is held off while parity 3118 * data is available. The parity check is held off while parity
3119 * dependent operations are in flight. 3119 * dependent operations are in flight.
3120 */ 3120 */
3121 if (sh->check_state || 3121 if (sh->check_state ||
3122 (s.syncing && s.locked == 0 && 3122 (s.syncing && s.locked == 0 &&
3123 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3123 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3124 !test_bit(STRIPE_INSYNC, &sh->state))) 3124 !test_bit(STRIPE_INSYNC, &sh->state)))
3125 handle_parity_checks5(conf, sh, &s, disks); 3125 handle_parity_checks5(conf, sh, &s, disks);
3126 3126
3127 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3127 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3128 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3128 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
3129 clear_bit(STRIPE_SYNCING, &sh->state); 3129 clear_bit(STRIPE_SYNCING, &sh->state);
3130 } 3130 }
3131 3131
3132 /* If the failed drive is just a ReadError, then we might need to progress 3132 /* If the failed drive is just a ReadError, then we might need to progress
3133 * the repair/check process 3133 * the repair/check process
3134 */ 3134 */
3135 if (s.failed == 1 && !conf->mddev->ro && 3135 if (s.failed == 1 && !conf->mddev->ro &&
3136 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags) 3136 test_bit(R5_ReadError, &sh->dev[s.failed_num].flags)
3137 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags) 3137 && !test_bit(R5_LOCKED, &sh->dev[s.failed_num].flags)
3138 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags) 3138 && test_bit(R5_UPTODATE, &sh->dev[s.failed_num].flags)
3139 ) { 3139 ) {
3140 dev = &sh->dev[s.failed_num]; 3140 dev = &sh->dev[s.failed_num];
3141 if (!test_bit(R5_ReWrite, &dev->flags)) { 3141 if (!test_bit(R5_ReWrite, &dev->flags)) {
3142 set_bit(R5_Wantwrite, &dev->flags); 3142 set_bit(R5_Wantwrite, &dev->flags);
3143 set_bit(R5_ReWrite, &dev->flags); 3143 set_bit(R5_ReWrite, &dev->flags);
3144 set_bit(R5_LOCKED, &dev->flags); 3144 set_bit(R5_LOCKED, &dev->flags);
3145 s.locked++; 3145 s.locked++;
3146 } else { 3146 } else {
3147 /* let's read it back */ 3147 /* let's read it back */
3148 set_bit(R5_Wantread, &dev->flags); 3148 set_bit(R5_Wantread, &dev->flags);
3149 set_bit(R5_LOCKED, &dev->flags); 3149 set_bit(R5_LOCKED, &dev->flags);
3150 s.locked++; 3150 s.locked++;
3151 } 3151 }
3152 } 3152 }
3153 3153
3154 /* Finish reconstruct operations initiated by the expansion process */ 3154 /* Finish reconstruct operations initiated by the expansion process */
3155 if (sh->reconstruct_state == reconstruct_state_result) { 3155 if (sh->reconstruct_state == reconstruct_state_result) {
3156 struct stripe_head *sh2 3156 struct stripe_head *sh2
3157 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3157 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3158 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3158 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3159 /* sh cannot be written until sh2 has been read. 3159 /* sh cannot be written until sh2 has been read.
3160 * so arrange for sh to be delayed a little 3160 * so arrange for sh to be delayed a little
3161 */ 3161 */
3162 set_bit(STRIPE_DELAYED, &sh->state); 3162 set_bit(STRIPE_DELAYED, &sh->state);
3163 set_bit(STRIPE_HANDLE, &sh->state); 3163 set_bit(STRIPE_HANDLE, &sh->state);
3164 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3164 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3165 &sh2->state)) 3165 &sh2->state))
3166 atomic_inc(&conf->preread_active_stripes); 3166 atomic_inc(&conf->preread_active_stripes);
3167 release_stripe(sh2); 3167 release_stripe(sh2);
3168 goto unlock; 3168 goto unlock;
3169 } 3169 }
3170 if (sh2) 3170 if (sh2)
3171 release_stripe(sh2); 3171 release_stripe(sh2);
3172 3172
3173 sh->reconstruct_state = reconstruct_state_idle; 3173 sh->reconstruct_state = reconstruct_state_idle;
3174 clear_bit(STRIPE_EXPANDING, &sh->state); 3174 clear_bit(STRIPE_EXPANDING, &sh->state);
3175 for (i = conf->raid_disks; i--; ) { 3175 for (i = conf->raid_disks; i--; ) {
3176 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3176 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3177 set_bit(R5_LOCKED, &sh->dev[i].flags); 3177 set_bit(R5_LOCKED, &sh->dev[i].flags);
3178 s.locked++; 3178 s.locked++;
3179 } 3179 }
3180 } 3180 }
3181 3181
3182 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3182 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3183 !sh->reconstruct_state) { 3183 !sh->reconstruct_state) {
3184 /* Need to write out all blocks after computing parity */ 3184 /* Need to write out all blocks after computing parity */
3185 sh->disks = conf->raid_disks; 3185 sh->disks = conf->raid_disks;
3186 stripe_set_idx(sh->sector, conf, 0, sh); 3186 stripe_set_idx(sh->sector, conf, 0, sh);
3187 schedule_reconstruction(sh, &s, 1, 1); 3187 schedule_reconstruction(sh, &s, 1, 1);
3188 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3188 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3189 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3189 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3190 atomic_dec(&conf->reshape_stripes); 3190 atomic_dec(&conf->reshape_stripes);
3191 wake_up(&conf->wait_for_overlap); 3191 wake_up(&conf->wait_for_overlap);
3192 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3192 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3193 } 3193 }
3194 3194
3195 if (s.expanding && s.locked == 0 && 3195 if (s.expanding && s.locked == 0 &&
3196 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3196 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3197 handle_stripe_expansion(conf, sh, NULL); 3197 handle_stripe_expansion(conf, sh, NULL);
3198 3198
3199 unlock: 3199 unlock:
3200 spin_unlock(&sh->lock); 3200 spin_unlock(&sh->lock);
3201 3201
3202 /* wait for this device to become unblocked */ 3202 /* wait for this device to become unblocked */
3203 if (unlikely(blocked_rdev)) 3203 if (unlikely(blocked_rdev))
3204 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3204 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3205 3205
3206 if (s.ops_request) 3206 if (s.ops_request)
3207 raid_run_ops(sh, s.ops_request); 3207 raid_run_ops(sh, s.ops_request);
3208 3208
3209 ops_run_io(sh, &s); 3209 ops_run_io(sh, &s);
3210 3210
3211 return_io(return_bi); 3211 return_io(return_bi);
3212 } 3212 }
3213 3213
3214 static void handle_stripe6(struct stripe_head *sh) 3214 static void handle_stripe6(struct stripe_head *sh)
3215 { 3215 {
3216 raid5_conf_t *conf = sh->raid_conf; 3216 raid5_conf_t *conf = sh->raid_conf;
3217 int disks = sh->disks; 3217 int disks = sh->disks;
3218 struct bio *return_bi = NULL; 3218 struct bio *return_bi = NULL;
3219 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx; 3219 int i, pd_idx = sh->pd_idx, qd_idx = sh->qd_idx;
3220 struct stripe_head_state s; 3220 struct stripe_head_state s;
3221 struct r6_state r6s; 3221 struct r6_state r6s;
3222 struct r5dev *dev, *pdev, *qdev; 3222 struct r5dev *dev, *pdev, *qdev;
3223 mdk_rdev_t *blocked_rdev = NULL; 3223 mdk_rdev_t *blocked_rdev = NULL;
3224 3224
3225 pr_debug("handling stripe %llu, state=%#lx cnt=%d, " 3225 pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
3226 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n", 3226 "pd_idx=%d, qd_idx=%d\n, check:%d, reconstruct:%d\n",
3227 (unsigned long long)sh->sector, sh->state, 3227 (unsigned long long)sh->sector, sh->state,
3228 atomic_read(&sh->count), pd_idx, qd_idx, 3228 atomic_read(&sh->count), pd_idx, qd_idx,
3229 sh->check_state, sh->reconstruct_state); 3229 sh->check_state, sh->reconstruct_state);
3230 memset(&s, 0, sizeof(s)); 3230 memset(&s, 0, sizeof(s));
3231 3231
3232 spin_lock(&sh->lock); 3232 spin_lock(&sh->lock);
3233 clear_bit(STRIPE_HANDLE, &sh->state); 3233 clear_bit(STRIPE_HANDLE, &sh->state);
3234 clear_bit(STRIPE_DELAYED, &sh->state); 3234 clear_bit(STRIPE_DELAYED, &sh->state);
3235 3235
3236 s.syncing = test_bit(STRIPE_SYNCING, &sh->state); 3236 s.syncing = test_bit(STRIPE_SYNCING, &sh->state);
3237 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state); 3237 s.expanding = test_bit(STRIPE_EXPAND_SOURCE, &sh->state);
3238 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state); 3238 s.expanded = test_bit(STRIPE_EXPAND_READY, &sh->state);
3239 /* Now to look around and see what can be done */ 3239 /* Now to look around and see what can be done */
3240 3240
3241 rcu_read_lock(); 3241 rcu_read_lock();
3242 for (i=disks; i--; ) { 3242 for (i=disks; i--; ) {
3243 mdk_rdev_t *rdev; 3243 mdk_rdev_t *rdev;
3244 dev = &sh->dev[i]; 3244 dev = &sh->dev[i];
3245 clear_bit(R5_Insync, &dev->flags); 3245 clear_bit(R5_Insync, &dev->flags);
3246 3246
3247 pr_debug("check %d: state 0x%lx read %p write %p written %p\n", 3247 pr_debug("check %d: state 0x%lx read %p write %p written %p\n",
3248 i, dev->flags, dev->toread, dev->towrite, dev->written); 3248 i, dev->flags, dev->toread, dev->towrite, dev->written);
3249 /* maybe we can reply to a read 3249 /* maybe we can reply to a read
3250 * 3250 *
3251 * new wantfill requests are only permitted while 3251 * new wantfill requests are only permitted while
3252 * ops_complete_biofill is guaranteed to be inactive 3252 * ops_complete_biofill is guaranteed to be inactive
3253 */ 3253 */
3254 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread && 3254 if (test_bit(R5_UPTODATE, &dev->flags) && dev->toread &&
3255 !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) 3255 !test_bit(STRIPE_BIOFILL_RUN, &sh->state))
3256 set_bit(R5_Wantfill, &dev->flags); 3256 set_bit(R5_Wantfill, &dev->flags);
3257 3257
3258 /* now count some things */ 3258 /* now count some things */
3259 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++; 3259 if (test_bit(R5_LOCKED, &dev->flags)) s.locked++;
3260 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++; 3260 if (test_bit(R5_UPTODATE, &dev->flags)) s.uptodate++;
3261 if (test_bit(R5_Wantcompute, &dev->flags)) { 3261 if (test_bit(R5_Wantcompute, &dev->flags)) {
3262 s.compute++; 3262 s.compute++;
3263 BUG_ON(s.compute > 2); 3263 BUG_ON(s.compute > 2);
3264 } 3264 }
3265 3265
3266 if (test_bit(R5_Wantfill, &dev->flags)) { 3266 if (test_bit(R5_Wantfill, &dev->flags)) {
3267 s.to_fill++; 3267 s.to_fill++;
3268 } else if (dev->toread) 3268 } else if (dev->toread)
3269 s.to_read++; 3269 s.to_read++;
3270 if (dev->towrite) { 3270 if (dev->towrite) {
3271 s.to_write++; 3271 s.to_write++;
3272 if (!test_bit(R5_OVERWRITE, &dev->flags)) 3272 if (!test_bit(R5_OVERWRITE, &dev->flags))
3273 s.non_overwrite++; 3273 s.non_overwrite++;
3274 } 3274 }
3275 if (dev->written) 3275 if (dev->written)
3276 s.written++; 3276 s.written++;
3277 rdev = rcu_dereference(conf->disks[i].rdev); 3277 rdev = rcu_dereference(conf->disks[i].rdev);
3278 if (blocked_rdev == NULL && 3278 if (blocked_rdev == NULL &&
3279 rdev && unlikely(test_bit(Blocked, &rdev->flags))) { 3279 rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
3280 blocked_rdev = rdev; 3280 blocked_rdev = rdev;
3281 atomic_inc(&rdev->nr_pending); 3281 atomic_inc(&rdev->nr_pending);
3282 } 3282 }
3283 if (!rdev || !test_bit(In_sync, &rdev->flags)) { 3283 if (!rdev || !test_bit(In_sync, &rdev->flags)) {
3284 /* The ReadError flag will just be confusing now */ 3284 /* The ReadError flag will just be confusing now */
3285 clear_bit(R5_ReadError, &dev->flags); 3285 clear_bit(R5_ReadError, &dev->flags);
3286 clear_bit(R5_ReWrite, &dev->flags); 3286 clear_bit(R5_ReWrite, &dev->flags);
3287 } 3287 }
3288 if (!rdev || !test_bit(In_sync, &rdev->flags) 3288 if (!rdev || !test_bit(In_sync, &rdev->flags)
3289 || test_bit(R5_ReadError, &dev->flags)) { 3289 || test_bit(R5_ReadError, &dev->flags)) {
3290 if (s.failed < 2) 3290 if (s.failed < 2)
3291 r6s.failed_num[s.failed] = i; 3291 r6s.failed_num[s.failed] = i;
3292 s.failed++; 3292 s.failed++;
3293 } else 3293 } else
3294 set_bit(R5_Insync, &dev->flags); 3294 set_bit(R5_Insync, &dev->flags);
3295 } 3295 }
3296 rcu_read_unlock(); 3296 rcu_read_unlock();
3297 3297
3298 if (unlikely(blocked_rdev)) { 3298 if (unlikely(blocked_rdev)) {
3299 if (s.syncing || s.expanding || s.expanded || 3299 if (s.syncing || s.expanding || s.expanded ||
3300 s.to_write || s.written) { 3300 s.to_write || s.written) {
3301 set_bit(STRIPE_HANDLE, &sh->state); 3301 set_bit(STRIPE_HANDLE, &sh->state);
3302 goto unlock; 3302 goto unlock;
3303 } 3303 }
3304 /* There is nothing for the blocked_rdev to block */ 3304 /* There is nothing for the blocked_rdev to block */
3305 rdev_dec_pending(blocked_rdev, conf->mddev); 3305 rdev_dec_pending(blocked_rdev, conf->mddev);
3306 blocked_rdev = NULL; 3306 blocked_rdev = NULL;
3307 } 3307 }
3308 3308
3309 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) { 3309 if (s.to_fill && !test_bit(STRIPE_BIOFILL_RUN, &sh->state)) {
3310 set_bit(STRIPE_OP_BIOFILL, &s.ops_request); 3310 set_bit(STRIPE_OP_BIOFILL, &s.ops_request);
3311 set_bit(STRIPE_BIOFILL_RUN, &sh->state); 3311 set_bit(STRIPE_BIOFILL_RUN, &sh->state);
3312 } 3312 }
3313 3313
3314 pr_debug("locked=%d uptodate=%d to_read=%d" 3314 pr_debug("locked=%d uptodate=%d to_read=%d"
3315 " to_write=%d failed=%d failed_num=%d,%d\n", 3315 " to_write=%d failed=%d failed_num=%d,%d\n",
3316 s.locked, s.uptodate, s.to_read, s.to_write, s.failed, 3316 s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
3317 r6s.failed_num[0], r6s.failed_num[1]); 3317 r6s.failed_num[0], r6s.failed_num[1]);
3318 /* check if the array has lost >2 devices and, if so, some requests 3318 /* check if the array has lost >2 devices and, if so, some requests
3319 * might need to be failed 3319 * might need to be failed
3320 */ 3320 */
3321 if (s.failed > 2 && s.to_read+s.to_write+s.written) 3321 if (s.failed > 2 && s.to_read+s.to_write+s.written)
3322 handle_failed_stripe(conf, sh, &s, disks, &return_bi); 3322 handle_failed_stripe(conf, sh, &s, disks, &return_bi);
3323 if (s.failed > 2 && s.syncing) { 3323 if (s.failed > 2 && s.syncing) {
3324 md_done_sync(conf->mddev, STRIPE_SECTORS,0); 3324 md_done_sync(conf->mddev, STRIPE_SECTORS,0);
3325 clear_bit(STRIPE_SYNCING, &sh->state); 3325 clear_bit(STRIPE_SYNCING, &sh->state);
3326 s.syncing = 0; 3326 s.syncing = 0;
3327 } 3327 }
3328 3328
3329 /* 3329 /*
3330 * might be able to return some write requests if the parity blocks 3330 * might be able to return some write requests if the parity blocks
3331 * are safe, or on a failed drive 3331 * are safe, or on a failed drive
3332 */ 3332 */
3333 pdev = &sh->dev[pd_idx]; 3333 pdev = &sh->dev[pd_idx];
3334 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx) 3334 r6s.p_failed = (s.failed >= 1 && r6s.failed_num[0] == pd_idx)
3335 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx); 3335 || (s.failed >= 2 && r6s.failed_num[1] == pd_idx);
3336 qdev = &sh->dev[qd_idx]; 3336 qdev = &sh->dev[qd_idx];
3337 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx) 3337 r6s.q_failed = (s.failed >= 1 && r6s.failed_num[0] == qd_idx)
3338 || (s.failed >= 2 && r6s.failed_num[1] == qd_idx); 3338 || (s.failed >= 2 && r6s.failed_num[1] == qd_idx);
3339 3339
3340 if ( s.written && 3340 if ( s.written &&
3341 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags) 3341 ( r6s.p_failed || ((test_bit(R5_Insync, &pdev->flags)
3342 && !test_bit(R5_LOCKED, &pdev->flags) 3342 && !test_bit(R5_LOCKED, &pdev->flags)
3343 && test_bit(R5_UPTODATE, &pdev->flags)))) && 3343 && test_bit(R5_UPTODATE, &pdev->flags)))) &&
3344 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags) 3344 ( r6s.q_failed || ((test_bit(R5_Insync, &qdev->flags)
3345 && !test_bit(R5_LOCKED, &qdev->flags) 3345 && !test_bit(R5_LOCKED, &qdev->flags)
3346 && test_bit(R5_UPTODATE, &qdev->flags))))) 3346 && test_bit(R5_UPTODATE, &qdev->flags)))))
3347 handle_stripe_clean_event(conf, sh, disks, &return_bi); 3347 handle_stripe_clean_event(conf, sh, disks, &return_bi);
3348 3348
3349 /* Now we might consider reading some blocks, either to check/generate 3349 /* Now we might consider reading some blocks, either to check/generate
3350 * parity, or to satisfy requests 3350 * parity, or to satisfy requests
3351 * or to load a block that is being partially written. 3351 * or to load a block that is being partially written.
3352 */ 3352 */
3353 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) || 3353 if (s.to_read || s.non_overwrite || (s.to_write && s.failed) ||
3354 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding) 3354 (s.syncing && (s.uptodate + s.compute < disks)) || s.expanding)
3355 handle_stripe_fill6(sh, &s, &r6s, disks); 3355 handle_stripe_fill6(sh, &s, &r6s, disks);
3356 3356
3357 /* Now we check to see if any write operations have recently 3357 /* Now we check to see if any write operations have recently
3358 * completed 3358 * completed
3359 */ 3359 */
3360 if (sh->reconstruct_state == reconstruct_state_drain_result) { 3360 if (sh->reconstruct_state == reconstruct_state_drain_result) {
3361 int qd_idx = sh->qd_idx; 3361 int qd_idx = sh->qd_idx;
3362 3362
3363 sh->reconstruct_state = reconstruct_state_idle; 3363 sh->reconstruct_state = reconstruct_state_idle;
3364 /* All the 'written' buffers and the parity blocks are ready to 3364 /* All the 'written' buffers and the parity blocks are ready to
3365 * be written back to disk 3365 * be written back to disk
3366 */ 3366 */
3367 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags)); 3367 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[sh->pd_idx].flags));
3368 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags)); 3368 BUG_ON(!test_bit(R5_UPTODATE, &sh->dev[qd_idx].flags));
3369 for (i = disks; i--; ) { 3369 for (i = disks; i--; ) {
3370 dev = &sh->dev[i]; 3370 dev = &sh->dev[i];
3371 if (test_bit(R5_LOCKED, &dev->flags) && 3371 if (test_bit(R5_LOCKED, &dev->flags) &&
3372 (i == sh->pd_idx || i == qd_idx || 3372 (i == sh->pd_idx || i == qd_idx ||
3373 dev->written)) { 3373 dev->written)) {
3374 pr_debug("Writing block %d\n", i); 3374 pr_debug("Writing block %d\n", i);
3375 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags)); 3375 BUG_ON(!test_bit(R5_UPTODATE, &dev->flags));
3376 set_bit(R5_Wantwrite, &dev->flags); 3376 set_bit(R5_Wantwrite, &dev->flags);
3377 if (!test_bit(R5_Insync, &dev->flags) || 3377 if (!test_bit(R5_Insync, &dev->flags) ||
3378 ((i == sh->pd_idx || i == qd_idx) && 3378 ((i == sh->pd_idx || i == qd_idx) &&
3379 s.failed == 0)) 3379 s.failed == 0))
3380 set_bit(STRIPE_INSYNC, &sh->state); 3380 set_bit(STRIPE_INSYNC, &sh->state);
3381 } 3381 }
3382 } 3382 }
3383 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 3383 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) {
3384 atomic_dec(&conf->preread_active_stripes); 3384 atomic_dec(&conf->preread_active_stripes);
3385 if (atomic_read(&conf->preread_active_stripes) < 3385 if (atomic_read(&conf->preread_active_stripes) <
3386 IO_THRESHOLD) 3386 IO_THRESHOLD)
3387 md_wakeup_thread(conf->mddev->thread); 3387 md_wakeup_thread(conf->mddev->thread);
3388 } 3388 }
3389 } 3389 }
3390 3390
3391 /* Now to consider new write requests and what else, if anything 3391 /* Now to consider new write requests and what else, if anything
3392 * should be read. We do not handle new writes when: 3392 * should be read. We do not handle new writes when:
3393 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight. 3393 * 1/ A 'write' operation (copy+gen_syndrome) is already in flight.
3394 * 2/ A 'check' operation is in flight, as it may clobber the parity 3394 * 2/ A 'check' operation is in flight, as it may clobber the parity
3395 * block. 3395 * block.
3396 */ 3396 */
3397 if (s.to_write && !sh->reconstruct_state && !sh->check_state) 3397 if (s.to_write && !sh->reconstruct_state && !sh->check_state)
3398 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks); 3398 handle_stripe_dirtying6(conf, sh, &s, &r6s, disks);
3399 3399
3400 /* maybe we need to check and possibly fix the parity for this stripe 3400 /* maybe we need to check and possibly fix the parity for this stripe
3401 * Any reads will already have been scheduled, so we just see if enough 3401 * Any reads will already have been scheduled, so we just see if enough
3402 * data is available. The parity check is held off while parity 3402 * data is available. The parity check is held off while parity
3403 * dependent operations are in flight. 3403 * dependent operations are in flight.
3404 */ 3404 */
3405 if (sh->check_state || 3405 if (sh->check_state ||
3406 (s.syncing && s.locked == 0 && 3406 (s.syncing && s.locked == 0 &&
3407 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && 3407 !test_bit(STRIPE_COMPUTE_RUN, &sh->state) &&
3408 !test_bit(STRIPE_INSYNC, &sh->state))) 3408 !test_bit(STRIPE_INSYNC, &sh->state)))
3409 handle_parity_checks6(conf, sh, &s, &r6s, disks); 3409 handle_parity_checks6(conf, sh, &s, &r6s, disks);
3410 3410
3411 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) { 3411 if (s.syncing && s.locked == 0 && test_bit(STRIPE_INSYNC, &sh->state)) {
3412 md_done_sync(conf->mddev, STRIPE_SECTORS,1); 3412 md_done_sync(conf->mddev, STRIPE_SECTORS,1);
3413 clear_bit(STRIPE_SYNCING, &sh->state); 3413 clear_bit(STRIPE_SYNCING, &sh->state);
3414 } 3414 }
3415 3415
3416 /* If the failed drives are just a ReadError, then we might need 3416 /* If the failed drives are just a ReadError, then we might need
3417 * to progress the repair/check process 3417 * to progress the repair/check process
3418 */ 3418 */
3419 if (s.failed <= 2 && !conf->mddev->ro) 3419 if (s.failed <= 2 && !conf->mddev->ro)
3420 for (i = 0; i < s.failed; i++) { 3420 for (i = 0; i < s.failed; i++) {
3421 dev = &sh->dev[r6s.failed_num[i]]; 3421 dev = &sh->dev[r6s.failed_num[i]];
3422 if (test_bit(R5_ReadError, &dev->flags) 3422 if (test_bit(R5_ReadError, &dev->flags)
3423 && !test_bit(R5_LOCKED, &dev->flags) 3423 && !test_bit(R5_LOCKED, &dev->flags)
3424 && test_bit(R5_UPTODATE, &dev->flags) 3424 && test_bit(R5_UPTODATE, &dev->flags)
3425 ) { 3425 ) {
3426 if (!test_bit(R5_ReWrite, &dev->flags)) { 3426 if (!test_bit(R5_ReWrite, &dev->flags)) {
3427 set_bit(R5_Wantwrite, &dev->flags); 3427 set_bit(R5_Wantwrite, &dev->flags);
3428 set_bit(R5_ReWrite, &dev->flags); 3428 set_bit(R5_ReWrite, &dev->flags);
3429 set_bit(R5_LOCKED, &dev->flags); 3429 set_bit(R5_LOCKED, &dev->flags);
3430 s.locked++; 3430 s.locked++;
3431 } else { 3431 } else {
3432 /* let's read it back */ 3432 /* let's read it back */
3433 set_bit(R5_Wantread, &dev->flags); 3433 set_bit(R5_Wantread, &dev->flags);
3434 set_bit(R5_LOCKED, &dev->flags); 3434 set_bit(R5_LOCKED, &dev->flags);
3435 s.locked++; 3435 s.locked++;
3436 } 3436 }
3437 } 3437 }
3438 } 3438 }
3439 3439
3440 /* Finish reconstruct operations initiated by the expansion process */ 3440 /* Finish reconstruct operations initiated by the expansion process */
3441 if (sh->reconstruct_state == reconstruct_state_result) { 3441 if (sh->reconstruct_state == reconstruct_state_result) {
3442 sh->reconstruct_state = reconstruct_state_idle; 3442 sh->reconstruct_state = reconstruct_state_idle;
3443 clear_bit(STRIPE_EXPANDING, &sh->state); 3443 clear_bit(STRIPE_EXPANDING, &sh->state);
3444 for (i = conf->raid_disks; i--; ) { 3444 for (i = conf->raid_disks; i--; ) {
3445 set_bit(R5_Wantwrite, &sh->dev[i].flags); 3445 set_bit(R5_Wantwrite, &sh->dev[i].flags);
3446 set_bit(R5_LOCKED, &sh->dev[i].flags); 3446 set_bit(R5_LOCKED, &sh->dev[i].flags);
3447 s.locked++; 3447 s.locked++;
3448 } 3448 }
3449 } 3449 }
3450 3450
3451 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) && 3451 if (s.expanded && test_bit(STRIPE_EXPANDING, &sh->state) &&
3452 !sh->reconstruct_state) { 3452 !sh->reconstruct_state) {
3453 struct stripe_head *sh2 3453 struct stripe_head *sh2
3454 = get_active_stripe(conf, sh->sector, 1, 1, 1); 3454 = get_active_stripe(conf, sh->sector, 1, 1, 1);
3455 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) { 3455 if (sh2 && test_bit(STRIPE_EXPAND_SOURCE, &sh2->state)) {
3456 /* sh cannot be written until sh2 has been read. 3456 /* sh cannot be written until sh2 has been read.
3457 * so arrange for sh to be delayed a little 3457 * so arrange for sh to be delayed a little
3458 */ 3458 */
3459 set_bit(STRIPE_DELAYED, &sh->state); 3459 set_bit(STRIPE_DELAYED, &sh->state);
3460 set_bit(STRIPE_HANDLE, &sh->state); 3460 set_bit(STRIPE_HANDLE, &sh->state);
3461 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, 3461 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE,
3462 &sh2->state)) 3462 &sh2->state))
3463 atomic_inc(&conf->preread_active_stripes); 3463 atomic_inc(&conf->preread_active_stripes);
3464 release_stripe(sh2); 3464 release_stripe(sh2);
3465 goto unlock; 3465 goto unlock;
3466 } 3466 }
3467 if (sh2) 3467 if (sh2)
3468 release_stripe(sh2); 3468 release_stripe(sh2);
3469 3469
3470 /* Need to write out all blocks after computing P&Q */ 3470 /* Need to write out all blocks after computing P&Q */
3471 sh->disks = conf->raid_disks; 3471 sh->disks = conf->raid_disks;
3472 stripe_set_idx(sh->sector, conf, 0, sh); 3472 stripe_set_idx(sh->sector, conf, 0, sh);
3473 schedule_reconstruction(sh, &s, 1, 1); 3473 schedule_reconstruction(sh, &s, 1, 1);
3474 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) { 3474 } else if (s.expanded && !sh->reconstruct_state && s.locked == 0) {
3475 clear_bit(STRIPE_EXPAND_READY, &sh->state); 3475 clear_bit(STRIPE_EXPAND_READY, &sh->state);
3476 atomic_dec(&conf->reshape_stripes); 3476 atomic_dec(&conf->reshape_stripes);
3477 wake_up(&conf->wait_for_overlap); 3477 wake_up(&conf->wait_for_overlap);
3478 md_done_sync(conf->mddev, STRIPE_SECTORS, 1); 3478 md_done_sync(conf->mddev, STRIPE_SECTORS, 1);
3479 } 3479 }
3480 3480
3481 if (s.expanding && s.locked == 0 && 3481 if (s.expanding && s.locked == 0 &&
3482 !test_bit(STRIPE_COMPUTE_RUN, &sh->state)) 3482 !test_bit(STRIPE_COMPUTE_RUN, &sh->state))
3483 handle_stripe_expansion(conf, sh, &r6s); 3483 handle_stripe_expansion(conf, sh, &r6s);
3484 3484
3485 unlock: 3485 unlock:
3486 spin_unlock(&sh->lock); 3486 spin_unlock(&sh->lock);
3487 3487
3488 /* wait for this device to become unblocked */ 3488 /* wait for this device to become unblocked */
3489 if (unlikely(blocked_rdev)) 3489 if (unlikely(blocked_rdev))
3490 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev); 3490 md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);
3491 3491
3492 if (s.ops_request) 3492 if (s.ops_request)
3493 raid_run_ops(sh, s.ops_request); 3493 raid_run_ops(sh, s.ops_request);
3494 3494
3495 ops_run_io(sh, &s); 3495 ops_run_io(sh, &s);
3496 3496
3497 return_io(return_bi); 3497 return_io(return_bi);
3498 } 3498 }
3499 3499
3500 static void handle_stripe(struct stripe_head *sh) 3500 static void handle_stripe(struct stripe_head *sh)
3501 { 3501 {
3502 if (sh->raid_conf->level == 6) 3502 if (sh->raid_conf->level == 6)
3503 handle_stripe6(sh); 3503 handle_stripe6(sh);
3504 else 3504 else
3505 handle_stripe5(sh); 3505 handle_stripe5(sh);
3506 } 3506 }
3507 3507
3508 static void raid5_activate_delayed(raid5_conf_t *conf) 3508 static void raid5_activate_delayed(raid5_conf_t *conf)
3509 { 3509 {
3510 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) { 3510 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) {
3511 while (!list_empty(&conf->delayed_list)) { 3511 while (!list_empty(&conf->delayed_list)) {
3512 struct list_head *l = conf->delayed_list.next; 3512 struct list_head *l = conf->delayed_list.next;
3513 struct stripe_head *sh; 3513 struct stripe_head *sh;
3514 sh = list_entry(l, struct stripe_head, lru); 3514 sh = list_entry(l, struct stripe_head, lru);
3515 list_del_init(l); 3515 list_del_init(l);
3516 clear_bit(STRIPE_DELAYED, &sh->state); 3516 clear_bit(STRIPE_DELAYED, &sh->state);
3517 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) 3517 if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
3518 atomic_inc(&conf->preread_active_stripes); 3518 atomic_inc(&conf->preread_active_stripes);
3519 list_add_tail(&sh->lru, &conf->hold_list); 3519 list_add_tail(&sh->lru, &conf->hold_list);
3520 } 3520 }
3521 } else 3521 } else
3522 blk_plug_device(conf->mddev->queue); 3522 blk_plug_device(conf->mddev->queue);
3523 } 3523 }
3524 3524
3525 static void activate_bit_delay(raid5_conf_t *conf) 3525 static void activate_bit_delay(raid5_conf_t *conf)
3526 { 3526 {
3527 /* device_lock is held */ 3527 /* device_lock is held */
3528 struct list_head head; 3528 struct list_head head;
3529 list_add(&head, &conf->bitmap_list); 3529 list_add(&head, &conf->bitmap_list);
3530 list_del_init(&conf->bitmap_list); 3530 list_del_init(&conf->bitmap_list);
3531 while (!list_empty(&head)) { 3531 while (!list_empty(&head)) {
3532 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru); 3532 struct stripe_head *sh = list_entry(head.next, struct stripe_head, lru);
3533 list_del_init(&sh->lru); 3533 list_del_init(&sh->lru);
3534 atomic_inc(&sh->count); 3534 atomic_inc(&sh->count);
3535 __release_stripe(conf, sh); 3535 __release_stripe(conf, sh);
3536 } 3536 }
3537 } 3537 }
3538 3538
3539 static void unplug_slaves(mddev_t *mddev) 3539 static void unplug_slaves(mddev_t *mddev)
3540 { 3540 {
3541 raid5_conf_t *conf = mddev->private; 3541 raid5_conf_t *conf = mddev->private;
3542 int i; 3542 int i;
3543 int devs = max(conf->raid_disks, conf->previous_raid_disks);
3543 3544
3544 rcu_read_lock(); 3545 rcu_read_lock();
3545 for (i = 0; i < conf->raid_disks; i++) { 3546 for (i = 0; i < devs; i++) {
3546 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev); 3547 mdk_rdev_t *rdev = rcu_dereference(conf->disks[i].rdev);
3547 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { 3548 if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) {
3548 struct request_queue *r_queue = bdev_get_queue(rdev->bdev); 3549 struct request_queue *r_queue = bdev_get_queue(rdev->bdev);
3549 3550
3550 atomic_inc(&rdev->nr_pending); 3551 atomic_inc(&rdev->nr_pending);
3551 rcu_read_unlock(); 3552 rcu_read_unlock();
3552 3553
3553 blk_unplug(r_queue); 3554 blk_unplug(r_queue);
3554 3555
3555 rdev_dec_pending(rdev, mddev); 3556 rdev_dec_pending(rdev, mddev);
3556 rcu_read_lock(); 3557 rcu_read_lock();
3557 } 3558 }
3558 } 3559 }
3559 rcu_read_unlock(); 3560 rcu_read_unlock();
3560 } 3561 }
3561 3562
3562 static void raid5_unplug_device(struct request_queue *q) 3563 static void raid5_unplug_device(struct request_queue *q)
3563 { 3564 {
3564 mddev_t *mddev = q->queuedata; 3565 mddev_t *mddev = q->queuedata;
3565 raid5_conf_t *conf = mddev->private; 3566 raid5_conf_t *conf = mddev->private;
3566 unsigned long flags; 3567 unsigned long flags;
3567 3568
3568 spin_lock_irqsave(&conf->device_lock, flags); 3569 spin_lock_irqsave(&conf->device_lock, flags);
3569 3570
3570 if (blk_remove_plug(q)) { 3571 if (blk_remove_plug(q)) {
3571 conf->seq_flush++; 3572 conf->seq_flush++;
3572 raid5_activate_delayed(conf); 3573 raid5_activate_delayed(conf);
3573 } 3574 }
3574 md_wakeup_thread(mddev->thread); 3575 md_wakeup_thread(mddev->thread);
3575 3576
3576 spin_unlock_irqrestore(&conf->device_lock, flags); 3577 spin_unlock_irqrestore(&conf->device_lock, flags);
3577 3578
3578 unplug_slaves(mddev); 3579 unplug_slaves(mddev);
3579 } 3580 }
3580 3581
3581 static int raid5_congested(void *data, int bits) 3582 static int raid5_congested(void *data, int bits)
3582 { 3583 {
3583 mddev_t *mddev = data; 3584 mddev_t *mddev = data;
3584 raid5_conf_t *conf = mddev->private; 3585 raid5_conf_t *conf = mddev->private;
3585 3586
3586 /* No difference between reads and writes. Just check 3587 /* No difference between reads and writes. Just check
3587 * how busy the stripe_cache is 3588 * how busy the stripe_cache is
3588 */ 3589 */
3589 3590
3590 if (mddev_congested(mddev, bits)) 3591 if (mddev_congested(mddev, bits))
3591 return 1; 3592 return 1;
3592 if (conf->inactive_blocked) 3593 if (conf->inactive_blocked)
3593 return 1; 3594 return 1;
3594 if (conf->quiesce) 3595 if (conf->quiesce)
3595 return 1; 3596 return 1;
3596 if (list_empty_careful(&conf->inactive_list)) 3597 if (list_empty_careful(&conf->inactive_list))
3597 return 1; 3598 return 1;
3598 3599
3599 return 0; 3600 return 0;
3600 } 3601 }
3601 3602
3602 /* We want read requests to align with chunks where possible, 3603 /* We want read requests to align with chunks where possible,
3603 * but write requests don't need to. 3604 * but write requests don't need to.
3604 */ 3605 */
3605 static int raid5_mergeable_bvec(struct request_queue *q, 3606 static int raid5_mergeable_bvec(struct request_queue *q,
3606 struct bvec_merge_data *bvm, 3607 struct bvec_merge_data *bvm,
3607 struct bio_vec *biovec) 3608 struct bio_vec *biovec)
3608 { 3609 {
3609 mddev_t *mddev = q->queuedata; 3610 mddev_t *mddev = q->queuedata;
3610 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 3611 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
3611 int max; 3612 int max;
3612 unsigned int chunk_sectors = mddev->chunk_sectors; 3613 unsigned int chunk_sectors = mddev->chunk_sectors;
3613 unsigned int bio_sectors = bvm->bi_size >> 9; 3614 unsigned int bio_sectors = bvm->bi_size >> 9;
3614 3615
3615 if ((bvm->bi_rw & 1) == WRITE) 3616 if ((bvm->bi_rw & 1) == WRITE)
3616 return biovec->bv_len; /* always allow writes to be mergeable */ 3617 return biovec->bv_len; /* always allow writes to be mergeable */
3617 3618
3618 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3619 if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3619 chunk_sectors = mddev->new_chunk_sectors; 3620 chunk_sectors = mddev->new_chunk_sectors;
3620 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 3621 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
3621 if (max < 0) max = 0; 3622 if (max < 0) max = 0;
3622 if (max <= biovec->bv_len && bio_sectors == 0) 3623 if (max <= biovec->bv_len && bio_sectors == 0)
3623 return biovec->bv_len; 3624 return biovec->bv_len;
3624 else 3625 else
3625 return max; 3626 return max;
3626 } 3627 }
3627 3628
3628 3629
3629 static int in_chunk_boundary(mddev_t *mddev, struct bio *bio) 3630 static int in_chunk_boundary(mddev_t *mddev, struct bio *bio)
3630 { 3631 {
3631 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev); 3632 sector_t sector = bio->bi_sector + get_start_sect(bio->bi_bdev);
3632 unsigned int chunk_sectors = mddev->chunk_sectors; 3633 unsigned int chunk_sectors = mddev->chunk_sectors;
3633 unsigned int bio_sectors = bio->bi_size >> 9; 3634 unsigned int bio_sectors = bio->bi_size >> 9;
3634 3635
3635 if (mddev->new_chunk_sectors < mddev->chunk_sectors) 3636 if (mddev->new_chunk_sectors < mddev->chunk_sectors)
3636 chunk_sectors = mddev->new_chunk_sectors; 3637 chunk_sectors = mddev->new_chunk_sectors;
3637 return chunk_sectors >= 3638 return chunk_sectors >=
3638 ((sector & (chunk_sectors - 1)) + bio_sectors); 3639 ((sector & (chunk_sectors - 1)) + bio_sectors);
3639 } 3640 }
3640 3641
3641 /* 3642 /*
3642 * add bio to the retry LIFO ( in O(1) ... we are in interrupt ) 3643 * add bio to the retry LIFO ( in O(1) ... we are in interrupt )
3643 * later sampled by raid5d. 3644 * later sampled by raid5d.
3644 */ 3645 */
3645 static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf) 3646 static void add_bio_to_retry(struct bio *bi,raid5_conf_t *conf)
3646 { 3647 {
3647 unsigned long flags; 3648 unsigned long flags;
3648 3649
3649 spin_lock_irqsave(&conf->device_lock, flags); 3650 spin_lock_irqsave(&conf->device_lock, flags);
3650 3651
3651 bi->bi_next = conf->retry_read_aligned_list; 3652 bi->bi_next = conf->retry_read_aligned_list;
3652 conf->retry_read_aligned_list = bi; 3653 conf->retry_read_aligned_list = bi;
3653 3654
3654 spin_unlock_irqrestore(&conf->device_lock, flags); 3655 spin_unlock_irqrestore(&conf->device_lock, flags);
3655 md_wakeup_thread(conf->mddev->thread); 3656 md_wakeup_thread(conf->mddev->thread);
3656 } 3657 }
3657 3658
3658 3659
3659 static struct bio *remove_bio_from_retry(raid5_conf_t *conf) 3660 static struct bio *remove_bio_from_retry(raid5_conf_t *conf)
3660 { 3661 {
3661 struct bio *bi; 3662 struct bio *bi;
3662 3663
3663 bi = conf->retry_read_aligned; 3664 bi = conf->retry_read_aligned;
3664 if (bi) { 3665 if (bi) {
3665 conf->retry_read_aligned = NULL; 3666 conf->retry_read_aligned = NULL;
3666 return bi; 3667 return bi;
3667 } 3668 }
3668 bi = conf->retry_read_aligned_list; 3669 bi = conf->retry_read_aligned_list;
3669 if(bi) { 3670 if(bi) {
3670 conf->retry_read_aligned_list = bi->bi_next; 3671 conf->retry_read_aligned_list = bi->bi_next;
3671 bi->bi_next = NULL; 3672 bi->bi_next = NULL;
3672 /* 3673 /*
3673 * this sets the active strip count to 1 and the processed 3674 * this sets the active strip count to 1 and the processed
3674 * strip count to zero (upper 8 bits) 3675 * strip count to zero (upper 8 bits)
3675 */ 3676 */
3676 bi->bi_phys_segments = 1; /* biased count of active stripes */ 3677 bi->bi_phys_segments = 1; /* biased count of active stripes */
3677 } 3678 }
3678 3679
3679 return bi; 3680 return bi;
3680 } 3681 }
3681 3682
3682 3683
3683 /* 3684 /*
3684 * The "raid5_align_endio" should check if the read succeeded and if it 3685 * The "raid5_align_endio" should check if the read succeeded and if it
3685 * did, call bio_endio on the original bio (having bio_put the new bio 3686 * did, call bio_endio on the original bio (having bio_put the new bio
3686 * first). 3687 * first).
3687 * If the read failed.. 3688 * If the read failed..
3688 */ 3689 */
3689 static void raid5_align_endio(struct bio *bi, int error) 3690 static void raid5_align_endio(struct bio *bi, int error)
3690 { 3691 {
3691 struct bio* raid_bi = bi->bi_private; 3692 struct bio* raid_bi = bi->bi_private;
3692 mddev_t *mddev; 3693 mddev_t *mddev;
3693 raid5_conf_t *conf; 3694 raid5_conf_t *conf;
3694 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags); 3695 int uptodate = test_bit(BIO_UPTODATE, &bi->bi_flags);
3695 mdk_rdev_t *rdev; 3696 mdk_rdev_t *rdev;
3696 3697
3697 bio_put(bi); 3698 bio_put(bi);
3698 3699
3699 mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata; 3700 mddev = raid_bi->bi_bdev->bd_disk->queue->queuedata;
3700 conf = mddev->private; 3701 conf = mddev->private;
3701 rdev = (void*)raid_bi->bi_next; 3702 rdev = (void*)raid_bi->bi_next;
3702 raid_bi->bi_next = NULL; 3703 raid_bi->bi_next = NULL;
3703 3704
3704 rdev_dec_pending(rdev, conf->mddev); 3705 rdev_dec_pending(rdev, conf->mddev);
3705 3706
3706 if (!error && uptodate) { 3707 if (!error && uptodate) {
3707 bio_endio(raid_bi, 0); 3708 bio_endio(raid_bi, 0);
3708 if (atomic_dec_and_test(&conf->active_aligned_reads)) 3709 if (atomic_dec_and_test(&conf->active_aligned_reads))
3709 wake_up(&conf->wait_for_stripe); 3710 wake_up(&conf->wait_for_stripe);
3710 return; 3711 return;
3711 } 3712 }
3712 3713
3713 3714
3714 pr_debug("raid5_align_endio : io error...handing IO for a retry\n"); 3715 pr_debug("raid5_align_endio : io error...handing IO for a retry\n");
3715 3716
3716 add_bio_to_retry(raid_bi, conf); 3717 add_bio_to_retry(raid_bi, conf);
3717 } 3718 }
3718 3719
3719 static int bio_fits_rdev(struct bio *bi) 3720 static int bio_fits_rdev(struct bio *bi)
3720 { 3721 {
3721 struct request_queue *q = bdev_get_queue(bi->bi_bdev); 3722 struct request_queue *q = bdev_get_queue(bi->bi_bdev);
3722 3723
3723 if ((bi->bi_size>>9) > queue_max_sectors(q)) 3724 if ((bi->bi_size>>9) > queue_max_sectors(q))
3724 return 0; 3725 return 0;
3725 blk_recount_segments(q, bi); 3726 blk_recount_segments(q, bi);
3726 if (bi->bi_phys_segments > queue_max_phys_segments(q)) 3727 if (bi->bi_phys_segments > queue_max_phys_segments(q))
3727 return 0; 3728 return 0;
3728 3729
3729 if (q->merge_bvec_fn) 3730 if (q->merge_bvec_fn)
3730 /* it's too hard to apply the merge_bvec_fn at this stage, 3731 /* it's too hard to apply the merge_bvec_fn at this stage,
3731 * just just give up 3732 * just just give up
3732 */ 3733 */
3733 return 0; 3734 return 0;
3734 3735
3735 return 1; 3736 return 1;
3736 } 3737 }
3737 3738
3738 3739
3739 static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio) 3740 static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
3740 { 3741 {
3741 mddev_t *mddev = q->queuedata; 3742 mddev_t *mddev = q->queuedata;
3742 raid5_conf_t *conf = mddev->private; 3743 raid5_conf_t *conf = mddev->private;
3743 unsigned int dd_idx; 3744 unsigned int dd_idx;
3744 struct bio* align_bi; 3745 struct bio* align_bi;
3745 mdk_rdev_t *rdev; 3746 mdk_rdev_t *rdev;
3746 3747
3747 if (!in_chunk_boundary(mddev, raid_bio)) { 3748 if (!in_chunk_boundary(mddev, raid_bio)) {
3748 pr_debug("chunk_aligned_read : non aligned\n"); 3749 pr_debug("chunk_aligned_read : non aligned\n");
3749 return 0; 3750 return 0;
3750 } 3751 }
3751 /* 3752 /*
3752 * use bio_clone to make a copy of the bio 3753 * use bio_clone to make a copy of the bio
3753 */ 3754 */
3754 align_bi = bio_clone(raid_bio, GFP_NOIO); 3755 align_bi = bio_clone(raid_bio, GFP_NOIO);
3755 if (!align_bi) 3756 if (!align_bi)
3756 return 0; 3757 return 0;
3757 /* 3758 /*
3758 * set bi_end_io to a new function, and set bi_private to the 3759 * set bi_end_io to a new function, and set bi_private to the
3759 * original bio. 3760 * original bio.
3760 */ 3761 */
3761 align_bi->bi_end_io = raid5_align_endio; 3762 align_bi->bi_end_io = raid5_align_endio;
3762 align_bi->bi_private = raid_bio; 3763 align_bi->bi_private = raid_bio;
3763 /* 3764 /*
3764 * compute position 3765 * compute position
3765 */ 3766 */
3766 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector, 3767 align_bi->bi_sector = raid5_compute_sector(conf, raid_bio->bi_sector,
3767 0, 3768 0,
3768 &dd_idx, NULL); 3769 &dd_idx, NULL);
3769 3770
3770 rcu_read_lock(); 3771 rcu_read_lock();
3771 rdev = rcu_dereference(conf->disks[dd_idx].rdev); 3772 rdev = rcu_dereference(conf->disks[dd_idx].rdev);
3772 if (rdev && test_bit(In_sync, &rdev->flags)) { 3773 if (rdev && test_bit(In_sync, &rdev->flags)) {
3773 atomic_inc(&rdev->nr_pending); 3774 atomic_inc(&rdev->nr_pending);
3774 rcu_read_unlock(); 3775 rcu_read_unlock();
3775 raid_bio->bi_next = (void*)rdev; 3776 raid_bio->bi_next = (void*)rdev;
3776 align_bi->bi_bdev = rdev->bdev; 3777 align_bi->bi_bdev = rdev->bdev;
3777 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID); 3778 align_bi->bi_flags &= ~(1 << BIO_SEG_VALID);
3778 align_bi->bi_sector += rdev->data_offset; 3779 align_bi->bi_sector += rdev->data_offset;
3779 3780
3780 if (!bio_fits_rdev(align_bi)) { 3781 if (!bio_fits_rdev(align_bi)) {
3781 /* too big in some way */ 3782 /* too big in some way */
3782 bio_put(align_bi); 3783 bio_put(align_bi);
3783 rdev_dec_pending(rdev, mddev); 3784 rdev_dec_pending(rdev, mddev);
3784 return 0; 3785 return 0;
3785 } 3786 }
3786 3787
3787 spin_lock_irq(&conf->device_lock); 3788 spin_lock_irq(&conf->device_lock);
3788 wait_event_lock_irq(conf->wait_for_stripe, 3789 wait_event_lock_irq(conf->wait_for_stripe,
3789 conf->quiesce == 0, 3790 conf->quiesce == 0,
3790 conf->device_lock, /* nothing */); 3791 conf->device_lock, /* nothing */);
3791 atomic_inc(&conf->active_aligned_reads); 3792 atomic_inc(&conf->active_aligned_reads);
3792 spin_unlock_irq(&conf->device_lock); 3793 spin_unlock_irq(&conf->device_lock);
3793 3794
3794 generic_make_request(align_bi); 3795 generic_make_request(align_bi);
3795 return 1; 3796 return 1;
3796 } else { 3797 } else {
3797 rcu_read_unlock(); 3798 rcu_read_unlock();
3798 bio_put(align_bi); 3799 bio_put(align_bi);
3799 return 0; 3800 return 0;
3800 } 3801 }
3801 } 3802 }
3802 3803
3803 /* __get_priority_stripe - get the next stripe to process 3804 /* __get_priority_stripe - get the next stripe to process
3804 * 3805 *
3805 * Full stripe writes are allowed to pass preread active stripes up until 3806 * Full stripe writes are allowed to pass preread active stripes up until
3806 * the bypass_threshold is exceeded. In general the bypass_count 3807 * the bypass_threshold is exceeded. In general the bypass_count
3807 * increments when the handle_list is handled before the hold_list; however, it 3808 * increments when the handle_list is handled before the hold_list; however, it
3808 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a 3809 * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
3809 * stripe with in flight i/o. The bypass_count will be reset when the 3810 * stripe with in flight i/o. The bypass_count will be reset when the
3810 * head of the hold_list has changed, i.e. the head was promoted to the 3811 * head of the hold_list has changed, i.e. the head was promoted to the
3811 * handle_list. 3812 * handle_list.
3812 */ 3813 */
3813 static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf) 3814 static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
3814 { 3815 {
3815 struct stripe_head *sh; 3816 struct stripe_head *sh;
3816 3817
3817 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", 3818 pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
3818 __func__, 3819 __func__,
3819 list_empty(&conf->handle_list) ? "empty" : "busy", 3820 list_empty(&conf->handle_list) ? "empty" : "busy",
3820 list_empty(&conf->hold_list) ? "empty" : "busy", 3821 list_empty(&conf->hold_list) ? "empty" : "busy",
3821 atomic_read(&conf->pending_full_writes), conf->bypass_count); 3822 atomic_read(&conf->pending_full_writes), conf->bypass_count);
3822 3823
3823 if (!list_empty(&conf->handle_list)) { 3824 if (!list_empty(&conf->handle_list)) {
3824 sh = list_entry(conf->handle_list.next, typeof(*sh), lru); 3825 sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
3825 3826
3826 if (list_empty(&conf->hold_list)) 3827 if (list_empty(&conf->hold_list))
3827 conf->bypass_count = 0; 3828 conf->bypass_count = 0;
3828 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) { 3829 else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
3829 if (conf->hold_list.next == conf->last_hold) 3830 if (conf->hold_list.next == conf->last_hold)
3830 conf->bypass_count++; 3831 conf->bypass_count++;
3831 else { 3832 else {
3832 conf->last_hold = conf->hold_list.next; 3833 conf->last_hold = conf->hold_list.next;
3833 conf->bypass_count -= conf->bypass_threshold; 3834 conf->bypass_count -= conf->bypass_threshold;
3834 if (conf->bypass_count < 0) 3835 if (conf->bypass_count < 0)
3835 conf->bypass_count = 0; 3836 conf->bypass_count = 0;
3836 } 3837 }
3837 } 3838 }
3838 } else if (!list_empty(&conf->hold_list) && 3839 } else if (!list_empty(&conf->hold_list) &&
3839 ((conf->bypass_threshold && 3840 ((conf->bypass_threshold &&
3840 conf->bypass_count > conf->bypass_threshold) || 3841 conf->bypass_count > conf->bypass_threshold) ||
3841 atomic_read(&conf->pending_full_writes) == 0)) { 3842 atomic_read(&conf->pending_full_writes) == 0)) {
3842 sh = list_entry(conf->hold_list.next, 3843 sh = list_entry(conf->hold_list.next,
3843 typeof(*sh), lru); 3844 typeof(*sh), lru);
3844 conf->bypass_count -= conf->bypass_threshold; 3845 conf->bypass_count -= conf->bypass_threshold;
3845 if (conf->bypass_count < 0) 3846 if (conf->bypass_count < 0)
3846 conf->bypass_count = 0; 3847 conf->bypass_count = 0;
3847 } else 3848 } else
3848 return NULL; 3849 return NULL;
3849 3850
3850 list_del_init(&sh->lru); 3851 list_del_init(&sh->lru);
3851 atomic_inc(&sh->count); 3852 atomic_inc(&sh->count);
3852 BUG_ON(atomic_read(&sh->count) != 1); 3853 BUG_ON(atomic_read(&sh->count) != 1);
3853 return sh; 3854 return sh;
3854 } 3855 }
3855 3856
3856 static int make_request(struct request_queue *q, struct bio * bi) 3857 static int make_request(struct request_queue *q, struct bio * bi)
3857 { 3858 {
3858 mddev_t *mddev = q->queuedata; 3859 mddev_t *mddev = q->queuedata;
3859 raid5_conf_t *conf = mddev->private; 3860 raid5_conf_t *conf = mddev->private;
3860 int dd_idx; 3861 int dd_idx;
3861 sector_t new_sector; 3862 sector_t new_sector;
3862 sector_t logical_sector, last_sector; 3863 sector_t logical_sector, last_sector;
3863 struct stripe_head *sh; 3864 struct stripe_head *sh;
3864 const int rw = bio_data_dir(bi); 3865 const int rw = bio_data_dir(bi);
3865 int cpu, remaining; 3866 int cpu, remaining;
3866 3867
3867 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) { 3868 if (unlikely(bio_rw_flagged(bi, BIO_RW_BARRIER))) {
3868 bio_endio(bi, -EOPNOTSUPP); 3869 bio_endio(bi, -EOPNOTSUPP);
3869 return 0; 3870 return 0;
3870 } 3871 }
3871 3872
3872 md_write_start(mddev, bi); 3873 md_write_start(mddev, bi);
3873 3874
3874 cpu = part_stat_lock(); 3875 cpu = part_stat_lock();
3875 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 3876 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
3876 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], 3877 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw],
3877 bio_sectors(bi)); 3878 bio_sectors(bi));
3878 part_stat_unlock(); 3879 part_stat_unlock();
3879 3880
3880 if (rw == READ && 3881 if (rw == READ &&
3881 mddev->reshape_position == MaxSector && 3882 mddev->reshape_position == MaxSector &&
3882 chunk_aligned_read(q,bi)) 3883 chunk_aligned_read(q,bi))
3883 return 0; 3884 return 0;
3884 3885
3885 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 3886 logical_sector = bi->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
3886 last_sector = bi->bi_sector + (bi->bi_size>>9); 3887 last_sector = bi->bi_sector + (bi->bi_size>>9);
3887 bi->bi_next = NULL; 3888 bi->bi_next = NULL;
3888 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */ 3889 bi->bi_phys_segments = 1; /* over-loaded to count active stripes */
3889 3890
3890 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { 3891 for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) {
3891 DEFINE_WAIT(w); 3892 DEFINE_WAIT(w);
3892 int disks, data_disks; 3893 int disks, data_disks;
3893 int previous; 3894 int previous;
3894 3895
3895 retry: 3896 retry:
3896 previous = 0; 3897 previous = 0;
3897 disks = conf->raid_disks; 3898 disks = conf->raid_disks;
3898 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); 3899 prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE);
3899 if (unlikely(conf->reshape_progress != MaxSector)) { 3900 if (unlikely(conf->reshape_progress != MaxSector)) {
3900 /* spinlock is needed as reshape_progress may be 3901 /* spinlock is needed as reshape_progress may be
3901 * 64bit on a 32bit platform, and so it might be 3902 * 64bit on a 32bit platform, and so it might be
3902 * possible to see a half-updated value 3903 * possible to see a half-updated value
3903 * Ofcourse reshape_progress could change after 3904 * Ofcourse reshape_progress could change after
3904 * the lock is dropped, so once we get a reference 3905 * the lock is dropped, so once we get a reference
3905 * to the stripe that we think it is, we will have 3906 * to the stripe that we think it is, we will have
3906 * to check again. 3907 * to check again.
3907 */ 3908 */
3908 spin_lock_irq(&conf->device_lock); 3909 spin_lock_irq(&conf->device_lock);
3909 if (mddev->delta_disks < 0 3910 if (mddev->delta_disks < 0
3910 ? logical_sector < conf->reshape_progress 3911 ? logical_sector < conf->reshape_progress
3911 : logical_sector >= conf->reshape_progress) { 3912 : logical_sector >= conf->reshape_progress) {
3912 disks = conf->previous_raid_disks; 3913 disks = conf->previous_raid_disks;
3913 previous = 1; 3914 previous = 1;
3914 } else { 3915 } else {
3915 if (mddev->delta_disks < 0 3916 if (mddev->delta_disks < 0
3916 ? logical_sector < conf->reshape_safe 3917 ? logical_sector < conf->reshape_safe
3917 : logical_sector >= conf->reshape_safe) { 3918 : logical_sector >= conf->reshape_safe) {
3918 spin_unlock_irq(&conf->device_lock); 3919 spin_unlock_irq(&conf->device_lock);
3919 schedule(); 3920 schedule();
3920 goto retry; 3921 goto retry;
3921 } 3922 }
3922 } 3923 }
3923 spin_unlock_irq(&conf->device_lock); 3924 spin_unlock_irq(&conf->device_lock);
3924 } 3925 }
3925 data_disks = disks - conf->max_degraded; 3926 data_disks = disks - conf->max_degraded;
3926 3927
3927 new_sector = raid5_compute_sector(conf, logical_sector, 3928 new_sector = raid5_compute_sector(conf, logical_sector,
3928 previous, 3929 previous,
3929 &dd_idx, NULL); 3930 &dd_idx, NULL);
3930 pr_debug("raid5: make_request, sector %llu logical %llu\n", 3931 pr_debug("raid5: make_request, sector %llu logical %llu\n",
3931 (unsigned long long)new_sector, 3932 (unsigned long long)new_sector,
3932 (unsigned long long)logical_sector); 3933 (unsigned long long)logical_sector);
3933 3934
3934 sh = get_active_stripe(conf, new_sector, previous, 3935 sh = get_active_stripe(conf, new_sector, previous,
3935 (bi->bi_rw&RWA_MASK), 0); 3936 (bi->bi_rw&RWA_MASK), 0);
3936 if (sh) { 3937 if (sh) {
3937 if (unlikely(previous)) { 3938 if (unlikely(previous)) {
3938 /* expansion might have moved on while waiting for a 3939 /* expansion might have moved on while waiting for a
3939 * stripe, so we must do the range check again. 3940 * stripe, so we must do the range check again.
3940 * Expansion could still move past after this 3941 * Expansion could still move past after this
3941 * test, but as we are holding a reference to 3942 * test, but as we are holding a reference to
3942 * 'sh', we know that if that happens, 3943 * 'sh', we know that if that happens,
3943 * STRIPE_EXPANDING will get set and the expansion 3944 * STRIPE_EXPANDING will get set and the expansion
3944 * won't proceed until we finish with the stripe. 3945 * won't proceed until we finish with the stripe.
3945 */ 3946 */
3946 int must_retry = 0; 3947 int must_retry = 0;
3947 spin_lock_irq(&conf->device_lock); 3948 spin_lock_irq(&conf->device_lock);
3948 if (mddev->delta_disks < 0 3949 if (mddev->delta_disks < 0
3949 ? logical_sector >= conf->reshape_progress 3950 ? logical_sector >= conf->reshape_progress
3950 : logical_sector < conf->reshape_progress) 3951 : logical_sector < conf->reshape_progress)
3951 /* mismatch, need to try again */ 3952 /* mismatch, need to try again */
3952 must_retry = 1; 3953 must_retry = 1;
3953 spin_unlock_irq(&conf->device_lock); 3954 spin_unlock_irq(&conf->device_lock);
3954 if (must_retry) { 3955 if (must_retry) {
3955 release_stripe(sh); 3956 release_stripe(sh);
3956 schedule(); 3957 schedule();
3957 goto retry; 3958 goto retry;
3958 } 3959 }
3959 } 3960 }
3960 3961
3961 if (bio_data_dir(bi) == WRITE && 3962 if (bio_data_dir(bi) == WRITE &&
3962 logical_sector >= mddev->suspend_lo && 3963 logical_sector >= mddev->suspend_lo &&
3963 logical_sector < mddev->suspend_hi) { 3964 logical_sector < mddev->suspend_hi) {
3964 release_stripe(sh); 3965 release_stripe(sh);
3965 /* As the suspend_* range is controlled by 3966 /* As the suspend_* range is controlled by
3966 * userspace, we want an interruptible 3967 * userspace, we want an interruptible
3967 * wait. 3968 * wait.
3968 */ 3969 */
3969 flush_signals(current); 3970 flush_signals(current);
3970 prepare_to_wait(&conf->wait_for_overlap, 3971 prepare_to_wait(&conf->wait_for_overlap,
3971 &w, TASK_INTERRUPTIBLE); 3972 &w, TASK_INTERRUPTIBLE);
3972 if (logical_sector >= mddev->suspend_lo && 3973 if (logical_sector >= mddev->suspend_lo &&
3973 logical_sector < mddev->suspend_hi) 3974 logical_sector < mddev->suspend_hi)
3974 schedule(); 3975 schedule();
3975 goto retry; 3976 goto retry;
3976 } 3977 }
3977 3978
3978 if (test_bit(STRIPE_EXPANDING, &sh->state) || 3979 if (test_bit(STRIPE_EXPANDING, &sh->state) ||
3979 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) { 3980 !add_stripe_bio(sh, bi, dd_idx, (bi->bi_rw&RW_MASK))) {
3980 /* Stripe is busy expanding or 3981 /* Stripe is busy expanding or
3981 * add failed due to overlap. Flush everything 3982 * add failed due to overlap. Flush everything
3982 * and wait a while 3983 * and wait a while
3983 */ 3984 */
3984 raid5_unplug_device(mddev->queue); 3985 raid5_unplug_device(mddev->queue);
3985 release_stripe(sh); 3986 release_stripe(sh);
3986 schedule(); 3987 schedule();
3987 goto retry; 3988 goto retry;
3988 } 3989 }
3989 finish_wait(&conf->wait_for_overlap, &w); 3990 finish_wait(&conf->wait_for_overlap, &w);
3990 set_bit(STRIPE_HANDLE, &sh->state); 3991 set_bit(STRIPE_HANDLE, &sh->state);
3991 clear_bit(STRIPE_DELAYED, &sh->state); 3992 clear_bit(STRIPE_DELAYED, &sh->state);
3992 release_stripe(sh); 3993 release_stripe(sh);
3993 } else { 3994 } else {
3994 /* cannot get stripe for read-ahead, just give-up */ 3995 /* cannot get stripe for read-ahead, just give-up */
3995 clear_bit(BIO_UPTODATE, &bi->bi_flags); 3996 clear_bit(BIO_UPTODATE, &bi->bi_flags);
3996 finish_wait(&conf->wait_for_overlap, &w); 3997 finish_wait(&conf->wait_for_overlap, &w);
3997 break; 3998 break;
3998 } 3999 }
3999 4000
4000 } 4001 }
4001 spin_lock_irq(&conf->device_lock); 4002 spin_lock_irq(&conf->device_lock);
4002 remaining = raid5_dec_bi_phys_segments(bi); 4003 remaining = raid5_dec_bi_phys_segments(bi);
4003 spin_unlock_irq(&conf->device_lock); 4004 spin_unlock_irq(&conf->device_lock);
4004 if (remaining == 0) { 4005 if (remaining == 0) {
4005 4006
4006 if ( rw == WRITE ) 4007 if ( rw == WRITE )
4007 md_write_end(mddev); 4008 md_write_end(mddev);
4008 4009
4009 bio_endio(bi, 0); 4010 bio_endio(bi, 0);
4010 } 4011 }
4011 return 0; 4012 return 0;
4012 } 4013 }
4013 4014
4014 static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks); 4015 static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);
4015 4016
4016 static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped) 4017 static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
4017 { 4018 {
4018 /* reshaping is quite different to recovery/resync so it is 4019 /* reshaping is quite different to recovery/resync so it is
4019 * handled quite separately ... here. 4020 * handled quite separately ... here.
4020 * 4021 *
4021 * On each call to sync_request, we gather one chunk worth of 4022 * On each call to sync_request, we gather one chunk worth of
4022 * destination stripes and flag them as expanding. 4023 * destination stripes and flag them as expanding.
4023 * Then we find all the source stripes and request reads. 4024 * Then we find all the source stripes and request reads.
4024 * As the reads complete, handle_stripe will copy the data 4025 * As the reads complete, handle_stripe will copy the data
4025 * into the destination stripe and release that stripe. 4026 * into the destination stripe and release that stripe.
4026 */ 4027 */
4027 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4028 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
4028 struct stripe_head *sh; 4029 struct stripe_head *sh;
4029 sector_t first_sector, last_sector; 4030 sector_t first_sector, last_sector;
4030 int raid_disks = conf->previous_raid_disks; 4031 int raid_disks = conf->previous_raid_disks;
4031 int data_disks = raid_disks - conf->max_degraded; 4032 int data_disks = raid_disks - conf->max_degraded;
4032 int new_data_disks = conf->raid_disks - conf->max_degraded; 4033 int new_data_disks = conf->raid_disks - conf->max_degraded;
4033 int i; 4034 int i;
4034 int dd_idx; 4035 int dd_idx;
4035 sector_t writepos, readpos, safepos; 4036 sector_t writepos, readpos, safepos;
4036 sector_t stripe_addr; 4037 sector_t stripe_addr;
4037 int reshape_sectors; 4038 int reshape_sectors;
4038 struct list_head stripes; 4039 struct list_head stripes;
4039 4040
4040 if (sector_nr == 0) { 4041 if (sector_nr == 0) {
4041 /* If restarting in the middle, skip the initial sectors */ 4042 /* If restarting in the middle, skip the initial sectors */
4042 if (mddev->delta_disks < 0 && 4043 if (mddev->delta_disks < 0 &&
4043 conf->reshape_progress < raid5_size(mddev, 0, 0)) { 4044 conf->reshape_progress < raid5_size(mddev, 0, 0)) {
4044 sector_nr = raid5_size(mddev, 0, 0) 4045 sector_nr = raid5_size(mddev, 0, 0)
4045 - conf->reshape_progress; 4046 - conf->reshape_progress;
4046 } else if (mddev->delta_disks >= 0 && 4047 } else if (mddev->delta_disks >= 0 &&
4047 conf->reshape_progress > 0) 4048 conf->reshape_progress > 0)
4048 sector_nr = conf->reshape_progress; 4049 sector_nr = conf->reshape_progress;
4049 sector_div(sector_nr, new_data_disks); 4050 sector_div(sector_nr, new_data_disks);
4050 if (sector_nr) { 4051 if (sector_nr) {
4051 *skipped = 1; 4052 *skipped = 1;
4052 return sector_nr; 4053 return sector_nr;
4053 } 4054 }
4054 } 4055 }
4055 4056
4056 /* We need to process a full chunk at a time. 4057 /* We need to process a full chunk at a time.
4057 * If old and new chunk sizes differ, we need to process the 4058 * If old and new chunk sizes differ, we need to process the
4058 * largest of these 4059 * largest of these
4059 */ 4060 */
4060 if (mddev->new_chunk_sectors > mddev->chunk_sectors) 4061 if (mddev->new_chunk_sectors > mddev->chunk_sectors)
4061 reshape_sectors = mddev->new_chunk_sectors; 4062 reshape_sectors = mddev->new_chunk_sectors;
4062 else 4063 else
4063 reshape_sectors = mddev->chunk_sectors; 4064 reshape_sectors = mddev->chunk_sectors;
4064 4065
4065 /* we update the metadata when there is more than 3Meg 4066 /* we update the metadata when there is more than 3Meg
4066 * in the block range (that is rather arbitrary, should 4067 * in the block range (that is rather arbitrary, should
4067 * probably be time based) or when the data about to be 4068 * probably be time based) or when the data about to be
4068 * copied would over-write the source of the data at 4069 * copied would over-write the source of the data at
4069 * the front of the range. 4070 * the front of the range.
4070 * i.e. one new_stripe along from reshape_progress new_maps 4071 * i.e. one new_stripe along from reshape_progress new_maps
4071 * to after where reshape_safe old_maps to 4072 * to after where reshape_safe old_maps to
4072 */ 4073 */
4073 writepos = conf->reshape_progress; 4074 writepos = conf->reshape_progress;
4074 sector_div(writepos, new_data_disks); 4075 sector_div(writepos, new_data_disks);
4075 readpos = conf->reshape_progress; 4076 readpos = conf->reshape_progress;
4076 sector_div(readpos, data_disks); 4077 sector_div(readpos, data_disks);
4077 safepos = conf->reshape_safe; 4078 safepos = conf->reshape_safe;
4078 sector_div(safepos, data_disks); 4079 sector_div(safepos, data_disks);
4079 if (mddev->delta_disks < 0) { 4080 if (mddev->delta_disks < 0) {
4080 writepos -= min_t(sector_t, reshape_sectors, writepos); 4081 writepos -= min_t(sector_t, reshape_sectors, writepos);
4081 readpos += reshape_sectors; 4082 readpos += reshape_sectors;
4082 safepos += reshape_sectors; 4083 safepos += reshape_sectors;
4083 } else { 4084 } else {
4084 writepos += reshape_sectors; 4085 writepos += reshape_sectors;
4085 readpos -= min_t(sector_t, reshape_sectors, readpos); 4086 readpos -= min_t(sector_t, reshape_sectors, readpos);
4086 safepos -= min_t(sector_t, reshape_sectors, safepos); 4087 safepos -= min_t(sector_t, reshape_sectors, safepos);
4087 } 4088 }
4088 4089
4089 /* 'writepos' is the most advanced device address we might write. 4090 /* 'writepos' is the most advanced device address we might write.
4090 * 'readpos' is the least advanced device address we might read. 4091 * 'readpos' is the least advanced device address we might read.
4091 * 'safepos' is the least address recorded in the metadata as having 4092 * 'safepos' is the least address recorded in the metadata as having
4092 * been reshaped. 4093 * been reshaped.
4093 * If 'readpos' is behind 'writepos', then there is no way that we can 4094 * If 'readpos' is behind 'writepos', then there is no way that we can
4094 * ensure safety in the face of a crash - that must be done by userspace 4095 * ensure safety in the face of a crash - that must be done by userspace
4095 * making a backup of the data. So in that case there is no particular 4096 * making a backup of the data. So in that case there is no particular
4096 * rush to update metadata. 4097 * rush to update metadata.
4097 * Otherwise if 'safepos' is behind 'writepos', then we really need to 4098 * Otherwise if 'safepos' is behind 'writepos', then we really need to
4098 * update the metadata to advance 'safepos' to match 'readpos' so that 4099 * update the metadata to advance 'safepos' to match 'readpos' so that
4099 * we can be safe in the event of a crash. 4100 * we can be safe in the event of a crash.
4100 * So we insist on updating metadata if safepos is behind writepos and 4101 * So we insist on updating metadata if safepos is behind writepos and
4101 * readpos is beyond writepos. 4102 * readpos is beyond writepos.
4102 * In any case, update the metadata every 10 seconds. 4103 * In any case, update the metadata every 10 seconds.
4103 * Maybe that number should be configurable, but I'm not sure it is 4104 * Maybe that number should be configurable, but I'm not sure it is
4104 * worth it.... maybe it could be a multiple of safemode_delay??? 4105 * worth it.... maybe it could be a multiple of safemode_delay???
4105 */ 4106 */
4106 if ((mddev->delta_disks < 0 4107 if ((mddev->delta_disks < 0
4107 ? (safepos > writepos && readpos < writepos) 4108 ? (safepos > writepos && readpos < writepos)
4108 : (safepos < writepos && readpos > writepos)) || 4109 : (safepos < writepos && readpos > writepos)) ||
4109 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) { 4110 time_after(jiffies, conf->reshape_checkpoint + 10*HZ)) {
4110 /* Cannot proceed until we've updated the superblock... */ 4111 /* Cannot proceed until we've updated the superblock... */
4111 wait_event(conf->wait_for_overlap, 4112 wait_event(conf->wait_for_overlap,
4112 atomic_read(&conf->reshape_stripes)==0); 4113 atomic_read(&conf->reshape_stripes)==0);
4113 mddev->reshape_position = conf->reshape_progress; 4114 mddev->reshape_position = conf->reshape_progress;
4114 mddev->curr_resync_completed = mddev->curr_resync; 4115 mddev->curr_resync_completed = mddev->curr_resync;
4115 conf->reshape_checkpoint = jiffies; 4116 conf->reshape_checkpoint = jiffies;
4116 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4117 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4117 md_wakeup_thread(mddev->thread); 4118 md_wakeup_thread(mddev->thread);
4118 wait_event(mddev->sb_wait, mddev->flags == 0 || 4119 wait_event(mddev->sb_wait, mddev->flags == 0 ||
4119 kthread_should_stop()); 4120 kthread_should_stop());
4120 spin_lock_irq(&conf->device_lock); 4121 spin_lock_irq(&conf->device_lock);
4121 conf->reshape_safe = mddev->reshape_position; 4122 conf->reshape_safe = mddev->reshape_position;
4122 spin_unlock_irq(&conf->device_lock); 4123 spin_unlock_irq(&conf->device_lock);
4123 wake_up(&conf->wait_for_overlap); 4124 wake_up(&conf->wait_for_overlap);
4124 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4125 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4125 } 4126 }
4126 4127
4127 if (mddev->delta_disks < 0) { 4128 if (mddev->delta_disks < 0) {
4128 BUG_ON(conf->reshape_progress == 0); 4129 BUG_ON(conf->reshape_progress == 0);
4129 stripe_addr = writepos; 4130 stripe_addr = writepos;
4130 BUG_ON((mddev->dev_sectors & 4131 BUG_ON((mddev->dev_sectors &
4131 ~((sector_t)reshape_sectors - 1)) 4132 ~((sector_t)reshape_sectors - 1))
4132 - reshape_sectors - stripe_addr 4133 - reshape_sectors - stripe_addr
4133 != sector_nr); 4134 != sector_nr);
4134 } else { 4135 } else {
4135 BUG_ON(writepos != sector_nr + reshape_sectors); 4136 BUG_ON(writepos != sector_nr + reshape_sectors);
4136 stripe_addr = sector_nr; 4137 stripe_addr = sector_nr;
4137 } 4138 }
4138 INIT_LIST_HEAD(&stripes); 4139 INIT_LIST_HEAD(&stripes);
4139 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { 4140 for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) {
4140 int j; 4141 int j;
4141 int skipped_disk = 0; 4142 int skipped_disk = 0;
4142 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1); 4143 sh = get_active_stripe(conf, stripe_addr+i, 0, 0, 1);
4143 set_bit(STRIPE_EXPANDING, &sh->state); 4144 set_bit(STRIPE_EXPANDING, &sh->state);
4144 atomic_inc(&conf->reshape_stripes); 4145 atomic_inc(&conf->reshape_stripes);
4145 /* If any of this stripe is beyond the end of the old 4146 /* If any of this stripe is beyond the end of the old
4146 * array, then we need to zero those blocks 4147 * array, then we need to zero those blocks
4147 */ 4148 */
4148 for (j=sh->disks; j--;) { 4149 for (j=sh->disks; j--;) {
4149 sector_t s; 4150 sector_t s;
4150 if (j == sh->pd_idx) 4151 if (j == sh->pd_idx)
4151 continue; 4152 continue;
4152 if (conf->level == 6 && 4153 if (conf->level == 6 &&
4153 j == sh->qd_idx) 4154 j == sh->qd_idx)
4154 continue; 4155 continue;
4155 s = compute_blocknr(sh, j, 0); 4156 s = compute_blocknr(sh, j, 0);
4156 if (s < raid5_size(mddev, 0, 0)) { 4157 if (s < raid5_size(mddev, 0, 0)) {
4157 skipped_disk = 1; 4158 skipped_disk = 1;
4158 continue; 4159 continue;
4159 } 4160 }
4160 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); 4161 memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE);
4161 set_bit(R5_Expanded, &sh->dev[j].flags); 4162 set_bit(R5_Expanded, &sh->dev[j].flags);
4162 set_bit(R5_UPTODATE, &sh->dev[j].flags); 4163 set_bit(R5_UPTODATE, &sh->dev[j].flags);
4163 } 4164 }
4164 if (!skipped_disk) { 4165 if (!skipped_disk) {
4165 set_bit(STRIPE_EXPAND_READY, &sh->state); 4166 set_bit(STRIPE_EXPAND_READY, &sh->state);
4166 set_bit(STRIPE_HANDLE, &sh->state); 4167 set_bit(STRIPE_HANDLE, &sh->state);
4167 } 4168 }
4168 list_add(&sh->lru, &stripes); 4169 list_add(&sh->lru, &stripes);
4169 } 4170 }
4170 spin_lock_irq(&conf->device_lock); 4171 spin_lock_irq(&conf->device_lock);
4171 if (mddev->delta_disks < 0) 4172 if (mddev->delta_disks < 0)
4172 conf->reshape_progress -= reshape_sectors * new_data_disks; 4173 conf->reshape_progress -= reshape_sectors * new_data_disks;
4173 else 4174 else
4174 conf->reshape_progress += reshape_sectors * new_data_disks; 4175 conf->reshape_progress += reshape_sectors * new_data_disks;
4175 spin_unlock_irq(&conf->device_lock); 4176 spin_unlock_irq(&conf->device_lock);
4176 /* Ok, those stripe are ready. We can start scheduling 4177 /* Ok, those stripe are ready. We can start scheduling
4177 * reads on the source stripes. 4178 * reads on the source stripes.
4178 * The source stripes are determined by mapping the first and last 4179 * The source stripes are determined by mapping the first and last
4179 * block on the destination stripes. 4180 * block on the destination stripes.
4180 */ 4181 */
4181 first_sector = 4182 first_sector =
4182 raid5_compute_sector(conf, stripe_addr*(new_data_disks), 4183 raid5_compute_sector(conf, stripe_addr*(new_data_disks),
4183 1, &dd_idx, NULL); 4184 1, &dd_idx, NULL);
4184 last_sector = 4185 last_sector =
4185 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors) 4186 raid5_compute_sector(conf, ((stripe_addr+reshape_sectors)
4186 * new_data_disks - 1), 4187 * new_data_disks - 1),
4187 1, &dd_idx, NULL); 4188 1, &dd_idx, NULL);
4188 if (last_sector >= mddev->dev_sectors) 4189 if (last_sector >= mddev->dev_sectors)
4189 last_sector = mddev->dev_sectors - 1; 4190 last_sector = mddev->dev_sectors - 1;
4190 while (first_sector <= last_sector) { 4191 while (first_sector <= last_sector) {
4191 sh = get_active_stripe(conf, first_sector, 1, 0, 1); 4192 sh = get_active_stripe(conf, first_sector, 1, 0, 1);
4192 set_bit(STRIPE_EXPAND_SOURCE, &sh->state); 4193 set_bit(STRIPE_EXPAND_SOURCE, &sh->state);
4193 set_bit(STRIPE_HANDLE, &sh->state); 4194 set_bit(STRIPE_HANDLE, &sh->state);
4194 release_stripe(sh); 4195 release_stripe(sh);
4195 first_sector += STRIPE_SECTORS; 4196 first_sector += STRIPE_SECTORS;
4196 } 4197 }
4197 /* Now that the sources are clearly marked, we can release 4198 /* Now that the sources are clearly marked, we can release
4198 * the destination stripes 4199 * the destination stripes
4199 */ 4200 */
4200 while (!list_empty(&stripes)) { 4201 while (!list_empty(&stripes)) {
4201 sh = list_entry(stripes.next, struct stripe_head, lru); 4202 sh = list_entry(stripes.next, struct stripe_head, lru);
4202 list_del_init(&sh->lru); 4203 list_del_init(&sh->lru);
4203 release_stripe(sh); 4204 release_stripe(sh);
4204 } 4205 }
4205 /* If this takes us to the resync_max point where we have to pause, 4206 /* If this takes us to the resync_max point where we have to pause,
4206 * then we need to write out the superblock. 4207 * then we need to write out the superblock.
4207 */ 4208 */
4208 sector_nr += reshape_sectors; 4209 sector_nr += reshape_sectors;
4209 if ((sector_nr - mddev->curr_resync_completed) * 2 4210 if ((sector_nr - mddev->curr_resync_completed) * 2
4210 >= mddev->resync_max - mddev->curr_resync_completed) { 4211 >= mddev->resync_max - mddev->curr_resync_completed) {
4211 /* Cannot proceed until we've updated the superblock... */ 4212 /* Cannot proceed until we've updated the superblock... */
4212 wait_event(conf->wait_for_overlap, 4213 wait_event(conf->wait_for_overlap,
4213 atomic_read(&conf->reshape_stripes) == 0); 4214 atomic_read(&conf->reshape_stripes) == 0);
4214 mddev->reshape_position = conf->reshape_progress; 4215 mddev->reshape_position = conf->reshape_progress;
4215 mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors; 4216 mddev->curr_resync_completed = mddev->curr_resync + reshape_sectors;
4216 conf->reshape_checkpoint = jiffies; 4217 conf->reshape_checkpoint = jiffies;
4217 set_bit(MD_CHANGE_DEVS, &mddev->flags); 4218 set_bit(MD_CHANGE_DEVS, &mddev->flags);
4218 md_wakeup_thread(mddev->thread); 4219 md_wakeup_thread(mddev->thread);
4219 wait_event(mddev->sb_wait, 4220 wait_event(mddev->sb_wait,
4220 !test_bit(MD_CHANGE_DEVS, &mddev->flags) 4221 !test_bit(MD_CHANGE_DEVS, &mddev->flags)
4221 || kthread_should_stop()); 4222 || kthread_should_stop());
4222 spin_lock_irq(&conf->device_lock); 4223 spin_lock_irq(&conf->device_lock);
4223 conf->reshape_safe = mddev->reshape_position; 4224 conf->reshape_safe = mddev->reshape_position;
4224 spin_unlock_irq(&conf->device_lock); 4225 spin_unlock_irq(&conf->device_lock);
4225 wake_up(&conf->wait_for_overlap); 4226 wake_up(&conf->wait_for_overlap);
4226 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 4227 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
4227 } 4228 }
4228 return reshape_sectors; 4229 return reshape_sectors;
4229 } 4230 }
4230 4231
4231 /* FIXME go_faster isn't used */ 4232 /* FIXME go_faster isn't used */
4232 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) 4233 static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster)
4233 { 4234 {
4234 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 4235 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
4235 struct stripe_head *sh; 4236 struct stripe_head *sh;
4236 sector_t max_sector = mddev->dev_sectors; 4237 sector_t max_sector = mddev->dev_sectors;
4237 int sync_blocks; 4238 int sync_blocks;
4238 int still_degraded = 0; 4239 int still_degraded = 0;
4239 int i; 4240 int i;
4240 4241
4241 if (sector_nr >= max_sector) { 4242 if (sector_nr >= max_sector) {
4242 /* just being told to finish up .. nothing much to do */ 4243 /* just being told to finish up .. nothing much to do */
4243 unplug_slaves(mddev); 4244 unplug_slaves(mddev);
4244 4245
4245 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) { 4246 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
4246 end_reshape(conf); 4247 end_reshape(conf);
4247 return 0; 4248 return 0;
4248 } 4249 }
4249 4250
4250 if (mddev->curr_resync < max_sector) /* aborted */ 4251 if (mddev->curr_resync < max_sector) /* aborted */
4251 bitmap_end_sync(mddev->bitmap, mddev->curr_resync, 4252 bitmap_end_sync(mddev->bitmap, mddev->curr_resync,
4252 &sync_blocks, 1); 4253 &sync_blocks, 1);
4253 else /* completed sync */ 4254 else /* completed sync */
4254 conf->fullsync = 0; 4255 conf->fullsync = 0;
4255 bitmap_close_sync(mddev->bitmap); 4256 bitmap_close_sync(mddev->bitmap);
4256 4257
4257 return 0; 4258 return 0;
4258 } 4259 }
4259 4260
4260 /* Allow raid5_quiesce to complete */ 4261 /* Allow raid5_quiesce to complete */
4261 wait_event(conf->wait_for_overlap, conf->quiesce != 2); 4262 wait_event(conf->wait_for_overlap, conf->quiesce != 2);
4262 4263
4263 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4264 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4264 return reshape_request(mddev, sector_nr, skipped); 4265 return reshape_request(mddev, sector_nr, skipped);
4265 4266
4266 /* No need to check resync_max as we never do more than one 4267 /* No need to check resync_max as we never do more than one
4267 * stripe, and as resync_max will always be on a chunk boundary, 4268 * stripe, and as resync_max will always be on a chunk boundary,
4268 * if the check in md_do_sync didn't fire, there is no chance 4269 * if the check in md_do_sync didn't fire, there is no chance
4269 * of overstepping resync_max here 4270 * of overstepping resync_max here
4270 */ 4271 */
4271 4272
4272 /* if there is too many failed drives and we are trying 4273 /* if there is too many failed drives and we are trying
4273 * to resync, then assert that we are finished, because there is 4274 * to resync, then assert that we are finished, because there is
4274 * nothing we can do. 4275 * nothing we can do.
4275 */ 4276 */
4276 if (mddev->degraded >= conf->max_degraded && 4277 if (mddev->degraded >= conf->max_degraded &&
4277 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4278 test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4278 sector_t rv = mddev->dev_sectors - sector_nr; 4279 sector_t rv = mddev->dev_sectors - sector_nr;
4279 *skipped = 1; 4280 *skipped = 1;
4280 return rv; 4281 return rv;
4281 } 4282 }
4282 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && 4283 if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) &&
4283 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && 4284 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) &&
4284 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) { 4285 !conf->fullsync && sync_blocks >= STRIPE_SECTORS) {
4285 /* we can skip this block, and probably more */ 4286 /* we can skip this block, and probably more */
4286 sync_blocks /= STRIPE_SECTORS; 4287 sync_blocks /= STRIPE_SECTORS;
4287 *skipped = 1; 4288 *skipped = 1;
4288 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ 4289 return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */
4289 } 4290 }
4290 4291
4291 4292
4292 bitmap_cond_end_sync(mddev->bitmap, sector_nr); 4293 bitmap_cond_end_sync(mddev->bitmap, sector_nr);
4293 4294
4294 sh = get_active_stripe(conf, sector_nr, 0, 1, 0); 4295 sh = get_active_stripe(conf, sector_nr, 0, 1, 0);
4295 if (sh == NULL) { 4296 if (sh == NULL) {
4296 sh = get_active_stripe(conf, sector_nr, 0, 0, 0); 4297 sh = get_active_stripe(conf, sector_nr, 0, 0, 0);
4297 /* make sure we don't swamp the stripe cache if someone else 4298 /* make sure we don't swamp the stripe cache if someone else
4298 * is trying to get access 4299 * is trying to get access
4299 */ 4300 */
4300 schedule_timeout_uninterruptible(1); 4301 schedule_timeout_uninterruptible(1);
4301 } 4302 }
4302 /* Need to check if array will still be degraded after recovery/resync 4303 /* Need to check if array will still be degraded after recovery/resync
4303 * We don't need to check the 'failed' flag as when that gets set, 4304 * We don't need to check the 'failed' flag as when that gets set,
4304 * recovery aborts. 4305 * recovery aborts.
4305 */ 4306 */
4306 for (i = 0; i < conf->raid_disks; i++) 4307 for (i = 0; i < conf->raid_disks; i++)
4307 if (conf->disks[i].rdev == NULL) 4308 if (conf->disks[i].rdev == NULL)
4308 still_degraded = 1; 4309 still_degraded = 1;
4309 4310
4310 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); 4311 bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded);
4311 4312
4312 spin_lock(&sh->lock); 4313 spin_lock(&sh->lock);
4313 set_bit(STRIPE_SYNCING, &sh->state); 4314 set_bit(STRIPE_SYNCING, &sh->state);
4314 clear_bit(STRIPE_INSYNC, &sh->state); 4315 clear_bit(STRIPE_INSYNC, &sh->state);
4315 spin_unlock(&sh->lock); 4316 spin_unlock(&sh->lock);
4316 4317
4317 handle_stripe(sh); 4318 handle_stripe(sh);
4318 release_stripe(sh); 4319 release_stripe(sh);
4319 4320
4320 return STRIPE_SECTORS; 4321 return STRIPE_SECTORS;
4321 } 4322 }
4322 4323
4323 static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio) 4324 static int retry_aligned_read(raid5_conf_t *conf, struct bio *raid_bio)
4324 { 4325 {
4325 /* We may not be able to submit a whole bio at once as there 4326 /* We may not be able to submit a whole bio at once as there
4326 * may not be enough stripe_heads available. 4327 * may not be enough stripe_heads available.
4327 * We cannot pre-allocate enough stripe_heads as we may need 4328 * We cannot pre-allocate enough stripe_heads as we may need
4328 * more than exist in the cache (if we allow ever large chunks). 4329 * more than exist in the cache (if we allow ever large chunks).
4329 * So we do one stripe head at a time and record in 4330 * So we do one stripe head at a time and record in
4330 * ->bi_hw_segments how many have been done. 4331 * ->bi_hw_segments how many have been done.
4331 * 4332 *
4332 * We *know* that this entire raid_bio is in one chunk, so 4333 * We *know* that this entire raid_bio is in one chunk, so
4333 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector. 4334 * it will be only one 'dd_idx' and only need one call to raid5_compute_sector.
4334 */ 4335 */
4335 struct stripe_head *sh; 4336 struct stripe_head *sh;
4336 int dd_idx; 4337 int dd_idx;
4337 sector_t sector, logical_sector, last_sector; 4338 sector_t sector, logical_sector, last_sector;
4338 int scnt = 0; 4339 int scnt = 0;
4339 int remaining; 4340 int remaining;
4340 int handled = 0; 4341 int handled = 0;
4341 4342
4342 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1); 4343 logical_sector = raid_bio->bi_sector & ~((sector_t)STRIPE_SECTORS-1);
4343 sector = raid5_compute_sector(conf, logical_sector, 4344 sector = raid5_compute_sector(conf, logical_sector,
4344 0, &dd_idx, NULL); 4345 0, &dd_idx, NULL);
4345 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9); 4346 last_sector = raid_bio->bi_sector + (raid_bio->bi_size>>9);
4346 4347
4347 for (; logical_sector < last_sector; 4348 for (; logical_sector < last_sector;
4348 logical_sector += STRIPE_SECTORS, 4349 logical_sector += STRIPE_SECTORS,
4349 sector += STRIPE_SECTORS, 4350 sector += STRIPE_SECTORS,
4350 scnt++) { 4351 scnt++) {
4351 4352
4352 if (scnt < raid5_bi_hw_segments(raid_bio)) 4353 if (scnt < raid5_bi_hw_segments(raid_bio))
4353 /* already done this stripe */ 4354 /* already done this stripe */
4354 continue; 4355 continue;
4355 4356
4356 sh = get_active_stripe(conf, sector, 0, 1, 0); 4357 sh = get_active_stripe(conf, sector, 0, 1, 0);
4357 4358
4358 if (!sh) { 4359 if (!sh) {
4359 /* failed to get a stripe - must wait */ 4360 /* failed to get a stripe - must wait */
4360 raid5_set_bi_hw_segments(raid_bio, scnt); 4361 raid5_set_bi_hw_segments(raid_bio, scnt);
4361 conf->retry_read_aligned = raid_bio; 4362 conf->retry_read_aligned = raid_bio;
4362 return handled; 4363 return handled;
4363 } 4364 }
4364 4365
4365 set_bit(R5_ReadError, &sh->dev[dd_idx].flags); 4366 set_bit(R5_ReadError, &sh->dev[dd_idx].flags);
4366 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) { 4367 if (!add_stripe_bio(sh, raid_bio, dd_idx, 0)) {
4367 release_stripe(sh); 4368 release_stripe(sh);
4368 raid5_set_bi_hw_segments(raid_bio, scnt); 4369 raid5_set_bi_hw_segments(raid_bio, scnt);
4369 conf->retry_read_aligned = raid_bio; 4370 conf->retry_read_aligned = raid_bio;
4370 return handled; 4371 return handled;
4371 } 4372 }
4372 4373
4373 handle_stripe(sh); 4374 handle_stripe(sh);
4374 release_stripe(sh); 4375 release_stripe(sh);
4375 handled++; 4376 handled++;
4376 } 4377 }
4377 spin_lock_irq(&conf->device_lock); 4378 spin_lock_irq(&conf->device_lock);
4378 remaining = raid5_dec_bi_phys_segments(raid_bio); 4379 remaining = raid5_dec_bi_phys_segments(raid_bio);
4379 spin_unlock_irq(&conf->device_lock); 4380 spin_unlock_irq(&conf->device_lock);
4380 if (remaining == 0) 4381 if (remaining == 0)
4381 bio_endio(raid_bio, 0); 4382 bio_endio(raid_bio, 0);
4382 if (atomic_dec_and_test(&conf->active_aligned_reads)) 4383 if (atomic_dec_and_test(&conf->active_aligned_reads))
4383 wake_up(&conf->wait_for_stripe); 4384 wake_up(&conf->wait_for_stripe);
4384 return handled; 4385 return handled;
4385 } 4386 }
4386 4387
4387 4388
4388 /* 4389 /*
4389 * This is our raid5 kernel thread. 4390 * This is our raid5 kernel thread.
4390 * 4391 *
4391 * We scan the hash table for stripes which can be handled now. 4392 * We scan the hash table for stripes which can be handled now.
4392 * During the scan, completed stripes are saved for us by the interrupt 4393 * During the scan, completed stripes are saved for us by the interrupt
4393 * handler, so that they will not have to wait for our next wakeup. 4394 * handler, so that they will not have to wait for our next wakeup.
4394 */ 4395 */
4395 static void raid5d(mddev_t *mddev) 4396 static void raid5d(mddev_t *mddev)
4396 { 4397 {
4397 struct stripe_head *sh; 4398 struct stripe_head *sh;
4398 raid5_conf_t *conf = mddev->private; 4399 raid5_conf_t *conf = mddev->private;
4399 int handled; 4400 int handled;
4400 4401
4401 pr_debug("+++ raid5d active\n"); 4402 pr_debug("+++ raid5d active\n");
4402 4403
4403 md_check_recovery(mddev); 4404 md_check_recovery(mddev);
4404 4405
4405 handled = 0; 4406 handled = 0;
4406 spin_lock_irq(&conf->device_lock); 4407 spin_lock_irq(&conf->device_lock);
4407 while (1) { 4408 while (1) {
4408 struct bio *bio; 4409 struct bio *bio;
4409 4410
4410 if (conf->seq_flush != conf->seq_write) { 4411 if (conf->seq_flush != conf->seq_write) {
4411 int seq = conf->seq_flush; 4412 int seq = conf->seq_flush;
4412 spin_unlock_irq(&conf->device_lock); 4413 spin_unlock_irq(&conf->device_lock);
4413 bitmap_unplug(mddev->bitmap); 4414 bitmap_unplug(mddev->bitmap);
4414 spin_lock_irq(&conf->device_lock); 4415 spin_lock_irq(&conf->device_lock);
4415 conf->seq_write = seq; 4416 conf->seq_write = seq;
4416 activate_bit_delay(conf); 4417 activate_bit_delay(conf);
4417 } 4418 }
4418 4419
4419 while ((bio = remove_bio_from_retry(conf))) { 4420 while ((bio = remove_bio_from_retry(conf))) {
4420 int ok; 4421 int ok;
4421 spin_unlock_irq(&conf->device_lock); 4422 spin_unlock_irq(&conf->device_lock);
4422 ok = retry_aligned_read(conf, bio); 4423 ok = retry_aligned_read(conf, bio);
4423 spin_lock_irq(&conf->device_lock); 4424 spin_lock_irq(&conf->device_lock);
4424 if (!ok) 4425 if (!ok)
4425 break; 4426 break;
4426 handled++; 4427 handled++;
4427 } 4428 }
4428 4429
4429 sh = __get_priority_stripe(conf); 4430 sh = __get_priority_stripe(conf);
4430 4431
4431 if (!sh) 4432 if (!sh)
4432 break; 4433 break;
4433 spin_unlock_irq(&conf->device_lock); 4434 spin_unlock_irq(&conf->device_lock);
4434 4435
4435 handled++; 4436 handled++;
4436 handle_stripe(sh); 4437 handle_stripe(sh);
4437 release_stripe(sh); 4438 release_stripe(sh);
4438 cond_resched(); 4439 cond_resched();
4439 4440
4440 spin_lock_irq(&conf->device_lock); 4441 spin_lock_irq(&conf->device_lock);
4441 } 4442 }
4442 pr_debug("%d stripes handled\n", handled); 4443 pr_debug("%d stripes handled\n", handled);
4443 4444
4444 spin_unlock_irq(&conf->device_lock); 4445 spin_unlock_irq(&conf->device_lock);
4445 4446
4446 async_tx_issue_pending_all(); 4447 async_tx_issue_pending_all();
4447 unplug_slaves(mddev); 4448 unplug_slaves(mddev);
4448 4449
4449 pr_debug("--- raid5d inactive\n"); 4450 pr_debug("--- raid5d inactive\n");
4450 } 4451 }
4451 4452
4452 static ssize_t 4453 static ssize_t
4453 raid5_show_stripe_cache_size(mddev_t *mddev, char *page) 4454 raid5_show_stripe_cache_size(mddev_t *mddev, char *page)
4454 { 4455 {
4455 raid5_conf_t *conf = mddev->private; 4456 raid5_conf_t *conf = mddev->private;
4456 if (conf) 4457 if (conf)
4457 return sprintf(page, "%d\n", conf->max_nr_stripes); 4458 return sprintf(page, "%d\n", conf->max_nr_stripes);
4458 else 4459 else
4459 return 0; 4460 return 0;
4460 } 4461 }
4461 4462
4462 static ssize_t 4463 static ssize_t
4463 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len) 4464 raid5_store_stripe_cache_size(mddev_t *mddev, const char *page, size_t len)
4464 { 4465 {
4465 raid5_conf_t *conf = mddev->private; 4466 raid5_conf_t *conf = mddev->private;
4466 unsigned long new; 4467 unsigned long new;
4467 int err; 4468 int err;
4468 4469
4469 if (len >= PAGE_SIZE) 4470 if (len >= PAGE_SIZE)
4470 return -EINVAL; 4471 return -EINVAL;
4471 if (!conf) 4472 if (!conf)
4472 return -ENODEV; 4473 return -ENODEV;
4473 4474
4474 if (strict_strtoul(page, 10, &new)) 4475 if (strict_strtoul(page, 10, &new))
4475 return -EINVAL; 4476 return -EINVAL;
4476 if (new <= 16 || new > 32768) 4477 if (new <= 16 || new > 32768)
4477 return -EINVAL; 4478 return -EINVAL;
4478 while (new < conf->max_nr_stripes) { 4479 while (new < conf->max_nr_stripes) {
4479 if (drop_one_stripe(conf)) 4480 if (drop_one_stripe(conf))
4480 conf->max_nr_stripes--; 4481 conf->max_nr_stripes--;
4481 else 4482 else
4482 break; 4483 break;
4483 } 4484 }
4484 err = md_allow_write(mddev); 4485 err = md_allow_write(mddev);
4485 if (err) 4486 if (err)
4486 return err; 4487 return err;
4487 while (new > conf->max_nr_stripes) { 4488 while (new > conf->max_nr_stripes) {
4488 if (grow_one_stripe(conf)) 4489 if (grow_one_stripe(conf))
4489 conf->max_nr_stripes++; 4490 conf->max_nr_stripes++;
4490 else break; 4491 else break;
4491 } 4492 }
4492 return len; 4493 return len;
4493 } 4494 }
4494 4495
4495 static struct md_sysfs_entry 4496 static struct md_sysfs_entry
4496 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR, 4497 raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
4497 raid5_show_stripe_cache_size, 4498 raid5_show_stripe_cache_size,
4498 raid5_store_stripe_cache_size); 4499 raid5_store_stripe_cache_size);
4499 4500
4500 static ssize_t 4501 static ssize_t
4501 raid5_show_preread_threshold(mddev_t *mddev, char *page) 4502 raid5_show_preread_threshold(mddev_t *mddev, char *page)
4502 { 4503 {
4503 raid5_conf_t *conf = mddev->private; 4504 raid5_conf_t *conf = mddev->private;
4504 if (conf) 4505 if (conf)
4505 return sprintf(page, "%d\n", conf->bypass_threshold); 4506 return sprintf(page, "%d\n", conf->bypass_threshold);
4506 else 4507 else
4507 return 0; 4508 return 0;
4508 } 4509 }
4509 4510
4510 static ssize_t 4511 static ssize_t
4511 raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len) 4512 raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
4512 { 4513 {
4513 raid5_conf_t *conf = mddev->private; 4514 raid5_conf_t *conf = mddev->private;
4514 unsigned long new; 4515 unsigned long new;
4515 if (len >= PAGE_SIZE) 4516 if (len >= PAGE_SIZE)
4516 return -EINVAL; 4517 return -EINVAL;
4517 if (!conf) 4518 if (!conf)
4518 return -ENODEV; 4519 return -ENODEV;
4519 4520
4520 if (strict_strtoul(page, 10, &new)) 4521 if (strict_strtoul(page, 10, &new))
4521 return -EINVAL; 4522 return -EINVAL;
4522 if (new > conf->max_nr_stripes) 4523 if (new > conf->max_nr_stripes)
4523 return -EINVAL; 4524 return -EINVAL;
4524 conf->bypass_threshold = new; 4525 conf->bypass_threshold = new;
4525 return len; 4526 return len;
4526 } 4527 }
4527 4528
4528 static struct md_sysfs_entry 4529 static struct md_sysfs_entry
4529 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold, 4530 raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
4530 S_IRUGO | S_IWUSR, 4531 S_IRUGO | S_IWUSR,
4531 raid5_show_preread_threshold, 4532 raid5_show_preread_threshold,
4532 raid5_store_preread_threshold); 4533 raid5_store_preread_threshold);
4533 4534
4534 static ssize_t 4535 static ssize_t
4535 stripe_cache_active_show(mddev_t *mddev, char *page) 4536 stripe_cache_active_show(mddev_t *mddev, char *page)
4536 { 4537 {
4537 raid5_conf_t *conf = mddev->private; 4538 raid5_conf_t *conf = mddev->private;
4538 if (conf) 4539 if (conf)
4539 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes)); 4540 return sprintf(page, "%d\n", atomic_read(&conf->active_stripes));
4540 else 4541 else
4541 return 0; 4542 return 0;
4542 } 4543 }
4543 4544
4544 static struct md_sysfs_entry 4545 static struct md_sysfs_entry
4545 raid5_stripecache_active = __ATTR_RO(stripe_cache_active); 4546 raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
4546 4547
4547 static struct attribute *raid5_attrs[] = { 4548 static struct attribute *raid5_attrs[] = {
4548 &raid5_stripecache_size.attr, 4549 &raid5_stripecache_size.attr,
4549 &raid5_stripecache_active.attr, 4550 &raid5_stripecache_active.attr,
4550 &raid5_preread_bypass_threshold.attr, 4551 &raid5_preread_bypass_threshold.attr,
4551 NULL, 4552 NULL,
4552 }; 4553 };
4553 static struct attribute_group raid5_attrs_group = { 4554 static struct attribute_group raid5_attrs_group = {
4554 .name = NULL, 4555 .name = NULL,
4555 .attrs = raid5_attrs, 4556 .attrs = raid5_attrs,
4556 }; 4557 };
4557 4558
4558 static sector_t 4559 static sector_t
4559 raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) 4560 raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks)
4560 { 4561 {
4561 raid5_conf_t *conf = mddev->private; 4562 raid5_conf_t *conf = mddev->private;
4562 4563
4563 if (!sectors) 4564 if (!sectors)
4564 sectors = mddev->dev_sectors; 4565 sectors = mddev->dev_sectors;
4565 if (!raid_disks) { 4566 if (!raid_disks)
4566 /* size is defined by the smallest of previous and new size */ 4567 /* size is defined by the smallest of previous and new size */
4567 if (conf->raid_disks < conf->previous_raid_disks) 4568 raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
4568 raid_disks = conf->raid_disks;
4569 else
4570 raid_disks = conf->previous_raid_disks;
4571 }
4572 4569
4573 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 4570 sectors &= ~((sector_t)mddev->chunk_sectors - 1);
4574 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1); 4571 sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
4575 return sectors * (raid_disks - conf->max_degraded); 4572 return sectors * (raid_disks - conf->max_degraded);
4576 } 4573 }
4577 4574
4578 static void raid5_free_percpu(raid5_conf_t *conf) 4575 static void raid5_free_percpu(raid5_conf_t *conf)
4579 { 4576 {
4580 struct raid5_percpu *percpu; 4577 struct raid5_percpu *percpu;
4581 unsigned long cpu; 4578 unsigned long cpu;
4582 4579
4583 if (!conf->percpu) 4580 if (!conf->percpu)
4584 return; 4581 return;
4585 4582
4586 get_online_cpus(); 4583 get_online_cpus();
4587 for_each_possible_cpu(cpu) { 4584 for_each_possible_cpu(cpu) {
4588 percpu = per_cpu_ptr(conf->percpu, cpu); 4585 percpu = per_cpu_ptr(conf->percpu, cpu);
4589 safe_put_page(percpu->spare_page); 4586 safe_put_page(percpu->spare_page);
4590 kfree(percpu->scribble); 4587 kfree(percpu->scribble);
4591 } 4588 }
4592 #ifdef CONFIG_HOTPLUG_CPU 4589 #ifdef CONFIG_HOTPLUG_CPU
4593 unregister_cpu_notifier(&conf->cpu_notify); 4590 unregister_cpu_notifier(&conf->cpu_notify);
4594 #endif 4591 #endif
4595 put_online_cpus(); 4592 put_online_cpus();
4596 4593
4597 free_percpu(conf->percpu); 4594 free_percpu(conf->percpu);
4598 } 4595 }
4599 4596
4600 static void free_conf(raid5_conf_t *conf) 4597 static void free_conf(raid5_conf_t *conf)
4601 { 4598 {
4602 shrink_stripes(conf); 4599 shrink_stripes(conf);
4603 raid5_free_percpu(conf); 4600 raid5_free_percpu(conf);
4604 kfree(conf->disks); 4601 kfree(conf->disks);
4605 kfree(conf->stripe_hashtbl); 4602 kfree(conf->stripe_hashtbl);
4606 kfree(conf); 4603 kfree(conf);
4607 } 4604 }
4608 4605
4609 #ifdef CONFIG_HOTPLUG_CPU 4606 #ifdef CONFIG_HOTPLUG_CPU
4610 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action, 4607 static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
4611 void *hcpu) 4608 void *hcpu)
4612 { 4609 {
4613 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify); 4610 raid5_conf_t *conf = container_of(nfb, raid5_conf_t, cpu_notify);
4614 long cpu = (long)hcpu; 4611 long cpu = (long)hcpu;
4615 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu); 4612 struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
4616 4613
4617 switch (action) { 4614 switch (action) {
4618 case CPU_UP_PREPARE: 4615 case CPU_UP_PREPARE:
4619 case CPU_UP_PREPARE_FROZEN: 4616 case CPU_UP_PREPARE_FROZEN:
4620 if (conf->level == 6 && !percpu->spare_page) 4617 if (conf->level == 6 && !percpu->spare_page)
4621 percpu->spare_page = alloc_page(GFP_KERNEL); 4618 percpu->spare_page = alloc_page(GFP_KERNEL);
4622 if (!percpu->scribble) 4619 if (!percpu->scribble)
4623 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL); 4620 percpu->scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4624 4621
4625 if (!percpu->scribble || 4622 if (!percpu->scribble ||
4626 (conf->level == 6 && !percpu->spare_page)) { 4623 (conf->level == 6 && !percpu->spare_page)) {
4627 safe_put_page(percpu->spare_page); 4624 safe_put_page(percpu->spare_page);
4628 kfree(percpu->scribble); 4625 kfree(percpu->scribble);
4629 pr_err("%s: failed memory allocation for cpu%ld\n", 4626 pr_err("%s: failed memory allocation for cpu%ld\n",
4630 __func__, cpu); 4627 __func__, cpu);
4631 return NOTIFY_BAD; 4628 return NOTIFY_BAD;
4632 } 4629 }
4633 break; 4630 break;
4634 case CPU_DEAD: 4631 case CPU_DEAD:
4635 case CPU_DEAD_FROZEN: 4632 case CPU_DEAD_FROZEN:
4636 safe_put_page(percpu->spare_page); 4633 safe_put_page(percpu->spare_page);
4637 kfree(percpu->scribble); 4634 kfree(percpu->scribble);
4638 percpu->spare_page = NULL; 4635 percpu->spare_page = NULL;
4639 percpu->scribble = NULL; 4636 percpu->scribble = NULL;
4640 break; 4637 break;
4641 default: 4638 default:
4642 break; 4639 break;
4643 } 4640 }
4644 return NOTIFY_OK; 4641 return NOTIFY_OK;
4645 } 4642 }
4646 #endif 4643 #endif
4647 4644
4648 static int raid5_alloc_percpu(raid5_conf_t *conf) 4645 static int raid5_alloc_percpu(raid5_conf_t *conf)
4649 { 4646 {
4650 unsigned long cpu; 4647 unsigned long cpu;
4651 struct page *spare_page; 4648 struct page *spare_page;
4652 struct raid5_percpu *allcpus; 4649 struct raid5_percpu *allcpus;
4653 void *scribble; 4650 void *scribble;
4654 int err; 4651 int err;
4655 4652
4656 allcpus = alloc_percpu(struct raid5_percpu); 4653 allcpus = alloc_percpu(struct raid5_percpu);
4657 if (!allcpus) 4654 if (!allcpus)
4658 return -ENOMEM; 4655 return -ENOMEM;
4659 conf->percpu = allcpus; 4656 conf->percpu = allcpus;
4660 4657
4661 get_online_cpus(); 4658 get_online_cpus();
4662 err = 0; 4659 err = 0;
4663 for_each_present_cpu(cpu) { 4660 for_each_present_cpu(cpu) {
4664 if (conf->level == 6) { 4661 if (conf->level == 6) {
4665 spare_page = alloc_page(GFP_KERNEL); 4662 spare_page = alloc_page(GFP_KERNEL);
4666 if (!spare_page) { 4663 if (!spare_page) {
4667 err = -ENOMEM; 4664 err = -ENOMEM;
4668 break; 4665 break;
4669 } 4666 }
4670 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page; 4667 per_cpu_ptr(conf->percpu, cpu)->spare_page = spare_page;
4671 } 4668 }
4672 scribble = kmalloc(scribble_len(conf->raid_disks), GFP_KERNEL); 4669 scribble = kmalloc(conf->scribble_len, GFP_KERNEL);
4673 if (!scribble) { 4670 if (!scribble) {
4674 err = -ENOMEM; 4671 err = -ENOMEM;
4675 break; 4672 break;
4676 } 4673 }
4677 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble; 4674 per_cpu_ptr(conf->percpu, cpu)->scribble = scribble;
4678 } 4675 }
4679 #ifdef CONFIG_HOTPLUG_CPU 4676 #ifdef CONFIG_HOTPLUG_CPU
4680 conf->cpu_notify.notifier_call = raid456_cpu_notify; 4677 conf->cpu_notify.notifier_call = raid456_cpu_notify;
4681 conf->cpu_notify.priority = 0; 4678 conf->cpu_notify.priority = 0;
4682 if (err == 0) 4679 if (err == 0)
4683 err = register_cpu_notifier(&conf->cpu_notify); 4680 err = register_cpu_notifier(&conf->cpu_notify);
4684 #endif 4681 #endif
4685 put_online_cpus(); 4682 put_online_cpus();
4686 4683
4687 return err; 4684 return err;
4688 } 4685 }
4689 4686
4690 static raid5_conf_t *setup_conf(mddev_t *mddev) 4687 static raid5_conf_t *setup_conf(mddev_t *mddev)
4691 { 4688 {
4692 raid5_conf_t *conf; 4689 raid5_conf_t *conf;
4693 int raid_disk, memory; 4690 int raid_disk, memory, max_disks;
4694 mdk_rdev_t *rdev; 4691 mdk_rdev_t *rdev;
4695 struct disk_info *disk; 4692 struct disk_info *disk;
4696 4693
4697 if (mddev->new_level != 5 4694 if (mddev->new_level != 5
4698 && mddev->new_level != 4 4695 && mddev->new_level != 4
4699 && mddev->new_level != 6) { 4696 && mddev->new_level != 6) {
4700 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n", 4697 printk(KERN_ERR "raid5: %s: raid level not set to 4/5/6 (%d)\n",
4701 mdname(mddev), mddev->new_level); 4698 mdname(mddev), mddev->new_level);
4702 return ERR_PTR(-EIO); 4699 return ERR_PTR(-EIO);
4703 } 4700 }
4704 if ((mddev->new_level == 5 4701 if ((mddev->new_level == 5
4705 && !algorithm_valid_raid5(mddev->new_layout)) || 4702 && !algorithm_valid_raid5(mddev->new_layout)) ||
4706 (mddev->new_level == 6 4703 (mddev->new_level == 6
4707 && !algorithm_valid_raid6(mddev->new_layout))) { 4704 && !algorithm_valid_raid6(mddev->new_layout))) {
4708 printk(KERN_ERR "raid5: %s: layout %d not supported\n", 4705 printk(KERN_ERR "raid5: %s: layout %d not supported\n",
4709 mdname(mddev), mddev->new_layout); 4706 mdname(mddev), mddev->new_layout);
4710 return ERR_PTR(-EIO); 4707 return ERR_PTR(-EIO);
4711 } 4708 }
4712 if (mddev->new_level == 6 && mddev->raid_disks < 4) { 4709 if (mddev->new_level == 6 && mddev->raid_disks < 4) {
4713 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n", 4710 printk(KERN_ERR "raid6: not enough configured devices for %s (%d, minimum 4)\n",
4714 mdname(mddev), mddev->raid_disks); 4711 mdname(mddev), mddev->raid_disks);
4715 return ERR_PTR(-EINVAL); 4712 return ERR_PTR(-EINVAL);
4716 } 4713 }
4717 4714
4718 if (!mddev->new_chunk_sectors || 4715 if (!mddev->new_chunk_sectors ||
4719 (mddev->new_chunk_sectors << 9) % PAGE_SIZE || 4716 (mddev->new_chunk_sectors << 9) % PAGE_SIZE ||
4720 !is_power_of_2(mddev->new_chunk_sectors)) { 4717 !is_power_of_2(mddev->new_chunk_sectors)) {
4721 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n", 4718 printk(KERN_ERR "raid5: invalid chunk size %d for %s\n",
4722 mddev->new_chunk_sectors << 9, mdname(mddev)); 4719 mddev->new_chunk_sectors << 9, mdname(mddev));
4723 return ERR_PTR(-EINVAL); 4720 return ERR_PTR(-EINVAL);
4724 } 4721 }
4725 4722
4726 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL); 4723 conf = kzalloc(sizeof(raid5_conf_t), GFP_KERNEL);
4727 if (conf == NULL) 4724 if (conf == NULL)
4728 goto abort; 4725 goto abort;
4729 spin_lock_init(&conf->device_lock); 4726 spin_lock_init(&conf->device_lock);
4730 init_waitqueue_head(&conf->wait_for_stripe); 4727 init_waitqueue_head(&conf->wait_for_stripe);
4731 init_waitqueue_head(&conf->wait_for_overlap); 4728 init_waitqueue_head(&conf->wait_for_overlap);
4732 INIT_LIST_HEAD(&conf->handle_list); 4729 INIT_LIST_HEAD(&conf->handle_list);
4733 INIT_LIST_HEAD(&conf->hold_list); 4730 INIT_LIST_HEAD(&conf->hold_list);
4734 INIT_LIST_HEAD(&conf->delayed_list); 4731 INIT_LIST_HEAD(&conf->delayed_list);
4735 INIT_LIST_HEAD(&conf->bitmap_list); 4732 INIT_LIST_HEAD(&conf->bitmap_list);
4736 INIT_LIST_HEAD(&conf->inactive_list); 4733 INIT_LIST_HEAD(&conf->inactive_list);
4737 atomic_set(&conf->active_stripes, 0); 4734 atomic_set(&conf->active_stripes, 0);
4738 atomic_set(&conf->preread_active_stripes, 0); 4735 atomic_set(&conf->preread_active_stripes, 0);
4739 atomic_set(&conf->active_aligned_reads, 0); 4736 atomic_set(&conf->active_aligned_reads, 0);
4740 conf->bypass_threshold = BYPASS_THRESHOLD; 4737 conf->bypass_threshold = BYPASS_THRESHOLD;
4741 4738
4742 conf->raid_disks = mddev->raid_disks; 4739 conf->raid_disks = mddev->raid_disks;
4743 conf->scribble_len = scribble_len(conf->raid_disks);
4744 if (mddev->reshape_position == MaxSector) 4740 if (mddev->reshape_position == MaxSector)
4745 conf->previous_raid_disks = mddev->raid_disks; 4741 conf->previous_raid_disks = mddev->raid_disks;
4746 else 4742 else
4747 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks; 4743 conf->previous_raid_disks = mddev->raid_disks - mddev->delta_disks;
4744 max_disks = max(conf->raid_disks, conf->previous_raid_disks);
4745 conf->scribble_len = scribble_len(max_disks);
4748 4746
4749 conf->disks = kzalloc(conf->raid_disks * sizeof(struct disk_info), 4747 conf->disks = kzalloc(max_disks * sizeof(struct disk_info),
4750 GFP_KERNEL); 4748 GFP_KERNEL);
4751 if (!conf->disks) 4749 if (!conf->disks)
4752 goto abort; 4750 goto abort;
4753 4751
4754 conf->mddev = mddev; 4752 conf->mddev = mddev;
4755 4753
4756 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL) 4754 if ((conf->stripe_hashtbl = kzalloc(PAGE_SIZE, GFP_KERNEL)) == NULL)
4757 goto abort; 4755 goto abort;
4758 4756
4759 conf->level = mddev->new_level; 4757 conf->level = mddev->new_level;
4760 if (raid5_alloc_percpu(conf) != 0) 4758 if (raid5_alloc_percpu(conf) != 0)
4761 goto abort; 4759 goto abort;
4762 4760
4763 pr_debug("raid5: run(%s) called.\n", mdname(mddev)); 4761 pr_debug("raid5: run(%s) called.\n", mdname(mddev));
4764 4762
4765 list_for_each_entry(rdev, &mddev->disks, same_set) { 4763 list_for_each_entry(rdev, &mddev->disks, same_set) {
4766 raid_disk = rdev->raid_disk; 4764 raid_disk = rdev->raid_disk;
4767 if (raid_disk >= conf->raid_disks 4765 if (raid_disk >= max_disks
4768 || raid_disk < 0) 4766 || raid_disk < 0)
4769 continue; 4767 continue;
4770 disk = conf->disks + raid_disk; 4768 disk = conf->disks + raid_disk;
4771 4769
4772 disk->rdev = rdev; 4770 disk->rdev = rdev;
4773 4771
4774 if (test_bit(In_sync, &rdev->flags)) { 4772 if (test_bit(In_sync, &rdev->flags)) {
4775 char b[BDEVNAME_SIZE]; 4773 char b[BDEVNAME_SIZE];
4776 printk(KERN_INFO "raid5: device %s operational as raid" 4774 printk(KERN_INFO "raid5: device %s operational as raid"
4777 " disk %d\n", bdevname(rdev->bdev,b), 4775 " disk %d\n", bdevname(rdev->bdev,b),
4778 raid_disk); 4776 raid_disk);
4779 } else 4777 } else
4780 /* Cannot rely on bitmap to complete recovery */ 4778 /* Cannot rely on bitmap to complete recovery */
4781 conf->fullsync = 1; 4779 conf->fullsync = 1;
4782 } 4780 }
4783 4781
4784 conf->chunk_sectors = mddev->new_chunk_sectors; 4782 conf->chunk_sectors = mddev->new_chunk_sectors;
4785 conf->level = mddev->new_level; 4783 conf->level = mddev->new_level;
4786 if (conf->level == 6) 4784 if (conf->level == 6)
4787 conf->max_degraded = 2; 4785 conf->max_degraded = 2;
4788 else 4786 else
4789 conf->max_degraded = 1; 4787 conf->max_degraded = 1;
4790 conf->algorithm = mddev->new_layout; 4788 conf->algorithm = mddev->new_layout;
4791 conf->max_nr_stripes = NR_STRIPES; 4789 conf->max_nr_stripes = NR_STRIPES;
4792 conf->reshape_progress = mddev->reshape_position; 4790 conf->reshape_progress = mddev->reshape_position;
4793 if (conf->reshape_progress != MaxSector) { 4791 if (conf->reshape_progress != MaxSector) {
4794 conf->prev_chunk_sectors = mddev->chunk_sectors; 4792 conf->prev_chunk_sectors = mddev->chunk_sectors;
4795 conf->prev_algo = mddev->layout; 4793 conf->prev_algo = mddev->layout;
4796 } 4794 }
4797 4795
4798 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) + 4796 memory = conf->max_nr_stripes * (sizeof(struct stripe_head) +
4799 conf->raid_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024; 4797 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
4800 if (grow_stripes(conf, conf->max_nr_stripes)) { 4798 if (grow_stripes(conf, conf->max_nr_stripes)) {
4801 printk(KERN_ERR 4799 printk(KERN_ERR
4802 "raid5: couldn't allocate %dkB for buffers\n", memory); 4800 "raid5: couldn't allocate %dkB for buffers\n", memory);
4803 goto abort; 4801 goto abort;
4804 } else 4802 } else
4805 printk(KERN_INFO "raid5: allocated %dkB for %s\n", 4803 printk(KERN_INFO "raid5: allocated %dkB for %s\n",
4806 memory, mdname(mddev)); 4804 memory, mdname(mddev));
4807 4805
4808 conf->thread = md_register_thread(raid5d, mddev, NULL); 4806 conf->thread = md_register_thread(raid5d, mddev, NULL);
4809 if (!conf->thread) { 4807 if (!conf->thread) {
4810 printk(KERN_ERR 4808 printk(KERN_ERR
4811 "raid5: couldn't allocate thread for %s\n", 4809 "raid5: couldn't allocate thread for %s\n",
4812 mdname(mddev)); 4810 mdname(mddev));
4813 goto abort; 4811 goto abort;
4814 } 4812 }
4815 4813
4816 return conf; 4814 return conf;
4817 4815
4818 abort: 4816 abort:
4819 if (conf) { 4817 if (conf) {
4820 free_conf(conf); 4818 free_conf(conf);
4821 return ERR_PTR(-EIO); 4819 return ERR_PTR(-EIO);
4822 } else 4820 } else
4823 return ERR_PTR(-ENOMEM); 4821 return ERR_PTR(-ENOMEM);
4824 } 4822 }
4825 4823
4826 static int run(mddev_t *mddev) 4824 static int run(mddev_t *mddev)
4827 { 4825 {
4828 raid5_conf_t *conf; 4826 raid5_conf_t *conf;
4829 int working_disks = 0, chunk_size; 4827 int working_disks = 0, chunk_size;
4830 mdk_rdev_t *rdev; 4828 mdk_rdev_t *rdev;
4831 4829
4832 if (mddev->recovery_cp != MaxSector) 4830 if (mddev->recovery_cp != MaxSector)
4833 printk(KERN_NOTICE "raid5: %s is not clean" 4831 printk(KERN_NOTICE "raid5: %s is not clean"
4834 " -- starting background reconstruction\n", 4832 " -- starting background reconstruction\n",
4835 mdname(mddev)); 4833 mdname(mddev));
4836 if (mddev->reshape_position != MaxSector) { 4834 if (mddev->reshape_position != MaxSector) {
4837 /* Check that we can continue the reshape. 4835 /* Check that we can continue the reshape.
4838 * Currently only disks can change, it must 4836 * Currently only disks can change, it must
4839 * increase, and we must be past the point where 4837 * increase, and we must be past the point where
4840 * a stripe over-writes itself 4838 * a stripe over-writes itself
4841 */ 4839 */
4842 sector_t here_new, here_old; 4840 sector_t here_new, here_old;
4843 int old_disks; 4841 int old_disks;
4844 int max_degraded = (mddev->level == 6 ? 2 : 1); 4842 int max_degraded = (mddev->level == 6 ? 2 : 1);
4845 4843
4846 if (mddev->new_level != mddev->level) { 4844 if (mddev->new_level != mddev->level) {
4847 printk(KERN_ERR "raid5: %s: unsupported reshape " 4845 printk(KERN_ERR "raid5: %s: unsupported reshape "
4848 "required - aborting.\n", 4846 "required - aborting.\n",
4849 mdname(mddev)); 4847 mdname(mddev));
4850 return -EINVAL; 4848 return -EINVAL;
4851 } 4849 }
4852 old_disks = mddev->raid_disks - mddev->delta_disks; 4850 old_disks = mddev->raid_disks - mddev->delta_disks;
4853 /* reshape_position must be on a new-stripe boundary, and one 4851 /* reshape_position must be on a new-stripe boundary, and one
4854 * further up in new geometry must map after here in old 4852 * further up in new geometry must map after here in old
4855 * geometry. 4853 * geometry.
4856 */ 4854 */
4857 here_new = mddev->reshape_position; 4855 here_new = mddev->reshape_position;
4858 if (sector_div(here_new, mddev->new_chunk_sectors * 4856 if (sector_div(here_new, mddev->new_chunk_sectors *
4859 (mddev->raid_disks - max_degraded))) { 4857 (mddev->raid_disks - max_degraded))) {
4860 printk(KERN_ERR "raid5: reshape_position not " 4858 printk(KERN_ERR "raid5: reshape_position not "
4861 "on a stripe boundary\n"); 4859 "on a stripe boundary\n");
4862 return -EINVAL; 4860 return -EINVAL;
4863 } 4861 }
4864 /* here_new is the stripe we will write to */ 4862 /* here_new is the stripe we will write to */
4865 here_old = mddev->reshape_position; 4863 here_old = mddev->reshape_position;
4866 sector_div(here_old, mddev->chunk_sectors * 4864 sector_div(here_old, mddev->chunk_sectors *
4867 (old_disks-max_degraded)); 4865 (old_disks-max_degraded));
4868 /* here_old is the first stripe that we might need to read 4866 /* here_old is the first stripe that we might need to read
4869 * from */ 4867 * from */
4870 if (mddev->delta_disks == 0) { 4868 if (mddev->delta_disks == 0) {
4871 /* We cannot be sure it is safe to start an in-place 4869 /* We cannot be sure it is safe to start an in-place
4872 * reshape. It is only safe if user-space if monitoring 4870 * reshape. It is only safe if user-space if monitoring
4873 * and taking constant backups. 4871 * and taking constant backups.
4874 * mdadm always starts a situation like this in 4872 * mdadm always starts a situation like this in
4875 * readonly mode so it can take control before 4873 * readonly mode so it can take control before
4876 * allowing any writes. So just check for that. 4874 * allowing any writes. So just check for that.
4877 */ 4875 */
4878 if ((here_new * mddev->new_chunk_sectors != 4876 if ((here_new * mddev->new_chunk_sectors !=
4879 here_old * mddev->chunk_sectors) || 4877 here_old * mddev->chunk_sectors) ||
4880 mddev->ro == 0) { 4878 mddev->ro == 0) {
4881 printk(KERN_ERR "raid5: in-place reshape must be started" 4879 printk(KERN_ERR "raid5: in-place reshape must be started"
4882 " in read-only mode - aborting\n"); 4880 " in read-only mode - aborting\n");
4883 return -EINVAL; 4881 return -EINVAL;
4884 } 4882 }
4885 } else if (mddev->delta_disks < 0 4883 } else if (mddev->delta_disks < 0
4886 ? (here_new * mddev->new_chunk_sectors <= 4884 ? (here_new * mddev->new_chunk_sectors <=
4887 here_old * mddev->chunk_sectors) 4885 here_old * mddev->chunk_sectors)
4888 : (here_new * mddev->new_chunk_sectors >= 4886 : (here_new * mddev->new_chunk_sectors >=
4889 here_old * mddev->chunk_sectors)) { 4887 here_old * mddev->chunk_sectors)) {
4890 /* Reading from the same stripe as writing to - bad */ 4888 /* Reading from the same stripe as writing to - bad */
4891 printk(KERN_ERR "raid5: reshape_position too early for " 4889 printk(KERN_ERR "raid5: reshape_position too early for "
4892 "auto-recovery - aborting.\n"); 4890 "auto-recovery - aborting.\n");
4893 return -EINVAL; 4891 return -EINVAL;
4894 } 4892 }
4895 printk(KERN_INFO "raid5: reshape will continue\n"); 4893 printk(KERN_INFO "raid5: reshape will continue\n");
4896 /* OK, we should be able to continue; */ 4894 /* OK, we should be able to continue; */
4897 } else { 4895 } else {
4898 BUG_ON(mddev->level != mddev->new_level); 4896 BUG_ON(mddev->level != mddev->new_level);
4899 BUG_ON(mddev->layout != mddev->new_layout); 4897 BUG_ON(mddev->layout != mddev->new_layout);
4900 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors); 4898 BUG_ON(mddev->chunk_sectors != mddev->new_chunk_sectors);
4901 BUG_ON(mddev->delta_disks != 0); 4899 BUG_ON(mddev->delta_disks != 0);
4902 } 4900 }
4903 4901
4904 if (mddev->private == NULL) 4902 if (mddev->private == NULL)
4905 conf = setup_conf(mddev); 4903 conf = setup_conf(mddev);
4906 else 4904 else
4907 conf = mddev->private; 4905 conf = mddev->private;
4908 4906
4909 if (IS_ERR(conf)) 4907 if (IS_ERR(conf))
4910 return PTR_ERR(conf); 4908 return PTR_ERR(conf);
4911 4909
4912 mddev->thread = conf->thread; 4910 mddev->thread = conf->thread;
4913 conf->thread = NULL; 4911 conf->thread = NULL;
4914 mddev->private = conf; 4912 mddev->private = conf;
4915 4913
4916 /* 4914 /*
4917 * 0 for a fully functional array, 1 or 2 for a degraded array. 4915 * 0 for a fully functional array, 1 or 2 for a degraded array.
4918 */ 4916 */
4919 list_for_each_entry(rdev, &mddev->disks, same_set) 4917 list_for_each_entry(rdev, &mddev->disks, same_set)
4920 if (rdev->raid_disk >= 0 && 4918 if (rdev->raid_disk >= 0 &&
4921 test_bit(In_sync, &rdev->flags)) 4919 test_bit(In_sync, &rdev->flags))
4922 working_disks++; 4920 working_disks++;
4923 4921
4924 mddev->degraded = conf->raid_disks - working_disks; 4922 mddev->degraded = (max(conf->raid_disks, conf->previous_raid_disks)
4923 - working_disks);
4925 4924
4926 if (mddev->degraded > conf->max_degraded) { 4925 if (mddev->degraded > conf->max_degraded) {
4927 printk(KERN_ERR "raid5: not enough operational devices for %s" 4926 printk(KERN_ERR "raid5: not enough operational devices for %s"
4928 " (%d/%d failed)\n", 4927 " (%d/%d failed)\n",
4929 mdname(mddev), mddev->degraded, conf->raid_disks); 4928 mdname(mddev), mddev->degraded, conf->raid_disks);
4930 goto abort; 4929 goto abort;
4931 } 4930 }
4932 4931
4933 /* device size must be a multiple of chunk size */ 4932 /* device size must be a multiple of chunk size */
4934 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1); 4933 mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
4935 mddev->resync_max_sectors = mddev->dev_sectors; 4934 mddev->resync_max_sectors = mddev->dev_sectors;
4936 4935
4937 if (mddev->degraded > 0 && 4936 if (mddev->degraded > 0 &&
4938 mddev->recovery_cp != MaxSector) { 4937 mddev->recovery_cp != MaxSector) {
4939 if (mddev->ok_start_degraded) 4938 if (mddev->ok_start_degraded)
4940 printk(KERN_WARNING 4939 printk(KERN_WARNING
4941 "raid5: starting dirty degraded array: %s" 4940 "raid5: starting dirty degraded array: %s"
4942 "- data corruption possible.\n", 4941 "- data corruption possible.\n",
4943 mdname(mddev)); 4942 mdname(mddev));
4944 else { 4943 else {
4945 printk(KERN_ERR 4944 printk(KERN_ERR
4946 "raid5: cannot start dirty degraded array for %s\n", 4945 "raid5: cannot start dirty degraded array for %s\n",
4947 mdname(mddev)); 4946 mdname(mddev));
4948 goto abort; 4947 goto abort;
4949 } 4948 }
4950 } 4949 }
4951 4950
4952 if (mddev->degraded == 0) 4951 if (mddev->degraded == 0)
4953 printk("raid5: raid level %d set %s active with %d out of %d" 4952 printk("raid5: raid level %d set %s active with %d out of %d"
4954 " devices, algorithm %d\n", conf->level, mdname(mddev), 4953 " devices, algorithm %d\n", conf->level, mdname(mddev),
4955 mddev->raid_disks-mddev->degraded, mddev->raid_disks, 4954 mddev->raid_disks-mddev->degraded, mddev->raid_disks,
4956 mddev->new_layout); 4955 mddev->new_layout);
4957 else 4956 else
4958 printk(KERN_ALERT "raid5: raid level %d set %s active with %d" 4957 printk(KERN_ALERT "raid5: raid level %d set %s active with %d"
4959 " out of %d devices, algorithm %d\n", conf->level, 4958 " out of %d devices, algorithm %d\n", conf->level,
4960 mdname(mddev), mddev->raid_disks - mddev->degraded, 4959 mdname(mddev), mddev->raid_disks - mddev->degraded,
4961 mddev->raid_disks, mddev->new_layout); 4960 mddev->raid_disks, mddev->new_layout);
4962 4961
4963 print_raid5_conf(conf); 4962 print_raid5_conf(conf);
4964 4963
4965 if (conf->reshape_progress != MaxSector) { 4964 if (conf->reshape_progress != MaxSector) {
4966 printk("...ok start reshape thread\n"); 4965 printk("...ok start reshape thread\n");
4967 conf->reshape_safe = conf->reshape_progress; 4966 conf->reshape_safe = conf->reshape_progress;
4968 atomic_set(&conf->reshape_stripes, 0); 4967 atomic_set(&conf->reshape_stripes, 0);
4969 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4968 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4970 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4969 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4971 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 4970 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
4972 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 4971 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
4973 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 4972 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
4974 "reshape"); 4973 "reshape");
4975 } 4974 }
4976 4975
4977 /* read-ahead size must cover two whole stripes, which is 4976 /* read-ahead size must cover two whole stripes, which is
4978 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 4977 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
4979 */ 4978 */
4980 { 4979 {
4981 int data_disks = conf->previous_raid_disks - conf->max_degraded; 4980 int data_disks = conf->previous_raid_disks - conf->max_degraded;
4982 int stripe = data_disks * 4981 int stripe = data_disks *
4983 ((mddev->chunk_sectors << 9) / PAGE_SIZE); 4982 ((mddev->chunk_sectors << 9) / PAGE_SIZE);
4984 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 4983 if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
4985 mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 4984 mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
4986 } 4985 }
4987 4986
4988 /* Ok, everything is just fine now */ 4987 /* Ok, everything is just fine now */
4989 if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group)) 4988 if (sysfs_create_group(&mddev->kobj, &raid5_attrs_group))
4990 printk(KERN_WARNING 4989 printk(KERN_WARNING
4991 "raid5: failed to create sysfs attributes for %s\n", 4990 "raid5: failed to create sysfs attributes for %s\n",
4992 mdname(mddev)); 4991 mdname(mddev));
4993 4992
4994 mddev->queue->queue_lock = &conf->device_lock; 4993 mddev->queue->queue_lock = &conf->device_lock;
4995 4994
4996 mddev->queue->unplug_fn = raid5_unplug_device; 4995 mddev->queue->unplug_fn = raid5_unplug_device;
4997 mddev->queue->backing_dev_info.congested_data = mddev; 4996 mddev->queue->backing_dev_info.congested_data = mddev;
4998 mddev->queue->backing_dev_info.congested_fn = raid5_congested; 4997 mddev->queue->backing_dev_info.congested_fn = raid5_congested;
4999 4998
5000 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 4999 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5001 5000
5002 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec); 5001 blk_queue_merge_bvec(mddev->queue, raid5_mergeable_bvec);
5003 chunk_size = mddev->chunk_sectors << 9; 5002 chunk_size = mddev->chunk_sectors << 9;
5004 blk_queue_io_min(mddev->queue, chunk_size); 5003 blk_queue_io_min(mddev->queue, chunk_size);
5005 blk_queue_io_opt(mddev->queue, chunk_size * 5004 blk_queue_io_opt(mddev->queue, chunk_size *
5006 (conf->raid_disks - conf->max_degraded)); 5005 (conf->raid_disks - conf->max_degraded));
5007 5006
5008 list_for_each_entry(rdev, &mddev->disks, same_set) 5007 list_for_each_entry(rdev, &mddev->disks, same_set)
5009 disk_stack_limits(mddev->gendisk, rdev->bdev, 5008 disk_stack_limits(mddev->gendisk, rdev->bdev,
5010 rdev->data_offset << 9); 5009 rdev->data_offset << 9);
5011 5010
5012 return 0; 5011 return 0;
5013 abort: 5012 abort:
5014 md_unregister_thread(mddev->thread); 5013 md_unregister_thread(mddev->thread);
5015 mddev->thread = NULL; 5014 mddev->thread = NULL;
5016 if (conf) { 5015 if (conf) {
5017 print_raid5_conf(conf); 5016 print_raid5_conf(conf);
5018 free_conf(conf); 5017 free_conf(conf);
5019 } 5018 }
5020 mddev->private = NULL; 5019 mddev->private = NULL;
5021 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); 5020 printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev));
5022 return -EIO; 5021 return -EIO;
5023 } 5022 }
5024 5023
5025 5024
5026 5025
5027 static int stop(mddev_t *mddev) 5026 static int stop(mddev_t *mddev)
5028 { 5027 {
5029 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 5028 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
5030 5029
5031 md_unregister_thread(mddev->thread); 5030 md_unregister_thread(mddev->thread);
5032 mddev->thread = NULL; 5031 mddev->thread = NULL;
5033 mddev->queue->backing_dev_info.congested_fn = NULL; 5032 mddev->queue->backing_dev_info.congested_fn = NULL;
5034 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ 5033 blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/
5035 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); 5034 sysfs_remove_group(&mddev->kobj, &raid5_attrs_group);
5036 free_conf(conf); 5035 free_conf(conf);
5037 mddev->private = NULL; 5036 mddev->private = NULL;
5038 return 0; 5037 return 0;
5039 } 5038 }
5040 5039
5041 #ifdef DEBUG 5040 #ifdef DEBUG
5042 static void print_sh(struct seq_file *seq, struct stripe_head *sh) 5041 static void print_sh(struct seq_file *seq, struct stripe_head *sh)
5043 { 5042 {
5044 int i; 5043 int i;
5045 5044
5046 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n", 5045 seq_printf(seq, "sh %llu, pd_idx %d, state %ld.\n",
5047 (unsigned long long)sh->sector, sh->pd_idx, sh->state); 5046 (unsigned long long)sh->sector, sh->pd_idx, sh->state);
5048 seq_printf(seq, "sh %llu, count %d.\n", 5047 seq_printf(seq, "sh %llu, count %d.\n",
5049 (unsigned long long)sh->sector, atomic_read(&sh->count)); 5048 (unsigned long long)sh->sector, atomic_read(&sh->count));
5050 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector); 5049 seq_printf(seq, "sh %llu, ", (unsigned long long)sh->sector);
5051 for (i = 0; i < sh->disks; i++) { 5050 for (i = 0; i < sh->disks; i++) {
5052 seq_printf(seq, "(cache%d: %p %ld) ", 5051 seq_printf(seq, "(cache%d: %p %ld) ",
5053 i, sh->dev[i].page, sh->dev[i].flags); 5052 i, sh->dev[i].page, sh->dev[i].flags);
5054 } 5053 }
5055 seq_printf(seq, "\n"); 5054 seq_printf(seq, "\n");
5056 } 5055 }
5057 5056
5058 static void printall(struct seq_file *seq, raid5_conf_t *conf) 5057 static void printall(struct seq_file *seq, raid5_conf_t *conf)
5059 { 5058 {
5060 struct stripe_head *sh; 5059 struct stripe_head *sh;
5061 struct hlist_node *hn; 5060 struct hlist_node *hn;
5062 int i; 5061 int i;
5063 5062
5064 spin_lock_irq(&conf->device_lock); 5063 spin_lock_irq(&conf->device_lock);
5065 for (i = 0; i < NR_HASH; i++) { 5064 for (i = 0; i < NR_HASH; i++) {
5066 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) { 5065 hlist_for_each_entry(sh, hn, &conf->stripe_hashtbl[i], hash) {
5067 if (sh->raid_conf != conf) 5066 if (sh->raid_conf != conf)
5068 continue; 5067 continue;
5069 print_sh(seq, sh); 5068 print_sh(seq, sh);
5070 } 5069 }
5071 } 5070 }
5072 spin_unlock_irq(&conf->device_lock); 5071 spin_unlock_irq(&conf->device_lock);
5073 } 5072 }
5074 #endif 5073 #endif
5075 5074
5076 static void status(struct seq_file *seq, mddev_t *mddev) 5075 static void status(struct seq_file *seq, mddev_t *mddev)
5077 { 5076 {
5078 raid5_conf_t *conf = (raid5_conf_t *) mddev->private; 5077 raid5_conf_t *conf = (raid5_conf_t *) mddev->private;
5079 int i; 5078 int i;
5080 5079
5081 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level, 5080 seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
5082 mddev->chunk_sectors / 2, mddev->layout); 5081 mddev->chunk_sectors / 2, mddev->layout);
5083 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded); 5082 seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
5084 for (i = 0; i < conf->raid_disks; i++) 5083 for (i = 0; i < conf->raid_disks; i++)
5085 seq_printf (seq, "%s", 5084 seq_printf (seq, "%s",
5086 conf->disks[i].rdev && 5085 conf->disks[i].rdev &&
5087 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_"); 5086 test_bit(In_sync, &conf->disks[i].rdev->flags) ? "U" : "_");
5088 seq_printf (seq, "]"); 5087 seq_printf (seq, "]");
5089 #ifdef DEBUG 5088 #ifdef DEBUG
5090 seq_printf (seq, "\n"); 5089 seq_printf (seq, "\n");
5091 printall(seq, conf); 5090 printall(seq, conf);
5092 #endif 5091 #endif
5093 } 5092 }
5094 5093
5095 static void print_raid5_conf (raid5_conf_t *conf) 5094 static void print_raid5_conf (raid5_conf_t *conf)
5096 { 5095 {
5097 int i; 5096 int i;
5098 struct disk_info *tmp; 5097 struct disk_info *tmp;
5099 5098
5100 printk("RAID5 conf printout:\n"); 5099 printk("RAID5 conf printout:\n");
5101 if (!conf) { 5100 if (!conf) {
5102 printk("(conf==NULL)\n"); 5101 printk("(conf==NULL)\n");
5103 return; 5102 return;
5104 } 5103 }
5105 printk(" --- rd:%d wd:%d\n", conf->raid_disks, 5104 printk(" --- rd:%d wd:%d\n", conf->raid_disks,
5106 conf->raid_disks - conf->mddev->degraded); 5105 conf->raid_disks - conf->mddev->degraded);
5107 5106
5108 for (i = 0; i < conf->raid_disks; i++) { 5107 for (i = 0; i < conf->raid_disks; i++) {
5109 char b[BDEVNAME_SIZE]; 5108 char b[BDEVNAME_SIZE];
5110 tmp = conf->disks + i; 5109 tmp = conf->disks + i;
5111 if (tmp->rdev) 5110 if (tmp->rdev)
5112 printk(" disk %d, o:%d, dev:%s\n", 5111 printk(" disk %d, o:%d, dev:%s\n",
5113 i, !test_bit(Faulty, &tmp->rdev->flags), 5112 i, !test_bit(Faulty, &tmp->rdev->flags),
5114 bdevname(tmp->rdev->bdev,b)); 5113 bdevname(tmp->rdev->bdev,b));
5115 } 5114 }
5116 } 5115 }
5117 5116
5118 static int raid5_spare_active(mddev_t *mddev) 5117 static int raid5_spare_active(mddev_t *mddev)
5119 { 5118 {
5120 int i; 5119 int i;
5121 raid5_conf_t *conf = mddev->private; 5120 raid5_conf_t *conf = mddev->private;
5122 struct disk_info *tmp; 5121 struct disk_info *tmp;
5123 5122
5124 for (i = 0; i < conf->raid_disks; i++) { 5123 for (i = 0; i < conf->raid_disks; i++) {
5125 tmp = conf->disks + i; 5124 tmp = conf->disks + i;
5126 if (tmp->rdev 5125 if (tmp->rdev
5127 && !test_bit(Faulty, &tmp->rdev->flags) 5126 && !test_bit(Faulty, &tmp->rdev->flags)
5128 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { 5127 && !test_and_set_bit(In_sync, &tmp->rdev->flags)) {
5129 unsigned long flags; 5128 unsigned long flags;
5130 spin_lock_irqsave(&conf->device_lock, flags); 5129 spin_lock_irqsave(&conf->device_lock, flags);
5131 mddev->degraded--; 5130 mddev->degraded--;
5132 spin_unlock_irqrestore(&conf->device_lock, flags); 5131 spin_unlock_irqrestore(&conf->device_lock, flags);
5133 } 5132 }
5134 } 5133 }
5135 print_raid5_conf(conf); 5134 print_raid5_conf(conf);
5136 return 0; 5135 return 0;
5137 } 5136 }
5138 5137
5139 static int raid5_remove_disk(mddev_t *mddev, int number) 5138 static int raid5_remove_disk(mddev_t *mddev, int number)
5140 { 5139 {
5141 raid5_conf_t *conf = mddev->private; 5140 raid5_conf_t *conf = mddev->private;
5142 int err = 0; 5141 int err = 0;
5143 mdk_rdev_t *rdev; 5142 mdk_rdev_t *rdev;
5144 struct disk_info *p = conf->disks + number; 5143 struct disk_info *p = conf->disks + number;
5145 5144
5146 print_raid5_conf(conf); 5145 print_raid5_conf(conf);
5147 rdev = p->rdev; 5146 rdev = p->rdev;
5148 if (rdev) { 5147 if (rdev) {
5149 if (number >= conf->raid_disks && 5148 if (number >= conf->raid_disks &&
5150 conf->reshape_progress == MaxSector) 5149 conf->reshape_progress == MaxSector)
5151 clear_bit(In_sync, &rdev->flags); 5150 clear_bit(In_sync, &rdev->flags);
5152 5151
5153 if (test_bit(In_sync, &rdev->flags) || 5152 if (test_bit(In_sync, &rdev->flags) ||
5154 atomic_read(&rdev->nr_pending)) { 5153 atomic_read(&rdev->nr_pending)) {
5155 err = -EBUSY; 5154 err = -EBUSY;
5156 goto abort; 5155 goto abort;
5157 } 5156 }
5158 /* Only remove non-faulty devices if recovery 5157 /* Only remove non-faulty devices if recovery
5159 * isn't possible. 5158 * isn't possible.
5160 */ 5159 */
5161 if (!test_bit(Faulty, &rdev->flags) && 5160 if (!test_bit(Faulty, &rdev->flags) &&
5162 mddev->degraded <= conf->max_degraded && 5161 mddev->degraded <= conf->max_degraded &&
5163 number < conf->raid_disks) { 5162 number < conf->raid_disks) {
5164 err = -EBUSY; 5163 err = -EBUSY;
5165 goto abort; 5164 goto abort;
5166 } 5165 }
5167 p->rdev = NULL; 5166 p->rdev = NULL;
5168 synchronize_rcu(); 5167 synchronize_rcu();
5169 if (atomic_read(&rdev->nr_pending)) { 5168 if (atomic_read(&rdev->nr_pending)) {
5170 /* lost the race, try later */ 5169 /* lost the race, try later */
5171 err = -EBUSY; 5170 err = -EBUSY;
5172 p->rdev = rdev; 5171 p->rdev = rdev;
5173 } 5172 }
5174 } 5173 }
5175 abort: 5174 abort:
5176 5175
5177 print_raid5_conf(conf); 5176 print_raid5_conf(conf);
5178 return err; 5177 return err;
5179 } 5178 }
5180 5179
5181 static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) 5180 static int raid5_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
5182 { 5181 {
5183 raid5_conf_t *conf = mddev->private; 5182 raid5_conf_t *conf = mddev->private;
5184 int err = -EEXIST; 5183 int err = -EEXIST;
5185 int disk; 5184 int disk;
5186 struct disk_info *p; 5185 struct disk_info *p;
5187 int first = 0; 5186 int first = 0;
5188 int last = conf->raid_disks - 1; 5187 int last = conf->raid_disks - 1;
5189 5188
5190 if (mddev->degraded > conf->max_degraded) 5189 if (mddev->degraded > conf->max_degraded)
5191 /* no point adding a device */ 5190 /* no point adding a device */
5192 return -EINVAL; 5191 return -EINVAL;
5193 5192
5194 if (rdev->raid_disk >= 0) 5193 if (rdev->raid_disk >= 0)
5195 first = last = rdev->raid_disk; 5194 first = last = rdev->raid_disk;
5196 5195
5197 /* 5196 /*
5198 * find the disk ... but prefer rdev->saved_raid_disk 5197 * find the disk ... but prefer rdev->saved_raid_disk
5199 * if possible. 5198 * if possible.
5200 */ 5199 */
5201 if (rdev->saved_raid_disk >= 0 && 5200 if (rdev->saved_raid_disk >= 0 &&
5202 rdev->saved_raid_disk >= first && 5201 rdev->saved_raid_disk >= first &&
5203 conf->disks[rdev->saved_raid_disk].rdev == NULL) 5202 conf->disks[rdev->saved_raid_disk].rdev == NULL)
5204 disk = rdev->saved_raid_disk; 5203 disk = rdev->saved_raid_disk;
5205 else 5204 else
5206 disk = first; 5205 disk = first;
5207 for ( ; disk <= last ; disk++) 5206 for ( ; disk <= last ; disk++)
5208 if ((p=conf->disks + disk)->rdev == NULL) { 5207 if ((p=conf->disks + disk)->rdev == NULL) {
5209 clear_bit(In_sync, &rdev->flags); 5208 clear_bit(In_sync, &rdev->flags);
5210 rdev->raid_disk = disk; 5209 rdev->raid_disk = disk;
5211 err = 0; 5210 err = 0;
5212 if (rdev->saved_raid_disk != disk) 5211 if (rdev->saved_raid_disk != disk)
5213 conf->fullsync = 1; 5212 conf->fullsync = 1;
5214 rcu_assign_pointer(p->rdev, rdev); 5213 rcu_assign_pointer(p->rdev, rdev);
5215 break; 5214 break;
5216 } 5215 }
5217 print_raid5_conf(conf); 5216 print_raid5_conf(conf);
5218 return err; 5217 return err;
5219 } 5218 }
5220 5219
5221 static int raid5_resize(mddev_t *mddev, sector_t sectors) 5220 static int raid5_resize(mddev_t *mddev, sector_t sectors)
5222 { 5221 {
5223 /* no resync is happening, and there is enough space 5222 /* no resync is happening, and there is enough space
5224 * on all devices, so we can resize. 5223 * on all devices, so we can resize.
5225 * We need to make sure resync covers any new space. 5224 * We need to make sure resync covers any new space.
5226 * If the array is shrinking we should possibly wait until 5225 * If the array is shrinking we should possibly wait until
5227 * any io in the removed space completes, but it hardly seems 5226 * any io in the removed space completes, but it hardly seems
5228 * worth it. 5227 * worth it.
5229 */ 5228 */
5230 sectors &= ~((sector_t)mddev->chunk_sectors - 1); 5229 sectors &= ~((sector_t)mddev->chunk_sectors - 1);
5231 md_set_array_sectors(mddev, raid5_size(mddev, sectors, 5230 md_set_array_sectors(mddev, raid5_size(mddev, sectors,
5232 mddev->raid_disks)); 5231 mddev->raid_disks));
5233 if (mddev->array_sectors > 5232 if (mddev->array_sectors >
5234 raid5_size(mddev, sectors, mddev->raid_disks)) 5233 raid5_size(mddev, sectors, mddev->raid_disks))
5235 return -EINVAL; 5234 return -EINVAL;
5236 set_capacity(mddev->gendisk, mddev->array_sectors); 5235 set_capacity(mddev->gendisk, mddev->array_sectors);
5237 mddev->changed = 1; 5236 mddev->changed = 1;
5238 revalidate_disk(mddev->gendisk); 5237 revalidate_disk(mddev->gendisk);
5239 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { 5238 if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) {
5240 mddev->recovery_cp = mddev->dev_sectors; 5239 mddev->recovery_cp = mddev->dev_sectors;
5241 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5240 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5242 } 5241 }
5243 mddev->dev_sectors = sectors; 5242 mddev->dev_sectors = sectors;
5244 mddev->resync_max_sectors = sectors; 5243 mddev->resync_max_sectors = sectors;
5245 return 0; 5244 return 0;
5246 } 5245 }
5247 5246
5248 static int check_stripe_cache(mddev_t *mddev) 5247 static int check_stripe_cache(mddev_t *mddev)
5249 { 5248 {
5250 /* Can only proceed if there are plenty of stripe_heads. 5249 /* Can only proceed if there are plenty of stripe_heads.
5251 * We need a minimum of one full stripe,, and for sensible progress 5250 * We need a minimum of one full stripe,, and for sensible progress
5252 * it is best to have about 4 times that. 5251 * it is best to have about 4 times that.
5253 * If we require 4 times, then the default 256 4K stripe_heads will 5252 * If we require 4 times, then the default 256 4K stripe_heads will
5254 * allow for chunk sizes up to 256K, which is probably OK. 5253 * allow for chunk sizes up to 256K, which is probably OK.
5255 * If the chunk size is greater, user-space should request more 5254 * If the chunk size is greater, user-space should request more
5256 * stripe_heads first. 5255 * stripe_heads first.
5257 */ 5256 */
5258 raid5_conf_t *conf = mddev->private; 5257 raid5_conf_t *conf = mddev->private;
5259 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 5258 if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4
5260 > conf->max_nr_stripes || 5259 > conf->max_nr_stripes ||
5261 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 5260 ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4
5262 > conf->max_nr_stripes) { 5261 > conf->max_nr_stripes) {
5263 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n", 5262 printk(KERN_WARNING "raid5: reshape: not enough stripes. Needed %lu\n",
5264 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) 5263 ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9)
5265 / STRIPE_SIZE)*4); 5264 / STRIPE_SIZE)*4);
5266 return 0; 5265 return 0;
5267 } 5266 }
5268 return 1; 5267 return 1;
5269 } 5268 }
5270 5269
5271 static int check_reshape(mddev_t *mddev) 5270 static int check_reshape(mddev_t *mddev)
5272 { 5271 {
5273 raid5_conf_t *conf = mddev->private; 5272 raid5_conf_t *conf = mddev->private;
5274 5273
5275 if (mddev->delta_disks == 0 && 5274 if (mddev->delta_disks == 0 &&
5276 mddev->new_layout == mddev->layout && 5275 mddev->new_layout == mddev->layout &&
5277 mddev->new_chunk_sectors == mddev->chunk_sectors) 5276 mddev->new_chunk_sectors == mddev->chunk_sectors)
5278 return 0; /* nothing to do */ 5277 return 0; /* nothing to do */
5279 if (mddev->bitmap) 5278 if (mddev->bitmap)
5280 /* Cannot grow a bitmap yet */ 5279 /* Cannot grow a bitmap yet */
5281 return -EBUSY; 5280 return -EBUSY;
5282 if (mddev->degraded > conf->max_degraded) 5281 if (mddev->degraded > conf->max_degraded)
5283 return -EINVAL; 5282 return -EINVAL;
5284 if (mddev->delta_disks < 0) { 5283 if (mddev->delta_disks < 0) {
5285 /* We might be able to shrink, but the devices must 5284 /* We might be able to shrink, but the devices must
5286 * be made bigger first. 5285 * be made bigger first.
5287 * For raid6, 4 is the minimum size. 5286 * For raid6, 4 is the minimum size.
5288 * Otherwise 2 is the minimum 5287 * Otherwise 2 is the minimum
5289 */ 5288 */
5290 int min = 2; 5289 int min = 2;
5291 if (mddev->level == 6) 5290 if (mddev->level == 6)
5292 min = 4; 5291 min = 4;
5293 if (mddev->raid_disks + mddev->delta_disks < min) 5292 if (mddev->raid_disks + mddev->delta_disks < min)
5294 return -EINVAL; 5293 return -EINVAL;
5295 } 5294 }
5296 5295
5297 if (!check_stripe_cache(mddev)) 5296 if (!check_stripe_cache(mddev))
5298 return -ENOSPC; 5297 return -ENOSPC;
5299 5298
5300 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks); 5299 return resize_stripes(conf, conf->raid_disks + mddev->delta_disks);
5301 } 5300 }
5302 5301
5303 static int raid5_start_reshape(mddev_t *mddev) 5302 static int raid5_start_reshape(mddev_t *mddev)
5304 { 5303 {
5305 raid5_conf_t *conf = mddev->private; 5304 raid5_conf_t *conf = mddev->private;
5306 mdk_rdev_t *rdev; 5305 mdk_rdev_t *rdev;
5307 int spares = 0; 5306 int spares = 0;
5308 int added_devices = 0; 5307 int added_devices = 0;
5309 unsigned long flags; 5308 unsigned long flags;
5310 5309
5311 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 5310 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
5312 return -EBUSY; 5311 return -EBUSY;
5313 5312
5314 if (!check_stripe_cache(mddev)) 5313 if (!check_stripe_cache(mddev))
5315 return -ENOSPC; 5314 return -ENOSPC;
5316 5315
5317 list_for_each_entry(rdev, &mddev->disks, same_set) 5316 list_for_each_entry(rdev, &mddev->disks, same_set)
5318 if (rdev->raid_disk < 0 && 5317 if (rdev->raid_disk < 0 &&
5319 !test_bit(Faulty, &rdev->flags)) 5318 !test_bit(Faulty, &rdev->flags))
5320 spares++; 5319 spares++;
5321 5320
5322 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded) 5321 if (spares - mddev->degraded < mddev->delta_disks - conf->max_degraded)
5323 /* Not enough devices even to make a degraded array 5322 /* Not enough devices even to make a degraded array
5324 * of that size 5323 * of that size
5325 */ 5324 */
5326 return -EINVAL; 5325 return -EINVAL;
5327 5326
5328 /* Refuse to reduce size of the array. Any reductions in 5327 /* Refuse to reduce size of the array. Any reductions in
5329 * array size must be through explicit setting of array_size 5328 * array size must be through explicit setting of array_size
5330 * attribute. 5329 * attribute.
5331 */ 5330 */
5332 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks) 5331 if (raid5_size(mddev, 0, conf->raid_disks + mddev->delta_disks)
5333 < mddev->array_sectors) { 5332 < mddev->array_sectors) {
5334 printk(KERN_ERR "md: %s: array size must be reduced " 5333 printk(KERN_ERR "md: %s: array size must be reduced "
5335 "before number of disks\n", mdname(mddev)); 5334 "before number of disks\n", mdname(mddev));
5336 return -EINVAL; 5335 return -EINVAL;
5337 } 5336 }
5338 5337
5339 atomic_set(&conf->reshape_stripes, 0); 5338 atomic_set(&conf->reshape_stripes, 0);
5340 spin_lock_irq(&conf->device_lock); 5339 spin_lock_irq(&conf->device_lock);
5341 conf->previous_raid_disks = conf->raid_disks; 5340 conf->previous_raid_disks = conf->raid_disks;
5342 conf->raid_disks += mddev->delta_disks; 5341 conf->raid_disks += mddev->delta_disks;
5343 conf->prev_chunk_sectors = conf->chunk_sectors; 5342 conf->prev_chunk_sectors = conf->chunk_sectors;
5344 conf->chunk_sectors = mddev->new_chunk_sectors; 5343 conf->chunk_sectors = mddev->new_chunk_sectors;
5345 conf->prev_algo = conf->algorithm; 5344 conf->prev_algo = conf->algorithm;
5346 conf->algorithm = mddev->new_layout; 5345 conf->algorithm = mddev->new_layout;
5347 if (mddev->delta_disks < 0) 5346 if (mddev->delta_disks < 0)
5348 conf->reshape_progress = raid5_size(mddev, 0, 0); 5347 conf->reshape_progress = raid5_size(mddev, 0, 0);
5349 else 5348 else
5350 conf->reshape_progress = 0; 5349 conf->reshape_progress = 0;
5351 conf->reshape_safe = conf->reshape_progress; 5350 conf->reshape_safe = conf->reshape_progress;
5352 conf->generation++; 5351 conf->generation++;
5353 spin_unlock_irq(&conf->device_lock); 5352 spin_unlock_irq(&conf->device_lock);
5354 5353
5355 /* Add some new drives, as many as will fit. 5354 /* Add some new drives, as many as will fit.
5356 * We know there are enough to make the newly sized array work. 5355 * We know there are enough to make the newly sized array work.
5357 */ 5356 */
5358 list_for_each_entry(rdev, &mddev->disks, same_set) 5357 list_for_each_entry(rdev, &mddev->disks, same_set)
5359 if (rdev->raid_disk < 0 && 5358 if (rdev->raid_disk < 0 &&
5360 !test_bit(Faulty, &rdev->flags)) { 5359 !test_bit(Faulty, &rdev->flags)) {
5361 if (raid5_add_disk(mddev, rdev) == 0) { 5360 if (raid5_add_disk(mddev, rdev) == 0) {
5362 char nm[20]; 5361 char nm[20];
5363 set_bit(In_sync, &rdev->flags); 5362 set_bit(In_sync, &rdev->flags);
5364 added_devices++; 5363 added_devices++;
5365 rdev->recovery_offset = 0; 5364 rdev->recovery_offset = 0;
5366 sprintf(nm, "rd%d", rdev->raid_disk); 5365 sprintf(nm, "rd%d", rdev->raid_disk);
5367 if (sysfs_create_link(&mddev->kobj, 5366 if (sysfs_create_link(&mddev->kobj,
5368 &rdev->kobj, nm)) 5367 &rdev->kobj, nm))
5369 printk(KERN_WARNING 5368 printk(KERN_WARNING
5370 "raid5: failed to create " 5369 "raid5: failed to create "
5371 " link %s for %s\n", 5370 " link %s for %s\n",
5372 nm, mdname(mddev)); 5371 nm, mdname(mddev));
5373 } else 5372 } else
5374 break; 5373 break;
5375 } 5374 }
5376 5375
5377 if (mddev->delta_disks > 0) { 5376 if (mddev->delta_disks > 0) {
5378 spin_lock_irqsave(&conf->device_lock, flags); 5377 spin_lock_irqsave(&conf->device_lock, flags);
5379 mddev->degraded = (conf->raid_disks - conf->previous_raid_disks) 5378 mddev->degraded = (conf->raid_disks - conf->previous_raid_disks)
5380 - added_devices; 5379 - added_devices;
5381 spin_unlock_irqrestore(&conf->device_lock, flags); 5380 spin_unlock_irqrestore(&conf->device_lock, flags);
5382 } 5381 }
5383 mddev->raid_disks = conf->raid_disks; 5382 mddev->raid_disks = conf->raid_disks;
5384 mddev->reshape_position = conf->reshape_progress; 5383 mddev->reshape_position = conf->reshape_progress;
5385 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5384 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5386 5385
5387 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 5386 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
5388 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 5387 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
5389 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 5388 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
5390 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 5389 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
5391 mddev->sync_thread = md_register_thread(md_do_sync, mddev, 5390 mddev->sync_thread = md_register_thread(md_do_sync, mddev,
5392 "reshape"); 5391 "reshape");
5393 if (!mddev->sync_thread) { 5392 if (!mddev->sync_thread) {
5394 mddev->recovery = 0; 5393 mddev->recovery = 0;
5395 spin_lock_irq(&conf->device_lock); 5394 spin_lock_irq(&conf->device_lock);
5396 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5395 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
5397 conf->reshape_progress = MaxSector; 5396 conf->reshape_progress = MaxSector;
5398 spin_unlock_irq(&conf->device_lock); 5397 spin_unlock_irq(&conf->device_lock);
5399 return -EAGAIN; 5398 return -EAGAIN;
5400 } 5399 }
5401 conf->reshape_checkpoint = jiffies; 5400 conf->reshape_checkpoint = jiffies;
5402 md_wakeup_thread(mddev->sync_thread); 5401 md_wakeup_thread(mddev->sync_thread);
5403 md_new_event(mddev); 5402 md_new_event(mddev);
5404 return 0; 5403 return 0;
5405 } 5404 }
5406 5405
5407 /* This is called from the reshape thread and should make any 5406 /* This is called from the reshape thread and should make any
5408 * changes needed in 'conf' 5407 * changes needed in 'conf'
5409 */ 5408 */
5410 static void end_reshape(raid5_conf_t *conf) 5409 static void end_reshape(raid5_conf_t *conf)
5411 { 5410 {
5412 5411
5413 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) { 5412 if (!test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
5414 5413
5415 spin_lock_irq(&conf->device_lock); 5414 spin_lock_irq(&conf->device_lock);
5416 conf->previous_raid_disks = conf->raid_disks; 5415 conf->previous_raid_disks = conf->raid_disks;
5417 conf->reshape_progress = MaxSector; 5416 conf->reshape_progress = MaxSector;
5418 spin_unlock_irq(&conf->device_lock); 5417 spin_unlock_irq(&conf->device_lock);
5419 wake_up(&conf->wait_for_overlap); 5418 wake_up(&conf->wait_for_overlap);
5420 5419
5421 /* read-ahead size must cover two whole stripes, which is 5420 /* read-ahead size must cover two whole stripes, which is
5422 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices 5421 * 2 * (datadisks) * chunksize where 'n' is the number of raid devices
5423 */ 5422 */
5424 { 5423 {
5425 int data_disks = conf->raid_disks - conf->max_degraded; 5424 int data_disks = conf->raid_disks - conf->max_degraded;
5426 int stripe = data_disks * ((conf->chunk_sectors << 9) 5425 int stripe = data_disks * ((conf->chunk_sectors << 9)
5427 / PAGE_SIZE); 5426 / PAGE_SIZE);
5428 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) 5427 if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe)
5429 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; 5428 conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe;
5430 } 5429 }
5431 } 5430 }
5432 } 5431 }
5433 5432
5434 /* This is called from the raid5d thread with mddev_lock held. 5433 /* This is called from the raid5d thread with mddev_lock held.
5435 * It makes config changes to the device. 5434 * It makes config changes to the device.
5436 */ 5435 */
5437 static void raid5_finish_reshape(mddev_t *mddev) 5436 static void raid5_finish_reshape(mddev_t *mddev)
5438 { 5437 {
5439 raid5_conf_t *conf = mddev->private; 5438 raid5_conf_t *conf = mddev->private;
5440 5439
5441 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 5440 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
5442 5441
5443 if (mddev->delta_disks > 0) { 5442 if (mddev->delta_disks > 0) {
5444 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); 5443 md_set_array_sectors(mddev, raid5_size(mddev, 0, 0));
5445 set_capacity(mddev->gendisk, mddev->array_sectors); 5444 set_capacity(mddev->gendisk, mddev->array_sectors);
5446 mddev->changed = 1; 5445 mddev->changed = 1;
5447 revalidate_disk(mddev->gendisk); 5446 revalidate_disk(mddev->gendisk);
5448 } else { 5447 } else {
5449 int d; 5448 int d;
5450 mddev->degraded = conf->raid_disks; 5449 mddev->degraded = conf->raid_disks;
5451 for (d = 0; d < conf->raid_disks ; d++) 5450 for (d = 0; d < conf->raid_disks ; d++)
5452 if (conf->disks[d].rdev && 5451 if (conf->disks[d].rdev &&
5453 test_bit(In_sync, 5452 test_bit(In_sync,
5454 &conf->disks[d].rdev->flags)) 5453 &conf->disks[d].rdev->flags))
5455 mddev->degraded--; 5454 mddev->degraded--;
5456 for (d = conf->raid_disks ; 5455 for (d = conf->raid_disks ;
5457 d < conf->raid_disks - mddev->delta_disks; 5456 d < conf->raid_disks - mddev->delta_disks;
5458 d++) { 5457 d++) {
5459 mdk_rdev_t *rdev = conf->disks[d].rdev; 5458 mdk_rdev_t *rdev = conf->disks[d].rdev;
5460 if (rdev && raid5_remove_disk(mddev, d) == 0) { 5459 if (rdev && raid5_remove_disk(mddev, d) == 0) {
5461 char nm[20]; 5460 char nm[20];
5462 sprintf(nm, "rd%d", rdev->raid_disk); 5461 sprintf(nm, "rd%d", rdev->raid_disk);
5463 sysfs_remove_link(&mddev->kobj, nm); 5462 sysfs_remove_link(&mddev->kobj, nm);
5464 rdev->raid_disk = -1; 5463 rdev->raid_disk = -1;
5465 } 5464 }
5466 } 5465 }
5467 } 5466 }
5468 mddev->layout = conf->algorithm; 5467 mddev->layout = conf->algorithm;
5469 mddev->chunk_sectors = conf->chunk_sectors; 5468 mddev->chunk_sectors = conf->chunk_sectors;
5470 mddev->reshape_position = MaxSector; 5469 mddev->reshape_position = MaxSector;
5471 mddev->delta_disks = 0; 5470 mddev->delta_disks = 0;
5472 } 5471 }
5473 } 5472 }
5474 5473
5475 static void raid5_quiesce(mddev_t *mddev, int state) 5474 static void raid5_quiesce(mddev_t *mddev, int state)
5476 { 5475 {
5477 raid5_conf_t *conf = mddev->private; 5476 raid5_conf_t *conf = mddev->private;
5478 5477
5479 switch(state) { 5478 switch(state) {
5480 case 2: /* resume for a suspend */ 5479 case 2: /* resume for a suspend */
5481 wake_up(&conf->wait_for_overlap); 5480 wake_up(&conf->wait_for_overlap);
5482 break; 5481 break;
5483 5482
5484 case 1: /* stop all writes */ 5483 case 1: /* stop all writes */
5485 spin_lock_irq(&conf->device_lock); 5484 spin_lock_irq(&conf->device_lock);
5486 /* '2' tells resync/reshape to pause so that all 5485 /* '2' tells resync/reshape to pause so that all
5487 * active stripes can drain 5486 * active stripes can drain
5488 */ 5487 */
5489 conf->quiesce = 2; 5488 conf->quiesce = 2;
5490 wait_event_lock_irq(conf->wait_for_stripe, 5489 wait_event_lock_irq(conf->wait_for_stripe,
5491 atomic_read(&conf->active_stripes) == 0 && 5490 atomic_read(&conf->active_stripes) == 0 &&
5492 atomic_read(&conf->active_aligned_reads) == 0, 5491 atomic_read(&conf->active_aligned_reads) == 0,
5493 conf->device_lock, /* nothing */); 5492 conf->device_lock, /* nothing */);
5494 conf->quiesce = 1; 5493 conf->quiesce = 1;
5495 spin_unlock_irq(&conf->device_lock); 5494 spin_unlock_irq(&conf->device_lock);
5496 /* allow reshape to continue */ 5495 /* allow reshape to continue */
5497 wake_up(&conf->wait_for_overlap); 5496 wake_up(&conf->wait_for_overlap);
5498 break; 5497 break;
5499 5498
5500 case 0: /* re-enable writes */ 5499 case 0: /* re-enable writes */
5501 spin_lock_irq(&conf->device_lock); 5500 spin_lock_irq(&conf->device_lock);
5502 conf->quiesce = 0; 5501 conf->quiesce = 0;
5503 wake_up(&conf->wait_for_stripe); 5502 wake_up(&conf->wait_for_stripe);
5504 wake_up(&conf->wait_for_overlap); 5503 wake_up(&conf->wait_for_overlap);
5505 spin_unlock_irq(&conf->device_lock); 5504 spin_unlock_irq(&conf->device_lock);
5506 break; 5505 break;
5507 } 5506 }
5508 } 5507 }
5509 5508
5510 5509
5511 static void *raid5_takeover_raid1(mddev_t *mddev) 5510 static void *raid5_takeover_raid1(mddev_t *mddev)
5512 { 5511 {
5513 int chunksect; 5512 int chunksect;
5514 5513
5515 if (mddev->raid_disks != 2 || 5514 if (mddev->raid_disks != 2 ||
5516 mddev->degraded > 1) 5515 mddev->degraded > 1)
5517 return ERR_PTR(-EINVAL); 5516 return ERR_PTR(-EINVAL);
5518 5517
5519 /* Should check if there are write-behind devices? */ 5518 /* Should check if there are write-behind devices? */
5520 5519
5521 chunksect = 64*2; /* 64K by default */ 5520 chunksect = 64*2; /* 64K by default */
5522 5521
5523 /* The array must be an exact multiple of chunksize */ 5522 /* The array must be an exact multiple of chunksize */
5524 while (chunksect && (mddev->array_sectors & (chunksect-1))) 5523 while (chunksect && (mddev->array_sectors & (chunksect-1)))
5525 chunksect >>= 1; 5524 chunksect >>= 1;
5526 5525
5527 if ((chunksect<<9) < STRIPE_SIZE) 5526 if ((chunksect<<9) < STRIPE_SIZE)
5528 /* array size does not allow a suitable chunk size */ 5527 /* array size does not allow a suitable chunk size */
5529 return ERR_PTR(-EINVAL); 5528 return ERR_PTR(-EINVAL);
5530 5529
5531 mddev->new_level = 5; 5530 mddev->new_level = 5;
5532 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC; 5531 mddev->new_layout = ALGORITHM_LEFT_SYMMETRIC;
5533 mddev->new_chunk_sectors = chunksect; 5532 mddev->new_chunk_sectors = chunksect;
5534 5533
5535 return setup_conf(mddev); 5534 return setup_conf(mddev);
5536 } 5535 }
5537 5536
5538 static void *raid5_takeover_raid6(mddev_t *mddev) 5537 static void *raid5_takeover_raid6(mddev_t *mddev)
5539 { 5538 {
5540 int new_layout; 5539 int new_layout;
5541 5540
5542 switch (mddev->layout) { 5541 switch (mddev->layout) {
5543 case ALGORITHM_LEFT_ASYMMETRIC_6: 5542 case ALGORITHM_LEFT_ASYMMETRIC_6:
5544 new_layout = ALGORITHM_LEFT_ASYMMETRIC; 5543 new_layout = ALGORITHM_LEFT_ASYMMETRIC;
5545 break; 5544 break;
5546 case ALGORITHM_RIGHT_ASYMMETRIC_6: 5545 case ALGORITHM_RIGHT_ASYMMETRIC_6:
5547 new_layout = ALGORITHM_RIGHT_ASYMMETRIC; 5546 new_layout = ALGORITHM_RIGHT_ASYMMETRIC;
5548 break; 5547 break;
5549 case ALGORITHM_LEFT_SYMMETRIC_6: 5548 case ALGORITHM_LEFT_SYMMETRIC_6:
5550 new_layout = ALGORITHM_LEFT_SYMMETRIC; 5549 new_layout = ALGORITHM_LEFT_SYMMETRIC;
5551 break; 5550 break;
5552 case ALGORITHM_RIGHT_SYMMETRIC_6: 5551 case ALGORITHM_RIGHT_SYMMETRIC_6:
5553 new_layout = ALGORITHM_RIGHT_SYMMETRIC; 5552 new_layout = ALGORITHM_RIGHT_SYMMETRIC;
5554 break; 5553 break;
5555 case ALGORITHM_PARITY_0_6: 5554 case ALGORITHM_PARITY_0_6:
5556 new_layout = ALGORITHM_PARITY_0; 5555 new_layout = ALGORITHM_PARITY_0;
5557 break; 5556 break;
5558 case ALGORITHM_PARITY_N: 5557 case ALGORITHM_PARITY_N:
5559 new_layout = ALGORITHM_PARITY_N; 5558 new_layout = ALGORITHM_PARITY_N;
5560 break; 5559 break;
5561 default: 5560 default:
5562 return ERR_PTR(-EINVAL); 5561 return ERR_PTR(-EINVAL);
5563 } 5562 }
5564 mddev->new_level = 5; 5563 mddev->new_level = 5;
5565 mddev->new_layout = new_layout; 5564 mddev->new_layout = new_layout;
5566 mddev->delta_disks = -1; 5565 mddev->delta_disks = -1;
5567 mddev->raid_disks -= 1; 5566 mddev->raid_disks -= 1;
5568 return setup_conf(mddev); 5567 return setup_conf(mddev);
5569 } 5568 }
5570 5569
5571 5570
5572 static int raid5_check_reshape(mddev_t *mddev) 5571 static int raid5_check_reshape(mddev_t *mddev)
5573 { 5572 {
5574 /* For a 2-drive array, the layout and chunk size can be changed 5573 /* For a 2-drive array, the layout and chunk size can be changed
5575 * immediately as not restriping is needed. 5574 * immediately as not restriping is needed.
5576 * For larger arrays we record the new value - after validation 5575 * For larger arrays we record the new value - after validation
5577 * to be used by a reshape pass. 5576 * to be used by a reshape pass.
5578 */ 5577 */
5579 raid5_conf_t *conf = mddev->private; 5578 raid5_conf_t *conf = mddev->private;
5580 int new_chunk = mddev->new_chunk_sectors; 5579 int new_chunk = mddev->new_chunk_sectors;
5581 5580
5582 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout)) 5581 if (mddev->new_layout >= 0 && !algorithm_valid_raid5(mddev->new_layout))
5583 return -EINVAL; 5582 return -EINVAL;
5584 if (new_chunk > 0) { 5583 if (new_chunk > 0) {
5585 if (!is_power_of_2(new_chunk)) 5584 if (!is_power_of_2(new_chunk))
5586 return -EINVAL; 5585 return -EINVAL;
5587 if (new_chunk < (PAGE_SIZE>>9)) 5586 if (new_chunk < (PAGE_SIZE>>9))
5588 return -EINVAL; 5587 return -EINVAL;
5589 if (mddev->array_sectors & (new_chunk-1)) 5588 if (mddev->array_sectors & (new_chunk-1))
5590 /* not factor of array size */ 5589 /* not factor of array size */
5591 return -EINVAL; 5590 return -EINVAL;
5592 } 5591 }
5593 5592
5594 /* They look valid */ 5593 /* They look valid */
5595 5594
5596 if (mddev->raid_disks == 2) { 5595 if (mddev->raid_disks == 2) {
5597 /* can make the change immediately */ 5596 /* can make the change immediately */
5598 if (mddev->new_layout >= 0) { 5597 if (mddev->new_layout >= 0) {
5599 conf->algorithm = mddev->new_layout; 5598 conf->algorithm = mddev->new_layout;
5600 mddev->layout = mddev->new_layout; 5599 mddev->layout = mddev->new_layout;
5601 } 5600 }
5602 if (new_chunk > 0) { 5601 if (new_chunk > 0) {
5603 conf->chunk_sectors = new_chunk ; 5602 conf->chunk_sectors = new_chunk ;
5604 mddev->chunk_sectors = new_chunk; 5603 mddev->chunk_sectors = new_chunk;
5605 } 5604 }
5606 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5605 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5607 md_wakeup_thread(mddev->thread); 5606 md_wakeup_thread(mddev->thread);
5608 } 5607 }
5609 return check_reshape(mddev); 5608 return check_reshape(mddev);
5610 } 5609 }
5611 5610
5612 static int raid6_check_reshape(mddev_t *mddev) 5611 static int raid6_check_reshape(mddev_t *mddev)
5613 { 5612 {
5614 int new_chunk = mddev->new_chunk_sectors; 5613 int new_chunk = mddev->new_chunk_sectors;
5615 5614
5616 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout)) 5615 if (mddev->new_layout >= 0 && !algorithm_valid_raid6(mddev->new_layout))
5617 return -EINVAL; 5616 return -EINVAL;
5618 if (new_chunk > 0) { 5617 if (new_chunk > 0) {
5619 if (!is_power_of_2(new_chunk)) 5618 if (!is_power_of_2(new_chunk))
5620 return -EINVAL; 5619 return -EINVAL;
5621 if (new_chunk < (PAGE_SIZE >> 9)) 5620 if (new_chunk < (PAGE_SIZE >> 9))
5622 return -EINVAL; 5621 return -EINVAL;
5623 if (mddev->array_sectors & (new_chunk-1)) 5622 if (mddev->array_sectors & (new_chunk-1))
5624 /* not factor of array size */ 5623 /* not factor of array size */
5625 return -EINVAL; 5624 return -EINVAL;
5626 } 5625 }
5627 5626
5628 /* They look valid */ 5627 /* They look valid */
5629 return check_reshape(mddev); 5628 return check_reshape(mddev);
5630 } 5629 }
5631 5630
5632 static void *raid5_takeover(mddev_t *mddev) 5631 static void *raid5_takeover(mddev_t *mddev)
5633 { 5632 {
5634 /* raid5 can take over: 5633 /* raid5 can take over:
5635 * raid0 - if all devices are the same - make it a raid4 layout 5634 * raid0 - if all devices are the same - make it a raid4 layout
5636 * raid1 - if there are two drives. We need to know the chunk size 5635 * raid1 - if there are two drives. We need to know the chunk size
5637 * raid4 - trivial - just use a raid4 layout. 5636 * raid4 - trivial - just use a raid4 layout.
5638 * raid6 - Providing it is a *_6 layout 5637 * raid6 - Providing it is a *_6 layout
5639 */ 5638 */
5640 5639
5641 if (mddev->level == 1) 5640 if (mddev->level == 1)
5642 return raid5_takeover_raid1(mddev); 5641 return raid5_takeover_raid1(mddev);
5643 if (mddev->level == 4) { 5642 if (mddev->level == 4) {
5644 mddev->new_layout = ALGORITHM_PARITY_N; 5643 mddev->new_layout = ALGORITHM_PARITY_N;
5645 mddev->new_level = 5; 5644 mddev->new_level = 5;
5646 return setup_conf(mddev); 5645 return setup_conf(mddev);
5647 } 5646 }
5648 if (mddev->level == 6) 5647 if (mddev->level == 6)
5649 return raid5_takeover_raid6(mddev); 5648 return raid5_takeover_raid6(mddev);
5650 5649
5651 return ERR_PTR(-EINVAL); 5650 return ERR_PTR(-EINVAL);
5652 } 5651 }
5653 5652
5654 5653
5655 static struct mdk_personality raid5_personality; 5654 static struct mdk_personality raid5_personality;
5656 5655
5657 static void *raid6_takeover(mddev_t *mddev) 5656 static void *raid6_takeover(mddev_t *mddev)
5658 { 5657 {
5659 /* Currently can only take over a raid5. We map the 5658 /* Currently can only take over a raid5. We map the
5660 * personality to an equivalent raid6 personality 5659 * personality to an equivalent raid6 personality
5661 * with the Q block at the end. 5660 * with the Q block at the end.
5662 */ 5661 */
5663 int new_layout; 5662 int new_layout;
5664 5663
5665 if (mddev->pers != &raid5_personality) 5664 if (mddev->pers != &raid5_personality)
5666 return ERR_PTR(-EINVAL); 5665 return ERR_PTR(-EINVAL);
5667 if (mddev->degraded > 1) 5666 if (mddev->degraded > 1)
5668 return ERR_PTR(-EINVAL); 5667 return ERR_PTR(-EINVAL);
5669 if (mddev->raid_disks > 253) 5668 if (mddev->raid_disks > 253)
5670 return ERR_PTR(-EINVAL); 5669 return ERR_PTR(-EINVAL);
5671 if (mddev->raid_disks < 3) 5670 if (mddev->raid_disks < 3)
5672 return ERR_PTR(-EINVAL); 5671 return ERR_PTR(-EINVAL);
5673 5672
5674 switch (mddev->layout) { 5673 switch (mddev->layout) {
5675 case ALGORITHM_LEFT_ASYMMETRIC: 5674 case ALGORITHM_LEFT_ASYMMETRIC:
5676 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6; 5675 new_layout = ALGORITHM_LEFT_ASYMMETRIC_6;
5677 break; 5676 break;
5678 case ALGORITHM_RIGHT_ASYMMETRIC: 5677 case ALGORITHM_RIGHT_ASYMMETRIC:
5679 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6; 5678 new_layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
5680 break; 5679 break;
5681 case ALGORITHM_LEFT_SYMMETRIC: 5680 case ALGORITHM_LEFT_SYMMETRIC:
5682 new_layout = ALGORITHM_LEFT_SYMMETRIC_6; 5681 new_layout = ALGORITHM_LEFT_SYMMETRIC_6;
5683 break; 5682 break;
5684 case ALGORITHM_RIGHT_SYMMETRIC: 5683 case ALGORITHM_RIGHT_SYMMETRIC:
5685 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6; 5684 new_layout = ALGORITHM_RIGHT_SYMMETRIC_6;
5686 break; 5685 break;
5687 case ALGORITHM_PARITY_0: 5686 case ALGORITHM_PARITY_0:
5688 new_layout = ALGORITHM_PARITY_0_6; 5687 new_layout = ALGORITHM_PARITY_0_6;
5689 break; 5688 break;
5690 case ALGORITHM_PARITY_N: 5689 case ALGORITHM_PARITY_N:
5691 new_layout = ALGORITHM_PARITY_N; 5690 new_layout = ALGORITHM_PARITY_N;
5692 break; 5691 break;
5693 default: 5692 default:
5694 return ERR_PTR(-EINVAL); 5693 return ERR_PTR(-EINVAL);
5695 } 5694 }
5696 mddev->new_level = 6; 5695 mddev->new_level = 6;
5697 mddev->new_layout = new_layout; 5696 mddev->new_layout = new_layout;
5698 mddev->delta_disks = 1; 5697 mddev->delta_disks = 1;
5699 mddev->raid_disks += 1; 5698 mddev->raid_disks += 1;
5700 return setup_conf(mddev); 5699 return setup_conf(mddev);
5701 } 5700 }
5702 5701
5703 5702
5704 static struct mdk_personality raid6_personality = 5703 static struct mdk_personality raid6_personality =
5705 { 5704 {
5706 .name = "raid6", 5705 .name = "raid6",
5707 .level = 6, 5706 .level = 6,
5708 .owner = THIS_MODULE, 5707 .owner = THIS_MODULE,
5709 .make_request = make_request, 5708 .make_request = make_request,
5710 .run = run, 5709 .run = run,
5711 .stop = stop, 5710 .stop = stop,
5712 .status = status, 5711 .status = status,
5713 .error_handler = error, 5712 .error_handler = error,
5714 .hot_add_disk = raid5_add_disk, 5713 .hot_add_disk = raid5_add_disk,
5715 .hot_remove_disk= raid5_remove_disk, 5714 .hot_remove_disk= raid5_remove_disk,
5716 .spare_active = raid5_spare_active, 5715 .spare_active = raid5_spare_active,
5717 .sync_request = sync_request, 5716 .sync_request = sync_request,
5718 .resize = raid5_resize, 5717 .resize = raid5_resize,
5719 .size = raid5_size, 5718 .size = raid5_size,
5720 .check_reshape = raid6_check_reshape, 5719 .check_reshape = raid6_check_reshape,
5721 .start_reshape = raid5_start_reshape, 5720 .start_reshape = raid5_start_reshape,
5722 .finish_reshape = raid5_finish_reshape, 5721 .finish_reshape = raid5_finish_reshape,
5723 .quiesce = raid5_quiesce, 5722 .quiesce = raid5_quiesce,
5724 .takeover = raid6_takeover, 5723 .takeover = raid6_takeover,
5725 }; 5724 };
5726 static struct mdk_personality raid5_personality = 5725 static struct mdk_personality raid5_personality =
5727 { 5726 {
5728 .name = "raid5", 5727 .name = "raid5",
5729 .level = 5, 5728 .level = 5,
5730 .owner = THIS_MODULE, 5729 .owner = THIS_MODULE,
5731 .make_request = make_request, 5730 .make_request = make_request,
5732 .run = run, 5731 .run = run,
5733 .stop = stop, 5732 .stop = stop,
5734 .status = status, 5733 .status = status,
5735 .error_handler = error, 5734 .error_handler = error,
5736 .hot_add_disk = raid5_add_disk, 5735 .hot_add_disk = raid5_add_disk,
5737 .hot_remove_disk= raid5_remove_disk, 5736 .hot_remove_disk= raid5_remove_disk,
5738 .spare_active = raid5_spare_active, 5737 .spare_active = raid5_spare_active,
5739 .sync_request = sync_request, 5738 .sync_request = sync_request,
5740 .resize = raid5_resize, 5739 .resize = raid5_resize,
5741 .size = raid5_size, 5740 .size = raid5_size,
5742 .check_reshape = raid5_check_reshape, 5741 .check_reshape = raid5_check_reshape,
5743 .start_reshape = raid5_start_reshape, 5742 .start_reshape = raid5_start_reshape,
5744 .finish_reshape = raid5_finish_reshape, 5743 .finish_reshape = raid5_finish_reshape,
5745 .quiesce = raid5_quiesce, 5744 .quiesce = raid5_quiesce,
5746 .takeover = raid5_takeover, 5745 .takeover = raid5_takeover,
5747 }; 5746 };
5748 5747
5749 static struct mdk_personality raid4_personality = 5748 static struct mdk_personality raid4_personality =
5750 { 5749 {
5751 .name = "raid4", 5750 .name = "raid4",
5752 .level = 4, 5751 .level = 4,
5753 .owner = THIS_MODULE, 5752 .owner = THIS_MODULE,
5754 .make_request = make_request, 5753 .make_request = make_request,
5755 .run = run, 5754 .run = run,
5756 .stop = stop, 5755 .stop = stop,
5757 .status = status, 5756 .status = status,
5758 .error_handler = error, 5757 .error_handler = error,
5759 .hot_add_disk = raid5_add_disk, 5758 .hot_add_disk = raid5_add_disk,
5760 .hot_remove_disk= raid5_remove_disk, 5759 .hot_remove_disk= raid5_remove_disk,
5761 .spare_active = raid5_spare_active, 5760 .spare_active = raid5_spare_active,
5762 .sync_request = sync_request, 5761 .sync_request = sync_request,
5763 .resize = raid5_resize, 5762 .resize = raid5_resize,
5764 .size = raid5_size, 5763 .size = raid5_size,
5765 .check_reshape = raid5_check_reshape, 5764 .check_reshape = raid5_check_reshape,
5766 .start_reshape = raid5_start_reshape, 5765 .start_reshape = raid5_start_reshape,
5767 .finish_reshape = raid5_finish_reshape, 5766 .finish_reshape = raid5_finish_reshape,
5768 .quiesce = raid5_quiesce, 5767 .quiesce = raid5_quiesce,
5769 }; 5768 };
5770 5769
5771 static int __init raid5_init(void) 5770 static int __init raid5_init(void)
5772 { 5771 {
5773 register_md_personality(&raid6_personality); 5772 register_md_personality(&raid6_personality);
5774 register_md_personality(&raid5_personality); 5773 register_md_personality(&raid5_personality);
5775 register_md_personality(&raid4_personality); 5774 register_md_personality(&raid4_personality);
5776 return 0; 5775 return 0;
5777 } 5776 }
5778 5777
5779 static void raid5_exit(void) 5778 static void raid5_exit(void)
5780 { 5779 {
5781 unregister_md_personality(&raid6_personality); 5780 unregister_md_personality(&raid6_personality);
5782 unregister_md_personality(&raid5_personality); 5781 unregister_md_personality(&raid5_personality);
5783 unregister_md_personality(&raid4_personality); 5782 unregister_md_personality(&raid4_personality);
5784 } 5783 }
5785 5784
5786 module_init(raid5_init); 5785 module_init(raid5_init);
5787 module_exit(raid5_exit); 5786 module_exit(raid5_exit);
5788 MODULE_LICENSE("GPL"); 5787 MODULE_LICENSE("GPL");
5789 MODULE_ALIAS("md-personality-4"); /* RAID5 */ 5788 MODULE_ALIAS("md-personality-4"); /* RAID5 */
5790 MODULE_ALIAS("md-raid5"); 5789 MODULE_ALIAS("md-raid5");
5791 MODULE_ALIAS("md-raid4"); 5790 MODULE_ALIAS("md-raid4");
5792 MODULE_ALIAS("md-level-5"); 5791 MODULE_ALIAS("md-level-5");
5793 MODULE_ALIAS("md-level-4"); 5792 MODULE_ALIAS("md-level-4");
5794 MODULE_ALIAS("md-personality-8"); /* RAID6 */ 5793 MODULE_ALIAS("md-personality-8"); /* RAID6 */
5795 MODULE_ALIAS("md-raid6"); 5794 MODULE_ALIAS("md-raid6");
5796 MODULE_ALIAS("md-level-6"); 5795 MODULE_ALIAS("md-level-6");