Commit a91d5ac04841ca1be340e8610e6d899fc8b419b5

Authored by Jonathan Brassow
Committed by NeilBrown
1 parent b6d428c669

MD: Export 'md_reap_sync_thread' function

MD: Export 'md_reap_sync_thread' function

Make 'md_reap_sync_thread' available to other files, specifically dm-raid.c.
- rename reap_sync_thread to md_reap_sync_thread
- move the fn after md_check_recovery to match md.h declaration placement
- export md_reap_sync_thread

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.de>

Showing 2 changed files with 50 additions and 50 deletions Inline Diff

1 /* 1 /*
2 md.c : Multiple Devices driver for Linux 2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
4 4
5 completely rewritten, based on the MD driver code from Marc Zyngier 5 completely rewritten, based on the MD driver code from Marc Zyngier
6 6
7 Changes: 7 Changes:
8 8
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin 13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16 16
17 - lots of fixes and improvements to the RAID1/RAID5 and generic 17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization): 18 RAID code (such as request based resynchronization):
19 19
20 Neil Brown <neilb@cse.unsw.edu.au>. 20 Neil Brown <neilb@cse.unsw.edu.au>.
21 21
22 - persistent bitmap code 22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24 24
25 This program is free software; you can redistribute it and/or modify 25 This program is free software; you can redistribute it and/or modify
26 it under the terms of the GNU General Public License as published by 26 it under the terms of the GNU General Public License as published by
27 the Free Software Foundation; either version 2, or (at your option) 27 the Free Software Foundation; either version 2, or (at your option)
28 any later version. 28 any later version.
29 29
30 You should have received a copy of the GNU General Public License 30 You should have received a copy of the GNU General Public License
31 (for example /usr/src/linux/COPYING); if not, write to the Free 31 (for example /usr/src/linux/COPYING); if not, write to the Free
32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */ 33 */
34 34
35 #include <linux/kthread.h> 35 #include <linux/kthread.h>
36 #include <linux/blkdev.h> 36 #include <linux/blkdev.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/seq_file.h> 38 #include <linux/seq_file.h>
39 #include <linux/fs.h> 39 #include <linux/fs.h>
40 #include <linux/poll.h> 40 #include <linux/poll.h>
41 #include <linux/ctype.h> 41 #include <linux/ctype.h>
42 #include <linux/string.h> 42 #include <linux/string.h>
43 #include <linux/hdreg.h> 43 #include <linux/hdreg.h>
44 #include <linux/proc_fs.h> 44 #include <linux/proc_fs.h>
45 #include <linux/random.h> 45 #include <linux/random.h>
46 #include <linux/module.h> 46 #include <linux/module.h>
47 #include <linux/reboot.h> 47 #include <linux/reboot.h>
48 #include <linux/file.h> 48 #include <linux/file.h>
49 #include <linux/compat.h> 49 #include <linux/compat.h>
50 #include <linux/delay.h> 50 #include <linux/delay.h>
51 #include <linux/raid/md_p.h> 51 #include <linux/raid/md_p.h>
52 #include <linux/raid/md_u.h> 52 #include <linux/raid/md_u.h>
53 #include <linux/slab.h> 53 #include <linux/slab.h>
54 #include "md.h" 54 #include "md.h"
55 #include "bitmap.h" 55 #include "bitmap.h"
56 56
57 #ifndef MODULE 57 #ifndef MODULE
58 static void autostart_arrays(int part); 58 static void autostart_arrays(int part);
59 #endif 59 #endif
60 60
61 /* pers_list is a list of registered personalities protected 61 /* pers_list is a list of registered personalities protected
62 * by pers_lock. 62 * by pers_lock.
63 * pers_lock does extra service to protect accesses to 63 * pers_lock does extra service to protect accesses to
64 * mddev->thread when the mutex cannot be held. 64 * mddev->thread when the mutex cannot be held.
65 */ 65 */
66 static LIST_HEAD(pers_list); 66 static LIST_HEAD(pers_list);
67 static DEFINE_SPINLOCK(pers_lock); 67 static DEFINE_SPINLOCK(pers_lock);
68 68
69 static void md_print_devices(void); 69 static void md_print_devices(void);
70 70
71 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 71 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
72 static struct workqueue_struct *md_wq; 72 static struct workqueue_struct *md_wq;
73 static struct workqueue_struct *md_misc_wq; 73 static struct workqueue_struct *md_misc_wq;
74 74
75 static int remove_and_add_spares(struct mddev *mddev, 75 static int remove_and_add_spares(struct mddev *mddev,
76 struct md_rdev *this); 76 struct md_rdev *this);
77 77
78 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 78 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
79 79
80 /* 80 /*
81 * Default number of read corrections we'll attempt on an rdev 81 * Default number of read corrections we'll attempt on an rdev
82 * before ejecting it from the array. We divide the read error 82 * before ejecting it from the array. We divide the read error
83 * count by 2 for every hour elapsed between read errors. 83 * count by 2 for every hour elapsed between read errors.
84 */ 84 */
85 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 85 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
86 /* 86 /*
87 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 87 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
88 * is 1000 KB/sec, so the extra system load does not show up that much. 88 * is 1000 KB/sec, so the extra system load does not show up that much.
89 * Increase it if you want to have more _guaranteed_ speed. Note that 89 * Increase it if you want to have more _guaranteed_ speed. Note that
90 * the RAID driver will use the maximum available bandwidth if the IO 90 * the RAID driver will use the maximum available bandwidth if the IO
91 * subsystem is idle. There is also an 'absolute maximum' reconstruction 91 * subsystem is idle. There is also an 'absolute maximum' reconstruction
92 * speed limit - in case reconstruction slows down your system despite 92 * speed limit - in case reconstruction slows down your system despite
93 * idle IO detection. 93 * idle IO detection.
94 * 94 *
95 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 95 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
96 * or /sys/block/mdX/md/sync_speed_{min,max} 96 * or /sys/block/mdX/md/sync_speed_{min,max}
97 */ 97 */
98 98
99 static int sysctl_speed_limit_min = 1000; 99 static int sysctl_speed_limit_min = 1000;
100 static int sysctl_speed_limit_max = 200000; 100 static int sysctl_speed_limit_max = 200000;
101 static inline int speed_min(struct mddev *mddev) 101 static inline int speed_min(struct mddev *mddev)
102 { 102 {
103 return mddev->sync_speed_min ? 103 return mddev->sync_speed_min ?
104 mddev->sync_speed_min : sysctl_speed_limit_min; 104 mddev->sync_speed_min : sysctl_speed_limit_min;
105 } 105 }
106 106
107 static inline int speed_max(struct mddev *mddev) 107 static inline int speed_max(struct mddev *mddev)
108 { 108 {
109 return mddev->sync_speed_max ? 109 return mddev->sync_speed_max ?
110 mddev->sync_speed_max : sysctl_speed_limit_max; 110 mddev->sync_speed_max : sysctl_speed_limit_max;
111 } 111 }
112 112
113 static struct ctl_table_header *raid_table_header; 113 static struct ctl_table_header *raid_table_header;
114 114
115 static ctl_table raid_table[] = { 115 static ctl_table raid_table[] = {
116 { 116 {
117 .procname = "speed_limit_min", 117 .procname = "speed_limit_min",
118 .data = &sysctl_speed_limit_min, 118 .data = &sysctl_speed_limit_min,
119 .maxlen = sizeof(int), 119 .maxlen = sizeof(int),
120 .mode = S_IRUGO|S_IWUSR, 120 .mode = S_IRUGO|S_IWUSR,
121 .proc_handler = proc_dointvec, 121 .proc_handler = proc_dointvec,
122 }, 122 },
123 { 123 {
124 .procname = "speed_limit_max", 124 .procname = "speed_limit_max",
125 .data = &sysctl_speed_limit_max, 125 .data = &sysctl_speed_limit_max,
126 .maxlen = sizeof(int), 126 .maxlen = sizeof(int),
127 .mode = S_IRUGO|S_IWUSR, 127 .mode = S_IRUGO|S_IWUSR,
128 .proc_handler = proc_dointvec, 128 .proc_handler = proc_dointvec,
129 }, 129 },
130 { } 130 { }
131 }; 131 };
132 132
133 static ctl_table raid_dir_table[] = { 133 static ctl_table raid_dir_table[] = {
134 { 134 {
135 .procname = "raid", 135 .procname = "raid",
136 .maxlen = 0, 136 .maxlen = 0,
137 .mode = S_IRUGO|S_IXUGO, 137 .mode = S_IRUGO|S_IXUGO,
138 .child = raid_table, 138 .child = raid_table,
139 }, 139 },
140 { } 140 { }
141 }; 141 };
142 142
143 static ctl_table raid_root_table[] = { 143 static ctl_table raid_root_table[] = {
144 { 144 {
145 .procname = "dev", 145 .procname = "dev",
146 .maxlen = 0, 146 .maxlen = 0,
147 .mode = 0555, 147 .mode = 0555,
148 .child = raid_dir_table, 148 .child = raid_dir_table,
149 }, 149 },
150 { } 150 { }
151 }; 151 };
152 152
153 static const struct block_device_operations md_fops; 153 static const struct block_device_operations md_fops;
154 154
155 static int start_readonly; 155 static int start_readonly;
156 156
157 /* bio_clone_mddev 157 /* bio_clone_mddev
158 * like bio_clone, but with a local bio set 158 * like bio_clone, but with a local bio set
159 */ 159 */
160 160
161 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 161 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
162 struct mddev *mddev) 162 struct mddev *mddev)
163 { 163 {
164 struct bio *b; 164 struct bio *b;
165 165
166 if (!mddev || !mddev->bio_set) 166 if (!mddev || !mddev->bio_set)
167 return bio_alloc(gfp_mask, nr_iovecs); 167 return bio_alloc(gfp_mask, nr_iovecs);
168 168
169 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); 169 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
170 if (!b) 170 if (!b)
171 return NULL; 171 return NULL;
172 return b; 172 return b;
173 } 173 }
174 EXPORT_SYMBOL_GPL(bio_alloc_mddev); 174 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
175 175
176 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, 176 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
177 struct mddev *mddev) 177 struct mddev *mddev)
178 { 178 {
179 if (!mddev || !mddev->bio_set) 179 if (!mddev || !mddev->bio_set)
180 return bio_clone(bio, gfp_mask); 180 return bio_clone(bio, gfp_mask);
181 181
182 return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); 182 return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
183 } 183 }
184 EXPORT_SYMBOL_GPL(bio_clone_mddev); 184 EXPORT_SYMBOL_GPL(bio_clone_mddev);
185 185
186 void md_trim_bio(struct bio *bio, int offset, int size) 186 void md_trim_bio(struct bio *bio, int offset, int size)
187 { 187 {
188 /* 'bio' is a cloned bio which we need to trim to match 188 /* 'bio' is a cloned bio which we need to trim to match
189 * the given offset and size. 189 * the given offset and size.
190 * This requires adjusting bi_sector, bi_size, and bi_io_vec 190 * This requires adjusting bi_sector, bi_size, and bi_io_vec
191 */ 191 */
192 int i; 192 int i;
193 struct bio_vec *bvec; 193 struct bio_vec *bvec;
194 int sofar = 0; 194 int sofar = 0;
195 195
196 size <<= 9; 196 size <<= 9;
197 if (offset == 0 && size == bio->bi_size) 197 if (offset == 0 && size == bio->bi_size)
198 return; 198 return;
199 199
200 bio->bi_sector += offset; 200 bio->bi_sector += offset;
201 bio->bi_size = size; 201 bio->bi_size = size;
202 offset <<= 9; 202 offset <<= 9;
203 clear_bit(BIO_SEG_VALID, &bio->bi_flags); 203 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
204 204
205 while (bio->bi_idx < bio->bi_vcnt && 205 while (bio->bi_idx < bio->bi_vcnt &&
206 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { 206 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
207 /* remove this whole bio_vec */ 207 /* remove this whole bio_vec */
208 offset -= bio->bi_io_vec[bio->bi_idx].bv_len; 208 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
209 bio->bi_idx++; 209 bio->bi_idx++;
210 } 210 }
211 if (bio->bi_idx < bio->bi_vcnt) { 211 if (bio->bi_idx < bio->bi_vcnt) {
212 bio->bi_io_vec[bio->bi_idx].bv_offset += offset; 212 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
213 bio->bi_io_vec[bio->bi_idx].bv_len -= offset; 213 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
214 } 214 }
215 /* avoid any complications with bi_idx being non-zero*/ 215 /* avoid any complications with bi_idx being non-zero*/
216 if (bio->bi_idx) { 216 if (bio->bi_idx) {
217 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, 217 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
218 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); 218 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
219 bio->bi_vcnt -= bio->bi_idx; 219 bio->bi_vcnt -= bio->bi_idx;
220 bio->bi_idx = 0; 220 bio->bi_idx = 0;
221 } 221 }
222 /* Make sure vcnt and last bv are not too big */ 222 /* Make sure vcnt and last bv are not too big */
223 bio_for_each_segment(bvec, bio, i) { 223 bio_for_each_segment(bvec, bio, i) {
224 if (sofar + bvec->bv_len > size) 224 if (sofar + bvec->bv_len > size)
225 bvec->bv_len = size - sofar; 225 bvec->bv_len = size - sofar;
226 if (bvec->bv_len == 0) { 226 if (bvec->bv_len == 0) {
227 bio->bi_vcnt = i; 227 bio->bi_vcnt = i;
228 break; 228 break;
229 } 229 }
230 sofar += bvec->bv_len; 230 sofar += bvec->bv_len;
231 } 231 }
232 } 232 }
233 EXPORT_SYMBOL_GPL(md_trim_bio); 233 EXPORT_SYMBOL_GPL(md_trim_bio);
234 234
235 /* 235 /*
236 * We have a system wide 'event count' that is incremented 236 * We have a system wide 'event count' that is incremented
237 * on any 'interesting' event, and readers of /proc/mdstat 237 * on any 'interesting' event, and readers of /proc/mdstat
238 * can use 'poll' or 'select' to find out when the event 238 * can use 'poll' or 'select' to find out when the event
239 * count increases. 239 * count increases.
240 * 240 *
241 * Events are: 241 * Events are:
242 * start array, stop array, error, add device, remove device, 242 * start array, stop array, error, add device, remove device,
243 * start build, activate spare 243 * start build, activate spare
244 */ 244 */
245 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 245 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
246 static atomic_t md_event_count; 246 static atomic_t md_event_count;
247 void md_new_event(struct mddev *mddev) 247 void md_new_event(struct mddev *mddev)
248 { 248 {
249 atomic_inc(&md_event_count); 249 atomic_inc(&md_event_count);
250 wake_up(&md_event_waiters); 250 wake_up(&md_event_waiters);
251 } 251 }
252 EXPORT_SYMBOL_GPL(md_new_event); 252 EXPORT_SYMBOL_GPL(md_new_event);
253 253
254 /* Alternate version that can be called from interrupts 254 /* Alternate version that can be called from interrupts
255 * when calling sysfs_notify isn't needed. 255 * when calling sysfs_notify isn't needed.
256 */ 256 */
257 static void md_new_event_inintr(struct mddev *mddev) 257 static void md_new_event_inintr(struct mddev *mddev)
258 { 258 {
259 atomic_inc(&md_event_count); 259 atomic_inc(&md_event_count);
260 wake_up(&md_event_waiters); 260 wake_up(&md_event_waiters);
261 } 261 }
262 262
263 /* 263 /*
264 * Enables to iterate over all existing md arrays 264 * Enables to iterate over all existing md arrays
265 * all_mddevs_lock protects this list. 265 * all_mddevs_lock protects this list.
266 */ 266 */
267 static LIST_HEAD(all_mddevs); 267 static LIST_HEAD(all_mddevs);
268 static DEFINE_SPINLOCK(all_mddevs_lock); 268 static DEFINE_SPINLOCK(all_mddevs_lock);
269 269
270 270
271 /* 271 /*
272 * iterates through all used mddevs in the system. 272 * iterates through all used mddevs in the system.
273 * We take care to grab the all_mddevs_lock whenever navigating 273 * We take care to grab the all_mddevs_lock whenever navigating
274 * the list, and to always hold a refcount when unlocked. 274 * the list, and to always hold a refcount when unlocked.
275 * Any code which breaks out of this loop while own 275 * Any code which breaks out of this loop while own
276 * a reference to the current mddev and must mddev_put it. 276 * a reference to the current mddev and must mddev_put it.
277 */ 277 */
278 #define for_each_mddev(_mddev,_tmp) \ 278 #define for_each_mddev(_mddev,_tmp) \
279 \ 279 \
280 for (({ spin_lock(&all_mddevs_lock); \ 280 for (({ spin_lock(&all_mddevs_lock); \
281 _tmp = all_mddevs.next; \ 281 _tmp = all_mddevs.next; \
282 _mddev = NULL;}); \ 282 _mddev = NULL;}); \
283 ({ if (_tmp != &all_mddevs) \ 283 ({ if (_tmp != &all_mddevs) \
284 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ 284 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
285 spin_unlock(&all_mddevs_lock); \ 285 spin_unlock(&all_mddevs_lock); \
286 if (_mddev) mddev_put(_mddev); \ 286 if (_mddev) mddev_put(_mddev); \
287 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ 287 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
288 _tmp != &all_mddevs;}); \ 288 _tmp != &all_mddevs;}); \
289 ({ spin_lock(&all_mddevs_lock); \ 289 ({ spin_lock(&all_mddevs_lock); \
290 _tmp = _tmp->next;}) \ 290 _tmp = _tmp->next;}) \
291 ) 291 )
292 292
293 293
294 /* Rather than calling directly into the personality make_request function, 294 /* Rather than calling directly into the personality make_request function,
295 * IO requests come here first so that we can check if the device is 295 * IO requests come here first so that we can check if the device is
296 * being suspended pending a reconfiguration. 296 * being suspended pending a reconfiguration.
297 * We hold a refcount over the call to ->make_request. By the time that 297 * We hold a refcount over the call to ->make_request. By the time that
298 * call has finished, the bio has been linked into some internal structure 298 * call has finished, the bio has been linked into some internal structure
299 * and so is visible to ->quiesce(), so we don't need the refcount any more. 299 * and so is visible to ->quiesce(), so we don't need the refcount any more.
300 */ 300 */
301 static void md_make_request(struct request_queue *q, struct bio *bio) 301 static void md_make_request(struct request_queue *q, struct bio *bio)
302 { 302 {
303 const int rw = bio_data_dir(bio); 303 const int rw = bio_data_dir(bio);
304 struct mddev *mddev = q->queuedata; 304 struct mddev *mddev = q->queuedata;
305 int cpu; 305 int cpu;
306 unsigned int sectors; 306 unsigned int sectors;
307 307
308 if (mddev == NULL || mddev->pers == NULL 308 if (mddev == NULL || mddev->pers == NULL
309 || !mddev->ready) { 309 || !mddev->ready) {
310 bio_io_error(bio); 310 bio_io_error(bio);
311 return; 311 return;
312 } 312 }
313 if (mddev->ro == 1 && unlikely(rw == WRITE)) { 313 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
314 bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); 314 bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
315 return; 315 return;
316 } 316 }
317 smp_rmb(); /* Ensure implications of 'active' are visible */ 317 smp_rmb(); /* Ensure implications of 'active' are visible */
318 rcu_read_lock(); 318 rcu_read_lock();
319 if (mddev->suspended) { 319 if (mddev->suspended) {
320 DEFINE_WAIT(__wait); 320 DEFINE_WAIT(__wait);
321 for (;;) { 321 for (;;) {
322 prepare_to_wait(&mddev->sb_wait, &__wait, 322 prepare_to_wait(&mddev->sb_wait, &__wait,
323 TASK_UNINTERRUPTIBLE); 323 TASK_UNINTERRUPTIBLE);
324 if (!mddev->suspended) 324 if (!mddev->suspended)
325 break; 325 break;
326 rcu_read_unlock(); 326 rcu_read_unlock();
327 schedule(); 327 schedule();
328 rcu_read_lock(); 328 rcu_read_lock();
329 } 329 }
330 finish_wait(&mddev->sb_wait, &__wait); 330 finish_wait(&mddev->sb_wait, &__wait);
331 } 331 }
332 atomic_inc(&mddev->active_io); 332 atomic_inc(&mddev->active_io);
333 rcu_read_unlock(); 333 rcu_read_unlock();
334 334
335 /* 335 /*
336 * save the sectors now since our bio can 336 * save the sectors now since our bio can
337 * go away inside make_request 337 * go away inside make_request
338 */ 338 */
339 sectors = bio_sectors(bio); 339 sectors = bio_sectors(bio);
340 mddev->pers->make_request(mddev, bio); 340 mddev->pers->make_request(mddev, bio);
341 341
342 cpu = part_stat_lock(); 342 cpu = part_stat_lock();
343 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 343 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
344 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); 344 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
345 part_stat_unlock(); 345 part_stat_unlock();
346 346
347 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 347 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
348 wake_up(&mddev->sb_wait); 348 wake_up(&mddev->sb_wait);
349 } 349 }
350 350
351 /* mddev_suspend makes sure no new requests are submitted 351 /* mddev_suspend makes sure no new requests are submitted
352 * to the device, and that any requests that have been submitted 352 * to the device, and that any requests that have been submitted
353 * are completely handled. 353 * are completely handled.
354 * Once ->stop is called and completes, the module will be completely 354 * Once ->stop is called and completes, the module will be completely
355 * unused. 355 * unused.
356 */ 356 */
357 void mddev_suspend(struct mddev *mddev) 357 void mddev_suspend(struct mddev *mddev)
358 { 358 {
359 BUG_ON(mddev->suspended); 359 BUG_ON(mddev->suspended);
360 mddev->suspended = 1; 360 mddev->suspended = 1;
361 synchronize_rcu(); 361 synchronize_rcu();
362 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 362 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
363 mddev->pers->quiesce(mddev, 1); 363 mddev->pers->quiesce(mddev, 1);
364 364
365 del_timer_sync(&mddev->safemode_timer); 365 del_timer_sync(&mddev->safemode_timer);
366 } 366 }
367 EXPORT_SYMBOL_GPL(mddev_suspend); 367 EXPORT_SYMBOL_GPL(mddev_suspend);
368 368
369 void mddev_resume(struct mddev *mddev) 369 void mddev_resume(struct mddev *mddev)
370 { 370 {
371 mddev->suspended = 0; 371 mddev->suspended = 0;
372 wake_up(&mddev->sb_wait); 372 wake_up(&mddev->sb_wait);
373 mddev->pers->quiesce(mddev, 0); 373 mddev->pers->quiesce(mddev, 0);
374 374
375 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 375 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
376 md_wakeup_thread(mddev->thread); 376 md_wakeup_thread(mddev->thread);
377 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 377 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
378 } 378 }
379 EXPORT_SYMBOL_GPL(mddev_resume); 379 EXPORT_SYMBOL_GPL(mddev_resume);
380 380
381 int mddev_congested(struct mddev *mddev, int bits) 381 int mddev_congested(struct mddev *mddev, int bits)
382 { 382 {
383 return mddev->suspended; 383 return mddev->suspended;
384 } 384 }
385 EXPORT_SYMBOL(mddev_congested); 385 EXPORT_SYMBOL(mddev_congested);
386 386
387 /* 387 /*
388 * Generic flush handling for md 388 * Generic flush handling for md
389 */ 389 */
390 390
391 static void md_end_flush(struct bio *bio, int err) 391 static void md_end_flush(struct bio *bio, int err)
392 { 392 {
393 struct md_rdev *rdev = bio->bi_private; 393 struct md_rdev *rdev = bio->bi_private;
394 struct mddev *mddev = rdev->mddev; 394 struct mddev *mddev = rdev->mddev;
395 395
396 rdev_dec_pending(rdev, mddev); 396 rdev_dec_pending(rdev, mddev);
397 397
398 if (atomic_dec_and_test(&mddev->flush_pending)) { 398 if (atomic_dec_and_test(&mddev->flush_pending)) {
399 /* The pre-request flush has finished */ 399 /* The pre-request flush has finished */
400 queue_work(md_wq, &mddev->flush_work); 400 queue_work(md_wq, &mddev->flush_work);
401 } 401 }
402 bio_put(bio); 402 bio_put(bio);
403 } 403 }
404 404
405 static void md_submit_flush_data(struct work_struct *ws); 405 static void md_submit_flush_data(struct work_struct *ws);
406 406
407 static void submit_flushes(struct work_struct *ws) 407 static void submit_flushes(struct work_struct *ws)
408 { 408 {
409 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 409 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
410 struct md_rdev *rdev; 410 struct md_rdev *rdev;
411 411
412 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 412 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
413 atomic_set(&mddev->flush_pending, 1); 413 atomic_set(&mddev->flush_pending, 1);
414 rcu_read_lock(); 414 rcu_read_lock();
415 rdev_for_each_rcu(rdev, mddev) 415 rdev_for_each_rcu(rdev, mddev)
416 if (rdev->raid_disk >= 0 && 416 if (rdev->raid_disk >= 0 &&
417 !test_bit(Faulty, &rdev->flags)) { 417 !test_bit(Faulty, &rdev->flags)) {
418 /* Take two references, one is dropped 418 /* Take two references, one is dropped
419 * when request finishes, one after 419 * when request finishes, one after
420 * we reclaim rcu_read_lock 420 * we reclaim rcu_read_lock
421 */ 421 */
422 struct bio *bi; 422 struct bio *bi;
423 atomic_inc(&rdev->nr_pending); 423 atomic_inc(&rdev->nr_pending);
424 atomic_inc(&rdev->nr_pending); 424 atomic_inc(&rdev->nr_pending);
425 rcu_read_unlock(); 425 rcu_read_unlock();
426 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 426 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
427 bi->bi_end_io = md_end_flush; 427 bi->bi_end_io = md_end_flush;
428 bi->bi_private = rdev; 428 bi->bi_private = rdev;
429 bi->bi_bdev = rdev->bdev; 429 bi->bi_bdev = rdev->bdev;
430 atomic_inc(&mddev->flush_pending); 430 atomic_inc(&mddev->flush_pending);
431 submit_bio(WRITE_FLUSH, bi); 431 submit_bio(WRITE_FLUSH, bi);
432 rcu_read_lock(); 432 rcu_read_lock();
433 rdev_dec_pending(rdev, mddev); 433 rdev_dec_pending(rdev, mddev);
434 } 434 }
435 rcu_read_unlock(); 435 rcu_read_unlock();
436 if (atomic_dec_and_test(&mddev->flush_pending)) 436 if (atomic_dec_and_test(&mddev->flush_pending))
437 queue_work(md_wq, &mddev->flush_work); 437 queue_work(md_wq, &mddev->flush_work);
438 } 438 }
439 439
440 static void md_submit_flush_data(struct work_struct *ws) 440 static void md_submit_flush_data(struct work_struct *ws)
441 { 441 {
442 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 442 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
443 struct bio *bio = mddev->flush_bio; 443 struct bio *bio = mddev->flush_bio;
444 444
445 if (bio->bi_size == 0) 445 if (bio->bi_size == 0)
446 /* an empty barrier - all done */ 446 /* an empty barrier - all done */
447 bio_endio(bio, 0); 447 bio_endio(bio, 0);
448 else { 448 else {
449 bio->bi_rw &= ~REQ_FLUSH; 449 bio->bi_rw &= ~REQ_FLUSH;
450 mddev->pers->make_request(mddev, bio); 450 mddev->pers->make_request(mddev, bio);
451 } 451 }
452 452
453 mddev->flush_bio = NULL; 453 mddev->flush_bio = NULL;
454 wake_up(&mddev->sb_wait); 454 wake_up(&mddev->sb_wait);
455 } 455 }
456 456
457 void md_flush_request(struct mddev *mddev, struct bio *bio) 457 void md_flush_request(struct mddev *mddev, struct bio *bio)
458 { 458 {
459 spin_lock_irq(&mddev->write_lock); 459 spin_lock_irq(&mddev->write_lock);
460 wait_event_lock_irq(mddev->sb_wait, 460 wait_event_lock_irq(mddev->sb_wait,
461 !mddev->flush_bio, 461 !mddev->flush_bio,
462 mddev->write_lock); 462 mddev->write_lock);
463 mddev->flush_bio = bio; 463 mddev->flush_bio = bio;
464 spin_unlock_irq(&mddev->write_lock); 464 spin_unlock_irq(&mddev->write_lock);
465 465
466 INIT_WORK(&mddev->flush_work, submit_flushes); 466 INIT_WORK(&mddev->flush_work, submit_flushes);
467 queue_work(md_wq, &mddev->flush_work); 467 queue_work(md_wq, &mddev->flush_work);
468 } 468 }
469 EXPORT_SYMBOL(md_flush_request); 469 EXPORT_SYMBOL(md_flush_request);
470 470
471 void md_unplug(struct blk_plug_cb *cb, bool from_schedule) 471 void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
472 { 472 {
473 struct mddev *mddev = cb->data; 473 struct mddev *mddev = cb->data;
474 md_wakeup_thread(mddev->thread); 474 md_wakeup_thread(mddev->thread);
475 kfree(cb); 475 kfree(cb);
476 } 476 }
477 EXPORT_SYMBOL(md_unplug); 477 EXPORT_SYMBOL(md_unplug);
478 478
479 static inline struct mddev *mddev_get(struct mddev *mddev) 479 static inline struct mddev *mddev_get(struct mddev *mddev)
480 { 480 {
481 atomic_inc(&mddev->active); 481 atomic_inc(&mddev->active);
482 return mddev; 482 return mddev;
483 } 483 }
484 484
485 static void mddev_delayed_delete(struct work_struct *ws); 485 static void mddev_delayed_delete(struct work_struct *ws);
486 486
487 static void mddev_put(struct mddev *mddev) 487 static void mddev_put(struct mddev *mddev)
488 { 488 {
489 struct bio_set *bs = NULL; 489 struct bio_set *bs = NULL;
490 490
491 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 491 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
492 return; 492 return;
493 if (!mddev->raid_disks && list_empty(&mddev->disks) && 493 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
494 mddev->ctime == 0 && !mddev->hold_active) { 494 mddev->ctime == 0 && !mddev->hold_active) {
495 /* Array is not configured at all, and not held active, 495 /* Array is not configured at all, and not held active,
496 * so destroy it */ 496 * so destroy it */
497 list_del_init(&mddev->all_mddevs); 497 list_del_init(&mddev->all_mddevs);
498 bs = mddev->bio_set; 498 bs = mddev->bio_set;
499 mddev->bio_set = NULL; 499 mddev->bio_set = NULL;
500 if (mddev->gendisk) { 500 if (mddev->gendisk) {
501 /* We did a probe so need to clean up. Call 501 /* We did a probe so need to clean up. Call
502 * queue_work inside the spinlock so that 502 * queue_work inside the spinlock so that
503 * flush_workqueue() after mddev_find will 503 * flush_workqueue() after mddev_find will
504 * succeed in waiting for the work to be done. 504 * succeed in waiting for the work to be done.
505 */ 505 */
506 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 506 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
507 queue_work(md_misc_wq, &mddev->del_work); 507 queue_work(md_misc_wq, &mddev->del_work);
508 } else 508 } else
509 kfree(mddev); 509 kfree(mddev);
510 } 510 }
511 spin_unlock(&all_mddevs_lock); 511 spin_unlock(&all_mddevs_lock);
512 if (bs) 512 if (bs)
513 bioset_free(bs); 513 bioset_free(bs);
514 } 514 }
515 515
516 void mddev_init(struct mddev *mddev) 516 void mddev_init(struct mddev *mddev)
517 { 517 {
518 mutex_init(&mddev->open_mutex); 518 mutex_init(&mddev->open_mutex);
519 mutex_init(&mddev->reconfig_mutex); 519 mutex_init(&mddev->reconfig_mutex);
520 mutex_init(&mddev->bitmap_info.mutex); 520 mutex_init(&mddev->bitmap_info.mutex);
521 INIT_LIST_HEAD(&mddev->disks); 521 INIT_LIST_HEAD(&mddev->disks);
522 INIT_LIST_HEAD(&mddev->all_mddevs); 522 INIT_LIST_HEAD(&mddev->all_mddevs);
523 init_timer(&mddev->safemode_timer); 523 init_timer(&mddev->safemode_timer);
524 atomic_set(&mddev->active, 1); 524 atomic_set(&mddev->active, 1);
525 atomic_set(&mddev->openers, 0); 525 atomic_set(&mddev->openers, 0);
526 atomic_set(&mddev->active_io, 0); 526 atomic_set(&mddev->active_io, 0);
527 spin_lock_init(&mddev->write_lock); 527 spin_lock_init(&mddev->write_lock);
528 atomic_set(&mddev->flush_pending, 0); 528 atomic_set(&mddev->flush_pending, 0);
529 init_waitqueue_head(&mddev->sb_wait); 529 init_waitqueue_head(&mddev->sb_wait);
530 init_waitqueue_head(&mddev->recovery_wait); 530 init_waitqueue_head(&mddev->recovery_wait);
531 mddev->reshape_position = MaxSector; 531 mddev->reshape_position = MaxSector;
532 mddev->reshape_backwards = 0; 532 mddev->reshape_backwards = 0;
533 mddev->resync_min = 0; 533 mddev->resync_min = 0;
534 mddev->resync_max = MaxSector; 534 mddev->resync_max = MaxSector;
535 mddev->level = LEVEL_NONE; 535 mddev->level = LEVEL_NONE;
536 } 536 }
537 EXPORT_SYMBOL_GPL(mddev_init); 537 EXPORT_SYMBOL_GPL(mddev_init);
538 538
539 static struct mddev * mddev_find(dev_t unit) 539 static struct mddev * mddev_find(dev_t unit)
540 { 540 {
541 struct mddev *mddev, *new = NULL; 541 struct mddev *mddev, *new = NULL;
542 542
543 if (unit && MAJOR(unit) != MD_MAJOR) 543 if (unit && MAJOR(unit) != MD_MAJOR)
544 unit &= ~((1<<MdpMinorShift)-1); 544 unit &= ~((1<<MdpMinorShift)-1);
545 545
546 retry: 546 retry:
547 spin_lock(&all_mddevs_lock); 547 spin_lock(&all_mddevs_lock);
548 548
549 if (unit) { 549 if (unit) {
550 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 550 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
551 if (mddev->unit == unit) { 551 if (mddev->unit == unit) {
552 mddev_get(mddev); 552 mddev_get(mddev);
553 spin_unlock(&all_mddevs_lock); 553 spin_unlock(&all_mddevs_lock);
554 kfree(new); 554 kfree(new);
555 return mddev; 555 return mddev;
556 } 556 }
557 557
558 if (new) { 558 if (new) {
559 list_add(&new->all_mddevs, &all_mddevs); 559 list_add(&new->all_mddevs, &all_mddevs);
560 spin_unlock(&all_mddevs_lock); 560 spin_unlock(&all_mddevs_lock);
561 new->hold_active = UNTIL_IOCTL; 561 new->hold_active = UNTIL_IOCTL;
562 return new; 562 return new;
563 } 563 }
564 } else if (new) { 564 } else if (new) {
565 /* find an unused unit number */ 565 /* find an unused unit number */
566 static int next_minor = 512; 566 static int next_minor = 512;
567 int start = next_minor; 567 int start = next_minor;
568 int is_free = 0; 568 int is_free = 0;
569 int dev = 0; 569 int dev = 0;
570 while (!is_free) { 570 while (!is_free) {
571 dev = MKDEV(MD_MAJOR, next_minor); 571 dev = MKDEV(MD_MAJOR, next_minor);
572 next_minor++; 572 next_minor++;
573 if (next_minor > MINORMASK) 573 if (next_minor > MINORMASK)
574 next_minor = 0; 574 next_minor = 0;
575 if (next_minor == start) { 575 if (next_minor == start) {
576 /* Oh dear, all in use. */ 576 /* Oh dear, all in use. */
577 spin_unlock(&all_mddevs_lock); 577 spin_unlock(&all_mddevs_lock);
578 kfree(new); 578 kfree(new);
579 return NULL; 579 return NULL;
580 } 580 }
581 581
582 is_free = 1; 582 is_free = 1;
583 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 583 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
584 if (mddev->unit == dev) { 584 if (mddev->unit == dev) {
585 is_free = 0; 585 is_free = 0;
586 break; 586 break;
587 } 587 }
588 } 588 }
589 new->unit = dev; 589 new->unit = dev;
590 new->md_minor = MINOR(dev); 590 new->md_minor = MINOR(dev);
591 new->hold_active = UNTIL_STOP; 591 new->hold_active = UNTIL_STOP;
592 list_add(&new->all_mddevs, &all_mddevs); 592 list_add(&new->all_mddevs, &all_mddevs);
593 spin_unlock(&all_mddevs_lock); 593 spin_unlock(&all_mddevs_lock);
594 return new; 594 return new;
595 } 595 }
596 spin_unlock(&all_mddevs_lock); 596 spin_unlock(&all_mddevs_lock);
597 597
598 new = kzalloc(sizeof(*new), GFP_KERNEL); 598 new = kzalloc(sizeof(*new), GFP_KERNEL);
599 if (!new) 599 if (!new)
600 return NULL; 600 return NULL;
601 601
602 new->unit = unit; 602 new->unit = unit;
603 if (MAJOR(unit) == MD_MAJOR) 603 if (MAJOR(unit) == MD_MAJOR)
604 new->md_minor = MINOR(unit); 604 new->md_minor = MINOR(unit);
605 else 605 else
606 new->md_minor = MINOR(unit) >> MdpMinorShift; 606 new->md_minor = MINOR(unit) >> MdpMinorShift;
607 607
608 mddev_init(new); 608 mddev_init(new);
609 609
610 goto retry; 610 goto retry;
611 } 611 }
612 612
613 static inline int mddev_lock(struct mddev * mddev) 613 static inline int mddev_lock(struct mddev * mddev)
614 { 614 {
615 return mutex_lock_interruptible(&mddev->reconfig_mutex); 615 return mutex_lock_interruptible(&mddev->reconfig_mutex);
616 } 616 }
617 617
618 static inline int mddev_is_locked(struct mddev *mddev) 618 static inline int mddev_is_locked(struct mddev *mddev)
619 { 619 {
620 return mutex_is_locked(&mddev->reconfig_mutex); 620 return mutex_is_locked(&mddev->reconfig_mutex);
621 } 621 }
622 622
623 static inline int mddev_trylock(struct mddev * mddev) 623 static inline int mddev_trylock(struct mddev * mddev)
624 { 624 {
625 return mutex_trylock(&mddev->reconfig_mutex); 625 return mutex_trylock(&mddev->reconfig_mutex);
626 } 626 }
627 627
628 static struct attribute_group md_redundancy_group; 628 static struct attribute_group md_redundancy_group;
629 629
630 static void mddev_unlock(struct mddev * mddev) 630 static void mddev_unlock(struct mddev * mddev)
631 { 631 {
632 if (mddev->to_remove) { 632 if (mddev->to_remove) {
633 /* These cannot be removed under reconfig_mutex as 633 /* These cannot be removed under reconfig_mutex as
634 * an access to the files will try to take reconfig_mutex 634 * an access to the files will try to take reconfig_mutex
635 * while holding the file unremovable, which leads to 635 * while holding the file unremovable, which leads to
636 * a deadlock. 636 * a deadlock.
637 * So hold set sysfs_active while the remove in happeing, 637 * So hold set sysfs_active while the remove in happeing,
638 * and anything else which might set ->to_remove or my 638 * and anything else which might set ->to_remove or my
639 * otherwise change the sysfs namespace will fail with 639 * otherwise change the sysfs namespace will fail with
640 * -EBUSY if sysfs_active is still set. 640 * -EBUSY if sysfs_active is still set.
641 * We set sysfs_active under reconfig_mutex and elsewhere 641 * We set sysfs_active under reconfig_mutex and elsewhere
642 * test it under the same mutex to ensure its correct value 642 * test it under the same mutex to ensure its correct value
643 * is seen. 643 * is seen.
644 */ 644 */
645 struct attribute_group *to_remove = mddev->to_remove; 645 struct attribute_group *to_remove = mddev->to_remove;
646 mddev->to_remove = NULL; 646 mddev->to_remove = NULL;
647 mddev->sysfs_active = 1; 647 mddev->sysfs_active = 1;
648 mutex_unlock(&mddev->reconfig_mutex); 648 mutex_unlock(&mddev->reconfig_mutex);
649 649
650 if (mddev->kobj.sd) { 650 if (mddev->kobj.sd) {
651 if (to_remove != &md_redundancy_group) 651 if (to_remove != &md_redundancy_group)
652 sysfs_remove_group(&mddev->kobj, to_remove); 652 sysfs_remove_group(&mddev->kobj, to_remove);
653 if (mddev->pers == NULL || 653 if (mddev->pers == NULL ||
654 mddev->pers->sync_request == NULL) { 654 mddev->pers->sync_request == NULL) {
655 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 655 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
656 if (mddev->sysfs_action) 656 if (mddev->sysfs_action)
657 sysfs_put(mddev->sysfs_action); 657 sysfs_put(mddev->sysfs_action);
658 mddev->sysfs_action = NULL; 658 mddev->sysfs_action = NULL;
659 } 659 }
660 } 660 }
661 mddev->sysfs_active = 0; 661 mddev->sysfs_active = 0;
662 } else 662 } else
663 mutex_unlock(&mddev->reconfig_mutex); 663 mutex_unlock(&mddev->reconfig_mutex);
664 664
665 /* As we've dropped the mutex we need a spinlock to 665 /* As we've dropped the mutex we need a spinlock to
666 * make sure the thread doesn't disappear 666 * make sure the thread doesn't disappear
667 */ 667 */
668 spin_lock(&pers_lock); 668 spin_lock(&pers_lock);
669 md_wakeup_thread(mddev->thread); 669 md_wakeup_thread(mddev->thread);
670 spin_unlock(&pers_lock); 670 spin_unlock(&pers_lock);
671 } 671 }
672 672
673 static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) 673 static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
674 { 674 {
675 struct md_rdev *rdev; 675 struct md_rdev *rdev;
676 676
677 rdev_for_each(rdev, mddev) 677 rdev_for_each(rdev, mddev)
678 if (rdev->desc_nr == nr) 678 if (rdev->desc_nr == nr)
679 return rdev; 679 return rdev;
680 680
681 return NULL; 681 return NULL;
682 } 682 }
683 683
684 static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) 684 static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
685 { 685 {
686 struct md_rdev *rdev; 686 struct md_rdev *rdev;
687 687
688 rdev_for_each_rcu(rdev, mddev) 688 rdev_for_each_rcu(rdev, mddev)
689 if (rdev->desc_nr == nr) 689 if (rdev->desc_nr == nr)
690 return rdev; 690 return rdev;
691 691
692 return NULL; 692 return NULL;
693 } 693 }
694 694
695 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 695 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
696 { 696 {
697 struct md_rdev *rdev; 697 struct md_rdev *rdev;
698 698
699 rdev_for_each(rdev, mddev) 699 rdev_for_each(rdev, mddev)
700 if (rdev->bdev->bd_dev == dev) 700 if (rdev->bdev->bd_dev == dev)
701 return rdev; 701 return rdev;
702 702
703 return NULL; 703 return NULL;
704 } 704 }
705 705
706 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) 706 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
707 { 707 {
708 struct md_rdev *rdev; 708 struct md_rdev *rdev;
709 709
710 rdev_for_each_rcu(rdev, mddev) 710 rdev_for_each_rcu(rdev, mddev)
711 if (rdev->bdev->bd_dev == dev) 711 if (rdev->bdev->bd_dev == dev)
712 return rdev; 712 return rdev;
713 713
714 return NULL; 714 return NULL;
715 } 715 }
716 716
717 static struct md_personality *find_pers(int level, char *clevel) 717 static struct md_personality *find_pers(int level, char *clevel)
718 { 718 {
719 struct md_personality *pers; 719 struct md_personality *pers;
720 list_for_each_entry(pers, &pers_list, list) { 720 list_for_each_entry(pers, &pers_list, list) {
721 if (level != LEVEL_NONE && pers->level == level) 721 if (level != LEVEL_NONE && pers->level == level)
722 return pers; 722 return pers;
723 if (strcmp(pers->name, clevel)==0) 723 if (strcmp(pers->name, clevel)==0)
724 return pers; 724 return pers;
725 } 725 }
726 return NULL; 726 return NULL;
727 } 727 }
728 728
729 /* return the offset of the super block in 512byte sectors */ 729 /* return the offset of the super block in 512byte sectors */
730 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 730 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
731 { 731 {
732 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; 732 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
733 return MD_NEW_SIZE_SECTORS(num_sectors); 733 return MD_NEW_SIZE_SECTORS(num_sectors);
734 } 734 }
735 735
736 static int alloc_disk_sb(struct md_rdev * rdev) 736 static int alloc_disk_sb(struct md_rdev * rdev)
737 { 737 {
738 if (rdev->sb_page) 738 if (rdev->sb_page)
739 MD_BUG(); 739 MD_BUG();
740 740
741 rdev->sb_page = alloc_page(GFP_KERNEL); 741 rdev->sb_page = alloc_page(GFP_KERNEL);
742 if (!rdev->sb_page) { 742 if (!rdev->sb_page) {
743 printk(KERN_ALERT "md: out of memory.\n"); 743 printk(KERN_ALERT "md: out of memory.\n");
744 return -ENOMEM; 744 return -ENOMEM;
745 } 745 }
746 746
747 return 0; 747 return 0;
748 } 748 }
749 749
750 void md_rdev_clear(struct md_rdev *rdev) 750 void md_rdev_clear(struct md_rdev *rdev)
751 { 751 {
752 if (rdev->sb_page) { 752 if (rdev->sb_page) {
753 put_page(rdev->sb_page); 753 put_page(rdev->sb_page);
754 rdev->sb_loaded = 0; 754 rdev->sb_loaded = 0;
755 rdev->sb_page = NULL; 755 rdev->sb_page = NULL;
756 rdev->sb_start = 0; 756 rdev->sb_start = 0;
757 rdev->sectors = 0; 757 rdev->sectors = 0;
758 } 758 }
759 if (rdev->bb_page) { 759 if (rdev->bb_page) {
760 put_page(rdev->bb_page); 760 put_page(rdev->bb_page);
761 rdev->bb_page = NULL; 761 rdev->bb_page = NULL;
762 } 762 }
763 kfree(rdev->badblocks.page); 763 kfree(rdev->badblocks.page);
764 rdev->badblocks.page = NULL; 764 rdev->badblocks.page = NULL;
765 } 765 }
766 EXPORT_SYMBOL_GPL(md_rdev_clear); 766 EXPORT_SYMBOL_GPL(md_rdev_clear);
767 767
768 static void super_written(struct bio *bio, int error) 768 static void super_written(struct bio *bio, int error)
769 { 769 {
770 struct md_rdev *rdev = bio->bi_private; 770 struct md_rdev *rdev = bio->bi_private;
771 struct mddev *mddev = rdev->mddev; 771 struct mddev *mddev = rdev->mddev;
772 772
773 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 773 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
774 printk("md: super_written gets error=%d, uptodate=%d\n", 774 printk("md: super_written gets error=%d, uptodate=%d\n",
775 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 775 error, test_bit(BIO_UPTODATE, &bio->bi_flags));
776 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 776 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
777 md_error(mddev, rdev); 777 md_error(mddev, rdev);
778 } 778 }
779 779
780 if (atomic_dec_and_test(&mddev->pending_writes)) 780 if (atomic_dec_and_test(&mddev->pending_writes))
781 wake_up(&mddev->sb_wait); 781 wake_up(&mddev->sb_wait);
782 bio_put(bio); 782 bio_put(bio);
783 } 783 }
784 784
785 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 785 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
786 sector_t sector, int size, struct page *page) 786 sector_t sector, int size, struct page *page)
787 { 787 {
788 /* write first size bytes of page to sector of rdev 788 /* write first size bytes of page to sector of rdev
789 * Increment mddev->pending_writes before returning 789 * Increment mddev->pending_writes before returning
790 * and decrement it on completion, waking up sb_wait 790 * and decrement it on completion, waking up sb_wait
791 * if zero is reached. 791 * if zero is reached.
792 * If an error occurred, call md_error 792 * If an error occurred, call md_error
793 */ 793 */
794 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); 794 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
795 795
796 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; 796 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
797 bio->bi_sector = sector; 797 bio->bi_sector = sector;
798 bio_add_page(bio, page, size, 0); 798 bio_add_page(bio, page, size, 0);
799 bio->bi_private = rdev; 799 bio->bi_private = rdev;
800 bio->bi_end_io = super_written; 800 bio->bi_end_io = super_written;
801 801
802 atomic_inc(&mddev->pending_writes); 802 atomic_inc(&mddev->pending_writes);
803 submit_bio(WRITE_FLUSH_FUA, bio); 803 submit_bio(WRITE_FLUSH_FUA, bio);
804 } 804 }
805 805
806 void md_super_wait(struct mddev *mddev) 806 void md_super_wait(struct mddev *mddev)
807 { 807 {
808 /* wait for all superblock writes that were scheduled to complete */ 808 /* wait for all superblock writes that were scheduled to complete */
809 DEFINE_WAIT(wq); 809 DEFINE_WAIT(wq);
810 for(;;) { 810 for(;;) {
811 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 811 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
812 if (atomic_read(&mddev->pending_writes)==0) 812 if (atomic_read(&mddev->pending_writes)==0)
813 break; 813 break;
814 schedule(); 814 schedule();
815 } 815 }
816 finish_wait(&mddev->sb_wait, &wq); 816 finish_wait(&mddev->sb_wait, &wq);
817 } 817 }
818 818
819 static void bi_complete(struct bio *bio, int error) 819 static void bi_complete(struct bio *bio, int error)
820 { 820 {
821 complete((struct completion*)bio->bi_private); 821 complete((struct completion*)bio->bi_private);
822 } 822 }
823 823
824 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 824 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
825 struct page *page, int rw, bool metadata_op) 825 struct page *page, int rw, bool metadata_op)
826 { 826 {
827 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); 827 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
828 struct completion event; 828 struct completion event;
829 int ret; 829 int ret;
830 830
831 rw |= REQ_SYNC; 831 rw |= REQ_SYNC;
832 832
833 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? 833 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
834 rdev->meta_bdev : rdev->bdev; 834 rdev->meta_bdev : rdev->bdev;
835 if (metadata_op) 835 if (metadata_op)
836 bio->bi_sector = sector + rdev->sb_start; 836 bio->bi_sector = sector + rdev->sb_start;
837 else if (rdev->mddev->reshape_position != MaxSector && 837 else if (rdev->mddev->reshape_position != MaxSector &&
838 (rdev->mddev->reshape_backwards == 838 (rdev->mddev->reshape_backwards ==
839 (sector >= rdev->mddev->reshape_position))) 839 (sector >= rdev->mddev->reshape_position)))
840 bio->bi_sector = sector + rdev->new_data_offset; 840 bio->bi_sector = sector + rdev->new_data_offset;
841 else 841 else
842 bio->bi_sector = sector + rdev->data_offset; 842 bio->bi_sector = sector + rdev->data_offset;
843 bio_add_page(bio, page, size, 0); 843 bio_add_page(bio, page, size, 0);
844 init_completion(&event); 844 init_completion(&event);
845 bio->bi_private = &event; 845 bio->bi_private = &event;
846 bio->bi_end_io = bi_complete; 846 bio->bi_end_io = bi_complete;
847 submit_bio(rw, bio); 847 submit_bio(rw, bio);
848 wait_for_completion(&event); 848 wait_for_completion(&event);
849 849
850 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 850 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
851 bio_put(bio); 851 bio_put(bio);
852 return ret; 852 return ret;
853 } 853 }
854 EXPORT_SYMBOL_GPL(sync_page_io); 854 EXPORT_SYMBOL_GPL(sync_page_io);
855 855
856 static int read_disk_sb(struct md_rdev * rdev, int size) 856 static int read_disk_sb(struct md_rdev * rdev, int size)
857 { 857 {
858 char b[BDEVNAME_SIZE]; 858 char b[BDEVNAME_SIZE];
859 if (!rdev->sb_page) { 859 if (!rdev->sb_page) {
860 MD_BUG(); 860 MD_BUG();
861 return -EINVAL; 861 return -EINVAL;
862 } 862 }
863 if (rdev->sb_loaded) 863 if (rdev->sb_loaded)
864 return 0; 864 return 0;
865 865
866 866
867 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) 867 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
868 goto fail; 868 goto fail;
869 rdev->sb_loaded = 1; 869 rdev->sb_loaded = 1;
870 return 0; 870 return 0;
871 871
872 fail: 872 fail:
873 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 873 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
874 bdevname(rdev->bdev,b)); 874 bdevname(rdev->bdev,b));
875 return -EINVAL; 875 return -EINVAL;
876 } 876 }
877 877
878 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 878 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
879 { 879 {
880 return sb1->set_uuid0 == sb2->set_uuid0 && 880 return sb1->set_uuid0 == sb2->set_uuid0 &&
881 sb1->set_uuid1 == sb2->set_uuid1 && 881 sb1->set_uuid1 == sb2->set_uuid1 &&
882 sb1->set_uuid2 == sb2->set_uuid2 && 882 sb1->set_uuid2 == sb2->set_uuid2 &&
883 sb1->set_uuid3 == sb2->set_uuid3; 883 sb1->set_uuid3 == sb2->set_uuid3;
884 } 884 }
885 885
886 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 886 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
887 { 887 {
888 int ret; 888 int ret;
889 mdp_super_t *tmp1, *tmp2; 889 mdp_super_t *tmp1, *tmp2;
890 890
891 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 891 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
892 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 892 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
893 893
894 if (!tmp1 || !tmp2) { 894 if (!tmp1 || !tmp2) {
895 ret = 0; 895 ret = 0;
896 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 896 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
897 goto abort; 897 goto abort;
898 } 898 }
899 899
900 *tmp1 = *sb1; 900 *tmp1 = *sb1;
901 *tmp2 = *sb2; 901 *tmp2 = *sb2;
902 902
903 /* 903 /*
904 * nr_disks is not constant 904 * nr_disks is not constant
905 */ 905 */
906 tmp1->nr_disks = 0; 906 tmp1->nr_disks = 0;
907 tmp2->nr_disks = 0; 907 tmp2->nr_disks = 0;
908 908
909 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 909 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
910 abort: 910 abort:
911 kfree(tmp1); 911 kfree(tmp1);
912 kfree(tmp2); 912 kfree(tmp2);
913 return ret; 913 return ret;
914 } 914 }
915 915
916 916
917 static u32 md_csum_fold(u32 csum) 917 static u32 md_csum_fold(u32 csum)
918 { 918 {
919 csum = (csum & 0xffff) + (csum >> 16); 919 csum = (csum & 0xffff) + (csum >> 16);
920 return (csum & 0xffff) + (csum >> 16); 920 return (csum & 0xffff) + (csum >> 16);
921 } 921 }
922 922
923 static unsigned int calc_sb_csum(mdp_super_t * sb) 923 static unsigned int calc_sb_csum(mdp_super_t * sb)
924 { 924 {
925 u64 newcsum = 0; 925 u64 newcsum = 0;
926 u32 *sb32 = (u32*)sb; 926 u32 *sb32 = (u32*)sb;
927 int i; 927 int i;
928 unsigned int disk_csum, csum; 928 unsigned int disk_csum, csum;
929 929
930 disk_csum = sb->sb_csum; 930 disk_csum = sb->sb_csum;
931 sb->sb_csum = 0; 931 sb->sb_csum = 0;
932 932
933 for (i = 0; i < MD_SB_BYTES/4 ; i++) 933 for (i = 0; i < MD_SB_BYTES/4 ; i++)
934 newcsum += sb32[i]; 934 newcsum += sb32[i];
935 csum = (newcsum & 0xffffffff) + (newcsum>>32); 935 csum = (newcsum & 0xffffffff) + (newcsum>>32);
936 936
937 937
938 #ifdef CONFIG_ALPHA 938 #ifdef CONFIG_ALPHA
939 /* This used to use csum_partial, which was wrong for several 939 /* This used to use csum_partial, which was wrong for several
940 * reasons including that different results are returned on 940 * reasons including that different results are returned on
941 * different architectures. It isn't critical that we get exactly 941 * different architectures. It isn't critical that we get exactly
942 * the same return value as before (we always csum_fold before 942 * the same return value as before (we always csum_fold before
943 * testing, and that removes any differences). However as we 943 * testing, and that removes any differences). However as we
944 * know that csum_partial always returned a 16bit value on 944 * know that csum_partial always returned a 16bit value on
945 * alphas, do a fold to maximise conformity to previous behaviour. 945 * alphas, do a fold to maximise conformity to previous behaviour.
946 */ 946 */
947 sb->sb_csum = md_csum_fold(disk_csum); 947 sb->sb_csum = md_csum_fold(disk_csum);
948 #else 948 #else
949 sb->sb_csum = disk_csum; 949 sb->sb_csum = disk_csum;
950 #endif 950 #endif
951 return csum; 951 return csum;
952 } 952 }
953 953
954 954
955 /* 955 /*
956 * Handle superblock details. 956 * Handle superblock details.
957 * We want to be able to handle multiple superblock formats 957 * We want to be able to handle multiple superblock formats
958 * so we have a common interface to them all, and an array of 958 * so we have a common interface to them all, and an array of
959 * different handlers. 959 * different handlers.
960 * We rely on user-space to write the initial superblock, and support 960 * We rely on user-space to write the initial superblock, and support
961 * reading and updating of superblocks. 961 * reading and updating of superblocks.
962 * Interface methods are: 962 * Interface methods are:
963 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 963 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
964 * loads and validates a superblock on dev. 964 * loads and validates a superblock on dev.
965 * if refdev != NULL, compare superblocks on both devices 965 * if refdev != NULL, compare superblocks on both devices
966 * Return: 966 * Return:
967 * 0 - dev has a superblock that is compatible with refdev 967 * 0 - dev has a superblock that is compatible with refdev
968 * 1 - dev has a superblock that is compatible and newer than refdev 968 * 1 - dev has a superblock that is compatible and newer than refdev
969 * so dev should be used as the refdev in future 969 * so dev should be used as the refdev in future
970 * -EINVAL superblock incompatible or invalid 970 * -EINVAL superblock incompatible or invalid
971 * -othererror e.g. -EIO 971 * -othererror e.g. -EIO
972 * 972 *
973 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 973 * int validate_super(struct mddev *mddev, struct md_rdev *dev)
974 * Verify that dev is acceptable into mddev. 974 * Verify that dev is acceptable into mddev.
975 * The first time, mddev->raid_disks will be 0, and data from 975 * The first time, mddev->raid_disks will be 0, and data from
976 * dev should be merged in. Subsequent calls check that dev 976 * dev should be merged in. Subsequent calls check that dev
977 * is new enough. Return 0 or -EINVAL 977 * is new enough. Return 0 or -EINVAL
978 * 978 *
979 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 979 * void sync_super(struct mddev *mddev, struct md_rdev *dev)
980 * Update the superblock for rdev with data in mddev 980 * Update the superblock for rdev with data in mddev
981 * This does not write to disc. 981 * This does not write to disc.
982 * 982 *
983 */ 983 */
984 984
985 struct super_type { 985 struct super_type {
986 char *name; 986 char *name;
987 struct module *owner; 987 struct module *owner;
988 int (*load_super)(struct md_rdev *rdev, 988 int (*load_super)(struct md_rdev *rdev,
989 struct md_rdev *refdev, 989 struct md_rdev *refdev,
990 int minor_version); 990 int minor_version);
991 int (*validate_super)(struct mddev *mddev, 991 int (*validate_super)(struct mddev *mddev,
992 struct md_rdev *rdev); 992 struct md_rdev *rdev);
993 void (*sync_super)(struct mddev *mddev, 993 void (*sync_super)(struct mddev *mddev,
994 struct md_rdev *rdev); 994 struct md_rdev *rdev);
995 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 995 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
996 sector_t num_sectors); 996 sector_t num_sectors);
997 int (*allow_new_offset)(struct md_rdev *rdev, 997 int (*allow_new_offset)(struct md_rdev *rdev,
998 unsigned long long new_offset); 998 unsigned long long new_offset);
999 }; 999 };
1000 1000
1001 /* 1001 /*
1002 * Check that the given mddev has no bitmap. 1002 * Check that the given mddev has no bitmap.
1003 * 1003 *
1004 * This function is called from the run method of all personalities that do not 1004 * This function is called from the run method of all personalities that do not
1005 * support bitmaps. It prints an error message and returns non-zero if mddev 1005 * support bitmaps. It prints an error message and returns non-zero if mddev
1006 * has a bitmap. Otherwise, it returns 0. 1006 * has a bitmap. Otherwise, it returns 0.
1007 * 1007 *
1008 */ 1008 */
1009 int md_check_no_bitmap(struct mddev *mddev) 1009 int md_check_no_bitmap(struct mddev *mddev)
1010 { 1010 {
1011 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 1011 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
1012 return 0; 1012 return 0;
1013 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 1013 printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
1014 mdname(mddev), mddev->pers->name); 1014 mdname(mddev), mddev->pers->name);
1015 return 1; 1015 return 1;
1016 } 1016 }
1017 EXPORT_SYMBOL(md_check_no_bitmap); 1017 EXPORT_SYMBOL(md_check_no_bitmap);
1018 1018
1019 /* 1019 /*
1020 * load_super for 0.90.0 1020 * load_super for 0.90.0
1021 */ 1021 */
1022 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1022 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1023 { 1023 {
1024 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1024 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1025 mdp_super_t *sb; 1025 mdp_super_t *sb;
1026 int ret; 1026 int ret;
1027 1027
1028 /* 1028 /*
1029 * Calculate the position of the superblock (512byte sectors), 1029 * Calculate the position of the superblock (512byte sectors),
1030 * it's at the end of the disk. 1030 * it's at the end of the disk.
1031 * 1031 *
1032 * It also happens to be a multiple of 4Kb. 1032 * It also happens to be a multiple of 4Kb.
1033 */ 1033 */
1034 rdev->sb_start = calc_dev_sboffset(rdev); 1034 rdev->sb_start = calc_dev_sboffset(rdev);
1035 1035
1036 ret = read_disk_sb(rdev, MD_SB_BYTES); 1036 ret = read_disk_sb(rdev, MD_SB_BYTES);
1037 if (ret) return ret; 1037 if (ret) return ret;
1038 1038
1039 ret = -EINVAL; 1039 ret = -EINVAL;
1040 1040
1041 bdevname(rdev->bdev, b); 1041 bdevname(rdev->bdev, b);
1042 sb = page_address(rdev->sb_page); 1042 sb = page_address(rdev->sb_page);
1043 1043
1044 if (sb->md_magic != MD_SB_MAGIC) { 1044 if (sb->md_magic != MD_SB_MAGIC) {
1045 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 1045 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
1046 b); 1046 b);
1047 goto abort; 1047 goto abort;
1048 } 1048 }
1049 1049
1050 if (sb->major_version != 0 || 1050 if (sb->major_version != 0 ||
1051 sb->minor_version < 90 || 1051 sb->minor_version < 90 ||
1052 sb->minor_version > 91) { 1052 sb->minor_version > 91) {
1053 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 1053 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
1054 sb->major_version, sb->minor_version, 1054 sb->major_version, sb->minor_version,
1055 b); 1055 b);
1056 goto abort; 1056 goto abort;
1057 } 1057 }
1058 1058
1059 if (sb->raid_disks <= 0) 1059 if (sb->raid_disks <= 0)
1060 goto abort; 1060 goto abort;
1061 1061
1062 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1062 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1063 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 1063 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
1064 b); 1064 b);
1065 goto abort; 1065 goto abort;
1066 } 1066 }
1067 1067
1068 rdev->preferred_minor = sb->md_minor; 1068 rdev->preferred_minor = sb->md_minor;
1069 rdev->data_offset = 0; 1069 rdev->data_offset = 0;
1070 rdev->new_data_offset = 0; 1070 rdev->new_data_offset = 0;
1071 rdev->sb_size = MD_SB_BYTES; 1071 rdev->sb_size = MD_SB_BYTES;
1072 rdev->badblocks.shift = -1; 1072 rdev->badblocks.shift = -1;
1073 1073
1074 if (sb->level == LEVEL_MULTIPATH) 1074 if (sb->level == LEVEL_MULTIPATH)
1075 rdev->desc_nr = -1; 1075 rdev->desc_nr = -1;
1076 else 1076 else
1077 rdev->desc_nr = sb->this_disk.number; 1077 rdev->desc_nr = sb->this_disk.number;
1078 1078
1079 if (!refdev) { 1079 if (!refdev) {
1080 ret = 1; 1080 ret = 1;
1081 } else { 1081 } else {
1082 __u64 ev1, ev2; 1082 __u64 ev1, ev2;
1083 mdp_super_t *refsb = page_address(refdev->sb_page); 1083 mdp_super_t *refsb = page_address(refdev->sb_page);
1084 if (!uuid_equal(refsb, sb)) { 1084 if (!uuid_equal(refsb, sb)) {
1085 printk(KERN_WARNING "md: %s has different UUID to %s\n", 1085 printk(KERN_WARNING "md: %s has different UUID to %s\n",
1086 b, bdevname(refdev->bdev,b2)); 1086 b, bdevname(refdev->bdev,b2));
1087 goto abort; 1087 goto abort;
1088 } 1088 }
1089 if (!sb_equal(refsb, sb)) { 1089 if (!sb_equal(refsb, sb)) {
1090 printk(KERN_WARNING "md: %s has same UUID" 1090 printk(KERN_WARNING "md: %s has same UUID"
1091 " but different superblock to %s\n", 1091 " but different superblock to %s\n",
1092 b, bdevname(refdev->bdev, b2)); 1092 b, bdevname(refdev->bdev, b2));
1093 goto abort; 1093 goto abort;
1094 } 1094 }
1095 ev1 = md_event(sb); 1095 ev1 = md_event(sb);
1096 ev2 = md_event(refsb); 1096 ev2 = md_event(refsb);
1097 if (ev1 > ev2) 1097 if (ev1 > ev2)
1098 ret = 1; 1098 ret = 1;
1099 else 1099 else
1100 ret = 0; 1100 ret = 0;
1101 } 1101 }
1102 rdev->sectors = rdev->sb_start; 1102 rdev->sectors = rdev->sb_start;
1103 /* Limit to 4TB as metadata cannot record more than that. 1103 /* Limit to 4TB as metadata cannot record more than that.
1104 * (not needed for Linear and RAID0 as metadata doesn't 1104 * (not needed for Linear and RAID0 as metadata doesn't
1105 * record this size) 1105 * record this size)
1106 */ 1106 */
1107 if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1107 if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1108 rdev->sectors = (2ULL << 32) - 2; 1108 rdev->sectors = (2ULL << 32) - 2;
1109 1109
1110 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1110 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1111 /* "this cannot possibly happen" ... */ 1111 /* "this cannot possibly happen" ... */
1112 ret = -EINVAL; 1112 ret = -EINVAL;
1113 1113
1114 abort: 1114 abort:
1115 return ret; 1115 return ret;
1116 } 1116 }
1117 1117
1118 /* 1118 /*
1119 * validate_super for 0.90.0 1119 * validate_super for 0.90.0
1120 */ 1120 */
1121 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1121 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1122 { 1122 {
1123 mdp_disk_t *desc; 1123 mdp_disk_t *desc;
1124 mdp_super_t *sb = page_address(rdev->sb_page); 1124 mdp_super_t *sb = page_address(rdev->sb_page);
1125 __u64 ev1 = md_event(sb); 1125 __u64 ev1 = md_event(sb);
1126 1126
1127 rdev->raid_disk = -1; 1127 rdev->raid_disk = -1;
1128 clear_bit(Faulty, &rdev->flags); 1128 clear_bit(Faulty, &rdev->flags);
1129 clear_bit(In_sync, &rdev->flags); 1129 clear_bit(In_sync, &rdev->flags);
1130 clear_bit(WriteMostly, &rdev->flags); 1130 clear_bit(WriteMostly, &rdev->flags);
1131 1131
1132 if (mddev->raid_disks == 0) { 1132 if (mddev->raid_disks == 0) {
1133 mddev->major_version = 0; 1133 mddev->major_version = 0;
1134 mddev->minor_version = sb->minor_version; 1134 mddev->minor_version = sb->minor_version;
1135 mddev->patch_version = sb->patch_version; 1135 mddev->patch_version = sb->patch_version;
1136 mddev->external = 0; 1136 mddev->external = 0;
1137 mddev->chunk_sectors = sb->chunk_size >> 9; 1137 mddev->chunk_sectors = sb->chunk_size >> 9;
1138 mddev->ctime = sb->ctime; 1138 mddev->ctime = sb->ctime;
1139 mddev->utime = sb->utime; 1139 mddev->utime = sb->utime;
1140 mddev->level = sb->level; 1140 mddev->level = sb->level;
1141 mddev->clevel[0] = 0; 1141 mddev->clevel[0] = 0;
1142 mddev->layout = sb->layout; 1142 mddev->layout = sb->layout;
1143 mddev->raid_disks = sb->raid_disks; 1143 mddev->raid_disks = sb->raid_disks;
1144 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1144 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1145 mddev->events = ev1; 1145 mddev->events = ev1;
1146 mddev->bitmap_info.offset = 0; 1146 mddev->bitmap_info.offset = 0;
1147 mddev->bitmap_info.space = 0; 1147 mddev->bitmap_info.space = 0;
1148 /* bitmap can use 60 K after the 4K superblocks */ 1148 /* bitmap can use 60 K after the 4K superblocks */
1149 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1149 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1150 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1150 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1151 mddev->reshape_backwards = 0; 1151 mddev->reshape_backwards = 0;
1152 1152
1153 if (mddev->minor_version >= 91) { 1153 if (mddev->minor_version >= 91) {
1154 mddev->reshape_position = sb->reshape_position; 1154 mddev->reshape_position = sb->reshape_position;
1155 mddev->delta_disks = sb->delta_disks; 1155 mddev->delta_disks = sb->delta_disks;
1156 mddev->new_level = sb->new_level; 1156 mddev->new_level = sb->new_level;
1157 mddev->new_layout = sb->new_layout; 1157 mddev->new_layout = sb->new_layout;
1158 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1158 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1159 if (mddev->delta_disks < 0) 1159 if (mddev->delta_disks < 0)
1160 mddev->reshape_backwards = 1; 1160 mddev->reshape_backwards = 1;
1161 } else { 1161 } else {
1162 mddev->reshape_position = MaxSector; 1162 mddev->reshape_position = MaxSector;
1163 mddev->delta_disks = 0; 1163 mddev->delta_disks = 0;
1164 mddev->new_level = mddev->level; 1164 mddev->new_level = mddev->level;
1165 mddev->new_layout = mddev->layout; 1165 mddev->new_layout = mddev->layout;
1166 mddev->new_chunk_sectors = mddev->chunk_sectors; 1166 mddev->new_chunk_sectors = mddev->chunk_sectors;
1167 } 1167 }
1168 1168
1169 if (sb->state & (1<<MD_SB_CLEAN)) 1169 if (sb->state & (1<<MD_SB_CLEAN))
1170 mddev->recovery_cp = MaxSector; 1170 mddev->recovery_cp = MaxSector;
1171 else { 1171 else {
1172 if (sb->events_hi == sb->cp_events_hi && 1172 if (sb->events_hi == sb->cp_events_hi &&
1173 sb->events_lo == sb->cp_events_lo) { 1173 sb->events_lo == sb->cp_events_lo) {
1174 mddev->recovery_cp = sb->recovery_cp; 1174 mddev->recovery_cp = sb->recovery_cp;
1175 } else 1175 } else
1176 mddev->recovery_cp = 0; 1176 mddev->recovery_cp = 0;
1177 } 1177 }
1178 1178
1179 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1179 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1180 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1180 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1181 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1181 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1182 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1182 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1183 1183
1184 mddev->max_disks = MD_SB_DISKS; 1184 mddev->max_disks = MD_SB_DISKS;
1185 1185
1186 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1186 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1187 mddev->bitmap_info.file == NULL) { 1187 mddev->bitmap_info.file == NULL) {
1188 mddev->bitmap_info.offset = 1188 mddev->bitmap_info.offset =
1189 mddev->bitmap_info.default_offset; 1189 mddev->bitmap_info.default_offset;
1190 mddev->bitmap_info.space = 1190 mddev->bitmap_info.space =
1191 mddev->bitmap_info.space; 1191 mddev->bitmap_info.space;
1192 } 1192 }
1193 1193
1194 } else if (mddev->pers == NULL) { 1194 } else if (mddev->pers == NULL) {
1195 /* Insist on good event counter while assembling, except 1195 /* Insist on good event counter while assembling, except
1196 * for spares (which don't need an event count) */ 1196 * for spares (which don't need an event count) */
1197 ++ev1; 1197 ++ev1;
1198 if (sb->disks[rdev->desc_nr].state & ( 1198 if (sb->disks[rdev->desc_nr].state & (
1199 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1199 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1200 if (ev1 < mddev->events) 1200 if (ev1 < mddev->events)
1201 return -EINVAL; 1201 return -EINVAL;
1202 } else if (mddev->bitmap) { 1202 } else if (mddev->bitmap) {
1203 /* if adding to array with a bitmap, then we can accept an 1203 /* if adding to array with a bitmap, then we can accept an
1204 * older device ... but not too old. 1204 * older device ... but not too old.
1205 */ 1205 */
1206 if (ev1 < mddev->bitmap->events_cleared) 1206 if (ev1 < mddev->bitmap->events_cleared)
1207 return 0; 1207 return 0;
1208 } else { 1208 } else {
1209 if (ev1 < mddev->events) 1209 if (ev1 < mddev->events)
1210 /* just a hot-add of a new device, leave raid_disk at -1 */ 1210 /* just a hot-add of a new device, leave raid_disk at -1 */
1211 return 0; 1211 return 0;
1212 } 1212 }
1213 1213
1214 if (mddev->level != LEVEL_MULTIPATH) { 1214 if (mddev->level != LEVEL_MULTIPATH) {
1215 desc = sb->disks + rdev->desc_nr; 1215 desc = sb->disks + rdev->desc_nr;
1216 1216
1217 if (desc->state & (1<<MD_DISK_FAULTY)) 1217 if (desc->state & (1<<MD_DISK_FAULTY))
1218 set_bit(Faulty, &rdev->flags); 1218 set_bit(Faulty, &rdev->flags);
1219 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1219 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1220 desc->raid_disk < mddev->raid_disks */) { 1220 desc->raid_disk < mddev->raid_disks */) {
1221 set_bit(In_sync, &rdev->flags); 1221 set_bit(In_sync, &rdev->flags);
1222 rdev->raid_disk = desc->raid_disk; 1222 rdev->raid_disk = desc->raid_disk;
1223 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1223 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1224 /* active but not in sync implies recovery up to 1224 /* active but not in sync implies recovery up to
1225 * reshape position. We don't know exactly where 1225 * reshape position. We don't know exactly where
1226 * that is, so set to zero for now */ 1226 * that is, so set to zero for now */
1227 if (mddev->minor_version >= 91) { 1227 if (mddev->minor_version >= 91) {
1228 rdev->recovery_offset = 0; 1228 rdev->recovery_offset = 0;
1229 rdev->raid_disk = desc->raid_disk; 1229 rdev->raid_disk = desc->raid_disk;
1230 } 1230 }
1231 } 1231 }
1232 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1232 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1233 set_bit(WriteMostly, &rdev->flags); 1233 set_bit(WriteMostly, &rdev->flags);
1234 } else /* MULTIPATH are always insync */ 1234 } else /* MULTIPATH are always insync */
1235 set_bit(In_sync, &rdev->flags); 1235 set_bit(In_sync, &rdev->flags);
1236 return 0; 1236 return 0;
1237 } 1237 }
1238 1238
1239 /* 1239 /*
1240 * sync_super for 0.90.0 1240 * sync_super for 0.90.0
1241 */ 1241 */
1242 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1242 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1243 { 1243 {
1244 mdp_super_t *sb; 1244 mdp_super_t *sb;
1245 struct md_rdev *rdev2; 1245 struct md_rdev *rdev2;
1246 int next_spare = mddev->raid_disks; 1246 int next_spare = mddev->raid_disks;
1247 1247
1248 1248
1249 /* make rdev->sb match mddev data.. 1249 /* make rdev->sb match mddev data..
1250 * 1250 *
1251 * 1/ zero out disks 1251 * 1/ zero out disks
1252 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1252 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1253 * 3/ any empty disks < next_spare become removed 1253 * 3/ any empty disks < next_spare become removed
1254 * 1254 *
1255 * disks[0] gets initialised to REMOVED because 1255 * disks[0] gets initialised to REMOVED because
1256 * we cannot be sure from other fields if it has 1256 * we cannot be sure from other fields if it has
1257 * been initialised or not. 1257 * been initialised or not.
1258 */ 1258 */
1259 int i; 1259 int i;
1260 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1260 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1261 1261
1262 rdev->sb_size = MD_SB_BYTES; 1262 rdev->sb_size = MD_SB_BYTES;
1263 1263
1264 sb = page_address(rdev->sb_page); 1264 sb = page_address(rdev->sb_page);
1265 1265
1266 memset(sb, 0, sizeof(*sb)); 1266 memset(sb, 0, sizeof(*sb));
1267 1267
1268 sb->md_magic = MD_SB_MAGIC; 1268 sb->md_magic = MD_SB_MAGIC;
1269 sb->major_version = mddev->major_version; 1269 sb->major_version = mddev->major_version;
1270 sb->patch_version = mddev->patch_version; 1270 sb->patch_version = mddev->patch_version;
1271 sb->gvalid_words = 0; /* ignored */ 1271 sb->gvalid_words = 0; /* ignored */
1272 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1272 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1273 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1273 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1274 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1274 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1275 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1275 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1276 1276
1277 sb->ctime = mddev->ctime; 1277 sb->ctime = mddev->ctime;
1278 sb->level = mddev->level; 1278 sb->level = mddev->level;
1279 sb->size = mddev->dev_sectors / 2; 1279 sb->size = mddev->dev_sectors / 2;
1280 sb->raid_disks = mddev->raid_disks; 1280 sb->raid_disks = mddev->raid_disks;
1281 sb->md_minor = mddev->md_minor; 1281 sb->md_minor = mddev->md_minor;
1282 sb->not_persistent = 0; 1282 sb->not_persistent = 0;
1283 sb->utime = mddev->utime; 1283 sb->utime = mddev->utime;
1284 sb->state = 0; 1284 sb->state = 0;
1285 sb->events_hi = (mddev->events>>32); 1285 sb->events_hi = (mddev->events>>32);
1286 sb->events_lo = (u32)mddev->events; 1286 sb->events_lo = (u32)mddev->events;
1287 1287
1288 if (mddev->reshape_position == MaxSector) 1288 if (mddev->reshape_position == MaxSector)
1289 sb->minor_version = 90; 1289 sb->minor_version = 90;
1290 else { 1290 else {
1291 sb->minor_version = 91; 1291 sb->minor_version = 91;
1292 sb->reshape_position = mddev->reshape_position; 1292 sb->reshape_position = mddev->reshape_position;
1293 sb->new_level = mddev->new_level; 1293 sb->new_level = mddev->new_level;
1294 sb->delta_disks = mddev->delta_disks; 1294 sb->delta_disks = mddev->delta_disks;
1295 sb->new_layout = mddev->new_layout; 1295 sb->new_layout = mddev->new_layout;
1296 sb->new_chunk = mddev->new_chunk_sectors << 9; 1296 sb->new_chunk = mddev->new_chunk_sectors << 9;
1297 } 1297 }
1298 mddev->minor_version = sb->minor_version; 1298 mddev->minor_version = sb->minor_version;
1299 if (mddev->in_sync) 1299 if (mddev->in_sync)
1300 { 1300 {
1301 sb->recovery_cp = mddev->recovery_cp; 1301 sb->recovery_cp = mddev->recovery_cp;
1302 sb->cp_events_hi = (mddev->events>>32); 1302 sb->cp_events_hi = (mddev->events>>32);
1303 sb->cp_events_lo = (u32)mddev->events; 1303 sb->cp_events_lo = (u32)mddev->events;
1304 if (mddev->recovery_cp == MaxSector) 1304 if (mddev->recovery_cp == MaxSector)
1305 sb->state = (1<< MD_SB_CLEAN); 1305 sb->state = (1<< MD_SB_CLEAN);
1306 } else 1306 } else
1307 sb->recovery_cp = 0; 1307 sb->recovery_cp = 0;
1308 1308
1309 sb->layout = mddev->layout; 1309 sb->layout = mddev->layout;
1310 sb->chunk_size = mddev->chunk_sectors << 9; 1310 sb->chunk_size = mddev->chunk_sectors << 9;
1311 1311
1312 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1312 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1313 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1313 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1314 1314
1315 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1315 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1316 rdev_for_each(rdev2, mddev) { 1316 rdev_for_each(rdev2, mddev) {
1317 mdp_disk_t *d; 1317 mdp_disk_t *d;
1318 int desc_nr; 1318 int desc_nr;
1319 int is_active = test_bit(In_sync, &rdev2->flags); 1319 int is_active = test_bit(In_sync, &rdev2->flags);
1320 1320
1321 if (rdev2->raid_disk >= 0 && 1321 if (rdev2->raid_disk >= 0 &&
1322 sb->minor_version >= 91) 1322 sb->minor_version >= 91)
1323 /* we have nowhere to store the recovery_offset, 1323 /* we have nowhere to store the recovery_offset,
1324 * but if it is not below the reshape_position, 1324 * but if it is not below the reshape_position,
1325 * we can piggy-back on that. 1325 * we can piggy-back on that.
1326 */ 1326 */
1327 is_active = 1; 1327 is_active = 1;
1328 if (rdev2->raid_disk < 0 || 1328 if (rdev2->raid_disk < 0 ||
1329 test_bit(Faulty, &rdev2->flags)) 1329 test_bit(Faulty, &rdev2->flags))
1330 is_active = 0; 1330 is_active = 0;
1331 if (is_active) 1331 if (is_active)
1332 desc_nr = rdev2->raid_disk; 1332 desc_nr = rdev2->raid_disk;
1333 else 1333 else
1334 desc_nr = next_spare++; 1334 desc_nr = next_spare++;
1335 rdev2->desc_nr = desc_nr; 1335 rdev2->desc_nr = desc_nr;
1336 d = &sb->disks[rdev2->desc_nr]; 1336 d = &sb->disks[rdev2->desc_nr];
1337 nr_disks++; 1337 nr_disks++;
1338 d->number = rdev2->desc_nr; 1338 d->number = rdev2->desc_nr;
1339 d->major = MAJOR(rdev2->bdev->bd_dev); 1339 d->major = MAJOR(rdev2->bdev->bd_dev);
1340 d->minor = MINOR(rdev2->bdev->bd_dev); 1340 d->minor = MINOR(rdev2->bdev->bd_dev);
1341 if (is_active) 1341 if (is_active)
1342 d->raid_disk = rdev2->raid_disk; 1342 d->raid_disk = rdev2->raid_disk;
1343 else 1343 else
1344 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1344 d->raid_disk = rdev2->desc_nr; /* compatibility */
1345 if (test_bit(Faulty, &rdev2->flags)) 1345 if (test_bit(Faulty, &rdev2->flags))
1346 d->state = (1<<MD_DISK_FAULTY); 1346 d->state = (1<<MD_DISK_FAULTY);
1347 else if (is_active) { 1347 else if (is_active) {
1348 d->state = (1<<MD_DISK_ACTIVE); 1348 d->state = (1<<MD_DISK_ACTIVE);
1349 if (test_bit(In_sync, &rdev2->flags)) 1349 if (test_bit(In_sync, &rdev2->flags))
1350 d->state |= (1<<MD_DISK_SYNC); 1350 d->state |= (1<<MD_DISK_SYNC);
1351 active++; 1351 active++;
1352 working++; 1352 working++;
1353 } else { 1353 } else {
1354 d->state = 0; 1354 d->state = 0;
1355 spare++; 1355 spare++;
1356 working++; 1356 working++;
1357 } 1357 }
1358 if (test_bit(WriteMostly, &rdev2->flags)) 1358 if (test_bit(WriteMostly, &rdev2->flags))
1359 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1359 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1360 } 1360 }
1361 /* now set the "removed" and "faulty" bits on any missing devices */ 1361 /* now set the "removed" and "faulty" bits on any missing devices */
1362 for (i=0 ; i < mddev->raid_disks ; i++) { 1362 for (i=0 ; i < mddev->raid_disks ; i++) {
1363 mdp_disk_t *d = &sb->disks[i]; 1363 mdp_disk_t *d = &sb->disks[i];
1364 if (d->state == 0 && d->number == 0) { 1364 if (d->state == 0 && d->number == 0) {
1365 d->number = i; 1365 d->number = i;
1366 d->raid_disk = i; 1366 d->raid_disk = i;
1367 d->state = (1<<MD_DISK_REMOVED); 1367 d->state = (1<<MD_DISK_REMOVED);
1368 d->state |= (1<<MD_DISK_FAULTY); 1368 d->state |= (1<<MD_DISK_FAULTY);
1369 failed++; 1369 failed++;
1370 } 1370 }
1371 } 1371 }
1372 sb->nr_disks = nr_disks; 1372 sb->nr_disks = nr_disks;
1373 sb->active_disks = active; 1373 sb->active_disks = active;
1374 sb->working_disks = working; 1374 sb->working_disks = working;
1375 sb->failed_disks = failed; 1375 sb->failed_disks = failed;
1376 sb->spare_disks = spare; 1376 sb->spare_disks = spare;
1377 1377
1378 sb->this_disk = sb->disks[rdev->desc_nr]; 1378 sb->this_disk = sb->disks[rdev->desc_nr];
1379 sb->sb_csum = calc_sb_csum(sb); 1379 sb->sb_csum = calc_sb_csum(sb);
1380 } 1380 }
1381 1381
1382 /* 1382 /*
1383 * rdev_size_change for 0.90.0 1383 * rdev_size_change for 0.90.0
1384 */ 1384 */
1385 static unsigned long long 1385 static unsigned long long
1386 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1386 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1387 { 1387 {
1388 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1388 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1389 return 0; /* component must fit device */ 1389 return 0; /* component must fit device */
1390 if (rdev->mddev->bitmap_info.offset) 1390 if (rdev->mddev->bitmap_info.offset)
1391 return 0; /* can't move bitmap */ 1391 return 0; /* can't move bitmap */
1392 rdev->sb_start = calc_dev_sboffset(rdev); 1392 rdev->sb_start = calc_dev_sboffset(rdev);
1393 if (!num_sectors || num_sectors > rdev->sb_start) 1393 if (!num_sectors || num_sectors > rdev->sb_start)
1394 num_sectors = rdev->sb_start; 1394 num_sectors = rdev->sb_start;
1395 /* Limit to 4TB as metadata cannot record more than that. 1395 /* Limit to 4TB as metadata cannot record more than that.
1396 * 4TB == 2^32 KB, or 2*2^32 sectors. 1396 * 4TB == 2^32 KB, or 2*2^32 sectors.
1397 */ 1397 */
1398 if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1398 if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1399 num_sectors = (2ULL << 32) - 2; 1399 num_sectors = (2ULL << 32) - 2;
1400 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1400 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1401 rdev->sb_page); 1401 rdev->sb_page);
1402 md_super_wait(rdev->mddev); 1402 md_super_wait(rdev->mddev);
1403 return num_sectors; 1403 return num_sectors;
1404 } 1404 }
1405 1405
1406 static int 1406 static int
1407 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1407 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1408 { 1408 {
1409 /* non-zero offset changes not possible with v0.90 */ 1409 /* non-zero offset changes not possible with v0.90 */
1410 return new_offset == 0; 1410 return new_offset == 0;
1411 } 1411 }
1412 1412
1413 /* 1413 /*
1414 * version 1 superblock 1414 * version 1 superblock
1415 */ 1415 */
1416 1416
1417 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1417 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1418 { 1418 {
1419 __le32 disk_csum; 1419 __le32 disk_csum;
1420 u32 csum; 1420 u32 csum;
1421 unsigned long long newcsum; 1421 unsigned long long newcsum;
1422 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1422 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1423 __le32 *isuper = (__le32*)sb; 1423 __le32 *isuper = (__le32*)sb;
1424 1424
1425 disk_csum = sb->sb_csum; 1425 disk_csum = sb->sb_csum;
1426 sb->sb_csum = 0; 1426 sb->sb_csum = 0;
1427 newcsum = 0; 1427 newcsum = 0;
1428 for (; size >= 4; size -= 4) 1428 for (; size >= 4; size -= 4)
1429 newcsum += le32_to_cpu(*isuper++); 1429 newcsum += le32_to_cpu(*isuper++);
1430 1430
1431 if (size == 2) 1431 if (size == 2)
1432 newcsum += le16_to_cpu(*(__le16*) isuper); 1432 newcsum += le16_to_cpu(*(__le16*) isuper);
1433 1433
1434 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1434 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1435 sb->sb_csum = disk_csum; 1435 sb->sb_csum = disk_csum;
1436 return cpu_to_le32(csum); 1436 return cpu_to_le32(csum);
1437 } 1437 }
1438 1438
1439 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, 1439 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1440 int acknowledged); 1440 int acknowledged);
1441 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1441 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1442 { 1442 {
1443 struct mdp_superblock_1 *sb; 1443 struct mdp_superblock_1 *sb;
1444 int ret; 1444 int ret;
1445 sector_t sb_start; 1445 sector_t sb_start;
1446 sector_t sectors; 1446 sector_t sectors;
1447 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1447 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1448 int bmask; 1448 int bmask;
1449 1449
1450 /* 1450 /*
1451 * Calculate the position of the superblock in 512byte sectors. 1451 * Calculate the position of the superblock in 512byte sectors.
1452 * It is always aligned to a 4K boundary and 1452 * It is always aligned to a 4K boundary and
1453 * depeding on minor_version, it can be: 1453 * depeding on minor_version, it can be:
1454 * 0: At least 8K, but less than 12K, from end of device 1454 * 0: At least 8K, but less than 12K, from end of device
1455 * 1: At start of device 1455 * 1: At start of device
1456 * 2: 4K from start of device. 1456 * 2: 4K from start of device.
1457 */ 1457 */
1458 switch(minor_version) { 1458 switch(minor_version) {
1459 case 0: 1459 case 0:
1460 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; 1460 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1461 sb_start -= 8*2; 1461 sb_start -= 8*2;
1462 sb_start &= ~(sector_t)(4*2-1); 1462 sb_start &= ~(sector_t)(4*2-1);
1463 break; 1463 break;
1464 case 1: 1464 case 1:
1465 sb_start = 0; 1465 sb_start = 0;
1466 break; 1466 break;
1467 case 2: 1467 case 2:
1468 sb_start = 8; 1468 sb_start = 8;
1469 break; 1469 break;
1470 default: 1470 default:
1471 return -EINVAL; 1471 return -EINVAL;
1472 } 1472 }
1473 rdev->sb_start = sb_start; 1473 rdev->sb_start = sb_start;
1474 1474
1475 /* superblock is rarely larger than 1K, but it can be larger, 1475 /* superblock is rarely larger than 1K, but it can be larger,
1476 * and it is safe to read 4k, so we do that 1476 * and it is safe to read 4k, so we do that
1477 */ 1477 */
1478 ret = read_disk_sb(rdev, 4096); 1478 ret = read_disk_sb(rdev, 4096);
1479 if (ret) return ret; 1479 if (ret) return ret;
1480 1480
1481 1481
1482 sb = page_address(rdev->sb_page); 1482 sb = page_address(rdev->sb_page);
1483 1483
1484 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1484 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1485 sb->major_version != cpu_to_le32(1) || 1485 sb->major_version != cpu_to_le32(1) ||
1486 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1486 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1487 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1487 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1488 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1488 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1489 return -EINVAL; 1489 return -EINVAL;
1490 1490
1491 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1491 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1492 printk("md: invalid superblock checksum on %s\n", 1492 printk("md: invalid superblock checksum on %s\n",
1493 bdevname(rdev->bdev,b)); 1493 bdevname(rdev->bdev,b));
1494 return -EINVAL; 1494 return -EINVAL;
1495 } 1495 }
1496 if (le64_to_cpu(sb->data_size) < 10) { 1496 if (le64_to_cpu(sb->data_size) < 10) {
1497 printk("md: data_size too small on %s\n", 1497 printk("md: data_size too small on %s\n",
1498 bdevname(rdev->bdev,b)); 1498 bdevname(rdev->bdev,b));
1499 return -EINVAL; 1499 return -EINVAL;
1500 } 1500 }
1501 if (sb->pad0 || 1501 if (sb->pad0 ||
1502 sb->pad3[0] || 1502 sb->pad3[0] ||
1503 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1503 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1504 /* Some padding is non-zero, might be a new feature */ 1504 /* Some padding is non-zero, might be a new feature */
1505 return -EINVAL; 1505 return -EINVAL;
1506 1506
1507 rdev->preferred_minor = 0xffff; 1507 rdev->preferred_minor = 0xffff;
1508 rdev->data_offset = le64_to_cpu(sb->data_offset); 1508 rdev->data_offset = le64_to_cpu(sb->data_offset);
1509 rdev->new_data_offset = rdev->data_offset; 1509 rdev->new_data_offset = rdev->data_offset;
1510 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1510 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1511 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1511 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1512 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1512 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1513 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1513 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1514 1514
1515 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1515 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1516 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1516 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1517 if (rdev->sb_size & bmask) 1517 if (rdev->sb_size & bmask)
1518 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1518 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1519 1519
1520 if (minor_version 1520 if (minor_version
1521 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1521 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1522 return -EINVAL; 1522 return -EINVAL;
1523 if (minor_version 1523 if (minor_version
1524 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1524 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1525 return -EINVAL; 1525 return -EINVAL;
1526 1526
1527 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1527 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1528 rdev->desc_nr = -1; 1528 rdev->desc_nr = -1;
1529 else 1529 else
1530 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1530 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1531 1531
1532 if (!rdev->bb_page) { 1532 if (!rdev->bb_page) {
1533 rdev->bb_page = alloc_page(GFP_KERNEL); 1533 rdev->bb_page = alloc_page(GFP_KERNEL);
1534 if (!rdev->bb_page) 1534 if (!rdev->bb_page)
1535 return -ENOMEM; 1535 return -ENOMEM;
1536 } 1536 }
1537 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1537 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1538 rdev->badblocks.count == 0) { 1538 rdev->badblocks.count == 0) {
1539 /* need to load the bad block list. 1539 /* need to load the bad block list.
1540 * Currently we limit it to one page. 1540 * Currently we limit it to one page.
1541 */ 1541 */
1542 s32 offset; 1542 s32 offset;
1543 sector_t bb_sector; 1543 sector_t bb_sector;
1544 u64 *bbp; 1544 u64 *bbp;
1545 int i; 1545 int i;
1546 int sectors = le16_to_cpu(sb->bblog_size); 1546 int sectors = le16_to_cpu(sb->bblog_size);
1547 if (sectors > (PAGE_SIZE / 512)) 1547 if (sectors > (PAGE_SIZE / 512))
1548 return -EINVAL; 1548 return -EINVAL;
1549 offset = le32_to_cpu(sb->bblog_offset); 1549 offset = le32_to_cpu(sb->bblog_offset);
1550 if (offset == 0) 1550 if (offset == 0)
1551 return -EINVAL; 1551 return -EINVAL;
1552 bb_sector = (long long)offset; 1552 bb_sector = (long long)offset;
1553 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1553 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1554 rdev->bb_page, READ, true)) 1554 rdev->bb_page, READ, true))
1555 return -EIO; 1555 return -EIO;
1556 bbp = (u64 *)page_address(rdev->bb_page); 1556 bbp = (u64 *)page_address(rdev->bb_page);
1557 rdev->badblocks.shift = sb->bblog_shift; 1557 rdev->badblocks.shift = sb->bblog_shift;
1558 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1558 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1559 u64 bb = le64_to_cpu(*bbp); 1559 u64 bb = le64_to_cpu(*bbp);
1560 int count = bb & (0x3ff); 1560 int count = bb & (0x3ff);
1561 u64 sector = bb >> 10; 1561 u64 sector = bb >> 10;
1562 sector <<= sb->bblog_shift; 1562 sector <<= sb->bblog_shift;
1563 count <<= sb->bblog_shift; 1563 count <<= sb->bblog_shift;
1564 if (bb + 1 == 0) 1564 if (bb + 1 == 0)
1565 break; 1565 break;
1566 if (md_set_badblocks(&rdev->badblocks, 1566 if (md_set_badblocks(&rdev->badblocks,
1567 sector, count, 1) == 0) 1567 sector, count, 1) == 0)
1568 return -EINVAL; 1568 return -EINVAL;
1569 } 1569 }
1570 } else if (sb->bblog_offset == 0) 1570 } else if (sb->bblog_offset == 0)
1571 rdev->badblocks.shift = -1; 1571 rdev->badblocks.shift = -1;
1572 1572
1573 if (!refdev) { 1573 if (!refdev) {
1574 ret = 1; 1574 ret = 1;
1575 } else { 1575 } else {
1576 __u64 ev1, ev2; 1576 __u64 ev1, ev2;
1577 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1577 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1578 1578
1579 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1579 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1580 sb->level != refsb->level || 1580 sb->level != refsb->level ||
1581 sb->layout != refsb->layout || 1581 sb->layout != refsb->layout ||
1582 sb->chunksize != refsb->chunksize) { 1582 sb->chunksize != refsb->chunksize) {
1583 printk(KERN_WARNING "md: %s has strangely different" 1583 printk(KERN_WARNING "md: %s has strangely different"
1584 " superblock to %s\n", 1584 " superblock to %s\n",
1585 bdevname(rdev->bdev,b), 1585 bdevname(rdev->bdev,b),
1586 bdevname(refdev->bdev,b2)); 1586 bdevname(refdev->bdev,b2));
1587 return -EINVAL; 1587 return -EINVAL;
1588 } 1588 }
1589 ev1 = le64_to_cpu(sb->events); 1589 ev1 = le64_to_cpu(sb->events);
1590 ev2 = le64_to_cpu(refsb->events); 1590 ev2 = le64_to_cpu(refsb->events);
1591 1591
1592 if (ev1 > ev2) 1592 if (ev1 > ev2)
1593 ret = 1; 1593 ret = 1;
1594 else 1594 else
1595 ret = 0; 1595 ret = 0;
1596 } 1596 }
1597 if (minor_version) { 1597 if (minor_version) {
1598 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); 1598 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1599 sectors -= rdev->data_offset; 1599 sectors -= rdev->data_offset;
1600 } else 1600 } else
1601 sectors = rdev->sb_start; 1601 sectors = rdev->sb_start;
1602 if (sectors < le64_to_cpu(sb->data_size)) 1602 if (sectors < le64_to_cpu(sb->data_size))
1603 return -EINVAL; 1603 return -EINVAL;
1604 rdev->sectors = le64_to_cpu(sb->data_size); 1604 rdev->sectors = le64_to_cpu(sb->data_size);
1605 return ret; 1605 return ret;
1606 } 1606 }
1607 1607
1608 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1608 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1609 { 1609 {
1610 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1610 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1611 __u64 ev1 = le64_to_cpu(sb->events); 1611 __u64 ev1 = le64_to_cpu(sb->events);
1612 1612
1613 rdev->raid_disk = -1; 1613 rdev->raid_disk = -1;
1614 clear_bit(Faulty, &rdev->flags); 1614 clear_bit(Faulty, &rdev->flags);
1615 clear_bit(In_sync, &rdev->flags); 1615 clear_bit(In_sync, &rdev->flags);
1616 clear_bit(WriteMostly, &rdev->flags); 1616 clear_bit(WriteMostly, &rdev->flags);
1617 1617
1618 if (mddev->raid_disks == 0) { 1618 if (mddev->raid_disks == 0) {
1619 mddev->major_version = 1; 1619 mddev->major_version = 1;
1620 mddev->patch_version = 0; 1620 mddev->patch_version = 0;
1621 mddev->external = 0; 1621 mddev->external = 0;
1622 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1622 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1623 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1623 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1624 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1624 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1625 mddev->level = le32_to_cpu(sb->level); 1625 mddev->level = le32_to_cpu(sb->level);
1626 mddev->clevel[0] = 0; 1626 mddev->clevel[0] = 0;
1627 mddev->layout = le32_to_cpu(sb->layout); 1627 mddev->layout = le32_to_cpu(sb->layout);
1628 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1628 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1629 mddev->dev_sectors = le64_to_cpu(sb->size); 1629 mddev->dev_sectors = le64_to_cpu(sb->size);
1630 mddev->events = ev1; 1630 mddev->events = ev1;
1631 mddev->bitmap_info.offset = 0; 1631 mddev->bitmap_info.offset = 0;
1632 mddev->bitmap_info.space = 0; 1632 mddev->bitmap_info.space = 0;
1633 /* Default location for bitmap is 1K after superblock 1633 /* Default location for bitmap is 1K after superblock
1634 * using 3K - total of 4K 1634 * using 3K - total of 4K
1635 */ 1635 */
1636 mddev->bitmap_info.default_offset = 1024 >> 9; 1636 mddev->bitmap_info.default_offset = 1024 >> 9;
1637 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1637 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1638 mddev->reshape_backwards = 0; 1638 mddev->reshape_backwards = 0;
1639 1639
1640 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1640 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1641 memcpy(mddev->uuid, sb->set_uuid, 16); 1641 memcpy(mddev->uuid, sb->set_uuid, 16);
1642 1642
1643 mddev->max_disks = (4096-256)/2; 1643 mddev->max_disks = (4096-256)/2;
1644 1644
1645 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1645 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1646 mddev->bitmap_info.file == NULL) { 1646 mddev->bitmap_info.file == NULL) {
1647 mddev->bitmap_info.offset = 1647 mddev->bitmap_info.offset =
1648 (__s32)le32_to_cpu(sb->bitmap_offset); 1648 (__s32)le32_to_cpu(sb->bitmap_offset);
1649 /* Metadata doesn't record how much space is available. 1649 /* Metadata doesn't record how much space is available.
1650 * For 1.0, we assume we can use up to the superblock 1650 * For 1.0, we assume we can use up to the superblock
1651 * if before, else to 4K beyond superblock. 1651 * if before, else to 4K beyond superblock.
1652 * For others, assume no change is possible. 1652 * For others, assume no change is possible.
1653 */ 1653 */
1654 if (mddev->minor_version > 0) 1654 if (mddev->minor_version > 0)
1655 mddev->bitmap_info.space = 0; 1655 mddev->bitmap_info.space = 0;
1656 else if (mddev->bitmap_info.offset > 0) 1656 else if (mddev->bitmap_info.offset > 0)
1657 mddev->bitmap_info.space = 1657 mddev->bitmap_info.space =
1658 8 - mddev->bitmap_info.offset; 1658 8 - mddev->bitmap_info.offset;
1659 else 1659 else
1660 mddev->bitmap_info.space = 1660 mddev->bitmap_info.space =
1661 -mddev->bitmap_info.offset; 1661 -mddev->bitmap_info.offset;
1662 } 1662 }
1663 1663
1664 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1664 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1665 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1665 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1666 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1666 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1667 mddev->new_level = le32_to_cpu(sb->new_level); 1667 mddev->new_level = le32_to_cpu(sb->new_level);
1668 mddev->new_layout = le32_to_cpu(sb->new_layout); 1668 mddev->new_layout = le32_to_cpu(sb->new_layout);
1669 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1669 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1670 if (mddev->delta_disks < 0 || 1670 if (mddev->delta_disks < 0 ||
1671 (mddev->delta_disks == 0 && 1671 (mddev->delta_disks == 0 &&
1672 (le32_to_cpu(sb->feature_map) 1672 (le32_to_cpu(sb->feature_map)
1673 & MD_FEATURE_RESHAPE_BACKWARDS))) 1673 & MD_FEATURE_RESHAPE_BACKWARDS)))
1674 mddev->reshape_backwards = 1; 1674 mddev->reshape_backwards = 1;
1675 } else { 1675 } else {
1676 mddev->reshape_position = MaxSector; 1676 mddev->reshape_position = MaxSector;
1677 mddev->delta_disks = 0; 1677 mddev->delta_disks = 0;
1678 mddev->new_level = mddev->level; 1678 mddev->new_level = mddev->level;
1679 mddev->new_layout = mddev->layout; 1679 mddev->new_layout = mddev->layout;
1680 mddev->new_chunk_sectors = mddev->chunk_sectors; 1680 mddev->new_chunk_sectors = mddev->chunk_sectors;
1681 } 1681 }
1682 1682
1683 } else if (mddev->pers == NULL) { 1683 } else if (mddev->pers == NULL) {
1684 /* Insist of good event counter while assembling, except for 1684 /* Insist of good event counter while assembling, except for
1685 * spares (which don't need an event count) */ 1685 * spares (which don't need an event count) */
1686 ++ev1; 1686 ++ev1;
1687 if (rdev->desc_nr >= 0 && 1687 if (rdev->desc_nr >= 0 &&
1688 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1688 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1689 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) 1689 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1690 if (ev1 < mddev->events) 1690 if (ev1 < mddev->events)
1691 return -EINVAL; 1691 return -EINVAL;
1692 } else if (mddev->bitmap) { 1692 } else if (mddev->bitmap) {
1693 /* If adding to array with a bitmap, then we can accept an 1693 /* If adding to array with a bitmap, then we can accept an
1694 * older device, but not too old. 1694 * older device, but not too old.
1695 */ 1695 */
1696 if (ev1 < mddev->bitmap->events_cleared) 1696 if (ev1 < mddev->bitmap->events_cleared)
1697 return 0; 1697 return 0;
1698 } else { 1698 } else {
1699 if (ev1 < mddev->events) 1699 if (ev1 < mddev->events)
1700 /* just a hot-add of a new device, leave raid_disk at -1 */ 1700 /* just a hot-add of a new device, leave raid_disk at -1 */
1701 return 0; 1701 return 0;
1702 } 1702 }
1703 if (mddev->level != LEVEL_MULTIPATH) { 1703 if (mddev->level != LEVEL_MULTIPATH) {
1704 int role; 1704 int role;
1705 if (rdev->desc_nr < 0 || 1705 if (rdev->desc_nr < 0 ||
1706 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1706 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1707 role = 0xffff; 1707 role = 0xffff;
1708 rdev->desc_nr = -1; 1708 rdev->desc_nr = -1;
1709 } else 1709 } else
1710 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1710 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1711 switch(role) { 1711 switch(role) {
1712 case 0xffff: /* spare */ 1712 case 0xffff: /* spare */
1713 break; 1713 break;
1714 case 0xfffe: /* faulty */ 1714 case 0xfffe: /* faulty */
1715 set_bit(Faulty, &rdev->flags); 1715 set_bit(Faulty, &rdev->flags);
1716 break; 1716 break;
1717 default: 1717 default:
1718 if ((le32_to_cpu(sb->feature_map) & 1718 if ((le32_to_cpu(sb->feature_map) &
1719 MD_FEATURE_RECOVERY_OFFSET)) 1719 MD_FEATURE_RECOVERY_OFFSET))
1720 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1720 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1721 else 1721 else
1722 set_bit(In_sync, &rdev->flags); 1722 set_bit(In_sync, &rdev->flags);
1723 rdev->raid_disk = role; 1723 rdev->raid_disk = role;
1724 break; 1724 break;
1725 } 1725 }
1726 if (sb->devflags & WriteMostly1) 1726 if (sb->devflags & WriteMostly1)
1727 set_bit(WriteMostly, &rdev->flags); 1727 set_bit(WriteMostly, &rdev->flags);
1728 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1728 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1729 set_bit(Replacement, &rdev->flags); 1729 set_bit(Replacement, &rdev->flags);
1730 } else /* MULTIPATH are always insync */ 1730 } else /* MULTIPATH are always insync */
1731 set_bit(In_sync, &rdev->flags); 1731 set_bit(In_sync, &rdev->flags);
1732 1732
1733 return 0; 1733 return 0;
1734 } 1734 }
1735 1735
1736 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1736 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1737 { 1737 {
1738 struct mdp_superblock_1 *sb; 1738 struct mdp_superblock_1 *sb;
1739 struct md_rdev *rdev2; 1739 struct md_rdev *rdev2;
1740 int max_dev, i; 1740 int max_dev, i;
1741 /* make rdev->sb match mddev and rdev data. */ 1741 /* make rdev->sb match mddev and rdev data. */
1742 1742
1743 sb = page_address(rdev->sb_page); 1743 sb = page_address(rdev->sb_page);
1744 1744
1745 sb->feature_map = 0; 1745 sb->feature_map = 0;
1746 sb->pad0 = 0; 1746 sb->pad0 = 0;
1747 sb->recovery_offset = cpu_to_le64(0); 1747 sb->recovery_offset = cpu_to_le64(0);
1748 memset(sb->pad3, 0, sizeof(sb->pad3)); 1748 memset(sb->pad3, 0, sizeof(sb->pad3));
1749 1749
1750 sb->utime = cpu_to_le64((__u64)mddev->utime); 1750 sb->utime = cpu_to_le64((__u64)mddev->utime);
1751 sb->events = cpu_to_le64(mddev->events); 1751 sb->events = cpu_to_le64(mddev->events);
1752 if (mddev->in_sync) 1752 if (mddev->in_sync)
1753 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1753 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1754 else 1754 else
1755 sb->resync_offset = cpu_to_le64(0); 1755 sb->resync_offset = cpu_to_le64(0);
1756 1756
1757 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1757 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1758 1758
1759 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1759 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1760 sb->size = cpu_to_le64(mddev->dev_sectors); 1760 sb->size = cpu_to_le64(mddev->dev_sectors);
1761 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1761 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1762 sb->level = cpu_to_le32(mddev->level); 1762 sb->level = cpu_to_le32(mddev->level);
1763 sb->layout = cpu_to_le32(mddev->layout); 1763 sb->layout = cpu_to_le32(mddev->layout);
1764 1764
1765 if (test_bit(WriteMostly, &rdev->flags)) 1765 if (test_bit(WriteMostly, &rdev->flags))
1766 sb->devflags |= WriteMostly1; 1766 sb->devflags |= WriteMostly1;
1767 else 1767 else
1768 sb->devflags &= ~WriteMostly1; 1768 sb->devflags &= ~WriteMostly1;
1769 sb->data_offset = cpu_to_le64(rdev->data_offset); 1769 sb->data_offset = cpu_to_le64(rdev->data_offset);
1770 sb->data_size = cpu_to_le64(rdev->sectors); 1770 sb->data_size = cpu_to_le64(rdev->sectors);
1771 1771
1772 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1772 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1773 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1773 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1774 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1774 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1775 } 1775 }
1776 1776
1777 if (rdev->raid_disk >= 0 && 1777 if (rdev->raid_disk >= 0 &&
1778 !test_bit(In_sync, &rdev->flags)) { 1778 !test_bit(In_sync, &rdev->flags)) {
1779 sb->feature_map |= 1779 sb->feature_map |=
1780 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1780 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1781 sb->recovery_offset = 1781 sb->recovery_offset =
1782 cpu_to_le64(rdev->recovery_offset); 1782 cpu_to_le64(rdev->recovery_offset);
1783 } 1783 }
1784 if (test_bit(Replacement, &rdev->flags)) 1784 if (test_bit(Replacement, &rdev->flags))
1785 sb->feature_map |= 1785 sb->feature_map |=
1786 cpu_to_le32(MD_FEATURE_REPLACEMENT); 1786 cpu_to_le32(MD_FEATURE_REPLACEMENT);
1787 1787
1788 if (mddev->reshape_position != MaxSector) { 1788 if (mddev->reshape_position != MaxSector) {
1789 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1789 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1790 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1790 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1791 sb->new_layout = cpu_to_le32(mddev->new_layout); 1791 sb->new_layout = cpu_to_le32(mddev->new_layout);
1792 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1792 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1793 sb->new_level = cpu_to_le32(mddev->new_level); 1793 sb->new_level = cpu_to_le32(mddev->new_level);
1794 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1794 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1795 if (mddev->delta_disks == 0 && 1795 if (mddev->delta_disks == 0 &&
1796 mddev->reshape_backwards) 1796 mddev->reshape_backwards)
1797 sb->feature_map 1797 sb->feature_map
1798 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 1798 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1799 if (rdev->new_data_offset != rdev->data_offset) { 1799 if (rdev->new_data_offset != rdev->data_offset) {
1800 sb->feature_map 1800 sb->feature_map
1801 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 1801 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1802 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 1802 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1803 - rdev->data_offset)); 1803 - rdev->data_offset));
1804 } 1804 }
1805 } 1805 }
1806 1806
1807 if (rdev->badblocks.count == 0) 1807 if (rdev->badblocks.count == 0)
1808 /* Nothing to do for bad blocks*/ ; 1808 /* Nothing to do for bad blocks*/ ;
1809 else if (sb->bblog_offset == 0) 1809 else if (sb->bblog_offset == 0)
1810 /* Cannot record bad blocks on this device */ 1810 /* Cannot record bad blocks on this device */
1811 md_error(mddev, rdev); 1811 md_error(mddev, rdev);
1812 else { 1812 else {
1813 struct badblocks *bb = &rdev->badblocks; 1813 struct badblocks *bb = &rdev->badblocks;
1814 u64 *bbp = (u64 *)page_address(rdev->bb_page); 1814 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1815 u64 *p = bb->page; 1815 u64 *p = bb->page;
1816 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 1816 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1817 if (bb->changed) { 1817 if (bb->changed) {
1818 unsigned seq; 1818 unsigned seq;
1819 1819
1820 retry: 1820 retry:
1821 seq = read_seqbegin(&bb->lock); 1821 seq = read_seqbegin(&bb->lock);
1822 1822
1823 memset(bbp, 0xff, PAGE_SIZE); 1823 memset(bbp, 0xff, PAGE_SIZE);
1824 1824
1825 for (i = 0 ; i < bb->count ; i++) { 1825 for (i = 0 ; i < bb->count ; i++) {
1826 u64 internal_bb = p[i]; 1826 u64 internal_bb = p[i];
1827 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 1827 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1828 | BB_LEN(internal_bb)); 1828 | BB_LEN(internal_bb));
1829 bbp[i] = cpu_to_le64(store_bb); 1829 bbp[i] = cpu_to_le64(store_bb);
1830 } 1830 }
1831 bb->changed = 0; 1831 bb->changed = 0;
1832 if (read_seqretry(&bb->lock, seq)) 1832 if (read_seqretry(&bb->lock, seq))
1833 goto retry; 1833 goto retry;
1834 1834
1835 bb->sector = (rdev->sb_start + 1835 bb->sector = (rdev->sb_start +
1836 (int)le32_to_cpu(sb->bblog_offset)); 1836 (int)le32_to_cpu(sb->bblog_offset));
1837 bb->size = le16_to_cpu(sb->bblog_size); 1837 bb->size = le16_to_cpu(sb->bblog_size);
1838 } 1838 }
1839 } 1839 }
1840 1840
1841 max_dev = 0; 1841 max_dev = 0;
1842 rdev_for_each(rdev2, mddev) 1842 rdev_for_each(rdev2, mddev)
1843 if (rdev2->desc_nr+1 > max_dev) 1843 if (rdev2->desc_nr+1 > max_dev)
1844 max_dev = rdev2->desc_nr+1; 1844 max_dev = rdev2->desc_nr+1;
1845 1845
1846 if (max_dev > le32_to_cpu(sb->max_dev)) { 1846 if (max_dev > le32_to_cpu(sb->max_dev)) {
1847 int bmask; 1847 int bmask;
1848 sb->max_dev = cpu_to_le32(max_dev); 1848 sb->max_dev = cpu_to_le32(max_dev);
1849 rdev->sb_size = max_dev * 2 + 256; 1849 rdev->sb_size = max_dev * 2 + 256;
1850 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1850 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1851 if (rdev->sb_size & bmask) 1851 if (rdev->sb_size & bmask)
1852 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1852 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1853 } else 1853 } else
1854 max_dev = le32_to_cpu(sb->max_dev); 1854 max_dev = le32_to_cpu(sb->max_dev);
1855 1855
1856 for (i=0; i<max_dev;i++) 1856 for (i=0; i<max_dev;i++)
1857 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1857 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1858 1858
1859 rdev_for_each(rdev2, mddev) { 1859 rdev_for_each(rdev2, mddev) {
1860 i = rdev2->desc_nr; 1860 i = rdev2->desc_nr;
1861 if (test_bit(Faulty, &rdev2->flags)) 1861 if (test_bit(Faulty, &rdev2->flags))
1862 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1862 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1863 else if (test_bit(In_sync, &rdev2->flags)) 1863 else if (test_bit(In_sync, &rdev2->flags))
1864 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1864 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1865 else if (rdev2->raid_disk >= 0) 1865 else if (rdev2->raid_disk >= 0)
1866 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1866 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1867 else 1867 else
1868 sb->dev_roles[i] = cpu_to_le16(0xffff); 1868 sb->dev_roles[i] = cpu_to_le16(0xffff);
1869 } 1869 }
1870 1870
1871 sb->sb_csum = calc_sb_1_csum(sb); 1871 sb->sb_csum = calc_sb_1_csum(sb);
1872 } 1872 }
1873 1873
1874 static unsigned long long 1874 static unsigned long long
1875 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1875 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1876 { 1876 {
1877 struct mdp_superblock_1 *sb; 1877 struct mdp_superblock_1 *sb;
1878 sector_t max_sectors; 1878 sector_t max_sectors;
1879 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1879 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1880 return 0; /* component must fit device */ 1880 return 0; /* component must fit device */
1881 if (rdev->data_offset != rdev->new_data_offset) 1881 if (rdev->data_offset != rdev->new_data_offset)
1882 return 0; /* too confusing */ 1882 return 0; /* too confusing */
1883 if (rdev->sb_start < rdev->data_offset) { 1883 if (rdev->sb_start < rdev->data_offset) {
1884 /* minor versions 1 and 2; superblock before data */ 1884 /* minor versions 1 and 2; superblock before data */
1885 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; 1885 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1886 max_sectors -= rdev->data_offset; 1886 max_sectors -= rdev->data_offset;
1887 if (!num_sectors || num_sectors > max_sectors) 1887 if (!num_sectors || num_sectors > max_sectors)
1888 num_sectors = max_sectors; 1888 num_sectors = max_sectors;
1889 } else if (rdev->mddev->bitmap_info.offset) { 1889 } else if (rdev->mddev->bitmap_info.offset) {
1890 /* minor version 0 with bitmap we can't move */ 1890 /* minor version 0 with bitmap we can't move */
1891 return 0; 1891 return 0;
1892 } else { 1892 } else {
1893 /* minor version 0; superblock after data */ 1893 /* minor version 0; superblock after data */
1894 sector_t sb_start; 1894 sector_t sb_start;
1895 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; 1895 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1896 sb_start &= ~(sector_t)(4*2 - 1); 1896 sb_start &= ~(sector_t)(4*2 - 1);
1897 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1897 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1898 if (!num_sectors || num_sectors > max_sectors) 1898 if (!num_sectors || num_sectors > max_sectors)
1899 num_sectors = max_sectors; 1899 num_sectors = max_sectors;
1900 rdev->sb_start = sb_start; 1900 rdev->sb_start = sb_start;
1901 } 1901 }
1902 sb = page_address(rdev->sb_page); 1902 sb = page_address(rdev->sb_page);
1903 sb->data_size = cpu_to_le64(num_sectors); 1903 sb->data_size = cpu_to_le64(num_sectors);
1904 sb->super_offset = rdev->sb_start; 1904 sb->super_offset = rdev->sb_start;
1905 sb->sb_csum = calc_sb_1_csum(sb); 1905 sb->sb_csum = calc_sb_1_csum(sb);
1906 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1906 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1907 rdev->sb_page); 1907 rdev->sb_page);
1908 md_super_wait(rdev->mddev); 1908 md_super_wait(rdev->mddev);
1909 return num_sectors; 1909 return num_sectors;
1910 1910
1911 } 1911 }
1912 1912
1913 static int 1913 static int
1914 super_1_allow_new_offset(struct md_rdev *rdev, 1914 super_1_allow_new_offset(struct md_rdev *rdev,
1915 unsigned long long new_offset) 1915 unsigned long long new_offset)
1916 { 1916 {
1917 /* All necessary checks on new >= old have been done */ 1917 /* All necessary checks on new >= old have been done */
1918 struct bitmap *bitmap; 1918 struct bitmap *bitmap;
1919 if (new_offset >= rdev->data_offset) 1919 if (new_offset >= rdev->data_offset)
1920 return 1; 1920 return 1;
1921 1921
1922 /* with 1.0 metadata, there is no metadata to tread on 1922 /* with 1.0 metadata, there is no metadata to tread on
1923 * so we can always move back */ 1923 * so we can always move back */
1924 if (rdev->mddev->minor_version == 0) 1924 if (rdev->mddev->minor_version == 0)
1925 return 1; 1925 return 1;
1926 1926
1927 /* otherwise we must be sure not to step on 1927 /* otherwise we must be sure not to step on
1928 * any metadata, so stay: 1928 * any metadata, so stay:
1929 * 36K beyond start of superblock 1929 * 36K beyond start of superblock
1930 * beyond end of badblocks 1930 * beyond end of badblocks
1931 * beyond write-intent bitmap 1931 * beyond write-intent bitmap
1932 */ 1932 */
1933 if (rdev->sb_start + (32+4)*2 > new_offset) 1933 if (rdev->sb_start + (32+4)*2 > new_offset)
1934 return 0; 1934 return 0;
1935 bitmap = rdev->mddev->bitmap; 1935 bitmap = rdev->mddev->bitmap;
1936 if (bitmap && !rdev->mddev->bitmap_info.file && 1936 if (bitmap && !rdev->mddev->bitmap_info.file &&
1937 rdev->sb_start + rdev->mddev->bitmap_info.offset + 1937 rdev->sb_start + rdev->mddev->bitmap_info.offset +
1938 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 1938 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
1939 return 0; 1939 return 0;
1940 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 1940 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
1941 return 0; 1941 return 0;
1942 1942
1943 return 1; 1943 return 1;
1944 } 1944 }
1945 1945
1946 static struct super_type super_types[] = { 1946 static struct super_type super_types[] = {
1947 [0] = { 1947 [0] = {
1948 .name = "0.90.0", 1948 .name = "0.90.0",
1949 .owner = THIS_MODULE, 1949 .owner = THIS_MODULE,
1950 .load_super = super_90_load, 1950 .load_super = super_90_load,
1951 .validate_super = super_90_validate, 1951 .validate_super = super_90_validate,
1952 .sync_super = super_90_sync, 1952 .sync_super = super_90_sync,
1953 .rdev_size_change = super_90_rdev_size_change, 1953 .rdev_size_change = super_90_rdev_size_change,
1954 .allow_new_offset = super_90_allow_new_offset, 1954 .allow_new_offset = super_90_allow_new_offset,
1955 }, 1955 },
1956 [1] = { 1956 [1] = {
1957 .name = "md-1", 1957 .name = "md-1",
1958 .owner = THIS_MODULE, 1958 .owner = THIS_MODULE,
1959 .load_super = super_1_load, 1959 .load_super = super_1_load,
1960 .validate_super = super_1_validate, 1960 .validate_super = super_1_validate,
1961 .sync_super = super_1_sync, 1961 .sync_super = super_1_sync,
1962 .rdev_size_change = super_1_rdev_size_change, 1962 .rdev_size_change = super_1_rdev_size_change,
1963 .allow_new_offset = super_1_allow_new_offset, 1963 .allow_new_offset = super_1_allow_new_offset,
1964 }, 1964 },
1965 }; 1965 };
1966 1966
1967 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 1967 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
1968 { 1968 {
1969 if (mddev->sync_super) { 1969 if (mddev->sync_super) {
1970 mddev->sync_super(mddev, rdev); 1970 mddev->sync_super(mddev, rdev);
1971 return; 1971 return;
1972 } 1972 }
1973 1973
1974 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 1974 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1975 1975
1976 super_types[mddev->major_version].sync_super(mddev, rdev); 1976 super_types[mddev->major_version].sync_super(mddev, rdev);
1977 } 1977 }
1978 1978
1979 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 1979 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
1980 { 1980 {
1981 struct md_rdev *rdev, *rdev2; 1981 struct md_rdev *rdev, *rdev2;
1982 1982
1983 rcu_read_lock(); 1983 rcu_read_lock();
1984 rdev_for_each_rcu(rdev, mddev1) 1984 rdev_for_each_rcu(rdev, mddev1)
1985 rdev_for_each_rcu(rdev2, mddev2) 1985 rdev_for_each_rcu(rdev2, mddev2)
1986 if (rdev->bdev->bd_contains == 1986 if (rdev->bdev->bd_contains ==
1987 rdev2->bdev->bd_contains) { 1987 rdev2->bdev->bd_contains) {
1988 rcu_read_unlock(); 1988 rcu_read_unlock();
1989 return 1; 1989 return 1;
1990 } 1990 }
1991 rcu_read_unlock(); 1991 rcu_read_unlock();
1992 return 0; 1992 return 0;
1993 } 1993 }
1994 1994
1995 static LIST_HEAD(pending_raid_disks); 1995 static LIST_HEAD(pending_raid_disks);
1996 1996
1997 /* 1997 /*
1998 * Try to register data integrity profile for an mddev 1998 * Try to register data integrity profile for an mddev
1999 * 1999 *
2000 * This is called when an array is started and after a disk has been kicked 2000 * This is called when an array is started and after a disk has been kicked
2001 * from the array. It only succeeds if all working and active component devices 2001 * from the array. It only succeeds if all working and active component devices
2002 * are integrity capable with matching profiles. 2002 * are integrity capable with matching profiles.
2003 */ 2003 */
2004 int md_integrity_register(struct mddev *mddev) 2004 int md_integrity_register(struct mddev *mddev)
2005 { 2005 {
2006 struct md_rdev *rdev, *reference = NULL; 2006 struct md_rdev *rdev, *reference = NULL;
2007 2007
2008 if (list_empty(&mddev->disks)) 2008 if (list_empty(&mddev->disks))
2009 return 0; /* nothing to do */ 2009 return 0; /* nothing to do */
2010 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 2010 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
2011 return 0; /* shouldn't register, or already is */ 2011 return 0; /* shouldn't register, or already is */
2012 rdev_for_each(rdev, mddev) { 2012 rdev_for_each(rdev, mddev) {
2013 /* skip spares and non-functional disks */ 2013 /* skip spares and non-functional disks */
2014 if (test_bit(Faulty, &rdev->flags)) 2014 if (test_bit(Faulty, &rdev->flags))
2015 continue; 2015 continue;
2016 if (rdev->raid_disk < 0) 2016 if (rdev->raid_disk < 0)
2017 continue; 2017 continue;
2018 if (!reference) { 2018 if (!reference) {
2019 /* Use the first rdev as the reference */ 2019 /* Use the first rdev as the reference */
2020 reference = rdev; 2020 reference = rdev;
2021 continue; 2021 continue;
2022 } 2022 }
2023 /* does this rdev's profile match the reference profile? */ 2023 /* does this rdev's profile match the reference profile? */
2024 if (blk_integrity_compare(reference->bdev->bd_disk, 2024 if (blk_integrity_compare(reference->bdev->bd_disk,
2025 rdev->bdev->bd_disk) < 0) 2025 rdev->bdev->bd_disk) < 0)
2026 return -EINVAL; 2026 return -EINVAL;
2027 } 2027 }
2028 if (!reference || !bdev_get_integrity(reference->bdev)) 2028 if (!reference || !bdev_get_integrity(reference->bdev))
2029 return 0; 2029 return 0;
2030 /* 2030 /*
2031 * All component devices are integrity capable and have matching 2031 * All component devices are integrity capable and have matching
2032 * profiles, register the common profile for the md device. 2032 * profiles, register the common profile for the md device.
2033 */ 2033 */
2034 if (blk_integrity_register(mddev->gendisk, 2034 if (blk_integrity_register(mddev->gendisk,
2035 bdev_get_integrity(reference->bdev)) != 0) { 2035 bdev_get_integrity(reference->bdev)) != 0) {
2036 printk(KERN_ERR "md: failed to register integrity for %s\n", 2036 printk(KERN_ERR "md: failed to register integrity for %s\n",
2037 mdname(mddev)); 2037 mdname(mddev));
2038 return -EINVAL; 2038 return -EINVAL;
2039 } 2039 }
2040 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); 2040 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
2041 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { 2041 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
2042 printk(KERN_ERR "md: failed to create integrity pool for %s\n", 2042 printk(KERN_ERR "md: failed to create integrity pool for %s\n",
2043 mdname(mddev)); 2043 mdname(mddev));
2044 return -EINVAL; 2044 return -EINVAL;
2045 } 2045 }
2046 return 0; 2046 return 0;
2047 } 2047 }
2048 EXPORT_SYMBOL(md_integrity_register); 2048 EXPORT_SYMBOL(md_integrity_register);
2049 2049
2050 /* Disable data integrity if non-capable/non-matching disk is being added */ 2050 /* Disable data integrity if non-capable/non-matching disk is being added */
2051 void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2051 void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2052 { 2052 {
2053 struct blk_integrity *bi_rdev; 2053 struct blk_integrity *bi_rdev;
2054 struct blk_integrity *bi_mddev; 2054 struct blk_integrity *bi_mddev;
2055 2055
2056 if (!mddev->gendisk) 2056 if (!mddev->gendisk)
2057 return; 2057 return;
2058 2058
2059 bi_rdev = bdev_get_integrity(rdev->bdev); 2059 bi_rdev = bdev_get_integrity(rdev->bdev);
2060 bi_mddev = blk_get_integrity(mddev->gendisk); 2060 bi_mddev = blk_get_integrity(mddev->gendisk);
2061 2061
2062 if (!bi_mddev) /* nothing to do */ 2062 if (!bi_mddev) /* nothing to do */
2063 return; 2063 return;
2064 if (rdev->raid_disk < 0) /* skip spares */ 2064 if (rdev->raid_disk < 0) /* skip spares */
2065 return; 2065 return;
2066 if (bi_rdev && blk_integrity_compare(mddev->gendisk, 2066 if (bi_rdev && blk_integrity_compare(mddev->gendisk,
2067 rdev->bdev->bd_disk) >= 0) 2067 rdev->bdev->bd_disk) >= 0)
2068 return; 2068 return;
2069 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); 2069 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
2070 blk_integrity_unregister(mddev->gendisk); 2070 blk_integrity_unregister(mddev->gendisk);
2071 } 2071 }
2072 EXPORT_SYMBOL(md_integrity_add_rdev); 2072 EXPORT_SYMBOL(md_integrity_add_rdev);
2073 2073
2074 static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) 2074 static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev)
2075 { 2075 {
2076 char b[BDEVNAME_SIZE]; 2076 char b[BDEVNAME_SIZE];
2077 struct kobject *ko; 2077 struct kobject *ko;
2078 char *s; 2078 char *s;
2079 int err; 2079 int err;
2080 2080
2081 if (rdev->mddev) { 2081 if (rdev->mddev) {
2082 MD_BUG(); 2082 MD_BUG();
2083 return -EINVAL; 2083 return -EINVAL;
2084 } 2084 }
2085 2085
2086 /* prevent duplicates */ 2086 /* prevent duplicates */
2087 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2087 if (find_rdev(mddev, rdev->bdev->bd_dev))
2088 return -EEXIST; 2088 return -EEXIST;
2089 2089
2090 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2090 /* make sure rdev->sectors exceeds mddev->dev_sectors */
2091 if (rdev->sectors && (mddev->dev_sectors == 0 || 2091 if (rdev->sectors && (mddev->dev_sectors == 0 ||
2092 rdev->sectors < mddev->dev_sectors)) { 2092 rdev->sectors < mddev->dev_sectors)) {
2093 if (mddev->pers) { 2093 if (mddev->pers) {
2094 /* Cannot change size, so fail 2094 /* Cannot change size, so fail
2095 * If mddev->level <= 0, then we don't care 2095 * If mddev->level <= 0, then we don't care
2096 * about aligning sizes (e.g. linear) 2096 * about aligning sizes (e.g. linear)
2097 */ 2097 */
2098 if (mddev->level > 0) 2098 if (mddev->level > 0)
2099 return -ENOSPC; 2099 return -ENOSPC;
2100 } else 2100 } else
2101 mddev->dev_sectors = rdev->sectors; 2101 mddev->dev_sectors = rdev->sectors;
2102 } 2102 }
2103 2103
2104 /* Verify rdev->desc_nr is unique. 2104 /* Verify rdev->desc_nr is unique.
2105 * If it is -1, assign a free number, else 2105 * If it is -1, assign a free number, else
2106 * check number is not in use 2106 * check number is not in use
2107 */ 2107 */
2108 if (rdev->desc_nr < 0) { 2108 if (rdev->desc_nr < 0) {
2109 int choice = 0; 2109 int choice = 0;
2110 if (mddev->pers) choice = mddev->raid_disks; 2110 if (mddev->pers) choice = mddev->raid_disks;
2111 while (find_rdev_nr(mddev, choice)) 2111 while (find_rdev_nr(mddev, choice))
2112 choice++; 2112 choice++;
2113 rdev->desc_nr = choice; 2113 rdev->desc_nr = choice;
2114 } else { 2114 } else {
2115 if (find_rdev_nr(mddev, rdev->desc_nr)) 2115 if (find_rdev_nr(mddev, rdev->desc_nr))
2116 return -EBUSY; 2116 return -EBUSY;
2117 } 2117 }
2118 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2118 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2119 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 2119 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
2120 mdname(mddev), mddev->max_disks); 2120 mdname(mddev), mddev->max_disks);
2121 return -EBUSY; 2121 return -EBUSY;
2122 } 2122 }
2123 bdevname(rdev->bdev,b); 2123 bdevname(rdev->bdev,b);
2124 while ( (s=strchr(b, '/')) != NULL) 2124 while ( (s=strchr(b, '/')) != NULL)
2125 *s = '!'; 2125 *s = '!';
2126 2126
2127 rdev->mddev = mddev; 2127 rdev->mddev = mddev;
2128 printk(KERN_INFO "md: bind<%s>\n", b); 2128 printk(KERN_INFO "md: bind<%s>\n", b);
2129 2129
2130 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2130 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2131 goto fail; 2131 goto fail;
2132 2132
2133 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 2133 ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2134 if (sysfs_create_link(&rdev->kobj, ko, "block")) 2134 if (sysfs_create_link(&rdev->kobj, ko, "block"))
2135 /* failure here is OK */; 2135 /* failure here is OK */;
2136 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2136 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2137 2137
2138 list_add_rcu(&rdev->same_set, &mddev->disks); 2138 list_add_rcu(&rdev->same_set, &mddev->disks);
2139 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2139 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2140 2140
2141 /* May as well allow recovery to be retried once */ 2141 /* May as well allow recovery to be retried once */
2142 mddev->recovery_disabled++; 2142 mddev->recovery_disabled++;
2143 2143
2144 return 0; 2144 return 0;
2145 2145
2146 fail: 2146 fail:
2147 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 2147 printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
2148 b, mdname(mddev)); 2148 b, mdname(mddev));
2149 return err; 2149 return err;
2150 } 2150 }
2151 2151
2152 static void md_delayed_delete(struct work_struct *ws) 2152 static void md_delayed_delete(struct work_struct *ws)
2153 { 2153 {
2154 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); 2154 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2155 kobject_del(&rdev->kobj); 2155 kobject_del(&rdev->kobj);
2156 kobject_put(&rdev->kobj); 2156 kobject_put(&rdev->kobj);
2157 } 2157 }
2158 2158
2159 static void unbind_rdev_from_array(struct md_rdev * rdev) 2159 static void unbind_rdev_from_array(struct md_rdev * rdev)
2160 { 2160 {
2161 char b[BDEVNAME_SIZE]; 2161 char b[BDEVNAME_SIZE];
2162 if (!rdev->mddev) { 2162 if (!rdev->mddev) {
2163 MD_BUG(); 2163 MD_BUG();
2164 return; 2164 return;
2165 } 2165 }
2166 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2166 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2167 list_del_rcu(&rdev->same_set); 2167 list_del_rcu(&rdev->same_set);
2168 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 2168 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
2169 rdev->mddev = NULL; 2169 rdev->mddev = NULL;
2170 sysfs_remove_link(&rdev->kobj, "block"); 2170 sysfs_remove_link(&rdev->kobj, "block");
2171 sysfs_put(rdev->sysfs_state); 2171 sysfs_put(rdev->sysfs_state);
2172 rdev->sysfs_state = NULL; 2172 rdev->sysfs_state = NULL;
2173 rdev->badblocks.count = 0; 2173 rdev->badblocks.count = 0;
2174 /* We need to delay this, otherwise we can deadlock when 2174 /* We need to delay this, otherwise we can deadlock when
2175 * writing to 'remove' to "dev/state". We also need 2175 * writing to 'remove' to "dev/state". We also need
2176 * to delay it due to rcu usage. 2176 * to delay it due to rcu usage.
2177 */ 2177 */
2178 synchronize_rcu(); 2178 synchronize_rcu();
2179 INIT_WORK(&rdev->del_work, md_delayed_delete); 2179 INIT_WORK(&rdev->del_work, md_delayed_delete);
2180 kobject_get(&rdev->kobj); 2180 kobject_get(&rdev->kobj);
2181 queue_work(md_misc_wq, &rdev->del_work); 2181 queue_work(md_misc_wq, &rdev->del_work);
2182 } 2182 }
2183 2183
2184 /* 2184 /*
2185 * prevent the device from being mounted, repartitioned or 2185 * prevent the device from being mounted, repartitioned or
2186 * otherwise reused by a RAID array (or any other kernel 2186 * otherwise reused by a RAID array (or any other kernel
2187 * subsystem), by bd_claiming the device. 2187 * subsystem), by bd_claiming the device.
2188 */ 2188 */
2189 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) 2189 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2190 { 2190 {
2191 int err = 0; 2191 int err = 0;
2192 struct block_device *bdev; 2192 struct block_device *bdev;
2193 char b[BDEVNAME_SIZE]; 2193 char b[BDEVNAME_SIZE];
2194 2194
2195 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, 2195 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2196 shared ? (struct md_rdev *)lock_rdev : rdev); 2196 shared ? (struct md_rdev *)lock_rdev : rdev);
2197 if (IS_ERR(bdev)) { 2197 if (IS_ERR(bdev)) {
2198 printk(KERN_ERR "md: could not open %s.\n", 2198 printk(KERN_ERR "md: could not open %s.\n",
2199 __bdevname(dev, b)); 2199 __bdevname(dev, b));
2200 return PTR_ERR(bdev); 2200 return PTR_ERR(bdev);
2201 } 2201 }
2202 rdev->bdev = bdev; 2202 rdev->bdev = bdev;
2203 return err; 2203 return err;
2204 } 2204 }
2205 2205
2206 static void unlock_rdev(struct md_rdev *rdev) 2206 static void unlock_rdev(struct md_rdev *rdev)
2207 { 2207 {
2208 struct block_device *bdev = rdev->bdev; 2208 struct block_device *bdev = rdev->bdev;
2209 rdev->bdev = NULL; 2209 rdev->bdev = NULL;
2210 if (!bdev) 2210 if (!bdev)
2211 MD_BUG(); 2211 MD_BUG();
2212 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 2212 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2213 } 2213 }
2214 2214
2215 void md_autodetect_dev(dev_t dev); 2215 void md_autodetect_dev(dev_t dev);
2216 2216
2217 static void export_rdev(struct md_rdev * rdev) 2217 static void export_rdev(struct md_rdev * rdev)
2218 { 2218 {
2219 char b[BDEVNAME_SIZE]; 2219 char b[BDEVNAME_SIZE];
2220 printk(KERN_INFO "md: export_rdev(%s)\n", 2220 printk(KERN_INFO "md: export_rdev(%s)\n",
2221 bdevname(rdev->bdev,b)); 2221 bdevname(rdev->bdev,b));
2222 if (rdev->mddev) 2222 if (rdev->mddev)
2223 MD_BUG(); 2223 MD_BUG();
2224 md_rdev_clear(rdev); 2224 md_rdev_clear(rdev);
2225 #ifndef MODULE 2225 #ifndef MODULE
2226 if (test_bit(AutoDetected, &rdev->flags)) 2226 if (test_bit(AutoDetected, &rdev->flags))
2227 md_autodetect_dev(rdev->bdev->bd_dev); 2227 md_autodetect_dev(rdev->bdev->bd_dev);
2228 #endif 2228 #endif
2229 unlock_rdev(rdev); 2229 unlock_rdev(rdev);
2230 kobject_put(&rdev->kobj); 2230 kobject_put(&rdev->kobj);
2231 } 2231 }
2232 2232
2233 static void kick_rdev_from_array(struct md_rdev * rdev) 2233 static void kick_rdev_from_array(struct md_rdev * rdev)
2234 { 2234 {
2235 unbind_rdev_from_array(rdev); 2235 unbind_rdev_from_array(rdev);
2236 export_rdev(rdev); 2236 export_rdev(rdev);
2237 } 2237 }
2238 2238
2239 static void export_array(struct mddev *mddev) 2239 static void export_array(struct mddev *mddev)
2240 { 2240 {
2241 struct md_rdev *rdev, *tmp; 2241 struct md_rdev *rdev, *tmp;
2242 2242
2243 rdev_for_each_safe(rdev, tmp, mddev) { 2243 rdev_for_each_safe(rdev, tmp, mddev) {
2244 if (!rdev->mddev) { 2244 if (!rdev->mddev) {
2245 MD_BUG(); 2245 MD_BUG();
2246 continue; 2246 continue;
2247 } 2247 }
2248 kick_rdev_from_array(rdev); 2248 kick_rdev_from_array(rdev);
2249 } 2249 }
2250 if (!list_empty(&mddev->disks)) 2250 if (!list_empty(&mddev->disks))
2251 MD_BUG(); 2251 MD_BUG();
2252 mddev->raid_disks = 0; 2252 mddev->raid_disks = 0;
2253 mddev->major_version = 0; 2253 mddev->major_version = 0;
2254 } 2254 }
2255 2255
2256 static void print_desc(mdp_disk_t *desc) 2256 static void print_desc(mdp_disk_t *desc)
2257 { 2257 {
2258 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 2258 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
2259 desc->major,desc->minor,desc->raid_disk,desc->state); 2259 desc->major,desc->minor,desc->raid_disk,desc->state);
2260 } 2260 }
2261 2261
2262 static void print_sb_90(mdp_super_t *sb) 2262 static void print_sb_90(mdp_super_t *sb)
2263 { 2263 {
2264 int i; 2264 int i;
2265 2265
2266 printk(KERN_INFO 2266 printk(KERN_INFO
2267 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 2267 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
2268 sb->major_version, sb->minor_version, sb->patch_version, 2268 sb->major_version, sb->minor_version, sb->patch_version,
2269 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 2269 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
2270 sb->ctime); 2270 sb->ctime);
2271 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 2271 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
2272 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 2272 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
2273 sb->md_minor, sb->layout, sb->chunk_size); 2273 sb->md_minor, sb->layout, sb->chunk_size);
2274 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 2274 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
2275 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 2275 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
2276 sb->utime, sb->state, sb->active_disks, sb->working_disks, 2276 sb->utime, sb->state, sb->active_disks, sb->working_disks,
2277 sb->failed_disks, sb->spare_disks, 2277 sb->failed_disks, sb->spare_disks,
2278 sb->sb_csum, (unsigned long)sb->events_lo); 2278 sb->sb_csum, (unsigned long)sb->events_lo);
2279 2279
2280 printk(KERN_INFO); 2280 printk(KERN_INFO);
2281 for (i = 0; i < MD_SB_DISKS; i++) { 2281 for (i = 0; i < MD_SB_DISKS; i++) {
2282 mdp_disk_t *desc; 2282 mdp_disk_t *desc;
2283 2283
2284 desc = sb->disks + i; 2284 desc = sb->disks + i;
2285 if (desc->number || desc->major || desc->minor || 2285 if (desc->number || desc->major || desc->minor ||
2286 desc->raid_disk || (desc->state && (desc->state != 4))) { 2286 desc->raid_disk || (desc->state && (desc->state != 4))) {
2287 printk(" D %2d: ", i); 2287 printk(" D %2d: ", i);
2288 print_desc(desc); 2288 print_desc(desc);
2289 } 2289 }
2290 } 2290 }
2291 printk(KERN_INFO "md: THIS: "); 2291 printk(KERN_INFO "md: THIS: ");
2292 print_desc(&sb->this_disk); 2292 print_desc(&sb->this_disk);
2293 } 2293 }
2294 2294
2295 static void print_sb_1(struct mdp_superblock_1 *sb) 2295 static void print_sb_1(struct mdp_superblock_1 *sb)
2296 { 2296 {
2297 __u8 *uuid; 2297 __u8 *uuid;
2298 2298
2299 uuid = sb->set_uuid; 2299 uuid = sb->set_uuid;
2300 printk(KERN_INFO 2300 printk(KERN_INFO
2301 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" 2301 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n"
2302 "md: Name: \"%s\" CT:%llu\n", 2302 "md: Name: \"%s\" CT:%llu\n",
2303 le32_to_cpu(sb->major_version), 2303 le32_to_cpu(sb->major_version),
2304 le32_to_cpu(sb->feature_map), 2304 le32_to_cpu(sb->feature_map),
2305 uuid, 2305 uuid,
2306 sb->set_name, 2306 sb->set_name,
2307 (unsigned long long)le64_to_cpu(sb->ctime) 2307 (unsigned long long)le64_to_cpu(sb->ctime)
2308 & MD_SUPERBLOCK_1_TIME_SEC_MASK); 2308 & MD_SUPERBLOCK_1_TIME_SEC_MASK);
2309 2309
2310 uuid = sb->device_uuid; 2310 uuid = sb->device_uuid;
2311 printk(KERN_INFO 2311 printk(KERN_INFO
2312 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" 2312 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
2313 " RO:%llu\n" 2313 " RO:%llu\n"
2314 "md: Dev:%08x UUID: %pU\n" 2314 "md: Dev:%08x UUID: %pU\n"
2315 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" 2315 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
2316 "md: (MaxDev:%u) \n", 2316 "md: (MaxDev:%u) \n",
2317 le32_to_cpu(sb->level), 2317 le32_to_cpu(sb->level),
2318 (unsigned long long)le64_to_cpu(sb->size), 2318 (unsigned long long)le64_to_cpu(sb->size),
2319 le32_to_cpu(sb->raid_disks), 2319 le32_to_cpu(sb->raid_disks),
2320 le32_to_cpu(sb->layout), 2320 le32_to_cpu(sb->layout),
2321 le32_to_cpu(sb->chunksize), 2321 le32_to_cpu(sb->chunksize),
2322 (unsigned long long)le64_to_cpu(sb->data_offset), 2322 (unsigned long long)le64_to_cpu(sb->data_offset),
2323 (unsigned long long)le64_to_cpu(sb->data_size), 2323 (unsigned long long)le64_to_cpu(sb->data_size),
2324 (unsigned long long)le64_to_cpu(sb->super_offset), 2324 (unsigned long long)le64_to_cpu(sb->super_offset),
2325 (unsigned long long)le64_to_cpu(sb->recovery_offset), 2325 (unsigned long long)le64_to_cpu(sb->recovery_offset),
2326 le32_to_cpu(sb->dev_number), 2326 le32_to_cpu(sb->dev_number),
2327 uuid, 2327 uuid,
2328 sb->devflags, 2328 sb->devflags,
2329 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, 2329 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
2330 (unsigned long long)le64_to_cpu(sb->events), 2330 (unsigned long long)le64_to_cpu(sb->events),
2331 (unsigned long long)le64_to_cpu(sb->resync_offset), 2331 (unsigned long long)le64_to_cpu(sb->resync_offset),
2332 le32_to_cpu(sb->sb_csum), 2332 le32_to_cpu(sb->sb_csum),
2333 le32_to_cpu(sb->max_dev) 2333 le32_to_cpu(sb->max_dev)
2334 ); 2334 );
2335 } 2335 }
2336 2336
2337 static void print_rdev(struct md_rdev *rdev, int major_version) 2337 static void print_rdev(struct md_rdev *rdev, int major_version)
2338 { 2338 {
2339 char b[BDEVNAME_SIZE]; 2339 char b[BDEVNAME_SIZE];
2340 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", 2340 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
2341 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, 2341 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
2342 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 2342 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
2343 rdev->desc_nr); 2343 rdev->desc_nr);
2344 if (rdev->sb_loaded) { 2344 if (rdev->sb_loaded) {
2345 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); 2345 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
2346 switch (major_version) { 2346 switch (major_version) {
2347 case 0: 2347 case 0:
2348 print_sb_90(page_address(rdev->sb_page)); 2348 print_sb_90(page_address(rdev->sb_page));
2349 break; 2349 break;
2350 case 1: 2350 case 1:
2351 print_sb_1(page_address(rdev->sb_page)); 2351 print_sb_1(page_address(rdev->sb_page));
2352 break; 2352 break;
2353 } 2353 }
2354 } else 2354 } else
2355 printk(KERN_INFO "md: no rdev superblock!\n"); 2355 printk(KERN_INFO "md: no rdev superblock!\n");
2356 } 2356 }
2357 2357
2358 static void md_print_devices(void) 2358 static void md_print_devices(void)
2359 { 2359 {
2360 struct list_head *tmp; 2360 struct list_head *tmp;
2361 struct md_rdev *rdev; 2361 struct md_rdev *rdev;
2362 struct mddev *mddev; 2362 struct mddev *mddev;
2363 char b[BDEVNAME_SIZE]; 2363 char b[BDEVNAME_SIZE];
2364 2364
2365 printk("\n"); 2365 printk("\n");
2366 printk("md: **********************************\n"); 2366 printk("md: **********************************\n");
2367 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 2367 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
2368 printk("md: **********************************\n"); 2368 printk("md: **********************************\n");
2369 for_each_mddev(mddev, tmp) { 2369 for_each_mddev(mddev, tmp) {
2370 2370
2371 if (mddev->bitmap) 2371 if (mddev->bitmap)
2372 bitmap_print_sb(mddev->bitmap); 2372 bitmap_print_sb(mddev->bitmap);
2373 else 2373 else
2374 printk("%s: ", mdname(mddev)); 2374 printk("%s: ", mdname(mddev));
2375 rdev_for_each(rdev, mddev) 2375 rdev_for_each(rdev, mddev)
2376 printk("<%s>", bdevname(rdev->bdev,b)); 2376 printk("<%s>", bdevname(rdev->bdev,b));
2377 printk("\n"); 2377 printk("\n");
2378 2378
2379 rdev_for_each(rdev, mddev) 2379 rdev_for_each(rdev, mddev)
2380 print_rdev(rdev, mddev->major_version); 2380 print_rdev(rdev, mddev->major_version);
2381 } 2381 }
2382 printk("md: **********************************\n"); 2382 printk("md: **********************************\n");
2383 printk("\n"); 2383 printk("\n");
2384 } 2384 }
2385 2385
2386 2386
2387 static void sync_sbs(struct mddev * mddev, int nospares) 2387 static void sync_sbs(struct mddev * mddev, int nospares)
2388 { 2388 {
2389 /* Update each superblock (in-memory image), but 2389 /* Update each superblock (in-memory image), but
2390 * if we are allowed to, skip spares which already 2390 * if we are allowed to, skip spares which already
2391 * have the right event counter, or have one earlier 2391 * have the right event counter, or have one earlier
2392 * (which would mean they aren't being marked as dirty 2392 * (which would mean they aren't being marked as dirty
2393 * with the rest of the array) 2393 * with the rest of the array)
2394 */ 2394 */
2395 struct md_rdev *rdev; 2395 struct md_rdev *rdev;
2396 rdev_for_each(rdev, mddev) { 2396 rdev_for_each(rdev, mddev) {
2397 if (rdev->sb_events == mddev->events || 2397 if (rdev->sb_events == mddev->events ||
2398 (nospares && 2398 (nospares &&
2399 rdev->raid_disk < 0 && 2399 rdev->raid_disk < 0 &&
2400 rdev->sb_events+1 == mddev->events)) { 2400 rdev->sb_events+1 == mddev->events)) {
2401 /* Don't update this superblock */ 2401 /* Don't update this superblock */
2402 rdev->sb_loaded = 2; 2402 rdev->sb_loaded = 2;
2403 } else { 2403 } else {
2404 sync_super(mddev, rdev); 2404 sync_super(mddev, rdev);
2405 rdev->sb_loaded = 1; 2405 rdev->sb_loaded = 1;
2406 } 2406 }
2407 } 2407 }
2408 } 2408 }
2409 2409
2410 static void md_update_sb(struct mddev * mddev, int force_change) 2410 static void md_update_sb(struct mddev * mddev, int force_change)
2411 { 2411 {
2412 struct md_rdev *rdev; 2412 struct md_rdev *rdev;
2413 int sync_req; 2413 int sync_req;
2414 int nospares = 0; 2414 int nospares = 0;
2415 int any_badblocks_changed = 0; 2415 int any_badblocks_changed = 0;
2416 2416
2417 if (mddev->ro) { 2417 if (mddev->ro) {
2418 if (force_change) 2418 if (force_change)
2419 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2419 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2420 return; 2420 return;
2421 } 2421 }
2422 repeat: 2422 repeat:
2423 /* First make sure individual recovery_offsets are correct */ 2423 /* First make sure individual recovery_offsets are correct */
2424 rdev_for_each(rdev, mddev) { 2424 rdev_for_each(rdev, mddev) {
2425 if (rdev->raid_disk >= 0 && 2425 if (rdev->raid_disk >= 0 &&
2426 mddev->delta_disks >= 0 && 2426 mddev->delta_disks >= 0 &&
2427 !test_bit(In_sync, &rdev->flags) && 2427 !test_bit(In_sync, &rdev->flags) &&
2428 mddev->curr_resync_completed > rdev->recovery_offset) 2428 mddev->curr_resync_completed > rdev->recovery_offset)
2429 rdev->recovery_offset = mddev->curr_resync_completed; 2429 rdev->recovery_offset = mddev->curr_resync_completed;
2430 2430
2431 } 2431 }
2432 if (!mddev->persistent) { 2432 if (!mddev->persistent) {
2433 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2433 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2434 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2434 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2435 if (!mddev->external) { 2435 if (!mddev->external) {
2436 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2436 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2437 rdev_for_each(rdev, mddev) { 2437 rdev_for_each(rdev, mddev) {
2438 if (rdev->badblocks.changed) { 2438 if (rdev->badblocks.changed) {
2439 rdev->badblocks.changed = 0; 2439 rdev->badblocks.changed = 0;
2440 md_ack_all_badblocks(&rdev->badblocks); 2440 md_ack_all_badblocks(&rdev->badblocks);
2441 md_error(mddev, rdev); 2441 md_error(mddev, rdev);
2442 } 2442 }
2443 clear_bit(Blocked, &rdev->flags); 2443 clear_bit(Blocked, &rdev->flags);
2444 clear_bit(BlockedBadBlocks, &rdev->flags); 2444 clear_bit(BlockedBadBlocks, &rdev->flags);
2445 wake_up(&rdev->blocked_wait); 2445 wake_up(&rdev->blocked_wait);
2446 } 2446 }
2447 } 2447 }
2448 wake_up(&mddev->sb_wait); 2448 wake_up(&mddev->sb_wait);
2449 return; 2449 return;
2450 } 2450 }
2451 2451
2452 spin_lock_irq(&mddev->write_lock); 2452 spin_lock_irq(&mddev->write_lock);
2453 2453
2454 mddev->utime = get_seconds(); 2454 mddev->utime = get_seconds();
2455 2455
2456 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2456 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2457 force_change = 1; 2457 force_change = 1;
2458 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2458 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
2459 /* just a clean<-> dirty transition, possibly leave spares alone, 2459 /* just a clean<-> dirty transition, possibly leave spares alone,
2460 * though if events isn't the right even/odd, we will have to do 2460 * though if events isn't the right even/odd, we will have to do
2461 * spares after all 2461 * spares after all
2462 */ 2462 */
2463 nospares = 1; 2463 nospares = 1;
2464 if (force_change) 2464 if (force_change)
2465 nospares = 0; 2465 nospares = 0;
2466 if (mddev->degraded) 2466 if (mddev->degraded)
2467 /* If the array is degraded, then skipping spares is both 2467 /* If the array is degraded, then skipping spares is both
2468 * dangerous and fairly pointless. 2468 * dangerous and fairly pointless.
2469 * Dangerous because a device that was removed from the array 2469 * Dangerous because a device that was removed from the array
2470 * might have a event_count that still looks up-to-date, 2470 * might have a event_count that still looks up-to-date,
2471 * so it can be re-added without a resync. 2471 * so it can be re-added without a resync.
2472 * Pointless because if there are any spares to skip, 2472 * Pointless because if there are any spares to skip,
2473 * then a recovery will happen and soon that array won't 2473 * then a recovery will happen and soon that array won't
2474 * be degraded any more and the spare can go back to sleep then. 2474 * be degraded any more and the spare can go back to sleep then.
2475 */ 2475 */
2476 nospares = 0; 2476 nospares = 0;
2477 2477
2478 sync_req = mddev->in_sync; 2478 sync_req = mddev->in_sync;
2479 2479
2480 /* If this is just a dirty<->clean transition, and the array is clean 2480 /* If this is just a dirty<->clean transition, and the array is clean
2481 * and 'events' is odd, we can roll back to the previous clean state */ 2481 * and 'events' is odd, we can roll back to the previous clean state */
2482 if (nospares 2482 if (nospares
2483 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2483 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2484 && mddev->can_decrease_events 2484 && mddev->can_decrease_events
2485 && mddev->events != 1) { 2485 && mddev->events != 1) {
2486 mddev->events--; 2486 mddev->events--;
2487 mddev->can_decrease_events = 0; 2487 mddev->can_decrease_events = 0;
2488 } else { 2488 } else {
2489 /* otherwise we have to go forward and ... */ 2489 /* otherwise we have to go forward and ... */
2490 mddev->events ++; 2490 mddev->events ++;
2491 mddev->can_decrease_events = nospares; 2491 mddev->can_decrease_events = nospares;
2492 } 2492 }
2493 2493
2494 if (!mddev->events) { 2494 if (!mddev->events) {
2495 /* 2495 /*
2496 * oops, this 64-bit counter should never wrap. 2496 * oops, this 64-bit counter should never wrap.
2497 * Either we are in around ~1 trillion A.C., assuming 2497 * Either we are in around ~1 trillion A.C., assuming
2498 * 1 reboot per second, or we have a bug: 2498 * 1 reboot per second, or we have a bug:
2499 */ 2499 */
2500 MD_BUG(); 2500 MD_BUG();
2501 mddev->events --; 2501 mddev->events --;
2502 } 2502 }
2503 2503
2504 rdev_for_each(rdev, mddev) { 2504 rdev_for_each(rdev, mddev) {
2505 if (rdev->badblocks.changed) 2505 if (rdev->badblocks.changed)
2506 any_badblocks_changed++; 2506 any_badblocks_changed++;
2507 if (test_bit(Faulty, &rdev->flags)) 2507 if (test_bit(Faulty, &rdev->flags))
2508 set_bit(FaultRecorded, &rdev->flags); 2508 set_bit(FaultRecorded, &rdev->flags);
2509 } 2509 }
2510 2510
2511 sync_sbs(mddev, nospares); 2511 sync_sbs(mddev, nospares);
2512 spin_unlock_irq(&mddev->write_lock); 2512 spin_unlock_irq(&mddev->write_lock);
2513 2513
2514 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2514 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2515 mdname(mddev), mddev->in_sync); 2515 mdname(mddev), mddev->in_sync);
2516 2516
2517 bitmap_update_sb(mddev->bitmap); 2517 bitmap_update_sb(mddev->bitmap);
2518 rdev_for_each(rdev, mddev) { 2518 rdev_for_each(rdev, mddev) {
2519 char b[BDEVNAME_SIZE]; 2519 char b[BDEVNAME_SIZE];
2520 2520
2521 if (rdev->sb_loaded != 1) 2521 if (rdev->sb_loaded != 1)
2522 continue; /* no noise on spare devices */ 2522 continue; /* no noise on spare devices */
2523 2523
2524 if (!test_bit(Faulty, &rdev->flags) && 2524 if (!test_bit(Faulty, &rdev->flags) &&
2525 rdev->saved_raid_disk == -1) { 2525 rdev->saved_raid_disk == -1) {
2526 md_super_write(mddev,rdev, 2526 md_super_write(mddev,rdev,
2527 rdev->sb_start, rdev->sb_size, 2527 rdev->sb_start, rdev->sb_size,
2528 rdev->sb_page); 2528 rdev->sb_page);
2529 pr_debug("md: (write) %s's sb offset: %llu\n", 2529 pr_debug("md: (write) %s's sb offset: %llu\n",
2530 bdevname(rdev->bdev, b), 2530 bdevname(rdev->bdev, b),
2531 (unsigned long long)rdev->sb_start); 2531 (unsigned long long)rdev->sb_start);
2532 rdev->sb_events = mddev->events; 2532 rdev->sb_events = mddev->events;
2533 if (rdev->badblocks.size) { 2533 if (rdev->badblocks.size) {
2534 md_super_write(mddev, rdev, 2534 md_super_write(mddev, rdev,
2535 rdev->badblocks.sector, 2535 rdev->badblocks.sector,
2536 rdev->badblocks.size << 9, 2536 rdev->badblocks.size << 9,
2537 rdev->bb_page); 2537 rdev->bb_page);
2538 rdev->badblocks.size = 0; 2538 rdev->badblocks.size = 0;
2539 } 2539 }
2540 2540
2541 } else if (test_bit(Faulty, &rdev->flags)) 2541 } else if (test_bit(Faulty, &rdev->flags))
2542 pr_debug("md: %s (skipping faulty)\n", 2542 pr_debug("md: %s (skipping faulty)\n",
2543 bdevname(rdev->bdev, b)); 2543 bdevname(rdev->bdev, b));
2544 else 2544 else
2545 pr_debug("(skipping incremental s/r "); 2545 pr_debug("(skipping incremental s/r ");
2546 2546
2547 if (mddev->level == LEVEL_MULTIPATH) 2547 if (mddev->level == LEVEL_MULTIPATH)
2548 /* only need to write one superblock... */ 2548 /* only need to write one superblock... */
2549 break; 2549 break;
2550 } 2550 }
2551 md_super_wait(mddev); 2551 md_super_wait(mddev);
2552 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2552 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2553 2553
2554 spin_lock_irq(&mddev->write_lock); 2554 spin_lock_irq(&mddev->write_lock);
2555 if (mddev->in_sync != sync_req || 2555 if (mddev->in_sync != sync_req ||
2556 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2556 test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2557 /* have to write it out again */ 2557 /* have to write it out again */
2558 spin_unlock_irq(&mddev->write_lock); 2558 spin_unlock_irq(&mddev->write_lock);
2559 goto repeat; 2559 goto repeat;
2560 } 2560 }
2561 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2561 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2562 spin_unlock_irq(&mddev->write_lock); 2562 spin_unlock_irq(&mddev->write_lock);
2563 wake_up(&mddev->sb_wait); 2563 wake_up(&mddev->sb_wait);
2564 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2564 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2565 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2565 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2566 2566
2567 rdev_for_each(rdev, mddev) { 2567 rdev_for_each(rdev, mddev) {
2568 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2568 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2569 clear_bit(Blocked, &rdev->flags); 2569 clear_bit(Blocked, &rdev->flags);
2570 2570
2571 if (any_badblocks_changed) 2571 if (any_badblocks_changed)
2572 md_ack_all_badblocks(&rdev->badblocks); 2572 md_ack_all_badblocks(&rdev->badblocks);
2573 clear_bit(BlockedBadBlocks, &rdev->flags); 2573 clear_bit(BlockedBadBlocks, &rdev->flags);
2574 wake_up(&rdev->blocked_wait); 2574 wake_up(&rdev->blocked_wait);
2575 } 2575 }
2576 } 2576 }
2577 2577
2578 /* words written to sysfs files may, or may not, be \n terminated. 2578 /* words written to sysfs files may, or may not, be \n terminated.
2579 * We want to accept with case. For this we use cmd_match. 2579 * We want to accept with case. For this we use cmd_match.
2580 */ 2580 */
2581 static int cmd_match(const char *cmd, const char *str) 2581 static int cmd_match(const char *cmd, const char *str)
2582 { 2582 {
2583 /* See if cmd, written into a sysfs file, matches 2583 /* See if cmd, written into a sysfs file, matches
2584 * str. They must either be the same, or cmd can 2584 * str. They must either be the same, or cmd can
2585 * have a trailing newline 2585 * have a trailing newline
2586 */ 2586 */
2587 while (*cmd && *str && *cmd == *str) { 2587 while (*cmd && *str && *cmd == *str) {
2588 cmd++; 2588 cmd++;
2589 str++; 2589 str++;
2590 } 2590 }
2591 if (*cmd == '\n') 2591 if (*cmd == '\n')
2592 cmd++; 2592 cmd++;
2593 if (*str || *cmd) 2593 if (*str || *cmd)
2594 return 0; 2594 return 0;
2595 return 1; 2595 return 1;
2596 } 2596 }
2597 2597
2598 struct rdev_sysfs_entry { 2598 struct rdev_sysfs_entry {
2599 struct attribute attr; 2599 struct attribute attr;
2600 ssize_t (*show)(struct md_rdev *, char *); 2600 ssize_t (*show)(struct md_rdev *, char *);
2601 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2601 ssize_t (*store)(struct md_rdev *, const char *, size_t);
2602 }; 2602 };
2603 2603
2604 static ssize_t 2604 static ssize_t
2605 state_show(struct md_rdev *rdev, char *page) 2605 state_show(struct md_rdev *rdev, char *page)
2606 { 2606 {
2607 char *sep = ""; 2607 char *sep = "";
2608 size_t len = 0; 2608 size_t len = 0;
2609 2609
2610 if (test_bit(Faulty, &rdev->flags) || 2610 if (test_bit(Faulty, &rdev->flags) ||
2611 rdev->badblocks.unacked_exist) { 2611 rdev->badblocks.unacked_exist) {
2612 len+= sprintf(page+len, "%sfaulty",sep); 2612 len+= sprintf(page+len, "%sfaulty",sep);
2613 sep = ","; 2613 sep = ",";
2614 } 2614 }
2615 if (test_bit(In_sync, &rdev->flags)) { 2615 if (test_bit(In_sync, &rdev->flags)) {
2616 len += sprintf(page+len, "%sin_sync",sep); 2616 len += sprintf(page+len, "%sin_sync",sep);
2617 sep = ","; 2617 sep = ",";
2618 } 2618 }
2619 if (test_bit(WriteMostly, &rdev->flags)) { 2619 if (test_bit(WriteMostly, &rdev->flags)) {
2620 len += sprintf(page+len, "%swrite_mostly",sep); 2620 len += sprintf(page+len, "%swrite_mostly",sep);
2621 sep = ","; 2621 sep = ",";
2622 } 2622 }
2623 if (test_bit(Blocked, &rdev->flags) || 2623 if (test_bit(Blocked, &rdev->flags) ||
2624 (rdev->badblocks.unacked_exist 2624 (rdev->badblocks.unacked_exist
2625 && !test_bit(Faulty, &rdev->flags))) { 2625 && !test_bit(Faulty, &rdev->flags))) {
2626 len += sprintf(page+len, "%sblocked", sep); 2626 len += sprintf(page+len, "%sblocked", sep);
2627 sep = ","; 2627 sep = ",";
2628 } 2628 }
2629 if (!test_bit(Faulty, &rdev->flags) && 2629 if (!test_bit(Faulty, &rdev->flags) &&
2630 !test_bit(In_sync, &rdev->flags)) { 2630 !test_bit(In_sync, &rdev->flags)) {
2631 len += sprintf(page+len, "%sspare", sep); 2631 len += sprintf(page+len, "%sspare", sep);
2632 sep = ","; 2632 sep = ",";
2633 } 2633 }
2634 if (test_bit(WriteErrorSeen, &rdev->flags)) { 2634 if (test_bit(WriteErrorSeen, &rdev->flags)) {
2635 len += sprintf(page+len, "%swrite_error", sep); 2635 len += sprintf(page+len, "%swrite_error", sep);
2636 sep = ","; 2636 sep = ",";
2637 } 2637 }
2638 if (test_bit(WantReplacement, &rdev->flags)) { 2638 if (test_bit(WantReplacement, &rdev->flags)) {
2639 len += sprintf(page+len, "%swant_replacement", sep); 2639 len += sprintf(page+len, "%swant_replacement", sep);
2640 sep = ","; 2640 sep = ",";
2641 } 2641 }
2642 if (test_bit(Replacement, &rdev->flags)) { 2642 if (test_bit(Replacement, &rdev->flags)) {
2643 len += sprintf(page+len, "%sreplacement", sep); 2643 len += sprintf(page+len, "%sreplacement", sep);
2644 sep = ","; 2644 sep = ",";
2645 } 2645 }
2646 2646
2647 return len+sprintf(page+len, "\n"); 2647 return len+sprintf(page+len, "\n");
2648 } 2648 }
2649 2649
2650 static ssize_t 2650 static ssize_t
2651 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2651 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2652 { 2652 {
2653 /* can write 2653 /* can write
2654 * faulty - simulates an error 2654 * faulty - simulates an error
2655 * remove - disconnects the device 2655 * remove - disconnects the device
2656 * writemostly - sets write_mostly 2656 * writemostly - sets write_mostly
2657 * -writemostly - clears write_mostly 2657 * -writemostly - clears write_mostly
2658 * blocked - sets the Blocked flags 2658 * blocked - sets the Blocked flags
2659 * -blocked - clears the Blocked and possibly simulates an error 2659 * -blocked - clears the Blocked and possibly simulates an error
2660 * insync - sets Insync providing device isn't active 2660 * insync - sets Insync providing device isn't active
2661 * write_error - sets WriteErrorSeen 2661 * write_error - sets WriteErrorSeen
2662 * -write_error - clears WriteErrorSeen 2662 * -write_error - clears WriteErrorSeen
2663 */ 2663 */
2664 int err = -EINVAL; 2664 int err = -EINVAL;
2665 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2665 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2666 md_error(rdev->mddev, rdev); 2666 md_error(rdev->mddev, rdev);
2667 if (test_bit(Faulty, &rdev->flags)) 2667 if (test_bit(Faulty, &rdev->flags))
2668 err = 0; 2668 err = 0;
2669 else 2669 else
2670 err = -EBUSY; 2670 err = -EBUSY;
2671 } else if (cmd_match(buf, "remove")) { 2671 } else if (cmd_match(buf, "remove")) {
2672 if (rdev->raid_disk >= 0) 2672 if (rdev->raid_disk >= 0)
2673 err = -EBUSY; 2673 err = -EBUSY;
2674 else { 2674 else {
2675 struct mddev *mddev = rdev->mddev; 2675 struct mddev *mddev = rdev->mddev;
2676 kick_rdev_from_array(rdev); 2676 kick_rdev_from_array(rdev);
2677 if (mddev->pers) 2677 if (mddev->pers)
2678 md_update_sb(mddev, 1); 2678 md_update_sb(mddev, 1);
2679 md_new_event(mddev); 2679 md_new_event(mddev);
2680 err = 0; 2680 err = 0;
2681 } 2681 }
2682 } else if (cmd_match(buf, "writemostly")) { 2682 } else if (cmd_match(buf, "writemostly")) {
2683 set_bit(WriteMostly, &rdev->flags); 2683 set_bit(WriteMostly, &rdev->flags);
2684 err = 0; 2684 err = 0;
2685 } else if (cmd_match(buf, "-writemostly")) { 2685 } else if (cmd_match(buf, "-writemostly")) {
2686 clear_bit(WriteMostly, &rdev->flags); 2686 clear_bit(WriteMostly, &rdev->flags);
2687 err = 0; 2687 err = 0;
2688 } else if (cmd_match(buf, "blocked")) { 2688 } else if (cmd_match(buf, "blocked")) {
2689 set_bit(Blocked, &rdev->flags); 2689 set_bit(Blocked, &rdev->flags);
2690 err = 0; 2690 err = 0;
2691 } else if (cmd_match(buf, "-blocked")) { 2691 } else if (cmd_match(buf, "-blocked")) {
2692 if (!test_bit(Faulty, &rdev->flags) && 2692 if (!test_bit(Faulty, &rdev->flags) &&
2693 rdev->badblocks.unacked_exist) { 2693 rdev->badblocks.unacked_exist) {
2694 /* metadata handler doesn't understand badblocks, 2694 /* metadata handler doesn't understand badblocks,
2695 * so we need to fail the device 2695 * so we need to fail the device
2696 */ 2696 */
2697 md_error(rdev->mddev, rdev); 2697 md_error(rdev->mddev, rdev);
2698 } 2698 }
2699 clear_bit(Blocked, &rdev->flags); 2699 clear_bit(Blocked, &rdev->flags);
2700 clear_bit(BlockedBadBlocks, &rdev->flags); 2700 clear_bit(BlockedBadBlocks, &rdev->flags);
2701 wake_up(&rdev->blocked_wait); 2701 wake_up(&rdev->blocked_wait);
2702 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2702 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2703 md_wakeup_thread(rdev->mddev->thread); 2703 md_wakeup_thread(rdev->mddev->thread);
2704 2704
2705 err = 0; 2705 err = 0;
2706 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2706 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2707 set_bit(In_sync, &rdev->flags); 2707 set_bit(In_sync, &rdev->flags);
2708 err = 0; 2708 err = 0;
2709 } else if (cmd_match(buf, "write_error")) { 2709 } else if (cmd_match(buf, "write_error")) {
2710 set_bit(WriteErrorSeen, &rdev->flags); 2710 set_bit(WriteErrorSeen, &rdev->flags);
2711 err = 0; 2711 err = 0;
2712 } else if (cmd_match(buf, "-write_error")) { 2712 } else if (cmd_match(buf, "-write_error")) {
2713 clear_bit(WriteErrorSeen, &rdev->flags); 2713 clear_bit(WriteErrorSeen, &rdev->flags);
2714 err = 0; 2714 err = 0;
2715 } else if (cmd_match(buf, "want_replacement")) { 2715 } else if (cmd_match(buf, "want_replacement")) {
2716 /* Any non-spare device that is not a replacement can 2716 /* Any non-spare device that is not a replacement can
2717 * become want_replacement at any time, but we then need to 2717 * become want_replacement at any time, but we then need to
2718 * check if recovery is needed. 2718 * check if recovery is needed.
2719 */ 2719 */
2720 if (rdev->raid_disk >= 0 && 2720 if (rdev->raid_disk >= 0 &&
2721 !test_bit(Replacement, &rdev->flags)) 2721 !test_bit(Replacement, &rdev->flags))
2722 set_bit(WantReplacement, &rdev->flags); 2722 set_bit(WantReplacement, &rdev->flags);
2723 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2723 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2724 md_wakeup_thread(rdev->mddev->thread); 2724 md_wakeup_thread(rdev->mddev->thread);
2725 err = 0; 2725 err = 0;
2726 } else if (cmd_match(buf, "-want_replacement")) { 2726 } else if (cmd_match(buf, "-want_replacement")) {
2727 /* Clearing 'want_replacement' is always allowed. 2727 /* Clearing 'want_replacement' is always allowed.
2728 * Once replacements starts it is too late though. 2728 * Once replacements starts it is too late though.
2729 */ 2729 */
2730 err = 0; 2730 err = 0;
2731 clear_bit(WantReplacement, &rdev->flags); 2731 clear_bit(WantReplacement, &rdev->flags);
2732 } else if (cmd_match(buf, "replacement")) { 2732 } else if (cmd_match(buf, "replacement")) {
2733 /* Can only set a device as a replacement when array has not 2733 /* Can only set a device as a replacement when array has not
2734 * yet been started. Once running, replacement is automatic 2734 * yet been started. Once running, replacement is automatic
2735 * from spares, or by assigning 'slot'. 2735 * from spares, or by assigning 'slot'.
2736 */ 2736 */
2737 if (rdev->mddev->pers) 2737 if (rdev->mddev->pers)
2738 err = -EBUSY; 2738 err = -EBUSY;
2739 else { 2739 else {
2740 set_bit(Replacement, &rdev->flags); 2740 set_bit(Replacement, &rdev->flags);
2741 err = 0; 2741 err = 0;
2742 } 2742 }
2743 } else if (cmd_match(buf, "-replacement")) { 2743 } else if (cmd_match(buf, "-replacement")) {
2744 /* Similarly, can only clear Replacement before start */ 2744 /* Similarly, can only clear Replacement before start */
2745 if (rdev->mddev->pers) 2745 if (rdev->mddev->pers)
2746 err = -EBUSY; 2746 err = -EBUSY;
2747 else { 2747 else {
2748 clear_bit(Replacement, &rdev->flags); 2748 clear_bit(Replacement, &rdev->flags);
2749 err = 0; 2749 err = 0;
2750 } 2750 }
2751 } 2751 }
2752 if (!err) 2752 if (!err)
2753 sysfs_notify_dirent_safe(rdev->sysfs_state); 2753 sysfs_notify_dirent_safe(rdev->sysfs_state);
2754 return err ? err : len; 2754 return err ? err : len;
2755 } 2755 }
2756 static struct rdev_sysfs_entry rdev_state = 2756 static struct rdev_sysfs_entry rdev_state =
2757 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 2757 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2758 2758
2759 static ssize_t 2759 static ssize_t
2760 errors_show(struct md_rdev *rdev, char *page) 2760 errors_show(struct md_rdev *rdev, char *page)
2761 { 2761 {
2762 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2762 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2763 } 2763 }
2764 2764
2765 static ssize_t 2765 static ssize_t
2766 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 2766 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2767 { 2767 {
2768 char *e; 2768 char *e;
2769 unsigned long n = simple_strtoul(buf, &e, 10); 2769 unsigned long n = simple_strtoul(buf, &e, 10);
2770 if (*buf && (*e == 0 || *e == '\n')) { 2770 if (*buf && (*e == 0 || *e == '\n')) {
2771 atomic_set(&rdev->corrected_errors, n); 2771 atomic_set(&rdev->corrected_errors, n);
2772 return len; 2772 return len;
2773 } 2773 }
2774 return -EINVAL; 2774 return -EINVAL;
2775 } 2775 }
2776 static struct rdev_sysfs_entry rdev_errors = 2776 static struct rdev_sysfs_entry rdev_errors =
2777 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2777 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2778 2778
2779 static ssize_t 2779 static ssize_t
2780 slot_show(struct md_rdev *rdev, char *page) 2780 slot_show(struct md_rdev *rdev, char *page)
2781 { 2781 {
2782 if (rdev->raid_disk < 0) 2782 if (rdev->raid_disk < 0)
2783 return sprintf(page, "none\n"); 2783 return sprintf(page, "none\n");
2784 else 2784 else
2785 return sprintf(page, "%d\n", rdev->raid_disk); 2785 return sprintf(page, "%d\n", rdev->raid_disk);
2786 } 2786 }
2787 2787
2788 static ssize_t 2788 static ssize_t
2789 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 2789 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2790 { 2790 {
2791 char *e; 2791 char *e;
2792 int err; 2792 int err;
2793 int slot = simple_strtoul(buf, &e, 10); 2793 int slot = simple_strtoul(buf, &e, 10);
2794 if (strncmp(buf, "none", 4)==0) 2794 if (strncmp(buf, "none", 4)==0)
2795 slot = -1; 2795 slot = -1;
2796 else if (e==buf || (*e && *e!= '\n')) 2796 else if (e==buf || (*e && *e!= '\n'))
2797 return -EINVAL; 2797 return -EINVAL;
2798 if (rdev->mddev->pers && slot == -1) { 2798 if (rdev->mddev->pers && slot == -1) {
2799 /* Setting 'slot' on an active array requires also 2799 /* Setting 'slot' on an active array requires also
2800 * updating the 'rd%d' link, and communicating 2800 * updating the 'rd%d' link, and communicating
2801 * with the personality with ->hot_*_disk. 2801 * with the personality with ->hot_*_disk.
2802 * For now we only support removing 2802 * For now we only support removing
2803 * failed/spare devices. This normally happens automatically, 2803 * failed/spare devices. This normally happens automatically,
2804 * but not when the metadata is externally managed. 2804 * but not when the metadata is externally managed.
2805 */ 2805 */
2806 if (rdev->raid_disk == -1) 2806 if (rdev->raid_disk == -1)
2807 return -EEXIST; 2807 return -EEXIST;
2808 /* personality does all needed checks */ 2808 /* personality does all needed checks */
2809 if (rdev->mddev->pers->hot_remove_disk == NULL) 2809 if (rdev->mddev->pers->hot_remove_disk == NULL)
2810 return -EINVAL; 2810 return -EINVAL;
2811 clear_bit(Blocked, &rdev->flags); 2811 clear_bit(Blocked, &rdev->flags);
2812 remove_and_add_spares(rdev->mddev, rdev); 2812 remove_and_add_spares(rdev->mddev, rdev);
2813 if (rdev->raid_disk >= 0) 2813 if (rdev->raid_disk >= 0)
2814 return -EBUSY; 2814 return -EBUSY;
2815 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2815 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2816 md_wakeup_thread(rdev->mddev->thread); 2816 md_wakeup_thread(rdev->mddev->thread);
2817 } else if (rdev->mddev->pers) { 2817 } else if (rdev->mddev->pers) {
2818 /* Activating a spare .. or possibly reactivating 2818 /* Activating a spare .. or possibly reactivating
2819 * if we ever get bitmaps working here. 2819 * if we ever get bitmaps working here.
2820 */ 2820 */
2821 2821
2822 if (rdev->raid_disk != -1) 2822 if (rdev->raid_disk != -1)
2823 return -EBUSY; 2823 return -EBUSY;
2824 2824
2825 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 2825 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2826 return -EBUSY; 2826 return -EBUSY;
2827 2827
2828 if (rdev->mddev->pers->hot_add_disk == NULL) 2828 if (rdev->mddev->pers->hot_add_disk == NULL)
2829 return -EINVAL; 2829 return -EINVAL;
2830 2830
2831 if (slot >= rdev->mddev->raid_disks && 2831 if (slot >= rdev->mddev->raid_disks &&
2832 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2832 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2833 return -ENOSPC; 2833 return -ENOSPC;
2834 2834
2835 rdev->raid_disk = slot; 2835 rdev->raid_disk = slot;
2836 if (test_bit(In_sync, &rdev->flags)) 2836 if (test_bit(In_sync, &rdev->flags))
2837 rdev->saved_raid_disk = slot; 2837 rdev->saved_raid_disk = slot;
2838 else 2838 else
2839 rdev->saved_raid_disk = -1; 2839 rdev->saved_raid_disk = -1;
2840 clear_bit(In_sync, &rdev->flags); 2840 clear_bit(In_sync, &rdev->flags);
2841 err = rdev->mddev->pers-> 2841 err = rdev->mddev->pers->
2842 hot_add_disk(rdev->mddev, rdev); 2842 hot_add_disk(rdev->mddev, rdev);
2843 if (err) { 2843 if (err) {
2844 rdev->raid_disk = -1; 2844 rdev->raid_disk = -1;
2845 return err; 2845 return err;
2846 } else 2846 } else
2847 sysfs_notify_dirent_safe(rdev->sysfs_state); 2847 sysfs_notify_dirent_safe(rdev->sysfs_state);
2848 if (sysfs_link_rdev(rdev->mddev, rdev)) 2848 if (sysfs_link_rdev(rdev->mddev, rdev))
2849 /* failure here is OK */; 2849 /* failure here is OK */;
2850 /* don't wakeup anyone, leave that to userspace. */ 2850 /* don't wakeup anyone, leave that to userspace. */
2851 } else { 2851 } else {
2852 if (slot >= rdev->mddev->raid_disks && 2852 if (slot >= rdev->mddev->raid_disks &&
2853 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2853 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2854 return -ENOSPC; 2854 return -ENOSPC;
2855 rdev->raid_disk = slot; 2855 rdev->raid_disk = slot;
2856 /* assume it is working */ 2856 /* assume it is working */
2857 clear_bit(Faulty, &rdev->flags); 2857 clear_bit(Faulty, &rdev->flags);
2858 clear_bit(WriteMostly, &rdev->flags); 2858 clear_bit(WriteMostly, &rdev->flags);
2859 set_bit(In_sync, &rdev->flags); 2859 set_bit(In_sync, &rdev->flags);
2860 sysfs_notify_dirent_safe(rdev->sysfs_state); 2860 sysfs_notify_dirent_safe(rdev->sysfs_state);
2861 } 2861 }
2862 return len; 2862 return len;
2863 } 2863 }
2864 2864
2865 2865
2866 static struct rdev_sysfs_entry rdev_slot = 2866 static struct rdev_sysfs_entry rdev_slot =
2867 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2867 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2868 2868
2869 static ssize_t 2869 static ssize_t
2870 offset_show(struct md_rdev *rdev, char *page) 2870 offset_show(struct md_rdev *rdev, char *page)
2871 { 2871 {
2872 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2872 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2873 } 2873 }
2874 2874
2875 static ssize_t 2875 static ssize_t
2876 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 2876 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
2877 { 2877 {
2878 unsigned long long offset; 2878 unsigned long long offset;
2879 if (strict_strtoull(buf, 10, &offset) < 0) 2879 if (strict_strtoull(buf, 10, &offset) < 0)
2880 return -EINVAL; 2880 return -EINVAL;
2881 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2881 if (rdev->mddev->pers && rdev->raid_disk >= 0)
2882 return -EBUSY; 2882 return -EBUSY;
2883 if (rdev->sectors && rdev->mddev->external) 2883 if (rdev->sectors && rdev->mddev->external)
2884 /* Must set offset before size, so overlap checks 2884 /* Must set offset before size, so overlap checks
2885 * can be sane */ 2885 * can be sane */
2886 return -EBUSY; 2886 return -EBUSY;
2887 rdev->data_offset = offset; 2887 rdev->data_offset = offset;
2888 rdev->new_data_offset = offset; 2888 rdev->new_data_offset = offset;
2889 return len; 2889 return len;
2890 } 2890 }
2891 2891
2892 static struct rdev_sysfs_entry rdev_offset = 2892 static struct rdev_sysfs_entry rdev_offset =
2893 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2893 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2894 2894
2895 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 2895 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
2896 { 2896 {
2897 return sprintf(page, "%llu\n", 2897 return sprintf(page, "%llu\n",
2898 (unsigned long long)rdev->new_data_offset); 2898 (unsigned long long)rdev->new_data_offset);
2899 } 2899 }
2900 2900
2901 static ssize_t new_offset_store(struct md_rdev *rdev, 2901 static ssize_t new_offset_store(struct md_rdev *rdev,
2902 const char *buf, size_t len) 2902 const char *buf, size_t len)
2903 { 2903 {
2904 unsigned long long new_offset; 2904 unsigned long long new_offset;
2905 struct mddev *mddev = rdev->mddev; 2905 struct mddev *mddev = rdev->mddev;
2906 2906
2907 if (strict_strtoull(buf, 10, &new_offset) < 0) 2907 if (strict_strtoull(buf, 10, &new_offset) < 0)
2908 return -EINVAL; 2908 return -EINVAL;
2909 2909
2910 if (mddev->sync_thread) 2910 if (mddev->sync_thread)
2911 return -EBUSY; 2911 return -EBUSY;
2912 if (new_offset == rdev->data_offset) 2912 if (new_offset == rdev->data_offset)
2913 /* reset is always permitted */ 2913 /* reset is always permitted */
2914 ; 2914 ;
2915 else if (new_offset > rdev->data_offset) { 2915 else if (new_offset > rdev->data_offset) {
2916 /* must not push array size beyond rdev_sectors */ 2916 /* must not push array size beyond rdev_sectors */
2917 if (new_offset - rdev->data_offset 2917 if (new_offset - rdev->data_offset
2918 + mddev->dev_sectors > rdev->sectors) 2918 + mddev->dev_sectors > rdev->sectors)
2919 return -E2BIG; 2919 return -E2BIG;
2920 } 2920 }
2921 /* Metadata worries about other space details. */ 2921 /* Metadata worries about other space details. */
2922 2922
2923 /* decreasing the offset is inconsistent with a backwards 2923 /* decreasing the offset is inconsistent with a backwards
2924 * reshape. 2924 * reshape.
2925 */ 2925 */
2926 if (new_offset < rdev->data_offset && 2926 if (new_offset < rdev->data_offset &&
2927 mddev->reshape_backwards) 2927 mddev->reshape_backwards)
2928 return -EINVAL; 2928 return -EINVAL;
2929 /* Increasing offset is inconsistent with forwards 2929 /* Increasing offset is inconsistent with forwards
2930 * reshape. reshape_direction should be set to 2930 * reshape. reshape_direction should be set to
2931 * 'backwards' first. 2931 * 'backwards' first.
2932 */ 2932 */
2933 if (new_offset > rdev->data_offset && 2933 if (new_offset > rdev->data_offset &&
2934 !mddev->reshape_backwards) 2934 !mddev->reshape_backwards)
2935 return -EINVAL; 2935 return -EINVAL;
2936 2936
2937 if (mddev->pers && mddev->persistent && 2937 if (mddev->pers && mddev->persistent &&
2938 !super_types[mddev->major_version] 2938 !super_types[mddev->major_version]
2939 .allow_new_offset(rdev, new_offset)) 2939 .allow_new_offset(rdev, new_offset))
2940 return -E2BIG; 2940 return -E2BIG;
2941 rdev->new_data_offset = new_offset; 2941 rdev->new_data_offset = new_offset;
2942 if (new_offset > rdev->data_offset) 2942 if (new_offset > rdev->data_offset)
2943 mddev->reshape_backwards = 1; 2943 mddev->reshape_backwards = 1;
2944 else if (new_offset < rdev->data_offset) 2944 else if (new_offset < rdev->data_offset)
2945 mddev->reshape_backwards = 0; 2945 mddev->reshape_backwards = 0;
2946 2946
2947 return len; 2947 return len;
2948 } 2948 }
2949 static struct rdev_sysfs_entry rdev_new_offset = 2949 static struct rdev_sysfs_entry rdev_new_offset =
2950 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 2950 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
2951 2951
2952 static ssize_t 2952 static ssize_t
2953 rdev_size_show(struct md_rdev *rdev, char *page) 2953 rdev_size_show(struct md_rdev *rdev, char *page)
2954 { 2954 {
2955 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2955 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2956 } 2956 }
2957 2957
2958 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2958 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2959 { 2959 {
2960 /* check if two start/length pairs overlap */ 2960 /* check if two start/length pairs overlap */
2961 if (s1+l1 <= s2) 2961 if (s1+l1 <= s2)
2962 return 0; 2962 return 0;
2963 if (s2+l2 <= s1) 2963 if (s2+l2 <= s1)
2964 return 0; 2964 return 0;
2965 return 1; 2965 return 1;
2966 } 2966 }
2967 2967
2968 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2968 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2969 { 2969 {
2970 unsigned long long blocks; 2970 unsigned long long blocks;
2971 sector_t new; 2971 sector_t new;
2972 2972
2973 if (strict_strtoull(buf, 10, &blocks) < 0) 2973 if (strict_strtoull(buf, 10, &blocks) < 0)
2974 return -EINVAL; 2974 return -EINVAL;
2975 2975
2976 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2976 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2977 return -EINVAL; /* sector conversion overflow */ 2977 return -EINVAL; /* sector conversion overflow */
2978 2978
2979 new = blocks * 2; 2979 new = blocks * 2;
2980 if (new != blocks * 2) 2980 if (new != blocks * 2)
2981 return -EINVAL; /* unsigned long long to sector_t overflow */ 2981 return -EINVAL; /* unsigned long long to sector_t overflow */
2982 2982
2983 *sectors = new; 2983 *sectors = new;
2984 return 0; 2984 return 0;
2985 } 2985 }
2986 2986
2987 static ssize_t 2987 static ssize_t
2988 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 2988 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2989 { 2989 {
2990 struct mddev *my_mddev = rdev->mddev; 2990 struct mddev *my_mddev = rdev->mddev;
2991 sector_t oldsectors = rdev->sectors; 2991 sector_t oldsectors = rdev->sectors;
2992 sector_t sectors; 2992 sector_t sectors;
2993 2993
2994 if (strict_blocks_to_sectors(buf, &sectors) < 0) 2994 if (strict_blocks_to_sectors(buf, &sectors) < 0)
2995 return -EINVAL; 2995 return -EINVAL;
2996 if (rdev->data_offset != rdev->new_data_offset) 2996 if (rdev->data_offset != rdev->new_data_offset)
2997 return -EINVAL; /* too confusing */ 2997 return -EINVAL; /* too confusing */
2998 if (my_mddev->pers && rdev->raid_disk >= 0) { 2998 if (my_mddev->pers && rdev->raid_disk >= 0) {
2999 if (my_mddev->persistent) { 2999 if (my_mddev->persistent) {
3000 sectors = super_types[my_mddev->major_version]. 3000 sectors = super_types[my_mddev->major_version].
3001 rdev_size_change(rdev, sectors); 3001 rdev_size_change(rdev, sectors);
3002 if (!sectors) 3002 if (!sectors)
3003 return -EBUSY; 3003 return -EBUSY;
3004 } else if (!sectors) 3004 } else if (!sectors)
3005 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 3005 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
3006 rdev->data_offset; 3006 rdev->data_offset;
3007 if (!my_mddev->pers->resize) 3007 if (!my_mddev->pers->resize)
3008 /* Cannot change size for RAID0 or Linear etc */ 3008 /* Cannot change size for RAID0 or Linear etc */
3009 return -EINVAL; 3009 return -EINVAL;
3010 } 3010 }
3011 if (sectors < my_mddev->dev_sectors) 3011 if (sectors < my_mddev->dev_sectors)
3012 return -EINVAL; /* component must fit device */ 3012 return -EINVAL; /* component must fit device */
3013 3013
3014 rdev->sectors = sectors; 3014 rdev->sectors = sectors;
3015 if (sectors > oldsectors && my_mddev->external) { 3015 if (sectors > oldsectors && my_mddev->external) {
3016 /* need to check that all other rdevs with the same ->bdev 3016 /* need to check that all other rdevs with the same ->bdev
3017 * do not overlap. We need to unlock the mddev to avoid 3017 * do not overlap. We need to unlock the mddev to avoid
3018 * a deadlock. We have already changed rdev->sectors, and if 3018 * a deadlock. We have already changed rdev->sectors, and if
3019 * we have to change it back, we will have the lock again. 3019 * we have to change it back, we will have the lock again.
3020 */ 3020 */
3021 struct mddev *mddev; 3021 struct mddev *mddev;
3022 int overlap = 0; 3022 int overlap = 0;
3023 struct list_head *tmp; 3023 struct list_head *tmp;
3024 3024
3025 mddev_unlock(my_mddev); 3025 mddev_unlock(my_mddev);
3026 for_each_mddev(mddev, tmp) { 3026 for_each_mddev(mddev, tmp) {
3027 struct md_rdev *rdev2; 3027 struct md_rdev *rdev2;
3028 3028
3029 mddev_lock(mddev); 3029 mddev_lock(mddev);
3030 rdev_for_each(rdev2, mddev) 3030 rdev_for_each(rdev2, mddev)
3031 if (rdev->bdev == rdev2->bdev && 3031 if (rdev->bdev == rdev2->bdev &&
3032 rdev != rdev2 && 3032 rdev != rdev2 &&
3033 overlaps(rdev->data_offset, rdev->sectors, 3033 overlaps(rdev->data_offset, rdev->sectors,
3034 rdev2->data_offset, 3034 rdev2->data_offset,
3035 rdev2->sectors)) { 3035 rdev2->sectors)) {
3036 overlap = 1; 3036 overlap = 1;
3037 break; 3037 break;
3038 } 3038 }
3039 mddev_unlock(mddev); 3039 mddev_unlock(mddev);
3040 if (overlap) { 3040 if (overlap) {
3041 mddev_put(mddev); 3041 mddev_put(mddev);
3042 break; 3042 break;
3043 } 3043 }
3044 } 3044 }
3045 mddev_lock(my_mddev); 3045 mddev_lock(my_mddev);
3046 if (overlap) { 3046 if (overlap) {
3047 /* Someone else could have slipped in a size 3047 /* Someone else could have slipped in a size
3048 * change here, but doing so is just silly. 3048 * change here, but doing so is just silly.
3049 * We put oldsectors back because we *know* it is 3049 * We put oldsectors back because we *know* it is
3050 * safe, and trust userspace not to race with 3050 * safe, and trust userspace not to race with
3051 * itself 3051 * itself
3052 */ 3052 */
3053 rdev->sectors = oldsectors; 3053 rdev->sectors = oldsectors;
3054 return -EBUSY; 3054 return -EBUSY;
3055 } 3055 }
3056 } 3056 }
3057 return len; 3057 return len;
3058 } 3058 }
3059 3059
3060 static struct rdev_sysfs_entry rdev_size = 3060 static struct rdev_sysfs_entry rdev_size =
3061 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3061 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3062 3062
3063 3063
3064 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3064 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3065 { 3065 {
3066 unsigned long long recovery_start = rdev->recovery_offset; 3066 unsigned long long recovery_start = rdev->recovery_offset;
3067 3067
3068 if (test_bit(In_sync, &rdev->flags) || 3068 if (test_bit(In_sync, &rdev->flags) ||
3069 recovery_start == MaxSector) 3069 recovery_start == MaxSector)
3070 return sprintf(page, "none\n"); 3070 return sprintf(page, "none\n");
3071 3071
3072 return sprintf(page, "%llu\n", recovery_start); 3072 return sprintf(page, "%llu\n", recovery_start);
3073 } 3073 }
3074 3074
3075 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3075 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3076 { 3076 {
3077 unsigned long long recovery_start; 3077 unsigned long long recovery_start;
3078 3078
3079 if (cmd_match(buf, "none")) 3079 if (cmd_match(buf, "none"))
3080 recovery_start = MaxSector; 3080 recovery_start = MaxSector;
3081 else if (strict_strtoull(buf, 10, &recovery_start)) 3081 else if (strict_strtoull(buf, 10, &recovery_start))
3082 return -EINVAL; 3082 return -EINVAL;
3083 3083
3084 if (rdev->mddev->pers && 3084 if (rdev->mddev->pers &&
3085 rdev->raid_disk >= 0) 3085 rdev->raid_disk >= 0)
3086 return -EBUSY; 3086 return -EBUSY;
3087 3087
3088 rdev->recovery_offset = recovery_start; 3088 rdev->recovery_offset = recovery_start;
3089 if (recovery_start == MaxSector) 3089 if (recovery_start == MaxSector)
3090 set_bit(In_sync, &rdev->flags); 3090 set_bit(In_sync, &rdev->flags);
3091 else 3091 else
3092 clear_bit(In_sync, &rdev->flags); 3092 clear_bit(In_sync, &rdev->flags);
3093 return len; 3093 return len;
3094 } 3094 }
3095 3095
3096 static struct rdev_sysfs_entry rdev_recovery_start = 3096 static struct rdev_sysfs_entry rdev_recovery_start =
3097 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3097 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3098 3098
3099 3099
3100 static ssize_t 3100 static ssize_t
3101 badblocks_show(struct badblocks *bb, char *page, int unack); 3101 badblocks_show(struct badblocks *bb, char *page, int unack);
3102 static ssize_t 3102 static ssize_t
3103 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); 3103 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
3104 3104
3105 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3105 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3106 { 3106 {
3107 return badblocks_show(&rdev->badblocks, page, 0); 3107 return badblocks_show(&rdev->badblocks, page, 0);
3108 } 3108 }
3109 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3109 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3110 { 3110 {
3111 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3111 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3112 /* Maybe that ack was all we needed */ 3112 /* Maybe that ack was all we needed */
3113 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3113 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3114 wake_up(&rdev->blocked_wait); 3114 wake_up(&rdev->blocked_wait);
3115 return rv; 3115 return rv;
3116 } 3116 }
3117 static struct rdev_sysfs_entry rdev_bad_blocks = 3117 static struct rdev_sysfs_entry rdev_bad_blocks =
3118 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3118 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3119 3119
3120 3120
3121 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3121 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3122 { 3122 {
3123 return badblocks_show(&rdev->badblocks, page, 1); 3123 return badblocks_show(&rdev->badblocks, page, 1);
3124 } 3124 }
3125 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3125 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3126 { 3126 {
3127 return badblocks_store(&rdev->badblocks, page, len, 1); 3127 return badblocks_store(&rdev->badblocks, page, len, 1);
3128 } 3128 }
3129 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3129 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3130 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3130 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3131 3131
3132 static struct attribute *rdev_default_attrs[] = { 3132 static struct attribute *rdev_default_attrs[] = {
3133 &rdev_state.attr, 3133 &rdev_state.attr,
3134 &rdev_errors.attr, 3134 &rdev_errors.attr,
3135 &rdev_slot.attr, 3135 &rdev_slot.attr,
3136 &rdev_offset.attr, 3136 &rdev_offset.attr,
3137 &rdev_new_offset.attr, 3137 &rdev_new_offset.attr,
3138 &rdev_size.attr, 3138 &rdev_size.attr,
3139 &rdev_recovery_start.attr, 3139 &rdev_recovery_start.attr,
3140 &rdev_bad_blocks.attr, 3140 &rdev_bad_blocks.attr,
3141 &rdev_unack_bad_blocks.attr, 3141 &rdev_unack_bad_blocks.attr,
3142 NULL, 3142 NULL,
3143 }; 3143 };
3144 static ssize_t 3144 static ssize_t
3145 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3145 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3146 { 3146 {
3147 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3147 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3148 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3148 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3149 struct mddev *mddev = rdev->mddev; 3149 struct mddev *mddev = rdev->mddev;
3150 ssize_t rv; 3150 ssize_t rv;
3151 3151
3152 if (!entry->show) 3152 if (!entry->show)
3153 return -EIO; 3153 return -EIO;
3154 3154
3155 rv = mddev ? mddev_lock(mddev) : -EBUSY; 3155 rv = mddev ? mddev_lock(mddev) : -EBUSY;
3156 if (!rv) { 3156 if (!rv) {
3157 if (rdev->mddev == NULL) 3157 if (rdev->mddev == NULL)
3158 rv = -EBUSY; 3158 rv = -EBUSY;
3159 else 3159 else
3160 rv = entry->show(rdev, page); 3160 rv = entry->show(rdev, page);
3161 mddev_unlock(mddev); 3161 mddev_unlock(mddev);
3162 } 3162 }
3163 return rv; 3163 return rv;
3164 } 3164 }
3165 3165
3166 static ssize_t 3166 static ssize_t
3167 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3167 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3168 const char *page, size_t length) 3168 const char *page, size_t length)
3169 { 3169 {
3170 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3170 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3171 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3171 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3172 ssize_t rv; 3172 ssize_t rv;
3173 struct mddev *mddev = rdev->mddev; 3173 struct mddev *mddev = rdev->mddev;
3174 3174
3175 if (!entry->store) 3175 if (!entry->store)
3176 return -EIO; 3176 return -EIO;
3177 if (!capable(CAP_SYS_ADMIN)) 3177 if (!capable(CAP_SYS_ADMIN))
3178 return -EACCES; 3178 return -EACCES;
3179 rv = mddev ? mddev_lock(mddev): -EBUSY; 3179 rv = mddev ? mddev_lock(mddev): -EBUSY;
3180 if (!rv) { 3180 if (!rv) {
3181 if (rdev->mddev == NULL) 3181 if (rdev->mddev == NULL)
3182 rv = -EBUSY; 3182 rv = -EBUSY;
3183 else 3183 else
3184 rv = entry->store(rdev, page, length); 3184 rv = entry->store(rdev, page, length);
3185 mddev_unlock(mddev); 3185 mddev_unlock(mddev);
3186 } 3186 }
3187 return rv; 3187 return rv;
3188 } 3188 }
3189 3189
3190 static void rdev_free(struct kobject *ko) 3190 static void rdev_free(struct kobject *ko)
3191 { 3191 {
3192 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3192 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3193 kfree(rdev); 3193 kfree(rdev);
3194 } 3194 }
3195 static const struct sysfs_ops rdev_sysfs_ops = { 3195 static const struct sysfs_ops rdev_sysfs_ops = {
3196 .show = rdev_attr_show, 3196 .show = rdev_attr_show,
3197 .store = rdev_attr_store, 3197 .store = rdev_attr_store,
3198 }; 3198 };
3199 static struct kobj_type rdev_ktype = { 3199 static struct kobj_type rdev_ktype = {
3200 .release = rdev_free, 3200 .release = rdev_free,
3201 .sysfs_ops = &rdev_sysfs_ops, 3201 .sysfs_ops = &rdev_sysfs_ops,
3202 .default_attrs = rdev_default_attrs, 3202 .default_attrs = rdev_default_attrs,
3203 }; 3203 };
3204 3204
3205 int md_rdev_init(struct md_rdev *rdev) 3205 int md_rdev_init(struct md_rdev *rdev)
3206 { 3206 {
3207 rdev->desc_nr = -1; 3207 rdev->desc_nr = -1;
3208 rdev->saved_raid_disk = -1; 3208 rdev->saved_raid_disk = -1;
3209 rdev->raid_disk = -1; 3209 rdev->raid_disk = -1;
3210 rdev->flags = 0; 3210 rdev->flags = 0;
3211 rdev->data_offset = 0; 3211 rdev->data_offset = 0;
3212 rdev->new_data_offset = 0; 3212 rdev->new_data_offset = 0;
3213 rdev->sb_events = 0; 3213 rdev->sb_events = 0;
3214 rdev->last_read_error.tv_sec = 0; 3214 rdev->last_read_error.tv_sec = 0;
3215 rdev->last_read_error.tv_nsec = 0; 3215 rdev->last_read_error.tv_nsec = 0;
3216 rdev->sb_loaded = 0; 3216 rdev->sb_loaded = 0;
3217 rdev->bb_page = NULL; 3217 rdev->bb_page = NULL;
3218 atomic_set(&rdev->nr_pending, 0); 3218 atomic_set(&rdev->nr_pending, 0);
3219 atomic_set(&rdev->read_errors, 0); 3219 atomic_set(&rdev->read_errors, 0);
3220 atomic_set(&rdev->corrected_errors, 0); 3220 atomic_set(&rdev->corrected_errors, 0);
3221 3221
3222 INIT_LIST_HEAD(&rdev->same_set); 3222 INIT_LIST_HEAD(&rdev->same_set);
3223 init_waitqueue_head(&rdev->blocked_wait); 3223 init_waitqueue_head(&rdev->blocked_wait);
3224 3224
3225 /* Add space to store bad block list. 3225 /* Add space to store bad block list.
3226 * This reserves the space even on arrays where it cannot 3226 * This reserves the space even on arrays where it cannot
3227 * be used - I wonder if that matters 3227 * be used - I wonder if that matters
3228 */ 3228 */
3229 rdev->badblocks.count = 0; 3229 rdev->badblocks.count = 0;
3230 rdev->badblocks.shift = 0; 3230 rdev->badblocks.shift = 0;
3231 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); 3231 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3232 seqlock_init(&rdev->badblocks.lock); 3232 seqlock_init(&rdev->badblocks.lock);
3233 if (rdev->badblocks.page == NULL) 3233 if (rdev->badblocks.page == NULL)
3234 return -ENOMEM; 3234 return -ENOMEM;
3235 3235
3236 return 0; 3236 return 0;
3237 } 3237 }
3238 EXPORT_SYMBOL_GPL(md_rdev_init); 3238 EXPORT_SYMBOL_GPL(md_rdev_init);
3239 /* 3239 /*
3240 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3240 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3241 * 3241 *
3242 * mark the device faulty if: 3242 * mark the device faulty if:
3243 * 3243 *
3244 * - the device is nonexistent (zero size) 3244 * - the device is nonexistent (zero size)
3245 * - the device has no valid superblock 3245 * - the device has no valid superblock
3246 * 3246 *
3247 * a faulty rdev _never_ has rdev->sb set. 3247 * a faulty rdev _never_ has rdev->sb set.
3248 */ 3248 */
3249 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3249 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3250 { 3250 {
3251 char b[BDEVNAME_SIZE]; 3251 char b[BDEVNAME_SIZE];
3252 int err; 3252 int err;
3253 struct md_rdev *rdev; 3253 struct md_rdev *rdev;
3254 sector_t size; 3254 sector_t size;
3255 3255
3256 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3256 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3257 if (!rdev) { 3257 if (!rdev) {
3258 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 3258 printk(KERN_ERR "md: could not alloc mem for new device!\n");
3259 return ERR_PTR(-ENOMEM); 3259 return ERR_PTR(-ENOMEM);
3260 } 3260 }
3261 3261
3262 err = md_rdev_init(rdev); 3262 err = md_rdev_init(rdev);
3263 if (err) 3263 if (err)
3264 goto abort_free; 3264 goto abort_free;
3265 err = alloc_disk_sb(rdev); 3265 err = alloc_disk_sb(rdev);
3266 if (err) 3266 if (err)
3267 goto abort_free; 3267 goto abort_free;
3268 3268
3269 err = lock_rdev(rdev, newdev, super_format == -2); 3269 err = lock_rdev(rdev, newdev, super_format == -2);
3270 if (err) 3270 if (err)
3271 goto abort_free; 3271 goto abort_free;
3272 3272
3273 kobject_init(&rdev->kobj, &rdev_ktype); 3273 kobject_init(&rdev->kobj, &rdev_ktype);
3274 3274
3275 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; 3275 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3276 if (!size) { 3276 if (!size) {
3277 printk(KERN_WARNING 3277 printk(KERN_WARNING
3278 "md: %s has zero or unknown size, marking faulty!\n", 3278 "md: %s has zero or unknown size, marking faulty!\n",
3279 bdevname(rdev->bdev,b)); 3279 bdevname(rdev->bdev,b));
3280 err = -EINVAL; 3280 err = -EINVAL;
3281 goto abort_free; 3281 goto abort_free;
3282 } 3282 }
3283 3283
3284 if (super_format >= 0) { 3284 if (super_format >= 0) {
3285 err = super_types[super_format]. 3285 err = super_types[super_format].
3286 load_super(rdev, NULL, super_minor); 3286 load_super(rdev, NULL, super_minor);
3287 if (err == -EINVAL) { 3287 if (err == -EINVAL) {
3288 printk(KERN_WARNING 3288 printk(KERN_WARNING
3289 "md: %s does not have a valid v%d.%d " 3289 "md: %s does not have a valid v%d.%d "
3290 "superblock, not importing!\n", 3290 "superblock, not importing!\n",
3291 bdevname(rdev->bdev,b), 3291 bdevname(rdev->bdev,b),
3292 super_format, super_minor); 3292 super_format, super_minor);
3293 goto abort_free; 3293 goto abort_free;
3294 } 3294 }
3295 if (err < 0) { 3295 if (err < 0) {
3296 printk(KERN_WARNING 3296 printk(KERN_WARNING
3297 "md: could not read %s's sb, not importing!\n", 3297 "md: could not read %s's sb, not importing!\n",
3298 bdevname(rdev->bdev,b)); 3298 bdevname(rdev->bdev,b));
3299 goto abort_free; 3299 goto abort_free;
3300 } 3300 }
3301 } 3301 }
3302 if (super_format == -1) 3302 if (super_format == -1)
3303 /* hot-add for 0.90, or non-persistent: so no badblocks */ 3303 /* hot-add for 0.90, or non-persistent: so no badblocks */
3304 rdev->badblocks.shift = -1; 3304 rdev->badblocks.shift = -1;
3305 3305
3306 return rdev; 3306 return rdev;
3307 3307
3308 abort_free: 3308 abort_free:
3309 if (rdev->bdev) 3309 if (rdev->bdev)
3310 unlock_rdev(rdev); 3310 unlock_rdev(rdev);
3311 md_rdev_clear(rdev); 3311 md_rdev_clear(rdev);
3312 kfree(rdev); 3312 kfree(rdev);
3313 return ERR_PTR(err); 3313 return ERR_PTR(err);
3314 } 3314 }
3315 3315
3316 /* 3316 /*
3317 * Check a full RAID array for plausibility 3317 * Check a full RAID array for plausibility
3318 */ 3318 */
3319 3319
3320 3320
3321 static void analyze_sbs(struct mddev * mddev) 3321 static void analyze_sbs(struct mddev * mddev)
3322 { 3322 {
3323 int i; 3323 int i;
3324 struct md_rdev *rdev, *freshest, *tmp; 3324 struct md_rdev *rdev, *freshest, *tmp;
3325 char b[BDEVNAME_SIZE]; 3325 char b[BDEVNAME_SIZE];
3326 3326
3327 freshest = NULL; 3327 freshest = NULL;
3328 rdev_for_each_safe(rdev, tmp, mddev) 3328 rdev_for_each_safe(rdev, tmp, mddev)
3329 switch (super_types[mddev->major_version]. 3329 switch (super_types[mddev->major_version].
3330 load_super(rdev, freshest, mddev->minor_version)) { 3330 load_super(rdev, freshest, mddev->minor_version)) {
3331 case 1: 3331 case 1:
3332 freshest = rdev; 3332 freshest = rdev;
3333 break; 3333 break;
3334 case 0: 3334 case 0:
3335 break; 3335 break;
3336 default: 3336 default:
3337 printk( KERN_ERR \ 3337 printk( KERN_ERR \
3338 "md: fatal superblock inconsistency in %s" 3338 "md: fatal superblock inconsistency in %s"
3339 " -- removing from array\n", 3339 " -- removing from array\n",
3340 bdevname(rdev->bdev,b)); 3340 bdevname(rdev->bdev,b));
3341 kick_rdev_from_array(rdev); 3341 kick_rdev_from_array(rdev);
3342 } 3342 }
3343 3343
3344 3344
3345 super_types[mddev->major_version]. 3345 super_types[mddev->major_version].
3346 validate_super(mddev, freshest); 3346 validate_super(mddev, freshest);
3347 3347
3348 i = 0; 3348 i = 0;
3349 rdev_for_each_safe(rdev, tmp, mddev) { 3349 rdev_for_each_safe(rdev, tmp, mddev) {
3350 if (mddev->max_disks && 3350 if (mddev->max_disks &&
3351 (rdev->desc_nr >= mddev->max_disks || 3351 (rdev->desc_nr >= mddev->max_disks ||
3352 i > mddev->max_disks)) { 3352 i > mddev->max_disks)) {
3353 printk(KERN_WARNING 3353 printk(KERN_WARNING
3354 "md: %s: %s: only %d devices permitted\n", 3354 "md: %s: %s: only %d devices permitted\n",
3355 mdname(mddev), bdevname(rdev->bdev, b), 3355 mdname(mddev), bdevname(rdev->bdev, b),
3356 mddev->max_disks); 3356 mddev->max_disks);
3357 kick_rdev_from_array(rdev); 3357 kick_rdev_from_array(rdev);
3358 continue; 3358 continue;
3359 } 3359 }
3360 if (rdev != freshest) 3360 if (rdev != freshest)
3361 if (super_types[mddev->major_version]. 3361 if (super_types[mddev->major_version].
3362 validate_super(mddev, rdev)) { 3362 validate_super(mddev, rdev)) {
3363 printk(KERN_WARNING "md: kicking non-fresh %s" 3363 printk(KERN_WARNING "md: kicking non-fresh %s"
3364 " from array!\n", 3364 " from array!\n",
3365 bdevname(rdev->bdev,b)); 3365 bdevname(rdev->bdev,b));
3366 kick_rdev_from_array(rdev); 3366 kick_rdev_from_array(rdev);
3367 continue; 3367 continue;
3368 } 3368 }
3369 if (mddev->level == LEVEL_MULTIPATH) { 3369 if (mddev->level == LEVEL_MULTIPATH) {
3370 rdev->desc_nr = i++; 3370 rdev->desc_nr = i++;
3371 rdev->raid_disk = rdev->desc_nr; 3371 rdev->raid_disk = rdev->desc_nr;
3372 set_bit(In_sync, &rdev->flags); 3372 set_bit(In_sync, &rdev->flags);
3373 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { 3373 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
3374 rdev->raid_disk = -1; 3374 rdev->raid_disk = -1;
3375 clear_bit(In_sync, &rdev->flags); 3375 clear_bit(In_sync, &rdev->flags);
3376 } 3376 }
3377 } 3377 }
3378 } 3378 }
3379 3379
3380 /* Read a fixed-point number. 3380 /* Read a fixed-point number.
3381 * Numbers in sysfs attributes should be in "standard" units where 3381 * Numbers in sysfs attributes should be in "standard" units where
3382 * possible, so time should be in seconds. 3382 * possible, so time should be in seconds.
3383 * However we internally use a a much smaller unit such as 3383 * However we internally use a a much smaller unit such as
3384 * milliseconds or jiffies. 3384 * milliseconds or jiffies.
3385 * This function takes a decimal number with a possible fractional 3385 * This function takes a decimal number with a possible fractional
3386 * component, and produces an integer which is the result of 3386 * component, and produces an integer which is the result of
3387 * multiplying that number by 10^'scale'. 3387 * multiplying that number by 10^'scale'.
3388 * all without any floating-point arithmetic. 3388 * all without any floating-point arithmetic.
3389 */ 3389 */
3390 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3390 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3391 { 3391 {
3392 unsigned long result = 0; 3392 unsigned long result = 0;
3393 long decimals = -1; 3393 long decimals = -1;
3394 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3394 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3395 if (*cp == '.') 3395 if (*cp == '.')
3396 decimals = 0; 3396 decimals = 0;
3397 else if (decimals < scale) { 3397 else if (decimals < scale) {
3398 unsigned int value; 3398 unsigned int value;
3399 value = *cp - '0'; 3399 value = *cp - '0';
3400 result = result * 10 + value; 3400 result = result * 10 + value;
3401 if (decimals >= 0) 3401 if (decimals >= 0)
3402 decimals++; 3402 decimals++;
3403 } 3403 }
3404 cp++; 3404 cp++;
3405 } 3405 }
3406 if (*cp == '\n') 3406 if (*cp == '\n')
3407 cp++; 3407 cp++;
3408 if (*cp) 3408 if (*cp)
3409 return -EINVAL; 3409 return -EINVAL;
3410 if (decimals < 0) 3410 if (decimals < 0)
3411 decimals = 0; 3411 decimals = 0;
3412 while (decimals < scale) { 3412 while (decimals < scale) {
3413 result *= 10; 3413 result *= 10;
3414 decimals ++; 3414 decimals ++;
3415 } 3415 }
3416 *res = result; 3416 *res = result;
3417 return 0; 3417 return 0;
3418 } 3418 }
3419 3419
3420 3420
3421 static void md_safemode_timeout(unsigned long data); 3421 static void md_safemode_timeout(unsigned long data);
3422 3422
3423 static ssize_t 3423 static ssize_t
3424 safe_delay_show(struct mddev *mddev, char *page) 3424 safe_delay_show(struct mddev *mddev, char *page)
3425 { 3425 {
3426 int msec = (mddev->safemode_delay*1000)/HZ; 3426 int msec = (mddev->safemode_delay*1000)/HZ;
3427 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 3427 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3428 } 3428 }
3429 static ssize_t 3429 static ssize_t
3430 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3430 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3431 { 3431 {
3432 unsigned long msec; 3432 unsigned long msec;
3433 3433
3434 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) 3434 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3435 return -EINVAL; 3435 return -EINVAL;
3436 if (msec == 0) 3436 if (msec == 0)
3437 mddev->safemode_delay = 0; 3437 mddev->safemode_delay = 0;
3438 else { 3438 else {
3439 unsigned long old_delay = mddev->safemode_delay; 3439 unsigned long old_delay = mddev->safemode_delay;
3440 mddev->safemode_delay = (msec*HZ)/1000; 3440 mddev->safemode_delay = (msec*HZ)/1000;
3441 if (mddev->safemode_delay == 0) 3441 if (mddev->safemode_delay == 0)
3442 mddev->safemode_delay = 1; 3442 mddev->safemode_delay = 1;
3443 if (mddev->safemode_delay < old_delay) 3443 if (mddev->safemode_delay < old_delay)
3444 md_safemode_timeout((unsigned long)mddev); 3444 md_safemode_timeout((unsigned long)mddev);
3445 } 3445 }
3446 return len; 3446 return len;
3447 } 3447 }
3448 static struct md_sysfs_entry md_safe_delay = 3448 static struct md_sysfs_entry md_safe_delay =
3449 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3449 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3450 3450
3451 static ssize_t 3451 static ssize_t
3452 level_show(struct mddev *mddev, char *page) 3452 level_show(struct mddev *mddev, char *page)
3453 { 3453 {
3454 struct md_personality *p = mddev->pers; 3454 struct md_personality *p = mddev->pers;
3455 if (p) 3455 if (p)
3456 return sprintf(page, "%s\n", p->name); 3456 return sprintf(page, "%s\n", p->name);
3457 else if (mddev->clevel[0]) 3457 else if (mddev->clevel[0])
3458 return sprintf(page, "%s\n", mddev->clevel); 3458 return sprintf(page, "%s\n", mddev->clevel);
3459 else if (mddev->level != LEVEL_NONE) 3459 else if (mddev->level != LEVEL_NONE)
3460 return sprintf(page, "%d\n", mddev->level); 3460 return sprintf(page, "%d\n", mddev->level);
3461 else 3461 else
3462 return 0; 3462 return 0;
3463 } 3463 }
3464 3464
3465 static ssize_t 3465 static ssize_t
3466 level_store(struct mddev *mddev, const char *buf, size_t len) 3466 level_store(struct mddev *mddev, const char *buf, size_t len)
3467 { 3467 {
3468 char clevel[16]; 3468 char clevel[16];
3469 ssize_t rv = len; 3469 ssize_t rv = len;
3470 struct md_personality *pers; 3470 struct md_personality *pers;
3471 long level; 3471 long level;
3472 void *priv; 3472 void *priv;
3473 struct md_rdev *rdev; 3473 struct md_rdev *rdev;
3474 3474
3475 if (mddev->pers == NULL) { 3475 if (mddev->pers == NULL) {
3476 if (len == 0) 3476 if (len == 0)
3477 return 0; 3477 return 0;
3478 if (len >= sizeof(mddev->clevel)) 3478 if (len >= sizeof(mddev->clevel))
3479 return -ENOSPC; 3479 return -ENOSPC;
3480 strncpy(mddev->clevel, buf, len); 3480 strncpy(mddev->clevel, buf, len);
3481 if (mddev->clevel[len-1] == '\n') 3481 if (mddev->clevel[len-1] == '\n')
3482 len--; 3482 len--;
3483 mddev->clevel[len] = 0; 3483 mddev->clevel[len] = 0;
3484 mddev->level = LEVEL_NONE; 3484 mddev->level = LEVEL_NONE;
3485 return rv; 3485 return rv;
3486 } 3486 }
3487 3487
3488 /* request to change the personality. Need to ensure: 3488 /* request to change the personality. Need to ensure:
3489 * - array is not engaged in resync/recovery/reshape 3489 * - array is not engaged in resync/recovery/reshape
3490 * - old personality can be suspended 3490 * - old personality can be suspended
3491 * - new personality will access other array. 3491 * - new personality will access other array.
3492 */ 3492 */
3493 3493
3494 if (mddev->sync_thread || 3494 if (mddev->sync_thread ||
3495 mddev->reshape_position != MaxSector || 3495 mddev->reshape_position != MaxSector ||
3496 mddev->sysfs_active) 3496 mddev->sysfs_active)
3497 return -EBUSY; 3497 return -EBUSY;
3498 3498
3499 if (!mddev->pers->quiesce) { 3499 if (!mddev->pers->quiesce) {
3500 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 3500 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
3501 mdname(mddev), mddev->pers->name); 3501 mdname(mddev), mddev->pers->name);
3502 return -EINVAL; 3502 return -EINVAL;
3503 } 3503 }
3504 3504
3505 /* Now find the new personality */ 3505 /* Now find the new personality */
3506 if (len == 0 || len >= sizeof(clevel)) 3506 if (len == 0 || len >= sizeof(clevel))
3507 return -EINVAL; 3507 return -EINVAL;
3508 strncpy(clevel, buf, len); 3508 strncpy(clevel, buf, len);
3509 if (clevel[len-1] == '\n') 3509 if (clevel[len-1] == '\n')
3510 len--; 3510 len--;
3511 clevel[len] = 0; 3511 clevel[len] = 0;
3512 if (strict_strtol(clevel, 10, &level)) 3512 if (strict_strtol(clevel, 10, &level))
3513 level = LEVEL_NONE; 3513 level = LEVEL_NONE;
3514 3514
3515 if (request_module("md-%s", clevel) != 0) 3515 if (request_module("md-%s", clevel) != 0)
3516 request_module("md-level-%s", clevel); 3516 request_module("md-level-%s", clevel);
3517 spin_lock(&pers_lock); 3517 spin_lock(&pers_lock);
3518 pers = find_pers(level, clevel); 3518 pers = find_pers(level, clevel);
3519 if (!pers || !try_module_get(pers->owner)) { 3519 if (!pers || !try_module_get(pers->owner)) {
3520 spin_unlock(&pers_lock); 3520 spin_unlock(&pers_lock);
3521 printk(KERN_WARNING "md: personality %s not loaded\n", clevel); 3521 printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
3522 return -EINVAL; 3522 return -EINVAL;
3523 } 3523 }
3524 spin_unlock(&pers_lock); 3524 spin_unlock(&pers_lock);
3525 3525
3526 if (pers == mddev->pers) { 3526 if (pers == mddev->pers) {
3527 /* Nothing to do! */ 3527 /* Nothing to do! */
3528 module_put(pers->owner); 3528 module_put(pers->owner);
3529 return rv; 3529 return rv;
3530 } 3530 }
3531 if (!pers->takeover) { 3531 if (!pers->takeover) {
3532 module_put(pers->owner); 3532 module_put(pers->owner);
3533 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 3533 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
3534 mdname(mddev), clevel); 3534 mdname(mddev), clevel);
3535 return -EINVAL; 3535 return -EINVAL;
3536 } 3536 }
3537 3537
3538 rdev_for_each(rdev, mddev) 3538 rdev_for_each(rdev, mddev)
3539 rdev->new_raid_disk = rdev->raid_disk; 3539 rdev->new_raid_disk = rdev->raid_disk;
3540 3540
3541 /* ->takeover must set new_* and/or delta_disks 3541 /* ->takeover must set new_* and/or delta_disks
3542 * if it succeeds, and may set them when it fails. 3542 * if it succeeds, and may set them when it fails.
3543 */ 3543 */
3544 priv = pers->takeover(mddev); 3544 priv = pers->takeover(mddev);
3545 if (IS_ERR(priv)) { 3545 if (IS_ERR(priv)) {
3546 mddev->new_level = mddev->level; 3546 mddev->new_level = mddev->level;
3547 mddev->new_layout = mddev->layout; 3547 mddev->new_layout = mddev->layout;
3548 mddev->new_chunk_sectors = mddev->chunk_sectors; 3548 mddev->new_chunk_sectors = mddev->chunk_sectors;
3549 mddev->raid_disks -= mddev->delta_disks; 3549 mddev->raid_disks -= mddev->delta_disks;
3550 mddev->delta_disks = 0; 3550 mddev->delta_disks = 0;
3551 mddev->reshape_backwards = 0; 3551 mddev->reshape_backwards = 0;
3552 module_put(pers->owner); 3552 module_put(pers->owner);
3553 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3553 printk(KERN_WARNING "md: %s: %s would not accept array\n",
3554 mdname(mddev), clevel); 3554 mdname(mddev), clevel);
3555 return PTR_ERR(priv); 3555 return PTR_ERR(priv);
3556 } 3556 }
3557 3557
3558 /* Looks like we have a winner */ 3558 /* Looks like we have a winner */
3559 mddev_suspend(mddev); 3559 mddev_suspend(mddev);
3560 mddev->pers->stop(mddev); 3560 mddev->pers->stop(mddev);
3561 3561
3562 if (mddev->pers->sync_request == NULL && 3562 if (mddev->pers->sync_request == NULL &&
3563 pers->sync_request != NULL) { 3563 pers->sync_request != NULL) {
3564 /* need to add the md_redundancy_group */ 3564 /* need to add the md_redundancy_group */
3565 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3565 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3566 printk(KERN_WARNING 3566 printk(KERN_WARNING
3567 "md: cannot register extra attributes for %s\n", 3567 "md: cannot register extra attributes for %s\n",
3568 mdname(mddev)); 3568 mdname(mddev));
3569 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action"); 3569 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, NULL, "sync_action");
3570 } 3570 }
3571 if (mddev->pers->sync_request != NULL && 3571 if (mddev->pers->sync_request != NULL &&
3572 pers->sync_request == NULL) { 3572 pers->sync_request == NULL) {
3573 /* need to remove the md_redundancy_group */ 3573 /* need to remove the md_redundancy_group */
3574 if (mddev->to_remove == NULL) 3574 if (mddev->to_remove == NULL)
3575 mddev->to_remove = &md_redundancy_group; 3575 mddev->to_remove = &md_redundancy_group;
3576 } 3576 }
3577 3577
3578 if (mddev->pers->sync_request == NULL && 3578 if (mddev->pers->sync_request == NULL &&
3579 mddev->external) { 3579 mddev->external) {
3580 /* We are converting from a no-redundancy array 3580 /* We are converting from a no-redundancy array
3581 * to a redundancy array and metadata is managed 3581 * to a redundancy array and metadata is managed
3582 * externally so we need to be sure that writes 3582 * externally so we need to be sure that writes
3583 * won't block due to a need to transition 3583 * won't block due to a need to transition
3584 * clean->dirty 3584 * clean->dirty
3585 * until external management is started. 3585 * until external management is started.
3586 */ 3586 */
3587 mddev->in_sync = 0; 3587 mddev->in_sync = 0;
3588 mddev->safemode_delay = 0; 3588 mddev->safemode_delay = 0;
3589 mddev->safemode = 0; 3589 mddev->safemode = 0;
3590 } 3590 }
3591 3591
3592 rdev_for_each(rdev, mddev) { 3592 rdev_for_each(rdev, mddev) {
3593 if (rdev->raid_disk < 0) 3593 if (rdev->raid_disk < 0)
3594 continue; 3594 continue;
3595 if (rdev->new_raid_disk >= mddev->raid_disks) 3595 if (rdev->new_raid_disk >= mddev->raid_disks)
3596 rdev->new_raid_disk = -1; 3596 rdev->new_raid_disk = -1;
3597 if (rdev->new_raid_disk == rdev->raid_disk) 3597 if (rdev->new_raid_disk == rdev->raid_disk)
3598 continue; 3598 continue;
3599 sysfs_unlink_rdev(mddev, rdev); 3599 sysfs_unlink_rdev(mddev, rdev);
3600 } 3600 }
3601 rdev_for_each(rdev, mddev) { 3601 rdev_for_each(rdev, mddev) {
3602 if (rdev->raid_disk < 0) 3602 if (rdev->raid_disk < 0)
3603 continue; 3603 continue;
3604 if (rdev->new_raid_disk == rdev->raid_disk) 3604 if (rdev->new_raid_disk == rdev->raid_disk)
3605 continue; 3605 continue;
3606 rdev->raid_disk = rdev->new_raid_disk; 3606 rdev->raid_disk = rdev->new_raid_disk;
3607 if (rdev->raid_disk < 0) 3607 if (rdev->raid_disk < 0)
3608 clear_bit(In_sync, &rdev->flags); 3608 clear_bit(In_sync, &rdev->flags);
3609 else { 3609 else {
3610 if (sysfs_link_rdev(mddev, rdev)) 3610 if (sysfs_link_rdev(mddev, rdev))
3611 printk(KERN_WARNING "md: cannot register rd%d" 3611 printk(KERN_WARNING "md: cannot register rd%d"
3612 " for %s after level change\n", 3612 " for %s after level change\n",
3613 rdev->raid_disk, mdname(mddev)); 3613 rdev->raid_disk, mdname(mddev));
3614 } 3614 }
3615 } 3615 }
3616 3616
3617 module_put(mddev->pers->owner); 3617 module_put(mddev->pers->owner);
3618 mddev->pers = pers; 3618 mddev->pers = pers;
3619 mddev->private = priv; 3619 mddev->private = priv;
3620 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3620 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3621 mddev->level = mddev->new_level; 3621 mddev->level = mddev->new_level;
3622 mddev->layout = mddev->new_layout; 3622 mddev->layout = mddev->new_layout;
3623 mddev->chunk_sectors = mddev->new_chunk_sectors; 3623 mddev->chunk_sectors = mddev->new_chunk_sectors;
3624 mddev->delta_disks = 0; 3624 mddev->delta_disks = 0;
3625 mddev->reshape_backwards = 0; 3625 mddev->reshape_backwards = 0;
3626 mddev->degraded = 0; 3626 mddev->degraded = 0;
3627 if (mddev->pers->sync_request == NULL) { 3627 if (mddev->pers->sync_request == NULL) {
3628 /* this is now an array without redundancy, so 3628 /* this is now an array without redundancy, so
3629 * it must always be in_sync 3629 * it must always be in_sync
3630 */ 3630 */
3631 mddev->in_sync = 1; 3631 mddev->in_sync = 1;
3632 del_timer_sync(&mddev->safemode_timer); 3632 del_timer_sync(&mddev->safemode_timer);
3633 } 3633 }
3634 pers->run(mddev); 3634 pers->run(mddev);
3635 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3635 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3636 mddev_resume(mddev); 3636 mddev_resume(mddev);
3637 sysfs_notify(&mddev->kobj, NULL, "level"); 3637 sysfs_notify(&mddev->kobj, NULL, "level");
3638 md_new_event(mddev); 3638 md_new_event(mddev);
3639 return rv; 3639 return rv;
3640 } 3640 }
3641 3641
3642 static struct md_sysfs_entry md_level = 3642 static struct md_sysfs_entry md_level =
3643 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 3643 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3644 3644
3645 3645
3646 static ssize_t 3646 static ssize_t
3647 layout_show(struct mddev *mddev, char *page) 3647 layout_show(struct mddev *mddev, char *page)
3648 { 3648 {
3649 /* just a number, not meaningful for all levels */ 3649 /* just a number, not meaningful for all levels */
3650 if (mddev->reshape_position != MaxSector && 3650 if (mddev->reshape_position != MaxSector &&
3651 mddev->layout != mddev->new_layout) 3651 mddev->layout != mddev->new_layout)
3652 return sprintf(page, "%d (%d)\n", 3652 return sprintf(page, "%d (%d)\n",
3653 mddev->new_layout, mddev->layout); 3653 mddev->new_layout, mddev->layout);
3654 return sprintf(page, "%d\n", mddev->layout); 3654 return sprintf(page, "%d\n", mddev->layout);
3655 } 3655 }
3656 3656
3657 static ssize_t 3657 static ssize_t
3658 layout_store(struct mddev *mddev, const char *buf, size_t len) 3658 layout_store(struct mddev *mddev, const char *buf, size_t len)
3659 { 3659 {
3660 char *e; 3660 char *e;
3661 unsigned long n = simple_strtoul(buf, &e, 10); 3661 unsigned long n = simple_strtoul(buf, &e, 10);
3662 3662
3663 if (!*buf || (*e && *e != '\n')) 3663 if (!*buf || (*e && *e != '\n'))
3664 return -EINVAL; 3664 return -EINVAL;
3665 3665
3666 if (mddev->pers) { 3666 if (mddev->pers) {
3667 int err; 3667 int err;
3668 if (mddev->pers->check_reshape == NULL) 3668 if (mddev->pers->check_reshape == NULL)
3669 return -EBUSY; 3669 return -EBUSY;
3670 mddev->new_layout = n; 3670 mddev->new_layout = n;
3671 err = mddev->pers->check_reshape(mddev); 3671 err = mddev->pers->check_reshape(mddev);
3672 if (err) { 3672 if (err) {
3673 mddev->new_layout = mddev->layout; 3673 mddev->new_layout = mddev->layout;
3674 return err; 3674 return err;
3675 } 3675 }
3676 } else { 3676 } else {
3677 mddev->new_layout = n; 3677 mddev->new_layout = n;
3678 if (mddev->reshape_position == MaxSector) 3678 if (mddev->reshape_position == MaxSector)
3679 mddev->layout = n; 3679 mddev->layout = n;
3680 } 3680 }
3681 return len; 3681 return len;
3682 } 3682 }
3683 static struct md_sysfs_entry md_layout = 3683 static struct md_sysfs_entry md_layout =
3684 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 3684 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3685 3685
3686 3686
3687 static ssize_t 3687 static ssize_t
3688 raid_disks_show(struct mddev *mddev, char *page) 3688 raid_disks_show(struct mddev *mddev, char *page)
3689 { 3689 {
3690 if (mddev->raid_disks == 0) 3690 if (mddev->raid_disks == 0)
3691 return 0; 3691 return 0;
3692 if (mddev->reshape_position != MaxSector && 3692 if (mddev->reshape_position != MaxSector &&
3693 mddev->delta_disks != 0) 3693 mddev->delta_disks != 0)
3694 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 3694 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3695 mddev->raid_disks - mddev->delta_disks); 3695 mddev->raid_disks - mddev->delta_disks);
3696 return sprintf(page, "%d\n", mddev->raid_disks); 3696 return sprintf(page, "%d\n", mddev->raid_disks);
3697 } 3697 }
3698 3698
3699 static int update_raid_disks(struct mddev *mddev, int raid_disks); 3699 static int update_raid_disks(struct mddev *mddev, int raid_disks);
3700 3700
3701 static ssize_t 3701 static ssize_t
3702 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 3702 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3703 { 3703 {
3704 char *e; 3704 char *e;
3705 int rv = 0; 3705 int rv = 0;
3706 unsigned long n = simple_strtoul(buf, &e, 10); 3706 unsigned long n = simple_strtoul(buf, &e, 10);
3707 3707
3708 if (!*buf || (*e && *e != '\n')) 3708 if (!*buf || (*e && *e != '\n'))
3709 return -EINVAL; 3709 return -EINVAL;
3710 3710
3711 if (mddev->pers) 3711 if (mddev->pers)
3712 rv = update_raid_disks(mddev, n); 3712 rv = update_raid_disks(mddev, n);
3713 else if (mddev->reshape_position != MaxSector) { 3713 else if (mddev->reshape_position != MaxSector) {
3714 struct md_rdev *rdev; 3714 struct md_rdev *rdev;
3715 int olddisks = mddev->raid_disks - mddev->delta_disks; 3715 int olddisks = mddev->raid_disks - mddev->delta_disks;
3716 3716
3717 rdev_for_each(rdev, mddev) { 3717 rdev_for_each(rdev, mddev) {
3718 if (olddisks < n && 3718 if (olddisks < n &&
3719 rdev->data_offset < rdev->new_data_offset) 3719 rdev->data_offset < rdev->new_data_offset)
3720 return -EINVAL; 3720 return -EINVAL;
3721 if (olddisks > n && 3721 if (olddisks > n &&
3722 rdev->data_offset > rdev->new_data_offset) 3722 rdev->data_offset > rdev->new_data_offset)
3723 return -EINVAL; 3723 return -EINVAL;
3724 } 3724 }
3725 mddev->delta_disks = n - olddisks; 3725 mddev->delta_disks = n - olddisks;
3726 mddev->raid_disks = n; 3726 mddev->raid_disks = n;
3727 mddev->reshape_backwards = (mddev->delta_disks < 0); 3727 mddev->reshape_backwards = (mddev->delta_disks < 0);
3728 } else 3728 } else
3729 mddev->raid_disks = n; 3729 mddev->raid_disks = n;
3730 return rv ? rv : len; 3730 return rv ? rv : len;
3731 } 3731 }
3732 static struct md_sysfs_entry md_raid_disks = 3732 static struct md_sysfs_entry md_raid_disks =
3733 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3733 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3734 3734
3735 static ssize_t 3735 static ssize_t
3736 chunk_size_show(struct mddev *mddev, char *page) 3736 chunk_size_show(struct mddev *mddev, char *page)
3737 { 3737 {
3738 if (mddev->reshape_position != MaxSector && 3738 if (mddev->reshape_position != MaxSector &&
3739 mddev->chunk_sectors != mddev->new_chunk_sectors) 3739 mddev->chunk_sectors != mddev->new_chunk_sectors)
3740 return sprintf(page, "%d (%d)\n", 3740 return sprintf(page, "%d (%d)\n",
3741 mddev->new_chunk_sectors << 9, 3741 mddev->new_chunk_sectors << 9,
3742 mddev->chunk_sectors << 9); 3742 mddev->chunk_sectors << 9);
3743 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 3743 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3744 } 3744 }
3745 3745
3746 static ssize_t 3746 static ssize_t
3747 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 3747 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3748 { 3748 {
3749 char *e; 3749 char *e;
3750 unsigned long n = simple_strtoul(buf, &e, 10); 3750 unsigned long n = simple_strtoul(buf, &e, 10);
3751 3751
3752 if (!*buf || (*e && *e != '\n')) 3752 if (!*buf || (*e && *e != '\n'))
3753 return -EINVAL; 3753 return -EINVAL;
3754 3754
3755 if (mddev->pers) { 3755 if (mddev->pers) {
3756 int err; 3756 int err;
3757 if (mddev->pers->check_reshape == NULL) 3757 if (mddev->pers->check_reshape == NULL)
3758 return -EBUSY; 3758 return -EBUSY;
3759 mddev->new_chunk_sectors = n >> 9; 3759 mddev->new_chunk_sectors = n >> 9;
3760 err = mddev->pers->check_reshape(mddev); 3760 err = mddev->pers->check_reshape(mddev);
3761 if (err) { 3761 if (err) {
3762 mddev->new_chunk_sectors = mddev->chunk_sectors; 3762 mddev->new_chunk_sectors = mddev->chunk_sectors;
3763 return err; 3763 return err;
3764 } 3764 }
3765 } else { 3765 } else {
3766 mddev->new_chunk_sectors = n >> 9; 3766 mddev->new_chunk_sectors = n >> 9;
3767 if (mddev->reshape_position == MaxSector) 3767 if (mddev->reshape_position == MaxSector)
3768 mddev->chunk_sectors = n >> 9; 3768 mddev->chunk_sectors = n >> 9;
3769 } 3769 }
3770 return len; 3770 return len;
3771 } 3771 }
3772 static struct md_sysfs_entry md_chunk_size = 3772 static struct md_sysfs_entry md_chunk_size =
3773 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3773 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3774 3774
3775 static ssize_t 3775 static ssize_t
3776 resync_start_show(struct mddev *mddev, char *page) 3776 resync_start_show(struct mddev *mddev, char *page)
3777 { 3777 {
3778 if (mddev->recovery_cp == MaxSector) 3778 if (mddev->recovery_cp == MaxSector)
3779 return sprintf(page, "none\n"); 3779 return sprintf(page, "none\n");
3780 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 3780 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
3781 } 3781 }
3782 3782
3783 static ssize_t 3783 static ssize_t
3784 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 3784 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
3785 { 3785 {
3786 char *e; 3786 char *e;
3787 unsigned long long n = simple_strtoull(buf, &e, 10); 3787 unsigned long long n = simple_strtoull(buf, &e, 10);
3788 3788
3789 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3789 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3790 return -EBUSY; 3790 return -EBUSY;
3791 if (cmd_match(buf, "none")) 3791 if (cmd_match(buf, "none"))
3792 n = MaxSector; 3792 n = MaxSector;
3793 else if (!*buf || (*e && *e != '\n')) 3793 else if (!*buf || (*e && *e != '\n'))
3794 return -EINVAL; 3794 return -EINVAL;
3795 3795
3796 mddev->recovery_cp = n; 3796 mddev->recovery_cp = n;
3797 if (mddev->pers) 3797 if (mddev->pers)
3798 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3798 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3799 return len; 3799 return len;
3800 } 3800 }
3801 static struct md_sysfs_entry md_resync_start = 3801 static struct md_sysfs_entry md_resync_start =
3802 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 3802 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
3803 3803
3804 /* 3804 /*
3805 * The array state can be: 3805 * The array state can be:
3806 * 3806 *
3807 * clear 3807 * clear
3808 * No devices, no size, no level 3808 * No devices, no size, no level
3809 * Equivalent to STOP_ARRAY ioctl 3809 * Equivalent to STOP_ARRAY ioctl
3810 * inactive 3810 * inactive
3811 * May have some settings, but array is not active 3811 * May have some settings, but array is not active
3812 * all IO results in error 3812 * all IO results in error
3813 * When written, doesn't tear down array, but just stops it 3813 * When written, doesn't tear down array, but just stops it
3814 * suspended (not supported yet) 3814 * suspended (not supported yet)
3815 * All IO requests will block. The array can be reconfigured. 3815 * All IO requests will block. The array can be reconfigured.
3816 * Writing this, if accepted, will block until array is quiescent 3816 * Writing this, if accepted, will block until array is quiescent
3817 * readonly 3817 * readonly
3818 * no resync can happen. no superblocks get written. 3818 * no resync can happen. no superblocks get written.
3819 * write requests fail 3819 * write requests fail
3820 * read-auto 3820 * read-auto
3821 * like readonly, but behaves like 'clean' on a write request. 3821 * like readonly, but behaves like 'clean' on a write request.
3822 * 3822 *
3823 * clean - no pending writes, but otherwise active. 3823 * clean - no pending writes, but otherwise active.
3824 * When written to inactive array, starts without resync 3824 * When written to inactive array, starts without resync
3825 * If a write request arrives then 3825 * If a write request arrives then
3826 * if metadata is known, mark 'dirty' and switch to 'active'. 3826 * if metadata is known, mark 'dirty' and switch to 'active'.
3827 * if not known, block and switch to write-pending 3827 * if not known, block and switch to write-pending
3828 * If written to an active array that has pending writes, then fails. 3828 * If written to an active array that has pending writes, then fails.
3829 * active 3829 * active
3830 * fully active: IO and resync can be happening. 3830 * fully active: IO and resync can be happening.
3831 * When written to inactive array, starts with resync 3831 * When written to inactive array, starts with resync
3832 * 3832 *
3833 * write-pending 3833 * write-pending
3834 * clean, but writes are blocked waiting for 'active' to be written. 3834 * clean, but writes are blocked waiting for 'active' to be written.
3835 * 3835 *
3836 * active-idle 3836 * active-idle
3837 * like active, but no writes have been seen for a while (100msec). 3837 * like active, but no writes have been seen for a while (100msec).
3838 * 3838 *
3839 */ 3839 */
3840 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 3840 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
3841 write_pending, active_idle, bad_word}; 3841 write_pending, active_idle, bad_word};
3842 static char *array_states[] = { 3842 static char *array_states[] = {
3843 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 3843 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3844 "write-pending", "active-idle", NULL }; 3844 "write-pending", "active-idle", NULL };
3845 3845
3846 static int match_word(const char *word, char **list) 3846 static int match_word(const char *word, char **list)
3847 { 3847 {
3848 int n; 3848 int n;
3849 for (n=0; list[n]; n++) 3849 for (n=0; list[n]; n++)
3850 if (cmd_match(word, list[n])) 3850 if (cmd_match(word, list[n]))
3851 break; 3851 break;
3852 return n; 3852 return n;
3853 } 3853 }
3854 3854
3855 static ssize_t 3855 static ssize_t
3856 array_state_show(struct mddev *mddev, char *page) 3856 array_state_show(struct mddev *mddev, char *page)
3857 { 3857 {
3858 enum array_state st = inactive; 3858 enum array_state st = inactive;
3859 3859
3860 if (mddev->pers) 3860 if (mddev->pers)
3861 switch(mddev->ro) { 3861 switch(mddev->ro) {
3862 case 1: 3862 case 1:
3863 st = readonly; 3863 st = readonly;
3864 break; 3864 break;
3865 case 2: 3865 case 2:
3866 st = read_auto; 3866 st = read_auto;
3867 break; 3867 break;
3868 case 0: 3868 case 0:
3869 if (mddev->in_sync) 3869 if (mddev->in_sync)
3870 st = clean; 3870 st = clean;
3871 else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 3871 else if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
3872 st = write_pending; 3872 st = write_pending;
3873 else if (mddev->safemode) 3873 else if (mddev->safemode)
3874 st = active_idle; 3874 st = active_idle;
3875 else 3875 else
3876 st = active; 3876 st = active;
3877 } 3877 }
3878 else { 3878 else {
3879 if (list_empty(&mddev->disks) && 3879 if (list_empty(&mddev->disks) &&
3880 mddev->raid_disks == 0 && 3880 mddev->raid_disks == 0 &&
3881 mddev->dev_sectors == 0) 3881 mddev->dev_sectors == 0)
3882 st = clear; 3882 st = clear;
3883 else 3883 else
3884 st = inactive; 3884 st = inactive;
3885 } 3885 }
3886 return sprintf(page, "%s\n", array_states[st]); 3886 return sprintf(page, "%s\n", array_states[st]);
3887 } 3887 }
3888 3888
3889 static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev); 3889 static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev);
3890 static int md_set_readonly(struct mddev * mddev, struct block_device *bdev); 3890 static int md_set_readonly(struct mddev * mddev, struct block_device *bdev);
3891 static int do_md_run(struct mddev * mddev); 3891 static int do_md_run(struct mddev * mddev);
3892 static int restart_array(struct mddev *mddev); 3892 static int restart_array(struct mddev *mddev);
3893 3893
3894 static ssize_t 3894 static ssize_t
3895 array_state_store(struct mddev *mddev, const char *buf, size_t len) 3895 array_state_store(struct mddev *mddev, const char *buf, size_t len)
3896 { 3896 {
3897 int err = -EINVAL; 3897 int err = -EINVAL;
3898 enum array_state st = match_word(buf, array_states); 3898 enum array_state st = match_word(buf, array_states);
3899 switch(st) { 3899 switch(st) {
3900 case bad_word: 3900 case bad_word:
3901 break; 3901 break;
3902 case clear: 3902 case clear:
3903 /* stopping an active array */ 3903 /* stopping an active array */
3904 err = do_md_stop(mddev, 0, NULL); 3904 err = do_md_stop(mddev, 0, NULL);
3905 break; 3905 break;
3906 case inactive: 3906 case inactive:
3907 /* stopping an active array */ 3907 /* stopping an active array */
3908 if (mddev->pers) 3908 if (mddev->pers)
3909 err = do_md_stop(mddev, 2, NULL); 3909 err = do_md_stop(mddev, 2, NULL);
3910 else 3910 else
3911 err = 0; /* already inactive */ 3911 err = 0; /* already inactive */
3912 break; 3912 break;
3913 case suspended: 3913 case suspended:
3914 break; /* not supported yet */ 3914 break; /* not supported yet */
3915 case readonly: 3915 case readonly:
3916 if (mddev->pers) 3916 if (mddev->pers)
3917 err = md_set_readonly(mddev, NULL); 3917 err = md_set_readonly(mddev, NULL);
3918 else { 3918 else {
3919 mddev->ro = 1; 3919 mddev->ro = 1;
3920 set_disk_ro(mddev->gendisk, 1); 3920 set_disk_ro(mddev->gendisk, 1);
3921 err = do_md_run(mddev); 3921 err = do_md_run(mddev);
3922 } 3922 }
3923 break; 3923 break;
3924 case read_auto: 3924 case read_auto:
3925 if (mddev->pers) { 3925 if (mddev->pers) {
3926 if (mddev->ro == 0) 3926 if (mddev->ro == 0)
3927 err = md_set_readonly(mddev, NULL); 3927 err = md_set_readonly(mddev, NULL);
3928 else if (mddev->ro == 1) 3928 else if (mddev->ro == 1)
3929 err = restart_array(mddev); 3929 err = restart_array(mddev);
3930 if (err == 0) { 3930 if (err == 0) {
3931 mddev->ro = 2; 3931 mddev->ro = 2;
3932 set_disk_ro(mddev->gendisk, 0); 3932 set_disk_ro(mddev->gendisk, 0);
3933 } 3933 }
3934 } else { 3934 } else {
3935 mddev->ro = 2; 3935 mddev->ro = 2;
3936 err = do_md_run(mddev); 3936 err = do_md_run(mddev);
3937 } 3937 }
3938 break; 3938 break;
3939 case clean: 3939 case clean:
3940 if (mddev->pers) { 3940 if (mddev->pers) {
3941 restart_array(mddev); 3941 restart_array(mddev);
3942 spin_lock_irq(&mddev->write_lock); 3942 spin_lock_irq(&mddev->write_lock);
3943 if (atomic_read(&mddev->writes_pending) == 0) { 3943 if (atomic_read(&mddev->writes_pending) == 0) {
3944 if (mddev->in_sync == 0) { 3944 if (mddev->in_sync == 0) {
3945 mddev->in_sync = 1; 3945 mddev->in_sync = 1;
3946 if (mddev->safemode == 1) 3946 if (mddev->safemode == 1)
3947 mddev->safemode = 0; 3947 mddev->safemode = 0;
3948 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3948 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3949 } 3949 }
3950 err = 0; 3950 err = 0;
3951 } else 3951 } else
3952 err = -EBUSY; 3952 err = -EBUSY;
3953 spin_unlock_irq(&mddev->write_lock); 3953 spin_unlock_irq(&mddev->write_lock);
3954 } else 3954 } else
3955 err = -EINVAL; 3955 err = -EINVAL;
3956 break; 3956 break;
3957 case active: 3957 case active:
3958 if (mddev->pers) { 3958 if (mddev->pers) {
3959 restart_array(mddev); 3959 restart_array(mddev);
3960 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 3960 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
3961 wake_up(&mddev->sb_wait); 3961 wake_up(&mddev->sb_wait);
3962 err = 0; 3962 err = 0;
3963 } else { 3963 } else {
3964 mddev->ro = 0; 3964 mddev->ro = 0;
3965 set_disk_ro(mddev->gendisk, 0); 3965 set_disk_ro(mddev->gendisk, 0);
3966 err = do_md_run(mddev); 3966 err = do_md_run(mddev);
3967 } 3967 }
3968 break; 3968 break;
3969 case write_pending: 3969 case write_pending:
3970 case active_idle: 3970 case active_idle:
3971 /* these cannot be set */ 3971 /* these cannot be set */
3972 break; 3972 break;
3973 } 3973 }
3974 if (err) 3974 if (err)
3975 return err; 3975 return err;
3976 else { 3976 else {
3977 if (mddev->hold_active == UNTIL_IOCTL) 3977 if (mddev->hold_active == UNTIL_IOCTL)
3978 mddev->hold_active = 0; 3978 mddev->hold_active = 0;
3979 sysfs_notify_dirent_safe(mddev->sysfs_state); 3979 sysfs_notify_dirent_safe(mddev->sysfs_state);
3980 return len; 3980 return len;
3981 } 3981 }
3982 } 3982 }
3983 static struct md_sysfs_entry md_array_state = 3983 static struct md_sysfs_entry md_array_state =
3984 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3984 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3985 3985
3986 static ssize_t 3986 static ssize_t
3987 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 3987 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
3988 return sprintf(page, "%d\n", 3988 return sprintf(page, "%d\n",
3989 atomic_read(&mddev->max_corr_read_errors)); 3989 atomic_read(&mddev->max_corr_read_errors));
3990 } 3990 }
3991 3991
3992 static ssize_t 3992 static ssize_t
3993 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 3993 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
3994 { 3994 {
3995 char *e; 3995 char *e;
3996 unsigned long n = simple_strtoul(buf, &e, 10); 3996 unsigned long n = simple_strtoul(buf, &e, 10);
3997 3997
3998 if (*buf && (*e == 0 || *e == '\n')) { 3998 if (*buf && (*e == 0 || *e == '\n')) {
3999 atomic_set(&mddev->max_corr_read_errors, n); 3999 atomic_set(&mddev->max_corr_read_errors, n);
4000 return len; 4000 return len;
4001 } 4001 }
4002 return -EINVAL; 4002 return -EINVAL;
4003 } 4003 }
4004 4004
4005 static struct md_sysfs_entry max_corr_read_errors = 4005 static struct md_sysfs_entry max_corr_read_errors =
4006 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 4006 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
4007 max_corrected_read_errors_store); 4007 max_corrected_read_errors_store);
4008 4008
4009 static ssize_t 4009 static ssize_t
4010 null_show(struct mddev *mddev, char *page) 4010 null_show(struct mddev *mddev, char *page)
4011 { 4011 {
4012 return -EINVAL; 4012 return -EINVAL;
4013 } 4013 }
4014 4014
4015 static ssize_t 4015 static ssize_t
4016 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 4016 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
4017 { 4017 {
4018 /* buf must be %d:%d\n? giving major and minor numbers */ 4018 /* buf must be %d:%d\n? giving major and minor numbers */
4019 /* The new device is added to the array. 4019 /* The new device is added to the array.
4020 * If the array has a persistent superblock, we read the 4020 * If the array has a persistent superblock, we read the
4021 * superblock to initialise info and check validity. 4021 * superblock to initialise info and check validity.
4022 * Otherwise, only checking done is that in bind_rdev_to_array, 4022 * Otherwise, only checking done is that in bind_rdev_to_array,
4023 * which mainly checks size. 4023 * which mainly checks size.
4024 */ 4024 */
4025 char *e; 4025 char *e;
4026 int major = simple_strtoul(buf, &e, 10); 4026 int major = simple_strtoul(buf, &e, 10);
4027 int minor; 4027 int minor;
4028 dev_t dev; 4028 dev_t dev;
4029 struct md_rdev *rdev; 4029 struct md_rdev *rdev;
4030 int err; 4030 int err;
4031 4031
4032 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 4032 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4033 return -EINVAL; 4033 return -EINVAL;
4034 minor = simple_strtoul(e+1, &e, 10); 4034 minor = simple_strtoul(e+1, &e, 10);
4035 if (*e && *e != '\n') 4035 if (*e && *e != '\n')
4036 return -EINVAL; 4036 return -EINVAL;
4037 dev = MKDEV(major, minor); 4037 dev = MKDEV(major, minor);
4038 if (major != MAJOR(dev) || 4038 if (major != MAJOR(dev) ||
4039 minor != MINOR(dev)) 4039 minor != MINOR(dev))
4040 return -EOVERFLOW; 4040 return -EOVERFLOW;
4041 4041
4042 4042
4043 if (mddev->persistent) { 4043 if (mddev->persistent) {
4044 rdev = md_import_device(dev, mddev->major_version, 4044 rdev = md_import_device(dev, mddev->major_version,
4045 mddev->minor_version); 4045 mddev->minor_version);
4046 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4046 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4047 struct md_rdev *rdev0 4047 struct md_rdev *rdev0
4048 = list_entry(mddev->disks.next, 4048 = list_entry(mddev->disks.next,
4049 struct md_rdev, same_set); 4049 struct md_rdev, same_set);
4050 err = super_types[mddev->major_version] 4050 err = super_types[mddev->major_version]
4051 .load_super(rdev, rdev0, mddev->minor_version); 4051 .load_super(rdev, rdev0, mddev->minor_version);
4052 if (err < 0) 4052 if (err < 0)
4053 goto out; 4053 goto out;
4054 } 4054 }
4055 } else if (mddev->external) 4055 } else if (mddev->external)
4056 rdev = md_import_device(dev, -2, -1); 4056 rdev = md_import_device(dev, -2, -1);
4057 else 4057 else
4058 rdev = md_import_device(dev, -1, -1); 4058 rdev = md_import_device(dev, -1, -1);
4059 4059
4060 if (IS_ERR(rdev)) 4060 if (IS_ERR(rdev))
4061 return PTR_ERR(rdev); 4061 return PTR_ERR(rdev);
4062 err = bind_rdev_to_array(rdev, mddev); 4062 err = bind_rdev_to_array(rdev, mddev);
4063 out: 4063 out:
4064 if (err) 4064 if (err)
4065 export_rdev(rdev); 4065 export_rdev(rdev);
4066 return err ? err : len; 4066 return err ? err : len;
4067 } 4067 }
4068 4068
4069 static struct md_sysfs_entry md_new_device = 4069 static struct md_sysfs_entry md_new_device =
4070 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4070 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4071 4071
4072 static ssize_t 4072 static ssize_t
4073 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4073 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4074 { 4074 {
4075 char *end; 4075 char *end;
4076 unsigned long chunk, end_chunk; 4076 unsigned long chunk, end_chunk;
4077 4077
4078 if (!mddev->bitmap) 4078 if (!mddev->bitmap)
4079 goto out; 4079 goto out;
4080 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4080 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4081 while (*buf) { 4081 while (*buf) {
4082 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4082 chunk = end_chunk = simple_strtoul(buf, &end, 0);
4083 if (buf == end) break; 4083 if (buf == end) break;
4084 if (*end == '-') { /* range */ 4084 if (*end == '-') { /* range */
4085 buf = end + 1; 4085 buf = end + 1;
4086 end_chunk = simple_strtoul(buf, &end, 0); 4086 end_chunk = simple_strtoul(buf, &end, 0);
4087 if (buf == end) break; 4087 if (buf == end) break;
4088 } 4088 }
4089 if (*end && !isspace(*end)) break; 4089 if (*end && !isspace(*end)) break;
4090 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4090 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4091 buf = skip_spaces(end); 4091 buf = skip_spaces(end);
4092 } 4092 }
4093 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4093 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4094 out: 4094 out:
4095 return len; 4095 return len;
4096 } 4096 }
4097 4097
4098 static struct md_sysfs_entry md_bitmap = 4098 static struct md_sysfs_entry md_bitmap =
4099 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4099 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4100 4100
4101 static ssize_t 4101 static ssize_t
4102 size_show(struct mddev *mddev, char *page) 4102 size_show(struct mddev *mddev, char *page)
4103 { 4103 {
4104 return sprintf(page, "%llu\n", 4104 return sprintf(page, "%llu\n",
4105 (unsigned long long)mddev->dev_sectors / 2); 4105 (unsigned long long)mddev->dev_sectors / 2);
4106 } 4106 }
4107 4107
4108 static int update_size(struct mddev *mddev, sector_t num_sectors); 4108 static int update_size(struct mddev *mddev, sector_t num_sectors);
4109 4109
4110 static ssize_t 4110 static ssize_t
4111 size_store(struct mddev *mddev, const char *buf, size_t len) 4111 size_store(struct mddev *mddev, const char *buf, size_t len)
4112 { 4112 {
4113 /* If array is inactive, we can reduce the component size, but 4113 /* If array is inactive, we can reduce the component size, but
4114 * not increase it (except from 0). 4114 * not increase it (except from 0).
4115 * If array is active, we can try an on-line resize 4115 * If array is active, we can try an on-line resize
4116 */ 4116 */
4117 sector_t sectors; 4117 sector_t sectors;
4118 int err = strict_blocks_to_sectors(buf, &sectors); 4118 int err = strict_blocks_to_sectors(buf, &sectors);
4119 4119
4120 if (err < 0) 4120 if (err < 0)
4121 return err; 4121 return err;
4122 if (mddev->pers) { 4122 if (mddev->pers) {
4123 err = update_size(mddev, sectors); 4123 err = update_size(mddev, sectors);
4124 md_update_sb(mddev, 1); 4124 md_update_sb(mddev, 1);
4125 } else { 4125 } else {
4126 if (mddev->dev_sectors == 0 || 4126 if (mddev->dev_sectors == 0 ||
4127 mddev->dev_sectors > sectors) 4127 mddev->dev_sectors > sectors)
4128 mddev->dev_sectors = sectors; 4128 mddev->dev_sectors = sectors;
4129 else 4129 else
4130 err = -ENOSPC; 4130 err = -ENOSPC;
4131 } 4131 }
4132 return err ? err : len; 4132 return err ? err : len;
4133 } 4133 }
4134 4134
4135 static struct md_sysfs_entry md_size = 4135 static struct md_sysfs_entry md_size =
4136 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4136 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4137 4137
4138 4138
4139 /* Metadata version. 4139 /* Metadata version.
4140 * This is one of 4140 * This is one of
4141 * 'none' for arrays with no metadata (good luck...) 4141 * 'none' for arrays with no metadata (good luck...)
4142 * 'external' for arrays with externally managed metadata, 4142 * 'external' for arrays with externally managed metadata,
4143 * or N.M for internally known formats 4143 * or N.M for internally known formats
4144 */ 4144 */
4145 static ssize_t 4145 static ssize_t
4146 metadata_show(struct mddev *mddev, char *page) 4146 metadata_show(struct mddev *mddev, char *page)
4147 { 4147 {
4148 if (mddev->persistent) 4148 if (mddev->persistent)
4149 return sprintf(page, "%d.%d\n", 4149 return sprintf(page, "%d.%d\n",
4150 mddev->major_version, mddev->minor_version); 4150 mddev->major_version, mddev->minor_version);
4151 else if (mddev->external) 4151 else if (mddev->external)
4152 return sprintf(page, "external:%s\n", mddev->metadata_type); 4152 return sprintf(page, "external:%s\n", mddev->metadata_type);
4153 else 4153 else
4154 return sprintf(page, "none\n"); 4154 return sprintf(page, "none\n");
4155 } 4155 }
4156 4156
4157 static ssize_t 4157 static ssize_t
4158 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4158 metadata_store(struct mddev *mddev, const char *buf, size_t len)
4159 { 4159 {
4160 int major, minor; 4160 int major, minor;
4161 char *e; 4161 char *e;
4162 /* Changing the details of 'external' metadata is 4162 /* Changing the details of 'external' metadata is
4163 * always permitted. Otherwise there must be 4163 * always permitted. Otherwise there must be
4164 * no devices attached to the array. 4164 * no devices attached to the array.
4165 */ 4165 */
4166 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4166 if (mddev->external && strncmp(buf, "external:", 9) == 0)
4167 ; 4167 ;
4168 else if (!list_empty(&mddev->disks)) 4168 else if (!list_empty(&mddev->disks))
4169 return -EBUSY; 4169 return -EBUSY;
4170 4170
4171 if (cmd_match(buf, "none")) { 4171 if (cmd_match(buf, "none")) {
4172 mddev->persistent = 0; 4172 mddev->persistent = 0;
4173 mddev->external = 0; 4173 mddev->external = 0;
4174 mddev->major_version = 0; 4174 mddev->major_version = 0;
4175 mddev->minor_version = 90; 4175 mddev->minor_version = 90;
4176 return len; 4176 return len;
4177 } 4177 }
4178 if (strncmp(buf, "external:", 9) == 0) { 4178 if (strncmp(buf, "external:", 9) == 0) {
4179 size_t namelen = len-9; 4179 size_t namelen = len-9;
4180 if (namelen >= sizeof(mddev->metadata_type)) 4180 if (namelen >= sizeof(mddev->metadata_type))
4181 namelen = sizeof(mddev->metadata_type)-1; 4181 namelen = sizeof(mddev->metadata_type)-1;
4182 strncpy(mddev->metadata_type, buf+9, namelen); 4182 strncpy(mddev->metadata_type, buf+9, namelen);
4183 mddev->metadata_type[namelen] = 0; 4183 mddev->metadata_type[namelen] = 0;
4184 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4184 if (namelen && mddev->metadata_type[namelen-1] == '\n')
4185 mddev->metadata_type[--namelen] = 0; 4185 mddev->metadata_type[--namelen] = 0;
4186 mddev->persistent = 0; 4186 mddev->persistent = 0;
4187 mddev->external = 1; 4187 mddev->external = 1;
4188 mddev->major_version = 0; 4188 mddev->major_version = 0;
4189 mddev->minor_version = 90; 4189 mddev->minor_version = 90;
4190 return len; 4190 return len;
4191 } 4191 }
4192 major = simple_strtoul(buf, &e, 10); 4192 major = simple_strtoul(buf, &e, 10);
4193 if (e==buf || *e != '.') 4193 if (e==buf || *e != '.')
4194 return -EINVAL; 4194 return -EINVAL;
4195 buf = e+1; 4195 buf = e+1;
4196 minor = simple_strtoul(buf, &e, 10); 4196 minor = simple_strtoul(buf, &e, 10);
4197 if (e==buf || (*e && *e != '\n') ) 4197 if (e==buf || (*e && *e != '\n') )
4198 return -EINVAL; 4198 return -EINVAL;
4199 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4199 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4200 return -ENOENT; 4200 return -ENOENT;
4201 mddev->major_version = major; 4201 mddev->major_version = major;
4202 mddev->minor_version = minor; 4202 mddev->minor_version = minor;
4203 mddev->persistent = 1; 4203 mddev->persistent = 1;
4204 mddev->external = 0; 4204 mddev->external = 0;
4205 return len; 4205 return len;
4206 } 4206 }
4207 4207
4208 static struct md_sysfs_entry md_metadata = 4208 static struct md_sysfs_entry md_metadata =
4209 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4209 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4210 4210
4211 static ssize_t 4211 static ssize_t
4212 action_show(struct mddev *mddev, char *page) 4212 action_show(struct mddev *mddev, char *page)
4213 { 4213 {
4214 char *type = "idle"; 4214 char *type = "idle";
4215 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4215 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4216 type = "frozen"; 4216 type = "frozen";
4217 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4217 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4218 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { 4218 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
4219 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4219 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4220 type = "reshape"; 4220 type = "reshape";
4221 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4221 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4222 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 4222 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
4223 type = "resync"; 4223 type = "resync";
4224 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 4224 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
4225 type = "check"; 4225 type = "check";
4226 else 4226 else
4227 type = "repair"; 4227 type = "repair";
4228 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 4228 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
4229 type = "recover"; 4229 type = "recover";
4230 } 4230 }
4231 return sprintf(page, "%s\n", type); 4231 return sprintf(page, "%s\n", type);
4232 } 4232 }
4233 4233
4234 static void reap_sync_thread(struct mddev *mddev);
4235
4236 static ssize_t 4234 static ssize_t
4237 action_store(struct mddev *mddev, const char *page, size_t len) 4235 action_store(struct mddev *mddev, const char *page, size_t len)
4238 { 4236 {
4239 if (!mddev->pers || !mddev->pers->sync_request) 4237 if (!mddev->pers || !mddev->pers->sync_request)
4240 return -EINVAL; 4238 return -EINVAL;
4241 4239
4242 if (cmd_match(page, "frozen")) 4240 if (cmd_match(page, "frozen"))
4243 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4241 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4244 else 4242 else
4245 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4243 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4246 4244
4247 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4245 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4248 if (mddev->sync_thread) { 4246 if (mddev->sync_thread) {
4249 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4247 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4250 reap_sync_thread(mddev); 4248 md_reap_sync_thread(mddev);
4251 } 4249 }
4252 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4250 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4253 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 4251 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
4254 return -EBUSY; 4252 return -EBUSY;
4255 else if (cmd_match(page, "resync")) 4253 else if (cmd_match(page, "resync"))
4256 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4254 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4257 else if (cmd_match(page, "recover")) { 4255 else if (cmd_match(page, "recover")) {
4258 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4256 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4259 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4257 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4260 } else if (cmd_match(page, "reshape")) { 4258 } else if (cmd_match(page, "reshape")) {
4261 int err; 4259 int err;
4262 if (mddev->pers->start_reshape == NULL) 4260 if (mddev->pers->start_reshape == NULL)
4263 return -EINVAL; 4261 return -EINVAL;
4264 err = mddev->pers->start_reshape(mddev); 4262 err = mddev->pers->start_reshape(mddev);
4265 if (err) 4263 if (err)
4266 return err; 4264 return err;
4267 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4265 sysfs_notify(&mddev->kobj, NULL, "degraded");
4268 } else { 4266 } else {
4269 if (cmd_match(page, "check")) 4267 if (cmd_match(page, "check"))
4270 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4268 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4271 else if (!cmd_match(page, "repair")) 4269 else if (!cmd_match(page, "repair"))
4272 return -EINVAL; 4270 return -EINVAL;
4273 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4271 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4274 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4272 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4275 } 4273 }
4276 if (mddev->ro == 2) { 4274 if (mddev->ro == 2) {
4277 /* A write to sync_action is enough to justify 4275 /* A write to sync_action is enough to justify
4278 * canceling read-auto mode 4276 * canceling read-auto mode
4279 */ 4277 */
4280 mddev->ro = 0; 4278 mddev->ro = 0;
4281 md_wakeup_thread(mddev->sync_thread); 4279 md_wakeup_thread(mddev->sync_thread);
4282 } 4280 }
4283 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4281 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4284 md_wakeup_thread(mddev->thread); 4282 md_wakeup_thread(mddev->thread);
4285 sysfs_notify_dirent_safe(mddev->sysfs_action); 4283 sysfs_notify_dirent_safe(mddev->sysfs_action);
4286 return len; 4284 return len;
4287 } 4285 }
4288 4286
4289 static ssize_t 4287 static ssize_t
4290 mismatch_cnt_show(struct mddev *mddev, char *page) 4288 mismatch_cnt_show(struct mddev *mddev, char *page)
4291 { 4289 {
4292 return sprintf(page, "%llu\n", 4290 return sprintf(page, "%llu\n",
4293 (unsigned long long) 4291 (unsigned long long)
4294 atomic64_read(&mddev->resync_mismatches)); 4292 atomic64_read(&mddev->resync_mismatches));
4295 } 4293 }
4296 4294
4297 static struct md_sysfs_entry md_scan_mode = 4295 static struct md_sysfs_entry md_scan_mode =
4298 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4296 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4299 4297
4300 4298
4301 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4299 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4302 4300
4303 static ssize_t 4301 static ssize_t
4304 sync_min_show(struct mddev *mddev, char *page) 4302 sync_min_show(struct mddev *mddev, char *page)
4305 { 4303 {
4306 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4304 return sprintf(page, "%d (%s)\n", speed_min(mddev),
4307 mddev->sync_speed_min ? "local": "system"); 4305 mddev->sync_speed_min ? "local": "system");
4308 } 4306 }
4309 4307
4310 static ssize_t 4308 static ssize_t
4311 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4309 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4312 { 4310 {
4313 int min; 4311 int min;
4314 char *e; 4312 char *e;
4315 if (strncmp(buf, "system", 6)==0) { 4313 if (strncmp(buf, "system", 6)==0) {
4316 mddev->sync_speed_min = 0; 4314 mddev->sync_speed_min = 0;
4317 return len; 4315 return len;
4318 } 4316 }
4319 min = simple_strtoul(buf, &e, 10); 4317 min = simple_strtoul(buf, &e, 10);
4320 if (buf == e || (*e && *e != '\n') || min <= 0) 4318 if (buf == e || (*e && *e != '\n') || min <= 0)
4321 return -EINVAL; 4319 return -EINVAL;
4322 mddev->sync_speed_min = min; 4320 mddev->sync_speed_min = min;
4323 return len; 4321 return len;
4324 } 4322 }
4325 4323
4326 static struct md_sysfs_entry md_sync_min = 4324 static struct md_sysfs_entry md_sync_min =
4327 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4325 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4328 4326
4329 static ssize_t 4327 static ssize_t
4330 sync_max_show(struct mddev *mddev, char *page) 4328 sync_max_show(struct mddev *mddev, char *page)
4331 { 4329 {
4332 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4330 return sprintf(page, "%d (%s)\n", speed_max(mddev),
4333 mddev->sync_speed_max ? "local": "system"); 4331 mddev->sync_speed_max ? "local": "system");
4334 } 4332 }
4335 4333
4336 static ssize_t 4334 static ssize_t
4337 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4335 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4338 { 4336 {
4339 int max; 4337 int max;
4340 char *e; 4338 char *e;
4341 if (strncmp(buf, "system", 6)==0) { 4339 if (strncmp(buf, "system", 6)==0) {
4342 mddev->sync_speed_max = 0; 4340 mddev->sync_speed_max = 0;
4343 return len; 4341 return len;
4344 } 4342 }
4345 max = simple_strtoul(buf, &e, 10); 4343 max = simple_strtoul(buf, &e, 10);
4346 if (buf == e || (*e && *e != '\n') || max <= 0) 4344 if (buf == e || (*e && *e != '\n') || max <= 0)
4347 return -EINVAL; 4345 return -EINVAL;
4348 mddev->sync_speed_max = max; 4346 mddev->sync_speed_max = max;
4349 return len; 4347 return len;
4350 } 4348 }
4351 4349
4352 static struct md_sysfs_entry md_sync_max = 4350 static struct md_sysfs_entry md_sync_max =
4353 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 4351 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4354 4352
4355 static ssize_t 4353 static ssize_t
4356 degraded_show(struct mddev *mddev, char *page) 4354 degraded_show(struct mddev *mddev, char *page)
4357 { 4355 {
4358 return sprintf(page, "%d\n", mddev->degraded); 4356 return sprintf(page, "%d\n", mddev->degraded);
4359 } 4357 }
4360 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 4358 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4361 4359
4362 static ssize_t 4360 static ssize_t
4363 sync_force_parallel_show(struct mddev *mddev, char *page) 4361 sync_force_parallel_show(struct mddev *mddev, char *page)
4364 { 4362 {
4365 return sprintf(page, "%d\n", mddev->parallel_resync); 4363 return sprintf(page, "%d\n", mddev->parallel_resync);
4366 } 4364 }
4367 4365
4368 static ssize_t 4366 static ssize_t
4369 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 4367 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4370 { 4368 {
4371 long n; 4369 long n;
4372 4370
4373 if (strict_strtol(buf, 10, &n)) 4371 if (strict_strtol(buf, 10, &n))
4374 return -EINVAL; 4372 return -EINVAL;
4375 4373
4376 if (n != 0 && n != 1) 4374 if (n != 0 && n != 1)
4377 return -EINVAL; 4375 return -EINVAL;
4378 4376
4379 mddev->parallel_resync = n; 4377 mddev->parallel_resync = n;
4380 4378
4381 if (mddev->sync_thread) 4379 if (mddev->sync_thread)
4382 wake_up(&resync_wait); 4380 wake_up(&resync_wait);
4383 4381
4384 return len; 4382 return len;
4385 } 4383 }
4386 4384
4387 /* force parallel resync, even with shared block devices */ 4385 /* force parallel resync, even with shared block devices */
4388 static struct md_sysfs_entry md_sync_force_parallel = 4386 static struct md_sysfs_entry md_sync_force_parallel =
4389 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 4387 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4390 sync_force_parallel_show, sync_force_parallel_store); 4388 sync_force_parallel_show, sync_force_parallel_store);
4391 4389
4392 static ssize_t 4390 static ssize_t
4393 sync_speed_show(struct mddev *mddev, char *page) 4391 sync_speed_show(struct mddev *mddev, char *page)
4394 { 4392 {
4395 unsigned long resync, dt, db; 4393 unsigned long resync, dt, db;
4396 if (mddev->curr_resync == 0) 4394 if (mddev->curr_resync == 0)
4397 return sprintf(page, "none\n"); 4395 return sprintf(page, "none\n");
4398 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 4396 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4399 dt = (jiffies - mddev->resync_mark) / HZ; 4397 dt = (jiffies - mddev->resync_mark) / HZ;
4400 if (!dt) dt++; 4398 if (!dt) dt++;
4401 db = resync - mddev->resync_mark_cnt; 4399 db = resync - mddev->resync_mark_cnt;
4402 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 4400 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4403 } 4401 }
4404 4402
4405 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 4403 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4406 4404
4407 static ssize_t 4405 static ssize_t
4408 sync_completed_show(struct mddev *mddev, char *page) 4406 sync_completed_show(struct mddev *mddev, char *page)
4409 { 4407 {
4410 unsigned long long max_sectors, resync; 4408 unsigned long long max_sectors, resync;
4411 4409
4412 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4410 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4413 return sprintf(page, "none\n"); 4411 return sprintf(page, "none\n");
4414 4412
4415 if (mddev->curr_resync == 1 || 4413 if (mddev->curr_resync == 1 ||
4416 mddev->curr_resync == 2) 4414 mddev->curr_resync == 2)
4417 return sprintf(page, "delayed\n"); 4415 return sprintf(page, "delayed\n");
4418 4416
4419 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 4417 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4420 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4418 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4421 max_sectors = mddev->resync_max_sectors; 4419 max_sectors = mddev->resync_max_sectors;
4422 else 4420 else
4423 max_sectors = mddev->dev_sectors; 4421 max_sectors = mddev->dev_sectors;
4424 4422
4425 resync = mddev->curr_resync_completed; 4423 resync = mddev->curr_resync_completed;
4426 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 4424 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4427 } 4425 }
4428 4426
4429 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 4427 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
4430 4428
4431 static ssize_t 4429 static ssize_t
4432 min_sync_show(struct mddev *mddev, char *page) 4430 min_sync_show(struct mddev *mddev, char *page)
4433 { 4431 {
4434 return sprintf(page, "%llu\n", 4432 return sprintf(page, "%llu\n",
4435 (unsigned long long)mddev->resync_min); 4433 (unsigned long long)mddev->resync_min);
4436 } 4434 }
4437 static ssize_t 4435 static ssize_t
4438 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 4436 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4439 { 4437 {
4440 unsigned long long min; 4438 unsigned long long min;
4441 if (strict_strtoull(buf, 10, &min)) 4439 if (strict_strtoull(buf, 10, &min))
4442 return -EINVAL; 4440 return -EINVAL;
4443 if (min > mddev->resync_max) 4441 if (min > mddev->resync_max)
4444 return -EINVAL; 4442 return -EINVAL;
4445 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4443 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4446 return -EBUSY; 4444 return -EBUSY;
4447 4445
4448 /* Must be a multiple of chunk_size */ 4446 /* Must be a multiple of chunk_size */
4449 if (mddev->chunk_sectors) { 4447 if (mddev->chunk_sectors) {
4450 sector_t temp = min; 4448 sector_t temp = min;
4451 if (sector_div(temp, mddev->chunk_sectors)) 4449 if (sector_div(temp, mddev->chunk_sectors))
4452 return -EINVAL; 4450 return -EINVAL;
4453 } 4451 }
4454 mddev->resync_min = min; 4452 mddev->resync_min = min;
4455 4453
4456 return len; 4454 return len;
4457 } 4455 }
4458 4456
4459 static struct md_sysfs_entry md_min_sync = 4457 static struct md_sysfs_entry md_min_sync =
4460 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 4458 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4461 4459
4462 static ssize_t 4460 static ssize_t
4463 max_sync_show(struct mddev *mddev, char *page) 4461 max_sync_show(struct mddev *mddev, char *page)
4464 { 4462 {
4465 if (mddev->resync_max == MaxSector) 4463 if (mddev->resync_max == MaxSector)
4466 return sprintf(page, "max\n"); 4464 return sprintf(page, "max\n");
4467 else 4465 else
4468 return sprintf(page, "%llu\n", 4466 return sprintf(page, "%llu\n",
4469 (unsigned long long)mddev->resync_max); 4467 (unsigned long long)mddev->resync_max);
4470 } 4468 }
4471 static ssize_t 4469 static ssize_t
4472 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 4470 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4473 { 4471 {
4474 if (strncmp(buf, "max", 3) == 0) 4472 if (strncmp(buf, "max", 3) == 0)
4475 mddev->resync_max = MaxSector; 4473 mddev->resync_max = MaxSector;
4476 else { 4474 else {
4477 unsigned long long max; 4475 unsigned long long max;
4478 if (strict_strtoull(buf, 10, &max)) 4476 if (strict_strtoull(buf, 10, &max))
4479 return -EINVAL; 4477 return -EINVAL;
4480 if (max < mddev->resync_min) 4478 if (max < mddev->resync_min)
4481 return -EINVAL; 4479 return -EINVAL;
4482 if (max < mddev->resync_max && 4480 if (max < mddev->resync_max &&
4483 mddev->ro == 0 && 4481 mddev->ro == 0 &&
4484 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4482 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4485 return -EBUSY; 4483 return -EBUSY;
4486 4484
4487 /* Must be a multiple of chunk_size */ 4485 /* Must be a multiple of chunk_size */
4488 if (mddev->chunk_sectors) { 4486 if (mddev->chunk_sectors) {
4489 sector_t temp = max; 4487 sector_t temp = max;
4490 if (sector_div(temp, mddev->chunk_sectors)) 4488 if (sector_div(temp, mddev->chunk_sectors))
4491 return -EINVAL; 4489 return -EINVAL;
4492 } 4490 }
4493 mddev->resync_max = max; 4491 mddev->resync_max = max;
4494 } 4492 }
4495 wake_up(&mddev->recovery_wait); 4493 wake_up(&mddev->recovery_wait);
4496 return len; 4494 return len;
4497 } 4495 }
4498 4496
4499 static struct md_sysfs_entry md_max_sync = 4497 static struct md_sysfs_entry md_max_sync =
4500 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 4498 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4501 4499
4502 static ssize_t 4500 static ssize_t
4503 suspend_lo_show(struct mddev *mddev, char *page) 4501 suspend_lo_show(struct mddev *mddev, char *page)
4504 { 4502 {
4505 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 4503 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4506 } 4504 }
4507 4505
4508 static ssize_t 4506 static ssize_t
4509 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 4507 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4510 { 4508 {
4511 char *e; 4509 char *e;
4512 unsigned long long new = simple_strtoull(buf, &e, 10); 4510 unsigned long long new = simple_strtoull(buf, &e, 10);
4513 unsigned long long old = mddev->suspend_lo; 4511 unsigned long long old = mddev->suspend_lo;
4514 4512
4515 if (mddev->pers == NULL || 4513 if (mddev->pers == NULL ||
4516 mddev->pers->quiesce == NULL) 4514 mddev->pers->quiesce == NULL)
4517 return -EINVAL; 4515 return -EINVAL;
4518 if (buf == e || (*e && *e != '\n')) 4516 if (buf == e || (*e && *e != '\n'))
4519 return -EINVAL; 4517 return -EINVAL;
4520 4518
4521 mddev->suspend_lo = new; 4519 mddev->suspend_lo = new;
4522 if (new >= old) 4520 if (new >= old)
4523 /* Shrinking suspended region */ 4521 /* Shrinking suspended region */
4524 mddev->pers->quiesce(mddev, 2); 4522 mddev->pers->quiesce(mddev, 2);
4525 else { 4523 else {
4526 /* Expanding suspended region - need to wait */ 4524 /* Expanding suspended region - need to wait */
4527 mddev->pers->quiesce(mddev, 1); 4525 mddev->pers->quiesce(mddev, 1);
4528 mddev->pers->quiesce(mddev, 0); 4526 mddev->pers->quiesce(mddev, 0);
4529 } 4527 }
4530 return len; 4528 return len;
4531 } 4529 }
4532 static struct md_sysfs_entry md_suspend_lo = 4530 static struct md_sysfs_entry md_suspend_lo =
4533 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4531 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4534 4532
4535 4533
4536 static ssize_t 4534 static ssize_t
4537 suspend_hi_show(struct mddev *mddev, char *page) 4535 suspend_hi_show(struct mddev *mddev, char *page)
4538 { 4536 {
4539 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 4537 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4540 } 4538 }
4541 4539
4542 static ssize_t 4540 static ssize_t
4543 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 4541 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4544 { 4542 {
4545 char *e; 4543 char *e;
4546 unsigned long long new = simple_strtoull(buf, &e, 10); 4544 unsigned long long new = simple_strtoull(buf, &e, 10);
4547 unsigned long long old = mddev->suspend_hi; 4545 unsigned long long old = mddev->suspend_hi;
4548 4546
4549 if (mddev->pers == NULL || 4547 if (mddev->pers == NULL ||
4550 mddev->pers->quiesce == NULL) 4548 mddev->pers->quiesce == NULL)
4551 return -EINVAL; 4549 return -EINVAL;
4552 if (buf == e || (*e && *e != '\n')) 4550 if (buf == e || (*e && *e != '\n'))
4553 return -EINVAL; 4551 return -EINVAL;
4554 4552
4555 mddev->suspend_hi = new; 4553 mddev->suspend_hi = new;
4556 if (new <= old) 4554 if (new <= old)
4557 /* Shrinking suspended region */ 4555 /* Shrinking suspended region */
4558 mddev->pers->quiesce(mddev, 2); 4556 mddev->pers->quiesce(mddev, 2);
4559 else { 4557 else {
4560 /* Expanding suspended region - need to wait */ 4558 /* Expanding suspended region - need to wait */
4561 mddev->pers->quiesce(mddev, 1); 4559 mddev->pers->quiesce(mddev, 1);
4562 mddev->pers->quiesce(mddev, 0); 4560 mddev->pers->quiesce(mddev, 0);
4563 } 4561 }
4564 return len; 4562 return len;
4565 } 4563 }
4566 static struct md_sysfs_entry md_suspend_hi = 4564 static struct md_sysfs_entry md_suspend_hi =
4567 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4565 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4568 4566
4569 static ssize_t 4567 static ssize_t
4570 reshape_position_show(struct mddev *mddev, char *page) 4568 reshape_position_show(struct mddev *mddev, char *page)
4571 { 4569 {
4572 if (mddev->reshape_position != MaxSector) 4570 if (mddev->reshape_position != MaxSector)
4573 return sprintf(page, "%llu\n", 4571 return sprintf(page, "%llu\n",
4574 (unsigned long long)mddev->reshape_position); 4572 (unsigned long long)mddev->reshape_position);
4575 strcpy(page, "none\n"); 4573 strcpy(page, "none\n");
4576 return 5; 4574 return 5;
4577 } 4575 }
4578 4576
4579 static ssize_t 4577 static ssize_t
4580 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 4578 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4581 { 4579 {
4582 struct md_rdev *rdev; 4580 struct md_rdev *rdev;
4583 char *e; 4581 char *e;
4584 unsigned long long new = simple_strtoull(buf, &e, 10); 4582 unsigned long long new = simple_strtoull(buf, &e, 10);
4585 if (mddev->pers) 4583 if (mddev->pers)
4586 return -EBUSY; 4584 return -EBUSY;
4587 if (buf == e || (*e && *e != '\n')) 4585 if (buf == e || (*e && *e != '\n'))
4588 return -EINVAL; 4586 return -EINVAL;
4589 mddev->reshape_position = new; 4587 mddev->reshape_position = new;
4590 mddev->delta_disks = 0; 4588 mddev->delta_disks = 0;
4591 mddev->reshape_backwards = 0; 4589 mddev->reshape_backwards = 0;
4592 mddev->new_level = mddev->level; 4590 mddev->new_level = mddev->level;
4593 mddev->new_layout = mddev->layout; 4591 mddev->new_layout = mddev->layout;
4594 mddev->new_chunk_sectors = mddev->chunk_sectors; 4592 mddev->new_chunk_sectors = mddev->chunk_sectors;
4595 rdev_for_each(rdev, mddev) 4593 rdev_for_each(rdev, mddev)
4596 rdev->new_data_offset = rdev->data_offset; 4594 rdev->new_data_offset = rdev->data_offset;
4597 return len; 4595 return len;
4598 } 4596 }
4599 4597
4600 static struct md_sysfs_entry md_reshape_position = 4598 static struct md_sysfs_entry md_reshape_position =
4601 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 4599 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4602 reshape_position_store); 4600 reshape_position_store);
4603 4601
4604 static ssize_t 4602 static ssize_t
4605 reshape_direction_show(struct mddev *mddev, char *page) 4603 reshape_direction_show(struct mddev *mddev, char *page)
4606 { 4604 {
4607 return sprintf(page, "%s\n", 4605 return sprintf(page, "%s\n",
4608 mddev->reshape_backwards ? "backwards" : "forwards"); 4606 mddev->reshape_backwards ? "backwards" : "forwards");
4609 } 4607 }
4610 4608
4611 static ssize_t 4609 static ssize_t
4612 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 4610 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4613 { 4611 {
4614 int backwards = 0; 4612 int backwards = 0;
4615 if (cmd_match(buf, "forwards")) 4613 if (cmd_match(buf, "forwards"))
4616 backwards = 0; 4614 backwards = 0;
4617 else if (cmd_match(buf, "backwards")) 4615 else if (cmd_match(buf, "backwards"))
4618 backwards = 1; 4616 backwards = 1;
4619 else 4617 else
4620 return -EINVAL; 4618 return -EINVAL;
4621 if (mddev->reshape_backwards == backwards) 4619 if (mddev->reshape_backwards == backwards)
4622 return len; 4620 return len;
4623 4621
4624 /* check if we are allowed to change */ 4622 /* check if we are allowed to change */
4625 if (mddev->delta_disks) 4623 if (mddev->delta_disks)
4626 return -EBUSY; 4624 return -EBUSY;
4627 4625
4628 if (mddev->persistent && 4626 if (mddev->persistent &&
4629 mddev->major_version == 0) 4627 mddev->major_version == 0)
4630 return -EINVAL; 4628 return -EINVAL;
4631 4629
4632 mddev->reshape_backwards = backwards; 4630 mddev->reshape_backwards = backwards;
4633 return len; 4631 return len;
4634 } 4632 }
4635 4633
4636 static struct md_sysfs_entry md_reshape_direction = 4634 static struct md_sysfs_entry md_reshape_direction =
4637 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 4635 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
4638 reshape_direction_store); 4636 reshape_direction_store);
4639 4637
4640 static ssize_t 4638 static ssize_t
4641 array_size_show(struct mddev *mddev, char *page) 4639 array_size_show(struct mddev *mddev, char *page)
4642 { 4640 {
4643 if (mddev->external_size) 4641 if (mddev->external_size)
4644 return sprintf(page, "%llu\n", 4642 return sprintf(page, "%llu\n",
4645 (unsigned long long)mddev->array_sectors/2); 4643 (unsigned long long)mddev->array_sectors/2);
4646 else 4644 else
4647 return sprintf(page, "default\n"); 4645 return sprintf(page, "default\n");
4648 } 4646 }
4649 4647
4650 static ssize_t 4648 static ssize_t
4651 array_size_store(struct mddev *mddev, const char *buf, size_t len) 4649 array_size_store(struct mddev *mddev, const char *buf, size_t len)
4652 { 4650 {
4653 sector_t sectors; 4651 sector_t sectors;
4654 4652
4655 if (strncmp(buf, "default", 7) == 0) { 4653 if (strncmp(buf, "default", 7) == 0) {
4656 if (mddev->pers) 4654 if (mddev->pers)
4657 sectors = mddev->pers->size(mddev, 0, 0); 4655 sectors = mddev->pers->size(mddev, 0, 0);
4658 else 4656 else
4659 sectors = mddev->array_sectors; 4657 sectors = mddev->array_sectors;
4660 4658
4661 mddev->external_size = 0; 4659 mddev->external_size = 0;
4662 } else { 4660 } else {
4663 if (strict_blocks_to_sectors(buf, &sectors) < 0) 4661 if (strict_blocks_to_sectors(buf, &sectors) < 0)
4664 return -EINVAL; 4662 return -EINVAL;
4665 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 4663 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
4666 return -E2BIG; 4664 return -E2BIG;
4667 4665
4668 mddev->external_size = 1; 4666 mddev->external_size = 1;
4669 } 4667 }
4670 4668
4671 mddev->array_sectors = sectors; 4669 mddev->array_sectors = sectors;
4672 if (mddev->pers) { 4670 if (mddev->pers) {
4673 set_capacity(mddev->gendisk, mddev->array_sectors); 4671 set_capacity(mddev->gendisk, mddev->array_sectors);
4674 revalidate_disk(mddev->gendisk); 4672 revalidate_disk(mddev->gendisk);
4675 } 4673 }
4676 return len; 4674 return len;
4677 } 4675 }
4678 4676
4679 static struct md_sysfs_entry md_array_size = 4677 static struct md_sysfs_entry md_array_size =
4680 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 4678 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
4681 array_size_store); 4679 array_size_store);
4682 4680
4683 static struct attribute *md_default_attrs[] = { 4681 static struct attribute *md_default_attrs[] = {
4684 &md_level.attr, 4682 &md_level.attr,
4685 &md_layout.attr, 4683 &md_layout.attr,
4686 &md_raid_disks.attr, 4684 &md_raid_disks.attr,
4687 &md_chunk_size.attr, 4685 &md_chunk_size.attr,
4688 &md_size.attr, 4686 &md_size.attr,
4689 &md_resync_start.attr, 4687 &md_resync_start.attr,
4690 &md_metadata.attr, 4688 &md_metadata.attr,
4691 &md_new_device.attr, 4689 &md_new_device.attr,
4692 &md_safe_delay.attr, 4690 &md_safe_delay.attr,
4693 &md_array_state.attr, 4691 &md_array_state.attr,
4694 &md_reshape_position.attr, 4692 &md_reshape_position.attr,
4695 &md_reshape_direction.attr, 4693 &md_reshape_direction.attr,
4696 &md_array_size.attr, 4694 &md_array_size.attr,
4697 &max_corr_read_errors.attr, 4695 &max_corr_read_errors.attr,
4698 NULL, 4696 NULL,
4699 }; 4697 };
4700 4698
4701 static struct attribute *md_redundancy_attrs[] = { 4699 static struct attribute *md_redundancy_attrs[] = {
4702 &md_scan_mode.attr, 4700 &md_scan_mode.attr,
4703 &md_mismatches.attr, 4701 &md_mismatches.attr,
4704 &md_sync_min.attr, 4702 &md_sync_min.attr,
4705 &md_sync_max.attr, 4703 &md_sync_max.attr,
4706 &md_sync_speed.attr, 4704 &md_sync_speed.attr,
4707 &md_sync_force_parallel.attr, 4705 &md_sync_force_parallel.attr,
4708 &md_sync_completed.attr, 4706 &md_sync_completed.attr,
4709 &md_min_sync.attr, 4707 &md_min_sync.attr,
4710 &md_max_sync.attr, 4708 &md_max_sync.attr,
4711 &md_suspend_lo.attr, 4709 &md_suspend_lo.attr,
4712 &md_suspend_hi.attr, 4710 &md_suspend_hi.attr,
4713 &md_bitmap.attr, 4711 &md_bitmap.attr,
4714 &md_degraded.attr, 4712 &md_degraded.attr,
4715 NULL, 4713 NULL,
4716 }; 4714 };
4717 static struct attribute_group md_redundancy_group = { 4715 static struct attribute_group md_redundancy_group = {
4718 .name = NULL, 4716 .name = NULL,
4719 .attrs = md_redundancy_attrs, 4717 .attrs = md_redundancy_attrs,
4720 }; 4718 };
4721 4719
4722 4720
4723 static ssize_t 4721 static ssize_t
4724 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4722 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4725 { 4723 {
4726 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4724 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4727 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4725 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
4728 ssize_t rv; 4726 ssize_t rv;
4729 4727
4730 if (!entry->show) 4728 if (!entry->show)
4731 return -EIO; 4729 return -EIO;
4732 spin_lock(&all_mddevs_lock); 4730 spin_lock(&all_mddevs_lock);
4733 if (list_empty(&mddev->all_mddevs)) { 4731 if (list_empty(&mddev->all_mddevs)) {
4734 spin_unlock(&all_mddevs_lock); 4732 spin_unlock(&all_mddevs_lock);
4735 return -EBUSY; 4733 return -EBUSY;
4736 } 4734 }
4737 mddev_get(mddev); 4735 mddev_get(mddev);
4738 spin_unlock(&all_mddevs_lock); 4736 spin_unlock(&all_mddevs_lock);
4739 4737
4740 rv = mddev_lock(mddev); 4738 rv = mddev_lock(mddev);
4741 if (!rv) { 4739 if (!rv) {
4742 rv = entry->show(mddev, page); 4740 rv = entry->show(mddev, page);
4743 mddev_unlock(mddev); 4741 mddev_unlock(mddev);
4744 } 4742 }
4745 mddev_put(mddev); 4743 mddev_put(mddev);
4746 return rv; 4744 return rv;
4747 } 4745 }
4748 4746
4749 static ssize_t 4747 static ssize_t
4750 md_attr_store(struct kobject *kobj, struct attribute *attr, 4748 md_attr_store(struct kobject *kobj, struct attribute *attr,
4751 const char *page, size_t length) 4749 const char *page, size_t length)
4752 { 4750 {
4753 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4751 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4754 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4752 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
4755 ssize_t rv; 4753 ssize_t rv;
4756 4754
4757 if (!entry->store) 4755 if (!entry->store)
4758 return -EIO; 4756 return -EIO;
4759 if (!capable(CAP_SYS_ADMIN)) 4757 if (!capable(CAP_SYS_ADMIN))
4760 return -EACCES; 4758 return -EACCES;
4761 spin_lock(&all_mddevs_lock); 4759 spin_lock(&all_mddevs_lock);
4762 if (list_empty(&mddev->all_mddevs)) { 4760 if (list_empty(&mddev->all_mddevs)) {
4763 spin_unlock(&all_mddevs_lock); 4761 spin_unlock(&all_mddevs_lock);
4764 return -EBUSY; 4762 return -EBUSY;
4765 } 4763 }
4766 mddev_get(mddev); 4764 mddev_get(mddev);
4767 spin_unlock(&all_mddevs_lock); 4765 spin_unlock(&all_mddevs_lock);
4768 if (entry->store == new_dev_store) 4766 if (entry->store == new_dev_store)
4769 flush_workqueue(md_misc_wq); 4767 flush_workqueue(md_misc_wq);
4770 rv = mddev_lock(mddev); 4768 rv = mddev_lock(mddev);
4771 if (!rv) { 4769 if (!rv) {
4772 rv = entry->store(mddev, page, length); 4770 rv = entry->store(mddev, page, length);
4773 mddev_unlock(mddev); 4771 mddev_unlock(mddev);
4774 } 4772 }
4775 mddev_put(mddev); 4773 mddev_put(mddev);
4776 return rv; 4774 return rv;
4777 } 4775 }
4778 4776
4779 static void md_free(struct kobject *ko) 4777 static void md_free(struct kobject *ko)
4780 { 4778 {
4781 struct mddev *mddev = container_of(ko, struct mddev, kobj); 4779 struct mddev *mddev = container_of(ko, struct mddev, kobj);
4782 4780
4783 if (mddev->sysfs_state) 4781 if (mddev->sysfs_state)
4784 sysfs_put(mddev->sysfs_state); 4782 sysfs_put(mddev->sysfs_state);
4785 4783
4786 if (mddev->gendisk) { 4784 if (mddev->gendisk) {
4787 del_gendisk(mddev->gendisk); 4785 del_gendisk(mddev->gendisk);
4788 put_disk(mddev->gendisk); 4786 put_disk(mddev->gendisk);
4789 } 4787 }
4790 if (mddev->queue) 4788 if (mddev->queue)
4791 blk_cleanup_queue(mddev->queue); 4789 blk_cleanup_queue(mddev->queue);
4792 4790
4793 kfree(mddev); 4791 kfree(mddev);
4794 } 4792 }
4795 4793
4796 static const struct sysfs_ops md_sysfs_ops = { 4794 static const struct sysfs_ops md_sysfs_ops = {
4797 .show = md_attr_show, 4795 .show = md_attr_show,
4798 .store = md_attr_store, 4796 .store = md_attr_store,
4799 }; 4797 };
4800 static struct kobj_type md_ktype = { 4798 static struct kobj_type md_ktype = {
4801 .release = md_free, 4799 .release = md_free,
4802 .sysfs_ops = &md_sysfs_ops, 4800 .sysfs_ops = &md_sysfs_ops,
4803 .default_attrs = md_default_attrs, 4801 .default_attrs = md_default_attrs,
4804 }; 4802 };
4805 4803
4806 int mdp_major = 0; 4804 int mdp_major = 0;
4807 4805
4808 static void mddev_delayed_delete(struct work_struct *ws) 4806 static void mddev_delayed_delete(struct work_struct *ws)
4809 { 4807 {
4810 struct mddev *mddev = container_of(ws, struct mddev, del_work); 4808 struct mddev *mddev = container_of(ws, struct mddev, del_work);
4811 4809
4812 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4810 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
4813 kobject_del(&mddev->kobj); 4811 kobject_del(&mddev->kobj);
4814 kobject_put(&mddev->kobj); 4812 kobject_put(&mddev->kobj);
4815 } 4813 }
4816 4814
4817 static int md_alloc(dev_t dev, char *name) 4815 static int md_alloc(dev_t dev, char *name)
4818 { 4816 {
4819 static DEFINE_MUTEX(disks_mutex); 4817 static DEFINE_MUTEX(disks_mutex);
4820 struct mddev *mddev = mddev_find(dev); 4818 struct mddev *mddev = mddev_find(dev);
4821 struct gendisk *disk; 4819 struct gendisk *disk;
4822 int partitioned; 4820 int partitioned;
4823 int shift; 4821 int shift;
4824 int unit; 4822 int unit;
4825 int error; 4823 int error;
4826 4824
4827 if (!mddev) 4825 if (!mddev)
4828 return -ENODEV; 4826 return -ENODEV;
4829 4827
4830 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 4828 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
4831 shift = partitioned ? MdpMinorShift : 0; 4829 shift = partitioned ? MdpMinorShift : 0;
4832 unit = MINOR(mddev->unit) >> shift; 4830 unit = MINOR(mddev->unit) >> shift;
4833 4831
4834 /* wait for any previous instance of this device to be 4832 /* wait for any previous instance of this device to be
4835 * completely removed (mddev_delayed_delete). 4833 * completely removed (mddev_delayed_delete).
4836 */ 4834 */
4837 flush_workqueue(md_misc_wq); 4835 flush_workqueue(md_misc_wq);
4838 4836
4839 mutex_lock(&disks_mutex); 4837 mutex_lock(&disks_mutex);
4840 error = -EEXIST; 4838 error = -EEXIST;
4841 if (mddev->gendisk) 4839 if (mddev->gendisk)
4842 goto abort; 4840 goto abort;
4843 4841
4844 if (name) { 4842 if (name) {
4845 /* Need to ensure that 'name' is not a duplicate. 4843 /* Need to ensure that 'name' is not a duplicate.
4846 */ 4844 */
4847 struct mddev *mddev2; 4845 struct mddev *mddev2;
4848 spin_lock(&all_mddevs_lock); 4846 spin_lock(&all_mddevs_lock);
4849 4847
4850 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 4848 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
4851 if (mddev2->gendisk && 4849 if (mddev2->gendisk &&
4852 strcmp(mddev2->gendisk->disk_name, name) == 0) { 4850 strcmp(mddev2->gendisk->disk_name, name) == 0) {
4853 spin_unlock(&all_mddevs_lock); 4851 spin_unlock(&all_mddevs_lock);
4854 goto abort; 4852 goto abort;
4855 } 4853 }
4856 spin_unlock(&all_mddevs_lock); 4854 spin_unlock(&all_mddevs_lock);
4857 } 4855 }
4858 4856
4859 error = -ENOMEM; 4857 error = -ENOMEM;
4860 mddev->queue = blk_alloc_queue(GFP_KERNEL); 4858 mddev->queue = blk_alloc_queue(GFP_KERNEL);
4861 if (!mddev->queue) 4859 if (!mddev->queue)
4862 goto abort; 4860 goto abort;
4863 mddev->queue->queuedata = mddev; 4861 mddev->queue->queuedata = mddev;
4864 4862
4865 blk_queue_make_request(mddev->queue, md_make_request); 4863 blk_queue_make_request(mddev->queue, md_make_request);
4866 blk_set_stacking_limits(&mddev->queue->limits); 4864 blk_set_stacking_limits(&mddev->queue->limits);
4867 4865
4868 disk = alloc_disk(1 << shift); 4866 disk = alloc_disk(1 << shift);
4869 if (!disk) { 4867 if (!disk) {
4870 blk_cleanup_queue(mddev->queue); 4868 blk_cleanup_queue(mddev->queue);
4871 mddev->queue = NULL; 4869 mddev->queue = NULL;
4872 goto abort; 4870 goto abort;
4873 } 4871 }
4874 disk->major = MAJOR(mddev->unit); 4872 disk->major = MAJOR(mddev->unit);
4875 disk->first_minor = unit << shift; 4873 disk->first_minor = unit << shift;
4876 if (name) 4874 if (name)
4877 strcpy(disk->disk_name, name); 4875 strcpy(disk->disk_name, name);
4878 else if (partitioned) 4876 else if (partitioned)
4879 sprintf(disk->disk_name, "md_d%d", unit); 4877 sprintf(disk->disk_name, "md_d%d", unit);
4880 else 4878 else
4881 sprintf(disk->disk_name, "md%d", unit); 4879 sprintf(disk->disk_name, "md%d", unit);
4882 disk->fops = &md_fops; 4880 disk->fops = &md_fops;
4883 disk->private_data = mddev; 4881 disk->private_data = mddev;
4884 disk->queue = mddev->queue; 4882 disk->queue = mddev->queue;
4885 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); 4883 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
4886 /* Allow extended partitions. This makes the 4884 /* Allow extended partitions. This makes the
4887 * 'mdp' device redundant, but we can't really 4885 * 'mdp' device redundant, but we can't really
4888 * remove it now. 4886 * remove it now.
4889 */ 4887 */
4890 disk->flags |= GENHD_FL_EXT_DEVT; 4888 disk->flags |= GENHD_FL_EXT_DEVT;
4891 mddev->gendisk = disk; 4889 mddev->gendisk = disk;
4892 /* As soon as we call add_disk(), another thread could get 4890 /* As soon as we call add_disk(), another thread could get
4893 * through to md_open, so make sure it doesn't get too far 4891 * through to md_open, so make sure it doesn't get too far
4894 */ 4892 */
4895 mutex_lock(&mddev->open_mutex); 4893 mutex_lock(&mddev->open_mutex);
4896 add_disk(disk); 4894 add_disk(disk);
4897 4895
4898 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 4896 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4899 &disk_to_dev(disk)->kobj, "%s", "md"); 4897 &disk_to_dev(disk)->kobj, "%s", "md");
4900 if (error) { 4898 if (error) {
4901 /* This isn't possible, but as kobject_init_and_add is marked 4899 /* This isn't possible, but as kobject_init_and_add is marked
4902 * __must_check, we must do something with the result 4900 * __must_check, we must do something with the result
4903 */ 4901 */
4904 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 4902 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
4905 disk->disk_name); 4903 disk->disk_name);
4906 error = 0; 4904 error = 0;
4907 } 4905 }
4908 if (mddev->kobj.sd && 4906 if (mddev->kobj.sd &&
4909 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 4907 sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4910 printk(KERN_DEBUG "pointless warning\n"); 4908 printk(KERN_DEBUG "pointless warning\n");
4911 mutex_unlock(&mddev->open_mutex); 4909 mutex_unlock(&mddev->open_mutex);
4912 abort: 4910 abort:
4913 mutex_unlock(&disks_mutex); 4911 mutex_unlock(&disks_mutex);
4914 if (!error && mddev->kobj.sd) { 4912 if (!error && mddev->kobj.sd) {
4915 kobject_uevent(&mddev->kobj, KOBJ_ADD); 4913 kobject_uevent(&mddev->kobj, KOBJ_ADD);
4916 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 4914 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
4917 } 4915 }
4918 mddev_put(mddev); 4916 mddev_put(mddev);
4919 return error; 4917 return error;
4920 } 4918 }
4921 4919
4922 static struct kobject *md_probe(dev_t dev, int *part, void *data) 4920 static struct kobject *md_probe(dev_t dev, int *part, void *data)
4923 { 4921 {
4924 md_alloc(dev, NULL); 4922 md_alloc(dev, NULL);
4925 return NULL; 4923 return NULL;
4926 } 4924 }
4927 4925
4928 static int add_named_array(const char *val, struct kernel_param *kp) 4926 static int add_named_array(const char *val, struct kernel_param *kp)
4929 { 4927 {
4930 /* val must be "md_*" where * is not all digits. 4928 /* val must be "md_*" where * is not all digits.
4931 * We allocate an array with a large free minor number, and 4929 * We allocate an array with a large free minor number, and
4932 * set the name to val. val must not already be an active name. 4930 * set the name to val. val must not already be an active name.
4933 */ 4931 */
4934 int len = strlen(val); 4932 int len = strlen(val);
4935 char buf[DISK_NAME_LEN]; 4933 char buf[DISK_NAME_LEN];
4936 4934
4937 while (len && val[len-1] == '\n') 4935 while (len && val[len-1] == '\n')
4938 len--; 4936 len--;
4939 if (len >= DISK_NAME_LEN) 4937 if (len >= DISK_NAME_LEN)
4940 return -E2BIG; 4938 return -E2BIG;
4941 strlcpy(buf, val, len+1); 4939 strlcpy(buf, val, len+1);
4942 if (strncmp(buf, "md_", 3) != 0) 4940 if (strncmp(buf, "md_", 3) != 0)
4943 return -EINVAL; 4941 return -EINVAL;
4944 return md_alloc(0, buf); 4942 return md_alloc(0, buf);
4945 } 4943 }
4946 4944
4947 static void md_safemode_timeout(unsigned long data) 4945 static void md_safemode_timeout(unsigned long data)
4948 { 4946 {
4949 struct mddev *mddev = (struct mddev *) data; 4947 struct mddev *mddev = (struct mddev *) data;
4950 4948
4951 if (!atomic_read(&mddev->writes_pending)) { 4949 if (!atomic_read(&mddev->writes_pending)) {
4952 mddev->safemode = 1; 4950 mddev->safemode = 1;
4953 if (mddev->external) 4951 if (mddev->external)
4954 sysfs_notify_dirent_safe(mddev->sysfs_state); 4952 sysfs_notify_dirent_safe(mddev->sysfs_state);
4955 } 4953 }
4956 md_wakeup_thread(mddev->thread); 4954 md_wakeup_thread(mddev->thread);
4957 } 4955 }
4958 4956
4959 static int start_dirty_degraded; 4957 static int start_dirty_degraded;
4960 4958
4961 int md_run(struct mddev *mddev) 4959 int md_run(struct mddev *mddev)
4962 { 4960 {
4963 int err; 4961 int err;
4964 struct md_rdev *rdev; 4962 struct md_rdev *rdev;
4965 struct md_personality *pers; 4963 struct md_personality *pers;
4966 4964
4967 if (list_empty(&mddev->disks)) 4965 if (list_empty(&mddev->disks))
4968 /* cannot run an array with no devices.. */ 4966 /* cannot run an array with no devices.. */
4969 return -EINVAL; 4967 return -EINVAL;
4970 4968
4971 if (mddev->pers) 4969 if (mddev->pers)
4972 return -EBUSY; 4970 return -EBUSY;
4973 /* Cannot run until previous stop completes properly */ 4971 /* Cannot run until previous stop completes properly */
4974 if (mddev->sysfs_active) 4972 if (mddev->sysfs_active)
4975 return -EBUSY; 4973 return -EBUSY;
4976 4974
4977 /* 4975 /*
4978 * Analyze all RAID superblock(s) 4976 * Analyze all RAID superblock(s)
4979 */ 4977 */
4980 if (!mddev->raid_disks) { 4978 if (!mddev->raid_disks) {
4981 if (!mddev->persistent) 4979 if (!mddev->persistent)
4982 return -EINVAL; 4980 return -EINVAL;
4983 analyze_sbs(mddev); 4981 analyze_sbs(mddev);
4984 } 4982 }
4985 4983
4986 if (mddev->level != LEVEL_NONE) 4984 if (mddev->level != LEVEL_NONE)
4987 request_module("md-level-%d", mddev->level); 4985 request_module("md-level-%d", mddev->level);
4988 else if (mddev->clevel[0]) 4986 else if (mddev->clevel[0])
4989 request_module("md-%s", mddev->clevel); 4987 request_module("md-%s", mddev->clevel);
4990 4988
4991 /* 4989 /*
4992 * Drop all container device buffers, from now on 4990 * Drop all container device buffers, from now on
4993 * the only valid external interface is through the md 4991 * the only valid external interface is through the md
4994 * device. 4992 * device.
4995 */ 4993 */
4996 rdev_for_each(rdev, mddev) { 4994 rdev_for_each(rdev, mddev) {
4997 if (test_bit(Faulty, &rdev->flags)) 4995 if (test_bit(Faulty, &rdev->flags))
4998 continue; 4996 continue;
4999 sync_blockdev(rdev->bdev); 4997 sync_blockdev(rdev->bdev);
5000 invalidate_bdev(rdev->bdev); 4998 invalidate_bdev(rdev->bdev);
5001 4999
5002 /* perform some consistency tests on the device. 5000 /* perform some consistency tests on the device.
5003 * We don't want the data to overlap the metadata, 5001 * We don't want the data to overlap the metadata,
5004 * Internal Bitmap issues have been handled elsewhere. 5002 * Internal Bitmap issues have been handled elsewhere.
5005 */ 5003 */
5006 if (rdev->meta_bdev) { 5004 if (rdev->meta_bdev) {
5007 /* Nothing to check */; 5005 /* Nothing to check */;
5008 } else if (rdev->data_offset < rdev->sb_start) { 5006 } else if (rdev->data_offset < rdev->sb_start) {
5009 if (mddev->dev_sectors && 5007 if (mddev->dev_sectors &&
5010 rdev->data_offset + mddev->dev_sectors 5008 rdev->data_offset + mddev->dev_sectors
5011 > rdev->sb_start) { 5009 > rdev->sb_start) {
5012 printk("md: %s: data overlaps metadata\n", 5010 printk("md: %s: data overlaps metadata\n",
5013 mdname(mddev)); 5011 mdname(mddev));
5014 return -EINVAL; 5012 return -EINVAL;
5015 } 5013 }
5016 } else { 5014 } else {
5017 if (rdev->sb_start + rdev->sb_size/512 5015 if (rdev->sb_start + rdev->sb_size/512
5018 > rdev->data_offset) { 5016 > rdev->data_offset) {
5019 printk("md: %s: metadata overlaps data\n", 5017 printk("md: %s: metadata overlaps data\n",
5020 mdname(mddev)); 5018 mdname(mddev));
5021 return -EINVAL; 5019 return -EINVAL;
5022 } 5020 }
5023 } 5021 }
5024 sysfs_notify_dirent_safe(rdev->sysfs_state); 5022 sysfs_notify_dirent_safe(rdev->sysfs_state);
5025 } 5023 }
5026 5024
5027 if (mddev->bio_set == NULL) 5025 if (mddev->bio_set == NULL)
5028 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); 5026 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
5029 5027
5030 spin_lock(&pers_lock); 5028 spin_lock(&pers_lock);
5031 pers = find_pers(mddev->level, mddev->clevel); 5029 pers = find_pers(mddev->level, mddev->clevel);
5032 if (!pers || !try_module_get(pers->owner)) { 5030 if (!pers || !try_module_get(pers->owner)) {
5033 spin_unlock(&pers_lock); 5031 spin_unlock(&pers_lock);
5034 if (mddev->level != LEVEL_NONE) 5032 if (mddev->level != LEVEL_NONE)
5035 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 5033 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
5036 mddev->level); 5034 mddev->level);
5037 else 5035 else
5038 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 5036 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
5039 mddev->clevel); 5037 mddev->clevel);
5040 return -EINVAL; 5038 return -EINVAL;
5041 } 5039 }
5042 mddev->pers = pers; 5040 mddev->pers = pers;
5043 spin_unlock(&pers_lock); 5041 spin_unlock(&pers_lock);
5044 if (mddev->level != pers->level) { 5042 if (mddev->level != pers->level) {
5045 mddev->level = pers->level; 5043 mddev->level = pers->level;
5046 mddev->new_level = pers->level; 5044 mddev->new_level = pers->level;
5047 } 5045 }
5048 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5046 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5049 5047
5050 if (mddev->reshape_position != MaxSector && 5048 if (mddev->reshape_position != MaxSector &&
5051 pers->start_reshape == NULL) { 5049 pers->start_reshape == NULL) {
5052 /* This personality cannot handle reshaping... */ 5050 /* This personality cannot handle reshaping... */
5053 mddev->pers = NULL; 5051 mddev->pers = NULL;
5054 module_put(pers->owner); 5052 module_put(pers->owner);
5055 return -EINVAL; 5053 return -EINVAL;
5056 } 5054 }
5057 5055
5058 if (pers->sync_request) { 5056 if (pers->sync_request) {
5059 /* Warn if this is a potentially silly 5057 /* Warn if this is a potentially silly
5060 * configuration. 5058 * configuration.
5061 */ 5059 */
5062 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5060 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5063 struct md_rdev *rdev2; 5061 struct md_rdev *rdev2;
5064 int warned = 0; 5062 int warned = 0;
5065 5063
5066 rdev_for_each(rdev, mddev) 5064 rdev_for_each(rdev, mddev)
5067 rdev_for_each(rdev2, mddev) { 5065 rdev_for_each(rdev2, mddev) {
5068 if (rdev < rdev2 && 5066 if (rdev < rdev2 &&
5069 rdev->bdev->bd_contains == 5067 rdev->bdev->bd_contains ==
5070 rdev2->bdev->bd_contains) { 5068 rdev2->bdev->bd_contains) {
5071 printk(KERN_WARNING 5069 printk(KERN_WARNING
5072 "%s: WARNING: %s appears to be" 5070 "%s: WARNING: %s appears to be"
5073 " on the same physical disk as" 5071 " on the same physical disk as"
5074 " %s.\n", 5072 " %s.\n",
5075 mdname(mddev), 5073 mdname(mddev),
5076 bdevname(rdev->bdev,b), 5074 bdevname(rdev->bdev,b),
5077 bdevname(rdev2->bdev,b2)); 5075 bdevname(rdev2->bdev,b2));
5078 warned = 1; 5076 warned = 1;
5079 } 5077 }
5080 } 5078 }
5081 5079
5082 if (warned) 5080 if (warned)
5083 printk(KERN_WARNING 5081 printk(KERN_WARNING
5084 "True protection against single-disk" 5082 "True protection against single-disk"
5085 " failure might be compromised.\n"); 5083 " failure might be compromised.\n");
5086 } 5084 }
5087 5085
5088 mddev->recovery = 0; 5086 mddev->recovery = 0;
5089 /* may be over-ridden by personality */ 5087 /* may be over-ridden by personality */
5090 mddev->resync_max_sectors = mddev->dev_sectors; 5088 mddev->resync_max_sectors = mddev->dev_sectors;
5091 5089
5092 mddev->ok_start_degraded = start_dirty_degraded; 5090 mddev->ok_start_degraded = start_dirty_degraded;
5093 5091
5094 if (start_readonly && mddev->ro == 0) 5092 if (start_readonly && mddev->ro == 0)
5095 mddev->ro = 2; /* read-only, but switch on first write */ 5093 mddev->ro = 2; /* read-only, but switch on first write */
5096 5094
5097 err = mddev->pers->run(mddev); 5095 err = mddev->pers->run(mddev);
5098 if (err) 5096 if (err)
5099 printk(KERN_ERR "md: pers->run() failed ...\n"); 5097 printk(KERN_ERR "md: pers->run() failed ...\n");
5100 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { 5098 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
5101 WARN_ONCE(!mddev->external_size, "%s: default size too small," 5099 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
5102 " but 'external_size' not in effect?\n", __func__); 5100 " but 'external_size' not in effect?\n", __func__);
5103 printk(KERN_ERR 5101 printk(KERN_ERR
5104 "md: invalid array_size %llu > default size %llu\n", 5102 "md: invalid array_size %llu > default size %llu\n",
5105 (unsigned long long)mddev->array_sectors / 2, 5103 (unsigned long long)mddev->array_sectors / 2,
5106 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); 5104 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
5107 err = -EINVAL; 5105 err = -EINVAL;
5108 mddev->pers->stop(mddev); 5106 mddev->pers->stop(mddev);
5109 } 5107 }
5110 if (err == 0 && mddev->pers->sync_request && 5108 if (err == 0 && mddev->pers->sync_request &&
5111 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 5109 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5112 err = bitmap_create(mddev); 5110 err = bitmap_create(mddev);
5113 if (err) { 5111 if (err) {
5114 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 5112 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
5115 mdname(mddev), err); 5113 mdname(mddev), err);
5116 mddev->pers->stop(mddev); 5114 mddev->pers->stop(mddev);
5117 } 5115 }
5118 } 5116 }
5119 if (err) { 5117 if (err) {
5120 module_put(mddev->pers->owner); 5118 module_put(mddev->pers->owner);
5121 mddev->pers = NULL; 5119 mddev->pers = NULL;
5122 bitmap_destroy(mddev); 5120 bitmap_destroy(mddev);
5123 return err; 5121 return err;
5124 } 5122 }
5125 if (mddev->pers->sync_request) { 5123 if (mddev->pers->sync_request) {
5126 if (mddev->kobj.sd && 5124 if (mddev->kobj.sd &&
5127 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 5125 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5128 printk(KERN_WARNING 5126 printk(KERN_WARNING
5129 "md: cannot register extra attributes for %s\n", 5127 "md: cannot register extra attributes for %s\n",
5130 mdname(mddev)); 5128 mdname(mddev));
5131 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 5129 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5132 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 5130 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
5133 mddev->ro = 0; 5131 mddev->ro = 0;
5134 5132
5135 atomic_set(&mddev->writes_pending,0); 5133 atomic_set(&mddev->writes_pending,0);
5136 atomic_set(&mddev->max_corr_read_errors, 5134 atomic_set(&mddev->max_corr_read_errors,
5137 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 5135 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
5138 mddev->safemode = 0; 5136 mddev->safemode = 0;
5139 mddev->safemode_timer.function = md_safemode_timeout; 5137 mddev->safemode_timer.function = md_safemode_timeout;
5140 mddev->safemode_timer.data = (unsigned long) mddev; 5138 mddev->safemode_timer.data = (unsigned long) mddev;
5141 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 5139 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
5142 mddev->in_sync = 1; 5140 mddev->in_sync = 1;
5143 smp_wmb(); 5141 smp_wmb();
5144 mddev->ready = 1; 5142 mddev->ready = 1;
5145 rdev_for_each(rdev, mddev) 5143 rdev_for_each(rdev, mddev)
5146 if (rdev->raid_disk >= 0) 5144 if (rdev->raid_disk >= 0)
5147 if (sysfs_link_rdev(mddev, rdev)) 5145 if (sysfs_link_rdev(mddev, rdev))
5148 /* failure here is OK */; 5146 /* failure here is OK */;
5149 5147
5150 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5148 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5151 5149
5152 if (mddev->flags) 5150 if (mddev->flags)
5153 md_update_sb(mddev, 0); 5151 md_update_sb(mddev, 0);
5154 5152
5155 md_new_event(mddev); 5153 md_new_event(mddev);
5156 sysfs_notify_dirent_safe(mddev->sysfs_state); 5154 sysfs_notify_dirent_safe(mddev->sysfs_state);
5157 sysfs_notify_dirent_safe(mddev->sysfs_action); 5155 sysfs_notify_dirent_safe(mddev->sysfs_action);
5158 sysfs_notify(&mddev->kobj, NULL, "degraded"); 5156 sysfs_notify(&mddev->kobj, NULL, "degraded");
5159 return 0; 5157 return 0;
5160 } 5158 }
5161 EXPORT_SYMBOL_GPL(md_run); 5159 EXPORT_SYMBOL_GPL(md_run);
5162 5160
5163 static int do_md_run(struct mddev *mddev) 5161 static int do_md_run(struct mddev *mddev)
5164 { 5162 {
5165 int err; 5163 int err;
5166 5164
5167 err = md_run(mddev); 5165 err = md_run(mddev);
5168 if (err) 5166 if (err)
5169 goto out; 5167 goto out;
5170 err = bitmap_load(mddev); 5168 err = bitmap_load(mddev);
5171 if (err) { 5169 if (err) {
5172 bitmap_destroy(mddev); 5170 bitmap_destroy(mddev);
5173 goto out; 5171 goto out;
5174 } 5172 }
5175 5173
5176 md_wakeup_thread(mddev->thread); 5174 md_wakeup_thread(mddev->thread);
5177 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 5175 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5178 5176
5179 set_capacity(mddev->gendisk, mddev->array_sectors); 5177 set_capacity(mddev->gendisk, mddev->array_sectors);
5180 revalidate_disk(mddev->gendisk); 5178 revalidate_disk(mddev->gendisk);
5181 mddev->changed = 1; 5179 mddev->changed = 1;
5182 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5180 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5183 out: 5181 out:
5184 return err; 5182 return err;
5185 } 5183 }
5186 5184
5187 static int restart_array(struct mddev *mddev) 5185 static int restart_array(struct mddev *mddev)
5188 { 5186 {
5189 struct gendisk *disk = mddev->gendisk; 5187 struct gendisk *disk = mddev->gendisk;
5190 5188
5191 /* Complain if it has no devices */ 5189 /* Complain if it has no devices */
5192 if (list_empty(&mddev->disks)) 5190 if (list_empty(&mddev->disks))
5193 return -ENXIO; 5191 return -ENXIO;
5194 if (!mddev->pers) 5192 if (!mddev->pers)
5195 return -EINVAL; 5193 return -EINVAL;
5196 if (!mddev->ro) 5194 if (!mddev->ro)
5197 return -EBUSY; 5195 return -EBUSY;
5198 mddev->safemode = 0; 5196 mddev->safemode = 0;
5199 mddev->ro = 0; 5197 mddev->ro = 0;
5200 set_disk_ro(disk, 0); 5198 set_disk_ro(disk, 0);
5201 printk(KERN_INFO "md: %s switched to read-write mode.\n", 5199 printk(KERN_INFO "md: %s switched to read-write mode.\n",
5202 mdname(mddev)); 5200 mdname(mddev));
5203 /* Kick recovery or resync if necessary */ 5201 /* Kick recovery or resync if necessary */
5204 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5202 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5205 md_wakeup_thread(mddev->thread); 5203 md_wakeup_thread(mddev->thread);
5206 md_wakeup_thread(mddev->sync_thread); 5204 md_wakeup_thread(mddev->sync_thread);
5207 sysfs_notify_dirent_safe(mddev->sysfs_state); 5205 sysfs_notify_dirent_safe(mddev->sysfs_state);
5208 return 0; 5206 return 0;
5209 } 5207 }
5210 5208
5211 /* similar to deny_write_access, but accounts for our holding a reference 5209 /* similar to deny_write_access, but accounts for our holding a reference
5212 * to the file ourselves */ 5210 * to the file ourselves */
5213 static int deny_bitmap_write_access(struct file * file) 5211 static int deny_bitmap_write_access(struct file * file)
5214 { 5212 {
5215 struct inode *inode = file->f_mapping->host; 5213 struct inode *inode = file->f_mapping->host;
5216 5214
5217 spin_lock(&inode->i_lock); 5215 spin_lock(&inode->i_lock);
5218 if (atomic_read(&inode->i_writecount) > 1) { 5216 if (atomic_read(&inode->i_writecount) > 1) {
5219 spin_unlock(&inode->i_lock); 5217 spin_unlock(&inode->i_lock);
5220 return -ETXTBSY; 5218 return -ETXTBSY;
5221 } 5219 }
5222 atomic_set(&inode->i_writecount, -1); 5220 atomic_set(&inode->i_writecount, -1);
5223 spin_unlock(&inode->i_lock); 5221 spin_unlock(&inode->i_lock);
5224 5222
5225 return 0; 5223 return 0;
5226 } 5224 }
5227 5225
5228 void restore_bitmap_write_access(struct file *file) 5226 void restore_bitmap_write_access(struct file *file)
5229 { 5227 {
5230 struct inode *inode = file->f_mapping->host; 5228 struct inode *inode = file->f_mapping->host;
5231 5229
5232 spin_lock(&inode->i_lock); 5230 spin_lock(&inode->i_lock);
5233 atomic_set(&inode->i_writecount, 1); 5231 atomic_set(&inode->i_writecount, 1);
5234 spin_unlock(&inode->i_lock); 5232 spin_unlock(&inode->i_lock);
5235 } 5233 }
5236 5234
5237 static void md_clean(struct mddev *mddev) 5235 static void md_clean(struct mddev *mddev)
5238 { 5236 {
5239 mddev->array_sectors = 0; 5237 mddev->array_sectors = 0;
5240 mddev->external_size = 0; 5238 mddev->external_size = 0;
5241 mddev->dev_sectors = 0; 5239 mddev->dev_sectors = 0;
5242 mddev->raid_disks = 0; 5240 mddev->raid_disks = 0;
5243 mddev->recovery_cp = 0; 5241 mddev->recovery_cp = 0;
5244 mddev->resync_min = 0; 5242 mddev->resync_min = 0;
5245 mddev->resync_max = MaxSector; 5243 mddev->resync_max = MaxSector;
5246 mddev->reshape_position = MaxSector; 5244 mddev->reshape_position = MaxSector;
5247 mddev->external = 0; 5245 mddev->external = 0;
5248 mddev->persistent = 0; 5246 mddev->persistent = 0;
5249 mddev->level = LEVEL_NONE; 5247 mddev->level = LEVEL_NONE;
5250 mddev->clevel[0] = 0; 5248 mddev->clevel[0] = 0;
5251 mddev->flags = 0; 5249 mddev->flags = 0;
5252 mddev->ro = 0; 5250 mddev->ro = 0;
5253 mddev->metadata_type[0] = 0; 5251 mddev->metadata_type[0] = 0;
5254 mddev->chunk_sectors = 0; 5252 mddev->chunk_sectors = 0;
5255 mddev->ctime = mddev->utime = 0; 5253 mddev->ctime = mddev->utime = 0;
5256 mddev->layout = 0; 5254 mddev->layout = 0;
5257 mddev->max_disks = 0; 5255 mddev->max_disks = 0;
5258 mddev->events = 0; 5256 mddev->events = 0;
5259 mddev->can_decrease_events = 0; 5257 mddev->can_decrease_events = 0;
5260 mddev->delta_disks = 0; 5258 mddev->delta_disks = 0;
5261 mddev->reshape_backwards = 0; 5259 mddev->reshape_backwards = 0;
5262 mddev->new_level = LEVEL_NONE; 5260 mddev->new_level = LEVEL_NONE;
5263 mddev->new_layout = 0; 5261 mddev->new_layout = 0;
5264 mddev->new_chunk_sectors = 0; 5262 mddev->new_chunk_sectors = 0;
5265 mddev->curr_resync = 0; 5263 mddev->curr_resync = 0;
5266 atomic64_set(&mddev->resync_mismatches, 0); 5264 atomic64_set(&mddev->resync_mismatches, 0);
5267 mddev->suspend_lo = mddev->suspend_hi = 0; 5265 mddev->suspend_lo = mddev->suspend_hi = 0;
5268 mddev->sync_speed_min = mddev->sync_speed_max = 0; 5266 mddev->sync_speed_min = mddev->sync_speed_max = 0;
5269 mddev->recovery = 0; 5267 mddev->recovery = 0;
5270 mddev->in_sync = 0; 5268 mddev->in_sync = 0;
5271 mddev->changed = 0; 5269 mddev->changed = 0;
5272 mddev->degraded = 0; 5270 mddev->degraded = 0;
5273 mddev->safemode = 0; 5271 mddev->safemode = 0;
5274 mddev->merge_check_needed = 0; 5272 mddev->merge_check_needed = 0;
5275 mddev->bitmap_info.offset = 0; 5273 mddev->bitmap_info.offset = 0;
5276 mddev->bitmap_info.default_offset = 0; 5274 mddev->bitmap_info.default_offset = 0;
5277 mddev->bitmap_info.default_space = 0; 5275 mddev->bitmap_info.default_space = 0;
5278 mddev->bitmap_info.chunksize = 0; 5276 mddev->bitmap_info.chunksize = 0;
5279 mddev->bitmap_info.daemon_sleep = 0; 5277 mddev->bitmap_info.daemon_sleep = 0;
5280 mddev->bitmap_info.max_write_behind = 0; 5278 mddev->bitmap_info.max_write_behind = 0;
5281 } 5279 }
5282 5280
5283 static void __md_stop_writes(struct mddev *mddev) 5281 static void __md_stop_writes(struct mddev *mddev)
5284 { 5282 {
5285 if (mddev->sync_thread) { 5283 if (mddev->sync_thread) {
5286 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5284 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5287 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5285 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5288 reap_sync_thread(mddev); 5286 md_reap_sync_thread(mddev);
5289 } 5287 }
5290 5288
5291 del_timer_sync(&mddev->safemode_timer); 5289 del_timer_sync(&mddev->safemode_timer);
5292 5290
5293 bitmap_flush(mddev); 5291 bitmap_flush(mddev);
5294 md_super_wait(mddev); 5292 md_super_wait(mddev);
5295 5293
5296 if (mddev->ro == 0 && 5294 if (mddev->ro == 0 &&
5297 (!mddev->in_sync || mddev->flags)) { 5295 (!mddev->in_sync || mddev->flags)) {
5298 /* mark array as shutdown cleanly */ 5296 /* mark array as shutdown cleanly */
5299 mddev->in_sync = 1; 5297 mddev->in_sync = 1;
5300 md_update_sb(mddev, 1); 5298 md_update_sb(mddev, 1);
5301 } 5299 }
5302 } 5300 }
5303 5301
5304 void md_stop_writes(struct mddev *mddev) 5302 void md_stop_writes(struct mddev *mddev)
5305 { 5303 {
5306 mddev_lock(mddev); 5304 mddev_lock(mddev);
5307 __md_stop_writes(mddev); 5305 __md_stop_writes(mddev);
5308 mddev_unlock(mddev); 5306 mddev_unlock(mddev);
5309 } 5307 }
5310 EXPORT_SYMBOL_GPL(md_stop_writes); 5308 EXPORT_SYMBOL_GPL(md_stop_writes);
5311 5309
5312 static void __md_stop(struct mddev *mddev) 5310 static void __md_stop(struct mddev *mddev)
5313 { 5311 {
5314 mddev->ready = 0; 5312 mddev->ready = 0;
5315 mddev->pers->stop(mddev); 5313 mddev->pers->stop(mddev);
5316 if (mddev->pers->sync_request && mddev->to_remove == NULL) 5314 if (mddev->pers->sync_request && mddev->to_remove == NULL)
5317 mddev->to_remove = &md_redundancy_group; 5315 mddev->to_remove = &md_redundancy_group;
5318 module_put(mddev->pers->owner); 5316 module_put(mddev->pers->owner);
5319 mddev->pers = NULL; 5317 mddev->pers = NULL;
5320 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5318 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5321 } 5319 }
5322 5320
5323 void md_stop(struct mddev *mddev) 5321 void md_stop(struct mddev *mddev)
5324 { 5322 {
5325 /* stop the array and free an attached data structures. 5323 /* stop the array and free an attached data structures.
5326 * This is called from dm-raid 5324 * This is called from dm-raid
5327 */ 5325 */
5328 __md_stop(mddev); 5326 __md_stop(mddev);
5329 bitmap_destroy(mddev); 5327 bitmap_destroy(mddev);
5330 if (mddev->bio_set) 5328 if (mddev->bio_set)
5331 bioset_free(mddev->bio_set); 5329 bioset_free(mddev->bio_set);
5332 } 5330 }
5333 5331
5334 EXPORT_SYMBOL_GPL(md_stop); 5332 EXPORT_SYMBOL_GPL(md_stop);
5335 5333
5336 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5334 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5337 { 5335 {
5338 int err = 0; 5336 int err = 0;
5339 mutex_lock(&mddev->open_mutex); 5337 mutex_lock(&mddev->open_mutex);
5340 if (atomic_read(&mddev->openers) > !!bdev) { 5338 if (atomic_read(&mddev->openers) > !!bdev) {
5341 printk("md: %s still in use.\n",mdname(mddev)); 5339 printk("md: %s still in use.\n",mdname(mddev));
5342 err = -EBUSY; 5340 err = -EBUSY;
5343 goto out; 5341 goto out;
5344 } 5342 }
5345 if (bdev) 5343 if (bdev)
5346 sync_blockdev(bdev); 5344 sync_blockdev(bdev);
5347 if (mddev->pers) { 5345 if (mddev->pers) {
5348 __md_stop_writes(mddev); 5346 __md_stop_writes(mddev);
5349 5347
5350 err = -ENXIO; 5348 err = -ENXIO;
5351 if (mddev->ro==1) 5349 if (mddev->ro==1)
5352 goto out; 5350 goto out;
5353 mddev->ro = 1; 5351 mddev->ro = 1;
5354 set_disk_ro(mddev->gendisk, 1); 5352 set_disk_ro(mddev->gendisk, 1);
5355 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5353 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5356 sysfs_notify_dirent_safe(mddev->sysfs_state); 5354 sysfs_notify_dirent_safe(mddev->sysfs_state);
5357 err = 0; 5355 err = 0;
5358 } 5356 }
5359 out: 5357 out:
5360 mutex_unlock(&mddev->open_mutex); 5358 mutex_unlock(&mddev->open_mutex);
5361 return err; 5359 return err;
5362 } 5360 }
5363 5361
5364 /* mode: 5362 /* mode:
5365 * 0 - completely stop and dis-assemble array 5363 * 0 - completely stop and dis-assemble array
5366 * 2 - stop but do not disassemble array 5364 * 2 - stop but do not disassemble array
5367 */ 5365 */
5368 static int do_md_stop(struct mddev * mddev, int mode, 5366 static int do_md_stop(struct mddev * mddev, int mode,
5369 struct block_device *bdev) 5367 struct block_device *bdev)
5370 { 5368 {
5371 struct gendisk *disk = mddev->gendisk; 5369 struct gendisk *disk = mddev->gendisk;
5372 struct md_rdev *rdev; 5370 struct md_rdev *rdev;
5373 5371
5374 mutex_lock(&mddev->open_mutex); 5372 mutex_lock(&mddev->open_mutex);
5375 if (atomic_read(&mddev->openers) > !!bdev || 5373 if (atomic_read(&mddev->openers) > !!bdev ||
5376 mddev->sysfs_active) { 5374 mddev->sysfs_active) {
5377 printk("md: %s still in use.\n",mdname(mddev)); 5375 printk("md: %s still in use.\n",mdname(mddev));
5378 mutex_unlock(&mddev->open_mutex); 5376 mutex_unlock(&mddev->open_mutex);
5379 return -EBUSY; 5377 return -EBUSY;
5380 } 5378 }
5381 if (bdev) 5379 if (bdev)
5382 /* It is possible IO was issued on some other 5380 /* It is possible IO was issued on some other
5383 * open file which was closed before we took ->open_mutex. 5381 * open file which was closed before we took ->open_mutex.
5384 * As that was not the last close __blkdev_put will not 5382 * As that was not the last close __blkdev_put will not
5385 * have called sync_blockdev, so we must. 5383 * have called sync_blockdev, so we must.
5386 */ 5384 */
5387 sync_blockdev(bdev); 5385 sync_blockdev(bdev);
5388 5386
5389 if (mddev->pers) { 5387 if (mddev->pers) {
5390 if (mddev->ro) 5388 if (mddev->ro)
5391 set_disk_ro(disk, 0); 5389 set_disk_ro(disk, 0);
5392 5390
5393 __md_stop_writes(mddev); 5391 __md_stop_writes(mddev);
5394 __md_stop(mddev); 5392 __md_stop(mddev);
5395 mddev->queue->merge_bvec_fn = NULL; 5393 mddev->queue->merge_bvec_fn = NULL;
5396 mddev->queue->backing_dev_info.congested_fn = NULL; 5394 mddev->queue->backing_dev_info.congested_fn = NULL;
5397 5395
5398 /* tell userspace to handle 'inactive' */ 5396 /* tell userspace to handle 'inactive' */
5399 sysfs_notify_dirent_safe(mddev->sysfs_state); 5397 sysfs_notify_dirent_safe(mddev->sysfs_state);
5400 5398
5401 rdev_for_each(rdev, mddev) 5399 rdev_for_each(rdev, mddev)
5402 if (rdev->raid_disk >= 0) 5400 if (rdev->raid_disk >= 0)
5403 sysfs_unlink_rdev(mddev, rdev); 5401 sysfs_unlink_rdev(mddev, rdev);
5404 5402
5405 set_capacity(disk, 0); 5403 set_capacity(disk, 0);
5406 mutex_unlock(&mddev->open_mutex); 5404 mutex_unlock(&mddev->open_mutex);
5407 mddev->changed = 1; 5405 mddev->changed = 1;
5408 revalidate_disk(disk); 5406 revalidate_disk(disk);
5409 5407
5410 if (mddev->ro) 5408 if (mddev->ro)
5411 mddev->ro = 0; 5409 mddev->ro = 0;
5412 } else 5410 } else
5413 mutex_unlock(&mddev->open_mutex); 5411 mutex_unlock(&mddev->open_mutex);
5414 /* 5412 /*
5415 * Free resources if final stop 5413 * Free resources if final stop
5416 */ 5414 */
5417 if (mode == 0) { 5415 if (mode == 0) {
5418 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 5416 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
5419 5417
5420 bitmap_destroy(mddev); 5418 bitmap_destroy(mddev);
5421 if (mddev->bitmap_info.file) { 5419 if (mddev->bitmap_info.file) {
5422 restore_bitmap_write_access(mddev->bitmap_info.file); 5420 restore_bitmap_write_access(mddev->bitmap_info.file);
5423 fput(mddev->bitmap_info.file); 5421 fput(mddev->bitmap_info.file);
5424 mddev->bitmap_info.file = NULL; 5422 mddev->bitmap_info.file = NULL;
5425 } 5423 }
5426 mddev->bitmap_info.offset = 0; 5424 mddev->bitmap_info.offset = 0;
5427 5425
5428 export_array(mddev); 5426 export_array(mddev);
5429 5427
5430 md_clean(mddev); 5428 md_clean(mddev);
5431 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5429 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5432 if (mddev->hold_active == UNTIL_STOP) 5430 if (mddev->hold_active == UNTIL_STOP)
5433 mddev->hold_active = 0; 5431 mddev->hold_active = 0;
5434 } 5432 }
5435 blk_integrity_unregister(disk); 5433 blk_integrity_unregister(disk);
5436 md_new_event(mddev); 5434 md_new_event(mddev);
5437 sysfs_notify_dirent_safe(mddev->sysfs_state); 5435 sysfs_notify_dirent_safe(mddev->sysfs_state);
5438 return 0; 5436 return 0;
5439 } 5437 }
5440 5438
5441 #ifndef MODULE 5439 #ifndef MODULE
5442 static void autorun_array(struct mddev *mddev) 5440 static void autorun_array(struct mddev *mddev)
5443 { 5441 {
5444 struct md_rdev *rdev; 5442 struct md_rdev *rdev;
5445 int err; 5443 int err;
5446 5444
5447 if (list_empty(&mddev->disks)) 5445 if (list_empty(&mddev->disks))
5448 return; 5446 return;
5449 5447
5450 printk(KERN_INFO "md: running: "); 5448 printk(KERN_INFO "md: running: ");
5451 5449
5452 rdev_for_each(rdev, mddev) { 5450 rdev_for_each(rdev, mddev) {
5453 char b[BDEVNAME_SIZE]; 5451 char b[BDEVNAME_SIZE];
5454 printk("<%s>", bdevname(rdev->bdev,b)); 5452 printk("<%s>", bdevname(rdev->bdev,b));
5455 } 5453 }
5456 printk("\n"); 5454 printk("\n");
5457 5455
5458 err = do_md_run(mddev); 5456 err = do_md_run(mddev);
5459 if (err) { 5457 if (err) {
5460 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 5458 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
5461 do_md_stop(mddev, 0, NULL); 5459 do_md_stop(mddev, 0, NULL);
5462 } 5460 }
5463 } 5461 }
5464 5462
5465 /* 5463 /*
5466 * lets try to run arrays based on all disks that have arrived 5464 * lets try to run arrays based on all disks that have arrived
5467 * until now. (those are in pending_raid_disks) 5465 * until now. (those are in pending_raid_disks)
5468 * 5466 *
5469 * the method: pick the first pending disk, collect all disks with 5467 * the method: pick the first pending disk, collect all disks with
5470 * the same UUID, remove all from the pending list and put them into 5468 * the same UUID, remove all from the pending list and put them into
5471 * the 'same_array' list. Then order this list based on superblock 5469 * the 'same_array' list. Then order this list based on superblock
5472 * update time (freshest comes first), kick out 'old' disks and 5470 * update time (freshest comes first), kick out 'old' disks and
5473 * compare superblocks. If everything's fine then run it. 5471 * compare superblocks. If everything's fine then run it.
5474 * 5472 *
5475 * If "unit" is allocated, then bump its reference count 5473 * If "unit" is allocated, then bump its reference count
5476 */ 5474 */
5477 static void autorun_devices(int part) 5475 static void autorun_devices(int part)
5478 { 5476 {
5479 struct md_rdev *rdev0, *rdev, *tmp; 5477 struct md_rdev *rdev0, *rdev, *tmp;
5480 struct mddev *mddev; 5478 struct mddev *mddev;
5481 char b[BDEVNAME_SIZE]; 5479 char b[BDEVNAME_SIZE];
5482 5480
5483 printk(KERN_INFO "md: autorun ...\n"); 5481 printk(KERN_INFO "md: autorun ...\n");
5484 while (!list_empty(&pending_raid_disks)) { 5482 while (!list_empty(&pending_raid_disks)) {
5485 int unit; 5483 int unit;
5486 dev_t dev; 5484 dev_t dev;
5487 LIST_HEAD(candidates); 5485 LIST_HEAD(candidates);
5488 rdev0 = list_entry(pending_raid_disks.next, 5486 rdev0 = list_entry(pending_raid_disks.next,
5489 struct md_rdev, same_set); 5487 struct md_rdev, same_set);
5490 5488
5491 printk(KERN_INFO "md: considering %s ...\n", 5489 printk(KERN_INFO "md: considering %s ...\n",
5492 bdevname(rdev0->bdev,b)); 5490 bdevname(rdev0->bdev,b));
5493 INIT_LIST_HEAD(&candidates); 5491 INIT_LIST_HEAD(&candidates);
5494 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 5492 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
5495 if (super_90_load(rdev, rdev0, 0) >= 0) { 5493 if (super_90_load(rdev, rdev0, 0) >= 0) {
5496 printk(KERN_INFO "md: adding %s ...\n", 5494 printk(KERN_INFO "md: adding %s ...\n",
5497 bdevname(rdev->bdev,b)); 5495 bdevname(rdev->bdev,b));
5498 list_move(&rdev->same_set, &candidates); 5496 list_move(&rdev->same_set, &candidates);
5499 } 5497 }
5500 /* 5498 /*
5501 * now we have a set of devices, with all of them having 5499 * now we have a set of devices, with all of them having
5502 * mostly sane superblocks. It's time to allocate the 5500 * mostly sane superblocks. It's time to allocate the
5503 * mddev. 5501 * mddev.
5504 */ 5502 */
5505 if (part) { 5503 if (part) {
5506 dev = MKDEV(mdp_major, 5504 dev = MKDEV(mdp_major,
5507 rdev0->preferred_minor << MdpMinorShift); 5505 rdev0->preferred_minor << MdpMinorShift);
5508 unit = MINOR(dev) >> MdpMinorShift; 5506 unit = MINOR(dev) >> MdpMinorShift;
5509 } else { 5507 } else {
5510 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 5508 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
5511 unit = MINOR(dev); 5509 unit = MINOR(dev);
5512 } 5510 }
5513 if (rdev0->preferred_minor != unit) { 5511 if (rdev0->preferred_minor != unit) {
5514 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 5512 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
5515 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 5513 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
5516 break; 5514 break;
5517 } 5515 }
5518 5516
5519 md_probe(dev, NULL, NULL); 5517 md_probe(dev, NULL, NULL);
5520 mddev = mddev_find(dev); 5518 mddev = mddev_find(dev);
5521 if (!mddev || !mddev->gendisk) { 5519 if (!mddev || !mddev->gendisk) {
5522 if (mddev) 5520 if (mddev)
5523 mddev_put(mddev); 5521 mddev_put(mddev);
5524 printk(KERN_ERR 5522 printk(KERN_ERR
5525 "md: cannot allocate memory for md drive.\n"); 5523 "md: cannot allocate memory for md drive.\n");
5526 break; 5524 break;
5527 } 5525 }
5528 if (mddev_lock(mddev)) 5526 if (mddev_lock(mddev))
5529 printk(KERN_WARNING "md: %s locked, cannot run\n", 5527 printk(KERN_WARNING "md: %s locked, cannot run\n",
5530 mdname(mddev)); 5528 mdname(mddev));
5531 else if (mddev->raid_disks || mddev->major_version 5529 else if (mddev->raid_disks || mddev->major_version
5532 || !list_empty(&mddev->disks)) { 5530 || !list_empty(&mddev->disks)) {
5533 printk(KERN_WARNING 5531 printk(KERN_WARNING
5534 "md: %s already running, cannot run %s\n", 5532 "md: %s already running, cannot run %s\n",
5535 mdname(mddev), bdevname(rdev0->bdev,b)); 5533 mdname(mddev), bdevname(rdev0->bdev,b));
5536 mddev_unlock(mddev); 5534 mddev_unlock(mddev);
5537 } else { 5535 } else {
5538 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 5536 printk(KERN_INFO "md: created %s\n", mdname(mddev));
5539 mddev->persistent = 1; 5537 mddev->persistent = 1;
5540 rdev_for_each_list(rdev, tmp, &candidates) { 5538 rdev_for_each_list(rdev, tmp, &candidates) {
5541 list_del_init(&rdev->same_set); 5539 list_del_init(&rdev->same_set);
5542 if (bind_rdev_to_array(rdev, mddev)) 5540 if (bind_rdev_to_array(rdev, mddev))
5543 export_rdev(rdev); 5541 export_rdev(rdev);
5544 } 5542 }
5545 autorun_array(mddev); 5543 autorun_array(mddev);
5546 mddev_unlock(mddev); 5544 mddev_unlock(mddev);
5547 } 5545 }
5548 /* on success, candidates will be empty, on error 5546 /* on success, candidates will be empty, on error
5549 * it won't... 5547 * it won't...
5550 */ 5548 */
5551 rdev_for_each_list(rdev, tmp, &candidates) { 5549 rdev_for_each_list(rdev, tmp, &candidates) {
5552 list_del_init(&rdev->same_set); 5550 list_del_init(&rdev->same_set);
5553 export_rdev(rdev); 5551 export_rdev(rdev);
5554 } 5552 }
5555 mddev_put(mddev); 5553 mddev_put(mddev);
5556 } 5554 }
5557 printk(KERN_INFO "md: ... autorun DONE.\n"); 5555 printk(KERN_INFO "md: ... autorun DONE.\n");
5558 } 5556 }
5559 #endif /* !MODULE */ 5557 #endif /* !MODULE */
5560 5558
5561 static int get_version(void __user * arg) 5559 static int get_version(void __user * arg)
5562 { 5560 {
5563 mdu_version_t ver; 5561 mdu_version_t ver;
5564 5562
5565 ver.major = MD_MAJOR_VERSION; 5563 ver.major = MD_MAJOR_VERSION;
5566 ver.minor = MD_MINOR_VERSION; 5564 ver.minor = MD_MINOR_VERSION;
5567 ver.patchlevel = MD_PATCHLEVEL_VERSION; 5565 ver.patchlevel = MD_PATCHLEVEL_VERSION;
5568 5566
5569 if (copy_to_user(arg, &ver, sizeof(ver))) 5567 if (copy_to_user(arg, &ver, sizeof(ver)))
5570 return -EFAULT; 5568 return -EFAULT;
5571 5569
5572 return 0; 5570 return 0;
5573 } 5571 }
5574 5572
5575 static int get_array_info(struct mddev * mddev, void __user * arg) 5573 static int get_array_info(struct mddev * mddev, void __user * arg)
5576 { 5574 {
5577 mdu_array_info_t info; 5575 mdu_array_info_t info;
5578 int nr,working,insync,failed,spare; 5576 int nr,working,insync,failed,spare;
5579 struct md_rdev *rdev; 5577 struct md_rdev *rdev;
5580 5578
5581 nr = working = insync = failed = spare = 0; 5579 nr = working = insync = failed = spare = 0;
5582 rcu_read_lock(); 5580 rcu_read_lock();
5583 rdev_for_each_rcu(rdev, mddev) { 5581 rdev_for_each_rcu(rdev, mddev) {
5584 nr++; 5582 nr++;
5585 if (test_bit(Faulty, &rdev->flags)) 5583 if (test_bit(Faulty, &rdev->flags))
5586 failed++; 5584 failed++;
5587 else { 5585 else {
5588 working++; 5586 working++;
5589 if (test_bit(In_sync, &rdev->flags)) 5587 if (test_bit(In_sync, &rdev->flags))
5590 insync++; 5588 insync++;
5591 else 5589 else
5592 spare++; 5590 spare++;
5593 } 5591 }
5594 } 5592 }
5595 rcu_read_unlock(); 5593 rcu_read_unlock();
5596 5594
5597 info.major_version = mddev->major_version; 5595 info.major_version = mddev->major_version;
5598 info.minor_version = mddev->minor_version; 5596 info.minor_version = mddev->minor_version;
5599 info.patch_version = MD_PATCHLEVEL_VERSION; 5597 info.patch_version = MD_PATCHLEVEL_VERSION;
5600 info.ctime = mddev->ctime; 5598 info.ctime = mddev->ctime;
5601 info.level = mddev->level; 5599 info.level = mddev->level;
5602 info.size = mddev->dev_sectors / 2; 5600 info.size = mddev->dev_sectors / 2;
5603 if (info.size != mddev->dev_sectors / 2) /* overflow */ 5601 if (info.size != mddev->dev_sectors / 2) /* overflow */
5604 info.size = -1; 5602 info.size = -1;
5605 info.nr_disks = nr; 5603 info.nr_disks = nr;
5606 info.raid_disks = mddev->raid_disks; 5604 info.raid_disks = mddev->raid_disks;
5607 info.md_minor = mddev->md_minor; 5605 info.md_minor = mddev->md_minor;
5608 info.not_persistent= !mddev->persistent; 5606 info.not_persistent= !mddev->persistent;
5609 5607
5610 info.utime = mddev->utime; 5608 info.utime = mddev->utime;
5611 info.state = 0; 5609 info.state = 0;
5612 if (mddev->in_sync) 5610 if (mddev->in_sync)
5613 info.state = (1<<MD_SB_CLEAN); 5611 info.state = (1<<MD_SB_CLEAN);
5614 if (mddev->bitmap && mddev->bitmap_info.offset) 5612 if (mddev->bitmap && mddev->bitmap_info.offset)
5615 info.state = (1<<MD_SB_BITMAP_PRESENT); 5613 info.state = (1<<MD_SB_BITMAP_PRESENT);
5616 info.active_disks = insync; 5614 info.active_disks = insync;
5617 info.working_disks = working; 5615 info.working_disks = working;
5618 info.failed_disks = failed; 5616 info.failed_disks = failed;
5619 info.spare_disks = spare; 5617 info.spare_disks = spare;
5620 5618
5621 info.layout = mddev->layout; 5619 info.layout = mddev->layout;
5622 info.chunk_size = mddev->chunk_sectors << 9; 5620 info.chunk_size = mddev->chunk_sectors << 9;
5623 5621
5624 if (copy_to_user(arg, &info, sizeof(info))) 5622 if (copy_to_user(arg, &info, sizeof(info)))
5625 return -EFAULT; 5623 return -EFAULT;
5626 5624
5627 return 0; 5625 return 0;
5628 } 5626 }
5629 5627
5630 static int get_bitmap_file(struct mddev * mddev, void __user * arg) 5628 static int get_bitmap_file(struct mddev * mddev, void __user * arg)
5631 { 5629 {
5632 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 5630 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
5633 char *ptr, *buf = NULL; 5631 char *ptr, *buf = NULL;
5634 int err = -ENOMEM; 5632 int err = -ENOMEM;
5635 5633
5636 if (md_allow_write(mddev)) 5634 if (md_allow_write(mddev))
5637 file = kmalloc(sizeof(*file), GFP_NOIO); 5635 file = kmalloc(sizeof(*file), GFP_NOIO);
5638 else 5636 else
5639 file = kmalloc(sizeof(*file), GFP_KERNEL); 5637 file = kmalloc(sizeof(*file), GFP_KERNEL);
5640 5638
5641 if (!file) 5639 if (!file)
5642 goto out; 5640 goto out;
5643 5641
5644 /* bitmap disabled, zero the first byte and copy out */ 5642 /* bitmap disabled, zero the first byte and copy out */
5645 if (!mddev->bitmap || !mddev->bitmap->storage.file) { 5643 if (!mddev->bitmap || !mddev->bitmap->storage.file) {
5646 file->pathname[0] = '\0'; 5644 file->pathname[0] = '\0';
5647 goto copy_out; 5645 goto copy_out;
5648 } 5646 }
5649 5647
5650 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 5648 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
5651 if (!buf) 5649 if (!buf)
5652 goto out; 5650 goto out;
5653 5651
5654 ptr = d_path(&mddev->bitmap->storage.file->f_path, 5652 ptr = d_path(&mddev->bitmap->storage.file->f_path,
5655 buf, sizeof(file->pathname)); 5653 buf, sizeof(file->pathname));
5656 if (IS_ERR(ptr)) 5654 if (IS_ERR(ptr))
5657 goto out; 5655 goto out;
5658 5656
5659 strcpy(file->pathname, ptr); 5657 strcpy(file->pathname, ptr);
5660 5658
5661 copy_out: 5659 copy_out:
5662 err = 0; 5660 err = 0;
5663 if (copy_to_user(arg, file, sizeof(*file))) 5661 if (copy_to_user(arg, file, sizeof(*file)))
5664 err = -EFAULT; 5662 err = -EFAULT;
5665 out: 5663 out:
5666 kfree(buf); 5664 kfree(buf);
5667 kfree(file); 5665 kfree(file);
5668 return err; 5666 return err;
5669 } 5667 }
5670 5668
5671 static int get_disk_info(struct mddev * mddev, void __user * arg) 5669 static int get_disk_info(struct mddev * mddev, void __user * arg)
5672 { 5670 {
5673 mdu_disk_info_t info; 5671 mdu_disk_info_t info;
5674 struct md_rdev *rdev; 5672 struct md_rdev *rdev;
5675 5673
5676 if (copy_from_user(&info, arg, sizeof(info))) 5674 if (copy_from_user(&info, arg, sizeof(info)))
5677 return -EFAULT; 5675 return -EFAULT;
5678 5676
5679 rcu_read_lock(); 5677 rcu_read_lock();
5680 rdev = find_rdev_nr_rcu(mddev, info.number); 5678 rdev = find_rdev_nr_rcu(mddev, info.number);
5681 if (rdev) { 5679 if (rdev) {
5682 info.major = MAJOR(rdev->bdev->bd_dev); 5680 info.major = MAJOR(rdev->bdev->bd_dev);
5683 info.minor = MINOR(rdev->bdev->bd_dev); 5681 info.minor = MINOR(rdev->bdev->bd_dev);
5684 info.raid_disk = rdev->raid_disk; 5682 info.raid_disk = rdev->raid_disk;
5685 info.state = 0; 5683 info.state = 0;
5686 if (test_bit(Faulty, &rdev->flags)) 5684 if (test_bit(Faulty, &rdev->flags))
5687 info.state |= (1<<MD_DISK_FAULTY); 5685 info.state |= (1<<MD_DISK_FAULTY);
5688 else if (test_bit(In_sync, &rdev->flags)) { 5686 else if (test_bit(In_sync, &rdev->flags)) {
5689 info.state |= (1<<MD_DISK_ACTIVE); 5687 info.state |= (1<<MD_DISK_ACTIVE);
5690 info.state |= (1<<MD_DISK_SYNC); 5688 info.state |= (1<<MD_DISK_SYNC);
5691 } 5689 }
5692 if (test_bit(WriteMostly, &rdev->flags)) 5690 if (test_bit(WriteMostly, &rdev->flags))
5693 info.state |= (1<<MD_DISK_WRITEMOSTLY); 5691 info.state |= (1<<MD_DISK_WRITEMOSTLY);
5694 } else { 5692 } else {
5695 info.major = info.minor = 0; 5693 info.major = info.minor = 0;
5696 info.raid_disk = -1; 5694 info.raid_disk = -1;
5697 info.state = (1<<MD_DISK_REMOVED); 5695 info.state = (1<<MD_DISK_REMOVED);
5698 } 5696 }
5699 rcu_read_unlock(); 5697 rcu_read_unlock();
5700 5698
5701 if (copy_to_user(arg, &info, sizeof(info))) 5699 if (copy_to_user(arg, &info, sizeof(info)))
5702 return -EFAULT; 5700 return -EFAULT;
5703 5701
5704 return 0; 5702 return 0;
5705 } 5703 }
5706 5704
5707 static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) 5705 static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
5708 { 5706 {
5709 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5707 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5710 struct md_rdev *rdev; 5708 struct md_rdev *rdev;
5711 dev_t dev = MKDEV(info->major,info->minor); 5709 dev_t dev = MKDEV(info->major,info->minor);
5712 5710
5713 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 5711 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
5714 return -EOVERFLOW; 5712 return -EOVERFLOW;
5715 5713
5716 if (!mddev->raid_disks) { 5714 if (!mddev->raid_disks) {
5717 int err; 5715 int err;
5718 /* expecting a device which has a superblock */ 5716 /* expecting a device which has a superblock */
5719 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 5717 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
5720 if (IS_ERR(rdev)) { 5718 if (IS_ERR(rdev)) {
5721 printk(KERN_WARNING 5719 printk(KERN_WARNING
5722 "md: md_import_device returned %ld\n", 5720 "md: md_import_device returned %ld\n",
5723 PTR_ERR(rdev)); 5721 PTR_ERR(rdev));
5724 return PTR_ERR(rdev); 5722 return PTR_ERR(rdev);
5725 } 5723 }
5726 if (!list_empty(&mddev->disks)) { 5724 if (!list_empty(&mddev->disks)) {
5727 struct md_rdev *rdev0 5725 struct md_rdev *rdev0
5728 = list_entry(mddev->disks.next, 5726 = list_entry(mddev->disks.next,
5729 struct md_rdev, same_set); 5727 struct md_rdev, same_set);
5730 err = super_types[mddev->major_version] 5728 err = super_types[mddev->major_version]
5731 .load_super(rdev, rdev0, mddev->minor_version); 5729 .load_super(rdev, rdev0, mddev->minor_version);
5732 if (err < 0) { 5730 if (err < 0) {
5733 printk(KERN_WARNING 5731 printk(KERN_WARNING
5734 "md: %s has different UUID to %s\n", 5732 "md: %s has different UUID to %s\n",
5735 bdevname(rdev->bdev,b), 5733 bdevname(rdev->bdev,b),
5736 bdevname(rdev0->bdev,b2)); 5734 bdevname(rdev0->bdev,b2));
5737 export_rdev(rdev); 5735 export_rdev(rdev);
5738 return -EINVAL; 5736 return -EINVAL;
5739 } 5737 }
5740 } 5738 }
5741 err = bind_rdev_to_array(rdev, mddev); 5739 err = bind_rdev_to_array(rdev, mddev);
5742 if (err) 5740 if (err)
5743 export_rdev(rdev); 5741 export_rdev(rdev);
5744 return err; 5742 return err;
5745 } 5743 }
5746 5744
5747 /* 5745 /*
5748 * add_new_disk can be used once the array is assembled 5746 * add_new_disk can be used once the array is assembled
5749 * to add "hot spares". They must already have a superblock 5747 * to add "hot spares". They must already have a superblock
5750 * written 5748 * written
5751 */ 5749 */
5752 if (mddev->pers) { 5750 if (mddev->pers) {
5753 int err; 5751 int err;
5754 if (!mddev->pers->hot_add_disk) { 5752 if (!mddev->pers->hot_add_disk) {
5755 printk(KERN_WARNING 5753 printk(KERN_WARNING
5756 "%s: personality does not support diskops!\n", 5754 "%s: personality does not support diskops!\n",
5757 mdname(mddev)); 5755 mdname(mddev));
5758 return -EINVAL; 5756 return -EINVAL;
5759 } 5757 }
5760 if (mddev->persistent) 5758 if (mddev->persistent)
5761 rdev = md_import_device(dev, mddev->major_version, 5759 rdev = md_import_device(dev, mddev->major_version,
5762 mddev->minor_version); 5760 mddev->minor_version);
5763 else 5761 else
5764 rdev = md_import_device(dev, -1, -1); 5762 rdev = md_import_device(dev, -1, -1);
5765 if (IS_ERR(rdev)) { 5763 if (IS_ERR(rdev)) {
5766 printk(KERN_WARNING 5764 printk(KERN_WARNING
5767 "md: md_import_device returned %ld\n", 5765 "md: md_import_device returned %ld\n",
5768 PTR_ERR(rdev)); 5766 PTR_ERR(rdev));
5769 return PTR_ERR(rdev); 5767 return PTR_ERR(rdev);
5770 } 5768 }
5771 /* set saved_raid_disk if appropriate */ 5769 /* set saved_raid_disk if appropriate */
5772 if (!mddev->persistent) { 5770 if (!mddev->persistent) {
5773 if (info->state & (1<<MD_DISK_SYNC) && 5771 if (info->state & (1<<MD_DISK_SYNC) &&
5774 info->raid_disk < mddev->raid_disks) { 5772 info->raid_disk < mddev->raid_disks) {
5775 rdev->raid_disk = info->raid_disk; 5773 rdev->raid_disk = info->raid_disk;
5776 set_bit(In_sync, &rdev->flags); 5774 set_bit(In_sync, &rdev->flags);
5777 } else 5775 } else
5778 rdev->raid_disk = -1; 5776 rdev->raid_disk = -1;
5779 } else 5777 } else
5780 super_types[mddev->major_version]. 5778 super_types[mddev->major_version].
5781 validate_super(mddev, rdev); 5779 validate_super(mddev, rdev);
5782 if ((info->state & (1<<MD_DISK_SYNC)) && 5780 if ((info->state & (1<<MD_DISK_SYNC)) &&
5783 rdev->raid_disk != info->raid_disk) { 5781 rdev->raid_disk != info->raid_disk) {
5784 /* This was a hot-add request, but events doesn't 5782 /* This was a hot-add request, but events doesn't
5785 * match, so reject it. 5783 * match, so reject it.
5786 */ 5784 */
5787 export_rdev(rdev); 5785 export_rdev(rdev);
5788 return -EINVAL; 5786 return -EINVAL;
5789 } 5787 }
5790 5788
5791 if (test_bit(In_sync, &rdev->flags)) 5789 if (test_bit(In_sync, &rdev->flags))
5792 rdev->saved_raid_disk = rdev->raid_disk; 5790 rdev->saved_raid_disk = rdev->raid_disk;
5793 else 5791 else
5794 rdev->saved_raid_disk = -1; 5792 rdev->saved_raid_disk = -1;
5795 5793
5796 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 5794 clear_bit(In_sync, &rdev->flags); /* just to be sure */
5797 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5795 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5798 set_bit(WriteMostly, &rdev->flags); 5796 set_bit(WriteMostly, &rdev->flags);
5799 else 5797 else
5800 clear_bit(WriteMostly, &rdev->flags); 5798 clear_bit(WriteMostly, &rdev->flags);
5801 5799
5802 rdev->raid_disk = -1; 5800 rdev->raid_disk = -1;
5803 err = bind_rdev_to_array(rdev, mddev); 5801 err = bind_rdev_to_array(rdev, mddev);
5804 if (!err && !mddev->pers->hot_remove_disk) { 5802 if (!err && !mddev->pers->hot_remove_disk) {
5805 /* If there is hot_add_disk but no hot_remove_disk 5803 /* If there is hot_add_disk but no hot_remove_disk
5806 * then added disks for geometry changes, 5804 * then added disks for geometry changes,
5807 * and should be added immediately. 5805 * and should be added immediately.
5808 */ 5806 */
5809 super_types[mddev->major_version]. 5807 super_types[mddev->major_version].
5810 validate_super(mddev, rdev); 5808 validate_super(mddev, rdev);
5811 err = mddev->pers->hot_add_disk(mddev, rdev); 5809 err = mddev->pers->hot_add_disk(mddev, rdev);
5812 if (err) 5810 if (err)
5813 unbind_rdev_from_array(rdev); 5811 unbind_rdev_from_array(rdev);
5814 } 5812 }
5815 if (err) 5813 if (err)
5816 export_rdev(rdev); 5814 export_rdev(rdev);
5817 else 5815 else
5818 sysfs_notify_dirent_safe(rdev->sysfs_state); 5816 sysfs_notify_dirent_safe(rdev->sysfs_state);
5819 5817
5820 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5818 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5821 if (mddev->degraded) 5819 if (mddev->degraded)
5822 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5820 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5823 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5821 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5824 if (!err) 5822 if (!err)
5825 md_new_event(mddev); 5823 md_new_event(mddev);
5826 md_wakeup_thread(mddev->thread); 5824 md_wakeup_thread(mddev->thread);
5827 return err; 5825 return err;
5828 } 5826 }
5829 5827
5830 /* otherwise, add_new_disk is only allowed 5828 /* otherwise, add_new_disk is only allowed
5831 * for major_version==0 superblocks 5829 * for major_version==0 superblocks
5832 */ 5830 */
5833 if (mddev->major_version != 0) { 5831 if (mddev->major_version != 0) {
5834 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 5832 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
5835 mdname(mddev)); 5833 mdname(mddev));
5836 return -EINVAL; 5834 return -EINVAL;
5837 } 5835 }
5838 5836
5839 if (!(info->state & (1<<MD_DISK_FAULTY))) { 5837 if (!(info->state & (1<<MD_DISK_FAULTY))) {
5840 int err; 5838 int err;
5841 rdev = md_import_device(dev, -1, 0); 5839 rdev = md_import_device(dev, -1, 0);
5842 if (IS_ERR(rdev)) { 5840 if (IS_ERR(rdev)) {
5843 printk(KERN_WARNING 5841 printk(KERN_WARNING
5844 "md: error, md_import_device() returned %ld\n", 5842 "md: error, md_import_device() returned %ld\n",
5845 PTR_ERR(rdev)); 5843 PTR_ERR(rdev));
5846 return PTR_ERR(rdev); 5844 return PTR_ERR(rdev);
5847 } 5845 }
5848 rdev->desc_nr = info->number; 5846 rdev->desc_nr = info->number;
5849 if (info->raid_disk < mddev->raid_disks) 5847 if (info->raid_disk < mddev->raid_disks)
5850 rdev->raid_disk = info->raid_disk; 5848 rdev->raid_disk = info->raid_disk;
5851 else 5849 else
5852 rdev->raid_disk = -1; 5850 rdev->raid_disk = -1;
5853 5851
5854 if (rdev->raid_disk < mddev->raid_disks) 5852 if (rdev->raid_disk < mddev->raid_disks)
5855 if (info->state & (1<<MD_DISK_SYNC)) 5853 if (info->state & (1<<MD_DISK_SYNC))
5856 set_bit(In_sync, &rdev->flags); 5854 set_bit(In_sync, &rdev->flags);
5857 5855
5858 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5856 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5859 set_bit(WriteMostly, &rdev->flags); 5857 set_bit(WriteMostly, &rdev->flags);
5860 5858
5861 if (!mddev->persistent) { 5859 if (!mddev->persistent) {
5862 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 5860 printk(KERN_INFO "md: nonpersistent superblock ...\n");
5863 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 5861 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5864 } else 5862 } else
5865 rdev->sb_start = calc_dev_sboffset(rdev); 5863 rdev->sb_start = calc_dev_sboffset(rdev);
5866 rdev->sectors = rdev->sb_start; 5864 rdev->sectors = rdev->sb_start;
5867 5865
5868 err = bind_rdev_to_array(rdev, mddev); 5866 err = bind_rdev_to_array(rdev, mddev);
5869 if (err) { 5867 if (err) {
5870 export_rdev(rdev); 5868 export_rdev(rdev);
5871 return err; 5869 return err;
5872 } 5870 }
5873 } 5871 }
5874 5872
5875 return 0; 5873 return 0;
5876 } 5874 }
5877 5875
5878 static int hot_remove_disk(struct mddev * mddev, dev_t dev) 5876 static int hot_remove_disk(struct mddev * mddev, dev_t dev)
5879 { 5877 {
5880 char b[BDEVNAME_SIZE]; 5878 char b[BDEVNAME_SIZE];
5881 struct md_rdev *rdev; 5879 struct md_rdev *rdev;
5882 5880
5883 rdev = find_rdev(mddev, dev); 5881 rdev = find_rdev(mddev, dev);
5884 if (!rdev) 5882 if (!rdev)
5885 return -ENXIO; 5883 return -ENXIO;
5886 5884
5887 clear_bit(Blocked, &rdev->flags); 5885 clear_bit(Blocked, &rdev->flags);
5888 remove_and_add_spares(mddev, rdev); 5886 remove_and_add_spares(mddev, rdev);
5889 5887
5890 if (rdev->raid_disk >= 0) 5888 if (rdev->raid_disk >= 0)
5891 goto busy; 5889 goto busy;
5892 5890
5893 kick_rdev_from_array(rdev); 5891 kick_rdev_from_array(rdev);
5894 md_update_sb(mddev, 1); 5892 md_update_sb(mddev, 1);
5895 md_new_event(mddev); 5893 md_new_event(mddev);
5896 5894
5897 return 0; 5895 return 0;
5898 busy: 5896 busy:
5899 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 5897 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
5900 bdevname(rdev->bdev,b), mdname(mddev)); 5898 bdevname(rdev->bdev,b), mdname(mddev));
5901 return -EBUSY; 5899 return -EBUSY;
5902 } 5900 }
5903 5901
5904 static int hot_add_disk(struct mddev * mddev, dev_t dev) 5902 static int hot_add_disk(struct mddev * mddev, dev_t dev)
5905 { 5903 {
5906 char b[BDEVNAME_SIZE]; 5904 char b[BDEVNAME_SIZE];
5907 int err; 5905 int err;
5908 struct md_rdev *rdev; 5906 struct md_rdev *rdev;
5909 5907
5910 if (!mddev->pers) 5908 if (!mddev->pers)
5911 return -ENODEV; 5909 return -ENODEV;
5912 5910
5913 if (mddev->major_version != 0) { 5911 if (mddev->major_version != 0) {
5914 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 5912 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
5915 " version-0 superblocks.\n", 5913 " version-0 superblocks.\n",
5916 mdname(mddev)); 5914 mdname(mddev));
5917 return -EINVAL; 5915 return -EINVAL;
5918 } 5916 }
5919 if (!mddev->pers->hot_add_disk) { 5917 if (!mddev->pers->hot_add_disk) {
5920 printk(KERN_WARNING 5918 printk(KERN_WARNING
5921 "%s: personality does not support diskops!\n", 5919 "%s: personality does not support diskops!\n",
5922 mdname(mddev)); 5920 mdname(mddev));
5923 return -EINVAL; 5921 return -EINVAL;
5924 } 5922 }
5925 5923
5926 rdev = md_import_device(dev, -1, 0); 5924 rdev = md_import_device(dev, -1, 0);
5927 if (IS_ERR(rdev)) { 5925 if (IS_ERR(rdev)) {
5928 printk(KERN_WARNING 5926 printk(KERN_WARNING
5929 "md: error, md_import_device() returned %ld\n", 5927 "md: error, md_import_device() returned %ld\n",
5930 PTR_ERR(rdev)); 5928 PTR_ERR(rdev));
5931 return -EINVAL; 5929 return -EINVAL;
5932 } 5930 }
5933 5931
5934 if (mddev->persistent) 5932 if (mddev->persistent)
5935 rdev->sb_start = calc_dev_sboffset(rdev); 5933 rdev->sb_start = calc_dev_sboffset(rdev);
5936 else 5934 else
5937 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 5935 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5938 5936
5939 rdev->sectors = rdev->sb_start; 5937 rdev->sectors = rdev->sb_start;
5940 5938
5941 if (test_bit(Faulty, &rdev->flags)) { 5939 if (test_bit(Faulty, &rdev->flags)) {
5942 printk(KERN_WARNING 5940 printk(KERN_WARNING
5943 "md: can not hot-add faulty %s disk to %s!\n", 5941 "md: can not hot-add faulty %s disk to %s!\n",
5944 bdevname(rdev->bdev,b), mdname(mddev)); 5942 bdevname(rdev->bdev,b), mdname(mddev));
5945 err = -EINVAL; 5943 err = -EINVAL;
5946 goto abort_export; 5944 goto abort_export;
5947 } 5945 }
5948 clear_bit(In_sync, &rdev->flags); 5946 clear_bit(In_sync, &rdev->flags);
5949 rdev->desc_nr = -1; 5947 rdev->desc_nr = -1;
5950 rdev->saved_raid_disk = -1; 5948 rdev->saved_raid_disk = -1;
5951 err = bind_rdev_to_array(rdev, mddev); 5949 err = bind_rdev_to_array(rdev, mddev);
5952 if (err) 5950 if (err)
5953 goto abort_export; 5951 goto abort_export;
5954 5952
5955 /* 5953 /*
5956 * The rest should better be atomic, we can have disk failures 5954 * The rest should better be atomic, we can have disk failures
5957 * noticed in interrupt contexts ... 5955 * noticed in interrupt contexts ...
5958 */ 5956 */
5959 5957
5960 rdev->raid_disk = -1; 5958 rdev->raid_disk = -1;
5961 5959
5962 md_update_sb(mddev, 1); 5960 md_update_sb(mddev, 1);
5963 5961
5964 /* 5962 /*
5965 * Kick recovery, maybe this spare has to be added to the 5963 * Kick recovery, maybe this spare has to be added to the
5966 * array immediately. 5964 * array immediately.
5967 */ 5965 */
5968 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5966 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5969 md_wakeup_thread(mddev->thread); 5967 md_wakeup_thread(mddev->thread);
5970 md_new_event(mddev); 5968 md_new_event(mddev);
5971 return 0; 5969 return 0;
5972 5970
5973 abort_export: 5971 abort_export:
5974 export_rdev(rdev); 5972 export_rdev(rdev);
5975 return err; 5973 return err;
5976 } 5974 }
5977 5975
5978 static int set_bitmap_file(struct mddev *mddev, int fd) 5976 static int set_bitmap_file(struct mddev *mddev, int fd)
5979 { 5977 {
5980 int err; 5978 int err;
5981 5979
5982 if (mddev->pers) { 5980 if (mddev->pers) {
5983 if (!mddev->pers->quiesce) 5981 if (!mddev->pers->quiesce)
5984 return -EBUSY; 5982 return -EBUSY;
5985 if (mddev->recovery || mddev->sync_thread) 5983 if (mddev->recovery || mddev->sync_thread)
5986 return -EBUSY; 5984 return -EBUSY;
5987 /* we should be able to change the bitmap.. */ 5985 /* we should be able to change the bitmap.. */
5988 } 5986 }
5989 5987
5990 5988
5991 if (fd >= 0) { 5989 if (fd >= 0) {
5992 if (mddev->bitmap) 5990 if (mddev->bitmap)
5993 return -EEXIST; /* cannot add when bitmap is present */ 5991 return -EEXIST; /* cannot add when bitmap is present */
5994 mddev->bitmap_info.file = fget(fd); 5992 mddev->bitmap_info.file = fget(fd);
5995 5993
5996 if (mddev->bitmap_info.file == NULL) { 5994 if (mddev->bitmap_info.file == NULL) {
5997 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 5995 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
5998 mdname(mddev)); 5996 mdname(mddev));
5999 return -EBADF; 5997 return -EBADF;
6000 } 5998 }
6001 5999
6002 err = deny_bitmap_write_access(mddev->bitmap_info.file); 6000 err = deny_bitmap_write_access(mddev->bitmap_info.file);
6003 if (err) { 6001 if (err) {
6004 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 6002 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
6005 mdname(mddev)); 6003 mdname(mddev));
6006 fput(mddev->bitmap_info.file); 6004 fput(mddev->bitmap_info.file);
6007 mddev->bitmap_info.file = NULL; 6005 mddev->bitmap_info.file = NULL;
6008 return err; 6006 return err;
6009 } 6007 }
6010 mddev->bitmap_info.offset = 0; /* file overrides offset */ 6008 mddev->bitmap_info.offset = 0; /* file overrides offset */
6011 } else if (mddev->bitmap == NULL) 6009 } else if (mddev->bitmap == NULL)
6012 return -ENOENT; /* cannot remove what isn't there */ 6010 return -ENOENT; /* cannot remove what isn't there */
6013 err = 0; 6011 err = 0;
6014 if (mddev->pers) { 6012 if (mddev->pers) {
6015 mddev->pers->quiesce(mddev, 1); 6013 mddev->pers->quiesce(mddev, 1);
6016 if (fd >= 0) { 6014 if (fd >= 0) {
6017 err = bitmap_create(mddev); 6015 err = bitmap_create(mddev);
6018 if (!err) 6016 if (!err)
6019 err = bitmap_load(mddev); 6017 err = bitmap_load(mddev);
6020 } 6018 }
6021 if (fd < 0 || err) { 6019 if (fd < 0 || err) {
6022 bitmap_destroy(mddev); 6020 bitmap_destroy(mddev);
6023 fd = -1; /* make sure to put the file */ 6021 fd = -1; /* make sure to put the file */
6024 } 6022 }
6025 mddev->pers->quiesce(mddev, 0); 6023 mddev->pers->quiesce(mddev, 0);
6026 } 6024 }
6027 if (fd < 0) { 6025 if (fd < 0) {
6028 if (mddev->bitmap_info.file) { 6026 if (mddev->bitmap_info.file) {
6029 restore_bitmap_write_access(mddev->bitmap_info.file); 6027 restore_bitmap_write_access(mddev->bitmap_info.file);
6030 fput(mddev->bitmap_info.file); 6028 fput(mddev->bitmap_info.file);
6031 } 6029 }
6032 mddev->bitmap_info.file = NULL; 6030 mddev->bitmap_info.file = NULL;
6033 } 6031 }
6034 6032
6035 return err; 6033 return err;
6036 } 6034 }
6037 6035
6038 /* 6036 /*
6039 * set_array_info is used two different ways 6037 * set_array_info is used two different ways
6040 * The original usage is when creating a new array. 6038 * The original usage is when creating a new array.
6041 * In this usage, raid_disks is > 0 and it together with 6039 * In this usage, raid_disks is > 0 and it together with
6042 * level, size, not_persistent,layout,chunksize determine the 6040 * level, size, not_persistent,layout,chunksize determine the
6043 * shape of the array. 6041 * shape of the array.
6044 * This will always create an array with a type-0.90.0 superblock. 6042 * This will always create an array with a type-0.90.0 superblock.
6045 * The newer usage is when assembling an array. 6043 * The newer usage is when assembling an array.
6046 * In this case raid_disks will be 0, and the major_version field is 6044 * In this case raid_disks will be 0, and the major_version field is
6047 * use to determine which style super-blocks are to be found on the devices. 6045 * use to determine which style super-blocks are to be found on the devices.
6048 * The minor and patch _version numbers are also kept incase the 6046 * The minor and patch _version numbers are also kept incase the
6049 * super_block handler wishes to interpret them. 6047 * super_block handler wishes to interpret them.
6050 */ 6048 */
6051 static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) 6049 static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
6052 { 6050 {
6053 6051
6054 if (info->raid_disks == 0) { 6052 if (info->raid_disks == 0) {
6055 /* just setting version number for superblock loading */ 6053 /* just setting version number for superblock loading */
6056 if (info->major_version < 0 || 6054 if (info->major_version < 0 ||
6057 info->major_version >= ARRAY_SIZE(super_types) || 6055 info->major_version >= ARRAY_SIZE(super_types) ||
6058 super_types[info->major_version].name == NULL) { 6056 super_types[info->major_version].name == NULL) {
6059 /* maybe try to auto-load a module? */ 6057 /* maybe try to auto-load a module? */
6060 printk(KERN_INFO 6058 printk(KERN_INFO
6061 "md: superblock version %d not known\n", 6059 "md: superblock version %d not known\n",
6062 info->major_version); 6060 info->major_version);
6063 return -EINVAL; 6061 return -EINVAL;
6064 } 6062 }
6065 mddev->major_version = info->major_version; 6063 mddev->major_version = info->major_version;
6066 mddev->minor_version = info->minor_version; 6064 mddev->minor_version = info->minor_version;
6067 mddev->patch_version = info->patch_version; 6065 mddev->patch_version = info->patch_version;
6068 mddev->persistent = !info->not_persistent; 6066 mddev->persistent = !info->not_persistent;
6069 /* ensure mddev_put doesn't delete this now that there 6067 /* ensure mddev_put doesn't delete this now that there
6070 * is some minimal configuration. 6068 * is some minimal configuration.
6071 */ 6069 */
6072 mddev->ctime = get_seconds(); 6070 mddev->ctime = get_seconds();
6073 return 0; 6071 return 0;
6074 } 6072 }
6075 mddev->major_version = MD_MAJOR_VERSION; 6073 mddev->major_version = MD_MAJOR_VERSION;
6076 mddev->minor_version = MD_MINOR_VERSION; 6074 mddev->minor_version = MD_MINOR_VERSION;
6077 mddev->patch_version = MD_PATCHLEVEL_VERSION; 6075 mddev->patch_version = MD_PATCHLEVEL_VERSION;
6078 mddev->ctime = get_seconds(); 6076 mddev->ctime = get_seconds();
6079 6077
6080 mddev->level = info->level; 6078 mddev->level = info->level;
6081 mddev->clevel[0] = 0; 6079 mddev->clevel[0] = 0;
6082 mddev->dev_sectors = 2 * (sector_t)info->size; 6080 mddev->dev_sectors = 2 * (sector_t)info->size;
6083 mddev->raid_disks = info->raid_disks; 6081 mddev->raid_disks = info->raid_disks;
6084 /* don't set md_minor, it is determined by which /dev/md* was 6082 /* don't set md_minor, it is determined by which /dev/md* was
6085 * openned 6083 * openned
6086 */ 6084 */
6087 if (info->state & (1<<MD_SB_CLEAN)) 6085 if (info->state & (1<<MD_SB_CLEAN))
6088 mddev->recovery_cp = MaxSector; 6086 mddev->recovery_cp = MaxSector;
6089 else 6087 else
6090 mddev->recovery_cp = 0; 6088 mddev->recovery_cp = 0;
6091 mddev->persistent = ! info->not_persistent; 6089 mddev->persistent = ! info->not_persistent;
6092 mddev->external = 0; 6090 mddev->external = 0;
6093 6091
6094 mddev->layout = info->layout; 6092 mddev->layout = info->layout;
6095 mddev->chunk_sectors = info->chunk_size >> 9; 6093 mddev->chunk_sectors = info->chunk_size >> 9;
6096 6094
6097 mddev->max_disks = MD_SB_DISKS; 6095 mddev->max_disks = MD_SB_DISKS;
6098 6096
6099 if (mddev->persistent) 6097 if (mddev->persistent)
6100 mddev->flags = 0; 6098 mddev->flags = 0;
6101 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6099 set_bit(MD_CHANGE_DEVS, &mddev->flags);
6102 6100
6103 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 6101 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6104 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 6102 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6105 mddev->bitmap_info.offset = 0; 6103 mddev->bitmap_info.offset = 0;
6106 6104
6107 mddev->reshape_position = MaxSector; 6105 mddev->reshape_position = MaxSector;
6108 6106
6109 /* 6107 /*
6110 * Generate a 128 bit UUID 6108 * Generate a 128 bit UUID
6111 */ 6109 */
6112 get_random_bytes(mddev->uuid, 16); 6110 get_random_bytes(mddev->uuid, 16);
6113 6111
6114 mddev->new_level = mddev->level; 6112 mddev->new_level = mddev->level;
6115 mddev->new_chunk_sectors = mddev->chunk_sectors; 6113 mddev->new_chunk_sectors = mddev->chunk_sectors;
6116 mddev->new_layout = mddev->layout; 6114 mddev->new_layout = mddev->layout;
6117 mddev->delta_disks = 0; 6115 mddev->delta_disks = 0;
6118 mddev->reshape_backwards = 0; 6116 mddev->reshape_backwards = 0;
6119 6117
6120 return 0; 6118 return 0;
6121 } 6119 }
6122 6120
6123 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 6121 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6124 { 6122 {
6125 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 6123 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
6126 6124
6127 if (mddev->external_size) 6125 if (mddev->external_size)
6128 return; 6126 return;
6129 6127
6130 mddev->array_sectors = array_sectors; 6128 mddev->array_sectors = array_sectors;
6131 } 6129 }
6132 EXPORT_SYMBOL(md_set_array_sectors); 6130 EXPORT_SYMBOL(md_set_array_sectors);
6133 6131
6134 static int update_size(struct mddev *mddev, sector_t num_sectors) 6132 static int update_size(struct mddev *mddev, sector_t num_sectors)
6135 { 6133 {
6136 struct md_rdev *rdev; 6134 struct md_rdev *rdev;
6137 int rv; 6135 int rv;
6138 int fit = (num_sectors == 0); 6136 int fit = (num_sectors == 0);
6139 6137
6140 if (mddev->pers->resize == NULL) 6138 if (mddev->pers->resize == NULL)
6141 return -EINVAL; 6139 return -EINVAL;
6142 /* The "num_sectors" is the number of sectors of each device that 6140 /* The "num_sectors" is the number of sectors of each device that
6143 * is used. This can only make sense for arrays with redundancy. 6141 * is used. This can only make sense for arrays with redundancy.
6144 * linear and raid0 always use whatever space is available. We can only 6142 * linear and raid0 always use whatever space is available. We can only
6145 * consider changing this number if no resync or reconstruction is 6143 * consider changing this number if no resync or reconstruction is
6146 * happening, and if the new size is acceptable. It must fit before the 6144 * happening, and if the new size is acceptable. It must fit before the
6147 * sb_start or, if that is <data_offset, it must fit before the size 6145 * sb_start or, if that is <data_offset, it must fit before the size
6148 * of each device. If num_sectors is zero, we find the largest size 6146 * of each device. If num_sectors is zero, we find the largest size
6149 * that fits. 6147 * that fits.
6150 */ 6148 */
6151 if (mddev->sync_thread) 6149 if (mddev->sync_thread)
6152 return -EBUSY; 6150 return -EBUSY;
6153 6151
6154 rdev_for_each(rdev, mddev) { 6152 rdev_for_each(rdev, mddev) {
6155 sector_t avail = rdev->sectors; 6153 sector_t avail = rdev->sectors;
6156 6154
6157 if (fit && (num_sectors == 0 || num_sectors > avail)) 6155 if (fit && (num_sectors == 0 || num_sectors > avail))
6158 num_sectors = avail; 6156 num_sectors = avail;
6159 if (avail < num_sectors) 6157 if (avail < num_sectors)
6160 return -ENOSPC; 6158 return -ENOSPC;
6161 } 6159 }
6162 rv = mddev->pers->resize(mddev, num_sectors); 6160 rv = mddev->pers->resize(mddev, num_sectors);
6163 if (!rv) 6161 if (!rv)
6164 revalidate_disk(mddev->gendisk); 6162 revalidate_disk(mddev->gendisk);
6165 return rv; 6163 return rv;
6166 } 6164 }
6167 6165
6168 static int update_raid_disks(struct mddev *mddev, int raid_disks) 6166 static int update_raid_disks(struct mddev *mddev, int raid_disks)
6169 { 6167 {
6170 int rv; 6168 int rv;
6171 struct md_rdev *rdev; 6169 struct md_rdev *rdev;
6172 /* change the number of raid disks */ 6170 /* change the number of raid disks */
6173 if (mddev->pers->check_reshape == NULL) 6171 if (mddev->pers->check_reshape == NULL)
6174 return -EINVAL; 6172 return -EINVAL;
6175 if (raid_disks <= 0 || 6173 if (raid_disks <= 0 ||
6176 (mddev->max_disks && raid_disks >= mddev->max_disks)) 6174 (mddev->max_disks && raid_disks >= mddev->max_disks))
6177 return -EINVAL; 6175 return -EINVAL;
6178 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 6176 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
6179 return -EBUSY; 6177 return -EBUSY;
6180 6178
6181 rdev_for_each(rdev, mddev) { 6179 rdev_for_each(rdev, mddev) {
6182 if (mddev->raid_disks < raid_disks && 6180 if (mddev->raid_disks < raid_disks &&
6183 rdev->data_offset < rdev->new_data_offset) 6181 rdev->data_offset < rdev->new_data_offset)
6184 return -EINVAL; 6182 return -EINVAL;
6185 if (mddev->raid_disks > raid_disks && 6183 if (mddev->raid_disks > raid_disks &&
6186 rdev->data_offset > rdev->new_data_offset) 6184 rdev->data_offset > rdev->new_data_offset)
6187 return -EINVAL; 6185 return -EINVAL;
6188 } 6186 }
6189 6187
6190 mddev->delta_disks = raid_disks - mddev->raid_disks; 6188 mddev->delta_disks = raid_disks - mddev->raid_disks;
6191 if (mddev->delta_disks < 0) 6189 if (mddev->delta_disks < 0)
6192 mddev->reshape_backwards = 1; 6190 mddev->reshape_backwards = 1;
6193 else if (mddev->delta_disks > 0) 6191 else if (mddev->delta_disks > 0)
6194 mddev->reshape_backwards = 0; 6192 mddev->reshape_backwards = 0;
6195 6193
6196 rv = mddev->pers->check_reshape(mddev); 6194 rv = mddev->pers->check_reshape(mddev);
6197 if (rv < 0) { 6195 if (rv < 0) {
6198 mddev->delta_disks = 0; 6196 mddev->delta_disks = 0;
6199 mddev->reshape_backwards = 0; 6197 mddev->reshape_backwards = 0;
6200 } 6198 }
6201 return rv; 6199 return rv;
6202 } 6200 }
6203 6201
6204 6202
6205 /* 6203 /*
6206 * update_array_info is used to change the configuration of an 6204 * update_array_info is used to change the configuration of an
6207 * on-line array. 6205 * on-line array.
6208 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 6206 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
6209 * fields in the info are checked against the array. 6207 * fields in the info are checked against the array.
6210 * Any differences that cannot be handled will cause an error. 6208 * Any differences that cannot be handled will cause an error.
6211 * Normally, only one change can be managed at a time. 6209 * Normally, only one change can be managed at a time.
6212 */ 6210 */
6213 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 6211 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6214 { 6212 {
6215 int rv = 0; 6213 int rv = 0;
6216 int cnt = 0; 6214 int cnt = 0;
6217 int state = 0; 6215 int state = 0;
6218 6216
6219 /* calculate expected state,ignoring low bits */ 6217 /* calculate expected state,ignoring low bits */
6220 if (mddev->bitmap && mddev->bitmap_info.offset) 6218 if (mddev->bitmap && mddev->bitmap_info.offset)
6221 state |= (1 << MD_SB_BITMAP_PRESENT); 6219 state |= (1 << MD_SB_BITMAP_PRESENT);
6222 6220
6223 if (mddev->major_version != info->major_version || 6221 if (mddev->major_version != info->major_version ||
6224 mddev->minor_version != info->minor_version || 6222 mddev->minor_version != info->minor_version ||
6225 /* mddev->patch_version != info->patch_version || */ 6223 /* mddev->patch_version != info->patch_version || */
6226 mddev->ctime != info->ctime || 6224 mddev->ctime != info->ctime ||
6227 mddev->level != info->level || 6225 mddev->level != info->level ||
6228 /* mddev->layout != info->layout || */ 6226 /* mddev->layout != info->layout || */
6229 !mddev->persistent != info->not_persistent|| 6227 !mddev->persistent != info->not_persistent||
6230 mddev->chunk_sectors != info->chunk_size >> 9 || 6228 mddev->chunk_sectors != info->chunk_size >> 9 ||
6231 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 6229 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
6232 ((state^info->state) & 0xfffffe00) 6230 ((state^info->state) & 0xfffffe00)
6233 ) 6231 )
6234 return -EINVAL; 6232 return -EINVAL;
6235 /* Check there is only one change */ 6233 /* Check there is only one change */
6236 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6234 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6237 cnt++; 6235 cnt++;
6238 if (mddev->raid_disks != info->raid_disks) 6236 if (mddev->raid_disks != info->raid_disks)
6239 cnt++; 6237 cnt++;
6240 if (mddev->layout != info->layout) 6238 if (mddev->layout != info->layout)
6241 cnt++; 6239 cnt++;
6242 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 6240 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
6243 cnt++; 6241 cnt++;
6244 if (cnt == 0) 6242 if (cnt == 0)
6245 return 0; 6243 return 0;
6246 if (cnt > 1) 6244 if (cnt > 1)
6247 return -EINVAL; 6245 return -EINVAL;
6248 6246
6249 if (mddev->layout != info->layout) { 6247 if (mddev->layout != info->layout) {
6250 /* Change layout 6248 /* Change layout
6251 * we don't need to do anything at the md level, the 6249 * we don't need to do anything at the md level, the
6252 * personality will take care of it all. 6250 * personality will take care of it all.
6253 */ 6251 */
6254 if (mddev->pers->check_reshape == NULL) 6252 if (mddev->pers->check_reshape == NULL)
6255 return -EINVAL; 6253 return -EINVAL;
6256 else { 6254 else {
6257 mddev->new_layout = info->layout; 6255 mddev->new_layout = info->layout;
6258 rv = mddev->pers->check_reshape(mddev); 6256 rv = mddev->pers->check_reshape(mddev);
6259 if (rv) 6257 if (rv)
6260 mddev->new_layout = mddev->layout; 6258 mddev->new_layout = mddev->layout;
6261 return rv; 6259 return rv;
6262 } 6260 }
6263 } 6261 }
6264 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6262 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6265 rv = update_size(mddev, (sector_t)info->size * 2); 6263 rv = update_size(mddev, (sector_t)info->size * 2);
6266 6264
6267 if (mddev->raid_disks != info->raid_disks) 6265 if (mddev->raid_disks != info->raid_disks)
6268 rv = update_raid_disks(mddev, info->raid_disks); 6266 rv = update_raid_disks(mddev, info->raid_disks);
6269 6267
6270 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 6268 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
6271 if (mddev->pers->quiesce == NULL) 6269 if (mddev->pers->quiesce == NULL)
6272 return -EINVAL; 6270 return -EINVAL;
6273 if (mddev->recovery || mddev->sync_thread) 6271 if (mddev->recovery || mddev->sync_thread)
6274 return -EBUSY; 6272 return -EBUSY;
6275 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 6273 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
6276 /* add the bitmap */ 6274 /* add the bitmap */
6277 if (mddev->bitmap) 6275 if (mddev->bitmap)
6278 return -EEXIST; 6276 return -EEXIST;
6279 if (mddev->bitmap_info.default_offset == 0) 6277 if (mddev->bitmap_info.default_offset == 0)
6280 return -EINVAL; 6278 return -EINVAL;
6281 mddev->bitmap_info.offset = 6279 mddev->bitmap_info.offset =
6282 mddev->bitmap_info.default_offset; 6280 mddev->bitmap_info.default_offset;
6283 mddev->bitmap_info.space = 6281 mddev->bitmap_info.space =
6284 mddev->bitmap_info.default_space; 6282 mddev->bitmap_info.default_space;
6285 mddev->pers->quiesce(mddev, 1); 6283 mddev->pers->quiesce(mddev, 1);
6286 rv = bitmap_create(mddev); 6284 rv = bitmap_create(mddev);
6287 if (!rv) 6285 if (!rv)
6288 rv = bitmap_load(mddev); 6286 rv = bitmap_load(mddev);
6289 if (rv) 6287 if (rv)
6290 bitmap_destroy(mddev); 6288 bitmap_destroy(mddev);
6291 mddev->pers->quiesce(mddev, 0); 6289 mddev->pers->quiesce(mddev, 0);
6292 } else { 6290 } else {
6293 /* remove the bitmap */ 6291 /* remove the bitmap */
6294 if (!mddev->bitmap) 6292 if (!mddev->bitmap)
6295 return -ENOENT; 6293 return -ENOENT;
6296 if (mddev->bitmap->storage.file) 6294 if (mddev->bitmap->storage.file)
6297 return -EINVAL; 6295 return -EINVAL;
6298 mddev->pers->quiesce(mddev, 1); 6296 mddev->pers->quiesce(mddev, 1);
6299 bitmap_destroy(mddev); 6297 bitmap_destroy(mddev);
6300 mddev->pers->quiesce(mddev, 0); 6298 mddev->pers->quiesce(mddev, 0);
6301 mddev->bitmap_info.offset = 0; 6299 mddev->bitmap_info.offset = 0;
6302 } 6300 }
6303 } 6301 }
6304 md_update_sb(mddev, 1); 6302 md_update_sb(mddev, 1);
6305 return rv; 6303 return rv;
6306 } 6304 }
6307 6305
6308 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 6306 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
6309 { 6307 {
6310 struct md_rdev *rdev; 6308 struct md_rdev *rdev;
6311 int err = 0; 6309 int err = 0;
6312 6310
6313 if (mddev->pers == NULL) 6311 if (mddev->pers == NULL)
6314 return -ENODEV; 6312 return -ENODEV;
6315 6313
6316 rcu_read_lock(); 6314 rcu_read_lock();
6317 rdev = find_rdev_rcu(mddev, dev); 6315 rdev = find_rdev_rcu(mddev, dev);
6318 if (!rdev) 6316 if (!rdev)
6319 err = -ENODEV; 6317 err = -ENODEV;
6320 else { 6318 else {
6321 md_error(mddev, rdev); 6319 md_error(mddev, rdev);
6322 if (!test_bit(Faulty, &rdev->flags)) 6320 if (!test_bit(Faulty, &rdev->flags))
6323 err = -EBUSY; 6321 err = -EBUSY;
6324 } 6322 }
6325 rcu_read_unlock(); 6323 rcu_read_unlock();
6326 return err; 6324 return err;
6327 } 6325 }
6328 6326
6329 /* 6327 /*
6330 * We have a problem here : there is no easy way to give a CHS 6328 * We have a problem here : there is no easy way to give a CHS
6331 * virtual geometry. We currently pretend that we have a 2 heads 6329 * virtual geometry. We currently pretend that we have a 2 heads
6332 * 4 sectors (with a BIG number of cylinders...). This drives 6330 * 4 sectors (with a BIG number of cylinders...). This drives
6333 * dosfs just mad... ;-) 6331 * dosfs just mad... ;-)
6334 */ 6332 */
6335 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 6333 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
6336 { 6334 {
6337 struct mddev *mddev = bdev->bd_disk->private_data; 6335 struct mddev *mddev = bdev->bd_disk->private_data;
6338 6336
6339 geo->heads = 2; 6337 geo->heads = 2;
6340 geo->sectors = 4; 6338 geo->sectors = 4;
6341 geo->cylinders = mddev->array_sectors / 8; 6339 geo->cylinders = mddev->array_sectors / 8;
6342 return 0; 6340 return 0;
6343 } 6341 }
6344 6342
6345 static int md_ioctl(struct block_device *bdev, fmode_t mode, 6343 static int md_ioctl(struct block_device *bdev, fmode_t mode,
6346 unsigned int cmd, unsigned long arg) 6344 unsigned int cmd, unsigned long arg)
6347 { 6345 {
6348 int err = 0; 6346 int err = 0;
6349 void __user *argp = (void __user *)arg; 6347 void __user *argp = (void __user *)arg;
6350 struct mddev *mddev = NULL; 6348 struct mddev *mddev = NULL;
6351 int ro; 6349 int ro;
6352 6350
6353 switch (cmd) { 6351 switch (cmd) {
6354 case RAID_VERSION: 6352 case RAID_VERSION:
6355 case GET_ARRAY_INFO: 6353 case GET_ARRAY_INFO:
6356 case GET_DISK_INFO: 6354 case GET_DISK_INFO:
6357 break; 6355 break;
6358 default: 6356 default:
6359 if (!capable(CAP_SYS_ADMIN)) 6357 if (!capable(CAP_SYS_ADMIN))
6360 return -EACCES; 6358 return -EACCES;
6361 } 6359 }
6362 6360
6363 /* 6361 /*
6364 * Commands dealing with the RAID driver but not any 6362 * Commands dealing with the RAID driver but not any
6365 * particular array: 6363 * particular array:
6366 */ 6364 */
6367 switch (cmd) { 6365 switch (cmd) {
6368 case RAID_VERSION: 6366 case RAID_VERSION:
6369 err = get_version(argp); 6367 err = get_version(argp);
6370 goto done; 6368 goto done;
6371 6369
6372 case PRINT_RAID_DEBUG: 6370 case PRINT_RAID_DEBUG:
6373 err = 0; 6371 err = 0;
6374 md_print_devices(); 6372 md_print_devices();
6375 goto done; 6373 goto done;
6376 6374
6377 #ifndef MODULE 6375 #ifndef MODULE
6378 case RAID_AUTORUN: 6376 case RAID_AUTORUN:
6379 err = 0; 6377 err = 0;
6380 autostart_arrays(arg); 6378 autostart_arrays(arg);
6381 goto done; 6379 goto done;
6382 #endif 6380 #endif
6383 default:; 6381 default:;
6384 } 6382 }
6385 6383
6386 /* 6384 /*
6387 * Commands creating/starting a new array: 6385 * Commands creating/starting a new array:
6388 */ 6386 */
6389 6387
6390 mddev = bdev->bd_disk->private_data; 6388 mddev = bdev->bd_disk->private_data;
6391 6389
6392 if (!mddev) { 6390 if (!mddev) {
6393 BUG(); 6391 BUG();
6394 goto abort; 6392 goto abort;
6395 } 6393 }
6396 6394
6397 /* Some actions do not requires the mutex */ 6395 /* Some actions do not requires the mutex */
6398 switch (cmd) { 6396 switch (cmd) {
6399 case GET_ARRAY_INFO: 6397 case GET_ARRAY_INFO:
6400 if (!mddev->raid_disks && !mddev->external) 6398 if (!mddev->raid_disks && !mddev->external)
6401 err = -ENODEV; 6399 err = -ENODEV;
6402 else 6400 else
6403 err = get_array_info(mddev, argp); 6401 err = get_array_info(mddev, argp);
6404 goto abort; 6402 goto abort;
6405 6403
6406 case GET_DISK_INFO: 6404 case GET_DISK_INFO:
6407 if (!mddev->raid_disks && !mddev->external) 6405 if (!mddev->raid_disks && !mddev->external)
6408 err = -ENODEV; 6406 err = -ENODEV;
6409 else 6407 else
6410 err = get_disk_info(mddev, argp); 6408 err = get_disk_info(mddev, argp);
6411 goto abort; 6409 goto abort;
6412 6410
6413 case SET_DISK_FAULTY: 6411 case SET_DISK_FAULTY:
6414 err = set_disk_faulty(mddev, new_decode_dev(arg)); 6412 err = set_disk_faulty(mddev, new_decode_dev(arg));
6415 goto abort; 6413 goto abort;
6416 } 6414 }
6417 6415
6418 if (cmd == ADD_NEW_DISK) 6416 if (cmd == ADD_NEW_DISK)
6419 /* need to ensure md_delayed_delete() has completed */ 6417 /* need to ensure md_delayed_delete() has completed */
6420 flush_workqueue(md_misc_wq); 6418 flush_workqueue(md_misc_wq);
6421 6419
6422 err = mddev_lock(mddev); 6420 err = mddev_lock(mddev);
6423 if (err) { 6421 if (err) {
6424 printk(KERN_INFO 6422 printk(KERN_INFO
6425 "md: ioctl lock interrupted, reason %d, cmd %d\n", 6423 "md: ioctl lock interrupted, reason %d, cmd %d\n",
6426 err, cmd); 6424 err, cmd);
6427 goto abort; 6425 goto abort;
6428 } 6426 }
6429 6427
6430 if (cmd == SET_ARRAY_INFO) { 6428 if (cmd == SET_ARRAY_INFO) {
6431 mdu_array_info_t info; 6429 mdu_array_info_t info;
6432 if (!arg) 6430 if (!arg)
6433 memset(&info, 0, sizeof(info)); 6431 memset(&info, 0, sizeof(info));
6434 else if (copy_from_user(&info, argp, sizeof(info))) { 6432 else if (copy_from_user(&info, argp, sizeof(info))) {
6435 err = -EFAULT; 6433 err = -EFAULT;
6436 goto abort_unlock; 6434 goto abort_unlock;
6437 } 6435 }
6438 if (mddev->pers) { 6436 if (mddev->pers) {
6439 err = update_array_info(mddev, &info); 6437 err = update_array_info(mddev, &info);
6440 if (err) { 6438 if (err) {
6441 printk(KERN_WARNING "md: couldn't update" 6439 printk(KERN_WARNING "md: couldn't update"
6442 " array info. %d\n", err); 6440 " array info. %d\n", err);
6443 goto abort_unlock; 6441 goto abort_unlock;
6444 } 6442 }
6445 goto done_unlock; 6443 goto done_unlock;
6446 } 6444 }
6447 if (!list_empty(&mddev->disks)) { 6445 if (!list_empty(&mddev->disks)) {
6448 printk(KERN_WARNING 6446 printk(KERN_WARNING
6449 "md: array %s already has disks!\n", 6447 "md: array %s already has disks!\n",
6450 mdname(mddev)); 6448 mdname(mddev));
6451 err = -EBUSY; 6449 err = -EBUSY;
6452 goto abort_unlock; 6450 goto abort_unlock;
6453 } 6451 }
6454 if (mddev->raid_disks) { 6452 if (mddev->raid_disks) {
6455 printk(KERN_WARNING 6453 printk(KERN_WARNING
6456 "md: array %s already initialised!\n", 6454 "md: array %s already initialised!\n",
6457 mdname(mddev)); 6455 mdname(mddev));
6458 err = -EBUSY; 6456 err = -EBUSY;
6459 goto abort_unlock; 6457 goto abort_unlock;
6460 } 6458 }
6461 err = set_array_info(mddev, &info); 6459 err = set_array_info(mddev, &info);
6462 if (err) { 6460 if (err) {
6463 printk(KERN_WARNING "md: couldn't set" 6461 printk(KERN_WARNING "md: couldn't set"
6464 " array info. %d\n", err); 6462 " array info. %d\n", err);
6465 goto abort_unlock; 6463 goto abort_unlock;
6466 } 6464 }
6467 goto done_unlock; 6465 goto done_unlock;
6468 } 6466 }
6469 6467
6470 /* 6468 /*
6471 * Commands querying/configuring an existing array: 6469 * Commands querying/configuring an existing array:
6472 */ 6470 */
6473 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 6471 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
6474 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 6472 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
6475 if ((!mddev->raid_disks && !mddev->external) 6473 if ((!mddev->raid_disks && !mddev->external)
6476 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 6474 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
6477 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 6475 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
6478 && cmd != GET_BITMAP_FILE) { 6476 && cmd != GET_BITMAP_FILE) {
6479 err = -ENODEV; 6477 err = -ENODEV;
6480 goto abort_unlock; 6478 goto abort_unlock;
6481 } 6479 }
6482 6480
6483 /* 6481 /*
6484 * Commands even a read-only array can execute: 6482 * Commands even a read-only array can execute:
6485 */ 6483 */
6486 switch (cmd) { 6484 switch (cmd) {
6487 case GET_BITMAP_FILE: 6485 case GET_BITMAP_FILE:
6488 err = get_bitmap_file(mddev, argp); 6486 err = get_bitmap_file(mddev, argp);
6489 goto done_unlock; 6487 goto done_unlock;
6490 6488
6491 case RESTART_ARRAY_RW: 6489 case RESTART_ARRAY_RW:
6492 err = restart_array(mddev); 6490 err = restart_array(mddev);
6493 goto done_unlock; 6491 goto done_unlock;
6494 6492
6495 case STOP_ARRAY: 6493 case STOP_ARRAY:
6496 err = do_md_stop(mddev, 0, bdev); 6494 err = do_md_stop(mddev, 0, bdev);
6497 goto done_unlock; 6495 goto done_unlock;
6498 6496
6499 case STOP_ARRAY_RO: 6497 case STOP_ARRAY_RO:
6500 err = md_set_readonly(mddev, bdev); 6498 err = md_set_readonly(mddev, bdev);
6501 goto done_unlock; 6499 goto done_unlock;
6502 6500
6503 case HOT_REMOVE_DISK: 6501 case HOT_REMOVE_DISK:
6504 err = hot_remove_disk(mddev, new_decode_dev(arg)); 6502 err = hot_remove_disk(mddev, new_decode_dev(arg));
6505 goto done_unlock; 6503 goto done_unlock;
6506 6504
6507 case ADD_NEW_DISK: 6505 case ADD_NEW_DISK:
6508 /* We can support ADD_NEW_DISK on read-only arrays 6506 /* We can support ADD_NEW_DISK on read-only arrays
6509 * on if we are re-adding a preexisting device. 6507 * on if we are re-adding a preexisting device.
6510 * So require mddev->pers and MD_DISK_SYNC. 6508 * So require mddev->pers and MD_DISK_SYNC.
6511 */ 6509 */
6512 if (mddev->pers) { 6510 if (mddev->pers) {
6513 mdu_disk_info_t info; 6511 mdu_disk_info_t info;
6514 if (copy_from_user(&info, argp, sizeof(info))) 6512 if (copy_from_user(&info, argp, sizeof(info)))
6515 err = -EFAULT; 6513 err = -EFAULT;
6516 else if (!(info.state & (1<<MD_DISK_SYNC))) 6514 else if (!(info.state & (1<<MD_DISK_SYNC)))
6517 /* Need to clear read-only for this */ 6515 /* Need to clear read-only for this */
6518 break; 6516 break;
6519 else 6517 else
6520 err = add_new_disk(mddev, &info); 6518 err = add_new_disk(mddev, &info);
6521 goto done_unlock; 6519 goto done_unlock;
6522 } 6520 }
6523 break; 6521 break;
6524 6522
6525 case BLKROSET: 6523 case BLKROSET:
6526 if (get_user(ro, (int __user *)(arg))) { 6524 if (get_user(ro, (int __user *)(arg))) {
6527 err = -EFAULT; 6525 err = -EFAULT;
6528 goto done_unlock; 6526 goto done_unlock;
6529 } 6527 }
6530 err = -EINVAL; 6528 err = -EINVAL;
6531 6529
6532 /* if the bdev is going readonly the value of mddev->ro 6530 /* if the bdev is going readonly the value of mddev->ro
6533 * does not matter, no writes are coming 6531 * does not matter, no writes are coming
6534 */ 6532 */
6535 if (ro) 6533 if (ro)
6536 goto done_unlock; 6534 goto done_unlock;
6537 6535
6538 /* are we are already prepared for writes? */ 6536 /* are we are already prepared for writes? */
6539 if (mddev->ro != 1) 6537 if (mddev->ro != 1)
6540 goto done_unlock; 6538 goto done_unlock;
6541 6539
6542 /* transitioning to readauto need only happen for 6540 /* transitioning to readauto need only happen for
6543 * arrays that call md_write_start 6541 * arrays that call md_write_start
6544 */ 6542 */
6545 if (mddev->pers) { 6543 if (mddev->pers) {
6546 err = restart_array(mddev); 6544 err = restart_array(mddev);
6547 if (err == 0) { 6545 if (err == 0) {
6548 mddev->ro = 2; 6546 mddev->ro = 2;
6549 set_disk_ro(mddev->gendisk, 0); 6547 set_disk_ro(mddev->gendisk, 0);
6550 } 6548 }
6551 } 6549 }
6552 goto done_unlock; 6550 goto done_unlock;
6553 } 6551 }
6554 6552
6555 /* 6553 /*
6556 * The remaining ioctls are changing the state of the 6554 * The remaining ioctls are changing the state of the
6557 * superblock, so we do not allow them on read-only arrays. 6555 * superblock, so we do not allow them on read-only arrays.
6558 * However non-MD ioctls (e.g. get-size) will still come through 6556 * However non-MD ioctls (e.g. get-size) will still come through
6559 * here and hit the 'default' below, so only disallow 6557 * here and hit the 'default' below, so only disallow
6560 * 'md' ioctls, and switch to rw mode if started auto-readonly. 6558 * 'md' ioctls, and switch to rw mode if started auto-readonly.
6561 */ 6559 */
6562 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { 6560 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
6563 if (mddev->ro == 2) { 6561 if (mddev->ro == 2) {
6564 mddev->ro = 0; 6562 mddev->ro = 0;
6565 sysfs_notify_dirent_safe(mddev->sysfs_state); 6563 sysfs_notify_dirent_safe(mddev->sysfs_state);
6566 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6564 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6567 /* mddev_unlock will wake thread */ 6565 /* mddev_unlock will wake thread */
6568 /* If a device failed while we were read-only, we 6566 /* If a device failed while we were read-only, we
6569 * need to make sure the metadata is updated now. 6567 * need to make sure the metadata is updated now.
6570 */ 6568 */
6571 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 6569 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
6572 mddev_unlock(mddev); 6570 mddev_unlock(mddev);
6573 wait_event(mddev->sb_wait, 6571 wait_event(mddev->sb_wait,
6574 !test_bit(MD_CHANGE_DEVS, &mddev->flags) && 6572 !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6575 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6573 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6576 mddev_lock(mddev); 6574 mddev_lock(mddev);
6577 } 6575 }
6578 } else { 6576 } else {
6579 err = -EROFS; 6577 err = -EROFS;
6580 goto abort_unlock; 6578 goto abort_unlock;
6581 } 6579 }
6582 } 6580 }
6583 6581
6584 switch (cmd) { 6582 switch (cmd) {
6585 case ADD_NEW_DISK: 6583 case ADD_NEW_DISK:
6586 { 6584 {
6587 mdu_disk_info_t info; 6585 mdu_disk_info_t info;
6588 if (copy_from_user(&info, argp, sizeof(info))) 6586 if (copy_from_user(&info, argp, sizeof(info)))
6589 err = -EFAULT; 6587 err = -EFAULT;
6590 else 6588 else
6591 err = add_new_disk(mddev, &info); 6589 err = add_new_disk(mddev, &info);
6592 goto done_unlock; 6590 goto done_unlock;
6593 } 6591 }
6594 6592
6595 case HOT_ADD_DISK: 6593 case HOT_ADD_DISK:
6596 err = hot_add_disk(mddev, new_decode_dev(arg)); 6594 err = hot_add_disk(mddev, new_decode_dev(arg));
6597 goto done_unlock; 6595 goto done_unlock;
6598 6596
6599 case RUN_ARRAY: 6597 case RUN_ARRAY:
6600 err = do_md_run(mddev); 6598 err = do_md_run(mddev);
6601 goto done_unlock; 6599 goto done_unlock;
6602 6600
6603 case SET_BITMAP_FILE: 6601 case SET_BITMAP_FILE:
6604 err = set_bitmap_file(mddev, (int)arg); 6602 err = set_bitmap_file(mddev, (int)arg);
6605 goto done_unlock; 6603 goto done_unlock;
6606 6604
6607 default: 6605 default:
6608 err = -EINVAL; 6606 err = -EINVAL;
6609 goto abort_unlock; 6607 goto abort_unlock;
6610 } 6608 }
6611 6609
6612 done_unlock: 6610 done_unlock:
6613 abort_unlock: 6611 abort_unlock:
6614 if (mddev->hold_active == UNTIL_IOCTL && 6612 if (mddev->hold_active == UNTIL_IOCTL &&
6615 err != -EINVAL) 6613 err != -EINVAL)
6616 mddev->hold_active = 0; 6614 mddev->hold_active = 0;
6617 mddev_unlock(mddev); 6615 mddev_unlock(mddev);
6618 6616
6619 return err; 6617 return err;
6620 done: 6618 done:
6621 if (err) 6619 if (err)
6622 MD_BUG(); 6620 MD_BUG();
6623 abort: 6621 abort:
6624 return err; 6622 return err;
6625 } 6623 }
6626 #ifdef CONFIG_COMPAT 6624 #ifdef CONFIG_COMPAT
6627 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, 6625 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
6628 unsigned int cmd, unsigned long arg) 6626 unsigned int cmd, unsigned long arg)
6629 { 6627 {
6630 switch (cmd) { 6628 switch (cmd) {
6631 case HOT_REMOVE_DISK: 6629 case HOT_REMOVE_DISK:
6632 case HOT_ADD_DISK: 6630 case HOT_ADD_DISK:
6633 case SET_DISK_FAULTY: 6631 case SET_DISK_FAULTY:
6634 case SET_BITMAP_FILE: 6632 case SET_BITMAP_FILE:
6635 /* These take in integer arg, do not convert */ 6633 /* These take in integer arg, do not convert */
6636 break; 6634 break;
6637 default: 6635 default:
6638 arg = (unsigned long)compat_ptr(arg); 6636 arg = (unsigned long)compat_ptr(arg);
6639 break; 6637 break;
6640 } 6638 }
6641 6639
6642 return md_ioctl(bdev, mode, cmd, arg); 6640 return md_ioctl(bdev, mode, cmd, arg);
6643 } 6641 }
6644 #endif /* CONFIG_COMPAT */ 6642 #endif /* CONFIG_COMPAT */
6645 6643
6646 static int md_open(struct block_device *bdev, fmode_t mode) 6644 static int md_open(struct block_device *bdev, fmode_t mode)
6647 { 6645 {
6648 /* 6646 /*
6649 * Succeed if we can lock the mddev, which confirms that 6647 * Succeed if we can lock the mddev, which confirms that
6650 * it isn't being stopped right now. 6648 * it isn't being stopped right now.
6651 */ 6649 */
6652 struct mddev *mddev = mddev_find(bdev->bd_dev); 6650 struct mddev *mddev = mddev_find(bdev->bd_dev);
6653 int err; 6651 int err;
6654 6652
6655 if (!mddev) 6653 if (!mddev)
6656 return -ENODEV; 6654 return -ENODEV;
6657 6655
6658 if (mddev->gendisk != bdev->bd_disk) { 6656 if (mddev->gendisk != bdev->bd_disk) {
6659 /* we are racing with mddev_put which is discarding this 6657 /* we are racing with mddev_put which is discarding this
6660 * bd_disk. 6658 * bd_disk.
6661 */ 6659 */
6662 mddev_put(mddev); 6660 mddev_put(mddev);
6663 /* Wait until bdev->bd_disk is definitely gone */ 6661 /* Wait until bdev->bd_disk is definitely gone */
6664 flush_workqueue(md_misc_wq); 6662 flush_workqueue(md_misc_wq);
6665 /* Then retry the open from the top */ 6663 /* Then retry the open from the top */
6666 return -ERESTARTSYS; 6664 return -ERESTARTSYS;
6667 } 6665 }
6668 BUG_ON(mddev != bdev->bd_disk->private_data); 6666 BUG_ON(mddev != bdev->bd_disk->private_data);
6669 6667
6670 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 6668 if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
6671 goto out; 6669 goto out;
6672 6670
6673 err = 0; 6671 err = 0;
6674 atomic_inc(&mddev->openers); 6672 atomic_inc(&mddev->openers);
6675 mutex_unlock(&mddev->open_mutex); 6673 mutex_unlock(&mddev->open_mutex);
6676 6674
6677 check_disk_change(bdev); 6675 check_disk_change(bdev);
6678 out: 6676 out:
6679 return err; 6677 return err;
6680 } 6678 }
6681 6679
6682 static int md_release(struct gendisk *disk, fmode_t mode) 6680 static int md_release(struct gendisk *disk, fmode_t mode)
6683 { 6681 {
6684 struct mddev *mddev = disk->private_data; 6682 struct mddev *mddev = disk->private_data;
6685 6683
6686 BUG_ON(!mddev); 6684 BUG_ON(!mddev);
6687 atomic_dec(&mddev->openers); 6685 atomic_dec(&mddev->openers);
6688 mddev_put(mddev); 6686 mddev_put(mddev);
6689 6687
6690 return 0; 6688 return 0;
6691 } 6689 }
6692 6690
6693 static int md_media_changed(struct gendisk *disk) 6691 static int md_media_changed(struct gendisk *disk)
6694 { 6692 {
6695 struct mddev *mddev = disk->private_data; 6693 struct mddev *mddev = disk->private_data;
6696 6694
6697 return mddev->changed; 6695 return mddev->changed;
6698 } 6696 }
6699 6697
6700 static int md_revalidate(struct gendisk *disk) 6698 static int md_revalidate(struct gendisk *disk)
6701 { 6699 {
6702 struct mddev *mddev = disk->private_data; 6700 struct mddev *mddev = disk->private_data;
6703 6701
6704 mddev->changed = 0; 6702 mddev->changed = 0;
6705 return 0; 6703 return 0;
6706 } 6704 }
6707 static const struct block_device_operations md_fops = 6705 static const struct block_device_operations md_fops =
6708 { 6706 {
6709 .owner = THIS_MODULE, 6707 .owner = THIS_MODULE,
6710 .open = md_open, 6708 .open = md_open,
6711 .release = md_release, 6709 .release = md_release,
6712 .ioctl = md_ioctl, 6710 .ioctl = md_ioctl,
6713 #ifdef CONFIG_COMPAT 6711 #ifdef CONFIG_COMPAT
6714 .compat_ioctl = md_compat_ioctl, 6712 .compat_ioctl = md_compat_ioctl,
6715 #endif 6713 #endif
6716 .getgeo = md_getgeo, 6714 .getgeo = md_getgeo,
6717 .media_changed = md_media_changed, 6715 .media_changed = md_media_changed,
6718 .revalidate_disk= md_revalidate, 6716 .revalidate_disk= md_revalidate,
6719 }; 6717 };
6720 6718
6721 static int md_thread(void * arg) 6719 static int md_thread(void * arg)
6722 { 6720 {
6723 struct md_thread *thread = arg; 6721 struct md_thread *thread = arg;
6724 6722
6725 /* 6723 /*
6726 * md_thread is a 'system-thread', it's priority should be very 6724 * md_thread is a 'system-thread', it's priority should be very
6727 * high. We avoid resource deadlocks individually in each 6725 * high. We avoid resource deadlocks individually in each
6728 * raid personality. (RAID5 does preallocation) We also use RR and 6726 * raid personality. (RAID5 does preallocation) We also use RR and
6729 * the very same RT priority as kswapd, thus we will never get 6727 * the very same RT priority as kswapd, thus we will never get
6730 * into a priority inversion deadlock. 6728 * into a priority inversion deadlock.
6731 * 6729 *
6732 * we definitely have to have equal or higher priority than 6730 * we definitely have to have equal or higher priority than
6733 * bdflush, otherwise bdflush will deadlock if there are too 6731 * bdflush, otherwise bdflush will deadlock if there are too
6734 * many dirty RAID5 blocks. 6732 * many dirty RAID5 blocks.
6735 */ 6733 */
6736 6734
6737 allow_signal(SIGKILL); 6735 allow_signal(SIGKILL);
6738 while (!kthread_should_stop()) { 6736 while (!kthread_should_stop()) {
6739 6737
6740 /* We need to wait INTERRUPTIBLE so that 6738 /* We need to wait INTERRUPTIBLE so that
6741 * we don't add to the load-average. 6739 * we don't add to the load-average.
6742 * That means we need to be sure no signals are 6740 * That means we need to be sure no signals are
6743 * pending 6741 * pending
6744 */ 6742 */
6745 if (signal_pending(current)) 6743 if (signal_pending(current))
6746 flush_signals(current); 6744 flush_signals(current);
6747 6745
6748 wait_event_interruptible_timeout 6746 wait_event_interruptible_timeout
6749 (thread->wqueue, 6747 (thread->wqueue,
6750 test_bit(THREAD_WAKEUP, &thread->flags) 6748 test_bit(THREAD_WAKEUP, &thread->flags)
6751 || kthread_should_stop(), 6749 || kthread_should_stop(),
6752 thread->timeout); 6750 thread->timeout);
6753 6751
6754 clear_bit(THREAD_WAKEUP, &thread->flags); 6752 clear_bit(THREAD_WAKEUP, &thread->flags);
6755 if (!kthread_should_stop()) 6753 if (!kthread_should_stop())
6756 thread->run(thread); 6754 thread->run(thread);
6757 } 6755 }
6758 6756
6759 return 0; 6757 return 0;
6760 } 6758 }
6761 6759
6762 void md_wakeup_thread(struct md_thread *thread) 6760 void md_wakeup_thread(struct md_thread *thread)
6763 { 6761 {
6764 if (thread) { 6762 if (thread) {
6765 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); 6763 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
6766 set_bit(THREAD_WAKEUP, &thread->flags); 6764 set_bit(THREAD_WAKEUP, &thread->flags);
6767 wake_up(&thread->wqueue); 6765 wake_up(&thread->wqueue);
6768 } 6766 }
6769 } 6767 }
6770 6768
6771 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 6769 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
6772 struct mddev *mddev, const char *name) 6770 struct mddev *mddev, const char *name)
6773 { 6771 {
6774 struct md_thread *thread; 6772 struct md_thread *thread;
6775 6773
6776 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 6774 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
6777 if (!thread) 6775 if (!thread)
6778 return NULL; 6776 return NULL;
6779 6777
6780 init_waitqueue_head(&thread->wqueue); 6778 init_waitqueue_head(&thread->wqueue);
6781 6779
6782 thread->run = run; 6780 thread->run = run;
6783 thread->mddev = mddev; 6781 thread->mddev = mddev;
6784 thread->timeout = MAX_SCHEDULE_TIMEOUT; 6782 thread->timeout = MAX_SCHEDULE_TIMEOUT;
6785 thread->tsk = kthread_run(md_thread, thread, 6783 thread->tsk = kthread_run(md_thread, thread,
6786 "%s_%s", 6784 "%s_%s",
6787 mdname(thread->mddev), 6785 mdname(thread->mddev),
6788 name); 6786 name);
6789 if (IS_ERR(thread->tsk)) { 6787 if (IS_ERR(thread->tsk)) {
6790 kfree(thread); 6788 kfree(thread);
6791 return NULL; 6789 return NULL;
6792 } 6790 }
6793 return thread; 6791 return thread;
6794 } 6792 }
6795 6793
6796 void md_unregister_thread(struct md_thread **threadp) 6794 void md_unregister_thread(struct md_thread **threadp)
6797 { 6795 {
6798 struct md_thread *thread = *threadp; 6796 struct md_thread *thread = *threadp;
6799 if (!thread) 6797 if (!thread)
6800 return; 6798 return;
6801 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 6799 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
6802 /* Locking ensures that mddev_unlock does not wake_up a 6800 /* Locking ensures that mddev_unlock does not wake_up a
6803 * non-existent thread 6801 * non-existent thread
6804 */ 6802 */
6805 spin_lock(&pers_lock); 6803 spin_lock(&pers_lock);
6806 *threadp = NULL; 6804 *threadp = NULL;
6807 spin_unlock(&pers_lock); 6805 spin_unlock(&pers_lock);
6808 6806
6809 kthread_stop(thread->tsk); 6807 kthread_stop(thread->tsk);
6810 kfree(thread); 6808 kfree(thread);
6811 } 6809 }
6812 6810
6813 void md_error(struct mddev *mddev, struct md_rdev *rdev) 6811 void md_error(struct mddev *mddev, struct md_rdev *rdev)
6814 { 6812 {
6815 if (!mddev) { 6813 if (!mddev) {
6816 MD_BUG(); 6814 MD_BUG();
6817 return; 6815 return;
6818 } 6816 }
6819 6817
6820 if (!rdev || test_bit(Faulty, &rdev->flags)) 6818 if (!rdev || test_bit(Faulty, &rdev->flags))
6821 return; 6819 return;
6822 6820
6823 if (!mddev->pers || !mddev->pers->error_handler) 6821 if (!mddev->pers || !mddev->pers->error_handler)
6824 return; 6822 return;
6825 mddev->pers->error_handler(mddev,rdev); 6823 mddev->pers->error_handler(mddev,rdev);
6826 if (mddev->degraded) 6824 if (mddev->degraded)
6827 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6825 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6828 sysfs_notify_dirent_safe(rdev->sysfs_state); 6826 sysfs_notify_dirent_safe(rdev->sysfs_state);
6829 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6827 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6830 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6828 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6831 md_wakeup_thread(mddev->thread); 6829 md_wakeup_thread(mddev->thread);
6832 if (mddev->event_work.func) 6830 if (mddev->event_work.func)
6833 queue_work(md_misc_wq, &mddev->event_work); 6831 queue_work(md_misc_wq, &mddev->event_work);
6834 md_new_event_inintr(mddev); 6832 md_new_event_inintr(mddev);
6835 } 6833 }
6836 6834
6837 /* seq_file implementation /proc/mdstat */ 6835 /* seq_file implementation /proc/mdstat */
6838 6836
6839 static void status_unused(struct seq_file *seq) 6837 static void status_unused(struct seq_file *seq)
6840 { 6838 {
6841 int i = 0; 6839 int i = 0;
6842 struct md_rdev *rdev; 6840 struct md_rdev *rdev;
6843 6841
6844 seq_printf(seq, "unused devices: "); 6842 seq_printf(seq, "unused devices: ");
6845 6843
6846 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 6844 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
6847 char b[BDEVNAME_SIZE]; 6845 char b[BDEVNAME_SIZE];
6848 i++; 6846 i++;
6849 seq_printf(seq, "%s ", 6847 seq_printf(seq, "%s ",
6850 bdevname(rdev->bdev,b)); 6848 bdevname(rdev->bdev,b));
6851 } 6849 }
6852 if (!i) 6850 if (!i)
6853 seq_printf(seq, "<none>"); 6851 seq_printf(seq, "<none>");
6854 6852
6855 seq_printf(seq, "\n"); 6853 seq_printf(seq, "\n");
6856 } 6854 }
6857 6855
6858 6856
6859 static void status_resync(struct seq_file *seq, struct mddev * mddev) 6857 static void status_resync(struct seq_file *seq, struct mddev * mddev)
6860 { 6858 {
6861 sector_t max_sectors, resync, res; 6859 sector_t max_sectors, resync, res;
6862 unsigned long dt, db; 6860 unsigned long dt, db;
6863 sector_t rt; 6861 sector_t rt;
6864 int scale; 6862 int scale;
6865 unsigned int per_milli; 6863 unsigned int per_milli;
6866 6864
6867 if (mddev->curr_resync <= 3) 6865 if (mddev->curr_resync <= 3)
6868 resync = 0; 6866 resync = 0;
6869 else 6867 else
6870 resync = mddev->curr_resync 6868 resync = mddev->curr_resync
6871 - atomic_read(&mddev->recovery_active); 6869 - atomic_read(&mddev->recovery_active);
6872 6870
6873 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 6871 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
6874 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6872 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6875 max_sectors = mddev->resync_max_sectors; 6873 max_sectors = mddev->resync_max_sectors;
6876 else 6874 else
6877 max_sectors = mddev->dev_sectors; 6875 max_sectors = mddev->dev_sectors;
6878 6876
6879 /* 6877 /*
6880 * Should not happen. 6878 * Should not happen.
6881 */ 6879 */
6882 if (!max_sectors) { 6880 if (!max_sectors) {
6883 MD_BUG(); 6881 MD_BUG();
6884 return; 6882 return;
6885 } 6883 }
6886 /* Pick 'scale' such that (resync>>scale)*1000 will fit 6884 /* Pick 'scale' such that (resync>>scale)*1000 will fit
6887 * in a sector_t, and (max_sectors>>scale) will fit in a 6885 * in a sector_t, and (max_sectors>>scale) will fit in a
6888 * u32, as those are the requirements for sector_div. 6886 * u32, as those are the requirements for sector_div.
6889 * Thus 'scale' must be at least 10 6887 * Thus 'scale' must be at least 10
6890 */ 6888 */
6891 scale = 10; 6889 scale = 10;
6892 if (sizeof(sector_t) > sizeof(unsigned long)) { 6890 if (sizeof(sector_t) > sizeof(unsigned long)) {
6893 while ( max_sectors/2 > (1ULL<<(scale+32))) 6891 while ( max_sectors/2 > (1ULL<<(scale+32)))
6894 scale++; 6892 scale++;
6895 } 6893 }
6896 res = (resync>>scale)*1000; 6894 res = (resync>>scale)*1000;
6897 sector_div(res, (u32)((max_sectors>>scale)+1)); 6895 sector_div(res, (u32)((max_sectors>>scale)+1));
6898 6896
6899 per_milli = res; 6897 per_milli = res;
6900 { 6898 {
6901 int i, x = per_milli/50, y = 20-x; 6899 int i, x = per_milli/50, y = 20-x;
6902 seq_printf(seq, "["); 6900 seq_printf(seq, "[");
6903 for (i = 0; i < x; i++) 6901 for (i = 0; i < x; i++)
6904 seq_printf(seq, "="); 6902 seq_printf(seq, "=");
6905 seq_printf(seq, ">"); 6903 seq_printf(seq, ">");
6906 for (i = 0; i < y; i++) 6904 for (i = 0; i < y; i++)
6907 seq_printf(seq, "."); 6905 seq_printf(seq, ".");
6908 seq_printf(seq, "] "); 6906 seq_printf(seq, "] ");
6909 } 6907 }
6910 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 6908 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
6911 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 6909 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
6912 "reshape" : 6910 "reshape" :
6913 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 6911 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
6914 "check" : 6912 "check" :
6915 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 6913 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
6916 "resync" : "recovery"))), 6914 "resync" : "recovery"))),
6917 per_milli/10, per_milli % 10, 6915 per_milli/10, per_milli % 10,
6918 (unsigned long long) resync/2, 6916 (unsigned long long) resync/2,
6919 (unsigned long long) max_sectors/2); 6917 (unsigned long long) max_sectors/2);
6920 6918
6921 /* 6919 /*
6922 * dt: time from mark until now 6920 * dt: time from mark until now
6923 * db: blocks written from mark until now 6921 * db: blocks written from mark until now
6924 * rt: remaining time 6922 * rt: remaining time
6925 * 6923 *
6926 * rt is a sector_t, so could be 32bit or 64bit. 6924 * rt is a sector_t, so could be 32bit or 64bit.
6927 * So we divide before multiply in case it is 32bit and close 6925 * So we divide before multiply in case it is 32bit and close
6928 * to the limit. 6926 * to the limit.
6929 * We scale the divisor (db) by 32 to avoid losing precision 6927 * We scale the divisor (db) by 32 to avoid losing precision
6930 * near the end of resync when the number of remaining sectors 6928 * near the end of resync when the number of remaining sectors
6931 * is close to 'db'. 6929 * is close to 'db'.
6932 * We then divide rt by 32 after multiplying by db to compensate. 6930 * We then divide rt by 32 after multiplying by db to compensate.
6933 * The '+1' avoids division by zero if db is very small. 6931 * The '+1' avoids division by zero if db is very small.
6934 */ 6932 */
6935 dt = ((jiffies - mddev->resync_mark) / HZ); 6933 dt = ((jiffies - mddev->resync_mark) / HZ);
6936 if (!dt) dt++; 6934 if (!dt) dt++;
6937 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 6935 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
6938 - mddev->resync_mark_cnt; 6936 - mddev->resync_mark_cnt;
6939 6937
6940 rt = max_sectors - resync; /* number of remaining sectors */ 6938 rt = max_sectors - resync; /* number of remaining sectors */
6941 sector_div(rt, db/32+1); 6939 sector_div(rt, db/32+1);
6942 rt *= dt; 6940 rt *= dt;
6943 rt >>= 5; 6941 rt >>= 5;
6944 6942
6945 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 6943 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
6946 ((unsigned long)rt % 60)/6); 6944 ((unsigned long)rt % 60)/6);
6947 6945
6948 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 6946 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
6949 } 6947 }
6950 6948
6951 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 6949 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
6952 { 6950 {
6953 struct list_head *tmp; 6951 struct list_head *tmp;
6954 loff_t l = *pos; 6952 loff_t l = *pos;
6955 struct mddev *mddev; 6953 struct mddev *mddev;
6956 6954
6957 if (l >= 0x10000) 6955 if (l >= 0x10000)
6958 return NULL; 6956 return NULL;
6959 if (!l--) 6957 if (!l--)
6960 /* header */ 6958 /* header */
6961 return (void*)1; 6959 return (void*)1;
6962 6960
6963 spin_lock(&all_mddevs_lock); 6961 spin_lock(&all_mddevs_lock);
6964 list_for_each(tmp,&all_mddevs) 6962 list_for_each(tmp,&all_mddevs)
6965 if (!l--) { 6963 if (!l--) {
6966 mddev = list_entry(tmp, struct mddev, all_mddevs); 6964 mddev = list_entry(tmp, struct mddev, all_mddevs);
6967 mddev_get(mddev); 6965 mddev_get(mddev);
6968 spin_unlock(&all_mddevs_lock); 6966 spin_unlock(&all_mddevs_lock);
6969 return mddev; 6967 return mddev;
6970 } 6968 }
6971 spin_unlock(&all_mddevs_lock); 6969 spin_unlock(&all_mddevs_lock);
6972 if (!l--) 6970 if (!l--)
6973 return (void*)2;/* tail */ 6971 return (void*)2;/* tail */
6974 return NULL; 6972 return NULL;
6975 } 6973 }
6976 6974
6977 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 6975 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
6978 { 6976 {
6979 struct list_head *tmp; 6977 struct list_head *tmp;
6980 struct mddev *next_mddev, *mddev = v; 6978 struct mddev *next_mddev, *mddev = v;
6981 6979
6982 ++*pos; 6980 ++*pos;
6983 if (v == (void*)2) 6981 if (v == (void*)2)
6984 return NULL; 6982 return NULL;
6985 6983
6986 spin_lock(&all_mddevs_lock); 6984 spin_lock(&all_mddevs_lock);
6987 if (v == (void*)1) 6985 if (v == (void*)1)
6988 tmp = all_mddevs.next; 6986 tmp = all_mddevs.next;
6989 else 6987 else
6990 tmp = mddev->all_mddevs.next; 6988 tmp = mddev->all_mddevs.next;
6991 if (tmp != &all_mddevs) 6989 if (tmp != &all_mddevs)
6992 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); 6990 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
6993 else { 6991 else {
6994 next_mddev = (void*)2; 6992 next_mddev = (void*)2;
6995 *pos = 0x10000; 6993 *pos = 0x10000;
6996 } 6994 }
6997 spin_unlock(&all_mddevs_lock); 6995 spin_unlock(&all_mddevs_lock);
6998 6996
6999 if (v != (void*)1) 6997 if (v != (void*)1)
7000 mddev_put(mddev); 6998 mddev_put(mddev);
7001 return next_mddev; 6999 return next_mddev;
7002 7000
7003 } 7001 }
7004 7002
7005 static void md_seq_stop(struct seq_file *seq, void *v) 7003 static void md_seq_stop(struct seq_file *seq, void *v)
7006 { 7004 {
7007 struct mddev *mddev = v; 7005 struct mddev *mddev = v;
7008 7006
7009 if (mddev && v != (void*)1 && v != (void*)2) 7007 if (mddev && v != (void*)1 && v != (void*)2)
7010 mddev_put(mddev); 7008 mddev_put(mddev);
7011 } 7009 }
7012 7010
7013 static int md_seq_show(struct seq_file *seq, void *v) 7011 static int md_seq_show(struct seq_file *seq, void *v)
7014 { 7012 {
7015 struct mddev *mddev = v; 7013 struct mddev *mddev = v;
7016 sector_t sectors; 7014 sector_t sectors;
7017 struct md_rdev *rdev; 7015 struct md_rdev *rdev;
7018 7016
7019 if (v == (void*)1) { 7017 if (v == (void*)1) {
7020 struct md_personality *pers; 7018 struct md_personality *pers;
7021 seq_printf(seq, "Personalities : "); 7019 seq_printf(seq, "Personalities : ");
7022 spin_lock(&pers_lock); 7020 spin_lock(&pers_lock);
7023 list_for_each_entry(pers, &pers_list, list) 7021 list_for_each_entry(pers, &pers_list, list)
7024 seq_printf(seq, "[%s] ", pers->name); 7022 seq_printf(seq, "[%s] ", pers->name);
7025 7023
7026 spin_unlock(&pers_lock); 7024 spin_unlock(&pers_lock);
7027 seq_printf(seq, "\n"); 7025 seq_printf(seq, "\n");
7028 seq->poll_event = atomic_read(&md_event_count); 7026 seq->poll_event = atomic_read(&md_event_count);
7029 return 0; 7027 return 0;
7030 } 7028 }
7031 if (v == (void*)2) { 7029 if (v == (void*)2) {
7032 status_unused(seq); 7030 status_unused(seq);
7033 return 0; 7031 return 0;
7034 } 7032 }
7035 7033
7036 if (mddev_lock(mddev) < 0) 7034 if (mddev_lock(mddev) < 0)
7037 return -EINTR; 7035 return -EINTR;
7038 7036
7039 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 7037 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
7040 seq_printf(seq, "%s : %sactive", mdname(mddev), 7038 seq_printf(seq, "%s : %sactive", mdname(mddev),
7041 mddev->pers ? "" : "in"); 7039 mddev->pers ? "" : "in");
7042 if (mddev->pers) { 7040 if (mddev->pers) {
7043 if (mddev->ro==1) 7041 if (mddev->ro==1)
7044 seq_printf(seq, " (read-only)"); 7042 seq_printf(seq, " (read-only)");
7045 if (mddev->ro==2) 7043 if (mddev->ro==2)
7046 seq_printf(seq, " (auto-read-only)"); 7044 seq_printf(seq, " (auto-read-only)");
7047 seq_printf(seq, " %s", mddev->pers->name); 7045 seq_printf(seq, " %s", mddev->pers->name);
7048 } 7046 }
7049 7047
7050 sectors = 0; 7048 sectors = 0;
7051 rdev_for_each(rdev, mddev) { 7049 rdev_for_each(rdev, mddev) {
7052 char b[BDEVNAME_SIZE]; 7050 char b[BDEVNAME_SIZE];
7053 seq_printf(seq, " %s[%d]", 7051 seq_printf(seq, " %s[%d]",
7054 bdevname(rdev->bdev,b), rdev->desc_nr); 7052 bdevname(rdev->bdev,b), rdev->desc_nr);
7055 if (test_bit(WriteMostly, &rdev->flags)) 7053 if (test_bit(WriteMostly, &rdev->flags))
7056 seq_printf(seq, "(W)"); 7054 seq_printf(seq, "(W)");
7057 if (test_bit(Faulty, &rdev->flags)) { 7055 if (test_bit(Faulty, &rdev->flags)) {
7058 seq_printf(seq, "(F)"); 7056 seq_printf(seq, "(F)");
7059 continue; 7057 continue;
7060 } 7058 }
7061 if (rdev->raid_disk < 0) 7059 if (rdev->raid_disk < 0)
7062 seq_printf(seq, "(S)"); /* spare */ 7060 seq_printf(seq, "(S)"); /* spare */
7063 if (test_bit(Replacement, &rdev->flags)) 7061 if (test_bit(Replacement, &rdev->flags))
7064 seq_printf(seq, "(R)"); 7062 seq_printf(seq, "(R)");
7065 sectors += rdev->sectors; 7063 sectors += rdev->sectors;
7066 } 7064 }
7067 7065
7068 if (!list_empty(&mddev->disks)) { 7066 if (!list_empty(&mddev->disks)) {
7069 if (mddev->pers) 7067 if (mddev->pers)
7070 seq_printf(seq, "\n %llu blocks", 7068 seq_printf(seq, "\n %llu blocks",
7071 (unsigned long long) 7069 (unsigned long long)
7072 mddev->array_sectors / 2); 7070 mddev->array_sectors / 2);
7073 else 7071 else
7074 seq_printf(seq, "\n %llu blocks", 7072 seq_printf(seq, "\n %llu blocks",
7075 (unsigned long long)sectors / 2); 7073 (unsigned long long)sectors / 2);
7076 } 7074 }
7077 if (mddev->persistent) { 7075 if (mddev->persistent) {
7078 if (mddev->major_version != 0 || 7076 if (mddev->major_version != 0 ||
7079 mddev->minor_version != 90) { 7077 mddev->minor_version != 90) {
7080 seq_printf(seq," super %d.%d", 7078 seq_printf(seq," super %d.%d",
7081 mddev->major_version, 7079 mddev->major_version,
7082 mddev->minor_version); 7080 mddev->minor_version);
7083 } 7081 }
7084 } else if (mddev->external) 7082 } else if (mddev->external)
7085 seq_printf(seq, " super external:%s", 7083 seq_printf(seq, " super external:%s",
7086 mddev->metadata_type); 7084 mddev->metadata_type);
7087 else 7085 else
7088 seq_printf(seq, " super non-persistent"); 7086 seq_printf(seq, " super non-persistent");
7089 7087
7090 if (mddev->pers) { 7088 if (mddev->pers) {
7091 mddev->pers->status(seq, mddev); 7089 mddev->pers->status(seq, mddev);
7092 seq_printf(seq, "\n "); 7090 seq_printf(seq, "\n ");
7093 if (mddev->pers->sync_request) { 7091 if (mddev->pers->sync_request) {
7094 if (mddev->curr_resync > 2) { 7092 if (mddev->curr_resync > 2) {
7095 status_resync(seq, mddev); 7093 status_resync(seq, mddev);
7096 seq_printf(seq, "\n "); 7094 seq_printf(seq, "\n ");
7097 } else if (mddev->curr_resync >= 1) 7095 } else if (mddev->curr_resync >= 1)
7098 seq_printf(seq, "\tresync=DELAYED\n "); 7096 seq_printf(seq, "\tresync=DELAYED\n ");
7099 else if (mddev->recovery_cp < MaxSector) 7097 else if (mddev->recovery_cp < MaxSector)
7100 seq_printf(seq, "\tresync=PENDING\n "); 7098 seq_printf(seq, "\tresync=PENDING\n ");
7101 } 7099 }
7102 } else 7100 } else
7103 seq_printf(seq, "\n "); 7101 seq_printf(seq, "\n ");
7104 7102
7105 bitmap_status(seq, mddev->bitmap); 7103 bitmap_status(seq, mddev->bitmap);
7106 7104
7107 seq_printf(seq, "\n"); 7105 seq_printf(seq, "\n");
7108 } 7106 }
7109 mddev_unlock(mddev); 7107 mddev_unlock(mddev);
7110 7108
7111 return 0; 7109 return 0;
7112 } 7110 }
7113 7111
7114 static const struct seq_operations md_seq_ops = { 7112 static const struct seq_operations md_seq_ops = {
7115 .start = md_seq_start, 7113 .start = md_seq_start,
7116 .next = md_seq_next, 7114 .next = md_seq_next,
7117 .stop = md_seq_stop, 7115 .stop = md_seq_stop,
7118 .show = md_seq_show, 7116 .show = md_seq_show,
7119 }; 7117 };
7120 7118
7121 static int md_seq_open(struct inode *inode, struct file *file) 7119 static int md_seq_open(struct inode *inode, struct file *file)
7122 { 7120 {
7123 struct seq_file *seq; 7121 struct seq_file *seq;
7124 int error; 7122 int error;
7125 7123
7126 error = seq_open(file, &md_seq_ops); 7124 error = seq_open(file, &md_seq_ops);
7127 if (error) 7125 if (error)
7128 return error; 7126 return error;
7129 7127
7130 seq = file->private_data; 7128 seq = file->private_data;
7131 seq->poll_event = atomic_read(&md_event_count); 7129 seq->poll_event = atomic_read(&md_event_count);
7132 return error; 7130 return error;
7133 } 7131 }
7134 7132
7135 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 7133 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
7136 { 7134 {
7137 struct seq_file *seq = filp->private_data; 7135 struct seq_file *seq = filp->private_data;
7138 int mask; 7136 int mask;
7139 7137
7140 poll_wait(filp, &md_event_waiters, wait); 7138 poll_wait(filp, &md_event_waiters, wait);
7141 7139
7142 /* always allow read */ 7140 /* always allow read */
7143 mask = POLLIN | POLLRDNORM; 7141 mask = POLLIN | POLLRDNORM;
7144 7142
7145 if (seq->poll_event != atomic_read(&md_event_count)) 7143 if (seq->poll_event != atomic_read(&md_event_count))
7146 mask |= POLLERR | POLLPRI; 7144 mask |= POLLERR | POLLPRI;
7147 return mask; 7145 return mask;
7148 } 7146 }
7149 7147
7150 static const struct file_operations md_seq_fops = { 7148 static const struct file_operations md_seq_fops = {
7151 .owner = THIS_MODULE, 7149 .owner = THIS_MODULE,
7152 .open = md_seq_open, 7150 .open = md_seq_open,
7153 .read = seq_read, 7151 .read = seq_read,
7154 .llseek = seq_lseek, 7152 .llseek = seq_lseek,
7155 .release = seq_release_private, 7153 .release = seq_release_private,
7156 .poll = mdstat_poll, 7154 .poll = mdstat_poll,
7157 }; 7155 };
7158 7156
7159 int register_md_personality(struct md_personality *p) 7157 int register_md_personality(struct md_personality *p)
7160 { 7158 {
7161 spin_lock(&pers_lock); 7159 spin_lock(&pers_lock);
7162 list_add_tail(&p->list, &pers_list); 7160 list_add_tail(&p->list, &pers_list);
7163 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 7161 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
7164 spin_unlock(&pers_lock); 7162 spin_unlock(&pers_lock);
7165 return 0; 7163 return 0;
7166 } 7164 }
7167 7165
7168 int unregister_md_personality(struct md_personality *p) 7166 int unregister_md_personality(struct md_personality *p)
7169 { 7167 {
7170 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 7168 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
7171 spin_lock(&pers_lock); 7169 spin_lock(&pers_lock);
7172 list_del_init(&p->list); 7170 list_del_init(&p->list);
7173 spin_unlock(&pers_lock); 7171 spin_unlock(&pers_lock);
7174 return 0; 7172 return 0;
7175 } 7173 }
7176 7174
7177 static int is_mddev_idle(struct mddev *mddev, int init) 7175 static int is_mddev_idle(struct mddev *mddev, int init)
7178 { 7176 {
7179 struct md_rdev * rdev; 7177 struct md_rdev * rdev;
7180 int idle; 7178 int idle;
7181 int curr_events; 7179 int curr_events;
7182 7180
7183 idle = 1; 7181 idle = 1;
7184 rcu_read_lock(); 7182 rcu_read_lock();
7185 rdev_for_each_rcu(rdev, mddev) { 7183 rdev_for_each_rcu(rdev, mddev) {
7186 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 7184 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
7187 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 7185 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
7188 (int)part_stat_read(&disk->part0, sectors[1]) - 7186 (int)part_stat_read(&disk->part0, sectors[1]) -
7189 atomic_read(&disk->sync_io); 7187 atomic_read(&disk->sync_io);
7190 /* sync IO will cause sync_io to increase before the disk_stats 7188 /* sync IO will cause sync_io to increase before the disk_stats
7191 * as sync_io is counted when a request starts, and 7189 * as sync_io is counted when a request starts, and
7192 * disk_stats is counted when it completes. 7190 * disk_stats is counted when it completes.
7193 * So resync activity will cause curr_events to be smaller than 7191 * So resync activity will cause curr_events to be smaller than
7194 * when there was no such activity. 7192 * when there was no such activity.
7195 * non-sync IO will cause disk_stat to increase without 7193 * non-sync IO will cause disk_stat to increase without
7196 * increasing sync_io so curr_events will (eventually) 7194 * increasing sync_io so curr_events will (eventually)
7197 * be larger than it was before. Once it becomes 7195 * be larger than it was before. Once it becomes
7198 * substantially larger, the test below will cause 7196 * substantially larger, the test below will cause
7199 * the array to appear non-idle, and resync will slow 7197 * the array to appear non-idle, and resync will slow
7200 * down. 7198 * down.
7201 * If there is a lot of outstanding resync activity when 7199 * If there is a lot of outstanding resync activity when
7202 * we set last_event to curr_events, then all that activity 7200 * we set last_event to curr_events, then all that activity
7203 * completing might cause the array to appear non-idle 7201 * completing might cause the array to appear non-idle
7204 * and resync will be slowed down even though there might 7202 * and resync will be slowed down even though there might
7205 * not have been non-resync activity. This will only 7203 * not have been non-resync activity. This will only
7206 * happen once though. 'last_events' will soon reflect 7204 * happen once though. 'last_events' will soon reflect
7207 * the state where there is little or no outstanding 7205 * the state where there is little or no outstanding
7208 * resync requests, and further resync activity will 7206 * resync requests, and further resync activity will
7209 * always make curr_events less than last_events. 7207 * always make curr_events less than last_events.
7210 * 7208 *
7211 */ 7209 */
7212 if (init || curr_events - rdev->last_events > 64) { 7210 if (init || curr_events - rdev->last_events > 64) {
7213 rdev->last_events = curr_events; 7211 rdev->last_events = curr_events;
7214 idle = 0; 7212 idle = 0;
7215 } 7213 }
7216 } 7214 }
7217 rcu_read_unlock(); 7215 rcu_read_unlock();
7218 return idle; 7216 return idle;
7219 } 7217 }
7220 7218
7221 void md_done_sync(struct mddev *mddev, int blocks, int ok) 7219 void md_done_sync(struct mddev *mddev, int blocks, int ok)
7222 { 7220 {
7223 /* another "blocks" (512byte) blocks have been synced */ 7221 /* another "blocks" (512byte) blocks have been synced */
7224 atomic_sub(blocks, &mddev->recovery_active); 7222 atomic_sub(blocks, &mddev->recovery_active);
7225 wake_up(&mddev->recovery_wait); 7223 wake_up(&mddev->recovery_wait);
7226 if (!ok) { 7224 if (!ok) {
7227 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7225 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7228 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 7226 set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
7229 md_wakeup_thread(mddev->thread); 7227 md_wakeup_thread(mddev->thread);
7230 // stop recovery, signal do_sync .... 7228 // stop recovery, signal do_sync ....
7231 } 7229 }
7232 } 7230 }
7233 7231
7234 7232
7235 /* md_write_start(mddev, bi) 7233 /* md_write_start(mddev, bi)
7236 * If we need to update some array metadata (e.g. 'active' flag 7234 * If we need to update some array metadata (e.g. 'active' flag
7237 * in superblock) before writing, schedule a superblock update 7235 * in superblock) before writing, schedule a superblock update
7238 * and wait for it to complete. 7236 * and wait for it to complete.
7239 */ 7237 */
7240 void md_write_start(struct mddev *mddev, struct bio *bi) 7238 void md_write_start(struct mddev *mddev, struct bio *bi)
7241 { 7239 {
7242 int did_change = 0; 7240 int did_change = 0;
7243 if (bio_data_dir(bi) != WRITE) 7241 if (bio_data_dir(bi) != WRITE)
7244 return; 7242 return;
7245 7243
7246 BUG_ON(mddev->ro == 1); 7244 BUG_ON(mddev->ro == 1);
7247 if (mddev->ro == 2) { 7245 if (mddev->ro == 2) {
7248 /* need to switch to read/write */ 7246 /* need to switch to read/write */
7249 mddev->ro = 0; 7247 mddev->ro = 0;
7250 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7248 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7251 md_wakeup_thread(mddev->thread); 7249 md_wakeup_thread(mddev->thread);
7252 md_wakeup_thread(mddev->sync_thread); 7250 md_wakeup_thread(mddev->sync_thread);
7253 did_change = 1; 7251 did_change = 1;
7254 } 7252 }
7255 atomic_inc(&mddev->writes_pending); 7253 atomic_inc(&mddev->writes_pending);
7256 if (mddev->safemode == 1) 7254 if (mddev->safemode == 1)
7257 mddev->safemode = 0; 7255 mddev->safemode = 0;
7258 if (mddev->in_sync) { 7256 if (mddev->in_sync) {
7259 spin_lock_irq(&mddev->write_lock); 7257 spin_lock_irq(&mddev->write_lock);
7260 if (mddev->in_sync) { 7258 if (mddev->in_sync) {
7261 mddev->in_sync = 0; 7259 mddev->in_sync = 0;
7262 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7260 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7263 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7261 set_bit(MD_CHANGE_PENDING, &mddev->flags);
7264 md_wakeup_thread(mddev->thread); 7262 md_wakeup_thread(mddev->thread);
7265 did_change = 1; 7263 did_change = 1;
7266 } 7264 }
7267 spin_unlock_irq(&mddev->write_lock); 7265 spin_unlock_irq(&mddev->write_lock);
7268 } 7266 }
7269 if (did_change) 7267 if (did_change)
7270 sysfs_notify_dirent_safe(mddev->sysfs_state); 7268 sysfs_notify_dirent_safe(mddev->sysfs_state);
7271 wait_event(mddev->sb_wait, 7269 wait_event(mddev->sb_wait,
7272 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 7270 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
7273 } 7271 }
7274 7272
7275 void md_write_end(struct mddev *mddev) 7273 void md_write_end(struct mddev *mddev)
7276 { 7274 {
7277 if (atomic_dec_and_test(&mddev->writes_pending)) { 7275 if (atomic_dec_and_test(&mddev->writes_pending)) {
7278 if (mddev->safemode == 2) 7276 if (mddev->safemode == 2)
7279 md_wakeup_thread(mddev->thread); 7277 md_wakeup_thread(mddev->thread);
7280 else if (mddev->safemode_delay) 7278 else if (mddev->safemode_delay)
7281 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 7279 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
7282 } 7280 }
7283 } 7281 }
7284 7282
7285 /* md_allow_write(mddev) 7283 /* md_allow_write(mddev)
7286 * Calling this ensures that the array is marked 'active' so that writes 7284 * Calling this ensures that the array is marked 'active' so that writes
7287 * may proceed without blocking. It is important to call this before 7285 * may proceed without blocking. It is important to call this before
7288 * attempting a GFP_KERNEL allocation while holding the mddev lock. 7286 * attempting a GFP_KERNEL allocation while holding the mddev lock.
7289 * Must be called with mddev_lock held. 7287 * Must be called with mddev_lock held.
7290 * 7288 *
7291 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 7289 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
7292 * is dropped, so return -EAGAIN after notifying userspace. 7290 * is dropped, so return -EAGAIN after notifying userspace.
7293 */ 7291 */
7294 int md_allow_write(struct mddev *mddev) 7292 int md_allow_write(struct mddev *mddev)
7295 { 7293 {
7296 if (!mddev->pers) 7294 if (!mddev->pers)
7297 return 0; 7295 return 0;
7298 if (mddev->ro) 7296 if (mddev->ro)
7299 return 0; 7297 return 0;
7300 if (!mddev->pers->sync_request) 7298 if (!mddev->pers->sync_request)
7301 return 0; 7299 return 0;
7302 7300
7303 spin_lock_irq(&mddev->write_lock); 7301 spin_lock_irq(&mddev->write_lock);
7304 if (mddev->in_sync) { 7302 if (mddev->in_sync) {
7305 mddev->in_sync = 0; 7303 mddev->in_sync = 0;
7306 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7304 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7307 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7305 set_bit(MD_CHANGE_PENDING, &mddev->flags);
7308 if (mddev->safemode_delay && 7306 if (mddev->safemode_delay &&
7309 mddev->safemode == 0) 7307 mddev->safemode == 0)
7310 mddev->safemode = 1; 7308 mddev->safemode = 1;
7311 spin_unlock_irq(&mddev->write_lock); 7309 spin_unlock_irq(&mddev->write_lock);
7312 md_update_sb(mddev, 0); 7310 md_update_sb(mddev, 0);
7313 sysfs_notify_dirent_safe(mddev->sysfs_state); 7311 sysfs_notify_dirent_safe(mddev->sysfs_state);
7314 } else 7312 } else
7315 spin_unlock_irq(&mddev->write_lock); 7313 spin_unlock_irq(&mddev->write_lock);
7316 7314
7317 if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 7315 if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
7318 return -EAGAIN; 7316 return -EAGAIN;
7319 else 7317 else
7320 return 0; 7318 return 0;
7321 } 7319 }
7322 EXPORT_SYMBOL_GPL(md_allow_write); 7320 EXPORT_SYMBOL_GPL(md_allow_write);
7323 7321
7324 #define SYNC_MARKS 10 7322 #define SYNC_MARKS 10
7325 #define SYNC_MARK_STEP (3*HZ) 7323 #define SYNC_MARK_STEP (3*HZ)
7326 #define UPDATE_FREQUENCY (5*60*HZ) 7324 #define UPDATE_FREQUENCY (5*60*HZ)
7327 void md_do_sync(struct md_thread *thread) 7325 void md_do_sync(struct md_thread *thread)
7328 { 7326 {
7329 struct mddev *mddev = thread->mddev; 7327 struct mddev *mddev = thread->mddev;
7330 struct mddev *mddev2; 7328 struct mddev *mddev2;
7331 unsigned int currspeed = 0, 7329 unsigned int currspeed = 0,
7332 window; 7330 window;
7333 sector_t max_sectors,j, io_sectors; 7331 sector_t max_sectors,j, io_sectors;
7334 unsigned long mark[SYNC_MARKS]; 7332 unsigned long mark[SYNC_MARKS];
7335 unsigned long update_time; 7333 unsigned long update_time;
7336 sector_t mark_cnt[SYNC_MARKS]; 7334 sector_t mark_cnt[SYNC_MARKS];
7337 int last_mark,m; 7335 int last_mark,m;
7338 struct list_head *tmp; 7336 struct list_head *tmp;
7339 sector_t last_check; 7337 sector_t last_check;
7340 int skipped = 0; 7338 int skipped = 0;
7341 struct md_rdev *rdev; 7339 struct md_rdev *rdev;
7342 char *desc; 7340 char *desc;
7343 struct blk_plug plug; 7341 struct blk_plug plug;
7344 7342
7345 /* just incase thread restarts... */ 7343 /* just incase thread restarts... */
7346 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7344 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7347 return; 7345 return;
7348 if (mddev->ro) /* never try to sync a read-only array */ 7346 if (mddev->ro) /* never try to sync a read-only array */
7349 return; 7347 return;
7350 7348
7351 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7349 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7352 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 7350 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
7353 desc = "data-check"; 7351 desc = "data-check";
7354 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7352 else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7355 desc = "requested-resync"; 7353 desc = "requested-resync";
7356 else 7354 else
7357 desc = "resync"; 7355 desc = "resync";
7358 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7356 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7359 desc = "reshape"; 7357 desc = "reshape";
7360 else 7358 else
7361 desc = "recovery"; 7359 desc = "recovery";
7362 7360
7363 /* we overload curr_resync somewhat here. 7361 /* we overload curr_resync somewhat here.
7364 * 0 == not engaged in resync at all 7362 * 0 == not engaged in resync at all
7365 * 2 == checking that there is no conflict with another sync 7363 * 2 == checking that there is no conflict with another sync
7366 * 1 == like 2, but have yielded to allow conflicting resync to 7364 * 1 == like 2, but have yielded to allow conflicting resync to
7367 * commense 7365 * commense
7368 * other == active in resync - this many blocks 7366 * other == active in resync - this many blocks
7369 * 7367 *
7370 * Before starting a resync we must have set curr_resync to 7368 * Before starting a resync we must have set curr_resync to
7371 * 2, and then checked that every "conflicting" array has curr_resync 7369 * 2, and then checked that every "conflicting" array has curr_resync
7372 * less than ours. When we find one that is the same or higher 7370 * less than ours. When we find one that is the same or higher
7373 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 7371 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
7374 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 7372 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
7375 * This will mean we have to start checking from the beginning again. 7373 * This will mean we have to start checking from the beginning again.
7376 * 7374 *
7377 */ 7375 */
7378 7376
7379 do { 7377 do {
7380 mddev->curr_resync = 2; 7378 mddev->curr_resync = 2;
7381 7379
7382 try_again: 7380 try_again:
7383 if (kthread_should_stop()) 7381 if (kthread_should_stop())
7384 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7382 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7385 7383
7386 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7384 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7387 goto skip; 7385 goto skip;
7388 for_each_mddev(mddev2, tmp) { 7386 for_each_mddev(mddev2, tmp) {
7389 if (mddev2 == mddev) 7387 if (mddev2 == mddev)
7390 continue; 7388 continue;
7391 if (!mddev->parallel_resync 7389 if (!mddev->parallel_resync
7392 && mddev2->curr_resync 7390 && mddev2->curr_resync
7393 && match_mddev_units(mddev, mddev2)) { 7391 && match_mddev_units(mddev, mddev2)) {
7394 DEFINE_WAIT(wq); 7392 DEFINE_WAIT(wq);
7395 if (mddev < mddev2 && mddev->curr_resync == 2) { 7393 if (mddev < mddev2 && mddev->curr_resync == 2) {
7396 /* arbitrarily yield */ 7394 /* arbitrarily yield */
7397 mddev->curr_resync = 1; 7395 mddev->curr_resync = 1;
7398 wake_up(&resync_wait); 7396 wake_up(&resync_wait);
7399 } 7397 }
7400 if (mddev > mddev2 && mddev->curr_resync == 1) 7398 if (mddev > mddev2 && mddev->curr_resync == 1)
7401 /* no need to wait here, we can wait the next 7399 /* no need to wait here, we can wait the next
7402 * time 'round when curr_resync == 2 7400 * time 'round when curr_resync == 2
7403 */ 7401 */
7404 continue; 7402 continue;
7405 /* We need to wait 'interruptible' so as not to 7403 /* We need to wait 'interruptible' so as not to
7406 * contribute to the load average, and not to 7404 * contribute to the load average, and not to
7407 * be caught by 'softlockup' 7405 * be caught by 'softlockup'
7408 */ 7406 */
7409 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 7407 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
7410 if (!kthread_should_stop() && 7408 if (!kthread_should_stop() &&
7411 mddev2->curr_resync >= mddev->curr_resync) { 7409 mddev2->curr_resync >= mddev->curr_resync) {
7412 printk(KERN_INFO "md: delaying %s of %s" 7410 printk(KERN_INFO "md: delaying %s of %s"
7413 " until %s has finished (they" 7411 " until %s has finished (they"
7414 " share one or more physical units)\n", 7412 " share one or more physical units)\n",
7415 desc, mdname(mddev), mdname(mddev2)); 7413 desc, mdname(mddev), mdname(mddev2));
7416 mddev_put(mddev2); 7414 mddev_put(mddev2);
7417 if (signal_pending(current)) 7415 if (signal_pending(current))
7418 flush_signals(current); 7416 flush_signals(current);
7419 schedule(); 7417 schedule();
7420 finish_wait(&resync_wait, &wq); 7418 finish_wait(&resync_wait, &wq);
7421 goto try_again; 7419 goto try_again;
7422 } 7420 }
7423 finish_wait(&resync_wait, &wq); 7421 finish_wait(&resync_wait, &wq);
7424 } 7422 }
7425 } 7423 }
7426 } while (mddev->curr_resync < 2); 7424 } while (mddev->curr_resync < 2);
7427 7425
7428 j = 0; 7426 j = 0;
7429 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7427 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7430 /* resync follows the size requested by the personality, 7428 /* resync follows the size requested by the personality,
7431 * which defaults to physical size, but can be virtual size 7429 * which defaults to physical size, but can be virtual size
7432 */ 7430 */
7433 max_sectors = mddev->resync_max_sectors; 7431 max_sectors = mddev->resync_max_sectors;
7434 atomic64_set(&mddev->resync_mismatches, 0); 7432 atomic64_set(&mddev->resync_mismatches, 0);
7435 /* we don't use the checkpoint if there's a bitmap */ 7433 /* we don't use the checkpoint if there's a bitmap */
7436 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7434 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7437 j = mddev->resync_min; 7435 j = mddev->resync_min;
7438 else if (!mddev->bitmap) 7436 else if (!mddev->bitmap)
7439 j = mddev->recovery_cp; 7437 j = mddev->recovery_cp;
7440 7438
7441 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7439 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7442 max_sectors = mddev->resync_max_sectors; 7440 max_sectors = mddev->resync_max_sectors;
7443 else { 7441 else {
7444 /* recovery follows the physical size of devices */ 7442 /* recovery follows the physical size of devices */
7445 max_sectors = mddev->dev_sectors; 7443 max_sectors = mddev->dev_sectors;
7446 j = MaxSector; 7444 j = MaxSector;
7447 rcu_read_lock(); 7445 rcu_read_lock();
7448 rdev_for_each_rcu(rdev, mddev) 7446 rdev_for_each_rcu(rdev, mddev)
7449 if (rdev->raid_disk >= 0 && 7447 if (rdev->raid_disk >= 0 &&
7450 !test_bit(Faulty, &rdev->flags) && 7448 !test_bit(Faulty, &rdev->flags) &&
7451 !test_bit(In_sync, &rdev->flags) && 7449 !test_bit(In_sync, &rdev->flags) &&
7452 rdev->recovery_offset < j) 7450 rdev->recovery_offset < j)
7453 j = rdev->recovery_offset; 7451 j = rdev->recovery_offset;
7454 rcu_read_unlock(); 7452 rcu_read_unlock();
7455 } 7453 }
7456 7454
7457 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 7455 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
7458 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 7456 printk(KERN_INFO "md: minimum _guaranteed_ speed:"
7459 " %d KB/sec/disk.\n", speed_min(mddev)); 7457 " %d KB/sec/disk.\n", speed_min(mddev));
7460 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 7458 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
7461 "(but not more than %d KB/sec) for %s.\n", 7459 "(but not more than %d KB/sec) for %s.\n",
7462 speed_max(mddev), desc); 7460 speed_max(mddev), desc);
7463 7461
7464 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 7462 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
7465 7463
7466 io_sectors = 0; 7464 io_sectors = 0;
7467 for (m = 0; m < SYNC_MARKS; m++) { 7465 for (m = 0; m < SYNC_MARKS; m++) {
7468 mark[m] = jiffies; 7466 mark[m] = jiffies;
7469 mark_cnt[m] = io_sectors; 7467 mark_cnt[m] = io_sectors;
7470 } 7468 }
7471 last_mark = 0; 7469 last_mark = 0;
7472 mddev->resync_mark = mark[last_mark]; 7470 mddev->resync_mark = mark[last_mark];
7473 mddev->resync_mark_cnt = mark_cnt[last_mark]; 7471 mddev->resync_mark_cnt = mark_cnt[last_mark];
7474 7472
7475 /* 7473 /*
7476 * Tune reconstruction: 7474 * Tune reconstruction:
7477 */ 7475 */
7478 window = 32*(PAGE_SIZE/512); 7476 window = 32*(PAGE_SIZE/512);
7479 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", 7477 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
7480 window/2, (unsigned long long)max_sectors/2); 7478 window/2, (unsigned long long)max_sectors/2);
7481 7479
7482 atomic_set(&mddev->recovery_active, 0); 7480 atomic_set(&mddev->recovery_active, 0);
7483 last_check = 0; 7481 last_check = 0;
7484 7482
7485 if (j>2) { 7483 if (j>2) {
7486 printk(KERN_INFO 7484 printk(KERN_INFO
7487 "md: resuming %s of %s from checkpoint.\n", 7485 "md: resuming %s of %s from checkpoint.\n",
7488 desc, mdname(mddev)); 7486 desc, mdname(mddev));
7489 mddev->curr_resync = j; 7487 mddev->curr_resync = j;
7490 } else 7488 } else
7491 mddev->curr_resync = 3; /* no longer delayed */ 7489 mddev->curr_resync = 3; /* no longer delayed */
7492 mddev->curr_resync_completed = j; 7490 mddev->curr_resync_completed = j;
7493 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7491 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7494 md_new_event(mddev); 7492 md_new_event(mddev);
7495 update_time = jiffies; 7493 update_time = jiffies;
7496 7494
7497 blk_start_plug(&plug); 7495 blk_start_plug(&plug);
7498 while (j < max_sectors) { 7496 while (j < max_sectors) {
7499 sector_t sectors; 7497 sector_t sectors;
7500 7498
7501 skipped = 0; 7499 skipped = 0;
7502 7500
7503 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 7501 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7504 ((mddev->curr_resync > mddev->curr_resync_completed && 7502 ((mddev->curr_resync > mddev->curr_resync_completed &&
7505 (mddev->curr_resync - mddev->curr_resync_completed) 7503 (mddev->curr_resync - mddev->curr_resync_completed)
7506 > (max_sectors >> 4)) || 7504 > (max_sectors >> 4)) ||
7507 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 7505 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
7508 (j - mddev->curr_resync_completed)*2 7506 (j - mddev->curr_resync_completed)*2
7509 >= mddev->resync_max - mddev->curr_resync_completed 7507 >= mddev->resync_max - mddev->curr_resync_completed
7510 )) { 7508 )) {
7511 /* time to update curr_resync_completed */ 7509 /* time to update curr_resync_completed */
7512 wait_event(mddev->recovery_wait, 7510 wait_event(mddev->recovery_wait,
7513 atomic_read(&mddev->recovery_active) == 0); 7511 atomic_read(&mddev->recovery_active) == 0);
7514 mddev->curr_resync_completed = j; 7512 mddev->curr_resync_completed = j;
7515 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 7513 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
7516 j > mddev->recovery_cp) 7514 j > mddev->recovery_cp)
7517 mddev->recovery_cp = j; 7515 mddev->recovery_cp = j;
7518 update_time = jiffies; 7516 update_time = jiffies;
7519 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7517 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7520 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7518 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7521 } 7519 }
7522 7520
7523 while (j >= mddev->resync_max && !kthread_should_stop()) { 7521 while (j >= mddev->resync_max && !kthread_should_stop()) {
7524 /* As this condition is controlled by user-space, 7522 /* As this condition is controlled by user-space,
7525 * we can block indefinitely, so use '_interruptible' 7523 * we can block indefinitely, so use '_interruptible'
7526 * to avoid triggering warnings. 7524 * to avoid triggering warnings.
7527 */ 7525 */
7528 flush_signals(current); /* just in case */ 7526 flush_signals(current); /* just in case */
7529 wait_event_interruptible(mddev->recovery_wait, 7527 wait_event_interruptible(mddev->recovery_wait,
7530 mddev->resync_max > j 7528 mddev->resync_max > j
7531 || kthread_should_stop()); 7529 || kthread_should_stop());
7532 } 7530 }
7533 7531
7534 if (kthread_should_stop()) 7532 if (kthread_should_stop())
7535 goto interrupted; 7533 goto interrupted;
7536 7534
7537 sectors = mddev->pers->sync_request(mddev, j, &skipped, 7535 sectors = mddev->pers->sync_request(mddev, j, &skipped,
7538 currspeed < speed_min(mddev)); 7536 currspeed < speed_min(mddev));
7539 if (sectors == 0) { 7537 if (sectors == 0) {
7540 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7538 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7541 goto out; 7539 goto out;
7542 } 7540 }
7543 7541
7544 if (!skipped) { /* actual IO requested */ 7542 if (!skipped) { /* actual IO requested */
7545 io_sectors += sectors; 7543 io_sectors += sectors;
7546 atomic_add(sectors, &mddev->recovery_active); 7544 atomic_add(sectors, &mddev->recovery_active);
7547 } 7545 }
7548 7546
7549 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7547 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7550 break; 7548 break;
7551 7549
7552 j += sectors; 7550 j += sectors;
7553 if (j > 2) 7551 if (j > 2)
7554 mddev->curr_resync = j; 7552 mddev->curr_resync = j;
7555 mddev->curr_mark_cnt = io_sectors; 7553 mddev->curr_mark_cnt = io_sectors;
7556 if (last_check == 0) 7554 if (last_check == 0)
7557 /* this is the earliest that rebuild will be 7555 /* this is the earliest that rebuild will be
7558 * visible in /proc/mdstat 7556 * visible in /proc/mdstat
7559 */ 7557 */
7560 md_new_event(mddev); 7558 md_new_event(mddev);
7561 7559
7562 if (last_check + window > io_sectors || j == max_sectors) 7560 if (last_check + window > io_sectors || j == max_sectors)
7563 continue; 7561 continue;
7564 7562
7565 last_check = io_sectors; 7563 last_check = io_sectors;
7566 repeat: 7564 repeat:
7567 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 7565 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
7568 /* step marks */ 7566 /* step marks */
7569 int next = (last_mark+1) % SYNC_MARKS; 7567 int next = (last_mark+1) % SYNC_MARKS;
7570 7568
7571 mddev->resync_mark = mark[next]; 7569 mddev->resync_mark = mark[next];
7572 mddev->resync_mark_cnt = mark_cnt[next]; 7570 mddev->resync_mark_cnt = mark_cnt[next];
7573 mark[next] = jiffies; 7571 mark[next] = jiffies;
7574 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 7572 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
7575 last_mark = next; 7573 last_mark = next;
7576 } 7574 }
7577 7575
7578 7576
7579 if (kthread_should_stop()) 7577 if (kthread_should_stop())
7580 goto interrupted; 7578 goto interrupted;
7581 7579
7582 7580
7583 /* 7581 /*
7584 * this loop exits only if either when we are slower than 7582 * this loop exits only if either when we are slower than
7585 * the 'hard' speed limit, or the system was IO-idle for 7583 * the 'hard' speed limit, or the system was IO-idle for
7586 * a jiffy. 7584 * a jiffy.
7587 * the system might be non-idle CPU-wise, but we only care 7585 * the system might be non-idle CPU-wise, but we only care
7588 * about not overloading the IO subsystem. (things like an 7586 * about not overloading the IO subsystem. (things like an
7589 * e2fsck being done on the RAID array should execute fast) 7587 * e2fsck being done on the RAID array should execute fast)
7590 */ 7588 */
7591 cond_resched(); 7589 cond_resched();
7592 7590
7593 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 7591 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
7594 /((jiffies-mddev->resync_mark)/HZ +1) +1; 7592 /((jiffies-mddev->resync_mark)/HZ +1) +1;
7595 7593
7596 if (currspeed > speed_min(mddev)) { 7594 if (currspeed > speed_min(mddev)) {
7597 if ((currspeed > speed_max(mddev)) || 7595 if ((currspeed > speed_max(mddev)) ||
7598 !is_mddev_idle(mddev, 0)) { 7596 !is_mddev_idle(mddev, 0)) {
7599 msleep(500); 7597 msleep(500);
7600 goto repeat; 7598 goto repeat;
7601 } 7599 }
7602 } 7600 }
7603 } 7601 }
7604 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc); 7602 printk(KERN_INFO "md: %s: %s done.\n",mdname(mddev), desc);
7605 /* 7603 /*
7606 * this also signals 'finished resyncing' to md_stop 7604 * this also signals 'finished resyncing' to md_stop
7607 */ 7605 */
7608 out: 7606 out:
7609 blk_finish_plug(&plug); 7607 blk_finish_plug(&plug);
7610 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 7608 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7611 7609
7612 /* tell personality that we are finished */ 7610 /* tell personality that we are finished */
7613 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 7611 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
7614 7612
7615 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 7613 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
7616 mddev->curr_resync > 2) { 7614 mddev->curr_resync > 2) {
7617 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7615 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7618 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7616 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7619 if (mddev->curr_resync >= mddev->recovery_cp) { 7617 if (mddev->curr_resync >= mddev->recovery_cp) {
7620 printk(KERN_INFO 7618 printk(KERN_INFO
7621 "md: checkpointing %s of %s.\n", 7619 "md: checkpointing %s of %s.\n",
7622 desc, mdname(mddev)); 7620 desc, mdname(mddev));
7623 if (test_bit(MD_RECOVERY_ERROR, 7621 if (test_bit(MD_RECOVERY_ERROR,
7624 &mddev->recovery)) 7622 &mddev->recovery))
7625 mddev->recovery_cp = 7623 mddev->recovery_cp =
7626 mddev->curr_resync_completed; 7624 mddev->curr_resync_completed;
7627 else 7625 else
7628 mddev->recovery_cp = 7626 mddev->recovery_cp =
7629 mddev->curr_resync; 7627 mddev->curr_resync;
7630 } 7628 }
7631 } else 7629 } else
7632 mddev->recovery_cp = MaxSector; 7630 mddev->recovery_cp = MaxSector;
7633 } else { 7631 } else {
7634 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7632 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7635 mddev->curr_resync = MaxSector; 7633 mddev->curr_resync = MaxSector;
7636 rcu_read_lock(); 7634 rcu_read_lock();
7637 rdev_for_each_rcu(rdev, mddev) 7635 rdev_for_each_rcu(rdev, mddev)
7638 if (rdev->raid_disk >= 0 && 7636 if (rdev->raid_disk >= 0 &&
7639 mddev->delta_disks >= 0 && 7637 mddev->delta_disks >= 0 &&
7640 !test_bit(Faulty, &rdev->flags) && 7638 !test_bit(Faulty, &rdev->flags) &&
7641 !test_bit(In_sync, &rdev->flags) && 7639 !test_bit(In_sync, &rdev->flags) &&
7642 rdev->recovery_offset < mddev->curr_resync) 7640 rdev->recovery_offset < mddev->curr_resync)
7643 rdev->recovery_offset = mddev->curr_resync; 7641 rdev->recovery_offset = mddev->curr_resync;
7644 rcu_read_unlock(); 7642 rcu_read_unlock();
7645 } 7643 }
7646 } 7644 }
7647 skip: 7645 skip:
7648 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7646 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7649 7647
7650 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7648 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7651 /* We completed so min/max setting can be forgotten if used. */ 7649 /* We completed so min/max setting can be forgotten if used. */
7652 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7650 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7653 mddev->resync_min = 0; 7651 mddev->resync_min = 0;
7654 mddev->resync_max = MaxSector; 7652 mddev->resync_max = MaxSector;
7655 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7653 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7656 mddev->resync_min = mddev->curr_resync_completed; 7654 mddev->resync_min = mddev->curr_resync_completed;
7657 mddev->curr_resync = 0; 7655 mddev->curr_resync = 0;
7658 wake_up(&resync_wait); 7656 wake_up(&resync_wait);
7659 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 7657 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
7660 md_wakeup_thread(mddev->thread); 7658 md_wakeup_thread(mddev->thread);
7661 return; 7659 return;
7662 7660
7663 interrupted: 7661 interrupted:
7664 /* 7662 /*
7665 * got a signal, exit. 7663 * got a signal, exit.
7666 */ 7664 */
7667 printk(KERN_INFO 7665 printk(KERN_INFO
7668 "md: md_do_sync() got signal ... exiting\n"); 7666 "md: md_do_sync() got signal ... exiting\n");
7669 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7667 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7670 goto out; 7668 goto out;
7671 7669
7672 } 7670 }
7673 EXPORT_SYMBOL_GPL(md_do_sync); 7671 EXPORT_SYMBOL_GPL(md_do_sync);
7674 7672
7675 static int remove_and_add_spares(struct mddev *mddev, 7673 static int remove_and_add_spares(struct mddev *mddev,
7676 struct md_rdev *this) 7674 struct md_rdev *this)
7677 { 7675 {
7678 struct md_rdev *rdev; 7676 struct md_rdev *rdev;
7679 int spares = 0; 7677 int spares = 0;
7680 int removed = 0; 7678 int removed = 0;
7681 7679
7682 rdev_for_each(rdev, mddev) 7680 rdev_for_each(rdev, mddev)
7683 if ((this == NULL || rdev == this) && 7681 if ((this == NULL || rdev == this) &&
7684 rdev->raid_disk >= 0 && 7682 rdev->raid_disk >= 0 &&
7685 !test_bit(Blocked, &rdev->flags) && 7683 !test_bit(Blocked, &rdev->flags) &&
7686 (test_bit(Faulty, &rdev->flags) || 7684 (test_bit(Faulty, &rdev->flags) ||
7687 ! test_bit(In_sync, &rdev->flags)) && 7685 ! test_bit(In_sync, &rdev->flags)) &&
7688 atomic_read(&rdev->nr_pending)==0) { 7686 atomic_read(&rdev->nr_pending)==0) {
7689 if (mddev->pers->hot_remove_disk( 7687 if (mddev->pers->hot_remove_disk(
7690 mddev, rdev) == 0) { 7688 mddev, rdev) == 0) {
7691 sysfs_unlink_rdev(mddev, rdev); 7689 sysfs_unlink_rdev(mddev, rdev);
7692 rdev->raid_disk = -1; 7690 rdev->raid_disk = -1;
7693 removed++; 7691 removed++;
7694 } 7692 }
7695 } 7693 }
7696 if (removed && mddev->kobj.sd) 7694 if (removed && mddev->kobj.sd)
7697 sysfs_notify(&mddev->kobj, NULL, "degraded"); 7695 sysfs_notify(&mddev->kobj, NULL, "degraded");
7698 7696
7699 if (this) 7697 if (this)
7700 goto no_add; 7698 goto no_add;
7701 7699
7702 rdev_for_each(rdev, mddev) { 7700 rdev_for_each(rdev, mddev) {
7703 if (rdev->raid_disk >= 0 && 7701 if (rdev->raid_disk >= 0 &&
7704 !test_bit(In_sync, &rdev->flags) && 7702 !test_bit(In_sync, &rdev->flags) &&
7705 !test_bit(Faulty, &rdev->flags)) 7703 !test_bit(Faulty, &rdev->flags))
7706 spares++; 7704 spares++;
7707 if (rdev->raid_disk >= 0) 7705 if (rdev->raid_disk >= 0)
7708 continue; 7706 continue;
7709 if (test_bit(Faulty, &rdev->flags)) 7707 if (test_bit(Faulty, &rdev->flags))
7710 continue; 7708 continue;
7711 if (mddev->ro && 7709 if (mddev->ro &&
7712 rdev->saved_raid_disk < 0) 7710 rdev->saved_raid_disk < 0)
7713 continue; 7711 continue;
7714 7712
7715 rdev->recovery_offset = 0; 7713 rdev->recovery_offset = 0;
7716 if (rdev->saved_raid_disk >= 0 && mddev->in_sync) { 7714 if (rdev->saved_raid_disk >= 0 && mddev->in_sync) {
7717 spin_lock_irq(&mddev->write_lock); 7715 spin_lock_irq(&mddev->write_lock);
7718 if (mddev->in_sync) 7716 if (mddev->in_sync)
7719 /* OK, this device, which is in_sync, 7717 /* OK, this device, which is in_sync,
7720 * will definitely be noticed before 7718 * will definitely be noticed before
7721 * the next write, so recovery isn't 7719 * the next write, so recovery isn't
7722 * needed. 7720 * needed.
7723 */ 7721 */
7724 rdev->recovery_offset = mddev->recovery_cp; 7722 rdev->recovery_offset = mddev->recovery_cp;
7725 spin_unlock_irq(&mddev->write_lock); 7723 spin_unlock_irq(&mddev->write_lock);
7726 } 7724 }
7727 if (mddev->ro && rdev->recovery_offset != MaxSector) 7725 if (mddev->ro && rdev->recovery_offset != MaxSector)
7728 /* not safe to add this disk now */ 7726 /* not safe to add this disk now */
7729 continue; 7727 continue;
7730 if (mddev->pers-> 7728 if (mddev->pers->
7731 hot_add_disk(mddev, rdev) == 0) { 7729 hot_add_disk(mddev, rdev) == 0) {
7732 if (sysfs_link_rdev(mddev, rdev)) 7730 if (sysfs_link_rdev(mddev, rdev))
7733 /* failure here is OK */; 7731 /* failure here is OK */;
7734 spares++; 7732 spares++;
7735 md_new_event(mddev); 7733 md_new_event(mddev);
7736 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7734 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7737 } 7735 }
7738 } 7736 }
7739 no_add: 7737 no_add:
7740 if (removed) 7738 if (removed)
7741 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7739 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7742 return spares; 7740 return spares;
7743 } 7741 }
7744 7742
7745 static void reap_sync_thread(struct mddev *mddev)
7746 {
7747 struct md_rdev *rdev;
7748
7749 /* resync has finished, collect result */
7750 md_unregister_thread(&mddev->sync_thread);
7751 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7752 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7753 /* success...*/
7754 /* activate any spares */
7755 if (mddev->pers->spare_active(mddev)) {
7756 sysfs_notify(&mddev->kobj, NULL,
7757 "degraded");
7758 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7759 }
7760 }
7761 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7762 mddev->pers->finish_reshape)
7763 mddev->pers->finish_reshape(mddev);
7764
7765 /* If array is no-longer degraded, then any saved_raid_disk
7766 * information must be scrapped. Also if any device is now
7767 * In_sync we must scrape the saved_raid_disk for that device
7768 * do the superblock for an incrementally recovered device
7769 * written out.
7770 */
7771 rdev_for_each(rdev, mddev)
7772 if (!mddev->degraded ||
7773 test_bit(In_sync, &rdev->flags))
7774 rdev->saved_raid_disk = -1;
7775
7776 md_update_sb(mddev, 1);
7777 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7778 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7779 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7780 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7781 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7782 /* flag recovery needed just to double check */
7783 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7784 sysfs_notify_dirent_safe(mddev->sysfs_action);
7785 md_new_event(mddev);
7786 if (mddev->event_work.func)
7787 queue_work(md_misc_wq, &mddev->event_work);
7788 }
7789
7790 /* 7743 /*
7791 * This routine is regularly called by all per-raid-array threads to 7744 * This routine is regularly called by all per-raid-array threads to
7792 * deal with generic issues like resync and super-block update. 7745 * deal with generic issues like resync and super-block update.
7793 * Raid personalities that don't have a thread (linear/raid0) do not 7746 * Raid personalities that don't have a thread (linear/raid0) do not
7794 * need this as they never do any recovery or update the superblock. 7747 * need this as they never do any recovery or update the superblock.
7795 * 7748 *
7796 * It does not do any resync itself, but rather "forks" off other threads 7749 * It does not do any resync itself, but rather "forks" off other threads
7797 * to do that as needed. 7750 * to do that as needed.
7798 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 7751 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
7799 * "->recovery" and create a thread at ->sync_thread. 7752 * "->recovery" and create a thread at ->sync_thread.
7800 * When the thread finishes it sets MD_RECOVERY_DONE 7753 * When the thread finishes it sets MD_RECOVERY_DONE
7801 * and wakeups up this thread which will reap the thread and finish up. 7754 * and wakeups up this thread which will reap the thread and finish up.
7802 * This thread also removes any faulty devices (with nr_pending == 0). 7755 * This thread also removes any faulty devices (with nr_pending == 0).
7803 * 7756 *
7804 * The overall approach is: 7757 * The overall approach is:
7805 * 1/ if the superblock needs updating, update it. 7758 * 1/ if the superblock needs updating, update it.
7806 * 2/ If a recovery thread is running, don't do anything else. 7759 * 2/ If a recovery thread is running, don't do anything else.
7807 * 3/ If recovery has finished, clean up, possibly marking spares active. 7760 * 3/ If recovery has finished, clean up, possibly marking spares active.
7808 * 4/ If there are any faulty devices, remove them. 7761 * 4/ If there are any faulty devices, remove them.
7809 * 5/ If array is degraded, try to add spares devices 7762 * 5/ If array is degraded, try to add spares devices
7810 * 6/ If array has spares or is not in-sync, start a resync thread. 7763 * 6/ If array has spares or is not in-sync, start a resync thread.
7811 */ 7764 */
7812 void md_check_recovery(struct mddev *mddev) 7765 void md_check_recovery(struct mddev *mddev)
7813 { 7766 {
7814 if (mddev->suspended) 7767 if (mddev->suspended)
7815 return; 7768 return;
7816 7769
7817 if (mddev->bitmap) 7770 if (mddev->bitmap)
7818 bitmap_daemon_work(mddev); 7771 bitmap_daemon_work(mddev);
7819 7772
7820 if (signal_pending(current)) { 7773 if (signal_pending(current)) {
7821 if (mddev->pers->sync_request && !mddev->external) { 7774 if (mddev->pers->sync_request && !mddev->external) {
7822 printk(KERN_INFO "md: %s in immediate safe mode\n", 7775 printk(KERN_INFO "md: %s in immediate safe mode\n",
7823 mdname(mddev)); 7776 mdname(mddev));
7824 mddev->safemode = 2; 7777 mddev->safemode = 2;
7825 } 7778 }
7826 flush_signals(current); 7779 flush_signals(current);
7827 } 7780 }
7828 7781
7829 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 7782 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
7830 return; 7783 return;
7831 if ( ! ( 7784 if ( ! (
7832 (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) || 7785 (mddev->flags & ~ (1<<MD_CHANGE_PENDING)) ||
7833 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 7786 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7834 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 7787 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
7835 (mddev->external == 0 && mddev->safemode == 1) || 7788 (mddev->external == 0 && mddev->safemode == 1) ||
7836 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 7789 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
7837 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 7790 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
7838 )) 7791 ))
7839 return; 7792 return;
7840 7793
7841 if (mddev_trylock(mddev)) { 7794 if (mddev_trylock(mddev)) {
7842 int spares = 0; 7795 int spares = 0;
7843 7796
7844 if (mddev->ro) { 7797 if (mddev->ro) {
7845 /* On a read-only array we can: 7798 /* On a read-only array we can:
7846 * - remove failed devices 7799 * - remove failed devices
7847 * - add already-in_sync devices if the array itself 7800 * - add already-in_sync devices if the array itself
7848 * is in-sync. 7801 * is in-sync.
7849 * As we only add devices that are already in-sync, 7802 * As we only add devices that are already in-sync,
7850 * we can activate the spares immediately. 7803 * we can activate the spares immediately.
7851 */ 7804 */
7852 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7805 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7853 remove_and_add_spares(mddev, NULL); 7806 remove_and_add_spares(mddev, NULL);
7854 mddev->pers->spare_active(mddev); 7807 mddev->pers->spare_active(mddev);
7855 goto unlock; 7808 goto unlock;
7856 } 7809 }
7857 7810
7858 if (!mddev->external) { 7811 if (!mddev->external) {
7859 int did_change = 0; 7812 int did_change = 0;
7860 spin_lock_irq(&mddev->write_lock); 7813 spin_lock_irq(&mddev->write_lock);
7861 if (mddev->safemode && 7814 if (mddev->safemode &&
7862 !atomic_read(&mddev->writes_pending) && 7815 !atomic_read(&mddev->writes_pending) &&
7863 !mddev->in_sync && 7816 !mddev->in_sync &&
7864 mddev->recovery_cp == MaxSector) { 7817 mddev->recovery_cp == MaxSector) {
7865 mddev->in_sync = 1; 7818 mddev->in_sync = 1;
7866 did_change = 1; 7819 did_change = 1;
7867 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7820 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7868 } 7821 }
7869 if (mddev->safemode == 1) 7822 if (mddev->safemode == 1)
7870 mddev->safemode = 0; 7823 mddev->safemode = 0;
7871 spin_unlock_irq(&mddev->write_lock); 7824 spin_unlock_irq(&mddev->write_lock);
7872 if (did_change) 7825 if (did_change)
7873 sysfs_notify_dirent_safe(mddev->sysfs_state); 7826 sysfs_notify_dirent_safe(mddev->sysfs_state);
7874 } 7827 }
7875 7828
7876 if (mddev->flags) 7829 if (mddev->flags)
7877 md_update_sb(mddev, 0); 7830 md_update_sb(mddev, 0);
7878 7831
7879 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 7832 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
7880 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 7833 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
7881 /* resync/recovery still happening */ 7834 /* resync/recovery still happening */
7882 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7835 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7883 goto unlock; 7836 goto unlock;
7884 } 7837 }
7885 if (mddev->sync_thread) { 7838 if (mddev->sync_thread) {
7886 reap_sync_thread(mddev); 7839 md_reap_sync_thread(mddev);
7887 goto unlock; 7840 goto unlock;
7888 } 7841 }
7889 /* Set RUNNING before clearing NEEDED to avoid 7842 /* Set RUNNING before clearing NEEDED to avoid
7890 * any transients in the value of "sync_action". 7843 * any transients in the value of "sync_action".
7891 */ 7844 */
7892 mddev->curr_resync_completed = 0; 7845 mddev->curr_resync_completed = 0;
7893 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7846 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7894 /* Clear some bits that don't mean anything, but 7847 /* Clear some bits that don't mean anything, but
7895 * might be left set 7848 * might be left set
7896 */ 7849 */
7897 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 7850 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
7898 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7851 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7899 7852
7900 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 7853 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7901 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 7854 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
7902 goto unlock; 7855 goto unlock;
7903 /* no recovery is running. 7856 /* no recovery is running.
7904 * remove any failed drives, then 7857 * remove any failed drives, then
7905 * add spares if possible. 7858 * add spares if possible.
7906 * Spares are also removed and re-added, to allow 7859 * Spares are also removed and re-added, to allow
7907 * the personality to fail the re-add. 7860 * the personality to fail the re-add.
7908 */ 7861 */
7909 7862
7910 if (mddev->reshape_position != MaxSector) { 7863 if (mddev->reshape_position != MaxSector) {
7911 if (mddev->pers->check_reshape == NULL || 7864 if (mddev->pers->check_reshape == NULL ||
7912 mddev->pers->check_reshape(mddev) != 0) 7865 mddev->pers->check_reshape(mddev) != 0)
7913 /* Cannot proceed */ 7866 /* Cannot proceed */
7914 goto unlock; 7867 goto unlock;
7915 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7868 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7916 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7869 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7917 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 7870 } else if ((spares = remove_and_add_spares(mddev, NULL))) {
7918 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7871 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7919 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7872 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7920 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 7873 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7921 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7874 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7922 } else if (mddev->recovery_cp < MaxSector) { 7875 } else if (mddev->recovery_cp < MaxSector) {
7923 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7876 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7924 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7877 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7925 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 7878 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
7926 /* nothing to be done ... */ 7879 /* nothing to be done ... */
7927 goto unlock; 7880 goto unlock;
7928 7881
7929 if (mddev->pers->sync_request) { 7882 if (mddev->pers->sync_request) {
7930 if (spares) { 7883 if (spares) {
7931 /* We are adding a device or devices to an array 7884 /* We are adding a device or devices to an array
7932 * which has the bitmap stored on all devices. 7885 * which has the bitmap stored on all devices.
7933 * So make sure all bitmap pages get written 7886 * So make sure all bitmap pages get written
7934 */ 7887 */
7935 bitmap_write_all(mddev->bitmap); 7888 bitmap_write_all(mddev->bitmap);
7936 } 7889 }
7937 mddev->sync_thread = md_register_thread(md_do_sync, 7890 mddev->sync_thread = md_register_thread(md_do_sync,
7938 mddev, 7891 mddev,
7939 "resync"); 7892 "resync");
7940 if (!mddev->sync_thread) { 7893 if (!mddev->sync_thread) {
7941 printk(KERN_ERR "%s: could not start resync" 7894 printk(KERN_ERR "%s: could not start resync"
7942 " thread...\n", 7895 " thread...\n",
7943 mdname(mddev)); 7896 mdname(mddev));
7944 /* leave the spares where they are, it shouldn't hurt */ 7897 /* leave the spares where they are, it shouldn't hurt */
7945 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7898 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7946 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7899 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7947 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7900 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7948 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 7901 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7949 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7902 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7950 } else 7903 } else
7951 md_wakeup_thread(mddev->sync_thread); 7904 md_wakeup_thread(mddev->sync_thread);
7952 sysfs_notify_dirent_safe(mddev->sysfs_action); 7905 sysfs_notify_dirent_safe(mddev->sysfs_action);
7953 md_new_event(mddev); 7906 md_new_event(mddev);
7954 } 7907 }
7955 unlock: 7908 unlock:
7956 if (!mddev->sync_thread) { 7909 if (!mddev->sync_thread) {
7957 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7910 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7958 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 7911 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
7959 &mddev->recovery)) 7912 &mddev->recovery))
7960 if (mddev->sysfs_action) 7913 if (mddev->sysfs_action)
7961 sysfs_notify_dirent_safe(mddev->sysfs_action); 7914 sysfs_notify_dirent_safe(mddev->sysfs_action);
7962 } 7915 }
7963 mddev_unlock(mddev); 7916 mddev_unlock(mddev);
7964 } 7917 }
7965 } 7918 }
7966 7919
7920 void md_reap_sync_thread(struct mddev *mddev)
7921 {
7922 struct md_rdev *rdev;
7923
7924 /* resync has finished, collect result */
7925 md_unregister_thread(&mddev->sync_thread);
7926 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7927 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7928 /* success...*/
7929 /* activate any spares */
7930 if (mddev->pers->spare_active(mddev)) {
7931 sysfs_notify(&mddev->kobj, NULL,
7932 "degraded");
7933 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7934 }
7935 }
7936 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7937 mddev->pers->finish_reshape)
7938 mddev->pers->finish_reshape(mddev);
7939
7940 /* If array is no-longer degraded, then any saved_raid_disk
7941 * information must be scrapped. Also if any device is now
7942 * In_sync we must scrape the saved_raid_disk for that device
7943 * do the superblock for an incrementally recovered device
7944 * written out.
7945 */
7946 rdev_for_each(rdev, mddev)
7947 if (!mddev->degraded ||
7948 test_bit(In_sync, &rdev->flags))
7949 rdev->saved_raid_disk = -1;
7950
7951 md_update_sb(mddev, 1);
7952 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7953 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7954 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7955 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7956 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7957 /* flag recovery needed just to double check */
7958 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7959 sysfs_notify_dirent_safe(mddev->sysfs_action);
7960 md_new_event(mddev);
7961 if (mddev->event_work.func)
7962 queue_work(md_misc_wq, &mddev->event_work);
7963 }
7964
7967 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 7965 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
7968 { 7966 {
7969 sysfs_notify_dirent_safe(rdev->sysfs_state); 7967 sysfs_notify_dirent_safe(rdev->sysfs_state);
7970 wait_event_timeout(rdev->blocked_wait, 7968 wait_event_timeout(rdev->blocked_wait,
7971 !test_bit(Blocked, &rdev->flags) && 7969 !test_bit(Blocked, &rdev->flags) &&
7972 !test_bit(BlockedBadBlocks, &rdev->flags), 7970 !test_bit(BlockedBadBlocks, &rdev->flags),
7973 msecs_to_jiffies(5000)); 7971 msecs_to_jiffies(5000));
7974 rdev_dec_pending(rdev, mddev); 7972 rdev_dec_pending(rdev, mddev);
7975 } 7973 }
7976 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 7974 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7977 7975
7978 void md_finish_reshape(struct mddev *mddev) 7976 void md_finish_reshape(struct mddev *mddev)
7979 { 7977 {
7980 /* called be personality module when reshape completes. */ 7978 /* called be personality module when reshape completes. */
7981 struct md_rdev *rdev; 7979 struct md_rdev *rdev;
7982 7980
7983 rdev_for_each(rdev, mddev) { 7981 rdev_for_each(rdev, mddev) {
7984 if (rdev->data_offset > rdev->new_data_offset) 7982 if (rdev->data_offset > rdev->new_data_offset)
7985 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 7983 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
7986 else 7984 else
7987 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 7985 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
7988 rdev->data_offset = rdev->new_data_offset; 7986 rdev->data_offset = rdev->new_data_offset;
7989 } 7987 }
7990 } 7988 }
7991 EXPORT_SYMBOL(md_finish_reshape); 7989 EXPORT_SYMBOL(md_finish_reshape);
7992 7990
7993 /* Bad block management. 7991 /* Bad block management.
7994 * We can record which blocks on each device are 'bad' and so just 7992 * We can record which blocks on each device are 'bad' and so just
7995 * fail those blocks, or that stripe, rather than the whole device. 7993 * fail those blocks, or that stripe, rather than the whole device.
7996 * Entries in the bad-block table are 64bits wide. This comprises: 7994 * Entries in the bad-block table are 64bits wide. This comprises:
7997 * Length of bad-range, in sectors: 0-511 for lengths 1-512 7995 * Length of bad-range, in sectors: 0-511 for lengths 1-512
7998 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) 7996 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
7999 * A 'shift' can be set so that larger blocks are tracked and 7997 * A 'shift' can be set so that larger blocks are tracked and
8000 * consequently larger devices can be covered. 7998 * consequently larger devices can be covered.
8001 * 'Acknowledged' flag - 1 bit. - the most significant bit. 7999 * 'Acknowledged' flag - 1 bit. - the most significant bit.
8002 * 8000 *
8003 * Locking of the bad-block table uses a seqlock so md_is_badblock 8001 * Locking of the bad-block table uses a seqlock so md_is_badblock
8004 * might need to retry if it is very unlucky. 8002 * might need to retry if it is very unlucky.
8005 * We will sometimes want to check for bad blocks in a bi_end_io function, 8003 * We will sometimes want to check for bad blocks in a bi_end_io function,
8006 * so we use the write_seqlock_irq variant. 8004 * so we use the write_seqlock_irq variant.
8007 * 8005 *
8008 * When looking for a bad block we specify a range and want to 8006 * When looking for a bad block we specify a range and want to
8009 * know if any block in the range is bad. So we binary-search 8007 * know if any block in the range is bad. So we binary-search
8010 * to the last range that starts at-or-before the given endpoint, 8008 * to the last range that starts at-or-before the given endpoint,
8011 * (or "before the sector after the target range") 8009 * (or "before the sector after the target range")
8012 * then see if it ends after the given start. 8010 * then see if it ends after the given start.
8013 * We return 8011 * We return
8014 * 0 if there are no known bad blocks in the range 8012 * 0 if there are no known bad blocks in the range
8015 * 1 if there are known bad block which are all acknowledged 8013 * 1 if there are known bad block which are all acknowledged
8016 * -1 if there are bad blocks which have not yet been acknowledged in metadata. 8014 * -1 if there are bad blocks which have not yet been acknowledged in metadata.
8017 * plus the start/length of the first bad section we overlap. 8015 * plus the start/length of the first bad section we overlap.
8018 */ 8016 */
8019 int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, 8017 int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
8020 sector_t *first_bad, int *bad_sectors) 8018 sector_t *first_bad, int *bad_sectors)
8021 { 8019 {
8022 int hi; 8020 int hi;
8023 int lo; 8021 int lo;
8024 u64 *p = bb->page; 8022 u64 *p = bb->page;
8025 int rv; 8023 int rv;
8026 sector_t target = s + sectors; 8024 sector_t target = s + sectors;
8027 unsigned seq; 8025 unsigned seq;
8028 8026
8029 if (bb->shift > 0) { 8027 if (bb->shift > 0) {
8030 /* round the start down, and the end up */ 8028 /* round the start down, and the end up */
8031 s >>= bb->shift; 8029 s >>= bb->shift;
8032 target += (1<<bb->shift) - 1; 8030 target += (1<<bb->shift) - 1;
8033 target >>= bb->shift; 8031 target >>= bb->shift;
8034 sectors = target - s; 8032 sectors = target - s;
8035 } 8033 }
8036 /* 'target' is now the first block after the bad range */ 8034 /* 'target' is now the first block after the bad range */
8037 8035
8038 retry: 8036 retry:
8039 seq = read_seqbegin(&bb->lock); 8037 seq = read_seqbegin(&bb->lock);
8040 lo = 0; 8038 lo = 0;
8041 rv = 0; 8039 rv = 0;
8042 hi = bb->count; 8040 hi = bb->count;
8043 8041
8044 /* Binary search between lo and hi for 'target' 8042 /* Binary search between lo and hi for 'target'
8045 * i.e. for the last range that starts before 'target' 8043 * i.e. for the last range that starts before 'target'
8046 */ 8044 */
8047 /* INVARIANT: ranges before 'lo' and at-or-after 'hi' 8045 /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
8048 * are known not to be the last range before target. 8046 * are known not to be the last range before target.
8049 * VARIANT: hi-lo is the number of possible 8047 * VARIANT: hi-lo is the number of possible
8050 * ranges, and decreases until it reaches 1 8048 * ranges, and decreases until it reaches 1
8051 */ 8049 */
8052 while (hi - lo > 1) { 8050 while (hi - lo > 1) {
8053 int mid = (lo + hi) / 2; 8051 int mid = (lo + hi) / 2;
8054 sector_t a = BB_OFFSET(p[mid]); 8052 sector_t a = BB_OFFSET(p[mid]);
8055 if (a < target) 8053 if (a < target)
8056 /* This could still be the one, earlier ranges 8054 /* This could still be the one, earlier ranges
8057 * could not. */ 8055 * could not. */
8058 lo = mid; 8056 lo = mid;
8059 else 8057 else
8060 /* This and later ranges are definitely out. */ 8058 /* This and later ranges are definitely out. */
8061 hi = mid; 8059 hi = mid;
8062 } 8060 }
8063 /* 'lo' might be the last that started before target, but 'hi' isn't */ 8061 /* 'lo' might be the last that started before target, but 'hi' isn't */
8064 if (hi > lo) { 8062 if (hi > lo) {
8065 /* need to check all range that end after 's' to see if 8063 /* need to check all range that end after 's' to see if
8066 * any are unacknowledged. 8064 * any are unacknowledged.
8067 */ 8065 */
8068 while (lo >= 0 && 8066 while (lo >= 0 &&
8069 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { 8067 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8070 if (BB_OFFSET(p[lo]) < target) { 8068 if (BB_OFFSET(p[lo]) < target) {
8071 /* starts before the end, and finishes after 8069 /* starts before the end, and finishes after
8072 * the start, so they must overlap 8070 * the start, so they must overlap
8073 */ 8071 */
8074 if (rv != -1 && BB_ACK(p[lo])) 8072 if (rv != -1 && BB_ACK(p[lo]))
8075 rv = 1; 8073 rv = 1;
8076 else 8074 else
8077 rv = -1; 8075 rv = -1;
8078 *first_bad = BB_OFFSET(p[lo]); 8076 *first_bad = BB_OFFSET(p[lo]);
8079 *bad_sectors = BB_LEN(p[lo]); 8077 *bad_sectors = BB_LEN(p[lo]);
8080 } 8078 }
8081 lo--; 8079 lo--;
8082 } 8080 }
8083 } 8081 }
8084 8082
8085 if (read_seqretry(&bb->lock, seq)) 8083 if (read_seqretry(&bb->lock, seq))
8086 goto retry; 8084 goto retry;
8087 8085
8088 return rv; 8086 return rv;
8089 } 8087 }
8090 EXPORT_SYMBOL_GPL(md_is_badblock); 8088 EXPORT_SYMBOL_GPL(md_is_badblock);
8091 8089
8092 /* 8090 /*
8093 * Add a range of bad blocks to the table. 8091 * Add a range of bad blocks to the table.
8094 * This might extend the table, or might contract it 8092 * This might extend the table, or might contract it
8095 * if two adjacent ranges can be merged. 8093 * if two adjacent ranges can be merged.
8096 * We binary-search to find the 'insertion' point, then 8094 * We binary-search to find the 'insertion' point, then
8097 * decide how best to handle it. 8095 * decide how best to handle it.
8098 */ 8096 */
8099 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, 8097 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
8100 int acknowledged) 8098 int acknowledged)
8101 { 8099 {
8102 u64 *p; 8100 u64 *p;
8103 int lo, hi; 8101 int lo, hi;
8104 int rv = 1; 8102 int rv = 1;
8105 8103
8106 if (bb->shift < 0) 8104 if (bb->shift < 0)
8107 /* badblocks are disabled */ 8105 /* badblocks are disabled */
8108 return 0; 8106 return 0;
8109 8107
8110 if (bb->shift) { 8108 if (bb->shift) {
8111 /* round the start down, and the end up */ 8109 /* round the start down, and the end up */
8112 sector_t next = s + sectors; 8110 sector_t next = s + sectors;
8113 s >>= bb->shift; 8111 s >>= bb->shift;
8114 next += (1<<bb->shift) - 1; 8112 next += (1<<bb->shift) - 1;
8115 next >>= bb->shift; 8113 next >>= bb->shift;
8116 sectors = next - s; 8114 sectors = next - s;
8117 } 8115 }
8118 8116
8119 write_seqlock_irq(&bb->lock); 8117 write_seqlock_irq(&bb->lock);
8120 8118
8121 p = bb->page; 8119 p = bb->page;
8122 lo = 0; 8120 lo = 0;
8123 hi = bb->count; 8121 hi = bb->count;
8124 /* Find the last range that starts at-or-before 's' */ 8122 /* Find the last range that starts at-or-before 's' */
8125 while (hi - lo > 1) { 8123 while (hi - lo > 1) {
8126 int mid = (lo + hi) / 2; 8124 int mid = (lo + hi) / 2;
8127 sector_t a = BB_OFFSET(p[mid]); 8125 sector_t a = BB_OFFSET(p[mid]);
8128 if (a <= s) 8126 if (a <= s)
8129 lo = mid; 8127 lo = mid;
8130 else 8128 else
8131 hi = mid; 8129 hi = mid;
8132 } 8130 }
8133 if (hi > lo && BB_OFFSET(p[lo]) > s) 8131 if (hi > lo && BB_OFFSET(p[lo]) > s)
8134 hi = lo; 8132 hi = lo;
8135 8133
8136 if (hi > lo) { 8134 if (hi > lo) {
8137 /* we found a range that might merge with the start 8135 /* we found a range that might merge with the start
8138 * of our new range 8136 * of our new range
8139 */ 8137 */
8140 sector_t a = BB_OFFSET(p[lo]); 8138 sector_t a = BB_OFFSET(p[lo]);
8141 sector_t e = a + BB_LEN(p[lo]); 8139 sector_t e = a + BB_LEN(p[lo]);
8142 int ack = BB_ACK(p[lo]); 8140 int ack = BB_ACK(p[lo]);
8143 if (e >= s) { 8141 if (e >= s) {
8144 /* Yes, we can merge with a previous range */ 8142 /* Yes, we can merge with a previous range */
8145 if (s == a && s + sectors >= e) 8143 if (s == a && s + sectors >= e)
8146 /* new range covers old */ 8144 /* new range covers old */
8147 ack = acknowledged; 8145 ack = acknowledged;
8148 else 8146 else
8149 ack = ack && acknowledged; 8147 ack = ack && acknowledged;
8150 8148
8151 if (e < s + sectors) 8149 if (e < s + sectors)
8152 e = s + sectors; 8150 e = s + sectors;
8153 if (e - a <= BB_MAX_LEN) { 8151 if (e - a <= BB_MAX_LEN) {
8154 p[lo] = BB_MAKE(a, e-a, ack); 8152 p[lo] = BB_MAKE(a, e-a, ack);
8155 s = e; 8153 s = e;
8156 } else { 8154 } else {
8157 /* does not all fit in one range, 8155 /* does not all fit in one range,
8158 * make p[lo] maximal 8156 * make p[lo] maximal
8159 */ 8157 */
8160 if (BB_LEN(p[lo]) != BB_MAX_LEN) 8158 if (BB_LEN(p[lo]) != BB_MAX_LEN)
8161 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); 8159 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
8162 s = a + BB_MAX_LEN; 8160 s = a + BB_MAX_LEN;
8163 } 8161 }
8164 sectors = e - s; 8162 sectors = e - s;
8165 } 8163 }
8166 } 8164 }
8167 if (sectors && hi < bb->count) { 8165 if (sectors && hi < bb->count) {
8168 /* 'hi' points to the first range that starts after 's'. 8166 /* 'hi' points to the first range that starts after 's'.
8169 * Maybe we can merge with the start of that range */ 8167 * Maybe we can merge with the start of that range */
8170 sector_t a = BB_OFFSET(p[hi]); 8168 sector_t a = BB_OFFSET(p[hi]);
8171 sector_t e = a + BB_LEN(p[hi]); 8169 sector_t e = a + BB_LEN(p[hi]);
8172 int ack = BB_ACK(p[hi]); 8170 int ack = BB_ACK(p[hi]);
8173 if (a <= s + sectors) { 8171 if (a <= s + sectors) {
8174 /* merging is possible */ 8172 /* merging is possible */
8175 if (e <= s + sectors) { 8173 if (e <= s + sectors) {
8176 /* full overlap */ 8174 /* full overlap */
8177 e = s + sectors; 8175 e = s + sectors;
8178 ack = acknowledged; 8176 ack = acknowledged;
8179 } else 8177 } else
8180 ack = ack && acknowledged; 8178 ack = ack && acknowledged;
8181 8179
8182 a = s; 8180 a = s;
8183 if (e - a <= BB_MAX_LEN) { 8181 if (e - a <= BB_MAX_LEN) {
8184 p[hi] = BB_MAKE(a, e-a, ack); 8182 p[hi] = BB_MAKE(a, e-a, ack);
8185 s = e; 8183 s = e;
8186 } else { 8184 } else {
8187 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); 8185 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
8188 s = a + BB_MAX_LEN; 8186 s = a + BB_MAX_LEN;
8189 } 8187 }
8190 sectors = e - s; 8188 sectors = e - s;
8191 lo = hi; 8189 lo = hi;
8192 hi++; 8190 hi++;
8193 } 8191 }
8194 } 8192 }
8195 if (sectors == 0 && hi < bb->count) { 8193 if (sectors == 0 && hi < bb->count) {
8196 /* we might be able to combine lo and hi */ 8194 /* we might be able to combine lo and hi */
8197 /* Note: 's' is at the end of 'lo' */ 8195 /* Note: 's' is at the end of 'lo' */
8198 sector_t a = BB_OFFSET(p[hi]); 8196 sector_t a = BB_OFFSET(p[hi]);
8199 int lolen = BB_LEN(p[lo]); 8197 int lolen = BB_LEN(p[lo]);
8200 int hilen = BB_LEN(p[hi]); 8198 int hilen = BB_LEN(p[hi]);
8201 int newlen = lolen + hilen - (s - a); 8199 int newlen = lolen + hilen - (s - a);
8202 if (s >= a && newlen < BB_MAX_LEN) { 8200 if (s >= a && newlen < BB_MAX_LEN) {
8203 /* yes, we can combine them */ 8201 /* yes, we can combine them */
8204 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); 8202 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
8205 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); 8203 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
8206 memmove(p + hi, p + hi + 1, 8204 memmove(p + hi, p + hi + 1,
8207 (bb->count - hi - 1) * 8); 8205 (bb->count - hi - 1) * 8);
8208 bb->count--; 8206 bb->count--;
8209 } 8207 }
8210 } 8208 }
8211 while (sectors) { 8209 while (sectors) {
8212 /* didn't merge (it all). 8210 /* didn't merge (it all).
8213 * Need to add a range just before 'hi' */ 8211 * Need to add a range just before 'hi' */
8214 if (bb->count >= MD_MAX_BADBLOCKS) { 8212 if (bb->count >= MD_MAX_BADBLOCKS) {
8215 /* No room for more */ 8213 /* No room for more */
8216 rv = 0; 8214 rv = 0;
8217 break; 8215 break;
8218 } else { 8216 } else {
8219 int this_sectors = sectors; 8217 int this_sectors = sectors;
8220 memmove(p + hi + 1, p + hi, 8218 memmove(p + hi + 1, p + hi,
8221 (bb->count - hi) * 8); 8219 (bb->count - hi) * 8);
8222 bb->count++; 8220 bb->count++;
8223 8221
8224 if (this_sectors > BB_MAX_LEN) 8222 if (this_sectors > BB_MAX_LEN)
8225 this_sectors = BB_MAX_LEN; 8223 this_sectors = BB_MAX_LEN;
8226 p[hi] = BB_MAKE(s, this_sectors, acknowledged); 8224 p[hi] = BB_MAKE(s, this_sectors, acknowledged);
8227 sectors -= this_sectors; 8225 sectors -= this_sectors;
8228 s += this_sectors; 8226 s += this_sectors;
8229 } 8227 }
8230 } 8228 }
8231 8229
8232 bb->changed = 1; 8230 bb->changed = 1;
8233 if (!acknowledged) 8231 if (!acknowledged)
8234 bb->unacked_exist = 1; 8232 bb->unacked_exist = 1;
8235 write_sequnlock_irq(&bb->lock); 8233 write_sequnlock_irq(&bb->lock);
8236 8234
8237 return rv; 8235 return rv;
8238 } 8236 }
8239 8237
8240 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8238 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8241 int is_new) 8239 int is_new)
8242 { 8240 {
8243 int rv; 8241 int rv;
8244 if (is_new) 8242 if (is_new)
8245 s += rdev->new_data_offset; 8243 s += rdev->new_data_offset;
8246 else 8244 else
8247 s += rdev->data_offset; 8245 s += rdev->data_offset;
8248 rv = md_set_badblocks(&rdev->badblocks, 8246 rv = md_set_badblocks(&rdev->badblocks,
8249 s, sectors, 0); 8247 s, sectors, 0);
8250 if (rv) { 8248 if (rv) {
8251 /* Make sure they get written out promptly */ 8249 /* Make sure they get written out promptly */
8252 sysfs_notify_dirent_safe(rdev->sysfs_state); 8250 sysfs_notify_dirent_safe(rdev->sysfs_state);
8253 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); 8251 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
8254 md_wakeup_thread(rdev->mddev->thread); 8252 md_wakeup_thread(rdev->mddev->thread);
8255 } 8253 }
8256 return rv; 8254 return rv;
8257 } 8255 }
8258 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 8256 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
8259 8257
8260 /* 8258 /*
8261 * Remove a range of bad blocks from the table. 8259 * Remove a range of bad blocks from the table.
8262 * This may involve extending the table if we spilt a region, 8260 * This may involve extending the table if we spilt a region,
8263 * but it must not fail. So if the table becomes full, we just 8261 * but it must not fail. So if the table becomes full, we just
8264 * drop the remove request. 8262 * drop the remove request.
8265 */ 8263 */
8266 static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) 8264 static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
8267 { 8265 {
8268 u64 *p; 8266 u64 *p;
8269 int lo, hi; 8267 int lo, hi;
8270 sector_t target = s + sectors; 8268 sector_t target = s + sectors;
8271 int rv = 0; 8269 int rv = 0;
8272 8270
8273 if (bb->shift > 0) { 8271 if (bb->shift > 0) {
8274 /* When clearing we round the start up and the end down. 8272 /* When clearing we round the start up and the end down.
8275 * This should not matter as the shift should align with 8273 * This should not matter as the shift should align with
8276 * the block size and no rounding should ever be needed. 8274 * the block size and no rounding should ever be needed.
8277 * However it is better the think a block is bad when it 8275 * However it is better the think a block is bad when it
8278 * isn't than to think a block is not bad when it is. 8276 * isn't than to think a block is not bad when it is.
8279 */ 8277 */
8280 s += (1<<bb->shift) - 1; 8278 s += (1<<bb->shift) - 1;
8281 s >>= bb->shift; 8279 s >>= bb->shift;
8282 target >>= bb->shift; 8280 target >>= bb->shift;
8283 sectors = target - s; 8281 sectors = target - s;
8284 } 8282 }
8285 8283
8286 write_seqlock_irq(&bb->lock); 8284 write_seqlock_irq(&bb->lock);
8287 8285
8288 p = bb->page; 8286 p = bb->page;
8289 lo = 0; 8287 lo = 0;
8290 hi = bb->count; 8288 hi = bb->count;
8291 /* Find the last range that starts before 'target' */ 8289 /* Find the last range that starts before 'target' */
8292 while (hi - lo > 1) { 8290 while (hi - lo > 1) {
8293 int mid = (lo + hi) / 2; 8291 int mid = (lo + hi) / 2;
8294 sector_t a = BB_OFFSET(p[mid]); 8292 sector_t a = BB_OFFSET(p[mid]);
8295 if (a < target) 8293 if (a < target)
8296 lo = mid; 8294 lo = mid;
8297 else 8295 else
8298 hi = mid; 8296 hi = mid;
8299 } 8297 }
8300 if (hi > lo) { 8298 if (hi > lo) {
8301 /* p[lo] is the last range that could overlap the 8299 /* p[lo] is the last range that could overlap the
8302 * current range. Earlier ranges could also overlap, 8300 * current range. Earlier ranges could also overlap,
8303 * but only this one can overlap the end of the range. 8301 * but only this one can overlap the end of the range.
8304 */ 8302 */
8305 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { 8303 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
8306 /* Partial overlap, leave the tail of this range */ 8304 /* Partial overlap, leave the tail of this range */
8307 int ack = BB_ACK(p[lo]); 8305 int ack = BB_ACK(p[lo]);
8308 sector_t a = BB_OFFSET(p[lo]); 8306 sector_t a = BB_OFFSET(p[lo]);
8309 sector_t end = a + BB_LEN(p[lo]); 8307 sector_t end = a + BB_LEN(p[lo]);
8310 8308
8311 if (a < s) { 8309 if (a < s) {
8312 /* we need to split this range */ 8310 /* we need to split this range */
8313 if (bb->count >= MD_MAX_BADBLOCKS) { 8311 if (bb->count >= MD_MAX_BADBLOCKS) {
8314 rv = 0; 8312 rv = 0;
8315 goto out; 8313 goto out;
8316 } 8314 }
8317 memmove(p+lo+1, p+lo, (bb->count - lo) * 8); 8315 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
8318 bb->count++; 8316 bb->count++;
8319 p[lo] = BB_MAKE(a, s-a, ack); 8317 p[lo] = BB_MAKE(a, s-a, ack);
8320 lo++; 8318 lo++;
8321 } 8319 }
8322 p[lo] = BB_MAKE(target, end - target, ack); 8320 p[lo] = BB_MAKE(target, end - target, ack);
8323 /* there is no longer an overlap */ 8321 /* there is no longer an overlap */
8324 hi = lo; 8322 hi = lo;
8325 lo--; 8323 lo--;
8326 } 8324 }
8327 while (lo >= 0 && 8325 while (lo >= 0 &&
8328 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { 8326 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8329 /* This range does overlap */ 8327 /* This range does overlap */
8330 if (BB_OFFSET(p[lo]) < s) { 8328 if (BB_OFFSET(p[lo]) < s) {
8331 /* Keep the early parts of this range. */ 8329 /* Keep the early parts of this range. */
8332 int ack = BB_ACK(p[lo]); 8330 int ack = BB_ACK(p[lo]);
8333 sector_t start = BB_OFFSET(p[lo]); 8331 sector_t start = BB_OFFSET(p[lo]);
8334 p[lo] = BB_MAKE(start, s - start, ack); 8332 p[lo] = BB_MAKE(start, s - start, ack);
8335 /* now low doesn't overlap, so.. */ 8333 /* now low doesn't overlap, so.. */
8336 break; 8334 break;
8337 } 8335 }
8338 lo--; 8336 lo--;
8339 } 8337 }
8340 /* 'lo' is strictly before, 'hi' is strictly after, 8338 /* 'lo' is strictly before, 'hi' is strictly after,
8341 * anything between needs to be discarded 8339 * anything between needs to be discarded
8342 */ 8340 */
8343 if (hi - lo > 1) { 8341 if (hi - lo > 1) {
8344 memmove(p+lo+1, p+hi, (bb->count - hi) * 8); 8342 memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
8345 bb->count -= (hi - lo - 1); 8343 bb->count -= (hi - lo - 1);
8346 } 8344 }
8347 } 8345 }
8348 8346
8349 bb->changed = 1; 8347 bb->changed = 1;
8350 out: 8348 out:
8351 write_sequnlock_irq(&bb->lock); 8349 write_sequnlock_irq(&bb->lock);
8352 return rv; 8350 return rv;
8353 } 8351 }
8354 8352
8355 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8353 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8356 int is_new) 8354 int is_new)
8357 { 8355 {
8358 if (is_new) 8356 if (is_new)
8359 s += rdev->new_data_offset; 8357 s += rdev->new_data_offset;
8360 else 8358 else
8361 s += rdev->data_offset; 8359 s += rdev->data_offset;
8362 return md_clear_badblocks(&rdev->badblocks, 8360 return md_clear_badblocks(&rdev->badblocks,
8363 s, sectors); 8361 s, sectors);
8364 } 8362 }
8365 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 8363 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8366 8364
8367 /* 8365 /*
8368 * Acknowledge all bad blocks in a list. 8366 * Acknowledge all bad blocks in a list.
8369 * This only succeeds if ->changed is clear. It is used by 8367 * This only succeeds if ->changed is clear. It is used by
8370 * in-kernel metadata updates 8368 * in-kernel metadata updates
8371 */ 8369 */
8372 void md_ack_all_badblocks(struct badblocks *bb) 8370 void md_ack_all_badblocks(struct badblocks *bb)
8373 { 8371 {
8374 if (bb->page == NULL || bb->changed) 8372 if (bb->page == NULL || bb->changed)
8375 /* no point even trying */ 8373 /* no point even trying */
8376 return; 8374 return;
8377 write_seqlock_irq(&bb->lock); 8375 write_seqlock_irq(&bb->lock);
8378 8376
8379 if (bb->changed == 0 && bb->unacked_exist) { 8377 if (bb->changed == 0 && bb->unacked_exist) {
8380 u64 *p = bb->page; 8378 u64 *p = bb->page;
8381 int i; 8379 int i;
8382 for (i = 0; i < bb->count ; i++) { 8380 for (i = 0; i < bb->count ; i++) {
8383 if (!BB_ACK(p[i])) { 8381 if (!BB_ACK(p[i])) {
8384 sector_t start = BB_OFFSET(p[i]); 8382 sector_t start = BB_OFFSET(p[i]);
8385 int len = BB_LEN(p[i]); 8383 int len = BB_LEN(p[i]);
8386 p[i] = BB_MAKE(start, len, 1); 8384 p[i] = BB_MAKE(start, len, 1);
8387 } 8385 }
8388 } 8386 }
8389 bb->unacked_exist = 0; 8387 bb->unacked_exist = 0;
8390 } 8388 }
8391 write_sequnlock_irq(&bb->lock); 8389 write_sequnlock_irq(&bb->lock);
8392 } 8390 }
8393 EXPORT_SYMBOL_GPL(md_ack_all_badblocks); 8391 EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
8394 8392
8395 /* sysfs access to bad-blocks list. 8393 /* sysfs access to bad-blocks list.
8396 * We present two files. 8394 * We present two files.
8397 * 'bad-blocks' lists sector numbers and lengths of ranges that 8395 * 'bad-blocks' lists sector numbers and lengths of ranges that
8398 * are recorded as bad. The list is truncated to fit within 8396 * are recorded as bad. The list is truncated to fit within
8399 * the one-page limit of sysfs. 8397 * the one-page limit of sysfs.
8400 * Writing "sector length" to this file adds an acknowledged 8398 * Writing "sector length" to this file adds an acknowledged
8401 * bad block list. 8399 * bad block list.
8402 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 8400 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
8403 * been acknowledged. Writing to this file adds bad blocks 8401 * been acknowledged. Writing to this file adds bad blocks
8404 * without acknowledging them. This is largely for testing. 8402 * without acknowledging them. This is largely for testing.
8405 */ 8403 */
8406 8404
8407 static ssize_t 8405 static ssize_t
8408 badblocks_show(struct badblocks *bb, char *page, int unack) 8406 badblocks_show(struct badblocks *bb, char *page, int unack)
8409 { 8407 {
8410 size_t len; 8408 size_t len;
8411 int i; 8409 int i;
8412 u64 *p = bb->page; 8410 u64 *p = bb->page;
8413 unsigned seq; 8411 unsigned seq;
8414 8412
8415 if (bb->shift < 0) 8413 if (bb->shift < 0)
8416 return 0; 8414 return 0;
8417 8415
8418 retry: 8416 retry:
8419 seq = read_seqbegin(&bb->lock); 8417 seq = read_seqbegin(&bb->lock);
8420 8418
8421 len = 0; 8419 len = 0;
8422 i = 0; 8420 i = 0;
8423 8421
8424 while (len < PAGE_SIZE && i < bb->count) { 8422 while (len < PAGE_SIZE && i < bb->count) {
8425 sector_t s = BB_OFFSET(p[i]); 8423 sector_t s = BB_OFFSET(p[i]);
8426 unsigned int length = BB_LEN(p[i]); 8424 unsigned int length = BB_LEN(p[i]);
8427 int ack = BB_ACK(p[i]); 8425 int ack = BB_ACK(p[i]);
8428 i++; 8426 i++;
8429 8427
8430 if (unack && ack) 8428 if (unack && ack)
8431 continue; 8429 continue;
8432 8430
8433 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", 8431 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
8434 (unsigned long long)s << bb->shift, 8432 (unsigned long long)s << bb->shift,
8435 length << bb->shift); 8433 length << bb->shift);
8436 } 8434 }
8437 if (unack && len == 0) 8435 if (unack && len == 0)
8438 bb->unacked_exist = 0; 8436 bb->unacked_exist = 0;
8439 8437
8440 if (read_seqretry(&bb->lock, seq)) 8438 if (read_seqretry(&bb->lock, seq))
8441 goto retry; 8439 goto retry;
8442 8440
8443 return len; 8441 return len;
8444 } 8442 }
8445 8443
8446 #define DO_DEBUG 1 8444 #define DO_DEBUG 1
8447 8445
8448 static ssize_t 8446 static ssize_t
8449 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) 8447 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8450 { 8448 {
8451 unsigned long long sector; 8449 unsigned long long sector;
8452 int length; 8450 int length;
8453 char newline; 8451 char newline;
8454 #ifdef DO_DEBUG 8452 #ifdef DO_DEBUG
8455 /* Allow clearing via sysfs *only* for testing/debugging. 8453 /* Allow clearing via sysfs *only* for testing/debugging.
8456 * Normally only a successful write may clear a badblock 8454 * Normally only a successful write may clear a badblock
8457 */ 8455 */
8458 int clear = 0; 8456 int clear = 0;
8459 if (page[0] == '-') { 8457 if (page[0] == '-') {
8460 clear = 1; 8458 clear = 1;
8461 page++; 8459 page++;
8462 } 8460 }
8463 #endif /* DO_DEBUG */ 8461 #endif /* DO_DEBUG */
8464 8462
8465 switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) { 8463 switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
8466 case 3: 8464 case 3:
8467 if (newline != '\n') 8465 if (newline != '\n')
8468 return -EINVAL; 8466 return -EINVAL;
8469 case 2: 8467 case 2:
8470 if (length <= 0) 8468 if (length <= 0)
8471 return -EINVAL; 8469 return -EINVAL;
8472 break; 8470 break;
8473 default: 8471 default:
8474 return -EINVAL; 8472 return -EINVAL;
8475 } 8473 }
8476 8474
8477 #ifdef DO_DEBUG 8475 #ifdef DO_DEBUG
8478 if (clear) { 8476 if (clear) {
8479 md_clear_badblocks(bb, sector, length); 8477 md_clear_badblocks(bb, sector, length);
8480 return len; 8478 return len;
8481 } 8479 }
8482 #endif /* DO_DEBUG */ 8480 #endif /* DO_DEBUG */
8483 if (md_set_badblocks(bb, sector, length, !unack)) 8481 if (md_set_badblocks(bb, sector, length, !unack))
8484 return len; 8482 return len;
8485 else 8483 else
8486 return -ENOSPC; 8484 return -ENOSPC;
8487 } 8485 }
8488 8486
8489 static int md_notify_reboot(struct notifier_block *this, 8487 static int md_notify_reboot(struct notifier_block *this,
8490 unsigned long code, void *x) 8488 unsigned long code, void *x)
8491 { 8489 {
8492 struct list_head *tmp; 8490 struct list_head *tmp;
8493 struct mddev *mddev; 8491 struct mddev *mddev;
8494 int need_delay = 0; 8492 int need_delay = 0;
8495 8493
8496 for_each_mddev(mddev, tmp) { 8494 for_each_mddev(mddev, tmp) {
8497 if (mddev_trylock(mddev)) { 8495 if (mddev_trylock(mddev)) {
8498 if (mddev->pers) 8496 if (mddev->pers)
8499 __md_stop_writes(mddev); 8497 __md_stop_writes(mddev);
8500 mddev->safemode = 2; 8498 mddev->safemode = 2;
8501 mddev_unlock(mddev); 8499 mddev_unlock(mddev);
8502 } 8500 }
8503 need_delay = 1; 8501 need_delay = 1;
8504 } 8502 }
8505 /* 8503 /*
8506 * certain more exotic SCSI devices are known to be 8504 * certain more exotic SCSI devices are known to be
8507 * volatile wrt too early system reboots. While the 8505 * volatile wrt too early system reboots. While the
8508 * right place to handle this issue is the given 8506 * right place to handle this issue is the given
8509 * driver, we do want to have a safe RAID driver ... 8507 * driver, we do want to have a safe RAID driver ...
8510 */ 8508 */
8511 if (need_delay) 8509 if (need_delay)
8512 mdelay(1000*1); 8510 mdelay(1000*1);
8513 8511
8514 return NOTIFY_DONE; 8512 return NOTIFY_DONE;
8515 } 8513 }
8516 8514
8517 static struct notifier_block md_notifier = { 8515 static struct notifier_block md_notifier = {
8518 .notifier_call = md_notify_reboot, 8516 .notifier_call = md_notify_reboot,
8519 .next = NULL, 8517 .next = NULL,
8520 .priority = INT_MAX, /* before any real devices */ 8518 .priority = INT_MAX, /* before any real devices */
8521 }; 8519 };
8522 8520
8523 static void md_geninit(void) 8521 static void md_geninit(void)
8524 { 8522 {
8525 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 8523 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8526 8524
8527 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 8525 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
8528 } 8526 }
8529 8527
8530 static int __init md_init(void) 8528 static int __init md_init(void)
8531 { 8529 {
8532 int ret = -ENOMEM; 8530 int ret = -ENOMEM;
8533 8531
8534 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 8532 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
8535 if (!md_wq) 8533 if (!md_wq)
8536 goto err_wq; 8534 goto err_wq;
8537 8535
8538 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 8536 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
8539 if (!md_misc_wq) 8537 if (!md_misc_wq)
8540 goto err_misc_wq; 8538 goto err_misc_wq;
8541 8539
8542 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) 8540 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
8543 goto err_md; 8541 goto err_md;
8544 8542
8545 if ((ret = register_blkdev(0, "mdp")) < 0) 8543 if ((ret = register_blkdev(0, "mdp")) < 0)
8546 goto err_mdp; 8544 goto err_mdp;
8547 mdp_major = ret; 8545 mdp_major = ret;
8548 8546
8549 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, 8547 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
8550 md_probe, NULL, NULL); 8548 md_probe, NULL, NULL);
8551 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 8549 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
8552 md_probe, NULL, NULL); 8550 md_probe, NULL, NULL);
8553 8551
8554 register_reboot_notifier(&md_notifier); 8552 register_reboot_notifier(&md_notifier);
8555 raid_table_header = register_sysctl_table(raid_root_table); 8553 raid_table_header = register_sysctl_table(raid_root_table);
8556 8554
8557 md_geninit(); 8555 md_geninit();
8558 return 0; 8556 return 0;
8559 8557
8560 err_mdp: 8558 err_mdp:
8561 unregister_blkdev(MD_MAJOR, "md"); 8559 unregister_blkdev(MD_MAJOR, "md");
8562 err_md: 8560 err_md:
8563 destroy_workqueue(md_misc_wq); 8561 destroy_workqueue(md_misc_wq);
8564 err_misc_wq: 8562 err_misc_wq:
8565 destroy_workqueue(md_wq); 8563 destroy_workqueue(md_wq);
8566 err_wq: 8564 err_wq:
8567 return ret; 8565 return ret;
8568 } 8566 }
8569 8567
8570 #ifndef MODULE 8568 #ifndef MODULE
8571 8569
8572 /* 8570 /*
8573 * Searches all registered partitions for autorun RAID arrays 8571 * Searches all registered partitions for autorun RAID arrays
8574 * at boot time. 8572 * at boot time.
8575 */ 8573 */
8576 8574
8577 static LIST_HEAD(all_detected_devices); 8575 static LIST_HEAD(all_detected_devices);
8578 struct detected_devices_node { 8576 struct detected_devices_node {
8579 struct list_head list; 8577 struct list_head list;
8580 dev_t dev; 8578 dev_t dev;
8581 }; 8579 };
8582 8580
8583 void md_autodetect_dev(dev_t dev) 8581 void md_autodetect_dev(dev_t dev)
8584 { 8582 {
8585 struct detected_devices_node *node_detected_dev; 8583 struct detected_devices_node *node_detected_dev;
8586 8584
8587 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 8585 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
8588 if (node_detected_dev) { 8586 if (node_detected_dev) {
8589 node_detected_dev->dev = dev; 8587 node_detected_dev->dev = dev;
8590 list_add_tail(&node_detected_dev->list, &all_detected_devices); 8588 list_add_tail(&node_detected_dev->list, &all_detected_devices);
8591 } else { 8589 } else {
8592 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 8590 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
8593 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 8591 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
8594 } 8592 }
8595 } 8593 }
8596 8594
8597 8595
8598 static void autostart_arrays(int part) 8596 static void autostart_arrays(int part)
8599 { 8597 {
8600 struct md_rdev *rdev; 8598 struct md_rdev *rdev;
8601 struct detected_devices_node *node_detected_dev; 8599 struct detected_devices_node *node_detected_dev;
8602 dev_t dev; 8600 dev_t dev;
8603 int i_scanned, i_passed; 8601 int i_scanned, i_passed;
8604 8602
8605 i_scanned = 0; 8603 i_scanned = 0;
8606 i_passed = 0; 8604 i_passed = 0;
8607 8605
8608 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 8606 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
8609 8607
8610 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 8608 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
8611 i_scanned++; 8609 i_scanned++;
8612 node_detected_dev = list_entry(all_detected_devices.next, 8610 node_detected_dev = list_entry(all_detected_devices.next,
8613 struct detected_devices_node, list); 8611 struct detected_devices_node, list);
8614 list_del(&node_detected_dev->list); 8612 list_del(&node_detected_dev->list);
8615 dev = node_detected_dev->dev; 8613 dev = node_detected_dev->dev;
8616 kfree(node_detected_dev); 8614 kfree(node_detected_dev);
8617 rdev = md_import_device(dev,0, 90); 8615 rdev = md_import_device(dev,0, 90);
8618 if (IS_ERR(rdev)) 8616 if (IS_ERR(rdev))
8619 continue; 8617 continue;
8620 8618
8621 if (test_bit(Faulty, &rdev->flags)) { 8619 if (test_bit(Faulty, &rdev->flags)) {
8622 MD_BUG(); 8620 MD_BUG();
8623 continue; 8621 continue;
8624 } 8622 }
8625 set_bit(AutoDetected, &rdev->flags); 8623 set_bit(AutoDetected, &rdev->flags);
8626 list_add(&rdev->same_set, &pending_raid_disks); 8624 list_add(&rdev->same_set, &pending_raid_disks);
8627 i_passed++; 8625 i_passed++;
8628 } 8626 }
8629 8627
8630 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 8628 printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
8631 i_scanned, i_passed); 8629 i_scanned, i_passed);
8632 8630
8633 autorun_devices(part); 8631 autorun_devices(part);
8634 } 8632 }
8635 8633
8636 #endif /* !MODULE */ 8634 #endif /* !MODULE */
8637 8635
8638 static __exit void md_exit(void) 8636 static __exit void md_exit(void)
8639 { 8637 {
8640 struct mddev *mddev; 8638 struct mddev *mddev;
8641 struct list_head *tmp; 8639 struct list_head *tmp;
8642 8640
8643 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); 8641 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
8644 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 8642 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
8645 8643
8646 unregister_blkdev(MD_MAJOR,"md"); 8644 unregister_blkdev(MD_MAJOR,"md");
8647 unregister_blkdev(mdp_major, "mdp"); 8645 unregister_blkdev(mdp_major, "mdp");
8648 unregister_reboot_notifier(&md_notifier); 8646 unregister_reboot_notifier(&md_notifier);
8649 unregister_sysctl_table(raid_table_header); 8647 unregister_sysctl_table(raid_table_header);
8650 remove_proc_entry("mdstat", NULL); 8648 remove_proc_entry("mdstat", NULL);
8651 for_each_mddev(mddev, tmp) { 8649 for_each_mddev(mddev, tmp) {
1 /* 1 /*
2 md.h : kernel internal structure of the Linux MD driver 2 md.h : kernel internal structure of the Linux MD driver
3 Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman 3 Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
4 4
5 This program is free software; you can redistribute it and/or modify 5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by 6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 2, or (at your option) 7 the Free Software Foundation; either version 2, or (at your option)
8 any later version. 8 any later version.
9 9
10 You should have received a copy of the GNU General Public License 10 You should have received a copy of the GNU General Public License
11 (for example /usr/src/linux/COPYING); if not, write to the Free 11 (for example /usr/src/linux/COPYING); if not, write to the Free
12 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 12 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
13 */ 13 */
14 14
15 #ifndef _MD_MD_H 15 #ifndef _MD_MD_H
16 #define _MD_MD_H 16 #define _MD_MD_H
17 17
18 #include <linux/blkdev.h> 18 #include <linux/blkdev.h>
19 #include <linux/kobject.h> 19 #include <linux/kobject.h>
20 #include <linux/list.h> 20 #include <linux/list.h>
21 #include <linux/mm.h> 21 #include <linux/mm.h>
22 #include <linux/mutex.h> 22 #include <linux/mutex.h>
23 #include <linux/timer.h> 23 #include <linux/timer.h>
24 #include <linux/wait.h> 24 #include <linux/wait.h>
25 #include <linux/workqueue.h> 25 #include <linux/workqueue.h>
26 26
27 #define MaxSector (~(sector_t)0) 27 #define MaxSector (~(sector_t)0)
28 28
29 /* Bad block numbers are stored sorted in a single page. 29 /* Bad block numbers are stored sorted in a single page.
30 * 64bits is used for each block or extent. 30 * 64bits is used for each block or extent.
31 * 54 bits are sector number, 9 bits are extent size, 31 * 54 bits are sector number, 9 bits are extent size,
32 * 1 bit is an 'acknowledged' flag. 32 * 1 bit is an 'acknowledged' flag.
33 */ 33 */
34 #define MD_MAX_BADBLOCKS (PAGE_SIZE/8) 34 #define MD_MAX_BADBLOCKS (PAGE_SIZE/8)
35 35
36 /* 36 /*
37 * MD's 'extended' device 37 * MD's 'extended' device
38 */ 38 */
39 struct md_rdev { 39 struct md_rdev {
40 struct list_head same_set; /* RAID devices within the same set */ 40 struct list_head same_set; /* RAID devices within the same set */
41 41
42 sector_t sectors; /* Device size (in 512bytes sectors) */ 42 sector_t sectors; /* Device size (in 512bytes sectors) */
43 struct mddev *mddev; /* RAID array if running */ 43 struct mddev *mddev; /* RAID array if running */
44 int last_events; /* IO event timestamp */ 44 int last_events; /* IO event timestamp */
45 45
46 /* 46 /*
47 * If meta_bdev is non-NULL, it means that a separate device is 47 * If meta_bdev is non-NULL, it means that a separate device is
48 * being used to store the metadata (superblock/bitmap) which 48 * being used to store the metadata (superblock/bitmap) which
49 * would otherwise be contained on the same device as the data (bdev). 49 * would otherwise be contained on the same device as the data (bdev).
50 */ 50 */
51 struct block_device *meta_bdev; 51 struct block_device *meta_bdev;
52 struct block_device *bdev; /* block device handle */ 52 struct block_device *bdev; /* block device handle */
53 53
54 struct page *sb_page, *bb_page; 54 struct page *sb_page, *bb_page;
55 int sb_loaded; 55 int sb_loaded;
56 __u64 sb_events; 56 __u64 sb_events;
57 sector_t data_offset; /* start of data in array */ 57 sector_t data_offset; /* start of data in array */
58 sector_t new_data_offset;/* only relevant while reshaping */ 58 sector_t new_data_offset;/* only relevant while reshaping */
59 sector_t sb_start; /* offset of the super block (in 512byte sectors) */ 59 sector_t sb_start; /* offset of the super block (in 512byte sectors) */
60 int sb_size; /* bytes in the superblock */ 60 int sb_size; /* bytes in the superblock */
61 int preferred_minor; /* autorun support */ 61 int preferred_minor; /* autorun support */
62 62
63 struct kobject kobj; 63 struct kobject kobj;
64 64
65 /* A device can be in one of three states based on two flags: 65 /* A device can be in one of three states based on two flags:
66 * Not working: faulty==1 in_sync==0 66 * Not working: faulty==1 in_sync==0
67 * Fully working: faulty==0 in_sync==1 67 * Fully working: faulty==0 in_sync==1
68 * Working, but not 68 * Working, but not
69 * in sync with array 69 * in sync with array
70 * faulty==0 in_sync==0 70 * faulty==0 in_sync==0
71 * 71 *
72 * It can never have faulty==1, in_sync==1 72 * It can never have faulty==1, in_sync==1
73 * This reduces the burden of testing multiple flags in many cases 73 * This reduces the burden of testing multiple flags in many cases
74 */ 74 */
75 75
76 unsigned long flags; /* bit set of 'enum flag_bits' bits. */ 76 unsigned long flags; /* bit set of 'enum flag_bits' bits. */
77 wait_queue_head_t blocked_wait; 77 wait_queue_head_t blocked_wait;
78 78
79 int desc_nr; /* descriptor index in the superblock */ 79 int desc_nr; /* descriptor index in the superblock */
80 int raid_disk; /* role of device in array */ 80 int raid_disk; /* role of device in array */
81 int new_raid_disk; /* role that the device will have in 81 int new_raid_disk; /* role that the device will have in
82 * the array after a level-change completes. 82 * the array after a level-change completes.
83 */ 83 */
84 int saved_raid_disk; /* role that device used to have in the 84 int saved_raid_disk; /* role that device used to have in the
85 * array and could again if we did a partial 85 * array and could again if we did a partial
86 * resync from the bitmap 86 * resync from the bitmap
87 */ 87 */
88 sector_t recovery_offset;/* If this device has been partially 88 sector_t recovery_offset;/* If this device has been partially
89 * recovered, this is where we were 89 * recovered, this is where we were
90 * up to. 90 * up to.
91 */ 91 */
92 92
93 atomic_t nr_pending; /* number of pending requests. 93 atomic_t nr_pending; /* number of pending requests.
94 * only maintained for arrays that 94 * only maintained for arrays that
95 * support hot removal 95 * support hot removal
96 */ 96 */
97 atomic_t read_errors; /* number of consecutive read errors that 97 atomic_t read_errors; /* number of consecutive read errors that
98 * we have tried to ignore. 98 * we have tried to ignore.
99 */ 99 */
100 struct timespec last_read_error; /* monotonic time since our 100 struct timespec last_read_error; /* monotonic time since our
101 * last read error 101 * last read error
102 */ 102 */
103 atomic_t corrected_errors; /* number of corrected read errors, 103 atomic_t corrected_errors; /* number of corrected read errors,
104 * for reporting to userspace and storing 104 * for reporting to userspace and storing
105 * in superblock. 105 * in superblock.
106 */ 106 */
107 struct work_struct del_work; /* used for delayed sysfs removal */ 107 struct work_struct del_work; /* used for delayed sysfs removal */
108 108
109 struct sysfs_dirent *sysfs_state; /* handle for 'state' 109 struct sysfs_dirent *sysfs_state; /* handle for 'state'
110 * sysfs entry */ 110 * sysfs entry */
111 111
112 struct badblocks { 112 struct badblocks {
113 int count; /* count of bad blocks */ 113 int count; /* count of bad blocks */
114 int unacked_exist; /* there probably are unacknowledged 114 int unacked_exist; /* there probably are unacknowledged
115 * bad blocks. This is only cleared 115 * bad blocks. This is only cleared
116 * when a read discovers none 116 * when a read discovers none
117 */ 117 */
118 int shift; /* shift from sectors to block size 118 int shift; /* shift from sectors to block size
119 * a -ve shift means badblocks are 119 * a -ve shift means badblocks are
120 * disabled.*/ 120 * disabled.*/
121 u64 *page; /* badblock list */ 121 u64 *page; /* badblock list */
122 int changed; 122 int changed;
123 seqlock_t lock; 123 seqlock_t lock;
124 124
125 sector_t sector; 125 sector_t sector;
126 sector_t size; /* in sectors */ 126 sector_t size; /* in sectors */
127 } badblocks; 127 } badblocks;
128 }; 128 };
129 enum flag_bits { 129 enum flag_bits {
130 Faulty, /* device is known to have a fault */ 130 Faulty, /* device is known to have a fault */
131 In_sync, /* device is in_sync with rest of array */ 131 In_sync, /* device is in_sync with rest of array */
132 Unmerged, /* device is being added to array and should 132 Unmerged, /* device is being added to array and should
133 * be considerred for bvec_merge_fn but not 133 * be considerred for bvec_merge_fn but not
134 * yet for actual IO 134 * yet for actual IO
135 */ 135 */
136 WriteMostly, /* Avoid reading if at all possible */ 136 WriteMostly, /* Avoid reading if at all possible */
137 AutoDetected, /* added by auto-detect */ 137 AutoDetected, /* added by auto-detect */
138 Blocked, /* An error occurred but has not yet 138 Blocked, /* An error occurred but has not yet
139 * been acknowledged by the metadata 139 * been acknowledged by the metadata
140 * handler, so don't allow writes 140 * handler, so don't allow writes
141 * until it is cleared */ 141 * until it is cleared */
142 WriteErrorSeen, /* A write error has been seen on this 142 WriteErrorSeen, /* A write error has been seen on this
143 * device 143 * device
144 */ 144 */
145 FaultRecorded, /* Intermediate state for clearing 145 FaultRecorded, /* Intermediate state for clearing
146 * Blocked. The Fault is/will-be 146 * Blocked. The Fault is/will-be
147 * recorded in the metadata, but that 147 * recorded in the metadata, but that
148 * metadata hasn't been stored safely 148 * metadata hasn't been stored safely
149 * on disk yet. 149 * on disk yet.
150 */ 150 */
151 BlockedBadBlocks, /* A writer is blocked because they 151 BlockedBadBlocks, /* A writer is blocked because they
152 * found an unacknowledged bad-block. 152 * found an unacknowledged bad-block.
153 * This can safely be cleared at any 153 * This can safely be cleared at any
154 * time, and the writer will re-check. 154 * time, and the writer will re-check.
155 * It may be set at any time, and at 155 * It may be set at any time, and at
156 * worst the writer will timeout and 156 * worst the writer will timeout and
157 * re-check. So setting it as 157 * re-check. So setting it as
158 * accurately as possible is good, but 158 * accurately as possible is good, but
159 * not absolutely critical. 159 * not absolutely critical.
160 */ 160 */
161 WantReplacement, /* This device is a candidate to be 161 WantReplacement, /* This device is a candidate to be
162 * hot-replaced, either because it has 162 * hot-replaced, either because it has
163 * reported some faults, or because 163 * reported some faults, or because
164 * of explicit request. 164 * of explicit request.
165 */ 165 */
166 Replacement, /* This device is a replacement for 166 Replacement, /* This device is a replacement for
167 * a want_replacement device with same 167 * a want_replacement device with same
168 * raid_disk number. 168 * raid_disk number.
169 */ 169 */
170 }; 170 };
171 171
172 #define BB_LEN_MASK (0x00000000000001FFULL) 172 #define BB_LEN_MASK (0x00000000000001FFULL)
173 #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) 173 #define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL)
174 #define BB_ACK_MASK (0x8000000000000000ULL) 174 #define BB_ACK_MASK (0x8000000000000000ULL)
175 #define BB_MAX_LEN 512 175 #define BB_MAX_LEN 512
176 #define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) 176 #define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9)
177 #define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) 177 #define BB_LEN(x) (((x) & BB_LEN_MASK) + 1)
178 #define BB_ACK(x) (!!((x) & BB_ACK_MASK)) 178 #define BB_ACK(x) (!!((x) & BB_ACK_MASK))
179 #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) 179 #define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63))
180 180
181 extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, 181 extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
182 sector_t *first_bad, int *bad_sectors); 182 sector_t *first_bad, int *bad_sectors);
183 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors, 183 static inline int is_badblock(struct md_rdev *rdev, sector_t s, int sectors,
184 sector_t *first_bad, int *bad_sectors) 184 sector_t *first_bad, int *bad_sectors)
185 { 185 {
186 if (unlikely(rdev->badblocks.count)) { 186 if (unlikely(rdev->badblocks.count)) {
187 int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, 187 int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s,
188 sectors, 188 sectors,
189 first_bad, bad_sectors); 189 first_bad, bad_sectors);
190 if (rv) 190 if (rv)
191 *first_bad -= rdev->data_offset; 191 *first_bad -= rdev->data_offset;
192 return rv; 192 return rv;
193 } 193 }
194 return 0; 194 return 0;
195 } 195 }
196 extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 196 extern int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
197 int is_new); 197 int is_new);
198 extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 198 extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
199 int is_new); 199 int is_new);
200 extern void md_ack_all_badblocks(struct badblocks *bb); 200 extern void md_ack_all_badblocks(struct badblocks *bb);
201 201
202 struct mddev { 202 struct mddev {
203 void *private; 203 void *private;
204 struct md_personality *pers; 204 struct md_personality *pers;
205 dev_t unit; 205 dev_t unit;
206 int md_minor; 206 int md_minor;
207 struct list_head disks; 207 struct list_head disks;
208 unsigned long flags; 208 unsigned long flags;
209 #define MD_CHANGE_DEVS 0 /* Some device status has changed */ 209 #define MD_CHANGE_DEVS 0 /* Some device status has changed */
210 #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */ 210 #define MD_CHANGE_CLEAN 1 /* transition to or from 'clean' */
211 #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */ 211 #define MD_CHANGE_PENDING 2 /* switch from 'clean' to 'active' in progress */
212 #define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */ 212 #define MD_ARRAY_FIRST_USE 3 /* First use of array, needs initialization */
213 213
214 int suspended; 214 int suspended;
215 atomic_t active_io; 215 atomic_t active_io;
216 int ro; 216 int ro;
217 int sysfs_active; /* set when sysfs deletes 217 int sysfs_active; /* set when sysfs deletes
218 * are happening, so run/ 218 * are happening, so run/
219 * takeover/stop are not safe 219 * takeover/stop are not safe
220 */ 220 */
221 int ready; /* See when safe to pass 221 int ready; /* See when safe to pass
222 * IO requests down */ 222 * IO requests down */
223 struct gendisk *gendisk; 223 struct gendisk *gendisk;
224 224
225 struct kobject kobj; 225 struct kobject kobj;
226 int hold_active; 226 int hold_active;
227 #define UNTIL_IOCTL 1 227 #define UNTIL_IOCTL 1
228 #define UNTIL_STOP 2 228 #define UNTIL_STOP 2
229 229
230 /* Superblock information */ 230 /* Superblock information */
231 int major_version, 231 int major_version,
232 minor_version, 232 minor_version,
233 patch_version; 233 patch_version;
234 int persistent; 234 int persistent;
235 int external; /* metadata is 235 int external; /* metadata is
236 * managed externally */ 236 * managed externally */
237 char metadata_type[17]; /* externally set*/ 237 char metadata_type[17]; /* externally set*/
238 int chunk_sectors; 238 int chunk_sectors;
239 time_t ctime, utime; 239 time_t ctime, utime;
240 int level, layout; 240 int level, layout;
241 char clevel[16]; 241 char clevel[16];
242 int raid_disks; 242 int raid_disks;
243 int max_disks; 243 int max_disks;
244 sector_t dev_sectors; /* used size of 244 sector_t dev_sectors; /* used size of
245 * component devices */ 245 * component devices */
246 sector_t array_sectors; /* exported array size */ 246 sector_t array_sectors; /* exported array size */
247 int external_size; /* size managed 247 int external_size; /* size managed
248 * externally */ 248 * externally */
249 __u64 events; 249 __u64 events;
250 /* If the last 'event' was simply a clean->dirty transition, and 250 /* If the last 'event' was simply a clean->dirty transition, and
251 * we didn't write it to the spares, then it is safe and simple 251 * we didn't write it to the spares, then it is safe and simple
252 * to just decrement the event count on a dirty->clean transition. 252 * to just decrement the event count on a dirty->clean transition.
253 * So we record that possibility here. 253 * So we record that possibility here.
254 */ 254 */
255 int can_decrease_events; 255 int can_decrease_events;
256 256
257 char uuid[16]; 257 char uuid[16];
258 258
259 /* If the array is being reshaped, we need to record the 259 /* If the array is being reshaped, we need to record the
260 * new shape and an indication of where we are up to. 260 * new shape and an indication of where we are up to.
261 * This is written to the superblock. 261 * This is written to the superblock.
262 * If reshape_position is MaxSector, then no reshape is happening (yet). 262 * If reshape_position is MaxSector, then no reshape is happening (yet).
263 */ 263 */
264 sector_t reshape_position; 264 sector_t reshape_position;
265 int delta_disks, new_level, new_layout; 265 int delta_disks, new_level, new_layout;
266 int new_chunk_sectors; 266 int new_chunk_sectors;
267 int reshape_backwards; 267 int reshape_backwards;
268 268
269 struct md_thread *thread; /* management thread */ 269 struct md_thread *thread; /* management thread */
270 struct md_thread *sync_thread; /* doing resync or reconstruct */ 270 struct md_thread *sync_thread; /* doing resync or reconstruct */
271 sector_t curr_resync; /* last block scheduled */ 271 sector_t curr_resync; /* last block scheduled */
272 /* As resync requests can complete out of order, we cannot easily track 272 /* As resync requests can complete out of order, we cannot easily track
273 * how much resync has been completed. So we occasionally pause until 273 * how much resync has been completed. So we occasionally pause until
274 * everything completes, then set curr_resync_completed to curr_resync. 274 * everything completes, then set curr_resync_completed to curr_resync.
275 * As such it may be well behind the real resync mark, but it is a value 275 * As such it may be well behind the real resync mark, but it is a value
276 * we are certain of. 276 * we are certain of.
277 */ 277 */
278 sector_t curr_resync_completed; 278 sector_t curr_resync_completed;
279 unsigned long resync_mark; /* a recent timestamp */ 279 unsigned long resync_mark; /* a recent timestamp */
280 sector_t resync_mark_cnt;/* blocks written at resync_mark */ 280 sector_t resync_mark_cnt;/* blocks written at resync_mark */
281 sector_t curr_mark_cnt; /* blocks scheduled now */ 281 sector_t curr_mark_cnt; /* blocks scheduled now */
282 282
283 sector_t resync_max_sectors; /* may be set by personality */ 283 sector_t resync_max_sectors; /* may be set by personality */
284 284
285 atomic64_t resync_mismatches; /* count of sectors where 285 atomic64_t resync_mismatches; /* count of sectors where
286 * parity/replica mismatch found 286 * parity/replica mismatch found
287 */ 287 */
288 288
289 /* allow user-space to request suspension of IO to regions of the array */ 289 /* allow user-space to request suspension of IO to regions of the array */
290 sector_t suspend_lo; 290 sector_t suspend_lo;
291 sector_t suspend_hi; 291 sector_t suspend_hi;
292 /* if zero, use the system-wide default */ 292 /* if zero, use the system-wide default */
293 int sync_speed_min; 293 int sync_speed_min;
294 int sync_speed_max; 294 int sync_speed_max;
295 295
296 /* resync even though the same disks are shared among md-devices */ 296 /* resync even though the same disks are shared among md-devices */
297 int parallel_resync; 297 int parallel_resync;
298 298
299 int ok_start_degraded; 299 int ok_start_degraded;
300 /* recovery/resync flags 300 /* recovery/resync flags
301 * NEEDED: we might need to start a resync/recover 301 * NEEDED: we might need to start a resync/recover
302 * RUNNING: a thread is running, or about to be started 302 * RUNNING: a thread is running, or about to be started
303 * SYNC: actually doing a resync, not a recovery 303 * SYNC: actually doing a resync, not a recovery
304 * RECOVER: doing recovery, or need to try it. 304 * RECOVER: doing recovery, or need to try it.
305 * INTR: resync needs to be aborted for some reason 305 * INTR: resync needs to be aborted for some reason
306 * DONE: thread is done and is waiting to be reaped 306 * DONE: thread is done and is waiting to be reaped
307 * REQUEST: user-space has requested a sync (used with SYNC) 307 * REQUEST: user-space has requested a sync (used with SYNC)
308 * CHECK: user-space request for check-only, no repair 308 * CHECK: user-space request for check-only, no repair
309 * RESHAPE: A reshape is happening 309 * RESHAPE: A reshape is happening
310 * ERROR: sync-action interrupted because io-error 310 * ERROR: sync-action interrupted because io-error
311 * 311 *
312 * If neither SYNC or RESHAPE are set, then it is a recovery. 312 * If neither SYNC or RESHAPE are set, then it is a recovery.
313 */ 313 */
314 #define MD_RECOVERY_RUNNING 0 314 #define MD_RECOVERY_RUNNING 0
315 #define MD_RECOVERY_SYNC 1 315 #define MD_RECOVERY_SYNC 1
316 #define MD_RECOVERY_RECOVER 2 316 #define MD_RECOVERY_RECOVER 2
317 #define MD_RECOVERY_INTR 3 317 #define MD_RECOVERY_INTR 3
318 #define MD_RECOVERY_DONE 4 318 #define MD_RECOVERY_DONE 4
319 #define MD_RECOVERY_NEEDED 5 319 #define MD_RECOVERY_NEEDED 5
320 #define MD_RECOVERY_REQUESTED 6 320 #define MD_RECOVERY_REQUESTED 6
321 #define MD_RECOVERY_CHECK 7 321 #define MD_RECOVERY_CHECK 7
322 #define MD_RECOVERY_RESHAPE 8 322 #define MD_RECOVERY_RESHAPE 8
323 #define MD_RECOVERY_FROZEN 9 323 #define MD_RECOVERY_FROZEN 9
324 #define MD_RECOVERY_ERROR 10 324 #define MD_RECOVERY_ERROR 10
325 325
326 unsigned long recovery; 326 unsigned long recovery;
327 /* If a RAID personality determines that recovery (of a particular 327 /* If a RAID personality determines that recovery (of a particular
328 * device) will fail due to a read error on the source device, it 328 * device) will fail due to a read error on the source device, it
329 * takes a copy of this number and does not attempt recovery again 329 * takes a copy of this number and does not attempt recovery again
330 * until this number changes. 330 * until this number changes.
331 */ 331 */
332 int recovery_disabled; 332 int recovery_disabled;
333 333
334 int in_sync; /* know to not need resync */ 334 int in_sync; /* know to not need resync */
335 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so 335 /* 'open_mutex' avoids races between 'md_open' and 'do_md_stop', so
336 * that we are never stopping an array while it is open. 336 * that we are never stopping an array while it is open.
337 * 'reconfig_mutex' protects all other reconfiguration. 337 * 'reconfig_mutex' protects all other reconfiguration.
338 * These locks are separate due to conflicting interactions 338 * These locks are separate due to conflicting interactions
339 * with bdev->bd_mutex. 339 * with bdev->bd_mutex.
340 * Lock ordering is: 340 * Lock ordering is:
341 * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk 341 * reconfig_mutex -> bd_mutex : e.g. do_md_run -> revalidate_disk
342 * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open 342 * bd_mutex -> open_mutex: e.g. __blkdev_get -> md_open
343 */ 343 */
344 struct mutex open_mutex; 344 struct mutex open_mutex;
345 struct mutex reconfig_mutex; 345 struct mutex reconfig_mutex;
346 atomic_t active; /* general refcount */ 346 atomic_t active; /* general refcount */
347 atomic_t openers; /* number of active opens */ 347 atomic_t openers; /* number of active opens */
348 348
349 int changed; /* True if we might need to 349 int changed; /* True if we might need to
350 * reread partition info */ 350 * reread partition info */
351 int degraded; /* whether md should consider 351 int degraded; /* whether md should consider
352 * adding a spare 352 * adding a spare
353 */ 353 */
354 int merge_check_needed; /* at least one 354 int merge_check_needed; /* at least one
355 * member device 355 * member device
356 * has a 356 * has a
357 * merge_bvec_fn */ 357 * merge_bvec_fn */
358 358
359 atomic_t recovery_active; /* blocks scheduled, but not written */ 359 atomic_t recovery_active; /* blocks scheduled, but not written */
360 wait_queue_head_t recovery_wait; 360 wait_queue_head_t recovery_wait;
361 sector_t recovery_cp; 361 sector_t recovery_cp;
362 sector_t resync_min; /* user requested sync 362 sector_t resync_min; /* user requested sync
363 * starts here */ 363 * starts here */
364 sector_t resync_max; /* resync should pause 364 sector_t resync_max; /* resync should pause
365 * when it gets here */ 365 * when it gets here */
366 366
367 struct sysfs_dirent *sysfs_state; /* handle for 'array_state' 367 struct sysfs_dirent *sysfs_state; /* handle for 'array_state'
368 * file in sysfs. 368 * file in sysfs.
369 */ 369 */
370 struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */ 370 struct sysfs_dirent *sysfs_action; /* handle for 'sync_action' */
371 371
372 struct work_struct del_work; /* used for delayed sysfs removal */ 372 struct work_struct del_work; /* used for delayed sysfs removal */
373 373
374 spinlock_t write_lock; 374 spinlock_t write_lock;
375 wait_queue_head_t sb_wait; /* for waiting on superblock updates */ 375 wait_queue_head_t sb_wait; /* for waiting on superblock updates */
376 atomic_t pending_writes; /* number of active superblock writes */ 376 atomic_t pending_writes; /* number of active superblock writes */
377 377
378 unsigned int safemode; /* if set, update "clean" superblock 378 unsigned int safemode; /* if set, update "clean" superblock
379 * when no writes pending. 379 * when no writes pending.
380 */ 380 */
381 unsigned int safemode_delay; 381 unsigned int safemode_delay;
382 struct timer_list safemode_timer; 382 struct timer_list safemode_timer;
383 atomic_t writes_pending; 383 atomic_t writes_pending;
384 struct request_queue *queue; /* for plugging ... */ 384 struct request_queue *queue; /* for plugging ... */
385 385
386 struct bitmap *bitmap; /* the bitmap for the device */ 386 struct bitmap *bitmap; /* the bitmap for the device */
387 struct { 387 struct {
388 struct file *file; /* the bitmap file */ 388 struct file *file; /* the bitmap file */
389 loff_t offset; /* offset from superblock of 389 loff_t offset; /* offset from superblock of
390 * start of bitmap. May be 390 * start of bitmap. May be
391 * negative, but not '0' 391 * negative, but not '0'
392 * For external metadata, offset 392 * For external metadata, offset
393 * from start of device. 393 * from start of device.
394 */ 394 */
395 unsigned long space; /* space available at this offset */ 395 unsigned long space; /* space available at this offset */
396 loff_t default_offset; /* this is the offset to use when 396 loff_t default_offset; /* this is the offset to use when
397 * hot-adding a bitmap. It should 397 * hot-adding a bitmap. It should
398 * eventually be settable by sysfs. 398 * eventually be settable by sysfs.
399 */ 399 */
400 unsigned long default_space; /* space available at 400 unsigned long default_space; /* space available at
401 * default offset */ 401 * default offset */
402 struct mutex mutex; 402 struct mutex mutex;
403 unsigned long chunksize; 403 unsigned long chunksize;
404 unsigned long daemon_sleep; /* how many jiffies between updates? */ 404 unsigned long daemon_sleep; /* how many jiffies between updates? */
405 unsigned long max_write_behind; /* write-behind mode */ 405 unsigned long max_write_behind; /* write-behind mode */
406 int external; 406 int external;
407 } bitmap_info; 407 } bitmap_info;
408 408
409 atomic_t max_corr_read_errors; /* max read retries */ 409 atomic_t max_corr_read_errors; /* max read retries */
410 struct list_head all_mddevs; 410 struct list_head all_mddevs;
411 411
412 struct attribute_group *to_remove; 412 struct attribute_group *to_remove;
413 413
414 struct bio_set *bio_set; 414 struct bio_set *bio_set;
415 415
416 /* Generic flush handling. 416 /* Generic flush handling.
417 * The last to finish preflush schedules a worker to submit 417 * The last to finish preflush schedules a worker to submit
418 * the rest of the request (without the REQ_FLUSH flag). 418 * the rest of the request (without the REQ_FLUSH flag).
419 */ 419 */
420 struct bio *flush_bio; 420 struct bio *flush_bio;
421 atomic_t flush_pending; 421 atomic_t flush_pending;
422 struct work_struct flush_work; 422 struct work_struct flush_work;
423 struct work_struct event_work; /* used by dm to report failure event */ 423 struct work_struct event_work; /* used by dm to report failure event */
424 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev); 424 void (*sync_super)(struct mddev *mddev, struct md_rdev *rdev);
425 }; 425 };
426 426
427 427
428 static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev) 428 static inline void rdev_dec_pending(struct md_rdev *rdev, struct mddev *mddev)
429 { 429 {
430 int faulty = test_bit(Faulty, &rdev->flags); 430 int faulty = test_bit(Faulty, &rdev->flags);
431 if (atomic_dec_and_test(&rdev->nr_pending) && faulty) 431 if (atomic_dec_and_test(&rdev->nr_pending) && faulty)
432 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 432 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
433 } 433 }
434 434
435 static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors) 435 static inline void md_sync_acct(struct block_device *bdev, unsigned long nr_sectors)
436 { 436 {
437 atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io); 437 atomic_add(nr_sectors, &bdev->bd_contains->bd_disk->sync_io);
438 } 438 }
439 439
440 struct md_personality 440 struct md_personality
441 { 441 {
442 char *name; 442 char *name;
443 int level; 443 int level;
444 struct list_head list; 444 struct list_head list;
445 struct module *owner; 445 struct module *owner;
446 void (*make_request)(struct mddev *mddev, struct bio *bio); 446 void (*make_request)(struct mddev *mddev, struct bio *bio);
447 int (*run)(struct mddev *mddev); 447 int (*run)(struct mddev *mddev);
448 int (*stop)(struct mddev *mddev); 448 int (*stop)(struct mddev *mddev);
449 void (*status)(struct seq_file *seq, struct mddev *mddev); 449 void (*status)(struct seq_file *seq, struct mddev *mddev);
450 /* error_handler must set ->faulty and clear ->in_sync 450 /* error_handler must set ->faulty and clear ->in_sync
451 * if appropriate, and should abort recovery if needed 451 * if appropriate, and should abort recovery if needed
452 */ 452 */
453 void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev); 453 void (*error_handler)(struct mddev *mddev, struct md_rdev *rdev);
454 int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev); 454 int (*hot_add_disk) (struct mddev *mddev, struct md_rdev *rdev);
455 int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev); 455 int (*hot_remove_disk) (struct mddev *mddev, struct md_rdev *rdev);
456 int (*spare_active) (struct mddev *mddev); 456 int (*spare_active) (struct mddev *mddev);
457 sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster); 457 sector_t (*sync_request)(struct mddev *mddev, sector_t sector_nr, int *skipped, int go_faster);
458 int (*resize) (struct mddev *mddev, sector_t sectors); 458 int (*resize) (struct mddev *mddev, sector_t sectors);
459 sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks); 459 sector_t (*size) (struct mddev *mddev, sector_t sectors, int raid_disks);
460 int (*check_reshape) (struct mddev *mddev); 460 int (*check_reshape) (struct mddev *mddev);
461 int (*start_reshape) (struct mddev *mddev); 461 int (*start_reshape) (struct mddev *mddev);
462 void (*finish_reshape) (struct mddev *mddev); 462 void (*finish_reshape) (struct mddev *mddev);
463 /* quiesce moves between quiescence states 463 /* quiesce moves between quiescence states
464 * 0 - fully active 464 * 0 - fully active
465 * 1 - no new requests allowed 465 * 1 - no new requests allowed
466 * others - reserved 466 * others - reserved
467 */ 467 */
468 void (*quiesce) (struct mddev *mddev, int state); 468 void (*quiesce) (struct mddev *mddev, int state);
469 /* takeover is used to transition an array from one 469 /* takeover is used to transition an array from one
470 * personality to another. The new personality must be able 470 * personality to another. The new personality must be able
471 * to handle the data in the current layout. 471 * to handle the data in the current layout.
472 * e.g. 2drive raid1 -> 2drive raid5 472 * e.g. 2drive raid1 -> 2drive raid5
473 * ndrive raid5 -> degraded n+1drive raid6 with special layout 473 * ndrive raid5 -> degraded n+1drive raid6 with special layout
474 * If the takeover succeeds, a new 'private' structure is returned. 474 * If the takeover succeeds, a new 'private' structure is returned.
475 * This needs to be installed and then ->run used to activate the 475 * This needs to be installed and then ->run used to activate the
476 * array. 476 * array.
477 */ 477 */
478 void *(*takeover) (struct mddev *mddev); 478 void *(*takeover) (struct mddev *mddev);
479 }; 479 };
480 480
481 481
482 struct md_sysfs_entry { 482 struct md_sysfs_entry {
483 struct attribute attr; 483 struct attribute attr;
484 ssize_t (*show)(struct mddev *, char *); 484 ssize_t (*show)(struct mddev *, char *);
485 ssize_t (*store)(struct mddev *, const char *, size_t); 485 ssize_t (*store)(struct mddev *, const char *, size_t);
486 }; 486 };
487 extern struct attribute_group md_bitmap_group; 487 extern struct attribute_group md_bitmap_group;
488 488
489 static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name) 489 static inline struct sysfs_dirent *sysfs_get_dirent_safe(struct sysfs_dirent *sd, char *name)
490 { 490 {
491 if (sd) 491 if (sd)
492 return sysfs_get_dirent(sd, NULL, name); 492 return sysfs_get_dirent(sd, NULL, name);
493 return sd; 493 return sd;
494 } 494 }
495 static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd) 495 static inline void sysfs_notify_dirent_safe(struct sysfs_dirent *sd)
496 { 496 {
497 if (sd) 497 if (sd)
498 sysfs_notify_dirent(sd); 498 sysfs_notify_dirent(sd);
499 } 499 }
500 500
501 static inline char * mdname (struct mddev * mddev) 501 static inline char * mdname (struct mddev * mddev)
502 { 502 {
503 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX"; 503 return mddev->gendisk ? mddev->gendisk->disk_name : "mdX";
504 } 504 }
505 505
506 static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev) 506 static inline int sysfs_link_rdev(struct mddev *mddev, struct md_rdev *rdev)
507 { 507 {
508 char nm[20]; 508 char nm[20];
509 if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) { 509 if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
510 sprintf(nm, "rd%d", rdev->raid_disk); 510 sprintf(nm, "rd%d", rdev->raid_disk);
511 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm); 511 return sysfs_create_link(&mddev->kobj, &rdev->kobj, nm);
512 } else 512 } else
513 return 0; 513 return 0;
514 } 514 }
515 515
516 static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) 516 static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
517 { 517 {
518 char nm[20]; 518 char nm[20];
519 if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) { 519 if (!test_bit(Replacement, &rdev->flags) && mddev->kobj.sd) {
520 sprintf(nm, "rd%d", rdev->raid_disk); 520 sprintf(nm, "rd%d", rdev->raid_disk);
521 sysfs_remove_link(&mddev->kobj, nm); 521 sysfs_remove_link(&mddev->kobj, nm);
522 } 522 }
523 } 523 }
524 524
525 /* 525 /*
526 * iterates through some rdev ringlist. It's safe to remove the 526 * iterates through some rdev ringlist. It's safe to remove the
527 * current 'rdev'. Dont touch 'tmp' though. 527 * current 'rdev'. Dont touch 'tmp' though.
528 */ 528 */
529 #define rdev_for_each_list(rdev, tmp, head) \ 529 #define rdev_for_each_list(rdev, tmp, head) \
530 list_for_each_entry_safe(rdev, tmp, head, same_set) 530 list_for_each_entry_safe(rdev, tmp, head, same_set)
531 531
532 /* 532 /*
533 * iterates through the 'same array disks' ringlist 533 * iterates through the 'same array disks' ringlist
534 */ 534 */
535 #define rdev_for_each(rdev, mddev) \ 535 #define rdev_for_each(rdev, mddev) \
536 list_for_each_entry(rdev, &((mddev)->disks), same_set) 536 list_for_each_entry(rdev, &((mddev)->disks), same_set)
537 537
538 #define rdev_for_each_safe(rdev, tmp, mddev) \ 538 #define rdev_for_each_safe(rdev, tmp, mddev) \
539 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) 539 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
540 540
541 #define rdev_for_each_rcu(rdev, mddev) \ 541 #define rdev_for_each_rcu(rdev, mddev) \
542 list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) 542 list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set)
543 543
544 struct md_thread { 544 struct md_thread {
545 void (*run) (struct md_thread *thread); 545 void (*run) (struct md_thread *thread);
546 struct mddev *mddev; 546 struct mddev *mddev;
547 wait_queue_head_t wqueue; 547 wait_queue_head_t wqueue;
548 unsigned long flags; 548 unsigned long flags;
549 struct task_struct *tsk; 549 struct task_struct *tsk;
550 unsigned long timeout; 550 unsigned long timeout;
551 void *private; 551 void *private;
552 }; 552 };
553 553
554 #define THREAD_WAKEUP 0 554 #define THREAD_WAKEUP 0
555 555
556 static inline void safe_put_page(struct page *p) 556 static inline void safe_put_page(struct page *p)
557 { 557 {
558 if (p) put_page(p); 558 if (p) put_page(p);
559 } 559 }
560 560
561 extern int register_md_personality(struct md_personality *p); 561 extern int register_md_personality(struct md_personality *p);
562 extern int unregister_md_personality(struct md_personality *p); 562 extern int unregister_md_personality(struct md_personality *p);
563 extern struct md_thread *md_register_thread( 563 extern struct md_thread *md_register_thread(
564 void (*run)(struct md_thread *thread), 564 void (*run)(struct md_thread *thread),
565 struct mddev *mddev, 565 struct mddev *mddev,
566 const char *name); 566 const char *name);
567 extern void md_unregister_thread(struct md_thread **threadp); 567 extern void md_unregister_thread(struct md_thread **threadp);
568 extern void md_wakeup_thread(struct md_thread *thread); 568 extern void md_wakeup_thread(struct md_thread *thread);
569 extern void md_check_recovery(struct mddev *mddev); 569 extern void md_check_recovery(struct mddev *mddev);
570 extern void md_reap_sync_thread(struct mddev *mddev);
570 extern void md_write_start(struct mddev *mddev, struct bio *bi); 571 extern void md_write_start(struct mddev *mddev, struct bio *bi);
571 extern void md_write_end(struct mddev *mddev); 572 extern void md_write_end(struct mddev *mddev);
572 extern void md_done_sync(struct mddev *mddev, int blocks, int ok); 573 extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
573 extern void md_error(struct mddev *mddev, struct md_rdev *rdev); 574 extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
574 extern void md_finish_reshape(struct mddev *mddev); 575 extern void md_finish_reshape(struct mddev *mddev);
575 576
576 extern int mddev_congested(struct mddev *mddev, int bits); 577 extern int mddev_congested(struct mddev *mddev, int bits);
577 extern void md_flush_request(struct mddev *mddev, struct bio *bio); 578 extern void md_flush_request(struct mddev *mddev, struct bio *bio);
578 extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 579 extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
579 sector_t sector, int size, struct page *page); 580 sector_t sector, int size, struct page *page);
580 extern void md_super_wait(struct mddev *mddev); 581 extern void md_super_wait(struct mddev *mddev);
581 extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 582 extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
582 struct page *page, int rw, bool metadata_op); 583 struct page *page, int rw, bool metadata_op);
583 extern void md_do_sync(struct md_thread *thread); 584 extern void md_do_sync(struct md_thread *thread);
584 extern void md_new_event(struct mddev *mddev); 585 extern void md_new_event(struct mddev *mddev);
585 extern int md_allow_write(struct mddev *mddev); 586 extern int md_allow_write(struct mddev *mddev);
586 extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); 587 extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev);
587 extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors); 588 extern void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors);
588 extern int md_check_no_bitmap(struct mddev *mddev); 589 extern int md_check_no_bitmap(struct mddev *mddev);
589 extern int md_integrity_register(struct mddev *mddev); 590 extern int md_integrity_register(struct mddev *mddev);
590 extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev); 591 extern void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev);
591 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale); 592 extern int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale);
592 extern void restore_bitmap_write_access(struct file *file); 593 extern void restore_bitmap_write_access(struct file *file);
593 594
594 extern void mddev_init(struct mddev *mddev); 595 extern void mddev_init(struct mddev *mddev);
595 extern int md_run(struct mddev *mddev); 596 extern int md_run(struct mddev *mddev);
596 extern void md_stop(struct mddev *mddev); 597 extern void md_stop(struct mddev *mddev);
597 extern void md_stop_writes(struct mddev *mddev); 598 extern void md_stop_writes(struct mddev *mddev);
598 extern int md_rdev_init(struct md_rdev *rdev); 599 extern int md_rdev_init(struct md_rdev *rdev);
599 extern void md_rdev_clear(struct md_rdev *rdev); 600 extern void md_rdev_clear(struct md_rdev *rdev);
600 601
601 extern void mddev_suspend(struct mddev *mddev); 602 extern void mddev_suspend(struct mddev *mddev);
602 extern void mddev_resume(struct mddev *mddev); 603 extern void mddev_resume(struct mddev *mddev);
603 extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, 604 extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
604 struct mddev *mddev); 605 struct mddev *mddev);
605 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 606 extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
606 struct mddev *mddev); 607 struct mddev *mddev);
607 extern void md_trim_bio(struct bio *bio, int offset, int size); 608 extern void md_trim_bio(struct bio *bio, int offset, int size);
608 609
609 extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule); 610 extern void md_unplug(struct blk_plug_cb *cb, bool from_schedule);
610 static inline int mddev_check_plugged(struct mddev *mddev) 611 static inline int mddev_check_plugged(struct mddev *mddev)
611 { 612 {
612 return !!blk_check_plugged(md_unplug, mddev, 613 return !!blk_check_plugged(md_unplug, mddev,
613 sizeof(struct blk_plug_cb)); 614 sizeof(struct blk_plug_cb));
614 } 615 }
615 #endif /* _MD_MD_H */ 616 #endif /* _MD_MD_H */
616 617