Commit ca755175f245b91f72cfa474aaa8acd9c26996f4

Authored by Linus Torvalds

Merge tag 'md/3.15-fixes' of git://neil.brown.name/md

Pull two md bugfixes from Neil Brown:
 "Two md bugfixes for possible corruption when restarting reshape

  If a raid5/6 reshape is restarted (After stopping and re-assembling
  the array) and the array is marked read-only (or read-auto), then the
  reshape will appear to complete immediately, without actually moving
  anything around.  This can result in corruption.

  There are two patches which do much the same thing in different
  places.  They are separate because one is an older bug and so can be
  applied to more -stable kernels"

* tag 'md/3.15-fixes' of git://neil.brown.name/md:
  md: always set MD_RECOVERY_INTR when interrupting a reshape thread.
  md: always set MD_RECOVERY_INTR when aborting a reshape or other "resync".

Showing 1 changed file Inline Diff

1 /* 1 /*
2 md.c : Multiple Devices driver for Linux 2 md.c : Multiple Devices driver for Linux
3 Copyright (C) 1998, 1999, 2000 Ingo Molnar 3 Copyright (C) 1998, 1999, 2000 Ingo Molnar
4 4
5 completely rewritten, based on the MD driver code from Marc Zyngier 5 completely rewritten, based on the MD driver code from Marc Zyngier
6 6
7 Changes: 7 Changes:
8 8
9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar 9 - RAID-1/RAID-5 extensions by Miguel de Icaza, Gadi Oxman, Ingo Molnar
10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com> 10 - RAID-6 extensions by H. Peter Anvin <hpa@zytor.com>
11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net> 11 - boot support for linear and striped mode by Harald Hoyer <HarryH@Royal.Net>
12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su> 12 - kerneld support by Boris Tobotras <boris@xtalk.msk.su>
13 - kmod support by: Cyrus Durgin 13 - kmod support by: Cyrus Durgin
14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com> 14 - RAID0 bugfixes: Mark Anthony Lisher <markal@iname.com>
15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au> 15 - Devfs support by Richard Gooch <rgooch@atnf.csiro.au>
16 16
17 - lots of fixes and improvements to the RAID1/RAID5 and generic 17 - lots of fixes and improvements to the RAID1/RAID5 and generic
18 RAID code (such as request based resynchronization): 18 RAID code (such as request based resynchronization):
19 19
20 Neil Brown <neilb@cse.unsw.edu.au>. 20 Neil Brown <neilb@cse.unsw.edu.au>.
21 21
22 - persistent bitmap code 22 - persistent bitmap code
23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. 23 Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
24 24
25 This program is free software; you can redistribute it and/or modify 25 This program is free software; you can redistribute it and/or modify
26 it under the terms of the GNU General Public License as published by 26 it under the terms of the GNU General Public License as published by
27 the Free Software Foundation; either version 2, or (at your option) 27 the Free Software Foundation; either version 2, or (at your option)
28 any later version. 28 any later version.
29 29
30 You should have received a copy of the GNU General Public License 30 You should have received a copy of the GNU General Public License
31 (for example /usr/src/linux/COPYING); if not, write to the Free 31 (for example /usr/src/linux/COPYING); if not, write to the Free
32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 32 Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
33 */ 33 */
34 34
35 #include <linux/kthread.h> 35 #include <linux/kthread.h>
36 #include <linux/blkdev.h> 36 #include <linux/blkdev.h>
37 #include <linux/sysctl.h> 37 #include <linux/sysctl.h>
38 #include <linux/seq_file.h> 38 #include <linux/seq_file.h>
39 #include <linux/fs.h> 39 #include <linux/fs.h>
40 #include <linux/poll.h> 40 #include <linux/poll.h>
41 #include <linux/ctype.h> 41 #include <linux/ctype.h>
42 #include <linux/string.h> 42 #include <linux/string.h>
43 #include <linux/hdreg.h> 43 #include <linux/hdreg.h>
44 #include <linux/proc_fs.h> 44 #include <linux/proc_fs.h>
45 #include <linux/random.h> 45 #include <linux/random.h>
46 #include <linux/module.h> 46 #include <linux/module.h>
47 #include <linux/reboot.h> 47 #include <linux/reboot.h>
48 #include <linux/file.h> 48 #include <linux/file.h>
49 #include <linux/compat.h> 49 #include <linux/compat.h>
50 #include <linux/delay.h> 50 #include <linux/delay.h>
51 #include <linux/raid/md_p.h> 51 #include <linux/raid/md_p.h>
52 #include <linux/raid/md_u.h> 52 #include <linux/raid/md_u.h>
53 #include <linux/slab.h> 53 #include <linux/slab.h>
54 #include "md.h" 54 #include "md.h"
55 #include "bitmap.h" 55 #include "bitmap.h"
56 56
57 #ifndef MODULE 57 #ifndef MODULE
58 static void autostart_arrays(int part); 58 static void autostart_arrays(int part);
59 #endif 59 #endif
60 60
61 /* pers_list is a list of registered personalities protected 61 /* pers_list is a list of registered personalities protected
62 * by pers_lock. 62 * by pers_lock.
63 * pers_lock does extra service to protect accesses to 63 * pers_lock does extra service to protect accesses to
64 * mddev->thread when the mutex cannot be held. 64 * mddev->thread when the mutex cannot be held.
65 */ 65 */
66 static LIST_HEAD(pers_list); 66 static LIST_HEAD(pers_list);
67 static DEFINE_SPINLOCK(pers_lock); 67 static DEFINE_SPINLOCK(pers_lock);
68 68
69 static void md_print_devices(void); 69 static void md_print_devices(void);
70 70
71 static DECLARE_WAIT_QUEUE_HEAD(resync_wait); 71 static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
72 static struct workqueue_struct *md_wq; 72 static struct workqueue_struct *md_wq;
73 static struct workqueue_struct *md_misc_wq; 73 static struct workqueue_struct *md_misc_wq;
74 74
75 static int remove_and_add_spares(struct mddev *mddev, 75 static int remove_and_add_spares(struct mddev *mddev,
76 struct md_rdev *this); 76 struct md_rdev *this);
77 77
78 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 78 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
79 79
80 /* 80 /*
81 * Default number of read corrections we'll attempt on an rdev 81 * Default number of read corrections we'll attempt on an rdev
82 * before ejecting it from the array. We divide the read error 82 * before ejecting it from the array. We divide the read error
83 * count by 2 for every hour elapsed between read errors. 83 * count by 2 for every hour elapsed between read errors.
84 */ 84 */
85 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 85 #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
86 /* 86 /*
87 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 87 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
88 * is 1000 KB/sec, so the extra system load does not show up that much. 88 * is 1000 KB/sec, so the extra system load does not show up that much.
89 * Increase it if you want to have more _guaranteed_ speed. Note that 89 * Increase it if you want to have more _guaranteed_ speed. Note that
90 * the RAID driver will use the maximum available bandwidth if the IO 90 * the RAID driver will use the maximum available bandwidth if the IO
91 * subsystem is idle. There is also an 'absolute maximum' reconstruction 91 * subsystem is idle. There is also an 'absolute maximum' reconstruction
92 * speed limit - in case reconstruction slows down your system despite 92 * speed limit - in case reconstruction slows down your system despite
93 * idle IO detection. 93 * idle IO detection.
94 * 94 *
95 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max. 95 * you can change it via /proc/sys/dev/raid/speed_limit_min and _max.
96 * or /sys/block/mdX/md/sync_speed_{min,max} 96 * or /sys/block/mdX/md/sync_speed_{min,max}
97 */ 97 */
98 98
99 static int sysctl_speed_limit_min = 1000; 99 static int sysctl_speed_limit_min = 1000;
100 static int sysctl_speed_limit_max = 200000; 100 static int sysctl_speed_limit_max = 200000;
101 static inline int speed_min(struct mddev *mddev) 101 static inline int speed_min(struct mddev *mddev)
102 { 102 {
103 return mddev->sync_speed_min ? 103 return mddev->sync_speed_min ?
104 mddev->sync_speed_min : sysctl_speed_limit_min; 104 mddev->sync_speed_min : sysctl_speed_limit_min;
105 } 105 }
106 106
107 static inline int speed_max(struct mddev *mddev) 107 static inline int speed_max(struct mddev *mddev)
108 { 108 {
109 return mddev->sync_speed_max ? 109 return mddev->sync_speed_max ?
110 mddev->sync_speed_max : sysctl_speed_limit_max; 110 mddev->sync_speed_max : sysctl_speed_limit_max;
111 } 111 }
112 112
113 static struct ctl_table_header *raid_table_header; 113 static struct ctl_table_header *raid_table_header;
114 114
115 static struct ctl_table raid_table[] = { 115 static struct ctl_table raid_table[] = {
116 { 116 {
117 .procname = "speed_limit_min", 117 .procname = "speed_limit_min",
118 .data = &sysctl_speed_limit_min, 118 .data = &sysctl_speed_limit_min,
119 .maxlen = sizeof(int), 119 .maxlen = sizeof(int),
120 .mode = S_IRUGO|S_IWUSR, 120 .mode = S_IRUGO|S_IWUSR,
121 .proc_handler = proc_dointvec, 121 .proc_handler = proc_dointvec,
122 }, 122 },
123 { 123 {
124 .procname = "speed_limit_max", 124 .procname = "speed_limit_max",
125 .data = &sysctl_speed_limit_max, 125 .data = &sysctl_speed_limit_max,
126 .maxlen = sizeof(int), 126 .maxlen = sizeof(int),
127 .mode = S_IRUGO|S_IWUSR, 127 .mode = S_IRUGO|S_IWUSR,
128 .proc_handler = proc_dointvec, 128 .proc_handler = proc_dointvec,
129 }, 129 },
130 { } 130 { }
131 }; 131 };
132 132
133 static struct ctl_table raid_dir_table[] = { 133 static struct ctl_table raid_dir_table[] = {
134 { 134 {
135 .procname = "raid", 135 .procname = "raid",
136 .maxlen = 0, 136 .maxlen = 0,
137 .mode = S_IRUGO|S_IXUGO, 137 .mode = S_IRUGO|S_IXUGO,
138 .child = raid_table, 138 .child = raid_table,
139 }, 139 },
140 { } 140 { }
141 }; 141 };
142 142
143 static struct ctl_table raid_root_table[] = { 143 static struct ctl_table raid_root_table[] = {
144 { 144 {
145 .procname = "dev", 145 .procname = "dev",
146 .maxlen = 0, 146 .maxlen = 0,
147 .mode = 0555, 147 .mode = 0555,
148 .child = raid_dir_table, 148 .child = raid_dir_table,
149 }, 149 },
150 { } 150 { }
151 }; 151 };
152 152
153 static const struct block_device_operations md_fops; 153 static const struct block_device_operations md_fops;
154 154
155 static int start_readonly; 155 static int start_readonly;
156 156
157 /* bio_clone_mddev 157 /* bio_clone_mddev
158 * like bio_clone, but with a local bio set 158 * like bio_clone, but with a local bio set
159 */ 159 */
160 160
161 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 161 struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
162 struct mddev *mddev) 162 struct mddev *mddev)
163 { 163 {
164 struct bio *b; 164 struct bio *b;
165 165
166 if (!mddev || !mddev->bio_set) 166 if (!mddev || !mddev->bio_set)
167 return bio_alloc(gfp_mask, nr_iovecs); 167 return bio_alloc(gfp_mask, nr_iovecs);
168 168
169 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set); 169 b = bio_alloc_bioset(gfp_mask, nr_iovecs, mddev->bio_set);
170 if (!b) 170 if (!b)
171 return NULL; 171 return NULL;
172 return b; 172 return b;
173 } 173 }
174 EXPORT_SYMBOL_GPL(bio_alloc_mddev); 174 EXPORT_SYMBOL_GPL(bio_alloc_mddev);
175 175
176 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, 176 struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
177 struct mddev *mddev) 177 struct mddev *mddev)
178 { 178 {
179 if (!mddev || !mddev->bio_set) 179 if (!mddev || !mddev->bio_set)
180 return bio_clone(bio, gfp_mask); 180 return bio_clone(bio, gfp_mask);
181 181
182 return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); 182 return bio_clone_bioset(bio, gfp_mask, mddev->bio_set);
183 } 183 }
184 EXPORT_SYMBOL_GPL(bio_clone_mddev); 184 EXPORT_SYMBOL_GPL(bio_clone_mddev);
185 185
186 /* 186 /*
187 * We have a system wide 'event count' that is incremented 187 * We have a system wide 'event count' that is incremented
188 * on any 'interesting' event, and readers of /proc/mdstat 188 * on any 'interesting' event, and readers of /proc/mdstat
189 * can use 'poll' or 'select' to find out when the event 189 * can use 'poll' or 'select' to find out when the event
190 * count increases. 190 * count increases.
191 * 191 *
192 * Events are: 192 * Events are:
193 * start array, stop array, error, add device, remove device, 193 * start array, stop array, error, add device, remove device,
194 * start build, activate spare 194 * start build, activate spare
195 */ 195 */
196 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters); 196 static DECLARE_WAIT_QUEUE_HEAD(md_event_waiters);
197 static atomic_t md_event_count; 197 static atomic_t md_event_count;
198 void md_new_event(struct mddev *mddev) 198 void md_new_event(struct mddev *mddev)
199 { 199 {
200 atomic_inc(&md_event_count); 200 atomic_inc(&md_event_count);
201 wake_up(&md_event_waiters); 201 wake_up(&md_event_waiters);
202 } 202 }
203 EXPORT_SYMBOL_GPL(md_new_event); 203 EXPORT_SYMBOL_GPL(md_new_event);
204 204
205 /* Alternate version that can be called from interrupts 205 /* Alternate version that can be called from interrupts
206 * when calling sysfs_notify isn't needed. 206 * when calling sysfs_notify isn't needed.
207 */ 207 */
208 static void md_new_event_inintr(struct mddev *mddev) 208 static void md_new_event_inintr(struct mddev *mddev)
209 { 209 {
210 atomic_inc(&md_event_count); 210 atomic_inc(&md_event_count);
211 wake_up(&md_event_waiters); 211 wake_up(&md_event_waiters);
212 } 212 }
213 213
214 /* 214 /*
215 * Enables to iterate over all existing md arrays 215 * Enables to iterate over all existing md arrays
216 * all_mddevs_lock protects this list. 216 * all_mddevs_lock protects this list.
217 */ 217 */
218 static LIST_HEAD(all_mddevs); 218 static LIST_HEAD(all_mddevs);
219 static DEFINE_SPINLOCK(all_mddevs_lock); 219 static DEFINE_SPINLOCK(all_mddevs_lock);
220 220
221 221
222 /* 222 /*
223 * iterates through all used mddevs in the system. 223 * iterates through all used mddevs in the system.
224 * We take care to grab the all_mddevs_lock whenever navigating 224 * We take care to grab the all_mddevs_lock whenever navigating
225 * the list, and to always hold a refcount when unlocked. 225 * the list, and to always hold a refcount when unlocked.
226 * Any code which breaks out of this loop while own 226 * Any code which breaks out of this loop while own
227 * a reference to the current mddev and must mddev_put it. 227 * a reference to the current mddev and must mddev_put it.
228 */ 228 */
229 #define for_each_mddev(_mddev,_tmp) \ 229 #define for_each_mddev(_mddev,_tmp) \
230 \ 230 \
231 for (({ spin_lock(&all_mddevs_lock); \ 231 for (({ spin_lock(&all_mddevs_lock); \
232 _tmp = all_mddevs.next; \ 232 _tmp = all_mddevs.next; \
233 _mddev = NULL;}); \ 233 _mddev = NULL;}); \
234 ({ if (_tmp != &all_mddevs) \ 234 ({ if (_tmp != &all_mddevs) \
235 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\ 235 mddev_get(list_entry(_tmp, struct mddev, all_mddevs));\
236 spin_unlock(&all_mddevs_lock); \ 236 spin_unlock(&all_mddevs_lock); \
237 if (_mddev) mddev_put(_mddev); \ 237 if (_mddev) mddev_put(_mddev); \
238 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \ 238 _mddev = list_entry(_tmp, struct mddev, all_mddevs); \
239 _tmp != &all_mddevs;}); \ 239 _tmp != &all_mddevs;}); \
240 ({ spin_lock(&all_mddevs_lock); \ 240 ({ spin_lock(&all_mddevs_lock); \
241 _tmp = _tmp->next;}) \ 241 _tmp = _tmp->next;}) \
242 ) 242 )
243 243
244 244
245 /* Rather than calling directly into the personality make_request function, 245 /* Rather than calling directly into the personality make_request function,
246 * IO requests come here first so that we can check if the device is 246 * IO requests come here first so that we can check if the device is
247 * being suspended pending a reconfiguration. 247 * being suspended pending a reconfiguration.
248 * We hold a refcount over the call to ->make_request. By the time that 248 * We hold a refcount over the call to ->make_request. By the time that
249 * call has finished, the bio has been linked into some internal structure 249 * call has finished, the bio has been linked into some internal structure
250 * and so is visible to ->quiesce(), so we don't need the refcount any more. 250 * and so is visible to ->quiesce(), so we don't need the refcount any more.
251 */ 251 */
252 static void md_make_request(struct request_queue *q, struct bio *bio) 252 static void md_make_request(struct request_queue *q, struct bio *bio)
253 { 253 {
254 const int rw = bio_data_dir(bio); 254 const int rw = bio_data_dir(bio);
255 struct mddev *mddev = q->queuedata; 255 struct mddev *mddev = q->queuedata;
256 int cpu; 256 int cpu;
257 unsigned int sectors; 257 unsigned int sectors;
258 258
259 if (mddev == NULL || mddev->pers == NULL 259 if (mddev == NULL || mddev->pers == NULL
260 || !mddev->ready) { 260 || !mddev->ready) {
261 bio_io_error(bio); 261 bio_io_error(bio);
262 return; 262 return;
263 } 263 }
264 if (mddev->ro == 1 && unlikely(rw == WRITE)) { 264 if (mddev->ro == 1 && unlikely(rw == WRITE)) {
265 bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS); 265 bio_endio(bio, bio_sectors(bio) == 0 ? 0 : -EROFS);
266 return; 266 return;
267 } 267 }
268 smp_rmb(); /* Ensure implications of 'active' are visible */ 268 smp_rmb(); /* Ensure implications of 'active' are visible */
269 rcu_read_lock(); 269 rcu_read_lock();
270 if (mddev->suspended) { 270 if (mddev->suspended) {
271 DEFINE_WAIT(__wait); 271 DEFINE_WAIT(__wait);
272 for (;;) { 272 for (;;) {
273 prepare_to_wait(&mddev->sb_wait, &__wait, 273 prepare_to_wait(&mddev->sb_wait, &__wait,
274 TASK_UNINTERRUPTIBLE); 274 TASK_UNINTERRUPTIBLE);
275 if (!mddev->suspended) 275 if (!mddev->suspended)
276 break; 276 break;
277 rcu_read_unlock(); 277 rcu_read_unlock();
278 schedule(); 278 schedule();
279 rcu_read_lock(); 279 rcu_read_lock();
280 } 280 }
281 finish_wait(&mddev->sb_wait, &__wait); 281 finish_wait(&mddev->sb_wait, &__wait);
282 } 282 }
283 atomic_inc(&mddev->active_io); 283 atomic_inc(&mddev->active_io);
284 rcu_read_unlock(); 284 rcu_read_unlock();
285 285
286 /* 286 /*
287 * save the sectors now since our bio can 287 * save the sectors now since our bio can
288 * go away inside make_request 288 * go away inside make_request
289 */ 289 */
290 sectors = bio_sectors(bio); 290 sectors = bio_sectors(bio);
291 mddev->pers->make_request(mddev, bio); 291 mddev->pers->make_request(mddev, bio);
292 292
293 cpu = part_stat_lock(); 293 cpu = part_stat_lock();
294 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]); 294 part_stat_inc(cpu, &mddev->gendisk->part0, ios[rw]);
295 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors); 295 part_stat_add(cpu, &mddev->gendisk->part0, sectors[rw], sectors);
296 part_stat_unlock(); 296 part_stat_unlock();
297 297
298 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended) 298 if (atomic_dec_and_test(&mddev->active_io) && mddev->suspended)
299 wake_up(&mddev->sb_wait); 299 wake_up(&mddev->sb_wait);
300 } 300 }
301 301
302 /* mddev_suspend makes sure no new requests are submitted 302 /* mddev_suspend makes sure no new requests are submitted
303 * to the device, and that any requests that have been submitted 303 * to the device, and that any requests that have been submitted
304 * are completely handled. 304 * are completely handled.
305 * Once ->stop is called and completes, the module will be completely 305 * Once ->stop is called and completes, the module will be completely
306 * unused. 306 * unused.
307 */ 307 */
308 void mddev_suspend(struct mddev *mddev) 308 void mddev_suspend(struct mddev *mddev)
309 { 309 {
310 BUG_ON(mddev->suspended); 310 BUG_ON(mddev->suspended);
311 mddev->suspended = 1; 311 mddev->suspended = 1;
312 synchronize_rcu(); 312 synchronize_rcu();
313 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0); 313 wait_event(mddev->sb_wait, atomic_read(&mddev->active_io) == 0);
314 mddev->pers->quiesce(mddev, 1); 314 mddev->pers->quiesce(mddev, 1);
315 315
316 del_timer_sync(&mddev->safemode_timer); 316 del_timer_sync(&mddev->safemode_timer);
317 } 317 }
318 EXPORT_SYMBOL_GPL(mddev_suspend); 318 EXPORT_SYMBOL_GPL(mddev_suspend);
319 319
320 void mddev_resume(struct mddev *mddev) 320 void mddev_resume(struct mddev *mddev)
321 { 321 {
322 mddev->suspended = 0; 322 mddev->suspended = 0;
323 wake_up(&mddev->sb_wait); 323 wake_up(&mddev->sb_wait);
324 mddev->pers->quiesce(mddev, 0); 324 mddev->pers->quiesce(mddev, 0);
325 325
326 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 326 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
327 md_wakeup_thread(mddev->thread); 327 md_wakeup_thread(mddev->thread);
328 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 328 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
329 } 329 }
330 EXPORT_SYMBOL_GPL(mddev_resume); 330 EXPORT_SYMBOL_GPL(mddev_resume);
331 331
332 int mddev_congested(struct mddev *mddev, int bits) 332 int mddev_congested(struct mddev *mddev, int bits)
333 { 333 {
334 return mddev->suspended; 334 return mddev->suspended;
335 } 335 }
336 EXPORT_SYMBOL(mddev_congested); 336 EXPORT_SYMBOL(mddev_congested);
337 337
338 /* 338 /*
339 * Generic flush handling for md 339 * Generic flush handling for md
340 */ 340 */
341 341
342 static void md_end_flush(struct bio *bio, int err) 342 static void md_end_flush(struct bio *bio, int err)
343 { 343 {
344 struct md_rdev *rdev = bio->bi_private; 344 struct md_rdev *rdev = bio->bi_private;
345 struct mddev *mddev = rdev->mddev; 345 struct mddev *mddev = rdev->mddev;
346 346
347 rdev_dec_pending(rdev, mddev); 347 rdev_dec_pending(rdev, mddev);
348 348
349 if (atomic_dec_and_test(&mddev->flush_pending)) { 349 if (atomic_dec_and_test(&mddev->flush_pending)) {
350 /* The pre-request flush has finished */ 350 /* The pre-request flush has finished */
351 queue_work(md_wq, &mddev->flush_work); 351 queue_work(md_wq, &mddev->flush_work);
352 } 352 }
353 bio_put(bio); 353 bio_put(bio);
354 } 354 }
355 355
356 static void md_submit_flush_data(struct work_struct *ws); 356 static void md_submit_flush_data(struct work_struct *ws);
357 357
358 static void submit_flushes(struct work_struct *ws) 358 static void submit_flushes(struct work_struct *ws)
359 { 359 {
360 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 360 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
361 struct md_rdev *rdev; 361 struct md_rdev *rdev;
362 362
363 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 363 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
364 atomic_set(&mddev->flush_pending, 1); 364 atomic_set(&mddev->flush_pending, 1);
365 rcu_read_lock(); 365 rcu_read_lock();
366 rdev_for_each_rcu(rdev, mddev) 366 rdev_for_each_rcu(rdev, mddev)
367 if (rdev->raid_disk >= 0 && 367 if (rdev->raid_disk >= 0 &&
368 !test_bit(Faulty, &rdev->flags)) { 368 !test_bit(Faulty, &rdev->flags)) {
369 /* Take two references, one is dropped 369 /* Take two references, one is dropped
370 * when request finishes, one after 370 * when request finishes, one after
371 * we reclaim rcu_read_lock 371 * we reclaim rcu_read_lock
372 */ 372 */
373 struct bio *bi; 373 struct bio *bi;
374 atomic_inc(&rdev->nr_pending); 374 atomic_inc(&rdev->nr_pending);
375 atomic_inc(&rdev->nr_pending); 375 atomic_inc(&rdev->nr_pending);
376 rcu_read_unlock(); 376 rcu_read_unlock();
377 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev); 377 bi = bio_alloc_mddev(GFP_NOIO, 0, mddev);
378 bi->bi_end_io = md_end_flush; 378 bi->bi_end_io = md_end_flush;
379 bi->bi_private = rdev; 379 bi->bi_private = rdev;
380 bi->bi_bdev = rdev->bdev; 380 bi->bi_bdev = rdev->bdev;
381 atomic_inc(&mddev->flush_pending); 381 atomic_inc(&mddev->flush_pending);
382 submit_bio(WRITE_FLUSH, bi); 382 submit_bio(WRITE_FLUSH, bi);
383 rcu_read_lock(); 383 rcu_read_lock();
384 rdev_dec_pending(rdev, mddev); 384 rdev_dec_pending(rdev, mddev);
385 } 385 }
386 rcu_read_unlock(); 386 rcu_read_unlock();
387 if (atomic_dec_and_test(&mddev->flush_pending)) 387 if (atomic_dec_and_test(&mddev->flush_pending))
388 queue_work(md_wq, &mddev->flush_work); 388 queue_work(md_wq, &mddev->flush_work);
389 } 389 }
390 390
391 static void md_submit_flush_data(struct work_struct *ws) 391 static void md_submit_flush_data(struct work_struct *ws)
392 { 392 {
393 struct mddev *mddev = container_of(ws, struct mddev, flush_work); 393 struct mddev *mddev = container_of(ws, struct mddev, flush_work);
394 struct bio *bio = mddev->flush_bio; 394 struct bio *bio = mddev->flush_bio;
395 395
396 if (bio->bi_iter.bi_size == 0) 396 if (bio->bi_iter.bi_size == 0)
397 /* an empty barrier - all done */ 397 /* an empty barrier - all done */
398 bio_endio(bio, 0); 398 bio_endio(bio, 0);
399 else { 399 else {
400 bio->bi_rw &= ~REQ_FLUSH; 400 bio->bi_rw &= ~REQ_FLUSH;
401 mddev->pers->make_request(mddev, bio); 401 mddev->pers->make_request(mddev, bio);
402 } 402 }
403 403
404 mddev->flush_bio = NULL; 404 mddev->flush_bio = NULL;
405 wake_up(&mddev->sb_wait); 405 wake_up(&mddev->sb_wait);
406 } 406 }
407 407
408 void md_flush_request(struct mddev *mddev, struct bio *bio) 408 void md_flush_request(struct mddev *mddev, struct bio *bio)
409 { 409 {
410 spin_lock_irq(&mddev->write_lock); 410 spin_lock_irq(&mddev->write_lock);
411 wait_event_lock_irq(mddev->sb_wait, 411 wait_event_lock_irq(mddev->sb_wait,
412 !mddev->flush_bio, 412 !mddev->flush_bio,
413 mddev->write_lock); 413 mddev->write_lock);
414 mddev->flush_bio = bio; 414 mddev->flush_bio = bio;
415 spin_unlock_irq(&mddev->write_lock); 415 spin_unlock_irq(&mddev->write_lock);
416 416
417 INIT_WORK(&mddev->flush_work, submit_flushes); 417 INIT_WORK(&mddev->flush_work, submit_flushes);
418 queue_work(md_wq, &mddev->flush_work); 418 queue_work(md_wq, &mddev->flush_work);
419 } 419 }
420 EXPORT_SYMBOL(md_flush_request); 420 EXPORT_SYMBOL(md_flush_request);
421 421
422 void md_unplug(struct blk_plug_cb *cb, bool from_schedule) 422 void md_unplug(struct blk_plug_cb *cb, bool from_schedule)
423 { 423 {
424 struct mddev *mddev = cb->data; 424 struct mddev *mddev = cb->data;
425 md_wakeup_thread(mddev->thread); 425 md_wakeup_thread(mddev->thread);
426 kfree(cb); 426 kfree(cb);
427 } 427 }
428 EXPORT_SYMBOL(md_unplug); 428 EXPORT_SYMBOL(md_unplug);
429 429
430 static inline struct mddev *mddev_get(struct mddev *mddev) 430 static inline struct mddev *mddev_get(struct mddev *mddev)
431 { 431 {
432 atomic_inc(&mddev->active); 432 atomic_inc(&mddev->active);
433 return mddev; 433 return mddev;
434 } 434 }
435 435
436 static void mddev_delayed_delete(struct work_struct *ws); 436 static void mddev_delayed_delete(struct work_struct *ws);
437 437
438 static void mddev_put(struct mddev *mddev) 438 static void mddev_put(struct mddev *mddev)
439 { 439 {
440 struct bio_set *bs = NULL; 440 struct bio_set *bs = NULL;
441 441
442 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock)) 442 if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
443 return; 443 return;
444 if (!mddev->raid_disks && list_empty(&mddev->disks) && 444 if (!mddev->raid_disks && list_empty(&mddev->disks) &&
445 mddev->ctime == 0 && !mddev->hold_active) { 445 mddev->ctime == 0 && !mddev->hold_active) {
446 /* Array is not configured at all, and not held active, 446 /* Array is not configured at all, and not held active,
447 * so destroy it */ 447 * so destroy it */
448 list_del_init(&mddev->all_mddevs); 448 list_del_init(&mddev->all_mddevs);
449 bs = mddev->bio_set; 449 bs = mddev->bio_set;
450 mddev->bio_set = NULL; 450 mddev->bio_set = NULL;
451 if (mddev->gendisk) { 451 if (mddev->gendisk) {
452 /* We did a probe so need to clean up. Call 452 /* We did a probe so need to clean up. Call
453 * queue_work inside the spinlock so that 453 * queue_work inside the spinlock so that
454 * flush_workqueue() after mddev_find will 454 * flush_workqueue() after mddev_find will
455 * succeed in waiting for the work to be done. 455 * succeed in waiting for the work to be done.
456 */ 456 */
457 INIT_WORK(&mddev->del_work, mddev_delayed_delete); 457 INIT_WORK(&mddev->del_work, mddev_delayed_delete);
458 queue_work(md_misc_wq, &mddev->del_work); 458 queue_work(md_misc_wq, &mddev->del_work);
459 } else 459 } else
460 kfree(mddev); 460 kfree(mddev);
461 } 461 }
462 spin_unlock(&all_mddevs_lock); 462 spin_unlock(&all_mddevs_lock);
463 if (bs) 463 if (bs)
464 bioset_free(bs); 464 bioset_free(bs);
465 } 465 }
466 466
467 void mddev_init(struct mddev *mddev) 467 void mddev_init(struct mddev *mddev)
468 { 468 {
469 mutex_init(&mddev->open_mutex); 469 mutex_init(&mddev->open_mutex);
470 mutex_init(&mddev->reconfig_mutex); 470 mutex_init(&mddev->reconfig_mutex);
471 mutex_init(&mddev->bitmap_info.mutex); 471 mutex_init(&mddev->bitmap_info.mutex);
472 INIT_LIST_HEAD(&mddev->disks); 472 INIT_LIST_HEAD(&mddev->disks);
473 INIT_LIST_HEAD(&mddev->all_mddevs); 473 INIT_LIST_HEAD(&mddev->all_mddevs);
474 init_timer(&mddev->safemode_timer); 474 init_timer(&mddev->safemode_timer);
475 atomic_set(&mddev->active, 1); 475 atomic_set(&mddev->active, 1);
476 atomic_set(&mddev->openers, 0); 476 atomic_set(&mddev->openers, 0);
477 atomic_set(&mddev->active_io, 0); 477 atomic_set(&mddev->active_io, 0);
478 spin_lock_init(&mddev->write_lock); 478 spin_lock_init(&mddev->write_lock);
479 atomic_set(&mddev->flush_pending, 0); 479 atomic_set(&mddev->flush_pending, 0);
480 init_waitqueue_head(&mddev->sb_wait); 480 init_waitqueue_head(&mddev->sb_wait);
481 init_waitqueue_head(&mddev->recovery_wait); 481 init_waitqueue_head(&mddev->recovery_wait);
482 mddev->reshape_position = MaxSector; 482 mddev->reshape_position = MaxSector;
483 mddev->reshape_backwards = 0; 483 mddev->reshape_backwards = 0;
484 mddev->last_sync_action = "none"; 484 mddev->last_sync_action = "none";
485 mddev->resync_min = 0; 485 mddev->resync_min = 0;
486 mddev->resync_max = MaxSector; 486 mddev->resync_max = MaxSector;
487 mddev->level = LEVEL_NONE; 487 mddev->level = LEVEL_NONE;
488 } 488 }
489 EXPORT_SYMBOL_GPL(mddev_init); 489 EXPORT_SYMBOL_GPL(mddev_init);
490 490
491 static struct mddev * mddev_find(dev_t unit) 491 static struct mddev * mddev_find(dev_t unit)
492 { 492 {
493 struct mddev *mddev, *new = NULL; 493 struct mddev *mddev, *new = NULL;
494 494
495 if (unit && MAJOR(unit) != MD_MAJOR) 495 if (unit && MAJOR(unit) != MD_MAJOR)
496 unit &= ~((1<<MdpMinorShift)-1); 496 unit &= ~((1<<MdpMinorShift)-1);
497 497
498 retry: 498 retry:
499 spin_lock(&all_mddevs_lock); 499 spin_lock(&all_mddevs_lock);
500 500
501 if (unit) { 501 if (unit) {
502 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 502 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
503 if (mddev->unit == unit) { 503 if (mddev->unit == unit) {
504 mddev_get(mddev); 504 mddev_get(mddev);
505 spin_unlock(&all_mddevs_lock); 505 spin_unlock(&all_mddevs_lock);
506 kfree(new); 506 kfree(new);
507 return mddev; 507 return mddev;
508 } 508 }
509 509
510 if (new) { 510 if (new) {
511 list_add(&new->all_mddevs, &all_mddevs); 511 list_add(&new->all_mddevs, &all_mddevs);
512 spin_unlock(&all_mddevs_lock); 512 spin_unlock(&all_mddevs_lock);
513 new->hold_active = UNTIL_IOCTL; 513 new->hold_active = UNTIL_IOCTL;
514 return new; 514 return new;
515 } 515 }
516 } else if (new) { 516 } else if (new) {
517 /* find an unused unit number */ 517 /* find an unused unit number */
518 static int next_minor = 512; 518 static int next_minor = 512;
519 int start = next_minor; 519 int start = next_minor;
520 int is_free = 0; 520 int is_free = 0;
521 int dev = 0; 521 int dev = 0;
522 while (!is_free) { 522 while (!is_free) {
523 dev = MKDEV(MD_MAJOR, next_minor); 523 dev = MKDEV(MD_MAJOR, next_minor);
524 next_minor++; 524 next_minor++;
525 if (next_minor > MINORMASK) 525 if (next_minor > MINORMASK)
526 next_minor = 0; 526 next_minor = 0;
527 if (next_minor == start) { 527 if (next_minor == start) {
528 /* Oh dear, all in use. */ 528 /* Oh dear, all in use. */
529 spin_unlock(&all_mddevs_lock); 529 spin_unlock(&all_mddevs_lock);
530 kfree(new); 530 kfree(new);
531 return NULL; 531 return NULL;
532 } 532 }
533 533
534 is_free = 1; 534 is_free = 1;
535 list_for_each_entry(mddev, &all_mddevs, all_mddevs) 535 list_for_each_entry(mddev, &all_mddevs, all_mddevs)
536 if (mddev->unit == dev) { 536 if (mddev->unit == dev) {
537 is_free = 0; 537 is_free = 0;
538 break; 538 break;
539 } 539 }
540 } 540 }
541 new->unit = dev; 541 new->unit = dev;
542 new->md_minor = MINOR(dev); 542 new->md_minor = MINOR(dev);
543 new->hold_active = UNTIL_STOP; 543 new->hold_active = UNTIL_STOP;
544 list_add(&new->all_mddevs, &all_mddevs); 544 list_add(&new->all_mddevs, &all_mddevs);
545 spin_unlock(&all_mddevs_lock); 545 spin_unlock(&all_mddevs_lock);
546 return new; 546 return new;
547 } 547 }
548 spin_unlock(&all_mddevs_lock); 548 spin_unlock(&all_mddevs_lock);
549 549
550 new = kzalloc(sizeof(*new), GFP_KERNEL); 550 new = kzalloc(sizeof(*new), GFP_KERNEL);
551 if (!new) 551 if (!new)
552 return NULL; 552 return NULL;
553 553
554 new->unit = unit; 554 new->unit = unit;
555 if (MAJOR(unit) == MD_MAJOR) 555 if (MAJOR(unit) == MD_MAJOR)
556 new->md_minor = MINOR(unit); 556 new->md_minor = MINOR(unit);
557 else 557 else
558 new->md_minor = MINOR(unit) >> MdpMinorShift; 558 new->md_minor = MINOR(unit) >> MdpMinorShift;
559 559
560 mddev_init(new); 560 mddev_init(new);
561 561
562 goto retry; 562 goto retry;
563 } 563 }
564 564
565 static inline int __must_check mddev_lock(struct mddev * mddev) 565 static inline int __must_check mddev_lock(struct mddev * mddev)
566 { 566 {
567 return mutex_lock_interruptible(&mddev->reconfig_mutex); 567 return mutex_lock_interruptible(&mddev->reconfig_mutex);
568 } 568 }
569 569
570 /* Sometimes we need to take the lock in a situation where 570 /* Sometimes we need to take the lock in a situation where
571 * failure due to interrupts is not acceptable. 571 * failure due to interrupts is not acceptable.
572 */ 572 */
573 static inline void mddev_lock_nointr(struct mddev * mddev) 573 static inline void mddev_lock_nointr(struct mddev * mddev)
574 { 574 {
575 mutex_lock(&mddev->reconfig_mutex); 575 mutex_lock(&mddev->reconfig_mutex);
576 } 576 }
577 577
578 static inline int mddev_is_locked(struct mddev *mddev) 578 static inline int mddev_is_locked(struct mddev *mddev)
579 { 579 {
580 return mutex_is_locked(&mddev->reconfig_mutex); 580 return mutex_is_locked(&mddev->reconfig_mutex);
581 } 581 }
582 582
583 static inline int mddev_trylock(struct mddev * mddev) 583 static inline int mddev_trylock(struct mddev * mddev)
584 { 584 {
585 return mutex_trylock(&mddev->reconfig_mutex); 585 return mutex_trylock(&mddev->reconfig_mutex);
586 } 586 }
587 587
588 static struct attribute_group md_redundancy_group; 588 static struct attribute_group md_redundancy_group;
589 589
590 static void mddev_unlock(struct mddev * mddev) 590 static void mddev_unlock(struct mddev * mddev)
591 { 591 {
592 if (mddev->to_remove) { 592 if (mddev->to_remove) {
593 /* These cannot be removed under reconfig_mutex as 593 /* These cannot be removed under reconfig_mutex as
594 * an access to the files will try to take reconfig_mutex 594 * an access to the files will try to take reconfig_mutex
595 * while holding the file unremovable, which leads to 595 * while holding the file unremovable, which leads to
596 * a deadlock. 596 * a deadlock.
597 * So hold set sysfs_active while the remove in happeing, 597 * So hold set sysfs_active while the remove in happeing,
598 * and anything else which might set ->to_remove or my 598 * and anything else which might set ->to_remove or my
599 * otherwise change the sysfs namespace will fail with 599 * otherwise change the sysfs namespace will fail with
600 * -EBUSY if sysfs_active is still set. 600 * -EBUSY if sysfs_active is still set.
601 * We set sysfs_active under reconfig_mutex and elsewhere 601 * We set sysfs_active under reconfig_mutex and elsewhere
602 * test it under the same mutex to ensure its correct value 602 * test it under the same mutex to ensure its correct value
603 * is seen. 603 * is seen.
604 */ 604 */
605 struct attribute_group *to_remove = mddev->to_remove; 605 struct attribute_group *to_remove = mddev->to_remove;
606 mddev->to_remove = NULL; 606 mddev->to_remove = NULL;
607 mddev->sysfs_active = 1; 607 mddev->sysfs_active = 1;
608 mutex_unlock(&mddev->reconfig_mutex); 608 mutex_unlock(&mddev->reconfig_mutex);
609 609
610 if (mddev->kobj.sd) { 610 if (mddev->kobj.sd) {
611 if (to_remove != &md_redundancy_group) 611 if (to_remove != &md_redundancy_group)
612 sysfs_remove_group(&mddev->kobj, to_remove); 612 sysfs_remove_group(&mddev->kobj, to_remove);
613 if (mddev->pers == NULL || 613 if (mddev->pers == NULL ||
614 mddev->pers->sync_request == NULL) { 614 mddev->pers->sync_request == NULL) {
615 sysfs_remove_group(&mddev->kobj, &md_redundancy_group); 615 sysfs_remove_group(&mddev->kobj, &md_redundancy_group);
616 if (mddev->sysfs_action) 616 if (mddev->sysfs_action)
617 sysfs_put(mddev->sysfs_action); 617 sysfs_put(mddev->sysfs_action);
618 mddev->sysfs_action = NULL; 618 mddev->sysfs_action = NULL;
619 } 619 }
620 } 620 }
621 mddev->sysfs_active = 0; 621 mddev->sysfs_active = 0;
622 } else 622 } else
623 mutex_unlock(&mddev->reconfig_mutex); 623 mutex_unlock(&mddev->reconfig_mutex);
624 624
625 /* As we've dropped the mutex we need a spinlock to 625 /* As we've dropped the mutex we need a spinlock to
626 * make sure the thread doesn't disappear 626 * make sure the thread doesn't disappear
627 */ 627 */
628 spin_lock(&pers_lock); 628 spin_lock(&pers_lock);
629 md_wakeup_thread(mddev->thread); 629 md_wakeup_thread(mddev->thread);
630 spin_unlock(&pers_lock); 630 spin_unlock(&pers_lock);
631 } 631 }
632 632
633 static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) 633 static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
634 { 634 {
635 struct md_rdev *rdev; 635 struct md_rdev *rdev;
636 636
637 rdev_for_each(rdev, mddev) 637 rdev_for_each(rdev, mddev)
638 if (rdev->desc_nr == nr) 638 if (rdev->desc_nr == nr)
639 return rdev; 639 return rdev;
640 640
641 return NULL; 641 return NULL;
642 } 642 }
643 643
644 static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr) 644 static struct md_rdev *find_rdev_nr_rcu(struct mddev *mddev, int nr)
645 { 645 {
646 struct md_rdev *rdev; 646 struct md_rdev *rdev;
647 647
648 rdev_for_each_rcu(rdev, mddev) 648 rdev_for_each_rcu(rdev, mddev)
649 if (rdev->desc_nr == nr) 649 if (rdev->desc_nr == nr)
650 return rdev; 650 return rdev;
651 651
652 return NULL; 652 return NULL;
653 } 653 }
654 654
655 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev) 655 static struct md_rdev *find_rdev(struct mddev *mddev, dev_t dev)
656 { 656 {
657 struct md_rdev *rdev; 657 struct md_rdev *rdev;
658 658
659 rdev_for_each(rdev, mddev) 659 rdev_for_each(rdev, mddev)
660 if (rdev->bdev->bd_dev == dev) 660 if (rdev->bdev->bd_dev == dev)
661 return rdev; 661 return rdev;
662 662
663 return NULL; 663 return NULL;
664 } 664 }
665 665
666 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev) 666 static struct md_rdev *find_rdev_rcu(struct mddev *mddev, dev_t dev)
667 { 667 {
668 struct md_rdev *rdev; 668 struct md_rdev *rdev;
669 669
670 rdev_for_each_rcu(rdev, mddev) 670 rdev_for_each_rcu(rdev, mddev)
671 if (rdev->bdev->bd_dev == dev) 671 if (rdev->bdev->bd_dev == dev)
672 return rdev; 672 return rdev;
673 673
674 return NULL; 674 return NULL;
675 } 675 }
676 676
677 static struct md_personality *find_pers(int level, char *clevel) 677 static struct md_personality *find_pers(int level, char *clevel)
678 { 678 {
679 struct md_personality *pers; 679 struct md_personality *pers;
680 list_for_each_entry(pers, &pers_list, list) { 680 list_for_each_entry(pers, &pers_list, list) {
681 if (level != LEVEL_NONE && pers->level == level) 681 if (level != LEVEL_NONE && pers->level == level)
682 return pers; 682 return pers;
683 if (strcmp(pers->name, clevel)==0) 683 if (strcmp(pers->name, clevel)==0)
684 return pers; 684 return pers;
685 } 685 }
686 return NULL; 686 return NULL;
687 } 687 }
688 688
689 /* return the offset of the super block in 512byte sectors */ 689 /* return the offset of the super block in 512byte sectors */
690 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev) 690 static inline sector_t calc_dev_sboffset(struct md_rdev *rdev)
691 { 691 {
692 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512; 692 sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
693 return MD_NEW_SIZE_SECTORS(num_sectors); 693 return MD_NEW_SIZE_SECTORS(num_sectors);
694 } 694 }
695 695
696 static int alloc_disk_sb(struct md_rdev * rdev) 696 static int alloc_disk_sb(struct md_rdev * rdev)
697 { 697 {
698 if (rdev->sb_page) 698 if (rdev->sb_page)
699 MD_BUG(); 699 MD_BUG();
700 700
701 rdev->sb_page = alloc_page(GFP_KERNEL); 701 rdev->sb_page = alloc_page(GFP_KERNEL);
702 if (!rdev->sb_page) { 702 if (!rdev->sb_page) {
703 printk(KERN_ALERT "md: out of memory.\n"); 703 printk(KERN_ALERT "md: out of memory.\n");
704 return -ENOMEM; 704 return -ENOMEM;
705 } 705 }
706 706
707 return 0; 707 return 0;
708 } 708 }
709 709
710 void md_rdev_clear(struct md_rdev *rdev) 710 void md_rdev_clear(struct md_rdev *rdev)
711 { 711 {
712 if (rdev->sb_page) { 712 if (rdev->sb_page) {
713 put_page(rdev->sb_page); 713 put_page(rdev->sb_page);
714 rdev->sb_loaded = 0; 714 rdev->sb_loaded = 0;
715 rdev->sb_page = NULL; 715 rdev->sb_page = NULL;
716 rdev->sb_start = 0; 716 rdev->sb_start = 0;
717 rdev->sectors = 0; 717 rdev->sectors = 0;
718 } 718 }
719 if (rdev->bb_page) { 719 if (rdev->bb_page) {
720 put_page(rdev->bb_page); 720 put_page(rdev->bb_page);
721 rdev->bb_page = NULL; 721 rdev->bb_page = NULL;
722 } 722 }
723 kfree(rdev->badblocks.page); 723 kfree(rdev->badblocks.page);
724 rdev->badblocks.page = NULL; 724 rdev->badblocks.page = NULL;
725 } 725 }
726 EXPORT_SYMBOL_GPL(md_rdev_clear); 726 EXPORT_SYMBOL_GPL(md_rdev_clear);
727 727
728 static void super_written(struct bio *bio, int error) 728 static void super_written(struct bio *bio, int error)
729 { 729 {
730 struct md_rdev *rdev = bio->bi_private; 730 struct md_rdev *rdev = bio->bi_private;
731 struct mddev *mddev = rdev->mddev; 731 struct mddev *mddev = rdev->mddev;
732 732
733 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) { 733 if (error || !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
734 printk("md: super_written gets error=%d, uptodate=%d\n", 734 printk("md: super_written gets error=%d, uptodate=%d\n",
735 error, test_bit(BIO_UPTODATE, &bio->bi_flags)); 735 error, test_bit(BIO_UPTODATE, &bio->bi_flags));
736 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags)); 736 WARN_ON(test_bit(BIO_UPTODATE, &bio->bi_flags));
737 md_error(mddev, rdev); 737 md_error(mddev, rdev);
738 } 738 }
739 739
740 if (atomic_dec_and_test(&mddev->pending_writes)) 740 if (atomic_dec_and_test(&mddev->pending_writes))
741 wake_up(&mddev->sb_wait); 741 wake_up(&mddev->sb_wait);
742 bio_put(bio); 742 bio_put(bio);
743 } 743 }
744 744
745 void md_super_write(struct mddev *mddev, struct md_rdev *rdev, 745 void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
746 sector_t sector, int size, struct page *page) 746 sector_t sector, int size, struct page *page)
747 { 747 {
748 /* write first size bytes of page to sector of rdev 748 /* write first size bytes of page to sector of rdev
749 * Increment mddev->pending_writes before returning 749 * Increment mddev->pending_writes before returning
750 * and decrement it on completion, waking up sb_wait 750 * and decrement it on completion, waking up sb_wait
751 * if zero is reached. 751 * if zero is reached.
752 * If an error occurred, call md_error 752 * If an error occurred, call md_error
753 */ 753 */
754 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev); 754 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
755 755
756 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev; 756 bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
757 bio->bi_iter.bi_sector = sector; 757 bio->bi_iter.bi_sector = sector;
758 bio_add_page(bio, page, size, 0); 758 bio_add_page(bio, page, size, 0);
759 bio->bi_private = rdev; 759 bio->bi_private = rdev;
760 bio->bi_end_io = super_written; 760 bio->bi_end_io = super_written;
761 761
762 atomic_inc(&mddev->pending_writes); 762 atomic_inc(&mddev->pending_writes);
763 submit_bio(WRITE_FLUSH_FUA, bio); 763 submit_bio(WRITE_FLUSH_FUA, bio);
764 } 764 }
765 765
766 void md_super_wait(struct mddev *mddev) 766 void md_super_wait(struct mddev *mddev)
767 { 767 {
768 /* wait for all superblock writes that were scheduled to complete */ 768 /* wait for all superblock writes that were scheduled to complete */
769 DEFINE_WAIT(wq); 769 DEFINE_WAIT(wq);
770 for(;;) { 770 for(;;) {
771 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE); 771 prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
772 if (atomic_read(&mddev->pending_writes)==0) 772 if (atomic_read(&mddev->pending_writes)==0)
773 break; 773 break;
774 schedule(); 774 schedule();
775 } 775 }
776 finish_wait(&mddev->sb_wait, &wq); 776 finish_wait(&mddev->sb_wait, &wq);
777 } 777 }
778 778
779 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, 779 int sync_page_io(struct md_rdev *rdev, sector_t sector, int size,
780 struct page *page, int rw, bool metadata_op) 780 struct page *page, int rw, bool metadata_op)
781 { 781 {
782 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev); 782 struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
783 int ret; 783 int ret;
784 784
785 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ? 785 bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
786 rdev->meta_bdev : rdev->bdev; 786 rdev->meta_bdev : rdev->bdev;
787 if (metadata_op) 787 if (metadata_op)
788 bio->bi_iter.bi_sector = sector + rdev->sb_start; 788 bio->bi_iter.bi_sector = sector + rdev->sb_start;
789 else if (rdev->mddev->reshape_position != MaxSector && 789 else if (rdev->mddev->reshape_position != MaxSector &&
790 (rdev->mddev->reshape_backwards == 790 (rdev->mddev->reshape_backwards ==
791 (sector >= rdev->mddev->reshape_position))) 791 (sector >= rdev->mddev->reshape_position)))
792 bio->bi_iter.bi_sector = sector + rdev->new_data_offset; 792 bio->bi_iter.bi_sector = sector + rdev->new_data_offset;
793 else 793 else
794 bio->bi_iter.bi_sector = sector + rdev->data_offset; 794 bio->bi_iter.bi_sector = sector + rdev->data_offset;
795 bio_add_page(bio, page, size, 0); 795 bio_add_page(bio, page, size, 0);
796 submit_bio_wait(rw, bio); 796 submit_bio_wait(rw, bio);
797 797
798 ret = test_bit(BIO_UPTODATE, &bio->bi_flags); 798 ret = test_bit(BIO_UPTODATE, &bio->bi_flags);
799 bio_put(bio); 799 bio_put(bio);
800 return ret; 800 return ret;
801 } 801 }
802 EXPORT_SYMBOL_GPL(sync_page_io); 802 EXPORT_SYMBOL_GPL(sync_page_io);
803 803
804 static int read_disk_sb(struct md_rdev * rdev, int size) 804 static int read_disk_sb(struct md_rdev * rdev, int size)
805 { 805 {
806 char b[BDEVNAME_SIZE]; 806 char b[BDEVNAME_SIZE];
807 if (!rdev->sb_page) { 807 if (!rdev->sb_page) {
808 MD_BUG(); 808 MD_BUG();
809 return -EINVAL; 809 return -EINVAL;
810 } 810 }
811 if (rdev->sb_loaded) 811 if (rdev->sb_loaded)
812 return 0; 812 return 0;
813 813
814 814
815 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true)) 815 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
816 goto fail; 816 goto fail;
817 rdev->sb_loaded = 1; 817 rdev->sb_loaded = 1;
818 return 0; 818 return 0;
819 819
820 fail: 820 fail:
821 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n", 821 printk(KERN_WARNING "md: disabled device %s, could not read superblock.\n",
822 bdevname(rdev->bdev,b)); 822 bdevname(rdev->bdev,b));
823 return -EINVAL; 823 return -EINVAL;
824 } 824 }
825 825
826 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2) 826 static int uuid_equal(mdp_super_t *sb1, mdp_super_t *sb2)
827 { 827 {
828 return sb1->set_uuid0 == sb2->set_uuid0 && 828 return sb1->set_uuid0 == sb2->set_uuid0 &&
829 sb1->set_uuid1 == sb2->set_uuid1 && 829 sb1->set_uuid1 == sb2->set_uuid1 &&
830 sb1->set_uuid2 == sb2->set_uuid2 && 830 sb1->set_uuid2 == sb2->set_uuid2 &&
831 sb1->set_uuid3 == sb2->set_uuid3; 831 sb1->set_uuid3 == sb2->set_uuid3;
832 } 832 }
833 833
834 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2) 834 static int sb_equal(mdp_super_t *sb1, mdp_super_t *sb2)
835 { 835 {
836 int ret; 836 int ret;
837 mdp_super_t *tmp1, *tmp2; 837 mdp_super_t *tmp1, *tmp2;
838 838
839 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL); 839 tmp1 = kmalloc(sizeof(*tmp1),GFP_KERNEL);
840 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL); 840 tmp2 = kmalloc(sizeof(*tmp2),GFP_KERNEL);
841 841
842 if (!tmp1 || !tmp2) { 842 if (!tmp1 || !tmp2) {
843 ret = 0; 843 ret = 0;
844 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n"); 844 printk(KERN_INFO "md.c sb_equal(): failed to allocate memory!\n");
845 goto abort; 845 goto abort;
846 } 846 }
847 847
848 *tmp1 = *sb1; 848 *tmp1 = *sb1;
849 *tmp2 = *sb2; 849 *tmp2 = *sb2;
850 850
851 /* 851 /*
852 * nr_disks is not constant 852 * nr_disks is not constant
853 */ 853 */
854 tmp1->nr_disks = 0; 854 tmp1->nr_disks = 0;
855 tmp2->nr_disks = 0; 855 tmp2->nr_disks = 0;
856 856
857 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0); 857 ret = (memcmp(tmp1, tmp2, MD_SB_GENERIC_CONSTANT_WORDS * 4) == 0);
858 abort: 858 abort:
859 kfree(tmp1); 859 kfree(tmp1);
860 kfree(tmp2); 860 kfree(tmp2);
861 return ret; 861 return ret;
862 } 862 }
863 863
864 864
865 static u32 md_csum_fold(u32 csum) 865 static u32 md_csum_fold(u32 csum)
866 { 866 {
867 csum = (csum & 0xffff) + (csum >> 16); 867 csum = (csum & 0xffff) + (csum >> 16);
868 return (csum & 0xffff) + (csum >> 16); 868 return (csum & 0xffff) + (csum >> 16);
869 } 869 }
870 870
871 static unsigned int calc_sb_csum(mdp_super_t * sb) 871 static unsigned int calc_sb_csum(mdp_super_t * sb)
872 { 872 {
873 u64 newcsum = 0; 873 u64 newcsum = 0;
874 u32 *sb32 = (u32*)sb; 874 u32 *sb32 = (u32*)sb;
875 int i; 875 int i;
876 unsigned int disk_csum, csum; 876 unsigned int disk_csum, csum;
877 877
878 disk_csum = sb->sb_csum; 878 disk_csum = sb->sb_csum;
879 sb->sb_csum = 0; 879 sb->sb_csum = 0;
880 880
881 for (i = 0; i < MD_SB_BYTES/4 ; i++) 881 for (i = 0; i < MD_SB_BYTES/4 ; i++)
882 newcsum += sb32[i]; 882 newcsum += sb32[i];
883 csum = (newcsum & 0xffffffff) + (newcsum>>32); 883 csum = (newcsum & 0xffffffff) + (newcsum>>32);
884 884
885 885
886 #ifdef CONFIG_ALPHA 886 #ifdef CONFIG_ALPHA
887 /* This used to use csum_partial, which was wrong for several 887 /* This used to use csum_partial, which was wrong for several
888 * reasons including that different results are returned on 888 * reasons including that different results are returned on
889 * different architectures. It isn't critical that we get exactly 889 * different architectures. It isn't critical that we get exactly
890 * the same return value as before (we always csum_fold before 890 * the same return value as before (we always csum_fold before
891 * testing, and that removes any differences). However as we 891 * testing, and that removes any differences). However as we
892 * know that csum_partial always returned a 16bit value on 892 * know that csum_partial always returned a 16bit value on
893 * alphas, do a fold to maximise conformity to previous behaviour. 893 * alphas, do a fold to maximise conformity to previous behaviour.
894 */ 894 */
895 sb->sb_csum = md_csum_fold(disk_csum); 895 sb->sb_csum = md_csum_fold(disk_csum);
896 #else 896 #else
897 sb->sb_csum = disk_csum; 897 sb->sb_csum = disk_csum;
898 #endif 898 #endif
899 return csum; 899 return csum;
900 } 900 }
901 901
902 902
903 /* 903 /*
904 * Handle superblock details. 904 * Handle superblock details.
905 * We want to be able to handle multiple superblock formats 905 * We want to be able to handle multiple superblock formats
906 * so we have a common interface to them all, and an array of 906 * so we have a common interface to them all, and an array of
907 * different handlers. 907 * different handlers.
908 * We rely on user-space to write the initial superblock, and support 908 * We rely on user-space to write the initial superblock, and support
909 * reading and updating of superblocks. 909 * reading and updating of superblocks.
910 * Interface methods are: 910 * Interface methods are:
911 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version) 911 * int load_super(struct md_rdev *dev, struct md_rdev *refdev, int minor_version)
912 * loads and validates a superblock on dev. 912 * loads and validates a superblock on dev.
913 * if refdev != NULL, compare superblocks on both devices 913 * if refdev != NULL, compare superblocks on both devices
914 * Return: 914 * Return:
915 * 0 - dev has a superblock that is compatible with refdev 915 * 0 - dev has a superblock that is compatible with refdev
916 * 1 - dev has a superblock that is compatible and newer than refdev 916 * 1 - dev has a superblock that is compatible and newer than refdev
917 * so dev should be used as the refdev in future 917 * so dev should be used as the refdev in future
918 * -EINVAL superblock incompatible or invalid 918 * -EINVAL superblock incompatible or invalid
919 * -othererror e.g. -EIO 919 * -othererror e.g. -EIO
920 * 920 *
921 * int validate_super(struct mddev *mddev, struct md_rdev *dev) 921 * int validate_super(struct mddev *mddev, struct md_rdev *dev)
922 * Verify that dev is acceptable into mddev. 922 * Verify that dev is acceptable into mddev.
923 * The first time, mddev->raid_disks will be 0, and data from 923 * The first time, mddev->raid_disks will be 0, and data from
924 * dev should be merged in. Subsequent calls check that dev 924 * dev should be merged in. Subsequent calls check that dev
925 * is new enough. Return 0 or -EINVAL 925 * is new enough. Return 0 or -EINVAL
926 * 926 *
927 * void sync_super(struct mddev *mddev, struct md_rdev *dev) 927 * void sync_super(struct mddev *mddev, struct md_rdev *dev)
928 * Update the superblock for rdev with data in mddev 928 * Update the superblock for rdev with data in mddev
929 * This does not write to disc. 929 * This does not write to disc.
930 * 930 *
931 */ 931 */
932 932
933 struct super_type { 933 struct super_type {
934 char *name; 934 char *name;
935 struct module *owner; 935 struct module *owner;
936 int (*load_super)(struct md_rdev *rdev, 936 int (*load_super)(struct md_rdev *rdev,
937 struct md_rdev *refdev, 937 struct md_rdev *refdev,
938 int minor_version); 938 int minor_version);
939 int (*validate_super)(struct mddev *mddev, 939 int (*validate_super)(struct mddev *mddev,
940 struct md_rdev *rdev); 940 struct md_rdev *rdev);
941 void (*sync_super)(struct mddev *mddev, 941 void (*sync_super)(struct mddev *mddev,
942 struct md_rdev *rdev); 942 struct md_rdev *rdev);
943 unsigned long long (*rdev_size_change)(struct md_rdev *rdev, 943 unsigned long long (*rdev_size_change)(struct md_rdev *rdev,
944 sector_t num_sectors); 944 sector_t num_sectors);
945 int (*allow_new_offset)(struct md_rdev *rdev, 945 int (*allow_new_offset)(struct md_rdev *rdev,
946 unsigned long long new_offset); 946 unsigned long long new_offset);
947 }; 947 };
948 948
949 /* 949 /*
950 * Check that the given mddev has no bitmap. 950 * Check that the given mddev has no bitmap.
951 * 951 *
952 * This function is called from the run method of all personalities that do not 952 * This function is called from the run method of all personalities that do not
953 * support bitmaps. It prints an error message and returns non-zero if mddev 953 * support bitmaps. It prints an error message and returns non-zero if mddev
954 * has a bitmap. Otherwise, it returns 0. 954 * has a bitmap. Otherwise, it returns 0.
955 * 955 *
956 */ 956 */
957 int md_check_no_bitmap(struct mddev *mddev) 957 int md_check_no_bitmap(struct mddev *mddev)
958 { 958 {
959 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset) 959 if (!mddev->bitmap_info.file && !mddev->bitmap_info.offset)
960 return 0; 960 return 0;
961 printk(KERN_ERR "%s: bitmaps are not supported for %s\n", 961 printk(KERN_ERR "%s: bitmaps are not supported for %s\n",
962 mdname(mddev), mddev->pers->name); 962 mdname(mddev), mddev->pers->name);
963 return 1; 963 return 1;
964 } 964 }
965 EXPORT_SYMBOL(md_check_no_bitmap); 965 EXPORT_SYMBOL(md_check_no_bitmap);
966 966
967 /* 967 /*
968 * load_super for 0.90.0 968 * load_super for 0.90.0
969 */ 969 */
970 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 970 static int super_90_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
971 { 971 {
972 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 972 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
973 mdp_super_t *sb; 973 mdp_super_t *sb;
974 int ret; 974 int ret;
975 975
976 /* 976 /*
977 * Calculate the position of the superblock (512byte sectors), 977 * Calculate the position of the superblock (512byte sectors),
978 * it's at the end of the disk. 978 * it's at the end of the disk.
979 * 979 *
980 * It also happens to be a multiple of 4Kb. 980 * It also happens to be a multiple of 4Kb.
981 */ 981 */
982 rdev->sb_start = calc_dev_sboffset(rdev); 982 rdev->sb_start = calc_dev_sboffset(rdev);
983 983
984 ret = read_disk_sb(rdev, MD_SB_BYTES); 984 ret = read_disk_sb(rdev, MD_SB_BYTES);
985 if (ret) return ret; 985 if (ret) return ret;
986 986
987 ret = -EINVAL; 987 ret = -EINVAL;
988 988
989 bdevname(rdev->bdev, b); 989 bdevname(rdev->bdev, b);
990 sb = page_address(rdev->sb_page); 990 sb = page_address(rdev->sb_page);
991 991
992 if (sb->md_magic != MD_SB_MAGIC) { 992 if (sb->md_magic != MD_SB_MAGIC) {
993 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 993 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
994 b); 994 b);
995 goto abort; 995 goto abort;
996 } 996 }
997 997
998 if (sb->major_version != 0 || 998 if (sb->major_version != 0 ||
999 sb->minor_version < 90 || 999 sb->minor_version < 90 ||
1000 sb->minor_version > 91) { 1000 sb->minor_version > 91) {
1001 printk(KERN_WARNING "Bad version number %d.%d on %s\n", 1001 printk(KERN_WARNING "Bad version number %d.%d on %s\n",
1002 sb->major_version, sb->minor_version, 1002 sb->major_version, sb->minor_version,
1003 b); 1003 b);
1004 goto abort; 1004 goto abort;
1005 } 1005 }
1006 1006
1007 if (sb->raid_disks <= 0) 1007 if (sb->raid_disks <= 0)
1008 goto abort; 1008 goto abort;
1009 1009
1010 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) { 1010 if (md_csum_fold(calc_sb_csum(sb)) != md_csum_fold(sb->sb_csum)) {
1011 printk(KERN_WARNING "md: invalid superblock checksum on %s\n", 1011 printk(KERN_WARNING "md: invalid superblock checksum on %s\n",
1012 b); 1012 b);
1013 goto abort; 1013 goto abort;
1014 } 1014 }
1015 1015
1016 rdev->preferred_minor = sb->md_minor; 1016 rdev->preferred_minor = sb->md_minor;
1017 rdev->data_offset = 0; 1017 rdev->data_offset = 0;
1018 rdev->new_data_offset = 0; 1018 rdev->new_data_offset = 0;
1019 rdev->sb_size = MD_SB_BYTES; 1019 rdev->sb_size = MD_SB_BYTES;
1020 rdev->badblocks.shift = -1; 1020 rdev->badblocks.shift = -1;
1021 1021
1022 if (sb->level == LEVEL_MULTIPATH) 1022 if (sb->level == LEVEL_MULTIPATH)
1023 rdev->desc_nr = -1; 1023 rdev->desc_nr = -1;
1024 else 1024 else
1025 rdev->desc_nr = sb->this_disk.number; 1025 rdev->desc_nr = sb->this_disk.number;
1026 1026
1027 if (!refdev) { 1027 if (!refdev) {
1028 ret = 1; 1028 ret = 1;
1029 } else { 1029 } else {
1030 __u64 ev1, ev2; 1030 __u64 ev1, ev2;
1031 mdp_super_t *refsb = page_address(refdev->sb_page); 1031 mdp_super_t *refsb = page_address(refdev->sb_page);
1032 if (!uuid_equal(refsb, sb)) { 1032 if (!uuid_equal(refsb, sb)) {
1033 printk(KERN_WARNING "md: %s has different UUID to %s\n", 1033 printk(KERN_WARNING "md: %s has different UUID to %s\n",
1034 b, bdevname(refdev->bdev,b2)); 1034 b, bdevname(refdev->bdev,b2));
1035 goto abort; 1035 goto abort;
1036 } 1036 }
1037 if (!sb_equal(refsb, sb)) { 1037 if (!sb_equal(refsb, sb)) {
1038 printk(KERN_WARNING "md: %s has same UUID" 1038 printk(KERN_WARNING "md: %s has same UUID"
1039 " but different superblock to %s\n", 1039 " but different superblock to %s\n",
1040 b, bdevname(refdev->bdev, b2)); 1040 b, bdevname(refdev->bdev, b2));
1041 goto abort; 1041 goto abort;
1042 } 1042 }
1043 ev1 = md_event(sb); 1043 ev1 = md_event(sb);
1044 ev2 = md_event(refsb); 1044 ev2 = md_event(refsb);
1045 if (ev1 > ev2) 1045 if (ev1 > ev2)
1046 ret = 1; 1046 ret = 1;
1047 else 1047 else
1048 ret = 0; 1048 ret = 0;
1049 } 1049 }
1050 rdev->sectors = rdev->sb_start; 1050 rdev->sectors = rdev->sb_start;
1051 /* Limit to 4TB as metadata cannot record more than that. 1051 /* Limit to 4TB as metadata cannot record more than that.
1052 * (not needed for Linear and RAID0 as metadata doesn't 1052 * (not needed for Linear and RAID0 as metadata doesn't
1053 * record this size) 1053 * record this size)
1054 */ 1054 */
1055 if (rdev->sectors >= (2ULL << 32) && sb->level >= 1) 1055 if (rdev->sectors >= (2ULL << 32) && sb->level >= 1)
1056 rdev->sectors = (2ULL << 32) - 2; 1056 rdev->sectors = (2ULL << 32) - 2;
1057 1057
1058 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1) 1058 if (rdev->sectors < ((sector_t)sb->size) * 2 && sb->level >= 1)
1059 /* "this cannot possibly happen" ... */ 1059 /* "this cannot possibly happen" ... */
1060 ret = -EINVAL; 1060 ret = -EINVAL;
1061 1061
1062 abort: 1062 abort:
1063 return ret; 1063 return ret;
1064 } 1064 }
1065 1065
1066 /* 1066 /*
1067 * validate_super for 0.90.0 1067 * validate_super for 0.90.0
1068 */ 1068 */
1069 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev) 1069 static int super_90_validate(struct mddev *mddev, struct md_rdev *rdev)
1070 { 1070 {
1071 mdp_disk_t *desc; 1071 mdp_disk_t *desc;
1072 mdp_super_t *sb = page_address(rdev->sb_page); 1072 mdp_super_t *sb = page_address(rdev->sb_page);
1073 __u64 ev1 = md_event(sb); 1073 __u64 ev1 = md_event(sb);
1074 1074
1075 rdev->raid_disk = -1; 1075 rdev->raid_disk = -1;
1076 clear_bit(Faulty, &rdev->flags); 1076 clear_bit(Faulty, &rdev->flags);
1077 clear_bit(In_sync, &rdev->flags); 1077 clear_bit(In_sync, &rdev->flags);
1078 clear_bit(Bitmap_sync, &rdev->flags); 1078 clear_bit(Bitmap_sync, &rdev->flags);
1079 clear_bit(WriteMostly, &rdev->flags); 1079 clear_bit(WriteMostly, &rdev->flags);
1080 1080
1081 if (mddev->raid_disks == 0) { 1081 if (mddev->raid_disks == 0) {
1082 mddev->major_version = 0; 1082 mddev->major_version = 0;
1083 mddev->minor_version = sb->minor_version; 1083 mddev->minor_version = sb->minor_version;
1084 mddev->patch_version = sb->patch_version; 1084 mddev->patch_version = sb->patch_version;
1085 mddev->external = 0; 1085 mddev->external = 0;
1086 mddev->chunk_sectors = sb->chunk_size >> 9; 1086 mddev->chunk_sectors = sb->chunk_size >> 9;
1087 mddev->ctime = sb->ctime; 1087 mddev->ctime = sb->ctime;
1088 mddev->utime = sb->utime; 1088 mddev->utime = sb->utime;
1089 mddev->level = sb->level; 1089 mddev->level = sb->level;
1090 mddev->clevel[0] = 0; 1090 mddev->clevel[0] = 0;
1091 mddev->layout = sb->layout; 1091 mddev->layout = sb->layout;
1092 mddev->raid_disks = sb->raid_disks; 1092 mddev->raid_disks = sb->raid_disks;
1093 mddev->dev_sectors = ((sector_t)sb->size) * 2; 1093 mddev->dev_sectors = ((sector_t)sb->size) * 2;
1094 mddev->events = ev1; 1094 mddev->events = ev1;
1095 mddev->bitmap_info.offset = 0; 1095 mddev->bitmap_info.offset = 0;
1096 mddev->bitmap_info.space = 0; 1096 mddev->bitmap_info.space = 0;
1097 /* bitmap can use 60 K after the 4K superblocks */ 1097 /* bitmap can use 60 K after the 4K superblocks */
1098 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 1098 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
1099 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 1099 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
1100 mddev->reshape_backwards = 0; 1100 mddev->reshape_backwards = 0;
1101 1101
1102 if (mddev->minor_version >= 91) { 1102 if (mddev->minor_version >= 91) {
1103 mddev->reshape_position = sb->reshape_position; 1103 mddev->reshape_position = sb->reshape_position;
1104 mddev->delta_disks = sb->delta_disks; 1104 mddev->delta_disks = sb->delta_disks;
1105 mddev->new_level = sb->new_level; 1105 mddev->new_level = sb->new_level;
1106 mddev->new_layout = sb->new_layout; 1106 mddev->new_layout = sb->new_layout;
1107 mddev->new_chunk_sectors = sb->new_chunk >> 9; 1107 mddev->new_chunk_sectors = sb->new_chunk >> 9;
1108 if (mddev->delta_disks < 0) 1108 if (mddev->delta_disks < 0)
1109 mddev->reshape_backwards = 1; 1109 mddev->reshape_backwards = 1;
1110 } else { 1110 } else {
1111 mddev->reshape_position = MaxSector; 1111 mddev->reshape_position = MaxSector;
1112 mddev->delta_disks = 0; 1112 mddev->delta_disks = 0;
1113 mddev->new_level = mddev->level; 1113 mddev->new_level = mddev->level;
1114 mddev->new_layout = mddev->layout; 1114 mddev->new_layout = mddev->layout;
1115 mddev->new_chunk_sectors = mddev->chunk_sectors; 1115 mddev->new_chunk_sectors = mddev->chunk_sectors;
1116 } 1116 }
1117 1117
1118 if (sb->state & (1<<MD_SB_CLEAN)) 1118 if (sb->state & (1<<MD_SB_CLEAN))
1119 mddev->recovery_cp = MaxSector; 1119 mddev->recovery_cp = MaxSector;
1120 else { 1120 else {
1121 if (sb->events_hi == sb->cp_events_hi && 1121 if (sb->events_hi == sb->cp_events_hi &&
1122 sb->events_lo == sb->cp_events_lo) { 1122 sb->events_lo == sb->cp_events_lo) {
1123 mddev->recovery_cp = sb->recovery_cp; 1123 mddev->recovery_cp = sb->recovery_cp;
1124 } else 1124 } else
1125 mddev->recovery_cp = 0; 1125 mddev->recovery_cp = 0;
1126 } 1126 }
1127 1127
1128 memcpy(mddev->uuid+0, &sb->set_uuid0, 4); 1128 memcpy(mddev->uuid+0, &sb->set_uuid0, 4);
1129 memcpy(mddev->uuid+4, &sb->set_uuid1, 4); 1129 memcpy(mddev->uuid+4, &sb->set_uuid1, 4);
1130 memcpy(mddev->uuid+8, &sb->set_uuid2, 4); 1130 memcpy(mddev->uuid+8, &sb->set_uuid2, 4);
1131 memcpy(mddev->uuid+12,&sb->set_uuid3, 4); 1131 memcpy(mddev->uuid+12,&sb->set_uuid3, 4);
1132 1132
1133 mddev->max_disks = MD_SB_DISKS; 1133 mddev->max_disks = MD_SB_DISKS;
1134 1134
1135 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) && 1135 if (sb->state & (1<<MD_SB_BITMAP_PRESENT) &&
1136 mddev->bitmap_info.file == NULL) { 1136 mddev->bitmap_info.file == NULL) {
1137 mddev->bitmap_info.offset = 1137 mddev->bitmap_info.offset =
1138 mddev->bitmap_info.default_offset; 1138 mddev->bitmap_info.default_offset;
1139 mddev->bitmap_info.space = 1139 mddev->bitmap_info.space =
1140 mddev->bitmap_info.default_space; 1140 mddev->bitmap_info.default_space;
1141 } 1141 }
1142 1142
1143 } else if (mddev->pers == NULL) { 1143 } else if (mddev->pers == NULL) {
1144 /* Insist on good event counter while assembling, except 1144 /* Insist on good event counter while assembling, except
1145 * for spares (which don't need an event count) */ 1145 * for spares (which don't need an event count) */
1146 ++ev1; 1146 ++ev1;
1147 if (sb->disks[rdev->desc_nr].state & ( 1147 if (sb->disks[rdev->desc_nr].state & (
1148 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE))) 1148 (1<<MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE)))
1149 if (ev1 < mddev->events) 1149 if (ev1 < mddev->events)
1150 return -EINVAL; 1150 return -EINVAL;
1151 } else if (mddev->bitmap) { 1151 } else if (mddev->bitmap) {
1152 /* if adding to array with a bitmap, then we can accept an 1152 /* if adding to array with a bitmap, then we can accept an
1153 * older device ... but not too old. 1153 * older device ... but not too old.
1154 */ 1154 */
1155 if (ev1 < mddev->bitmap->events_cleared) 1155 if (ev1 < mddev->bitmap->events_cleared)
1156 return 0; 1156 return 0;
1157 if (ev1 < mddev->events) 1157 if (ev1 < mddev->events)
1158 set_bit(Bitmap_sync, &rdev->flags); 1158 set_bit(Bitmap_sync, &rdev->flags);
1159 } else { 1159 } else {
1160 if (ev1 < mddev->events) 1160 if (ev1 < mddev->events)
1161 /* just a hot-add of a new device, leave raid_disk at -1 */ 1161 /* just a hot-add of a new device, leave raid_disk at -1 */
1162 return 0; 1162 return 0;
1163 } 1163 }
1164 1164
1165 if (mddev->level != LEVEL_MULTIPATH) { 1165 if (mddev->level != LEVEL_MULTIPATH) {
1166 desc = sb->disks + rdev->desc_nr; 1166 desc = sb->disks + rdev->desc_nr;
1167 1167
1168 if (desc->state & (1<<MD_DISK_FAULTY)) 1168 if (desc->state & (1<<MD_DISK_FAULTY))
1169 set_bit(Faulty, &rdev->flags); 1169 set_bit(Faulty, &rdev->flags);
1170 else if (desc->state & (1<<MD_DISK_SYNC) /* && 1170 else if (desc->state & (1<<MD_DISK_SYNC) /* &&
1171 desc->raid_disk < mddev->raid_disks */) { 1171 desc->raid_disk < mddev->raid_disks */) {
1172 set_bit(In_sync, &rdev->flags); 1172 set_bit(In_sync, &rdev->flags);
1173 rdev->raid_disk = desc->raid_disk; 1173 rdev->raid_disk = desc->raid_disk;
1174 rdev->saved_raid_disk = desc->raid_disk; 1174 rdev->saved_raid_disk = desc->raid_disk;
1175 } else if (desc->state & (1<<MD_DISK_ACTIVE)) { 1175 } else if (desc->state & (1<<MD_DISK_ACTIVE)) {
1176 /* active but not in sync implies recovery up to 1176 /* active but not in sync implies recovery up to
1177 * reshape position. We don't know exactly where 1177 * reshape position. We don't know exactly where
1178 * that is, so set to zero for now */ 1178 * that is, so set to zero for now */
1179 if (mddev->minor_version >= 91) { 1179 if (mddev->minor_version >= 91) {
1180 rdev->recovery_offset = 0; 1180 rdev->recovery_offset = 0;
1181 rdev->raid_disk = desc->raid_disk; 1181 rdev->raid_disk = desc->raid_disk;
1182 } 1182 }
1183 } 1183 }
1184 if (desc->state & (1<<MD_DISK_WRITEMOSTLY)) 1184 if (desc->state & (1<<MD_DISK_WRITEMOSTLY))
1185 set_bit(WriteMostly, &rdev->flags); 1185 set_bit(WriteMostly, &rdev->flags);
1186 } else /* MULTIPATH are always insync */ 1186 } else /* MULTIPATH are always insync */
1187 set_bit(In_sync, &rdev->flags); 1187 set_bit(In_sync, &rdev->flags);
1188 return 0; 1188 return 0;
1189 } 1189 }
1190 1190
1191 /* 1191 /*
1192 * sync_super for 0.90.0 1192 * sync_super for 0.90.0
1193 */ 1193 */
1194 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) 1194 static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1195 { 1195 {
1196 mdp_super_t *sb; 1196 mdp_super_t *sb;
1197 struct md_rdev *rdev2; 1197 struct md_rdev *rdev2;
1198 int next_spare = mddev->raid_disks; 1198 int next_spare = mddev->raid_disks;
1199 1199
1200 1200
1201 /* make rdev->sb match mddev data.. 1201 /* make rdev->sb match mddev data..
1202 * 1202 *
1203 * 1/ zero out disks 1203 * 1/ zero out disks
1204 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare); 1204 * 2/ Add info for each disk, keeping track of highest desc_nr (next_spare);
1205 * 3/ any empty disks < next_spare become removed 1205 * 3/ any empty disks < next_spare become removed
1206 * 1206 *
1207 * disks[0] gets initialised to REMOVED because 1207 * disks[0] gets initialised to REMOVED because
1208 * we cannot be sure from other fields if it has 1208 * we cannot be sure from other fields if it has
1209 * been initialised or not. 1209 * been initialised or not.
1210 */ 1210 */
1211 int i; 1211 int i;
1212 int active=0, working=0,failed=0,spare=0,nr_disks=0; 1212 int active=0, working=0,failed=0,spare=0,nr_disks=0;
1213 1213
1214 rdev->sb_size = MD_SB_BYTES; 1214 rdev->sb_size = MD_SB_BYTES;
1215 1215
1216 sb = page_address(rdev->sb_page); 1216 sb = page_address(rdev->sb_page);
1217 1217
1218 memset(sb, 0, sizeof(*sb)); 1218 memset(sb, 0, sizeof(*sb));
1219 1219
1220 sb->md_magic = MD_SB_MAGIC; 1220 sb->md_magic = MD_SB_MAGIC;
1221 sb->major_version = mddev->major_version; 1221 sb->major_version = mddev->major_version;
1222 sb->patch_version = mddev->patch_version; 1222 sb->patch_version = mddev->patch_version;
1223 sb->gvalid_words = 0; /* ignored */ 1223 sb->gvalid_words = 0; /* ignored */
1224 memcpy(&sb->set_uuid0, mddev->uuid+0, 4); 1224 memcpy(&sb->set_uuid0, mddev->uuid+0, 4);
1225 memcpy(&sb->set_uuid1, mddev->uuid+4, 4); 1225 memcpy(&sb->set_uuid1, mddev->uuid+4, 4);
1226 memcpy(&sb->set_uuid2, mddev->uuid+8, 4); 1226 memcpy(&sb->set_uuid2, mddev->uuid+8, 4);
1227 memcpy(&sb->set_uuid3, mddev->uuid+12,4); 1227 memcpy(&sb->set_uuid3, mddev->uuid+12,4);
1228 1228
1229 sb->ctime = mddev->ctime; 1229 sb->ctime = mddev->ctime;
1230 sb->level = mddev->level; 1230 sb->level = mddev->level;
1231 sb->size = mddev->dev_sectors / 2; 1231 sb->size = mddev->dev_sectors / 2;
1232 sb->raid_disks = mddev->raid_disks; 1232 sb->raid_disks = mddev->raid_disks;
1233 sb->md_minor = mddev->md_minor; 1233 sb->md_minor = mddev->md_minor;
1234 sb->not_persistent = 0; 1234 sb->not_persistent = 0;
1235 sb->utime = mddev->utime; 1235 sb->utime = mddev->utime;
1236 sb->state = 0; 1236 sb->state = 0;
1237 sb->events_hi = (mddev->events>>32); 1237 sb->events_hi = (mddev->events>>32);
1238 sb->events_lo = (u32)mddev->events; 1238 sb->events_lo = (u32)mddev->events;
1239 1239
1240 if (mddev->reshape_position == MaxSector) 1240 if (mddev->reshape_position == MaxSector)
1241 sb->minor_version = 90; 1241 sb->minor_version = 90;
1242 else { 1242 else {
1243 sb->minor_version = 91; 1243 sb->minor_version = 91;
1244 sb->reshape_position = mddev->reshape_position; 1244 sb->reshape_position = mddev->reshape_position;
1245 sb->new_level = mddev->new_level; 1245 sb->new_level = mddev->new_level;
1246 sb->delta_disks = mddev->delta_disks; 1246 sb->delta_disks = mddev->delta_disks;
1247 sb->new_layout = mddev->new_layout; 1247 sb->new_layout = mddev->new_layout;
1248 sb->new_chunk = mddev->new_chunk_sectors << 9; 1248 sb->new_chunk = mddev->new_chunk_sectors << 9;
1249 } 1249 }
1250 mddev->minor_version = sb->minor_version; 1250 mddev->minor_version = sb->minor_version;
1251 if (mddev->in_sync) 1251 if (mddev->in_sync)
1252 { 1252 {
1253 sb->recovery_cp = mddev->recovery_cp; 1253 sb->recovery_cp = mddev->recovery_cp;
1254 sb->cp_events_hi = (mddev->events>>32); 1254 sb->cp_events_hi = (mddev->events>>32);
1255 sb->cp_events_lo = (u32)mddev->events; 1255 sb->cp_events_lo = (u32)mddev->events;
1256 if (mddev->recovery_cp == MaxSector) 1256 if (mddev->recovery_cp == MaxSector)
1257 sb->state = (1<< MD_SB_CLEAN); 1257 sb->state = (1<< MD_SB_CLEAN);
1258 } else 1258 } else
1259 sb->recovery_cp = 0; 1259 sb->recovery_cp = 0;
1260 1260
1261 sb->layout = mddev->layout; 1261 sb->layout = mddev->layout;
1262 sb->chunk_size = mddev->chunk_sectors << 9; 1262 sb->chunk_size = mddev->chunk_sectors << 9;
1263 1263
1264 if (mddev->bitmap && mddev->bitmap_info.file == NULL) 1264 if (mddev->bitmap && mddev->bitmap_info.file == NULL)
1265 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1265 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1266 1266
1267 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1267 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1268 rdev_for_each(rdev2, mddev) { 1268 rdev_for_each(rdev2, mddev) {
1269 mdp_disk_t *d; 1269 mdp_disk_t *d;
1270 int desc_nr; 1270 int desc_nr;
1271 int is_active = test_bit(In_sync, &rdev2->flags); 1271 int is_active = test_bit(In_sync, &rdev2->flags);
1272 1272
1273 if (rdev2->raid_disk >= 0 && 1273 if (rdev2->raid_disk >= 0 &&
1274 sb->minor_version >= 91) 1274 sb->minor_version >= 91)
1275 /* we have nowhere to store the recovery_offset, 1275 /* we have nowhere to store the recovery_offset,
1276 * but if it is not below the reshape_position, 1276 * but if it is not below the reshape_position,
1277 * we can piggy-back on that. 1277 * we can piggy-back on that.
1278 */ 1278 */
1279 is_active = 1; 1279 is_active = 1;
1280 if (rdev2->raid_disk < 0 || 1280 if (rdev2->raid_disk < 0 ||
1281 test_bit(Faulty, &rdev2->flags)) 1281 test_bit(Faulty, &rdev2->flags))
1282 is_active = 0; 1282 is_active = 0;
1283 if (is_active) 1283 if (is_active)
1284 desc_nr = rdev2->raid_disk; 1284 desc_nr = rdev2->raid_disk;
1285 else 1285 else
1286 desc_nr = next_spare++; 1286 desc_nr = next_spare++;
1287 rdev2->desc_nr = desc_nr; 1287 rdev2->desc_nr = desc_nr;
1288 d = &sb->disks[rdev2->desc_nr]; 1288 d = &sb->disks[rdev2->desc_nr];
1289 nr_disks++; 1289 nr_disks++;
1290 d->number = rdev2->desc_nr; 1290 d->number = rdev2->desc_nr;
1291 d->major = MAJOR(rdev2->bdev->bd_dev); 1291 d->major = MAJOR(rdev2->bdev->bd_dev);
1292 d->minor = MINOR(rdev2->bdev->bd_dev); 1292 d->minor = MINOR(rdev2->bdev->bd_dev);
1293 if (is_active) 1293 if (is_active)
1294 d->raid_disk = rdev2->raid_disk; 1294 d->raid_disk = rdev2->raid_disk;
1295 else 1295 else
1296 d->raid_disk = rdev2->desc_nr; /* compatibility */ 1296 d->raid_disk = rdev2->desc_nr; /* compatibility */
1297 if (test_bit(Faulty, &rdev2->flags)) 1297 if (test_bit(Faulty, &rdev2->flags))
1298 d->state = (1<<MD_DISK_FAULTY); 1298 d->state = (1<<MD_DISK_FAULTY);
1299 else if (is_active) { 1299 else if (is_active) {
1300 d->state = (1<<MD_DISK_ACTIVE); 1300 d->state = (1<<MD_DISK_ACTIVE);
1301 if (test_bit(In_sync, &rdev2->flags)) 1301 if (test_bit(In_sync, &rdev2->flags))
1302 d->state |= (1<<MD_DISK_SYNC); 1302 d->state |= (1<<MD_DISK_SYNC);
1303 active++; 1303 active++;
1304 working++; 1304 working++;
1305 } else { 1305 } else {
1306 d->state = 0; 1306 d->state = 0;
1307 spare++; 1307 spare++;
1308 working++; 1308 working++;
1309 } 1309 }
1310 if (test_bit(WriteMostly, &rdev2->flags)) 1310 if (test_bit(WriteMostly, &rdev2->flags))
1311 d->state |= (1<<MD_DISK_WRITEMOSTLY); 1311 d->state |= (1<<MD_DISK_WRITEMOSTLY);
1312 } 1312 }
1313 /* now set the "removed" and "faulty" bits on any missing devices */ 1313 /* now set the "removed" and "faulty" bits on any missing devices */
1314 for (i=0 ; i < mddev->raid_disks ; i++) { 1314 for (i=0 ; i < mddev->raid_disks ; i++) {
1315 mdp_disk_t *d = &sb->disks[i]; 1315 mdp_disk_t *d = &sb->disks[i];
1316 if (d->state == 0 && d->number == 0) { 1316 if (d->state == 0 && d->number == 0) {
1317 d->number = i; 1317 d->number = i;
1318 d->raid_disk = i; 1318 d->raid_disk = i;
1319 d->state = (1<<MD_DISK_REMOVED); 1319 d->state = (1<<MD_DISK_REMOVED);
1320 d->state |= (1<<MD_DISK_FAULTY); 1320 d->state |= (1<<MD_DISK_FAULTY);
1321 failed++; 1321 failed++;
1322 } 1322 }
1323 } 1323 }
1324 sb->nr_disks = nr_disks; 1324 sb->nr_disks = nr_disks;
1325 sb->active_disks = active; 1325 sb->active_disks = active;
1326 sb->working_disks = working; 1326 sb->working_disks = working;
1327 sb->failed_disks = failed; 1327 sb->failed_disks = failed;
1328 sb->spare_disks = spare; 1328 sb->spare_disks = spare;
1329 1329
1330 sb->this_disk = sb->disks[rdev->desc_nr]; 1330 sb->this_disk = sb->disks[rdev->desc_nr];
1331 sb->sb_csum = calc_sb_csum(sb); 1331 sb->sb_csum = calc_sb_csum(sb);
1332 } 1332 }
1333 1333
1334 /* 1334 /*
1335 * rdev_size_change for 0.90.0 1335 * rdev_size_change for 0.90.0
1336 */ 1336 */
1337 static unsigned long long 1337 static unsigned long long
1338 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1338 super_90_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1339 { 1339 {
1340 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1340 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1341 return 0; /* component must fit device */ 1341 return 0; /* component must fit device */
1342 if (rdev->mddev->bitmap_info.offset) 1342 if (rdev->mddev->bitmap_info.offset)
1343 return 0; /* can't move bitmap */ 1343 return 0; /* can't move bitmap */
1344 rdev->sb_start = calc_dev_sboffset(rdev); 1344 rdev->sb_start = calc_dev_sboffset(rdev);
1345 if (!num_sectors || num_sectors > rdev->sb_start) 1345 if (!num_sectors || num_sectors > rdev->sb_start)
1346 num_sectors = rdev->sb_start; 1346 num_sectors = rdev->sb_start;
1347 /* Limit to 4TB as metadata cannot record more than that. 1347 /* Limit to 4TB as metadata cannot record more than that.
1348 * 4TB == 2^32 KB, or 2*2^32 sectors. 1348 * 4TB == 2^32 KB, or 2*2^32 sectors.
1349 */ 1349 */
1350 if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1) 1350 if (num_sectors >= (2ULL << 32) && rdev->mddev->level >= 1)
1351 num_sectors = (2ULL << 32) - 2; 1351 num_sectors = (2ULL << 32) - 2;
1352 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1352 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1353 rdev->sb_page); 1353 rdev->sb_page);
1354 md_super_wait(rdev->mddev); 1354 md_super_wait(rdev->mddev);
1355 return num_sectors; 1355 return num_sectors;
1356 } 1356 }
1357 1357
1358 static int 1358 static int
1359 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset) 1359 super_90_allow_new_offset(struct md_rdev *rdev, unsigned long long new_offset)
1360 { 1360 {
1361 /* non-zero offset changes not possible with v0.90 */ 1361 /* non-zero offset changes not possible with v0.90 */
1362 return new_offset == 0; 1362 return new_offset == 0;
1363 } 1363 }
1364 1364
1365 /* 1365 /*
1366 * version 1 superblock 1366 * version 1 superblock
1367 */ 1367 */
1368 1368
1369 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) 1369 static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1370 { 1370 {
1371 __le32 disk_csum; 1371 __le32 disk_csum;
1372 u32 csum; 1372 u32 csum;
1373 unsigned long long newcsum; 1373 unsigned long long newcsum;
1374 int size = 256 + le32_to_cpu(sb->max_dev)*2; 1374 int size = 256 + le32_to_cpu(sb->max_dev)*2;
1375 __le32 *isuper = (__le32*)sb; 1375 __le32 *isuper = (__le32*)sb;
1376 1376
1377 disk_csum = sb->sb_csum; 1377 disk_csum = sb->sb_csum;
1378 sb->sb_csum = 0; 1378 sb->sb_csum = 0;
1379 newcsum = 0; 1379 newcsum = 0;
1380 for (; size >= 4; size -= 4) 1380 for (; size >= 4; size -= 4)
1381 newcsum += le32_to_cpu(*isuper++); 1381 newcsum += le32_to_cpu(*isuper++);
1382 1382
1383 if (size == 2) 1383 if (size == 2)
1384 newcsum += le16_to_cpu(*(__le16*) isuper); 1384 newcsum += le16_to_cpu(*(__le16*) isuper);
1385 1385
1386 csum = (newcsum & 0xffffffff) + (newcsum >> 32); 1386 csum = (newcsum & 0xffffffff) + (newcsum >> 32);
1387 sb->sb_csum = disk_csum; 1387 sb->sb_csum = disk_csum;
1388 return cpu_to_le32(csum); 1388 return cpu_to_le32(csum);
1389 } 1389 }
1390 1390
1391 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, 1391 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1392 int acknowledged); 1392 int acknowledged);
1393 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version) 1393 static int super_1_load(struct md_rdev *rdev, struct md_rdev *refdev, int minor_version)
1394 { 1394 {
1395 struct mdp_superblock_1 *sb; 1395 struct mdp_superblock_1 *sb;
1396 int ret; 1396 int ret;
1397 sector_t sb_start; 1397 sector_t sb_start;
1398 sector_t sectors; 1398 sector_t sectors;
1399 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 1399 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
1400 int bmask; 1400 int bmask;
1401 1401
1402 /* 1402 /*
1403 * Calculate the position of the superblock in 512byte sectors. 1403 * Calculate the position of the superblock in 512byte sectors.
1404 * It is always aligned to a 4K boundary and 1404 * It is always aligned to a 4K boundary and
1405 * depeding on minor_version, it can be: 1405 * depeding on minor_version, it can be:
1406 * 0: At least 8K, but less than 12K, from end of device 1406 * 0: At least 8K, but less than 12K, from end of device
1407 * 1: At start of device 1407 * 1: At start of device
1408 * 2: 4K from start of device. 1408 * 2: 4K from start of device.
1409 */ 1409 */
1410 switch(minor_version) { 1410 switch(minor_version) {
1411 case 0: 1411 case 0:
1412 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9; 1412 sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
1413 sb_start -= 8*2; 1413 sb_start -= 8*2;
1414 sb_start &= ~(sector_t)(4*2-1); 1414 sb_start &= ~(sector_t)(4*2-1);
1415 break; 1415 break;
1416 case 1: 1416 case 1:
1417 sb_start = 0; 1417 sb_start = 0;
1418 break; 1418 break;
1419 case 2: 1419 case 2:
1420 sb_start = 8; 1420 sb_start = 8;
1421 break; 1421 break;
1422 default: 1422 default:
1423 return -EINVAL; 1423 return -EINVAL;
1424 } 1424 }
1425 rdev->sb_start = sb_start; 1425 rdev->sb_start = sb_start;
1426 1426
1427 /* superblock is rarely larger than 1K, but it can be larger, 1427 /* superblock is rarely larger than 1K, but it can be larger,
1428 * and it is safe to read 4k, so we do that 1428 * and it is safe to read 4k, so we do that
1429 */ 1429 */
1430 ret = read_disk_sb(rdev, 4096); 1430 ret = read_disk_sb(rdev, 4096);
1431 if (ret) return ret; 1431 if (ret) return ret;
1432 1432
1433 1433
1434 sb = page_address(rdev->sb_page); 1434 sb = page_address(rdev->sb_page);
1435 1435
1436 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1436 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1437 sb->major_version != cpu_to_le32(1) || 1437 sb->major_version != cpu_to_le32(1) ||
1438 le32_to_cpu(sb->max_dev) > (4096-256)/2 || 1438 le32_to_cpu(sb->max_dev) > (4096-256)/2 ||
1439 le64_to_cpu(sb->super_offset) != rdev->sb_start || 1439 le64_to_cpu(sb->super_offset) != rdev->sb_start ||
1440 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0) 1440 (le32_to_cpu(sb->feature_map) & ~MD_FEATURE_ALL) != 0)
1441 return -EINVAL; 1441 return -EINVAL;
1442 1442
1443 if (calc_sb_1_csum(sb) != sb->sb_csum) { 1443 if (calc_sb_1_csum(sb) != sb->sb_csum) {
1444 printk("md: invalid superblock checksum on %s\n", 1444 printk("md: invalid superblock checksum on %s\n",
1445 bdevname(rdev->bdev,b)); 1445 bdevname(rdev->bdev,b));
1446 return -EINVAL; 1446 return -EINVAL;
1447 } 1447 }
1448 if (le64_to_cpu(sb->data_size) < 10) { 1448 if (le64_to_cpu(sb->data_size) < 10) {
1449 printk("md: data_size too small on %s\n", 1449 printk("md: data_size too small on %s\n",
1450 bdevname(rdev->bdev,b)); 1450 bdevname(rdev->bdev,b));
1451 return -EINVAL; 1451 return -EINVAL;
1452 } 1452 }
1453 if (sb->pad0 || 1453 if (sb->pad0 ||
1454 sb->pad3[0] || 1454 sb->pad3[0] ||
1455 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1]))) 1455 memcmp(sb->pad3, sb->pad3+1, sizeof(sb->pad3) - sizeof(sb->pad3[1])))
1456 /* Some padding is non-zero, might be a new feature */ 1456 /* Some padding is non-zero, might be a new feature */
1457 return -EINVAL; 1457 return -EINVAL;
1458 1458
1459 rdev->preferred_minor = 0xffff; 1459 rdev->preferred_minor = 0xffff;
1460 rdev->data_offset = le64_to_cpu(sb->data_offset); 1460 rdev->data_offset = le64_to_cpu(sb->data_offset);
1461 rdev->new_data_offset = rdev->data_offset; 1461 rdev->new_data_offset = rdev->data_offset;
1462 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) && 1462 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE) &&
1463 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) 1463 (le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
1464 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset); 1464 rdev->new_data_offset += (s32)le32_to_cpu(sb->new_offset);
1465 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); 1465 atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read));
1466 1466
1467 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; 1467 rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256;
1468 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1468 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1469 if (rdev->sb_size & bmask) 1469 if (rdev->sb_size & bmask)
1470 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1470 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1471 1471
1472 if (minor_version 1472 if (minor_version
1473 && rdev->data_offset < sb_start + (rdev->sb_size/512)) 1473 && rdev->data_offset < sb_start + (rdev->sb_size/512))
1474 return -EINVAL; 1474 return -EINVAL;
1475 if (minor_version 1475 if (minor_version
1476 && rdev->new_data_offset < sb_start + (rdev->sb_size/512)) 1476 && rdev->new_data_offset < sb_start + (rdev->sb_size/512))
1477 return -EINVAL; 1477 return -EINVAL;
1478 1478
1479 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH)) 1479 if (sb->level == cpu_to_le32(LEVEL_MULTIPATH))
1480 rdev->desc_nr = -1; 1480 rdev->desc_nr = -1;
1481 else 1481 else
1482 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1482 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1483 1483
1484 if (!rdev->bb_page) { 1484 if (!rdev->bb_page) {
1485 rdev->bb_page = alloc_page(GFP_KERNEL); 1485 rdev->bb_page = alloc_page(GFP_KERNEL);
1486 if (!rdev->bb_page) 1486 if (!rdev->bb_page)
1487 return -ENOMEM; 1487 return -ENOMEM;
1488 } 1488 }
1489 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && 1489 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1490 rdev->badblocks.count == 0) { 1490 rdev->badblocks.count == 0) {
1491 /* need to load the bad block list. 1491 /* need to load the bad block list.
1492 * Currently we limit it to one page. 1492 * Currently we limit it to one page.
1493 */ 1493 */
1494 s32 offset; 1494 s32 offset;
1495 sector_t bb_sector; 1495 sector_t bb_sector;
1496 u64 *bbp; 1496 u64 *bbp;
1497 int i; 1497 int i;
1498 int sectors = le16_to_cpu(sb->bblog_size); 1498 int sectors = le16_to_cpu(sb->bblog_size);
1499 if (sectors > (PAGE_SIZE / 512)) 1499 if (sectors > (PAGE_SIZE / 512))
1500 return -EINVAL; 1500 return -EINVAL;
1501 offset = le32_to_cpu(sb->bblog_offset); 1501 offset = le32_to_cpu(sb->bblog_offset);
1502 if (offset == 0) 1502 if (offset == 0)
1503 return -EINVAL; 1503 return -EINVAL;
1504 bb_sector = (long long)offset; 1504 bb_sector = (long long)offset;
1505 if (!sync_page_io(rdev, bb_sector, sectors << 9, 1505 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1506 rdev->bb_page, READ, true)) 1506 rdev->bb_page, READ, true))
1507 return -EIO; 1507 return -EIO;
1508 bbp = (u64 *)page_address(rdev->bb_page); 1508 bbp = (u64 *)page_address(rdev->bb_page);
1509 rdev->badblocks.shift = sb->bblog_shift; 1509 rdev->badblocks.shift = sb->bblog_shift;
1510 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { 1510 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1511 u64 bb = le64_to_cpu(*bbp); 1511 u64 bb = le64_to_cpu(*bbp);
1512 int count = bb & (0x3ff); 1512 int count = bb & (0x3ff);
1513 u64 sector = bb >> 10; 1513 u64 sector = bb >> 10;
1514 sector <<= sb->bblog_shift; 1514 sector <<= sb->bblog_shift;
1515 count <<= sb->bblog_shift; 1515 count <<= sb->bblog_shift;
1516 if (bb + 1 == 0) 1516 if (bb + 1 == 0)
1517 break; 1517 break;
1518 if (md_set_badblocks(&rdev->badblocks, 1518 if (md_set_badblocks(&rdev->badblocks,
1519 sector, count, 1) == 0) 1519 sector, count, 1) == 0)
1520 return -EINVAL; 1520 return -EINVAL;
1521 } 1521 }
1522 } else if (sb->bblog_offset != 0) 1522 } else if (sb->bblog_offset != 0)
1523 rdev->badblocks.shift = 0; 1523 rdev->badblocks.shift = 0;
1524 1524
1525 if (!refdev) { 1525 if (!refdev) {
1526 ret = 1; 1526 ret = 1;
1527 } else { 1527 } else {
1528 __u64 ev1, ev2; 1528 __u64 ev1, ev2;
1529 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); 1529 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1530 1530
1531 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1531 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1532 sb->level != refsb->level || 1532 sb->level != refsb->level ||
1533 sb->layout != refsb->layout || 1533 sb->layout != refsb->layout ||
1534 sb->chunksize != refsb->chunksize) { 1534 sb->chunksize != refsb->chunksize) {
1535 printk(KERN_WARNING "md: %s has strangely different" 1535 printk(KERN_WARNING "md: %s has strangely different"
1536 " superblock to %s\n", 1536 " superblock to %s\n",
1537 bdevname(rdev->bdev,b), 1537 bdevname(rdev->bdev,b),
1538 bdevname(refdev->bdev,b2)); 1538 bdevname(refdev->bdev,b2));
1539 return -EINVAL; 1539 return -EINVAL;
1540 } 1540 }
1541 ev1 = le64_to_cpu(sb->events); 1541 ev1 = le64_to_cpu(sb->events);
1542 ev2 = le64_to_cpu(refsb->events); 1542 ev2 = le64_to_cpu(refsb->events);
1543 1543
1544 if (ev1 > ev2) 1544 if (ev1 > ev2)
1545 ret = 1; 1545 ret = 1;
1546 else 1546 else
1547 ret = 0; 1547 ret = 0;
1548 } 1548 }
1549 if (minor_version) { 1549 if (minor_version) {
1550 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9); 1550 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9);
1551 sectors -= rdev->data_offset; 1551 sectors -= rdev->data_offset;
1552 } else 1552 } else
1553 sectors = rdev->sb_start; 1553 sectors = rdev->sb_start;
1554 if (sectors < le64_to_cpu(sb->data_size)) 1554 if (sectors < le64_to_cpu(sb->data_size))
1555 return -EINVAL; 1555 return -EINVAL;
1556 rdev->sectors = le64_to_cpu(sb->data_size); 1556 rdev->sectors = le64_to_cpu(sb->data_size);
1557 return ret; 1557 return ret;
1558 } 1558 }
1559 1559
1560 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev) 1560 static int super_1_validate(struct mddev *mddev, struct md_rdev *rdev)
1561 { 1561 {
1562 struct mdp_superblock_1 *sb = page_address(rdev->sb_page); 1562 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1563 __u64 ev1 = le64_to_cpu(sb->events); 1563 __u64 ev1 = le64_to_cpu(sb->events);
1564 1564
1565 rdev->raid_disk = -1; 1565 rdev->raid_disk = -1;
1566 clear_bit(Faulty, &rdev->flags); 1566 clear_bit(Faulty, &rdev->flags);
1567 clear_bit(In_sync, &rdev->flags); 1567 clear_bit(In_sync, &rdev->flags);
1568 clear_bit(Bitmap_sync, &rdev->flags); 1568 clear_bit(Bitmap_sync, &rdev->flags);
1569 clear_bit(WriteMostly, &rdev->flags); 1569 clear_bit(WriteMostly, &rdev->flags);
1570 1570
1571 if (mddev->raid_disks == 0) { 1571 if (mddev->raid_disks == 0) {
1572 mddev->major_version = 1; 1572 mddev->major_version = 1;
1573 mddev->patch_version = 0; 1573 mddev->patch_version = 0;
1574 mddev->external = 0; 1574 mddev->external = 0;
1575 mddev->chunk_sectors = le32_to_cpu(sb->chunksize); 1575 mddev->chunk_sectors = le32_to_cpu(sb->chunksize);
1576 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1); 1576 mddev->ctime = le64_to_cpu(sb->ctime) & ((1ULL << 32)-1);
1577 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1); 1577 mddev->utime = le64_to_cpu(sb->utime) & ((1ULL << 32)-1);
1578 mddev->level = le32_to_cpu(sb->level); 1578 mddev->level = le32_to_cpu(sb->level);
1579 mddev->clevel[0] = 0; 1579 mddev->clevel[0] = 0;
1580 mddev->layout = le32_to_cpu(sb->layout); 1580 mddev->layout = le32_to_cpu(sb->layout);
1581 mddev->raid_disks = le32_to_cpu(sb->raid_disks); 1581 mddev->raid_disks = le32_to_cpu(sb->raid_disks);
1582 mddev->dev_sectors = le64_to_cpu(sb->size); 1582 mddev->dev_sectors = le64_to_cpu(sb->size);
1583 mddev->events = ev1; 1583 mddev->events = ev1;
1584 mddev->bitmap_info.offset = 0; 1584 mddev->bitmap_info.offset = 0;
1585 mddev->bitmap_info.space = 0; 1585 mddev->bitmap_info.space = 0;
1586 /* Default location for bitmap is 1K after superblock 1586 /* Default location for bitmap is 1K after superblock
1587 * using 3K - total of 4K 1587 * using 3K - total of 4K
1588 */ 1588 */
1589 mddev->bitmap_info.default_offset = 1024 >> 9; 1589 mddev->bitmap_info.default_offset = 1024 >> 9;
1590 mddev->bitmap_info.default_space = (4096-1024) >> 9; 1590 mddev->bitmap_info.default_space = (4096-1024) >> 9;
1591 mddev->reshape_backwards = 0; 1591 mddev->reshape_backwards = 0;
1592 1592
1593 mddev->recovery_cp = le64_to_cpu(sb->resync_offset); 1593 mddev->recovery_cp = le64_to_cpu(sb->resync_offset);
1594 memcpy(mddev->uuid, sb->set_uuid, 16); 1594 memcpy(mddev->uuid, sb->set_uuid, 16);
1595 1595
1596 mddev->max_disks = (4096-256)/2; 1596 mddev->max_disks = (4096-256)/2;
1597 1597
1598 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) && 1598 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) &&
1599 mddev->bitmap_info.file == NULL) { 1599 mddev->bitmap_info.file == NULL) {
1600 mddev->bitmap_info.offset = 1600 mddev->bitmap_info.offset =
1601 (__s32)le32_to_cpu(sb->bitmap_offset); 1601 (__s32)le32_to_cpu(sb->bitmap_offset);
1602 /* Metadata doesn't record how much space is available. 1602 /* Metadata doesn't record how much space is available.
1603 * For 1.0, we assume we can use up to the superblock 1603 * For 1.0, we assume we can use up to the superblock
1604 * if before, else to 4K beyond superblock. 1604 * if before, else to 4K beyond superblock.
1605 * For others, assume no change is possible. 1605 * For others, assume no change is possible.
1606 */ 1606 */
1607 if (mddev->minor_version > 0) 1607 if (mddev->minor_version > 0)
1608 mddev->bitmap_info.space = 0; 1608 mddev->bitmap_info.space = 0;
1609 else if (mddev->bitmap_info.offset > 0) 1609 else if (mddev->bitmap_info.offset > 0)
1610 mddev->bitmap_info.space = 1610 mddev->bitmap_info.space =
1611 8 - mddev->bitmap_info.offset; 1611 8 - mddev->bitmap_info.offset;
1612 else 1612 else
1613 mddev->bitmap_info.space = 1613 mddev->bitmap_info.space =
1614 -mddev->bitmap_info.offset; 1614 -mddev->bitmap_info.offset;
1615 } 1615 }
1616 1616
1617 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) { 1617 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_RESHAPE_ACTIVE)) {
1618 mddev->reshape_position = le64_to_cpu(sb->reshape_position); 1618 mddev->reshape_position = le64_to_cpu(sb->reshape_position);
1619 mddev->delta_disks = le32_to_cpu(sb->delta_disks); 1619 mddev->delta_disks = le32_to_cpu(sb->delta_disks);
1620 mddev->new_level = le32_to_cpu(sb->new_level); 1620 mddev->new_level = le32_to_cpu(sb->new_level);
1621 mddev->new_layout = le32_to_cpu(sb->new_layout); 1621 mddev->new_layout = le32_to_cpu(sb->new_layout);
1622 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk); 1622 mddev->new_chunk_sectors = le32_to_cpu(sb->new_chunk);
1623 if (mddev->delta_disks < 0 || 1623 if (mddev->delta_disks < 0 ||
1624 (mddev->delta_disks == 0 && 1624 (mddev->delta_disks == 0 &&
1625 (le32_to_cpu(sb->feature_map) 1625 (le32_to_cpu(sb->feature_map)
1626 & MD_FEATURE_RESHAPE_BACKWARDS))) 1626 & MD_FEATURE_RESHAPE_BACKWARDS)))
1627 mddev->reshape_backwards = 1; 1627 mddev->reshape_backwards = 1;
1628 } else { 1628 } else {
1629 mddev->reshape_position = MaxSector; 1629 mddev->reshape_position = MaxSector;
1630 mddev->delta_disks = 0; 1630 mddev->delta_disks = 0;
1631 mddev->new_level = mddev->level; 1631 mddev->new_level = mddev->level;
1632 mddev->new_layout = mddev->layout; 1632 mddev->new_layout = mddev->layout;
1633 mddev->new_chunk_sectors = mddev->chunk_sectors; 1633 mddev->new_chunk_sectors = mddev->chunk_sectors;
1634 } 1634 }
1635 1635
1636 } else if (mddev->pers == NULL) { 1636 } else if (mddev->pers == NULL) {
1637 /* Insist of good event counter while assembling, except for 1637 /* Insist of good event counter while assembling, except for
1638 * spares (which don't need an event count) */ 1638 * spares (which don't need an event count) */
1639 ++ev1; 1639 ++ev1;
1640 if (rdev->desc_nr >= 0 && 1640 if (rdev->desc_nr >= 0 &&
1641 rdev->desc_nr < le32_to_cpu(sb->max_dev) && 1641 rdev->desc_nr < le32_to_cpu(sb->max_dev) &&
1642 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe) 1642 le16_to_cpu(sb->dev_roles[rdev->desc_nr]) < 0xfffe)
1643 if (ev1 < mddev->events) 1643 if (ev1 < mddev->events)
1644 return -EINVAL; 1644 return -EINVAL;
1645 } else if (mddev->bitmap) { 1645 } else if (mddev->bitmap) {
1646 /* If adding to array with a bitmap, then we can accept an 1646 /* If adding to array with a bitmap, then we can accept an
1647 * older device, but not too old. 1647 * older device, but not too old.
1648 */ 1648 */
1649 if (ev1 < mddev->bitmap->events_cleared) 1649 if (ev1 < mddev->bitmap->events_cleared)
1650 return 0; 1650 return 0;
1651 if (ev1 < mddev->events) 1651 if (ev1 < mddev->events)
1652 set_bit(Bitmap_sync, &rdev->flags); 1652 set_bit(Bitmap_sync, &rdev->flags);
1653 } else { 1653 } else {
1654 if (ev1 < mddev->events) 1654 if (ev1 < mddev->events)
1655 /* just a hot-add of a new device, leave raid_disk at -1 */ 1655 /* just a hot-add of a new device, leave raid_disk at -1 */
1656 return 0; 1656 return 0;
1657 } 1657 }
1658 if (mddev->level != LEVEL_MULTIPATH) { 1658 if (mddev->level != LEVEL_MULTIPATH) {
1659 int role; 1659 int role;
1660 if (rdev->desc_nr < 0 || 1660 if (rdev->desc_nr < 0 ||
1661 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { 1661 rdev->desc_nr >= le32_to_cpu(sb->max_dev)) {
1662 role = 0xffff; 1662 role = 0xffff;
1663 rdev->desc_nr = -1; 1663 rdev->desc_nr = -1;
1664 } else 1664 } else
1665 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); 1665 role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]);
1666 switch(role) { 1666 switch(role) {
1667 case 0xffff: /* spare */ 1667 case 0xffff: /* spare */
1668 break; 1668 break;
1669 case 0xfffe: /* faulty */ 1669 case 0xfffe: /* faulty */
1670 set_bit(Faulty, &rdev->flags); 1670 set_bit(Faulty, &rdev->flags);
1671 break; 1671 break;
1672 default: 1672 default:
1673 rdev->saved_raid_disk = role; 1673 rdev->saved_raid_disk = role;
1674 if ((le32_to_cpu(sb->feature_map) & 1674 if ((le32_to_cpu(sb->feature_map) &
1675 MD_FEATURE_RECOVERY_OFFSET)) { 1675 MD_FEATURE_RECOVERY_OFFSET)) {
1676 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset); 1676 rdev->recovery_offset = le64_to_cpu(sb->recovery_offset);
1677 if (!(le32_to_cpu(sb->feature_map) & 1677 if (!(le32_to_cpu(sb->feature_map) &
1678 MD_FEATURE_RECOVERY_BITMAP)) 1678 MD_FEATURE_RECOVERY_BITMAP))
1679 rdev->saved_raid_disk = -1; 1679 rdev->saved_raid_disk = -1;
1680 } else 1680 } else
1681 set_bit(In_sync, &rdev->flags); 1681 set_bit(In_sync, &rdev->flags);
1682 rdev->raid_disk = role; 1682 rdev->raid_disk = role;
1683 break; 1683 break;
1684 } 1684 }
1685 if (sb->devflags & WriteMostly1) 1685 if (sb->devflags & WriteMostly1)
1686 set_bit(WriteMostly, &rdev->flags); 1686 set_bit(WriteMostly, &rdev->flags);
1687 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT) 1687 if (le32_to_cpu(sb->feature_map) & MD_FEATURE_REPLACEMENT)
1688 set_bit(Replacement, &rdev->flags); 1688 set_bit(Replacement, &rdev->flags);
1689 } else /* MULTIPATH are always insync */ 1689 } else /* MULTIPATH are always insync */
1690 set_bit(In_sync, &rdev->flags); 1690 set_bit(In_sync, &rdev->flags);
1691 1691
1692 return 0; 1692 return 0;
1693 } 1693 }
1694 1694
1695 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev) 1695 static void super_1_sync(struct mddev *mddev, struct md_rdev *rdev)
1696 { 1696 {
1697 struct mdp_superblock_1 *sb; 1697 struct mdp_superblock_1 *sb;
1698 struct md_rdev *rdev2; 1698 struct md_rdev *rdev2;
1699 int max_dev, i; 1699 int max_dev, i;
1700 /* make rdev->sb match mddev and rdev data. */ 1700 /* make rdev->sb match mddev and rdev data. */
1701 1701
1702 sb = page_address(rdev->sb_page); 1702 sb = page_address(rdev->sb_page);
1703 1703
1704 sb->feature_map = 0; 1704 sb->feature_map = 0;
1705 sb->pad0 = 0; 1705 sb->pad0 = 0;
1706 sb->recovery_offset = cpu_to_le64(0); 1706 sb->recovery_offset = cpu_to_le64(0);
1707 memset(sb->pad3, 0, sizeof(sb->pad3)); 1707 memset(sb->pad3, 0, sizeof(sb->pad3));
1708 1708
1709 sb->utime = cpu_to_le64((__u64)mddev->utime); 1709 sb->utime = cpu_to_le64((__u64)mddev->utime);
1710 sb->events = cpu_to_le64(mddev->events); 1710 sb->events = cpu_to_le64(mddev->events);
1711 if (mddev->in_sync) 1711 if (mddev->in_sync)
1712 sb->resync_offset = cpu_to_le64(mddev->recovery_cp); 1712 sb->resync_offset = cpu_to_le64(mddev->recovery_cp);
1713 else 1713 else
1714 sb->resync_offset = cpu_to_le64(0); 1714 sb->resync_offset = cpu_to_le64(0);
1715 1715
1716 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors)); 1716 sb->cnt_corrected_read = cpu_to_le32(atomic_read(&rdev->corrected_errors));
1717 1717
1718 sb->raid_disks = cpu_to_le32(mddev->raid_disks); 1718 sb->raid_disks = cpu_to_le32(mddev->raid_disks);
1719 sb->size = cpu_to_le64(mddev->dev_sectors); 1719 sb->size = cpu_to_le64(mddev->dev_sectors);
1720 sb->chunksize = cpu_to_le32(mddev->chunk_sectors); 1720 sb->chunksize = cpu_to_le32(mddev->chunk_sectors);
1721 sb->level = cpu_to_le32(mddev->level); 1721 sb->level = cpu_to_le32(mddev->level);
1722 sb->layout = cpu_to_le32(mddev->layout); 1722 sb->layout = cpu_to_le32(mddev->layout);
1723 1723
1724 if (test_bit(WriteMostly, &rdev->flags)) 1724 if (test_bit(WriteMostly, &rdev->flags))
1725 sb->devflags |= WriteMostly1; 1725 sb->devflags |= WriteMostly1;
1726 else 1726 else
1727 sb->devflags &= ~WriteMostly1; 1727 sb->devflags &= ~WriteMostly1;
1728 sb->data_offset = cpu_to_le64(rdev->data_offset); 1728 sb->data_offset = cpu_to_le64(rdev->data_offset);
1729 sb->data_size = cpu_to_le64(rdev->sectors); 1729 sb->data_size = cpu_to_le64(rdev->sectors);
1730 1730
1731 if (mddev->bitmap && mddev->bitmap_info.file == NULL) { 1731 if (mddev->bitmap && mddev->bitmap_info.file == NULL) {
1732 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset); 1732 sb->bitmap_offset = cpu_to_le32((__u32)mddev->bitmap_info.offset);
1733 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); 1733 sb->feature_map = cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
1734 } 1734 }
1735 1735
1736 if (rdev->raid_disk >= 0 && 1736 if (rdev->raid_disk >= 0 &&
1737 !test_bit(In_sync, &rdev->flags)) { 1737 !test_bit(In_sync, &rdev->flags)) {
1738 sb->feature_map |= 1738 sb->feature_map |=
1739 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET); 1739 cpu_to_le32(MD_FEATURE_RECOVERY_OFFSET);
1740 sb->recovery_offset = 1740 sb->recovery_offset =
1741 cpu_to_le64(rdev->recovery_offset); 1741 cpu_to_le64(rdev->recovery_offset);
1742 if (rdev->saved_raid_disk >= 0 && mddev->bitmap) 1742 if (rdev->saved_raid_disk >= 0 && mddev->bitmap)
1743 sb->feature_map |= 1743 sb->feature_map |=
1744 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP); 1744 cpu_to_le32(MD_FEATURE_RECOVERY_BITMAP);
1745 } 1745 }
1746 if (test_bit(Replacement, &rdev->flags)) 1746 if (test_bit(Replacement, &rdev->flags))
1747 sb->feature_map |= 1747 sb->feature_map |=
1748 cpu_to_le32(MD_FEATURE_REPLACEMENT); 1748 cpu_to_le32(MD_FEATURE_REPLACEMENT);
1749 1749
1750 if (mddev->reshape_position != MaxSector) { 1750 if (mddev->reshape_position != MaxSector) {
1751 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); 1751 sb->feature_map |= cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
1752 sb->reshape_position = cpu_to_le64(mddev->reshape_position); 1752 sb->reshape_position = cpu_to_le64(mddev->reshape_position);
1753 sb->new_layout = cpu_to_le32(mddev->new_layout); 1753 sb->new_layout = cpu_to_le32(mddev->new_layout);
1754 sb->delta_disks = cpu_to_le32(mddev->delta_disks); 1754 sb->delta_disks = cpu_to_le32(mddev->delta_disks);
1755 sb->new_level = cpu_to_le32(mddev->new_level); 1755 sb->new_level = cpu_to_le32(mddev->new_level);
1756 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1756 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1757 if (mddev->delta_disks == 0 && 1757 if (mddev->delta_disks == 0 &&
1758 mddev->reshape_backwards) 1758 mddev->reshape_backwards)
1759 sb->feature_map 1759 sb->feature_map
1760 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); 1760 |= cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
1761 if (rdev->new_data_offset != rdev->data_offset) { 1761 if (rdev->new_data_offset != rdev->data_offset) {
1762 sb->feature_map 1762 sb->feature_map
1763 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET); 1763 |= cpu_to_le32(MD_FEATURE_NEW_OFFSET);
1764 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset 1764 sb->new_offset = cpu_to_le32((__u32)(rdev->new_data_offset
1765 - rdev->data_offset)); 1765 - rdev->data_offset));
1766 } 1766 }
1767 } 1767 }
1768 1768
1769 if (rdev->badblocks.count == 0) 1769 if (rdev->badblocks.count == 0)
1770 /* Nothing to do for bad blocks*/ ; 1770 /* Nothing to do for bad blocks*/ ;
1771 else if (sb->bblog_offset == 0) 1771 else if (sb->bblog_offset == 0)
1772 /* Cannot record bad blocks on this device */ 1772 /* Cannot record bad blocks on this device */
1773 md_error(mddev, rdev); 1773 md_error(mddev, rdev);
1774 else { 1774 else {
1775 struct badblocks *bb = &rdev->badblocks; 1775 struct badblocks *bb = &rdev->badblocks;
1776 u64 *bbp = (u64 *)page_address(rdev->bb_page); 1776 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1777 u64 *p = bb->page; 1777 u64 *p = bb->page;
1778 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); 1778 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1779 if (bb->changed) { 1779 if (bb->changed) {
1780 unsigned seq; 1780 unsigned seq;
1781 1781
1782 retry: 1782 retry:
1783 seq = read_seqbegin(&bb->lock); 1783 seq = read_seqbegin(&bb->lock);
1784 1784
1785 memset(bbp, 0xff, PAGE_SIZE); 1785 memset(bbp, 0xff, PAGE_SIZE);
1786 1786
1787 for (i = 0 ; i < bb->count ; i++) { 1787 for (i = 0 ; i < bb->count ; i++) {
1788 u64 internal_bb = p[i]; 1788 u64 internal_bb = p[i];
1789 u64 store_bb = ((BB_OFFSET(internal_bb) << 10) 1789 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1790 | BB_LEN(internal_bb)); 1790 | BB_LEN(internal_bb));
1791 bbp[i] = cpu_to_le64(store_bb); 1791 bbp[i] = cpu_to_le64(store_bb);
1792 } 1792 }
1793 bb->changed = 0; 1793 bb->changed = 0;
1794 if (read_seqretry(&bb->lock, seq)) 1794 if (read_seqretry(&bb->lock, seq))
1795 goto retry; 1795 goto retry;
1796 1796
1797 bb->sector = (rdev->sb_start + 1797 bb->sector = (rdev->sb_start +
1798 (int)le32_to_cpu(sb->bblog_offset)); 1798 (int)le32_to_cpu(sb->bblog_offset));
1799 bb->size = le16_to_cpu(sb->bblog_size); 1799 bb->size = le16_to_cpu(sb->bblog_size);
1800 } 1800 }
1801 } 1801 }
1802 1802
1803 max_dev = 0; 1803 max_dev = 0;
1804 rdev_for_each(rdev2, mddev) 1804 rdev_for_each(rdev2, mddev)
1805 if (rdev2->desc_nr+1 > max_dev) 1805 if (rdev2->desc_nr+1 > max_dev)
1806 max_dev = rdev2->desc_nr+1; 1806 max_dev = rdev2->desc_nr+1;
1807 1807
1808 if (max_dev > le32_to_cpu(sb->max_dev)) { 1808 if (max_dev > le32_to_cpu(sb->max_dev)) {
1809 int bmask; 1809 int bmask;
1810 sb->max_dev = cpu_to_le32(max_dev); 1810 sb->max_dev = cpu_to_le32(max_dev);
1811 rdev->sb_size = max_dev * 2 + 256; 1811 rdev->sb_size = max_dev * 2 + 256;
1812 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; 1812 bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1;
1813 if (rdev->sb_size & bmask) 1813 if (rdev->sb_size & bmask)
1814 rdev->sb_size = (rdev->sb_size | bmask) + 1; 1814 rdev->sb_size = (rdev->sb_size | bmask) + 1;
1815 } else 1815 } else
1816 max_dev = le32_to_cpu(sb->max_dev); 1816 max_dev = le32_to_cpu(sb->max_dev);
1817 1817
1818 for (i=0; i<max_dev;i++) 1818 for (i=0; i<max_dev;i++)
1819 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1819 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1820 1820
1821 rdev_for_each(rdev2, mddev) { 1821 rdev_for_each(rdev2, mddev) {
1822 i = rdev2->desc_nr; 1822 i = rdev2->desc_nr;
1823 if (test_bit(Faulty, &rdev2->flags)) 1823 if (test_bit(Faulty, &rdev2->flags))
1824 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1824 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1825 else if (test_bit(In_sync, &rdev2->flags)) 1825 else if (test_bit(In_sync, &rdev2->flags))
1826 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1826 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1827 else if (rdev2->raid_disk >= 0) 1827 else if (rdev2->raid_disk >= 0)
1828 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk); 1828 sb->dev_roles[i] = cpu_to_le16(rdev2->raid_disk);
1829 else 1829 else
1830 sb->dev_roles[i] = cpu_to_le16(0xffff); 1830 sb->dev_roles[i] = cpu_to_le16(0xffff);
1831 } 1831 }
1832 1832
1833 sb->sb_csum = calc_sb_1_csum(sb); 1833 sb->sb_csum = calc_sb_1_csum(sb);
1834 } 1834 }
1835 1835
1836 static unsigned long long 1836 static unsigned long long
1837 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors) 1837 super_1_rdev_size_change(struct md_rdev *rdev, sector_t num_sectors)
1838 { 1838 {
1839 struct mdp_superblock_1 *sb; 1839 struct mdp_superblock_1 *sb;
1840 sector_t max_sectors; 1840 sector_t max_sectors;
1841 if (num_sectors && num_sectors < rdev->mddev->dev_sectors) 1841 if (num_sectors && num_sectors < rdev->mddev->dev_sectors)
1842 return 0; /* component must fit device */ 1842 return 0; /* component must fit device */
1843 if (rdev->data_offset != rdev->new_data_offset) 1843 if (rdev->data_offset != rdev->new_data_offset)
1844 return 0; /* too confusing */ 1844 return 0; /* too confusing */
1845 if (rdev->sb_start < rdev->data_offset) { 1845 if (rdev->sb_start < rdev->data_offset) {
1846 /* minor versions 1 and 2; superblock before data */ 1846 /* minor versions 1 and 2; superblock before data */
1847 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9; 1847 max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
1848 max_sectors -= rdev->data_offset; 1848 max_sectors -= rdev->data_offset;
1849 if (!num_sectors || num_sectors > max_sectors) 1849 if (!num_sectors || num_sectors > max_sectors)
1850 num_sectors = max_sectors; 1850 num_sectors = max_sectors;
1851 } else if (rdev->mddev->bitmap_info.offset) { 1851 } else if (rdev->mddev->bitmap_info.offset) {
1852 /* minor version 0 with bitmap we can't move */ 1852 /* minor version 0 with bitmap we can't move */
1853 return 0; 1853 return 0;
1854 } else { 1854 } else {
1855 /* minor version 0; superblock after data */ 1855 /* minor version 0; superblock after data */
1856 sector_t sb_start; 1856 sector_t sb_start;
1857 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2; 1857 sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
1858 sb_start &= ~(sector_t)(4*2 - 1); 1858 sb_start &= ~(sector_t)(4*2 - 1);
1859 max_sectors = rdev->sectors + sb_start - rdev->sb_start; 1859 max_sectors = rdev->sectors + sb_start - rdev->sb_start;
1860 if (!num_sectors || num_sectors > max_sectors) 1860 if (!num_sectors || num_sectors > max_sectors)
1861 num_sectors = max_sectors; 1861 num_sectors = max_sectors;
1862 rdev->sb_start = sb_start; 1862 rdev->sb_start = sb_start;
1863 } 1863 }
1864 sb = page_address(rdev->sb_page); 1864 sb = page_address(rdev->sb_page);
1865 sb->data_size = cpu_to_le64(num_sectors); 1865 sb->data_size = cpu_to_le64(num_sectors);
1866 sb->super_offset = rdev->sb_start; 1866 sb->super_offset = rdev->sb_start;
1867 sb->sb_csum = calc_sb_1_csum(sb); 1867 sb->sb_csum = calc_sb_1_csum(sb);
1868 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size, 1868 md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
1869 rdev->sb_page); 1869 rdev->sb_page);
1870 md_super_wait(rdev->mddev); 1870 md_super_wait(rdev->mddev);
1871 return num_sectors; 1871 return num_sectors;
1872 1872
1873 } 1873 }
1874 1874
1875 static int 1875 static int
1876 super_1_allow_new_offset(struct md_rdev *rdev, 1876 super_1_allow_new_offset(struct md_rdev *rdev,
1877 unsigned long long new_offset) 1877 unsigned long long new_offset)
1878 { 1878 {
1879 /* All necessary checks on new >= old have been done */ 1879 /* All necessary checks on new >= old have been done */
1880 struct bitmap *bitmap; 1880 struct bitmap *bitmap;
1881 if (new_offset >= rdev->data_offset) 1881 if (new_offset >= rdev->data_offset)
1882 return 1; 1882 return 1;
1883 1883
1884 /* with 1.0 metadata, there is no metadata to tread on 1884 /* with 1.0 metadata, there is no metadata to tread on
1885 * so we can always move back */ 1885 * so we can always move back */
1886 if (rdev->mddev->minor_version == 0) 1886 if (rdev->mddev->minor_version == 0)
1887 return 1; 1887 return 1;
1888 1888
1889 /* otherwise we must be sure not to step on 1889 /* otherwise we must be sure not to step on
1890 * any metadata, so stay: 1890 * any metadata, so stay:
1891 * 36K beyond start of superblock 1891 * 36K beyond start of superblock
1892 * beyond end of badblocks 1892 * beyond end of badblocks
1893 * beyond write-intent bitmap 1893 * beyond write-intent bitmap
1894 */ 1894 */
1895 if (rdev->sb_start + (32+4)*2 > new_offset) 1895 if (rdev->sb_start + (32+4)*2 > new_offset)
1896 return 0; 1896 return 0;
1897 bitmap = rdev->mddev->bitmap; 1897 bitmap = rdev->mddev->bitmap;
1898 if (bitmap && !rdev->mddev->bitmap_info.file && 1898 if (bitmap && !rdev->mddev->bitmap_info.file &&
1899 rdev->sb_start + rdev->mddev->bitmap_info.offset + 1899 rdev->sb_start + rdev->mddev->bitmap_info.offset +
1900 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset) 1900 bitmap->storage.file_pages * (PAGE_SIZE>>9) > new_offset)
1901 return 0; 1901 return 0;
1902 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset) 1902 if (rdev->badblocks.sector + rdev->badblocks.size > new_offset)
1903 return 0; 1903 return 0;
1904 1904
1905 return 1; 1905 return 1;
1906 } 1906 }
1907 1907
1908 static struct super_type super_types[] = { 1908 static struct super_type super_types[] = {
1909 [0] = { 1909 [0] = {
1910 .name = "0.90.0", 1910 .name = "0.90.0",
1911 .owner = THIS_MODULE, 1911 .owner = THIS_MODULE,
1912 .load_super = super_90_load, 1912 .load_super = super_90_load,
1913 .validate_super = super_90_validate, 1913 .validate_super = super_90_validate,
1914 .sync_super = super_90_sync, 1914 .sync_super = super_90_sync,
1915 .rdev_size_change = super_90_rdev_size_change, 1915 .rdev_size_change = super_90_rdev_size_change,
1916 .allow_new_offset = super_90_allow_new_offset, 1916 .allow_new_offset = super_90_allow_new_offset,
1917 }, 1917 },
1918 [1] = { 1918 [1] = {
1919 .name = "md-1", 1919 .name = "md-1",
1920 .owner = THIS_MODULE, 1920 .owner = THIS_MODULE,
1921 .load_super = super_1_load, 1921 .load_super = super_1_load,
1922 .validate_super = super_1_validate, 1922 .validate_super = super_1_validate,
1923 .sync_super = super_1_sync, 1923 .sync_super = super_1_sync,
1924 .rdev_size_change = super_1_rdev_size_change, 1924 .rdev_size_change = super_1_rdev_size_change,
1925 .allow_new_offset = super_1_allow_new_offset, 1925 .allow_new_offset = super_1_allow_new_offset,
1926 }, 1926 },
1927 }; 1927 };
1928 1928
1929 static void sync_super(struct mddev *mddev, struct md_rdev *rdev) 1929 static void sync_super(struct mddev *mddev, struct md_rdev *rdev)
1930 { 1930 {
1931 if (mddev->sync_super) { 1931 if (mddev->sync_super) {
1932 mddev->sync_super(mddev, rdev); 1932 mddev->sync_super(mddev, rdev);
1933 return; 1933 return;
1934 } 1934 }
1935 1935
1936 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types)); 1936 BUG_ON(mddev->major_version >= ARRAY_SIZE(super_types));
1937 1937
1938 super_types[mddev->major_version].sync_super(mddev, rdev); 1938 super_types[mddev->major_version].sync_super(mddev, rdev);
1939 } 1939 }
1940 1940
1941 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2) 1941 static int match_mddev_units(struct mddev *mddev1, struct mddev *mddev2)
1942 { 1942 {
1943 struct md_rdev *rdev, *rdev2; 1943 struct md_rdev *rdev, *rdev2;
1944 1944
1945 rcu_read_lock(); 1945 rcu_read_lock();
1946 rdev_for_each_rcu(rdev, mddev1) 1946 rdev_for_each_rcu(rdev, mddev1)
1947 rdev_for_each_rcu(rdev2, mddev2) 1947 rdev_for_each_rcu(rdev2, mddev2)
1948 if (rdev->bdev->bd_contains == 1948 if (rdev->bdev->bd_contains ==
1949 rdev2->bdev->bd_contains) { 1949 rdev2->bdev->bd_contains) {
1950 rcu_read_unlock(); 1950 rcu_read_unlock();
1951 return 1; 1951 return 1;
1952 } 1952 }
1953 rcu_read_unlock(); 1953 rcu_read_unlock();
1954 return 0; 1954 return 0;
1955 } 1955 }
1956 1956
1957 static LIST_HEAD(pending_raid_disks); 1957 static LIST_HEAD(pending_raid_disks);
1958 1958
1959 /* 1959 /*
1960 * Try to register data integrity profile for an mddev 1960 * Try to register data integrity profile for an mddev
1961 * 1961 *
1962 * This is called when an array is started and after a disk has been kicked 1962 * This is called when an array is started and after a disk has been kicked
1963 * from the array. It only succeeds if all working and active component devices 1963 * from the array. It only succeeds if all working and active component devices
1964 * are integrity capable with matching profiles. 1964 * are integrity capable with matching profiles.
1965 */ 1965 */
1966 int md_integrity_register(struct mddev *mddev) 1966 int md_integrity_register(struct mddev *mddev)
1967 { 1967 {
1968 struct md_rdev *rdev, *reference = NULL; 1968 struct md_rdev *rdev, *reference = NULL;
1969 1969
1970 if (list_empty(&mddev->disks)) 1970 if (list_empty(&mddev->disks))
1971 return 0; /* nothing to do */ 1971 return 0; /* nothing to do */
1972 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 1972 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
1973 return 0; /* shouldn't register, or already is */ 1973 return 0; /* shouldn't register, or already is */
1974 rdev_for_each(rdev, mddev) { 1974 rdev_for_each(rdev, mddev) {
1975 /* skip spares and non-functional disks */ 1975 /* skip spares and non-functional disks */
1976 if (test_bit(Faulty, &rdev->flags)) 1976 if (test_bit(Faulty, &rdev->flags))
1977 continue; 1977 continue;
1978 if (rdev->raid_disk < 0) 1978 if (rdev->raid_disk < 0)
1979 continue; 1979 continue;
1980 if (!reference) { 1980 if (!reference) {
1981 /* Use the first rdev as the reference */ 1981 /* Use the first rdev as the reference */
1982 reference = rdev; 1982 reference = rdev;
1983 continue; 1983 continue;
1984 } 1984 }
1985 /* does this rdev's profile match the reference profile? */ 1985 /* does this rdev's profile match the reference profile? */
1986 if (blk_integrity_compare(reference->bdev->bd_disk, 1986 if (blk_integrity_compare(reference->bdev->bd_disk,
1987 rdev->bdev->bd_disk) < 0) 1987 rdev->bdev->bd_disk) < 0)
1988 return -EINVAL; 1988 return -EINVAL;
1989 } 1989 }
1990 if (!reference || !bdev_get_integrity(reference->bdev)) 1990 if (!reference || !bdev_get_integrity(reference->bdev))
1991 return 0; 1991 return 0;
1992 /* 1992 /*
1993 * All component devices are integrity capable and have matching 1993 * All component devices are integrity capable and have matching
1994 * profiles, register the common profile for the md device. 1994 * profiles, register the common profile for the md device.
1995 */ 1995 */
1996 if (blk_integrity_register(mddev->gendisk, 1996 if (blk_integrity_register(mddev->gendisk,
1997 bdev_get_integrity(reference->bdev)) != 0) { 1997 bdev_get_integrity(reference->bdev)) != 0) {
1998 printk(KERN_ERR "md: failed to register integrity for %s\n", 1998 printk(KERN_ERR "md: failed to register integrity for %s\n",
1999 mdname(mddev)); 1999 mdname(mddev));
2000 return -EINVAL; 2000 return -EINVAL;
2001 } 2001 }
2002 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev)); 2002 printk(KERN_NOTICE "md: data integrity enabled on %s\n", mdname(mddev));
2003 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) { 2003 if (bioset_integrity_create(mddev->bio_set, BIO_POOL_SIZE)) {
2004 printk(KERN_ERR "md: failed to create integrity pool for %s\n", 2004 printk(KERN_ERR "md: failed to create integrity pool for %s\n",
2005 mdname(mddev)); 2005 mdname(mddev));
2006 return -EINVAL; 2006 return -EINVAL;
2007 } 2007 }
2008 return 0; 2008 return 0;
2009 } 2009 }
2010 EXPORT_SYMBOL(md_integrity_register); 2010 EXPORT_SYMBOL(md_integrity_register);
2011 2011
2012 /* Disable data integrity if non-capable/non-matching disk is being added */ 2012 /* Disable data integrity if non-capable/non-matching disk is being added */
2013 void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) 2013 void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev)
2014 { 2014 {
2015 struct blk_integrity *bi_rdev; 2015 struct blk_integrity *bi_rdev;
2016 struct blk_integrity *bi_mddev; 2016 struct blk_integrity *bi_mddev;
2017 2017
2018 if (!mddev->gendisk) 2018 if (!mddev->gendisk)
2019 return; 2019 return;
2020 2020
2021 bi_rdev = bdev_get_integrity(rdev->bdev); 2021 bi_rdev = bdev_get_integrity(rdev->bdev);
2022 bi_mddev = blk_get_integrity(mddev->gendisk); 2022 bi_mddev = blk_get_integrity(mddev->gendisk);
2023 2023
2024 if (!bi_mddev) /* nothing to do */ 2024 if (!bi_mddev) /* nothing to do */
2025 return; 2025 return;
2026 if (rdev->raid_disk < 0) /* skip spares */ 2026 if (rdev->raid_disk < 0) /* skip spares */
2027 return; 2027 return;
2028 if (bi_rdev && blk_integrity_compare(mddev->gendisk, 2028 if (bi_rdev && blk_integrity_compare(mddev->gendisk,
2029 rdev->bdev->bd_disk) >= 0) 2029 rdev->bdev->bd_disk) >= 0)
2030 return; 2030 return;
2031 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); 2031 printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev));
2032 blk_integrity_unregister(mddev->gendisk); 2032 blk_integrity_unregister(mddev->gendisk);
2033 } 2033 }
2034 EXPORT_SYMBOL(md_integrity_add_rdev); 2034 EXPORT_SYMBOL(md_integrity_add_rdev);
2035 2035
2036 static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev) 2036 static int bind_rdev_to_array(struct md_rdev * rdev, struct mddev * mddev)
2037 { 2037 {
2038 char b[BDEVNAME_SIZE]; 2038 char b[BDEVNAME_SIZE];
2039 struct kobject *ko; 2039 struct kobject *ko;
2040 char *s; 2040 char *s;
2041 int err; 2041 int err;
2042 2042
2043 if (rdev->mddev) { 2043 if (rdev->mddev) {
2044 MD_BUG(); 2044 MD_BUG();
2045 return -EINVAL; 2045 return -EINVAL;
2046 } 2046 }
2047 2047
2048 /* prevent duplicates */ 2048 /* prevent duplicates */
2049 if (find_rdev(mddev, rdev->bdev->bd_dev)) 2049 if (find_rdev(mddev, rdev->bdev->bd_dev))
2050 return -EEXIST; 2050 return -EEXIST;
2051 2051
2052 /* make sure rdev->sectors exceeds mddev->dev_sectors */ 2052 /* make sure rdev->sectors exceeds mddev->dev_sectors */
2053 if (rdev->sectors && (mddev->dev_sectors == 0 || 2053 if (rdev->sectors && (mddev->dev_sectors == 0 ||
2054 rdev->sectors < mddev->dev_sectors)) { 2054 rdev->sectors < mddev->dev_sectors)) {
2055 if (mddev->pers) { 2055 if (mddev->pers) {
2056 /* Cannot change size, so fail 2056 /* Cannot change size, so fail
2057 * If mddev->level <= 0, then we don't care 2057 * If mddev->level <= 0, then we don't care
2058 * about aligning sizes (e.g. linear) 2058 * about aligning sizes (e.g. linear)
2059 */ 2059 */
2060 if (mddev->level > 0) 2060 if (mddev->level > 0)
2061 return -ENOSPC; 2061 return -ENOSPC;
2062 } else 2062 } else
2063 mddev->dev_sectors = rdev->sectors; 2063 mddev->dev_sectors = rdev->sectors;
2064 } 2064 }
2065 2065
2066 /* Verify rdev->desc_nr is unique. 2066 /* Verify rdev->desc_nr is unique.
2067 * If it is -1, assign a free number, else 2067 * If it is -1, assign a free number, else
2068 * check number is not in use 2068 * check number is not in use
2069 */ 2069 */
2070 if (rdev->desc_nr < 0) { 2070 if (rdev->desc_nr < 0) {
2071 int choice = 0; 2071 int choice = 0;
2072 if (mddev->pers) choice = mddev->raid_disks; 2072 if (mddev->pers) choice = mddev->raid_disks;
2073 while (find_rdev_nr(mddev, choice)) 2073 while (find_rdev_nr(mddev, choice))
2074 choice++; 2074 choice++;
2075 rdev->desc_nr = choice; 2075 rdev->desc_nr = choice;
2076 } else { 2076 } else {
2077 if (find_rdev_nr(mddev, rdev->desc_nr)) 2077 if (find_rdev_nr(mddev, rdev->desc_nr))
2078 return -EBUSY; 2078 return -EBUSY;
2079 } 2079 }
2080 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { 2080 if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) {
2081 printk(KERN_WARNING "md: %s: array is limited to %d devices\n", 2081 printk(KERN_WARNING "md: %s: array is limited to %d devices\n",
2082 mdname(mddev), mddev->max_disks); 2082 mdname(mddev), mddev->max_disks);
2083 return -EBUSY; 2083 return -EBUSY;
2084 } 2084 }
2085 bdevname(rdev->bdev,b); 2085 bdevname(rdev->bdev,b);
2086 while ( (s=strchr(b, '/')) != NULL) 2086 while ( (s=strchr(b, '/')) != NULL)
2087 *s = '!'; 2087 *s = '!';
2088 2088
2089 rdev->mddev = mddev; 2089 rdev->mddev = mddev;
2090 printk(KERN_INFO "md: bind<%s>\n", b); 2090 printk(KERN_INFO "md: bind<%s>\n", b);
2091 2091
2092 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b))) 2092 if ((err = kobject_add(&rdev->kobj, &mddev->kobj, "dev-%s", b)))
2093 goto fail; 2093 goto fail;
2094 2094
2095 ko = &part_to_dev(rdev->bdev->bd_part)->kobj; 2095 ko = &part_to_dev(rdev->bdev->bd_part)->kobj;
2096 if (sysfs_create_link(&rdev->kobj, ko, "block")) 2096 if (sysfs_create_link(&rdev->kobj, ko, "block"))
2097 /* failure here is OK */; 2097 /* failure here is OK */;
2098 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); 2098 rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
2099 2099
2100 list_add_rcu(&rdev->same_set, &mddev->disks); 2100 list_add_rcu(&rdev->same_set, &mddev->disks);
2101 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2101 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
2102 2102
2103 /* May as well allow recovery to be retried once */ 2103 /* May as well allow recovery to be retried once */
2104 mddev->recovery_disabled++; 2104 mddev->recovery_disabled++;
2105 2105
2106 return 0; 2106 return 0;
2107 2107
2108 fail: 2108 fail:
2109 printk(KERN_WARNING "md: failed to register dev-%s for %s\n", 2109 printk(KERN_WARNING "md: failed to register dev-%s for %s\n",
2110 b, mdname(mddev)); 2110 b, mdname(mddev));
2111 return err; 2111 return err;
2112 } 2112 }
2113 2113
2114 static void md_delayed_delete(struct work_struct *ws) 2114 static void md_delayed_delete(struct work_struct *ws)
2115 { 2115 {
2116 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work); 2116 struct md_rdev *rdev = container_of(ws, struct md_rdev, del_work);
2117 kobject_del(&rdev->kobj); 2117 kobject_del(&rdev->kobj);
2118 kobject_put(&rdev->kobj); 2118 kobject_put(&rdev->kobj);
2119 } 2119 }
2120 2120
2121 static void unbind_rdev_from_array(struct md_rdev * rdev) 2121 static void unbind_rdev_from_array(struct md_rdev * rdev)
2122 { 2122 {
2123 char b[BDEVNAME_SIZE]; 2123 char b[BDEVNAME_SIZE];
2124 if (!rdev->mddev) { 2124 if (!rdev->mddev) {
2125 MD_BUG(); 2125 MD_BUG();
2126 return; 2126 return;
2127 } 2127 }
2128 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk); 2128 bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
2129 list_del_rcu(&rdev->same_set); 2129 list_del_rcu(&rdev->same_set);
2130 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b)); 2130 printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
2131 rdev->mddev = NULL; 2131 rdev->mddev = NULL;
2132 sysfs_remove_link(&rdev->kobj, "block"); 2132 sysfs_remove_link(&rdev->kobj, "block");
2133 sysfs_put(rdev->sysfs_state); 2133 sysfs_put(rdev->sysfs_state);
2134 rdev->sysfs_state = NULL; 2134 rdev->sysfs_state = NULL;
2135 rdev->badblocks.count = 0; 2135 rdev->badblocks.count = 0;
2136 /* We need to delay this, otherwise we can deadlock when 2136 /* We need to delay this, otherwise we can deadlock when
2137 * writing to 'remove' to "dev/state". We also need 2137 * writing to 'remove' to "dev/state". We also need
2138 * to delay it due to rcu usage. 2138 * to delay it due to rcu usage.
2139 */ 2139 */
2140 synchronize_rcu(); 2140 synchronize_rcu();
2141 INIT_WORK(&rdev->del_work, md_delayed_delete); 2141 INIT_WORK(&rdev->del_work, md_delayed_delete);
2142 kobject_get(&rdev->kobj); 2142 kobject_get(&rdev->kobj);
2143 queue_work(md_misc_wq, &rdev->del_work); 2143 queue_work(md_misc_wq, &rdev->del_work);
2144 } 2144 }
2145 2145
2146 /* 2146 /*
2147 * prevent the device from being mounted, repartitioned or 2147 * prevent the device from being mounted, repartitioned or
2148 * otherwise reused by a RAID array (or any other kernel 2148 * otherwise reused by a RAID array (or any other kernel
2149 * subsystem), by bd_claiming the device. 2149 * subsystem), by bd_claiming the device.
2150 */ 2150 */
2151 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared) 2151 static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
2152 { 2152 {
2153 int err = 0; 2153 int err = 0;
2154 struct block_device *bdev; 2154 struct block_device *bdev;
2155 char b[BDEVNAME_SIZE]; 2155 char b[BDEVNAME_SIZE];
2156 2156
2157 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL, 2157 bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
2158 shared ? (struct md_rdev *)lock_rdev : rdev); 2158 shared ? (struct md_rdev *)lock_rdev : rdev);
2159 if (IS_ERR(bdev)) { 2159 if (IS_ERR(bdev)) {
2160 printk(KERN_ERR "md: could not open %s.\n", 2160 printk(KERN_ERR "md: could not open %s.\n",
2161 __bdevname(dev, b)); 2161 __bdevname(dev, b));
2162 return PTR_ERR(bdev); 2162 return PTR_ERR(bdev);
2163 } 2163 }
2164 rdev->bdev = bdev; 2164 rdev->bdev = bdev;
2165 return err; 2165 return err;
2166 } 2166 }
2167 2167
2168 static void unlock_rdev(struct md_rdev *rdev) 2168 static void unlock_rdev(struct md_rdev *rdev)
2169 { 2169 {
2170 struct block_device *bdev = rdev->bdev; 2170 struct block_device *bdev = rdev->bdev;
2171 rdev->bdev = NULL; 2171 rdev->bdev = NULL;
2172 if (!bdev) 2172 if (!bdev)
2173 MD_BUG(); 2173 MD_BUG();
2174 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 2174 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
2175 } 2175 }
2176 2176
2177 void md_autodetect_dev(dev_t dev); 2177 void md_autodetect_dev(dev_t dev);
2178 2178
2179 static void export_rdev(struct md_rdev * rdev) 2179 static void export_rdev(struct md_rdev * rdev)
2180 { 2180 {
2181 char b[BDEVNAME_SIZE]; 2181 char b[BDEVNAME_SIZE];
2182 printk(KERN_INFO "md: export_rdev(%s)\n", 2182 printk(KERN_INFO "md: export_rdev(%s)\n",
2183 bdevname(rdev->bdev,b)); 2183 bdevname(rdev->bdev,b));
2184 if (rdev->mddev) 2184 if (rdev->mddev)
2185 MD_BUG(); 2185 MD_BUG();
2186 md_rdev_clear(rdev); 2186 md_rdev_clear(rdev);
2187 #ifndef MODULE 2187 #ifndef MODULE
2188 if (test_bit(AutoDetected, &rdev->flags)) 2188 if (test_bit(AutoDetected, &rdev->flags))
2189 md_autodetect_dev(rdev->bdev->bd_dev); 2189 md_autodetect_dev(rdev->bdev->bd_dev);
2190 #endif 2190 #endif
2191 unlock_rdev(rdev); 2191 unlock_rdev(rdev);
2192 kobject_put(&rdev->kobj); 2192 kobject_put(&rdev->kobj);
2193 } 2193 }
2194 2194
2195 static void kick_rdev_from_array(struct md_rdev * rdev) 2195 static void kick_rdev_from_array(struct md_rdev * rdev)
2196 { 2196 {
2197 unbind_rdev_from_array(rdev); 2197 unbind_rdev_from_array(rdev);
2198 export_rdev(rdev); 2198 export_rdev(rdev);
2199 } 2199 }
2200 2200
2201 static void export_array(struct mddev *mddev) 2201 static void export_array(struct mddev *mddev)
2202 { 2202 {
2203 struct md_rdev *rdev, *tmp; 2203 struct md_rdev *rdev, *tmp;
2204 2204
2205 rdev_for_each_safe(rdev, tmp, mddev) { 2205 rdev_for_each_safe(rdev, tmp, mddev) {
2206 if (!rdev->mddev) { 2206 if (!rdev->mddev) {
2207 MD_BUG(); 2207 MD_BUG();
2208 continue; 2208 continue;
2209 } 2209 }
2210 kick_rdev_from_array(rdev); 2210 kick_rdev_from_array(rdev);
2211 } 2211 }
2212 if (!list_empty(&mddev->disks)) 2212 if (!list_empty(&mddev->disks))
2213 MD_BUG(); 2213 MD_BUG();
2214 mddev->raid_disks = 0; 2214 mddev->raid_disks = 0;
2215 mddev->major_version = 0; 2215 mddev->major_version = 0;
2216 } 2216 }
2217 2217
2218 static void print_desc(mdp_disk_t *desc) 2218 static void print_desc(mdp_disk_t *desc)
2219 { 2219 {
2220 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number, 2220 printk(" DISK<N:%d,(%d,%d),R:%d,S:%d>\n", desc->number,
2221 desc->major,desc->minor,desc->raid_disk,desc->state); 2221 desc->major,desc->minor,desc->raid_disk,desc->state);
2222 } 2222 }
2223 2223
2224 static void print_sb_90(mdp_super_t *sb) 2224 static void print_sb_90(mdp_super_t *sb)
2225 { 2225 {
2226 int i; 2226 int i;
2227 2227
2228 printk(KERN_INFO 2228 printk(KERN_INFO
2229 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n", 2229 "md: SB: (V:%d.%d.%d) ID:<%08x.%08x.%08x.%08x> CT:%08x\n",
2230 sb->major_version, sb->minor_version, sb->patch_version, 2230 sb->major_version, sb->minor_version, sb->patch_version,
2231 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3, 2231 sb->set_uuid0, sb->set_uuid1, sb->set_uuid2, sb->set_uuid3,
2232 sb->ctime); 2232 sb->ctime);
2233 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n", 2233 printk(KERN_INFO "md: L%d S%08d ND:%d RD:%d md%d LO:%d CS:%d\n",
2234 sb->level, sb->size, sb->nr_disks, sb->raid_disks, 2234 sb->level, sb->size, sb->nr_disks, sb->raid_disks,
2235 sb->md_minor, sb->layout, sb->chunk_size); 2235 sb->md_minor, sb->layout, sb->chunk_size);
2236 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d" 2236 printk(KERN_INFO "md: UT:%08x ST:%d AD:%d WD:%d"
2237 " FD:%d SD:%d CSUM:%08x E:%08lx\n", 2237 " FD:%d SD:%d CSUM:%08x E:%08lx\n",
2238 sb->utime, sb->state, sb->active_disks, sb->working_disks, 2238 sb->utime, sb->state, sb->active_disks, sb->working_disks,
2239 sb->failed_disks, sb->spare_disks, 2239 sb->failed_disks, sb->spare_disks,
2240 sb->sb_csum, (unsigned long)sb->events_lo); 2240 sb->sb_csum, (unsigned long)sb->events_lo);
2241 2241
2242 printk(KERN_INFO); 2242 printk(KERN_INFO);
2243 for (i = 0; i < MD_SB_DISKS; i++) { 2243 for (i = 0; i < MD_SB_DISKS; i++) {
2244 mdp_disk_t *desc; 2244 mdp_disk_t *desc;
2245 2245
2246 desc = sb->disks + i; 2246 desc = sb->disks + i;
2247 if (desc->number || desc->major || desc->minor || 2247 if (desc->number || desc->major || desc->minor ||
2248 desc->raid_disk || (desc->state && (desc->state != 4))) { 2248 desc->raid_disk || (desc->state && (desc->state != 4))) {
2249 printk(" D %2d: ", i); 2249 printk(" D %2d: ", i);
2250 print_desc(desc); 2250 print_desc(desc);
2251 } 2251 }
2252 } 2252 }
2253 printk(KERN_INFO "md: THIS: "); 2253 printk(KERN_INFO "md: THIS: ");
2254 print_desc(&sb->this_disk); 2254 print_desc(&sb->this_disk);
2255 } 2255 }
2256 2256
2257 static void print_sb_1(struct mdp_superblock_1 *sb) 2257 static void print_sb_1(struct mdp_superblock_1 *sb)
2258 { 2258 {
2259 __u8 *uuid; 2259 __u8 *uuid;
2260 2260
2261 uuid = sb->set_uuid; 2261 uuid = sb->set_uuid;
2262 printk(KERN_INFO 2262 printk(KERN_INFO
2263 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n" 2263 "md: SB: (V:%u) (F:0x%08x) Array-ID:<%pU>\n"
2264 "md: Name: \"%s\" CT:%llu\n", 2264 "md: Name: \"%s\" CT:%llu\n",
2265 le32_to_cpu(sb->major_version), 2265 le32_to_cpu(sb->major_version),
2266 le32_to_cpu(sb->feature_map), 2266 le32_to_cpu(sb->feature_map),
2267 uuid, 2267 uuid,
2268 sb->set_name, 2268 sb->set_name,
2269 (unsigned long long)le64_to_cpu(sb->ctime) 2269 (unsigned long long)le64_to_cpu(sb->ctime)
2270 & MD_SUPERBLOCK_1_TIME_SEC_MASK); 2270 & MD_SUPERBLOCK_1_TIME_SEC_MASK);
2271 2271
2272 uuid = sb->device_uuid; 2272 uuid = sb->device_uuid;
2273 printk(KERN_INFO 2273 printk(KERN_INFO
2274 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu" 2274 "md: L%u SZ%llu RD:%u LO:%u CS:%u DO:%llu DS:%llu SO:%llu"
2275 " RO:%llu\n" 2275 " RO:%llu\n"
2276 "md: Dev:%08x UUID: %pU\n" 2276 "md: Dev:%08x UUID: %pU\n"
2277 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n" 2277 "md: (F:0x%08x) UT:%llu Events:%llu ResyncOffset:%llu CSUM:0x%08x\n"
2278 "md: (MaxDev:%u) \n", 2278 "md: (MaxDev:%u) \n",
2279 le32_to_cpu(sb->level), 2279 le32_to_cpu(sb->level),
2280 (unsigned long long)le64_to_cpu(sb->size), 2280 (unsigned long long)le64_to_cpu(sb->size),
2281 le32_to_cpu(sb->raid_disks), 2281 le32_to_cpu(sb->raid_disks),
2282 le32_to_cpu(sb->layout), 2282 le32_to_cpu(sb->layout),
2283 le32_to_cpu(sb->chunksize), 2283 le32_to_cpu(sb->chunksize),
2284 (unsigned long long)le64_to_cpu(sb->data_offset), 2284 (unsigned long long)le64_to_cpu(sb->data_offset),
2285 (unsigned long long)le64_to_cpu(sb->data_size), 2285 (unsigned long long)le64_to_cpu(sb->data_size),
2286 (unsigned long long)le64_to_cpu(sb->super_offset), 2286 (unsigned long long)le64_to_cpu(sb->super_offset),
2287 (unsigned long long)le64_to_cpu(sb->recovery_offset), 2287 (unsigned long long)le64_to_cpu(sb->recovery_offset),
2288 le32_to_cpu(sb->dev_number), 2288 le32_to_cpu(sb->dev_number),
2289 uuid, 2289 uuid,
2290 sb->devflags, 2290 sb->devflags,
2291 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK, 2291 (unsigned long long)le64_to_cpu(sb->utime) & MD_SUPERBLOCK_1_TIME_SEC_MASK,
2292 (unsigned long long)le64_to_cpu(sb->events), 2292 (unsigned long long)le64_to_cpu(sb->events),
2293 (unsigned long long)le64_to_cpu(sb->resync_offset), 2293 (unsigned long long)le64_to_cpu(sb->resync_offset),
2294 le32_to_cpu(sb->sb_csum), 2294 le32_to_cpu(sb->sb_csum),
2295 le32_to_cpu(sb->max_dev) 2295 le32_to_cpu(sb->max_dev)
2296 ); 2296 );
2297 } 2297 }
2298 2298
2299 static void print_rdev(struct md_rdev *rdev, int major_version) 2299 static void print_rdev(struct md_rdev *rdev, int major_version)
2300 { 2300 {
2301 char b[BDEVNAME_SIZE]; 2301 char b[BDEVNAME_SIZE];
2302 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n", 2302 printk(KERN_INFO "md: rdev %s, Sect:%08llu F:%d S:%d DN:%u\n",
2303 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors, 2303 bdevname(rdev->bdev, b), (unsigned long long)rdev->sectors,
2304 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags), 2304 test_bit(Faulty, &rdev->flags), test_bit(In_sync, &rdev->flags),
2305 rdev->desc_nr); 2305 rdev->desc_nr);
2306 if (rdev->sb_loaded) { 2306 if (rdev->sb_loaded) {
2307 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); 2307 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
2308 switch (major_version) { 2308 switch (major_version) {
2309 case 0: 2309 case 0:
2310 print_sb_90(page_address(rdev->sb_page)); 2310 print_sb_90(page_address(rdev->sb_page));
2311 break; 2311 break;
2312 case 1: 2312 case 1:
2313 print_sb_1(page_address(rdev->sb_page)); 2313 print_sb_1(page_address(rdev->sb_page));
2314 break; 2314 break;
2315 } 2315 }
2316 } else 2316 } else
2317 printk(KERN_INFO "md: no rdev superblock!\n"); 2317 printk(KERN_INFO "md: no rdev superblock!\n");
2318 } 2318 }
2319 2319
2320 static void md_print_devices(void) 2320 static void md_print_devices(void)
2321 { 2321 {
2322 struct list_head *tmp; 2322 struct list_head *tmp;
2323 struct md_rdev *rdev; 2323 struct md_rdev *rdev;
2324 struct mddev *mddev; 2324 struct mddev *mddev;
2325 char b[BDEVNAME_SIZE]; 2325 char b[BDEVNAME_SIZE];
2326 2326
2327 printk("\n"); 2327 printk("\n");
2328 printk("md: **********************************\n"); 2328 printk("md: **********************************\n");
2329 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); 2329 printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n");
2330 printk("md: **********************************\n"); 2330 printk("md: **********************************\n");
2331 for_each_mddev(mddev, tmp) { 2331 for_each_mddev(mddev, tmp) {
2332 2332
2333 if (mddev->bitmap) 2333 if (mddev->bitmap)
2334 bitmap_print_sb(mddev->bitmap); 2334 bitmap_print_sb(mddev->bitmap);
2335 else 2335 else
2336 printk("%s: ", mdname(mddev)); 2336 printk("%s: ", mdname(mddev));
2337 rdev_for_each(rdev, mddev) 2337 rdev_for_each(rdev, mddev)
2338 printk("<%s>", bdevname(rdev->bdev,b)); 2338 printk("<%s>", bdevname(rdev->bdev,b));
2339 printk("\n"); 2339 printk("\n");
2340 2340
2341 rdev_for_each(rdev, mddev) 2341 rdev_for_each(rdev, mddev)
2342 print_rdev(rdev, mddev->major_version); 2342 print_rdev(rdev, mddev->major_version);
2343 } 2343 }
2344 printk("md: **********************************\n"); 2344 printk("md: **********************************\n");
2345 printk("\n"); 2345 printk("\n");
2346 } 2346 }
2347 2347
2348 2348
2349 static void sync_sbs(struct mddev * mddev, int nospares) 2349 static void sync_sbs(struct mddev * mddev, int nospares)
2350 { 2350 {
2351 /* Update each superblock (in-memory image), but 2351 /* Update each superblock (in-memory image), but
2352 * if we are allowed to, skip spares which already 2352 * if we are allowed to, skip spares which already
2353 * have the right event counter, or have one earlier 2353 * have the right event counter, or have one earlier
2354 * (which would mean they aren't being marked as dirty 2354 * (which would mean they aren't being marked as dirty
2355 * with the rest of the array) 2355 * with the rest of the array)
2356 */ 2356 */
2357 struct md_rdev *rdev; 2357 struct md_rdev *rdev;
2358 rdev_for_each(rdev, mddev) { 2358 rdev_for_each(rdev, mddev) {
2359 if (rdev->sb_events == mddev->events || 2359 if (rdev->sb_events == mddev->events ||
2360 (nospares && 2360 (nospares &&
2361 rdev->raid_disk < 0 && 2361 rdev->raid_disk < 0 &&
2362 rdev->sb_events+1 == mddev->events)) { 2362 rdev->sb_events+1 == mddev->events)) {
2363 /* Don't update this superblock */ 2363 /* Don't update this superblock */
2364 rdev->sb_loaded = 2; 2364 rdev->sb_loaded = 2;
2365 } else { 2365 } else {
2366 sync_super(mddev, rdev); 2366 sync_super(mddev, rdev);
2367 rdev->sb_loaded = 1; 2367 rdev->sb_loaded = 1;
2368 } 2368 }
2369 } 2369 }
2370 } 2370 }
2371 2371
2372 static void md_update_sb(struct mddev * mddev, int force_change) 2372 static void md_update_sb(struct mddev * mddev, int force_change)
2373 { 2373 {
2374 struct md_rdev *rdev; 2374 struct md_rdev *rdev;
2375 int sync_req; 2375 int sync_req;
2376 int nospares = 0; 2376 int nospares = 0;
2377 int any_badblocks_changed = 0; 2377 int any_badblocks_changed = 0;
2378 2378
2379 if (mddev->ro) { 2379 if (mddev->ro) {
2380 if (force_change) 2380 if (force_change)
2381 set_bit(MD_CHANGE_DEVS, &mddev->flags); 2381 set_bit(MD_CHANGE_DEVS, &mddev->flags);
2382 return; 2382 return;
2383 } 2383 }
2384 repeat: 2384 repeat:
2385 /* First make sure individual recovery_offsets are correct */ 2385 /* First make sure individual recovery_offsets are correct */
2386 rdev_for_each(rdev, mddev) { 2386 rdev_for_each(rdev, mddev) {
2387 if (rdev->raid_disk >= 0 && 2387 if (rdev->raid_disk >= 0 &&
2388 mddev->delta_disks >= 0 && 2388 mddev->delta_disks >= 0 &&
2389 !test_bit(In_sync, &rdev->flags) && 2389 !test_bit(In_sync, &rdev->flags) &&
2390 mddev->curr_resync_completed > rdev->recovery_offset) 2390 mddev->curr_resync_completed > rdev->recovery_offset)
2391 rdev->recovery_offset = mddev->curr_resync_completed; 2391 rdev->recovery_offset = mddev->curr_resync_completed;
2392 2392
2393 } 2393 }
2394 if (!mddev->persistent) { 2394 if (!mddev->persistent) {
2395 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2395 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2396 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2396 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2397 if (!mddev->external) { 2397 if (!mddev->external) {
2398 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2398 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2399 rdev_for_each(rdev, mddev) { 2399 rdev_for_each(rdev, mddev) {
2400 if (rdev->badblocks.changed) { 2400 if (rdev->badblocks.changed) {
2401 rdev->badblocks.changed = 0; 2401 rdev->badblocks.changed = 0;
2402 md_ack_all_badblocks(&rdev->badblocks); 2402 md_ack_all_badblocks(&rdev->badblocks);
2403 md_error(mddev, rdev); 2403 md_error(mddev, rdev);
2404 } 2404 }
2405 clear_bit(Blocked, &rdev->flags); 2405 clear_bit(Blocked, &rdev->flags);
2406 clear_bit(BlockedBadBlocks, &rdev->flags); 2406 clear_bit(BlockedBadBlocks, &rdev->flags);
2407 wake_up(&rdev->blocked_wait); 2407 wake_up(&rdev->blocked_wait);
2408 } 2408 }
2409 } 2409 }
2410 wake_up(&mddev->sb_wait); 2410 wake_up(&mddev->sb_wait);
2411 return; 2411 return;
2412 } 2412 }
2413 2413
2414 spin_lock_irq(&mddev->write_lock); 2414 spin_lock_irq(&mddev->write_lock);
2415 2415
2416 mddev->utime = get_seconds(); 2416 mddev->utime = get_seconds();
2417 2417
2418 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags)) 2418 if (test_and_clear_bit(MD_CHANGE_DEVS, &mddev->flags))
2419 force_change = 1; 2419 force_change = 1;
2420 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags)) 2420 if (test_and_clear_bit(MD_CHANGE_CLEAN, &mddev->flags))
2421 /* just a clean<-> dirty transition, possibly leave spares alone, 2421 /* just a clean<-> dirty transition, possibly leave spares alone,
2422 * though if events isn't the right even/odd, we will have to do 2422 * though if events isn't the right even/odd, we will have to do
2423 * spares after all 2423 * spares after all
2424 */ 2424 */
2425 nospares = 1; 2425 nospares = 1;
2426 if (force_change) 2426 if (force_change)
2427 nospares = 0; 2427 nospares = 0;
2428 if (mddev->degraded) 2428 if (mddev->degraded)
2429 /* If the array is degraded, then skipping spares is both 2429 /* If the array is degraded, then skipping spares is both
2430 * dangerous and fairly pointless. 2430 * dangerous and fairly pointless.
2431 * Dangerous because a device that was removed from the array 2431 * Dangerous because a device that was removed from the array
2432 * might have a event_count that still looks up-to-date, 2432 * might have a event_count that still looks up-to-date,
2433 * so it can be re-added without a resync. 2433 * so it can be re-added without a resync.
2434 * Pointless because if there are any spares to skip, 2434 * Pointless because if there are any spares to skip,
2435 * then a recovery will happen and soon that array won't 2435 * then a recovery will happen and soon that array won't
2436 * be degraded any more and the spare can go back to sleep then. 2436 * be degraded any more and the spare can go back to sleep then.
2437 */ 2437 */
2438 nospares = 0; 2438 nospares = 0;
2439 2439
2440 sync_req = mddev->in_sync; 2440 sync_req = mddev->in_sync;
2441 2441
2442 /* If this is just a dirty<->clean transition, and the array is clean 2442 /* If this is just a dirty<->clean transition, and the array is clean
2443 * and 'events' is odd, we can roll back to the previous clean state */ 2443 * and 'events' is odd, we can roll back to the previous clean state */
2444 if (nospares 2444 if (nospares
2445 && (mddev->in_sync && mddev->recovery_cp == MaxSector) 2445 && (mddev->in_sync && mddev->recovery_cp == MaxSector)
2446 && mddev->can_decrease_events 2446 && mddev->can_decrease_events
2447 && mddev->events != 1) { 2447 && mddev->events != 1) {
2448 mddev->events--; 2448 mddev->events--;
2449 mddev->can_decrease_events = 0; 2449 mddev->can_decrease_events = 0;
2450 } else { 2450 } else {
2451 /* otherwise we have to go forward and ... */ 2451 /* otherwise we have to go forward and ... */
2452 mddev->events ++; 2452 mddev->events ++;
2453 mddev->can_decrease_events = nospares; 2453 mddev->can_decrease_events = nospares;
2454 } 2454 }
2455 2455
2456 if (!mddev->events) { 2456 if (!mddev->events) {
2457 /* 2457 /*
2458 * oops, this 64-bit counter should never wrap. 2458 * oops, this 64-bit counter should never wrap.
2459 * Either we are in around ~1 trillion A.C., assuming 2459 * Either we are in around ~1 trillion A.C., assuming
2460 * 1 reboot per second, or we have a bug: 2460 * 1 reboot per second, or we have a bug:
2461 */ 2461 */
2462 MD_BUG(); 2462 MD_BUG();
2463 mddev->events --; 2463 mddev->events --;
2464 } 2464 }
2465 2465
2466 rdev_for_each(rdev, mddev) { 2466 rdev_for_each(rdev, mddev) {
2467 if (rdev->badblocks.changed) 2467 if (rdev->badblocks.changed)
2468 any_badblocks_changed++; 2468 any_badblocks_changed++;
2469 if (test_bit(Faulty, &rdev->flags)) 2469 if (test_bit(Faulty, &rdev->flags))
2470 set_bit(FaultRecorded, &rdev->flags); 2470 set_bit(FaultRecorded, &rdev->flags);
2471 } 2471 }
2472 2472
2473 sync_sbs(mddev, nospares); 2473 sync_sbs(mddev, nospares);
2474 spin_unlock_irq(&mddev->write_lock); 2474 spin_unlock_irq(&mddev->write_lock);
2475 2475
2476 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n", 2476 pr_debug("md: updating %s RAID superblock on device (in sync %d)\n",
2477 mdname(mddev), mddev->in_sync); 2477 mdname(mddev), mddev->in_sync);
2478 2478
2479 bitmap_update_sb(mddev->bitmap); 2479 bitmap_update_sb(mddev->bitmap);
2480 rdev_for_each(rdev, mddev) { 2480 rdev_for_each(rdev, mddev) {
2481 char b[BDEVNAME_SIZE]; 2481 char b[BDEVNAME_SIZE];
2482 2482
2483 if (rdev->sb_loaded != 1) 2483 if (rdev->sb_loaded != 1)
2484 continue; /* no noise on spare devices */ 2484 continue; /* no noise on spare devices */
2485 2485
2486 if (!test_bit(Faulty, &rdev->flags)) { 2486 if (!test_bit(Faulty, &rdev->flags)) {
2487 md_super_write(mddev,rdev, 2487 md_super_write(mddev,rdev,
2488 rdev->sb_start, rdev->sb_size, 2488 rdev->sb_start, rdev->sb_size,
2489 rdev->sb_page); 2489 rdev->sb_page);
2490 pr_debug("md: (write) %s's sb offset: %llu\n", 2490 pr_debug("md: (write) %s's sb offset: %llu\n",
2491 bdevname(rdev->bdev, b), 2491 bdevname(rdev->bdev, b),
2492 (unsigned long long)rdev->sb_start); 2492 (unsigned long long)rdev->sb_start);
2493 rdev->sb_events = mddev->events; 2493 rdev->sb_events = mddev->events;
2494 if (rdev->badblocks.size) { 2494 if (rdev->badblocks.size) {
2495 md_super_write(mddev, rdev, 2495 md_super_write(mddev, rdev,
2496 rdev->badblocks.sector, 2496 rdev->badblocks.sector,
2497 rdev->badblocks.size << 9, 2497 rdev->badblocks.size << 9,
2498 rdev->bb_page); 2498 rdev->bb_page);
2499 rdev->badblocks.size = 0; 2499 rdev->badblocks.size = 0;
2500 } 2500 }
2501 2501
2502 } else 2502 } else
2503 pr_debug("md: %s (skipping faulty)\n", 2503 pr_debug("md: %s (skipping faulty)\n",
2504 bdevname(rdev->bdev, b)); 2504 bdevname(rdev->bdev, b));
2505 2505
2506 if (mddev->level == LEVEL_MULTIPATH) 2506 if (mddev->level == LEVEL_MULTIPATH)
2507 /* only need to write one superblock... */ 2507 /* only need to write one superblock... */
2508 break; 2508 break;
2509 } 2509 }
2510 md_super_wait(mddev); 2510 md_super_wait(mddev);
2511 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */ 2511 /* if there was a failure, MD_CHANGE_DEVS was set, and we re-write super */
2512 2512
2513 spin_lock_irq(&mddev->write_lock); 2513 spin_lock_irq(&mddev->write_lock);
2514 if (mddev->in_sync != sync_req || 2514 if (mddev->in_sync != sync_req ||
2515 test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 2515 test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
2516 /* have to write it out again */ 2516 /* have to write it out again */
2517 spin_unlock_irq(&mddev->write_lock); 2517 spin_unlock_irq(&mddev->write_lock);
2518 goto repeat; 2518 goto repeat;
2519 } 2519 }
2520 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2520 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2521 spin_unlock_irq(&mddev->write_lock); 2521 spin_unlock_irq(&mddev->write_lock);
2522 wake_up(&mddev->sb_wait); 2522 wake_up(&mddev->sb_wait);
2523 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2523 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2524 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2524 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2525 2525
2526 rdev_for_each(rdev, mddev) { 2526 rdev_for_each(rdev, mddev) {
2527 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2527 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2528 clear_bit(Blocked, &rdev->flags); 2528 clear_bit(Blocked, &rdev->flags);
2529 2529
2530 if (any_badblocks_changed) 2530 if (any_badblocks_changed)
2531 md_ack_all_badblocks(&rdev->badblocks); 2531 md_ack_all_badblocks(&rdev->badblocks);
2532 clear_bit(BlockedBadBlocks, &rdev->flags); 2532 clear_bit(BlockedBadBlocks, &rdev->flags);
2533 wake_up(&rdev->blocked_wait); 2533 wake_up(&rdev->blocked_wait);
2534 } 2534 }
2535 } 2535 }
2536 2536
2537 /* words written to sysfs files may, or may not, be \n terminated. 2537 /* words written to sysfs files may, or may not, be \n terminated.
2538 * We want to accept with case. For this we use cmd_match. 2538 * We want to accept with case. For this we use cmd_match.
2539 */ 2539 */
2540 static int cmd_match(const char *cmd, const char *str) 2540 static int cmd_match(const char *cmd, const char *str)
2541 { 2541 {
2542 /* See if cmd, written into a sysfs file, matches 2542 /* See if cmd, written into a sysfs file, matches
2543 * str. They must either be the same, or cmd can 2543 * str. They must either be the same, or cmd can
2544 * have a trailing newline 2544 * have a trailing newline
2545 */ 2545 */
2546 while (*cmd && *str && *cmd == *str) { 2546 while (*cmd && *str && *cmd == *str) {
2547 cmd++; 2547 cmd++;
2548 str++; 2548 str++;
2549 } 2549 }
2550 if (*cmd == '\n') 2550 if (*cmd == '\n')
2551 cmd++; 2551 cmd++;
2552 if (*str || *cmd) 2552 if (*str || *cmd)
2553 return 0; 2553 return 0;
2554 return 1; 2554 return 1;
2555 } 2555 }
2556 2556
2557 struct rdev_sysfs_entry { 2557 struct rdev_sysfs_entry {
2558 struct attribute attr; 2558 struct attribute attr;
2559 ssize_t (*show)(struct md_rdev *, char *); 2559 ssize_t (*show)(struct md_rdev *, char *);
2560 ssize_t (*store)(struct md_rdev *, const char *, size_t); 2560 ssize_t (*store)(struct md_rdev *, const char *, size_t);
2561 }; 2561 };
2562 2562
2563 static ssize_t 2563 static ssize_t
2564 state_show(struct md_rdev *rdev, char *page) 2564 state_show(struct md_rdev *rdev, char *page)
2565 { 2565 {
2566 char *sep = ""; 2566 char *sep = "";
2567 size_t len = 0; 2567 size_t len = 0;
2568 2568
2569 if (test_bit(Faulty, &rdev->flags) || 2569 if (test_bit(Faulty, &rdev->flags) ||
2570 rdev->badblocks.unacked_exist) { 2570 rdev->badblocks.unacked_exist) {
2571 len+= sprintf(page+len, "%sfaulty",sep); 2571 len+= sprintf(page+len, "%sfaulty",sep);
2572 sep = ","; 2572 sep = ",";
2573 } 2573 }
2574 if (test_bit(In_sync, &rdev->flags)) { 2574 if (test_bit(In_sync, &rdev->flags)) {
2575 len += sprintf(page+len, "%sin_sync",sep); 2575 len += sprintf(page+len, "%sin_sync",sep);
2576 sep = ","; 2576 sep = ",";
2577 } 2577 }
2578 if (test_bit(WriteMostly, &rdev->flags)) { 2578 if (test_bit(WriteMostly, &rdev->flags)) {
2579 len += sprintf(page+len, "%swrite_mostly",sep); 2579 len += sprintf(page+len, "%swrite_mostly",sep);
2580 sep = ","; 2580 sep = ",";
2581 } 2581 }
2582 if (test_bit(Blocked, &rdev->flags) || 2582 if (test_bit(Blocked, &rdev->flags) ||
2583 (rdev->badblocks.unacked_exist 2583 (rdev->badblocks.unacked_exist
2584 && !test_bit(Faulty, &rdev->flags))) { 2584 && !test_bit(Faulty, &rdev->flags))) {
2585 len += sprintf(page+len, "%sblocked", sep); 2585 len += sprintf(page+len, "%sblocked", sep);
2586 sep = ","; 2586 sep = ",";
2587 } 2587 }
2588 if (!test_bit(Faulty, &rdev->flags) && 2588 if (!test_bit(Faulty, &rdev->flags) &&
2589 !test_bit(In_sync, &rdev->flags)) { 2589 !test_bit(In_sync, &rdev->flags)) {
2590 len += sprintf(page+len, "%sspare", sep); 2590 len += sprintf(page+len, "%sspare", sep);
2591 sep = ","; 2591 sep = ",";
2592 } 2592 }
2593 if (test_bit(WriteErrorSeen, &rdev->flags)) { 2593 if (test_bit(WriteErrorSeen, &rdev->flags)) {
2594 len += sprintf(page+len, "%swrite_error", sep); 2594 len += sprintf(page+len, "%swrite_error", sep);
2595 sep = ","; 2595 sep = ",";
2596 } 2596 }
2597 if (test_bit(WantReplacement, &rdev->flags)) { 2597 if (test_bit(WantReplacement, &rdev->flags)) {
2598 len += sprintf(page+len, "%swant_replacement", sep); 2598 len += sprintf(page+len, "%swant_replacement", sep);
2599 sep = ","; 2599 sep = ",";
2600 } 2600 }
2601 if (test_bit(Replacement, &rdev->flags)) { 2601 if (test_bit(Replacement, &rdev->flags)) {
2602 len += sprintf(page+len, "%sreplacement", sep); 2602 len += sprintf(page+len, "%sreplacement", sep);
2603 sep = ","; 2603 sep = ",";
2604 } 2604 }
2605 2605
2606 return len+sprintf(page+len, "\n"); 2606 return len+sprintf(page+len, "\n");
2607 } 2607 }
2608 2608
2609 static ssize_t 2609 static ssize_t
2610 state_store(struct md_rdev *rdev, const char *buf, size_t len) 2610 state_store(struct md_rdev *rdev, const char *buf, size_t len)
2611 { 2611 {
2612 /* can write 2612 /* can write
2613 * faulty - simulates an error 2613 * faulty - simulates an error
2614 * remove - disconnects the device 2614 * remove - disconnects the device
2615 * writemostly - sets write_mostly 2615 * writemostly - sets write_mostly
2616 * -writemostly - clears write_mostly 2616 * -writemostly - clears write_mostly
2617 * blocked - sets the Blocked flags 2617 * blocked - sets the Blocked flags
2618 * -blocked - clears the Blocked and possibly simulates an error 2618 * -blocked - clears the Blocked and possibly simulates an error
2619 * insync - sets Insync providing device isn't active 2619 * insync - sets Insync providing device isn't active
2620 * -insync - clear Insync for a device with a slot assigned, 2620 * -insync - clear Insync for a device with a slot assigned,
2621 * so that it gets rebuilt based on bitmap 2621 * so that it gets rebuilt based on bitmap
2622 * write_error - sets WriteErrorSeen 2622 * write_error - sets WriteErrorSeen
2623 * -write_error - clears WriteErrorSeen 2623 * -write_error - clears WriteErrorSeen
2624 */ 2624 */
2625 int err = -EINVAL; 2625 int err = -EINVAL;
2626 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2626 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
2627 md_error(rdev->mddev, rdev); 2627 md_error(rdev->mddev, rdev);
2628 if (test_bit(Faulty, &rdev->flags)) 2628 if (test_bit(Faulty, &rdev->flags))
2629 err = 0; 2629 err = 0;
2630 else 2630 else
2631 err = -EBUSY; 2631 err = -EBUSY;
2632 } else if (cmd_match(buf, "remove")) { 2632 } else if (cmd_match(buf, "remove")) {
2633 if (rdev->raid_disk >= 0) 2633 if (rdev->raid_disk >= 0)
2634 err = -EBUSY; 2634 err = -EBUSY;
2635 else { 2635 else {
2636 struct mddev *mddev = rdev->mddev; 2636 struct mddev *mddev = rdev->mddev;
2637 kick_rdev_from_array(rdev); 2637 kick_rdev_from_array(rdev);
2638 if (mddev->pers) 2638 if (mddev->pers)
2639 md_update_sb(mddev, 1); 2639 md_update_sb(mddev, 1);
2640 md_new_event(mddev); 2640 md_new_event(mddev);
2641 err = 0; 2641 err = 0;
2642 } 2642 }
2643 } else if (cmd_match(buf, "writemostly")) { 2643 } else if (cmd_match(buf, "writemostly")) {
2644 set_bit(WriteMostly, &rdev->flags); 2644 set_bit(WriteMostly, &rdev->flags);
2645 err = 0; 2645 err = 0;
2646 } else if (cmd_match(buf, "-writemostly")) { 2646 } else if (cmd_match(buf, "-writemostly")) {
2647 clear_bit(WriteMostly, &rdev->flags); 2647 clear_bit(WriteMostly, &rdev->flags);
2648 err = 0; 2648 err = 0;
2649 } else if (cmd_match(buf, "blocked")) { 2649 } else if (cmd_match(buf, "blocked")) {
2650 set_bit(Blocked, &rdev->flags); 2650 set_bit(Blocked, &rdev->flags);
2651 err = 0; 2651 err = 0;
2652 } else if (cmd_match(buf, "-blocked")) { 2652 } else if (cmd_match(buf, "-blocked")) {
2653 if (!test_bit(Faulty, &rdev->flags) && 2653 if (!test_bit(Faulty, &rdev->flags) &&
2654 rdev->badblocks.unacked_exist) { 2654 rdev->badblocks.unacked_exist) {
2655 /* metadata handler doesn't understand badblocks, 2655 /* metadata handler doesn't understand badblocks,
2656 * so we need to fail the device 2656 * so we need to fail the device
2657 */ 2657 */
2658 md_error(rdev->mddev, rdev); 2658 md_error(rdev->mddev, rdev);
2659 } 2659 }
2660 clear_bit(Blocked, &rdev->flags); 2660 clear_bit(Blocked, &rdev->flags);
2661 clear_bit(BlockedBadBlocks, &rdev->flags); 2661 clear_bit(BlockedBadBlocks, &rdev->flags);
2662 wake_up(&rdev->blocked_wait); 2662 wake_up(&rdev->blocked_wait);
2663 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2663 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2664 md_wakeup_thread(rdev->mddev->thread); 2664 md_wakeup_thread(rdev->mddev->thread);
2665 2665
2666 err = 0; 2666 err = 0;
2667 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2667 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2668 set_bit(In_sync, &rdev->flags); 2668 set_bit(In_sync, &rdev->flags);
2669 err = 0; 2669 err = 0;
2670 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) { 2670 } else if (cmd_match(buf, "-insync") && rdev->raid_disk >= 0) {
2671 clear_bit(In_sync, &rdev->flags); 2671 clear_bit(In_sync, &rdev->flags);
2672 rdev->saved_raid_disk = rdev->raid_disk; 2672 rdev->saved_raid_disk = rdev->raid_disk;
2673 rdev->raid_disk = -1; 2673 rdev->raid_disk = -1;
2674 err = 0; 2674 err = 0;
2675 } else if (cmd_match(buf, "write_error")) { 2675 } else if (cmd_match(buf, "write_error")) {
2676 set_bit(WriteErrorSeen, &rdev->flags); 2676 set_bit(WriteErrorSeen, &rdev->flags);
2677 err = 0; 2677 err = 0;
2678 } else if (cmd_match(buf, "-write_error")) { 2678 } else if (cmd_match(buf, "-write_error")) {
2679 clear_bit(WriteErrorSeen, &rdev->flags); 2679 clear_bit(WriteErrorSeen, &rdev->flags);
2680 err = 0; 2680 err = 0;
2681 } else if (cmd_match(buf, "want_replacement")) { 2681 } else if (cmd_match(buf, "want_replacement")) {
2682 /* Any non-spare device that is not a replacement can 2682 /* Any non-spare device that is not a replacement can
2683 * become want_replacement at any time, but we then need to 2683 * become want_replacement at any time, but we then need to
2684 * check if recovery is needed. 2684 * check if recovery is needed.
2685 */ 2685 */
2686 if (rdev->raid_disk >= 0 && 2686 if (rdev->raid_disk >= 0 &&
2687 !test_bit(Replacement, &rdev->flags)) 2687 !test_bit(Replacement, &rdev->flags))
2688 set_bit(WantReplacement, &rdev->flags); 2688 set_bit(WantReplacement, &rdev->flags);
2689 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2689 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2690 md_wakeup_thread(rdev->mddev->thread); 2690 md_wakeup_thread(rdev->mddev->thread);
2691 err = 0; 2691 err = 0;
2692 } else if (cmd_match(buf, "-want_replacement")) { 2692 } else if (cmd_match(buf, "-want_replacement")) {
2693 /* Clearing 'want_replacement' is always allowed. 2693 /* Clearing 'want_replacement' is always allowed.
2694 * Once replacements starts it is too late though. 2694 * Once replacements starts it is too late though.
2695 */ 2695 */
2696 err = 0; 2696 err = 0;
2697 clear_bit(WantReplacement, &rdev->flags); 2697 clear_bit(WantReplacement, &rdev->flags);
2698 } else if (cmd_match(buf, "replacement")) { 2698 } else if (cmd_match(buf, "replacement")) {
2699 /* Can only set a device as a replacement when array has not 2699 /* Can only set a device as a replacement when array has not
2700 * yet been started. Once running, replacement is automatic 2700 * yet been started. Once running, replacement is automatic
2701 * from spares, or by assigning 'slot'. 2701 * from spares, or by assigning 'slot'.
2702 */ 2702 */
2703 if (rdev->mddev->pers) 2703 if (rdev->mddev->pers)
2704 err = -EBUSY; 2704 err = -EBUSY;
2705 else { 2705 else {
2706 set_bit(Replacement, &rdev->flags); 2706 set_bit(Replacement, &rdev->flags);
2707 err = 0; 2707 err = 0;
2708 } 2708 }
2709 } else if (cmd_match(buf, "-replacement")) { 2709 } else if (cmd_match(buf, "-replacement")) {
2710 /* Similarly, can only clear Replacement before start */ 2710 /* Similarly, can only clear Replacement before start */
2711 if (rdev->mddev->pers) 2711 if (rdev->mddev->pers)
2712 err = -EBUSY; 2712 err = -EBUSY;
2713 else { 2713 else {
2714 clear_bit(Replacement, &rdev->flags); 2714 clear_bit(Replacement, &rdev->flags);
2715 err = 0; 2715 err = 0;
2716 } 2716 }
2717 } 2717 }
2718 if (!err) 2718 if (!err)
2719 sysfs_notify_dirent_safe(rdev->sysfs_state); 2719 sysfs_notify_dirent_safe(rdev->sysfs_state);
2720 return err ? err : len; 2720 return err ? err : len;
2721 } 2721 }
2722 static struct rdev_sysfs_entry rdev_state = 2722 static struct rdev_sysfs_entry rdev_state =
2723 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); 2723 __ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store);
2724 2724
2725 static ssize_t 2725 static ssize_t
2726 errors_show(struct md_rdev *rdev, char *page) 2726 errors_show(struct md_rdev *rdev, char *page)
2727 { 2727 {
2728 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors)); 2728 return sprintf(page, "%d\n", atomic_read(&rdev->corrected_errors));
2729 } 2729 }
2730 2730
2731 static ssize_t 2731 static ssize_t
2732 errors_store(struct md_rdev *rdev, const char *buf, size_t len) 2732 errors_store(struct md_rdev *rdev, const char *buf, size_t len)
2733 { 2733 {
2734 char *e; 2734 char *e;
2735 unsigned long n = simple_strtoul(buf, &e, 10); 2735 unsigned long n = simple_strtoul(buf, &e, 10);
2736 if (*buf && (*e == 0 || *e == '\n')) { 2736 if (*buf && (*e == 0 || *e == '\n')) {
2737 atomic_set(&rdev->corrected_errors, n); 2737 atomic_set(&rdev->corrected_errors, n);
2738 return len; 2738 return len;
2739 } 2739 }
2740 return -EINVAL; 2740 return -EINVAL;
2741 } 2741 }
2742 static struct rdev_sysfs_entry rdev_errors = 2742 static struct rdev_sysfs_entry rdev_errors =
2743 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store); 2743 __ATTR(errors, S_IRUGO|S_IWUSR, errors_show, errors_store);
2744 2744
2745 static ssize_t 2745 static ssize_t
2746 slot_show(struct md_rdev *rdev, char *page) 2746 slot_show(struct md_rdev *rdev, char *page)
2747 { 2747 {
2748 if (rdev->raid_disk < 0) 2748 if (rdev->raid_disk < 0)
2749 return sprintf(page, "none\n"); 2749 return sprintf(page, "none\n");
2750 else 2750 else
2751 return sprintf(page, "%d\n", rdev->raid_disk); 2751 return sprintf(page, "%d\n", rdev->raid_disk);
2752 } 2752 }
2753 2753
2754 static ssize_t 2754 static ssize_t
2755 slot_store(struct md_rdev *rdev, const char *buf, size_t len) 2755 slot_store(struct md_rdev *rdev, const char *buf, size_t len)
2756 { 2756 {
2757 char *e; 2757 char *e;
2758 int err; 2758 int err;
2759 int slot = simple_strtoul(buf, &e, 10); 2759 int slot = simple_strtoul(buf, &e, 10);
2760 if (strncmp(buf, "none", 4)==0) 2760 if (strncmp(buf, "none", 4)==0)
2761 slot = -1; 2761 slot = -1;
2762 else if (e==buf || (*e && *e!= '\n')) 2762 else if (e==buf || (*e && *e!= '\n'))
2763 return -EINVAL; 2763 return -EINVAL;
2764 if (rdev->mddev->pers && slot == -1) { 2764 if (rdev->mddev->pers && slot == -1) {
2765 /* Setting 'slot' on an active array requires also 2765 /* Setting 'slot' on an active array requires also
2766 * updating the 'rd%d' link, and communicating 2766 * updating the 'rd%d' link, and communicating
2767 * with the personality with ->hot_*_disk. 2767 * with the personality with ->hot_*_disk.
2768 * For now we only support removing 2768 * For now we only support removing
2769 * failed/spare devices. This normally happens automatically, 2769 * failed/spare devices. This normally happens automatically,
2770 * but not when the metadata is externally managed. 2770 * but not when the metadata is externally managed.
2771 */ 2771 */
2772 if (rdev->raid_disk == -1) 2772 if (rdev->raid_disk == -1)
2773 return -EEXIST; 2773 return -EEXIST;
2774 /* personality does all needed checks */ 2774 /* personality does all needed checks */
2775 if (rdev->mddev->pers->hot_remove_disk == NULL) 2775 if (rdev->mddev->pers->hot_remove_disk == NULL)
2776 return -EINVAL; 2776 return -EINVAL;
2777 clear_bit(Blocked, &rdev->flags); 2777 clear_bit(Blocked, &rdev->flags);
2778 remove_and_add_spares(rdev->mddev, rdev); 2778 remove_and_add_spares(rdev->mddev, rdev);
2779 if (rdev->raid_disk >= 0) 2779 if (rdev->raid_disk >= 0)
2780 return -EBUSY; 2780 return -EBUSY;
2781 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2781 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2782 md_wakeup_thread(rdev->mddev->thread); 2782 md_wakeup_thread(rdev->mddev->thread);
2783 } else if (rdev->mddev->pers) { 2783 } else if (rdev->mddev->pers) {
2784 /* Activating a spare .. or possibly reactivating 2784 /* Activating a spare .. or possibly reactivating
2785 * if we ever get bitmaps working here. 2785 * if we ever get bitmaps working here.
2786 */ 2786 */
2787 2787
2788 if (rdev->raid_disk != -1) 2788 if (rdev->raid_disk != -1)
2789 return -EBUSY; 2789 return -EBUSY;
2790 2790
2791 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery)) 2791 if (test_bit(MD_RECOVERY_RUNNING, &rdev->mddev->recovery))
2792 return -EBUSY; 2792 return -EBUSY;
2793 2793
2794 if (rdev->mddev->pers->hot_add_disk == NULL) 2794 if (rdev->mddev->pers->hot_add_disk == NULL)
2795 return -EINVAL; 2795 return -EINVAL;
2796 2796
2797 if (slot >= rdev->mddev->raid_disks && 2797 if (slot >= rdev->mddev->raid_disks &&
2798 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2798 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2799 return -ENOSPC; 2799 return -ENOSPC;
2800 2800
2801 rdev->raid_disk = slot; 2801 rdev->raid_disk = slot;
2802 if (test_bit(In_sync, &rdev->flags)) 2802 if (test_bit(In_sync, &rdev->flags))
2803 rdev->saved_raid_disk = slot; 2803 rdev->saved_raid_disk = slot;
2804 else 2804 else
2805 rdev->saved_raid_disk = -1; 2805 rdev->saved_raid_disk = -1;
2806 clear_bit(In_sync, &rdev->flags); 2806 clear_bit(In_sync, &rdev->flags);
2807 clear_bit(Bitmap_sync, &rdev->flags); 2807 clear_bit(Bitmap_sync, &rdev->flags);
2808 err = rdev->mddev->pers-> 2808 err = rdev->mddev->pers->
2809 hot_add_disk(rdev->mddev, rdev); 2809 hot_add_disk(rdev->mddev, rdev);
2810 if (err) { 2810 if (err) {
2811 rdev->raid_disk = -1; 2811 rdev->raid_disk = -1;
2812 return err; 2812 return err;
2813 } else 2813 } else
2814 sysfs_notify_dirent_safe(rdev->sysfs_state); 2814 sysfs_notify_dirent_safe(rdev->sysfs_state);
2815 if (sysfs_link_rdev(rdev->mddev, rdev)) 2815 if (sysfs_link_rdev(rdev->mddev, rdev))
2816 /* failure here is OK */; 2816 /* failure here is OK */;
2817 /* don't wakeup anyone, leave that to userspace. */ 2817 /* don't wakeup anyone, leave that to userspace. */
2818 } else { 2818 } else {
2819 if (slot >= rdev->mddev->raid_disks && 2819 if (slot >= rdev->mddev->raid_disks &&
2820 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks) 2820 slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
2821 return -ENOSPC; 2821 return -ENOSPC;
2822 rdev->raid_disk = slot; 2822 rdev->raid_disk = slot;
2823 /* assume it is working */ 2823 /* assume it is working */
2824 clear_bit(Faulty, &rdev->flags); 2824 clear_bit(Faulty, &rdev->flags);
2825 clear_bit(WriteMostly, &rdev->flags); 2825 clear_bit(WriteMostly, &rdev->flags);
2826 set_bit(In_sync, &rdev->flags); 2826 set_bit(In_sync, &rdev->flags);
2827 sysfs_notify_dirent_safe(rdev->sysfs_state); 2827 sysfs_notify_dirent_safe(rdev->sysfs_state);
2828 } 2828 }
2829 return len; 2829 return len;
2830 } 2830 }
2831 2831
2832 2832
2833 static struct rdev_sysfs_entry rdev_slot = 2833 static struct rdev_sysfs_entry rdev_slot =
2834 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store); 2834 __ATTR(slot, S_IRUGO|S_IWUSR, slot_show, slot_store);
2835 2835
2836 static ssize_t 2836 static ssize_t
2837 offset_show(struct md_rdev *rdev, char *page) 2837 offset_show(struct md_rdev *rdev, char *page)
2838 { 2838 {
2839 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset); 2839 return sprintf(page, "%llu\n", (unsigned long long)rdev->data_offset);
2840 } 2840 }
2841 2841
2842 static ssize_t 2842 static ssize_t
2843 offset_store(struct md_rdev *rdev, const char *buf, size_t len) 2843 offset_store(struct md_rdev *rdev, const char *buf, size_t len)
2844 { 2844 {
2845 unsigned long long offset; 2845 unsigned long long offset;
2846 if (kstrtoull(buf, 10, &offset) < 0) 2846 if (kstrtoull(buf, 10, &offset) < 0)
2847 return -EINVAL; 2847 return -EINVAL;
2848 if (rdev->mddev->pers && rdev->raid_disk >= 0) 2848 if (rdev->mddev->pers && rdev->raid_disk >= 0)
2849 return -EBUSY; 2849 return -EBUSY;
2850 if (rdev->sectors && rdev->mddev->external) 2850 if (rdev->sectors && rdev->mddev->external)
2851 /* Must set offset before size, so overlap checks 2851 /* Must set offset before size, so overlap checks
2852 * can be sane */ 2852 * can be sane */
2853 return -EBUSY; 2853 return -EBUSY;
2854 rdev->data_offset = offset; 2854 rdev->data_offset = offset;
2855 rdev->new_data_offset = offset; 2855 rdev->new_data_offset = offset;
2856 return len; 2856 return len;
2857 } 2857 }
2858 2858
2859 static struct rdev_sysfs_entry rdev_offset = 2859 static struct rdev_sysfs_entry rdev_offset =
2860 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store); 2860 __ATTR(offset, S_IRUGO|S_IWUSR, offset_show, offset_store);
2861 2861
2862 static ssize_t new_offset_show(struct md_rdev *rdev, char *page) 2862 static ssize_t new_offset_show(struct md_rdev *rdev, char *page)
2863 { 2863 {
2864 return sprintf(page, "%llu\n", 2864 return sprintf(page, "%llu\n",
2865 (unsigned long long)rdev->new_data_offset); 2865 (unsigned long long)rdev->new_data_offset);
2866 } 2866 }
2867 2867
2868 static ssize_t new_offset_store(struct md_rdev *rdev, 2868 static ssize_t new_offset_store(struct md_rdev *rdev,
2869 const char *buf, size_t len) 2869 const char *buf, size_t len)
2870 { 2870 {
2871 unsigned long long new_offset; 2871 unsigned long long new_offset;
2872 struct mddev *mddev = rdev->mddev; 2872 struct mddev *mddev = rdev->mddev;
2873 2873
2874 if (kstrtoull(buf, 10, &new_offset) < 0) 2874 if (kstrtoull(buf, 10, &new_offset) < 0)
2875 return -EINVAL; 2875 return -EINVAL;
2876 2876
2877 if (mddev->sync_thread) 2877 if (mddev->sync_thread)
2878 return -EBUSY; 2878 return -EBUSY;
2879 if (new_offset == rdev->data_offset) 2879 if (new_offset == rdev->data_offset)
2880 /* reset is always permitted */ 2880 /* reset is always permitted */
2881 ; 2881 ;
2882 else if (new_offset > rdev->data_offset) { 2882 else if (new_offset > rdev->data_offset) {
2883 /* must not push array size beyond rdev_sectors */ 2883 /* must not push array size beyond rdev_sectors */
2884 if (new_offset - rdev->data_offset 2884 if (new_offset - rdev->data_offset
2885 + mddev->dev_sectors > rdev->sectors) 2885 + mddev->dev_sectors > rdev->sectors)
2886 return -E2BIG; 2886 return -E2BIG;
2887 } 2887 }
2888 /* Metadata worries about other space details. */ 2888 /* Metadata worries about other space details. */
2889 2889
2890 /* decreasing the offset is inconsistent with a backwards 2890 /* decreasing the offset is inconsistent with a backwards
2891 * reshape. 2891 * reshape.
2892 */ 2892 */
2893 if (new_offset < rdev->data_offset && 2893 if (new_offset < rdev->data_offset &&
2894 mddev->reshape_backwards) 2894 mddev->reshape_backwards)
2895 return -EINVAL; 2895 return -EINVAL;
2896 /* Increasing offset is inconsistent with forwards 2896 /* Increasing offset is inconsistent with forwards
2897 * reshape. reshape_direction should be set to 2897 * reshape. reshape_direction should be set to
2898 * 'backwards' first. 2898 * 'backwards' first.
2899 */ 2899 */
2900 if (new_offset > rdev->data_offset && 2900 if (new_offset > rdev->data_offset &&
2901 !mddev->reshape_backwards) 2901 !mddev->reshape_backwards)
2902 return -EINVAL; 2902 return -EINVAL;
2903 2903
2904 if (mddev->pers && mddev->persistent && 2904 if (mddev->pers && mddev->persistent &&
2905 !super_types[mddev->major_version] 2905 !super_types[mddev->major_version]
2906 .allow_new_offset(rdev, new_offset)) 2906 .allow_new_offset(rdev, new_offset))
2907 return -E2BIG; 2907 return -E2BIG;
2908 rdev->new_data_offset = new_offset; 2908 rdev->new_data_offset = new_offset;
2909 if (new_offset > rdev->data_offset) 2909 if (new_offset > rdev->data_offset)
2910 mddev->reshape_backwards = 1; 2910 mddev->reshape_backwards = 1;
2911 else if (new_offset < rdev->data_offset) 2911 else if (new_offset < rdev->data_offset)
2912 mddev->reshape_backwards = 0; 2912 mddev->reshape_backwards = 0;
2913 2913
2914 return len; 2914 return len;
2915 } 2915 }
2916 static struct rdev_sysfs_entry rdev_new_offset = 2916 static struct rdev_sysfs_entry rdev_new_offset =
2917 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store); 2917 __ATTR(new_offset, S_IRUGO|S_IWUSR, new_offset_show, new_offset_store);
2918 2918
2919 static ssize_t 2919 static ssize_t
2920 rdev_size_show(struct md_rdev *rdev, char *page) 2920 rdev_size_show(struct md_rdev *rdev, char *page)
2921 { 2921 {
2922 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2); 2922 return sprintf(page, "%llu\n", (unsigned long long)rdev->sectors / 2);
2923 } 2923 }
2924 2924
2925 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2) 2925 static int overlaps(sector_t s1, sector_t l1, sector_t s2, sector_t l2)
2926 { 2926 {
2927 /* check if two start/length pairs overlap */ 2927 /* check if two start/length pairs overlap */
2928 if (s1+l1 <= s2) 2928 if (s1+l1 <= s2)
2929 return 0; 2929 return 0;
2930 if (s2+l2 <= s1) 2930 if (s2+l2 <= s1)
2931 return 0; 2931 return 0;
2932 return 1; 2932 return 1;
2933 } 2933 }
2934 2934
2935 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors) 2935 static int strict_blocks_to_sectors(const char *buf, sector_t *sectors)
2936 { 2936 {
2937 unsigned long long blocks; 2937 unsigned long long blocks;
2938 sector_t new; 2938 sector_t new;
2939 2939
2940 if (kstrtoull(buf, 10, &blocks) < 0) 2940 if (kstrtoull(buf, 10, &blocks) < 0)
2941 return -EINVAL; 2941 return -EINVAL;
2942 2942
2943 if (blocks & 1ULL << (8 * sizeof(blocks) - 1)) 2943 if (blocks & 1ULL << (8 * sizeof(blocks) - 1))
2944 return -EINVAL; /* sector conversion overflow */ 2944 return -EINVAL; /* sector conversion overflow */
2945 2945
2946 new = blocks * 2; 2946 new = blocks * 2;
2947 if (new != blocks * 2) 2947 if (new != blocks * 2)
2948 return -EINVAL; /* unsigned long long to sector_t overflow */ 2948 return -EINVAL; /* unsigned long long to sector_t overflow */
2949 2949
2950 *sectors = new; 2950 *sectors = new;
2951 return 0; 2951 return 0;
2952 } 2952 }
2953 2953
2954 static ssize_t 2954 static ssize_t
2955 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) 2955 rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2956 { 2956 {
2957 struct mddev *my_mddev = rdev->mddev; 2957 struct mddev *my_mddev = rdev->mddev;
2958 sector_t oldsectors = rdev->sectors; 2958 sector_t oldsectors = rdev->sectors;
2959 sector_t sectors; 2959 sector_t sectors;
2960 2960
2961 if (strict_blocks_to_sectors(buf, &sectors) < 0) 2961 if (strict_blocks_to_sectors(buf, &sectors) < 0)
2962 return -EINVAL; 2962 return -EINVAL;
2963 if (rdev->data_offset != rdev->new_data_offset) 2963 if (rdev->data_offset != rdev->new_data_offset)
2964 return -EINVAL; /* too confusing */ 2964 return -EINVAL; /* too confusing */
2965 if (my_mddev->pers && rdev->raid_disk >= 0) { 2965 if (my_mddev->pers && rdev->raid_disk >= 0) {
2966 if (my_mddev->persistent) { 2966 if (my_mddev->persistent) {
2967 sectors = super_types[my_mddev->major_version]. 2967 sectors = super_types[my_mddev->major_version].
2968 rdev_size_change(rdev, sectors); 2968 rdev_size_change(rdev, sectors);
2969 if (!sectors) 2969 if (!sectors)
2970 return -EBUSY; 2970 return -EBUSY;
2971 } else if (!sectors) 2971 } else if (!sectors)
2972 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) - 2972 sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
2973 rdev->data_offset; 2973 rdev->data_offset;
2974 if (!my_mddev->pers->resize) 2974 if (!my_mddev->pers->resize)
2975 /* Cannot change size for RAID0 or Linear etc */ 2975 /* Cannot change size for RAID0 or Linear etc */
2976 return -EINVAL; 2976 return -EINVAL;
2977 } 2977 }
2978 if (sectors < my_mddev->dev_sectors) 2978 if (sectors < my_mddev->dev_sectors)
2979 return -EINVAL; /* component must fit device */ 2979 return -EINVAL; /* component must fit device */
2980 2980
2981 rdev->sectors = sectors; 2981 rdev->sectors = sectors;
2982 if (sectors > oldsectors && my_mddev->external) { 2982 if (sectors > oldsectors && my_mddev->external) {
2983 /* need to check that all other rdevs with the same ->bdev 2983 /* need to check that all other rdevs with the same ->bdev
2984 * do not overlap. We need to unlock the mddev to avoid 2984 * do not overlap. We need to unlock the mddev to avoid
2985 * a deadlock. We have already changed rdev->sectors, and if 2985 * a deadlock. We have already changed rdev->sectors, and if
2986 * we have to change it back, we will have the lock again. 2986 * we have to change it back, we will have the lock again.
2987 */ 2987 */
2988 struct mddev *mddev; 2988 struct mddev *mddev;
2989 int overlap = 0; 2989 int overlap = 0;
2990 struct list_head *tmp; 2990 struct list_head *tmp;
2991 2991
2992 mddev_unlock(my_mddev); 2992 mddev_unlock(my_mddev);
2993 for_each_mddev(mddev, tmp) { 2993 for_each_mddev(mddev, tmp) {
2994 struct md_rdev *rdev2; 2994 struct md_rdev *rdev2;
2995 2995
2996 mddev_lock_nointr(mddev); 2996 mddev_lock_nointr(mddev);
2997 rdev_for_each(rdev2, mddev) 2997 rdev_for_each(rdev2, mddev)
2998 if (rdev->bdev == rdev2->bdev && 2998 if (rdev->bdev == rdev2->bdev &&
2999 rdev != rdev2 && 2999 rdev != rdev2 &&
3000 overlaps(rdev->data_offset, rdev->sectors, 3000 overlaps(rdev->data_offset, rdev->sectors,
3001 rdev2->data_offset, 3001 rdev2->data_offset,
3002 rdev2->sectors)) { 3002 rdev2->sectors)) {
3003 overlap = 1; 3003 overlap = 1;
3004 break; 3004 break;
3005 } 3005 }
3006 mddev_unlock(mddev); 3006 mddev_unlock(mddev);
3007 if (overlap) { 3007 if (overlap) {
3008 mddev_put(mddev); 3008 mddev_put(mddev);
3009 break; 3009 break;
3010 } 3010 }
3011 } 3011 }
3012 mddev_lock_nointr(my_mddev); 3012 mddev_lock_nointr(my_mddev);
3013 if (overlap) { 3013 if (overlap) {
3014 /* Someone else could have slipped in a size 3014 /* Someone else could have slipped in a size
3015 * change here, but doing so is just silly. 3015 * change here, but doing so is just silly.
3016 * We put oldsectors back because we *know* it is 3016 * We put oldsectors back because we *know* it is
3017 * safe, and trust userspace not to race with 3017 * safe, and trust userspace not to race with
3018 * itself 3018 * itself
3019 */ 3019 */
3020 rdev->sectors = oldsectors; 3020 rdev->sectors = oldsectors;
3021 return -EBUSY; 3021 return -EBUSY;
3022 } 3022 }
3023 } 3023 }
3024 return len; 3024 return len;
3025 } 3025 }
3026 3026
3027 static struct rdev_sysfs_entry rdev_size = 3027 static struct rdev_sysfs_entry rdev_size =
3028 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store); 3028 __ATTR(size, S_IRUGO|S_IWUSR, rdev_size_show, rdev_size_store);
3029 3029
3030 3030
3031 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page) 3031 static ssize_t recovery_start_show(struct md_rdev *rdev, char *page)
3032 { 3032 {
3033 unsigned long long recovery_start = rdev->recovery_offset; 3033 unsigned long long recovery_start = rdev->recovery_offset;
3034 3034
3035 if (test_bit(In_sync, &rdev->flags) || 3035 if (test_bit(In_sync, &rdev->flags) ||
3036 recovery_start == MaxSector) 3036 recovery_start == MaxSector)
3037 return sprintf(page, "none\n"); 3037 return sprintf(page, "none\n");
3038 3038
3039 return sprintf(page, "%llu\n", recovery_start); 3039 return sprintf(page, "%llu\n", recovery_start);
3040 } 3040 }
3041 3041
3042 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len) 3042 static ssize_t recovery_start_store(struct md_rdev *rdev, const char *buf, size_t len)
3043 { 3043 {
3044 unsigned long long recovery_start; 3044 unsigned long long recovery_start;
3045 3045
3046 if (cmd_match(buf, "none")) 3046 if (cmd_match(buf, "none"))
3047 recovery_start = MaxSector; 3047 recovery_start = MaxSector;
3048 else if (kstrtoull(buf, 10, &recovery_start)) 3048 else if (kstrtoull(buf, 10, &recovery_start))
3049 return -EINVAL; 3049 return -EINVAL;
3050 3050
3051 if (rdev->mddev->pers && 3051 if (rdev->mddev->pers &&
3052 rdev->raid_disk >= 0) 3052 rdev->raid_disk >= 0)
3053 return -EBUSY; 3053 return -EBUSY;
3054 3054
3055 rdev->recovery_offset = recovery_start; 3055 rdev->recovery_offset = recovery_start;
3056 if (recovery_start == MaxSector) 3056 if (recovery_start == MaxSector)
3057 set_bit(In_sync, &rdev->flags); 3057 set_bit(In_sync, &rdev->flags);
3058 else 3058 else
3059 clear_bit(In_sync, &rdev->flags); 3059 clear_bit(In_sync, &rdev->flags);
3060 return len; 3060 return len;
3061 } 3061 }
3062 3062
3063 static struct rdev_sysfs_entry rdev_recovery_start = 3063 static struct rdev_sysfs_entry rdev_recovery_start =
3064 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 3064 __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
3065 3065
3066 3066
3067 static ssize_t 3067 static ssize_t
3068 badblocks_show(struct badblocks *bb, char *page, int unack); 3068 badblocks_show(struct badblocks *bb, char *page, int unack);
3069 static ssize_t 3069 static ssize_t
3070 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); 3070 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
3071 3071
3072 static ssize_t bb_show(struct md_rdev *rdev, char *page) 3072 static ssize_t bb_show(struct md_rdev *rdev, char *page)
3073 { 3073 {
3074 return badblocks_show(&rdev->badblocks, page, 0); 3074 return badblocks_show(&rdev->badblocks, page, 0);
3075 } 3075 }
3076 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len) 3076 static ssize_t bb_store(struct md_rdev *rdev, const char *page, size_t len)
3077 { 3077 {
3078 int rv = badblocks_store(&rdev->badblocks, page, len, 0); 3078 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
3079 /* Maybe that ack was all we needed */ 3079 /* Maybe that ack was all we needed */
3080 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) 3080 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
3081 wake_up(&rdev->blocked_wait); 3081 wake_up(&rdev->blocked_wait);
3082 return rv; 3082 return rv;
3083 } 3083 }
3084 static struct rdev_sysfs_entry rdev_bad_blocks = 3084 static struct rdev_sysfs_entry rdev_bad_blocks =
3085 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); 3085 __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
3086 3086
3087 3087
3088 static ssize_t ubb_show(struct md_rdev *rdev, char *page) 3088 static ssize_t ubb_show(struct md_rdev *rdev, char *page)
3089 { 3089 {
3090 return badblocks_show(&rdev->badblocks, page, 1); 3090 return badblocks_show(&rdev->badblocks, page, 1);
3091 } 3091 }
3092 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len) 3092 static ssize_t ubb_store(struct md_rdev *rdev, const char *page, size_t len)
3093 { 3093 {
3094 return badblocks_store(&rdev->badblocks, page, len, 1); 3094 return badblocks_store(&rdev->badblocks, page, len, 1);
3095 } 3095 }
3096 static struct rdev_sysfs_entry rdev_unack_bad_blocks = 3096 static struct rdev_sysfs_entry rdev_unack_bad_blocks =
3097 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); 3097 __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
3098 3098
3099 static struct attribute *rdev_default_attrs[] = { 3099 static struct attribute *rdev_default_attrs[] = {
3100 &rdev_state.attr, 3100 &rdev_state.attr,
3101 &rdev_errors.attr, 3101 &rdev_errors.attr,
3102 &rdev_slot.attr, 3102 &rdev_slot.attr,
3103 &rdev_offset.attr, 3103 &rdev_offset.attr,
3104 &rdev_new_offset.attr, 3104 &rdev_new_offset.attr,
3105 &rdev_size.attr, 3105 &rdev_size.attr,
3106 &rdev_recovery_start.attr, 3106 &rdev_recovery_start.attr,
3107 &rdev_bad_blocks.attr, 3107 &rdev_bad_blocks.attr,
3108 &rdev_unack_bad_blocks.attr, 3108 &rdev_unack_bad_blocks.attr,
3109 NULL, 3109 NULL,
3110 }; 3110 };
3111 static ssize_t 3111 static ssize_t
3112 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 3112 rdev_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
3113 { 3113 {
3114 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3114 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3115 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3115 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3116 struct mddev *mddev = rdev->mddev; 3116 struct mddev *mddev = rdev->mddev;
3117 ssize_t rv; 3117 ssize_t rv;
3118 3118
3119 if (!entry->show) 3119 if (!entry->show)
3120 return -EIO; 3120 return -EIO;
3121 3121
3122 rv = mddev ? mddev_lock(mddev) : -EBUSY; 3122 rv = mddev ? mddev_lock(mddev) : -EBUSY;
3123 if (!rv) { 3123 if (!rv) {
3124 if (rdev->mddev == NULL) 3124 if (rdev->mddev == NULL)
3125 rv = -EBUSY; 3125 rv = -EBUSY;
3126 else 3126 else
3127 rv = entry->show(rdev, page); 3127 rv = entry->show(rdev, page);
3128 mddev_unlock(mddev); 3128 mddev_unlock(mddev);
3129 } 3129 }
3130 return rv; 3130 return rv;
3131 } 3131 }
3132 3132
3133 static ssize_t 3133 static ssize_t
3134 rdev_attr_store(struct kobject *kobj, struct attribute *attr, 3134 rdev_attr_store(struct kobject *kobj, struct attribute *attr,
3135 const char *page, size_t length) 3135 const char *page, size_t length)
3136 { 3136 {
3137 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr); 3137 struct rdev_sysfs_entry *entry = container_of(attr, struct rdev_sysfs_entry, attr);
3138 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj); 3138 struct md_rdev *rdev = container_of(kobj, struct md_rdev, kobj);
3139 ssize_t rv; 3139 ssize_t rv;
3140 struct mddev *mddev = rdev->mddev; 3140 struct mddev *mddev = rdev->mddev;
3141 3141
3142 if (!entry->store) 3142 if (!entry->store)
3143 return -EIO; 3143 return -EIO;
3144 if (!capable(CAP_SYS_ADMIN)) 3144 if (!capable(CAP_SYS_ADMIN))
3145 return -EACCES; 3145 return -EACCES;
3146 rv = mddev ? mddev_lock(mddev): -EBUSY; 3146 rv = mddev ? mddev_lock(mddev): -EBUSY;
3147 if (!rv) { 3147 if (!rv) {
3148 if (rdev->mddev == NULL) 3148 if (rdev->mddev == NULL)
3149 rv = -EBUSY; 3149 rv = -EBUSY;
3150 else 3150 else
3151 rv = entry->store(rdev, page, length); 3151 rv = entry->store(rdev, page, length);
3152 mddev_unlock(mddev); 3152 mddev_unlock(mddev);
3153 } 3153 }
3154 return rv; 3154 return rv;
3155 } 3155 }
3156 3156
3157 static void rdev_free(struct kobject *ko) 3157 static void rdev_free(struct kobject *ko)
3158 { 3158 {
3159 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj); 3159 struct md_rdev *rdev = container_of(ko, struct md_rdev, kobj);
3160 kfree(rdev); 3160 kfree(rdev);
3161 } 3161 }
3162 static const struct sysfs_ops rdev_sysfs_ops = { 3162 static const struct sysfs_ops rdev_sysfs_ops = {
3163 .show = rdev_attr_show, 3163 .show = rdev_attr_show,
3164 .store = rdev_attr_store, 3164 .store = rdev_attr_store,
3165 }; 3165 };
3166 static struct kobj_type rdev_ktype = { 3166 static struct kobj_type rdev_ktype = {
3167 .release = rdev_free, 3167 .release = rdev_free,
3168 .sysfs_ops = &rdev_sysfs_ops, 3168 .sysfs_ops = &rdev_sysfs_ops,
3169 .default_attrs = rdev_default_attrs, 3169 .default_attrs = rdev_default_attrs,
3170 }; 3170 };
3171 3171
3172 int md_rdev_init(struct md_rdev *rdev) 3172 int md_rdev_init(struct md_rdev *rdev)
3173 { 3173 {
3174 rdev->desc_nr = -1; 3174 rdev->desc_nr = -1;
3175 rdev->saved_raid_disk = -1; 3175 rdev->saved_raid_disk = -1;
3176 rdev->raid_disk = -1; 3176 rdev->raid_disk = -1;
3177 rdev->flags = 0; 3177 rdev->flags = 0;
3178 rdev->data_offset = 0; 3178 rdev->data_offset = 0;
3179 rdev->new_data_offset = 0; 3179 rdev->new_data_offset = 0;
3180 rdev->sb_events = 0; 3180 rdev->sb_events = 0;
3181 rdev->last_read_error.tv_sec = 0; 3181 rdev->last_read_error.tv_sec = 0;
3182 rdev->last_read_error.tv_nsec = 0; 3182 rdev->last_read_error.tv_nsec = 0;
3183 rdev->sb_loaded = 0; 3183 rdev->sb_loaded = 0;
3184 rdev->bb_page = NULL; 3184 rdev->bb_page = NULL;
3185 atomic_set(&rdev->nr_pending, 0); 3185 atomic_set(&rdev->nr_pending, 0);
3186 atomic_set(&rdev->read_errors, 0); 3186 atomic_set(&rdev->read_errors, 0);
3187 atomic_set(&rdev->corrected_errors, 0); 3187 atomic_set(&rdev->corrected_errors, 0);
3188 3188
3189 INIT_LIST_HEAD(&rdev->same_set); 3189 INIT_LIST_HEAD(&rdev->same_set);
3190 init_waitqueue_head(&rdev->blocked_wait); 3190 init_waitqueue_head(&rdev->blocked_wait);
3191 3191
3192 /* Add space to store bad block list. 3192 /* Add space to store bad block list.
3193 * This reserves the space even on arrays where it cannot 3193 * This reserves the space even on arrays where it cannot
3194 * be used - I wonder if that matters 3194 * be used - I wonder if that matters
3195 */ 3195 */
3196 rdev->badblocks.count = 0; 3196 rdev->badblocks.count = 0;
3197 rdev->badblocks.shift = -1; /* disabled until explicitly enabled */ 3197 rdev->badblocks.shift = -1; /* disabled until explicitly enabled */
3198 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); 3198 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3199 seqlock_init(&rdev->badblocks.lock); 3199 seqlock_init(&rdev->badblocks.lock);
3200 if (rdev->badblocks.page == NULL) 3200 if (rdev->badblocks.page == NULL)
3201 return -ENOMEM; 3201 return -ENOMEM;
3202 3202
3203 return 0; 3203 return 0;
3204 } 3204 }
3205 EXPORT_SYMBOL_GPL(md_rdev_init); 3205 EXPORT_SYMBOL_GPL(md_rdev_init);
3206 /* 3206 /*
3207 * Import a device. If 'super_format' >= 0, then sanity check the superblock 3207 * Import a device. If 'super_format' >= 0, then sanity check the superblock
3208 * 3208 *
3209 * mark the device faulty if: 3209 * mark the device faulty if:
3210 * 3210 *
3211 * - the device is nonexistent (zero size) 3211 * - the device is nonexistent (zero size)
3212 * - the device has no valid superblock 3212 * - the device has no valid superblock
3213 * 3213 *
3214 * a faulty rdev _never_ has rdev->sb set. 3214 * a faulty rdev _never_ has rdev->sb set.
3215 */ 3215 */
3216 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor) 3216 static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
3217 { 3217 {
3218 char b[BDEVNAME_SIZE]; 3218 char b[BDEVNAME_SIZE];
3219 int err; 3219 int err;
3220 struct md_rdev *rdev; 3220 struct md_rdev *rdev;
3221 sector_t size; 3221 sector_t size;
3222 3222
3223 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL); 3223 rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
3224 if (!rdev) { 3224 if (!rdev) {
3225 printk(KERN_ERR "md: could not alloc mem for new device!\n"); 3225 printk(KERN_ERR "md: could not alloc mem for new device!\n");
3226 return ERR_PTR(-ENOMEM); 3226 return ERR_PTR(-ENOMEM);
3227 } 3227 }
3228 3228
3229 err = md_rdev_init(rdev); 3229 err = md_rdev_init(rdev);
3230 if (err) 3230 if (err)
3231 goto abort_free; 3231 goto abort_free;
3232 err = alloc_disk_sb(rdev); 3232 err = alloc_disk_sb(rdev);
3233 if (err) 3233 if (err)
3234 goto abort_free; 3234 goto abort_free;
3235 3235
3236 err = lock_rdev(rdev, newdev, super_format == -2); 3236 err = lock_rdev(rdev, newdev, super_format == -2);
3237 if (err) 3237 if (err)
3238 goto abort_free; 3238 goto abort_free;
3239 3239
3240 kobject_init(&rdev->kobj, &rdev_ktype); 3240 kobject_init(&rdev->kobj, &rdev_ktype);
3241 3241
3242 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS; 3242 size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
3243 if (!size) { 3243 if (!size) {
3244 printk(KERN_WARNING 3244 printk(KERN_WARNING
3245 "md: %s has zero or unknown size, marking faulty!\n", 3245 "md: %s has zero or unknown size, marking faulty!\n",
3246 bdevname(rdev->bdev,b)); 3246 bdevname(rdev->bdev,b));
3247 err = -EINVAL; 3247 err = -EINVAL;
3248 goto abort_free; 3248 goto abort_free;
3249 } 3249 }
3250 3250
3251 if (super_format >= 0) { 3251 if (super_format >= 0) {
3252 err = super_types[super_format]. 3252 err = super_types[super_format].
3253 load_super(rdev, NULL, super_minor); 3253 load_super(rdev, NULL, super_minor);
3254 if (err == -EINVAL) { 3254 if (err == -EINVAL) {
3255 printk(KERN_WARNING 3255 printk(KERN_WARNING
3256 "md: %s does not have a valid v%d.%d " 3256 "md: %s does not have a valid v%d.%d "
3257 "superblock, not importing!\n", 3257 "superblock, not importing!\n",
3258 bdevname(rdev->bdev,b), 3258 bdevname(rdev->bdev,b),
3259 super_format, super_minor); 3259 super_format, super_minor);
3260 goto abort_free; 3260 goto abort_free;
3261 } 3261 }
3262 if (err < 0) { 3262 if (err < 0) {
3263 printk(KERN_WARNING 3263 printk(KERN_WARNING
3264 "md: could not read %s's sb, not importing!\n", 3264 "md: could not read %s's sb, not importing!\n",
3265 bdevname(rdev->bdev,b)); 3265 bdevname(rdev->bdev,b));
3266 goto abort_free; 3266 goto abort_free;
3267 } 3267 }
3268 } 3268 }
3269 3269
3270 return rdev; 3270 return rdev;
3271 3271
3272 abort_free: 3272 abort_free:
3273 if (rdev->bdev) 3273 if (rdev->bdev)
3274 unlock_rdev(rdev); 3274 unlock_rdev(rdev);
3275 md_rdev_clear(rdev); 3275 md_rdev_clear(rdev);
3276 kfree(rdev); 3276 kfree(rdev);
3277 return ERR_PTR(err); 3277 return ERR_PTR(err);
3278 } 3278 }
3279 3279
3280 /* 3280 /*
3281 * Check a full RAID array for plausibility 3281 * Check a full RAID array for plausibility
3282 */ 3282 */
3283 3283
3284 3284
3285 static void analyze_sbs(struct mddev * mddev) 3285 static void analyze_sbs(struct mddev * mddev)
3286 { 3286 {
3287 int i; 3287 int i;
3288 struct md_rdev *rdev, *freshest, *tmp; 3288 struct md_rdev *rdev, *freshest, *tmp;
3289 char b[BDEVNAME_SIZE]; 3289 char b[BDEVNAME_SIZE];
3290 3290
3291 freshest = NULL; 3291 freshest = NULL;
3292 rdev_for_each_safe(rdev, tmp, mddev) 3292 rdev_for_each_safe(rdev, tmp, mddev)
3293 switch (super_types[mddev->major_version]. 3293 switch (super_types[mddev->major_version].
3294 load_super(rdev, freshest, mddev->minor_version)) { 3294 load_super(rdev, freshest, mddev->minor_version)) {
3295 case 1: 3295 case 1:
3296 freshest = rdev; 3296 freshest = rdev;
3297 break; 3297 break;
3298 case 0: 3298 case 0:
3299 break; 3299 break;
3300 default: 3300 default:
3301 printk( KERN_ERR \ 3301 printk( KERN_ERR \
3302 "md: fatal superblock inconsistency in %s" 3302 "md: fatal superblock inconsistency in %s"
3303 " -- removing from array\n", 3303 " -- removing from array\n",
3304 bdevname(rdev->bdev,b)); 3304 bdevname(rdev->bdev,b));
3305 kick_rdev_from_array(rdev); 3305 kick_rdev_from_array(rdev);
3306 } 3306 }
3307 3307
3308 3308
3309 super_types[mddev->major_version]. 3309 super_types[mddev->major_version].
3310 validate_super(mddev, freshest); 3310 validate_super(mddev, freshest);
3311 3311
3312 i = 0; 3312 i = 0;
3313 rdev_for_each_safe(rdev, tmp, mddev) { 3313 rdev_for_each_safe(rdev, tmp, mddev) {
3314 if (mddev->max_disks && 3314 if (mddev->max_disks &&
3315 (rdev->desc_nr >= mddev->max_disks || 3315 (rdev->desc_nr >= mddev->max_disks ||
3316 i > mddev->max_disks)) { 3316 i > mddev->max_disks)) {
3317 printk(KERN_WARNING 3317 printk(KERN_WARNING
3318 "md: %s: %s: only %d devices permitted\n", 3318 "md: %s: %s: only %d devices permitted\n",
3319 mdname(mddev), bdevname(rdev->bdev, b), 3319 mdname(mddev), bdevname(rdev->bdev, b),
3320 mddev->max_disks); 3320 mddev->max_disks);
3321 kick_rdev_from_array(rdev); 3321 kick_rdev_from_array(rdev);
3322 continue; 3322 continue;
3323 } 3323 }
3324 if (rdev != freshest) 3324 if (rdev != freshest)
3325 if (super_types[mddev->major_version]. 3325 if (super_types[mddev->major_version].
3326 validate_super(mddev, rdev)) { 3326 validate_super(mddev, rdev)) {
3327 printk(KERN_WARNING "md: kicking non-fresh %s" 3327 printk(KERN_WARNING "md: kicking non-fresh %s"
3328 " from array!\n", 3328 " from array!\n",
3329 bdevname(rdev->bdev,b)); 3329 bdevname(rdev->bdev,b));
3330 kick_rdev_from_array(rdev); 3330 kick_rdev_from_array(rdev);
3331 continue; 3331 continue;
3332 } 3332 }
3333 if (mddev->level == LEVEL_MULTIPATH) { 3333 if (mddev->level == LEVEL_MULTIPATH) {
3334 rdev->desc_nr = i++; 3334 rdev->desc_nr = i++;
3335 rdev->raid_disk = rdev->desc_nr; 3335 rdev->raid_disk = rdev->desc_nr;
3336 set_bit(In_sync, &rdev->flags); 3336 set_bit(In_sync, &rdev->flags);
3337 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) { 3337 } else if (rdev->raid_disk >= (mddev->raid_disks - min(0, mddev->delta_disks))) {
3338 rdev->raid_disk = -1; 3338 rdev->raid_disk = -1;
3339 clear_bit(In_sync, &rdev->flags); 3339 clear_bit(In_sync, &rdev->flags);
3340 } 3340 }
3341 } 3341 }
3342 } 3342 }
3343 3343
3344 /* Read a fixed-point number. 3344 /* Read a fixed-point number.
3345 * Numbers in sysfs attributes should be in "standard" units where 3345 * Numbers in sysfs attributes should be in "standard" units where
3346 * possible, so time should be in seconds. 3346 * possible, so time should be in seconds.
3347 * However we internally use a a much smaller unit such as 3347 * However we internally use a a much smaller unit such as
3348 * milliseconds or jiffies. 3348 * milliseconds or jiffies.
3349 * This function takes a decimal number with a possible fractional 3349 * This function takes a decimal number with a possible fractional
3350 * component, and produces an integer which is the result of 3350 * component, and produces an integer which is the result of
3351 * multiplying that number by 10^'scale'. 3351 * multiplying that number by 10^'scale'.
3352 * all without any floating-point arithmetic. 3352 * all without any floating-point arithmetic.
3353 */ 3353 */
3354 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale) 3354 int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
3355 { 3355 {
3356 unsigned long result = 0; 3356 unsigned long result = 0;
3357 long decimals = -1; 3357 long decimals = -1;
3358 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) { 3358 while (isdigit(*cp) || (*cp == '.' && decimals < 0)) {
3359 if (*cp == '.') 3359 if (*cp == '.')
3360 decimals = 0; 3360 decimals = 0;
3361 else if (decimals < scale) { 3361 else if (decimals < scale) {
3362 unsigned int value; 3362 unsigned int value;
3363 value = *cp - '0'; 3363 value = *cp - '0';
3364 result = result * 10 + value; 3364 result = result * 10 + value;
3365 if (decimals >= 0) 3365 if (decimals >= 0)
3366 decimals++; 3366 decimals++;
3367 } 3367 }
3368 cp++; 3368 cp++;
3369 } 3369 }
3370 if (*cp == '\n') 3370 if (*cp == '\n')
3371 cp++; 3371 cp++;
3372 if (*cp) 3372 if (*cp)
3373 return -EINVAL; 3373 return -EINVAL;
3374 if (decimals < 0) 3374 if (decimals < 0)
3375 decimals = 0; 3375 decimals = 0;
3376 while (decimals < scale) { 3376 while (decimals < scale) {
3377 result *= 10; 3377 result *= 10;
3378 decimals ++; 3378 decimals ++;
3379 } 3379 }
3380 *res = result; 3380 *res = result;
3381 return 0; 3381 return 0;
3382 } 3382 }
3383 3383
3384 3384
3385 static void md_safemode_timeout(unsigned long data); 3385 static void md_safemode_timeout(unsigned long data);
3386 3386
3387 static ssize_t 3387 static ssize_t
3388 safe_delay_show(struct mddev *mddev, char *page) 3388 safe_delay_show(struct mddev *mddev, char *page)
3389 { 3389 {
3390 int msec = (mddev->safemode_delay*1000)/HZ; 3390 int msec = (mddev->safemode_delay*1000)/HZ;
3391 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000); 3391 return sprintf(page, "%d.%03d\n", msec/1000, msec%1000);
3392 } 3392 }
3393 static ssize_t 3393 static ssize_t
3394 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len) 3394 safe_delay_store(struct mddev *mddev, const char *cbuf, size_t len)
3395 { 3395 {
3396 unsigned long msec; 3396 unsigned long msec;
3397 3397
3398 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0) 3398 if (strict_strtoul_scaled(cbuf, &msec, 3) < 0)
3399 return -EINVAL; 3399 return -EINVAL;
3400 if (msec == 0) 3400 if (msec == 0)
3401 mddev->safemode_delay = 0; 3401 mddev->safemode_delay = 0;
3402 else { 3402 else {
3403 unsigned long old_delay = mddev->safemode_delay; 3403 unsigned long old_delay = mddev->safemode_delay;
3404 mddev->safemode_delay = (msec*HZ)/1000; 3404 mddev->safemode_delay = (msec*HZ)/1000;
3405 if (mddev->safemode_delay == 0) 3405 if (mddev->safemode_delay == 0)
3406 mddev->safemode_delay = 1; 3406 mddev->safemode_delay = 1;
3407 if (mddev->safemode_delay < old_delay || old_delay == 0) 3407 if (mddev->safemode_delay < old_delay || old_delay == 0)
3408 md_safemode_timeout((unsigned long)mddev); 3408 md_safemode_timeout((unsigned long)mddev);
3409 } 3409 }
3410 return len; 3410 return len;
3411 } 3411 }
3412 static struct md_sysfs_entry md_safe_delay = 3412 static struct md_sysfs_entry md_safe_delay =
3413 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store); 3413 __ATTR(safe_mode_delay, S_IRUGO|S_IWUSR,safe_delay_show, safe_delay_store);
3414 3414
3415 static ssize_t 3415 static ssize_t
3416 level_show(struct mddev *mddev, char *page) 3416 level_show(struct mddev *mddev, char *page)
3417 { 3417 {
3418 struct md_personality *p = mddev->pers; 3418 struct md_personality *p = mddev->pers;
3419 if (p) 3419 if (p)
3420 return sprintf(page, "%s\n", p->name); 3420 return sprintf(page, "%s\n", p->name);
3421 else if (mddev->clevel[0]) 3421 else if (mddev->clevel[0])
3422 return sprintf(page, "%s\n", mddev->clevel); 3422 return sprintf(page, "%s\n", mddev->clevel);
3423 else if (mddev->level != LEVEL_NONE) 3423 else if (mddev->level != LEVEL_NONE)
3424 return sprintf(page, "%d\n", mddev->level); 3424 return sprintf(page, "%d\n", mddev->level);
3425 else 3425 else
3426 return 0; 3426 return 0;
3427 } 3427 }
3428 3428
3429 static ssize_t 3429 static ssize_t
3430 level_store(struct mddev *mddev, const char *buf, size_t len) 3430 level_store(struct mddev *mddev, const char *buf, size_t len)
3431 { 3431 {
3432 char clevel[16]; 3432 char clevel[16];
3433 ssize_t rv = len; 3433 ssize_t rv = len;
3434 struct md_personality *pers; 3434 struct md_personality *pers;
3435 long level; 3435 long level;
3436 void *priv; 3436 void *priv;
3437 struct md_rdev *rdev; 3437 struct md_rdev *rdev;
3438 3438
3439 if (mddev->pers == NULL) { 3439 if (mddev->pers == NULL) {
3440 if (len == 0) 3440 if (len == 0)
3441 return 0; 3441 return 0;
3442 if (len >= sizeof(mddev->clevel)) 3442 if (len >= sizeof(mddev->clevel))
3443 return -ENOSPC; 3443 return -ENOSPC;
3444 strncpy(mddev->clevel, buf, len); 3444 strncpy(mddev->clevel, buf, len);
3445 if (mddev->clevel[len-1] == '\n') 3445 if (mddev->clevel[len-1] == '\n')
3446 len--; 3446 len--;
3447 mddev->clevel[len] = 0; 3447 mddev->clevel[len] = 0;
3448 mddev->level = LEVEL_NONE; 3448 mddev->level = LEVEL_NONE;
3449 return rv; 3449 return rv;
3450 } 3450 }
3451 3451
3452 /* request to change the personality. Need to ensure: 3452 /* request to change the personality. Need to ensure:
3453 * - array is not engaged in resync/recovery/reshape 3453 * - array is not engaged in resync/recovery/reshape
3454 * - old personality can be suspended 3454 * - old personality can be suspended
3455 * - new personality will access other array. 3455 * - new personality will access other array.
3456 */ 3456 */
3457 3457
3458 if (mddev->sync_thread || 3458 if (mddev->sync_thread ||
3459 mddev->reshape_position != MaxSector || 3459 mddev->reshape_position != MaxSector ||
3460 mddev->sysfs_active) 3460 mddev->sysfs_active)
3461 return -EBUSY; 3461 return -EBUSY;
3462 3462
3463 if (!mddev->pers->quiesce) { 3463 if (!mddev->pers->quiesce) {
3464 printk(KERN_WARNING "md: %s: %s does not support online personality change\n", 3464 printk(KERN_WARNING "md: %s: %s does not support online personality change\n",
3465 mdname(mddev), mddev->pers->name); 3465 mdname(mddev), mddev->pers->name);
3466 return -EINVAL; 3466 return -EINVAL;
3467 } 3467 }
3468 3468
3469 /* Now find the new personality */ 3469 /* Now find the new personality */
3470 if (len == 0 || len >= sizeof(clevel)) 3470 if (len == 0 || len >= sizeof(clevel))
3471 return -EINVAL; 3471 return -EINVAL;
3472 strncpy(clevel, buf, len); 3472 strncpy(clevel, buf, len);
3473 if (clevel[len-1] == '\n') 3473 if (clevel[len-1] == '\n')
3474 len--; 3474 len--;
3475 clevel[len] = 0; 3475 clevel[len] = 0;
3476 if (kstrtol(clevel, 10, &level)) 3476 if (kstrtol(clevel, 10, &level))
3477 level = LEVEL_NONE; 3477 level = LEVEL_NONE;
3478 3478
3479 if (request_module("md-%s", clevel) != 0) 3479 if (request_module("md-%s", clevel) != 0)
3480 request_module("md-level-%s", clevel); 3480 request_module("md-level-%s", clevel);
3481 spin_lock(&pers_lock); 3481 spin_lock(&pers_lock);
3482 pers = find_pers(level, clevel); 3482 pers = find_pers(level, clevel);
3483 if (!pers || !try_module_get(pers->owner)) { 3483 if (!pers || !try_module_get(pers->owner)) {
3484 spin_unlock(&pers_lock); 3484 spin_unlock(&pers_lock);
3485 printk(KERN_WARNING "md: personality %s not loaded\n", clevel); 3485 printk(KERN_WARNING "md: personality %s not loaded\n", clevel);
3486 return -EINVAL; 3486 return -EINVAL;
3487 } 3487 }
3488 spin_unlock(&pers_lock); 3488 spin_unlock(&pers_lock);
3489 3489
3490 if (pers == mddev->pers) { 3490 if (pers == mddev->pers) {
3491 /* Nothing to do! */ 3491 /* Nothing to do! */
3492 module_put(pers->owner); 3492 module_put(pers->owner);
3493 return rv; 3493 return rv;
3494 } 3494 }
3495 if (!pers->takeover) { 3495 if (!pers->takeover) {
3496 module_put(pers->owner); 3496 module_put(pers->owner);
3497 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n", 3497 printk(KERN_WARNING "md: %s: %s does not support personality takeover\n",
3498 mdname(mddev), clevel); 3498 mdname(mddev), clevel);
3499 return -EINVAL; 3499 return -EINVAL;
3500 } 3500 }
3501 3501
3502 rdev_for_each(rdev, mddev) 3502 rdev_for_each(rdev, mddev)
3503 rdev->new_raid_disk = rdev->raid_disk; 3503 rdev->new_raid_disk = rdev->raid_disk;
3504 3504
3505 /* ->takeover must set new_* and/or delta_disks 3505 /* ->takeover must set new_* and/or delta_disks
3506 * if it succeeds, and may set them when it fails. 3506 * if it succeeds, and may set them when it fails.
3507 */ 3507 */
3508 priv = pers->takeover(mddev); 3508 priv = pers->takeover(mddev);
3509 if (IS_ERR(priv)) { 3509 if (IS_ERR(priv)) {
3510 mddev->new_level = mddev->level; 3510 mddev->new_level = mddev->level;
3511 mddev->new_layout = mddev->layout; 3511 mddev->new_layout = mddev->layout;
3512 mddev->new_chunk_sectors = mddev->chunk_sectors; 3512 mddev->new_chunk_sectors = mddev->chunk_sectors;
3513 mddev->raid_disks -= mddev->delta_disks; 3513 mddev->raid_disks -= mddev->delta_disks;
3514 mddev->delta_disks = 0; 3514 mddev->delta_disks = 0;
3515 mddev->reshape_backwards = 0; 3515 mddev->reshape_backwards = 0;
3516 module_put(pers->owner); 3516 module_put(pers->owner);
3517 printk(KERN_WARNING "md: %s: %s would not accept array\n", 3517 printk(KERN_WARNING "md: %s: %s would not accept array\n",
3518 mdname(mddev), clevel); 3518 mdname(mddev), clevel);
3519 return PTR_ERR(priv); 3519 return PTR_ERR(priv);
3520 } 3520 }
3521 3521
3522 /* Looks like we have a winner */ 3522 /* Looks like we have a winner */
3523 mddev_suspend(mddev); 3523 mddev_suspend(mddev);
3524 mddev->pers->stop(mddev); 3524 mddev->pers->stop(mddev);
3525 3525
3526 if (mddev->pers->sync_request == NULL && 3526 if (mddev->pers->sync_request == NULL &&
3527 pers->sync_request != NULL) { 3527 pers->sync_request != NULL) {
3528 /* need to add the md_redundancy_group */ 3528 /* need to add the md_redundancy_group */
3529 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 3529 if (sysfs_create_group(&mddev->kobj, &md_redundancy_group))
3530 printk(KERN_WARNING 3530 printk(KERN_WARNING
3531 "md: cannot register extra attributes for %s\n", 3531 "md: cannot register extra attributes for %s\n",
3532 mdname(mddev)); 3532 mdname(mddev));
3533 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action"); 3533 mddev->sysfs_action = sysfs_get_dirent(mddev->kobj.sd, "sync_action");
3534 } 3534 }
3535 if (mddev->pers->sync_request != NULL && 3535 if (mddev->pers->sync_request != NULL &&
3536 pers->sync_request == NULL) { 3536 pers->sync_request == NULL) {
3537 /* need to remove the md_redundancy_group */ 3537 /* need to remove the md_redundancy_group */
3538 if (mddev->to_remove == NULL) 3538 if (mddev->to_remove == NULL)
3539 mddev->to_remove = &md_redundancy_group; 3539 mddev->to_remove = &md_redundancy_group;
3540 } 3540 }
3541 3541
3542 if (mddev->pers->sync_request == NULL && 3542 if (mddev->pers->sync_request == NULL &&
3543 mddev->external) { 3543 mddev->external) {
3544 /* We are converting from a no-redundancy array 3544 /* We are converting from a no-redundancy array
3545 * to a redundancy array and metadata is managed 3545 * to a redundancy array and metadata is managed
3546 * externally so we need to be sure that writes 3546 * externally so we need to be sure that writes
3547 * won't block due to a need to transition 3547 * won't block due to a need to transition
3548 * clean->dirty 3548 * clean->dirty
3549 * until external management is started. 3549 * until external management is started.
3550 */ 3550 */
3551 mddev->in_sync = 0; 3551 mddev->in_sync = 0;
3552 mddev->safemode_delay = 0; 3552 mddev->safemode_delay = 0;
3553 mddev->safemode = 0; 3553 mddev->safemode = 0;
3554 } 3554 }
3555 3555
3556 rdev_for_each(rdev, mddev) { 3556 rdev_for_each(rdev, mddev) {
3557 if (rdev->raid_disk < 0) 3557 if (rdev->raid_disk < 0)
3558 continue; 3558 continue;
3559 if (rdev->new_raid_disk >= mddev->raid_disks) 3559 if (rdev->new_raid_disk >= mddev->raid_disks)
3560 rdev->new_raid_disk = -1; 3560 rdev->new_raid_disk = -1;
3561 if (rdev->new_raid_disk == rdev->raid_disk) 3561 if (rdev->new_raid_disk == rdev->raid_disk)
3562 continue; 3562 continue;
3563 sysfs_unlink_rdev(mddev, rdev); 3563 sysfs_unlink_rdev(mddev, rdev);
3564 } 3564 }
3565 rdev_for_each(rdev, mddev) { 3565 rdev_for_each(rdev, mddev) {
3566 if (rdev->raid_disk < 0) 3566 if (rdev->raid_disk < 0)
3567 continue; 3567 continue;
3568 if (rdev->new_raid_disk == rdev->raid_disk) 3568 if (rdev->new_raid_disk == rdev->raid_disk)
3569 continue; 3569 continue;
3570 rdev->raid_disk = rdev->new_raid_disk; 3570 rdev->raid_disk = rdev->new_raid_disk;
3571 if (rdev->raid_disk < 0) 3571 if (rdev->raid_disk < 0)
3572 clear_bit(In_sync, &rdev->flags); 3572 clear_bit(In_sync, &rdev->flags);
3573 else { 3573 else {
3574 if (sysfs_link_rdev(mddev, rdev)) 3574 if (sysfs_link_rdev(mddev, rdev))
3575 printk(KERN_WARNING "md: cannot register rd%d" 3575 printk(KERN_WARNING "md: cannot register rd%d"
3576 " for %s after level change\n", 3576 " for %s after level change\n",
3577 rdev->raid_disk, mdname(mddev)); 3577 rdev->raid_disk, mdname(mddev));
3578 } 3578 }
3579 } 3579 }
3580 3580
3581 module_put(mddev->pers->owner); 3581 module_put(mddev->pers->owner);
3582 mddev->pers = pers; 3582 mddev->pers = pers;
3583 mddev->private = priv; 3583 mddev->private = priv;
3584 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 3584 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
3585 mddev->level = mddev->new_level; 3585 mddev->level = mddev->new_level;
3586 mddev->layout = mddev->new_layout; 3586 mddev->layout = mddev->new_layout;
3587 mddev->chunk_sectors = mddev->new_chunk_sectors; 3587 mddev->chunk_sectors = mddev->new_chunk_sectors;
3588 mddev->delta_disks = 0; 3588 mddev->delta_disks = 0;
3589 mddev->reshape_backwards = 0; 3589 mddev->reshape_backwards = 0;
3590 mddev->degraded = 0; 3590 mddev->degraded = 0;
3591 if (mddev->pers->sync_request == NULL) { 3591 if (mddev->pers->sync_request == NULL) {
3592 /* this is now an array without redundancy, so 3592 /* this is now an array without redundancy, so
3593 * it must always be in_sync 3593 * it must always be in_sync
3594 */ 3594 */
3595 mddev->in_sync = 1; 3595 mddev->in_sync = 1;
3596 del_timer_sync(&mddev->safemode_timer); 3596 del_timer_sync(&mddev->safemode_timer);
3597 } 3597 }
3598 blk_set_stacking_limits(&mddev->queue->limits); 3598 blk_set_stacking_limits(&mddev->queue->limits);
3599 pers->run(mddev); 3599 pers->run(mddev);
3600 set_bit(MD_CHANGE_DEVS, &mddev->flags); 3600 set_bit(MD_CHANGE_DEVS, &mddev->flags);
3601 mddev_resume(mddev); 3601 mddev_resume(mddev);
3602 if (!mddev->thread) 3602 if (!mddev->thread)
3603 md_update_sb(mddev, 1); 3603 md_update_sb(mddev, 1);
3604 sysfs_notify(&mddev->kobj, NULL, "level"); 3604 sysfs_notify(&mddev->kobj, NULL, "level");
3605 md_new_event(mddev); 3605 md_new_event(mddev);
3606 return rv; 3606 return rv;
3607 } 3607 }
3608 3608
3609 static struct md_sysfs_entry md_level = 3609 static struct md_sysfs_entry md_level =
3610 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store); 3610 __ATTR(level, S_IRUGO|S_IWUSR, level_show, level_store);
3611 3611
3612 3612
3613 static ssize_t 3613 static ssize_t
3614 layout_show(struct mddev *mddev, char *page) 3614 layout_show(struct mddev *mddev, char *page)
3615 { 3615 {
3616 /* just a number, not meaningful for all levels */ 3616 /* just a number, not meaningful for all levels */
3617 if (mddev->reshape_position != MaxSector && 3617 if (mddev->reshape_position != MaxSector &&
3618 mddev->layout != mddev->new_layout) 3618 mddev->layout != mddev->new_layout)
3619 return sprintf(page, "%d (%d)\n", 3619 return sprintf(page, "%d (%d)\n",
3620 mddev->new_layout, mddev->layout); 3620 mddev->new_layout, mddev->layout);
3621 return sprintf(page, "%d\n", mddev->layout); 3621 return sprintf(page, "%d\n", mddev->layout);
3622 } 3622 }
3623 3623
3624 static ssize_t 3624 static ssize_t
3625 layout_store(struct mddev *mddev, const char *buf, size_t len) 3625 layout_store(struct mddev *mddev, const char *buf, size_t len)
3626 { 3626 {
3627 char *e; 3627 char *e;
3628 unsigned long n = simple_strtoul(buf, &e, 10); 3628 unsigned long n = simple_strtoul(buf, &e, 10);
3629 3629
3630 if (!*buf || (*e && *e != '\n')) 3630 if (!*buf || (*e && *e != '\n'))
3631 return -EINVAL; 3631 return -EINVAL;
3632 3632
3633 if (mddev->pers) { 3633 if (mddev->pers) {
3634 int err; 3634 int err;
3635 if (mddev->pers->check_reshape == NULL) 3635 if (mddev->pers->check_reshape == NULL)
3636 return -EBUSY; 3636 return -EBUSY;
3637 mddev->new_layout = n; 3637 mddev->new_layout = n;
3638 err = mddev->pers->check_reshape(mddev); 3638 err = mddev->pers->check_reshape(mddev);
3639 if (err) { 3639 if (err) {
3640 mddev->new_layout = mddev->layout; 3640 mddev->new_layout = mddev->layout;
3641 return err; 3641 return err;
3642 } 3642 }
3643 } else { 3643 } else {
3644 mddev->new_layout = n; 3644 mddev->new_layout = n;
3645 if (mddev->reshape_position == MaxSector) 3645 if (mddev->reshape_position == MaxSector)
3646 mddev->layout = n; 3646 mddev->layout = n;
3647 } 3647 }
3648 return len; 3648 return len;
3649 } 3649 }
3650 static struct md_sysfs_entry md_layout = 3650 static struct md_sysfs_entry md_layout =
3651 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store); 3651 __ATTR(layout, S_IRUGO|S_IWUSR, layout_show, layout_store);
3652 3652
3653 3653
3654 static ssize_t 3654 static ssize_t
3655 raid_disks_show(struct mddev *mddev, char *page) 3655 raid_disks_show(struct mddev *mddev, char *page)
3656 { 3656 {
3657 if (mddev->raid_disks == 0) 3657 if (mddev->raid_disks == 0)
3658 return 0; 3658 return 0;
3659 if (mddev->reshape_position != MaxSector && 3659 if (mddev->reshape_position != MaxSector &&
3660 mddev->delta_disks != 0) 3660 mddev->delta_disks != 0)
3661 return sprintf(page, "%d (%d)\n", mddev->raid_disks, 3661 return sprintf(page, "%d (%d)\n", mddev->raid_disks,
3662 mddev->raid_disks - mddev->delta_disks); 3662 mddev->raid_disks - mddev->delta_disks);
3663 return sprintf(page, "%d\n", mddev->raid_disks); 3663 return sprintf(page, "%d\n", mddev->raid_disks);
3664 } 3664 }
3665 3665
3666 static int update_raid_disks(struct mddev *mddev, int raid_disks); 3666 static int update_raid_disks(struct mddev *mddev, int raid_disks);
3667 3667
3668 static ssize_t 3668 static ssize_t
3669 raid_disks_store(struct mddev *mddev, const char *buf, size_t len) 3669 raid_disks_store(struct mddev *mddev, const char *buf, size_t len)
3670 { 3670 {
3671 char *e; 3671 char *e;
3672 int rv = 0; 3672 int rv = 0;
3673 unsigned long n = simple_strtoul(buf, &e, 10); 3673 unsigned long n = simple_strtoul(buf, &e, 10);
3674 3674
3675 if (!*buf || (*e && *e != '\n')) 3675 if (!*buf || (*e && *e != '\n'))
3676 return -EINVAL; 3676 return -EINVAL;
3677 3677
3678 if (mddev->pers) 3678 if (mddev->pers)
3679 rv = update_raid_disks(mddev, n); 3679 rv = update_raid_disks(mddev, n);
3680 else if (mddev->reshape_position != MaxSector) { 3680 else if (mddev->reshape_position != MaxSector) {
3681 struct md_rdev *rdev; 3681 struct md_rdev *rdev;
3682 int olddisks = mddev->raid_disks - mddev->delta_disks; 3682 int olddisks = mddev->raid_disks - mddev->delta_disks;
3683 3683
3684 rdev_for_each(rdev, mddev) { 3684 rdev_for_each(rdev, mddev) {
3685 if (olddisks < n && 3685 if (olddisks < n &&
3686 rdev->data_offset < rdev->new_data_offset) 3686 rdev->data_offset < rdev->new_data_offset)
3687 return -EINVAL; 3687 return -EINVAL;
3688 if (olddisks > n && 3688 if (olddisks > n &&
3689 rdev->data_offset > rdev->new_data_offset) 3689 rdev->data_offset > rdev->new_data_offset)
3690 return -EINVAL; 3690 return -EINVAL;
3691 } 3691 }
3692 mddev->delta_disks = n - olddisks; 3692 mddev->delta_disks = n - olddisks;
3693 mddev->raid_disks = n; 3693 mddev->raid_disks = n;
3694 mddev->reshape_backwards = (mddev->delta_disks < 0); 3694 mddev->reshape_backwards = (mddev->delta_disks < 0);
3695 } else 3695 } else
3696 mddev->raid_disks = n; 3696 mddev->raid_disks = n;
3697 return rv ? rv : len; 3697 return rv ? rv : len;
3698 } 3698 }
3699 static struct md_sysfs_entry md_raid_disks = 3699 static struct md_sysfs_entry md_raid_disks =
3700 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store); 3700 __ATTR(raid_disks, S_IRUGO|S_IWUSR, raid_disks_show, raid_disks_store);
3701 3701
3702 static ssize_t 3702 static ssize_t
3703 chunk_size_show(struct mddev *mddev, char *page) 3703 chunk_size_show(struct mddev *mddev, char *page)
3704 { 3704 {
3705 if (mddev->reshape_position != MaxSector && 3705 if (mddev->reshape_position != MaxSector &&
3706 mddev->chunk_sectors != mddev->new_chunk_sectors) 3706 mddev->chunk_sectors != mddev->new_chunk_sectors)
3707 return sprintf(page, "%d (%d)\n", 3707 return sprintf(page, "%d (%d)\n",
3708 mddev->new_chunk_sectors << 9, 3708 mddev->new_chunk_sectors << 9,
3709 mddev->chunk_sectors << 9); 3709 mddev->chunk_sectors << 9);
3710 return sprintf(page, "%d\n", mddev->chunk_sectors << 9); 3710 return sprintf(page, "%d\n", mddev->chunk_sectors << 9);
3711 } 3711 }
3712 3712
3713 static ssize_t 3713 static ssize_t
3714 chunk_size_store(struct mddev *mddev, const char *buf, size_t len) 3714 chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
3715 { 3715 {
3716 char *e; 3716 char *e;
3717 unsigned long n = simple_strtoul(buf, &e, 10); 3717 unsigned long n = simple_strtoul(buf, &e, 10);
3718 3718
3719 if (!*buf || (*e && *e != '\n')) 3719 if (!*buf || (*e && *e != '\n'))
3720 return -EINVAL; 3720 return -EINVAL;
3721 3721
3722 if (mddev->pers) { 3722 if (mddev->pers) {
3723 int err; 3723 int err;
3724 if (mddev->pers->check_reshape == NULL) 3724 if (mddev->pers->check_reshape == NULL)
3725 return -EBUSY; 3725 return -EBUSY;
3726 mddev->new_chunk_sectors = n >> 9; 3726 mddev->new_chunk_sectors = n >> 9;
3727 err = mddev->pers->check_reshape(mddev); 3727 err = mddev->pers->check_reshape(mddev);
3728 if (err) { 3728 if (err) {
3729 mddev->new_chunk_sectors = mddev->chunk_sectors; 3729 mddev->new_chunk_sectors = mddev->chunk_sectors;
3730 return err; 3730 return err;
3731 } 3731 }
3732 } else { 3732 } else {
3733 mddev->new_chunk_sectors = n >> 9; 3733 mddev->new_chunk_sectors = n >> 9;
3734 if (mddev->reshape_position == MaxSector) 3734 if (mddev->reshape_position == MaxSector)
3735 mddev->chunk_sectors = n >> 9; 3735 mddev->chunk_sectors = n >> 9;
3736 } 3736 }
3737 return len; 3737 return len;
3738 } 3738 }
3739 static struct md_sysfs_entry md_chunk_size = 3739 static struct md_sysfs_entry md_chunk_size =
3740 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store); 3740 __ATTR(chunk_size, S_IRUGO|S_IWUSR, chunk_size_show, chunk_size_store);
3741 3741
3742 static ssize_t 3742 static ssize_t
3743 resync_start_show(struct mddev *mddev, char *page) 3743 resync_start_show(struct mddev *mddev, char *page)
3744 { 3744 {
3745 if (mddev->recovery_cp == MaxSector) 3745 if (mddev->recovery_cp == MaxSector)
3746 return sprintf(page, "none\n"); 3746 return sprintf(page, "none\n");
3747 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp); 3747 return sprintf(page, "%llu\n", (unsigned long long)mddev->recovery_cp);
3748 } 3748 }
3749 3749
3750 static ssize_t 3750 static ssize_t
3751 resync_start_store(struct mddev *mddev, const char *buf, size_t len) 3751 resync_start_store(struct mddev *mddev, const char *buf, size_t len)
3752 { 3752 {
3753 char *e; 3753 char *e;
3754 unsigned long long n = simple_strtoull(buf, &e, 10); 3754 unsigned long long n = simple_strtoull(buf, &e, 10);
3755 3755
3756 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 3756 if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
3757 return -EBUSY; 3757 return -EBUSY;
3758 if (cmd_match(buf, "none")) 3758 if (cmd_match(buf, "none"))
3759 n = MaxSector; 3759 n = MaxSector;
3760 else if (!*buf || (*e && *e != '\n')) 3760 else if (!*buf || (*e && *e != '\n'))
3761 return -EINVAL; 3761 return -EINVAL;
3762 3762
3763 mddev->recovery_cp = n; 3763 mddev->recovery_cp = n;
3764 if (mddev->pers) 3764 if (mddev->pers)
3765 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3765 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3766 return len; 3766 return len;
3767 } 3767 }
3768 static struct md_sysfs_entry md_resync_start = 3768 static struct md_sysfs_entry md_resync_start =
3769 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); 3769 __ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store);
3770 3770
3771 /* 3771 /*
3772 * The array state can be: 3772 * The array state can be:
3773 * 3773 *
3774 * clear 3774 * clear
3775 * No devices, no size, no level 3775 * No devices, no size, no level
3776 * Equivalent to STOP_ARRAY ioctl 3776 * Equivalent to STOP_ARRAY ioctl
3777 * inactive 3777 * inactive
3778 * May have some settings, but array is not active 3778 * May have some settings, but array is not active
3779 * all IO results in error 3779 * all IO results in error
3780 * When written, doesn't tear down array, but just stops it 3780 * When written, doesn't tear down array, but just stops it
3781 * suspended (not supported yet) 3781 * suspended (not supported yet)
3782 * All IO requests will block. The array can be reconfigured. 3782 * All IO requests will block. The array can be reconfigured.
3783 * Writing this, if accepted, will block until array is quiescent 3783 * Writing this, if accepted, will block until array is quiescent
3784 * readonly 3784 * readonly
3785 * no resync can happen. no superblocks get written. 3785 * no resync can happen. no superblocks get written.
3786 * write requests fail 3786 * write requests fail
3787 * read-auto 3787 * read-auto
3788 * like readonly, but behaves like 'clean' on a write request. 3788 * like readonly, but behaves like 'clean' on a write request.
3789 * 3789 *
3790 * clean - no pending writes, but otherwise active. 3790 * clean - no pending writes, but otherwise active.
3791 * When written to inactive array, starts without resync 3791 * When written to inactive array, starts without resync
3792 * If a write request arrives then 3792 * If a write request arrives then
3793 * if metadata is known, mark 'dirty' and switch to 'active'. 3793 * if metadata is known, mark 'dirty' and switch to 'active'.
3794 * if not known, block and switch to write-pending 3794 * if not known, block and switch to write-pending
3795 * If written to an active array that has pending writes, then fails. 3795 * If written to an active array that has pending writes, then fails.
3796 * active 3796 * active
3797 * fully active: IO and resync can be happening. 3797 * fully active: IO and resync can be happening.
3798 * When written to inactive array, starts with resync 3798 * When written to inactive array, starts with resync
3799 * 3799 *
3800 * write-pending 3800 * write-pending
3801 * clean, but writes are blocked waiting for 'active' to be written. 3801 * clean, but writes are blocked waiting for 'active' to be written.
3802 * 3802 *
3803 * active-idle 3803 * active-idle
3804 * like active, but no writes have been seen for a while (100msec). 3804 * like active, but no writes have been seen for a while (100msec).
3805 * 3805 *
3806 */ 3806 */
3807 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active, 3807 enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
3808 write_pending, active_idle, bad_word}; 3808 write_pending, active_idle, bad_word};
3809 static char *array_states[] = { 3809 static char *array_states[] = {
3810 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active", 3810 "clear", "inactive", "suspended", "readonly", "read-auto", "clean", "active",
3811 "write-pending", "active-idle", NULL }; 3811 "write-pending", "active-idle", NULL };
3812 3812
3813 static int match_word(const char *word, char **list) 3813 static int match_word(const char *word, char **list)
3814 { 3814 {
3815 int n; 3815 int n;
3816 for (n=0; list[n]; n++) 3816 for (n=0; list[n]; n++)
3817 if (cmd_match(word, list[n])) 3817 if (cmd_match(word, list[n]))
3818 break; 3818 break;
3819 return n; 3819 return n;
3820 } 3820 }
3821 3821
3822 static ssize_t 3822 static ssize_t
3823 array_state_show(struct mddev *mddev, char *page) 3823 array_state_show(struct mddev *mddev, char *page)
3824 { 3824 {
3825 enum array_state st = inactive; 3825 enum array_state st = inactive;
3826 3826
3827 if (mddev->pers) 3827 if (mddev->pers)
3828 switch(mddev->ro) { 3828 switch(mddev->ro) {
3829 case 1: 3829 case 1:
3830 st = readonly; 3830 st = readonly;
3831 break; 3831 break;
3832 case 2: 3832 case 2:
3833 st = read_auto; 3833 st = read_auto;
3834 break; 3834 break;
3835 case 0: 3835 case 0:
3836 if (mddev->in_sync) 3836 if (mddev->in_sync)
3837 st = clean; 3837 st = clean;
3838 else if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 3838 else if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
3839 st = write_pending; 3839 st = write_pending;
3840 else if (mddev->safemode) 3840 else if (mddev->safemode)
3841 st = active_idle; 3841 st = active_idle;
3842 else 3842 else
3843 st = active; 3843 st = active;
3844 } 3844 }
3845 else { 3845 else {
3846 if (list_empty(&mddev->disks) && 3846 if (list_empty(&mddev->disks) &&
3847 mddev->raid_disks == 0 && 3847 mddev->raid_disks == 0 &&
3848 mddev->dev_sectors == 0) 3848 mddev->dev_sectors == 0)
3849 st = clear; 3849 st = clear;
3850 else 3850 else
3851 st = inactive; 3851 st = inactive;
3852 } 3852 }
3853 return sprintf(page, "%s\n", array_states[st]); 3853 return sprintf(page, "%s\n", array_states[st]);
3854 } 3854 }
3855 3855
3856 static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev); 3856 static int do_md_stop(struct mddev * mddev, int ro, struct block_device *bdev);
3857 static int md_set_readonly(struct mddev * mddev, struct block_device *bdev); 3857 static int md_set_readonly(struct mddev * mddev, struct block_device *bdev);
3858 static int do_md_run(struct mddev * mddev); 3858 static int do_md_run(struct mddev * mddev);
3859 static int restart_array(struct mddev *mddev); 3859 static int restart_array(struct mddev *mddev);
3860 3860
3861 static ssize_t 3861 static ssize_t
3862 array_state_store(struct mddev *mddev, const char *buf, size_t len) 3862 array_state_store(struct mddev *mddev, const char *buf, size_t len)
3863 { 3863 {
3864 int err = -EINVAL; 3864 int err = -EINVAL;
3865 enum array_state st = match_word(buf, array_states); 3865 enum array_state st = match_word(buf, array_states);
3866 switch(st) { 3866 switch(st) {
3867 case bad_word: 3867 case bad_word:
3868 break; 3868 break;
3869 case clear: 3869 case clear:
3870 /* stopping an active array */ 3870 /* stopping an active array */
3871 err = do_md_stop(mddev, 0, NULL); 3871 err = do_md_stop(mddev, 0, NULL);
3872 break; 3872 break;
3873 case inactive: 3873 case inactive:
3874 /* stopping an active array */ 3874 /* stopping an active array */
3875 if (mddev->pers) 3875 if (mddev->pers)
3876 err = do_md_stop(mddev, 2, NULL); 3876 err = do_md_stop(mddev, 2, NULL);
3877 else 3877 else
3878 err = 0; /* already inactive */ 3878 err = 0; /* already inactive */
3879 break; 3879 break;
3880 case suspended: 3880 case suspended:
3881 break; /* not supported yet */ 3881 break; /* not supported yet */
3882 case readonly: 3882 case readonly:
3883 if (mddev->pers) 3883 if (mddev->pers)
3884 err = md_set_readonly(mddev, NULL); 3884 err = md_set_readonly(mddev, NULL);
3885 else { 3885 else {
3886 mddev->ro = 1; 3886 mddev->ro = 1;
3887 set_disk_ro(mddev->gendisk, 1); 3887 set_disk_ro(mddev->gendisk, 1);
3888 err = do_md_run(mddev); 3888 err = do_md_run(mddev);
3889 } 3889 }
3890 break; 3890 break;
3891 case read_auto: 3891 case read_auto:
3892 if (mddev->pers) { 3892 if (mddev->pers) {
3893 if (mddev->ro == 0) 3893 if (mddev->ro == 0)
3894 err = md_set_readonly(mddev, NULL); 3894 err = md_set_readonly(mddev, NULL);
3895 else if (mddev->ro == 1) 3895 else if (mddev->ro == 1)
3896 err = restart_array(mddev); 3896 err = restart_array(mddev);
3897 if (err == 0) { 3897 if (err == 0) {
3898 mddev->ro = 2; 3898 mddev->ro = 2;
3899 set_disk_ro(mddev->gendisk, 0); 3899 set_disk_ro(mddev->gendisk, 0);
3900 } 3900 }
3901 } else { 3901 } else {
3902 mddev->ro = 2; 3902 mddev->ro = 2;
3903 err = do_md_run(mddev); 3903 err = do_md_run(mddev);
3904 } 3904 }
3905 break; 3905 break;
3906 case clean: 3906 case clean:
3907 if (mddev->pers) { 3907 if (mddev->pers) {
3908 restart_array(mddev); 3908 restart_array(mddev);
3909 spin_lock_irq(&mddev->write_lock); 3909 spin_lock_irq(&mddev->write_lock);
3910 if (atomic_read(&mddev->writes_pending) == 0) { 3910 if (atomic_read(&mddev->writes_pending) == 0) {
3911 if (mddev->in_sync == 0) { 3911 if (mddev->in_sync == 0) {
3912 mddev->in_sync = 1; 3912 mddev->in_sync = 1;
3913 if (mddev->safemode == 1) 3913 if (mddev->safemode == 1)
3914 mddev->safemode = 0; 3914 mddev->safemode = 0;
3915 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 3915 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
3916 } 3916 }
3917 err = 0; 3917 err = 0;
3918 } else 3918 } else
3919 err = -EBUSY; 3919 err = -EBUSY;
3920 spin_unlock_irq(&mddev->write_lock); 3920 spin_unlock_irq(&mddev->write_lock);
3921 } else 3921 } else
3922 err = -EINVAL; 3922 err = -EINVAL;
3923 break; 3923 break;
3924 case active: 3924 case active:
3925 if (mddev->pers) { 3925 if (mddev->pers) {
3926 restart_array(mddev); 3926 restart_array(mddev);
3927 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 3927 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
3928 wake_up(&mddev->sb_wait); 3928 wake_up(&mddev->sb_wait);
3929 err = 0; 3929 err = 0;
3930 } else { 3930 } else {
3931 mddev->ro = 0; 3931 mddev->ro = 0;
3932 set_disk_ro(mddev->gendisk, 0); 3932 set_disk_ro(mddev->gendisk, 0);
3933 err = do_md_run(mddev); 3933 err = do_md_run(mddev);
3934 } 3934 }
3935 break; 3935 break;
3936 case write_pending: 3936 case write_pending:
3937 case active_idle: 3937 case active_idle:
3938 /* these cannot be set */ 3938 /* these cannot be set */
3939 break; 3939 break;
3940 } 3940 }
3941 if (err) 3941 if (err)
3942 return err; 3942 return err;
3943 else { 3943 else {
3944 if (mddev->hold_active == UNTIL_IOCTL) 3944 if (mddev->hold_active == UNTIL_IOCTL)
3945 mddev->hold_active = 0; 3945 mddev->hold_active = 0;
3946 sysfs_notify_dirent_safe(mddev->sysfs_state); 3946 sysfs_notify_dirent_safe(mddev->sysfs_state);
3947 return len; 3947 return len;
3948 } 3948 }
3949 } 3949 }
3950 static struct md_sysfs_entry md_array_state = 3950 static struct md_sysfs_entry md_array_state =
3951 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3951 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3952 3952
3953 static ssize_t 3953 static ssize_t
3954 max_corrected_read_errors_show(struct mddev *mddev, char *page) { 3954 max_corrected_read_errors_show(struct mddev *mddev, char *page) {
3955 return sprintf(page, "%d\n", 3955 return sprintf(page, "%d\n",
3956 atomic_read(&mddev->max_corr_read_errors)); 3956 atomic_read(&mddev->max_corr_read_errors));
3957 } 3957 }
3958 3958
3959 static ssize_t 3959 static ssize_t
3960 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len) 3960 max_corrected_read_errors_store(struct mddev *mddev, const char *buf, size_t len)
3961 { 3961 {
3962 char *e; 3962 char *e;
3963 unsigned long n = simple_strtoul(buf, &e, 10); 3963 unsigned long n = simple_strtoul(buf, &e, 10);
3964 3964
3965 if (*buf && (*e == 0 || *e == '\n')) { 3965 if (*buf && (*e == 0 || *e == '\n')) {
3966 atomic_set(&mddev->max_corr_read_errors, n); 3966 atomic_set(&mddev->max_corr_read_errors, n);
3967 return len; 3967 return len;
3968 } 3968 }
3969 return -EINVAL; 3969 return -EINVAL;
3970 } 3970 }
3971 3971
3972 static struct md_sysfs_entry max_corr_read_errors = 3972 static struct md_sysfs_entry max_corr_read_errors =
3973 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show, 3973 __ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3974 max_corrected_read_errors_store); 3974 max_corrected_read_errors_store);
3975 3975
3976 static ssize_t 3976 static ssize_t
3977 null_show(struct mddev *mddev, char *page) 3977 null_show(struct mddev *mddev, char *page)
3978 { 3978 {
3979 return -EINVAL; 3979 return -EINVAL;
3980 } 3980 }
3981 3981
3982 static ssize_t 3982 static ssize_t
3983 new_dev_store(struct mddev *mddev, const char *buf, size_t len) 3983 new_dev_store(struct mddev *mddev, const char *buf, size_t len)
3984 { 3984 {
3985 /* buf must be %d:%d\n? giving major and minor numbers */ 3985 /* buf must be %d:%d\n? giving major and minor numbers */
3986 /* The new device is added to the array. 3986 /* The new device is added to the array.
3987 * If the array has a persistent superblock, we read the 3987 * If the array has a persistent superblock, we read the
3988 * superblock to initialise info and check validity. 3988 * superblock to initialise info and check validity.
3989 * Otherwise, only checking done is that in bind_rdev_to_array, 3989 * Otherwise, only checking done is that in bind_rdev_to_array,
3990 * which mainly checks size. 3990 * which mainly checks size.
3991 */ 3991 */
3992 char *e; 3992 char *e;
3993 int major = simple_strtoul(buf, &e, 10); 3993 int major = simple_strtoul(buf, &e, 10);
3994 int minor; 3994 int minor;
3995 dev_t dev; 3995 dev_t dev;
3996 struct md_rdev *rdev; 3996 struct md_rdev *rdev;
3997 int err; 3997 int err;
3998 3998
3999 if (!*buf || *e != ':' || !e[1] || e[1] == '\n') 3999 if (!*buf || *e != ':' || !e[1] || e[1] == '\n')
4000 return -EINVAL; 4000 return -EINVAL;
4001 minor = simple_strtoul(e+1, &e, 10); 4001 minor = simple_strtoul(e+1, &e, 10);
4002 if (*e && *e != '\n') 4002 if (*e && *e != '\n')
4003 return -EINVAL; 4003 return -EINVAL;
4004 dev = MKDEV(major, minor); 4004 dev = MKDEV(major, minor);
4005 if (major != MAJOR(dev) || 4005 if (major != MAJOR(dev) ||
4006 minor != MINOR(dev)) 4006 minor != MINOR(dev))
4007 return -EOVERFLOW; 4007 return -EOVERFLOW;
4008 4008
4009 4009
4010 if (mddev->persistent) { 4010 if (mddev->persistent) {
4011 rdev = md_import_device(dev, mddev->major_version, 4011 rdev = md_import_device(dev, mddev->major_version,
4012 mddev->minor_version); 4012 mddev->minor_version);
4013 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) { 4013 if (!IS_ERR(rdev) && !list_empty(&mddev->disks)) {
4014 struct md_rdev *rdev0 4014 struct md_rdev *rdev0
4015 = list_entry(mddev->disks.next, 4015 = list_entry(mddev->disks.next,
4016 struct md_rdev, same_set); 4016 struct md_rdev, same_set);
4017 err = super_types[mddev->major_version] 4017 err = super_types[mddev->major_version]
4018 .load_super(rdev, rdev0, mddev->minor_version); 4018 .load_super(rdev, rdev0, mddev->minor_version);
4019 if (err < 0) 4019 if (err < 0)
4020 goto out; 4020 goto out;
4021 } 4021 }
4022 } else if (mddev->external) 4022 } else if (mddev->external)
4023 rdev = md_import_device(dev, -2, -1); 4023 rdev = md_import_device(dev, -2, -1);
4024 else 4024 else
4025 rdev = md_import_device(dev, -1, -1); 4025 rdev = md_import_device(dev, -1, -1);
4026 4026
4027 if (IS_ERR(rdev)) 4027 if (IS_ERR(rdev))
4028 return PTR_ERR(rdev); 4028 return PTR_ERR(rdev);
4029 err = bind_rdev_to_array(rdev, mddev); 4029 err = bind_rdev_to_array(rdev, mddev);
4030 out: 4030 out:
4031 if (err) 4031 if (err)
4032 export_rdev(rdev); 4032 export_rdev(rdev);
4033 return err ? err : len; 4033 return err ? err : len;
4034 } 4034 }
4035 4035
4036 static struct md_sysfs_entry md_new_device = 4036 static struct md_sysfs_entry md_new_device =
4037 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store); 4037 __ATTR(new_dev, S_IWUSR, null_show, new_dev_store);
4038 4038
4039 static ssize_t 4039 static ssize_t
4040 bitmap_store(struct mddev *mddev, const char *buf, size_t len) 4040 bitmap_store(struct mddev *mddev, const char *buf, size_t len)
4041 { 4041 {
4042 char *end; 4042 char *end;
4043 unsigned long chunk, end_chunk; 4043 unsigned long chunk, end_chunk;
4044 4044
4045 if (!mddev->bitmap) 4045 if (!mddev->bitmap)
4046 goto out; 4046 goto out;
4047 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */ 4047 /* buf should be <chunk> <chunk> ... or <chunk>-<chunk> ... (range) */
4048 while (*buf) { 4048 while (*buf) {
4049 chunk = end_chunk = simple_strtoul(buf, &end, 0); 4049 chunk = end_chunk = simple_strtoul(buf, &end, 0);
4050 if (buf == end) break; 4050 if (buf == end) break;
4051 if (*end == '-') { /* range */ 4051 if (*end == '-') { /* range */
4052 buf = end + 1; 4052 buf = end + 1;
4053 end_chunk = simple_strtoul(buf, &end, 0); 4053 end_chunk = simple_strtoul(buf, &end, 0);
4054 if (buf == end) break; 4054 if (buf == end) break;
4055 } 4055 }
4056 if (*end && !isspace(*end)) break; 4056 if (*end && !isspace(*end)) break;
4057 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk); 4057 bitmap_dirty_bits(mddev->bitmap, chunk, end_chunk);
4058 buf = skip_spaces(end); 4058 buf = skip_spaces(end);
4059 } 4059 }
4060 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */ 4060 bitmap_unplug(mddev->bitmap); /* flush the bits to disk */
4061 out: 4061 out:
4062 return len; 4062 return len;
4063 } 4063 }
4064 4064
4065 static struct md_sysfs_entry md_bitmap = 4065 static struct md_sysfs_entry md_bitmap =
4066 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store); 4066 __ATTR(bitmap_set_bits, S_IWUSR, null_show, bitmap_store);
4067 4067
4068 static ssize_t 4068 static ssize_t
4069 size_show(struct mddev *mddev, char *page) 4069 size_show(struct mddev *mddev, char *page)
4070 { 4070 {
4071 return sprintf(page, "%llu\n", 4071 return sprintf(page, "%llu\n",
4072 (unsigned long long)mddev->dev_sectors / 2); 4072 (unsigned long long)mddev->dev_sectors / 2);
4073 } 4073 }
4074 4074
4075 static int update_size(struct mddev *mddev, sector_t num_sectors); 4075 static int update_size(struct mddev *mddev, sector_t num_sectors);
4076 4076
4077 static ssize_t 4077 static ssize_t
4078 size_store(struct mddev *mddev, const char *buf, size_t len) 4078 size_store(struct mddev *mddev, const char *buf, size_t len)
4079 { 4079 {
4080 /* If array is inactive, we can reduce the component size, but 4080 /* If array is inactive, we can reduce the component size, but
4081 * not increase it (except from 0). 4081 * not increase it (except from 0).
4082 * If array is active, we can try an on-line resize 4082 * If array is active, we can try an on-line resize
4083 */ 4083 */
4084 sector_t sectors; 4084 sector_t sectors;
4085 int err = strict_blocks_to_sectors(buf, &sectors); 4085 int err = strict_blocks_to_sectors(buf, &sectors);
4086 4086
4087 if (err < 0) 4087 if (err < 0)
4088 return err; 4088 return err;
4089 if (mddev->pers) { 4089 if (mddev->pers) {
4090 err = update_size(mddev, sectors); 4090 err = update_size(mddev, sectors);
4091 md_update_sb(mddev, 1); 4091 md_update_sb(mddev, 1);
4092 } else { 4092 } else {
4093 if (mddev->dev_sectors == 0 || 4093 if (mddev->dev_sectors == 0 ||
4094 mddev->dev_sectors > sectors) 4094 mddev->dev_sectors > sectors)
4095 mddev->dev_sectors = sectors; 4095 mddev->dev_sectors = sectors;
4096 else 4096 else
4097 err = -ENOSPC; 4097 err = -ENOSPC;
4098 } 4098 }
4099 return err ? err : len; 4099 return err ? err : len;
4100 } 4100 }
4101 4101
4102 static struct md_sysfs_entry md_size = 4102 static struct md_sysfs_entry md_size =
4103 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store); 4103 __ATTR(component_size, S_IRUGO|S_IWUSR, size_show, size_store);
4104 4104
4105 4105
4106 /* Metadata version. 4106 /* Metadata version.
4107 * This is one of 4107 * This is one of
4108 * 'none' for arrays with no metadata (good luck...) 4108 * 'none' for arrays with no metadata (good luck...)
4109 * 'external' for arrays with externally managed metadata, 4109 * 'external' for arrays with externally managed metadata,
4110 * or N.M for internally known formats 4110 * or N.M for internally known formats
4111 */ 4111 */
4112 static ssize_t 4112 static ssize_t
4113 metadata_show(struct mddev *mddev, char *page) 4113 metadata_show(struct mddev *mddev, char *page)
4114 { 4114 {
4115 if (mddev->persistent) 4115 if (mddev->persistent)
4116 return sprintf(page, "%d.%d\n", 4116 return sprintf(page, "%d.%d\n",
4117 mddev->major_version, mddev->minor_version); 4117 mddev->major_version, mddev->minor_version);
4118 else if (mddev->external) 4118 else if (mddev->external)
4119 return sprintf(page, "external:%s\n", mddev->metadata_type); 4119 return sprintf(page, "external:%s\n", mddev->metadata_type);
4120 else 4120 else
4121 return sprintf(page, "none\n"); 4121 return sprintf(page, "none\n");
4122 } 4122 }
4123 4123
4124 static ssize_t 4124 static ssize_t
4125 metadata_store(struct mddev *mddev, const char *buf, size_t len) 4125 metadata_store(struct mddev *mddev, const char *buf, size_t len)
4126 { 4126 {
4127 int major, minor; 4127 int major, minor;
4128 char *e; 4128 char *e;
4129 /* Changing the details of 'external' metadata is 4129 /* Changing the details of 'external' metadata is
4130 * always permitted. Otherwise there must be 4130 * always permitted. Otherwise there must be
4131 * no devices attached to the array. 4131 * no devices attached to the array.
4132 */ 4132 */
4133 if (mddev->external && strncmp(buf, "external:", 9) == 0) 4133 if (mddev->external && strncmp(buf, "external:", 9) == 0)
4134 ; 4134 ;
4135 else if (!list_empty(&mddev->disks)) 4135 else if (!list_empty(&mddev->disks))
4136 return -EBUSY; 4136 return -EBUSY;
4137 4137
4138 if (cmd_match(buf, "none")) { 4138 if (cmd_match(buf, "none")) {
4139 mddev->persistent = 0; 4139 mddev->persistent = 0;
4140 mddev->external = 0; 4140 mddev->external = 0;
4141 mddev->major_version = 0; 4141 mddev->major_version = 0;
4142 mddev->minor_version = 90; 4142 mddev->minor_version = 90;
4143 return len; 4143 return len;
4144 } 4144 }
4145 if (strncmp(buf, "external:", 9) == 0) { 4145 if (strncmp(buf, "external:", 9) == 0) {
4146 size_t namelen = len-9; 4146 size_t namelen = len-9;
4147 if (namelen >= sizeof(mddev->metadata_type)) 4147 if (namelen >= sizeof(mddev->metadata_type))
4148 namelen = sizeof(mddev->metadata_type)-1; 4148 namelen = sizeof(mddev->metadata_type)-1;
4149 strncpy(mddev->metadata_type, buf+9, namelen); 4149 strncpy(mddev->metadata_type, buf+9, namelen);
4150 mddev->metadata_type[namelen] = 0; 4150 mddev->metadata_type[namelen] = 0;
4151 if (namelen && mddev->metadata_type[namelen-1] == '\n') 4151 if (namelen && mddev->metadata_type[namelen-1] == '\n')
4152 mddev->metadata_type[--namelen] = 0; 4152 mddev->metadata_type[--namelen] = 0;
4153 mddev->persistent = 0; 4153 mddev->persistent = 0;
4154 mddev->external = 1; 4154 mddev->external = 1;
4155 mddev->major_version = 0; 4155 mddev->major_version = 0;
4156 mddev->minor_version = 90; 4156 mddev->minor_version = 90;
4157 return len; 4157 return len;
4158 } 4158 }
4159 major = simple_strtoul(buf, &e, 10); 4159 major = simple_strtoul(buf, &e, 10);
4160 if (e==buf || *e != '.') 4160 if (e==buf || *e != '.')
4161 return -EINVAL; 4161 return -EINVAL;
4162 buf = e+1; 4162 buf = e+1;
4163 minor = simple_strtoul(buf, &e, 10); 4163 minor = simple_strtoul(buf, &e, 10);
4164 if (e==buf || (*e && *e != '\n') ) 4164 if (e==buf || (*e && *e != '\n') )
4165 return -EINVAL; 4165 return -EINVAL;
4166 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL) 4166 if (major >= ARRAY_SIZE(super_types) || super_types[major].name == NULL)
4167 return -ENOENT; 4167 return -ENOENT;
4168 mddev->major_version = major; 4168 mddev->major_version = major;
4169 mddev->minor_version = minor; 4169 mddev->minor_version = minor;
4170 mddev->persistent = 1; 4170 mddev->persistent = 1;
4171 mddev->external = 0; 4171 mddev->external = 0;
4172 return len; 4172 return len;
4173 } 4173 }
4174 4174
4175 static struct md_sysfs_entry md_metadata = 4175 static struct md_sysfs_entry md_metadata =
4176 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); 4176 __ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store);
4177 4177
4178 static ssize_t 4178 static ssize_t
4179 action_show(struct mddev *mddev, char *page) 4179 action_show(struct mddev *mddev, char *page)
4180 { 4180 {
4181 char *type = "idle"; 4181 char *type = "idle";
4182 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 4182 if (test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
4183 type = "frozen"; 4183 type = "frozen";
4184 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4184 else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4185 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) { 4185 (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))) {
4186 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4186 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4187 type = "reshape"; 4187 type = "reshape";
4188 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 4188 else if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
4189 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 4189 if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
4190 type = "resync"; 4190 type = "resync";
4191 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) 4191 else if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery))
4192 type = "check"; 4192 type = "check";
4193 else 4193 else
4194 type = "repair"; 4194 type = "repair";
4195 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery)) 4195 } else if (test_bit(MD_RECOVERY_RECOVER, &mddev->recovery))
4196 type = "recover"; 4196 type = "recover";
4197 } 4197 }
4198 return sprintf(page, "%s\n", type); 4198 return sprintf(page, "%s\n", type);
4199 } 4199 }
4200 4200
4201 static ssize_t 4201 static ssize_t
4202 action_store(struct mddev *mddev, const char *page, size_t len) 4202 action_store(struct mddev *mddev, const char *page, size_t len)
4203 { 4203 {
4204 if (!mddev->pers || !mddev->pers->sync_request) 4204 if (!mddev->pers || !mddev->pers->sync_request)
4205 return -EINVAL; 4205 return -EINVAL;
4206 4206
4207 if (cmd_match(page, "frozen")) 4207 if (cmd_match(page, "frozen"))
4208 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4208 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4209 else 4209 else
4210 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 4210 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
4211 4211
4212 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) { 4212 if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
4213 if (mddev->sync_thread) { 4213 if (mddev->sync_thread) {
4214 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 4214 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
4215 md_reap_sync_thread(mddev); 4215 md_reap_sync_thread(mddev);
4216 } 4216 }
4217 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) || 4217 } else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
4218 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 4218 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
4219 return -EBUSY; 4219 return -EBUSY;
4220 else if (cmd_match(page, "resync")) 4220 else if (cmd_match(page, "resync"))
4221 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4221 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4222 else if (cmd_match(page, "recover")) { 4222 else if (cmd_match(page, "recover")) {
4223 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 4223 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
4224 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4224 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4225 } else if (cmd_match(page, "reshape")) { 4225 } else if (cmd_match(page, "reshape")) {
4226 int err; 4226 int err;
4227 if (mddev->pers->start_reshape == NULL) 4227 if (mddev->pers->start_reshape == NULL)
4228 return -EINVAL; 4228 return -EINVAL;
4229 err = mddev->pers->start_reshape(mddev); 4229 err = mddev->pers->start_reshape(mddev);
4230 if (err) 4230 if (err)
4231 return err; 4231 return err;
4232 sysfs_notify(&mddev->kobj, NULL, "degraded"); 4232 sysfs_notify(&mddev->kobj, NULL, "degraded");
4233 } else { 4233 } else {
4234 if (cmd_match(page, "check")) 4234 if (cmd_match(page, "check"))
4235 set_bit(MD_RECOVERY_CHECK, &mddev->recovery); 4235 set_bit(MD_RECOVERY_CHECK, &mddev->recovery);
4236 else if (!cmd_match(page, "repair")) 4236 else if (!cmd_match(page, "repair"))
4237 return -EINVAL; 4237 return -EINVAL;
4238 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 4238 set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
4239 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 4239 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
4240 } 4240 }
4241 if (mddev->ro == 2) { 4241 if (mddev->ro == 2) {
4242 /* A write to sync_action is enough to justify 4242 /* A write to sync_action is enough to justify
4243 * canceling read-auto mode 4243 * canceling read-auto mode
4244 */ 4244 */
4245 mddev->ro = 0; 4245 mddev->ro = 0;
4246 md_wakeup_thread(mddev->sync_thread); 4246 md_wakeup_thread(mddev->sync_thread);
4247 } 4247 }
4248 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4248 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4249 md_wakeup_thread(mddev->thread); 4249 md_wakeup_thread(mddev->thread);
4250 sysfs_notify_dirent_safe(mddev->sysfs_action); 4250 sysfs_notify_dirent_safe(mddev->sysfs_action);
4251 return len; 4251 return len;
4252 } 4252 }
4253 4253
4254 static struct md_sysfs_entry md_scan_mode = 4254 static struct md_sysfs_entry md_scan_mode =
4255 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); 4255 __ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store);
4256 4256
4257 static ssize_t 4257 static ssize_t
4258 last_sync_action_show(struct mddev *mddev, char *page) 4258 last_sync_action_show(struct mddev *mddev, char *page)
4259 { 4259 {
4260 return sprintf(page, "%s\n", mddev->last_sync_action); 4260 return sprintf(page, "%s\n", mddev->last_sync_action);
4261 } 4261 }
4262 4262
4263 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action); 4263 static struct md_sysfs_entry md_last_scan_mode = __ATTR_RO(last_sync_action);
4264 4264
4265 static ssize_t 4265 static ssize_t
4266 mismatch_cnt_show(struct mddev *mddev, char *page) 4266 mismatch_cnt_show(struct mddev *mddev, char *page)
4267 { 4267 {
4268 return sprintf(page, "%llu\n", 4268 return sprintf(page, "%llu\n",
4269 (unsigned long long) 4269 (unsigned long long)
4270 atomic64_read(&mddev->resync_mismatches)); 4270 atomic64_read(&mddev->resync_mismatches));
4271 } 4271 }
4272 4272
4273 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt); 4273 static struct md_sysfs_entry md_mismatches = __ATTR_RO(mismatch_cnt);
4274 4274
4275 static ssize_t 4275 static ssize_t
4276 sync_min_show(struct mddev *mddev, char *page) 4276 sync_min_show(struct mddev *mddev, char *page)
4277 { 4277 {
4278 return sprintf(page, "%d (%s)\n", speed_min(mddev), 4278 return sprintf(page, "%d (%s)\n", speed_min(mddev),
4279 mddev->sync_speed_min ? "local": "system"); 4279 mddev->sync_speed_min ? "local": "system");
4280 } 4280 }
4281 4281
4282 static ssize_t 4282 static ssize_t
4283 sync_min_store(struct mddev *mddev, const char *buf, size_t len) 4283 sync_min_store(struct mddev *mddev, const char *buf, size_t len)
4284 { 4284 {
4285 int min; 4285 int min;
4286 char *e; 4286 char *e;
4287 if (strncmp(buf, "system", 6)==0) { 4287 if (strncmp(buf, "system", 6)==0) {
4288 mddev->sync_speed_min = 0; 4288 mddev->sync_speed_min = 0;
4289 return len; 4289 return len;
4290 } 4290 }
4291 min = simple_strtoul(buf, &e, 10); 4291 min = simple_strtoul(buf, &e, 10);
4292 if (buf == e || (*e && *e != '\n') || min <= 0) 4292 if (buf == e || (*e && *e != '\n') || min <= 0)
4293 return -EINVAL; 4293 return -EINVAL;
4294 mddev->sync_speed_min = min; 4294 mddev->sync_speed_min = min;
4295 return len; 4295 return len;
4296 } 4296 }
4297 4297
4298 static struct md_sysfs_entry md_sync_min = 4298 static struct md_sysfs_entry md_sync_min =
4299 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store); 4299 __ATTR(sync_speed_min, S_IRUGO|S_IWUSR, sync_min_show, sync_min_store);
4300 4300
4301 static ssize_t 4301 static ssize_t
4302 sync_max_show(struct mddev *mddev, char *page) 4302 sync_max_show(struct mddev *mddev, char *page)
4303 { 4303 {
4304 return sprintf(page, "%d (%s)\n", speed_max(mddev), 4304 return sprintf(page, "%d (%s)\n", speed_max(mddev),
4305 mddev->sync_speed_max ? "local": "system"); 4305 mddev->sync_speed_max ? "local": "system");
4306 } 4306 }
4307 4307
4308 static ssize_t 4308 static ssize_t
4309 sync_max_store(struct mddev *mddev, const char *buf, size_t len) 4309 sync_max_store(struct mddev *mddev, const char *buf, size_t len)
4310 { 4310 {
4311 int max; 4311 int max;
4312 char *e; 4312 char *e;
4313 if (strncmp(buf, "system", 6)==0) { 4313 if (strncmp(buf, "system", 6)==0) {
4314 mddev->sync_speed_max = 0; 4314 mddev->sync_speed_max = 0;
4315 return len; 4315 return len;
4316 } 4316 }
4317 max = simple_strtoul(buf, &e, 10); 4317 max = simple_strtoul(buf, &e, 10);
4318 if (buf == e || (*e && *e != '\n') || max <= 0) 4318 if (buf == e || (*e && *e != '\n') || max <= 0)
4319 return -EINVAL; 4319 return -EINVAL;
4320 mddev->sync_speed_max = max; 4320 mddev->sync_speed_max = max;
4321 return len; 4321 return len;
4322 } 4322 }
4323 4323
4324 static struct md_sysfs_entry md_sync_max = 4324 static struct md_sysfs_entry md_sync_max =
4325 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store); 4325 __ATTR(sync_speed_max, S_IRUGO|S_IWUSR, sync_max_show, sync_max_store);
4326 4326
4327 static ssize_t 4327 static ssize_t
4328 degraded_show(struct mddev *mddev, char *page) 4328 degraded_show(struct mddev *mddev, char *page)
4329 { 4329 {
4330 return sprintf(page, "%d\n", mddev->degraded); 4330 return sprintf(page, "%d\n", mddev->degraded);
4331 } 4331 }
4332 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded); 4332 static struct md_sysfs_entry md_degraded = __ATTR_RO(degraded);
4333 4333
4334 static ssize_t 4334 static ssize_t
4335 sync_force_parallel_show(struct mddev *mddev, char *page) 4335 sync_force_parallel_show(struct mddev *mddev, char *page)
4336 { 4336 {
4337 return sprintf(page, "%d\n", mddev->parallel_resync); 4337 return sprintf(page, "%d\n", mddev->parallel_resync);
4338 } 4338 }
4339 4339
4340 static ssize_t 4340 static ssize_t
4341 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len) 4341 sync_force_parallel_store(struct mddev *mddev, const char *buf, size_t len)
4342 { 4342 {
4343 long n; 4343 long n;
4344 4344
4345 if (kstrtol(buf, 10, &n)) 4345 if (kstrtol(buf, 10, &n))
4346 return -EINVAL; 4346 return -EINVAL;
4347 4347
4348 if (n != 0 && n != 1) 4348 if (n != 0 && n != 1)
4349 return -EINVAL; 4349 return -EINVAL;
4350 4350
4351 mddev->parallel_resync = n; 4351 mddev->parallel_resync = n;
4352 4352
4353 if (mddev->sync_thread) 4353 if (mddev->sync_thread)
4354 wake_up(&resync_wait); 4354 wake_up(&resync_wait);
4355 4355
4356 return len; 4356 return len;
4357 } 4357 }
4358 4358
4359 /* force parallel resync, even with shared block devices */ 4359 /* force parallel resync, even with shared block devices */
4360 static struct md_sysfs_entry md_sync_force_parallel = 4360 static struct md_sysfs_entry md_sync_force_parallel =
4361 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR, 4361 __ATTR(sync_force_parallel, S_IRUGO|S_IWUSR,
4362 sync_force_parallel_show, sync_force_parallel_store); 4362 sync_force_parallel_show, sync_force_parallel_store);
4363 4363
4364 static ssize_t 4364 static ssize_t
4365 sync_speed_show(struct mddev *mddev, char *page) 4365 sync_speed_show(struct mddev *mddev, char *page)
4366 { 4366 {
4367 unsigned long resync, dt, db; 4367 unsigned long resync, dt, db;
4368 if (mddev->curr_resync == 0) 4368 if (mddev->curr_resync == 0)
4369 return sprintf(page, "none\n"); 4369 return sprintf(page, "none\n");
4370 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active); 4370 resync = mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active);
4371 dt = (jiffies - mddev->resync_mark) / HZ; 4371 dt = (jiffies - mddev->resync_mark) / HZ;
4372 if (!dt) dt++; 4372 if (!dt) dt++;
4373 db = resync - mddev->resync_mark_cnt; 4373 db = resync - mddev->resync_mark_cnt;
4374 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */ 4374 return sprintf(page, "%lu\n", db/dt/2); /* K/sec */
4375 } 4375 }
4376 4376
4377 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed); 4377 static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
4378 4378
4379 static ssize_t 4379 static ssize_t
4380 sync_completed_show(struct mddev *mddev, char *page) 4380 sync_completed_show(struct mddev *mddev, char *page)
4381 { 4381 {
4382 unsigned long long max_sectors, resync; 4382 unsigned long long max_sectors, resync;
4383 4383
4384 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4384 if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4385 return sprintf(page, "none\n"); 4385 return sprintf(page, "none\n");
4386 4386
4387 if (mddev->curr_resync == 1 || 4387 if (mddev->curr_resync == 1 ||
4388 mddev->curr_resync == 2) 4388 mddev->curr_resync == 2)
4389 return sprintf(page, "delayed\n"); 4389 return sprintf(page, "delayed\n");
4390 4390
4391 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 4391 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
4392 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 4392 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
4393 max_sectors = mddev->resync_max_sectors; 4393 max_sectors = mddev->resync_max_sectors;
4394 else 4394 else
4395 max_sectors = mddev->dev_sectors; 4395 max_sectors = mddev->dev_sectors;
4396 4396
4397 resync = mddev->curr_resync_completed; 4397 resync = mddev->curr_resync_completed;
4398 return sprintf(page, "%llu / %llu\n", resync, max_sectors); 4398 return sprintf(page, "%llu / %llu\n", resync, max_sectors);
4399 } 4399 }
4400 4400
4401 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); 4401 static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
4402 4402
4403 static ssize_t 4403 static ssize_t
4404 min_sync_show(struct mddev *mddev, char *page) 4404 min_sync_show(struct mddev *mddev, char *page)
4405 { 4405 {
4406 return sprintf(page, "%llu\n", 4406 return sprintf(page, "%llu\n",
4407 (unsigned long long)mddev->resync_min); 4407 (unsigned long long)mddev->resync_min);
4408 } 4408 }
4409 static ssize_t 4409 static ssize_t
4410 min_sync_store(struct mddev *mddev, const char *buf, size_t len) 4410 min_sync_store(struct mddev *mddev, const char *buf, size_t len)
4411 { 4411 {
4412 unsigned long long min; 4412 unsigned long long min;
4413 if (kstrtoull(buf, 10, &min)) 4413 if (kstrtoull(buf, 10, &min))
4414 return -EINVAL; 4414 return -EINVAL;
4415 if (min > mddev->resync_max) 4415 if (min > mddev->resync_max)
4416 return -EINVAL; 4416 return -EINVAL;
4417 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4417 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4418 return -EBUSY; 4418 return -EBUSY;
4419 4419
4420 /* Must be a multiple of chunk_size */ 4420 /* Must be a multiple of chunk_size */
4421 if (mddev->chunk_sectors) { 4421 if (mddev->chunk_sectors) {
4422 sector_t temp = min; 4422 sector_t temp = min;
4423 if (sector_div(temp, mddev->chunk_sectors)) 4423 if (sector_div(temp, mddev->chunk_sectors))
4424 return -EINVAL; 4424 return -EINVAL;
4425 } 4425 }
4426 mddev->resync_min = min; 4426 mddev->resync_min = min;
4427 4427
4428 return len; 4428 return len;
4429 } 4429 }
4430 4430
4431 static struct md_sysfs_entry md_min_sync = 4431 static struct md_sysfs_entry md_min_sync =
4432 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store); 4432 __ATTR(sync_min, S_IRUGO|S_IWUSR, min_sync_show, min_sync_store);
4433 4433
4434 static ssize_t 4434 static ssize_t
4435 max_sync_show(struct mddev *mddev, char *page) 4435 max_sync_show(struct mddev *mddev, char *page)
4436 { 4436 {
4437 if (mddev->resync_max == MaxSector) 4437 if (mddev->resync_max == MaxSector)
4438 return sprintf(page, "max\n"); 4438 return sprintf(page, "max\n");
4439 else 4439 else
4440 return sprintf(page, "%llu\n", 4440 return sprintf(page, "%llu\n",
4441 (unsigned long long)mddev->resync_max); 4441 (unsigned long long)mddev->resync_max);
4442 } 4442 }
4443 static ssize_t 4443 static ssize_t
4444 max_sync_store(struct mddev *mddev, const char *buf, size_t len) 4444 max_sync_store(struct mddev *mddev, const char *buf, size_t len)
4445 { 4445 {
4446 if (strncmp(buf, "max", 3) == 0) 4446 if (strncmp(buf, "max", 3) == 0)
4447 mddev->resync_max = MaxSector; 4447 mddev->resync_max = MaxSector;
4448 else { 4448 else {
4449 unsigned long long max; 4449 unsigned long long max;
4450 if (kstrtoull(buf, 10, &max)) 4450 if (kstrtoull(buf, 10, &max))
4451 return -EINVAL; 4451 return -EINVAL;
4452 if (max < mddev->resync_min) 4452 if (max < mddev->resync_min)
4453 return -EINVAL; 4453 return -EINVAL;
4454 if (max < mddev->resync_max && 4454 if (max < mddev->resync_max &&
4455 mddev->ro == 0 && 4455 mddev->ro == 0 &&
4456 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 4456 test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
4457 return -EBUSY; 4457 return -EBUSY;
4458 4458
4459 /* Must be a multiple of chunk_size */ 4459 /* Must be a multiple of chunk_size */
4460 if (mddev->chunk_sectors) { 4460 if (mddev->chunk_sectors) {
4461 sector_t temp = max; 4461 sector_t temp = max;
4462 if (sector_div(temp, mddev->chunk_sectors)) 4462 if (sector_div(temp, mddev->chunk_sectors))
4463 return -EINVAL; 4463 return -EINVAL;
4464 } 4464 }
4465 mddev->resync_max = max; 4465 mddev->resync_max = max;
4466 } 4466 }
4467 wake_up(&mddev->recovery_wait); 4467 wake_up(&mddev->recovery_wait);
4468 return len; 4468 return len;
4469 } 4469 }
4470 4470
4471 static struct md_sysfs_entry md_max_sync = 4471 static struct md_sysfs_entry md_max_sync =
4472 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store); 4472 __ATTR(sync_max, S_IRUGO|S_IWUSR, max_sync_show, max_sync_store);
4473 4473
4474 static ssize_t 4474 static ssize_t
4475 suspend_lo_show(struct mddev *mddev, char *page) 4475 suspend_lo_show(struct mddev *mddev, char *page)
4476 { 4476 {
4477 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo); 4477 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_lo);
4478 } 4478 }
4479 4479
4480 static ssize_t 4480 static ssize_t
4481 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len) 4481 suspend_lo_store(struct mddev *mddev, const char *buf, size_t len)
4482 { 4482 {
4483 char *e; 4483 char *e;
4484 unsigned long long new = simple_strtoull(buf, &e, 10); 4484 unsigned long long new = simple_strtoull(buf, &e, 10);
4485 unsigned long long old = mddev->suspend_lo; 4485 unsigned long long old = mddev->suspend_lo;
4486 4486
4487 if (mddev->pers == NULL || 4487 if (mddev->pers == NULL ||
4488 mddev->pers->quiesce == NULL) 4488 mddev->pers->quiesce == NULL)
4489 return -EINVAL; 4489 return -EINVAL;
4490 if (buf == e || (*e && *e != '\n')) 4490 if (buf == e || (*e && *e != '\n'))
4491 return -EINVAL; 4491 return -EINVAL;
4492 4492
4493 mddev->suspend_lo = new; 4493 mddev->suspend_lo = new;
4494 if (new >= old) 4494 if (new >= old)
4495 /* Shrinking suspended region */ 4495 /* Shrinking suspended region */
4496 mddev->pers->quiesce(mddev, 2); 4496 mddev->pers->quiesce(mddev, 2);
4497 else { 4497 else {
4498 /* Expanding suspended region - need to wait */ 4498 /* Expanding suspended region - need to wait */
4499 mddev->pers->quiesce(mddev, 1); 4499 mddev->pers->quiesce(mddev, 1);
4500 mddev->pers->quiesce(mddev, 0); 4500 mddev->pers->quiesce(mddev, 0);
4501 } 4501 }
4502 return len; 4502 return len;
4503 } 4503 }
4504 static struct md_sysfs_entry md_suspend_lo = 4504 static struct md_sysfs_entry md_suspend_lo =
4505 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store); 4505 __ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
4506 4506
4507 4507
4508 static ssize_t 4508 static ssize_t
4509 suspend_hi_show(struct mddev *mddev, char *page) 4509 suspend_hi_show(struct mddev *mddev, char *page)
4510 { 4510 {
4511 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi); 4511 return sprintf(page, "%llu\n", (unsigned long long)mddev->suspend_hi);
4512 } 4512 }
4513 4513
4514 static ssize_t 4514 static ssize_t
4515 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len) 4515 suspend_hi_store(struct mddev *mddev, const char *buf, size_t len)
4516 { 4516 {
4517 char *e; 4517 char *e;
4518 unsigned long long new = simple_strtoull(buf, &e, 10); 4518 unsigned long long new = simple_strtoull(buf, &e, 10);
4519 unsigned long long old = mddev->suspend_hi; 4519 unsigned long long old = mddev->suspend_hi;
4520 4520
4521 if (mddev->pers == NULL || 4521 if (mddev->pers == NULL ||
4522 mddev->pers->quiesce == NULL) 4522 mddev->pers->quiesce == NULL)
4523 return -EINVAL; 4523 return -EINVAL;
4524 if (buf == e || (*e && *e != '\n')) 4524 if (buf == e || (*e && *e != '\n'))
4525 return -EINVAL; 4525 return -EINVAL;
4526 4526
4527 mddev->suspend_hi = new; 4527 mddev->suspend_hi = new;
4528 if (new <= old) 4528 if (new <= old)
4529 /* Shrinking suspended region */ 4529 /* Shrinking suspended region */
4530 mddev->pers->quiesce(mddev, 2); 4530 mddev->pers->quiesce(mddev, 2);
4531 else { 4531 else {
4532 /* Expanding suspended region - need to wait */ 4532 /* Expanding suspended region - need to wait */
4533 mddev->pers->quiesce(mddev, 1); 4533 mddev->pers->quiesce(mddev, 1);
4534 mddev->pers->quiesce(mddev, 0); 4534 mddev->pers->quiesce(mddev, 0);
4535 } 4535 }
4536 return len; 4536 return len;
4537 } 4537 }
4538 static struct md_sysfs_entry md_suspend_hi = 4538 static struct md_sysfs_entry md_suspend_hi =
4539 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store); 4539 __ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
4540 4540
4541 static ssize_t 4541 static ssize_t
4542 reshape_position_show(struct mddev *mddev, char *page) 4542 reshape_position_show(struct mddev *mddev, char *page)
4543 { 4543 {
4544 if (mddev->reshape_position != MaxSector) 4544 if (mddev->reshape_position != MaxSector)
4545 return sprintf(page, "%llu\n", 4545 return sprintf(page, "%llu\n",
4546 (unsigned long long)mddev->reshape_position); 4546 (unsigned long long)mddev->reshape_position);
4547 strcpy(page, "none\n"); 4547 strcpy(page, "none\n");
4548 return 5; 4548 return 5;
4549 } 4549 }
4550 4550
4551 static ssize_t 4551 static ssize_t
4552 reshape_position_store(struct mddev *mddev, const char *buf, size_t len) 4552 reshape_position_store(struct mddev *mddev, const char *buf, size_t len)
4553 { 4553 {
4554 struct md_rdev *rdev; 4554 struct md_rdev *rdev;
4555 char *e; 4555 char *e;
4556 unsigned long long new = simple_strtoull(buf, &e, 10); 4556 unsigned long long new = simple_strtoull(buf, &e, 10);
4557 if (mddev->pers) 4557 if (mddev->pers)
4558 return -EBUSY; 4558 return -EBUSY;
4559 if (buf == e || (*e && *e != '\n')) 4559 if (buf == e || (*e && *e != '\n'))
4560 return -EINVAL; 4560 return -EINVAL;
4561 mddev->reshape_position = new; 4561 mddev->reshape_position = new;
4562 mddev->delta_disks = 0; 4562 mddev->delta_disks = 0;
4563 mddev->reshape_backwards = 0; 4563 mddev->reshape_backwards = 0;
4564 mddev->new_level = mddev->level; 4564 mddev->new_level = mddev->level;
4565 mddev->new_layout = mddev->layout; 4565 mddev->new_layout = mddev->layout;
4566 mddev->new_chunk_sectors = mddev->chunk_sectors; 4566 mddev->new_chunk_sectors = mddev->chunk_sectors;
4567 rdev_for_each(rdev, mddev) 4567 rdev_for_each(rdev, mddev)
4568 rdev->new_data_offset = rdev->data_offset; 4568 rdev->new_data_offset = rdev->data_offset;
4569 return len; 4569 return len;
4570 } 4570 }
4571 4571
4572 static struct md_sysfs_entry md_reshape_position = 4572 static struct md_sysfs_entry md_reshape_position =
4573 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show, 4573 __ATTR(reshape_position, S_IRUGO|S_IWUSR, reshape_position_show,
4574 reshape_position_store); 4574 reshape_position_store);
4575 4575
4576 static ssize_t 4576 static ssize_t
4577 reshape_direction_show(struct mddev *mddev, char *page) 4577 reshape_direction_show(struct mddev *mddev, char *page)
4578 { 4578 {
4579 return sprintf(page, "%s\n", 4579 return sprintf(page, "%s\n",
4580 mddev->reshape_backwards ? "backwards" : "forwards"); 4580 mddev->reshape_backwards ? "backwards" : "forwards");
4581 } 4581 }
4582 4582
4583 static ssize_t 4583 static ssize_t
4584 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len) 4584 reshape_direction_store(struct mddev *mddev, const char *buf, size_t len)
4585 { 4585 {
4586 int backwards = 0; 4586 int backwards = 0;
4587 if (cmd_match(buf, "forwards")) 4587 if (cmd_match(buf, "forwards"))
4588 backwards = 0; 4588 backwards = 0;
4589 else if (cmd_match(buf, "backwards")) 4589 else if (cmd_match(buf, "backwards"))
4590 backwards = 1; 4590 backwards = 1;
4591 else 4591 else
4592 return -EINVAL; 4592 return -EINVAL;
4593 if (mddev->reshape_backwards == backwards) 4593 if (mddev->reshape_backwards == backwards)
4594 return len; 4594 return len;
4595 4595
4596 /* check if we are allowed to change */ 4596 /* check if we are allowed to change */
4597 if (mddev->delta_disks) 4597 if (mddev->delta_disks)
4598 return -EBUSY; 4598 return -EBUSY;
4599 4599
4600 if (mddev->persistent && 4600 if (mddev->persistent &&
4601 mddev->major_version == 0) 4601 mddev->major_version == 0)
4602 return -EINVAL; 4602 return -EINVAL;
4603 4603
4604 mddev->reshape_backwards = backwards; 4604 mddev->reshape_backwards = backwards;
4605 return len; 4605 return len;
4606 } 4606 }
4607 4607
4608 static struct md_sysfs_entry md_reshape_direction = 4608 static struct md_sysfs_entry md_reshape_direction =
4609 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show, 4609 __ATTR(reshape_direction, S_IRUGO|S_IWUSR, reshape_direction_show,
4610 reshape_direction_store); 4610 reshape_direction_store);
4611 4611
4612 static ssize_t 4612 static ssize_t
4613 array_size_show(struct mddev *mddev, char *page) 4613 array_size_show(struct mddev *mddev, char *page)
4614 { 4614 {
4615 if (mddev->external_size) 4615 if (mddev->external_size)
4616 return sprintf(page, "%llu\n", 4616 return sprintf(page, "%llu\n",
4617 (unsigned long long)mddev->array_sectors/2); 4617 (unsigned long long)mddev->array_sectors/2);
4618 else 4618 else
4619 return sprintf(page, "default\n"); 4619 return sprintf(page, "default\n");
4620 } 4620 }
4621 4621
4622 static ssize_t 4622 static ssize_t
4623 array_size_store(struct mddev *mddev, const char *buf, size_t len) 4623 array_size_store(struct mddev *mddev, const char *buf, size_t len)
4624 { 4624 {
4625 sector_t sectors; 4625 sector_t sectors;
4626 4626
4627 if (strncmp(buf, "default", 7) == 0) { 4627 if (strncmp(buf, "default", 7) == 0) {
4628 if (mddev->pers) 4628 if (mddev->pers)
4629 sectors = mddev->pers->size(mddev, 0, 0); 4629 sectors = mddev->pers->size(mddev, 0, 0);
4630 else 4630 else
4631 sectors = mddev->array_sectors; 4631 sectors = mddev->array_sectors;
4632 4632
4633 mddev->external_size = 0; 4633 mddev->external_size = 0;
4634 } else { 4634 } else {
4635 if (strict_blocks_to_sectors(buf, &sectors) < 0) 4635 if (strict_blocks_to_sectors(buf, &sectors) < 0)
4636 return -EINVAL; 4636 return -EINVAL;
4637 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors) 4637 if (mddev->pers && mddev->pers->size(mddev, 0, 0) < sectors)
4638 return -E2BIG; 4638 return -E2BIG;
4639 4639
4640 mddev->external_size = 1; 4640 mddev->external_size = 1;
4641 } 4641 }
4642 4642
4643 mddev->array_sectors = sectors; 4643 mddev->array_sectors = sectors;
4644 if (mddev->pers) { 4644 if (mddev->pers) {
4645 set_capacity(mddev->gendisk, mddev->array_sectors); 4645 set_capacity(mddev->gendisk, mddev->array_sectors);
4646 revalidate_disk(mddev->gendisk); 4646 revalidate_disk(mddev->gendisk);
4647 } 4647 }
4648 return len; 4648 return len;
4649 } 4649 }
4650 4650
4651 static struct md_sysfs_entry md_array_size = 4651 static struct md_sysfs_entry md_array_size =
4652 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show, 4652 __ATTR(array_size, S_IRUGO|S_IWUSR, array_size_show,
4653 array_size_store); 4653 array_size_store);
4654 4654
4655 static struct attribute *md_default_attrs[] = { 4655 static struct attribute *md_default_attrs[] = {
4656 &md_level.attr, 4656 &md_level.attr,
4657 &md_layout.attr, 4657 &md_layout.attr,
4658 &md_raid_disks.attr, 4658 &md_raid_disks.attr,
4659 &md_chunk_size.attr, 4659 &md_chunk_size.attr,
4660 &md_size.attr, 4660 &md_size.attr,
4661 &md_resync_start.attr, 4661 &md_resync_start.attr,
4662 &md_metadata.attr, 4662 &md_metadata.attr,
4663 &md_new_device.attr, 4663 &md_new_device.attr,
4664 &md_safe_delay.attr, 4664 &md_safe_delay.attr,
4665 &md_array_state.attr, 4665 &md_array_state.attr,
4666 &md_reshape_position.attr, 4666 &md_reshape_position.attr,
4667 &md_reshape_direction.attr, 4667 &md_reshape_direction.attr,
4668 &md_array_size.attr, 4668 &md_array_size.attr,
4669 &max_corr_read_errors.attr, 4669 &max_corr_read_errors.attr,
4670 NULL, 4670 NULL,
4671 }; 4671 };
4672 4672
4673 static struct attribute *md_redundancy_attrs[] = { 4673 static struct attribute *md_redundancy_attrs[] = {
4674 &md_scan_mode.attr, 4674 &md_scan_mode.attr,
4675 &md_last_scan_mode.attr, 4675 &md_last_scan_mode.attr,
4676 &md_mismatches.attr, 4676 &md_mismatches.attr,
4677 &md_sync_min.attr, 4677 &md_sync_min.attr,
4678 &md_sync_max.attr, 4678 &md_sync_max.attr,
4679 &md_sync_speed.attr, 4679 &md_sync_speed.attr,
4680 &md_sync_force_parallel.attr, 4680 &md_sync_force_parallel.attr,
4681 &md_sync_completed.attr, 4681 &md_sync_completed.attr,
4682 &md_min_sync.attr, 4682 &md_min_sync.attr,
4683 &md_max_sync.attr, 4683 &md_max_sync.attr,
4684 &md_suspend_lo.attr, 4684 &md_suspend_lo.attr,
4685 &md_suspend_hi.attr, 4685 &md_suspend_hi.attr,
4686 &md_bitmap.attr, 4686 &md_bitmap.attr,
4687 &md_degraded.attr, 4687 &md_degraded.attr,
4688 NULL, 4688 NULL,
4689 }; 4689 };
4690 static struct attribute_group md_redundancy_group = { 4690 static struct attribute_group md_redundancy_group = {
4691 .name = NULL, 4691 .name = NULL,
4692 .attrs = md_redundancy_attrs, 4692 .attrs = md_redundancy_attrs,
4693 }; 4693 };
4694 4694
4695 4695
4696 static ssize_t 4696 static ssize_t
4697 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 4697 md_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
4698 { 4698 {
4699 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4699 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4700 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4700 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
4701 ssize_t rv; 4701 ssize_t rv;
4702 4702
4703 if (!entry->show) 4703 if (!entry->show)
4704 return -EIO; 4704 return -EIO;
4705 spin_lock(&all_mddevs_lock); 4705 spin_lock(&all_mddevs_lock);
4706 if (list_empty(&mddev->all_mddevs)) { 4706 if (list_empty(&mddev->all_mddevs)) {
4707 spin_unlock(&all_mddevs_lock); 4707 spin_unlock(&all_mddevs_lock);
4708 return -EBUSY; 4708 return -EBUSY;
4709 } 4709 }
4710 mddev_get(mddev); 4710 mddev_get(mddev);
4711 spin_unlock(&all_mddevs_lock); 4711 spin_unlock(&all_mddevs_lock);
4712 4712
4713 rv = mddev_lock(mddev); 4713 rv = mddev_lock(mddev);
4714 if (!rv) { 4714 if (!rv) {
4715 rv = entry->show(mddev, page); 4715 rv = entry->show(mddev, page);
4716 mddev_unlock(mddev); 4716 mddev_unlock(mddev);
4717 } 4717 }
4718 mddev_put(mddev); 4718 mddev_put(mddev);
4719 return rv; 4719 return rv;
4720 } 4720 }
4721 4721
4722 static ssize_t 4722 static ssize_t
4723 md_attr_store(struct kobject *kobj, struct attribute *attr, 4723 md_attr_store(struct kobject *kobj, struct attribute *attr,
4724 const char *page, size_t length) 4724 const char *page, size_t length)
4725 { 4725 {
4726 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr); 4726 struct md_sysfs_entry *entry = container_of(attr, struct md_sysfs_entry, attr);
4727 struct mddev *mddev = container_of(kobj, struct mddev, kobj); 4727 struct mddev *mddev = container_of(kobj, struct mddev, kobj);
4728 ssize_t rv; 4728 ssize_t rv;
4729 4729
4730 if (!entry->store) 4730 if (!entry->store)
4731 return -EIO; 4731 return -EIO;
4732 if (!capable(CAP_SYS_ADMIN)) 4732 if (!capable(CAP_SYS_ADMIN))
4733 return -EACCES; 4733 return -EACCES;
4734 spin_lock(&all_mddevs_lock); 4734 spin_lock(&all_mddevs_lock);
4735 if (list_empty(&mddev->all_mddevs)) { 4735 if (list_empty(&mddev->all_mddevs)) {
4736 spin_unlock(&all_mddevs_lock); 4736 spin_unlock(&all_mddevs_lock);
4737 return -EBUSY; 4737 return -EBUSY;
4738 } 4738 }
4739 mddev_get(mddev); 4739 mddev_get(mddev);
4740 spin_unlock(&all_mddevs_lock); 4740 spin_unlock(&all_mddevs_lock);
4741 if (entry->store == new_dev_store) 4741 if (entry->store == new_dev_store)
4742 flush_workqueue(md_misc_wq); 4742 flush_workqueue(md_misc_wq);
4743 rv = mddev_lock(mddev); 4743 rv = mddev_lock(mddev);
4744 if (!rv) { 4744 if (!rv) {
4745 rv = entry->store(mddev, page, length); 4745 rv = entry->store(mddev, page, length);
4746 mddev_unlock(mddev); 4746 mddev_unlock(mddev);
4747 } 4747 }
4748 mddev_put(mddev); 4748 mddev_put(mddev);
4749 return rv; 4749 return rv;
4750 } 4750 }
4751 4751
4752 static void md_free(struct kobject *ko) 4752 static void md_free(struct kobject *ko)
4753 { 4753 {
4754 struct mddev *mddev = container_of(ko, struct mddev, kobj); 4754 struct mddev *mddev = container_of(ko, struct mddev, kobj);
4755 4755
4756 if (mddev->sysfs_state) 4756 if (mddev->sysfs_state)
4757 sysfs_put(mddev->sysfs_state); 4757 sysfs_put(mddev->sysfs_state);
4758 4758
4759 if (mddev->gendisk) { 4759 if (mddev->gendisk) {
4760 del_gendisk(mddev->gendisk); 4760 del_gendisk(mddev->gendisk);
4761 put_disk(mddev->gendisk); 4761 put_disk(mddev->gendisk);
4762 } 4762 }
4763 if (mddev->queue) 4763 if (mddev->queue)
4764 blk_cleanup_queue(mddev->queue); 4764 blk_cleanup_queue(mddev->queue);
4765 4765
4766 kfree(mddev); 4766 kfree(mddev);
4767 } 4767 }
4768 4768
4769 static const struct sysfs_ops md_sysfs_ops = { 4769 static const struct sysfs_ops md_sysfs_ops = {
4770 .show = md_attr_show, 4770 .show = md_attr_show,
4771 .store = md_attr_store, 4771 .store = md_attr_store,
4772 }; 4772 };
4773 static struct kobj_type md_ktype = { 4773 static struct kobj_type md_ktype = {
4774 .release = md_free, 4774 .release = md_free,
4775 .sysfs_ops = &md_sysfs_ops, 4775 .sysfs_ops = &md_sysfs_ops,
4776 .default_attrs = md_default_attrs, 4776 .default_attrs = md_default_attrs,
4777 }; 4777 };
4778 4778
4779 int mdp_major = 0; 4779 int mdp_major = 0;
4780 4780
4781 static void mddev_delayed_delete(struct work_struct *ws) 4781 static void mddev_delayed_delete(struct work_struct *ws)
4782 { 4782 {
4783 struct mddev *mddev = container_of(ws, struct mddev, del_work); 4783 struct mddev *mddev = container_of(ws, struct mddev, del_work);
4784 4784
4785 sysfs_remove_group(&mddev->kobj, &md_bitmap_group); 4785 sysfs_remove_group(&mddev->kobj, &md_bitmap_group);
4786 kobject_del(&mddev->kobj); 4786 kobject_del(&mddev->kobj);
4787 kobject_put(&mddev->kobj); 4787 kobject_put(&mddev->kobj);
4788 } 4788 }
4789 4789
4790 static int md_alloc(dev_t dev, char *name) 4790 static int md_alloc(dev_t dev, char *name)
4791 { 4791 {
4792 static DEFINE_MUTEX(disks_mutex); 4792 static DEFINE_MUTEX(disks_mutex);
4793 struct mddev *mddev = mddev_find(dev); 4793 struct mddev *mddev = mddev_find(dev);
4794 struct gendisk *disk; 4794 struct gendisk *disk;
4795 int partitioned; 4795 int partitioned;
4796 int shift; 4796 int shift;
4797 int unit; 4797 int unit;
4798 int error; 4798 int error;
4799 4799
4800 if (!mddev) 4800 if (!mddev)
4801 return -ENODEV; 4801 return -ENODEV;
4802 4802
4803 partitioned = (MAJOR(mddev->unit) != MD_MAJOR); 4803 partitioned = (MAJOR(mddev->unit) != MD_MAJOR);
4804 shift = partitioned ? MdpMinorShift : 0; 4804 shift = partitioned ? MdpMinorShift : 0;
4805 unit = MINOR(mddev->unit) >> shift; 4805 unit = MINOR(mddev->unit) >> shift;
4806 4806
4807 /* wait for any previous instance of this device to be 4807 /* wait for any previous instance of this device to be
4808 * completely removed (mddev_delayed_delete). 4808 * completely removed (mddev_delayed_delete).
4809 */ 4809 */
4810 flush_workqueue(md_misc_wq); 4810 flush_workqueue(md_misc_wq);
4811 4811
4812 mutex_lock(&disks_mutex); 4812 mutex_lock(&disks_mutex);
4813 error = -EEXIST; 4813 error = -EEXIST;
4814 if (mddev->gendisk) 4814 if (mddev->gendisk)
4815 goto abort; 4815 goto abort;
4816 4816
4817 if (name) { 4817 if (name) {
4818 /* Need to ensure that 'name' is not a duplicate. 4818 /* Need to ensure that 'name' is not a duplicate.
4819 */ 4819 */
4820 struct mddev *mddev2; 4820 struct mddev *mddev2;
4821 spin_lock(&all_mddevs_lock); 4821 spin_lock(&all_mddevs_lock);
4822 4822
4823 list_for_each_entry(mddev2, &all_mddevs, all_mddevs) 4823 list_for_each_entry(mddev2, &all_mddevs, all_mddevs)
4824 if (mddev2->gendisk && 4824 if (mddev2->gendisk &&
4825 strcmp(mddev2->gendisk->disk_name, name) == 0) { 4825 strcmp(mddev2->gendisk->disk_name, name) == 0) {
4826 spin_unlock(&all_mddevs_lock); 4826 spin_unlock(&all_mddevs_lock);
4827 goto abort; 4827 goto abort;
4828 } 4828 }
4829 spin_unlock(&all_mddevs_lock); 4829 spin_unlock(&all_mddevs_lock);
4830 } 4830 }
4831 4831
4832 error = -ENOMEM; 4832 error = -ENOMEM;
4833 mddev->queue = blk_alloc_queue(GFP_KERNEL); 4833 mddev->queue = blk_alloc_queue(GFP_KERNEL);
4834 if (!mddev->queue) 4834 if (!mddev->queue)
4835 goto abort; 4835 goto abort;
4836 mddev->queue->queuedata = mddev; 4836 mddev->queue->queuedata = mddev;
4837 4837
4838 blk_queue_make_request(mddev->queue, md_make_request); 4838 blk_queue_make_request(mddev->queue, md_make_request);
4839 blk_set_stacking_limits(&mddev->queue->limits); 4839 blk_set_stacking_limits(&mddev->queue->limits);
4840 4840
4841 disk = alloc_disk(1 << shift); 4841 disk = alloc_disk(1 << shift);
4842 if (!disk) { 4842 if (!disk) {
4843 blk_cleanup_queue(mddev->queue); 4843 blk_cleanup_queue(mddev->queue);
4844 mddev->queue = NULL; 4844 mddev->queue = NULL;
4845 goto abort; 4845 goto abort;
4846 } 4846 }
4847 disk->major = MAJOR(mddev->unit); 4847 disk->major = MAJOR(mddev->unit);
4848 disk->first_minor = unit << shift; 4848 disk->first_minor = unit << shift;
4849 if (name) 4849 if (name)
4850 strcpy(disk->disk_name, name); 4850 strcpy(disk->disk_name, name);
4851 else if (partitioned) 4851 else if (partitioned)
4852 sprintf(disk->disk_name, "md_d%d", unit); 4852 sprintf(disk->disk_name, "md_d%d", unit);
4853 else 4853 else
4854 sprintf(disk->disk_name, "md%d", unit); 4854 sprintf(disk->disk_name, "md%d", unit);
4855 disk->fops = &md_fops; 4855 disk->fops = &md_fops;
4856 disk->private_data = mddev; 4856 disk->private_data = mddev;
4857 disk->queue = mddev->queue; 4857 disk->queue = mddev->queue;
4858 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); 4858 blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
4859 /* Allow extended partitions. This makes the 4859 /* Allow extended partitions. This makes the
4860 * 'mdp' device redundant, but we can't really 4860 * 'mdp' device redundant, but we can't really
4861 * remove it now. 4861 * remove it now.
4862 */ 4862 */
4863 disk->flags |= GENHD_FL_EXT_DEVT; 4863 disk->flags |= GENHD_FL_EXT_DEVT;
4864 mddev->gendisk = disk; 4864 mddev->gendisk = disk;
4865 /* As soon as we call add_disk(), another thread could get 4865 /* As soon as we call add_disk(), another thread could get
4866 * through to md_open, so make sure it doesn't get too far 4866 * through to md_open, so make sure it doesn't get too far
4867 */ 4867 */
4868 mutex_lock(&mddev->open_mutex); 4868 mutex_lock(&mddev->open_mutex);
4869 add_disk(disk); 4869 add_disk(disk);
4870 4870
4871 error = kobject_init_and_add(&mddev->kobj, &md_ktype, 4871 error = kobject_init_and_add(&mddev->kobj, &md_ktype,
4872 &disk_to_dev(disk)->kobj, "%s", "md"); 4872 &disk_to_dev(disk)->kobj, "%s", "md");
4873 if (error) { 4873 if (error) {
4874 /* This isn't possible, but as kobject_init_and_add is marked 4874 /* This isn't possible, but as kobject_init_and_add is marked
4875 * __must_check, we must do something with the result 4875 * __must_check, we must do something with the result
4876 */ 4876 */
4877 printk(KERN_WARNING "md: cannot register %s/md - name in use\n", 4877 printk(KERN_WARNING "md: cannot register %s/md - name in use\n",
4878 disk->disk_name); 4878 disk->disk_name);
4879 error = 0; 4879 error = 0;
4880 } 4880 }
4881 if (mddev->kobj.sd && 4881 if (mddev->kobj.sd &&
4882 sysfs_create_group(&mddev->kobj, &md_bitmap_group)) 4882 sysfs_create_group(&mddev->kobj, &md_bitmap_group))
4883 printk(KERN_DEBUG "pointless warning\n"); 4883 printk(KERN_DEBUG "pointless warning\n");
4884 mutex_unlock(&mddev->open_mutex); 4884 mutex_unlock(&mddev->open_mutex);
4885 abort: 4885 abort:
4886 mutex_unlock(&disks_mutex); 4886 mutex_unlock(&disks_mutex);
4887 if (!error && mddev->kobj.sd) { 4887 if (!error && mddev->kobj.sd) {
4888 kobject_uevent(&mddev->kobj, KOBJ_ADD); 4888 kobject_uevent(&mddev->kobj, KOBJ_ADD);
4889 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state"); 4889 mddev->sysfs_state = sysfs_get_dirent_safe(mddev->kobj.sd, "array_state");
4890 } 4890 }
4891 mddev_put(mddev); 4891 mddev_put(mddev);
4892 return error; 4892 return error;
4893 } 4893 }
4894 4894
4895 static struct kobject *md_probe(dev_t dev, int *part, void *data) 4895 static struct kobject *md_probe(dev_t dev, int *part, void *data)
4896 { 4896 {
4897 md_alloc(dev, NULL); 4897 md_alloc(dev, NULL);
4898 return NULL; 4898 return NULL;
4899 } 4899 }
4900 4900
4901 static int add_named_array(const char *val, struct kernel_param *kp) 4901 static int add_named_array(const char *val, struct kernel_param *kp)
4902 { 4902 {
4903 /* val must be "md_*" where * is not all digits. 4903 /* val must be "md_*" where * is not all digits.
4904 * We allocate an array with a large free minor number, and 4904 * We allocate an array with a large free minor number, and
4905 * set the name to val. val must not already be an active name. 4905 * set the name to val. val must not already be an active name.
4906 */ 4906 */
4907 int len = strlen(val); 4907 int len = strlen(val);
4908 char buf[DISK_NAME_LEN]; 4908 char buf[DISK_NAME_LEN];
4909 4909
4910 while (len && val[len-1] == '\n') 4910 while (len && val[len-1] == '\n')
4911 len--; 4911 len--;
4912 if (len >= DISK_NAME_LEN) 4912 if (len >= DISK_NAME_LEN)
4913 return -E2BIG; 4913 return -E2BIG;
4914 strlcpy(buf, val, len+1); 4914 strlcpy(buf, val, len+1);
4915 if (strncmp(buf, "md_", 3) != 0) 4915 if (strncmp(buf, "md_", 3) != 0)
4916 return -EINVAL; 4916 return -EINVAL;
4917 return md_alloc(0, buf); 4917 return md_alloc(0, buf);
4918 } 4918 }
4919 4919
4920 static void md_safemode_timeout(unsigned long data) 4920 static void md_safemode_timeout(unsigned long data)
4921 { 4921 {
4922 struct mddev *mddev = (struct mddev *) data; 4922 struct mddev *mddev = (struct mddev *) data;
4923 4923
4924 if (!atomic_read(&mddev->writes_pending)) { 4924 if (!atomic_read(&mddev->writes_pending)) {
4925 mddev->safemode = 1; 4925 mddev->safemode = 1;
4926 if (mddev->external) 4926 if (mddev->external)
4927 sysfs_notify_dirent_safe(mddev->sysfs_state); 4927 sysfs_notify_dirent_safe(mddev->sysfs_state);
4928 } 4928 }
4929 md_wakeup_thread(mddev->thread); 4929 md_wakeup_thread(mddev->thread);
4930 } 4930 }
4931 4931
4932 static int start_dirty_degraded; 4932 static int start_dirty_degraded;
4933 4933
4934 int md_run(struct mddev *mddev) 4934 int md_run(struct mddev *mddev)
4935 { 4935 {
4936 int err; 4936 int err;
4937 struct md_rdev *rdev; 4937 struct md_rdev *rdev;
4938 struct md_personality *pers; 4938 struct md_personality *pers;
4939 4939
4940 if (list_empty(&mddev->disks)) 4940 if (list_empty(&mddev->disks))
4941 /* cannot run an array with no devices.. */ 4941 /* cannot run an array with no devices.. */
4942 return -EINVAL; 4942 return -EINVAL;
4943 4943
4944 if (mddev->pers) 4944 if (mddev->pers)
4945 return -EBUSY; 4945 return -EBUSY;
4946 /* Cannot run until previous stop completes properly */ 4946 /* Cannot run until previous stop completes properly */
4947 if (mddev->sysfs_active) 4947 if (mddev->sysfs_active)
4948 return -EBUSY; 4948 return -EBUSY;
4949 4949
4950 /* 4950 /*
4951 * Analyze all RAID superblock(s) 4951 * Analyze all RAID superblock(s)
4952 */ 4952 */
4953 if (!mddev->raid_disks) { 4953 if (!mddev->raid_disks) {
4954 if (!mddev->persistent) 4954 if (!mddev->persistent)
4955 return -EINVAL; 4955 return -EINVAL;
4956 analyze_sbs(mddev); 4956 analyze_sbs(mddev);
4957 } 4957 }
4958 4958
4959 if (mddev->level != LEVEL_NONE) 4959 if (mddev->level != LEVEL_NONE)
4960 request_module("md-level-%d", mddev->level); 4960 request_module("md-level-%d", mddev->level);
4961 else if (mddev->clevel[0]) 4961 else if (mddev->clevel[0])
4962 request_module("md-%s", mddev->clevel); 4962 request_module("md-%s", mddev->clevel);
4963 4963
4964 /* 4964 /*
4965 * Drop all container device buffers, from now on 4965 * Drop all container device buffers, from now on
4966 * the only valid external interface is through the md 4966 * the only valid external interface is through the md
4967 * device. 4967 * device.
4968 */ 4968 */
4969 rdev_for_each(rdev, mddev) { 4969 rdev_for_each(rdev, mddev) {
4970 if (test_bit(Faulty, &rdev->flags)) 4970 if (test_bit(Faulty, &rdev->flags))
4971 continue; 4971 continue;
4972 sync_blockdev(rdev->bdev); 4972 sync_blockdev(rdev->bdev);
4973 invalidate_bdev(rdev->bdev); 4973 invalidate_bdev(rdev->bdev);
4974 4974
4975 /* perform some consistency tests on the device. 4975 /* perform some consistency tests on the device.
4976 * We don't want the data to overlap the metadata, 4976 * We don't want the data to overlap the metadata,
4977 * Internal Bitmap issues have been handled elsewhere. 4977 * Internal Bitmap issues have been handled elsewhere.
4978 */ 4978 */
4979 if (rdev->meta_bdev) { 4979 if (rdev->meta_bdev) {
4980 /* Nothing to check */; 4980 /* Nothing to check */;
4981 } else if (rdev->data_offset < rdev->sb_start) { 4981 } else if (rdev->data_offset < rdev->sb_start) {
4982 if (mddev->dev_sectors && 4982 if (mddev->dev_sectors &&
4983 rdev->data_offset + mddev->dev_sectors 4983 rdev->data_offset + mddev->dev_sectors
4984 > rdev->sb_start) { 4984 > rdev->sb_start) {
4985 printk("md: %s: data overlaps metadata\n", 4985 printk("md: %s: data overlaps metadata\n",
4986 mdname(mddev)); 4986 mdname(mddev));
4987 return -EINVAL; 4987 return -EINVAL;
4988 } 4988 }
4989 } else { 4989 } else {
4990 if (rdev->sb_start + rdev->sb_size/512 4990 if (rdev->sb_start + rdev->sb_size/512
4991 > rdev->data_offset) { 4991 > rdev->data_offset) {
4992 printk("md: %s: metadata overlaps data\n", 4992 printk("md: %s: metadata overlaps data\n",
4993 mdname(mddev)); 4993 mdname(mddev));
4994 return -EINVAL; 4994 return -EINVAL;
4995 } 4995 }
4996 } 4996 }
4997 sysfs_notify_dirent_safe(rdev->sysfs_state); 4997 sysfs_notify_dirent_safe(rdev->sysfs_state);
4998 } 4998 }
4999 4999
5000 if (mddev->bio_set == NULL) 5000 if (mddev->bio_set == NULL)
5001 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); 5001 mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0);
5002 5002
5003 spin_lock(&pers_lock); 5003 spin_lock(&pers_lock);
5004 pers = find_pers(mddev->level, mddev->clevel); 5004 pers = find_pers(mddev->level, mddev->clevel);
5005 if (!pers || !try_module_get(pers->owner)) { 5005 if (!pers || !try_module_get(pers->owner)) {
5006 spin_unlock(&pers_lock); 5006 spin_unlock(&pers_lock);
5007 if (mddev->level != LEVEL_NONE) 5007 if (mddev->level != LEVEL_NONE)
5008 printk(KERN_WARNING "md: personality for level %d is not loaded!\n", 5008 printk(KERN_WARNING "md: personality for level %d is not loaded!\n",
5009 mddev->level); 5009 mddev->level);
5010 else 5010 else
5011 printk(KERN_WARNING "md: personality for level %s is not loaded!\n", 5011 printk(KERN_WARNING "md: personality for level %s is not loaded!\n",
5012 mddev->clevel); 5012 mddev->clevel);
5013 return -EINVAL; 5013 return -EINVAL;
5014 } 5014 }
5015 mddev->pers = pers; 5015 mddev->pers = pers;
5016 spin_unlock(&pers_lock); 5016 spin_unlock(&pers_lock);
5017 if (mddev->level != pers->level) { 5017 if (mddev->level != pers->level) {
5018 mddev->level = pers->level; 5018 mddev->level = pers->level;
5019 mddev->new_level = pers->level; 5019 mddev->new_level = pers->level;
5020 } 5020 }
5021 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); 5021 strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel));
5022 5022
5023 if (mddev->reshape_position != MaxSector && 5023 if (mddev->reshape_position != MaxSector &&
5024 pers->start_reshape == NULL) { 5024 pers->start_reshape == NULL) {
5025 /* This personality cannot handle reshaping... */ 5025 /* This personality cannot handle reshaping... */
5026 mddev->pers = NULL; 5026 mddev->pers = NULL;
5027 module_put(pers->owner); 5027 module_put(pers->owner);
5028 return -EINVAL; 5028 return -EINVAL;
5029 } 5029 }
5030 5030
5031 if (pers->sync_request) { 5031 if (pers->sync_request) {
5032 /* Warn if this is a potentially silly 5032 /* Warn if this is a potentially silly
5033 * configuration. 5033 * configuration.
5034 */ 5034 */
5035 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5035 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5036 struct md_rdev *rdev2; 5036 struct md_rdev *rdev2;
5037 int warned = 0; 5037 int warned = 0;
5038 5038
5039 rdev_for_each(rdev, mddev) 5039 rdev_for_each(rdev, mddev)
5040 rdev_for_each(rdev2, mddev) { 5040 rdev_for_each(rdev2, mddev) {
5041 if (rdev < rdev2 && 5041 if (rdev < rdev2 &&
5042 rdev->bdev->bd_contains == 5042 rdev->bdev->bd_contains ==
5043 rdev2->bdev->bd_contains) { 5043 rdev2->bdev->bd_contains) {
5044 printk(KERN_WARNING 5044 printk(KERN_WARNING
5045 "%s: WARNING: %s appears to be" 5045 "%s: WARNING: %s appears to be"
5046 " on the same physical disk as" 5046 " on the same physical disk as"
5047 " %s.\n", 5047 " %s.\n",
5048 mdname(mddev), 5048 mdname(mddev),
5049 bdevname(rdev->bdev,b), 5049 bdevname(rdev->bdev,b),
5050 bdevname(rdev2->bdev,b2)); 5050 bdevname(rdev2->bdev,b2));
5051 warned = 1; 5051 warned = 1;
5052 } 5052 }
5053 } 5053 }
5054 5054
5055 if (warned) 5055 if (warned)
5056 printk(KERN_WARNING 5056 printk(KERN_WARNING
5057 "True protection against single-disk" 5057 "True protection against single-disk"
5058 " failure might be compromised.\n"); 5058 " failure might be compromised.\n");
5059 } 5059 }
5060 5060
5061 mddev->recovery = 0; 5061 mddev->recovery = 0;
5062 /* may be over-ridden by personality */ 5062 /* may be over-ridden by personality */
5063 mddev->resync_max_sectors = mddev->dev_sectors; 5063 mddev->resync_max_sectors = mddev->dev_sectors;
5064 5064
5065 mddev->ok_start_degraded = start_dirty_degraded; 5065 mddev->ok_start_degraded = start_dirty_degraded;
5066 5066
5067 if (start_readonly && mddev->ro == 0) 5067 if (start_readonly && mddev->ro == 0)
5068 mddev->ro = 2; /* read-only, but switch on first write */ 5068 mddev->ro = 2; /* read-only, but switch on first write */
5069 5069
5070 err = mddev->pers->run(mddev); 5070 err = mddev->pers->run(mddev);
5071 if (err) 5071 if (err)
5072 printk(KERN_ERR "md: pers->run() failed ...\n"); 5072 printk(KERN_ERR "md: pers->run() failed ...\n");
5073 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) { 5073 else if (mddev->pers->size(mddev, 0, 0) < mddev->array_sectors) {
5074 WARN_ONCE(!mddev->external_size, "%s: default size too small," 5074 WARN_ONCE(!mddev->external_size, "%s: default size too small,"
5075 " but 'external_size' not in effect?\n", __func__); 5075 " but 'external_size' not in effect?\n", __func__);
5076 printk(KERN_ERR 5076 printk(KERN_ERR
5077 "md: invalid array_size %llu > default size %llu\n", 5077 "md: invalid array_size %llu > default size %llu\n",
5078 (unsigned long long)mddev->array_sectors / 2, 5078 (unsigned long long)mddev->array_sectors / 2,
5079 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2); 5079 (unsigned long long)mddev->pers->size(mddev, 0, 0) / 2);
5080 err = -EINVAL; 5080 err = -EINVAL;
5081 mddev->pers->stop(mddev); 5081 mddev->pers->stop(mddev);
5082 } 5082 }
5083 if (err == 0 && mddev->pers->sync_request && 5083 if (err == 0 && mddev->pers->sync_request &&
5084 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) { 5084 (mddev->bitmap_info.file || mddev->bitmap_info.offset)) {
5085 err = bitmap_create(mddev); 5085 err = bitmap_create(mddev);
5086 if (err) { 5086 if (err) {
5087 printk(KERN_ERR "%s: failed to create bitmap (%d)\n", 5087 printk(KERN_ERR "%s: failed to create bitmap (%d)\n",
5088 mdname(mddev), err); 5088 mdname(mddev), err);
5089 mddev->pers->stop(mddev); 5089 mddev->pers->stop(mddev);
5090 } 5090 }
5091 } 5091 }
5092 if (err) { 5092 if (err) {
5093 module_put(mddev->pers->owner); 5093 module_put(mddev->pers->owner);
5094 mddev->pers = NULL; 5094 mddev->pers = NULL;
5095 bitmap_destroy(mddev); 5095 bitmap_destroy(mddev);
5096 return err; 5096 return err;
5097 } 5097 }
5098 if (mddev->pers->sync_request) { 5098 if (mddev->pers->sync_request) {
5099 if (mddev->kobj.sd && 5099 if (mddev->kobj.sd &&
5100 sysfs_create_group(&mddev->kobj, &md_redundancy_group)) 5100 sysfs_create_group(&mddev->kobj, &md_redundancy_group))
5101 printk(KERN_WARNING 5101 printk(KERN_WARNING
5102 "md: cannot register extra attributes for %s\n", 5102 "md: cannot register extra attributes for %s\n",
5103 mdname(mddev)); 5103 mdname(mddev));
5104 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action"); 5104 mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
5105 } else if (mddev->ro == 2) /* auto-readonly not meaningful */ 5105 } else if (mddev->ro == 2) /* auto-readonly not meaningful */
5106 mddev->ro = 0; 5106 mddev->ro = 0;
5107 5107
5108 atomic_set(&mddev->writes_pending,0); 5108 atomic_set(&mddev->writes_pending,0);
5109 atomic_set(&mddev->max_corr_read_errors, 5109 atomic_set(&mddev->max_corr_read_errors,
5110 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS); 5110 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
5111 mddev->safemode = 0; 5111 mddev->safemode = 0;
5112 mddev->safemode_timer.function = md_safemode_timeout; 5112 mddev->safemode_timer.function = md_safemode_timeout;
5113 mddev->safemode_timer.data = (unsigned long) mddev; 5113 mddev->safemode_timer.data = (unsigned long) mddev;
5114 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ 5114 mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
5115 mddev->in_sync = 1; 5115 mddev->in_sync = 1;
5116 smp_wmb(); 5116 smp_wmb();
5117 mddev->ready = 1; 5117 mddev->ready = 1;
5118 rdev_for_each(rdev, mddev) 5118 rdev_for_each(rdev, mddev)
5119 if (rdev->raid_disk >= 0) 5119 if (rdev->raid_disk >= 0)
5120 if (sysfs_link_rdev(mddev, rdev)) 5120 if (sysfs_link_rdev(mddev, rdev))
5121 /* failure here is OK */; 5121 /* failure here is OK */;
5122 5122
5123 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5123 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5124 5124
5125 if (mddev->flags & MD_UPDATE_SB_FLAGS) 5125 if (mddev->flags & MD_UPDATE_SB_FLAGS)
5126 md_update_sb(mddev, 0); 5126 md_update_sb(mddev, 0);
5127 5127
5128 md_new_event(mddev); 5128 md_new_event(mddev);
5129 sysfs_notify_dirent_safe(mddev->sysfs_state); 5129 sysfs_notify_dirent_safe(mddev->sysfs_state);
5130 sysfs_notify_dirent_safe(mddev->sysfs_action); 5130 sysfs_notify_dirent_safe(mddev->sysfs_action);
5131 sysfs_notify(&mddev->kobj, NULL, "degraded"); 5131 sysfs_notify(&mddev->kobj, NULL, "degraded");
5132 return 0; 5132 return 0;
5133 } 5133 }
5134 EXPORT_SYMBOL_GPL(md_run); 5134 EXPORT_SYMBOL_GPL(md_run);
5135 5135
5136 static int do_md_run(struct mddev *mddev) 5136 static int do_md_run(struct mddev *mddev)
5137 { 5137 {
5138 int err; 5138 int err;
5139 5139
5140 err = md_run(mddev); 5140 err = md_run(mddev);
5141 if (err) 5141 if (err)
5142 goto out; 5142 goto out;
5143 err = bitmap_load(mddev); 5143 err = bitmap_load(mddev);
5144 if (err) { 5144 if (err) {
5145 bitmap_destroy(mddev); 5145 bitmap_destroy(mddev);
5146 goto out; 5146 goto out;
5147 } 5147 }
5148 5148
5149 md_wakeup_thread(mddev->thread); 5149 md_wakeup_thread(mddev->thread);
5150 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ 5150 md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */
5151 5151
5152 set_capacity(mddev->gendisk, mddev->array_sectors); 5152 set_capacity(mddev->gendisk, mddev->array_sectors);
5153 revalidate_disk(mddev->gendisk); 5153 revalidate_disk(mddev->gendisk);
5154 mddev->changed = 1; 5154 mddev->changed = 1;
5155 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5155 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5156 out: 5156 out:
5157 return err; 5157 return err;
5158 } 5158 }
5159 5159
5160 static int restart_array(struct mddev *mddev) 5160 static int restart_array(struct mddev *mddev)
5161 { 5161 {
5162 struct gendisk *disk = mddev->gendisk; 5162 struct gendisk *disk = mddev->gendisk;
5163 5163
5164 /* Complain if it has no devices */ 5164 /* Complain if it has no devices */
5165 if (list_empty(&mddev->disks)) 5165 if (list_empty(&mddev->disks))
5166 return -ENXIO; 5166 return -ENXIO;
5167 if (!mddev->pers) 5167 if (!mddev->pers)
5168 return -EINVAL; 5168 return -EINVAL;
5169 if (!mddev->ro) 5169 if (!mddev->ro)
5170 return -EBUSY; 5170 return -EBUSY;
5171 mddev->safemode = 0; 5171 mddev->safemode = 0;
5172 mddev->ro = 0; 5172 mddev->ro = 0;
5173 set_disk_ro(disk, 0); 5173 set_disk_ro(disk, 0);
5174 printk(KERN_INFO "md: %s switched to read-write mode.\n", 5174 printk(KERN_INFO "md: %s switched to read-write mode.\n",
5175 mdname(mddev)); 5175 mdname(mddev));
5176 /* Kick recovery or resync if necessary */ 5176 /* Kick recovery or resync if necessary */
5177 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5177 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5178 md_wakeup_thread(mddev->thread); 5178 md_wakeup_thread(mddev->thread);
5179 md_wakeup_thread(mddev->sync_thread); 5179 md_wakeup_thread(mddev->sync_thread);
5180 sysfs_notify_dirent_safe(mddev->sysfs_state); 5180 sysfs_notify_dirent_safe(mddev->sysfs_state);
5181 return 0; 5181 return 0;
5182 } 5182 }
5183 5183
5184 static void md_clean(struct mddev *mddev) 5184 static void md_clean(struct mddev *mddev)
5185 { 5185 {
5186 mddev->array_sectors = 0; 5186 mddev->array_sectors = 0;
5187 mddev->external_size = 0; 5187 mddev->external_size = 0;
5188 mddev->dev_sectors = 0; 5188 mddev->dev_sectors = 0;
5189 mddev->raid_disks = 0; 5189 mddev->raid_disks = 0;
5190 mddev->recovery_cp = 0; 5190 mddev->recovery_cp = 0;
5191 mddev->resync_min = 0; 5191 mddev->resync_min = 0;
5192 mddev->resync_max = MaxSector; 5192 mddev->resync_max = MaxSector;
5193 mddev->reshape_position = MaxSector; 5193 mddev->reshape_position = MaxSector;
5194 mddev->external = 0; 5194 mddev->external = 0;
5195 mddev->persistent = 0; 5195 mddev->persistent = 0;
5196 mddev->level = LEVEL_NONE; 5196 mddev->level = LEVEL_NONE;
5197 mddev->clevel[0] = 0; 5197 mddev->clevel[0] = 0;
5198 mddev->flags = 0; 5198 mddev->flags = 0;
5199 mddev->ro = 0; 5199 mddev->ro = 0;
5200 mddev->metadata_type[0] = 0; 5200 mddev->metadata_type[0] = 0;
5201 mddev->chunk_sectors = 0; 5201 mddev->chunk_sectors = 0;
5202 mddev->ctime = mddev->utime = 0; 5202 mddev->ctime = mddev->utime = 0;
5203 mddev->layout = 0; 5203 mddev->layout = 0;
5204 mddev->max_disks = 0; 5204 mddev->max_disks = 0;
5205 mddev->events = 0; 5205 mddev->events = 0;
5206 mddev->can_decrease_events = 0; 5206 mddev->can_decrease_events = 0;
5207 mddev->delta_disks = 0; 5207 mddev->delta_disks = 0;
5208 mddev->reshape_backwards = 0; 5208 mddev->reshape_backwards = 0;
5209 mddev->new_level = LEVEL_NONE; 5209 mddev->new_level = LEVEL_NONE;
5210 mddev->new_layout = 0; 5210 mddev->new_layout = 0;
5211 mddev->new_chunk_sectors = 0; 5211 mddev->new_chunk_sectors = 0;
5212 mddev->curr_resync = 0; 5212 mddev->curr_resync = 0;
5213 atomic64_set(&mddev->resync_mismatches, 0); 5213 atomic64_set(&mddev->resync_mismatches, 0);
5214 mddev->suspend_lo = mddev->suspend_hi = 0; 5214 mddev->suspend_lo = mddev->suspend_hi = 0;
5215 mddev->sync_speed_min = mddev->sync_speed_max = 0; 5215 mddev->sync_speed_min = mddev->sync_speed_max = 0;
5216 mddev->recovery = 0; 5216 mddev->recovery = 0;
5217 mddev->in_sync = 0; 5217 mddev->in_sync = 0;
5218 mddev->changed = 0; 5218 mddev->changed = 0;
5219 mddev->degraded = 0; 5219 mddev->degraded = 0;
5220 mddev->safemode = 0; 5220 mddev->safemode = 0;
5221 mddev->merge_check_needed = 0; 5221 mddev->merge_check_needed = 0;
5222 mddev->bitmap_info.offset = 0; 5222 mddev->bitmap_info.offset = 0;
5223 mddev->bitmap_info.default_offset = 0; 5223 mddev->bitmap_info.default_offset = 0;
5224 mddev->bitmap_info.default_space = 0; 5224 mddev->bitmap_info.default_space = 0;
5225 mddev->bitmap_info.chunksize = 0; 5225 mddev->bitmap_info.chunksize = 0;
5226 mddev->bitmap_info.daemon_sleep = 0; 5226 mddev->bitmap_info.daemon_sleep = 0;
5227 mddev->bitmap_info.max_write_behind = 0; 5227 mddev->bitmap_info.max_write_behind = 0;
5228 } 5228 }
5229 5229
5230 static void __md_stop_writes(struct mddev *mddev) 5230 static void __md_stop_writes(struct mddev *mddev)
5231 { 5231 {
5232 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5232 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5233 if (mddev->sync_thread) { 5233 if (mddev->sync_thread) {
5234 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5234 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5235 md_reap_sync_thread(mddev); 5235 md_reap_sync_thread(mddev);
5236 } 5236 }
5237 5237
5238 del_timer_sync(&mddev->safemode_timer); 5238 del_timer_sync(&mddev->safemode_timer);
5239 5239
5240 bitmap_flush(mddev); 5240 bitmap_flush(mddev);
5241 md_super_wait(mddev); 5241 md_super_wait(mddev);
5242 5242
5243 if (mddev->ro == 0 && 5243 if (mddev->ro == 0 &&
5244 (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) { 5244 (!mddev->in_sync || (mddev->flags & MD_UPDATE_SB_FLAGS))) {
5245 /* mark array as shutdown cleanly */ 5245 /* mark array as shutdown cleanly */
5246 mddev->in_sync = 1; 5246 mddev->in_sync = 1;
5247 md_update_sb(mddev, 1); 5247 md_update_sb(mddev, 1);
5248 } 5248 }
5249 } 5249 }
5250 5250
5251 void md_stop_writes(struct mddev *mddev) 5251 void md_stop_writes(struct mddev *mddev)
5252 { 5252 {
5253 mddev_lock_nointr(mddev); 5253 mddev_lock_nointr(mddev);
5254 __md_stop_writes(mddev); 5254 __md_stop_writes(mddev);
5255 mddev_unlock(mddev); 5255 mddev_unlock(mddev);
5256 } 5256 }
5257 EXPORT_SYMBOL_GPL(md_stop_writes); 5257 EXPORT_SYMBOL_GPL(md_stop_writes);
5258 5258
5259 static void __md_stop(struct mddev *mddev) 5259 static void __md_stop(struct mddev *mddev)
5260 { 5260 {
5261 mddev->ready = 0; 5261 mddev->ready = 0;
5262 mddev->pers->stop(mddev); 5262 mddev->pers->stop(mddev);
5263 if (mddev->pers->sync_request && mddev->to_remove == NULL) 5263 if (mddev->pers->sync_request && mddev->to_remove == NULL)
5264 mddev->to_remove = &md_redundancy_group; 5264 mddev->to_remove = &md_redundancy_group;
5265 module_put(mddev->pers->owner); 5265 module_put(mddev->pers->owner);
5266 mddev->pers = NULL; 5266 mddev->pers = NULL;
5267 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5267 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5268 } 5268 }
5269 5269
5270 void md_stop(struct mddev *mddev) 5270 void md_stop(struct mddev *mddev)
5271 { 5271 {
5272 /* stop the array and free an attached data structures. 5272 /* stop the array and free an attached data structures.
5273 * This is called from dm-raid 5273 * This is called from dm-raid
5274 */ 5274 */
5275 __md_stop(mddev); 5275 __md_stop(mddev);
5276 bitmap_destroy(mddev); 5276 bitmap_destroy(mddev);
5277 if (mddev->bio_set) 5277 if (mddev->bio_set)
5278 bioset_free(mddev->bio_set); 5278 bioset_free(mddev->bio_set);
5279 } 5279 }
5280 5280
5281 EXPORT_SYMBOL_GPL(md_stop); 5281 EXPORT_SYMBOL_GPL(md_stop);
5282 5282
5283 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev) 5283 static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
5284 { 5284 {
5285 int err = 0; 5285 int err = 0;
5286 int did_freeze = 0; 5286 int did_freeze = 0;
5287 5287
5288 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5288 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5289 did_freeze = 1; 5289 did_freeze = 1;
5290 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5290 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5291 md_wakeup_thread(mddev->thread); 5291 md_wakeup_thread(mddev->thread);
5292 } 5292 }
5293 if (mddev->sync_thread) { 5293 if (mddev->sync_thread) {
5294 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5294 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5295 /* Thread might be blocked waiting for metadata update 5295 /* Thread might be blocked waiting for metadata update
5296 * which will now never happen */ 5296 * which will now never happen */
5297 wake_up_process(mddev->sync_thread->tsk); 5297 wake_up_process(mddev->sync_thread->tsk);
5298 } 5298 }
5299 mddev_unlock(mddev); 5299 mddev_unlock(mddev);
5300 wait_event(resync_wait, mddev->sync_thread == NULL); 5300 wait_event(resync_wait, mddev->sync_thread == NULL);
5301 mddev_lock_nointr(mddev); 5301 mddev_lock_nointr(mddev);
5302 5302
5303 mutex_lock(&mddev->open_mutex); 5303 mutex_lock(&mddev->open_mutex);
5304 if (atomic_read(&mddev->openers) > !!bdev || 5304 if (atomic_read(&mddev->openers) > !!bdev ||
5305 mddev->sync_thread || 5305 mddev->sync_thread ||
5306 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { 5306 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5307 printk("md: %s still in use.\n",mdname(mddev)); 5307 printk("md: %s still in use.\n",mdname(mddev));
5308 if (did_freeze) { 5308 if (did_freeze) {
5309 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5309 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5310 md_wakeup_thread(mddev->thread); 5310 md_wakeup_thread(mddev->thread);
5311 } 5311 }
5312 err = -EBUSY; 5312 err = -EBUSY;
5313 goto out; 5313 goto out;
5314 } 5314 }
5315 if (mddev->pers) { 5315 if (mddev->pers) {
5316 __md_stop_writes(mddev); 5316 __md_stop_writes(mddev);
5317 5317
5318 err = -ENXIO; 5318 err = -ENXIO;
5319 if (mddev->ro==1) 5319 if (mddev->ro==1)
5320 goto out; 5320 goto out;
5321 mddev->ro = 1; 5321 mddev->ro = 1;
5322 set_disk_ro(mddev->gendisk, 1); 5322 set_disk_ro(mddev->gendisk, 1);
5323 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5323 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5324 sysfs_notify_dirent_safe(mddev->sysfs_state); 5324 sysfs_notify_dirent_safe(mddev->sysfs_state);
5325 err = 0; 5325 err = 0;
5326 } 5326 }
5327 out: 5327 out:
5328 mutex_unlock(&mddev->open_mutex); 5328 mutex_unlock(&mddev->open_mutex);
5329 return err; 5329 return err;
5330 } 5330 }
5331 5331
5332 /* mode: 5332 /* mode:
5333 * 0 - completely stop and dis-assemble array 5333 * 0 - completely stop and dis-assemble array
5334 * 2 - stop but do not disassemble array 5334 * 2 - stop but do not disassemble array
5335 */ 5335 */
5336 static int do_md_stop(struct mddev * mddev, int mode, 5336 static int do_md_stop(struct mddev * mddev, int mode,
5337 struct block_device *bdev) 5337 struct block_device *bdev)
5338 { 5338 {
5339 struct gendisk *disk = mddev->gendisk; 5339 struct gendisk *disk = mddev->gendisk;
5340 struct md_rdev *rdev; 5340 struct md_rdev *rdev;
5341 int did_freeze = 0; 5341 int did_freeze = 0;
5342 5342
5343 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) { 5343 if (!test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) {
5344 did_freeze = 1; 5344 did_freeze = 1;
5345 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5345 set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5346 md_wakeup_thread(mddev->thread); 5346 md_wakeup_thread(mddev->thread);
5347 } 5347 }
5348 if (mddev->sync_thread) { 5348 if (mddev->sync_thread) {
5349 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 5349 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
5350 /* Thread might be blocked waiting for metadata update 5350 /* Thread might be blocked waiting for metadata update
5351 * which will now never happen */ 5351 * which will now never happen */
5352 wake_up_process(mddev->sync_thread->tsk); 5352 wake_up_process(mddev->sync_thread->tsk);
5353 } 5353 }
5354 mddev_unlock(mddev); 5354 mddev_unlock(mddev);
5355 wait_event(resync_wait, mddev->sync_thread == NULL); 5355 wait_event(resync_wait, mddev->sync_thread == NULL);
5356 mddev_lock_nointr(mddev); 5356 mddev_lock_nointr(mddev);
5357 5357
5358 mutex_lock(&mddev->open_mutex); 5358 mutex_lock(&mddev->open_mutex);
5359 if (atomic_read(&mddev->openers) > !!bdev || 5359 if (atomic_read(&mddev->openers) > !!bdev ||
5360 mddev->sysfs_active || 5360 mddev->sysfs_active ||
5361 mddev->sync_thread || 5361 mddev->sync_thread ||
5362 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) { 5362 (bdev && !test_bit(MD_STILL_CLOSED, &mddev->flags))) {
5363 printk("md: %s still in use.\n",mdname(mddev)); 5363 printk("md: %s still in use.\n",mdname(mddev));
5364 mutex_unlock(&mddev->open_mutex); 5364 mutex_unlock(&mddev->open_mutex);
5365 if (did_freeze) { 5365 if (did_freeze) {
5366 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery); 5366 clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
5367 md_wakeup_thread(mddev->thread); 5367 md_wakeup_thread(mddev->thread);
5368 } 5368 }
5369 return -EBUSY; 5369 return -EBUSY;
5370 } 5370 }
5371 if (mddev->pers) { 5371 if (mddev->pers) {
5372 if (mddev->ro) 5372 if (mddev->ro)
5373 set_disk_ro(disk, 0); 5373 set_disk_ro(disk, 0);
5374 5374
5375 __md_stop_writes(mddev); 5375 __md_stop_writes(mddev);
5376 __md_stop(mddev); 5376 __md_stop(mddev);
5377 mddev->queue->merge_bvec_fn = NULL; 5377 mddev->queue->merge_bvec_fn = NULL;
5378 mddev->queue->backing_dev_info.congested_fn = NULL; 5378 mddev->queue->backing_dev_info.congested_fn = NULL;
5379 5379
5380 /* tell userspace to handle 'inactive' */ 5380 /* tell userspace to handle 'inactive' */
5381 sysfs_notify_dirent_safe(mddev->sysfs_state); 5381 sysfs_notify_dirent_safe(mddev->sysfs_state);
5382 5382
5383 rdev_for_each(rdev, mddev) 5383 rdev_for_each(rdev, mddev)
5384 if (rdev->raid_disk >= 0) 5384 if (rdev->raid_disk >= 0)
5385 sysfs_unlink_rdev(mddev, rdev); 5385 sysfs_unlink_rdev(mddev, rdev);
5386 5386
5387 set_capacity(disk, 0); 5387 set_capacity(disk, 0);
5388 mutex_unlock(&mddev->open_mutex); 5388 mutex_unlock(&mddev->open_mutex);
5389 mddev->changed = 1; 5389 mddev->changed = 1;
5390 revalidate_disk(disk); 5390 revalidate_disk(disk);
5391 5391
5392 if (mddev->ro) 5392 if (mddev->ro)
5393 mddev->ro = 0; 5393 mddev->ro = 0;
5394 } else 5394 } else
5395 mutex_unlock(&mddev->open_mutex); 5395 mutex_unlock(&mddev->open_mutex);
5396 /* 5396 /*
5397 * Free resources if final stop 5397 * Free resources if final stop
5398 */ 5398 */
5399 if (mode == 0) { 5399 if (mode == 0) {
5400 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev)); 5400 printk(KERN_INFO "md: %s stopped.\n", mdname(mddev));
5401 5401
5402 bitmap_destroy(mddev); 5402 bitmap_destroy(mddev);
5403 if (mddev->bitmap_info.file) { 5403 if (mddev->bitmap_info.file) {
5404 fput(mddev->bitmap_info.file); 5404 fput(mddev->bitmap_info.file);
5405 mddev->bitmap_info.file = NULL; 5405 mddev->bitmap_info.file = NULL;
5406 } 5406 }
5407 mddev->bitmap_info.offset = 0; 5407 mddev->bitmap_info.offset = 0;
5408 5408
5409 export_array(mddev); 5409 export_array(mddev);
5410 5410
5411 md_clean(mddev); 5411 md_clean(mddev);
5412 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE); 5412 kobject_uevent(&disk_to_dev(mddev->gendisk)->kobj, KOBJ_CHANGE);
5413 if (mddev->hold_active == UNTIL_STOP) 5413 if (mddev->hold_active == UNTIL_STOP)
5414 mddev->hold_active = 0; 5414 mddev->hold_active = 0;
5415 } 5415 }
5416 blk_integrity_unregister(disk); 5416 blk_integrity_unregister(disk);
5417 md_new_event(mddev); 5417 md_new_event(mddev);
5418 sysfs_notify_dirent_safe(mddev->sysfs_state); 5418 sysfs_notify_dirent_safe(mddev->sysfs_state);
5419 return 0; 5419 return 0;
5420 } 5420 }
5421 5421
5422 #ifndef MODULE 5422 #ifndef MODULE
5423 static void autorun_array(struct mddev *mddev) 5423 static void autorun_array(struct mddev *mddev)
5424 { 5424 {
5425 struct md_rdev *rdev; 5425 struct md_rdev *rdev;
5426 int err; 5426 int err;
5427 5427
5428 if (list_empty(&mddev->disks)) 5428 if (list_empty(&mddev->disks))
5429 return; 5429 return;
5430 5430
5431 printk(KERN_INFO "md: running: "); 5431 printk(KERN_INFO "md: running: ");
5432 5432
5433 rdev_for_each(rdev, mddev) { 5433 rdev_for_each(rdev, mddev) {
5434 char b[BDEVNAME_SIZE]; 5434 char b[BDEVNAME_SIZE];
5435 printk("<%s>", bdevname(rdev->bdev,b)); 5435 printk("<%s>", bdevname(rdev->bdev,b));
5436 } 5436 }
5437 printk("\n"); 5437 printk("\n");
5438 5438
5439 err = do_md_run(mddev); 5439 err = do_md_run(mddev);
5440 if (err) { 5440 if (err) {
5441 printk(KERN_WARNING "md: do_md_run() returned %d\n", err); 5441 printk(KERN_WARNING "md: do_md_run() returned %d\n", err);
5442 do_md_stop(mddev, 0, NULL); 5442 do_md_stop(mddev, 0, NULL);
5443 } 5443 }
5444 } 5444 }
5445 5445
5446 /* 5446 /*
5447 * lets try to run arrays based on all disks that have arrived 5447 * lets try to run arrays based on all disks that have arrived
5448 * until now. (those are in pending_raid_disks) 5448 * until now. (those are in pending_raid_disks)
5449 * 5449 *
5450 * the method: pick the first pending disk, collect all disks with 5450 * the method: pick the first pending disk, collect all disks with
5451 * the same UUID, remove all from the pending list and put them into 5451 * the same UUID, remove all from the pending list and put them into
5452 * the 'same_array' list. Then order this list based on superblock 5452 * the 'same_array' list. Then order this list based on superblock
5453 * update time (freshest comes first), kick out 'old' disks and 5453 * update time (freshest comes first), kick out 'old' disks and
5454 * compare superblocks. If everything's fine then run it. 5454 * compare superblocks. If everything's fine then run it.
5455 * 5455 *
5456 * If "unit" is allocated, then bump its reference count 5456 * If "unit" is allocated, then bump its reference count
5457 */ 5457 */
5458 static void autorun_devices(int part) 5458 static void autorun_devices(int part)
5459 { 5459 {
5460 struct md_rdev *rdev0, *rdev, *tmp; 5460 struct md_rdev *rdev0, *rdev, *tmp;
5461 struct mddev *mddev; 5461 struct mddev *mddev;
5462 char b[BDEVNAME_SIZE]; 5462 char b[BDEVNAME_SIZE];
5463 5463
5464 printk(KERN_INFO "md: autorun ...\n"); 5464 printk(KERN_INFO "md: autorun ...\n");
5465 while (!list_empty(&pending_raid_disks)) { 5465 while (!list_empty(&pending_raid_disks)) {
5466 int unit; 5466 int unit;
5467 dev_t dev; 5467 dev_t dev;
5468 LIST_HEAD(candidates); 5468 LIST_HEAD(candidates);
5469 rdev0 = list_entry(pending_raid_disks.next, 5469 rdev0 = list_entry(pending_raid_disks.next,
5470 struct md_rdev, same_set); 5470 struct md_rdev, same_set);
5471 5471
5472 printk(KERN_INFO "md: considering %s ...\n", 5472 printk(KERN_INFO "md: considering %s ...\n",
5473 bdevname(rdev0->bdev,b)); 5473 bdevname(rdev0->bdev,b));
5474 INIT_LIST_HEAD(&candidates); 5474 INIT_LIST_HEAD(&candidates);
5475 rdev_for_each_list(rdev, tmp, &pending_raid_disks) 5475 rdev_for_each_list(rdev, tmp, &pending_raid_disks)
5476 if (super_90_load(rdev, rdev0, 0) >= 0) { 5476 if (super_90_load(rdev, rdev0, 0) >= 0) {
5477 printk(KERN_INFO "md: adding %s ...\n", 5477 printk(KERN_INFO "md: adding %s ...\n",
5478 bdevname(rdev->bdev,b)); 5478 bdevname(rdev->bdev,b));
5479 list_move(&rdev->same_set, &candidates); 5479 list_move(&rdev->same_set, &candidates);
5480 } 5480 }
5481 /* 5481 /*
5482 * now we have a set of devices, with all of them having 5482 * now we have a set of devices, with all of them having
5483 * mostly sane superblocks. It's time to allocate the 5483 * mostly sane superblocks. It's time to allocate the
5484 * mddev. 5484 * mddev.
5485 */ 5485 */
5486 if (part) { 5486 if (part) {
5487 dev = MKDEV(mdp_major, 5487 dev = MKDEV(mdp_major,
5488 rdev0->preferred_minor << MdpMinorShift); 5488 rdev0->preferred_minor << MdpMinorShift);
5489 unit = MINOR(dev) >> MdpMinorShift; 5489 unit = MINOR(dev) >> MdpMinorShift;
5490 } else { 5490 } else {
5491 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor); 5491 dev = MKDEV(MD_MAJOR, rdev0->preferred_minor);
5492 unit = MINOR(dev); 5492 unit = MINOR(dev);
5493 } 5493 }
5494 if (rdev0->preferred_minor != unit) { 5494 if (rdev0->preferred_minor != unit) {
5495 printk(KERN_INFO "md: unit number in %s is bad: %d\n", 5495 printk(KERN_INFO "md: unit number in %s is bad: %d\n",
5496 bdevname(rdev0->bdev, b), rdev0->preferred_minor); 5496 bdevname(rdev0->bdev, b), rdev0->preferred_minor);
5497 break; 5497 break;
5498 } 5498 }
5499 5499
5500 md_probe(dev, NULL, NULL); 5500 md_probe(dev, NULL, NULL);
5501 mddev = mddev_find(dev); 5501 mddev = mddev_find(dev);
5502 if (!mddev || !mddev->gendisk) { 5502 if (!mddev || !mddev->gendisk) {
5503 if (mddev) 5503 if (mddev)
5504 mddev_put(mddev); 5504 mddev_put(mddev);
5505 printk(KERN_ERR 5505 printk(KERN_ERR
5506 "md: cannot allocate memory for md drive.\n"); 5506 "md: cannot allocate memory for md drive.\n");
5507 break; 5507 break;
5508 } 5508 }
5509 if (mddev_lock(mddev)) 5509 if (mddev_lock(mddev))
5510 printk(KERN_WARNING "md: %s locked, cannot run\n", 5510 printk(KERN_WARNING "md: %s locked, cannot run\n",
5511 mdname(mddev)); 5511 mdname(mddev));
5512 else if (mddev->raid_disks || mddev->major_version 5512 else if (mddev->raid_disks || mddev->major_version
5513 || !list_empty(&mddev->disks)) { 5513 || !list_empty(&mddev->disks)) {
5514 printk(KERN_WARNING 5514 printk(KERN_WARNING
5515 "md: %s already running, cannot run %s\n", 5515 "md: %s already running, cannot run %s\n",
5516 mdname(mddev), bdevname(rdev0->bdev,b)); 5516 mdname(mddev), bdevname(rdev0->bdev,b));
5517 mddev_unlock(mddev); 5517 mddev_unlock(mddev);
5518 } else { 5518 } else {
5519 printk(KERN_INFO "md: created %s\n", mdname(mddev)); 5519 printk(KERN_INFO "md: created %s\n", mdname(mddev));
5520 mddev->persistent = 1; 5520 mddev->persistent = 1;
5521 rdev_for_each_list(rdev, tmp, &candidates) { 5521 rdev_for_each_list(rdev, tmp, &candidates) {
5522 list_del_init(&rdev->same_set); 5522 list_del_init(&rdev->same_set);
5523 if (bind_rdev_to_array(rdev, mddev)) 5523 if (bind_rdev_to_array(rdev, mddev))
5524 export_rdev(rdev); 5524 export_rdev(rdev);
5525 } 5525 }
5526 autorun_array(mddev); 5526 autorun_array(mddev);
5527 mddev_unlock(mddev); 5527 mddev_unlock(mddev);
5528 } 5528 }
5529 /* on success, candidates will be empty, on error 5529 /* on success, candidates will be empty, on error
5530 * it won't... 5530 * it won't...
5531 */ 5531 */
5532 rdev_for_each_list(rdev, tmp, &candidates) { 5532 rdev_for_each_list(rdev, tmp, &candidates) {
5533 list_del_init(&rdev->same_set); 5533 list_del_init(&rdev->same_set);
5534 export_rdev(rdev); 5534 export_rdev(rdev);
5535 } 5535 }
5536 mddev_put(mddev); 5536 mddev_put(mddev);
5537 } 5537 }
5538 printk(KERN_INFO "md: ... autorun DONE.\n"); 5538 printk(KERN_INFO "md: ... autorun DONE.\n");
5539 } 5539 }
5540 #endif /* !MODULE */ 5540 #endif /* !MODULE */
5541 5541
5542 static int get_version(void __user * arg) 5542 static int get_version(void __user * arg)
5543 { 5543 {
5544 mdu_version_t ver; 5544 mdu_version_t ver;
5545 5545
5546 ver.major = MD_MAJOR_VERSION; 5546 ver.major = MD_MAJOR_VERSION;
5547 ver.minor = MD_MINOR_VERSION; 5547 ver.minor = MD_MINOR_VERSION;
5548 ver.patchlevel = MD_PATCHLEVEL_VERSION; 5548 ver.patchlevel = MD_PATCHLEVEL_VERSION;
5549 5549
5550 if (copy_to_user(arg, &ver, sizeof(ver))) 5550 if (copy_to_user(arg, &ver, sizeof(ver)))
5551 return -EFAULT; 5551 return -EFAULT;
5552 5552
5553 return 0; 5553 return 0;
5554 } 5554 }
5555 5555
5556 static int get_array_info(struct mddev * mddev, void __user * arg) 5556 static int get_array_info(struct mddev * mddev, void __user * arg)
5557 { 5557 {
5558 mdu_array_info_t info; 5558 mdu_array_info_t info;
5559 int nr,working,insync,failed,spare; 5559 int nr,working,insync,failed,spare;
5560 struct md_rdev *rdev; 5560 struct md_rdev *rdev;
5561 5561
5562 nr = working = insync = failed = spare = 0; 5562 nr = working = insync = failed = spare = 0;
5563 rcu_read_lock(); 5563 rcu_read_lock();
5564 rdev_for_each_rcu(rdev, mddev) { 5564 rdev_for_each_rcu(rdev, mddev) {
5565 nr++; 5565 nr++;
5566 if (test_bit(Faulty, &rdev->flags)) 5566 if (test_bit(Faulty, &rdev->flags))
5567 failed++; 5567 failed++;
5568 else { 5568 else {
5569 working++; 5569 working++;
5570 if (test_bit(In_sync, &rdev->flags)) 5570 if (test_bit(In_sync, &rdev->flags))
5571 insync++; 5571 insync++;
5572 else 5572 else
5573 spare++; 5573 spare++;
5574 } 5574 }
5575 } 5575 }
5576 rcu_read_unlock(); 5576 rcu_read_unlock();
5577 5577
5578 info.major_version = mddev->major_version; 5578 info.major_version = mddev->major_version;
5579 info.minor_version = mddev->minor_version; 5579 info.minor_version = mddev->minor_version;
5580 info.patch_version = MD_PATCHLEVEL_VERSION; 5580 info.patch_version = MD_PATCHLEVEL_VERSION;
5581 info.ctime = mddev->ctime; 5581 info.ctime = mddev->ctime;
5582 info.level = mddev->level; 5582 info.level = mddev->level;
5583 info.size = mddev->dev_sectors / 2; 5583 info.size = mddev->dev_sectors / 2;
5584 if (info.size != mddev->dev_sectors / 2) /* overflow */ 5584 if (info.size != mddev->dev_sectors / 2) /* overflow */
5585 info.size = -1; 5585 info.size = -1;
5586 info.nr_disks = nr; 5586 info.nr_disks = nr;
5587 info.raid_disks = mddev->raid_disks; 5587 info.raid_disks = mddev->raid_disks;
5588 info.md_minor = mddev->md_minor; 5588 info.md_minor = mddev->md_minor;
5589 info.not_persistent= !mddev->persistent; 5589 info.not_persistent= !mddev->persistent;
5590 5590
5591 info.utime = mddev->utime; 5591 info.utime = mddev->utime;
5592 info.state = 0; 5592 info.state = 0;
5593 if (mddev->in_sync) 5593 if (mddev->in_sync)
5594 info.state = (1<<MD_SB_CLEAN); 5594 info.state = (1<<MD_SB_CLEAN);
5595 if (mddev->bitmap && mddev->bitmap_info.offset) 5595 if (mddev->bitmap && mddev->bitmap_info.offset)
5596 info.state = (1<<MD_SB_BITMAP_PRESENT); 5596 info.state = (1<<MD_SB_BITMAP_PRESENT);
5597 info.active_disks = insync; 5597 info.active_disks = insync;
5598 info.working_disks = working; 5598 info.working_disks = working;
5599 info.failed_disks = failed; 5599 info.failed_disks = failed;
5600 info.spare_disks = spare; 5600 info.spare_disks = spare;
5601 5601
5602 info.layout = mddev->layout; 5602 info.layout = mddev->layout;
5603 info.chunk_size = mddev->chunk_sectors << 9; 5603 info.chunk_size = mddev->chunk_sectors << 9;
5604 5604
5605 if (copy_to_user(arg, &info, sizeof(info))) 5605 if (copy_to_user(arg, &info, sizeof(info)))
5606 return -EFAULT; 5606 return -EFAULT;
5607 5607
5608 return 0; 5608 return 0;
5609 } 5609 }
5610 5610
5611 static int get_bitmap_file(struct mddev * mddev, void __user * arg) 5611 static int get_bitmap_file(struct mddev * mddev, void __user * arg)
5612 { 5612 {
5613 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */ 5613 mdu_bitmap_file_t *file = NULL; /* too big for stack allocation */
5614 char *ptr, *buf = NULL; 5614 char *ptr, *buf = NULL;
5615 int err = -ENOMEM; 5615 int err = -ENOMEM;
5616 5616
5617 file = kmalloc(sizeof(*file), GFP_NOIO); 5617 file = kmalloc(sizeof(*file), GFP_NOIO);
5618 5618
5619 if (!file) 5619 if (!file)
5620 goto out; 5620 goto out;
5621 5621
5622 /* bitmap disabled, zero the first byte and copy out */ 5622 /* bitmap disabled, zero the first byte and copy out */
5623 if (!mddev->bitmap || !mddev->bitmap->storage.file) { 5623 if (!mddev->bitmap || !mddev->bitmap->storage.file) {
5624 file->pathname[0] = '\0'; 5624 file->pathname[0] = '\0';
5625 goto copy_out; 5625 goto copy_out;
5626 } 5626 }
5627 5627
5628 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL); 5628 buf = kmalloc(sizeof(file->pathname), GFP_KERNEL);
5629 if (!buf) 5629 if (!buf)
5630 goto out; 5630 goto out;
5631 5631
5632 ptr = d_path(&mddev->bitmap->storage.file->f_path, 5632 ptr = d_path(&mddev->bitmap->storage.file->f_path,
5633 buf, sizeof(file->pathname)); 5633 buf, sizeof(file->pathname));
5634 if (IS_ERR(ptr)) 5634 if (IS_ERR(ptr))
5635 goto out; 5635 goto out;
5636 5636
5637 strcpy(file->pathname, ptr); 5637 strcpy(file->pathname, ptr);
5638 5638
5639 copy_out: 5639 copy_out:
5640 err = 0; 5640 err = 0;
5641 if (copy_to_user(arg, file, sizeof(*file))) 5641 if (copy_to_user(arg, file, sizeof(*file)))
5642 err = -EFAULT; 5642 err = -EFAULT;
5643 out: 5643 out:
5644 kfree(buf); 5644 kfree(buf);
5645 kfree(file); 5645 kfree(file);
5646 return err; 5646 return err;
5647 } 5647 }
5648 5648
5649 static int get_disk_info(struct mddev * mddev, void __user * arg) 5649 static int get_disk_info(struct mddev * mddev, void __user * arg)
5650 { 5650 {
5651 mdu_disk_info_t info; 5651 mdu_disk_info_t info;
5652 struct md_rdev *rdev; 5652 struct md_rdev *rdev;
5653 5653
5654 if (copy_from_user(&info, arg, sizeof(info))) 5654 if (copy_from_user(&info, arg, sizeof(info)))
5655 return -EFAULT; 5655 return -EFAULT;
5656 5656
5657 rcu_read_lock(); 5657 rcu_read_lock();
5658 rdev = find_rdev_nr_rcu(mddev, info.number); 5658 rdev = find_rdev_nr_rcu(mddev, info.number);
5659 if (rdev) { 5659 if (rdev) {
5660 info.major = MAJOR(rdev->bdev->bd_dev); 5660 info.major = MAJOR(rdev->bdev->bd_dev);
5661 info.minor = MINOR(rdev->bdev->bd_dev); 5661 info.minor = MINOR(rdev->bdev->bd_dev);
5662 info.raid_disk = rdev->raid_disk; 5662 info.raid_disk = rdev->raid_disk;
5663 info.state = 0; 5663 info.state = 0;
5664 if (test_bit(Faulty, &rdev->flags)) 5664 if (test_bit(Faulty, &rdev->flags))
5665 info.state |= (1<<MD_DISK_FAULTY); 5665 info.state |= (1<<MD_DISK_FAULTY);
5666 else if (test_bit(In_sync, &rdev->flags)) { 5666 else if (test_bit(In_sync, &rdev->flags)) {
5667 info.state |= (1<<MD_DISK_ACTIVE); 5667 info.state |= (1<<MD_DISK_ACTIVE);
5668 info.state |= (1<<MD_DISK_SYNC); 5668 info.state |= (1<<MD_DISK_SYNC);
5669 } 5669 }
5670 if (test_bit(WriteMostly, &rdev->flags)) 5670 if (test_bit(WriteMostly, &rdev->flags))
5671 info.state |= (1<<MD_DISK_WRITEMOSTLY); 5671 info.state |= (1<<MD_DISK_WRITEMOSTLY);
5672 } else { 5672 } else {
5673 info.major = info.minor = 0; 5673 info.major = info.minor = 0;
5674 info.raid_disk = -1; 5674 info.raid_disk = -1;
5675 info.state = (1<<MD_DISK_REMOVED); 5675 info.state = (1<<MD_DISK_REMOVED);
5676 } 5676 }
5677 rcu_read_unlock(); 5677 rcu_read_unlock();
5678 5678
5679 if (copy_to_user(arg, &info, sizeof(info))) 5679 if (copy_to_user(arg, &info, sizeof(info)))
5680 return -EFAULT; 5680 return -EFAULT;
5681 5681
5682 return 0; 5682 return 0;
5683 } 5683 }
5684 5684
5685 static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info) 5685 static int add_new_disk(struct mddev * mddev, mdu_disk_info_t *info)
5686 { 5686 {
5687 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE]; 5687 char b[BDEVNAME_SIZE], b2[BDEVNAME_SIZE];
5688 struct md_rdev *rdev; 5688 struct md_rdev *rdev;
5689 dev_t dev = MKDEV(info->major,info->minor); 5689 dev_t dev = MKDEV(info->major,info->minor);
5690 5690
5691 if (info->major != MAJOR(dev) || info->minor != MINOR(dev)) 5691 if (info->major != MAJOR(dev) || info->minor != MINOR(dev))
5692 return -EOVERFLOW; 5692 return -EOVERFLOW;
5693 5693
5694 if (!mddev->raid_disks) { 5694 if (!mddev->raid_disks) {
5695 int err; 5695 int err;
5696 /* expecting a device which has a superblock */ 5696 /* expecting a device which has a superblock */
5697 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version); 5697 rdev = md_import_device(dev, mddev->major_version, mddev->minor_version);
5698 if (IS_ERR(rdev)) { 5698 if (IS_ERR(rdev)) {
5699 printk(KERN_WARNING 5699 printk(KERN_WARNING
5700 "md: md_import_device returned %ld\n", 5700 "md: md_import_device returned %ld\n",
5701 PTR_ERR(rdev)); 5701 PTR_ERR(rdev));
5702 return PTR_ERR(rdev); 5702 return PTR_ERR(rdev);
5703 } 5703 }
5704 if (!list_empty(&mddev->disks)) { 5704 if (!list_empty(&mddev->disks)) {
5705 struct md_rdev *rdev0 5705 struct md_rdev *rdev0
5706 = list_entry(mddev->disks.next, 5706 = list_entry(mddev->disks.next,
5707 struct md_rdev, same_set); 5707 struct md_rdev, same_set);
5708 err = super_types[mddev->major_version] 5708 err = super_types[mddev->major_version]
5709 .load_super(rdev, rdev0, mddev->minor_version); 5709 .load_super(rdev, rdev0, mddev->minor_version);
5710 if (err < 0) { 5710 if (err < 0) {
5711 printk(KERN_WARNING 5711 printk(KERN_WARNING
5712 "md: %s has different UUID to %s\n", 5712 "md: %s has different UUID to %s\n",
5713 bdevname(rdev->bdev,b), 5713 bdevname(rdev->bdev,b),
5714 bdevname(rdev0->bdev,b2)); 5714 bdevname(rdev0->bdev,b2));
5715 export_rdev(rdev); 5715 export_rdev(rdev);
5716 return -EINVAL; 5716 return -EINVAL;
5717 } 5717 }
5718 } 5718 }
5719 err = bind_rdev_to_array(rdev, mddev); 5719 err = bind_rdev_to_array(rdev, mddev);
5720 if (err) 5720 if (err)
5721 export_rdev(rdev); 5721 export_rdev(rdev);
5722 return err; 5722 return err;
5723 } 5723 }
5724 5724
5725 /* 5725 /*
5726 * add_new_disk can be used once the array is assembled 5726 * add_new_disk can be used once the array is assembled
5727 * to add "hot spares". They must already have a superblock 5727 * to add "hot spares". They must already have a superblock
5728 * written 5728 * written
5729 */ 5729 */
5730 if (mddev->pers) { 5730 if (mddev->pers) {
5731 int err; 5731 int err;
5732 if (!mddev->pers->hot_add_disk) { 5732 if (!mddev->pers->hot_add_disk) {
5733 printk(KERN_WARNING 5733 printk(KERN_WARNING
5734 "%s: personality does not support diskops!\n", 5734 "%s: personality does not support diskops!\n",
5735 mdname(mddev)); 5735 mdname(mddev));
5736 return -EINVAL; 5736 return -EINVAL;
5737 } 5737 }
5738 if (mddev->persistent) 5738 if (mddev->persistent)
5739 rdev = md_import_device(dev, mddev->major_version, 5739 rdev = md_import_device(dev, mddev->major_version,
5740 mddev->minor_version); 5740 mddev->minor_version);
5741 else 5741 else
5742 rdev = md_import_device(dev, -1, -1); 5742 rdev = md_import_device(dev, -1, -1);
5743 if (IS_ERR(rdev)) { 5743 if (IS_ERR(rdev)) {
5744 printk(KERN_WARNING 5744 printk(KERN_WARNING
5745 "md: md_import_device returned %ld\n", 5745 "md: md_import_device returned %ld\n",
5746 PTR_ERR(rdev)); 5746 PTR_ERR(rdev));
5747 return PTR_ERR(rdev); 5747 return PTR_ERR(rdev);
5748 } 5748 }
5749 /* set saved_raid_disk if appropriate */ 5749 /* set saved_raid_disk if appropriate */
5750 if (!mddev->persistent) { 5750 if (!mddev->persistent) {
5751 if (info->state & (1<<MD_DISK_SYNC) && 5751 if (info->state & (1<<MD_DISK_SYNC) &&
5752 info->raid_disk < mddev->raid_disks) { 5752 info->raid_disk < mddev->raid_disks) {
5753 rdev->raid_disk = info->raid_disk; 5753 rdev->raid_disk = info->raid_disk;
5754 set_bit(In_sync, &rdev->flags); 5754 set_bit(In_sync, &rdev->flags);
5755 clear_bit(Bitmap_sync, &rdev->flags); 5755 clear_bit(Bitmap_sync, &rdev->flags);
5756 } else 5756 } else
5757 rdev->raid_disk = -1; 5757 rdev->raid_disk = -1;
5758 rdev->saved_raid_disk = rdev->raid_disk; 5758 rdev->saved_raid_disk = rdev->raid_disk;
5759 } else 5759 } else
5760 super_types[mddev->major_version]. 5760 super_types[mddev->major_version].
5761 validate_super(mddev, rdev); 5761 validate_super(mddev, rdev);
5762 if ((info->state & (1<<MD_DISK_SYNC)) && 5762 if ((info->state & (1<<MD_DISK_SYNC)) &&
5763 rdev->raid_disk != info->raid_disk) { 5763 rdev->raid_disk != info->raid_disk) {
5764 /* This was a hot-add request, but events doesn't 5764 /* This was a hot-add request, but events doesn't
5765 * match, so reject it. 5765 * match, so reject it.
5766 */ 5766 */
5767 export_rdev(rdev); 5767 export_rdev(rdev);
5768 return -EINVAL; 5768 return -EINVAL;
5769 } 5769 }
5770 5770
5771 clear_bit(In_sync, &rdev->flags); /* just to be sure */ 5771 clear_bit(In_sync, &rdev->flags); /* just to be sure */
5772 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5772 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5773 set_bit(WriteMostly, &rdev->flags); 5773 set_bit(WriteMostly, &rdev->flags);
5774 else 5774 else
5775 clear_bit(WriteMostly, &rdev->flags); 5775 clear_bit(WriteMostly, &rdev->flags);
5776 5776
5777 rdev->raid_disk = -1; 5777 rdev->raid_disk = -1;
5778 err = bind_rdev_to_array(rdev, mddev); 5778 err = bind_rdev_to_array(rdev, mddev);
5779 if (!err && !mddev->pers->hot_remove_disk) { 5779 if (!err && !mddev->pers->hot_remove_disk) {
5780 /* If there is hot_add_disk but no hot_remove_disk 5780 /* If there is hot_add_disk but no hot_remove_disk
5781 * then added disks for geometry changes, 5781 * then added disks for geometry changes,
5782 * and should be added immediately. 5782 * and should be added immediately.
5783 */ 5783 */
5784 super_types[mddev->major_version]. 5784 super_types[mddev->major_version].
5785 validate_super(mddev, rdev); 5785 validate_super(mddev, rdev);
5786 err = mddev->pers->hot_add_disk(mddev, rdev); 5786 err = mddev->pers->hot_add_disk(mddev, rdev);
5787 if (err) 5787 if (err)
5788 unbind_rdev_from_array(rdev); 5788 unbind_rdev_from_array(rdev);
5789 } 5789 }
5790 if (err) 5790 if (err)
5791 export_rdev(rdev); 5791 export_rdev(rdev);
5792 else 5792 else
5793 sysfs_notify_dirent_safe(rdev->sysfs_state); 5793 sysfs_notify_dirent_safe(rdev->sysfs_state);
5794 5794
5795 set_bit(MD_CHANGE_DEVS, &mddev->flags); 5795 set_bit(MD_CHANGE_DEVS, &mddev->flags);
5796 if (mddev->degraded) 5796 if (mddev->degraded)
5797 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 5797 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
5798 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5798 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5799 if (!err) 5799 if (!err)
5800 md_new_event(mddev); 5800 md_new_event(mddev);
5801 md_wakeup_thread(mddev->thread); 5801 md_wakeup_thread(mddev->thread);
5802 return err; 5802 return err;
5803 } 5803 }
5804 5804
5805 /* otherwise, add_new_disk is only allowed 5805 /* otherwise, add_new_disk is only allowed
5806 * for major_version==0 superblocks 5806 * for major_version==0 superblocks
5807 */ 5807 */
5808 if (mddev->major_version != 0) { 5808 if (mddev->major_version != 0) {
5809 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n", 5809 printk(KERN_WARNING "%s: ADD_NEW_DISK not supported\n",
5810 mdname(mddev)); 5810 mdname(mddev));
5811 return -EINVAL; 5811 return -EINVAL;
5812 } 5812 }
5813 5813
5814 if (!(info->state & (1<<MD_DISK_FAULTY))) { 5814 if (!(info->state & (1<<MD_DISK_FAULTY))) {
5815 int err; 5815 int err;
5816 rdev = md_import_device(dev, -1, 0); 5816 rdev = md_import_device(dev, -1, 0);
5817 if (IS_ERR(rdev)) { 5817 if (IS_ERR(rdev)) {
5818 printk(KERN_WARNING 5818 printk(KERN_WARNING
5819 "md: error, md_import_device() returned %ld\n", 5819 "md: error, md_import_device() returned %ld\n",
5820 PTR_ERR(rdev)); 5820 PTR_ERR(rdev));
5821 return PTR_ERR(rdev); 5821 return PTR_ERR(rdev);
5822 } 5822 }
5823 rdev->desc_nr = info->number; 5823 rdev->desc_nr = info->number;
5824 if (info->raid_disk < mddev->raid_disks) 5824 if (info->raid_disk < mddev->raid_disks)
5825 rdev->raid_disk = info->raid_disk; 5825 rdev->raid_disk = info->raid_disk;
5826 else 5826 else
5827 rdev->raid_disk = -1; 5827 rdev->raid_disk = -1;
5828 5828
5829 if (rdev->raid_disk < mddev->raid_disks) 5829 if (rdev->raid_disk < mddev->raid_disks)
5830 if (info->state & (1<<MD_DISK_SYNC)) 5830 if (info->state & (1<<MD_DISK_SYNC))
5831 set_bit(In_sync, &rdev->flags); 5831 set_bit(In_sync, &rdev->flags);
5832 5832
5833 if (info->state & (1<<MD_DISK_WRITEMOSTLY)) 5833 if (info->state & (1<<MD_DISK_WRITEMOSTLY))
5834 set_bit(WriteMostly, &rdev->flags); 5834 set_bit(WriteMostly, &rdev->flags);
5835 5835
5836 if (!mddev->persistent) { 5836 if (!mddev->persistent) {
5837 printk(KERN_INFO "md: nonpersistent superblock ...\n"); 5837 printk(KERN_INFO "md: nonpersistent superblock ...\n");
5838 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 5838 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5839 } else 5839 } else
5840 rdev->sb_start = calc_dev_sboffset(rdev); 5840 rdev->sb_start = calc_dev_sboffset(rdev);
5841 rdev->sectors = rdev->sb_start; 5841 rdev->sectors = rdev->sb_start;
5842 5842
5843 err = bind_rdev_to_array(rdev, mddev); 5843 err = bind_rdev_to_array(rdev, mddev);
5844 if (err) { 5844 if (err) {
5845 export_rdev(rdev); 5845 export_rdev(rdev);
5846 return err; 5846 return err;
5847 } 5847 }
5848 } 5848 }
5849 5849
5850 return 0; 5850 return 0;
5851 } 5851 }
5852 5852
5853 static int hot_remove_disk(struct mddev * mddev, dev_t dev) 5853 static int hot_remove_disk(struct mddev * mddev, dev_t dev)
5854 { 5854 {
5855 char b[BDEVNAME_SIZE]; 5855 char b[BDEVNAME_SIZE];
5856 struct md_rdev *rdev; 5856 struct md_rdev *rdev;
5857 5857
5858 rdev = find_rdev(mddev, dev); 5858 rdev = find_rdev(mddev, dev);
5859 if (!rdev) 5859 if (!rdev)
5860 return -ENXIO; 5860 return -ENXIO;
5861 5861
5862 clear_bit(Blocked, &rdev->flags); 5862 clear_bit(Blocked, &rdev->flags);
5863 remove_and_add_spares(mddev, rdev); 5863 remove_and_add_spares(mddev, rdev);
5864 5864
5865 if (rdev->raid_disk >= 0) 5865 if (rdev->raid_disk >= 0)
5866 goto busy; 5866 goto busy;
5867 5867
5868 kick_rdev_from_array(rdev); 5868 kick_rdev_from_array(rdev);
5869 md_update_sb(mddev, 1); 5869 md_update_sb(mddev, 1);
5870 md_new_event(mddev); 5870 md_new_event(mddev);
5871 5871
5872 return 0; 5872 return 0;
5873 busy: 5873 busy:
5874 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n", 5874 printk(KERN_WARNING "md: cannot remove active disk %s from %s ...\n",
5875 bdevname(rdev->bdev,b), mdname(mddev)); 5875 bdevname(rdev->bdev,b), mdname(mddev));
5876 return -EBUSY; 5876 return -EBUSY;
5877 } 5877 }
5878 5878
5879 static int hot_add_disk(struct mddev * mddev, dev_t dev) 5879 static int hot_add_disk(struct mddev * mddev, dev_t dev)
5880 { 5880 {
5881 char b[BDEVNAME_SIZE]; 5881 char b[BDEVNAME_SIZE];
5882 int err; 5882 int err;
5883 struct md_rdev *rdev; 5883 struct md_rdev *rdev;
5884 5884
5885 if (!mddev->pers) 5885 if (!mddev->pers)
5886 return -ENODEV; 5886 return -ENODEV;
5887 5887
5888 if (mddev->major_version != 0) { 5888 if (mddev->major_version != 0) {
5889 printk(KERN_WARNING "%s: HOT_ADD may only be used with" 5889 printk(KERN_WARNING "%s: HOT_ADD may only be used with"
5890 " version-0 superblocks.\n", 5890 " version-0 superblocks.\n",
5891 mdname(mddev)); 5891 mdname(mddev));
5892 return -EINVAL; 5892 return -EINVAL;
5893 } 5893 }
5894 if (!mddev->pers->hot_add_disk) { 5894 if (!mddev->pers->hot_add_disk) {
5895 printk(KERN_WARNING 5895 printk(KERN_WARNING
5896 "%s: personality does not support diskops!\n", 5896 "%s: personality does not support diskops!\n",
5897 mdname(mddev)); 5897 mdname(mddev));
5898 return -EINVAL; 5898 return -EINVAL;
5899 } 5899 }
5900 5900
5901 rdev = md_import_device(dev, -1, 0); 5901 rdev = md_import_device(dev, -1, 0);
5902 if (IS_ERR(rdev)) { 5902 if (IS_ERR(rdev)) {
5903 printk(KERN_WARNING 5903 printk(KERN_WARNING
5904 "md: error, md_import_device() returned %ld\n", 5904 "md: error, md_import_device() returned %ld\n",
5905 PTR_ERR(rdev)); 5905 PTR_ERR(rdev));
5906 return -EINVAL; 5906 return -EINVAL;
5907 } 5907 }
5908 5908
5909 if (mddev->persistent) 5909 if (mddev->persistent)
5910 rdev->sb_start = calc_dev_sboffset(rdev); 5910 rdev->sb_start = calc_dev_sboffset(rdev);
5911 else 5911 else
5912 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512; 5912 rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
5913 5913
5914 rdev->sectors = rdev->sb_start; 5914 rdev->sectors = rdev->sb_start;
5915 5915
5916 if (test_bit(Faulty, &rdev->flags)) { 5916 if (test_bit(Faulty, &rdev->flags)) {
5917 printk(KERN_WARNING 5917 printk(KERN_WARNING
5918 "md: can not hot-add faulty %s disk to %s!\n", 5918 "md: can not hot-add faulty %s disk to %s!\n",
5919 bdevname(rdev->bdev,b), mdname(mddev)); 5919 bdevname(rdev->bdev,b), mdname(mddev));
5920 err = -EINVAL; 5920 err = -EINVAL;
5921 goto abort_export; 5921 goto abort_export;
5922 } 5922 }
5923 clear_bit(In_sync, &rdev->flags); 5923 clear_bit(In_sync, &rdev->flags);
5924 rdev->desc_nr = -1; 5924 rdev->desc_nr = -1;
5925 rdev->saved_raid_disk = -1; 5925 rdev->saved_raid_disk = -1;
5926 err = bind_rdev_to_array(rdev, mddev); 5926 err = bind_rdev_to_array(rdev, mddev);
5927 if (err) 5927 if (err)
5928 goto abort_export; 5928 goto abort_export;
5929 5929
5930 /* 5930 /*
5931 * The rest should better be atomic, we can have disk failures 5931 * The rest should better be atomic, we can have disk failures
5932 * noticed in interrupt contexts ... 5932 * noticed in interrupt contexts ...
5933 */ 5933 */
5934 5934
5935 rdev->raid_disk = -1; 5935 rdev->raid_disk = -1;
5936 5936
5937 md_update_sb(mddev, 1); 5937 md_update_sb(mddev, 1);
5938 5938
5939 /* 5939 /*
5940 * Kick recovery, maybe this spare has to be added to the 5940 * Kick recovery, maybe this spare has to be added to the
5941 * array immediately. 5941 * array immediately.
5942 */ 5942 */
5943 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 5943 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
5944 md_wakeup_thread(mddev->thread); 5944 md_wakeup_thread(mddev->thread);
5945 md_new_event(mddev); 5945 md_new_event(mddev);
5946 return 0; 5946 return 0;
5947 5947
5948 abort_export: 5948 abort_export:
5949 export_rdev(rdev); 5949 export_rdev(rdev);
5950 return err; 5950 return err;
5951 } 5951 }
5952 5952
5953 static int set_bitmap_file(struct mddev *mddev, int fd) 5953 static int set_bitmap_file(struct mddev *mddev, int fd)
5954 { 5954 {
5955 int err = 0; 5955 int err = 0;
5956 5956
5957 if (mddev->pers) { 5957 if (mddev->pers) {
5958 if (!mddev->pers->quiesce) 5958 if (!mddev->pers->quiesce)
5959 return -EBUSY; 5959 return -EBUSY;
5960 if (mddev->recovery || mddev->sync_thread) 5960 if (mddev->recovery || mddev->sync_thread)
5961 return -EBUSY; 5961 return -EBUSY;
5962 /* we should be able to change the bitmap.. */ 5962 /* we should be able to change the bitmap.. */
5963 } 5963 }
5964 5964
5965 5965
5966 if (fd >= 0) { 5966 if (fd >= 0) {
5967 struct inode *inode; 5967 struct inode *inode;
5968 if (mddev->bitmap) 5968 if (mddev->bitmap)
5969 return -EEXIST; /* cannot add when bitmap is present */ 5969 return -EEXIST; /* cannot add when bitmap is present */
5970 mddev->bitmap_info.file = fget(fd); 5970 mddev->bitmap_info.file = fget(fd);
5971 5971
5972 if (mddev->bitmap_info.file == NULL) { 5972 if (mddev->bitmap_info.file == NULL) {
5973 printk(KERN_ERR "%s: error: failed to get bitmap file\n", 5973 printk(KERN_ERR "%s: error: failed to get bitmap file\n",
5974 mdname(mddev)); 5974 mdname(mddev));
5975 return -EBADF; 5975 return -EBADF;
5976 } 5976 }
5977 5977
5978 inode = mddev->bitmap_info.file->f_mapping->host; 5978 inode = mddev->bitmap_info.file->f_mapping->host;
5979 if (!S_ISREG(inode->i_mode)) { 5979 if (!S_ISREG(inode->i_mode)) {
5980 printk(KERN_ERR "%s: error: bitmap file must be a regular file\n", 5980 printk(KERN_ERR "%s: error: bitmap file must be a regular file\n",
5981 mdname(mddev)); 5981 mdname(mddev));
5982 err = -EBADF; 5982 err = -EBADF;
5983 } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) { 5983 } else if (!(mddev->bitmap_info.file->f_mode & FMODE_WRITE)) {
5984 printk(KERN_ERR "%s: error: bitmap file must open for write\n", 5984 printk(KERN_ERR "%s: error: bitmap file must open for write\n",
5985 mdname(mddev)); 5985 mdname(mddev));
5986 err = -EBADF; 5986 err = -EBADF;
5987 } else if (atomic_read(&inode->i_writecount) != 1) { 5987 } else if (atomic_read(&inode->i_writecount) != 1) {
5988 printk(KERN_ERR "%s: error: bitmap file is already in use\n", 5988 printk(KERN_ERR "%s: error: bitmap file is already in use\n",
5989 mdname(mddev)); 5989 mdname(mddev));
5990 err = -EBUSY; 5990 err = -EBUSY;
5991 } 5991 }
5992 if (err) { 5992 if (err) {
5993 fput(mddev->bitmap_info.file); 5993 fput(mddev->bitmap_info.file);
5994 mddev->bitmap_info.file = NULL; 5994 mddev->bitmap_info.file = NULL;
5995 return err; 5995 return err;
5996 } 5996 }
5997 mddev->bitmap_info.offset = 0; /* file overrides offset */ 5997 mddev->bitmap_info.offset = 0; /* file overrides offset */
5998 } else if (mddev->bitmap == NULL) 5998 } else if (mddev->bitmap == NULL)
5999 return -ENOENT; /* cannot remove what isn't there */ 5999 return -ENOENT; /* cannot remove what isn't there */
6000 err = 0; 6000 err = 0;
6001 if (mddev->pers) { 6001 if (mddev->pers) {
6002 mddev->pers->quiesce(mddev, 1); 6002 mddev->pers->quiesce(mddev, 1);
6003 if (fd >= 0) { 6003 if (fd >= 0) {
6004 err = bitmap_create(mddev); 6004 err = bitmap_create(mddev);
6005 if (!err) 6005 if (!err)
6006 err = bitmap_load(mddev); 6006 err = bitmap_load(mddev);
6007 } 6007 }
6008 if (fd < 0 || err) { 6008 if (fd < 0 || err) {
6009 bitmap_destroy(mddev); 6009 bitmap_destroy(mddev);
6010 fd = -1; /* make sure to put the file */ 6010 fd = -1; /* make sure to put the file */
6011 } 6011 }
6012 mddev->pers->quiesce(mddev, 0); 6012 mddev->pers->quiesce(mddev, 0);
6013 } 6013 }
6014 if (fd < 0) { 6014 if (fd < 0) {
6015 if (mddev->bitmap_info.file) 6015 if (mddev->bitmap_info.file)
6016 fput(mddev->bitmap_info.file); 6016 fput(mddev->bitmap_info.file);
6017 mddev->bitmap_info.file = NULL; 6017 mddev->bitmap_info.file = NULL;
6018 } 6018 }
6019 6019
6020 return err; 6020 return err;
6021 } 6021 }
6022 6022
6023 /* 6023 /*
6024 * set_array_info is used two different ways 6024 * set_array_info is used two different ways
6025 * The original usage is when creating a new array. 6025 * The original usage is when creating a new array.
6026 * In this usage, raid_disks is > 0 and it together with 6026 * In this usage, raid_disks is > 0 and it together with
6027 * level, size, not_persistent,layout,chunksize determine the 6027 * level, size, not_persistent,layout,chunksize determine the
6028 * shape of the array. 6028 * shape of the array.
6029 * This will always create an array with a type-0.90.0 superblock. 6029 * This will always create an array with a type-0.90.0 superblock.
6030 * The newer usage is when assembling an array. 6030 * The newer usage is when assembling an array.
6031 * In this case raid_disks will be 0, and the major_version field is 6031 * In this case raid_disks will be 0, and the major_version field is
6032 * use to determine which style super-blocks are to be found on the devices. 6032 * use to determine which style super-blocks are to be found on the devices.
6033 * The minor and patch _version numbers are also kept incase the 6033 * The minor and patch _version numbers are also kept incase the
6034 * super_block handler wishes to interpret them. 6034 * super_block handler wishes to interpret them.
6035 */ 6035 */
6036 static int set_array_info(struct mddev * mddev, mdu_array_info_t *info) 6036 static int set_array_info(struct mddev * mddev, mdu_array_info_t *info)
6037 { 6037 {
6038 6038
6039 if (info->raid_disks == 0) { 6039 if (info->raid_disks == 0) {
6040 /* just setting version number for superblock loading */ 6040 /* just setting version number for superblock loading */
6041 if (info->major_version < 0 || 6041 if (info->major_version < 0 ||
6042 info->major_version >= ARRAY_SIZE(super_types) || 6042 info->major_version >= ARRAY_SIZE(super_types) ||
6043 super_types[info->major_version].name == NULL) { 6043 super_types[info->major_version].name == NULL) {
6044 /* maybe try to auto-load a module? */ 6044 /* maybe try to auto-load a module? */
6045 printk(KERN_INFO 6045 printk(KERN_INFO
6046 "md: superblock version %d not known\n", 6046 "md: superblock version %d not known\n",
6047 info->major_version); 6047 info->major_version);
6048 return -EINVAL; 6048 return -EINVAL;
6049 } 6049 }
6050 mddev->major_version = info->major_version; 6050 mddev->major_version = info->major_version;
6051 mddev->minor_version = info->minor_version; 6051 mddev->minor_version = info->minor_version;
6052 mddev->patch_version = info->patch_version; 6052 mddev->patch_version = info->patch_version;
6053 mddev->persistent = !info->not_persistent; 6053 mddev->persistent = !info->not_persistent;
6054 /* ensure mddev_put doesn't delete this now that there 6054 /* ensure mddev_put doesn't delete this now that there
6055 * is some minimal configuration. 6055 * is some minimal configuration.
6056 */ 6056 */
6057 mddev->ctime = get_seconds(); 6057 mddev->ctime = get_seconds();
6058 return 0; 6058 return 0;
6059 } 6059 }
6060 mddev->major_version = MD_MAJOR_VERSION; 6060 mddev->major_version = MD_MAJOR_VERSION;
6061 mddev->minor_version = MD_MINOR_VERSION; 6061 mddev->minor_version = MD_MINOR_VERSION;
6062 mddev->patch_version = MD_PATCHLEVEL_VERSION; 6062 mddev->patch_version = MD_PATCHLEVEL_VERSION;
6063 mddev->ctime = get_seconds(); 6063 mddev->ctime = get_seconds();
6064 6064
6065 mddev->level = info->level; 6065 mddev->level = info->level;
6066 mddev->clevel[0] = 0; 6066 mddev->clevel[0] = 0;
6067 mddev->dev_sectors = 2 * (sector_t)info->size; 6067 mddev->dev_sectors = 2 * (sector_t)info->size;
6068 mddev->raid_disks = info->raid_disks; 6068 mddev->raid_disks = info->raid_disks;
6069 /* don't set md_minor, it is determined by which /dev/md* was 6069 /* don't set md_minor, it is determined by which /dev/md* was
6070 * openned 6070 * openned
6071 */ 6071 */
6072 if (info->state & (1<<MD_SB_CLEAN)) 6072 if (info->state & (1<<MD_SB_CLEAN))
6073 mddev->recovery_cp = MaxSector; 6073 mddev->recovery_cp = MaxSector;
6074 else 6074 else
6075 mddev->recovery_cp = 0; 6075 mddev->recovery_cp = 0;
6076 mddev->persistent = ! info->not_persistent; 6076 mddev->persistent = ! info->not_persistent;
6077 mddev->external = 0; 6077 mddev->external = 0;
6078 6078
6079 mddev->layout = info->layout; 6079 mddev->layout = info->layout;
6080 mddev->chunk_sectors = info->chunk_size >> 9; 6080 mddev->chunk_sectors = info->chunk_size >> 9;
6081 6081
6082 mddev->max_disks = MD_SB_DISKS; 6082 mddev->max_disks = MD_SB_DISKS;
6083 6083
6084 if (mddev->persistent) 6084 if (mddev->persistent)
6085 mddev->flags = 0; 6085 mddev->flags = 0;
6086 set_bit(MD_CHANGE_DEVS, &mddev->flags); 6086 set_bit(MD_CHANGE_DEVS, &mddev->flags);
6087 6087
6088 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9; 6088 mddev->bitmap_info.default_offset = MD_SB_BYTES >> 9;
6089 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9); 6089 mddev->bitmap_info.default_space = 64*2 - (MD_SB_BYTES >> 9);
6090 mddev->bitmap_info.offset = 0; 6090 mddev->bitmap_info.offset = 0;
6091 6091
6092 mddev->reshape_position = MaxSector; 6092 mddev->reshape_position = MaxSector;
6093 6093
6094 /* 6094 /*
6095 * Generate a 128 bit UUID 6095 * Generate a 128 bit UUID
6096 */ 6096 */
6097 get_random_bytes(mddev->uuid, 16); 6097 get_random_bytes(mddev->uuid, 16);
6098 6098
6099 mddev->new_level = mddev->level; 6099 mddev->new_level = mddev->level;
6100 mddev->new_chunk_sectors = mddev->chunk_sectors; 6100 mddev->new_chunk_sectors = mddev->chunk_sectors;
6101 mddev->new_layout = mddev->layout; 6101 mddev->new_layout = mddev->layout;
6102 mddev->delta_disks = 0; 6102 mddev->delta_disks = 0;
6103 mddev->reshape_backwards = 0; 6103 mddev->reshape_backwards = 0;
6104 6104
6105 return 0; 6105 return 0;
6106 } 6106 }
6107 6107
6108 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors) 6108 void md_set_array_sectors(struct mddev *mddev, sector_t array_sectors)
6109 { 6109 {
6110 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__); 6110 WARN(!mddev_is_locked(mddev), "%s: unlocked mddev!\n", __func__);
6111 6111
6112 if (mddev->external_size) 6112 if (mddev->external_size)
6113 return; 6113 return;
6114 6114
6115 mddev->array_sectors = array_sectors; 6115 mddev->array_sectors = array_sectors;
6116 } 6116 }
6117 EXPORT_SYMBOL(md_set_array_sectors); 6117 EXPORT_SYMBOL(md_set_array_sectors);
6118 6118
6119 static int update_size(struct mddev *mddev, sector_t num_sectors) 6119 static int update_size(struct mddev *mddev, sector_t num_sectors)
6120 { 6120 {
6121 struct md_rdev *rdev; 6121 struct md_rdev *rdev;
6122 int rv; 6122 int rv;
6123 int fit = (num_sectors == 0); 6123 int fit = (num_sectors == 0);
6124 6124
6125 if (mddev->pers->resize == NULL) 6125 if (mddev->pers->resize == NULL)
6126 return -EINVAL; 6126 return -EINVAL;
6127 /* The "num_sectors" is the number of sectors of each device that 6127 /* The "num_sectors" is the number of sectors of each device that
6128 * is used. This can only make sense for arrays with redundancy. 6128 * is used. This can only make sense for arrays with redundancy.
6129 * linear and raid0 always use whatever space is available. We can only 6129 * linear and raid0 always use whatever space is available. We can only
6130 * consider changing this number if no resync or reconstruction is 6130 * consider changing this number if no resync or reconstruction is
6131 * happening, and if the new size is acceptable. It must fit before the 6131 * happening, and if the new size is acceptable. It must fit before the
6132 * sb_start or, if that is <data_offset, it must fit before the size 6132 * sb_start or, if that is <data_offset, it must fit before the size
6133 * of each device. If num_sectors is zero, we find the largest size 6133 * of each device. If num_sectors is zero, we find the largest size
6134 * that fits. 6134 * that fits.
6135 */ 6135 */
6136 if (mddev->sync_thread) 6136 if (mddev->sync_thread)
6137 return -EBUSY; 6137 return -EBUSY;
6138 6138
6139 rdev_for_each(rdev, mddev) { 6139 rdev_for_each(rdev, mddev) {
6140 sector_t avail = rdev->sectors; 6140 sector_t avail = rdev->sectors;
6141 6141
6142 if (fit && (num_sectors == 0 || num_sectors > avail)) 6142 if (fit && (num_sectors == 0 || num_sectors > avail))
6143 num_sectors = avail; 6143 num_sectors = avail;
6144 if (avail < num_sectors) 6144 if (avail < num_sectors)
6145 return -ENOSPC; 6145 return -ENOSPC;
6146 } 6146 }
6147 rv = mddev->pers->resize(mddev, num_sectors); 6147 rv = mddev->pers->resize(mddev, num_sectors);
6148 if (!rv) 6148 if (!rv)
6149 revalidate_disk(mddev->gendisk); 6149 revalidate_disk(mddev->gendisk);
6150 return rv; 6150 return rv;
6151 } 6151 }
6152 6152
6153 static int update_raid_disks(struct mddev *mddev, int raid_disks) 6153 static int update_raid_disks(struct mddev *mddev, int raid_disks)
6154 { 6154 {
6155 int rv; 6155 int rv;
6156 struct md_rdev *rdev; 6156 struct md_rdev *rdev;
6157 /* change the number of raid disks */ 6157 /* change the number of raid disks */
6158 if (mddev->pers->check_reshape == NULL) 6158 if (mddev->pers->check_reshape == NULL)
6159 return -EINVAL; 6159 return -EINVAL;
6160 if (raid_disks <= 0 || 6160 if (raid_disks <= 0 ||
6161 (mddev->max_disks && raid_disks >= mddev->max_disks)) 6161 (mddev->max_disks && raid_disks >= mddev->max_disks))
6162 return -EINVAL; 6162 return -EINVAL;
6163 if (mddev->sync_thread || mddev->reshape_position != MaxSector) 6163 if (mddev->sync_thread || mddev->reshape_position != MaxSector)
6164 return -EBUSY; 6164 return -EBUSY;
6165 6165
6166 rdev_for_each(rdev, mddev) { 6166 rdev_for_each(rdev, mddev) {
6167 if (mddev->raid_disks < raid_disks && 6167 if (mddev->raid_disks < raid_disks &&
6168 rdev->data_offset < rdev->new_data_offset) 6168 rdev->data_offset < rdev->new_data_offset)
6169 return -EINVAL; 6169 return -EINVAL;
6170 if (mddev->raid_disks > raid_disks && 6170 if (mddev->raid_disks > raid_disks &&
6171 rdev->data_offset > rdev->new_data_offset) 6171 rdev->data_offset > rdev->new_data_offset)
6172 return -EINVAL; 6172 return -EINVAL;
6173 } 6173 }
6174 6174
6175 mddev->delta_disks = raid_disks - mddev->raid_disks; 6175 mddev->delta_disks = raid_disks - mddev->raid_disks;
6176 if (mddev->delta_disks < 0) 6176 if (mddev->delta_disks < 0)
6177 mddev->reshape_backwards = 1; 6177 mddev->reshape_backwards = 1;
6178 else if (mddev->delta_disks > 0) 6178 else if (mddev->delta_disks > 0)
6179 mddev->reshape_backwards = 0; 6179 mddev->reshape_backwards = 0;
6180 6180
6181 rv = mddev->pers->check_reshape(mddev); 6181 rv = mddev->pers->check_reshape(mddev);
6182 if (rv < 0) { 6182 if (rv < 0) {
6183 mddev->delta_disks = 0; 6183 mddev->delta_disks = 0;
6184 mddev->reshape_backwards = 0; 6184 mddev->reshape_backwards = 0;
6185 } 6185 }
6186 return rv; 6186 return rv;
6187 } 6187 }
6188 6188
6189 6189
6190 /* 6190 /*
6191 * update_array_info is used to change the configuration of an 6191 * update_array_info is used to change the configuration of an
6192 * on-line array. 6192 * on-line array.
6193 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size 6193 * The version, ctime,level,size,raid_disks,not_persistent, layout,chunk_size
6194 * fields in the info are checked against the array. 6194 * fields in the info are checked against the array.
6195 * Any differences that cannot be handled will cause an error. 6195 * Any differences that cannot be handled will cause an error.
6196 * Normally, only one change can be managed at a time. 6196 * Normally, only one change can be managed at a time.
6197 */ 6197 */
6198 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) 6198 static int update_array_info(struct mddev *mddev, mdu_array_info_t *info)
6199 { 6199 {
6200 int rv = 0; 6200 int rv = 0;
6201 int cnt = 0; 6201 int cnt = 0;
6202 int state = 0; 6202 int state = 0;
6203 6203
6204 /* calculate expected state,ignoring low bits */ 6204 /* calculate expected state,ignoring low bits */
6205 if (mddev->bitmap && mddev->bitmap_info.offset) 6205 if (mddev->bitmap && mddev->bitmap_info.offset)
6206 state |= (1 << MD_SB_BITMAP_PRESENT); 6206 state |= (1 << MD_SB_BITMAP_PRESENT);
6207 6207
6208 if (mddev->major_version != info->major_version || 6208 if (mddev->major_version != info->major_version ||
6209 mddev->minor_version != info->minor_version || 6209 mddev->minor_version != info->minor_version ||
6210 /* mddev->patch_version != info->patch_version || */ 6210 /* mddev->patch_version != info->patch_version || */
6211 mddev->ctime != info->ctime || 6211 mddev->ctime != info->ctime ||
6212 mddev->level != info->level || 6212 mddev->level != info->level ||
6213 /* mddev->layout != info->layout || */ 6213 /* mddev->layout != info->layout || */
6214 !mddev->persistent != info->not_persistent|| 6214 !mddev->persistent != info->not_persistent||
6215 mddev->chunk_sectors != info->chunk_size >> 9 || 6215 mddev->chunk_sectors != info->chunk_size >> 9 ||
6216 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */ 6216 /* ignore bottom 8 bits of state, and allow SB_BITMAP_PRESENT to change */
6217 ((state^info->state) & 0xfffffe00) 6217 ((state^info->state) & 0xfffffe00)
6218 ) 6218 )
6219 return -EINVAL; 6219 return -EINVAL;
6220 /* Check there is only one change */ 6220 /* Check there is only one change */
6221 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6221 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6222 cnt++; 6222 cnt++;
6223 if (mddev->raid_disks != info->raid_disks) 6223 if (mddev->raid_disks != info->raid_disks)
6224 cnt++; 6224 cnt++;
6225 if (mddev->layout != info->layout) 6225 if (mddev->layout != info->layout)
6226 cnt++; 6226 cnt++;
6227 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) 6227 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT))
6228 cnt++; 6228 cnt++;
6229 if (cnt == 0) 6229 if (cnt == 0)
6230 return 0; 6230 return 0;
6231 if (cnt > 1) 6231 if (cnt > 1)
6232 return -EINVAL; 6232 return -EINVAL;
6233 6233
6234 if (mddev->layout != info->layout) { 6234 if (mddev->layout != info->layout) {
6235 /* Change layout 6235 /* Change layout
6236 * we don't need to do anything at the md level, the 6236 * we don't need to do anything at the md level, the
6237 * personality will take care of it all. 6237 * personality will take care of it all.
6238 */ 6238 */
6239 if (mddev->pers->check_reshape == NULL) 6239 if (mddev->pers->check_reshape == NULL)
6240 return -EINVAL; 6240 return -EINVAL;
6241 else { 6241 else {
6242 mddev->new_layout = info->layout; 6242 mddev->new_layout = info->layout;
6243 rv = mddev->pers->check_reshape(mddev); 6243 rv = mddev->pers->check_reshape(mddev);
6244 if (rv) 6244 if (rv)
6245 mddev->new_layout = mddev->layout; 6245 mddev->new_layout = mddev->layout;
6246 return rv; 6246 return rv;
6247 } 6247 }
6248 } 6248 }
6249 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size) 6249 if (info->size >= 0 && mddev->dev_sectors / 2 != info->size)
6250 rv = update_size(mddev, (sector_t)info->size * 2); 6250 rv = update_size(mddev, (sector_t)info->size * 2);
6251 6251
6252 if (mddev->raid_disks != info->raid_disks) 6252 if (mddev->raid_disks != info->raid_disks)
6253 rv = update_raid_disks(mddev, info->raid_disks); 6253 rv = update_raid_disks(mddev, info->raid_disks);
6254 6254
6255 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) { 6255 if ((state ^ info->state) & (1<<MD_SB_BITMAP_PRESENT)) {
6256 if (mddev->pers->quiesce == NULL) 6256 if (mddev->pers->quiesce == NULL)
6257 return -EINVAL; 6257 return -EINVAL;
6258 if (mddev->recovery || mddev->sync_thread) 6258 if (mddev->recovery || mddev->sync_thread)
6259 return -EBUSY; 6259 return -EBUSY;
6260 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) { 6260 if (info->state & (1<<MD_SB_BITMAP_PRESENT)) {
6261 /* add the bitmap */ 6261 /* add the bitmap */
6262 if (mddev->bitmap) 6262 if (mddev->bitmap)
6263 return -EEXIST; 6263 return -EEXIST;
6264 if (mddev->bitmap_info.default_offset == 0) 6264 if (mddev->bitmap_info.default_offset == 0)
6265 return -EINVAL; 6265 return -EINVAL;
6266 mddev->bitmap_info.offset = 6266 mddev->bitmap_info.offset =
6267 mddev->bitmap_info.default_offset; 6267 mddev->bitmap_info.default_offset;
6268 mddev->bitmap_info.space = 6268 mddev->bitmap_info.space =
6269 mddev->bitmap_info.default_space; 6269 mddev->bitmap_info.default_space;
6270 mddev->pers->quiesce(mddev, 1); 6270 mddev->pers->quiesce(mddev, 1);
6271 rv = bitmap_create(mddev); 6271 rv = bitmap_create(mddev);
6272 if (!rv) 6272 if (!rv)
6273 rv = bitmap_load(mddev); 6273 rv = bitmap_load(mddev);
6274 if (rv) 6274 if (rv)
6275 bitmap_destroy(mddev); 6275 bitmap_destroy(mddev);
6276 mddev->pers->quiesce(mddev, 0); 6276 mddev->pers->quiesce(mddev, 0);
6277 } else { 6277 } else {
6278 /* remove the bitmap */ 6278 /* remove the bitmap */
6279 if (!mddev->bitmap) 6279 if (!mddev->bitmap)
6280 return -ENOENT; 6280 return -ENOENT;
6281 if (mddev->bitmap->storage.file) 6281 if (mddev->bitmap->storage.file)
6282 return -EINVAL; 6282 return -EINVAL;
6283 mddev->pers->quiesce(mddev, 1); 6283 mddev->pers->quiesce(mddev, 1);
6284 bitmap_destroy(mddev); 6284 bitmap_destroy(mddev);
6285 mddev->pers->quiesce(mddev, 0); 6285 mddev->pers->quiesce(mddev, 0);
6286 mddev->bitmap_info.offset = 0; 6286 mddev->bitmap_info.offset = 0;
6287 } 6287 }
6288 } 6288 }
6289 md_update_sb(mddev, 1); 6289 md_update_sb(mddev, 1);
6290 return rv; 6290 return rv;
6291 } 6291 }
6292 6292
6293 static int set_disk_faulty(struct mddev *mddev, dev_t dev) 6293 static int set_disk_faulty(struct mddev *mddev, dev_t dev)
6294 { 6294 {
6295 struct md_rdev *rdev; 6295 struct md_rdev *rdev;
6296 int err = 0; 6296 int err = 0;
6297 6297
6298 if (mddev->pers == NULL) 6298 if (mddev->pers == NULL)
6299 return -ENODEV; 6299 return -ENODEV;
6300 6300
6301 rcu_read_lock(); 6301 rcu_read_lock();
6302 rdev = find_rdev_rcu(mddev, dev); 6302 rdev = find_rdev_rcu(mddev, dev);
6303 if (!rdev) 6303 if (!rdev)
6304 err = -ENODEV; 6304 err = -ENODEV;
6305 else { 6305 else {
6306 md_error(mddev, rdev); 6306 md_error(mddev, rdev);
6307 if (!test_bit(Faulty, &rdev->flags)) 6307 if (!test_bit(Faulty, &rdev->flags))
6308 err = -EBUSY; 6308 err = -EBUSY;
6309 } 6309 }
6310 rcu_read_unlock(); 6310 rcu_read_unlock();
6311 return err; 6311 return err;
6312 } 6312 }
6313 6313
6314 /* 6314 /*
6315 * We have a problem here : there is no easy way to give a CHS 6315 * We have a problem here : there is no easy way to give a CHS
6316 * virtual geometry. We currently pretend that we have a 2 heads 6316 * virtual geometry. We currently pretend that we have a 2 heads
6317 * 4 sectors (with a BIG number of cylinders...). This drives 6317 * 4 sectors (with a BIG number of cylinders...). This drives
6318 * dosfs just mad... ;-) 6318 * dosfs just mad... ;-)
6319 */ 6319 */
6320 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo) 6320 static int md_getgeo(struct block_device *bdev, struct hd_geometry *geo)
6321 { 6321 {
6322 struct mddev *mddev = bdev->bd_disk->private_data; 6322 struct mddev *mddev = bdev->bd_disk->private_data;
6323 6323
6324 geo->heads = 2; 6324 geo->heads = 2;
6325 geo->sectors = 4; 6325 geo->sectors = 4;
6326 geo->cylinders = mddev->array_sectors / 8; 6326 geo->cylinders = mddev->array_sectors / 8;
6327 return 0; 6327 return 0;
6328 } 6328 }
6329 6329
6330 static inline bool md_ioctl_valid(unsigned int cmd) 6330 static inline bool md_ioctl_valid(unsigned int cmd)
6331 { 6331 {
6332 switch (cmd) { 6332 switch (cmd) {
6333 case ADD_NEW_DISK: 6333 case ADD_NEW_DISK:
6334 case BLKROSET: 6334 case BLKROSET:
6335 case GET_ARRAY_INFO: 6335 case GET_ARRAY_INFO:
6336 case GET_BITMAP_FILE: 6336 case GET_BITMAP_FILE:
6337 case GET_DISK_INFO: 6337 case GET_DISK_INFO:
6338 case HOT_ADD_DISK: 6338 case HOT_ADD_DISK:
6339 case HOT_REMOVE_DISK: 6339 case HOT_REMOVE_DISK:
6340 case PRINT_RAID_DEBUG: 6340 case PRINT_RAID_DEBUG:
6341 case RAID_AUTORUN: 6341 case RAID_AUTORUN:
6342 case RAID_VERSION: 6342 case RAID_VERSION:
6343 case RESTART_ARRAY_RW: 6343 case RESTART_ARRAY_RW:
6344 case RUN_ARRAY: 6344 case RUN_ARRAY:
6345 case SET_ARRAY_INFO: 6345 case SET_ARRAY_INFO:
6346 case SET_BITMAP_FILE: 6346 case SET_BITMAP_FILE:
6347 case SET_DISK_FAULTY: 6347 case SET_DISK_FAULTY:
6348 case STOP_ARRAY: 6348 case STOP_ARRAY:
6349 case STOP_ARRAY_RO: 6349 case STOP_ARRAY_RO:
6350 return true; 6350 return true;
6351 default: 6351 default:
6352 return false; 6352 return false;
6353 } 6353 }
6354 } 6354 }
6355 6355
6356 static int md_ioctl(struct block_device *bdev, fmode_t mode, 6356 static int md_ioctl(struct block_device *bdev, fmode_t mode,
6357 unsigned int cmd, unsigned long arg) 6357 unsigned int cmd, unsigned long arg)
6358 { 6358 {
6359 int err = 0; 6359 int err = 0;
6360 void __user *argp = (void __user *)arg; 6360 void __user *argp = (void __user *)arg;
6361 struct mddev *mddev = NULL; 6361 struct mddev *mddev = NULL;
6362 int ro; 6362 int ro;
6363 6363
6364 if (!md_ioctl_valid(cmd)) 6364 if (!md_ioctl_valid(cmd))
6365 return -ENOTTY; 6365 return -ENOTTY;
6366 6366
6367 switch (cmd) { 6367 switch (cmd) {
6368 case RAID_VERSION: 6368 case RAID_VERSION:
6369 case GET_ARRAY_INFO: 6369 case GET_ARRAY_INFO:
6370 case GET_DISK_INFO: 6370 case GET_DISK_INFO:
6371 break; 6371 break;
6372 default: 6372 default:
6373 if (!capable(CAP_SYS_ADMIN)) 6373 if (!capable(CAP_SYS_ADMIN))
6374 return -EACCES; 6374 return -EACCES;
6375 } 6375 }
6376 6376
6377 /* 6377 /*
6378 * Commands dealing with the RAID driver but not any 6378 * Commands dealing with the RAID driver but not any
6379 * particular array: 6379 * particular array:
6380 */ 6380 */
6381 switch (cmd) { 6381 switch (cmd) {
6382 case RAID_VERSION: 6382 case RAID_VERSION:
6383 err = get_version(argp); 6383 err = get_version(argp);
6384 goto done; 6384 goto done;
6385 6385
6386 case PRINT_RAID_DEBUG: 6386 case PRINT_RAID_DEBUG:
6387 err = 0; 6387 err = 0;
6388 md_print_devices(); 6388 md_print_devices();
6389 goto done; 6389 goto done;
6390 6390
6391 #ifndef MODULE 6391 #ifndef MODULE
6392 case RAID_AUTORUN: 6392 case RAID_AUTORUN:
6393 err = 0; 6393 err = 0;
6394 autostart_arrays(arg); 6394 autostart_arrays(arg);
6395 goto done; 6395 goto done;
6396 #endif 6396 #endif
6397 default:; 6397 default:;
6398 } 6398 }
6399 6399
6400 /* 6400 /*
6401 * Commands creating/starting a new array: 6401 * Commands creating/starting a new array:
6402 */ 6402 */
6403 6403
6404 mddev = bdev->bd_disk->private_data; 6404 mddev = bdev->bd_disk->private_data;
6405 6405
6406 if (!mddev) { 6406 if (!mddev) {
6407 BUG(); 6407 BUG();
6408 goto abort; 6408 goto abort;
6409 } 6409 }
6410 6410
6411 /* Some actions do not requires the mutex */ 6411 /* Some actions do not requires the mutex */
6412 switch (cmd) { 6412 switch (cmd) {
6413 case GET_ARRAY_INFO: 6413 case GET_ARRAY_INFO:
6414 if (!mddev->raid_disks && !mddev->external) 6414 if (!mddev->raid_disks && !mddev->external)
6415 err = -ENODEV; 6415 err = -ENODEV;
6416 else 6416 else
6417 err = get_array_info(mddev, argp); 6417 err = get_array_info(mddev, argp);
6418 goto abort; 6418 goto abort;
6419 6419
6420 case GET_DISK_INFO: 6420 case GET_DISK_INFO:
6421 if (!mddev->raid_disks && !mddev->external) 6421 if (!mddev->raid_disks && !mddev->external)
6422 err = -ENODEV; 6422 err = -ENODEV;
6423 else 6423 else
6424 err = get_disk_info(mddev, argp); 6424 err = get_disk_info(mddev, argp);
6425 goto abort; 6425 goto abort;
6426 6426
6427 case SET_DISK_FAULTY: 6427 case SET_DISK_FAULTY:
6428 err = set_disk_faulty(mddev, new_decode_dev(arg)); 6428 err = set_disk_faulty(mddev, new_decode_dev(arg));
6429 goto abort; 6429 goto abort;
6430 } 6430 }
6431 6431
6432 if (cmd == ADD_NEW_DISK) 6432 if (cmd == ADD_NEW_DISK)
6433 /* need to ensure md_delayed_delete() has completed */ 6433 /* need to ensure md_delayed_delete() has completed */
6434 flush_workqueue(md_misc_wq); 6434 flush_workqueue(md_misc_wq);
6435 6435
6436 if (cmd == HOT_REMOVE_DISK) 6436 if (cmd == HOT_REMOVE_DISK)
6437 /* need to ensure recovery thread has run */ 6437 /* need to ensure recovery thread has run */
6438 wait_event_interruptible_timeout(mddev->sb_wait, 6438 wait_event_interruptible_timeout(mddev->sb_wait,
6439 !test_bit(MD_RECOVERY_NEEDED, 6439 !test_bit(MD_RECOVERY_NEEDED,
6440 &mddev->flags), 6440 &mddev->flags),
6441 msecs_to_jiffies(5000)); 6441 msecs_to_jiffies(5000));
6442 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) { 6442 if (cmd == STOP_ARRAY || cmd == STOP_ARRAY_RO) {
6443 /* Need to flush page cache, and ensure no-one else opens 6443 /* Need to flush page cache, and ensure no-one else opens
6444 * and writes 6444 * and writes
6445 */ 6445 */
6446 mutex_lock(&mddev->open_mutex); 6446 mutex_lock(&mddev->open_mutex);
6447 if (atomic_read(&mddev->openers) > 1) { 6447 if (atomic_read(&mddev->openers) > 1) {
6448 mutex_unlock(&mddev->open_mutex); 6448 mutex_unlock(&mddev->open_mutex);
6449 err = -EBUSY; 6449 err = -EBUSY;
6450 goto abort; 6450 goto abort;
6451 } 6451 }
6452 set_bit(MD_STILL_CLOSED, &mddev->flags); 6452 set_bit(MD_STILL_CLOSED, &mddev->flags);
6453 mutex_unlock(&mddev->open_mutex); 6453 mutex_unlock(&mddev->open_mutex);
6454 sync_blockdev(bdev); 6454 sync_blockdev(bdev);
6455 } 6455 }
6456 err = mddev_lock(mddev); 6456 err = mddev_lock(mddev);
6457 if (err) { 6457 if (err) {
6458 printk(KERN_INFO 6458 printk(KERN_INFO
6459 "md: ioctl lock interrupted, reason %d, cmd %d\n", 6459 "md: ioctl lock interrupted, reason %d, cmd %d\n",
6460 err, cmd); 6460 err, cmd);
6461 goto abort; 6461 goto abort;
6462 } 6462 }
6463 6463
6464 if (cmd == SET_ARRAY_INFO) { 6464 if (cmd == SET_ARRAY_INFO) {
6465 mdu_array_info_t info; 6465 mdu_array_info_t info;
6466 if (!arg) 6466 if (!arg)
6467 memset(&info, 0, sizeof(info)); 6467 memset(&info, 0, sizeof(info));
6468 else if (copy_from_user(&info, argp, sizeof(info))) { 6468 else if (copy_from_user(&info, argp, sizeof(info))) {
6469 err = -EFAULT; 6469 err = -EFAULT;
6470 goto abort_unlock; 6470 goto abort_unlock;
6471 } 6471 }
6472 if (mddev->pers) { 6472 if (mddev->pers) {
6473 err = update_array_info(mddev, &info); 6473 err = update_array_info(mddev, &info);
6474 if (err) { 6474 if (err) {
6475 printk(KERN_WARNING "md: couldn't update" 6475 printk(KERN_WARNING "md: couldn't update"
6476 " array info. %d\n", err); 6476 " array info. %d\n", err);
6477 goto abort_unlock; 6477 goto abort_unlock;
6478 } 6478 }
6479 goto done_unlock; 6479 goto done_unlock;
6480 } 6480 }
6481 if (!list_empty(&mddev->disks)) { 6481 if (!list_empty(&mddev->disks)) {
6482 printk(KERN_WARNING 6482 printk(KERN_WARNING
6483 "md: array %s already has disks!\n", 6483 "md: array %s already has disks!\n",
6484 mdname(mddev)); 6484 mdname(mddev));
6485 err = -EBUSY; 6485 err = -EBUSY;
6486 goto abort_unlock; 6486 goto abort_unlock;
6487 } 6487 }
6488 if (mddev->raid_disks) { 6488 if (mddev->raid_disks) {
6489 printk(KERN_WARNING 6489 printk(KERN_WARNING
6490 "md: array %s already initialised!\n", 6490 "md: array %s already initialised!\n",
6491 mdname(mddev)); 6491 mdname(mddev));
6492 err = -EBUSY; 6492 err = -EBUSY;
6493 goto abort_unlock; 6493 goto abort_unlock;
6494 } 6494 }
6495 err = set_array_info(mddev, &info); 6495 err = set_array_info(mddev, &info);
6496 if (err) { 6496 if (err) {
6497 printk(KERN_WARNING "md: couldn't set" 6497 printk(KERN_WARNING "md: couldn't set"
6498 " array info. %d\n", err); 6498 " array info. %d\n", err);
6499 goto abort_unlock; 6499 goto abort_unlock;
6500 } 6500 }
6501 goto done_unlock; 6501 goto done_unlock;
6502 } 6502 }
6503 6503
6504 /* 6504 /*
6505 * Commands querying/configuring an existing array: 6505 * Commands querying/configuring an existing array:
6506 */ 6506 */
6507 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY, 6507 /* if we are not initialised yet, only ADD_NEW_DISK, STOP_ARRAY,
6508 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */ 6508 * RUN_ARRAY, and GET_ and SET_BITMAP_FILE are allowed */
6509 if ((!mddev->raid_disks && !mddev->external) 6509 if ((!mddev->raid_disks && !mddev->external)
6510 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY 6510 && cmd != ADD_NEW_DISK && cmd != STOP_ARRAY
6511 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE 6511 && cmd != RUN_ARRAY && cmd != SET_BITMAP_FILE
6512 && cmd != GET_BITMAP_FILE) { 6512 && cmd != GET_BITMAP_FILE) {
6513 err = -ENODEV; 6513 err = -ENODEV;
6514 goto abort_unlock; 6514 goto abort_unlock;
6515 } 6515 }
6516 6516
6517 /* 6517 /*
6518 * Commands even a read-only array can execute: 6518 * Commands even a read-only array can execute:
6519 */ 6519 */
6520 switch (cmd) { 6520 switch (cmd) {
6521 case GET_BITMAP_FILE: 6521 case GET_BITMAP_FILE:
6522 err = get_bitmap_file(mddev, argp); 6522 err = get_bitmap_file(mddev, argp);
6523 goto done_unlock; 6523 goto done_unlock;
6524 6524
6525 case RESTART_ARRAY_RW: 6525 case RESTART_ARRAY_RW:
6526 err = restart_array(mddev); 6526 err = restart_array(mddev);
6527 goto done_unlock; 6527 goto done_unlock;
6528 6528
6529 case STOP_ARRAY: 6529 case STOP_ARRAY:
6530 err = do_md_stop(mddev, 0, bdev); 6530 err = do_md_stop(mddev, 0, bdev);
6531 goto done_unlock; 6531 goto done_unlock;
6532 6532
6533 case STOP_ARRAY_RO: 6533 case STOP_ARRAY_RO:
6534 err = md_set_readonly(mddev, bdev); 6534 err = md_set_readonly(mddev, bdev);
6535 goto done_unlock; 6535 goto done_unlock;
6536 6536
6537 case HOT_REMOVE_DISK: 6537 case HOT_REMOVE_DISK:
6538 err = hot_remove_disk(mddev, new_decode_dev(arg)); 6538 err = hot_remove_disk(mddev, new_decode_dev(arg));
6539 goto done_unlock; 6539 goto done_unlock;
6540 6540
6541 case ADD_NEW_DISK: 6541 case ADD_NEW_DISK:
6542 /* We can support ADD_NEW_DISK on read-only arrays 6542 /* We can support ADD_NEW_DISK on read-only arrays
6543 * on if we are re-adding a preexisting device. 6543 * on if we are re-adding a preexisting device.
6544 * So require mddev->pers and MD_DISK_SYNC. 6544 * So require mddev->pers and MD_DISK_SYNC.
6545 */ 6545 */
6546 if (mddev->pers) { 6546 if (mddev->pers) {
6547 mdu_disk_info_t info; 6547 mdu_disk_info_t info;
6548 if (copy_from_user(&info, argp, sizeof(info))) 6548 if (copy_from_user(&info, argp, sizeof(info)))
6549 err = -EFAULT; 6549 err = -EFAULT;
6550 else if (!(info.state & (1<<MD_DISK_SYNC))) 6550 else if (!(info.state & (1<<MD_DISK_SYNC)))
6551 /* Need to clear read-only for this */ 6551 /* Need to clear read-only for this */
6552 break; 6552 break;
6553 else 6553 else
6554 err = add_new_disk(mddev, &info); 6554 err = add_new_disk(mddev, &info);
6555 goto done_unlock; 6555 goto done_unlock;
6556 } 6556 }
6557 break; 6557 break;
6558 6558
6559 case BLKROSET: 6559 case BLKROSET:
6560 if (get_user(ro, (int __user *)(arg))) { 6560 if (get_user(ro, (int __user *)(arg))) {
6561 err = -EFAULT; 6561 err = -EFAULT;
6562 goto done_unlock; 6562 goto done_unlock;
6563 } 6563 }
6564 err = -EINVAL; 6564 err = -EINVAL;
6565 6565
6566 /* if the bdev is going readonly the value of mddev->ro 6566 /* if the bdev is going readonly the value of mddev->ro
6567 * does not matter, no writes are coming 6567 * does not matter, no writes are coming
6568 */ 6568 */
6569 if (ro) 6569 if (ro)
6570 goto done_unlock; 6570 goto done_unlock;
6571 6571
6572 /* are we are already prepared for writes? */ 6572 /* are we are already prepared for writes? */
6573 if (mddev->ro != 1) 6573 if (mddev->ro != 1)
6574 goto done_unlock; 6574 goto done_unlock;
6575 6575
6576 /* transitioning to readauto need only happen for 6576 /* transitioning to readauto need only happen for
6577 * arrays that call md_write_start 6577 * arrays that call md_write_start
6578 */ 6578 */
6579 if (mddev->pers) { 6579 if (mddev->pers) {
6580 err = restart_array(mddev); 6580 err = restart_array(mddev);
6581 if (err == 0) { 6581 if (err == 0) {
6582 mddev->ro = 2; 6582 mddev->ro = 2;
6583 set_disk_ro(mddev->gendisk, 0); 6583 set_disk_ro(mddev->gendisk, 0);
6584 } 6584 }
6585 } 6585 }
6586 goto done_unlock; 6586 goto done_unlock;
6587 } 6587 }
6588 6588
6589 /* 6589 /*
6590 * The remaining ioctls are changing the state of the 6590 * The remaining ioctls are changing the state of the
6591 * superblock, so we do not allow them on read-only arrays. 6591 * superblock, so we do not allow them on read-only arrays.
6592 * However non-MD ioctls (e.g. get-size) will still come through 6592 * However non-MD ioctls (e.g. get-size) will still come through
6593 * here and hit the 'default' below, so only disallow 6593 * here and hit the 'default' below, so only disallow
6594 * 'md' ioctls, and switch to rw mode if started auto-readonly. 6594 * 'md' ioctls, and switch to rw mode if started auto-readonly.
6595 */ 6595 */
6596 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) { 6596 if (_IOC_TYPE(cmd) == MD_MAJOR && mddev->ro && mddev->pers) {
6597 if (mddev->ro == 2) { 6597 if (mddev->ro == 2) {
6598 mddev->ro = 0; 6598 mddev->ro = 0;
6599 sysfs_notify_dirent_safe(mddev->sysfs_state); 6599 sysfs_notify_dirent_safe(mddev->sysfs_state);
6600 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6600 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6601 /* mddev_unlock will wake thread */ 6601 /* mddev_unlock will wake thread */
6602 /* If a device failed while we were read-only, we 6602 /* If a device failed while we were read-only, we
6603 * need to make sure the metadata is updated now. 6603 * need to make sure the metadata is updated now.
6604 */ 6604 */
6605 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) { 6605 if (test_bit(MD_CHANGE_DEVS, &mddev->flags)) {
6606 mddev_unlock(mddev); 6606 mddev_unlock(mddev);
6607 wait_event(mddev->sb_wait, 6607 wait_event(mddev->sb_wait,
6608 !test_bit(MD_CHANGE_DEVS, &mddev->flags) && 6608 !test_bit(MD_CHANGE_DEVS, &mddev->flags) &&
6609 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 6609 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
6610 mddev_lock_nointr(mddev); 6610 mddev_lock_nointr(mddev);
6611 } 6611 }
6612 } else { 6612 } else {
6613 err = -EROFS; 6613 err = -EROFS;
6614 goto abort_unlock; 6614 goto abort_unlock;
6615 } 6615 }
6616 } 6616 }
6617 6617
6618 switch (cmd) { 6618 switch (cmd) {
6619 case ADD_NEW_DISK: 6619 case ADD_NEW_DISK:
6620 { 6620 {
6621 mdu_disk_info_t info; 6621 mdu_disk_info_t info;
6622 if (copy_from_user(&info, argp, sizeof(info))) 6622 if (copy_from_user(&info, argp, sizeof(info)))
6623 err = -EFAULT; 6623 err = -EFAULT;
6624 else 6624 else
6625 err = add_new_disk(mddev, &info); 6625 err = add_new_disk(mddev, &info);
6626 goto done_unlock; 6626 goto done_unlock;
6627 } 6627 }
6628 6628
6629 case HOT_ADD_DISK: 6629 case HOT_ADD_DISK:
6630 err = hot_add_disk(mddev, new_decode_dev(arg)); 6630 err = hot_add_disk(mddev, new_decode_dev(arg));
6631 goto done_unlock; 6631 goto done_unlock;
6632 6632
6633 case RUN_ARRAY: 6633 case RUN_ARRAY:
6634 err = do_md_run(mddev); 6634 err = do_md_run(mddev);
6635 goto done_unlock; 6635 goto done_unlock;
6636 6636
6637 case SET_BITMAP_FILE: 6637 case SET_BITMAP_FILE:
6638 err = set_bitmap_file(mddev, (int)arg); 6638 err = set_bitmap_file(mddev, (int)arg);
6639 goto done_unlock; 6639 goto done_unlock;
6640 6640
6641 default: 6641 default:
6642 err = -EINVAL; 6642 err = -EINVAL;
6643 goto abort_unlock; 6643 goto abort_unlock;
6644 } 6644 }
6645 6645
6646 done_unlock: 6646 done_unlock:
6647 abort_unlock: 6647 abort_unlock:
6648 if (mddev->hold_active == UNTIL_IOCTL && 6648 if (mddev->hold_active == UNTIL_IOCTL &&
6649 err != -EINVAL) 6649 err != -EINVAL)
6650 mddev->hold_active = 0; 6650 mddev->hold_active = 0;
6651 mddev_unlock(mddev); 6651 mddev_unlock(mddev);
6652 6652
6653 return err; 6653 return err;
6654 done: 6654 done:
6655 if (err) 6655 if (err)
6656 MD_BUG(); 6656 MD_BUG();
6657 abort: 6657 abort:
6658 return err; 6658 return err;
6659 } 6659 }
6660 #ifdef CONFIG_COMPAT 6660 #ifdef CONFIG_COMPAT
6661 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode, 6661 static int md_compat_ioctl(struct block_device *bdev, fmode_t mode,
6662 unsigned int cmd, unsigned long arg) 6662 unsigned int cmd, unsigned long arg)
6663 { 6663 {
6664 switch (cmd) { 6664 switch (cmd) {
6665 case HOT_REMOVE_DISK: 6665 case HOT_REMOVE_DISK:
6666 case HOT_ADD_DISK: 6666 case HOT_ADD_DISK:
6667 case SET_DISK_FAULTY: 6667 case SET_DISK_FAULTY:
6668 case SET_BITMAP_FILE: 6668 case SET_BITMAP_FILE:
6669 /* These take in integer arg, do not convert */ 6669 /* These take in integer arg, do not convert */
6670 break; 6670 break;
6671 default: 6671 default:
6672 arg = (unsigned long)compat_ptr(arg); 6672 arg = (unsigned long)compat_ptr(arg);
6673 break; 6673 break;
6674 } 6674 }
6675 6675
6676 return md_ioctl(bdev, mode, cmd, arg); 6676 return md_ioctl(bdev, mode, cmd, arg);
6677 } 6677 }
6678 #endif /* CONFIG_COMPAT */ 6678 #endif /* CONFIG_COMPAT */
6679 6679
6680 static int md_open(struct block_device *bdev, fmode_t mode) 6680 static int md_open(struct block_device *bdev, fmode_t mode)
6681 { 6681 {
6682 /* 6682 /*
6683 * Succeed if we can lock the mddev, which confirms that 6683 * Succeed if we can lock the mddev, which confirms that
6684 * it isn't being stopped right now. 6684 * it isn't being stopped right now.
6685 */ 6685 */
6686 struct mddev *mddev = mddev_find(bdev->bd_dev); 6686 struct mddev *mddev = mddev_find(bdev->bd_dev);
6687 int err; 6687 int err;
6688 6688
6689 if (!mddev) 6689 if (!mddev)
6690 return -ENODEV; 6690 return -ENODEV;
6691 6691
6692 if (mddev->gendisk != bdev->bd_disk) { 6692 if (mddev->gendisk != bdev->bd_disk) {
6693 /* we are racing with mddev_put which is discarding this 6693 /* we are racing with mddev_put which is discarding this
6694 * bd_disk. 6694 * bd_disk.
6695 */ 6695 */
6696 mddev_put(mddev); 6696 mddev_put(mddev);
6697 /* Wait until bdev->bd_disk is definitely gone */ 6697 /* Wait until bdev->bd_disk is definitely gone */
6698 flush_workqueue(md_misc_wq); 6698 flush_workqueue(md_misc_wq);
6699 /* Then retry the open from the top */ 6699 /* Then retry the open from the top */
6700 return -ERESTARTSYS; 6700 return -ERESTARTSYS;
6701 } 6701 }
6702 BUG_ON(mddev != bdev->bd_disk->private_data); 6702 BUG_ON(mddev != bdev->bd_disk->private_data);
6703 6703
6704 if ((err = mutex_lock_interruptible(&mddev->open_mutex))) 6704 if ((err = mutex_lock_interruptible(&mddev->open_mutex)))
6705 goto out; 6705 goto out;
6706 6706
6707 err = 0; 6707 err = 0;
6708 atomic_inc(&mddev->openers); 6708 atomic_inc(&mddev->openers);
6709 clear_bit(MD_STILL_CLOSED, &mddev->flags); 6709 clear_bit(MD_STILL_CLOSED, &mddev->flags);
6710 mutex_unlock(&mddev->open_mutex); 6710 mutex_unlock(&mddev->open_mutex);
6711 6711
6712 check_disk_change(bdev); 6712 check_disk_change(bdev);
6713 out: 6713 out:
6714 return err; 6714 return err;
6715 } 6715 }
6716 6716
6717 static void md_release(struct gendisk *disk, fmode_t mode) 6717 static void md_release(struct gendisk *disk, fmode_t mode)
6718 { 6718 {
6719 struct mddev *mddev = disk->private_data; 6719 struct mddev *mddev = disk->private_data;
6720 6720
6721 BUG_ON(!mddev); 6721 BUG_ON(!mddev);
6722 atomic_dec(&mddev->openers); 6722 atomic_dec(&mddev->openers);
6723 mddev_put(mddev); 6723 mddev_put(mddev);
6724 } 6724 }
6725 6725
6726 static int md_media_changed(struct gendisk *disk) 6726 static int md_media_changed(struct gendisk *disk)
6727 { 6727 {
6728 struct mddev *mddev = disk->private_data; 6728 struct mddev *mddev = disk->private_data;
6729 6729
6730 return mddev->changed; 6730 return mddev->changed;
6731 } 6731 }
6732 6732
6733 static int md_revalidate(struct gendisk *disk) 6733 static int md_revalidate(struct gendisk *disk)
6734 { 6734 {
6735 struct mddev *mddev = disk->private_data; 6735 struct mddev *mddev = disk->private_data;
6736 6736
6737 mddev->changed = 0; 6737 mddev->changed = 0;
6738 return 0; 6738 return 0;
6739 } 6739 }
6740 static const struct block_device_operations md_fops = 6740 static const struct block_device_operations md_fops =
6741 { 6741 {
6742 .owner = THIS_MODULE, 6742 .owner = THIS_MODULE,
6743 .open = md_open, 6743 .open = md_open,
6744 .release = md_release, 6744 .release = md_release,
6745 .ioctl = md_ioctl, 6745 .ioctl = md_ioctl,
6746 #ifdef CONFIG_COMPAT 6746 #ifdef CONFIG_COMPAT
6747 .compat_ioctl = md_compat_ioctl, 6747 .compat_ioctl = md_compat_ioctl,
6748 #endif 6748 #endif
6749 .getgeo = md_getgeo, 6749 .getgeo = md_getgeo,
6750 .media_changed = md_media_changed, 6750 .media_changed = md_media_changed,
6751 .revalidate_disk= md_revalidate, 6751 .revalidate_disk= md_revalidate,
6752 }; 6752 };
6753 6753
6754 static int md_thread(void * arg) 6754 static int md_thread(void * arg)
6755 { 6755 {
6756 struct md_thread *thread = arg; 6756 struct md_thread *thread = arg;
6757 6757
6758 /* 6758 /*
6759 * md_thread is a 'system-thread', it's priority should be very 6759 * md_thread is a 'system-thread', it's priority should be very
6760 * high. We avoid resource deadlocks individually in each 6760 * high. We avoid resource deadlocks individually in each
6761 * raid personality. (RAID5 does preallocation) We also use RR and 6761 * raid personality. (RAID5 does preallocation) We also use RR and
6762 * the very same RT priority as kswapd, thus we will never get 6762 * the very same RT priority as kswapd, thus we will never get
6763 * into a priority inversion deadlock. 6763 * into a priority inversion deadlock.
6764 * 6764 *
6765 * we definitely have to have equal or higher priority than 6765 * we definitely have to have equal or higher priority than
6766 * bdflush, otherwise bdflush will deadlock if there are too 6766 * bdflush, otherwise bdflush will deadlock if there are too
6767 * many dirty RAID5 blocks. 6767 * many dirty RAID5 blocks.
6768 */ 6768 */
6769 6769
6770 allow_signal(SIGKILL); 6770 allow_signal(SIGKILL);
6771 while (!kthread_should_stop()) { 6771 while (!kthread_should_stop()) {
6772 6772
6773 /* We need to wait INTERRUPTIBLE so that 6773 /* We need to wait INTERRUPTIBLE so that
6774 * we don't add to the load-average. 6774 * we don't add to the load-average.
6775 * That means we need to be sure no signals are 6775 * That means we need to be sure no signals are
6776 * pending 6776 * pending
6777 */ 6777 */
6778 if (signal_pending(current)) 6778 if (signal_pending(current))
6779 flush_signals(current); 6779 flush_signals(current);
6780 6780
6781 wait_event_interruptible_timeout 6781 wait_event_interruptible_timeout
6782 (thread->wqueue, 6782 (thread->wqueue,
6783 test_bit(THREAD_WAKEUP, &thread->flags) 6783 test_bit(THREAD_WAKEUP, &thread->flags)
6784 || kthread_should_stop(), 6784 || kthread_should_stop(),
6785 thread->timeout); 6785 thread->timeout);
6786 6786
6787 clear_bit(THREAD_WAKEUP, &thread->flags); 6787 clear_bit(THREAD_WAKEUP, &thread->flags);
6788 if (!kthread_should_stop()) 6788 if (!kthread_should_stop())
6789 thread->run(thread); 6789 thread->run(thread);
6790 } 6790 }
6791 6791
6792 return 0; 6792 return 0;
6793 } 6793 }
6794 6794
6795 void md_wakeup_thread(struct md_thread *thread) 6795 void md_wakeup_thread(struct md_thread *thread)
6796 { 6796 {
6797 if (thread) { 6797 if (thread) {
6798 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm); 6798 pr_debug("md: waking up MD thread %s.\n", thread->tsk->comm);
6799 set_bit(THREAD_WAKEUP, &thread->flags); 6799 set_bit(THREAD_WAKEUP, &thread->flags);
6800 wake_up(&thread->wqueue); 6800 wake_up(&thread->wqueue);
6801 } 6801 }
6802 } 6802 }
6803 6803
6804 struct md_thread *md_register_thread(void (*run) (struct md_thread *), 6804 struct md_thread *md_register_thread(void (*run) (struct md_thread *),
6805 struct mddev *mddev, const char *name) 6805 struct mddev *mddev, const char *name)
6806 { 6806 {
6807 struct md_thread *thread; 6807 struct md_thread *thread;
6808 6808
6809 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL); 6809 thread = kzalloc(sizeof(struct md_thread), GFP_KERNEL);
6810 if (!thread) 6810 if (!thread)
6811 return NULL; 6811 return NULL;
6812 6812
6813 init_waitqueue_head(&thread->wqueue); 6813 init_waitqueue_head(&thread->wqueue);
6814 6814
6815 thread->run = run; 6815 thread->run = run;
6816 thread->mddev = mddev; 6816 thread->mddev = mddev;
6817 thread->timeout = MAX_SCHEDULE_TIMEOUT; 6817 thread->timeout = MAX_SCHEDULE_TIMEOUT;
6818 thread->tsk = kthread_run(md_thread, thread, 6818 thread->tsk = kthread_run(md_thread, thread,
6819 "%s_%s", 6819 "%s_%s",
6820 mdname(thread->mddev), 6820 mdname(thread->mddev),
6821 name); 6821 name);
6822 if (IS_ERR(thread->tsk)) { 6822 if (IS_ERR(thread->tsk)) {
6823 kfree(thread); 6823 kfree(thread);
6824 return NULL; 6824 return NULL;
6825 } 6825 }
6826 return thread; 6826 return thread;
6827 } 6827 }
6828 6828
6829 void md_unregister_thread(struct md_thread **threadp) 6829 void md_unregister_thread(struct md_thread **threadp)
6830 { 6830 {
6831 struct md_thread *thread = *threadp; 6831 struct md_thread *thread = *threadp;
6832 if (!thread) 6832 if (!thread)
6833 return; 6833 return;
6834 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk)); 6834 pr_debug("interrupting MD-thread pid %d\n", task_pid_nr(thread->tsk));
6835 /* Locking ensures that mddev_unlock does not wake_up a 6835 /* Locking ensures that mddev_unlock does not wake_up a
6836 * non-existent thread 6836 * non-existent thread
6837 */ 6837 */
6838 spin_lock(&pers_lock); 6838 spin_lock(&pers_lock);
6839 *threadp = NULL; 6839 *threadp = NULL;
6840 spin_unlock(&pers_lock); 6840 spin_unlock(&pers_lock);
6841 6841
6842 kthread_stop(thread->tsk); 6842 kthread_stop(thread->tsk);
6843 kfree(thread); 6843 kfree(thread);
6844 } 6844 }
6845 6845
6846 void md_error(struct mddev *mddev, struct md_rdev *rdev) 6846 void md_error(struct mddev *mddev, struct md_rdev *rdev)
6847 { 6847 {
6848 if (!mddev) { 6848 if (!mddev) {
6849 MD_BUG(); 6849 MD_BUG();
6850 return; 6850 return;
6851 } 6851 }
6852 6852
6853 if (!rdev || test_bit(Faulty, &rdev->flags)) 6853 if (!rdev || test_bit(Faulty, &rdev->flags))
6854 return; 6854 return;
6855 6855
6856 if (!mddev->pers || !mddev->pers->error_handler) 6856 if (!mddev->pers || !mddev->pers->error_handler)
6857 return; 6857 return;
6858 mddev->pers->error_handler(mddev,rdev); 6858 mddev->pers->error_handler(mddev,rdev);
6859 if (mddev->degraded) 6859 if (mddev->degraded)
6860 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 6860 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
6861 sysfs_notify_dirent_safe(rdev->sysfs_state); 6861 sysfs_notify_dirent_safe(rdev->sysfs_state);
6862 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 6862 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
6863 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 6863 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
6864 md_wakeup_thread(mddev->thread); 6864 md_wakeup_thread(mddev->thread);
6865 if (mddev->event_work.func) 6865 if (mddev->event_work.func)
6866 queue_work(md_misc_wq, &mddev->event_work); 6866 queue_work(md_misc_wq, &mddev->event_work);
6867 md_new_event_inintr(mddev); 6867 md_new_event_inintr(mddev);
6868 } 6868 }
6869 6869
6870 /* seq_file implementation /proc/mdstat */ 6870 /* seq_file implementation /proc/mdstat */
6871 6871
6872 static void status_unused(struct seq_file *seq) 6872 static void status_unused(struct seq_file *seq)
6873 { 6873 {
6874 int i = 0; 6874 int i = 0;
6875 struct md_rdev *rdev; 6875 struct md_rdev *rdev;
6876 6876
6877 seq_printf(seq, "unused devices: "); 6877 seq_printf(seq, "unused devices: ");
6878 6878
6879 list_for_each_entry(rdev, &pending_raid_disks, same_set) { 6879 list_for_each_entry(rdev, &pending_raid_disks, same_set) {
6880 char b[BDEVNAME_SIZE]; 6880 char b[BDEVNAME_SIZE];
6881 i++; 6881 i++;
6882 seq_printf(seq, "%s ", 6882 seq_printf(seq, "%s ",
6883 bdevname(rdev->bdev,b)); 6883 bdevname(rdev->bdev,b));
6884 } 6884 }
6885 if (!i) 6885 if (!i)
6886 seq_printf(seq, "<none>"); 6886 seq_printf(seq, "<none>");
6887 6887
6888 seq_printf(seq, "\n"); 6888 seq_printf(seq, "\n");
6889 } 6889 }
6890 6890
6891 6891
6892 static void status_resync(struct seq_file *seq, struct mddev * mddev) 6892 static void status_resync(struct seq_file *seq, struct mddev * mddev)
6893 { 6893 {
6894 sector_t max_sectors, resync, res; 6894 sector_t max_sectors, resync, res;
6895 unsigned long dt, db; 6895 unsigned long dt, db;
6896 sector_t rt; 6896 sector_t rt;
6897 int scale; 6897 int scale;
6898 unsigned int per_milli; 6898 unsigned int per_milli;
6899 6899
6900 if (mddev->curr_resync <= 3) 6900 if (mddev->curr_resync <= 3)
6901 resync = 0; 6901 resync = 0;
6902 else 6902 else
6903 resync = mddev->curr_resync 6903 resync = mddev->curr_resync
6904 - atomic_read(&mddev->recovery_active); 6904 - atomic_read(&mddev->recovery_active);
6905 6905
6906 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) || 6906 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
6907 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 6907 test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
6908 max_sectors = mddev->resync_max_sectors; 6908 max_sectors = mddev->resync_max_sectors;
6909 else 6909 else
6910 max_sectors = mddev->dev_sectors; 6910 max_sectors = mddev->dev_sectors;
6911 6911
6912 /* 6912 /*
6913 * Should not happen. 6913 * Should not happen.
6914 */ 6914 */
6915 if (!max_sectors) { 6915 if (!max_sectors) {
6916 MD_BUG(); 6916 MD_BUG();
6917 return; 6917 return;
6918 } 6918 }
6919 /* Pick 'scale' such that (resync>>scale)*1000 will fit 6919 /* Pick 'scale' such that (resync>>scale)*1000 will fit
6920 * in a sector_t, and (max_sectors>>scale) will fit in a 6920 * in a sector_t, and (max_sectors>>scale) will fit in a
6921 * u32, as those are the requirements for sector_div. 6921 * u32, as those are the requirements for sector_div.
6922 * Thus 'scale' must be at least 10 6922 * Thus 'scale' must be at least 10
6923 */ 6923 */
6924 scale = 10; 6924 scale = 10;
6925 if (sizeof(sector_t) > sizeof(unsigned long)) { 6925 if (sizeof(sector_t) > sizeof(unsigned long)) {
6926 while ( max_sectors/2 > (1ULL<<(scale+32))) 6926 while ( max_sectors/2 > (1ULL<<(scale+32)))
6927 scale++; 6927 scale++;
6928 } 6928 }
6929 res = (resync>>scale)*1000; 6929 res = (resync>>scale)*1000;
6930 sector_div(res, (u32)((max_sectors>>scale)+1)); 6930 sector_div(res, (u32)((max_sectors>>scale)+1));
6931 6931
6932 per_milli = res; 6932 per_milli = res;
6933 { 6933 {
6934 int i, x = per_milli/50, y = 20-x; 6934 int i, x = per_milli/50, y = 20-x;
6935 seq_printf(seq, "["); 6935 seq_printf(seq, "[");
6936 for (i = 0; i < x; i++) 6936 for (i = 0; i < x; i++)
6937 seq_printf(seq, "="); 6937 seq_printf(seq, "=");
6938 seq_printf(seq, ">"); 6938 seq_printf(seq, ">");
6939 for (i = 0; i < y; i++) 6939 for (i = 0; i < y; i++)
6940 seq_printf(seq, "."); 6940 seq_printf(seq, ".");
6941 seq_printf(seq, "] "); 6941 seq_printf(seq, "] ");
6942 } 6942 }
6943 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)", 6943 seq_printf(seq, " %s =%3u.%u%% (%llu/%llu)",
6944 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)? 6944 (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)?
6945 "reshape" : 6945 "reshape" :
6946 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)? 6946 (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)?
6947 "check" : 6947 "check" :
6948 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ? 6948 (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ?
6949 "resync" : "recovery"))), 6949 "resync" : "recovery"))),
6950 per_milli/10, per_milli % 10, 6950 per_milli/10, per_milli % 10,
6951 (unsigned long long) resync/2, 6951 (unsigned long long) resync/2,
6952 (unsigned long long) max_sectors/2); 6952 (unsigned long long) max_sectors/2);
6953 6953
6954 /* 6954 /*
6955 * dt: time from mark until now 6955 * dt: time from mark until now
6956 * db: blocks written from mark until now 6956 * db: blocks written from mark until now
6957 * rt: remaining time 6957 * rt: remaining time
6958 * 6958 *
6959 * rt is a sector_t, so could be 32bit or 64bit. 6959 * rt is a sector_t, so could be 32bit or 64bit.
6960 * So we divide before multiply in case it is 32bit and close 6960 * So we divide before multiply in case it is 32bit and close
6961 * to the limit. 6961 * to the limit.
6962 * We scale the divisor (db) by 32 to avoid losing precision 6962 * We scale the divisor (db) by 32 to avoid losing precision
6963 * near the end of resync when the number of remaining sectors 6963 * near the end of resync when the number of remaining sectors
6964 * is close to 'db'. 6964 * is close to 'db'.
6965 * We then divide rt by 32 after multiplying by db to compensate. 6965 * We then divide rt by 32 after multiplying by db to compensate.
6966 * The '+1' avoids division by zero if db is very small. 6966 * The '+1' avoids division by zero if db is very small.
6967 */ 6967 */
6968 dt = ((jiffies - mddev->resync_mark) / HZ); 6968 dt = ((jiffies - mddev->resync_mark) / HZ);
6969 if (!dt) dt++; 6969 if (!dt) dt++;
6970 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active)) 6970 db = (mddev->curr_mark_cnt - atomic_read(&mddev->recovery_active))
6971 - mddev->resync_mark_cnt; 6971 - mddev->resync_mark_cnt;
6972 6972
6973 rt = max_sectors - resync; /* number of remaining sectors */ 6973 rt = max_sectors - resync; /* number of remaining sectors */
6974 sector_div(rt, db/32+1); 6974 sector_div(rt, db/32+1);
6975 rt *= dt; 6975 rt *= dt;
6976 rt >>= 5; 6976 rt >>= 5;
6977 6977
6978 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60, 6978 seq_printf(seq, " finish=%lu.%lumin", (unsigned long)rt / 60,
6979 ((unsigned long)rt % 60)/6); 6979 ((unsigned long)rt % 60)/6);
6980 6980
6981 seq_printf(seq, " speed=%ldK/sec", db/2/dt); 6981 seq_printf(seq, " speed=%ldK/sec", db/2/dt);
6982 } 6982 }
6983 6983
6984 static void *md_seq_start(struct seq_file *seq, loff_t *pos) 6984 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
6985 { 6985 {
6986 struct list_head *tmp; 6986 struct list_head *tmp;
6987 loff_t l = *pos; 6987 loff_t l = *pos;
6988 struct mddev *mddev; 6988 struct mddev *mddev;
6989 6989
6990 if (l >= 0x10000) 6990 if (l >= 0x10000)
6991 return NULL; 6991 return NULL;
6992 if (!l--) 6992 if (!l--)
6993 /* header */ 6993 /* header */
6994 return (void*)1; 6994 return (void*)1;
6995 6995
6996 spin_lock(&all_mddevs_lock); 6996 spin_lock(&all_mddevs_lock);
6997 list_for_each(tmp,&all_mddevs) 6997 list_for_each(tmp,&all_mddevs)
6998 if (!l--) { 6998 if (!l--) {
6999 mddev = list_entry(tmp, struct mddev, all_mddevs); 6999 mddev = list_entry(tmp, struct mddev, all_mddevs);
7000 mddev_get(mddev); 7000 mddev_get(mddev);
7001 spin_unlock(&all_mddevs_lock); 7001 spin_unlock(&all_mddevs_lock);
7002 return mddev; 7002 return mddev;
7003 } 7003 }
7004 spin_unlock(&all_mddevs_lock); 7004 spin_unlock(&all_mddevs_lock);
7005 if (!l--) 7005 if (!l--)
7006 return (void*)2;/* tail */ 7006 return (void*)2;/* tail */
7007 return NULL; 7007 return NULL;
7008 } 7008 }
7009 7009
7010 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos) 7010 static void *md_seq_next(struct seq_file *seq, void *v, loff_t *pos)
7011 { 7011 {
7012 struct list_head *tmp; 7012 struct list_head *tmp;
7013 struct mddev *next_mddev, *mddev = v; 7013 struct mddev *next_mddev, *mddev = v;
7014 7014
7015 ++*pos; 7015 ++*pos;
7016 if (v == (void*)2) 7016 if (v == (void*)2)
7017 return NULL; 7017 return NULL;
7018 7018
7019 spin_lock(&all_mddevs_lock); 7019 spin_lock(&all_mddevs_lock);
7020 if (v == (void*)1) 7020 if (v == (void*)1)
7021 tmp = all_mddevs.next; 7021 tmp = all_mddevs.next;
7022 else 7022 else
7023 tmp = mddev->all_mddevs.next; 7023 tmp = mddev->all_mddevs.next;
7024 if (tmp != &all_mddevs) 7024 if (tmp != &all_mddevs)
7025 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs)); 7025 next_mddev = mddev_get(list_entry(tmp,struct mddev,all_mddevs));
7026 else { 7026 else {
7027 next_mddev = (void*)2; 7027 next_mddev = (void*)2;
7028 *pos = 0x10000; 7028 *pos = 0x10000;
7029 } 7029 }
7030 spin_unlock(&all_mddevs_lock); 7030 spin_unlock(&all_mddevs_lock);
7031 7031
7032 if (v != (void*)1) 7032 if (v != (void*)1)
7033 mddev_put(mddev); 7033 mddev_put(mddev);
7034 return next_mddev; 7034 return next_mddev;
7035 7035
7036 } 7036 }
7037 7037
7038 static void md_seq_stop(struct seq_file *seq, void *v) 7038 static void md_seq_stop(struct seq_file *seq, void *v)
7039 { 7039 {
7040 struct mddev *mddev = v; 7040 struct mddev *mddev = v;
7041 7041
7042 if (mddev && v != (void*)1 && v != (void*)2) 7042 if (mddev && v != (void*)1 && v != (void*)2)
7043 mddev_put(mddev); 7043 mddev_put(mddev);
7044 } 7044 }
7045 7045
7046 static int md_seq_show(struct seq_file *seq, void *v) 7046 static int md_seq_show(struct seq_file *seq, void *v)
7047 { 7047 {
7048 struct mddev *mddev = v; 7048 struct mddev *mddev = v;
7049 sector_t sectors; 7049 sector_t sectors;
7050 struct md_rdev *rdev; 7050 struct md_rdev *rdev;
7051 7051
7052 if (v == (void*)1) { 7052 if (v == (void*)1) {
7053 struct md_personality *pers; 7053 struct md_personality *pers;
7054 seq_printf(seq, "Personalities : "); 7054 seq_printf(seq, "Personalities : ");
7055 spin_lock(&pers_lock); 7055 spin_lock(&pers_lock);
7056 list_for_each_entry(pers, &pers_list, list) 7056 list_for_each_entry(pers, &pers_list, list)
7057 seq_printf(seq, "[%s] ", pers->name); 7057 seq_printf(seq, "[%s] ", pers->name);
7058 7058
7059 spin_unlock(&pers_lock); 7059 spin_unlock(&pers_lock);
7060 seq_printf(seq, "\n"); 7060 seq_printf(seq, "\n");
7061 seq->poll_event = atomic_read(&md_event_count); 7061 seq->poll_event = atomic_read(&md_event_count);
7062 return 0; 7062 return 0;
7063 } 7063 }
7064 if (v == (void*)2) { 7064 if (v == (void*)2) {
7065 status_unused(seq); 7065 status_unused(seq);
7066 return 0; 7066 return 0;
7067 } 7067 }
7068 7068
7069 if (mddev_lock(mddev) < 0) 7069 if (mddev_lock(mddev) < 0)
7070 return -EINTR; 7070 return -EINTR;
7071 7071
7072 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { 7072 if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) {
7073 seq_printf(seq, "%s : %sactive", mdname(mddev), 7073 seq_printf(seq, "%s : %sactive", mdname(mddev),
7074 mddev->pers ? "" : "in"); 7074 mddev->pers ? "" : "in");
7075 if (mddev->pers) { 7075 if (mddev->pers) {
7076 if (mddev->ro==1) 7076 if (mddev->ro==1)
7077 seq_printf(seq, " (read-only)"); 7077 seq_printf(seq, " (read-only)");
7078 if (mddev->ro==2) 7078 if (mddev->ro==2)
7079 seq_printf(seq, " (auto-read-only)"); 7079 seq_printf(seq, " (auto-read-only)");
7080 seq_printf(seq, " %s", mddev->pers->name); 7080 seq_printf(seq, " %s", mddev->pers->name);
7081 } 7081 }
7082 7082
7083 sectors = 0; 7083 sectors = 0;
7084 rdev_for_each(rdev, mddev) { 7084 rdev_for_each(rdev, mddev) {
7085 char b[BDEVNAME_SIZE]; 7085 char b[BDEVNAME_SIZE];
7086 seq_printf(seq, " %s[%d]", 7086 seq_printf(seq, " %s[%d]",
7087 bdevname(rdev->bdev,b), rdev->desc_nr); 7087 bdevname(rdev->bdev,b), rdev->desc_nr);
7088 if (test_bit(WriteMostly, &rdev->flags)) 7088 if (test_bit(WriteMostly, &rdev->flags))
7089 seq_printf(seq, "(W)"); 7089 seq_printf(seq, "(W)");
7090 if (test_bit(Faulty, &rdev->flags)) { 7090 if (test_bit(Faulty, &rdev->flags)) {
7091 seq_printf(seq, "(F)"); 7091 seq_printf(seq, "(F)");
7092 continue; 7092 continue;
7093 } 7093 }
7094 if (rdev->raid_disk < 0) 7094 if (rdev->raid_disk < 0)
7095 seq_printf(seq, "(S)"); /* spare */ 7095 seq_printf(seq, "(S)"); /* spare */
7096 if (test_bit(Replacement, &rdev->flags)) 7096 if (test_bit(Replacement, &rdev->flags))
7097 seq_printf(seq, "(R)"); 7097 seq_printf(seq, "(R)");
7098 sectors += rdev->sectors; 7098 sectors += rdev->sectors;
7099 } 7099 }
7100 7100
7101 if (!list_empty(&mddev->disks)) { 7101 if (!list_empty(&mddev->disks)) {
7102 if (mddev->pers) 7102 if (mddev->pers)
7103 seq_printf(seq, "\n %llu blocks", 7103 seq_printf(seq, "\n %llu blocks",
7104 (unsigned long long) 7104 (unsigned long long)
7105 mddev->array_sectors / 2); 7105 mddev->array_sectors / 2);
7106 else 7106 else
7107 seq_printf(seq, "\n %llu blocks", 7107 seq_printf(seq, "\n %llu blocks",
7108 (unsigned long long)sectors / 2); 7108 (unsigned long long)sectors / 2);
7109 } 7109 }
7110 if (mddev->persistent) { 7110 if (mddev->persistent) {
7111 if (mddev->major_version != 0 || 7111 if (mddev->major_version != 0 ||
7112 mddev->minor_version != 90) { 7112 mddev->minor_version != 90) {
7113 seq_printf(seq," super %d.%d", 7113 seq_printf(seq," super %d.%d",
7114 mddev->major_version, 7114 mddev->major_version,
7115 mddev->minor_version); 7115 mddev->minor_version);
7116 } 7116 }
7117 } else if (mddev->external) 7117 } else if (mddev->external)
7118 seq_printf(seq, " super external:%s", 7118 seq_printf(seq, " super external:%s",
7119 mddev->metadata_type); 7119 mddev->metadata_type);
7120 else 7120 else
7121 seq_printf(seq, " super non-persistent"); 7121 seq_printf(seq, " super non-persistent");
7122 7122
7123 if (mddev->pers) { 7123 if (mddev->pers) {
7124 mddev->pers->status(seq, mddev); 7124 mddev->pers->status(seq, mddev);
7125 seq_printf(seq, "\n "); 7125 seq_printf(seq, "\n ");
7126 if (mddev->pers->sync_request) { 7126 if (mddev->pers->sync_request) {
7127 if (mddev->curr_resync > 2) { 7127 if (mddev->curr_resync > 2) {
7128 status_resync(seq, mddev); 7128 status_resync(seq, mddev);
7129 seq_printf(seq, "\n "); 7129 seq_printf(seq, "\n ");
7130 } else if (mddev->curr_resync >= 1) 7130 } else if (mddev->curr_resync >= 1)
7131 seq_printf(seq, "\tresync=DELAYED\n "); 7131 seq_printf(seq, "\tresync=DELAYED\n ");
7132 else if (mddev->recovery_cp < MaxSector) 7132 else if (mddev->recovery_cp < MaxSector)
7133 seq_printf(seq, "\tresync=PENDING\n "); 7133 seq_printf(seq, "\tresync=PENDING\n ");
7134 } 7134 }
7135 } else 7135 } else
7136 seq_printf(seq, "\n "); 7136 seq_printf(seq, "\n ");
7137 7137
7138 bitmap_status(seq, mddev->bitmap); 7138 bitmap_status(seq, mddev->bitmap);
7139 7139
7140 seq_printf(seq, "\n"); 7140 seq_printf(seq, "\n");
7141 } 7141 }
7142 mddev_unlock(mddev); 7142 mddev_unlock(mddev);
7143 7143
7144 return 0; 7144 return 0;
7145 } 7145 }
7146 7146
7147 static const struct seq_operations md_seq_ops = { 7147 static const struct seq_operations md_seq_ops = {
7148 .start = md_seq_start, 7148 .start = md_seq_start,
7149 .next = md_seq_next, 7149 .next = md_seq_next,
7150 .stop = md_seq_stop, 7150 .stop = md_seq_stop,
7151 .show = md_seq_show, 7151 .show = md_seq_show,
7152 }; 7152 };
7153 7153
7154 static int md_seq_open(struct inode *inode, struct file *file) 7154 static int md_seq_open(struct inode *inode, struct file *file)
7155 { 7155 {
7156 struct seq_file *seq; 7156 struct seq_file *seq;
7157 int error; 7157 int error;
7158 7158
7159 error = seq_open(file, &md_seq_ops); 7159 error = seq_open(file, &md_seq_ops);
7160 if (error) 7160 if (error)
7161 return error; 7161 return error;
7162 7162
7163 seq = file->private_data; 7163 seq = file->private_data;
7164 seq->poll_event = atomic_read(&md_event_count); 7164 seq->poll_event = atomic_read(&md_event_count);
7165 return error; 7165 return error;
7166 } 7166 }
7167 7167
7168 static int md_unloading; 7168 static int md_unloading;
7169 static unsigned int mdstat_poll(struct file *filp, poll_table *wait) 7169 static unsigned int mdstat_poll(struct file *filp, poll_table *wait)
7170 { 7170 {
7171 struct seq_file *seq = filp->private_data; 7171 struct seq_file *seq = filp->private_data;
7172 int mask; 7172 int mask;
7173 7173
7174 if (md_unloading) 7174 if (md_unloading)
7175 return POLLIN|POLLRDNORM|POLLERR|POLLPRI;; 7175 return POLLIN|POLLRDNORM|POLLERR|POLLPRI;;
7176 poll_wait(filp, &md_event_waiters, wait); 7176 poll_wait(filp, &md_event_waiters, wait);
7177 7177
7178 /* always allow read */ 7178 /* always allow read */
7179 mask = POLLIN | POLLRDNORM; 7179 mask = POLLIN | POLLRDNORM;
7180 7180
7181 if (seq->poll_event != atomic_read(&md_event_count)) 7181 if (seq->poll_event != atomic_read(&md_event_count))
7182 mask |= POLLERR | POLLPRI; 7182 mask |= POLLERR | POLLPRI;
7183 return mask; 7183 return mask;
7184 } 7184 }
7185 7185
7186 static const struct file_operations md_seq_fops = { 7186 static const struct file_operations md_seq_fops = {
7187 .owner = THIS_MODULE, 7187 .owner = THIS_MODULE,
7188 .open = md_seq_open, 7188 .open = md_seq_open,
7189 .read = seq_read, 7189 .read = seq_read,
7190 .llseek = seq_lseek, 7190 .llseek = seq_lseek,
7191 .release = seq_release_private, 7191 .release = seq_release_private,
7192 .poll = mdstat_poll, 7192 .poll = mdstat_poll,
7193 }; 7193 };
7194 7194
7195 int register_md_personality(struct md_personality *p) 7195 int register_md_personality(struct md_personality *p)
7196 { 7196 {
7197 spin_lock(&pers_lock); 7197 spin_lock(&pers_lock);
7198 list_add_tail(&p->list, &pers_list); 7198 list_add_tail(&p->list, &pers_list);
7199 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level); 7199 printk(KERN_INFO "md: %s personality registered for level %d\n", p->name, p->level);
7200 spin_unlock(&pers_lock); 7200 spin_unlock(&pers_lock);
7201 return 0; 7201 return 0;
7202 } 7202 }
7203 7203
7204 int unregister_md_personality(struct md_personality *p) 7204 int unregister_md_personality(struct md_personality *p)
7205 { 7205 {
7206 printk(KERN_INFO "md: %s personality unregistered\n", p->name); 7206 printk(KERN_INFO "md: %s personality unregistered\n", p->name);
7207 spin_lock(&pers_lock); 7207 spin_lock(&pers_lock);
7208 list_del_init(&p->list); 7208 list_del_init(&p->list);
7209 spin_unlock(&pers_lock); 7209 spin_unlock(&pers_lock);
7210 return 0; 7210 return 0;
7211 } 7211 }
7212 7212
7213 static int is_mddev_idle(struct mddev *mddev, int init) 7213 static int is_mddev_idle(struct mddev *mddev, int init)
7214 { 7214 {
7215 struct md_rdev * rdev; 7215 struct md_rdev * rdev;
7216 int idle; 7216 int idle;
7217 int curr_events; 7217 int curr_events;
7218 7218
7219 idle = 1; 7219 idle = 1;
7220 rcu_read_lock(); 7220 rcu_read_lock();
7221 rdev_for_each_rcu(rdev, mddev) { 7221 rdev_for_each_rcu(rdev, mddev) {
7222 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; 7222 struct gendisk *disk = rdev->bdev->bd_contains->bd_disk;
7223 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) + 7223 curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
7224 (int)part_stat_read(&disk->part0, sectors[1]) - 7224 (int)part_stat_read(&disk->part0, sectors[1]) -
7225 atomic_read(&disk->sync_io); 7225 atomic_read(&disk->sync_io);
7226 /* sync IO will cause sync_io to increase before the disk_stats 7226 /* sync IO will cause sync_io to increase before the disk_stats
7227 * as sync_io is counted when a request starts, and 7227 * as sync_io is counted when a request starts, and
7228 * disk_stats is counted when it completes. 7228 * disk_stats is counted when it completes.
7229 * So resync activity will cause curr_events to be smaller than 7229 * So resync activity will cause curr_events to be smaller than
7230 * when there was no such activity. 7230 * when there was no such activity.
7231 * non-sync IO will cause disk_stat to increase without 7231 * non-sync IO will cause disk_stat to increase without
7232 * increasing sync_io so curr_events will (eventually) 7232 * increasing sync_io so curr_events will (eventually)
7233 * be larger than it was before. Once it becomes 7233 * be larger than it was before. Once it becomes
7234 * substantially larger, the test below will cause 7234 * substantially larger, the test below will cause
7235 * the array to appear non-idle, and resync will slow 7235 * the array to appear non-idle, and resync will slow
7236 * down. 7236 * down.
7237 * If there is a lot of outstanding resync activity when 7237 * If there is a lot of outstanding resync activity when
7238 * we set last_event to curr_events, then all that activity 7238 * we set last_event to curr_events, then all that activity
7239 * completing might cause the array to appear non-idle 7239 * completing might cause the array to appear non-idle
7240 * and resync will be slowed down even though there might 7240 * and resync will be slowed down even though there might
7241 * not have been non-resync activity. This will only 7241 * not have been non-resync activity. This will only
7242 * happen once though. 'last_events' will soon reflect 7242 * happen once though. 'last_events' will soon reflect
7243 * the state where there is little or no outstanding 7243 * the state where there is little or no outstanding
7244 * resync requests, and further resync activity will 7244 * resync requests, and further resync activity will
7245 * always make curr_events less than last_events. 7245 * always make curr_events less than last_events.
7246 * 7246 *
7247 */ 7247 */
7248 if (init || curr_events - rdev->last_events > 64) { 7248 if (init || curr_events - rdev->last_events > 64) {
7249 rdev->last_events = curr_events; 7249 rdev->last_events = curr_events;
7250 idle = 0; 7250 idle = 0;
7251 } 7251 }
7252 } 7252 }
7253 rcu_read_unlock(); 7253 rcu_read_unlock();
7254 return idle; 7254 return idle;
7255 } 7255 }
7256 7256
7257 void md_done_sync(struct mddev *mddev, int blocks, int ok) 7257 void md_done_sync(struct mddev *mddev, int blocks, int ok)
7258 { 7258 {
7259 /* another "blocks" (512byte) blocks have been synced */ 7259 /* another "blocks" (512byte) blocks have been synced */
7260 atomic_sub(blocks, &mddev->recovery_active); 7260 atomic_sub(blocks, &mddev->recovery_active);
7261 wake_up(&mddev->recovery_wait); 7261 wake_up(&mddev->recovery_wait);
7262 if (!ok) { 7262 if (!ok) {
7263 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7263 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7264 set_bit(MD_RECOVERY_ERROR, &mddev->recovery); 7264 set_bit(MD_RECOVERY_ERROR, &mddev->recovery);
7265 md_wakeup_thread(mddev->thread); 7265 md_wakeup_thread(mddev->thread);
7266 // stop recovery, signal do_sync .... 7266 // stop recovery, signal do_sync ....
7267 } 7267 }
7268 } 7268 }
7269 7269
7270 7270
7271 /* md_write_start(mddev, bi) 7271 /* md_write_start(mddev, bi)
7272 * If we need to update some array metadata (e.g. 'active' flag 7272 * If we need to update some array metadata (e.g. 'active' flag
7273 * in superblock) before writing, schedule a superblock update 7273 * in superblock) before writing, schedule a superblock update
7274 * and wait for it to complete. 7274 * and wait for it to complete.
7275 */ 7275 */
7276 void md_write_start(struct mddev *mddev, struct bio *bi) 7276 void md_write_start(struct mddev *mddev, struct bio *bi)
7277 { 7277 {
7278 int did_change = 0; 7278 int did_change = 0;
7279 if (bio_data_dir(bi) != WRITE) 7279 if (bio_data_dir(bi) != WRITE)
7280 return; 7280 return;
7281 7281
7282 BUG_ON(mddev->ro == 1); 7282 BUG_ON(mddev->ro == 1);
7283 if (mddev->ro == 2) { 7283 if (mddev->ro == 2) {
7284 /* need to switch to read/write */ 7284 /* need to switch to read/write */
7285 mddev->ro = 0; 7285 mddev->ro = 0;
7286 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7286 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7287 md_wakeup_thread(mddev->thread); 7287 md_wakeup_thread(mddev->thread);
7288 md_wakeup_thread(mddev->sync_thread); 7288 md_wakeup_thread(mddev->sync_thread);
7289 did_change = 1; 7289 did_change = 1;
7290 } 7290 }
7291 atomic_inc(&mddev->writes_pending); 7291 atomic_inc(&mddev->writes_pending);
7292 if (mddev->safemode == 1) 7292 if (mddev->safemode == 1)
7293 mddev->safemode = 0; 7293 mddev->safemode = 0;
7294 if (mddev->in_sync) { 7294 if (mddev->in_sync) {
7295 spin_lock_irq(&mddev->write_lock); 7295 spin_lock_irq(&mddev->write_lock);
7296 if (mddev->in_sync) { 7296 if (mddev->in_sync) {
7297 mddev->in_sync = 0; 7297 mddev->in_sync = 0;
7298 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7298 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7299 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7299 set_bit(MD_CHANGE_PENDING, &mddev->flags);
7300 md_wakeup_thread(mddev->thread); 7300 md_wakeup_thread(mddev->thread);
7301 did_change = 1; 7301 did_change = 1;
7302 } 7302 }
7303 spin_unlock_irq(&mddev->write_lock); 7303 spin_unlock_irq(&mddev->write_lock);
7304 } 7304 }
7305 if (did_change) 7305 if (did_change)
7306 sysfs_notify_dirent_safe(mddev->sysfs_state); 7306 sysfs_notify_dirent_safe(mddev->sysfs_state);
7307 wait_event(mddev->sb_wait, 7307 wait_event(mddev->sb_wait,
7308 !test_bit(MD_CHANGE_PENDING, &mddev->flags)); 7308 !test_bit(MD_CHANGE_PENDING, &mddev->flags));
7309 } 7309 }
7310 7310
7311 void md_write_end(struct mddev *mddev) 7311 void md_write_end(struct mddev *mddev)
7312 { 7312 {
7313 if (atomic_dec_and_test(&mddev->writes_pending)) { 7313 if (atomic_dec_and_test(&mddev->writes_pending)) {
7314 if (mddev->safemode == 2) 7314 if (mddev->safemode == 2)
7315 md_wakeup_thread(mddev->thread); 7315 md_wakeup_thread(mddev->thread);
7316 else if (mddev->safemode_delay) 7316 else if (mddev->safemode_delay)
7317 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay); 7317 mod_timer(&mddev->safemode_timer, jiffies + mddev->safemode_delay);
7318 } 7318 }
7319 } 7319 }
7320 7320
7321 /* md_allow_write(mddev) 7321 /* md_allow_write(mddev)
7322 * Calling this ensures that the array is marked 'active' so that writes 7322 * Calling this ensures that the array is marked 'active' so that writes
7323 * may proceed without blocking. It is important to call this before 7323 * may proceed without blocking. It is important to call this before
7324 * attempting a GFP_KERNEL allocation while holding the mddev lock. 7324 * attempting a GFP_KERNEL allocation while holding the mddev lock.
7325 * Must be called with mddev_lock held. 7325 * Must be called with mddev_lock held.
7326 * 7326 *
7327 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock 7327 * In the ->external case MD_CHANGE_CLEAN can not be cleared until mddev->lock
7328 * is dropped, so return -EAGAIN after notifying userspace. 7328 * is dropped, so return -EAGAIN after notifying userspace.
7329 */ 7329 */
7330 int md_allow_write(struct mddev *mddev) 7330 int md_allow_write(struct mddev *mddev)
7331 { 7331 {
7332 if (!mddev->pers) 7332 if (!mddev->pers)
7333 return 0; 7333 return 0;
7334 if (mddev->ro) 7334 if (mddev->ro)
7335 return 0; 7335 return 0;
7336 if (!mddev->pers->sync_request) 7336 if (!mddev->pers->sync_request)
7337 return 0; 7337 return 0;
7338 7338
7339 spin_lock_irq(&mddev->write_lock); 7339 spin_lock_irq(&mddev->write_lock);
7340 if (mddev->in_sync) { 7340 if (mddev->in_sync) {
7341 mddev->in_sync = 0; 7341 mddev->in_sync = 0;
7342 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7342 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7343 set_bit(MD_CHANGE_PENDING, &mddev->flags); 7343 set_bit(MD_CHANGE_PENDING, &mddev->flags);
7344 if (mddev->safemode_delay && 7344 if (mddev->safemode_delay &&
7345 mddev->safemode == 0) 7345 mddev->safemode == 0)
7346 mddev->safemode = 1; 7346 mddev->safemode = 1;
7347 spin_unlock_irq(&mddev->write_lock); 7347 spin_unlock_irq(&mddev->write_lock);
7348 md_update_sb(mddev, 0); 7348 md_update_sb(mddev, 0);
7349 sysfs_notify_dirent_safe(mddev->sysfs_state); 7349 sysfs_notify_dirent_safe(mddev->sysfs_state);
7350 } else 7350 } else
7351 spin_unlock_irq(&mddev->write_lock); 7351 spin_unlock_irq(&mddev->write_lock);
7352 7352
7353 if (test_bit(MD_CHANGE_PENDING, &mddev->flags)) 7353 if (test_bit(MD_CHANGE_PENDING, &mddev->flags))
7354 return -EAGAIN; 7354 return -EAGAIN;
7355 else 7355 else
7356 return 0; 7356 return 0;
7357 } 7357 }
7358 EXPORT_SYMBOL_GPL(md_allow_write); 7358 EXPORT_SYMBOL_GPL(md_allow_write);
7359 7359
7360 #define SYNC_MARKS 10 7360 #define SYNC_MARKS 10
7361 #define SYNC_MARK_STEP (3*HZ) 7361 #define SYNC_MARK_STEP (3*HZ)
7362 #define UPDATE_FREQUENCY (5*60*HZ) 7362 #define UPDATE_FREQUENCY (5*60*HZ)
7363 void md_do_sync(struct md_thread *thread) 7363 void md_do_sync(struct md_thread *thread)
7364 { 7364 {
7365 struct mddev *mddev = thread->mddev; 7365 struct mddev *mddev = thread->mddev;
7366 struct mddev *mddev2; 7366 struct mddev *mddev2;
7367 unsigned int currspeed = 0, 7367 unsigned int currspeed = 0,
7368 window; 7368 window;
7369 sector_t max_sectors,j, io_sectors; 7369 sector_t max_sectors,j, io_sectors;
7370 unsigned long mark[SYNC_MARKS]; 7370 unsigned long mark[SYNC_MARKS];
7371 unsigned long update_time; 7371 unsigned long update_time;
7372 sector_t mark_cnt[SYNC_MARKS]; 7372 sector_t mark_cnt[SYNC_MARKS];
7373 int last_mark,m; 7373 int last_mark,m;
7374 struct list_head *tmp; 7374 struct list_head *tmp;
7375 sector_t last_check; 7375 sector_t last_check;
7376 int skipped = 0; 7376 int skipped = 0;
7377 struct md_rdev *rdev; 7377 struct md_rdev *rdev;
7378 char *desc, *action = NULL; 7378 char *desc, *action = NULL;
7379 struct blk_plug plug; 7379 struct blk_plug plug;
7380 7380
7381 /* just incase thread restarts... */ 7381 /* just incase thread restarts... */
7382 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery)) 7382 if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
7383 return; 7383 return;
7384 if (mddev->ro) /* never try to sync a read-only array */ 7384 if (mddev->ro) {/* never try to sync a read-only array */
7385 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7385 return; 7386 return;
7387 }
7386 7388
7387 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7389 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7388 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { 7390 if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) {
7389 desc = "data-check"; 7391 desc = "data-check";
7390 action = "check"; 7392 action = "check";
7391 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7393 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7392 desc = "requested-resync"; 7394 desc = "requested-resync";
7393 action = "repair"; 7395 action = "repair";
7394 } else 7396 } else
7395 desc = "resync"; 7397 desc = "resync";
7396 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7398 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7397 desc = "reshape"; 7399 desc = "reshape";
7398 else 7400 else
7399 desc = "recovery"; 7401 desc = "recovery";
7400 7402
7401 mddev->last_sync_action = action ?: desc; 7403 mddev->last_sync_action = action ?: desc;
7402 7404
7403 /* we overload curr_resync somewhat here. 7405 /* we overload curr_resync somewhat here.
7404 * 0 == not engaged in resync at all 7406 * 0 == not engaged in resync at all
7405 * 2 == checking that there is no conflict with another sync 7407 * 2 == checking that there is no conflict with another sync
7406 * 1 == like 2, but have yielded to allow conflicting resync to 7408 * 1 == like 2, but have yielded to allow conflicting resync to
7407 * commense 7409 * commense
7408 * other == active in resync - this many blocks 7410 * other == active in resync - this many blocks
7409 * 7411 *
7410 * Before starting a resync we must have set curr_resync to 7412 * Before starting a resync we must have set curr_resync to
7411 * 2, and then checked that every "conflicting" array has curr_resync 7413 * 2, and then checked that every "conflicting" array has curr_resync
7412 * less than ours. When we find one that is the same or higher 7414 * less than ours. When we find one that is the same or higher
7413 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync 7415 * we wait on resync_wait. To avoid deadlock, we reduce curr_resync
7414 * to 1 if we choose to yield (based arbitrarily on address of mddev structure). 7416 * to 1 if we choose to yield (based arbitrarily on address of mddev structure).
7415 * This will mean we have to start checking from the beginning again. 7417 * This will mean we have to start checking from the beginning again.
7416 * 7418 *
7417 */ 7419 */
7418 7420
7419 do { 7421 do {
7420 mddev->curr_resync = 2; 7422 mddev->curr_resync = 2;
7421 7423
7422 try_again: 7424 try_again:
7423 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7425 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7424 goto skip; 7426 goto skip;
7425 for_each_mddev(mddev2, tmp) { 7427 for_each_mddev(mddev2, tmp) {
7426 if (mddev2 == mddev) 7428 if (mddev2 == mddev)
7427 continue; 7429 continue;
7428 if (!mddev->parallel_resync 7430 if (!mddev->parallel_resync
7429 && mddev2->curr_resync 7431 && mddev2->curr_resync
7430 && match_mddev_units(mddev, mddev2)) { 7432 && match_mddev_units(mddev, mddev2)) {
7431 DEFINE_WAIT(wq); 7433 DEFINE_WAIT(wq);
7432 if (mddev < mddev2 && mddev->curr_resync == 2) { 7434 if (mddev < mddev2 && mddev->curr_resync == 2) {
7433 /* arbitrarily yield */ 7435 /* arbitrarily yield */
7434 mddev->curr_resync = 1; 7436 mddev->curr_resync = 1;
7435 wake_up(&resync_wait); 7437 wake_up(&resync_wait);
7436 } 7438 }
7437 if (mddev > mddev2 && mddev->curr_resync == 1) 7439 if (mddev > mddev2 && mddev->curr_resync == 1)
7438 /* no need to wait here, we can wait the next 7440 /* no need to wait here, we can wait the next
7439 * time 'round when curr_resync == 2 7441 * time 'round when curr_resync == 2
7440 */ 7442 */
7441 continue; 7443 continue;
7442 /* We need to wait 'interruptible' so as not to 7444 /* We need to wait 'interruptible' so as not to
7443 * contribute to the load average, and not to 7445 * contribute to the load average, and not to
7444 * be caught by 'softlockup' 7446 * be caught by 'softlockup'
7445 */ 7447 */
7446 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE); 7448 prepare_to_wait(&resync_wait, &wq, TASK_INTERRUPTIBLE);
7447 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 7449 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7448 mddev2->curr_resync >= mddev->curr_resync) { 7450 mddev2->curr_resync >= mddev->curr_resync) {
7449 printk(KERN_INFO "md: delaying %s of %s" 7451 printk(KERN_INFO "md: delaying %s of %s"
7450 " until %s has finished (they" 7452 " until %s has finished (they"
7451 " share one or more physical units)\n", 7453 " share one or more physical units)\n",
7452 desc, mdname(mddev), mdname(mddev2)); 7454 desc, mdname(mddev), mdname(mddev2));
7453 mddev_put(mddev2); 7455 mddev_put(mddev2);
7454 if (signal_pending(current)) 7456 if (signal_pending(current))
7455 flush_signals(current); 7457 flush_signals(current);
7456 schedule(); 7458 schedule();
7457 finish_wait(&resync_wait, &wq); 7459 finish_wait(&resync_wait, &wq);
7458 goto try_again; 7460 goto try_again;
7459 } 7461 }
7460 finish_wait(&resync_wait, &wq); 7462 finish_wait(&resync_wait, &wq);
7461 } 7463 }
7462 } 7464 }
7463 } while (mddev->curr_resync < 2); 7465 } while (mddev->curr_resync < 2);
7464 7466
7465 j = 0; 7467 j = 0;
7466 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7468 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7467 /* resync follows the size requested by the personality, 7469 /* resync follows the size requested by the personality,
7468 * which defaults to physical size, but can be virtual size 7470 * which defaults to physical size, but can be virtual size
7469 */ 7471 */
7470 max_sectors = mddev->resync_max_sectors; 7472 max_sectors = mddev->resync_max_sectors;
7471 atomic64_set(&mddev->resync_mismatches, 0); 7473 atomic64_set(&mddev->resync_mismatches, 0);
7472 /* we don't use the checkpoint if there's a bitmap */ 7474 /* we don't use the checkpoint if there's a bitmap */
7473 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7475 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7474 j = mddev->resync_min; 7476 j = mddev->resync_min;
7475 else if (!mddev->bitmap) 7477 else if (!mddev->bitmap)
7476 j = mddev->recovery_cp; 7478 j = mddev->recovery_cp;
7477 7479
7478 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) 7480 } else if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
7479 max_sectors = mddev->resync_max_sectors; 7481 max_sectors = mddev->resync_max_sectors;
7480 else { 7482 else {
7481 /* recovery follows the physical size of devices */ 7483 /* recovery follows the physical size of devices */
7482 max_sectors = mddev->dev_sectors; 7484 max_sectors = mddev->dev_sectors;
7483 j = MaxSector; 7485 j = MaxSector;
7484 rcu_read_lock(); 7486 rcu_read_lock();
7485 rdev_for_each_rcu(rdev, mddev) 7487 rdev_for_each_rcu(rdev, mddev)
7486 if (rdev->raid_disk >= 0 && 7488 if (rdev->raid_disk >= 0 &&
7487 !test_bit(Faulty, &rdev->flags) && 7489 !test_bit(Faulty, &rdev->flags) &&
7488 !test_bit(In_sync, &rdev->flags) && 7490 !test_bit(In_sync, &rdev->flags) &&
7489 rdev->recovery_offset < j) 7491 rdev->recovery_offset < j)
7490 j = rdev->recovery_offset; 7492 j = rdev->recovery_offset;
7491 rcu_read_unlock(); 7493 rcu_read_unlock();
7492 } 7494 }
7493 7495
7494 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev)); 7496 printk(KERN_INFO "md: %s of RAID array %s\n", desc, mdname(mddev));
7495 printk(KERN_INFO "md: minimum _guaranteed_ speed:" 7497 printk(KERN_INFO "md: minimum _guaranteed_ speed:"
7496 " %d KB/sec/disk.\n", speed_min(mddev)); 7498 " %d KB/sec/disk.\n", speed_min(mddev));
7497 printk(KERN_INFO "md: using maximum available idle IO bandwidth " 7499 printk(KERN_INFO "md: using maximum available idle IO bandwidth "
7498 "(but not more than %d KB/sec) for %s.\n", 7500 "(but not more than %d KB/sec) for %s.\n",
7499 speed_max(mddev), desc); 7501 speed_max(mddev), desc);
7500 7502
7501 is_mddev_idle(mddev, 1); /* this initializes IO event counters */ 7503 is_mddev_idle(mddev, 1); /* this initializes IO event counters */
7502 7504
7503 io_sectors = 0; 7505 io_sectors = 0;
7504 for (m = 0; m < SYNC_MARKS; m++) { 7506 for (m = 0; m < SYNC_MARKS; m++) {
7505 mark[m] = jiffies; 7507 mark[m] = jiffies;
7506 mark_cnt[m] = io_sectors; 7508 mark_cnt[m] = io_sectors;
7507 } 7509 }
7508 last_mark = 0; 7510 last_mark = 0;
7509 mddev->resync_mark = mark[last_mark]; 7511 mddev->resync_mark = mark[last_mark];
7510 mddev->resync_mark_cnt = mark_cnt[last_mark]; 7512 mddev->resync_mark_cnt = mark_cnt[last_mark];
7511 7513
7512 /* 7514 /*
7513 * Tune reconstruction: 7515 * Tune reconstruction:
7514 */ 7516 */
7515 window = 32*(PAGE_SIZE/512); 7517 window = 32*(PAGE_SIZE/512);
7516 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n", 7518 printk(KERN_INFO "md: using %dk window, over a total of %lluk.\n",
7517 window/2, (unsigned long long)max_sectors/2); 7519 window/2, (unsigned long long)max_sectors/2);
7518 7520
7519 atomic_set(&mddev->recovery_active, 0); 7521 atomic_set(&mddev->recovery_active, 0);
7520 last_check = 0; 7522 last_check = 0;
7521 7523
7522 if (j>2) { 7524 if (j>2) {
7523 printk(KERN_INFO 7525 printk(KERN_INFO
7524 "md: resuming %s of %s from checkpoint.\n", 7526 "md: resuming %s of %s from checkpoint.\n",
7525 desc, mdname(mddev)); 7527 desc, mdname(mddev));
7526 mddev->curr_resync = j; 7528 mddev->curr_resync = j;
7527 } else 7529 } else
7528 mddev->curr_resync = 3; /* no longer delayed */ 7530 mddev->curr_resync = 3; /* no longer delayed */
7529 mddev->curr_resync_completed = j; 7531 mddev->curr_resync_completed = j;
7530 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7532 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7531 md_new_event(mddev); 7533 md_new_event(mddev);
7532 update_time = jiffies; 7534 update_time = jiffies;
7533 7535
7534 blk_start_plug(&plug); 7536 blk_start_plug(&plug);
7535 while (j < max_sectors) { 7537 while (j < max_sectors) {
7536 sector_t sectors; 7538 sector_t sectors;
7537 7539
7538 skipped = 0; 7540 skipped = 0;
7539 7541
7540 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 7542 if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7541 ((mddev->curr_resync > mddev->curr_resync_completed && 7543 ((mddev->curr_resync > mddev->curr_resync_completed &&
7542 (mddev->curr_resync - mddev->curr_resync_completed) 7544 (mddev->curr_resync - mddev->curr_resync_completed)
7543 > (max_sectors >> 4)) || 7545 > (max_sectors >> 4)) ||
7544 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) || 7546 time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
7545 (j - mddev->curr_resync_completed)*2 7547 (j - mddev->curr_resync_completed)*2
7546 >= mddev->resync_max - mddev->curr_resync_completed 7548 >= mddev->resync_max - mddev->curr_resync_completed
7547 )) { 7549 )) {
7548 /* time to update curr_resync_completed */ 7550 /* time to update curr_resync_completed */
7549 wait_event(mddev->recovery_wait, 7551 wait_event(mddev->recovery_wait,
7550 atomic_read(&mddev->recovery_active) == 0); 7552 atomic_read(&mddev->recovery_active) == 0);
7551 mddev->curr_resync_completed = j; 7553 mddev->curr_resync_completed = j;
7552 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && 7554 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) &&
7553 j > mddev->recovery_cp) 7555 j > mddev->recovery_cp)
7554 mddev->recovery_cp = j; 7556 mddev->recovery_cp = j;
7555 update_time = jiffies; 7557 update_time = jiffies;
7556 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7558 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7557 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 7559 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
7558 } 7560 }
7559 7561
7560 while (j >= mddev->resync_max && 7562 while (j >= mddev->resync_max &&
7561 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7563 !test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7562 /* As this condition is controlled by user-space, 7564 /* As this condition is controlled by user-space,
7563 * we can block indefinitely, so use '_interruptible' 7565 * we can block indefinitely, so use '_interruptible'
7564 * to avoid triggering warnings. 7566 * to avoid triggering warnings.
7565 */ 7567 */
7566 flush_signals(current); /* just in case */ 7568 flush_signals(current); /* just in case */
7567 wait_event_interruptible(mddev->recovery_wait, 7569 wait_event_interruptible(mddev->recovery_wait,
7568 mddev->resync_max > j 7570 mddev->resync_max > j
7569 || test_bit(MD_RECOVERY_INTR, 7571 || test_bit(MD_RECOVERY_INTR,
7570 &mddev->recovery)); 7572 &mddev->recovery));
7571 } 7573 }
7572 7574
7573 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7575 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7574 break; 7576 break;
7575 7577
7576 sectors = mddev->pers->sync_request(mddev, j, &skipped, 7578 sectors = mddev->pers->sync_request(mddev, j, &skipped,
7577 currspeed < speed_min(mddev)); 7579 currspeed < speed_min(mddev));
7578 if (sectors == 0) { 7580 if (sectors == 0) {
7579 set_bit(MD_RECOVERY_INTR, &mddev->recovery); 7581 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7580 break; 7582 break;
7581 } 7583 }
7582 7584
7583 if (!skipped) { /* actual IO requested */ 7585 if (!skipped) { /* actual IO requested */
7584 io_sectors += sectors; 7586 io_sectors += sectors;
7585 atomic_add(sectors, &mddev->recovery_active); 7587 atomic_add(sectors, &mddev->recovery_active);
7586 } 7588 }
7587 7589
7588 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7590 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7589 break; 7591 break;
7590 7592
7591 j += sectors; 7593 j += sectors;
7592 if (j > 2) 7594 if (j > 2)
7593 mddev->curr_resync = j; 7595 mddev->curr_resync = j;
7594 mddev->curr_mark_cnt = io_sectors; 7596 mddev->curr_mark_cnt = io_sectors;
7595 if (last_check == 0) 7597 if (last_check == 0)
7596 /* this is the earliest that rebuild will be 7598 /* this is the earliest that rebuild will be
7597 * visible in /proc/mdstat 7599 * visible in /proc/mdstat
7598 */ 7600 */
7599 md_new_event(mddev); 7601 md_new_event(mddev);
7600 7602
7601 if (last_check + window > io_sectors || j == max_sectors) 7603 if (last_check + window > io_sectors || j == max_sectors)
7602 continue; 7604 continue;
7603 7605
7604 last_check = io_sectors; 7606 last_check = io_sectors;
7605 repeat: 7607 repeat:
7606 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 7608 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
7607 /* step marks */ 7609 /* step marks */
7608 int next = (last_mark+1) % SYNC_MARKS; 7610 int next = (last_mark+1) % SYNC_MARKS;
7609 7611
7610 mddev->resync_mark = mark[next]; 7612 mddev->resync_mark = mark[next];
7611 mddev->resync_mark_cnt = mark_cnt[next]; 7613 mddev->resync_mark_cnt = mark_cnt[next];
7612 mark[next] = jiffies; 7614 mark[next] = jiffies;
7613 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active); 7615 mark_cnt[next] = io_sectors - atomic_read(&mddev->recovery_active);
7614 last_mark = next; 7616 last_mark = next;
7615 } 7617 }
7616 7618
7617 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7619 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7618 break; 7620 break;
7619 7621
7620 /* 7622 /*
7621 * this loop exits only if either when we are slower than 7623 * this loop exits only if either when we are slower than
7622 * the 'hard' speed limit, or the system was IO-idle for 7624 * the 'hard' speed limit, or the system was IO-idle for
7623 * a jiffy. 7625 * a jiffy.
7624 * the system might be non-idle CPU-wise, but we only care 7626 * the system might be non-idle CPU-wise, but we only care
7625 * about not overloading the IO subsystem. (things like an 7627 * about not overloading the IO subsystem. (things like an
7626 * e2fsck being done on the RAID array should execute fast) 7628 * e2fsck being done on the RAID array should execute fast)
7627 */ 7629 */
7628 cond_resched(); 7630 cond_resched();
7629 7631
7630 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2 7632 currspeed = ((unsigned long)(io_sectors-mddev->resync_mark_cnt))/2
7631 /((jiffies-mddev->resync_mark)/HZ +1) +1; 7633 /((jiffies-mddev->resync_mark)/HZ +1) +1;
7632 7634
7633 if (currspeed > speed_min(mddev)) { 7635 if (currspeed > speed_min(mddev)) {
7634 if ((currspeed > speed_max(mddev)) || 7636 if ((currspeed > speed_max(mddev)) ||
7635 !is_mddev_idle(mddev, 0)) { 7637 !is_mddev_idle(mddev, 0)) {
7636 msleep(500); 7638 msleep(500);
7637 goto repeat; 7639 goto repeat;
7638 } 7640 }
7639 } 7641 }
7640 } 7642 }
7641 printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc, 7643 printk(KERN_INFO "md: %s: %s %s.\n",mdname(mddev), desc,
7642 test_bit(MD_RECOVERY_INTR, &mddev->recovery) 7644 test_bit(MD_RECOVERY_INTR, &mddev->recovery)
7643 ? "interrupted" : "done"); 7645 ? "interrupted" : "done");
7644 /* 7646 /*
7645 * this also signals 'finished resyncing' to md_stop 7647 * this also signals 'finished resyncing' to md_stop
7646 */ 7648 */
7647 blk_finish_plug(&plug); 7649 blk_finish_plug(&plug);
7648 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active)); 7650 wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
7649 7651
7650 /* tell personality that we are finished */ 7652 /* tell personality that we are finished */
7651 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1); 7653 mddev->pers->sync_request(mddev, max_sectors, &skipped, 1);
7652 7654
7653 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && 7655 if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
7654 mddev->curr_resync > 2) { 7656 mddev->curr_resync > 2) {
7655 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) { 7657 if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
7656 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7658 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7657 if (mddev->curr_resync >= mddev->recovery_cp) { 7659 if (mddev->curr_resync >= mddev->recovery_cp) {
7658 printk(KERN_INFO 7660 printk(KERN_INFO
7659 "md: checkpointing %s of %s.\n", 7661 "md: checkpointing %s of %s.\n",
7660 desc, mdname(mddev)); 7662 desc, mdname(mddev));
7661 if (test_bit(MD_RECOVERY_ERROR, 7663 if (test_bit(MD_RECOVERY_ERROR,
7662 &mddev->recovery)) 7664 &mddev->recovery))
7663 mddev->recovery_cp = 7665 mddev->recovery_cp =
7664 mddev->curr_resync_completed; 7666 mddev->curr_resync_completed;
7665 else 7667 else
7666 mddev->recovery_cp = 7668 mddev->recovery_cp =
7667 mddev->curr_resync; 7669 mddev->curr_resync;
7668 } 7670 }
7669 } else 7671 } else
7670 mddev->recovery_cp = MaxSector; 7672 mddev->recovery_cp = MaxSector;
7671 } else { 7673 } else {
7672 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7674 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7673 mddev->curr_resync = MaxSector; 7675 mddev->curr_resync = MaxSector;
7674 rcu_read_lock(); 7676 rcu_read_lock();
7675 rdev_for_each_rcu(rdev, mddev) 7677 rdev_for_each_rcu(rdev, mddev)
7676 if (rdev->raid_disk >= 0 && 7678 if (rdev->raid_disk >= 0 &&
7677 mddev->delta_disks >= 0 && 7679 mddev->delta_disks >= 0 &&
7678 !test_bit(Faulty, &rdev->flags) && 7680 !test_bit(Faulty, &rdev->flags) &&
7679 !test_bit(In_sync, &rdev->flags) && 7681 !test_bit(In_sync, &rdev->flags) &&
7680 rdev->recovery_offset < mddev->curr_resync) 7682 rdev->recovery_offset < mddev->curr_resync)
7681 rdev->recovery_offset = mddev->curr_resync; 7683 rdev->recovery_offset = mddev->curr_resync;
7682 rcu_read_unlock(); 7684 rcu_read_unlock();
7683 } 7685 }
7684 } 7686 }
7685 skip: 7687 skip:
7686 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7688 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7687 7689
7688 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { 7690 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
7689 /* We completed so min/max setting can be forgotten if used. */ 7691 /* We completed so min/max setting can be forgotten if used. */
7690 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7692 if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7691 mddev->resync_min = 0; 7693 mddev->resync_min = 0;
7692 mddev->resync_max = MaxSector; 7694 mddev->resync_max = MaxSector;
7693 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) 7695 } else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
7694 mddev->resync_min = mddev->curr_resync_completed; 7696 mddev->resync_min = mddev->curr_resync_completed;
7695 mddev->curr_resync = 0; 7697 mddev->curr_resync = 0;
7696 wake_up(&resync_wait); 7698 wake_up(&resync_wait);
7697 set_bit(MD_RECOVERY_DONE, &mddev->recovery); 7699 set_bit(MD_RECOVERY_DONE, &mddev->recovery);
7698 md_wakeup_thread(mddev->thread); 7700 md_wakeup_thread(mddev->thread);
7699 return; 7701 return;
7700 } 7702 }
7701 EXPORT_SYMBOL_GPL(md_do_sync); 7703 EXPORT_SYMBOL_GPL(md_do_sync);
7702 7704
7703 static int remove_and_add_spares(struct mddev *mddev, 7705 static int remove_and_add_spares(struct mddev *mddev,
7704 struct md_rdev *this) 7706 struct md_rdev *this)
7705 { 7707 {
7706 struct md_rdev *rdev; 7708 struct md_rdev *rdev;
7707 int spares = 0; 7709 int spares = 0;
7708 int removed = 0; 7710 int removed = 0;
7709 7711
7710 rdev_for_each(rdev, mddev) 7712 rdev_for_each(rdev, mddev)
7711 if ((this == NULL || rdev == this) && 7713 if ((this == NULL || rdev == this) &&
7712 rdev->raid_disk >= 0 && 7714 rdev->raid_disk >= 0 &&
7713 !test_bit(Blocked, &rdev->flags) && 7715 !test_bit(Blocked, &rdev->flags) &&
7714 (test_bit(Faulty, &rdev->flags) || 7716 (test_bit(Faulty, &rdev->flags) ||
7715 ! test_bit(In_sync, &rdev->flags)) && 7717 ! test_bit(In_sync, &rdev->flags)) &&
7716 atomic_read(&rdev->nr_pending)==0) { 7718 atomic_read(&rdev->nr_pending)==0) {
7717 if (mddev->pers->hot_remove_disk( 7719 if (mddev->pers->hot_remove_disk(
7718 mddev, rdev) == 0) { 7720 mddev, rdev) == 0) {
7719 sysfs_unlink_rdev(mddev, rdev); 7721 sysfs_unlink_rdev(mddev, rdev);
7720 rdev->raid_disk = -1; 7722 rdev->raid_disk = -1;
7721 removed++; 7723 removed++;
7722 } 7724 }
7723 } 7725 }
7724 if (removed && mddev->kobj.sd) 7726 if (removed && mddev->kobj.sd)
7725 sysfs_notify(&mddev->kobj, NULL, "degraded"); 7727 sysfs_notify(&mddev->kobj, NULL, "degraded");
7726 7728
7727 if (this) 7729 if (this)
7728 goto no_add; 7730 goto no_add;
7729 7731
7730 rdev_for_each(rdev, mddev) { 7732 rdev_for_each(rdev, mddev) {
7731 if (rdev->raid_disk >= 0 && 7733 if (rdev->raid_disk >= 0 &&
7732 !test_bit(In_sync, &rdev->flags) && 7734 !test_bit(In_sync, &rdev->flags) &&
7733 !test_bit(Faulty, &rdev->flags)) 7735 !test_bit(Faulty, &rdev->flags))
7734 spares++; 7736 spares++;
7735 if (rdev->raid_disk >= 0) 7737 if (rdev->raid_disk >= 0)
7736 continue; 7738 continue;
7737 if (test_bit(Faulty, &rdev->flags)) 7739 if (test_bit(Faulty, &rdev->flags))
7738 continue; 7740 continue;
7739 if (mddev->ro && 7741 if (mddev->ro &&
7740 ! (rdev->saved_raid_disk >= 0 && 7742 ! (rdev->saved_raid_disk >= 0 &&
7741 !test_bit(Bitmap_sync, &rdev->flags))) 7743 !test_bit(Bitmap_sync, &rdev->flags)))
7742 continue; 7744 continue;
7743 7745
7744 if (rdev->saved_raid_disk < 0) 7746 if (rdev->saved_raid_disk < 0)
7745 rdev->recovery_offset = 0; 7747 rdev->recovery_offset = 0;
7746 if (mddev->pers-> 7748 if (mddev->pers->
7747 hot_add_disk(mddev, rdev) == 0) { 7749 hot_add_disk(mddev, rdev) == 0) {
7748 if (sysfs_link_rdev(mddev, rdev)) 7750 if (sysfs_link_rdev(mddev, rdev))
7749 /* failure here is OK */; 7751 /* failure here is OK */;
7750 spares++; 7752 spares++;
7751 md_new_event(mddev); 7753 md_new_event(mddev);
7752 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7754 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7753 } 7755 }
7754 } 7756 }
7755 no_add: 7757 no_add:
7756 if (removed) 7758 if (removed)
7757 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7759 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7758 return spares; 7760 return spares;
7759 } 7761 }
7760 7762
7761 /* 7763 /*
7762 * This routine is regularly called by all per-raid-array threads to 7764 * This routine is regularly called by all per-raid-array threads to
7763 * deal with generic issues like resync and super-block update. 7765 * deal with generic issues like resync and super-block update.
7764 * Raid personalities that don't have a thread (linear/raid0) do not 7766 * Raid personalities that don't have a thread (linear/raid0) do not
7765 * need this as they never do any recovery or update the superblock. 7767 * need this as they never do any recovery or update the superblock.
7766 * 7768 *
7767 * It does not do any resync itself, but rather "forks" off other threads 7769 * It does not do any resync itself, but rather "forks" off other threads
7768 * to do that as needed. 7770 * to do that as needed.
7769 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in 7771 * When it is determined that resync is needed, we set MD_RECOVERY_RUNNING in
7770 * "->recovery" and create a thread at ->sync_thread. 7772 * "->recovery" and create a thread at ->sync_thread.
7771 * When the thread finishes it sets MD_RECOVERY_DONE 7773 * When the thread finishes it sets MD_RECOVERY_DONE
7772 * and wakeups up this thread which will reap the thread and finish up. 7774 * and wakeups up this thread which will reap the thread and finish up.
7773 * This thread also removes any faulty devices (with nr_pending == 0). 7775 * This thread also removes any faulty devices (with nr_pending == 0).
7774 * 7776 *
7775 * The overall approach is: 7777 * The overall approach is:
7776 * 1/ if the superblock needs updating, update it. 7778 * 1/ if the superblock needs updating, update it.
7777 * 2/ If a recovery thread is running, don't do anything else. 7779 * 2/ If a recovery thread is running, don't do anything else.
7778 * 3/ If recovery has finished, clean up, possibly marking spares active. 7780 * 3/ If recovery has finished, clean up, possibly marking spares active.
7779 * 4/ If there are any faulty devices, remove them. 7781 * 4/ If there are any faulty devices, remove them.
7780 * 5/ If array is degraded, try to add spares devices 7782 * 5/ If array is degraded, try to add spares devices
7781 * 6/ If array has spares or is not in-sync, start a resync thread. 7783 * 6/ If array has spares or is not in-sync, start a resync thread.
7782 */ 7784 */
7783 void md_check_recovery(struct mddev *mddev) 7785 void md_check_recovery(struct mddev *mddev)
7784 { 7786 {
7785 if (mddev->suspended) 7787 if (mddev->suspended)
7786 return; 7788 return;
7787 7789
7788 if (mddev->bitmap) 7790 if (mddev->bitmap)
7789 bitmap_daemon_work(mddev); 7791 bitmap_daemon_work(mddev);
7790 7792
7791 if (signal_pending(current)) { 7793 if (signal_pending(current)) {
7792 if (mddev->pers->sync_request && !mddev->external) { 7794 if (mddev->pers->sync_request && !mddev->external) {
7793 printk(KERN_INFO "md: %s in immediate safe mode\n", 7795 printk(KERN_INFO "md: %s in immediate safe mode\n",
7794 mdname(mddev)); 7796 mdname(mddev));
7795 mddev->safemode = 2; 7797 mddev->safemode = 2;
7796 } 7798 }
7797 flush_signals(current); 7799 flush_signals(current);
7798 } 7800 }
7799 7801
7800 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery)) 7802 if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
7801 return; 7803 return;
7802 if ( ! ( 7804 if ( ! (
7803 (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) || 7805 (mddev->flags & MD_UPDATE_SB_FLAGS & ~ (1<<MD_CHANGE_PENDING)) ||
7804 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 7806 test_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7805 test_bit(MD_RECOVERY_DONE, &mddev->recovery) || 7807 test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
7806 (mddev->external == 0 && mddev->safemode == 1) || 7808 (mddev->external == 0 && mddev->safemode == 1) ||
7807 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending) 7809 (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
7808 && !mddev->in_sync && mddev->recovery_cp == MaxSector) 7810 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
7809 )) 7811 ))
7810 return; 7812 return;
7811 7813
7812 if (mddev_trylock(mddev)) { 7814 if (mddev_trylock(mddev)) {
7813 int spares = 0; 7815 int spares = 0;
7814 7816
7815 if (mddev->ro) { 7817 if (mddev->ro) {
7816 /* On a read-only array we can: 7818 /* On a read-only array we can:
7817 * - remove failed devices 7819 * - remove failed devices
7818 * - add already-in_sync devices if the array itself 7820 * - add already-in_sync devices if the array itself
7819 * is in-sync. 7821 * is in-sync.
7820 * As we only add devices that are already in-sync, 7822 * As we only add devices that are already in-sync,
7821 * we can activate the spares immediately. 7823 * we can activate the spares immediately.
7822 */ 7824 */
7823 remove_and_add_spares(mddev, NULL); 7825 remove_and_add_spares(mddev, NULL);
7824 /* There is no thread, but we need to call 7826 /* There is no thread, but we need to call
7825 * ->spare_active and clear saved_raid_disk 7827 * ->spare_active and clear saved_raid_disk
7826 */ 7828 */
7829 set_bit(MD_RECOVERY_INTR, &mddev->recovery);
7827 md_reap_sync_thread(mddev); 7830 md_reap_sync_thread(mddev);
7828 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7831 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7829 goto unlock; 7832 goto unlock;
7830 } 7833 }
7831 7834
7832 if (!mddev->external) { 7835 if (!mddev->external) {
7833 int did_change = 0; 7836 int did_change = 0;
7834 spin_lock_irq(&mddev->write_lock); 7837 spin_lock_irq(&mddev->write_lock);
7835 if (mddev->safemode && 7838 if (mddev->safemode &&
7836 !atomic_read(&mddev->writes_pending) && 7839 !atomic_read(&mddev->writes_pending) &&
7837 !mddev->in_sync && 7840 !mddev->in_sync &&
7838 mddev->recovery_cp == MaxSector) { 7841 mddev->recovery_cp == MaxSector) {
7839 mddev->in_sync = 1; 7842 mddev->in_sync = 1;
7840 did_change = 1; 7843 did_change = 1;
7841 set_bit(MD_CHANGE_CLEAN, &mddev->flags); 7844 set_bit(MD_CHANGE_CLEAN, &mddev->flags);
7842 } 7845 }
7843 if (mddev->safemode == 1) 7846 if (mddev->safemode == 1)
7844 mddev->safemode = 0; 7847 mddev->safemode = 0;
7845 spin_unlock_irq(&mddev->write_lock); 7848 spin_unlock_irq(&mddev->write_lock);
7846 if (did_change) 7849 if (did_change)
7847 sysfs_notify_dirent_safe(mddev->sysfs_state); 7850 sysfs_notify_dirent_safe(mddev->sysfs_state);
7848 } 7851 }
7849 7852
7850 if (mddev->flags & MD_UPDATE_SB_FLAGS) 7853 if (mddev->flags & MD_UPDATE_SB_FLAGS)
7851 md_update_sb(mddev, 0); 7854 md_update_sb(mddev, 0);
7852 7855
7853 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) && 7856 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) &&
7854 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) { 7857 !test_bit(MD_RECOVERY_DONE, &mddev->recovery)) {
7855 /* resync/recovery still happening */ 7858 /* resync/recovery still happening */
7856 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7859 clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7857 goto unlock; 7860 goto unlock;
7858 } 7861 }
7859 if (mddev->sync_thread) { 7862 if (mddev->sync_thread) {
7860 md_reap_sync_thread(mddev); 7863 md_reap_sync_thread(mddev);
7861 goto unlock; 7864 goto unlock;
7862 } 7865 }
7863 /* Set RUNNING before clearing NEEDED to avoid 7866 /* Set RUNNING before clearing NEEDED to avoid
7864 * any transients in the value of "sync_action". 7867 * any transients in the value of "sync_action".
7865 */ 7868 */
7866 mddev->curr_resync_completed = 0; 7869 mddev->curr_resync_completed = 0;
7867 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7870 set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7868 /* Clear some bits that don't mean anything, but 7871 /* Clear some bits that don't mean anything, but
7869 * might be left set 7872 * might be left set
7870 */ 7873 */
7871 clear_bit(MD_RECOVERY_INTR, &mddev->recovery); 7874 clear_bit(MD_RECOVERY_INTR, &mddev->recovery);
7872 clear_bit(MD_RECOVERY_DONE, &mddev->recovery); 7875 clear_bit(MD_RECOVERY_DONE, &mddev->recovery);
7873 7876
7874 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) || 7877 if (!test_and_clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery) ||
7875 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) 7878 test_bit(MD_RECOVERY_FROZEN, &mddev->recovery))
7876 goto unlock; 7879 goto unlock;
7877 /* no recovery is running. 7880 /* no recovery is running.
7878 * remove any failed drives, then 7881 * remove any failed drives, then
7879 * add spares if possible. 7882 * add spares if possible.
7880 * Spares are also removed and re-added, to allow 7883 * Spares are also removed and re-added, to allow
7881 * the personality to fail the re-add. 7884 * the personality to fail the re-add.
7882 */ 7885 */
7883 7886
7884 if (mddev->reshape_position != MaxSector) { 7887 if (mddev->reshape_position != MaxSector) {
7885 if (mddev->pers->check_reshape == NULL || 7888 if (mddev->pers->check_reshape == NULL ||
7886 mddev->pers->check_reshape(mddev) != 0) 7889 mddev->pers->check_reshape(mddev) != 0)
7887 /* Cannot proceed */ 7890 /* Cannot proceed */
7888 goto unlock; 7891 goto unlock;
7889 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7892 set_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7890 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7893 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7891 } else if ((spares = remove_and_add_spares(mddev, NULL))) { 7894 } else if ((spares = remove_and_add_spares(mddev, NULL))) {
7892 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7895 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7893 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7896 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7894 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 7897 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7895 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7898 set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7896 } else if (mddev->recovery_cp < MaxSector) { 7899 } else if (mddev->recovery_cp < MaxSector) {
7897 set_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7900 set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7898 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery); 7901 clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
7899 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) 7902 } else if (!test_bit(MD_RECOVERY_SYNC, &mddev->recovery))
7900 /* nothing to be done ... */ 7903 /* nothing to be done ... */
7901 goto unlock; 7904 goto unlock;
7902 7905
7903 if (mddev->pers->sync_request) { 7906 if (mddev->pers->sync_request) {
7904 if (spares) { 7907 if (spares) {
7905 /* We are adding a device or devices to an array 7908 /* We are adding a device or devices to an array
7906 * which has the bitmap stored on all devices. 7909 * which has the bitmap stored on all devices.
7907 * So make sure all bitmap pages get written 7910 * So make sure all bitmap pages get written
7908 */ 7911 */
7909 bitmap_write_all(mddev->bitmap); 7912 bitmap_write_all(mddev->bitmap);
7910 } 7913 }
7911 mddev->sync_thread = md_register_thread(md_do_sync, 7914 mddev->sync_thread = md_register_thread(md_do_sync,
7912 mddev, 7915 mddev,
7913 "resync"); 7916 "resync");
7914 if (!mddev->sync_thread) { 7917 if (!mddev->sync_thread) {
7915 printk(KERN_ERR "%s: could not start resync" 7918 printk(KERN_ERR "%s: could not start resync"
7916 " thread...\n", 7919 " thread...\n",
7917 mdname(mddev)); 7920 mdname(mddev));
7918 /* leave the spares where they are, it shouldn't hurt */ 7921 /* leave the spares where they are, it shouldn't hurt */
7919 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7922 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7920 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7923 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7921 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7924 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7922 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 7925 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7923 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7926 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7924 } else 7927 } else
7925 md_wakeup_thread(mddev->sync_thread); 7928 md_wakeup_thread(mddev->sync_thread);
7926 sysfs_notify_dirent_safe(mddev->sysfs_action); 7929 sysfs_notify_dirent_safe(mddev->sysfs_action);
7927 md_new_event(mddev); 7930 md_new_event(mddev);
7928 } 7931 }
7929 unlock: 7932 unlock:
7930 wake_up(&mddev->sb_wait); 7933 wake_up(&mddev->sb_wait);
7931 7934
7932 if (!mddev->sync_thread) { 7935 if (!mddev->sync_thread) {
7933 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7936 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7934 if (test_and_clear_bit(MD_RECOVERY_RECOVER, 7937 if (test_and_clear_bit(MD_RECOVERY_RECOVER,
7935 &mddev->recovery)) 7938 &mddev->recovery))
7936 if (mddev->sysfs_action) 7939 if (mddev->sysfs_action)
7937 sysfs_notify_dirent_safe(mddev->sysfs_action); 7940 sysfs_notify_dirent_safe(mddev->sysfs_action);
7938 } 7941 }
7939 mddev_unlock(mddev); 7942 mddev_unlock(mddev);
7940 } 7943 }
7941 } 7944 }
7942 7945
7943 void md_reap_sync_thread(struct mddev *mddev) 7946 void md_reap_sync_thread(struct mddev *mddev)
7944 { 7947 {
7945 struct md_rdev *rdev; 7948 struct md_rdev *rdev;
7946 7949
7947 /* resync has finished, collect result */ 7950 /* resync has finished, collect result */
7948 md_unregister_thread(&mddev->sync_thread); 7951 md_unregister_thread(&mddev->sync_thread);
7949 wake_up(&resync_wait); 7952 wake_up(&resync_wait);
7950 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) && 7953 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
7951 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { 7954 !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
7952 /* success...*/ 7955 /* success...*/
7953 /* activate any spares */ 7956 /* activate any spares */
7954 if (mddev->pers->spare_active(mddev)) { 7957 if (mddev->pers->spare_active(mddev)) {
7955 sysfs_notify(&mddev->kobj, NULL, 7958 sysfs_notify(&mddev->kobj, NULL,
7956 "degraded"); 7959 "degraded");
7957 set_bit(MD_CHANGE_DEVS, &mddev->flags); 7960 set_bit(MD_CHANGE_DEVS, &mddev->flags);
7958 } 7961 }
7959 } 7962 }
7960 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) && 7963 if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
7961 mddev->pers->finish_reshape) 7964 mddev->pers->finish_reshape)
7962 mddev->pers->finish_reshape(mddev); 7965 mddev->pers->finish_reshape(mddev);
7963 7966
7964 /* If array is no-longer degraded, then any saved_raid_disk 7967 /* If array is no-longer degraded, then any saved_raid_disk
7965 * information must be scrapped. 7968 * information must be scrapped.
7966 */ 7969 */
7967 if (!mddev->degraded) 7970 if (!mddev->degraded)
7968 rdev_for_each(rdev, mddev) 7971 rdev_for_each(rdev, mddev)
7969 rdev->saved_raid_disk = -1; 7972 rdev->saved_raid_disk = -1;
7970 7973
7971 md_update_sb(mddev, 1); 7974 md_update_sb(mddev, 1);
7972 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery); 7975 clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
7973 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); 7976 clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
7974 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery); 7977 clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
7975 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery); 7978 clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
7976 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery); 7979 clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
7977 /* flag recovery needed just to double check */ 7980 /* flag recovery needed just to double check */
7978 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7981 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7979 sysfs_notify_dirent_safe(mddev->sysfs_action); 7982 sysfs_notify_dirent_safe(mddev->sysfs_action);
7980 md_new_event(mddev); 7983 md_new_event(mddev);
7981 if (mddev->event_work.func) 7984 if (mddev->event_work.func)
7982 queue_work(md_misc_wq, &mddev->event_work); 7985 queue_work(md_misc_wq, &mddev->event_work);
7983 } 7986 }
7984 7987
7985 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev) 7988 void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev)
7986 { 7989 {
7987 sysfs_notify_dirent_safe(rdev->sysfs_state); 7990 sysfs_notify_dirent_safe(rdev->sysfs_state);
7988 wait_event_timeout(rdev->blocked_wait, 7991 wait_event_timeout(rdev->blocked_wait,
7989 !test_bit(Blocked, &rdev->flags) && 7992 !test_bit(Blocked, &rdev->flags) &&
7990 !test_bit(BlockedBadBlocks, &rdev->flags), 7993 !test_bit(BlockedBadBlocks, &rdev->flags),
7991 msecs_to_jiffies(5000)); 7994 msecs_to_jiffies(5000));
7992 rdev_dec_pending(rdev, mddev); 7995 rdev_dec_pending(rdev, mddev);
7993 } 7996 }
7994 EXPORT_SYMBOL(md_wait_for_blocked_rdev); 7997 EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7995 7998
7996 void md_finish_reshape(struct mddev *mddev) 7999 void md_finish_reshape(struct mddev *mddev)
7997 { 8000 {
7998 /* called be personality module when reshape completes. */ 8001 /* called be personality module when reshape completes. */
7999 struct md_rdev *rdev; 8002 struct md_rdev *rdev;
8000 8003
8001 rdev_for_each(rdev, mddev) { 8004 rdev_for_each(rdev, mddev) {
8002 if (rdev->data_offset > rdev->new_data_offset) 8005 if (rdev->data_offset > rdev->new_data_offset)
8003 rdev->sectors += rdev->data_offset - rdev->new_data_offset; 8006 rdev->sectors += rdev->data_offset - rdev->new_data_offset;
8004 else 8007 else
8005 rdev->sectors -= rdev->new_data_offset - rdev->data_offset; 8008 rdev->sectors -= rdev->new_data_offset - rdev->data_offset;
8006 rdev->data_offset = rdev->new_data_offset; 8009 rdev->data_offset = rdev->new_data_offset;
8007 } 8010 }
8008 } 8011 }
8009 EXPORT_SYMBOL(md_finish_reshape); 8012 EXPORT_SYMBOL(md_finish_reshape);
8010 8013
8011 /* Bad block management. 8014 /* Bad block management.
8012 * We can record which blocks on each device are 'bad' and so just 8015 * We can record which blocks on each device are 'bad' and so just
8013 * fail those blocks, or that stripe, rather than the whole device. 8016 * fail those blocks, or that stripe, rather than the whole device.
8014 * Entries in the bad-block table are 64bits wide. This comprises: 8017 * Entries in the bad-block table are 64bits wide. This comprises:
8015 * Length of bad-range, in sectors: 0-511 for lengths 1-512 8018 * Length of bad-range, in sectors: 0-511 for lengths 1-512
8016 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) 8019 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
8017 * A 'shift' can be set so that larger blocks are tracked and 8020 * A 'shift' can be set so that larger blocks are tracked and
8018 * consequently larger devices can be covered. 8021 * consequently larger devices can be covered.
8019 * 'Acknowledged' flag - 1 bit. - the most significant bit. 8022 * 'Acknowledged' flag - 1 bit. - the most significant bit.
8020 * 8023 *
8021 * Locking of the bad-block table uses a seqlock so md_is_badblock 8024 * Locking of the bad-block table uses a seqlock so md_is_badblock
8022 * might need to retry if it is very unlucky. 8025 * might need to retry if it is very unlucky.
8023 * We will sometimes want to check for bad blocks in a bi_end_io function, 8026 * We will sometimes want to check for bad blocks in a bi_end_io function,
8024 * so we use the write_seqlock_irq variant. 8027 * so we use the write_seqlock_irq variant.
8025 * 8028 *
8026 * When looking for a bad block we specify a range and want to 8029 * When looking for a bad block we specify a range and want to
8027 * know if any block in the range is bad. So we binary-search 8030 * know if any block in the range is bad. So we binary-search
8028 * to the last range that starts at-or-before the given endpoint, 8031 * to the last range that starts at-or-before the given endpoint,
8029 * (or "before the sector after the target range") 8032 * (or "before the sector after the target range")
8030 * then see if it ends after the given start. 8033 * then see if it ends after the given start.
8031 * We return 8034 * We return
8032 * 0 if there are no known bad blocks in the range 8035 * 0 if there are no known bad blocks in the range
8033 * 1 if there are known bad block which are all acknowledged 8036 * 1 if there are known bad block which are all acknowledged
8034 * -1 if there are bad blocks which have not yet been acknowledged in metadata. 8037 * -1 if there are bad blocks which have not yet been acknowledged in metadata.
8035 * plus the start/length of the first bad section we overlap. 8038 * plus the start/length of the first bad section we overlap.
8036 */ 8039 */
8037 int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, 8040 int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
8038 sector_t *first_bad, int *bad_sectors) 8041 sector_t *first_bad, int *bad_sectors)
8039 { 8042 {
8040 int hi; 8043 int hi;
8041 int lo; 8044 int lo;
8042 u64 *p = bb->page; 8045 u64 *p = bb->page;
8043 int rv; 8046 int rv;
8044 sector_t target = s + sectors; 8047 sector_t target = s + sectors;
8045 unsigned seq; 8048 unsigned seq;
8046 8049
8047 if (bb->shift > 0) { 8050 if (bb->shift > 0) {
8048 /* round the start down, and the end up */ 8051 /* round the start down, and the end up */
8049 s >>= bb->shift; 8052 s >>= bb->shift;
8050 target += (1<<bb->shift) - 1; 8053 target += (1<<bb->shift) - 1;
8051 target >>= bb->shift; 8054 target >>= bb->shift;
8052 sectors = target - s; 8055 sectors = target - s;
8053 } 8056 }
8054 /* 'target' is now the first block after the bad range */ 8057 /* 'target' is now the first block after the bad range */
8055 8058
8056 retry: 8059 retry:
8057 seq = read_seqbegin(&bb->lock); 8060 seq = read_seqbegin(&bb->lock);
8058 lo = 0; 8061 lo = 0;
8059 rv = 0; 8062 rv = 0;
8060 hi = bb->count; 8063 hi = bb->count;
8061 8064
8062 /* Binary search between lo and hi for 'target' 8065 /* Binary search between lo and hi for 'target'
8063 * i.e. for the last range that starts before 'target' 8066 * i.e. for the last range that starts before 'target'
8064 */ 8067 */
8065 /* INVARIANT: ranges before 'lo' and at-or-after 'hi' 8068 /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
8066 * are known not to be the last range before target. 8069 * are known not to be the last range before target.
8067 * VARIANT: hi-lo is the number of possible 8070 * VARIANT: hi-lo is the number of possible
8068 * ranges, and decreases until it reaches 1 8071 * ranges, and decreases until it reaches 1
8069 */ 8072 */
8070 while (hi - lo > 1) { 8073 while (hi - lo > 1) {
8071 int mid = (lo + hi) / 2; 8074 int mid = (lo + hi) / 2;
8072 sector_t a = BB_OFFSET(p[mid]); 8075 sector_t a = BB_OFFSET(p[mid]);
8073 if (a < target) 8076 if (a < target)
8074 /* This could still be the one, earlier ranges 8077 /* This could still be the one, earlier ranges
8075 * could not. */ 8078 * could not. */
8076 lo = mid; 8079 lo = mid;
8077 else 8080 else
8078 /* This and later ranges are definitely out. */ 8081 /* This and later ranges are definitely out. */
8079 hi = mid; 8082 hi = mid;
8080 } 8083 }
8081 /* 'lo' might be the last that started before target, but 'hi' isn't */ 8084 /* 'lo' might be the last that started before target, but 'hi' isn't */
8082 if (hi > lo) { 8085 if (hi > lo) {
8083 /* need to check all range that end after 's' to see if 8086 /* need to check all range that end after 's' to see if
8084 * any are unacknowledged. 8087 * any are unacknowledged.
8085 */ 8088 */
8086 while (lo >= 0 && 8089 while (lo >= 0 &&
8087 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { 8090 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8088 if (BB_OFFSET(p[lo]) < target) { 8091 if (BB_OFFSET(p[lo]) < target) {
8089 /* starts before the end, and finishes after 8092 /* starts before the end, and finishes after
8090 * the start, so they must overlap 8093 * the start, so they must overlap
8091 */ 8094 */
8092 if (rv != -1 && BB_ACK(p[lo])) 8095 if (rv != -1 && BB_ACK(p[lo]))
8093 rv = 1; 8096 rv = 1;
8094 else 8097 else
8095 rv = -1; 8098 rv = -1;
8096 *first_bad = BB_OFFSET(p[lo]); 8099 *first_bad = BB_OFFSET(p[lo]);
8097 *bad_sectors = BB_LEN(p[lo]); 8100 *bad_sectors = BB_LEN(p[lo]);
8098 } 8101 }
8099 lo--; 8102 lo--;
8100 } 8103 }
8101 } 8104 }
8102 8105
8103 if (read_seqretry(&bb->lock, seq)) 8106 if (read_seqretry(&bb->lock, seq))
8104 goto retry; 8107 goto retry;
8105 8108
8106 return rv; 8109 return rv;
8107 } 8110 }
8108 EXPORT_SYMBOL_GPL(md_is_badblock); 8111 EXPORT_SYMBOL_GPL(md_is_badblock);
8109 8112
8110 /* 8113 /*
8111 * Add a range of bad blocks to the table. 8114 * Add a range of bad blocks to the table.
8112 * This might extend the table, or might contract it 8115 * This might extend the table, or might contract it
8113 * if two adjacent ranges can be merged. 8116 * if two adjacent ranges can be merged.
8114 * We binary-search to find the 'insertion' point, then 8117 * We binary-search to find the 'insertion' point, then
8115 * decide how best to handle it. 8118 * decide how best to handle it.
8116 */ 8119 */
8117 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, 8120 static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
8118 int acknowledged) 8121 int acknowledged)
8119 { 8122 {
8120 u64 *p; 8123 u64 *p;
8121 int lo, hi; 8124 int lo, hi;
8122 int rv = 1; 8125 int rv = 1;
8123 unsigned long flags; 8126 unsigned long flags;
8124 8127
8125 if (bb->shift < 0) 8128 if (bb->shift < 0)
8126 /* badblocks are disabled */ 8129 /* badblocks are disabled */
8127 return 0; 8130 return 0;
8128 8131
8129 if (bb->shift) { 8132 if (bb->shift) {
8130 /* round the start down, and the end up */ 8133 /* round the start down, and the end up */
8131 sector_t next = s + sectors; 8134 sector_t next = s + sectors;
8132 s >>= bb->shift; 8135 s >>= bb->shift;
8133 next += (1<<bb->shift) - 1; 8136 next += (1<<bb->shift) - 1;
8134 next >>= bb->shift; 8137 next >>= bb->shift;
8135 sectors = next - s; 8138 sectors = next - s;
8136 } 8139 }
8137 8140
8138 write_seqlock_irqsave(&bb->lock, flags); 8141 write_seqlock_irqsave(&bb->lock, flags);
8139 8142
8140 p = bb->page; 8143 p = bb->page;
8141 lo = 0; 8144 lo = 0;
8142 hi = bb->count; 8145 hi = bb->count;
8143 /* Find the last range that starts at-or-before 's' */ 8146 /* Find the last range that starts at-or-before 's' */
8144 while (hi - lo > 1) { 8147 while (hi - lo > 1) {
8145 int mid = (lo + hi) / 2; 8148 int mid = (lo + hi) / 2;
8146 sector_t a = BB_OFFSET(p[mid]); 8149 sector_t a = BB_OFFSET(p[mid]);
8147 if (a <= s) 8150 if (a <= s)
8148 lo = mid; 8151 lo = mid;
8149 else 8152 else
8150 hi = mid; 8153 hi = mid;
8151 } 8154 }
8152 if (hi > lo && BB_OFFSET(p[lo]) > s) 8155 if (hi > lo && BB_OFFSET(p[lo]) > s)
8153 hi = lo; 8156 hi = lo;
8154 8157
8155 if (hi > lo) { 8158 if (hi > lo) {
8156 /* we found a range that might merge with the start 8159 /* we found a range that might merge with the start
8157 * of our new range 8160 * of our new range
8158 */ 8161 */
8159 sector_t a = BB_OFFSET(p[lo]); 8162 sector_t a = BB_OFFSET(p[lo]);
8160 sector_t e = a + BB_LEN(p[lo]); 8163 sector_t e = a + BB_LEN(p[lo]);
8161 int ack = BB_ACK(p[lo]); 8164 int ack = BB_ACK(p[lo]);
8162 if (e >= s) { 8165 if (e >= s) {
8163 /* Yes, we can merge with a previous range */ 8166 /* Yes, we can merge with a previous range */
8164 if (s == a && s + sectors >= e) 8167 if (s == a && s + sectors >= e)
8165 /* new range covers old */ 8168 /* new range covers old */
8166 ack = acknowledged; 8169 ack = acknowledged;
8167 else 8170 else
8168 ack = ack && acknowledged; 8171 ack = ack && acknowledged;
8169 8172
8170 if (e < s + sectors) 8173 if (e < s + sectors)
8171 e = s + sectors; 8174 e = s + sectors;
8172 if (e - a <= BB_MAX_LEN) { 8175 if (e - a <= BB_MAX_LEN) {
8173 p[lo] = BB_MAKE(a, e-a, ack); 8176 p[lo] = BB_MAKE(a, e-a, ack);
8174 s = e; 8177 s = e;
8175 } else { 8178 } else {
8176 /* does not all fit in one range, 8179 /* does not all fit in one range,
8177 * make p[lo] maximal 8180 * make p[lo] maximal
8178 */ 8181 */
8179 if (BB_LEN(p[lo]) != BB_MAX_LEN) 8182 if (BB_LEN(p[lo]) != BB_MAX_LEN)
8180 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); 8183 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
8181 s = a + BB_MAX_LEN; 8184 s = a + BB_MAX_LEN;
8182 } 8185 }
8183 sectors = e - s; 8186 sectors = e - s;
8184 } 8187 }
8185 } 8188 }
8186 if (sectors && hi < bb->count) { 8189 if (sectors && hi < bb->count) {
8187 /* 'hi' points to the first range that starts after 's'. 8190 /* 'hi' points to the first range that starts after 's'.
8188 * Maybe we can merge with the start of that range */ 8191 * Maybe we can merge with the start of that range */
8189 sector_t a = BB_OFFSET(p[hi]); 8192 sector_t a = BB_OFFSET(p[hi]);
8190 sector_t e = a + BB_LEN(p[hi]); 8193 sector_t e = a + BB_LEN(p[hi]);
8191 int ack = BB_ACK(p[hi]); 8194 int ack = BB_ACK(p[hi]);
8192 if (a <= s + sectors) { 8195 if (a <= s + sectors) {
8193 /* merging is possible */ 8196 /* merging is possible */
8194 if (e <= s + sectors) { 8197 if (e <= s + sectors) {
8195 /* full overlap */ 8198 /* full overlap */
8196 e = s + sectors; 8199 e = s + sectors;
8197 ack = acknowledged; 8200 ack = acknowledged;
8198 } else 8201 } else
8199 ack = ack && acknowledged; 8202 ack = ack && acknowledged;
8200 8203
8201 a = s; 8204 a = s;
8202 if (e - a <= BB_MAX_LEN) { 8205 if (e - a <= BB_MAX_LEN) {
8203 p[hi] = BB_MAKE(a, e-a, ack); 8206 p[hi] = BB_MAKE(a, e-a, ack);
8204 s = e; 8207 s = e;
8205 } else { 8208 } else {
8206 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); 8209 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
8207 s = a + BB_MAX_LEN; 8210 s = a + BB_MAX_LEN;
8208 } 8211 }
8209 sectors = e - s; 8212 sectors = e - s;
8210 lo = hi; 8213 lo = hi;
8211 hi++; 8214 hi++;
8212 } 8215 }
8213 } 8216 }
8214 if (sectors == 0 && hi < bb->count) { 8217 if (sectors == 0 && hi < bb->count) {
8215 /* we might be able to combine lo and hi */ 8218 /* we might be able to combine lo and hi */
8216 /* Note: 's' is at the end of 'lo' */ 8219 /* Note: 's' is at the end of 'lo' */
8217 sector_t a = BB_OFFSET(p[hi]); 8220 sector_t a = BB_OFFSET(p[hi]);
8218 int lolen = BB_LEN(p[lo]); 8221 int lolen = BB_LEN(p[lo]);
8219 int hilen = BB_LEN(p[hi]); 8222 int hilen = BB_LEN(p[hi]);
8220 int newlen = lolen + hilen - (s - a); 8223 int newlen = lolen + hilen - (s - a);
8221 if (s >= a && newlen < BB_MAX_LEN) { 8224 if (s >= a && newlen < BB_MAX_LEN) {
8222 /* yes, we can combine them */ 8225 /* yes, we can combine them */
8223 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); 8226 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
8224 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); 8227 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
8225 memmove(p + hi, p + hi + 1, 8228 memmove(p + hi, p + hi + 1,
8226 (bb->count - hi - 1) * 8); 8229 (bb->count - hi - 1) * 8);
8227 bb->count--; 8230 bb->count--;
8228 } 8231 }
8229 } 8232 }
8230 while (sectors) { 8233 while (sectors) {
8231 /* didn't merge (it all). 8234 /* didn't merge (it all).
8232 * Need to add a range just before 'hi' */ 8235 * Need to add a range just before 'hi' */
8233 if (bb->count >= MD_MAX_BADBLOCKS) { 8236 if (bb->count >= MD_MAX_BADBLOCKS) {
8234 /* No room for more */ 8237 /* No room for more */
8235 rv = 0; 8238 rv = 0;
8236 break; 8239 break;
8237 } else { 8240 } else {
8238 int this_sectors = sectors; 8241 int this_sectors = sectors;
8239 memmove(p + hi + 1, p + hi, 8242 memmove(p + hi + 1, p + hi,
8240 (bb->count - hi) * 8); 8243 (bb->count - hi) * 8);
8241 bb->count++; 8244 bb->count++;
8242 8245
8243 if (this_sectors > BB_MAX_LEN) 8246 if (this_sectors > BB_MAX_LEN)
8244 this_sectors = BB_MAX_LEN; 8247 this_sectors = BB_MAX_LEN;
8245 p[hi] = BB_MAKE(s, this_sectors, acknowledged); 8248 p[hi] = BB_MAKE(s, this_sectors, acknowledged);
8246 sectors -= this_sectors; 8249 sectors -= this_sectors;
8247 s += this_sectors; 8250 s += this_sectors;
8248 } 8251 }
8249 } 8252 }
8250 8253
8251 bb->changed = 1; 8254 bb->changed = 1;
8252 if (!acknowledged) 8255 if (!acknowledged)
8253 bb->unacked_exist = 1; 8256 bb->unacked_exist = 1;
8254 write_sequnlock_irqrestore(&bb->lock, flags); 8257 write_sequnlock_irqrestore(&bb->lock, flags);
8255 8258
8256 return rv; 8259 return rv;
8257 } 8260 }
8258 8261
8259 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8262 int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8260 int is_new) 8263 int is_new)
8261 { 8264 {
8262 int rv; 8265 int rv;
8263 if (is_new) 8266 if (is_new)
8264 s += rdev->new_data_offset; 8267 s += rdev->new_data_offset;
8265 else 8268 else
8266 s += rdev->data_offset; 8269 s += rdev->data_offset;
8267 rv = md_set_badblocks(&rdev->badblocks, 8270 rv = md_set_badblocks(&rdev->badblocks,
8268 s, sectors, 0); 8271 s, sectors, 0);
8269 if (rv) { 8272 if (rv) {
8270 /* Make sure they get written out promptly */ 8273 /* Make sure they get written out promptly */
8271 sysfs_notify_dirent_safe(rdev->sysfs_state); 8274 sysfs_notify_dirent_safe(rdev->sysfs_state);
8272 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); 8275 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
8273 md_wakeup_thread(rdev->mddev->thread); 8276 md_wakeup_thread(rdev->mddev->thread);
8274 } 8277 }
8275 return rv; 8278 return rv;
8276 } 8279 }
8277 EXPORT_SYMBOL_GPL(rdev_set_badblocks); 8280 EXPORT_SYMBOL_GPL(rdev_set_badblocks);
8278 8281
8279 /* 8282 /*
8280 * Remove a range of bad blocks from the table. 8283 * Remove a range of bad blocks from the table.
8281 * This may involve extending the table if we spilt a region, 8284 * This may involve extending the table if we spilt a region,
8282 * but it must not fail. So if the table becomes full, we just 8285 * but it must not fail. So if the table becomes full, we just
8283 * drop the remove request. 8286 * drop the remove request.
8284 */ 8287 */
8285 static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) 8288 static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
8286 { 8289 {
8287 u64 *p; 8290 u64 *p;
8288 int lo, hi; 8291 int lo, hi;
8289 sector_t target = s + sectors; 8292 sector_t target = s + sectors;
8290 int rv = 0; 8293 int rv = 0;
8291 8294
8292 if (bb->shift > 0) { 8295 if (bb->shift > 0) {
8293 /* When clearing we round the start up and the end down. 8296 /* When clearing we round the start up and the end down.
8294 * This should not matter as the shift should align with 8297 * This should not matter as the shift should align with
8295 * the block size and no rounding should ever be needed. 8298 * the block size and no rounding should ever be needed.
8296 * However it is better the think a block is bad when it 8299 * However it is better the think a block is bad when it
8297 * isn't than to think a block is not bad when it is. 8300 * isn't than to think a block is not bad when it is.
8298 */ 8301 */
8299 s += (1<<bb->shift) - 1; 8302 s += (1<<bb->shift) - 1;
8300 s >>= bb->shift; 8303 s >>= bb->shift;
8301 target >>= bb->shift; 8304 target >>= bb->shift;
8302 sectors = target - s; 8305 sectors = target - s;
8303 } 8306 }
8304 8307
8305 write_seqlock_irq(&bb->lock); 8308 write_seqlock_irq(&bb->lock);
8306 8309
8307 p = bb->page; 8310 p = bb->page;
8308 lo = 0; 8311 lo = 0;
8309 hi = bb->count; 8312 hi = bb->count;
8310 /* Find the last range that starts before 'target' */ 8313 /* Find the last range that starts before 'target' */
8311 while (hi - lo > 1) { 8314 while (hi - lo > 1) {
8312 int mid = (lo + hi) / 2; 8315 int mid = (lo + hi) / 2;
8313 sector_t a = BB_OFFSET(p[mid]); 8316 sector_t a = BB_OFFSET(p[mid]);
8314 if (a < target) 8317 if (a < target)
8315 lo = mid; 8318 lo = mid;
8316 else 8319 else
8317 hi = mid; 8320 hi = mid;
8318 } 8321 }
8319 if (hi > lo) { 8322 if (hi > lo) {
8320 /* p[lo] is the last range that could overlap the 8323 /* p[lo] is the last range that could overlap the
8321 * current range. Earlier ranges could also overlap, 8324 * current range. Earlier ranges could also overlap,
8322 * but only this one can overlap the end of the range. 8325 * but only this one can overlap the end of the range.
8323 */ 8326 */
8324 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { 8327 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
8325 /* Partial overlap, leave the tail of this range */ 8328 /* Partial overlap, leave the tail of this range */
8326 int ack = BB_ACK(p[lo]); 8329 int ack = BB_ACK(p[lo]);
8327 sector_t a = BB_OFFSET(p[lo]); 8330 sector_t a = BB_OFFSET(p[lo]);
8328 sector_t end = a + BB_LEN(p[lo]); 8331 sector_t end = a + BB_LEN(p[lo]);
8329 8332
8330 if (a < s) { 8333 if (a < s) {
8331 /* we need to split this range */ 8334 /* we need to split this range */
8332 if (bb->count >= MD_MAX_BADBLOCKS) { 8335 if (bb->count >= MD_MAX_BADBLOCKS) {
8333 rv = 0; 8336 rv = 0;
8334 goto out; 8337 goto out;
8335 } 8338 }
8336 memmove(p+lo+1, p+lo, (bb->count - lo) * 8); 8339 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
8337 bb->count++; 8340 bb->count++;
8338 p[lo] = BB_MAKE(a, s-a, ack); 8341 p[lo] = BB_MAKE(a, s-a, ack);
8339 lo++; 8342 lo++;
8340 } 8343 }
8341 p[lo] = BB_MAKE(target, end - target, ack); 8344 p[lo] = BB_MAKE(target, end - target, ack);
8342 /* there is no longer an overlap */ 8345 /* there is no longer an overlap */
8343 hi = lo; 8346 hi = lo;
8344 lo--; 8347 lo--;
8345 } 8348 }
8346 while (lo >= 0 && 8349 while (lo >= 0 &&
8347 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { 8350 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
8348 /* This range does overlap */ 8351 /* This range does overlap */
8349 if (BB_OFFSET(p[lo]) < s) { 8352 if (BB_OFFSET(p[lo]) < s) {
8350 /* Keep the early parts of this range. */ 8353 /* Keep the early parts of this range. */
8351 int ack = BB_ACK(p[lo]); 8354 int ack = BB_ACK(p[lo]);
8352 sector_t start = BB_OFFSET(p[lo]); 8355 sector_t start = BB_OFFSET(p[lo]);
8353 p[lo] = BB_MAKE(start, s - start, ack); 8356 p[lo] = BB_MAKE(start, s - start, ack);
8354 /* now low doesn't overlap, so.. */ 8357 /* now low doesn't overlap, so.. */
8355 break; 8358 break;
8356 } 8359 }
8357 lo--; 8360 lo--;
8358 } 8361 }
8359 /* 'lo' is strictly before, 'hi' is strictly after, 8362 /* 'lo' is strictly before, 'hi' is strictly after,
8360 * anything between needs to be discarded 8363 * anything between needs to be discarded
8361 */ 8364 */
8362 if (hi - lo > 1) { 8365 if (hi - lo > 1) {
8363 memmove(p+lo+1, p+hi, (bb->count - hi) * 8); 8366 memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
8364 bb->count -= (hi - lo - 1); 8367 bb->count -= (hi - lo - 1);
8365 } 8368 }
8366 } 8369 }
8367 8370
8368 bb->changed = 1; 8371 bb->changed = 1;
8369 out: 8372 out:
8370 write_sequnlock_irq(&bb->lock); 8373 write_sequnlock_irq(&bb->lock);
8371 return rv; 8374 return rv;
8372 } 8375 }
8373 8376
8374 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors, 8377 int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
8375 int is_new) 8378 int is_new)
8376 { 8379 {
8377 if (is_new) 8380 if (is_new)
8378 s += rdev->new_data_offset; 8381 s += rdev->new_data_offset;
8379 else 8382 else
8380 s += rdev->data_offset; 8383 s += rdev->data_offset;
8381 return md_clear_badblocks(&rdev->badblocks, 8384 return md_clear_badblocks(&rdev->badblocks,
8382 s, sectors); 8385 s, sectors);
8383 } 8386 }
8384 EXPORT_SYMBOL_GPL(rdev_clear_badblocks); 8387 EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
8385 8388
8386 /* 8389 /*
8387 * Acknowledge all bad blocks in a list. 8390 * Acknowledge all bad blocks in a list.
8388 * This only succeeds if ->changed is clear. It is used by 8391 * This only succeeds if ->changed is clear. It is used by
8389 * in-kernel metadata updates 8392 * in-kernel metadata updates
8390 */ 8393 */
8391 void md_ack_all_badblocks(struct badblocks *bb) 8394 void md_ack_all_badblocks(struct badblocks *bb)
8392 { 8395 {
8393 if (bb->page == NULL || bb->changed) 8396 if (bb->page == NULL || bb->changed)
8394 /* no point even trying */ 8397 /* no point even trying */
8395 return; 8398 return;
8396 write_seqlock_irq(&bb->lock); 8399 write_seqlock_irq(&bb->lock);
8397 8400
8398 if (bb->changed == 0 && bb->unacked_exist) { 8401 if (bb->changed == 0 && bb->unacked_exist) {
8399 u64 *p = bb->page; 8402 u64 *p = bb->page;
8400 int i; 8403 int i;
8401 for (i = 0; i < bb->count ; i++) { 8404 for (i = 0; i < bb->count ; i++) {
8402 if (!BB_ACK(p[i])) { 8405 if (!BB_ACK(p[i])) {
8403 sector_t start = BB_OFFSET(p[i]); 8406 sector_t start = BB_OFFSET(p[i]);
8404 int len = BB_LEN(p[i]); 8407 int len = BB_LEN(p[i]);
8405 p[i] = BB_MAKE(start, len, 1); 8408 p[i] = BB_MAKE(start, len, 1);
8406 } 8409 }
8407 } 8410 }
8408 bb->unacked_exist = 0; 8411 bb->unacked_exist = 0;
8409 } 8412 }
8410 write_sequnlock_irq(&bb->lock); 8413 write_sequnlock_irq(&bb->lock);
8411 } 8414 }
8412 EXPORT_SYMBOL_GPL(md_ack_all_badblocks); 8415 EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
8413 8416
8414 /* sysfs access to bad-blocks list. 8417 /* sysfs access to bad-blocks list.
8415 * We present two files. 8418 * We present two files.
8416 * 'bad-blocks' lists sector numbers and lengths of ranges that 8419 * 'bad-blocks' lists sector numbers and lengths of ranges that
8417 * are recorded as bad. The list is truncated to fit within 8420 * are recorded as bad. The list is truncated to fit within
8418 * the one-page limit of sysfs. 8421 * the one-page limit of sysfs.
8419 * Writing "sector length" to this file adds an acknowledged 8422 * Writing "sector length" to this file adds an acknowledged
8420 * bad block list. 8423 * bad block list.
8421 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet 8424 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
8422 * been acknowledged. Writing to this file adds bad blocks 8425 * been acknowledged. Writing to this file adds bad blocks
8423 * without acknowledging them. This is largely for testing. 8426 * without acknowledging them. This is largely for testing.
8424 */ 8427 */
8425 8428
8426 static ssize_t 8429 static ssize_t
8427 badblocks_show(struct badblocks *bb, char *page, int unack) 8430 badblocks_show(struct badblocks *bb, char *page, int unack)
8428 { 8431 {
8429 size_t len; 8432 size_t len;
8430 int i; 8433 int i;
8431 u64 *p = bb->page; 8434 u64 *p = bb->page;
8432 unsigned seq; 8435 unsigned seq;
8433 8436
8434 if (bb->shift < 0) 8437 if (bb->shift < 0)
8435 return 0; 8438 return 0;
8436 8439
8437 retry: 8440 retry:
8438 seq = read_seqbegin(&bb->lock); 8441 seq = read_seqbegin(&bb->lock);
8439 8442
8440 len = 0; 8443 len = 0;
8441 i = 0; 8444 i = 0;
8442 8445
8443 while (len < PAGE_SIZE && i < bb->count) { 8446 while (len < PAGE_SIZE && i < bb->count) {
8444 sector_t s = BB_OFFSET(p[i]); 8447 sector_t s = BB_OFFSET(p[i]);
8445 unsigned int length = BB_LEN(p[i]); 8448 unsigned int length = BB_LEN(p[i]);
8446 int ack = BB_ACK(p[i]); 8449 int ack = BB_ACK(p[i]);
8447 i++; 8450 i++;
8448 8451
8449 if (unack && ack) 8452 if (unack && ack)
8450 continue; 8453 continue;
8451 8454
8452 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", 8455 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
8453 (unsigned long long)s << bb->shift, 8456 (unsigned long long)s << bb->shift,
8454 length << bb->shift); 8457 length << bb->shift);
8455 } 8458 }
8456 if (unack && len == 0) 8459 if (unack && len == 0)
8457 bb->unacked_exist = 0; 8460 bb->unacked_exist = 0;
8458 8461
8459 if (read_seqretry(&bb->lock, seq)) 8462 if (read_seqretry(&bb->lock, seq))
8460 goto retry; 8463 goto retry;
8461 8464
8462 return len; 8465 return len;
8463 } 8466 }
8464 8467
8465 #define DO_DEBUG 1 8468 #define DO_DEBUG 1
8466 8469
8467 static ssize_t 8470 static ssize_t
8468 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) 8471 badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8469 { 8472 {
8470 unsigned long long sector; 8473 unsigned long long sector;
8471 int length; 8474 int length;
8472 char newline; 8475 char newline;
8473 #ifdef DO_DEBUG 8476 #ifdef DO_DEBUG
8474 /* Allow clearing via sysfs *only* for testing/debugging. 8477 /* Allow clearing via sysfs *only* for testing/debugging.
8475 * Normally only a successful write may clear a badblock 8478 * Normally only a successful write may clear a badblock
8476 */ 8479 */
8477 int clear = 0; 8480 int clear = 0;
8478 if (page[0] == '-') { 8481 if (page[0] == '-') {
8479 clear = 1; 8482 clear = 1;
8480 page++; 8483 page++;
8481 } 8484 }
8482 #endif /* DO_DEBUG */ 8485 #endif /* DO_DEBUG */
8483 8486
8484 switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) { 8487 switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
8485 case 3: 8488 case 3:
8486 if (newline != '\n') 8489 if (newline != '\n')
8487 return -EINVAL; 8490 return -EINVAL;
8488 case 2: 8491 case 2:
8489 if (length <= 0) 8492 if (length <= 0)
8490 return -EINVAL; 8493 return -EINVAL;
8491 break; 8494 break;
8492 default: 8495 default:
8493 return -EINVAL; 8496 return -EINVAL;
8494 } 8497 }
8495 8498
8496 #ifdef DO_DEBUG 8499 #ifdef DO_DEBUG
8497 if (clear) { 8500 if (clear) {
8498 md_clear_badblocks(bb, sector, length); 8501 md_clear_badblocks(bb, sector, length);
8499 return len; 8502 return len;
8500 } 8503 }
8501 #endif /* DO_DEBUG */ 8504 #endif /* DO_DEBUG */
8502 if (md_set_badblocks(bb, sector, length, !unack)) 8505 if (md_set_badblocks(bb, sector, length, !unack))
8503 return len; 8506 return len;
8504 else 8507 else
8505 return -ENOSPC; 8508 return -ENOSPC;
8506 } 8509 }
8507 8510
8508 static int md_notify_reboot(struct notifier_block *this, 8511 static int md_notify_reboot(struct notifier_block *this,
8509 unsigned long code, void *x) 8512 unsigned long code, void *x)
8510 { 8513 {
8511 struct list_head *tmp; 8514 struct list_head *tmp;
8512 struct mddev *mddev; 8515 struct mddev *mddev;
8513 int need_delay = 0; 8516 int need_delay = 0;
8514 8517
8515 for_each_mddev(mddev, tmp) { 8518 for_each_mddev(mddev, tmp) {
8516 if (mddev_trylock(mddev)) { 8519 if (mddev_trylock(mddev)) {
8517 if (mddev->pers) 8520 if (mddev->pers)
8518 __md_stop_writes(mddev); 8521 __md_stop_writes(mddev);
8519 if (mddev->persistent) 8522 if (mddev->persistent)
8520 mddev->safemode = 2; 8523 mddev->safemode = 2;
8521 mddev_unlock(mddev); 8524 mddev_unlock(mddev);
8522 } 8525 }
8523 need_delay = 1; 8526 need_delay = 1;
8524 } 8527 }
8525 /* 8528 /*
8526 * certain more exotic SCSI devices are known to be 8529 * certain more exotic SCSI devices are known to be
8527 * volatile wrt too early system reboots. While the 8530 * volatile wrt too early system reboots. While the
8528 * right place to handle this issue is the given 8531 * right place to handle this issue is the given
8529 * driver, we do want to have a safe RAID driver ... 8532 * driver, we do want to have a safe RAID driver ...
8530 */ 8533 */
8531 if (need_delay) 8534 if (need_delay)
8532 mdelay(1000*1); 8535 mdelay(1000*1);
8533 8536
8534 return NOTIFY_DONE; 8537 return NOTIFY_DONE;
8535 } 8538 }
8536 8539
8537 static struct notifier_block md_notifier = { 8540 static struct notifier_block md_notifier = {
8538 .notifier_call = md_notify_reboot, 8541 .notifier_call = md_notify_reboot,
8539 .next = NULL, 8542 .next = NULL,
8540 .priority = INT_MAX, /* before any real devices */ 8543 .priority = INT_MAX, /* before any real devices */
8541 }; 8544 };
8542 8545
8543 static void md_geninit(void) 8546 static void md_geninit(void)
8544 { 8547 {
8545 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t)); 8548 pr_debug("md: sizeof(mdp_super_t) = %d\n", (int)sizeof(mdp_super_t));
8546 8549
8547 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops); 8550 proc_create("mdstat", S_IRUGO, NULL, &md_seq_fops);
8548 } 8551 }
8549 8552
8550 static int __init md_init(void) 8553 static int __init md_init(void)
8551 { 8554 {
8552 int ret = -ENOMEM; 8555 int ret = -ENOMEM;
8553 8556
8554 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0); 8557 md_wq = alloc_workqueue("md", WQ_MEM_RECLAIM, 0);
8555 if (!md_wq) 8558 if (!md_wq)
8556 goto err_wq; 8559 goto err_wq;
8557 8560
8558 md_misc_wq = alloc_workqueue("md_misc", 0, 0); 8561 md_misc_wq = alloc_workqueue("md_misc", 0, 0);
8559 if (!md_misc_wq) 8562 if (!md_misc_wq)
8560 goto err_misc_wq; 8563 goto err_misc_wq;
8561 8564
8562 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0) 8565 if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
8563 goto err_md; 8566 goto err_md;
8564 8567
8565 if ((ret = register_blkdev(0, "mdp")) < 0) 8568 if ((ret = register_blkdev(0, "mdp")) < 0)
8566 goto err_mdp; 8569 goto err_mdp;
8567 mdp_major = ret; 8570 mdp_major = ret;
8568 8571
8569 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE, 8572 blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
8570 md_probe, NULL, NULL); 8573 md_probe, NULL, NULL);
8571 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE, 8574 blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
8572 md_probe, NULL, NULL); 8575 md_probe, NULL, NULL);
8573 8576
8574 register_reboot_notifier(&md_notifier); 8577 register_reboot_notifier(&md_notifier);
8575 raid_table_header = register_sysctl_table(raid_root_table); 8578 raid_table_header = register_sysctl_table(raid_root_table);
8576 8579
8577 md_geninit(); 8580 md_geninit();
8578 return 0; 8581 return 0;
8579 8582
8580 err_mdp: 8583 err_mdp:
8581 unregister_blkdev(MD_MAJOR, "md"); 8584 unregister_blkdev(MD_MAJOR, "md");
8582 err_md: 8585 err_md:
8583 destroy_workqueue(md_misc_wq); 8586 destroy_workqueue(md_misc_wq);
8584 err_misc_wq: 8587 err_misc_wq:
8585 destroy_workqueue(md_wq); 8588 destroy_workqueue(md_wq);
8586 err_wq: 8589 err_wq:
8587 return ret; 8590 return ret;
8588 } 8591 }
8589 8592
8590 #ifndef MODULE 8593 #ifndef MODULE
8591 8594
8592 /* 8595 /*
8593 * Searches all registered partitions for autorun RAID arrays 8596 * Searches all registered partitions for autorun RAID arrays
8594 * at boot time. 8597 * at boot time.
8595 */ 8598 */
8596 8599
8597 static LIST_HEAD(all_detected_devices); 8600 static LIST_HEAD(all_detected_devices);
8598 struct detected_devices_node { 8601 struct detected_devices_node {
8599 struct list_head list; 8602 struct list_head list;
8600 dev_t dev; 8603 dev_t dev;
8601 }; 8604 };
8602 8605
8603 void md_autodetect_dev(dev_t dev) 8606 void md_autodetect_dev(dev_t dev)
8604 { 8607 {
8605 struct detected_devices_node *node_detected_dev; 8608 struct detected_devices_node *node_detected_dev;
8606 8609
8607 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL); 8610 node_detected_dev = kzalloc(sizeof(*node_detected_dev), GFP_KERNEL);
8608 if (node_detected_dev) { 8611 if (node_detected_dev) {
8609 node_detected_dev->dev = dev; 8612 node_detected_dev->dev = dev;
8610 list_add_tail(&node_detected_dev->list, &all_detected_devices); 8613 list_add_tail(&node_detected_dev->list, &all_detected_devices);
8611 } else { 8614 } else {
8612 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed" 8615 printk(KERN_CRIT "md: md_autodetect_dev: kzalloc failed"
8613 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev)); 8616 ", skipping dev(%d,%d)\n", MAJOR(dev), MINOR(dev));
8614 } 8617 }
8615 } 8618 }
8616 8619
8617 8620
8618 static void autostart_arrays(int part) 8621 static void autostart_arrays(int part)
8619 { 8622 {
8620 struct md_rdev *rdev; 8623 struct md_rdev *rdev;
8621 struct detected_devices_node *node_detected_dev; 8624 struct detected_devices_node *node_detected_dev;
8622 dev_t dev; 8625 dev_t dev;
8623 int i_scanned, i_passed; 8626 int i_scanned, i_passed;
8624 8627
8625 i_scanned = 0; 8628 i_scanned = 0;
8626 i_passed = 0; 8629 i_passed = 0;
8627 8630
8628 printk(KERN_INFO "md: Autodetecting RAID arrays.\n"); 8631 printk(KERN_INFO "md: Autodetecting RAID arrays.\n");
8629 8632
8630 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) { 8633 while (!list_empty(&all_detected_devices) && i_scanned < INT_MAX) {
8631 i_scanned++; 8634 i_scanned++;
8632 node_detected_dev = list_entry(all_detected_devices.next, 8635 node_detected_dev = list_entry(all_detected_devices.next,
8633 struct detected_devices_node, list); 8636 struct detected_devices_node, list);
8634 list_del(&node_detected_dev->list); 8637 list_del(&node_detected_dev->list);
8635 dev = node_detected_dev->dev; 8638 dev = node_detected_dev->dev;
8636 kfree(node_detected_dev); 8639 kfree(node_detected_dev);
8637 rdev = md_import_device(dev,0, 90); 8640 rdev = md_import_device(dev,0, 90);
8638 if (IS_ERR(rdev)) 8641 if (IS_ERR(rdev))
8639 continue; 8642 continue;
8640 8643
8641 if (test_bit(Faulty, &rdev->flags)) { 8644 if (test_bit(Faulty, &rdev->flags)) {
8642 MD_BUG(); 8645 MD_BUG();
8643 continue; 8646 continue;
8644 } 8647 }
8645 set_bit(AutoDetected, &rdev->flags); 8648 set_bit(AutoDetected, &rdev->flags);
8646 list_add(&rdev->same_set, &pending_raid_disks); 8649 list_add(&rdev->same_set, &pending_raid_disks);
8647 i_passed++; 8650 i_passed++;
8648 } 8651 }
8649 8652
8650 printk(KERN_INFO "md: Scanned %d and added %d devices.\n", 8653 printk(KERN_INFO "md: Scanned %d and added %d devices.\n",
8651 i_scanned, i_passed); 8654 i_scanned, i_passed);
8652 8655
8653 autorun_devices(part); 8656 autorun_devices(part);
8654 } 8657 }
8655 8658
8656 #endif /* !MODULE */ 8659 #endif /* !MODULE */
8657 8660
8658 static __exit void md_exit(void) 8661 static __exit void md_exit(void)
8659 { 8662 {
8660 struct mddev *mddev; 8663 struct mddev *mddev;
8661 struct list_head *tmp; 8664 struct list_head *tmp;
8662 int delay = 1; 8665 int delay = 1;
8663 8666
8664 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS); 8667 blk_unregister_region(MKDEV(MD_MAJOR,0), 1U << MINORBITS);
8665 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS); 8668 blk_unregister_region(MKDEV(mdp_major,0), 1U << MINORBITS);
8666 8669
8667 unregister_blkdev(MD_MAJOR,"md"); 8670 unregister_blkdev(MD_MAJOR,"md");
8668 unregister_blkdev(mdp_major, "mdp"); 8671 unregister_blkdev(mdp_major, "mdp");
8669 unregister_reboot_notifier(&md_notifier); 8672 unregister_reboot_notifier(&md_notifier);
8670 unregister_sysctl_table(raid_table_header); 8673 unregister_sysctl_table(raid_table_header);
8671 8674
8672 /* We cannot unload the modules while some process is 8675 /* We cannot unload the modules while some process is
8673 * waiting for us in select() or poll() - wake them up 8676 * waiting for us in select() or poll() - wake them up
8674 */ 8677 */
8675 md_unloading = 1; 8678 md_unloading = 1;
8676 while (waitqueue_active(&md_event_waiters)) { 8679 while (waitqueue_active(&md_event_waiters)) {
8677 /* not safe to leave yet */ 8680 /* not safe to leave yet */
8678 wake_up(&md_event_waiters); 8681 wake_up(&md_event_waiters);
8679 msleep(delay); 8682 msleep(delay);
8680 delay += delay; 8683 delay += delay;
8681 } 8684 }
8682 remove_proc_entry("mdstat", NULL); 8685 remove_proc_entry("mdstat", NULL);
8683 8686
8684 for_each_mddev(mddev, tmp) { 8687 for_each_mddev(mddev, tmp) {
8685 export_array(mddev); 8688 export_array(mddev);
8686 mddev->hold_active = 0; 8689 mddev->hold_active = 0;
8687 } 8690 }
8688 destroy_workqueue(md_misc_wq); 8691 destroy_workqueue(md_misc_wq);
8689 destroy_workqueue(md_wq); 8692 destroy_workqueue(md_wq);
8690 } 8693 }
8691 8694
8692 subsys_initcall(md_init); 8695 subsys_initcall(md_init);
8693 module_exit(md_exit) 8696 module_exit(md_exit)
8694 8697
8695 static int get_ro(char *buffer, struct kernel_param *kp) 8698 static int get_ro(char *buffer, struct kernel_param *kp)
8696 { 8699 {
8697 return sprintf(buffer, "%d", start_readonly); 8700 return sprintf(buffer, "%d", start_readonly);
8698 } 8701 }
8699 static int set_ro(const char *val, struct kernel_param *kp) 8702 static int set_ro(const char *val, struct kernel_param *kp)
8700 { 8703 {
8701 char *e; 8704 char *e;
8702 int num = simple_strtoul(val, &e, 10); 8705 int num = simple_strtoul(val, &e, 10);
8703 if (*val && (*e == '\0' || *e == '\n')) { 8706 if (*val && (*e == '\0' || *e == '\n')) {
8704 start_readonly = num; 8707 start_readonly = num;
8705 return 0; 8708 return 0;
8706 } 8709 }
8707 return -EINVAL; 8710 return -EINVAL;
8708 } 8711 }
8709 8712
8710 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR); 8713 module_param_call(start_ro, set_ro, get_ro, NULL, S_IRUSR|S_IWUSR);
8711 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR); 8714 module_param(start_dirty_degraded, int, S_IRUGO|S_IWUSR);
8712 8715
8713 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR); 8716 module_param_call(new_array, add_named_array, NULL, NULL, S_IWUSR);
8714 8717
8715 EXPORT_SYMBOL(register_md_personality); 8718 EXPORT_SYMBOL(register_md_personality);
8716 EXPORT_SYMBOL(unregister_md_personality); 8719 EXPORT_SYMBOL(unregister_md_personality);
8717 EXPORT_SYMBOL(md_error); 8720 EXPORT_SYMBOL(md_error);
8718 EXPORT_SYMBOL(md_done_sync); 8721 EXPORT_SYMBOL(md_done_sync);
8719 EXPORT_SYMBOL(md_write_start); 8722 EXPORT_SYMBOL(md_write_start);
8720 EXPORT_SYMBOL(md_write_end); 8723 EXPORT_SYMBOL(md_write_end);
8721 EXPORT_SYMBOL(md_register_thread); 8724 EXPORT_SYMBOL(md_register_thread);
8722 EXPORT_SYMBOL(md_unregister_thread); 8725 EXPORT_SYMBOL(md_unregister_thread);
8723 EXPORT_SYMBOL(md_wakeup_thread); 8726 EXPORT_SYMBOL(md_wakeup_thread);
8724 EXPORT_SYMBOL(md_check_recovery); 8727 EXPORT_SYMBOL(md_check_recovery);
8725 EXPORT_SYMBOL(md_reap_sync_thread); 8728 EXPORT_SYMBOL(md_reap_sync_thread);
8726 MODULE_LICENSE("GPL"); 8729 MODULE_LICENSE("GPL");
8727 MODULE_DESCRIPTION("MD RAID framework"); 8730 MODULE_DESCRIPTION("MD RAID framework");
8728 MODULE_ALIAS("md"); 8731 MODULE_ALIAS("md");
8729 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR); 8732 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
8730 8733