Commit 5e3604168b5994e591db2dece06867a035cd5146

Authored by Jonathan E Brassow
Committed by Greg Kroah-Hartman
1 parent 73b249403d

dm raid: set MD_CHANGE_DEVS when rebuilding

commit 3aa3b2b2b1edb813dc5342d0108befc39541542d upstream.

The 'rebuild' parameter is used to rebuild individual devices in an
array (e.g. resynchronize a RAID1 device or recalculate a parity device
in higher RAID).  The MD_CHANGE_DEVS flag must be set when this
parameter is given in order to write out the superblocks and make the
change take immediate effect.  The code that handles new devices in
super_load already sets MD_CHANGE_DEVS and 'FirstUse'.  (The 'FirstUse'
flag was being set as a special case for rebuilds in
super_init_validation.)

Add a condition for rebuilds in super_load to take care of both flags
without the special case in 'super_init_validation'.

Signed-off-by: Jonathan Brassow <jbrassow@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Showing 1 changed file with 10 additions and 6 deletions Inline Diff

drivers/md/dm-raid.c
1 /* 1 /*
2 * Copyright (C) 2010-2011 Neil Brown 2 * Copyright (C) 2010-2011 Neil Brown
3 * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2010-2011 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8 #include <linux/slab.h> 8 #include <linux/slab.h>
9 #include <linux/module.h> 9 #include <linux/module.h>
10 10
11 #include "md.h" 11 #include "md.h"
12 #include "raid1.h" 12 #include "raid1.h"
13 #include "raid5.h" 13 #include "raid5.h"
14 #include "bitmap.h" 14 #include "bitmap.h"
15 15
16 #include <linux/device-mapper.h> 16 #include <linux/device-mapper.h>
17 17
18 #define DM_MSG_PREFIX "raid" 18 #define DM_MSG_PREFIX "raid"
19 19
20 /* 20 /*
21 * The following flags are used by dm-raid.c to set up the array state. 21 * The following flags are used by dm-raid.c to set up the array state.
22 * They must be cleared before md_run is called. 22 * They must be cleared before md_run is called.
23 */ 23 */
24 #define FirstUse 10 /* rdev flag */ 24 #define FirstUse 10 /* rdev flag */
25 25
26 struct raid_dev { 26 struct raid_dev {
27 /* 27 /*
28 * Two DM devices, one to hold metadata and one to hold the 28 * Two DM devices, one to hold metadata and one to hold the
29 * actual data/parity. The reason for this is to not confuse 29 * actual data/parity. The reason for this is to not confuse
30 * ti->len and give more flexibility in altering size and 30 * ti->len and give more flexibility in altering size and
31 * characteristics. 31 * characteristics.
32 * 32 *
33 * While it is possible for this device to be associated 33 * While it is possible for this device to be associated
34 * with a different physical device than the data_dev, it 34 * with a different physical device than the data_dev, it
35 * is intended for it to be the same. 35 * is intended for it to be the same.
36 * |--------- Physical Device ---------| 36 * |--------- Physical Device ---------|
37 * |- meta_dev -|------ data_dev ------| 37 * |- meta_dev -|------ data_dev ------|
38 */ 38 */
39 struct dm_dev *meta_dev; 39 struct dm_dev *meta_dev;
40 struct dm_dev *data_dev; 40 struct dm_dev *data_dev;
41 struct md_rdev rdev; 41 struct md_rdev rdev;
42 }; 42 };
43 43
44 /* 44 /*
45 * Flags for rs->print_flags field. 45 * Flags for rs->print_flags field.
46 */ 46 */
47 #define DMPF_SYNC 0x1 47 #define DMPF_SYNC 0x1
48 #define DMPF_NOSYNC 0x2 48 #define DMPF_NOSYNC 0x2
49 #define DMPF_REBUILD 0x4 49 #define DMPF_REBUILD 0x4
50 #define DMPF_DAEMON_SLEEP 0x8 50 #define DMPF_DAEMON_SLEEP 0x8
51 #define DMPF_MIN_RECOVERY_RATE 0x10 51 #define DMPF_MIN_RECOVERY_RATE 0x10
52 #define DMPF_MAX_RECOVERY_RATE 0x20 52 #define DMPF_MAX_RECOVERY_RATE 0x20
53 #define DMPF_MAX_WRITE_BEHIND 0x40 53 #define DMPF_MAX_WRITE_BEHIND 0x40
54 #define DMPF_STRIPE_CACHE 0x80 54 #define DMPF_STRIPE_CACHE 0x80
55 #define DMPF_REGION_SIZE 0X100 55 #define DMPF_REGION_SIZE 0X100
56 struct raid_set { 56 struct raid_set {
57 struct dm_target *ti; 57 struct dm_target *ti;
58 58
59 uint64_t print_flags; 59 uint64_t print_flags;
60 60
61 struct mddev md; 61 struct mddev md;
62 struct raid_type *raid_type; 62 struct raid_type *raid_type;
63 struct dm_target_callbacks callbacks; 63 struct dm_target_callbacks callbacks;
64 64
65 struct raid_dev dev[0]; 65 struct raid_dev dev[0];
66 }; 66 };
67 67
68 /* Supported raid types and properties. */ 68 /* Supported raid types and properties. */
69 static struct raid_type { 69 static struct raid_type {
70 const char *name; /* RAID algorithm. */ 70 const char *name; /* RAID algorithm. */
71 const char *descr; /* Descriptor text for logging. */ 71 const char *descr; /* Descriptor text for logging. */
72 const unsigned parity_devs; /* # of parity devices. */ 72 const unsigned parity_devs; /* # of parity devices. */
73 const unsigned minimal_devs; /* minimal # of devices in set. */ 73 const unsigned minimal_devs; /* minimal # of devices in set. */
74 const unsigned level; /* RAID level. */ 74 const unsigned level; /* RAID level. */
75 const unsigned algorithm; /* RAID algorithm. */ 75 const unsigned algorithm; /* RAID algorithm. */
76 } raid_types[] = { 76 } raid_types[] = {
77 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */}, 77 {"raid1", "RAID1 (mirroring)", 0, 2, 1, 0 /* NONE */},
78 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0}, 78 {"raid4", "RAID4 (dedicated parity disk)", 1, 2, 5, ALGORITHM_PARITY_0},
79 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC}, 79 {"raid5_la", "RAID5 (left asymmetric)", 1, 2, 5, ALGORITHM_LEFT_ASYMMETRIC},
80 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC}, 80 {"raid5_ra", "RAID5 (right asymmetric)", 1, 2, 5, ALGORITHM_RIGHT_ASYMMETRIC},
81 {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC}, 81 {"raid5_ls", "RAID5 (left symmetric)", 1, 2, 5, ALGORITHM_LEFT_SYMMETRIC},
82 {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC}, 82 {"raid5_rs", "RAID5 (right symmetric)", 1, 2, 5, ALGORITHM_RIGHT_SYMMETRIC},
83 {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART}, 83 {"raid6_zr", "RAID6 (zero restart)", 2, 4, 6, ALGORITHM_ROTATING_ZERO_RESTART},
84 {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART}, 84 {"raid6_nr", "RAID6 (N restart)", 2, 4, 6, ALGORITHM_ROTATING_N_RESTART},
85 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE} 85 {"raid6_nc", "RAID6 (N continue)", 2, 4, 6, ALGORITHM_ROTATING_N_CONTINUE}
86 }; 86 };
87 87
88 static struct raid_type *get_raid_type(char *name) 88 static struct raid_type *get_raid_type(char *name)
89 { 89 {
90 int i; 90 int i;
91 91
92 for (i = 0; i < ARRAY_SIZE(raid_types); i++) 92 for (i = 0; i < ARRAY_SIZE(raid_types); i++)
93 if (!strcmp(raid_types[i].name, name)) 93 if (!strcmp(raid_types[i].name, name))
94 return &raid_types[i]; 94 return &raid_types[i];
95 95
96 return NULL; 96 return NULL;
97 } 97 }
98 98
99 static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs) 99 static struct raid_set *context_alloc(struct dm_target *ti, struct raid_type *raid_type, unsigned raid_devs)
100 { 100 {
101 unsigned i; 101 unsigned i;
102 struct raid_set *rs; 102 struct raid_set *rs;
103 sector_t sectors_per_dev; 103 sector_t sectors_per_dev;
104 104
105 if (raid_devs <= raid_type->parity_devs) { 105 if (raid_devs <= raid_type->parity_devs) {
106 ti->error = "Insufficient number of devices"; 106 ti->error = "Insufficient number of devices";
107 return ERR_PTR(-EINVAL); 107 return ERR_PTR(-EINVAL);
108 } 108 }
109 109
110 sectors_per_dev = ti->len; 110 sectors_per_dev = ti->len;
111 if ((raid_type->level > 1) && 111 if ((raid_type->level > 1) &&
112 sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) { 112 sector_div(sectors_per_dev, (raid_devs - raid_type->parity_devs))) {
113 ti->error = "Target length not divisible by number of data devices"; 113 ti->error = "Target length not divisible by number of data devices";
114 return ERR_PTR(-EINVAL); 114 return ERR_PTR(-EINVAL);
115 } 115 }
116 116
117 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL); 117 rs = kzalloc(sizeof(*rs) + raid_devs * sizeof(rs->dev[0]), GFP_KERNEL);
118 if (!rs) { 118 if (!rs) {
119 ti->error = "Cannot allocate raid context"; 119 ti->error = "Cannot allocate raid context";
120 return ERR_PTR(-ENOMEM); 120 return ERR_PTR(-ENOMEM);
121 } 121 }
122 122
123 mddev_init(&rs->md); 123 mddev_init(&rs->md);
124 124
125 rs->ti = ti; 125 rs->ti = ti;
126 rs->raid_type = raid_type; 126 rs->raid_type = raid_type;
127 rs->md.raid_disks = raid_devs; 127 rs->md.raid_disks = raid_devs;
128 rs->md.level = raid_type->level; 128 rs->md.level = raid_type->level;
129 rs->md.new_level = rs->md.level; 129 rs->md.new_level = rs->md.level;
130 rs->md.dev_sectors = sectors_per_dev; 130 rs->md.dev_sectors = sectors_per_dev;
131 rs->md.layout = raid_type->algorithm; 131 rs->md.layout = raid_type->algorithm;
132 rs->md.new_layout = rs->md.layout; 132 rs->md.new_layout = rs->md.layout;
133 rs->md.delta_disks = 0; 133 rs->md.delta_disks = 0;
134 rs->md.recovery_cp = 0; 134 rs->md.recovery_cp = 0;
135 135
136 for (i = 0; i < raid_devs; i++) 136 for (i = 0; i < raid_devs; i++)
137 md_rdev_init(&rs->dev[i].rdev); 137 md_rdev_init(&rs->dev[i].rdev);
138 138
139 /* 139 /*
140 * Remaining items to be initialized by further RAID params: 140 * Remaining items to be initialized by further RAID params:
141 * rs->md.persistent 141 * rs->md.persistent
142 * rs->md.external 142 * rs->md.external
143 * rs->md.chunk_sectors 143 * rs->md.chunk_sectors
144 * rs->md.new_chunk_sectors 144 * rs->md.new_chunk_sectors
145 */ 145 */
146 146
147 return rs; 147 return rs;
148 } 148 }
149 149
150 static void context_free(struct raid_set *rs) 150 static void context_free(struct raid_set *rs)
151 { 151 {
152 int i; 152 int i;
153 153
154 for (i = 0; i < rs->md.raid_disks; i++) { 154 for (i = 0; i < rs->md.raid_disks; i++) {
155 if (rs->dev[i].meta_dev) 155 if (rs->dev[i].meta_dev)
156 dm_put_device(rs->ti, rs->dev[i].meta_dev); 156 dm_put_device(rs->ti, rs->dev[i].meta_dev);
157 if (rs->dev[i].rdev.sb_page) 157 if (rs->dev[i].rdev.sb_page)
158 put_page(rs->dev[i].rdev.sb_page); 158 put_page(rs->dev[i].rdev.sb_page);
159 rs->dev[i].rdev.sb_page = NULL; 159 rs->dev[i].rdev.sb_page = NULL;
160 rs->dev[i].rdev.sb_loaded = 0; 160 rs->dev[i].rdev.sb_loaded = 0;
161 if (rs->dev[i].data_dev) 161 if (rs->dev[i].data_dev)
162 dm_put_device(rs->ti, rs->dev[i].data_dev); 162 dm_put_device(rs->ti, rs->dev[i].data_dev);
163 } 163 }
164 164
165 kfree(rs); 165 kfree(rs);
166 } 166 }
167 167
168 /* 168 /*
169 * For every device we have two words 169 * For every device we have two words
170 * <meta_dev>: meta device name or '-' if missing 170 * <meta_dev>: meta device name or '-' if missing
171 * <data_dev>: data device name or '-' if missing 171 * <data_dev>: data device name or '-' if missing
172 * 172 *
173 * The following are permitted: 173 * The following are permitted:
174 * - - 174 * - -
175 * - <data_dev> 175 * - <data_dev>
176 * <meta_dev> <data_dev> 176 * <meta_dev> <data_dev>
177 * 177 *
178 * The following is not allowed: 178 * The following is not allowed:
179 * <meta_dev> - 179 * <meta_dev> -
180 * 180 *
181 * This code parses those words. If there is a failure, 181 * This code parses those words. If there is a failure,
182 * the caller must use context_free to unwind the operations. 182 * the caller must use context_free to unwind the operations.
183 */ 183 */
184 static int dev_parms(struct raid_set *rs, char **argv) 184 static int dev_parms(struct raid_set *rs, char **argv)
185 { 185 {
186 int i; 186 int i;
187 int rebuild = 0; 187 int rebuild = 0;
188 int metadata_available = 0; 188 int metadata_available = 0;
189 int ret = 0; 189 int ret = 0;
190 190
191 for (i = 0; i < rs->md.raid_disks; i++, argv += 2) { 191 for (i = 0; i < rs->md.raid_disks; i++, argv += 2) {
192 rs->dev[i].rdev.raid_disk = i; 192 rs->dev[i].rdev.raid_disk = i;
193 193
194 rs->dev[i].meta_dev = NULL; 194 rs->dev[i].meta_dev = NULL;
195 rs->dev[i].data_dev = NULL; 195 rs->dev[i].data_dev = NULL;
196 196
197 /* 197 /*
198 * There are no offsets, since there is a separate device 198 * There are no offsets, since there is a separate device
199 * for data and metadata. 199 * for data and metadata.
200 */ 200 */
201 rs->dev[i].rdev.data_offset = 0; 201 rs->dev[i].rdev.data_offset = 0;
202 rs->dev[i].rdev.mddev = &rs->md; 202 rs->dev[i].rdev.mddev = &rs->md;
203 203
204 if (strcmp(argv[0], "-")) { 204 if (strcmp(argv[0], "-")) {
205 ret = dm_get_device(rs->ti, argv[0], 205 ret = dm_get_device(rs->ti, argv[0],
206 dm_table_get_mode(rs->ti->table), 206 dm_table_get_mode(rs->ti->table),
207 &rs->dev[i].meta_dev); 207 &rs->dev[i].meta_dev);
208 rs->ti->error = "RAID metadata device lookup failure"; 208 rs->ti->error = "RAID metadata device lookup failure";
209 if (ret) 209 if (ret)
210 return ret; 210 return ret;
211 211
212 rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL); 212 rs->dev[i].rdev.sb_page = alloc_page(GFP_KERNEL);
213 if (!rs->dev[i].rdev.sb_page) 213 if (!rs->dev[i].rdev.sb_page)
214 return -ENOMEM; 214 return -ENOMEM;
215 } 215 }
216 216
217 if (!strcmp(argv[1], "-")) { 217 if (!strcmp(argv[1], "-")) {
218 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) && 218 if (!test_bit(In_sync, &rs->dev[i].rdev.flags) &&
219 (!rs->dev[i].rdev.recovery_offset)) { 219 (!rs->dev[i].rdev.recovery_offset)) {
220 rs->ti->error = "Drive designated for rebuild not specified"; 220 rs->ti->error = "Drive designated for rebuild not specified";
221 return -EINVAL; 221 return -EINVAL;
222 } 222 }
223 223
224 rs->ti->error = "No data device supplied with metadata device"; 224 rs->ti->error = "No data device supplied with metadata device";
225 if (rs->dev[i].meta_dev) 225 if (rs->dev[i].meta_dev)
226 return -EINVAL; 226 return -EINVAL;
227 227
228 continue; 228 continue;
229 } 229 }
230 230
231 ret = dm_get_device(rs->ti, argv[1], 231 ret = dm_get_device(rs->ti, argv[1],
232 dm_table_get_mode(rs->ti->table), 232 dm_table_get_mode(rs->ti->table),
233 &rs->dev[i].data_dev); 233 &rs->dev[i].data_dev);
234 if (ret) { 234 if (ret) {
235 rs->ti->error = "RAID device lookup failure"; 235 rs->ti->error = "RAID device lookup failure";
236 return ret; 236 return ret;
237 } 237 }
238 238
239 if (rs->dev[i].meta_dev) { 239 if (rs->dev[i].meta_dev) {
240 metadata_available = 1; 240 metadata_available = 1;
241 rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev; 241 rs->dev[i].rdev.meta_bdev = rs->dev[i].meta_dev->bdev;
242 } 242 }
243 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev; 243 rs->dev[i].rdev.bdev = rs->dev[i].data_dev->bdev;
244 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks); 244 list_add(&rs->dev[i].rdev.same_set, &rs->md.disks);
245 if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 245 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
246 rebuild++; 246 rebuild++;
247 } 247 }
248 248
249 if (metadata_available) { 249 if (metadata_available) {
250 rs->md.external = 0; 250 rs->md.external = 0;
251 rs->md.persistent = 1; 251 rs->md.persistent = 1;
252 rs->md.major_version = 2; 252 rs->md.major_version = 2;
253 } else if (rebuild && !rs->md.recovery_cp) { 253 } else if (rebuild && !rs->md.recovery_cp) {
254 /* 254 /*
255 * Without metadata, we will not be able to tell if the array 255 * Without metadata, we will not be able to tell if the array
256 * is in-sync or not - we must assume it is not. Therefore, 256 * is in-sync or not - we must assume it is not. Therefore,
257 * it is impossible to rebuild a drive. 257 * it is impossible to rebuild a drive.
258 * 258 *
259 * Even if there is metadata, the on-disk information may 259 * Even if there is metadata, the on-disk information may
260 * indicate that the array is not in-sync and it will then 260 * indicate that the array is not in-sync and it will then
261 * fail at that time. 261 * fail at that time.
262 * 262 *
263 * User could specify 'nosync' option if desperate. 263 * User could specify 'nosync' option if desperate.
264 */ 264 */
265 DMERR("Unable to rebuild drive while array is not in-sync"); 265 DMERR("Unable to rebuild drive while array is not in-sync");
266 rs->ti->error = "RAID device lookup failure"; 266 rs->ti->error = "RAID device lookup failure";
267 return -EINVAL; 267 return -EINVAL;
268 } 268 }
269 269
270 return 0; 270 return 0;
271 } 271 }
272 272
273 /* 273 /*
274 * validate_region_size 274 * validate_region_size
275 * @rs 275 * @rs
276 * @region_size: region size in sectors. If 0, pick a size (4MiB default). 276 * @region_size: region size in sectors. If 0, pick a size (4MiB default).
277 * 277 *
278 * Set rs->md.bitmap_info.chunksize (which really refers to 'region size'). 278 * Set rs->md.bitmap_info.chunksize (which really refers to 'region size').
279 * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap. 279 * Ensure that (ti->len/region_size < 2^21) - required by MD bitmap.
280 * 280 *
281 * Returns: 0 on success, -EINVAL on failure. 281 * Returns: 0 on success, -EINVAL on failure.
282 */ 282 */
283 static int validate_region_size(struct raid_set *rs, unsigned long region_size) 283 static int validate_region_size(struct raid_set *rs, unsigned long region_size)
284 { 284 {
285 unsigned long min_region_size = rs->ti->len / (1 << 21); 285 unsigned long min_region_size = rs->ti->len / (1 << 21);
286 286
287 if (!region_size) { 287 if (!region_size) {
288 /* 288 /*
289 * Choose a reasonable default. All figures in sectors. 289 * Choose a reasonable default. All figures in sectors.
290 */ 290 */
291 if (min_region_size > (1 << 13)) { 291 if (min_region_size > (1 << 13)) {
292 DMINFO("Choosing default region size of %lu sectors", 292 DMINFO("Choosing default region size of %lu sectors",
293 region_size); 293 region_size);
294 region_size = min_region_size; 294 region_size = min_region_size;
295 } else { 295 } else {
296 DMINFO("Choosing default region size of 4MiB"); 296 DMINFO("Choosing default region size of 4MiB");
297 region_size = 1 << 13; /* sectors */ 297 region_size = 1 << 13; /* sectors */
298 } 298 }
299 } else { 299 } else {
300 /* 300 /*
301 * Validate user-supplied value. 301 * Validate user-supplied value.
302 */ 302 */
303 if (region_size > rs->ti->len) { 303 if (region_size > rs->ti->len) {
304 rs->ti->error = "Supplied region size is too large"; 304 rs->ti->error = "Supplied region size is too large";
305 return -EINVAL; 305 return -EINVAL;
306 } 306 }
307 307
308 if (region_size < min_region_size) { 308 if (region_size < min_region_size) {
309 DMERR("Supplied region_size (%lu sectors) below minimum (%lu)", 309 DMERR("Supplied region_size (%lu sectors) below minimum (%lu)",
310 region_size, min_region_size); 310 region_size, min_region_size);
311 rs->ti->error = "Supplied region size is too small"; 311 rs->ti->error = "Supplied region size is too small";
312 return -EINVAL; 312 return -EINVAL;
313 } 313 }
314 314
315 if (!is_power_of_2(region_size)) { 315 if (!is_power_of_2(region_size)) {
316 rs->ti->error = "Region size is not a power of 2"; 316 rs->ti->error = "Region size is not a power of 2";
317 return -EINVAL; 317 return -EINVAL;
318 } 318 }
319 319
320 if (region_size < rs->md.chunk_sectors) { 320 if (region_size < rs->md.chunk_sectors) {
321 rs->ti->error = "Region size is smaller than the chunk size"; 321 rs->ti->error = "Region size is smaller than the chunk size";
322 return -EINVAL; 322 return -EINVAL;
323 } 323 }
324 } 324 }
325 325
326 /* 326 /*
327 * Convert sectors to bytes. 327 * Convert sectors to bytes.
328 */ 328 */
329 rs->md.bitmap_info.chunksize = (region_size << 9); 329 rs->md.bitmap_info.chunksize = (region_size << 9);
330 330
331 return 0; 331 return 0;
332 } 332 }
333 333
334 /* 334 /*
335 * Possible arguments are... 335 * Possible arguments are...
336 * <chunk_size> [optional_args] 336 * <chunk_size> [optional_args]
337 * 337 *
338 * Argument definitions 338 * Argument definitions
339 * <chunk_size> The number of sectors per disk that 339 * <chunk_size> The number of sectors per disk that
340 * will form the "stripe" 340 * will form the "stripe"
341 * [[no]sync] Force or prevent recovery of the 341 * [[no]sync] Force or prevent recovery of the
342 * entire array 342 * entire array
343 * [rebuild <idx>] Rebuild the drive indicated by the index 343 * [rebuild <idx>] Rebuild the drive indicated by the index
344 * [daemon_sleep <ms>] Time between bitmap daemon work to 344 * [daemon_sleep <ms>] Time between bitmap daemon work to
345 * clear bits 345 * clear bits
346 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization 346 * [min_recovery_rate <kB/sec/disk>] Throttle RAID initialization
347 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization 347 * [max_recovery_rate <kB/sec/disk>] Throttle RAID initialization
348 * [write_mostly <idx>] Indicate a write mostly drive via index 348 * [write_mostly <idx>] Indicate a write mostly drive via index
349 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm) 349 * [max_write_behind <sectors>] See '-write-behind=' (man mdadm)
350 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs 350 * [stripe_cache <sectors>] Stripe cache size for higher RAIDs
351 * [region_size <sectors>] Defines granularity of bitmap 351 * [region_size <sectors>] Defines granularity of bitmap
352 */ 352 */
353 static int parse_raid_params(struct raid_set *rs, char **argv, 353 static int parse_raid_params(struct raid_set *rs, char **argv,
354 unsigned num_raid_params) 354 unsigned num_raid_params)
355 { 355 {
356 unsigned i, rebuild_cnt = 0; 356 unsigned i, rebuild_cnt = 0;
357 unsigned long value, region_size = 0; 357 unsigned long value, region_size = 0;
358 char *key; 358 char *key;
359 359
360 /* 360 /*
361 * First, parse the in-order required arguments 361 * First, parse the in-order required arguments
362 * "chunk_size" is the only argument of this type. 362 * "chunk_size" is the only argument of this type.
363 */ 363 */
364 if ((strict_strtoul(argv[0], 10, &value) < 0)) { 364 if ((strict_strtoul(argv[0], 10, &value) < 0)) {
365 rs->ti->error = "Bad chunk size"; 365 rs->ti->error = "Bad chunk size";
366 return -EINVAL; 366 return -EINVAL;
367 } else if (rs->raid_type->level == 1) { 367 } else if (rs->raid_type->level == 1) {
368 if (value) 368 if (value)
369 DMERR("Ignoring chunk size parameter for RAID 1"); 369 DMERR("Ignoring chunk size parameter for RAID 1");
370 value = 0; 370 value = 0;
371 } else if (!is_power_of_2(value)) { 371 } else if (!is_power_of_2(value)) {
372 rs->ti->error = "Chunk size must be a power of 2"; 372 rs->ti->error = "Chunk size must be a power of 2";
373 return -EINVAL; 373 return -EINVAL;
374 } else if (value < 8) { 374 } else if (value < 8) {
375 rs->ti->error = "Chunk size value is too small"; 375 rs->ti->error = "Chunk size value is too small";
376 return -EINVAL; 376 return -EINVAL;
377 } 377 }
378 378
379 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value; 379 rs->md.new_chunk_sectors = rs->md.chunk_sectors = value;
380 argv++; 380 argv++;
381 num_raid_params--; 381 num_raid_params--;
382 382
383 /* 383 /*
384 * We set each individual device as In_sync with a completed 384 * We set each individual device as In_sync with a completed
385 * 'recovery_offset'. If there has been a device failure or 385 * 'recovery_offset'. If there has been a device failure or
386 * replacement then one of the following cases applies: 386 * replacement then one of the following cases applies:
387 * 387 *
388 * 1) User specifies 'rebuild'. 388 * 1) User specifies 'rebuild'.
389 * - Device is reset when param is read. 389 * - Device is reset when param is read.
390 * 2) A new device is supplied. 390 * 2) A new device is supplied.
391 * - No matching superblock found, resets device. 391 * - No matching superblock found, resets device.
392 * 3) Device failure was transient and returns on reload. 392 * 3) Device failure was transient and returns on reload.
393 * - Failure noticed, resets device for bitmap replay. 393 * - Failure noticed, resets device for bitmap replay.
394 * 4) Device hadn't completed recovery after previous failure. 394 * 4) Device hadn't completed recovery after previous failure.
395 * - Superblock is read and overrides recovery_offset. 395 * - Superblock is read and overrides recovery_offset.
396 * 396 *
397 * What is found in the superblocks of the devices is always 397 * What is found in the superblocks of the devices is always
398 * authoritative, unless 'rebuild' or '[no]sync' was specified. 398 * authoritative, unless 'rebuild' or '[no]sync' was specified.
399 */ 399 */
400 for (i = 0; i < rs->md.raid_disks; i++) { 400 for (i = 0; i < rs->md.raid_disks; i++) {
401 set_bit(In_sync, &rs->dev[i].rdev.flags); 401 set_bit(In_sync, &rs->dev[i].rdev.flags);
402 rs->dev[i].rdev.recovery_offset = MaxSector; 402 rs->dev[i].rdev.recovery_offset = MaxSector;
403 } 403 }
404 404
405 /* 405 /*
406 * Second, parse the unordered optional arguments 406 * Second, parse the unordered optional arguments
407 */ 407 */
408 for (i = 0; i < num_raid_params; i++) { 408 for (i = 0; i < num_raid_params; i++) {
409 if (!strcasecmp(argv[i], "nosync")) { 409 if (!strcasecmp(argv[i], "nosync")) {
410 rs->md.recovery_cp = MaxSector; 410 rs->md.recovery_cp = MaxSector;
411 rs->print_flags |= DMPF_NOSYNC; 411 rs->print_flags |= DMPF_NOSYNC;
412 continue; 412 continue;
413 } 413 }
414 if (!strcasecmp(argv[i], "sync")) { 414 if (!strcasecmp(argv[i], "sync")) {
415 rs->md.recovery_cp = 0; 415 rs->md.recovery_cp = 0;
416 rs->print_flags |= DMPF_SYNC; 416 rs->print_flags |= DMPF_SYNC;
417 continue; 417 continue;
418 } 418 }
419 419
420 /* The rest of the optional arguments come in key/value pairs */ 420 /* The rest of the optional arguments come in key/value pairs */
421 if ((i + 1) >= num_raid_params) { 421 if ((i + 1) >= num_raid_params) {
422 rs->ti->error = "Wrong number of raid parameters given"; 422 rs->ti->error = "Wrong number of raid parameters given";
423 return -EINVAL; 423 return -EINVAL;
424 } 424 }
425 425
426 key = argv[i++]; 426 key = argv[i++];
427 if (strict_strtoul(argv[i], 10, &value) < 0) { 427 if (strict_strtoul(argv[i], 10, &value) < 0) {
428 rs->ti->error = "Bad numerical argument given in raid params"; 428 rs->ti->error = "Bad numerical argument given in raid params";
429 return -EINVAL; 429 return -EINVAL;
430 } 430 }
431 431
432 if (!strcasecmp(key, "rebuild")) { 432 if (!strcasecmp(key, "rebuild")) {
433 rebuild_cnt++; 433 rebuild_cnt++;
434 if (((rs->raid_type->level != 1) && 434 if (((rs->raid_type->level != 1) &&
435 (rebuild_cnt > rs->raid_type->parity_devs)) || 435 (rebuild_cnt > rs->raid_type->parity_devs)) ||
436 ((rs->raid_type->level == 1) && 436 ((rs->raid_type->level == 1) &&
437 (rebuild_cnt > (rs->md.raid_disks - 1)))) { 437 (rebuild_cnt > (rs->md.raid_disks - 1)))) {
438 rs->ti->error = "Too many rebuild devices specified for given RAID type"; 438 rs->ti->error = "Too many rebuild devices specified for given RAID type";
439 return -EINVAL; 439 return -EINVAL;
440 } 440 }
441 if (value > rs->md.raid_disks) { 441 if (value > rs->md.raid_disks) {
442 rs->ti->error = "Invalid rebuild index given"; 442 rs->ti->error = "Invalid rebuild index given";
443 return -EINVAL; 443 return -EINVAL;
444 } 444 }
445 clear_bit(In_sync, &rs->dev[value].rdev.flags); 445 clear_bit(In_sync, &rs->dev[value].rdev.flags);
446 rs->dev[value].rdev.recovery_offset = 0; 446 rs->dev[value].rdev.recovery_offset = 0;
447 rs->print_flags |= DMPF_REBUILD; 447 rs->print_flags |= DMPF_REBUILD;
448 } else if (!strcasecmp(key, "write_mostly")) { 448 } else if (!strcasecmp(key, "write_mostly")) {
449 if (rs->raid_type->level != 1) { 449 if (rs->raid_type->level != 1) {
450 rs->ti->error = "write_mostly option is only valid for RAID1"; 450 rs->ti->error = "write_mostly option is only valid for RAID1";
451 return -EINVAL; 451 return -EINVAL;
452 } 452 }
453 if (value >= rs->md.raid_disks) { 453 if (value >= rs->md.raid_disks) {
454 rs->ti->error = "Invalid write_mostly drive index given"; 454 rs->ti->error = "Invalid write_mostly drive index given";
455 return -EINVAL; 455 return -EINVAL;
456 } 456 }
457 set_bit(WriteMostly, &rs->dev[value].rdev.flags); 457 set_bit(WriteMostly, &rs->dev[value].rdev.flags);
458 } else if (!strcasecmp(key, "max_write_behind")) { 458 } else if (!strcasecmp(key, "max_write_behind")) {
459 if (rs->raid_type->level != 1) { 459 if (rs->raid_type->level != 1) {
460 rs->ti->error = "max_write_behind option is only valid for RAID1"; 460 rs->ti->error = "max_write_behind option is only valid for RAID1";
461 return -EINVAL; 461 return -EINVAL;
462 } 462 }
463 rs->print_flags |= DMPF_MAX_WRITE_BEHIND; 463 rs->print_flags |= DMPF_MAX_WRITE_BEHIND;
464 464
465 /* 465 /*
466 * In device-mapper, we specify things in sectors, but 466 * In device-mapper, we specify things in sectors, but
467 * MD records this value in kB 467 * MD records this value in kB
468 */ 468 */
469 value /= 2; 469 value /= 2;
470 if (value > COUNTER_MAX) { 470 if (value > COUNTER_MAX) {
471 rs->ti->error = "Max write-behind limit out of range"; 471 rs->ti->error = "Max write-behind limit out of range";
472 return -EINVAL; 472 return -EINVAL;
473 } 473 }
474 rs->md.bitmap_info.max_write_behind = value; 474 rs->md.bitmap_info.max_write_behind = value;
475 } else if (!strcasecmp(key, "daemon_sleep")) { 475 } else if (!strcasecmp(key, "daemon_sleep")) {
476 rs->print_flags |= DMPF_DAEMON_SLEEP; 476 rs->print_flags |= DMPF_DAEMON_SLEEP;
477 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) { 477 if (!value || (value > MAX_SCHEDULE_TIMEOUT)) {
478 rs->ti->error = "daemon sleep period out of range"; 478 rs->ti->error = "daemon sleep period out of range";
479 return -EINVAL; 479 return -EINVAL;
480 } 480 }
481 rs->md.bitmap_info.daemon_sleep = value; 481 rs->md.bitmap_info.daemon_sleep = value;
482 } else if (!strcasecmp(key, "stripe_cache")) { 482 } else if (!strcasecmp(key, "stripe_cache")) {
483 rs->print_flags |= DMPF_STRIPE_CACHE; 483 rs->print_flags |= DMPF_STRIPE_CACHE;
484 484
485 /* 485 /*
486 * In device-mapper, we specify things in sectors, but 486 * In device-mapper, we specify things in sectors, but
487 * MD records this value in kB 487 * MD records this value in kB
488 */ 488 */
489 value /= 2; 489 value /= 2;
490 490
491 if (rs->raid_type->level < 5) { 491 if (rs->raid_type->level < 5) {
492 rs->ti->error = "Inappropriate argument: stripe_cache"; 492 rs->ti->error = "Inappropriate argument: stripe_cache";
493 return -EINVAL; 493 return -EINVAL;
494 } 494 }
495 if (raid5_set_cache_size(&rs->md, (int)value)) { 495 if (raid5_set_cache_size(&rs->md, (int)value)) {
496 rs->ti->error = "Bad stripe_cache size"; 496 rs->ti->error = "Bad stripe_cache size";
497 return -EINVAL; 497 return -EINVAL;
498 } 498 }
499 } else if (!strcasecmp(key, "min_recovery_rate")) { 499 } else if (!strcasecmp(key, "min_recovery_rate")) {
500 rs->print_flags |= DMPF_MIN_RECOVERY_RATE; 500 rs->print_flags |= DMPF_MIN_RECOVERY_RATE;
501 if (value > INT_MAX) { 501 if (value > INT_MAX) {
502 rs->ti->error = "min_recovery_rate out of range"; 502 rs->ti->error = "min_recovery_rate out of range";
503 return -EINVAL; 503 return -EINVAL;
504 } 504 }
505 rs->md.sync_speed_min = (int)value; 505 rs->md.sync_speed_min = (int)value;
506 } else if (!strcasecmp(key, "max_recovery_rate")) { 506 } else if (!strcasecmp(key, "max_recovery_rate")) {
507 rs->print_flags |= DMPF_MAX_RECOVERY_RATE; 507 rs->print_flags |= DMPF_MAX_RECOVERY_RATE;
508 if (value > INT_MAX) { 508 if (value > INT_MAX) {
509 rs->ti->error = "max_recovery_rate out of range"; 509 rs->ti->error = "max_recovery_rate out of range";
510 return -EINVAL; 510 return -EINVAL;
511 } 511 }
512 rs->md.sync_speed_max = (int)value; 512 rs->md.sync_speed_max = (int)value;
513 } else if (!strcasecmp(key, "region_size")) { 513 } else if (!strcasecmp(key, "region_size")) {
514 rs->print_flags |= DMPF_REGION_SIZE; 514 rs->print_flags |= DMPF_REGION_SIZE;
515 region_size = value; 515 region_size = value;
516 } else { 516 } else {
517 DMERR("Unable to parse RAID parameter: %s", key); 517 DMERR("Unable to parse RAID parameter: %s", key);
518 rs->ti->error = "Unable to parse RAID parameters"; 518 rs->ti->error = "Unable to parse RAID parameters";
519 return -EINVAL; 519 return -EINVAL;
520 } 520 }
521 } 521 }
522 522
523 if (validate_region_size(rs, region_size)) 523 if (validate_region_size(rs, region_size))
524 return -EINVAL; 524 return -EINVAL;
525 525
526 if (rs->md.chunk_sectors) 526 if (rs->md.chunk_sectors)
527 rs->ti->split_io = rs->md.chunk_sectors; 527 rs->ti->split_io = rs->md.chunk_sectors;
528 else 528 else
529 rs->ti->split_io = region_size; 529 rs->ti->split_io = region_size;
530 530
531 if (rs->md.chunk_sectors) 531 if (rs->md.chunk_sectors)
532 rs->ti->split_io = rs->md.chunk_sectors; 532 rs->ti->split_io = rs->md.chunk_sectors;
533 else 533 else
534 rs->ti->split_io = region_size; 534 rs->ti->split_io = region_size;
535 535
536 /* Assume there are no metadata devices until the drives are parsed */ 536 /* Assume there are no metadata devices until the drives are parsed */
537 rs->md.persistent = 0; 537 rs->md.persistent = 0;
538 rs->md.external = 1; 538 rs->md.external = 1;
539 539
540 return 0; 540 return 0;
541 } 541 }
542 542
543 static void do_table_event(struct work_struct *ws) 543 static void do_table_event(struct work_struct *ws)
544 { 544 {
545 struct raid_set *rs = container_of(ws, struct raid_set, md.event_work); 545 struct raid_set *rs = container_of(ws, struct raid_set, md.event_work);
546 546
547 dm_table_event(rs->ti->table); 547 dm_table_event(rs->ti->table);
548 } 548 }
549 549
550 static int raid_is_congested(struct dm_target_callbacks *cb, int bits) 550 static int raid_is_congested(struct dm_target_callbacks *cb, int bits)
551 { 551 {
552 struct raid_set *rs = container_of(cb, struct raid_set, callbacks); 552 struct raid_set *rs = container_of(cb, struct raid_set, callbacks);
553 553
554 if (rs->raid_type->level == 1) 554 if (rs->raid_type->level == 1)
555 return md_raid1_congested(&rs->md, bits); 555 return md_raid1_congested(&rs->md, bits);
556 556
557 return md_raid5_congested(&rs->md, bits); 557 return md_raid5_congested(&rs->md, bits);
558 } 558 }
559 559
560 /* 560 /*
561 * This structure is never routinely used by userspace, unlike md superblocks. 561 * This structure is never routinely used by userspace, unlike md superblocks.
562 * Devices with this superblock should only ever be accessed via device-mapper. 562 * Devices with this superblock should only ever be accessed via device-mapper.
563 */ 563 */
564 #define DM_RAID_MAGIC 0x64526D44 564 #define DM_RAID_MAGIC 0x64526D44
565 struct dm_raid_superblock { 565 struct dm_raid_superblock {
566 __le32 magic; /* "DmRd" */ 566 __le32 magic; /* "DmRd" */
567 __le32 features; /* Used to indicate possible future changes */ 567 __le32 features; /* Used to indicate possible future changes */
568 568
569 __le32 num_devices; /* Number of devices in this array. (Max 64) */ 569 __le32 num_devices; /* Number of devices in this array. (Max 64) */
570 __le32 array_position; /* The position of this drive in the array */ 570 __le32 array_position; /* The position of this drive in the array */
571 571
572 __le64 events; /* Incremented by md when superblock updated */ 572 __le64 events; /* Incremented by md when superblock updated */
573 __le64 failed_devices; /* Bit field of devices to indicate failures */ 573 __le64 failed_devices; /* Bit field of devices to indicate failures */
574 574
575 /* 575 /*
576 * This offset tracks the progress of the repair or replacement of 576 * This offset tracks the progress of the repair or replacement of
577 * an individual drive. 577 * an individual drive.
578 */ 578 */
579 __le64 disk_recovery_offset; 579 __le64 disk_recovery_offset;
580 580
581 /* 581 /*
582 * This offset tracks the progress of the initial array 582 * This offset tracks the progress of the initial array
583 * synchronisation/parity calculation. 583 * synchronisation/parity calculation.
584 */ 584 */
585 __le64 array_resync_offset; 585 __le64 array_resync_offset;
586 586
587 /* 587 /*
588 * RAID characteristics 588 * RAID characteristics
589 */ 589 */
590 __le32 level; 590 __le32 level;
591 __le32 layout; 591 __le32 layout;
592 __le32 stripe_sectors; 592 __le32 stripe_sectors;
593 593
594 __u8 pad[452]; /* Round struct to 512 bytes. */ 594 __u8 pad[452]; /* Round struct to 512 bytes. */
595 /* Always set to 0 when writing. */ 595 /* Always set to 0 when writing. */
596 } __packed; 596 } __packed;
597 597
598 static int read_disk_sb(struct md_rdev *rdev, int size) 598 static int read_disk_sb(struct md_rdev *rdev, int size)
599 { 599 {
600 BUG_ON(!rdev->sb_page); 600 BUG_ON(!rdev->sb_page);
601 601
602 if (rdev->sb_loaded) 602 if (rdev->sb_loaded)
603 return 0; 603 return 0;
604 604
605 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { 605 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
606 DMERR("Failed to read device superblock"); 606 DMERR("Failed to read device superblock");
607 return -EINVAL; 607 return -EINVAL;
608 } 608 }
609 609
610 rdev->sb_loaded = 1; 610 rdev->sb_loaded = 1;
611 611
612 return 0; 612 return 0;
613 } 613 }
614 614
615 static void super_sync(struct mddev *mddev, struct md_rdev *rdev) 615 static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
616 { 616 {
617 struct md_rdev *r, *t; 617 struct md_rdev *r, *t;
618 uint64_t failed_devices; 618 uint64_t failed_devices;
619 struct dm_raid_superblock *sb; 619 struct dm_raid_superblock *sb;
620 620
621 sb = page_address(rdev->sb_page); 621 sb = page_address(rdev->sb_page);
622 failed_devices = le64_to_cpu(sb->failed_devices); 622 failed_devices = le64_to_cpu(sb->failed_devices);
623 623
624 rdev_for_each(r, t, mddev) 624 rdev_for_each(r, t, mddev)
625 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) 625 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
626 failed_devices |= (1ULL << r->raid_disk); 626 failed_devices |= (1ULL << r->raid_disk);
627 627
628 memset(sb, 0, sizeof(*sb)); 628 memset(sb, 0, sizeof(*sb));
629 629
630 sb->magic = cpu_to_le32(DM_RAID_MAGIC); 630 sb->magic = cpu_to_le32(DM_RAID_MAGIC);
631 sb->features = cpu_to_le32(0); /* No features yet */ 631 sb->features = cpu_to_le32(0); /* No features yet */
632 632
633 sb->num_devices = cpu_to_le32(mddev->raid_disks); 633 sb->num_devices = cpu_to_le32(mddev->raid_disks);
634 sb->array_position = cpu_to_le32(rdev->raid_disk); 634 sb->array_position = cpu_to_le32(rdev->raid_disk);
635 635
636 sb->events = cpu_to_le64(mddev->events); 636 sb->events = cpu_to_le64(mddev->events);
637 sb->failed_devices = cpu_to_le64(failed_devices); 637 sb->failed_devices = cpu_to_le64(failed_devices);
638 638
639 sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset); 639 sb->disk_recovery_offset = cpu_to_le64(rdev->recovery_offset);
640 sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp); 640 sb->array_resync_offset = cpu_to_le64(mddev->recovery_cp);
641 641
642 sb->level = cpu_to_le32(mddev->level); 642 sb->level = cpu_to_le32(mddev->level);
643 sb->layout = cpu_to_le32(mddev->layout); 643 sb->layout = cpu_to_le32(mddev->layout);
644 sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors); 644 sb->stripe_sectors = cpu_to_le32(mddev->chunk_sectors);
645 } 645 }
646 646
647 /* 647 /*
648 * super_load 648 * super_load
649 * 649 *
650 * This function creates a superblock if one is not found on the device 650 * This function creates a superblock if one is not found on the device
651 * and will decide which superblock to use if there's a choice. 651 * and will decide which superblock to use if there's a choice.
652 * 652 *
653 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise 653 * Return: 1 if use rdev, 0 if use refdev, -Exxx otherwise
654 */ 654 */
655 static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) 655 static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
656 { 656 {
657 int ret; 657 int ret;
658 struct dm_raid_superblock *sb; 658 struct dm_raid_superblock *sb;
659 struct dm_raid_superblock *refsb; 659 struct dm_raid_superblock *refsb;
660 uint64_t events_sb, events_refsb; 660 uint64_t events_sb, events_refsb;
661 661
662 rdev->sb_start = 0; 662 rdev->sb_start = 0;
663 rdev->sb_size = sizeof(*sb); 663 rdev->sb_size = sizeof(*sb);
664 664
665 ret = read_disk_sb(rdev, rdev->sb_size); 665 ret = read_disk_sb(rdev, rdev->sb_size);
666 if (ret) 666 if (ret)
667 return ret; 667 return ret;
668 668
669 sb = page_address(rdev->sb_page); 669 sb = page_address(rdev->sb_page);
670 if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) { 670
671 /*
672 * Two cases that we want to write new superblocks and rebuild:
673 * 1) New device (no matching magic number)
674 * 2) Device specified for rebuild (!In_sync w/ offset == 0)
675 */
676 if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) ||
677 (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) {
671 super_sync(rdev->mddev, rdev); 678 super_sync(rdev->mddev, rdev);
672 679
673 set_bit(FirstUse, &rdev->flags); 680 set_bit(FirstUse, &rdev->flags);
674 681
675 /* Force writing of superblocks to disk */ 682 /* Force writing of superblocks to disk */
676 set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags); 683 set_bit(MD_CHANGE_DEVS, &rdev->mddev->flags);
677 684
678 /* Any superblock is better than none, choose that if given */ 685 /* Any superblock is better than none, choose that if given */
679 return refdev ? 0 : 1; 686 return refdev ? 0 : 1;
680 } 687 }
681 688
682 if (!refdev) 689 if (!refdev)
683 return 1; 690 return 1;
684 691
685 events_sb = le64_to_cpu(sb->events); 692 events_sb = le64_to_cpu(sb->events);
686 693
687 refsb = page_address(refdev->sb_page); 694 refsb = page_address(refdev->sb_page);
688 events_refsb = le64_to_cpu(refsb->events); 695 events_refsb = le64_to_cpu(refsb->events);
689 696
690 return (events_sb > events_refsb) ? 1 : 0; 697 return (events_sb > events_refsb) ? 1 : 0;
691 } 698 }
692 699
693 static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) 700 static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
694 { 701 {
695 int role; 702 int role;
696 struct raid_set *rs = container_of(mddev, struct raid_set, md); 703 struct raid_set *rs = container_of(mddev, struct raid_set, md);
697 uint64_t events_sb; 704 uint64_t events_sb;
698 uint64_t failed_devices; 705 uint64_t failed_devices;
699 struct dm_raid_superblock *sb; 706 struct dm_raid_superblock *sb;
700 uint32_t new_devs = 0; 707 uint32_t new_devs = 0;
701 uint32_t rebuilds = 0; 708 uint32_t rebuilds = 0;
702 struct md_rdev *r, *t; 709 struct md_rdev *r, *t;
703 struct dm_raid_superblock *sb2; 710 struct dm_raid_superblock *sb2;
704 711
705 sb = page_address(rdev->sb_page); 712 sb = page_address(rdev->sb_page);
706 events_sb = le64_to_cpu(sb->events); 713 events_sb = le64_to_cpu(sb->events);
707 failed_devices = le64_to_cpu(sb->failed_devices); 714 failed_devices = le64_to_cpu(sb->failed_devices);
708 715
709 /* 716 /*
710 * Initialise to 1 if this is a new superblock. 717 * Initialise to 1 if this is a new superblock.
711 */ 718 */
712 mddev->events = events_sb ? : 1; 719 mddev->events = events_sb ? : 1;
713 720
714 /* 721 /*
715 * Reshaping is not currently allowed 722 * Reshaping is not currently allowed
716 */ 723 */
717 if ((le32_to_cpu(sb->level) != mddev->level) || 724 if ((le32_to_cpu(sb->level) != mddev->level) ||
718 (le32_to_cpu(sb->layout) != mddev->layout) || 725 (le32_to_cpu(sb->layout) != mddev->layout) ||
719 (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) { 726 (le32_to_cpu(sb->stripe_sectors) != mddev->chunk_sectors)) {
720 DMERR("Reshaping arrays not yet supported."); 727 DMERR("Reshaping arrays not yet supported.");
721 return -EINVAL; 728 return -EINVAL;
722 } 729 }
723 730
724 /* We can only change the number of devices in RAID1 right now */ 731 /* We can only change the number of devices in RAID1 right now */
725 if ((rs->raid_type->level != 1) && 732 if ((rs->raid_type->level != 1) &&
726 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) { 733 (le32_to_cpu(sb->num_devices) != mddev->raid_disks)) {
727 DMERR("Reshaping arrays not yet supported."); 734 DMERR("Reshaping arrays not yet supported.");
728 return -EINVAL; 735 return -EINVAL;
729 } 736 }
730 737
731 if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))) 738 if (!(rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)))
732 mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset); 739 mddev->recovery_cp = le64_to_cpu(sb->array_resync_offset);
733 740
734 /* 741 /*
735 * During load, we set FirstUse if a new superblock was written. 742 * During load, we set FirstUse if a new superblock was written.
736 * There are two reasons we might not have a superblock: 743 * There are two reasons we might not have a superblock:
737 * 1) The array is brand new - in which case, all of the 744 * 1) The array is brand new - in which case, all of the
738 * devices must have their In_sync bit set. Also, 745 * devices must have their In_sync bit set. Also,
739 * recovery_cp must be 0, unless forced. 746 * recovery_cp must be 0, unless forced.
740 * 2) This is a new device being added to an old array 747 * 2) This is a new device being added to an old array
741 * and the new device needs to be rebuilt - in which 748 * and the new device needs to be rebuilt - in which
742 * case the In_sync bit will /not/ be set and 749 * case the In_sync bit will /not/ be set and
743 * recovery_cp must be MaxSector. 750 * recovery_cp must be MaxSector.
744 */ 751 */
745 rdev_for_each(r, t, mddev) { 752 rdev_for_each(r, t, mddev) {
746 if (!test_bit(In_sync, &r->flags)) { 753 if (!test_bit(In_sync, &r->flags)) {
747 if (!test_bit(FirstUse, &r->flags)) 754 DMINFO("Device %d specified for rebuild: "
748 DMERR("Superblock area of " 755 "Clearing superblock", r->raid_disk);
749 "rebuild device %d should have been "
750 "cleared.", r->raid_disk);
751 set_bit(FirstUse, &r->flags);
752 rebuilds++; 756 rebuilds++;
753 } else if (test_bit(FirstUse, &r->flags)) 757 } else if (test_bit(FirstUse, &r->flags))
754 new_devs++; 758 new_devs++;
755 } 759 }
756 760
757 if (!rebuilds) { 761 if (!rebuilds) {
758 if (new_devs == mddev->raid_disks) { 762 if (new_devs == mddev->raid_disks) {
759 DMINFO("Superblocks created for new array"); 763 DMINFO("Superblocks created for new array");
760 set_bit(MD_ARRAY_FIRST_USE, &mddev->flags); 764 set_bit(MD_ARRAY_FIRST_USE, &mddev->flags);
761 } else if (new_devs) { 765 } else if (new_devs) {
762 DMERR("New device injected " 766 DMERR("New device injected "
763 "into existing array without 'rebuild' " 767 "into existing array without 'rebuild' "
764 "parameter specified"); 768 "parameter specified");
765 return -EINVAL; 769 return -EINVAL;
766 } 770 }
767 } else if (new_devs) { 771 } else if (new_devs) {
768 DMERR("'rebuild' devices cannot be " 772 DMERR("'rebuild' devices cannot be "
769 "injected into an array with other first-time devices"); 773 "injected into an array with other first-time devices");
770 return -EINVAL; 774 return -EINVAL;
771 } else if (mddev->recovery_cp != MaxSector) { 775 } else if (mddev->recovery_cp != MaxSector) {
772 DMERR("'rebuild' specified while array is not in-sync"); 776 DMERR("'rebuild' specified while array is not in-sync");
773 return -EINVAL; 777 return -EINVAL;
774 } 778 }
775 779
776 /* 780 /*
777 * Now we set the Faulty bit for those devices that are 781 * Now we set the Faulty bit for those devices that are
778 * recorded in the superblock as failed. 782 * recorded in the superblock as failed.
779 */ 783 */
780 rdev_for_each(r, t, mddev) { 784 rdev_for_each(r, t, mddev) {
781 if (!r->sb_page) 785 if (!r->sb_page)
782 continue; 786 continue;
783 sb2 = page_address(r->sb_page); 787 sb2 = page_address(r->sb_page);
784 sb2->failed_devices = 0; 788 sb2->failed_devices = 0;
785 789
786 /* 790 /*
787 * Check for any device re-ordering. 791 * Check for any device re-ordering.
788 */ 792 */
789 if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) { 793 if (!test_bit(FirstUse, &r->flags) && (r->raid_disk >= 0)) {
790 role = le32_to_cpu(sb2->array_position); 794 role = le32_to_cpu(sb2->array_position);
791 if (role != r->raid_disk) { 795 if (role != r->raid_disk) {
792 if (rs->raid_type->level != 1) { 796 if (rs->raid_type->level != 1) {
793 rs->ti->error = "Cannot change device " 797 rs->ti->error = "Cannot change device "
794 "positions in RAID array"; 798 "positions in RAID array";
795 return -EINVAL; 799 return -EINVAL;
796 } 800 }
797 DMINFO("RAID1 device #%d now at position #%d", 801 DMINFO("RAID1 device #%d now at position #%d",
798 role, r->raid_disk); 802 role, r->raid_disk);
799 } 803 }
800 804
801 /* 805 /*
802 * Partial recovery is performed on 806 * Partial recovery is performed on
803 * returning failed devices. 807 * returning failed devices.
804 */ 808 */
805 if (failed_devices & (1 << role)) 809 if (failed_devices & (1 << role))
806 set_bit(Faulty, &r->flags); 810 set_bit(Faulty, &r->flags);
807 } 811 }
808 } 812 }
809 813
810 return 0; 814 return 0;
811 } 815 }
812 816
813 static int super_validate(struct mddev *mddev, struct md_rdev *rdev) 817 static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
814 { 818 {
815 struct dm_raid_superblock *sb = page_address(rdev->sb_page); 819 struct dm_raid_superblock *sb = page_address(rdev->sb_page);
816 820
817 /* 821 /*
818 * If mddev->events is not set, we know we have not yet initialized 822 * If mddev->events is not set, we know we have not yet initialized
819 * the array. 823 * the array.
820 */ 824 */
821 if (!mddev->events && super_init_validation(mddev, rdev)) 825 if (!mddev->events && super_init_validation(mddev, rdev))
822 return -EINVAL; 826 return -EINVAL;
823 827
824 mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */ 828 mddev->bitmap_info.offset = 4096 >> 9; /* Enable bitmap creation */
825 rdev->mddev->bitmap_info.default_offset = 4096 >> 9; 829 rdev->mddev->bitmap_info.default_offset = 4096 >> 9;
826 if (!test_bit(FirstUse, &rdev->flags)) { 830 if (!test_bit(FirstUse, &rdev->flags)) {
827 rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset); 831 rdev->recovery_offset = le64_to_cpu(sb->disk_recovery_offset);
828 if (rdev->recovery_offset != MaxSector) 832 if (rdev->recovery_offset != MaxSector)
829 clear_bit(In_sync, &rdev->flags); 833 clear_bit(In_sync, &rdev->flags);
830 } 834 }
831 835
832 /* 836 /*
833 * If a device comes back, set it as not In_sync and no longer faulty. 837 * If a device comes back, set it as not In_sync and no longer faulty.
834 */ 838 */
835 if (test_bit(Faulty, &rdev->flags)) { 839 if (test_bit(Faulty, &rdev->flags)) {
836 clear_bit(Faulty, &rdev->flags); 840 clear_bit(Faulty, &rdev->flags);
837 clear_bit(In_sync, &rdev->flags); 841 clear_bit(In_sync, &rdev->flags);
838 rdev->saved_raid_disk = rdev->raid_disk; 842 rdev->saved_raid_disk = rdev->raid_disk;
839 rdev->recovery_offset = 0; 843 rdev->recovery_offset = 0;
840 } 844 }
841 845
842 clear_bit(FirstUse, &rdev->flags); 846 clear_bit(FirstUse, &rdev->flags);
843 847
844 return 0; 848 return 0;
845 } 849 }
846 850
847 /* 851 /*
848 * Analyse superblocks and select the freshest. 852 * Analyse superblocks and select the freshest.
849 */ 853 */
850 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) 854 static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
851 { 855 {
852 int ret; 856 int ret;
853 struct md_rdev *rdev, *freshest, *tmp; 857 struct md_rdev *rdev, *freshest, *tmp;
854 struct mddev *mddev = &rs->md; 858 struct mddev *mddev = &rs->md;
855 859
856 freshest = NULL; 860 freshest = NULL;
857 rdev_for_each(rdev, tmp, mddev) { 861 rdev_for_each(rdev, tmp, mddev) {
858 if (!rdev->meta_bdev) 862 if (!rdev->meta_bdev)
859 continue; 863 continue;
860 864
861 ret = super_load(rdev, freshest); 865 ret = super_load(rdev, freshest);
862 866
863 switch (ret) { 867 switch (ret) {
864 case 1: 868 case 1:
865 freshest = rdev; 869 freshest = rdev;
866 break; 870 break;
867 case 0: 871 case 0:
868 break; 872 break;
869 default: 873 default:
870 ti->error = "Failed to load superblock"; 874 ti->error = "Failed to load superblock";
871 return ret; 875 return ret;
872 } 876 }
873 } 877 }
874 878
875 if (!freshest) 879 if (!freshest)
876 return 0; 880 return 0;
877 881
878 /* 882 /*
879 * Validation of the freshest device provides the source of 883 * Validation of the freshest device provides the source of
880 * validation for the remaining devices. 884 * validation for the remaining devices.
881 */ 885 */
882 ti->error = "Unable to assemble array: Invalid superblocks"; 886 ti->error = "Unable to assemble array: Invalid superblocks";
883 if (super_validate(mddev, freshest)) 887 if (super_validate(mddev, freshest))
884 return -EINVAL; 888 return -EINVAL;
885 889
886 rdev_for_each(rdev, tmp, mddev) 890 rdev_for_each(rdev, tmp, mddev)
887 if ((rdev != freshest) && super_validate(mddev, rdev)) 891 if ((rdev != freshest) && super_validate(mddev, rdev))
888 return -EINVAL; 892 return -EINVAL;
889 893
890 return 0; 894 return 0;
891 } 895 }
892 896
893 /* 897 /*
894 * Construct a RAID4/5/6 mapping: 898 * Construct a RAID4/5/6 mapping:
895 * Args: 899 * Args:
896 * <raid_type> <#raid_params> <raid_params> \ 900 * <raid_type> <#raid_params> <raid_params> \
897 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> } 901 * <#raid_devs> { <meta_dev1> <dev1> .. <meta_devN> <devN> }
898 * 902 *
899 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for 903 * <raid_params> varies by <raid_type>. See 'parse_raid_params' for
900 * details on possible <raid_params>. 904 * details on possible <raid_params>.
901 */ 905 */
902 static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) 906 static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
903 { 907 {
904 int ret; 908 int ret;
905 struct raid_type *rt; 909 struct raid_type *rt;
906 unsigned long num_raid_params, num_raid_devs; 910 unsigned long num_raid_params, num_raid_devs;
907 struct raid_set *rs = NULL; 911 struct raid_set *rs = NULL;
908 912
909 /* Must have at least <raid_type> <#raid_params> */ 913 /* Must have at least <raid_type> <#raid_params> */
910 if (argc < 2) { 914 if (argc < 2) {
911 ti->error = "Too few arguments"; 915 ti->error = "Too few arguments";
912 return -EINVAL; 916 return -EINVAL;
913 } 917 }
914 918
915 /* raid type */ 919 /* raid type */
916 rt = get_raid_type(argv[0]); 920 rt = get_raid_type(argv[0]);
917 if (!rt) { 921 if (!rt) {
918 ti->error = "Unrecognised raid_type"; 922 ti->error = "Unrecognised raid_type";
919 return -EINVAL; 923 return -EINVAL;
920 } 924 }
921 argc--; 925 argc--;
922 argv++; 926 argv++;
923 927
924 /* number of RAID parameters */ 928 /* number of RAID parameters */
925 if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) { 929 if (strict_strtoul(argv[0], 10, &num_raid_params) < 0) {
926 ti->error = "Cannot understand number of RAID parameters"; 930 ti->error = "Cannot understand number of RAID parameters";
927 return -EINVAL; 931 return -EINVAL;
928 } 932 }
929 argc--; 933 argc--;
930 argv++; 934 argv++;
931 935
932 /* Skip over RAID params for now and find out # of devices */ 936 /* Skip over RAID params for now and find out # of devices */
933 if (num_raid_params + 1 > argc) { 937 if (num_raid_params + 1 > argc) {
934 ti->error = "Arguments do not agree with counts given"; 938 ti->error = "Arguments do not agree with counts given";
935 return -EINVAL; 939 return -EINVAL;
936 } 940 }
937 941
938 if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) || 942 if ((strict_strtoul(argv[num_raid_params], 10, &num_raid_devs) < 0) ||
939 (num_raid_devs >= INT_MAX)) { 943 (num_raid_devs >= INT_MAX)) {
940 ti->error = "Cannot understand number of raid devices"; 944 ti->error = "Cannot understand number of raid devices";
941 return -EINVAL; 945 return -EINVAL;
942 } 946 }
943 947
944 rs = context_alloc(ti, rt, (unsigned)num_raid_devs); 948 rs = context_alloc(ti, rt, (unsigned)num_raid_devs);
945 if (IS_ERR(rs)) 949 if (IS_ERR(rs))
946 return PTR_ERR(rs); 950 return PTR_ERR(rs);
947 951
948 ret = parse_raid_params(rs, argv, (unsigned)num_raid_params); 952 ret = parse_raid_params(rs, argv, (unsigned)num_raid_params);
949 if (ret) 953 if (ret)
950 goto bad; 954 goto bad;
951 955
952 ret = -EINVAL; 956 ret = -EINVAL;
953 957
954 argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */ 958 argc -= num_raid_params + 1; /* +1: we already have num_raid_devs */
955 argv += num_raid_params + 1; 959 argv += num_raid_params + 1;
956 960
957 if (argc != (num_raid_devs * 2)) { 961 if (argc != (num_raid_devs * 2)) {
958 ti->error = "Supplied RAID devices does not match the count given"; 962 ti->error = "Supplied RAID devices does not match the count given";
959 goto bad; 963 goto bad;
960 } 964 }
961 965
962 ret = dev_parms(rs, argv); 966 ret = dev_parms(rs, argv);
963 if (ret) 967 if (ret)
964 goto bad; 968 goto bad;
965 969
966 rs->md.sync_super = super_sync; 970 rs->md.sync_super = super_sync;
967 ret = analyse_superblocks(ti, rs); 971 ret = analyse_superblocks(ti, rs);
968 if (ret) 972 if (ret)
969 goto bad; 973 goto bad;
970 974
971 INIT_WORK(&rs->md.event_work, do_table_event); 975 INIT_WORK(&rs->md.event_work, do_table_event);
972 ti->private = rs; 976 ti->private = rs;
973 977
974 mutex_lock(&rs->md.reconfig_mutex); 978 mutex_lock(&rs->md.reconfig_mutex);
975 ret = md_run(&rs->md); 979 ret = md_run(&rs->md);
976 rs->md.in_sync = 0; /* Assume already marked dirty */ 980 rs->md.in_sync = 0; /* Assume already marked dirty */
977 mutex_unlock(&rs->md.reconfig_mutex); 981 mutex_unlock(&rs->md.reconfig_mutex);
978 982
979 if (ret) { 983 if (ret) {
980 ti->error = "Fail to run raid array"; 984 ti->error = "Fail to run raid array";
981 goto bad; 985 goto bad;
982 } 986 }
983 987
984 rs->callbacks.congested_fn = raid_is_congested; 988 rs->callbacks.congested_fn = raid_is_congested;
985 dm_table_add_target_callbacks(ti->table, &rs->callbacks); 989 dm_table_add_target_callbacks(ti->table, &rs->callbacks);
986 990
987 mddev_suspend(&rs->md); 991 mddev_suspend(&rs->md);
988 return 0; 992 return 0;
989 993
990 bad: 994 bad:
991 context_free(rs); 995 context_free(rs);
992 996
993 return ret; 997 return ret;
994 } 998 }
995 999
996 static void raid_dtr(struct dm_target *ti) 1000 static void raid_dtr(struct dm_target *ti)
997 { 1001 {
998 struct raid_set *rs = ti->private; 1002 struct raid_set *rs = ti->private;
999 1003
1000 list_del_init(&rs->callbacks.list); 1004 list_del_init(&rs->callbacks.list);
1001 md_stop(&rs->md); 1005 md_stop(&rs->md);
1002 context_free(rs); 1006 context_free(rs);
1003 } 1007 }
1004 1008
1005 static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context) 1009 static int raid_map(struct dm_target *ti, struct bio *bio, union map_info *map_context)
1006 { 1010 {
1007 struct raid_set *rs = ti->private; 1011 struct raid_set *rs = ti->private;
1008 struct mddev *mddev = &rs->md; 1012 struct mddev *mddev = &rs->md;
1009 1013
1010 mddev->pers->make_request(mddev, bio); 1014 mddev->pers->make_request(mddev, bio);
1011 1015
1012 return DM_MAPIO_SUBMITTED; 1016 return DM_MAPIO_SUBMITTED;
1013 } 1017 }
1014 1018
1015 static int raid_status(struct dm_target *ti, status_type_t type, 1019 static int raid_status(struct dm_target *ti, status_type_t type,
1016 char *result, unsigned maxlen) 1020 char *result, unsigned maxlen)
1017 { 1021 {
1018 struct raid_set *rs = ti->private; 1022 struct raid_set *rs = ti->private;
1019 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */ 1023 unsigned raid_param_cnt = 1; /* at least 1 for chunksize */
1020 unsigned sz = 0; 1024 unsigned sz = 0;
1021 int i, array_in_sync = 0; 1025 int i, array_in_sync = 0;
1022 sector_t sync; 1026 sector_t sync;
1023 1027
1024 switch (type) { 1028 switch (type) {
1025 case STATUSTYPE_INFO: 1029 case STATUSTYPE_INFO:
1026 DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks); 1030 DMEMIT("%s %d ", rs->raid_type->name, rs->md.raid_disks);
1027 1031
1028 if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery)) 1032 if (test_bit(MD_RECOVERY_RUNNING, &rs->md.recovery))
1029 sync = rs->md.curr_resync_completed; 1033 sync = rs->md.curr_resync_completed;
1030 else 1034 else
1031 sync = rs->md.recovery_cp; 1035 sync = rs->md.recovery_cp;
1032 1036
1033 if (sync >= rs->md.resync_max_sectors) { 1037 if (sync >= rs->md.resync_max_sectors) {
1034 array_in_sync = 1; 1038 array_in_sync = 1;
1035 sync = rs->md.resync_max_sectors; 1039 sync = rs->md.resync_max_sectors;
1036 } else { 1040 } else {
1037 /* 1041 /*
1038 * The array may be doing an initial sync, or it may 1042 * The array may be doing an initial sync, or it may
1039 * be rebuilding individual components. If all the 1043 * be rebuilding individual components. If all the
1040 * devices are In_sync, then it is the array that is 1044 * devices are In_sync, then it is the array that is
1041 * being initialized. 1045 * being initialized.
1042 */ 1046 */
1043 for (i = 0; i < rs->md.raid_disks; i++) 1047 for (i = 0; i < rs->md.raid_disks; i++)
1044 if (!test_bit(In_sync, &rs->dev[i].rdev.flags)) 1048 if (!test_bit(In_sync, &rs->dev[i].rdev.flags))
1045 array_in_sync = 1; 1049 array_in_sync = 1;
1046 } 1050 }
1047 /* 1051 /*
1048 * Status characters: 1052 * Status characters:
1049 * 'D' = Dead/Failed device 1053 * 'D' = Dead/Failed device
1050 * 'a' = Alive but not in-sync 1054 * 'a' = Alive but not in-sync
1051 * 'A' = Alive and in-sync 1055 * 'A' = Alive and in-sync
1052 */ 1056 */
1053 for (i = 0; i < rs->md.raid_disks; i++) { 1057 for (i = 0; i < rs->md.raid_disks; i++) {
1054 if (test_bit(Faulty, &rs->dev[i].rdev.flags)) 1058 if (test_bit(Faulty, &rs->dev[i].rdev.flags))
1055 DMEMIT("D"); 1059 DMEMIT("D");
1056 else if (!array_in_sync || 1060 else if (!array_in_sync ||
1057 !test_bit(In_sync, &rs->dev[i].rdev.flags)) 1061 !test_bit(In_sync, &rs->dev[i].rdev.flags))
1058 DMEMIT("a"); 1062 DMEMIT("a");
1059 else 1063 else
1060 DMEMIT("A"); 1064 DMEMIT("A");
1061 } 1065 }
1062 1066
1063 /* 1067 /*
1064 * In-sync ratio: 1068 * In-sync ratio:
1065 * The in-sync ratio shows the progress of: 1069 * The in-sync ratio shows the progress of:
1066 * - Initializing the array 1070 * - Initializing the array
1067 * - Rebuilding a subset of devices of the array 1071 * - Rebuilding a subset of devices of the array
1068 * The user can distinguish between the two by referring 1072 * The user can distinguish between the two by referring
1069 * to the status characters. 1073 * to the status characters.
1070 */ 1074 */
1071 DMEMIT(" %llu/%llu", 1075 DMEMIT(" %llu/%llu",
1072 (unsigned long long) sync, 1076 (unsigned long long) sync,
1073 (unsigned long long) rs->md.resync_max_sectors); 1077 (unsigned long long) rs->md.resync_max_sectors);
1074 1078
1075 break; 1079 break;
1076 case STATUSTYPE_TABLE: 1080 case STATUSTYPE_TABLE:
1077 /* The string you would use to construct this array */ 1081 /* The string you would use to construct this array */
1078 for (i = 0; i < rs->md.raid_disks; i++) { 1082 for (i = 0; i < rs->md.raid_disks; i++) {
1079 if ((rs->print_flags & DMPF_REBUILD) && 1083 if ((rs->print_flags & DMPF_REBUILD) &&
1080 rs->dev[i].data_dev && 1084 rs->dev[i].data_dev &&
1081 !test_bit(In_sync, &rs->dev[i].rdev.flags)) 1085 !test_bit(In_sync, &rs->dev[i].rdev.flags))
1082 raid_param_cnt += 2; /* for rebuilds */ 1086 raid_param_cnt += 2; /* for rebuilds */
1083 if (rs->dev[i].data_dev && 1087 if (rs->dev[i].data_dev &&
1084 test_bit(WriteMostly, &rs->dev[i].rdev.flags)) 1088 test_bit(WriteMostly, &rs->dev[i].rdev.flags))
1085 raid_param_cnt += 2; 1089 raid_param_cnt += 2;
1086 } 1090 }
1087 1091
1088 raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2); 1092 raid_param_cnt += (hweight64(rs->print_flags & ~DMPF_REBUILD) * 2);
1089 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC)) 1093 if (rs->print_flags & (DMPF_SYNC | DMPF_NOSYNC))
1090 raid_param_cnt--; 1094 raid_param_cnt--;
1091 1095
1092 DMEMIT("%s %u %u", rs->raid_type->name, 1096 DMEMIT("%s %u %u", rs->raid_type->name,
1093 raid_param_cnt, rs->md.chunk_sectors); 1097 raid_param_cnt, rs->md.chunk_sectors);
1094 1098
1095 if ((rs->print_flags & DMPF_SYNC) && 1099 if ((rs->print_flags & DMPF_SYNC) &&
1096 (rs->md.recovery_cp == MaxSector)) 1100 (rs->md.recovery_cp == MaxSector))
1097 DMEMIT(" sync"); 1101 DMEMIT(" sync");
1098 if (rs->print_flags & DMPF_NOSYNC) 1102 if (rs->print_flags & DMPF_NOSYNC)
1099 DMEMIT(" nosync"); 1103 DMEMIT(" nosync");
1100 1104
1101 for (i = 0; i < rs->md.raid_disks; i++) 1105 for (i = 0; i < rs->md.raid_disks; i++)
1102 if ((rs->print_flags & DMPF_REBUILD) && 1106 if ((rs->print_flags & DMPF_REBUILD) &&
1103 rs->dev[i].data_dev && 1107 rs->dev[i].data_dev &&
1104 !test_bit(In_sync, &rs->dev[i].rdev.flags)) 1108 !test_bit(In_sync, &rs->dev[i].rdev.flags))
1105 DMEMIT(" rebuild %u", i); 1109 DMEMIT(" rebuild %u", i);
1106 1110
1107 if (rs->print_flags & DMPF_DAEMON_SLEEP) 1111 if (rs->print_flags & DMPF_DAEMON_SLEEP)
1108 DMEMIT(" daemon_sleep %lu", 1112 DMEMIT(" daemon_sleep %lu",
1109 rs->md.bitmap_info.daemon_sleep); 1113 rs->md.bitmap_info.daemon_sleep);
1110 1114
1111 if (rs->print_flags & DMPF_MIN_RECOVERY_RATE) 1115 if (rs->print_flags & DMPF_MIN_RECOVERY_RATE)
1112 DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min); 1116 DMEMIT(" min_recovery_rate %d", rs->md.sync_speed_min);
1113 1117
1114 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE) 1118 if (rs->print_flags & DMPF_MAX_RECOVERY_RATE)
1115 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max); 1119 DMEMIT(" max_recovery_rate %d", rs->md.sync_speed_max);
1116 1120
1117 for (i = 0; i < rs->md.raid_disks; i++) 1121 for (i = 0; i < rs->md.raid_disks; i++)
1118 if (rs->dev[i].data_dev && 1122 if (rs->dev[i].data_dev &&
1119 test_bit(WriteMostly, &rs->dev[i].rdev.flags)) 1123 test_bit(WriteMostly, &rs->dev[i].rdev.flags))
1120 DMEMIT(" write_mostly %u", i); 1124 DMEMIT(" write_mostly %u", i);
1121 1125
1122 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND) 1126 if (rs->print_flags & DMPF_MAX_WRITE_BEHIND)
1123 DMEMIT(" max_write_behind %lu", 1127 DMEMIT(" max_write_behind %lu",
1124 rs->md.bitmap_info.max_write_behind); 1128 rs->md.bitmap_info.max_write_behind);
1125 1129
1126 if (rs->print_flags & DMPF_STRIPE_CACHE) { 1130 if (rs->print_flags & DMPF_STRIPE_CACHE) {
1127 struct r5conf *conf = rs->md.private; 1131 struct r5conf *conf = rs->md.private;
1128 1132
1129 /* convert from kiB to sectors */ 1133 /* convert from kiB to sectors */
1130 DMEMIT(" stripe_cache %d", 1134 DMEMIT(" stripe_cache %d",
1131 conf ? conf->max_nr_stripes * 2 : 0); 1135 conf ? conf->max_nr_stripes * 2 : 0);
1132 } 1136 }
1133 1137
1134 if (rs->print_flags & DMPF_REGION_SIZE) 1138 if (rs->print_flags & DMPF_REGION_SIZE)
1135 DMEMIT(" region_size %lu", 1139 DMEMIT(" region_size %lu",
1136 rs->md.bitmap_info.chunksize >> 9); 1140 rs->md.bitmap_info.chunksize >> 9);
1137 1141
1138 DMEMIT(" %d", rs->md.raid_disks); 1142 DMEMIT(" %d", rs->md.raid_disks);
1139 for (i = 0; i < rs->md.raid_disks; i++) { 1143 for (i = 0; i < rs->md.raid_disks; i++) {
1140 if (rs->dev[i].meta_dev) 1144 if (rs->dev[i].meta_dev)
1141 DMEMIT(" %s", rs->dev[i].meta_dev->name); 1145 DMEMIT(" %s", rs->dev[i].meta_dev->name);
1142 else 1146 else
1143 DMEMIT(" -"); 1147 DMEMIT(" -");
1144 1148
1145 if (rs->dev[i].data_dev) 1149 if (rs->dev[i].data_dev)
1146 DMEMIT(" %s", rs->dev[i].data_dev->name); 1150 DMEMIT(" %s", rs->dev[i].data_dev->name);
1147 else 1151 else
1148 DMEMIT(" -"); 1152 DMEMIT(" -");
1149 } 1153 }
1150 } 1154 }
1151 1155
1152 return 0; 1156 return 0;
1153 } 1157 }
1154 1158
1155 static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) 1159 static int raid_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data)
1156 { 1160 {
1157 struct raid_set *rs = ti->private; 1161 struct raid_set *rs = ti->private;
1158 unsigned i; 1162 unsigned i;
1159 int ret = 0; 1163 int ret = 0;
1160 1164
1161 for (i = 0; !ret && i < rs->md.raid_disks; i++) 1165 for (i = 0; !ret && i < rs->md.raid_disks; i++)
1162 if (rs->dev[i].data_dev) 1166 if (rs->dev[i].data_dev)
1163 ret = fn(ti, 1167 ret = fn(ti,
1164 rs->dev[i].data_dev, 1168 rs->dev[i].data_dev,
1165 0, /* No offset on data devs */ 1169 0, /* No offset on data devs */
1166 rs->md.dev_sectors, 1170 rs->md.dev_sectors,
1167 data); 1171 data);
1168 1172
1169 return ret; 1173 return ret;
1170 } 1174 }
1171 1175
1172 static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) 1176 static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
1173 { 1177 {
1174 struct raid_set *rs = ti->private; 1178 struct raid_set *rs = ti->private;
1175 unsigned chunk_size = rs->md.chunk_sectors << 9; 1179 unsigned chunk_size = rs->md.chunk_sectors << 9;
1176 struct r5conf *conf = rs->md.private; 1180 struct r5conf *conf = rs->md.private;
1177 1181
1178 blk_limits_io_min(limits, chunk_size); 1182 blk_limits_io_min(limits, chunk_size);
1179 blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded)); 1183 blk_limits_io_opt(limits, chunk_size * (conf->raid_disks - conf->max_degraded));
1180 } 1184 }
1181 1185
1182 static void raid_presuspend(struct dm_target *ti) 1186 static void raid_presuspend(struct dm_target *ti)
1183 { 1187 {
1184 struct raid_set *rs = ti->private; 1188 struct raid_set *rs = ti->private;
1185 1189
1186 md_stop_writes(&rs->md); 1190 md_stop_writes(&rs->md);
1187 } 1191 }
1188 1192
1189 static void raid_postsuspend(struct dm_target *ti) 1193 static void raid_postsuspend(struct dm_target *ti)
1190 { 1194 {
1191 struct raid_set *rs = ti->private; 1195 struct raid_set *rs = ti->private;
1192 1196
1193 mddev_suspend(&rs->md); 1197 mddev_suspend(&rs->md);
1194 } 1198 }
1195 1199
1196 static void raid_resume(struct dm_target *ti) 1200 static void raid_resume(struct dm_target *ti)
1197 { 1201 {
1198 struct raid_set *rs = ti->private; 1202 struct raid_set *rs = ti->private;
1199 1203
1200 bitmap_load(&rs->md); 1204 bitmap_load(&rs->md);
1201 mddev_resume(&rs->md); 1205 mddev_resume(&rs->md);
1202 } 1206 }
1203 1207
1204 static struct target_type raid_target = { 1208 static struct target_type raid_target = {
1205 .name = "raid", 1209 .name = "raid",
1206 .version = {1, 1, 0}, 1210 .version = {1, 1, 0},
1207 .module = THIS_MODULE, 1211 .module = THIS_MODULE,
1208 .ctr = raid_ctr, 1212 .ctr = raid_ctr,
1209 .dtr = raid_dtr, 1213 .dtr = raid_dtr,
1210 .map = raid_map, 1214 .map = raid_map,
1211 .status = raid_status, 1215 .status = raid_status,
1212 .iterate_devices = raid_iterate_devices, 1216 .iterate_devices = raid_iterate_devices,
1213 .io_hints = raid_io_hints, 1217 .io_hints = raid_io_hints,
1214 .presuspend = raid_presuspend, 1218 .presuspend = raid_presuspend,
1215 .postsuspend = raid_postsuspend, 1219 .postsuspend = raid_postsuspend,
1216 .resume = raid_resume, 1220 .resume = raid_resume,
1217 }; 1221 };
1218 1222
1219 static int __init dm_raid_init(void) 1223 static int __init dm_raid_init(void)
1220 { 1224 {
1221 return dm_register_target(&raid_target); 1225 return dm_register_target(&raid_target);
1222 } 1226 }
1223 1227
1224 static void __exit dm_raid_exit(void) 1228 static void __exit dm_raid_exit(void)
1225 { 1229 {
1226 dm_unregister_target(&raid_target); 1230 dm_unregister_target(&raid_target);
1227 } 1231 }
1228 1232
1229 module_init(dm_raid_init); 1233 module_init(dm_raid_init);
1230 module_exit(dm_raid_exit); 1234 module_exit(dm_raid_exit);
1231 1235
1232 MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target"); 1236 MODULE_DESCRIPTION(DM_NAME " raid4/5/6 target");
1233 MODULE_ALIAS("dm-raid4"); 1237 MODULE_ALIAS("dm-raid4");
1234 MODULE_ALIAS("dm-raid5"); 1238 MODULE_ALIAS("dm-raid5");
1235 MODULE_ALIAS("dm-raid6"); 1239 MODULE_ALIAS("dm-raid6");