Commit 4184153f9e483f9bb63339ed316e059962fe9794

Authored by Mikulas Patocka
Committed by Alasdair G Kergon
1 parent f1e5398746

dm raid1: support flush

Flush support for dm-raid1.

When it receives an empty barrier, submit it to all the devices via dm-io.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

Showing 2 changed files with 34 additions and 4 deletions Inline Diff

drivers/md/dm-raid1.c
1 /* 1 /*
2 * Copyright (C) 2003 Sistina Software Limited. 2 * Copyright (C) 2003 Sistina Software Limited.
3 * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8 #include "dm-bio-record.h" 8 #include "dm-bio-record.h"
9 9
10 #include <linux/init.h> 10 #include <linux/init.h>
11 #include <linux/mempool.h> 11 #include <linux/mempool.h>
12 #include <linux/module.h> 12 #include <linux/module.h>
13 #include <linux/pagemap.h> 13 #include <linux/pagemap.h>
14 #include <linux/slab.h> 14 #include <linux/slab.h>
15 #include <linux/workqueue.h> 15 #include <linux/workqueue.h>
16 #include <linux/device-mapper.h> 16 #include <linux/device-mapper.h>
17 #include <linux/dm-io.h> 17 #include <linux/dm-io.h>
18 #include <linux/dm-dirty-log.h> 18 #include <linux/dm-dirty-log.h>
19 #include <linux/dm-kcopyd.h> 19 #include <linux/dm-kcopyd.h>
20 #include <linux/dm-region-hash.h> 20 #include <linux/dm-region-hash.h>
21 21
22 #define DM_MSG_PREFIX "raid1" 22 #define DM_MSG_PREFIX "raid1"
23 23
24 #define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ 24 #define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */
25 #define DM_IO_PAGES 64 25 #define DM_IO_PAGES 64
26 #define DM_KCOPYD_PAGES 64 26 #define DM_KCOPYD_PAGES 64
27 27
28 #define DM_RAID1_HANDLE_ERRORS 0x01 28 #define DM_RAID1_HANDLE_ERRORS 0x01
29 #define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) 29 #define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS)
30 30
31 static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); 31 static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped);
32 32
33 /*----------------------------------------------------------------- 33 /*-----------------------------------------------------------------
34 * Mirror set structures. 34 * Mirror set structures.
35 *---------------------------------------------------------------*/ 35 *---------------------------------------------------------------*/
36 enum dm_raid1_error { 36 enum dm_raid1_error {
37 DM_RAID1_WRITE_ERROR, 37 DM_RAID1_WRITE_ERROR,
38 DM_RAID1_SYNC_ERROR, 38 DM_RAID1_SYNC_ERROR,
39 DM_RAID1_READ_ERROR 39 DM_RAID1_READ_ERROR
40 }; 40 };
41 41
42 struct mirror { 42 struct mirror {
43 struct mirror_set *ms; 43 struct mirror_set *ms;
44 atomic_t error_count; 44 atomic_t error_count;
45 unsigned long error_type; 45 unsigned long error_type;
46 struct dm_dev *dev; 46 struct dm_dev *dev;
47 sector_t offset; 47 sector_t offset;
48 }; 48 };
49 49
50 struct mirror_set { 50 struct mirror_set {
51 struct dm_target *ti; 51 struct dm_target *ti;
52 struct list_head list; 52 struct list_head list;
53 53
54 uint64_t features; 54 uint64_t features;
55 55
56 spinlock_t lock; /* protects the lists */ 56 spinlock_t lock; /* protects the lists */
57 struct bio_list reads; 57 struct bio_list reads;
58 struct bio_list writes; 58 struct bio_list writes;
59 struct bio_list failures; 59 struct bio_list failures;
60 60
61 struct dm_region_hash *rh; 61 struct dm_region_hash *rh;
62 struct dm_kcopyd_client *kcopyd_client; 62 struct dm_kcopyd_client *kcopyd_client;
63 struct dm_io_client *io_client; 63 struct dm_io_client *io_client;
64 mempool_t *read_record_pool; 64 mempool_t *read_record_pool;
65 65
66 /* recovery */ 66 /* recovery */
67 region_t nr_regions; 67 region_t nr_regions;
68 int in_sync; 68 int in_sync;
69 int log_failure; 69 int log_failure;
70 atomic_t suspend; 70 atomic_t suspend;
71 71
72 atomic_t default_mirror; /* Default mirror */ 72 atomic_t default_mirror; /* Default mirror */
73 73
74 struct workqueue_struct *kmirrord_wq; 74 struct workqueue_struct *kmirrord_wq;
75 struct work_struct kmirrord_work; 75 struct work_struct kmirrord_work;
76 struct timer_list timer; 76 struct timer_list timer;
77 unsigned long timer_pending; 77 unsigned long timer_pending;
78 78
79 struct work_struct trigger_event; 79 struct work_struct trigger_event;
80 80
81 unsigned nr_mirrors; 81 unsigned nr_mirrors;
82 struct mirror mirror[0]; 82 struct mirror mirror[0];
83 }; 83 };
84 84
85 static void wakeup_mirrord(void *context) 85 static void wakeup_mirrord(void *context)
86 { 86 {
87 struct mirror_set *ms = context; 87 struct mirror_set *ms = context;
88 88
89 queue_work(ms->kmirrord_wq, &ms->kmirrord_work); 89 queue_work(ms->kmirrord_wq, &ms->kmirrord_work);
90 } 90 }
91 91
92 static void delayed_wake_fn(unsigned long data) 92 static void delayed_wake_fn(unsigned long data)
93 { 93 {
94 struct mirror_set *ms = (struct mirror_set *) data; 94 struct mirror_set *ms = (struct mirror_set *) data;
95 95
96 clear_bit(0, &ms->timer_pending); 96 clear_bit(0, &ms->timer_pending);
97 wakeup_mirrord(ms); 97 wakeup_mirrord(ms);
98 } 98 }
99 99
100 static void delayed_wake(struct mirror_set *ms) 100 static void delayed_wake(struct mirror_set *ms)
101 { 101 {
102 if (test_and_set_bit(0, &ms->timer_pending)) 102 if (test_and_set_bit(0, &ms->timer_pending))
103 return; 103 return;
104 104
105 ms->timer.expires = jiffies + HZ / 5; 105 ms->timer.expires = jiffies + HZ / 5;
106 ms->timer.data = (unsigned long) ms; 106 ms->timer.data = (unsigned long) ms;
107 ms->timer.function = delayed_wake_fn; 107 ms->timer.function = delayed_wake_fn;
108 add_timer(&ms->timer); 108 add_timer(&ms->timer);
109 } 109 }
110 110
111 static void wakeup_all_recovery_waiters(void *context) 111 static void wakeup_all_recovery_waiters(void *context)
112 { 112 {
113 wake_up_all(&_kmirrord_recovery_stopped); 113 wake_up_all(&_kmirrord_recovery_stopped);
114 } 114 }
115 115
116 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) 116 static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw)
117 { 117 {
118 unsigned long flags; 118 unsigned long flags;
119 int should_wake = 0; 119 int should_wake = 0;
120 struct bio_list *bl; 120 struct bio_list *bl;
121 121
122 bl = (rw == WRITE) ? &ms->writes : &ms->reads; 122 bl = (rw == WRITE) ? &ms->writes : &ms->reads;
123 spin_lock_irqsave(&ms->lock, flags); 123 spin_lock_irqsave(&ms->lock, flags);
124 should_wake = !(bl->head); 124 should_wake = !(bl->head);
125 bio_list_add(bl, bio); 125 bio_list_add(bl, bio);
126 spin_unlock_irqrestore(&ms->lock, flags); 126 spin_unlock_irqrestore(&ms->lock, flags);
127 127
128 if (should_wake) 128 if (should_wake)
129 wakeup_mirrord(ms); 129 wakeup_mirrord(ms);
130 } 130 }
131 131
132 static void dispatch_bios(void *context, struct bio_list *bio_list) 132 static void dispatch_bios(void *context, struct bio_list *bio_list)
133 { 133 {
134 struct mirror_set *ms = context; 134 struct mirror_set *ms = context;
135 struct bio *bio; 135 struct bio *bio;
136 136
137 while ((bio = bio_list_pop(bio_list))) 137 while ((bio = bio_list_pop(bio_list)))
138 queue_bio(ms, bio, WRITE); 138 queue_bio(ms, bio, WRITE);
139 } 139 }
140 140
141 #define MIN_READ_RECORDS 20 141 #define MIN_READ_RECORDS 20
142 struct dm_raid1_read_record { 142 struct dm_raid1_read_record {
143 struct mirror *m; 143 struct mirror *m;
144 struct dm_bio_details details; 144 struct dm_bio_details details;
145 }; 145 };
146 146
147 static struct kmem_cache *_dm_raid1_read_record_cache; 147 static struct kmem_cache *_dm_raid1_read_record_cache;
148 148
149 /* 149 /*
150 * Every mirror should look like this one. 150 * Every mirror should look like this one.
151 */ 151 */
152 #define DEFAULT_MIRROR 0 152 #define DEFAULT_MIRROR 0
153 153
154 /* 154 /*
155 * This is yucky. We squirrel the mirror struct away inside 155 * This is yucky. We squirrel the mirror struct away inside
156 * bi_next for read/write buffers. This is safe since the bh 156 * bi_next for read/write buffers. This is safe since the bh
157 * doesn't get submitted to the lower levels of block layer. 157 * doesn't get submitted to the lower levels of block layer.
158 */ 158 */
159 static struct mirror *bio_get_m(struct bio *bio) 159 static struct mirror *bio_get_m(struct bio *bio)
160 { 160 {
161 return (struct mirror *) bio->bi_next; 161 return (struct mirror *) bio->bi_next;
162 } 162 }
163 163
164 static void bio_set_m(struct bio *bio, struct mirror *m) 164 static void bio_set_m(struct bio *bio, struct mirror *m)
165 { 165 {
166 bio->bi_next = (struct bio *) m; 166 bio->bi_next = (struct bio *) m;
167 } 167 }
168 168
169 static struct mirror *get_default_mirror(struct mirror_set *ms) 169 static struct mirror *get_default_mirror(struct mirror_set *ms)
170 { 170 {
171 return &ms->mirror[atomic_read(&ms->default_mirror)]; 171 return &ms->mirror[atomic_read(&ms->default_mirror)];
172 } 172 }
173 173
174 static void set_default_mirror(struct mirror *m) 174 static void set_default_mirror(struct mirror *m)
175 { 175 {
176 struct mirror_set *ms = m->ms; 176 struct mirror_set *ms = m->ms;
177 struct mirror *m0 = &(ms->mirror[0]); 177 struct mirror *m0 = &(ms->mirror[0]);
178 178
179 atomic_set(&ms->default_mirror, m - m0); 179 atomic_set(&ms->default_mirror, m - m0);
180 } 180 }
181 181
182 /* fail_mirror 182 /* fail_mirror
183 * @m: mirror device to fail 183 * @m: mirror device to fail
184 * @error_type: one of the enum's, DM_RAID1_*_ERROR 184 * @error_type: one of the enum's, DM_RAID1_*_ERROR
185 * 185 *
186 * If errors are being handled, record the type of 186 * If errors are being handled, record the type of
187 * error encountered for this device. If this type 187 * error encountered for this device. If this type
188 * of error has already been recorded, we can return; 188 * of error has already been recorded, we can return;
189 * otherwise, we must signal userspace by triggering 189 * otherwise, we must signal userspace by triggering
190 * an event. Additionally, if the device is the 190 * an event. Additionally, if the device is the
191 * primary device, we must choose a new primary, but 191 * primary device, we must choose a new primary, but
192 * only if the mirror is in-sync. 192 * only if the mirror is in-sync.
193 * 193 *
194 * This function must not block. 194 * This function must not block.
195 */ 195 */
196 static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) 196 static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type)
197 { 197 {
198 struct mirror_set *ms = m->ms; 198 struct mirror_set *ms = m->ms;
199 struct mirror *new; 199 struct mirror *new;
200 200
201 /* 201 /*
202 * error_count is used for nothing more than a 202 * error_count is used for nothing more than a
203 * simple way to tell if a device has encountered 203 * simple way to tell if a device has encountered
204 * errors. 204 * errors.
205 */ 205 */
206 atomic_inc(&m->error_count); 206 atomic_inc(&m->error_count);
207 207
208 if (test_and_set_bit(error_type, &m->error_type)) 208 if (test_and_set_bit(error_type, &m->error_type))
209 return; 209 return;
210 210
211 if (!errors_handled(ms)) 211 if (!errors_handled(ms))
212 return; 212 return;
213 213
214 if (m != get_default_mirror(ms)) 214 if (m != get_default_mirror(ms))
215 goto out; 215 goto out;
216 216
217 if (!ms->in_sync) { 217 if (!ms->in_sync) {
218 /* 218 /*
219 * Better to issue requests to same failing device 219 * Better to issue requests to same failing device
220 * than to risk returning corrupt data. 220 * than to risk returning corrupt data.
221 */ 221 */
222 DMERR("Primary mirror (%s) failed while out-of-sync: " 222 DMERR("Primary mirror (%s) failed while out-of-sync: "
223 "Reads may fail.", m->dev->name); 223 "Reads may fail.", m->dev->name);
224 goto out; 224 goto out;
225 } 225 }
226 226
227 for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) 227 for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++)
228 if (!atomic_read(&new->error_count)) { 228 if (!atomic_read(&new->error_count)) {
229 set_default_mirror(new); 229 set_default_mirror(new);
230 break; 230 break;
231 } 231 }
232 232
233 if (unlikely(new == ms->mirror + ms->nr_mirrors)) 233 if (unlikely(new == ms->mirror + ms->nr_mirrors))
234 DMWARN("All sides of mirror have failed."); 234 DMWARN("All sides of mirror have failed.");
235 235
236 out: 236 out:
237 schedule_work(&ms->trigger_event); 237 schedule_work(&ms->trigger_event);
238 } 238 }
239 239
240 /*----------------------------------------------------------------- 240 /*-----------------------------------------------------------------
241 * Recovery. 241 * Recovery.
242 * 242 *
243 * When a mirror is first activated we may find that some regions 243 * When a mirror is first activated we may find that some regions
244 * are in the no-sync state. We have to recover these by 244 * are in the no-sync state. We have to recover these by
245 * recopying from the default mirror to all the others. 245 * recopying from the default mirror to all the others.
246 *---------------------------------------------------------------*/ 246 *---------------------------------------------------------------*/
247 static void recovery_complete(int read_err, unsigned long write_err, 247 static void recovery_complete(int read_err, unsigned long write_err,
248 void *context) 248 void *context)
249 { 249 {
250 struct dm_region *reg = context; 250 struct dm_region *reg = context;
251 struct mirror_set *ms = dm_rh_region_context(reg); 251 struct mirror_set *ms = dm_rh_region_context(reg);
252 int m, bit = 0; 252 int m, bit = 0;
253 253
254 if (read_err) { 254 if (read_err) {
255 /* Read error means the failure of default mirror. */ 255 /* Read error means the failure of default mirror. */
256 DMERR_LIMIT("Unable to read primary mirror during recovery"); 256 DMERR_LIMIT("Unable to read primary mirror during recovery");
257 fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR); 257 fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR);
258 } 258 }
259 259
260 if (write_err) { 260 if (write_err) {
261 DMERR_LIMIT("Write error during recovery (error = 0x%lx)", 261 DMERR_LIMIT("Write error during recovery (error = 0x%lx)",
262 write_err); 262 write_err);
263 /* 263 /*
264 * Bits correspond to devices (excluding default mirror). 264 * Bits correspond to devices (excluding default mirror).
265 * The default mirror cannot change during recovery. 265 * The default mirror cannot change during recovery.
266 */ 266 */
267 for (m = 0; m < ms->nr_mirrors; m++) { 267 for (m = 0; m < ms->nr_mirrors; m++) {
268 if (&ms->mirror[m] == get_default_mirror(ms)) 268 if (&ms->mirror[m] == get_default_mirror(ms))
269 continue; 269 continue;
270 if (test_bit(bit, &write_err)) 270 if (test_bit(bit, &write_err))
271 fail_mirror(ms->mirror + m, 271 fail_mirror(ms->mirror + m,
272 DM_RAID1_SYNC_ERROR); 272 DM_RAID1_SYNC_ERROR);
273 bit++; 273 bit++;
274 } 274 }
275 } 275 }
276 276
277 dm_rh_recovery_end(reg, !(read_err || write_err)); 277 dm_rh_recovery_end(reg, !(read_err || write_err));
278 } 278 }
279 279
280 static int recover(struct mirror_set *ms, struct dm_region *reg) 280 static int recover(struct mirror_set *ms, struct dm_region *reg)
281 { 281 {
282 int r; 282 int r;
283 unsigned i; 283 unsigned i;
284 struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; 284 struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest;
285 struct mirror *m; 285 struct mirror *m;
286 unsigned long flags = 0; 286 unsigned long flags = 0;
287 region_t key = dm_rh_get_region_key(reg); 287 region_t key = dm_rh_get_region_key(reg);
288 sector_t region_size = dm_rh_get_region_size(ms->rh); 288 sector_t region_size = dm_rh_get_region_size(ms->rh);
289 289
290 /* fill in the source */ 290 /* fill in the source */
291 m = get_default_mirror(ms); 291 m = get_default_mirror(ms);
292 from.bdev = m->dev->bdev; 292 from.bdev = m->dev->bdev;
293 from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key); 293 from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key);
294 if (key == (ms->nr_regions - 1)) { 294 if (key == (ms->nr_regions - 1)) {
295 /* 295 /*
296 * The final region may be smaller than 296 * The final region may be smaller than
297 * region_size. 297 * region_size.
298 */ 298 */
299 from.count = ms->ti->len & (region_size - 1); 299 from.count = ms->ti->len & (region_size - 1);
300 if (!from.count) 300 if (!from.count)
301 from.count = region_size; 301 from.count = region_size;
302 } else 302 } else
303 from.count = region_size; 303 from.count = region_size;
304 304
305 /* fill in the destinations */ 305 /* fill in the destinations */
306 for (i = 0, dest = to; i < ms->nr_mirrors; i++) { 306 for (i = 0, dest = to; i < ms->nr_mirrors; i++) {
307 if (&ms->mirror[i] == get_default_mirror(ms)) 307 if (&ms->mirror[i] == get_default_mirror(ms))
308 continue; 308 continue;
309 309
310 m = ms->mirror + i; 310 m = ms->mirror + i;
311 dest->bdev = m->dev->bdev; 311 dest->bdev = m->dev->bdev;
312 dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key); 312 dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key);
313 dest->count = from.count; 313 dest->count = from.count;
314 dest++; 314 dest++;
315 } 315 }
316 316
317 /* hand to kcopyd */ 317 /* hand to kcopyd */
318 if (!errors_handled(ms)) 318 if (!errors_handled(ms))
319 set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); 319 set_bit(DM_KCOPYD_IGNORE_ERROR, &flags);
320 320
321 r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, 321 r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to,
322 flags, recovery_complete, reg); 322 flags, recovery_complete, reg);
323 323
324 return r; 324 return r;
325 } 325 }
326 326
327 static void do_recovery(struct mirror_set *ms) 327 static void do_recovery(struct mirror_set *ms)
328 { 328 {
329 struct dm_region *reg; 329 struct dm_region *reg;
330 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 330 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
331 int r; 331 int r;
332 332
333 /* 333 /*
334 * Start quiescing some regions. 334 * Start quiescing some regions.
335 */ 335 */
336 dm_rh_recovery_prepare(ms->rh); 336 dm_rh_recovery_prepare(ms->rh);
337 337
338 /* 338 /*
339 * Copy any already quiesced regions. 339 * Copy any already quiesced regions.
340 */ 340 */
341 while ((reg = dm_rh_recovery_start(ms->rh))) { 341 while ((reg = dm_rh_recovery_start(ms->rh))) {
342 r = recover(ms, reg); 342 r = recover(ms, reg);
343 if (r) 343 if (r)
344 dm_rh_recovery_end(reg, 0); 344 dm_rh_recovery_end(reg, 0);
345 } 345 }
346 346
347 /* 347 /*
348 * Update the in sync flag. 348 * Update the in sync flag.
349 */ 349 */
350 if (!ms->in_sync && 350 if (!ms->in_sync &&
351 (log->type->get_sync_count(log) == ms->nr_regions)) { 351 (log->type->get_sync_count(log) == ms->nr_regions)) {
352 /* the sync is complete */ 352 /* the sync is complete */
353 dm_table_event(ms->ti->table); 353 dm_table_event(ms->ti->table);
354 ms->in_sync = 1; 354 ms->in_sync = 1;
355 } 355 }
356 } 356 }
357 357
358 /*----------------------------------------------------------------- 358 /*-----------------------------------------------------------------
359 * Reads 359 * Reads
360 *---------------------------------------------------------------*/ 360 *---------------------------------------------------------------*/
361 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) 361 static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector)
362 { 362 {
363 struct mirror *m = get_default_mirror(ms); 363 struct mirror *m = get_default_mirror(ms);
364 364
365 do { 365 do {
366 if (likely(!atomic_read(&m->error_count))) 366 if (likely(!atomic_read(&m->error_count)))
367 return m; 367 return m;
368 368
369 if (m-- == ms->mirror) 369 if (m-- == ms->mirror)
370 m += ms->nr_mirrors; 370 m += ms->nr_mirrors;
371 } while (m != get_default_mirror(ms)); 371 } while (m != get_default_mirror(ms));
372 372
373 return NULL; 373 return NULL;
374 } 374 }
375 375
376 static int default_ok(struct mirror *m) 376 static int default_ok(struct mirror *m)
377 { 377 {
378 struct mirror *default_mirror = get_default_mirror(m->ms); 378 struct mirror *default_mirror = get_default_mirror(m->ms);
379 379
380 return !atomic_read(&default_mirror->error_count); 380 return !atomic_read(&default_mirror->error_count);
381 } 381 }
382 382
383 static int mirror_available(struct mirror_set *ms, struct bio *bio) 383 static int mirror_available(struct mirror_set *ms, struct bio *bio)
384 { 384 {
385 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 385 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
386 region_t region = dm_rh_bio_to_region(ms->rh, bio); 386 region_t region = dm_rh_bio_to_region(ms->rh, bio);
387 387
388 if (log->type->in_sync(log, region, 0)) 388 if (log->type->in_sync(log, region, 0))
389 return choose_mirror(ms, bio->bi_sector) ? 1 : 0; 389 return choose_mirror(ms, bio->bi_sector) ? 1 : 0;
390 390
391 return 0; 391 return 0;
392 } 392 }
393 393
394 /* 394 /*
395 * remap a buffer to a particular mirror. 395 * remap a buffer to a particular mirror.
396 */ 396 */
397 static sector_t map_sector(struct mirror *m, struct bio *bio) 397 static sector_t map_sector(struct mirror *m, struct bio *bio)
398 { 398 {
399 if (unlikely(!bio->bi_size))
400 return 0;
399 return m->offset + (bio->bi_sector - m->ms->ti->begin); 401 return m->offset + (bio->bi_sector - m->ms->ti->begin);
400 } 402 }
401 403
402 static void map_bio(struct mirror *m, struct bio *bio) 404 static void map_bio(struct mirror *m, struct bio *bio)
403 { 405 {
404 bio->bi_bdev = m->dev->bdev; 406 bio->bi_bdev = m->dev->bdev;
405 bio->bi_sector = map_sector(m, bio); 407 bio->bi_sector = map_sector(m, bio);
406 } 408 }
407 409
408 static void map_region(struct dm_io_region *io, struct mirror *m, 410 static void map_region(struct dm_io_region *io, struct mirror *m,
409 struct bio *bio) 411 struct bio *bio)
410 { 412 {
411 io->bdev = m->dev->bdev; 413 io->bdev = m->dev->bdev;
412 io->sector = map_sector(m, bio); 414 io->sector = map_sector(m, bio);
413 io->count = bio->bi_size >> 9; 415 io->count = bio->bi_size >> 9;
414 } 416 }
415 417
416 /*----------------------------------------------------------------- 418 /*-----------------------------------------------------------------
417 * Reads 419 * Reads
418 *---------------------------------------------------------------*/ 420 *---------------------------------------------------------------*/
419 static void read_callback(unsigned long error, void *context) 421 static void read_callback(unsigned long error, void *context)
420 { 422 {
421 struct bio *bio = context; 423 struct bio *bio = context;
422 struct mirror *m; 424 struct mirror *m;
423 425
424 m = bio_get_m(bio); 426 m = bio_get_m(bio);
425 bio_set_m(bio, NULL); 427 bio_set_m(bio, NULL);
426 428
427 if (likely(!error)) { 429 if (likely(!error)) {
428 bio_endio(bio, 0); 430 bio_endio(bio, 0);
429 return; 431 return;
430 } 432 }
431 433
432 fail_mirror(m, DM_RAID1_READ_ERROR); 434 fail_mirror(m, DM_RAID1_READ_ERROR);
433 435
434 if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { 436 if (likely(default_ok(m)) || mirror_available(m->ms, bio)) {
435 DMWARN_LIMIT("Read failure on mirror device %s. " 437 DMWARN_LIMIT("Read failure on mirror device %s. "
436 "Trying alternative device.", 438 "Trying alternative device.",
437 m->dev->name); 439 m->dev->name);
438 queue_bio(m->ms, bio, bio_rw(bio)); 440 queue_bio(m->ms, bio, bio_rw(bio));
439 return; 441 return;
440 } 442 }
441 443
442 DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", 444 DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.",
443 m->dev->name); 445 m->dev->name);
444 bio_endio(bio, -EIO); 446 bio_endio(bio, -EIO);
445 } 447 }
446 448
447 /* Asynchronous read. */ 449 /* Asynchronous read. */
448 static void read_async_bio(struct mirror *m, struct bio *bio) 450 static void read_async_bio(struct mirror *m, struct bio *bio)
449 { 451 {
450 struct dm_io_region io; 452 struct dm_io_region io;
451 struct dm_io_request io_req = { 453 struct dm_io_request io_req = {
452 .bi_rw = READ, 454 .bi_rw = READ,
453 .mem.type = DM_IO_BVEC, 455 .mem.type = DM_IO_BVEC,
454 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 456 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
455 .notify.fn = read_callback, 457 .notify.fn = read_callback,
456 .notify.context = bio, 458 .notify.context = bio,
457 .client = m->ms->io_client, 459 .client = m->ms->io_client,
458 }; 460 };
459 461
460 map_region(&io, m, bio); 462 map_region(&io, m, bio);
461 bio_set_m(bio, m); 463 bio_set_m(bio, m);
462 BUG_ON(dm_io(&io_req, 1, &io, NULL)); 464 BUG_ON(dm_io(&io_req, 1, &io, NULL));
463 } 465 }
464 466
465 static inline int region_in_sync(struct mirror_set *ms, region_t region, 467 static inline int region_in_sync(struct mirror_set *ms, region_t region,
466 int may_block) 468 int may_block)
467 { 469 {
468 int state = dm_rh_get_state(ms->rh, region, may_block); 470 int state = dm_rh_get_state(ms->rh, region, may_block);
469 return state == DM_RH_CLEAN || state == DM_RH_DIRTY; 471 return state == DM_RH_CLEAN || state == DM_RH_DIRTY;
470 } 472 }
471 473
472 static void do_reads(struct mirror_set *ms, struct bio_list *reads) 474 static void do_reads(struct mirror_set *ms, struct bio_list *reads)
473 { 475 {
474 region_t region; 476 region_t region;
475 struct bio *bio; 477 struct bio *bio;
476 struct mirror *m; 478 struct mirror *m;
477 479
478 while ((bio = bio_list_pop(reads))) { 480 while ((bio = bio_list_pop(reads))) {
479 region = dm_rh_bio_to_region(ms->rh, bio); 481 region = dm_rh_bio_to_region(ms->rh, bio);
480 m = get_default_mirror(ms); 482 m = get_default_mirror(ms);
481 483
482 /* 484 /*
483 * We can only read balance if the region is in sync. 485 * We can only read balance if the region is in sync.
484 */ 486 */
485 if (likely(region_in_sync(ms, region, 1))) 487 if (likely(region_in_sync(ms, region, 1)))
486 m = choose_mirror(ms, bio->bi_sector); 488 m = choose_mirror(ms, bio->bi_sector);
487 else if (m && atomic_read(&m->error_count)) 489 else if (m && atomic_read(&m->error_count))
488 m = NULL; 490 m = NULL;
489 491
490 if (likely(m)) 492 if (likely(m))
491 read_async_bio(m, bio); 493 read_async_bio(m, bio);
492 else 494 else
493 bio_endio(bio, -EIO); 495 bio_endio(bio, -EIO);
494 } 496 }
495 } 497 }
496 498
497 /*----------------------------------------------------------------- 499 /*-----------------------------------------------------------------
498 * Writes. 500 * Writes.
499 * 501 *
500 * We do different things with the write io depending on the 502 * We do different things with the write io depending on the
501 * state of the region that it's in: 503 * state of the region that it's in:
502 * 504 *
503 * SYNC: increment pending, use kcopyd to write to *all* mirrors 505 * SYNC: increment pending, use kcopyd to write to *all* mirrors
504 * RECOVERING: delay the io until recovery completes 506 * RECOVERING: delay the io until recovery completes
505 * NOSYNC: increment pending, just write to the default mirror 507 * NOSYNC: increment pending, just write to the default mirror
506 *---------------------------------------------------------------*/ 508 *---------------------------------------------------------------*/
507 509
508 510
509 static void write_callback(unsigned long error, void *context) 511 static void write_callback(unsigned long error, void *context)
510 { 512 {
511 unsigned i, ret = 0; 513 unsigned i, ret = 0;
512 struct bio *bio = (struct bio *) context; 514 struct bio *bio = (struct bio *) context;
513 struct mirror_set *ms; 515 struct mirror_set *ms;
514 int uptodate = 0; 516 int uptodate = 0;
515 int should_wake = 0; 517 int should_wake = 0;
516 unsigned long flags; 518 unsigned long flags;
517 519
518 ms = bio_get_m(bio)->ms; 520 ms = bio_get_m(bio)->ms;
519 bio_set_m(bio, NULL); 521 bio_set_m(bio, NULL);
520 522
521 /* 523 /*
522 * NOTE: We don't decrement the pending count here, 524 * NOTE: We don't decrement the pending count here,
523 * instead it is done by the targets endio function. 525 * instead it is done by the targets endio function.
524 * This way we handle both writes to SYNC and NOSYNC 526 * This way we handle both writes to SYNC and NOSYNC
525 * regions with the same code. 527 * regions with the same code.
526 */ 528 */
527 if (likely(!error)) 529 if (likely(!error))
528 goto out; 530 goto out;
529 531
530 for (i = 0; i < ms->nr_mirrors; i++) 532 for (i = 0; i < ms->nr_mirrors; i++)
531 if (test_bit(i, &error)) 533 if (test_bit(i, &error))
532 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); 534 fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR);
533 else 535 else
534 uptodate = 1; 536 uptodate = 1;
535 537
536 if (unlikely(!uptodate)) { 538 if (unlikely(!uptodate)) {
537 DMERR("All replicated volumes dead, failing I/O"); 539 DMERR("All replicated volumes dead, failing I/O");
538 /* None of the writes succeeded, fail the I/O. */ 540 /* None of the writes succeeded, fail the I/O. */
539 ret = -EIO; 541 ret = -EIO;
540 } else if (errors_handled(ms)) { 542 } else if (errors_handled(ms)) {
541 /* 543 /*
542 * Need to raise event. Since raising 544 * Need to raise event. Since raising
543 * events can block, we need to do it in 545 * events can block, we need to do it in
544 * the main thread. 546 * the main thread.
545 */ 547 */
546 spin_lock_irqsave(&ms->lock, flags); 548 spin_lock_irqsave(&ms->lock, flags);
547 if (!ms->failures.head) 549 if (!ms->failures.head)
548 should_wake = 1; 550 should_wake = 1;
549 bio_list_add(&ms->failures, bio); 551 bio_list_add(&ms->failures, bio);
550 spin_unlock_irqrestore(&ms->lock, flags); 552 spin_unlock_irqrestore(&ms->lock, flags);
551 if (should_wake) 553 if (should_wake)
552 wakeup_mirrord(ms); 554 wakeup_mirrord(ms);
553 return; 555 return;
554 } 556 }
555 out: 557 out:
556 bio_endio(bio, ret); 558 bio_endio(bio, ret);
557 } 559 }
558 560
559 static void do_write(struct mirror_set *ms, struct bio *bio) 561 static void do_write(struct mirror_set *ms, struct bio *bio)
560 { 562 {
561 unsigned int i; 563 unsigned int i;
562 struct dm_io_region io[ms->nr_mirrors], *dest = io; 564 struct dm_io_region io[ms->nr_mirrors], *dest = io;
563 struct mirror *m; 565 struct mirror *m;
564 struct dm_io_request io_req = { 566 struct dm_io_request io_req = {
565 .bi_rw = WRITE, 567 .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
566 .mem.type = DM_IO_BVEC, 568 .mem.type = DM_IO_BVEC,
567 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, 569 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
568 .notify.fn = write_callback, 570 .notify.fn = write_callback,
569 .notify.context = bio, 571 .notify.context = bio,
570 .client = ms->io_client, 572 .client = ms->io_client,
571 }; 573 };
572 574
573 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) 575 for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++)
574 map_region(dest++, m, bio); 576 map_region(dest++, m, bio);
575 577
576 /* 578 /*
577 * Use default mirror because we only need it to retrieve the reference 579 * Use default mirror because we only need it to retrieve the reference
578 * to the mirror set in write_callback(). 580 * to the mirror set in write_callback().
579 */ 581 */
580 bio_set_m(bio, get_default_mirror(ms)); 582 bio_set_m(bio, get_default_mirror(ms));
581 583
582 BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); 584 BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL));
583 } 585 }
584 586
585 static void do_writes(struct mirror_set *ms, struct bio_list *writes) 587 static void do_writes(struct mirror_set *ms, struct bio_list *writes)
586 { 588 {
587 int state; 589 int state;
588 struct bio *bio; 590 struct bio *bio;
589 struct bio_list sync, nosync, recover, *this_list = NULL; 591 struct bio_list sync, nosync, recover, *this_list = NULL;
590 struct bio_list requeue; 592 struct bio_list requeue;
591 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 593 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
592 region_t region; 594 region_t region;
593 595
594 if (!writes->head) 596 if (!writes->head)
595 return; 597 return;
596 598
597 /* 599 /*
598 * Classify each write. 600 * Classify each write.
599 */ 601 */
600 bio_list_init(&sync); 602 bio_list_init(&sync);
601 bio_list_init(&nosync); 603 bio_list_init(&nosync);
602 bio_list_init(&recover); 604 bio_list_init(&recover);
603 bio_list_init(&requeue); 605 bio_list_init(&requeue);
604 606
605 while ((bio = bio_list_pop(writes))) { 607 while ((bio = bio_list_pop(writes))) {
608 if (unlikely(bio_empty_barrier(bio))) {
609 bio_list_add(&sync, bio);
610 continue;
611 }
612
606 region = dm_rh_bio_to_region(ms->rh, bio); 613 region = dm_rh_bio_to_region(ms->rh, bio);
607 614
608 if (log->type->is_remote_recovering && 615 if (log->type->is_remote_recovering &&
609 log->type->is_remote_recovering(log, region)) { 616 log->type->is_remote_recovering(log, region)) {
610 bio_list_add(&requeue, bio); 617 bio_list_add(&requeue, bio);
611 continue; 618 continue;
612 } 619 }
613 620
614 state = dm_rh_get_state(ms->rh, region, 1); 621 state = dm_rh_get_state(ms->rh, region, 1);
615 switch (state) { 622 switch (state) {
616 case DM_RH_CLEAN: 623 case DM_RH_CLEAN:
617 case DM_RH_DIRTY: 624 case DM_RH_DIRTY:
618 this_list = &sync; 625 this_list = &sync;
619 break; 626 break;
620 627
621 case DM_RH_NOSYNC: 628 case DM_RH_NOSYNC:
622 this_list = &nosync; 629 this_list = &nosync;
623 break; 630 break;
624 631
625 case DM_RH_RECOVERING: 632 case DM_RH_RECOVERING:
626 this_list = &recover; 633 this_list = &recover;
627 break; 634 break;
628 } 635 }
629 636
630 bio_list_add(this_list, bio); 637 bio_list_add(this_list, bio);
631 } 638 }
632 639
633 /* 640 /*
634 * Add bios that are delayed due to remote recovery 641 * Add bios that are delayed due to remote recovery
635 * back on to the write queue 642 * back on to the write queue
636 */ 643 */
637 if (unlikely(requeue.head)) { 644 if (unlikely(requeue.head)) {
638 spin_lock_irq(&ms->lock); 645 spin_lock_irq(&ms->lock);
639 bio_list_merge(&ms->writes, &requeue); 646 bio_list_merge(&ms->writes, &requeue);
640 spin_unlock_irq(&ms->lock); 647 spin_unlock_irq(&ms->lock);
641 delayed_wake(ms); 648 delayed_wake(ms);
642 } 649 }
643 650
644 /* 651 /*
645 * Increment the pending counts for any regions that will 652 * Increment the pending counts for any regions that will
646 * be written to (writes to recover regions are going to 653 * be written to (writes to recover regions are going to
647 * be delayed). 654 * be delayed).
648 */ 655 */
649 dm_rh_inc_pending(ms->rh, &sync); 656 dm_rh_inc_pending(ms->rh, &sync);
650 dm_rh_inc_pending(ms->rh, &nosync); 657 dm_rh_inc_pending(ms->rh, &nosync);
651 658
652 /* 659 /*
653 * If the flush fails on a previous call and succeeds here, 660 * If the flush fails on a previous call and succeeds here,
654 * we must not reset the log_failure variable. We need 661 * we must not reset the log_failure variable. We need
655 * userspace interaction to do that. 662 * userspace interaction to do that.
656 */ 663 */
657 ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure; 664 ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure;
658 665
659 /* 666 /*
660 * Dispatch io. 667 * Dispatch io.
661 */ 668 */
662 if (unlikely(ms->log_failure)) { 669 if (unlikely(ms->log_failure)) {
663 spin_lock_irq(&ms->lock); 670 spin_lock_irq(&ms->lock);
664 bio_list_merge(&ms->failures, &sync); 671 bio_list_merge(&ms->failures, &sync);
665 spin_unlock_irq(&ms->lock); 672 spin_unlock_irq(&ms->lock);
666 wakeup_mirrord(ms); 673 wakeup_mirrord(ms);
667 } else 674 } else
668 while ((bio = bio_list_pop(&sync))) 675 while ((bio = bio_list_pop(&sync)))
669 do_write(ms, bio); 676 do_write(ms, bio);
670 677
671 while ((bio = bio_list_pop(&recover))) 678 while ((bio = bio_list_pop(&recover)))
672 dm_rh_delay(ms->rh, bio); 679 dm_rh_delay(ms->rh, bio);
673 680
674 while ((bio = bio_list_pop(&nosync))) { 681 while ((bio = bio_list_pop(&nosync))) {
675 map_bio(get_default_mirror(ms), bio); 682 map_bio(get_default_mirror(ms), bio);
676 generic_make_request(bio); 683 generic_make_request(bio);
677 } 684 }
678 } 685 }
679 686
680 static void do_failures(struct mirror_set *ms, struct bio_list *failures) 687 static void do_failures(struct mirror_set *ms, struct bio_list *failures)
681 { 688 {
682 struct bio *bio; 689 struct bio *bio;
683 690
684 if (!failures->head) 691 if (!failures->head)
685 return; 692 return;
686 693
687 if (!ms->log_failure) { 694 if (!ms->log_failure) {
688 while ((bio = bio_list_pop(failures))) { 695 while ((bio = bio_list_pop(failures))) {
689 ms->in_sync = 0; 696 ms->in_sync = 0;
690 dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); 697 dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0);
691 } 698 }
692 return; 699 return;
693 } 700 }
694 701
695 /* 702 /*
696 * If the log has failed, unattempted writes are being 703 * If the log has failed, unattempted writes are being
697 * put on the failures list. We can't issue those writes 704 * put on the failures list. We can't issue those writes
698 * until a log has been marked, so we must store them. 705 * until a log has been marked, so we must store them.
699 * 706 *
700 * If a 'noflush' suspend is in progress, we can requeue 707 * If a 'noflush' suspend is in progress, we can requeue
701 * the I/O's to the core. This give userspace a chance 708 * the I/O's to the core. This give userspace a chance
702 * to reconfigure the mirror, at which point the core 709 * to reconfigure the mirror, at which point the core
703 * will reissue the writes. If the 'noflush' flag is 710 * will reissue the writes. If the 'noflush' flag is
704 * not set, we have no choice but to return errors. 711 * not set, we have no choice but to return errors.
705 * 712 *
706 * Some writes on the failures list may have been 713 * Some writes on the failures list may have been
707 * submitted before the log failure and represent a 714 * submitted before the log failure and represent a
708 * failure to write to one of the devices. It is ok 715 * failure to write to one of the devices. It is ok
709 * for us to treat them the same and requeue them 716 * for us to treat them the same and requeue them
710 * as well. 717 * as well.
711 */ 718 */
712 if (dm_noflush_suspending(ms->ti)) { 719 if (dm_noflush_suspending(ms->ti)) {
713 while ((bio = bio_list_pop(failures))) 720 while ((bio = bio_list_pop(failures)))
714 bio_endio(bio, DM_ENDIO_REQUEUE); 721 bio_endio(bio, DM_ENDIO_REQUEUE);
715 return; 722 return;
716 } 723 }
717 724
718 if (atomic_read(&ms->suspend)) { 725 if (atomic_read(&ms->suspend)) {
719 while ((bio = bio_list_pop(failures))) 726 while ((bio = bio_list_pop(failures)))
720 bio_endio(bio, -EIO); 727 bio_endio(bio, -EIO);
721 return; 728 return;
722 } 729 }
723 730
724 spin_lock_irq(&ms->lock); 731 spin_lock_irq(&ms->lock);
725 bio_list_merge(&ms->failures, failures); 732 bio_list_merge(&ms->failures, failures);
726 spin_unlock_irq(&ms->lock); 733 spin_unlock_irq(&ms->lock);
727 734
728 delayed_wake(ms); 735 delayed_wake(ms);
729 } 736 }
730 737
731 static void trigger_event(struct work_struct *work) 738 static void trigger_event(struct work_struct *work)
732 { 739 {
733 struct mirror_set *ms = 740 struct mirror_set *ms =
734 container_of(work, struct mirror_set, trigger_event); 741 container_of(work, struct mirror_set, trigger_event);
735 742
736 dm_table_event(ms->ti->table); 743 dm_table_event(ms->ti->table);
737 } 744 }
738 745
739 /*----------------------------------------------------------------- 746 /*-----------------------------------------------------------------
740 * kmirrord 747 * kmirrord
741 *---------------------------------------------------------------*/ 748 *---------------------------------------------------------------*/
742 static void do_mirror(struct work_struct *work) 749 static void do_mirror(struct work_struct *work)
743 { 750 {
744 struct mirror_set *ms = container_of(work, struct mirror_set, 751 struct mirror_set *ms = container_of(work, struct mirror_set,
745 kmirrord_work); 752 kmirrord_work);
746 struct bio_list reads, writes, failures; 753 struct bio_list reads, writes, failures;
747 unsigned long flags; 754 unsigned long flags;
748 755
749 spin_lock_irqsave(&ms->lock, flags); 756 spin_lock_irqsave(&ms->lock, flags);
750 reads = ms->reads; 757 reads = ms->reads;
751 writes = ms->writes; 758 writes = ms->writes;
752 failures = ms->failures; 759 failures = ms->failures;
753 bio_list_init(&ms->reads); 760 bio_list_init(&ms->reads);
754 bio_list_init(&ms->writes); 761 bio_list_init(&ms->writes);
755 bio_list_init(&ms->failures); 762 bio_list_init(&ms->failures);
756 spin_unlock_irqrestore(&ms->lock, flags); 763 spin_unlock_irqrestore(&ms->lock, flags);
757 764
758 dm_rh_update_states(ms->rh, errors_handled(ms)); 765 dm_rh_update_states(ms->rh, errors_handled(ms));
759 do_recovery(ms); 766 do_recovery(ms);
760 do_reads(ms, &reads); 767 do_reads(ms, &reads);
761 do_writes(ms, &writes); 768 do_writes(ms, &writes);
762 do_failures(ms, &failures); 769 do_failures(ms, &failures);
763 770
764 dm_table_unplug_all(ms->ti->table); 771 dm_table_unplug_all(ms->ti->table);
765 } 772 }
766 773
767 /*----------------------------------------------------------------- 774 /*-----------------------------------------------------------------
768 * Target functions 775 * Target functions
769 *---------------------------------------------------------------*/ 776 *---------------------------------------------------------------*/
770 static struct mirror_set *alloc_context(unsigned int nr_mirrors, 777 static struct mirror_set *alloc_context(unsigned int nr_mirrors,
771 uint32_t region_size, 778 uint32_t region_size,
772 struct dm_target *ti, 779 struct dm_target *ti,
773 struct dm_dirty_log *dl) 780 struct dm_dirty_log *dl)
774 { 781 {
775 size_t len; 782 size_t len;
776 struct mirror_set *ms = NULL; 783 struct mirror_set *ms = NULL;
777 784
778 len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); 785 len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors);
779 786
780 ms = kzalloc(len, GFP_KERNEL); 787 ms = kzalloc(len, GFP_KERNEL);
781 if (!ms) { 788 if (!ms) {
782 ti->error = "Cannot allocate mirror context"; 789 ti->error = "Cannot allocate mirror context";
783 return NULL; 790 return NULL;
784 } 791 }
785 792
786 spin_lock_init(&ms->lock); 793 spin_lock_init(&ms->lock);
787 794
788 ms->ti = ti; 795 ms->ti = ti;
789 ms->nr_mirrors = nr_mirrors; 796 ms->nr_mirrors = nr_mirrors;
790 ms->nr_regions = dm_sector_div_up(ti->len, region_size); 797 ms->nr_regions = dm_sector_div_up(ti->len, region_size);
791 ms->in_sync = 0; 798 ms->in_sync = 0;
792 ms->log_failure = 0; 799 ms->log_failure = 0;
793 atomic_set(&ms->suspend, 0); 800 atomic_set(&ms->suspend, 0);
794 atomic_set(&ms->default_mirror, DEFAULT_MIRROR); 801 atomic_set(&ms->default_mirror, DEFAULT_MIRROR);
795 802
796 ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS, 803 ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS,
797 _dm_raid1_read_record_cache); 804 _dm_raid1_read_record_cache);
798 805
799 if (!ms->read_record_pool) { 806 if (!ms->read_record_pool) {
800 ti->error = "Error creating mirror read_record_pool"; 807 ti->error = "Error creating mirror read_record_pool";
801 kfree(ms); 808 kfree(ms);
802 return NULL; 809 return NULL;
803 } 810 }
804 811
805 ms->io_client = dm_io_client_create(DM_IO_PAGES); 812 ms->io_client = dm_io_client_create(DM_IO_PAGES);
806 if (IS_ERR(ms->io_client)) { 813 if (IS_ERR(ms->io_client)) {
807 ti->error = "Error creating dm_io client"; 814 ti->error = "Error creating dm_io client";
808 mempool_destroy(ms->read_record_pool); 815 mempool_destroy(ms->read_record_pool);
809 kfree(ms); 816 kfree(ms);
810 return NULL; 817 return NULL;
811 } 818 }
812 819
813 ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord, 820 ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord,
814 wakeup_all_recovery_waiters, 821 wakeup_all_recovery_waiters,
815 ms->ti->begin, MAX_RECOVERY, 822 ms->ti->begin, MAX_RECOVERY,
816 dl, region_size, ms->nr_regions); 823 dl, region_size, ms->nr_regions);
817 if (IS_ERR(ms->rh)) { 824 if (IS_ERR(ms->rh)) {
818 ti->error = "Error creating dirty region hash"; 825 ti->error = "Error creating dirty region hash";
819 dm_io_client_destroy(ms->io_client); 826 dm_io_client_destroy(ms->io_client);
820 mempool_destroy(ms->read_record_pool); 827 mempool_destroy(ms->read_record_pool);
821 kfree(ms); 828 kfree(ms);
822 return NULL; 829 return NULL;
823 } 830 }
824 831
825 return ms; 832 return ms;
826 } 833 }
827 834
828 static void free_context(struct mirror_set *ms, struct dm_target *ti, 835 static void free_context(struct mirror_set *ms, struct dm_target *ti,
829 unsigned int m) 836 unsigned int m)
830 { 837 {
831 while (m--) 838 while (m--)
832 dm_put_device(ti, ms->mirror[m].dev); 839 dm_put_device(ti, ms->mirror[m].dev);
833 840
834 dm_io_client_destroy(ms->io_client); 841 dm_io_client_destroy(ms->io_client);
835 dm_region_hash_destroy(ms->rh); 842 dm_region_hash_destroy(ms->rh);
836 mempool_destroy(ms->read_record_pool); 843 mempool_destroy(ms->read_record_pool);
837 kfree(ms); 844 kfree(ms);
838 } 845 }
839 846
840 static int get_mirror(struct mirror_set *ms, struct dm_target *ti, 847 static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
841 unsigned int mirror, char **argv) 848 unsigned int mirror, char **argv)
842 { 849 {
843 unsigned long long offset; 850 unsigned long long offset;
844 851
845 if (sscanf(argv[1], "%llu", &offset) != 1) { 852 if (sscanf(argv[1], "%llu", &offset) != 1) {
846 ti->error = "Invalid offset"; 853 ti->error = "Invalid offset";
847 return -EINVAL; 854 return -EINVAL;
848 } 855 }
849 856
850 if (dm_get_device(ti, argv[0], offset, ti->len, 857 if (dm_get_device(ti, argv[0], offset, ti->len,
851 dm_table_get_mode(ti->table), 858 dm_table_get_mode(ti->table),
852 &ms->mirror[mirror].dev)) { 859 &ms->mirror[mirror].dev)) {
853 ti->error = "Device lookup failure"; 860 ti->error = "Device lookup failure";
854 return -ENXIO; 861 return -ENXIO;
855 } 862 }
856 863
857 ms->mirror[mirror].ms = ms; 864 ms->mirror[mirror].ms = ms;
858 atomic_set(&(ms->mirror[mirror].error_count), 0); 865 atomic_set(&(ms->mirror[mirror].error_count), 0);
859 ms->mirror[mirror].error_type = 0; 866 ms->mirror[mirror].error_type = 0;
860 ms->mirror[mirror].offset = offset; 867 ms->mirror[mirror].offset = offset;
861 868
862 return 0; 869 return 0;
863 } 870 }
864 871
865 /* 872 /*
866 * Create dirty log: log_type #log_params <log_params> 873 * Create dirty log: log_type #log_params <log_params>
867 */ 874 */
868 static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, 875 static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
869 unsigned argc, char **argv, 876 unsigned argc, char **argv,
870 unsigned *args_used) 877 unsigned *args_used)
871 { 878 {
872 unsigned param_count; 879 unsigned param_count;
873 struct dm_dirty_log *dl; 880 struct dm_dirty_log *dl;
874 881
875 if (argc < 2) { 882 if (argc < 2) {
876 ti->error = "Insufficient mirror log arguments"; 883 ti->error = "Insufficient mirror log arguments";
877 return NULL; 884 return NULL;
878 } 885 }
879 886
880 if (sscanf(argv[1], "%u", &param_count) != 1) { 887 if (sscanf(argv[1], "%u", &param_count) != 1) {
881 ti->error = "Invalid mirror log argument count"; 888 ti->error = "Invalid mirror log argument count";
882 return NULL; 889 return NULL;
883 } 890 }
884 891
885 *args_used = 2 + param_count; 892 *args_used = 2 + param_count;
886 893
887 if (argc < *args_used) { 894 if (argc < *args_used) {
888 ti->error = "Insufficient mirror log arguments"; 895 ti->error = "Insufficient mirror log arguments";
889 return NULL; 896 return NULL;
890 } 897 }
891 898
892 dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); 899 dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2);
893 if (!dl) { 900 if (!dl) {
894 ti->error = "Error creating mirror dirty log"; 901 ti->error = "Error creating mirror dirty log";
895 return NULL; 902 return NULL;
896 } 903 }
897 904
898 return dl; 905 return dl;
899 } 906 }
900 907
901 static int parse_features(struct mirror_set *ms, unsigned argc, char **argv, 908 static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
902 unsigned *args_used) 909 unsigned *args_used)
903 { 910 {
904 unsigned num_features; 911 unsigned num_features;
905 struct dm_target *ti = ms->ti; 912 struct dm_target *ti = ms->ti;
906 913
907 *args_used = 0; 914 *args_used = 0;
908 915
909 if (!argc) 916 if (!argc)
910 return 0; 917 return 0;
911 918
912 if (sscanf(argv[0], "%u", &num_features) != 1) { 919 if (sscanf(argv[0], "%u", &num_features) != 1) {
913 ti->error = "Invalid number of features"; 920 ti->error = "Invalid number of features";
914 return -EINVAL; 921 return -EINVAL;
915 } 922 }
916 923
917 argc--; 924 argc--;
918 argv++; 925 argv++;
919 (*args_used)++; 926 (*args_used)++;
920 927
921 if (num_features > argc) { 928 if (num_features > argc) {
922 ti->error = "Not enough arguments to support feature count"; 929 ti->error = "Not enough arguments to support feature count";
923 return -EINVAL; 930 return -EINVAL;
924 } 931 }
925 932
926 if (!strcmp("handle_errors", argv[0])) 933 if (!strcmp("handle_errors", argv[0]))
927 ms->features |= DM_RAID1_HANDLE_ERRORS; 934 ms->features |= DM_RAID1_HANDLE_ERRORS;
928 else { 935 else {
929 ti->error = "Unrecognised feature requested"; 936 ti->error = "Unrecognised feature requested";
930 return -EINVAL; 937 return -EINVAL;
931 } 938 }
932 939
933 (*args_used)++; 940 (*args_used)++;
934 941
935 return 0; 942 return 0;
936 } 943 }
937 944
938 /* 945 /*
939 * Construct a mirror mapping: 946 * Construct a mirror mapping:
940 * 947 *
941 * log_type #log_params <log_params> 948 * log_type #log_params <log_params>
942 * #mirrors [mirror_path offset]{2,} 949 * #mirrors [mirror_path offset]{2,}
943 * [#features <features>] 950 * [#features <features>]
944 * 951 *
945 * log_type is "core" or "disk" 952 * log_type is "core" or "disk"
946 * #log_params is between 1 and 3 953 * #log_params is between 1 and 3
947 * 954 *
948 * If present, features must be "handle_errors". 955 * If present, features must be "handle_errors".
949 */ 956 */
950 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) 957 static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
951 { 958 {
952 int r; 959 int r;
953 unsigned int nr_mirrors, m, args_used; 960 unsigned int nr_mirrors, m, args_used;
954 struct mirror_set *ms; 961 struct mirror_set *ms;
955 struct dm_dirty_log *dl; 962 struct dm_dirty_log *dl;
956 963
957 dl = create_dirty_log(ti, argc, argv, &args_used); 964 dl = create_dirty_log(ti, argc, argv, &args_used);
958 if (!dl) 965 if (!dl)
959 return -EINVAL; 966 return -EINVAL;
960 967
961 argv += args_used; 968 argv += args_used;
962 argc -= args_used; 969 argc -= args_used;
963 970
964 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || 971 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 ||
965 nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { 972 nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) {
966 ti->error = "Invalid number of mirrors"; 973 ti->error = "Invalid number of mirrors";
967 dm_dirty_log_destroy(dl); 974 dm_dirty_log_destroy(dl);
968 return -EINVAL; 975 return -EINVAL;
969 } 976 }
970 977
971 argv++, argc--; 978 argv++, argc--;
972 979
973 if (argc < nr_mirrors * 2) { 980 if (argc < nr_mirrors * 2) {
974 ti->error = "Too few mirror arguments"; 981 ti->error = "Too few mirror arguments";
975 dm_dirty_log_destroy(dl); 982 dm_dirty_log_destroy(dl);
976 return -EINVAL; 983 return -EINVAL;
977 } 984 }
978 985
979 ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); 986 ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl);
980 if (!ms) { 987 if (!ms) {
981 dm_dirty_log_destroy(dl); 988 dm_dirty_log_destroy(dl);
982 return -ENOMEM; 989 return -ENOMEM;
983 } 990 }
984 991
985 /* Get the mirror parameter sets */ 992 /* Get the mirror parameter sets */
986 for (m = 0; m < nr_mirrors; m++) { 993 for (m = 0; m < nr_mirrors; m++) {
987 r = get_mirror(ms, ti, m, argv); 994 r = get_mirror(ms, ti, m, argv);
988 if (r) { 995 if (r) {
989 free_context(ms, ti, m); 996 free_context(ms, ti, m);
990 return r; 997 return r;
991 } 998 }
992 argv += 2; 999 argv += 2;
993 argc -= 2; 1000 argc -= 2;
994 } 1001 }
995 1002
996 ti->private = ms; 1003 ti->private = ms;
997 ti->split_io = dm_rh_get_region_size(ms->rh); 1004 ti->split_io = dm_rh_get_region_size(ms->rh);
1005 ti->num_flush_requests = 1;
998 1006
999 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); 1007 ms->kmirrord_wq = create_singlethread_workqueue("kmirrord");
1000 if (!ms->kmirrord_wq) { 1008 if (!ms->kmirrord_wq) {
1001 DMERR("couldn't start kmirrord"); 1009 DMERR("couldn't start kmirrord");
1002 r = -ENOMEM; 1010 r = -ENOMEM;
1003 goto err_free_context; 1011 goto err_free_context;
1004 } 1012 }
1005 INIT_WORK(&ms->kmirrord_work, do_mirror); 1013 INIT_WORK(&ms->kmirrord_work, do_mirror);
1006 init_timer(&ms->timer); 1014 init_timer(&ms->timer);
1007 ms->timer_pending = 0; 1015 ms->timer_pending = 0;
1008 INIT_WORK(&ms->trigger_event, trigger_event); 1016 INIT_WORK(&ms->trigger_event, trigger_event);
1009 1017
1010 r = parse_features(ms, argc, argv, &args_used); 1018 r = parse_features(ms, argc, argv, &args_used);
1011 if (r) 1019 if (r)
1012 goto err_destroy_wq; 1020 goto err_destroy_wq;
1013 1021
1014 argv += args_used; 1022 argv += args_used;
1015 argc -= args_used; 1023 argc -= args_used;
1016 1024
1017 /* 1025 /*
1018 * Any read-balancing addition depends on the 1026 * Any read-balancing addition depends on the
1019 * DM_RAID1_HANDLE_ERRORS flag being present. 1027 * DM_RAID1_HANDLE_ERRORS flag being present.
1020 * This is because the decision to balance depends 1028 * This is because the decision to balance depends
1021 * on the sync state of a region. If the above 1029 * on the sync state of a region. If the above
1022 * flag is not present, we ignore errors; and 1030 * flag is not present, we ignore errors; and
1023 * the sync state may be inaccurate. 1031 * the sync state may be inaccurate.
1024 */ 1032 */
1025 1033
1026 if (argc) { 1034 if (argc) {
1027 ti->error = "Too many mirror arguments"; 1035 ti->error = "Too many mirror arguments";
1028 r = -EINVAL; 1036 r = -EINVAL;
1029 goto err_destroy_wq; 1037 goto err_destroy_wq;
1030 } 1038 }
1031 1039
1032 r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client); 1040 r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client);
1033 if (r) 1041 if (r)
1034 goto err_destroy_wq; 1042 goto err_destroy_wq;
1035 1043
1036 wakeup_mirrord(ms); 1044 wakeup_mirrord(ms);
1037 return 0; 1045 return 0;
1038 1046
1039 err_destroy_wq: 1047 err_destroy_wq:
1040 destroy_workqueue(ms->kmirrord_wq); 1048 destroy_workqueue(ms->kmirrord_wq);
1041 err_free_context: 1049 err_free_context:
1042 free_context(ms, ti, ms->nr_mirrors); 1050 free_context(ms, ti, ms->nr_mirrors);
1043 return r; 1051 return r;
1044 } 1052 }
1045 1053
1046 static void mirror_dtr(struct dm_target *ti) 1054 static void mirror_dtr(struct dm_target *ti)
1047 { 1055 {
1048 struct mirror_set *ms = (struct mirror_set *) ti->private; 1056 struct mirror_set *ms = (struct mirror_set *) ti->private;
1049 1057
1050 del_timer_sync(&ms->timer); 1058 del_timer_sync(&ms->timer);
1051 flush_workqueue(ms->kmirrord_wq); 1059 flush_workqueue(ms->kmirrord_wq);
1052 flush_scheduled_work(); 1060 flush_scheduled_work();
1053 dm_kcopyd_client_destroy(ms->kcopyd_client); 1061 dm_kcopyd_client_destroy(ms->kcopyd_client);
1054 destroy_workqueue(ms->kmirrord_wq); 1062 destroy_workqueue(ms->kmirrord_wq);
1055 free_context(ms, ti, ms->nr_mirrors); 1063 free_context(ms, ti, ms->nr_mirrors);
1056 } 1064 }
1057 1065
1058 /* 1066 /*
1059 * Mirror mapping function 1067 * Mirror mapping function
1060 */ 1068 */
1061 static int mirror_map(struct dm_target *ti, struct bio *bio, 1069 static int mirror_map(struct dm_target *ti, struct bio *bio,
1062 union map_info *map_context) 1070 union map_info *map_context)
1063 { 1071 {
1064 int r, rw = bio_rw(bio); 1072 int r, rw = bio_rw(bio);
1065 struct mirror *m; 1073 struct mirror *m;
1066 struct mirror_set *ms = ti->private; 1074 struct mirror_set *ms = ti->private;
1067 struct dm_raid1_read_record *read_record = NULL; 1075 struct dm_raid1_read_record *read_record = NULL;
1068 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 1076 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1069 1077
1070 if (rw == WRITE) { 1078 if (rw == WRITE) {
1071 /* Save region for mirror_end_io() handler */ 1079 /* Save region for mirror_end_io() handler */
1072 map_context->ll = dm_rh_bio_to_region(ms->rh, bio); 1080 map_context->ll = dm_rh_bio_to_region(ms->rh, bio);
1073 queue_bio(ms, bio, rw); 1081 queue_bio(ms, bio, rw);
1074 return DM_MAPIO_SUBMITTED; 1082 return DM_MAPIO_SUBMITTED;
1075 } 1083 }
1076 1084
1077 r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); 1085 r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0);
1078 if (r < 0 && r != -EWOULDBLOCK) 1086 if (r < 0 && r != -EWOULDBLOCK)
1079 return r; 1087 return r;
1080 1088
1081 /* 1089 /*
1082 * If region is not in-sync queue the bio. 1090 * If region is not in-sync queue the bio.
1083 */ 1091 */
1084 if (!r || (r == -EWOULDBLOCK)) { 1092 if (!r || (r == -EWOULDBLOCK)) {
1085 if (rw == READA) 1093 if (rw == READA)
1086 return -EWOULDBLOCK; 1094 return -EWOULDBLOCK;
1087 1095
1088 queue_bio(ms, bio, rw); 1096 queue_bio(ms, bio, rw);
1089 return DM_MAPIO_SUBMITTED; 1097 return DM_MAPIO_SUBMITTED;
1090 } 1098 }
1091 1099
1092 /* 1100 /*
1093 * The region is in-sync and we can perform reads directly. 1101 * The region is in-sync and we can perform reads directly.
1094 * Store enough information so we can retry if it fails. 1102 * Store enough information so we can retry if it fails.
1095 */ 1103 */
1096 m = choose_mirror(ms, bio->bi_sector); 1104 m = choose_mirror(ms, bio->bi_sector);
1097 if (unlikely(!m)) 1105 if (unlikely(!m))
1098 return -EIO; 1106 return -EIO;
1099 1107
1100 read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); 1108 read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO);
1101 if (likely(read_record)) { 1109 if (likely(read_record)) {
1102 dm_bio_record(&read_record->details, bio); 1110 dm_bio_record(&read_record->details, bio);
1103 map_context->ptr = read_record; 1111 map_context->ptr = read_record;
1104 read_record->m = m; 1112 read_record->m = m;
1105 } 1113 }
1106 1114
1107 map_bio(m, bio); 1115 map_bio(m, bio);
1108 1116
1109 return DM_MAPIO_REMAPPED; 1117 return DM_MAPIO_REMAPPED;
1110 } 1118 }
1111 1119
1112 static int mirror_end_io(struct dm_target *ti, struct bio *bio, 1120 static int mirror_end_io(struct dm_target *ti, struct bio *bio,
1113 int error, union map_info *map_context) 1121 int error, union map_info *map_context)
1114 { 1122 {
1115 int rw = bio_rw(bio); 1123 int rw = bio_rw(bio);
1116 struct mirror_set *ms = (struct mirror_set *) ti->private; 1124 struct mirror_set *ms = (struct mirror_set *) ti->private;
1117 struct mirror *m = NULL; 1125 struct mirror *m = NULL;
1118 struct dm_bio_details *bd = NULL; 1126 struct dm_bio_details *bd = NULL;
1119 struct dm_raid1_read_record *read_record = map_context->ptr; 1127 struct dm_raid1_read_record *read_record = map_context->ptr;
1120 1128
1121 /* 1129 /*
1122 * We need to dec pending if this was a write. 1130 * We need to dec pending if this was a write.
1123 */ 1131 */
1124 if (rw == WRITE) { 1132 if (rw == WRITE) {
1125 dm_rh_dec(ms->rh, map_context->ll); 1133 if (likely(!bio_empty_barrier(bio)))
1134 dm_rh_dec(ms->rh, map_context->ll);
1126 return error; 1135 return error;
1127 } 1136 }
1128 1137
1129 if (error == -EOPNOTSUPP) 1138 if (error == -EOPNOTSUPP)
1130 goto out; 1139 goto out;
1131 1140
1132 if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD)) 1141 if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD))
1133 goto out; 1142 goto out;
1134 1143
1135 if (unlikely(error)) { 1144 if (unlikely(error)) {
1136 if (!read_record) { 1145 if (!read_record) {
1137 /* 1146 /*
1138 * There wasn't enough memory to record necessary 1147 * There wasn't enough memory to record necessary
1139 * information for a retry or there was no other 1148 * information for a retry or there was no other
1140 * mirror in-sync. 1149 * mirror in-sync.
1141 */ 1150 */
1142 DMERR_LIMIT("Mirror read failed."); 1151 DMERR_LIMIT("Mirror read failed.");
1143 return -EIO; 1152 return -EIO;
1144 } 1153 }
1145 1154
1146 m = read_record->m; 1155 m = read_record->m;
1147 1156
1148 DMERR("Mirror read failed from %s. Trying alternative device.", 1157 DMERR("Mirror read failed from %s. Trying alternative device.",
1149 m->dev->name); 1158 m->dev->name);
1150 1159
1151 fail_mirror(m, DM_RAID1_READ_ERROR); 1160 fail_mirror(m, DM_RAID1_READ_ERROR);
1152 1161
1153 /* 1162 /*
1154 * A failed read is requeued for another attempt using an intact 1163 * A failed read is requeued for another attempt using an intact
1155 * mirror. 1164 * mirror.
1156 */ 1165 */
1157 if (default_ok(m) || mirror_available(ms, bio)) { 1166 if (default_ok(m) || mirror_available(ms, bio)) {
1158 bd = &read_record->details; 1167 bd = &read_record->details;
1159 1168
1160 dm_bio_restore(bd, bio); 1169 dm_bio_restore(bd, bio);
1161 mempool_free(read_record, ms->read_record_pool); 1170 mempool_free(read_record, ms->read_record_pool);
1162 map_context->ptr = NULL; 1171 map_context->ptr = NULL;
1163 queue_bio(ms, bio, rw); 1172 queue_bio(ms, bio, rw);
1164 return 1; 1173 return 1;
1165 } 1174 }
1166 DMERR("All replicated volumes dead, failing I/O"); 1175 DMERR("All replicated volumes dead, failing I/O");
1167 } 1176 }
1168 1177
1169 out: 1178 out:
1170 if (read_record) { 1179 if (read_record) {
1171 mempool_free(read_record, ms->read_record_pool); 1180 mempool_free(read_record, ms->read_record_pool);
1172 map_context->ptr = NULL; 1181 map_context->ptr = NULL;
1173 } 1182 }
1174 1183
1175 return error; 1184 return error;
1176 } 1185 }
1177 1186
1178 static void mirror_presuspend(struct dm_target *ti) 1187 static void mirror_presuspend(struct dm_target *ti)
1179 { 1188 {
1180 struct mirror_set *ms = (struct mirror_set *) ti->private; 1189 struct mirror_set *ms = (struct mirror_set *) ti->private;
1181 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 1190 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1182 1191
1183 atomic_set(&ms->suspend, 1); 1192 atomic_set(&ms->suspend, 1);
1184 1193
1185 /* 1194 /*
1186 * We must finish up all the work that we've 1195 * We must finish up all the work that we've
1187 * generated (i.e. recovery work). 1196 * generated (i.e. recovery work).
1188 */ 1197 */
1189 dm_rh_stop_recovery(ms->rh); 1198 dm_rh_stop_recovery(ms->rh);
1190 1199
1191 wait_event(_kmirrord_recovery_stopped, 1200 wait_event(_kmirrord_recovery_stopped,
1192 !dm_rh_recovery_in_flight(ms->rh)); 1201 !dm_rh_recovery_in_flight(ms->rh));
1193 1202
1194 if (log->type->presuspend && log->type->presuspend(log)) 1203 if (log->type->presuspend && log->type->presuspend(log))
1195 /* FIXME: need better error handling */ 1204 /* FIXME: need better error handling */
1196 DMWARN("log presuspend failed"); 1205 DMWARN("log presuspend failed");
1197 1206
1198 /* 1207 /*
1199 * Now that recovery is complete/stopped and the 1208 * Now that recovery is complete/stopped and the
1200 * delayed bios are queued, we need to wait for 1209 * delayed bios are queued, we need to wait for
1201 * the worker thread to complete. This way, 1210 * the worker thread to complete. This way,
1202 * we know that all of our I/O has been pushed. 1211 * we know that all of our I/O has been pushed.
1203 */ 1212 */
1204 flush_workqueue(ms->kmirrord_wq); 1213 flush_workqueue(ms->kmirrord_wq);
1205 } 1214 }
1206 1215
1207 static void mirror_postsuspend(struct dm_target *ti) 1216 static void mirror_postsuspend(struct dm_target *ti)
1208 { 1217 {
1209 struct mirror_set *ms = ti->private; 1218 struct mirror_set *ms = ti->private;
1210 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 1219 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1211 1220
1212 if (log->type->postsuspend && log->type->postsuspend(log)) 1221 if (log->type->postsuspend && log->type->postsuspend(log))
1213 /* FIXME: need better error handling */ 1222 /* FIXME: need better error handling */
1214 DMWARN("log postsuspend failed"); 1223 DMWARN("log postsuspend failed");
1215 } 1224 }
1216 1225
1217 static void mirror_resume(struct dm_target *ti) 1226 static void mirror_resume(struct dm_target *ti)
1218 { 1227 {
1219 struct mirror_set *ms = ti->private; 1228 struct mirror_set *ms = ti->private;
1220 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 1229 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1221 1230
1222 atomic_set(&ms->suspend, 0); 1231 atomic_set(&ms->suspend, 0);
1223 if (log->type->resume && log->type->resume(log)) 1232 if (log->type->resume && log->type->resume(log))
1224 /* FIXME: need better error handling */ 1233 /* FIXME: need better error handling */
1225 DMWARN("log resume failed"); 1234 DMWARN("log resume failed");
1226 dm_rh_start_recovery(ms->rh); 1235 dm_rh_start_recovery(ms->rh);
1227 } 1236 }
1228 1237
1229 /* 1238 /*
1230 * device_status_char 1239 * device_status_char
1231 * @m: mirror device/leg we want the status of 1240 * @m: mirror device/leg we want the status of
1232 * 1241 *
1233 * We return one character representing the most severe error 1242 * We return one character representing the most severe error
1234 * we have encountered. 1243 * we have encountered.
1235 * A => Alive - No failures 1244 * A => Alive - No failures
1236 * D => Dead - A write failure occurred leaving mirror out-of-sync 1245 * D => Dead - A write failure occurred leaving mirror out-of-sync
1237 * S => Sync - A sychronization failure occurred, mirror out-of-sync 1246 * S => Sync - A sychronization failure occurred, mirror out-of-sync
1238 * R => Read - A read failure occurred, mirror data unaffected 1247 * R => Read - A read failure occurred, mirror data unaffected
1239 * 1248 *
1240 * Returns: <char> 1249 * Returns: <char>
1241 */ 1250 */
1242 static char device_status_char(struct mirror *m) 1251 static char device_status_char(struct mirror *m)
1243 { 1252 {
1244 if (!atomic_read(&(m->error_count))) 1253 if (!atomic_read(&(m->error_count)))
1245 return 'A'; 1254 return 'A';
1246 1255
1247 return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : 1256 return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' :
1248 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : 1257 (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' :
1249 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; 1258 (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U';
1250 } 1259 }
1251 1260
1252 1261
1253 static int mirror_status(struct dm_target *ti, status_type_t type, 1262 static int mirror_status(struct dm_target *ti, status_type_t type,
1254 char *result, unsigned int maxlen) 1263 char *result, unsigned int maxlen)
1255 { 1264 {
1256 unsigned int m, sz = 0; 1265 unsigned int m, sz = 0;
1257 struct mirror_set *ms = (struct mirror_set *) ti->private; 1266 struct mirror_set *ms = (struct mirror_set *) ti->private;
1258 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); 1267 struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh);
1259 char buffer[ms->nr_mirrors + 1]; 1268 char buffer[ms->nr_mirrors + 1];
1260 1269
1261 switch (type) { 1270 switch (type) {
1262 case STATUSTYPE_INFO: 1271 case STATUSTYPE_INFO:
1263 DMEMIT("%d ", ms->nr_mirrors); 1272 DMEMIT("%d ", ms->nr_mirrors);
1264 for (m = 0; m < ms->nr_mirrors; m++) { 1273 for (m = 0; m < ms->nr_mirrors; m++) {
1265 DMEMIT("%s ", ms->mirror[m].dev->name); 1274 DMEMIT("%s ", ms->mirror[m].dev->name);
1266 buffer[m] = device_status_char(&(ms->mirror[m])); 1275 buffer[m] = device_status_char(&(ms->mirror[m]));
1267 } 1276 }
1268 buffer[m] = '\0'; 1277 buffer[m] = '\0';
1269 1278
1270 DMEMIT("%llu/%llu 1 %s ", 1279 DMEMIT("%llu/%llu 1 %s ",
1271 (unsigned long long)log->type->get_sync_count(log), 1280 (unsigned long long)log->type->get_sync_count(log),
1272 (unsigned long long)ms->nr_regions, buffer); 1281 (unsigned long long)ms->nr_regions, buffer);
1273 1282
1274 sz += log->type->status(log, type, result+sz, maxlen-sz); 1283 sz += log->type->status(log, type, result+sz, maxlen-sz);
1275 1284
1276 break; 1285 break;
1277 1286
1278 case STATUSTYPE_TABLE: 1287 case STATUSTYPE_TABLE:
1279 sz = log->type->status(log, type, result, maxlen); 1288 sz = log->type->status(log, type, result, maxlen);
1280 1289
1281 DMEMIT("%d", ms->nr_mirrors); 1290 DMEMIT("%d", ms->nr_mirrors);
1282 for (m = 0; m < ms->nr_mirrors; m++) 1291 for (m = 0; m < ms->nr_mirrors; m++)
1283 DMEMIT(" %s %llu", ms->mirror[m].dev->name, 1292 DMEMIT(" %s %llu", ms->mirror[m].dev->name,
1284 (unsigned long long)ms->mirror[m].offset); 1293 (unsigned long long)ms->mirror[m].offset);
1285 1294
1286 if (ms->features & DM_RAID1_HANDLE_ERRORS) 1295 if (ms->features & DM_RAID1_HANDLE_ERRORS)
1287 DMEMIT(" 1 handle_errors"); 1296 DMEMIT(" 1 handle_errors");
1288 } 1297 }
1289 1298
1290 return 0; 1299 return 0;
1291 } 1300 }
1292 1301
1293 static int mirror_iterate_devices(struct dm_target *ti, 1302 static int mirror_iterate_devices(struct dm_target *ti,
1294 iterate_devices_callout_fn fn, void *data) 1303 iterate_devices_callout_fn fn, void *data)
1295 { 1304 {
1296 struct mirror_set *ms = ti->private; 1305 struct mirror_set *ms = ti->private;
1297 int ret = 0; 1306 int ret = 0;
1298 unsigned i; 1307 unsigned i;
1299 1308
1300 for (i = 0; !ret && i < ms->nr_mirrors; i++) 1309 for (i = 0; !ret && i < ms->nr_mirrors; i++)
1301 ret = fn(ti, ms->mirror[i].dev, 1310 ret = fn(ti, ms->mirror[i].dev,
1302 ms->mirror[i].offset, ti->len, data); 1311 ms->mirror[i].offset, ti->len, data);
1303 1312
1304 return ret; 1313 return ret;
1305 } 1314 }
1306 1315
1307 static struct target_type mirror_target = { 1316 static struct target_type mirror_target = {
1308 .name = "mirror", 1317 .name = "mirror",
1309 .version = {1, 12, 0}, 1318 .version = {1, 12, 0},
1310 .module = THIS_MODULE, 1319 .module = THIS_MODULE,
1311 .ctr = mirror_ctr, 1320 .ctr = mirror_ctr,
1312 .dtr = mirror_dtr, 1321 .dtr = mirror_dtr,
1313 .map = mirror_map, 1322 .map = mirror_map,
1314 .end_io = mirror_end_io, 1323 .end_io = mirror_end_io,
1315 .presuspend = mirror_presuspend, 1324 .presuspend = mirror_presuspend,
1316 .postsuspend = mirror_postsuspend, 1325 .postsuspend = mirror_postsuspend,
1317 .resume = mirror_resume, 1326 .resume = mirror_resume,
1318 .status = mirror_status, 1327 .status = mirror_status,
1319 .iterate_devices = mirror_iterate_devices, 1328 .iterate_devices = mirror_iterate_devices,
1320 }; 1329 };
1321 1330
1322 static int __init dm_mirror_init(void) 1331 static int __init dm_mirror_init(void)
1323 { 1332 {
1324 int r; 1333 int r;
1325 1334
1326 _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0); 1335 _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0);
1327 if (!_dm_raid1_read_record_cache) { 1336 if (!_dm_raid1_read_record_cache) {
1328 DMERR("Can't allocate dm_raid1_read_record cache"); 1337 DMERR("Can't allocate dm_raid1_read_record cache");
1329 r = -ENOMEM; 1338 r = -ENOMEM;
1330 goto bad_cache; 1339 goto bad_cache;
1331 } 1340 }
1332 1341
1333 r = dm_register_target(&mirror_target); 1342 r = dm_register_target(&mirror_target);
1334 if (r < 0) { 1343 if (r < 0) {
1335 DMERR("Failed to register mirror target"); 1344 DMERR("Failed to register mirror target");
1336 goto bad_target; 1345 goto bad_target;
1337 } 1346 }
1338 1347
1339 return 0; 1348 return 0;
1340 1349
1341 bad_target: 1350 bad_target:
1342 kmem_cache_destroy(_dm_raid1_read_record_cache); 1351 kmem_cache_destroy(_dm_raid1_read_record_cache);
1343 bad_cache: 1352 bad_cache:
1344 return r; 1353 return r;
1345 } 1354 }
1346 1355
1347 static void __exit dm_mirror_exit(void) 1356 static void __exit dm_mirror_exit(void)
1348 { 1357 {
1349 dm_unregister_target(&mirror_target); 1358 dm_unregister_target(&mirror_target);
1350 kmem_cache_destroy(_dm_raid1_read_record_cache); 1359 kmem_cache_destroy(_dm_raid1_read_record_cache);
1351 } 1360 }
1352 1361
1353 /* Module hooks */ 1362 /* Module hooks */
1354 module_init(dm_mirror_init); 1363 module_init(dm_mirror_init);
1355 module_exit(dm_mirror_exit); 1364 module_exit(dm_mirror_exit);
1356 1365
1357 MODULE_DESCRIPTION(DM_NAME " mirror target"); 1366 MODULE_DESCRIPTION(DM_NAME " mirror target");
1358 MODULE_AUTHOR("Joe Thornber"); 1367 MODULE_AUTHOR("Joe Thornber");
1359 MODULE_LICENSE("GPL"); 1368 MODULE_LICENSE("GPL");
1360 1369
drivers/md/dm-region-hash.c
1 /* 1 /*
2 * Copyright (C) 2003 Sistina Software Limited. 2 * Copyright (C) 2003 Sistina Software Limited.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8 #include <linux/dm-dirty-log.h> 8 #include <linux/dm-dirty-log.h>
9 #include <linux/dm-region-hash.h> 9 #include <linux/dm-region-hash.h>
10 10
11 #include <linux/ctype.h> 11 #include <linux/ctype.h>
12 #include <linux/init.h> 12 #include <linux/init.h>
13 #include <linux/module.h> 13 #include <linux/module.h>
14 #include <linux/vmalloc.h> 14 #include <linux/vmalloc.h>
15 15
16 #include "dm.h" 16 #include "dm.h"
17 17
18 #define DM_MSG_PREFIX "region hash" 18 #define DM_MSG_PREFIX "region hash"
19 19
20 /*----------------------------------------------------------------- 20 /*-----------------------------------------------------------------
21 * Region hash 21 * Region hash
22 * 22 *
23 * The mirror splits itself up into discrete regions. Each 23 * The mirror splits itself up into discrete regions. Each
24 * region can be in one of three states: clean, dirty, 24 * region can be in one of three states: clean, dirty,
25 * nosync. There is no need to put clean regions in the hash. 25 * nosync. There is no need to put clean regions in the hash.
26 * 26 *
27 * In addition to being present in the hash table a region _may_ 27 * In addition to being present in the hash table a region _may_
28 * be present on one of three lists. 28 * be present on one of three lists.
29 * 29 *
30 * clean_regions: Regions on this list have no io pending to 30 * clean_regions: Regions on this list have no io pending to
31 * them, they are in sync, we are no longer interested in them, 31 * them, they are in sync, we are no longer interested in them,
32 * they are dull. dm_rh_update_states() will remove them from the 32 * they are dull. dm_rh_update_states() will remove them from the
33 * hash table. 33 * hash table.
34 * 34 *
35 * quiesced_regions: These regions have been spun down, ready 35 * quiesced_regions: These regions have been spun down, ready
36 * for recovery. rh_recovery_start() will remove regions from 36 * for recovery. rh_recovery_start() will remove regions from
37 * this list and hand them to kmirrord, which will schedule the 37 * this list and hand them to kmirrord, which will schedule the
38 * recovery io with kcopyd. 38 * recovery io with kcopyd.
39 * 39 *
40 * recovered_regions: Regions that kcopyd has successfully 40 * recovered_regions: Regions that kcopyd has successfully
41 * recovered. dm_rh_update_states() will now schedule any delayed 41 * recovered. dm_rh_update_states() will now schedule any delayed
42 * io, up the recovery_count, and remove the region from the 42 * io, up the recovery_count, and remove the region from the
43 * hash. 43 * hash.
44 * 44 *
45 * There are 2 locks: 45 * There are 2 locks:
46 * A rw spin lock 'hash_lock' protects just the hash table, 46 * A rw spin lock 'hash_lock' protects just the hash table,
47 * this is never held in write mode from interrupt context, 47 * this is never held in write mode from interrupt context,
48 * which I believe means that we only have to disable irqs when 48 * which I believe means that we only have to disable irqs when
49 * doing a write lock. 49 * doing a write lock.
50 * 50 *
51 * An ordinary spin lock 'region_lock' that protects the three 51 * An ordinary spin lock 'region_lock' that protects the three
52 * lists in the region_hash, with the 'state', 'list' and 52 * lists in the region_hash, with the 'state', 'list' and
53 * 'delayed_bios' fields of the regions. This is used from irq 53 * 'delayed_bios' fields of the regions. This is used from irq
54 * context, so all other uses will have to suspend local irqs. 54 * context, so all other uses will have to suspend local irqs.
55 *---------------------------------------------------------------*/ 55 *---------------------------------------------------------------*/
56 struct dm_region_hash { 56 struct dm_region_hash {
57 uint32_t region_size; 57 uint32_t region_size;
58 unsigned region_shift; 58 unsigned region_shift;
59 59
60 /* holds persistent region state */ 60 /* holds persistent region state */
61 struct dm_dirty_log *log; 61 struct dm_dirty_log *log;
62 62
63 /* hash table */ 63 /* hash table */
64 rwlock_t hash_lock; 64 rwlock_t hash_lock;
65 mempool_t *region_pool; 65 mempool_t *region_pool;
66 unsigned mask; 66 unsigned mask;
67 unsigned nr_buckets; 67 unsigned nr_buckets;
68 unsigned prime; 68 unsigned prime;
69 unsigned shift; 69 unsigned shift;
70 struct list_head *buckets; 70 struct list_head *buckets;
71 71
72 unsigned max_recovery; /* Max # of regions to recover in parallel */ 72 unsigned max_recovery; /* Max # of regions to recover in parallel */
73 73
74 spinlock_t region_lock; 74 spinlock_t region_lock;
75 atomic_t recovery_in_flight; 75 atomic_t recovery_in_flight;
76 struct semaphore recovery_count; 76 struct semaphore recovery_count;
77 struct list_head clean_regions; 77 struct list_head clean_regions;
78 struct list_head quiesced_regions; 78 struct list_head quiesced_regions;
79 struct list_head recovered_regions; 79 struct list_head recovered_regions;
80 struct list_head failed_recovered_regions; 80 struct list_head failed_recovered_regions;
81 81
82 /*
83 * If there was a barrier failure no regions can be marked clean.
84 */
85 int barrier_failure;
86
82 void *context; 87 void *context;
83 sector_t target_begin; 88 sector_t target_begin;
84 89
85 /* Callback function to schedule bios writes */ 90 /* Callback function to schedule bios writes */
86 void (*dispatch_bios)(void *context, struct bio_list *bios); 91 void (*dispatch_bios)(void *context, struct bio_list *bios);
87 92
88 /* Callback function to wakeup callers worker thread. */ 93 /* Callback function to wakeup callers worker thread. */
89 void (*wakeup_workers)(void *context); 94 void (*wakeup_workers)(void *context);
90 95
91 /* Callback function to wakeup callers recovery waiters. */ 96 /* Callback function to wakeup callers recovery waiters. */
92 void (*wakeup_all_recovery_waiters)(void *context); 97 void (*wakeup_all_recovery_waiters)(void *context);
93 }; 98 };
94 99
95 struct dm_region { 100 struct dm_region {
96 struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ 101 struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */
97 region_t key; 102 region_t key;
98 int state; 103 int state;
99 104
100 struct list_head hash_list; 105 struct list_head hash_list;
101 struct list_head list; 106 struct list_head list;
102 107
103 atomic_t pending; 108 atomic_t pending;
104 struct bio_list delayed_bios; 109 struct bio_list delayed_bios;
105 }; 110 };
106 111
107 /* 112 /*
108 * Conversion fns 113 * Conversion fns
109 */ 114 */
110 static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector) 115 static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector)
111 { 116 {
112 return sector >> rh->region_shift; 117 return sector >> rh->region_shift;
113 } 118 }
114 119
115 sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) 120 sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region)
116 { 121 {
117 return region << rh->region_shift; 122 return region << rh->region_shift;
118 } 123 }
119 EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); 124 EXPORT_SYMBOL_GPL(dm_rh_region_to_sector);
120 125
121 region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) 126 region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio)
122 { 127 {
123 return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); 128 return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin);
124 } 129 }
125 EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); 130 EXPORT_SYMBOL_GPL(dm_rh_bio_to_region);
126 131
127 void *dm_rh_region_context(struct dm_region *reg) 132 void *dm_rh_region_context(struct dm_region *reg)
128 { 133 {
129 return reg->rh->context; 134 return reg->rh->context;
130 } 135 }
131 EXPORT_SYMBOL_GPL(dm_rh_region_context); 136 EXPORT_SYMBOL_GPL(dm_rh_region_context);
132 137
133 region_t dm_rh_get_region_key(struct dm_region *reg) 138 region_t dm_rh_get_region_key(struct dm_region *reg)
134 { 139 {
135 return reg->key; 140 return reg->key;
136 } 141 }
137 EXPORT_SYMBOL_GPL(dm_rh_get_region_key); 142 EXPORT_SYMBOL_GPL(dm_rh_get_region_key);
138 143
139 sector_t dm_rh_get_region_size(struct dm_region_hash *rh) 144 sector_t dm_rh_get_region_size(struct dm_region_hash *rh)
140 { 145 {
141 return rh->region_size; 146 return rh->region_size;
142 } 147 }
143 EXPORT_SYMBOL_GPL(dm_rh_get_region_size); 148 EXPORT_SYMBOL_GPL(dm_rh_get_region_size);
144 149
145 /* 150 /*
146 * FIXME: shall we pass in a structure instead of all these args to 151 * FIXME: shall we pass in a structure instead of all these args to
147 * dm_region_hash_create()???? 152 * dm_region_hash_create()????
148 */ 153 */
149 #define RH_HASH_MULT 2654435387U 154 #define RH_HASH_MULT 2654435387U
150 #define RH_HASH_SHIFT 12 155 #define RH_HASH_SHIFT 12
151 156
152 #define MIN_REGIONS 64 157 #define MIN_REGIONS 64
153 struct dm_region_hash *dm_region_hash_create( 158 struct dm_region_hash *dm_region_hash_create(
154 void *context, void (*dispatch_bios)(void *context, 159 void *context, void (*dispatch_bios)(void *context,
155 struct bio_list *bios), 160 struct bio_list *bios),
156 void (*wakeup_workers)(void *context), 161 void (*wakeup_workers)(void *context),
157 void (*wakeup_all_recovery_waiters)(void *context), 162 void (*wakeup_all_recovery_waiters)(void *context),
158 sector_t target_begin, unsigned max_recovery, 163 sector_t target_begin, unsigned max_recovery,
159 struct dm_dirty_log *log, uint32_t region_size, 164 struct dm_dirty_log *log, uint32_t region_size,
160 region_t nr_regions) 165 region_t nr_regions)
161 { 166 {
162 struct dm_region_hash *rh; 167 struct dm_region_hash *rh;
163 unsigned nr_buckets, max_buckets; 168 unsigned nr_buckets, max_buckets;
164 size_t i; 169 size_t i;
165 170
166 /* 171 /*
167 * Calculate a suitable number of buckets for our hash 172 * Calculate a suitable number of buckets for our hash
168 * table. 173 * table.
169 */ 174 */
170 max_buckets = nr_regions >> 6; 175 max_buckets = nr_regions >> 6;
171 for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) 176 for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1)
172 ; 177 ;
173 nr_buckets >>= 1; 178 nr_buckets >>= 1;
174 179
175 rh = kmalloc(sizeof(*rh), GFP_KERNEL); 180 rh = kmalloc(sizeof(*rh), GFP_KERNEL);
176 if (!rh) { 181 if (!rh) {
177 DMERR("unable to allocate region hash memory"); 182 DMERR("unable to allocate region hash memory");
178 return ERR_PTR(-ENOMEM); 183 return ERR_PTR(-ENOMEM);
179 } 184 }
180 185
181 rh->context = context; 186 rh->context = context;
182 rh->dispatch_bios = dispatch_bios; 187 rh->dispatch_bios = dispatch_bios;
183 rh->wakeup_workers = wakeup_workers; 188 rh->wakeup_workers = wakeup_workers;
184 rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; 189 rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters;
185 rh->target_begin = target_begin; 190 rh->target_begin = target_begin;
186 rh->max_recovery = max_recovery; 191 rh->max_recovery = max_recovery;
187 rh->log = log; 192 rh->log = log;
188 rh->region_size = region_size; 193 rh->region_size = region_size;
189 rh->region_shift = ffs(region_size) - 1; 194 rh->region_shift = ffs(region_size) - 1;
190 rwlock_init(&rh->hash_lock); 195 rwlock_init(&rh->hash_lock);
191 rh->mask = nr_buckets - 1; 196 rh->mask = nr_buckets - 1;
192 rh->nr_buckets = nr_buckets; 197 rh->nr_buckets = nr_buckets;
193 198
194 rh->shift = RH_HASH_SHIFT; 199 rh->shift = RH_HASH_SHIFT;
195 rh->prime = RH_HASH_MULT; 200 rh->prime = RH_HASH_MULT;
196 201
197 rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); 202 rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets));
198 if (!rh->buckets) { 203 if (!rh->buckets) {
199 DMERR("unable to allocate region hash bucket memory"); 204 DMERR("unable to allocate region hash bucket memory");
200 kfree(rh); 205 kfree(rh);
201 return ERR_PTR(-ENOMEM); 206 return ERR_PTR(-ENOMEM);
202 } 207 }
203 208
204 for (i = 0; i < nr_buckets; i++) 209 for (i = 0; i < nr_buckets; i++)
205 INIT_LIST_HEAD(rh->buckets + i); 210 INIT_LIST_HEAD(rh->buckets + i);
206 211
207 spin_lock_init(&rh->region_lock); 212 spin_lock_init(&rh->region_lock);
208 sema_init(&rh->recovery_count, 0); 213 sema_init(&rh->recovery_count, 0);
209 atomic_set(&rh->recovery_in_flight, 0); 214 atomic_set(&rh->recovery_in_flight, 0);
210 INIT_LIST_HEAD(&rh->clean_regions); 215 INIT_LIST_HEAD(&rh->clean_regions);
211 INIT_LIST_HEAD(&rh->quiesced_regions); 216 INIT_LIST_HEAD(&rh->quiesced_regions);
212 INIT_LIST_HEAD(&rh->recovered_regions); 217 INIT_LIST_HEAD(&rh->recovered_regions);
213 INIT_LIST_HEAD(&rh->failed_recovered_regions); 218 INIT_LIST_HEAD(&rh->failed_recovered_regions);
219 rh->barrier_failure = 0;
214 220
215 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, 221 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
216 sizeof(struct dm_region)); 222 sizeof(struct dm_region));
217 if (!rh->region_pool) { 223 if (!rh->region_pool) {
218 vfree(rh->buckets); 224 vfree(rh->buckets);
219 kfree(rh); 225 kfree(rh);
220 rh = ERR_PTR(-ENOMEM); 226 rh = ERR_PTR(-ENOMEM);
221 } 227 }
222 228
223 return rh; 229 return rh;
224 } 230 }
225 EXPORT_SYMBOL_GPL(dm_region_hash_create); 231 EXPORT_SYMBOL_GPL(dm_region_hash_create);
226 232
227 void dm_region_hash_destroy(struct dm_region_hash *rh) 233 void dm_region_hash_destroy(struct dm_region_hash *rh)
228 { 234 {
229 unsigned h; 235 unsigned h;
230 struct dm_region *reg, *nreg; 236 struct dm_region *reg, *nreg;
231 237
232 BUG_ON(!list_empty(&rh->quiesced_regions)); 238 BUG_ON(!list_empty(&rh->quiesced_regions));
233 for (h = 0; h < rh->nr_buckets; h++) { 239 for (h = 0; h < rh->nr_buckets; h++) {
234 list_for_each_entry_safe(reg, nreg, rh->buckets + h, 240 list_for_each_entry_safe(reg, nreg, rh->buckets + h,
235 hash_list) { 241 hash_list) {
236 BUG_ON(atomic_read(&reg->pending)); 242 BUG_ON(atomic_read(&reg->pending));
237 mempool_free(reg, rh->region_pool); 243 mempool_free(reg, rh->region_pool);
238 } 244 }
239 } 245 }
240 246
241 if (rh->log) 247 if (rh->log)
242 dm_dirty_log_destroy(rh->log); 248 dm_dirty_log_destroy(rh->log);
243 249
244 if (rh->region_pool) 250 if (rh->region_pool)
245 mempool_destroy(rh->region_pool); 251 mempool_destroy(rh->region_pool);
246 252
247 vfree(rh->buckets); 253 vfree(rh->buckets);
248 kfree(rh); 254 kfree(rh);
249 } 255 }
250 EXPORT_SYMBOL_GPL(dm_region_hash_destroy); 256 EXPORT_SYMBOL_GPL(dm_region_hash_destroy);
251 257
252 struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh) 258 struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh)
253 { 259 {
254 return rh->log; 260 return rh->log;
255 } 261 }
256 EXPORT_SYMBOL_GPL(dm_rh_dirty_log); 262 EXPORT_SYMBOL_GPL(dm_rh_dirty_log);
257 263
258 static unsigned rh_hash(struct dm_region_hash *rh, region_t region) 264 static unsigned rh_hash(struct dm_region_hash *rh, region_t region)
259 { 265 {
260 return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask; 266 return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask;
261 } 267 }
262 268
263 static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region) 269 static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region)
264 { 270 {
265 struct dm_region *reg; 271 struct dm_region *reg;
266 struct list_head *bucket = rh->buckets + rh_hash(rh, region); 272 struct list_head *bucket = rh->buckets + rh_hash(rh, region);
267 273
268 list_for_each_entry(reg, bucket, hash_list) 274 list_for_each_entry(reg, bucket, hash_list)
269 if (reg->key == region) 275 if (reg->key == region)
270 return reg; 276 return reg;
271 277
272 return NULL; 278 return NULL;
273 } 279 }
274 280
275 static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg) 281 static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg)
276 { 282 {
277 list_add(&reg->hash_list, rh->buckets + rh_hash(rh, reg->key)); 283 list_add(&reg->hash_list, rh->buckets + rh_hash(rh, reg->key));
278 } 284 }
279 285
280 static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) 286 static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region)
281 { 287 {
282 struct dm_region *reg, *nreg; 288 struct dm_region *reg, *nreg;
283 289
284 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); 290 nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC);
285 if (unlikely(!nreg)) 291 if (unlikely(!nreg))
286 nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); 292 nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL);
287 293
288 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? 294 nreg->state = rh->log->type->in_sync(rh->log, region, 1) ?
289 DM_RH_CLEAN : DM_RH_NOSYNC; 295 DM_RH_CLEAN : DM_RH_NOSYNC;
290 nreg->rh = rh; 296 nreg->rh = rh;
291 nreg->key = region; 297 nreg->key = region;
292 INIT_LIST_HEAD(&nreg->list); 298 INIT_LIST_HEAD(&nreg->list);
293 atomic_set(&nreg->pending, 0); 299 atomic_set(&nreg->pending, 0);
294 bio_list_init(&nreg->delayed_bios); 300 bio_list_init(&nreg->delayed_bios);
295 301
296 write_lock_irq(&rh->hash_lock); 302 write_lock_irq(&rh->hash_lock);
297 reg = __rh_lookup(rh, region); 303 reg = __rh_lookup(rh, region);
298 if (reg) 304 if (reg)
299 /* We lost the race. */ 305 /* We lost the race. */
300 mempool_free(nreg, rh->region_pool); 306 mempool_free(nreg, rh->region_pool);
301 else { 307 else {
302 __rh_insert(rh, nreg); 308 __rh_insert(rh, nreg);
303 if (nreg->state == DM_RH_CLEAN) { 309 if (nreg->state == DM_RH_CLEAN) {
304 spin_lock(&rh->region_lock); 310 spin_lock(&rh->region_lock);
305 list_add(&nreg->list, &rh->clean_regions); 311 list_add(&nreg->list, &rh->clean_regions);
306 spin_unlock(&rh->region_lock); 312 spin_unlock(&rh->region_lock);
307 } 313 }
308 314
309 reg = nreg; 315 reg = nreg;
310 } 316 }
311 write_unlock_irq(&rh->hash_lock); 317 write_unlock_irq(&rh->hash_lock);
312 318
313 return reg; 319 return reg;
314 } 320 }
315 321
316 static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region) 322 static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region)
317 { 323 {
318 struct dm_region *reg; 324 struct dm_region *reg;
319 325
320 reg = __rh_lookup(rh, region); 326 reg = __rh_lookup(rh, region);
321 if (!reg) { 327 if (!reg) {
322 read_unlock(&rh->hash_lock); 328 read_unlock(&rh->hash_lock);
323 reg = __rh_alloc(rh, region); 329 reg = __rh_alloc(rh, region);
324 read_lock(&rh->hash_lock); 330 read_lock(&rh->hash_lock);
325 } 331 }
326 332
327 return reg; 333 return reg;
328 } 334 }
329 335
330 int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block) 336 int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block)
331 { 337 {
332 int r; 338 int r;
333 struct dm_region *reg; 339 struct dm_region *reg;
334 340
335 read_lock(&rh->hash_lock); 341 read_lock(&rh->hash_lock);
336 reg = __rh_lookup(rh, region); 342 reg = __rh_lookup(rh, region);
337 read_unlock(&rh->hash_lock); 343 read_unlock(&rh->hash_lock);
338 344
339 if (reg) 345 if (reg)
340 return reg->state; 346 return reg->state;
341 347
342 /* 348 /*
343 * The region wasn't in the hash, so we fall back to the 349 * The region wasn't in the hash, so we fall back to the
344 * dirty log. 350 * dirty log.
345 */ 351 */
346 r = rh->log->type->in_sync(rh->log, region, may_block); 352 r = rh->log->type->in_sync(rh->log, region, may_block);
347 353
348 /* 354 /*
349 * Any error from the dirty log (eg. -EWOULDBLOCK) gets 355 * Any error from the dirty log (eg. -EWOULDBLOCK) gets
350 * taken as a DM_RH_NOSYNC 356 * taken as a DM_RH_NOSYNC
351 */ 357 */
352 return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC; 358 return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC;
353 } 359 }
354 EXPORT_SYMBOL_GPL(dm_rh_get_state); 360 EXPORT_SYMBOL_GPL(dm_rh_get_state);
355 361
356 static void complete_resync_work(struct dm_region *reg, int success) 362 static void complete_resync_work(struct dm_region *reg, int success)
357 { 363 {
358 struct dm_region_hash *rh = reg->rh; 364 struct dm_region_hash *rh = reg->rh;
359 365
360 rh->log->type->set_region_sync(rh->log, reg->key, success); 366 rh->log->type->set_region_sync(rh->log, reg->key, success);
361 367
362 /* 368 /*
363 * Dispatch the bios before we call 'wake_up_all'. 369 * Dispatch the bios before we call 'wake_up_all'.
364 * This is important because if we are suspending, 370 * This is important because if we are suspending,
365 * we want to know that recovery is complete and 371 * we want to know that recovery is complete and
366 * the work queue is flushed. If we wake_up_all 372 * the work queue is flushed. If we wake_up_all
367 * before we dispatch_bios (queue bios and call wake()), 373 * before we dispatch_bios (queue bios and call wake()),
368 * then we risk suspending before the work queue 374 * then we risk suspending before the work queue
369 * has been properly flushed. 375 * has been properly flushed.
370 */ 376 */
371 rh->dispatch_bios(rh->context, &reg->delayed_bios); 377 rh->dispatch_bios(rh->context, &reg->delayed_bios);
372 if (atomic_dec_and_test(&rh->recovery_in_flight)) 378 if (atomic_dec_and_test(&rh->recovery_in_flight))
373 rh->wakeup_all_recovery_waiters(rh->context); 379 rh->wakeup_all_recovery_waiters(rh->context);
374 up(&rh->recovery_count); 380 up(&rh->recovery_count);
375 } 381 }
376 382
377 /* dm_rh_mark_nosync 383 /* dm_rh_mark_nosync
378 * @ms 384 * @ms
379 * @bio 385 * @bio
380 * @done 386 * @done
381 * @error 387 * @error
382 * 388 *
383 * The bio was written on some mirror(s) but failed on other mirror(s). 389 * The bio was written on some mirror(s) but failed on other mirror(s).
384 * We can successfully endio the bio but should avoid the region being 390 * We can successfully endio the bio but should avoid the region being
385 * marked clean by setting the state DM_RH_NOSYNC. 391 * marked clean by setting the state DM_RH_NOSYNC.
386 * 392 *
387 * This function is _not_ safe in interrupt context! 393 * This function is _not_ safe in interrupt context!
388 */ 394 */
389 void dm_rh_mark_nosync(struct dm_region_hash *rh, 395 void dm_rh_mark_nosync(struct dm_region_hash *rh,
390 struct bio *bio, unsigned done, int error) 396 struct bio *bio, unsigned done, int error)
391 { 397 {
392 unsigned long flags; 398 unsigned long flags;
393 struct dm_dirty_log *log = rh->log; 399 struct dm_dirty_log *log = rh->log;
394 struct dm_region *reg; 400 struct dm_region *reg;
395 region_t region = dm_rh_bio_to_region(rh, bio); 401 region_t region = dm_rh_bio_to_region(rh, bio);
396 int recovering = 0; 402 int recovering = 0;
397 403
404 if (bio_empty_barrier(bio)) {
405 rh->barrier_failure = 1;
406 return;
407 }
408
398 /* We must inform the log that the sync count has changed. */ 409 /* We must inform the log that the sync count has changed. */
399 log->type->set_region_sync(log, region, 0); 410 log->type->set_region_sync(log, region, 0);
400 411
401 read_lock(&rh->hash_lock); 412 read_lock(&rh->hash_lock);
402 reg = __rh_find(rh, region); 413 reg = __rh_find(rh, region);
403 read_unlock(&rh->hash_lock); 414 read_unlock(&rh->hash_lock);
404 415
405 /* region hash entry should exist because write was in-flight */ 416 /* region hash entry should exist because write was in-flight */
406 BUG_ON(!reg); 417 BUG_ON(!reg);
407 BUG_ON(!list_empty(&reg->list)); 418 BUG_ON(!list_empty(&reg->list));
408 419
409 spin_lock_irqsave(&rh->region_lock, flags); 420 spin_lock_irqsave(&rh->region_lock, flags);
410 /* 421 /*
411 * Possible cases: 422 * Possible cases:
412 * 1) DM_RH_DIRTY 423 * 1) DM_RH_DIRTY
413 * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed 424 * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed
414 * 3) DM_RH_RECOVERING: flushing pending writes 425 * 3) DM_RH_RECOVERING: flushing pending writes
415 * Either case, the region should have not been connected to list. 426 * Either case, the region should have not been connected to list.
416 */ 427 */
417 recovering = (reg->state == DM_RH_RECOVERING); 428 recovering = (reg->state == DM_RH_RECOVERING);
418 reg->state = DM_RH_NOSYNC; 429 reg->state = DM_RH_NOSYNC;
419 BUG_ON(!list_empty(&reg->list)); 430 BUG_ON(!list_empty(&reg->list));
420 spin_unlock_irqrestore(&rh->region_lock, flags); 431 spin_unlock_irqrestore(&rh->region_lock, flags);
421 432
422 bio_endio(bio, error); 433 bio_endio(bio, error);
423 if (recovering) 434 if (recovering)
424 complete_resync_work(reg, 0); 435 complete_resync_work(reg, 0);
425 } 436 }
426 EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); 437 EXPORT_SYMBOL_GPL(dm_rh_mark_nosync);
427 438
428 void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) 439 void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled)
429 { 440 {
430 struct dm_region *reg, *next; 441 struct dm_region *reg, *next;
431 442
432 LIST_HEAD(clean); 443 LIST_HEAD(clean);
433 LIST_HEAD(recovered); 444 LIST_HEAD(recovered);
434 LIST_HEAD(failed_recovered); 445 LIST_HEAD(failed_recovered);
435 446
436 /* 447 /*
437 * Quickly grab the lists. 448 * Quickly grab the lists.
438 */ 449 */
439 write_lock_irq(&rh->hash_lock); 450 write_lock_irq(&rh->hash_lock);
440 spin_lock(&rh->region_lock); 451 spin_lock(&rh->region_lock);
441 if (!list_empty(&rh->clean_regions)) { 452 if (!list_empty(&rh->clean_regions)) {
442 list_splice_init(&rh->clean_regions, &clean); 453 list_splice_init(&rh->clean_regions, &clean);
443 454
444 list_for_each_entry(reg, &clean, list) 455 list_for_each_entry(reg, &clean, list)
445 list_del(&reg->hash_list); 456 list_del(&reg->hash_list);
446 } 457 }
447 458
448 if (!list_empty(&rh->recovered_regions)) { 459 if (!list_empty(&rh->recovered_regions)) {
449 list_splice_init(&rh->recovered_regions, &recovered); 460 list_splice_init(&rh->recovered_regions, &recovered);
450 461
451 list_for_each_entry(reg, &recovered, list) 462 list_for_each_entry(reg, &recovered, list)
452 list_del(&reg->hash_list); 463 list_del(&reg->hash_list);
453 } 464 }
454 465
455 if (!list_empty(&rh->failed_recovered_regions)) { 466 if (!list_empty(&rh->failed_recovered_regions)) {
456 list_splice_init(&rh->failed_recovered_regions, 467 list_splice_init(&rh->failed_recovered_regions,
457 &failed_recovered); 468 &failed_recovered);
458 469
459 list_for_each_entry(reg, &failed_recovered, list) 470 list_for_each_entry(reg, &failed_recovered, list)
460 list_del(&reg->hash_list); 471 list_del(&reg->hash_list);
461 } 472 }
462 473
463 spin_unlock(&rh->region_lock); 474 spin_unlock(&rh->region_lock);
464 write_unlock_irq(&rh->hash_lock); 475 write_unlock_irq(&rh->hash_lock);
465 476
466 /* 477 /*
467 * All the regions on the recovered and clean lists have 478 * All the regions on the recovered and clean lists have
468 * now been pulled out of the system, so no need to do 479 * now been pulled out of the system, so no need to do
469 * any more locking. 480 * any more locking.
470 */ 481 */
471 list_for_each_entry_safe(reg, next, &recovered, list) { 482 list_for_each_entry_safe(reg, next, &recovered, list) {
472 rh->log->type->clear_region(rh->log, reg->key); 483 rh->log->type->clear_region(rh->log, reg->key);
473 complete_resync_work(reg, 1); 484 complete_resync_work(reg, 1);
474 mempool_free(reg, rh->region_pool); 485 mempool_free(reg, rh->region_pool);
475 } 486 }
476 487
477 list_for_each_entry_safe(reg, next, &failed_recovered, list) { 488 list_for_each_entry_safe(reg, next, &failed_recovered, list) {
478 complete_resync_work(reg, errors_handled ? 0 : 1); 489 complete_resync_work(reg, errors_handled ? 0 : 1);
479 mempool_free(reg, rh->region_pool); 490 mempool_free(reg, rh->region_pool);
480 } 491 }
481 492
482 list_for_each_entry_safe(reg, next, &clean, list) { 493 list_for_each_entry_safe(reg, next, &clean, list) {
483 rh->log->type->clear_region(rh->log, reg->key); 494 rh->log->type->clear_region(rh->log, reg->key);
484 mempool_free(reg, rh->region_pool); 495 mempool_free(reg, rh->region_pool);
485 } 496 }
486 497
487 rh->log->type->flush(rh->log); 498 rh->log->type->flush(rh->log);
488 } 499 }
489 EXPORT_SYMBOL_GPL(dm_rh_update_states); 500 EXPORT_SYMBOL_GPL(dm_rh_update_states);
490 501
491 static void rh_inc(struct dm_region_hash *rh, region_t region) 502 static void rh_inc(struct dm_region_hash *rh, region_t region)
492 { 503 {
493 struct dm_region *reg; 504 struct dm_region *reg;
494 505
495 read_lock(&rh->hash_lock); 506 read_lock(&rh->hash_lock);
496 reg = __rh_find(rh, region); 507 reg = __rh_find(rh, region);
497 508
498 spin_lock_irq(&rh->region_lock); 509 spin_lock_irq(&rh->region_lock);
499 atomic_inc(&reg->pending); 510 atomic_inc(&reg->pending);
500 511
501 if (reg->state == DM_RH_CLEAN) { 512 if (reg->state == DM_RH_CLEAN) {
502 reg->state = DM_RH_DIRTY; 513 reg->state = DM_RH_DIRTY;
503 list_del_init(&reg->list); /* take off the clean list */ 514 list_del_init(&reg->list); /* take off the clean list */
504 spin_unlock_irq(&rh->region_lock); 515 spin_unlock_irq(&rh->region_lock);
505 516
506 rh->log->type->mark_region(rh->log, reg->key); 517 rh->log->type->mark_region(rh->log, reg->key);
507 } else 518 } else
508 spin_unlock_irq(&rh->region_lock); 519 spin_unlock_irq(&rh->region_lock);
509 520
510 521
511 read_unlock(&rh->hash_lock); 522 read_unlock(&rh->hash_lock);
512 } 523 }
513 524
514 void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) 525 void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios)
515 { 526 {
516 struct bio *bio; 527 struct bio *bio;
517 528
518 for (bio = bios->head; bio; bio = bio->bi_next) 529 for (bio = bios->head; bio; bio = bio->bi_next) {
530 if (bio_empty_barrier(bio))
531 continue;
519 rh_inc(rh, dm_rh_bio_to_region(rh, bio)); 532 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
533 }
520 } 534 }
521 EXPORT_SYMBOL_GPL(dm_rh_inc_pending); 535 EXPORT_SYMBOL_GPL(dm_rh_inc_pending);
522 536
523 void dm_rh_dec(struct dm_region_hash *rh, region_t region) 537 void dm_rh_dec(struct dm_region_hash *rh, region_t region)
524 { 538 {
525 unsigned long flags; 539 unsigned long flags;
526 struct dm_region *reg; 540 struct dm_region *reg;
527 int should_wake = 0; 541 int should_wake = 0;
528 542
529 read_lock(&rh->hash_lock); 543 read_lock(&rh->hash_lock);
530 reg = __rh_lookup(rh, region); 544 reg = __rh_lookup(rh, region);
531 read_unlock(&rh->hash_lock); 545 read_unlock(&rh->hash_lock);
532 546
533 spin_lock_irqsave(&rh->region_lock, flags); 547 spin_lock_irqsave(&rh->region_lock, flags);
534 if (atomic_dec_and_test(&reg->pending)) { 548 if (atomic_dec_and_test(&reg->pending)) {
535 /* 549 /*
536 * There is no pending I/O for this region. 550 * There is no pending I/O for this region.
537 * We can move the region to corresponding list for next action. 551 * We can move the region to corresponding list for next action.
538 * At this point, the region is not yet connected to any list. 552 * At this point, the region is not yet connected to any list.
539 * 553 *
540 * If the state is DM_RH_NOSYNC, the region should be kept off 554 * If the state is DM_RH_NOSYNC, the region should be kept off
541 * from clean list. 555 * from clean list.
542 * The hash entry for DM_RH_NOSYNC will remain in memory 556 * The hash entry for DM_RH_NOSYNC will remain in memory
543 * until the region is recovered or the map is reloaded. 557 * until the region is recovered or the map is reloaded.
544 */ 558 */
545 559
546 /* do nothing for DM_RH_NOSYNC */ 560 /* do nothing for DM_RH_NOSYNC */
547 if (reg->state == DM_RH_RECOVERING) { 561 if (unlikely(rh->barrier_failure)) {
562 /*
563 * If a write barrier failed some time ago, we
564 * don't know whether or not this write made it
565 * to the disk, so we must resync the device.
566 */
567 reg->state = DM_RH_NOSYNC;
568 } else if (reg->state == DM_RH_RECOVERING) {
548 list_add_tail(&reg->list, &rh->quiesced_regions); 569 list_add_tail(&reg->list, &rh->quiesced_regions);
549 } else if (reg->state == DM_RH_DIRTY) { 570 } else if (reg->state == DM_RH_DIRTY) {
550 reg->state = DM_RH_CLEAN; 571 reg->state = DM_RH_CLEAN;
551 list_add(&reg->list, &rh->clean_regions); 572 list_add(&reg->list, &rh->clean_regions);
552 } 573 }
553 should_wake = 1; 574 should_wake = 1;
554 } 575 }
555 spin_unlock_irqrestore(&rh->region_lock, flags); 576 spin_unlock_irqrestore(&rh->region_lock, flags);
556 577
557 if (should_wake) 578 if (should_wake)
558 rh->wakeup_workers(rh->context); 579 rh->wakeup_workers(rh->context);
559 } 580 }
560 EXPORT_SYMBOL_GPL(dm_rh_dec); 581 EXPORT_SYMBOL_GPL(dm_rh_dec);
561 582
562 /* 583 /*
563 * Starts quiescing a region in preparation for recovery. 584 * Starts quiescing a region in preparation for recovery.
564 */ 585 */
565 static int __rh_recovery_prepare(struct dm_region_hash *rh) 586 static int __rh_recovery_prepare(struct dm_region_hash *rh)
566 { 587 {
567 int r; 588 int r;
568 region_t region; 589 region_t region;
569 struct dm_region *reg; 590 struct dm_region *reg;
570 591
571 /* 592 /*
572 * Ask the dirty log what's next. 593 * Ask the dirty log what's next.
573 */ 594 */
574 r = rh->log->type->get_resync_work(rh->log, &region); 595 r = rh->log->type->get_resync_work(rh->log, &region);
575 if (r <= 0) 596 if (r <= 0)
576 return r; 597 return r;
577 598
578 /* 599 /*
579 * Get this region, and start it quiescing by setting the 600 * Get this region, and start it quiescing by setting the
580 * recovering flag. 601 * recovering flag.
581 */ 602 */
582 read_lock(&rh->hash_lock); 603 read_lock(&rh->hash_lock);
583 reg = __rh_find(rh, region); 604 reg = __rh_find(rh, region);
584 read_unlock(&rh->hash_lock); 605 read_unlock(&rh->hash_lock);
585 606
586 spin_lock_irq(&rh->region_lock); 607 spin_lock_irq(&rh->region_lock);
587 reg->state = DM_RH_RECOVERING; 608 reg->state = DM_RH_RECOVERING;
588 609
589 /* Already quiesced ? */ 610 /* Already quiesced ? */
590 if (atomic_read(&reg->pending)) 611 if (atomic_read(&reg->pending))
591 list_del_init(&reg->list); 612 list_del_init(&reg->list);
592 else 613 else
593 list_move(&reg->list, &rh->quiesced_regions); 614 list_move(&reg->list, &rh->quiesced_regions);
594 615
595 spin_unlock_irq(&rh->region_lock); 616 spin_unlock_irq(&rh->region_lock);
596 617
597 return 1; 618 return 1;
598 } 619 }
599 620
600 void dm_rh_recovery_prepare(struct dm_region_hash *rh) 621 void dm_rh_recovery_prepare(struct dm_region_hash *rh)
601 { 622 {
602 /* Extra reference to avoid race with dm_rh_stop_recovery */ 623 /* Extra reference to avoid race with dm_rh_stop_recovery */
603 atomic_inc(&rh->recovery_in_flight); 624 atomic_inc(&rh->recovery_in_flight);
604 625
605 while (!down_trylock(&rh->recovery_count)) { 626 while (!down_trylock(&rh->recovery_count)) {
606 atomic_inc(&rh->recovery_in_flight); 627 atomic_inc(&rh->recovery_in_flight);
607 if (__rh_recovery_prepare(rh) <= 0) { 628 if (__rh_recovery_prepare(rh) <= 0) {
608 atomic_dec(&rh->recovery_in_flight); 629 atomic_dec(&rh->recovery_in_flight);
609 up(&rh->recovery_count); 630 up(&rh->recovery_count);
610 break; 631 break;
611 } 632 }
612 } 633 }
613 634
614 /* Drop the extra reference */ 635 /* Drop the extra reference */
615 if (atomic_dec_and_test(&rh->recovery_in_flight)) 636 if (atomic_dec_and_test(&rh->recovery_in_flight))
616 rh->wakeup_all_recovery_waiters(rh->context); 637 rh->wakeup_all_recovery_waiters(rh->context);
617 } 638 }
618 EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); 639 EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare);
619 640
620 /* 641 /*
621 * Returns any quiesced regions. 642 * Returns any quiesced regions.
622 */ 643 */
623 struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh) 644 struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh)
624 { 645 {
625 struct dm_region *reg = NULL; 646 struct dm_region *reg = NULL;
626 647
627 spin_lock_irq(&rh->region_lock); 648 spin_lock_irq(&rh->region_lock);
628 if (!list_empty(&rh->quiesced_regions)) { 649 if (!list_empty(&rh->quiesced_regions)) {
629 reg = list_entry(rh->quiesced_regions.next, 650 reg = list_entry(rh->quiesced_regions.next,
630 struct dm_region, list); 651 struct dm_region, list);
631 list_del_init(&reg->list); /* remove from the quiesced list */ 652 list_del_init(&reg->list); /* remove from the quiesced list */
632 } 653 }
633 spin_unlock_irq(&rh->region_lock); 654 spin_unlock_irq(&rh->region_lock);
634 655
635 return reg; 656 return reg;
636 } 657 }
637 EXPORT_SYMBOL_GPL(dm_rh_recovery_start); 658 EXPORT_SYMBOL_GPL(dm_rh_recovery_start);
638 659
639 void dm_rh_recovery_end(struct dm_region *reg, int success) 660 void dm_rh_recovery_end(struct dm_region *reg, int success)
640 { 661 {
641 struct dm_region_hash *rh = reg->rh; 662 struct dm_region_hash *rh = reg->rh;
642 663
643 spin_lock_irq(&rh->region_lock); 664 spin_lock_irq(&rh->region_lock);
644 if (success) 665 if (success)
645 list_add(&reg->list, &reg->rh->recovered_regions); 666 list_add(&reg->list, &reg->rh->recovered_regions);
646 else { 667 else {
647 reg->state = DM_RH_NOSYNC; 668 reg->state = DM_RH_NOSYNC;
648 list_add(&reg->list, &reg->rh->failed_recovered_regions); 669 list_add(&reg->list, &reg->rh->failed_recovered_regions);
649 } 670 }
650 spin_unlock_irq(&rh->region_lock); 671 spin_unlock_irq(&rh->region_lock);
651 672
652 rh->wakeup_workers(rh->context); 673 rh->wakeup_workers(rh->context);
653 } 674 }
654 EXPORT_SYMBOL_GPL(dm_rh_recovery_end); 675 EXPORT_SYMBOL_GPL(dm_rh_recovery_end);
655 676
656 /* Return recovery in flight count. */ 677 /* Return recovery in flight count. */
657 int dm_rh_recovery_in_flight(struct dm_region_hash *rh) 678 int dm_rh_recovery_in_flight(struct dm_region_hash *rh)
658 { 679 {
659 return atomic_read(&rh->recovery_in_flight); 680 return atomic_read(&rh->recovery_in_flight);
660 } 681 }
661 EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); 682 EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight);
662 683
663 int dm_rh_flush(struct dm_region_hash *rh) 684 int dm_rh_flush(struct dm_region_hash *rh)
664 { 685 {
665 return rh->log->type->flush(rh->log); 686 return rh->log->type->flush(rh->log);
666 } 687 }
667 EXPORT_SYMBOL_GPL(dm_rh_flush); 688 EXPORT_SYMBOL_GPL(dm_rh_flush);
668 689
669 void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) 690 void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio)
670 { 691 {
671 struct dm_region *reg; 692 struct dm_region *reg;
672 693
673 read_lock(&rh->hash_lock); 694 read_lock(&rh->hash_lock);
674 reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); 695 reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio));
675 bio_list_add(&reg->delayed_bios, bio); 696 bio_list_add(&reg->delayed_bios, bio);
676 read_unlock(&rh->hash_lock); 697 read_unlock(&rh->hash_lock);
677 } 698 }
678 EXPORT_SYMBOL_GPL(dm_rh_delay); 699 EXPORT_SYMBOL_GPL(dm_rh_delay);
679 700
680 void dm_rh_stop_recovery(struct dm_region_hash *rh) 701 void dm_rh_stop_recovery(struct dm_region_hash *rh)
681 { 702 {
682 int i; 703 int i;
683 704
684 /* wait for any recovering regions */ 705 /* wait for any recovering regions */
685 for (i = 0; i < rh->max_recovery; i++) 706 for (i = 0; i < rh->max_recovery; i++)
686 down(&rh->recovery_count); 707 down(&rh->recovery_count);
687 } 708 }
688 EXPORT_SYMBOL_GPL(dm_rh_stop_recovery); 709 EXPORT_SYMBOL_GPL(dm_rh_stop_recovery);
689 710
690 void dm_rh_start_recovery(struct dm_region_hash *rh) 711 void dm_rh_start_recovery(struct dm_region_hash *rh)
691 { 712 {
692 int i; 713 int i;
693 714
694 for (i = 0; i < rh->max_recovery; i++) 715 for (i = 0; i < rh->max_recovery; i++)
695 up(&rh->recovery_count); 716 up(&rh->recovery_count);
696 717
697 rh->wakeup_workers(rh->context); 718 rh->wakeup_workers(rh->context);
698 } 719 }
699 EXPORT_SYMBOL_GPL(dm_rh_start_recovery); 720 EXPORT_SYMBOL_GPL(dm_rh_start_recovery);
700 721
701 MODULE_DESCRIPTION(DM_NAME " region hash"); 722 MODULE_DESCRIPTION(DM_NAME " region hash");
702 MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>"); 723 MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>");
703 MODULE_LICENSE("GPL"); 724 MODULE_LICENSE("GPL");
704 725