Commit 4184153f9e483f9bb63339ed316e059962fe9794
Committed by
Alasdair G Kergon
1 parent
f1e5398746
Exists in
master
and in
7 other branches
dm raid1: support flush
Flush support for dm-raid1. When it receives an empty barrier, submit it to all the devices via dm-io. Signed-off-by: Mikulas Patocka <mpatocka@redhat.com> Signed-off-by: Alasdair G Kergon <agk@redhat.com>
Showing 2 changed files with 34 additions and 4 deletions Inline Diff
drivers/md/dm-raid1.c
1 | /* | 1 | /* |
2 | * Copyright (C) 2003 Sistina Software Limited. | 2 | * Copyright (C) 2003 Sistina Software Limited. |
3 | * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. | 3 | * Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. |
4 | * | 4 | * |
5 | * This file is released under the GPL. | 5 | * This file is released under the GPL. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include "dm-bio-record.h" | 8 | #include "dm-bio-record.h" |
9 | 9 | ||
10 | #include <linux/init.h> | 10 | #include <linux/init.h> |
11 | #include <linux/mempool.h> | 11 | #include <linux/mempool.h> |
12 | #include <linux/module.h> | 12 | #include <linux/module.h> |
13 | #include <linux/pagemap.h> | 13 | #include <linux/pagemap.h> |
14 | #include <linux/slab.h> | 14 | #include <linux/slab.h> |
15 | #include <linux/workqueue.h> | 15 | #include <linux/workqueue.h> |
16 | #include <linux/device-mapper.h> | 16 | #include <linux/device-mapper.h> |
17 | #include <linux/dm-io.h> | 17 | #include <linux/dm-io.h> |
18 | #include <linux/dm-dirty-log.h> | 18 | #include <linux/dm-dirty-log.h> |
19 | #include <linux/dm-kcopyd.h> | 19 | #include <linux/dm-kcopyd.h> |
20 | #include <linux/dm-region-hash.h> | 20 | #include <linux/dm-region-hash.h> |
21 | 21 | ||
22 | #define DM_MSG_PREFIX "raid1" | 22 | #define DM_MSG_PREFIX "raid1" |
23 | 23 | ||
24 | #define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ | 24 | #define MAX_RECOVERY 1 /* Maximum number of regions recovered in parallel. */ |
25 | #define DM_IO_PAGES 64 | 25 | #define DM_IO_PAGES 64 |
26 | #define DM_KCOPYD_PAGES 64 | 26 | #define DM_KCOPYD_PAGES 64 |
27 | 27 | ||
28 | #define DM_RAID1_HANDLE_ERRORS 0x01 | 28 | #define DM_RAID1_HANDLE_ERRORS 0x01 |
29 | #define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) | 29 | #define errors_handled(p) ((p)->features & DM_RAID1_HANDLE_ERRORS) |
30 | 30 | ||
31 | static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); | 31 | static DECLARE_WAIT_QUEUE_HEAD(_kmirrord_recovery_stopped); |
32 | 32 | ||
33 | /*----------------------------------------------------------------- | 33 | /*----------------------------------------------------------------- |
34 | * Mirror set structures. | 34 | * Mirror set structures. |
35 | *---------------------------------------------------------------*/ | 35 | *---------------------------------------------------------------*/ |
36 | enum dm_raid1_error { | 36 | enum dm_raid1_error { |
37 | DM_RAID1_WRITE_ERROR, | 37 | DM_RAID1_WRITE_ERROR, |
38 | DM_RAID1_SYNC_ERROR, | 38 | DM_RAID1_SYNC_ERROR, |
39 | DM_RAID1_READ_ERROR | 39 | DM_RAID1_READ_ERROR |
40 | }; | 40 | }; |
41 | 41 | ||
42 | struct mirror { | 42 | struct mirror { |
43 | struct mirror_set *ms; | 43 | struct mirror_set *ms; |
44 | atomic_t error_count; | 44 | atomic_t error_count; |
45 | unsigned long error_type; | 45 | unsigned long error_type; |
46 | struct dm_dev *dev; | 46 | struct dm_dev *dev; |
47 | sector_t offset; | 47 | sector_t offset; |
48 | }; | 48 | }; |
49 | 49 | ||
50 | struct mirror_set { | 50 | struct mirror_set { |
51 | struct dm_target *ti; | 51 | struct dm_target *ti; |
52 | struct list_head list; | 52 | struct list_head list; |
53 | 53 | ||
54 | uint64_t features; | 54 | uint64_t features; |
55 | 55 | ||
56 | spinlock_t lock; /* protects the lists */ | 56 | spinlock_t lock; /* protects the lists */ |
57 | struct bio_list reads; | 57 | struct bio_list reads; |
58 | struct bio_list writes; | 58 | struct bio_list writes; |
59 | struct bio_list failures; | 59 | struct bio_list failures; |
60 | 60 | ||
61 | struct dm_region_hash *rh; | 61 | struct dm_region_hash *rh; |
62 | struct dm_kcopyd_client *kcopyd_client; | 62 | struct dm_kcopyd_client *kcopyd_client; |
63 | struct dm_io_client *io_client; | 63 | struct dm_io_client *io_client; |
64 | mempool_t *read_record_pool; | 64 | mempool_t *read_record_pool; |
65 | 65 | ||
66 | /* recovery */ | 66 | /* recovery */ |
67 | region_t nr_regions; | 67 | region_t nr_regions; |
68 | int in_sync; | 68 | int in_sync; |
69 | int log_failure; | 69 | int log_failure; |
70 | atomic_t suspend; | 70 | atomic_t suspend; |
71 | 71 | ||
72 | atomic_t default_mirror; /* Default mirror */ | 72 | atomic_t default_mirror; /* Default mirror */ |
73 | 73 | ||
74 | struct workqueue_struct *kmirrord_wq; | 74 | struct workqueue_struct *kmirrord_wq; |
75 | struct work_struct kmirrord_work; | 75 | struct work_struct kmirrord_work; |
76 | struct timer_list timer; | 76 | struct timer_list timer; |
77 | unsigned long timer_pending; | 77 | unsigned long timer_pending; |
78 | 78 | ||
79 | struct work_struct trigger_event; | 79 | struct work_struct trigger_event; |
80 | 80 | ||
81 | unsigned nr_mirrors; | 81 | unsigned nr_mirrors; |
82 | struct mirror mirror[0]; | 82 | struct mirror mirror[0]; |
83 | }; | 83 | }; |
84 | 84 | ||
85 | static void wakeup_mirrord(void *context) | 85 | static void wakeup_mirrord(void *context) |
86 | { | 86 | { |
87 | struct mirror_set *ms = context; | 87 | struct mirror_set *ms = context; |
88 | 88 | ||
89 | queue_work(ms->kmirrord_wq, &ms->kmirrord_work); | 89 | queue_work(ms->kmirrord_wq, &ms->kmirrord_work); |
90 | } | 90 | } |
91 | 91 | ||
92 | static void delayed_wake_fn(unsigned long data) | 92 | static void delayed_wake_fn(unsigned long data) |
93 | { | 93 | { |
94 | struct mirror_set *ms = (struct mirror_set *) data; | 94 | struct mirror_set *ms = (struct mirror_set *) data; |
95 | 95 | ||
96 | clear_bit(0, &ms->timer_pending); | 96 | clear_bit(0, &ms->timer_pending); |
97 | wakeup_mirrord(ms); | 97 | wakeup_mirrord(ms); |
98 | } | 98 | } |
99 | 99 | ||
100 | static void delayed_wake(struct mirror_set *ms) | 100 | static void delayed_wake(struct mirror_set *ms) |
101 | { | 101 | { |
102 | if (test_and_set_bit(0, &ms->timer_pending)) | 102 | if (test_and_set_bit(0, &ms->timer_pending)) |
103 | return; | 103 | return; |
104 | 104 | ||
105 | ms->timer.expires = jiffies + HZ / 5; | 105 | ms->timer.expires = jiffies + HZ / 5; |
106 | ms->timer.data = (unsigned long) ms; | 106 | ms->timer.data = (unsigned long) ms; |
107 | ms->timer.function = delayed_wake_fn; | 107 | ms->timer.function = delayed_wake_fn; |
108 | add_timer(&ms->timer); | 108 | add_timer(&ms->timer); |
109 | } | 109 | } |
110 | 110 | ||
111 | static void wakeup_all_recovery_waiters(void *context) | 111 | static void wakeup_all_recovery_waiters(void *context) |
112 | { | 112 | { |
113 | wake_up_all(&_kmirrord_recovery_stopped); | 113 | wake_up_all(&_kmirrord_recovery_stopped); |
114 | } | 114 | } |
115 | 115 | ||
116 | static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) | 116 | static void queue_bio(struct mirror_set *ms, struct bio *bio, int rw) |
117 | { | 117 | { |
118 | unsigned long flags; | 118 | unsigned long flags; |
119 | int should_wake = 0; | 119 | int should_wake = 0; |
120 | struct bio_list *bl; | 120 | struct bio_list *bl; |
121 | 121 | ||
122 | bl = (rw == WRITE) ? &ms->writes : &ms->reads; | 122 | bl = (rw == WRITE) ? &ms->writes : &ms->reads; |
123 | spin_lock_irqsave(&ms->lock, flags); | 123 | spin_lock_irqsave(&ms->lock, flags); |
124 | should_wake = !(bl->head); | 124 | should_wake = !(bl->head); |
125 | bio_list_add(bl, bio); | 125 | bio_list_add(bl, bio); |
126 | spin_unlock_irqrestore(&ms->lock, flags); | 126 | spin_unlock_irqrestore(&ms->lock, flags); |
127 | 127 | ||
128 | if (should_wake) | 128 | if (should_wake) |
129 | wakeup_mirrord(ms); | 129 | wakeup_mirrord(ms); |
130 | } | 130 | } |
131 | 131 | ||
132 | static void dispatch_bios(void *context, struct bio_list *bio_list) | 132 | static void dispatch_bios(void *context, struct bio_list *bio_list) |
133 | { | 133 | { |
134 | struct mirror_set *ms = context; | 134 | struct mirror_set *ms = context; |
135 | struct bio *bio; | 135 | struct bio *bio; |
136 | 136 | ||
137 | while ((bio = bio_list_pop(bio_list))) | 137 | while ((bio = bio_list_pop(bio_list))) |
138 | queue_bio(ms, bio, WRITE); | 138 | queue_bio(ms, bio, WRITE); |
139 | } | 139 | } |
140 | 140 | ||
141 | #define MIN_READ_RECORDS 20 | 141 | #define MIN_READ_RECORDS 20 |
142 | struct dm_raid1_read_record { | 142 | struct dm_raid1_read_record { |
143 | struct mirror *m; | 143 | struct mirror *m; |
144 | struct dm_bio_details details; | 144 | struct dm_bio_details details; |
145 | }; | 145 | }; |
146 | 146 | ||
147 | static struct kmem_cache *_dm_raid1_read_record_cache; | 147 | static struct kmem_cache *_dm_raid1_read_record_cache; |
148 | 148 | ||
149 | /* | 149 | /* |
150 | * Every mirror should look like this one. | 150 | * Every mirror should look like this one. |
151 | */ | 151 | */ |
152 | #define DEFAULT_MIRROR 0 | 152 | #define DEFAULT_MIRROR 0 |
153 | 153 | ||
154 | /* | 154 | /* |
155 | * This is yucky. We squirrel the mirror struct away inside | 155 | * This is yucky. We squirrel the mirror struct away inside |
156 | * bi_next for read/write buffers. This is safe since the bh | 156 | * bi_next for read/write buffers. This is safe since the bh |
157 | * doesn't get submitted to the lower levels of block layer. | 157 | * doesn't get submitted to the lower levels of block layer. |
158 | */ | 158 | */ |
159 | static struct mirror *bio_get_m(struct bio *bio) | 159 | static struct mirror *bio_get_m(struct bio *bio) |
160 | { | 160 | { |
161 | return (struct mirror *) bio->bi_next; | 161 | return (struct mirror *) bio->bi_next; |
162 | } | 162 | } |
163 | 163 | ||
164 | static void bio_set_m(struct bio *bio, struct mirror *m) | 164 | static void bio_set_m(struct bio *bio, struct mirror *m) |
165 | { | 165 | { |
166 | bio->bi_next = (struct bio *) m; | 166 | bio->bi_next = (struct bio *) m; |
167 | } | 167 | } |
168 | 168 | ||
169 | static struct mirror *get_default_mirror(struct mirror_set *ms) | 169 | static struct mirror *get_default_mirror(struct mirror_set *ms) |
170 | { | 170 | { |
171 | return &ms->mirror[atomic_read(&ms->default_mirror)]; | 171 | return &ms->mirror[atomic_read(&ms->default_mirror)]; |
172 | } | 172 | } |
173 | 173 | ||
174 | static void set_default_mirror(struct mirror *m) | 174 | static void set_default_mirror(struct mirror *m) |
175 | { | 175 | { |
176 | struct mirror_set *ms = m->ms; | 176 | struct mirror_set *ms = m->ms; |
177 | struct mirror *m0 = &(ms->mirror[0]); | 177 | struct mirror *m0 = &(ms->mirror[0]); |
178 | 178 | ||
179 | atomic_set(&ms->default_mirror, m - m0); | 179 | atomic_set(&ms->default_mirror, m - m0); |
180 | } | 180 | } |
181 | 181 | ||
182 | /* fail_mirror | 182 | /* fail_mirror |
183 | * @m: mirror device to fail | 183 | * @m: mirror device to fail |
184 | * @error_type: one of the enum's, DM_RAID1_*_ERROR | 184 | * @error_type: one of the enum's, DM_RAID1_*_ERROR |
185 | * | 185 | * |
186 | * If errors are being handled, record the type of | 186 | * If errors are being handled, record the type of |
187 | * error encountered for this device. If this type | 187 | * error encountered for this device. If this type |
188 | * of error has already been recorded, we can return; | 188 | * of error has already been recorded, we can return; |
189 | * otherwise, we must signal userspace by triggering | 189 | * otherwise, we must signal userspace by triggering |
190 | * an event. Additionally, if the device is the | 190 | * an event. Additionally, if the device is the |
191 | * primary device, we must choose a new primary, but | 191 | * primary device, we must choose a new primary, but |
192 | * only if the mirror is in-sync. | 192 | * only if the mirror is in-sync. |
193 | * | 193 | * |
194 | * This function must not block. | 194 | * This function must not block. |
195 | */ | 195 | */ |
196 | static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) | 196 | static void fail_mirror(struct mirror *m, enum dm_raid1_error error_type) |
197 | { | 197 | { |
198 | struct mirror_set *ms = m->ms; | 198 | struct mirror_set *ms = m->ms; |
199 | struct mirror *new; | 199 | struct mirror *new; |
200 | 200 | ||
201 | /* | 201 | /* |
202 | * error_count is used for nothing more than a | 202 | * error_count is used for nothing more than a |
203 | * simple way to tell if a device has encountered | 203 | * simple way to tell if a device has encountered |
204 | * errors. | 204 | * errors. |
205 | */ | 205 | */ |
206 | atomic_inc(&m->error_count); | 206 | atomic_inc(&m->error_count); |
207 | 207 | ||
208 | if (test_and_set_bit(error_type, &m->error_type)) | 208 | if (test_and_set_bit(error_type, &m->error_type)) |
209 | return; | 209 | return; |
210 | 210 | ||
211 | if (!errors_handled(ms)) | 211 | if (!errors_handled(ms)) |
212 | return; | 212 | return; |
213 | 213 | ||
214 | if (m != get_default_mirror(ms)) | 214 | if (m != get_default_mirror(ms)) |
215 | goto out; | 215 | goto out; |
216 | 216 | ||
217 | if (!ms->in_sync) { | 217 | if (!ms->in_sync) { |
218 | /* | 218 | /* |
219 | * Better to issue requests to same failing device | 219 | * Better to issue requests to same failing device |
220 | * than to risk returning corrupt data. | 220 | * than to risk returning corrupt data. |
221 | */ | 221 | */ |
222 | DMERR("Primary mirror (%s) failed while out-of-sync: " | 222 | DMERR("Primary mirror (%s) failed while out-of-sync: " |
223 | "Reads may fail.", m->dev->name); | 223 | "Reads may fail.", m->dev->name); |
224 | goto out; | 224 | goto out; |
225 | } | 225 | } |
226 | 226 | ||
227 | for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) | 227 | for (new = ms->mirror; new < ms->mirror + ms->nr_mirrors; new++) |
228 | if (!atomic_read(&new->error_count)) { | 228 | if (!atomic_read(&new->error_count)) { |
229 | set_default_mirror(new); | 229 | set_default_mirror(new); |
230 | break; | 230 | break; |
231 | } | 231 | } |
232 | 232 | ||
233 | if (unlikely(new == ms->mirror + ms->nr_mirrors)) | 233 | if (unlikely(new == ms->mirror + ms->nr_mirrors)) |
234 | DMWARN("All sides of mirror have failed."); | 234 | DMWARN("All sides of mirror have failed."); |
235 | 235 | ||
236 | out: | 236 | out: |
237 | schedule_work(&ms->trigger_event); | 237 | schedule_work(&ms->trigger_event); |
238 | } | 238 | } |
239 | 239 | ||
240 | /*----------------------------------------------------------------- | 240 | /*----------------------------------------------------------------- |
241 | * Recovery. | 241 | * Recovery. |
242 | * | 242 | * |
243 | * When a mirror is first activated we may find that some regions | 243 | * When a mirror is first activated we may find that some regions |
244 | * are in the no-sync state. We have to recover these by | 244 | * are in the no-sync state. We have to recover these by |
245 | * recopying from the default mirror to all the others. | 245 | * recopying from the default mirror to all the others. |
246 | *---------------------------------------------------------------*/ | 246 | *---------------------------------------------------------------*/ |
247 | static void recovery_complete(int read_err, unsigned long write_err, | 247 | static void recovery_complete(int read_err, unsigned long write_err, |
248 | void *context) | 248 | void *context) |
249 | { | 249 | { |
250 | struct dm_region *reg = context; | 250 | struct dm_region *reg = context; |
251 | struct mirror_set *ms = dm_rh_region_context(reg); | 251 | struct mirror_set *ms = dm_rh_region_context(reg); |
252 | int m, bit = 0; | 252 | int m, bit = 0; |
253 | 253 | ||
254 | if (read_err) { | 254 | if (read_err) { |
255 | /* Read error means the failure of default mirror. */ | 255 | /* Read error means the failure of default mirror. */ |
256 | DMERR_LIMIT("Unable to read primary mirror during recovery"); | 256 | DMERR_LIMIT("Unable to read primary mirror during recovery"); |
257 | fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR); | 257 | fail_mirror(get_default_mirror(ms), DM_RAID1_SYNC_ERROR); |
258 | } | 258 | } |
259 | 259 | ||
260 | if (write_err) { | 260 | if (write_err) { |
261 | DMERR_LIMIT("Write error during recovery (error = 0x%lx)", | 261 | DMERR_LIMIT("Write error during recovery (error = 0x%lx)", |
262 | write_err); | 262 | write_err); |
263 | /* | 263 | /* |
264 | * Bits correspond to devices (excluding default mirror). | 264 | * Bits correspond to devices (excluding default mirror). |
265 | * The default mirror cannot change during recovery. | 265 | * The default mirror cannot change during recovery. |
266 | */ | 266 | */ |
267 | for (m = 0; m < ms->nr_mirrors; m++) { | 267 | for (m = 0; m < ms->nr_mirrors; m++) { |
268 | if (&ms->mirror[m] == get_default_mirror(ms)) | 268 | if (&ms->mirror[m] == get_default_mirror(ms)) |
269 | continue; | 269 | continue; |
270 | if (test_bit(bit, &write_err)) | 270 | if (test_bit(bit, &write_err)) |
271 | fail_mirror(ms->mirror + m, | 271 | fail_mirror(ms->mirror + m, |
272 | DM_RAID1_SYNC_ERROR); | 272 | DM_RAID1_SYNC_ERROR); |
273 | bit++; | 273 | bit++; |
274 | } | 274 | } |
275 | } | 275 | } |
276 | 276 | ||
277 | dm_rh_recovery_end(reg, !(read_err || write_err)); | 277 | dm_rh_recovery_end(reg, !(read_err || write_err)); |
278 | } | 278 | } |
279 | 279 | ||
280 | static int recover(struct mirror_set *ms, struct dm_region *reg) | 280 | static int recover(struct mirror_set *ms, struct dm_region *reg) |
281 | { | 281 | { |
282 | int r; | 282 | int r; |
283 | unsigned i; | 283 | unsigned i; |
284 | struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; | 284 | struct dm_io_region from, to[DM_KCOPYD_MAX_REGIONS], *dest; |
285 | struct mirror *m; | 285 | struct mirror *m; |
286 | unsigned long flags = 0; | 286 | unsigned long flags = 0; |
287 | region_t key = dm_rh_get_region_key(reg); | 287 | region_t key = dm_rh_get_region_key(reg); |
288 | sector_t region_size = dm_rh_get_region_size(ms->rh); | 288 | sector_t region_size = dm_rh_get_region_size(ms->rh); |
289 | 289 | ||
290 | /* fill in the source */ | 290 | /* fill in the source */ |
291 | m = get_default_mirror(ms); | 291 | m = get_default_mirror(ms); |
292 | from.bdev = m->dev->bdev; | 292 | from.bdev = m->dev->bdev; |
293 | from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key); | 293 | from.sector = m->offset + dm_rh_region_to_sector(ms->rh, key); |
294 | if (key == (ms->nr_regions - 1)) { | 294 | if (key == (ms->nr_regions - 1)) { |
295 | /* | 295 | /* |
296 | * The final region may be smaller than | 296 | * The final region may be smaller than |
297 | * region_size. | 297 | * region_size. |
298 | */ | 298 | */ |
299 | from.count = ms->ti->len & (region_size - 1); | 299 | from.count = ms->ti->len & (region_size - 1); |
300 | if (!from.count) | 300 | if (!from.count) |
301 | from.count = region_size; | 301 | from.count = region_size; |
302 | } else | 302 | } else |
303 | from.count = region_size; | 303 | from.count = region_size; |
304 | 304 | ||
305 | /* fill in the destinations */ | 305 | /* fill in the destinations */ |
306 | for (i = 0, dest = to; i < ms->nr_mirrors; i++) { | 306 | for (i = 0, dest = to; i < ms->nr_mirrors; i++) { |
307 | if (&ms->mirror[i] == get_default_mirror(ms)) | 307 | if (&ms->mirror[i] == get_default_mirror(ms)) |
308 | continue; | 308 | continue; |
309 | 309 | ||
310 | m = ms->mirror + i; | 310 | m = ms->mirror + i; |
311 | dest->bdev = m->dev->bdev; | 311 | dest->bdev = m->dev->bdev; |
312 | dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key); | 312 | dest->sector = m->offset + dm_rh_region_to_sector(ms->rh, key); |
313 | dest->count = from.count; | 313 | dest->count = from.count; |
314 | dest++; | 314 | dest++; |
315 | } | 315 | } |
316 | 316 | ||
317 | /* hand to kcopyd */ | 317 | /* hand to kcopyd */ |
318 | if (!errors_handled(ms)) | 318 | if (!errors_handled(ms)) |
319 | set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); | 319 | set_bit(DM_KCOPYD_IGNORE_ERROR, &flags); |
320 | 320 | ||
321 | r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, | 321 | r = dm_kcopyd_copy(ms->kcopyd_client, &from, ms->nr_mirrors - 1, to, |
322 | flags, recovery_complete, reg); | 322 | flags, recovery_complete, reg); |
323 | 323 | ||
324 | return r; | 324 | return r; |
325 | } | 325 | } |
326 | 326 | ||
327 | static void do_recovery(struct mirror_set *ms) | 327 | static void do_recovery(struct mirror_set *ms) |
328 | { | 328 | { |
329 | struct dm_region *reg; | 329 | struct dm_region *reg; |
330 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | 330 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
331 | int r; | 331 | int r; |
332 | 332 | ||
333 | /* | 333 | /* |
334 | * Start quiescing some regions. | 334 | * Start quiescing some regions. |
335 | */ | 335 | */ |
336 | dm_rh_recovery_prepare(ms->rh); | 336 | dm_rh_recovery_prepare(ms->rh); |
337 | 337 | ||
338 | /* | 338 | /* |
339 | * Copy any already quiesced regions. | 339 | * Copy any already quiesced regions. |
340 | */ | 340 | */ |
341 | while ((reg = dm_rh_recovery_start(ms->rh))) { | 341 | while ((reg = dm_rh_recovery_start(ms->rh))) { |
342 | r = recover(ms, reg); | 342 | r = recover(ms, reg); |
343 | if (r) | 343 | if (r) |
344 | dm_rh_recovery_end(reg, 0); | 344 | dm_rh_recovery_end(reg, 0); |
345 | } | 345 | } |
346 | 346 | ||
347 | /* | 347 | /* |
348 | * Update the in sync flag. | 348 | * Update the in sync flag. |
349 | */ | 349 | */ |
350 | if (!ms->in_sync && | 350 | if (!ms->in_sync && |
351 | (log->type->get_sync_count(log) == ms->nr_regions)) { | 351 | (log->type->get_sync_count(log) == ms->nr_regions)) { |
352 | /* the sync is complete */ | 352 | /* the sync is complete */ |
353 | dm_table_event(ms->ti->table); | 353 | dm_table_event(ms->ti->table); |
354 | ms->in_sync = 1; | 354 | ms->in_sync = 1; |
355 | } | 355 | } |
356 | } | 356 | } |
357 | 357 | ||
358 | /*----------------------------------------------------------------- | 358 | /*----------------------------------------------------------------- |
359 | * Reads | 359 | * Reads |
360 | *---------------------------------------------------------------*/ | 360 | *---------------------------------------------------------------*/ |
361 | static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) | 361 | static struct mirror *choose_mirror(struct mirror_set *ms, sector_t sector) |
362 | { | 362 | { |
363 | struct mirror *m = get_default_mirror(ms); | 363 | struct mirror *m = get_default_mirror(ms); |
364 | 364 | ||
365 | do { | 365 | do { |
366 | if (likely(!atomic_read(&m->error_count))) | 366 | if (likely(!atomic_read(&m->error_count))) |
367 | return m; | 367 | return m; |
368 | 368 | ||
369 | if (m-- == ms->mirror) | 369 | if (m-- == ms->mirror) |
370 | m += ms->nr_mirrors; | 370 | m += ms->nr_mirrors; |
371 | } while (m != get_default_mirror(ms)); | 371 | } while (m != get_default_mirror(ms)); |
372 | 372 | ||
373 | return NULL; | 373 | return NULL; |
374 | } | 374 | } |
375 | 375 | ||
376 | static int default_ok(struct mirror *m) | 376 | static int default_ok(struct mirror *m) |
377 | { | 377 | { |
378 | struct mirror *default_mirror = get_default_mirror(m->ms); | 378 | struct mirror *default_mirror = get_default_mirror(m->ms); |
379 | 379 | ||
380 | return !atomic_read(&default_mirror->error_count); | 380 | return !atomic_read(&default_mirror->error_count); |
381 | } | 381 | } |
382 | 382 | ||
383 | static int mirror_available(struct mirror_set *ms, struct bio *bio) | 383 | static int mirror_available(struct mirror_set *ms, struct bio *bio) |
384 | { | 384 | { |
385 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | 385 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
386 | region_t region = dm_rh_bio_to_region(ms->rh, bio); | 386 | region_t region = dm_rh_bio_to_region(ms->rh, bio); |
387 | 387 | ||
388 | if (log->type->in_sync(log, region, 0)) | 388 | if (log->type->in_sync(log, region, 0)) |
389 | return choose_mirror(ms, bio->bi_sector) ? 1 : 0; | 389 | return choose_mirror(ms, bio->bi_sector) ? 1 : 0; |
390 | 390 | ||
391 | return 0; | 391 | return 0; |
392 | } | 392 | } |
393 | 393 | ||
394 | /* | 394 | /* |
395 | * remap a buffer to a particular mirror. | 395 | * remap a buffer to a particular mirror. |
396 | */ | 396 | */ |
397 | static sector_t map_sector(struct mirror *m, struct bio *bio) | 397 | static sector_t map_sector(struct mirror *m, struct bio *bio) |
398 | { | 398 | { |
399 | if (unlikely(!bio->bi_size)) | ||
400 | return 0; | ||
399 | return m->offset + (bio->bi_sector - m->ms->ti->begin); | 401 | return m->offset + (bio->bi_sector - m->ms->ti->begin); |
400 | } | 402 | } |
401 | 403 | ||
402 | static void map_bio(struct mirror *m, struct bio *bio) | 404 | static void map_bio(struct mirror *m, struct bio *bio) |
403 | { | 405 | { |
404 | bio->bi_bdev = m->dev->bdev; | 406 | bio->bi_bdev = m->dev->bdev; |
405 | bio->bi_sector = map_sector(m, bio); | 407 | bio->bi_sector = map_sector(m, bio); |
406 | } | 408 | } |
407 | 409 | ||
408 | static void map_region(struct dm_io_region *io, struct mirror *m, | 410 | static void map_region(struct dm_io_region *io, struct mirror *m, |
409 | struct bio *bio) | 411 | struct bio *bio) |
410 | { | 412 | { |
411 | io->bdev = m->dev->bdev; | 413 | io->bdev = m->dev->bdev; |
412 | io->sector = map_sector(m, bio); | 414 | io->sector = map_sector(m, bio); |
413 | io->count = bio->bi_size >> 9; | 415 | io->count = bio->bi_size >> 9; |
414 | } | 416 | } |
415 | 417 | ||
416 | /*----------------------------------------------------------------- | 418 | /*----------------------------------------------------------------- |
417 | * Reads | 419 | * Reads |
418 | *---------------------------------------------------------------*/ | 420 | *---------------------------------------------------------------*/ |
419 | static void read_callback(unsigned long error, void *context) | 421 | static void read_callback(unsigned long error, void *context) |
420 | { | 422 | { |
421 | struct bio *bio = context; | 423 | struct bio *bio = context; |
422 | struct mirror *m; | 424 | struct mirror *m; |
423 | 425 | ||
424 | m = bio_get_m(bio); | 426 | m = bio_get_m(bio); |
425 | bio_set_m(bio, NULL); | 427 | bio_set_m(bio, NULL); |
426 | 428 | ||
427 | if (likely(!error)) { | 429 | if (likely(!error)) { |
428 | bio_endio(bio, 0); | 430 | bio_endio(bio, 0); |
429 | return; | 431 | return; |
430 | } | 432 | } |
431 | 433 | ||
432 | fail_mirror(m, DM_RAID1_READ_ERROR); | 434 | fail_mirror(m, DM_RAID1_READ_ERROR); |
433 | 435 | ||
434 | if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { | 436 | if (likely(default_ok(m)) || mirror_available(m->ms, bio)) { |
435 | DMWARN_LIMIT("Read failure on mirror device %s. " | 437 | DMWARN_LIMIT("Read failure on mirror device %s. " |
436 | "Trying alternative device.", | 438 | "Trying alternative device.", |
437 | m->dev->name); | 439 | m->dev->name); |
438 | queue_bio(m->ms, bio, bio_rw(bio)); | 440 | queue_bio(m->ms, bio, bio_rw(bio)); |
439 | return; | 441 | return; |
440 | } | 442 | } |
441 | 443 | ||
442 | DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", | 444 | DMERR_LIMIT("Read failure on mirror device %s. Failing I/O.", |
443 | m->dev->name); | 445 | m->dev->name); |
444 | bio_endio(bio, -EIO); | 446 | bio_endio(bio, -EIO); |
445 | } | 447 | } |
446 | 448 | ||
447 | /* Asynchronous read. */ | 449 | /* Asynchronous read. */ |
448 | static void read_async_bio(struct mirror *m, struct bio *bio) | 450 | static void read_async_bio(struct mirror *m, struct bio *bio) |
449 | { | 451 | { |
450 | struct dm_io_region io; | 452 | struct dm_io_region io; |
451 | struct dm_io_request io_req = { | 453 | struct dm_io_request io_req = { |
452 | .bi_rw = READ, | 454 | .bi_rw = READ, |
453 | .mem.type = DM_IO_BVEC, | 455 | .mem.type = DM_IO_BVEC, |
454 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, | 456 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, |
455 | .notify.fn = read_callback, | 457 | .notify.fn = read_callback, |
456 | .notify.context = bio, | 458 | .notify.context = bio, |
457 | .client = m->ms->io_client, | 459 | .client = m->ms->io_client, |
458 | }; | 460 | }; |
459 | 461 | ||
460 | map_region(&io, m, bio); | 462 | map_region(&io, m, bio); |
461 | bio_set_m(bio, m); | 463 | bio_set_m(bio, m); |
462 | BUG_ON(dm_io(&io_req, 1, &io, NULL)); | 464 | BUG_ON(dm_io(&io_req, 1, &io, NULL)); |
463 | } | 465 | } |
464 | 466 | ||
465 | static inline int region_in_sync(struct mirror_set *ms, region_t region, | 467 | static inline int region_in_sync(struct mirror_set *ms, region_t region, |
466 | int may_block) | 468 | int may_block) |
467 | { | 469 | { |
468 | int state = dm_rh_get_state(ms->rh, region, may_block); | 470 | int state = dm_rh_get_state(ms->rh, region, may_block); |
469 | return state == DM_RH_CLEAN || state == DM_RH_DIRTY; | 471 | return state == DM_RH_CLEAN || state == DM_RH_DIRTY; |
470 | } | 472 | } |
471 | 473 | ||
472 | static void do_reads(struct mirror_set *ms, struct bio_list *reads) | 474 | static void do_reads(struct mirror_set *ms, struct bio_list *reads) |
473 | { | 475 | { |
474 | region_t region; | 476 | region_t region; |
475 | struct bio *bio; | 477 | struct bio *bio; |
476 | struct mirror *m; | 478 | struct mirror *m; |
477 | 479 | ||
478 | while ((bio = bio_list_pop(reads))) { | 480 | while ((bio = bio_list_pop(reads))) { |
479 | region = dm_rh_bio_to_region(ms->rh, bio); | 481 | region = dm_rh_bio_to_region(ms->rh, bio); |
480 | m = get_default_mirror(ms); | 482 | m = get_default_mirror(ms); |
481 | 483 | ||
482 | /* | 484 | /* |
483 | * We can only read balance if the region is in sync. | 485 | * We can only read balance if the region is in sync. |
484 | */ | 486 | */ |
485 | if (likely(region_in_sync(ms, region, 1))) | 487 | if (likely(region_in_sync(ms, region, 1))) |
486 | m = choose_mirror(ms, bio->bi_sector); | 488 | m = choose_mirror(ms, bio->bi_sector); |
487 | else if (m && atomic_read(&m->error_count)) | 489 | else if (m && atomic_read(&m->error_count)) |
488 | m = NULL; | 490 | m = NULL; |
489 | 491 | ||
490 | if (likely(m)) | 492 | if (likely(m)) |
491 | read_async_bio(m, bio); | 493 | read_async_bio(m, bio); |
492 | else | 494 | else |
493 | bio_endio(bio, -EIO); | 495 | bio_endio(bio, -EIO); |
494 | } | 496 | } |
495 | } | 497 | } |
496 | 498 | ||
497 | /*----------------------------------------------------------------- | 499 | /*----------------------------------------------------------------- |
498 | * Writes. | 500 | * Writes. |
499 | * | 501 | * |
500 | * We do different things with the write io depending on the | 502 | * We do different things with the write io depending on the |
501 | * state of the region that it's in: | 503 | * state of the region that it's in: |
502 | * | 504 | * |
503 | * SYNC: increment pending, use kcopyd to write to *all* mirrors | 505 | * SYNC: increment pending, use kcopyd to write to *all* mirrors |
504 | * RECOVERING: delay the io until recovery completes | 506 | * RECOVERING: delay the io until recovery completes |
505 | * NOSYNC: increment pending, just write to the default mirror | 507 | * NOSYNC: increment pending, just write to the default mirror |
506 | *---------------------------------------------------------------*/ | 508 | *---------------------------------------------------------------*/ |
507 | 509 | ||
508 | 510 | ||
509 | static void write_callback(unsigned long error, void *context) | 511 | static void write_callback(unsigned long error, void *context) |
510 | { | 512 | { |
511 | unsigned i, ret = 0; | 513 | unsigned i, ret = 0; |
512 | struct bio *bio = (struct bio *) context; | 514 | struct bio *bio = (struct bio *) context; |
513 | struct mirror_set *ms; | 515 | struct mirror_set *ms; |
514 | int uptodate = 0; | 516 | int uptodate = 0; |
515 | int should_wake = 0; | 517 | int should_wake = 0; |
516 | unsigned long flags; | 518 | unsigned long flags; |
517 | 519 | ||
518 | ms = bio_get_m(bio)->ms; | 520 | ms = bio_get_m(bio)->ms; |
519 | bio_set_m(bio, NULL); | 521 | bio_set_m(bio, NULL); |
520 | 522 | ||
521 | /* | 523 | /* |
522 | * NOTE: We don't decrement the pending count here, | 524 | * NOTE: We don't decrement the pending count here, |
523 | * instead it is done by the targets endio function. | 525 | * instead it is done by the targets endio function. |
524 | * This way we handle both writes to SYNC and NOSYNC | 526 | * This way we handle both writes to SYNC and NOSYNC |
525 | * regions with the same code. | 527 | * regions with the same code. |
526 | */ | 528 | */ |
527 | if (likely(!error)) | 529 | if (likely(!error)) |
528 | goto out; | 530 | goto out; |
529 | 531 | ||
530 | for (i = 0; i < ms->nr_mirrors; i++) | 532 | for (i = 0; i < ms->nr_mirrors; i++) |
531 | if (test_bit(i, &error)) | 533 | if (test_bit(i, &error)) |
532 | fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); | 534 | fail_mirror(ms->mirror + i, DM_RAID1_WRITE_ERROR); |
533 | else | 535 | else |
534 | uptodate = 1; | 536 | uptodate = 1; |
535 | 537 | ||
536 | if (unlikely(!uptodate)) { | 538 | if (unlikely(!uptodate)) { |
537 | DMERR("All replicated volumes dead, failing I/O"); | 539 | DMERR("All replicated volumes dead, failing I/O"); |
538 | /* None of the writes succeeded, fail the I/O. */ | 540 | /* None of the writes succeeded, fail the I/O. */ |
539 | ret = -EIO; | 541 | ret = -EIO; |
540 | } else if (errors_handled(ms)) { | 542 | } else if (errors_handled(ms)) { |
541 | /* | 543 | /* |
542 | * Need to raise event. Since raising | 544 | * Need to raise event. Since raising |
543 | * events can block, we need to do it in | 545 | * events can block, we need to do it in |
544 | * the main thread. | 546 | * the main thread. |
545 | */ | 547 | */ |
546 | spin_lock_irqsave(&ms->lock, flags); | 548 | spin_lock_irqsave(&ms->lock, flags); |
547 | if (!ms->failures.head) | 549 | if (!ms->failures.head) |
548 | should_wake = 1; | 550 | should_wake = 1; |
549 | bio_list_add(&ms->failures, bio); | 551 | bio_list_add(&ms->failures, bio); |
550 | spin_unlock_irqrestore(&ms->lock, flags); | 552 | spin_unlock_irqrestore(&ms->lock, flags); |
551 | if (should_wake) | 553 | if (should_wake) |
552 | wakeup_mirrord(ms); | 554 | wakeup_mirrord(ms); |
553 | return; | 555 | return; |
554 | } | 556 | } |
555 | out: | 557 | out: |
556 | bio_endio(bio, ret); | 558 | bio_endio(bio, ret); |
557 | } | 559 | } |
558 | 560 | ||
559 | static void do_write(struct mirror_set *ms, struct bio *bio) | 561 | static void do_write(struct mirror_set *ms, struct bio *bio) |
560 | { | 562 | { |
561 | unsigned int i; | 563 | unsigned int i; |
562 | struct dm_io_region io[ms->nr_mirrors], *dest = io; | 564 | struct dm_io_region io[ms->nr_mirrors], *dest = io; |
563 | struct mirror *m; | 565 | struct mirror *m; |
564 | struct dm_io_request io_req = { | 566 | struct dm_io_request io_req = { |
565 | .bi_rw = WRITE, | 567 | .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER), |
566 | .mem.type = DM_IO_BVEC, | 568 | .mem.type = DM_IO_BVEC, |
567 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, | 569 | .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx, |
568 | .notify.fn = write_callback, | 570 | .notify.fn = write_callback, |
569 | .notify.context = bio, | 571 | .notify.context = bio, |
570 | .client = ms->io_client, | 572 | .client = ms->io_client, |
571 | }; | 573 | }; |
572 | 574 | ||
573 | for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) | 575 | for (i = 0, m = ms->mirror; i < ms->nr_mirrors; i++, m++) |
574 | map_region(dest++, m, bio); | 576 | map_region(dest++, m, bio); |
575 | 577 | ||
576 | /* | 578 | /* |
577 | * Use default mirror because we only need it to retrieve the reference | 579 | * Use default mirror because we only need it to retrieve the reference |
578 | * to the mirror set in write_callback(). | 580 | * to the mirror set in write_callback(). |
579 | */ | 581 | */ |
580 | bio_set_m(bio, get_default_mirror(ms)); | 582 | bio_set_m(bio, get_default_mirror(ms)); |
581 | 583 | ||
582 | BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); | 584 | BUG_ON(dm_io(&io_req, ms->nr_mirrors, io, NULL)); |
583 | } | 585 | } |
584 | 586 | ||
585 | static void do_writes(struct mirror_set *ms, struct bio_list *writes) | 587 | static void do_writes(struct mirror_set *ms, struct bio_list *writes) |
586 | { | 588 | { |
587 | int state; | 589 | int state; |
588 | struct bio *bio; | 590 | struct bio *bio; |
589 | struct bio_list sync, nosync, recover, *this_list = NULL; | 591 | struct bio_list sync, nosync, recover, *this_list = NULL; |
590 | struct bio_list requeue; | 592 | struct bio_list requeue; |
591 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | 593 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
592 | region_t region; | 594 | region_t region; |
593 | 595 | ||
594 | if (!writes->head) | 596 | if (!writes->head) |
595 | return; | 597 | return; |
596 | 598 | ||
597 | /* | 599 | /* |
598 | * Classify each write. | 600 | * Classify each write. |
599 | */ | 601 | */ |
600 | bio_list_init(&sync); | 602 | bio_list_init(&sync); |
601 | bio_list_init(&nosync); | 603 | bio_list_init(&nosync); |
602 | bio_list_init(&recover); | 604 | bio_list_init(&recover); |
603 | bio_list_init(&requeue); | 605 | bio_list_init(&requeue); |
604 | 606 | ||
605 | while ((bio = bio_list_pop(writes))) { | 607 | while ((bio = bio_list_pop(writes))) { |
608 | if (unlikely(bio_empty_barrier(bio))) { | ||
609 | bio_list_add(&sync, bio); | ||
610 | continue; | ||
611 | } | ||
612 | |||
606 | region = dm_rh_bio_to_region(ms->rh, bio); | 613 | region = dm_rh_bio_to_region(ms->rh, bio); |
607 | 614 | ||
608 | if (log->type->is_remote_recovering && | 615 | if (log->type->is_remote_recovering && |
609 | log->type->is_remote_recovering(log, region)) { | 616 | log->type->is_remote_recovering(log, region)) { |
610 | bio_list_add(&requeue, bio); | 617 | bio_list_add(&requeue, bio); |
611 | continue; | 618 | continue; |
612 | } | 619 | } |
613 | 620 | ||
614 | state = dm_rh_get_state(ms->rh, region, 1); | 621 | state = dm_rh_get_state(ms->rh, region, 1); |
615 | switch (state) { | 622 | switch (state) { |
616 | case DM_RH_CLEAN: | 623 | case DM_RH_CLEAN: |
617 | case DM_RH_DIRTY: | 624 | case DM_RH_DIRTY: |
618 | this_list = &sync; | 625 | this_list = &sync; |
619 | break; | 626 | break; |
620 | 627 | ||
621 | case DM_RH_NOSYNC: | 628 | case DM_RH_NOSYNC: |
622 | this_list = &nosync; | 629 | this_list = &nosync; |
623 | break; | 630 | break; |
624 | 631 | ||
625 | case DM_RH_RECOVERING: | 632 | case DM_RH_RECOVERING: |
626 | this_list = &recover; | 633 | this_list = &recover; |
627 | break; | 634 | break; |
628 | } | 635 | } |
629 | 636 | ||
630 | bio_list_add(this_list, bio); | 637 | bio_list_add(this_list, bio); |
631 | } | 638 | } |
632 | 639 | ||
633 | /* | 640 | /* |
634 | * Add bios that are delayed due to remote recovery | 641 | * Add bios that are delayed due to remote recovery |
635 | * back on to the write queue | 642 | * back on to the write queue |
636 | */ | 643 | */ |
637 | if (unlikely(requeue.head)) { | 644 | if (unlikely(requeue.head)) { |
638 | spin_lock_irq(&ms->lock); | 645 | spin_lock_irq(&ms->lock); |
639 | bio_list_merge(&ms->writes, &requeue); | 646 | bio_list_merge(&ms->writes, &requeue); |
640 | spin_unlock_irq(&ms->lock); | 647 | spin_unlock_irq(&ms->lock); |
641 | delayed_wake(ms); | 648 | delayed_wake(ms); |
642 | } | 649 | } |
643 | 650 | ||
644 | /* | 651 | /* |
645 | * Increment the pending counts for any regions that will | 652 | * Increment the pending counts for any regions that will |
646 | * be written to (writes to recover regions are going to | 653 | * be written to (writes to recover regions are going to |
647 | * be delayed). | 654 | * be delayed). |
648 | */ | 655 | */ |
649 | dm_rh_inc_pending(ms->rh, &sync); | 656 | dm_rh_inc_pending(ms->rh, &sync); |
650 | dm_rh_inc_pending(ms->rh, &nosync); | 657 | dm_rh_inc_pending(ms->rh, &nosync); |
651 | 658 | ||
652 | /* | 659 | /* |
653 | * If the flush fails on a previous call and succeeds here, | 660 | * If the flush fails on a previous call and succeeds here, |
654 | * we must not reset the log_failure variable. We need | 661 | * we must not reset the log_failure variable. We need |
655 | * userspace interaction to do that. | 662 | * userspace interaction to do that. |
656 | */ | 663 | */ |
657 | ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure; | 664 | ms->log_failure = dm_rh_flush(ms->rh) ? 1 : ms->log_failure; |
658 | 665 | ||
659 | /* | 666 | /* |
660 | * Dispatch io. | 667 | * Dispatch io. |
661 | */ | 668 | */ |
662 | if (unlikely(ms->log_failure)) { | 669 | if (unlikely(ms->log_failure)) { |
663 | spin_lock_irq(&ms->lock); | 670 | spin_lock_irq(&ms->lock); |
664 | bio_list_merge(&ms->failures, &sync); | 671 | bio_list_merge(&ms->failures, &sync); |
665 | spin_unlock_irq(&ms->lock); | 672 | spin_unlock_irq(&ms->lock); |
666 | wakeup_mirrord(ms); | 673 | wakeup_mirrord(ms); |
667 | } else | 674 | } else |
668 | while ((bio = bio_list_pop(&sync))) | 675 | while ((bio = bio_list_pop(&sync))) |
669 | do_write(ms, bio); | 676 | do_write(ms, bio); |
670 | 677 | ||
671 | while ((bio = bio_list_pop(&recover))) | 678 | while ((bio = bio_list_pop(&recover))) |
672 | dm_rh_delay(ms->rh, bio); | 679 | dm_rh_delay(ms->rh, bio); |
673 | 680 | ||
674 | while ((bio = bio_list_pop(&nosync))) { | 681 | while ((bio = bio_list_pop(&nosync))) { |
675 | map_bio(get_default_mirror(ms), bio); | 682 | map_bio(get_default_mirror(ms), bio); |
676 | generic_make_request(bio); | 683 | generic_make_request(bio); |
677 | } | 684 | } |
678 | } | 685 | } |
679 | 686 | ||
680 | static void do_failures(struct mirror_set *ms, struct bio_list *failures) | 687 | static void do_failures(struct mirror_set *ms, struct bio_list *failures) |
681 | { | 688 | { |
682 | struct bio *bio; | 689 | struct bio *bio; |
683 | 690 | ||
684 | if (!failures->head) | 691 | if (!failures->head) |
685 | return; | 692 | return; |
686 | 693 | ||
687 | if (!ms->log_failure) { | 694 | if (!ms->log_failure) { |
688 | while ((bio = bio_list_pop(failures))) { | 695 | while ((bio = bio_list_pop(failures))) { |
689 | ms->in_sync = 0; | 696 | ms->in_sync = 0; |
690 | dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); | 697 | dm_rh_mark_nosync(ms->rh, bio, bio->bi_size, 0); |
691 | } | 698 | } |
692 | return; | 699 | return; |
693 | } | 700 | } |
694 | 701 | ||
695 | /* | 702 | /* |
696 | * If the log has failed, unattempted writes are being | 703 | * If the log has failed, unattempted writes are being |
697 | * put on the failures list. We can't issue those writes | 704 | * put on the failures list. We can't issue those writes |
698 | * until a log has been marked, so we must store them. | 705 | * until a log has been marked, so we must store them. |
699 | * | 706 | * |
700 | * If a 'noflush' suspend is in progress, we can requeue | 707 | * If a 'noflush' suspend is in progress, we can requeue |
701 | * the I/O's to the core. This give userspace a chance | 708 | * the I/O's to the core. This give userspace a chance |
702 | * to reconfigure the mirror, at which point the core | 709 | * to reconfigure the mirror, at which point the core |
703 | * will reissue the writes. If the 'noflush' flag is | 710 | * will reissue the writes. If the 'noflush' flag is |
704 | * not set, we have no choice but to return errors. | 711 | * not set, we have no choice but to return errors. |
705 | * | 712 | * |
706 | * Some writes on the failures list may have been | 713 | * Some writes on the failures list may have been |
707 | * submitted before the log failure and represent a | 714 | * submitted before the log failure and represent a |
708 | * failure to write to one of the devices. It is ok | 715 | * failure to write to one of the devices. It is ok |
709 | * for us to treat them the same and requeue them | 716 | * for us to treat them the same and requeue them |
710 | * as well. | 717 | * as well. |
711 | */ | 718 | */ |
712 | if (dm_noflush_suspending(ms->ti)) { | 719 | if (dm_noflush_suspending(ms->ti)) { |
713 | while ((bio = bio_list_pop(failures))) | 720 | while ((bio = bio_list_pop(failures))) |
714 | bio_endio(bio, DM_ENDIO_REQUEUE); | 721 | bio_endio(bio, DM_ENDIO_REQUEUE); |
715 | return; | 722 | return; |
716 | } | 723 | } |
717 | 724 | ||
718 | if (atomic_read(&ms->suspend)) { | 725 | if (atomic_read(&ms->suspend)) { |
719 | while ((bio = bio_list_pop(failures))) | 726 | while ((bio = bio_list_pop(failures))) |
720 | bio_endio(bio, -EIO); | 727 | bio_endio(bio, -EIO); |
721 | return; | 728 | return; |
722 | } | 729 | } |
723 | 730 | ||
724 | spin_lock_irq(&ms->lock); | 731 | spin_lock_irq(&ms->lock); |
725 | bio_list_merge(&ms->failures, failures); | 732 | bio_list_merge(&ms->failures, failures); |
726 | spin_unlock_irq(&ms->lock); | 733 | spin_unlock_irq(&ms->lock); |
727 | 734 | ||
728 | delayed_wake(ms); | 735 | delayed_wake(ms); |
729 | } | 736 | } |
730 | 737 | ||
731 | static void trigger_event(struct work_struct *work) | 738 | static void trigger_event(struct work_struct *work) |
732 | { | 739 | { |
733 | struct mirror_set *ms = | 740 | struct mirror_set *ms = |
734 | container_of(work, struct mirror_set, trigger_event); | 741 | container_of(work, struct mirror_set, trigger_event); |
735 | 742 | ||
736 | dm_table_event(ms->ti->table); | 743 | dm_table_event(ms->ti->table); |
737 | } | 744 | } |
738 | 745 | ||
739 | /*----------------------------------------------------------------- | 746 | /*----------------------------------------------------------------- |
740 | * kmirrord | 747 | * kmirrord |
741 | *---------------------------------------------------------------*/ | 748 | *---------------------------------------------------------------*/ |
742 | static void do_mirror(struct work_struct *work) | 749 | static void do_mirror(struct work_struct *work) |
743 | { | 750 | { |
744 | struct mirror_set *ms = container_of(work, struct mirror_set, | 751 | struct mirror_set *ms = container_of(work, struct mirror_set, |
745 | kmirrord_work); | 752 | kmirrord_work); |
746 | struct bio_list reads, writes, failures; | 753 | struct bio_list reads, writes, failures; |
747 | unsigned long flags; | 754 | unsigned long flags; |
748 | 755 | ||
749 | spin_lock_irqsave(&ms->lock, flags); | 756 | spin_lock_irqsave(&ms->lock, flags); |
750 | reads = ms->reads; | 757 | reads = ms->reads; |
751 | writes = ms->writes; | 758 | writes = ms->writes; |
752 | failures = ms->failures; | 759 | failures = ms->failures; |
753 | bio_list_init(&ms->reads); | 760 | bio_list_init(&ms->reads); |
754 | bio_list_init(&ms->writes); | 761 | bio_list_init(&ms->writes); |
755 | bio_list_init(&ms->failures); | 762 | bio_list_init(&ms->failures); |
756 | spin_unlock_irqrestore(&ms->lock, flags); | 763 | spin_unlock_irqrestore(&ms->lock, flags); |
757 | 764 | ||
758 | dm_rh_update_states(ms->rh, errors_handled(ms)); | 765 | dm_rh_update_states(ms->rh, errors_handled(ms)); |
759 | do_recovery(ms); | 766 | do_recovery(ms); |
760 | do_reads(ms, &reads); | 767 | do_reads(ms, &reads); |
761 | do_writes(ms, &writes); | 768 | do_writes(ms, &writes); |
762 | do_failures(ms, &failures); | 769 | do_failures(ms, &failures); |
763 | 770 | ||
764 | dm_table_unplug_all(ms->ti->table); | 771 | dm_table_unplug_all(ms->ti->table); |
765 | } | 772 | } |
766 | 773 | ||
767 | /*----------------------------------------------------------------- | 774 | /*----------------------------------------------------------------- |
768 | * Target functions | 775 | * Target functions |
769 | *---------------------------------------------------------------*/ | 776 | *---------------------------------------------------------------*/ |
770 | static struct mirror_set *alloc_context(unsigned int nr_mirrors, | 777 | static struct mirror_set *alloc_context(unsigned int nr_mirrors, |
771 | uint32_t region_size, | 778 | uint32_t region_size, |
772 | struct dm_target *ti, | 779 | struct dm_target *ti, |
773 | struct dm_dirty_log *dl) | 780 | struct dm_dirty_log *dl) |
774 | { | 781 | { |
775 | size_t len; | 782 | size_t len; |
776 | struct mirror_set *ms = NULL; | 783 | struct mirror_set *ms = NULL; |
777 | 784 | ||
778 | len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); | 785 | len = sizeof(*ms) + (sizeof(ms->mirror[0]) * nr_mirrors); |
779 | 786 | ||
780 | ms = kzalloc(len, GFP_KERNEL); | 787 | ms = kzalloc(len, GFP_KERNEL); |
781 | if (!ms) { | 788 | if (!ms) { |
782 | ti->error = "Cannot allocate mirror context"; | 789 | ti->error = "Cannot allocate mirror context"; |
783 | return NULL; | 790 | return NULL; |
784 | } | 791 | } |
785 | 792 | ||
786 | spin_lock_init(&ms->lock); | 793 | spin_lock_init(&ms->lock); |
787 | 794 | ||
788 | ms->ti = ti; | 795 | ms->ti = ti; |
789 | ms->nr_mirrors = nr_mirrors; | 796 | ms->nr_mirrors = nr_mirrors; |
790 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); | 797 | ms->nr_regions = dm_sector_div_up(ti->len, region_size); |
791 | ms->in_sync = 0; | 798 | ms->in_sync = 0; |
792 | ms->log_failure = 0; | 799 | ms->log_failure = 0; |
793 | atomic_set(&ms->suspend, 0); | 800 | atomic_set(&ms->suspend, 0); |
794 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); | 801 | atomic_set(&ms->default_mirror, DEFAULT_MIRROR); |
795 | 802 | ||
796 | ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS, | 803 | ms->read_record_pool = mempool_create_slab_pool(MIN_READ_RECORDS, |
797 | _dm_raid1_read_record_cache); | 804 | _dm_raid1_read_record_cache); |
798 | 805 | ||
799 | if (!ms->read_record_pool) { | 806 | if (!ms->read_record_pool) { |
800 | ti->error = "Error creating mirror read_record_pool"; | 807 | ti->error = "Error creating mirror read_record_pool"; |
801 | kfree(ms); | 808 | kfree(ms); |
802 | return NULL; | 809 | return NULL; |
803 | } | 810 | } |
804 | 811 | ||
805 | ms->io_client = dm_io_client_create(DM_IO_PAGES); | 812 | ms->io_client = dm_io_client_create(DM_IO_PAGES); |
806 | if (IS_ERR(ms->io_client)) { | 813 | if (IS_ERR(ms->io_client)) { |
807 | ti->error = "Error creating dm_io client"; | 814 | ti->error = "Error creating dm_io client"; |
808 | mempool_destroy(ms->read_record_pool); | 815 | mempool_destroy(ms->read_record_pool); |
809 | kfree(ms); | 816 | kfree(ms); |
810 | return NULL; | 817 | return NULL; |
811 | } | 818 | } |
812 | 819 | ||
813 | ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord, | 820 | ms->rh = dm_region_hash_create(ms, dispatch_bios, wakeup_mirrord, |
814 | wakeup_all_recovery_waiters, | 821 | wakeup_all_recovery_waiters, |
815 | ms->ti->begin, MAX_RECOVERY, | 822 | ms->ti->begin, MAX_RECOVERY, |
816 | dl, region_size, ms->nr_regions); | 823 | dl, region_size, ms->nr_regions); |
817 | if (IS_ERR(ms->rh)) { | 824 | if (IS_ERR(ms->rh)) { |
818 | ti->error = "Error creating dirty region hash"; | 825 | ti->error = "Error creating dirty region hash"; |
819 | dm_io_client_destroy(ms->io_client); | 826 | dm_io_client_destroy(ms->io_client); |
820 | mempool_destroy(ms->read_record_pool); | 827 | mempool_destroy(ms->read_record_pool); |
821 | kfree(ms); | 828 | kfree(ms); |
822 | return NULL; | 829 | return NULL; |
823 | } | 830 | } |
824 | 831 | ||
825 | return ms; | 832 | return ms; |
826 | } | 833 | } |
827 | 834 | ||
828 | static void free_context(struct mirror_set *ms, struct dm_target *ti, | 835 | static void free_context(struct mirror_set *ms, struct dm_target *ti, |
829 | unsigned int m) | 836 | unsigned int m) |
830 | { | 837 | { |
831 | while (m--) | 838 | while (m--) |
832 | dm_put_device(ti, ms->mirror[m].dev); | 839 | dm_put_device(ti, ms->mirror[m].dev); |
833 | 840 | ||
834 | dm_io_client_destroy(ms->io_client); | 841 | dm_io_client_destroy(ms->io_client); |
835 | dm_region_hash_destroy(ms->rh); | 842 | dm_region_hash_destroy(ms->rh); |
836 | mempool_destroy(ms->read_record_pool); | 843 | mempool_destroy(ms->read_record_pool); |
837 | kfree(ms); | 844 | kfree(ms); |
838 | } | 845 | } |
839 | 846 | ||
840 | static int get_mirror(struct mirror_set *ms, struct dm_target *ti, | 847 | static int get_mirror(struct mirror_set *ms, struct dm_target *ti, |
841 | unsigned int mirror, char **argv) | 848 | unsigned int mirror, char **argv) |
842 | { | 849 | { |
843 | unsigned long long offset; | 850 | unsigned long long offset; |
844 | 851 | ||
845 | if (sscanf(argv[1], "%llu", &offset) != 1) { | 852 | if (sscanf(argv[1], "%llu", &offset) != 1) { |
846 | ti->error = "Invalid offset"; | 853 | ti->error = "Invalid offset"; |
847 | return -EINVAL; | 854 | return -EINVAL; |
848 | } | 855 | } |
849 | 856 | ||
850 | if (dm_get_device(ti, argv[0], offset, ti->len, | 857 | if (dm_get_device(ti, argv[0], offset, ti->len, |
851 | dm_table_get_mode(ti->table), | 858 | dm_table_get_mode(ti->table), |
852 | &ms->mirror[mirror].dev)) { | 859 | &ms->mirror[mirror].dev)) { |
853 | ti->error = "Device lookup failure"; | 860 | ti->error = "Device lookup failure"; |
854 | return -ENXIO; | 861 | return -ENXIO; |
855 | } | 862 | } |
856 | 863 | ||
857 | ms->mirror[mirror].ms = ms; | 864 | ms->mirror[mirror].ms = ms; |
858 | atomic_set(&(ms->mirror[mirror].error_count), 0); | 865 | atomic_set(&(ms->mirror[mirror].error_count), 0); |
859 | ms->mirror[mirror].error_type = 0; | 866 | ms->mirror[mirror].error_type = 0; |
860 | ms->mirror[mirror].offset = offset; | 867 | ms->mirror[mirror].offset = offset; |
861 | 868 | ||
862 | return 0; | 869 | return 0; |
863 | } | 870 | } |
864 | 871 | ||
865 | /* | 872 | /* |
866 | * Create dirty log: log_type #log_params <log_params> | 873 | * Create dirty log: log_type #log_params <log_params> |
867 | */ | 874 | */ |
868 | static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, | 875 | static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, |
869 | unsigned argc, char **argv, | 876 | unsigned argc, char **argv, |
870 | unsigned *args_used) | 877 | unsigned *args_used) |
871 | { | 878 | { |
872 | unsigned param_count; | 879 | unsigned param_count; |
873 | struct dm_dirty_log *dl; | 880 | struct dm_dirty_log *dl; |
874 | 881 | ||
875 | if (argc < 2) { | 882 | if (argc < 2) { |
876 | ti->error = "Insufficient mirror log arguments"; | 883 | ti->error = "Insufficient mirror log arguments"; |
877 | return NULL; | 884 | return NULL; |
878 | } | 885 | } |
879 | 886 | ||
880 | if (sscanf(argv[1], "%u", ¶m_count) != 1) { | 887 | if (sscanf(argv[1], "%u", ¶m_count) != 1) { |
881 | ti->error = "Invalid mirror log argument count"; | 888 | ti->error = "Invalid mirror log argument count"; |
882 | return NULL; | 889 | return NULL; |
883 | } | 890 | } |
884 | 891 | ||
885 | *args_used = 2 + param_count; | 892 | *args_used = 2 + param_count; |
886 | 893 | ||
887 | if (argc < *args_used) { | 894 | if (argc < *args_used) { |
888 | ti->error = "Insufficient mirror log arguments"; | 895 | ti->error = "Insufficient mirror log arguments"; |
889 | return NULL; | 896 | return NULL; |
890 | } | 897 | } |
891 | 898 | ||
892 | dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); | 899 | dl = dm_dirty_log_create(argv[0], ti, param_count, argv + 2); |
893 | if (!dl) { | 900 | if (!dl) { |
894 | ti->error = "Error creating mirror dirty log"; | 901 | ti->error = "Error creating mirror dirty log"; |
895 | return NULL; | 902 | return NULL; |
896 | } | 903 | } |
897 | 904 | ||
898 | return dl; | 905 | return dl; |
899 | } | 906 | } |
900 | 907 | ||
901 | static int parse_features(struct mirror_set *ms, unsigned argc, char **argv, | 908 | static int parse_features(struct mirror_set *ms, unsigned argc, char **argv, |
902 | unsigned *args_used) | 909 | unsigned *args_used) |
903 | { | 910 | { |
904 | unsigned num_features; | 911 | unsigned num_features; |
905 | struct dm_target *ti = ms->ti; | 912 | struct dm_target *ti = ms->ti; |
906 | 913 | ||
907 | *args_used = 0; | 914 | *args_used = 0; |
908 | 915 | ||
909 | if (!argc) | 916 | if (!argc) |
910 | return 0; | 917 | return 0; |
911 | 918 | ||
912 | if (sscanf(argv[0], "%u", &num_features) != 1) { | 919 | if (sscanf(argv[0], "%u", &num_features) != 1) { |
913 | ti->error = "Invalid number of features"; | 920 | ti->error = "Invalid number of features"; |
914 | return -EINVAL; | 921 | return -EINVAL; |
915 | } | 922 | } |
916 | 923 | ||
917 | argc--; | 924 | argc--; |
918 | argv++; | 925 | argv++; |
919 | (*args_used)++; | 926 | (*args_used)++; |
920 | 927 | ||
921 | if (num_features > argc) { | 928 | if (num_features > argc) { |
922 | ti->error = "Not enough arguments to support feature count"; | 929 | ti->error = "Not enough arguments to support feature count"; |
923 | return -EINVAL; | 930 | return -EINVAL; |
924 | } | 931 | } |
925 | 932 | ||
926 | if (!strcmp("handle_errors", argv[0])) | 933 | if (!strcmp("handle_errors", argv[0])) |
927 | ms->features |= DM_RAID1_HANDLE_ERRORS; | 934 | ms->features |= DM_RAID1_HANDLE_ERRORS; |
928 | else { | 935 | else { |
929 | ti->error = "Unrecognised feature requested"; | 936 | ti->error = "Unrecognised feature requested"; |
930 | return -EINVAL; | 937 | return -EINVAL; |
931 | } | 938 | } |
932 | 939 | ||
933 | (*args_used)++; | 940 | (*args_used)++; |
934 | 941 | ||
935 | return 0; | 942 | return 0; |
936 | } | 943 | } |
937 | 944 | ||
938 | /* | 945 | /* |
939 | * Construct a mirror mapping: | 946 | * Construct a mirror mapping: |
940 | * | 947 | * |
941 | * log_type #log_params <log_params> | 948 | * log_type #log_params <log_params> |
942 | * #mirrors [mirror_path offset]{2,} | 949 | * #mirrors [mirror_path offset]{2,} |
943 | * [#features <features>] | 950 | * [#features <features>] |
944 | * | 951 | * |
945 | * log_type is "core" or "disk" | 952 | * log_type is "core" or "disk" |
946 | * #log_params is between 1 and 3 | 953 | * #log_params is between 1 and 3 |
947 | * | 954 | * |
948 | * If present, features must be "handle_errors". | 955 | * If present, features must be "handle_errors". |
949 | */ | 956 | */ |
950 | static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | 957 | static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) |
951 | { | 958 | { |
952 | int r; | 959 | int r; |
953 | unsigned int nr_mirrors, m, args_used; | 960 | unsigned int nr_mirrors, m, args_used; |
954 | struct mirror_set *ms; | 961 | struct mirror_set *ms; |
955 | struct dm_dirty_log *dl; | 962 | struct dm_dirty_log *dl; |
956 | 963 | ||
957 | dl = create_dirty_log(ti, argc, argv, &args_used); | 964 | dl = create_dirty_log(ti, argc, argv, &args_used); |
958 | if (!dl) | 965 | if (!dl) |
959 | return -EINVAL; | 966 | return -EINVAL; |
960 | 967 | ||
961 | argv += args_used; | 968 | argv += args_used; |
962 | argc -= args_used; | 969 | argc -= args_used; |
963 | 970 | ||
964 | if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || | 971 | if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || |
965 | nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { | 972 | nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { |
966 | ti->error = "Invalid number of mirrors"; | 973 | ti->error = "Invalid number of mirrors"; |
967 | dm_dirty_log_destroy(dl); | 974 | dm_dirty_log_destroy(dl); |
968 | return -EINVAL; | 975 | return -EINVAL; |
969 | } | 976 | } |
970 | 977 | ||
971 | argv++, argc--; | 978 | argv++, argc--; |
972 | 979 | ||
973 | if (argc < nr_mirrors * 2) { | 980 | if (argc < nr_mirrors * 2) { |
974 | ti->error = "Too few mirror arguments"; | 981 | ti->error = "Too few mirror arguments"; |
975 | dm_dirty_log_destroy(dl); | 982 | dm_dirty_log_destroy(dl); |
976 | return -EINVAL; | 983 | return -EINVAL; |
977 | } | 984 | } |
978 | 985 | ||
979 | ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); | 986 | ms = alloc_context(nr_mirrors, dl->type->get_region_size(dl), ti, dl); |
980 | if (!ms) { | 987 | if (!ms) { |
981 | dm_dirty_log_destroy(dl); | 988 | dm_dirty_log_destroy(dl); |
982 | return -ENOMEM; | 989 | return -ENOMEM; |
983 | } | 990 | } |
984 | 991 | ||
985 | /* Get the mirror parameter sets */ | 992 | /* Get the mirror parameter sets */ |
986 | for (m = 0; m < nr_mirrors; m++) { | 993 | for (m = 0; m < nr_mirrors; m++) { |
987 | r = get_mirror(ms, ti, m, argv); | 994 | r = get_mirror(ms, ti, m, argv); |
988 | if (r) { | 995 | if (r) { |
989 | free_context(ms, ti, m); | 996 | free_context(ms, ti, m); |
990 | return r; | 997 | return r; |
991 | } | 998 | } |
992 | argv += 2; | 999 | argv += 2; |
993 | argc -= 2; | 1000 | argc -= 2; |
994 | } | 1001 | } |
995 | 1002 | ||
996 | ti->private = ms; | 1003 | ti->private = ms; |
997 | ti->split_io = dm_rh_get_region_size(ms->rh); | 1004 | ti->split_io = dm_rh_get_region_size(ms->rh); |
1005 | ti->num_flush_requests = 1; | ||
998 | 1006 | ||
999 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); | 1007 | ms->kmirrord_wq = create_singlethread_workqueue("kmirrord"); |
1000 | if (!ms->kmirrord_wq) { | 1008 | if (!ms->kmirrord_wq) { |
1001 | DMERR("couldn't start kmirrord"); | 1009 | DMERR("couldn't start kmirrord"); |
1002 | r = -ENOMEM; | 1010 | r = -ENOMEM; |
1003 | goto err_free_context; | 1011 | goto err_free_context; |
1004 | } | 1012 | } |
1005 | INIT_WORK(&ms->kmirrord_work, do_mirror); | 1013 | INIT_WORK(&ms->kmirrord_work, do_mirror); |
1006 | init_timer(&ms->timer); | 1014 | init_timer(&ms->timer); |
1007 | ms->timer_pending = 0; | 1015 | ms->timer_pending = 0; |
1008 | INIT_WORK(&ms->trigger_event, trigger_event); | 1016 | INIT_WORK(&ms->trigger_event, trigger_event); |
1009 | 1017 | ||
1010 | r = parse_features(ms, argc, argv, &args_used); | 1018 | r = parse_features(ms, argc, argv, &args_used); |
1011 | if (r) | 1019 | if (r) |
1012 | goto err_destroy_wq; | 1020 | goto err_destroy_wq; |
1013 | 1021 | ||
1014 | argv += args_used; | 1022 | argv += args_used; |
1015 | argc -= args_used; | 1023 | argc -= args_used; |
1016 | 1024 | ||
1017 | /* | 1025 | /* |
1018 | * Any read-balancing addition depends on the | 1026 | * Any read-balancing addition depends on the |
1019 | * DM_RAID1_HANDLE_ERRORS flag being present. | 1027 | * DM_RAID1_HANDLE_ERRORS flag being present. |
1020 | * This is because the decision to balance depends | 1028 | * This is because the decision to balance depends |
1021 | * on the sync state of a region. If the above | 1029 | * on the sync state of a region. If the above |
1022 | * flag is not present, we ignore errors; and | 1030 | * flag is not present, we ignore errors; and |
1023 | * the sync state may be inaccurate. | 1031 | * the sync state may be inaccurate. |
1024 | */ | 1032 | */ |
1025 | 1033 | ||
1026 | if (argc) { | 1034 | if (argc) { |
1027 | ti->error = "Too many mirror arguments"; | 1035 | ti->error = "Too many mirror arguments"; |
1028 | r = -EINVAL; | 1036 | r = -EINVAL; |
1029 | goto err_destroy_wq; | 1037 | goto err_destroy_wq; |
1030 | } | 1038 | } |
1031 | 1039 | ||
1032 | r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client); | 1040 | r = dm_kcopyd_client_create(DM_KCOPYD_PAGES, &ms->kcopyd_client); |
1033 | if (r) | 1041 | if (r) |
1034 | goto err_destroy_wq; | 1042 | goto err_destroy_wq; |
1035 | 1043 | ||
1036 | wakeup_mirrord(ms); | 1044 | wakeup_mirrord(ms); |
1037 | return 0; | 1045 | return 0; |
1038 | 1046 | ||
1039 | err_destroy_wq: | 1047 | err_destroy_wq: |
1040 | destroy_workqueue(ms->kmirrord_wq); | 1048 | destroy_workqueue(ms->kmirrord_wq); |
1041 | err_free_context: | 1049 | err_free_context: |
1042 | free_context(ms, ti, ms->nr_mirrors); | 1050 | free_context(ms, ti, ms->nr_mirrors); |
1043 | return r; | 1051 | return r; |
1044 | } | 1052 | } |
1045 | 1053 | ||
1046 | static void mirror_dtr(struct dm_target *ti) | 1054 | static void mirror_dtr(struct dm_target *ti) |
1047 | { | 1055 | { |
1048 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1056 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1049 | 1057 | ||
1050 | del_timer_sync(&ms->timer); | 1058 | del_timer_sync(&ms->timer); |
1051 | flush_workqueue(ms->kmirrord_wq); | 1059 | flush_workqueue(ms->kmirrord_wq); |
1052 | flush_scheduled_work(); | 1060 | flush_scheduled_work(); |
1053 | dm_kcopyd_client_destroy(ms->kcopyd_client); | 1061 | dm_kcopyd_client_destroy(ms->kcopyd_client); |
1054 | destroy_workqueue(ms->kmirrord_wq); | 1062 | destroy_workqueue(ms->kmirrord_wq); |
1055 | free_context(ms, ti, ms->nr_mirrors); | 1063 | free_context(ms, ti, ms->nr_mirrors); |
1056 | } | 1064 | } |
1057 | 1065 | ||
1058 | /* | 1066 | /* |
1059 | * Mirror mapping function | 1067 | * Mirror mapping function |
1060 | */ | 1068 | */ |
1061 | static int mirror_map(struct dm_target *ti, struct bio *bio, | 1069 | static int mirror_map(struct dm_target *ti, struct bio *bio, |
1062 | union map_info *map_context) | 1070 | union map_info *map_context) |
1063 | { | 1071 | { |
1064 | int r, rw = bio_rw(bio); | 1072 | int r, rw = bio_rw(bio); |
1065 | struct mirror *m; | 1073 | struct mirror *m; |
1066 | struct mirror_set *ms = ti->private; | 1074 | struct mirror_set *ms = ti->private; |
1067 | struct dm_raid1_read_record *read_record = NULL; | 1075 | struct dm_raid1_read_record *read_record = NULL; |
1068 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | 1076 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1069 | 1077 | ||
1070 | if (rw == WRITE) { | 1078 | if (rw == WRITE) { |
1071 | /* Save region for mirror_end_io() handler */ | 1079 | /* Save region for mirror_end_io() handler */ |
1072 | map_context->ll = dm_rh_bio_to_region(ms->rh, bio); | 1080 | map_context->ll = dm_rh_bio_to_region(ms->rh, bio); |
1073 | queue_bio(ms, bio, rw); | 1081 | queue_bio(ms, bio, rw); |
1074 | return DM_MAPIO_SUBMITTED; | 1082 | return DM_MAPIO_SUBMITTED; |
1075 | } | 1083 | } |
1076 | 1084 | ||
1077 | r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); | 1085 | r = log->type->in_sync(log, dm_rh_bio_to_region(ms->rh, bio), 0); |
1078 | if (r < 0 && r != -EWOULDBLOCK) | 1086 | if (r < 0 && r != -EWOULDBLOCK) |
1079 | return r; | 1087 | return r; |
1080 | 1088 | ||
1081 | /* | 1089 | /* |
1082 | * If region is not in-sync queue the bio. | 1090 | * If region is not in-sync queue the bio. |
1083 | */ | 1091 | */ |
1084 | if (!r || (r == -EWOULDBLOCK)) { | 1092 | if (!r || (r == -EWOULDBLOCK)) { |
1085 | if (rw == READA) | 1093 | if (rw == READA) |
1086 | return -EWOULDBLOCK; | 1094 | return -EWOULDBLOCK; |
1087 | 1095 | ||
1088 | queue_bio(ms, bio, rw); | 1096 | queue_bio(ms, bio, rw); |
1089 | return DM_MAPIO_SUBMITTED; | 1097 | return DM_MAPIO_SUBMITTED; |
1090 | } | 1098 | } |
1091 | 1099 | ||
1092 | /* | 1100 | /* |
1093 | * The region is in-sync and we can perform reads directly. | 1101 | * The region is in-sync and we can perform reads directly. |
1094 | * Store enough information so we can retry if it fails. | 1102 | * Store enough information so we can retry if it fails. |
1095 | */ | 1103 | */ |
1096 | m = choose_mirror(ms, bio->bi_sector); | 1104 | m = choose_mirror(ms, bio->bi_sector); |
1097 | if (unlikely(!m)) | 1105 | if (unlikely(!m)) |
1098 | return -EIO; | 1106 | return -EIO; |
1099 | 1107 | ||
1100 | read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); | 1108 | read_record = mempool_alloc(ms->read_record_pool, GFP_NOIO); |
1101 | if (likely(read_record)) { | 1109 | if (likely(read_record)) { |
1102 | dm_bio_record(&read_record->details, bio); | 1110 | dm_bio_record(&read_record->details, bio); |
1103 | map_context->ptr = read_record; | 1111 | map_context->ptr = read_record; |
1104 | read_record->m = m; | 1112 | read_record->m = m; |
1105 | } | 1113 | } |
1106 | 1114 | ||
1107 | map_bio(m, bio); | 1115 | map_bio(m, bio); |
1108 | 1116 | ||
1109 | return DM_MAPIO_REMAPPED; | 1117 | return DM_MAPIO_REMAPPED; |
1110 | } | 1118 | } |
1111 | 1119 | ||
1112 | static int mirror_end_io(struct dm_target *ti, struct bio *bio, | 1120 | static int mirror_end_io(struct dm_target *ti, struct bio *bio, |
1113 | int error, union map_info *map_context) | 1121 | int error, union map_info *map_context) |
1114 | { | 1122 | { |
1115 | int rw = bio_rw(bio); | 1123 | int rw = bio_rw(bio); |
1116 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1124 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1117 | struct mirror *m = NULL; | 1125 | struct mirror *m = NULL; |
1118 | struct dm_bio_details *bd = NULL; | 1126 | struct dm_bio_details *bd = NULL; |
1119 | struct dm_raid1_read_record *read_record = map_context->ptr; | 1127 | struct dm_raid1_read_record *read_record = map_context->ptr; |
1120 | 1128 | ||
1121 | /* | 1129 | /* |
1122 | * We need to dec pending if this was a write. | 1130 | * We need to dec pending if this was a write. |
1123 | */ | 1131 | */ |
1124 | if (rw == WRITE) { | 1132 | if (rw == WRITE) { |
1125 | dm_rh_dec(ms->rh, map_context->ll); | 1133 | if (likely(!bio_empty_barrier(bio))) |
1134 | dm_rh_dec(ms->rh, map_context->ll); | ||
1126 | return error; | 1135 | return error; |
1127 | } | 1136 | } |
1128 | 1137 | ||
1129 | if (error == -EOPNOTSUPP) | 1138 | if (error == -EOPNOTSUPP) |
1130 | goto out; | 1139 | goto out; |
1131 | 1140 | ||
1132 | if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD)) | 1141 | if ((error == -EWOULDBLOCK) && bio_rw_flagged(bio, BIO_RW_AHEAD)) |
1133 | goto out; | 1142 | goto out; |
1134 | 1143 | ||
1135 | if (unlikely(error)) { | 1144 | if (unlikely(error)) { |
1136 | if (!read_record) { | 1145 | if (!read_record) { |
1137 | /* | 1146 | /* |
1138 | * There wasn't enough memory to record necessary | 1147 | * There wasn't enough memory to record necessary |
1139 | * information for a retry or there was no other | 1148 | * information for a retry or there was no other |
1140 | * mirror in-sync. | 1149 | * mirror in-sync. |
1141 | */ | 1150 | */ |
1142 | DMERR_LIMIT("Mirror read failed."); | 1151 | DMERR_LIMIT("Mirror read failed."); |
1143 | return -EIO; | 1152 | return -EIO; |
1144 | } | 1153 | } |
1145 | 1154 | ||
1146 | m = read_record->m; | 1155 | m = read_record->m; |
1147 | 1156 | ||
1148 | DMERR("Mirror read failed from %s. Trying alternative device.", | 1157 | DMERR("Mirror read failed from %s. Trying alternative device.", |
1149 | m->dev->name); | 1158 | m->dev->name); |
1150 | 1159 | ||
1151 | fail_mirror(m, DM_RAID1_READ_ERROR); | 1160 | fail_mirror(m, DM_RAID1_READ_ERROR); |
1152 | 1161 | ||
1153 | /* | 1162 | /* |
1154 | * A failed read is requeued for another attempt using an intact | 1163 | * A failed read is requeued for another attempt using an intact |
1155 | * mirror. | 1164 | * mirror. |
1156 | */ | 1165 | */ |
1157 | if (default_ok(m) || mirror_available(ms, bio)) { | 1166 | if (default_ok(m) || mirror_available(ms, bio)) { |
1158 | bd = &read_record->details; | 1167 | bd = &read_record->details; |
1159 | 1168 | ||
1160 | dm_bio_restore(bd, bio); | 1169 | dm_bio_restore(bd, bio); |
1161 | mempool_free(read_record, ms->read_record_pool); | 1170 | mempool_free(read_record, ms->read_record_pool); |
1162 | map_context->ptr = NULL; | 1171 | map_context->ptr = NULL; |
1163 | queue_bio(ms, bio, rw); | 1172 | queue_bio(ms, bio, rw); |
1164 | return 1; | 1173 | return 1; |
1165 | } | 1174 | } |
1166 | DMERR("All replicated volumes dead, failing I/O"); | 1175 | DMERR("All replicated volumes dead, failing I/O"); |
1167 | } | 1176 | } |
1168 | 1177 | ||
1169 | out: | 1178 | out: |
1170 | if (read_record) { | 1179 | if (read_record) { |
1171 | mempool_free(read_record, ms->read_record_pool); | 1180 | mempool_free(read_record, ms->read_record_pool); |
1172 | map_context->ptr = NULL; | 1181 | map_context->ptr = NULL; |
1173 | } | 1182 | } |
1174 | 1183 | ||
1175 | return error; | 1184 | return error; |
1176 | } | 1185 | } |
1177 | 1186 | ||
1178 | static void mirror_presuspend(struct dm_target *ti) | 1187 | static void mirror_presuspend(struct dm_target *ti) |
1179 | { | 1188 | { |
1180 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1189 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1181 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | 1190 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1182 | 1191 | ||
1183 | atomic_set(&ms->suspend, 1); | 1192 | atomic_set(&ms->suspend, 1); |
1184 | 1193 | ||
1185 | /* | 1194 | /* |
1186 | * We must finish up all the work that we've | 1195 | * We must finish up all the work that we've |
1187 | * generated (i.e. recovery work). | 1196 | * generated (i.e. recovery work). |
1188 | */ | 1197 | */ |
1189 | dm_rh_stop_recovery(ms->rh); | 1198 | dm_rh_stop_recovery(ms->rh); |
1190 | 1199 | ||
1191 | wait_event(_kmirrord_recovery_stopped, | 1200 | wait_event(_kmirrord_recovery_stopped, |
1192 | !dm_rh_recovery_in_flight(ms->rh)); | 1201 | !dm_rh_recovery_in_flight(ms->rh)); |
1193 | 1202 | ||
1194 | if (log->type->presuspend && log->type->presuspend(log)) | 1203 | if (log->type->presuspend && log->type->presuspend(log)) |
1195 | /* FIXME: need better error handling */ | 1204 | /* FIXME: need better error handling */ |
1196 | DMWARN("log presuspend failed"); | 1205 | DMWARN("log presuspend failed"); |
1197 | 1206 | ||
1198 | /* | 1207 | /* |
1199 | * Now that recovery is complete/stopped and the | 1208 | * Now that recovery is complete/stopped and the |
1200 | * delayed bios are queued, we need to wait for | 1209 | * delayed bios are queued, we need to wait for |
1201 | * the worker thread to complete. This way, | 1210 | * the worker thread to complete. This way, |
1202 | * we know that all of our I/O has been pushed. | 1211 | * we know that all of our I/O has been pushed. |
1203 | */ | 1212 | */ |
1204 | flush_workqueue(ms->kmirrord_wq); | 1213 | flush_workqueue(ms->kmirrord_wq); |
1205 | } | 1214 | } |
1206 | 1215 | ||
1207 | static void mirror_postsuspend(struct dm_target *ti) | 1216 | static void mirror_postsuspend(struct dm_target *ti) |
1208 | { | 1217 | { |
1209 | struct mirror_set *ms = ti->private; | 1218 | struct mirror_set *ms = ti->private; |
1210 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | 1219 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1211 | 1220 | ||
1212 | if (log->type->postsuspend && log->type->postsuspend(log)) | 1221 | if (log->type->postsuspend && log->type->postsuspend(log)) |
1213 | /* FIXME: need better error handling */ | 1222 | /* FIXME: need better error handling */ |
1214 | DMWARN("log postsuspend failed"); | 1223 | DMWARN("log postsuspend failed"); |
1215 | } | 1224 | } |
1216 | 1225 | ||
1217 | static void mirror_resume(struct dm_target *ti) | 1226 | static void mirror_resume(struct dm_target *ti) |
1218 | { | 1227 | { |
1219 | struct mirror_set *ms = ti->private; | 1228 | struct mirror_set *ms = ti->private; |
1220 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | 1229 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1221 | 1230 | ||
1222 | atomic_set(&ms->suspend, 0); | 1231 | atomic_set(&ms->suspend, 0); |
1223 | if (log->type->resume && log->type->resume(log)) | 1232 | if (log->type->resume && log->type->resume(log)) |
1224 | /* FIXME: need better error handling */ | 1233 | /* FIXME: need better error handling */ |
1225 | DMWARN("log resume failed"); | 1234 | DMWARN("log resume failed"); |
1226 | dm_rh_start_recovery(ms->rh); | 1235 | dm_rh_start_recovery(ms->rh); |
1227 | } | 1236 | } |
1228 | 1237 | ||
1229 | /* | 1238 | /* |
1230 | * device_status_char | 1239 | * device_status_char |
1231 | * @m: mirror device/leg we want the status of | 1240 | * @m: mirror device/leg we want the status of |
1232 | * | 1241 | * |
1233 | * We return one character representing the most severe error | 1242 | * We return one character representing the most severe error |
1234 | * we have encountered. | 1243 | * we have encountered. |
1235 | * A => Alive - No failures | 1244 | * A => Alive - No failures |
1236 | * D => Dead - A write failure occurred leaving mirror out-of-sync | 1245 | * D => Dead - A write failure occurred leaving mirror out-of-sync |
1237 | * S => Sync - A sychronization failure occurred, mirror out-of-sync | 1246 | * S => Sync - A sychronization failure occurred, mirror out-of-sync |
1238 | * R => Read - A read failure occurred, mirror data unaffected | 1247 | * R => Read - A read failure occurred, mirror data unaffected |
1239 | * | 1248 | * |
1240 | * Returns: <char> | 1249 | * Returns: <char> |
1241 | */ | 1250 | */ |
1242 | static char device_status_char(struct mirror *m) | 1251 | static char device_status_char(struct mirror *m) |
1243 | { | 1252 | { |
1244 | if (!atomic_read(&(m->error_count))) | 1253 | if (!atomic_read(&(m->error_count))) |
1245 | return 'A'; | 1254 | return 'A'; |
1246 | 1255 | ||
1247 | return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : | 1256 | return (test_bit(DM_RAID1_WRITE_ERROR, &(m->error_type))) ? 'D' : |
1248 | (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : | 1257 | (test_bit(DM_RAID1_SYNC_ERROR, &(m->error_type))) ? 'S' : |
1249 | (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; | 1258 | (test_bit(DM_RAID1_READ_ERROR, &(m->error_type))) ? 'R' : 'U'; |
1250 | } | 1259 | } |
1251 | 1260 | ||
1252 | 1261 | ||
1253 | static int mirror_status(struct dm_target *ti, status_type_t type, | 1262 | static int mirror_status(struct dm_target *ti, status_type_t type, |
1254 | char *result, unsigned int maxlen) | 1263 | char *result, unsigned int maxlen) |
1255 | { | 1264 | { |
1256 | unsigned int m, sz = 0; | 1265 | unsigned int m, sz = 0; |
1257 | struct mirror_set *ms = (struct mirror_set *) ti->private; | 1266 | struct mirror_set *ms = (struct mirror_set *) ti->private; |
1258 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); | 1267 | struct dm_dirty_log *log = dm_rh_dirty_log(ms->rh); |
1259 | char buffer[ms->nr_mirrors + 1]; | 1268 | char buffer[ms->nr_mirrors + 1]; |
1260 | 1269 | ||
1261 | switch (type) { | 1270 | switch (type) { |
1262 | case STATUSTYPE_INFO: | 1271 | case STATUSTYPE_INFO: |
1263 | DMEMIT("%d ", ms->nr_mirrors); | 1272 | DMEMIT("%d ", ms->nr_mirrors); |
1264 | for (m = 0; m < ms->nr_mirrors; m++) { | 1273 | for (m = 0; m < ms->nr_mirrors; m++) { |
1265 | DMEMIT("%s ", ms->mirror[m].dev->name); | 1274 | DMEMIT("%s ", ms->mirror[m].dev->name); |
1266 | buffer[m] = device_status_char(&(ms->mirror[m])); | 1275 | buffer[m] = device_status_char(&(ms->mirror[m])); |
1267 | } | 1276 | } |
1268 | buffer[m] = '\0'; | 1277 | buffer[m] = '\0'; |
1269 | 1278 | ||
1270 | DMEMIT("%llu/%llu 1 %s ", | 1279 | DMEMIT("%llu/%llu 1 %s ", |
1271 | (unsigned long long)log->type->get_sync_count(log), | 1280 | (unsigned long long)log->type->get_sync_count(log), |
1272 | (unsigned long long)ms->nr_regions, buffer); | 1281 | (unsigned long long)ms->nr_regions, buffer); |
1273 | 1282 | ||
1274 | sz += log->type->status(log, type, result+sz, maxlen-sz); | 1283 | sz += log->type->status(log, type, result+sz, maxlen-sz); |
1275 | 1284 | ||
1276 | break; | 1285 | break; |
1277 | 1286 | ||
1278 | case STATUSTYPE_TABLE: | 1287 | case STATUSTYPE_TABLE: |
1279 | sz = log->type->status(log, type, result, maxlen); | 1288 | sz = log->type->status(log, type, result, maxlen); |
1280 | 1289 | ||
1281 | DMEMIT("%d", ms->nr_mirrors); | 1290 | DMEMIT("%d", ms->nr_mirrors); |
1282 | for (m = 0; m < ms->nr_mirrors; m++) | 1291 | for (m = 0; m < ms->nr_mirrors; m++) |
1283 | DMEMIT(" %s %llu", ms->mirror[m].dev->name, | 1292 | DMEMIT(" %s %llu", ms->mirror[m].dev->name, |
1284 | (unsigned long long)ms->mirror[m].offset); | 1293 | (unsigned long long)ms->mirror[m].offset); |
1285 | 1294 | ||
1286 | if (ms->features & DM_RAID1_HANDLE_ERRORS) | 1295 | if (ms->features & DM_RAID1_HANDLE_ERRORS) |
1287 | DMEMIT(" 1 handle_errors"); | 1296 | DMEMIT(" 1 handle_errors"); |
1288 | } | 1297 | } |
1289 | 1298 | ||
1290 | return 0; | 1299 | return 0; |
1291 | } | 1300 | } |
1292 | 1301 | ||
1293 | static int mirror_iterate_devices(struct dm_target *ti, | 1302 | static int mirror_iterate_devices(struct dm_target *ti, |
1294 | iterate_devices_callout_fn fn, void *data) | 1303 | iterate_devices_callout_fn fn, void *data) |
1295 | { | 1304 | { |
1296 | struct mirror_set *ms = ti->private; | 1305 | struct mirror_set *ms = ti->private; |
1297 | int ret = 0; | 1306 | int ret = 0; |
1298 | unsigned i; | 1307 | unsigned i; |
1299 | 1308 | ||
1300 | for (i = 0; !ret && i < ms->nr_mirrors; i++) | 1309 | for (i = 0; !ret && i < ms->nr_mirrors; i++) |
1301 | ret = fn(ti, ms->mirror[i].dev, | 1310 | ret = fn(ti, ms->mirror[i].dev, |
1302 | ms->mirror[i].offset, ti->len, data); | 1311 | ms->mirror[i].offset, ti->len, data); |
1303 | 1312 | ||
1304 | return ret; | 1313 | return ret; |
1305 | } | 1314 | } |
1306 | 1315 | ||
1307 | static struct target_type mirror_target = { | 1316 | static struct target_type mirror_target = { |
1308 | .name = "mirror", | 1317 | .name = "mirror", |
1309 | .version = {1, 12, 0}, | 1318 | .version = {1, 12, 0}, |
1310 | .module = THIS_MODULE, | 1319 | .module = THIS_MODULE, |
1311 | .ctr = mirror_ctr, | 1320 | .ctr = mirror_ctr, |
1312 | .dtr = mirror_dtr, | 1321 | .dtr = mirror_dtr, |
1313 | .map = mirror_map, | 1322 | .map = mirror_map, |
1314 | .end_io = mirror_end_io, | 1323 | .end_io = mirror_end_io, |
1315 | .presuspend = mirror_presuspend, | 1324 | .presuspend = mirror_presuspend, |
1316 | .postsuspend = mirror_postsuspend, | 1325 | .postsuspend = mirror_postsuspend, |
1317 | .resume = mirror_resume, | 1326 | .resume = mirror_resume, |
1318 | .status = mirror_status, | 1327 | .status = mirror_status, |
1319 | .iterate_devices = mirror_iterate_devices, | 1328 | .iterate_devices = mirror_iterate_devices, |
1320 | }; | 1329 | }; |
1321 | 1330 | ||
1322 | static int __init dm_mirror_init(void) | 1331 | static int __init dm_mirror_init(void) |
1323 | { | 1332 | { |
1324 | int r; | 1333 | int r; |
1325 | 1334 | ||
1326 | _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0); | 1335 | _dm_raid1_read_record_cache = KMEM_CACHE(dm_raid1_read_record, 0); |
1327 | if (!_dm_raid1_read_record_cache) { | 1336 | if (!_dm_raid1_read_record_cache) { |
1328 | DMERR("Can't allocate dm_raid1_read_record cache"); | 1337 | DMERR("Can't allocate dm_raid1_read_record cache"); |
1329 | r = -ENOMEM; | 1338 | r = -ENOMEM; |
1330 | goto bad_cache; | 1339 | goto bad_cache; |
1331 | } | 1340 | } |
1332 | 1341 | ||
1333 | r = dm_register_target(&mirror_target); | 1342 | r = dm_register_target(&mirror_target); |
1334 | if (r < 0) { | 1343 | if (r < 0) { |
1335 | DMERR("Failed to register mirror target"); | 1344 | DMERR("Failed to register mirror target"); |
1336 | goto bad_target; | 1345 | goto bad_target; |
1337 | } | 1346 | } |
1338 | 1347 | ||
1339 | return 0; | 1348 | return 0; |
1340 | 1349 | ||
1341 | bad_target: | 1350 | bad_target: |
1342 | kmem_cache_destroy(_dm_raid1_read_record_cache); | 1351 | kmem_cache_destroy(_dm_raid1_read_record_cache); |
1343 | bad_cache: | 1352 | bad_cache: |
1344 | return r; | 1353 | return r; |
1345 | } | 1354 | } |
1346 | 1355 | ||
1347 | static void __exit dm_mirror_exit(void) | 1356 | static void __exit dm_mirror_exit(void) |
1348 | { | 1357 | { |
1349 | dm_unregister_target(&mirror_target); | 1358 | dm_unregister_target(&mirror_target); |
1350 | kmem_cache_destroy(_dm_raid1_read_record_cache); | 1359 | kmem_cache_destroy(_dm_raid1_read_record_cache); |
1351 | } | 1360 | } |
1352 | 1361 | ||
1353 | /* Module hooks */ | 1362 | /* Module hooks */ |
1354 | module_init(dm_mirror_init); | 1363 | module_init(dm_mirror_init); |
1355 | module_exit(dm_mirror_exit); | 1364 | module_exit(dm_mirror_exit); |
1356 | 1365 | ||
1357 | MODULE_DESCRIPTION(DM_NAME " mirror target"); | 1366 | MODULE_DESCRIPTION(DM_NAME " mirror target"); |
1358 | MODULE_AUTHOR("Joe Thornber"); | 1367 | MODULE_AUTHOR("Joe Thornber"); |
1359 | MODULE_LICENSE("GPL"); | 1368 | MODULE_LICENSE("GPL"); |
1360 | 1369 |
drivers/md/dm-region-hash.c
1 | /* | 1 | /* |
2 | * Copyright (C) 2003 Sistina Software Limited. | 2 | * Copyright (C) 2003 Sistina Software Limited. |
3 | * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. | 3 | * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. |
4 | * | 4 | * |
5 | * This file is released under the GPL. | 5 | * This file is released under the GPL. |
6 | */ | 6 | */ |
7 | 7 | ||
8 | #include <linux/dm-dirty-log.h> | 8 | #include <linux/dm-dirty-log.h> |
9 | #include <linux/dm-region-hash.h> | 9 | #include <linux/dm-region-hash.h> |
10 | 10 | ||
11 | #include <linux/ctype.h> | 11 | #include <linux/ctype.h> |
12 | #include <linux/init.h> | 12 | #include <linux/init.h> |
13 | #include <linux/module.h> | 13 | #include <linux/module.h> |
14 | #include <linux/vmalloc.h> | 14 | #include <linux/vmalloc.h> |
15 | 15 | ||
16 | #include "dm.h" | 16 | #include "dm.h" |
17 | 17 | ||
18 | #define DM_MSG_PREFIX "region hash" | 18 | #define DM_MSG_PREFIX "region hash" |
19 | 19 | ||
20 | /*----------------------------------------------------------------- | 20 | /*----------------------------------------------------------------- |
21 | * Region hash | 21 | * Region hash |
22 | * | 22 | * |
23 | * The mirror splits itself up into discrete regions. Each | 23 | * The mirror splits itself up into discrete regions. Each |
24 | * region can be in one of three states: clean, dirty, | 24 | * region can be in one of three states: clean, dirty, |
25 | * nosync. There is no need to put clean regions in the hash. | 25 | * nosync. There is no need to put clean regions in the hash. |
26 | * | 26 | * |
27 | * In addition to being present in the hash table a region _may_ | 27 | * In addition to being present in the hash table a region _may_ |
28 | * be present on one of three lists. | 28 | * be present on one of three lists. |
29 | * | 29 | * |
30 | * clean_regions: Regions on this list have no io pending to | 30 | * clean_regions: Regions on this list have no io pending to |
31 | * them, they are in sync, we are no longer interested in them, | 31 | * them, they are in sync, we are no longer interested in them, |
32 | * they are dull. dm_rh_update_states() will remove them from the | 32 | * they are dull. dm_rh_update_states() will remove them from the |
33 | * hash table. | 33 | * hash table. |
34 | * | 34 | * |
35 | * quiesced_regions: These regions have been spun down, ready | 35 | * quiesced_regions: These regions have been spun down, ready |
36 | * for recovery. rh_recovery_start() will remove regions from | 36 | * for recovery. rh_recovery_start() will remove regions from |
37 | * this list and hand them to kmirrord, which will schedule the | 37 | * this list and hand them to kmirrord, which will schedule the |
38 | * recovery io with kcopyd. | 38 | * recovery io with kcopyd. |
39 | * | 39 | * |
40 | * recovered_regions: Regions that kcopyd has successfully | 40 | * recovered_regions: Regions that kcopyd has successfully |
41 | * recovered. dm_rh_update_states() will now schedule any delayed | 41 | * recovered. dm_rh_update_states() will now schedule any delayed |
42 | * io, up the recovery_count, and remove the region from the | 42 | * io, up the recovery_count, and remove the region from the |
43 | * hash. | 43 | * hash. |
44 | * | 44 | * |
45 | * There are 2 locks: | 45 | * There are 2 locks: |
46 | * A rw spin lock 'hash_lock' protects just the hash table, | 46 | * A rw spin lock 'hash_lock' protects just the hash table, |
47 | * this is never held in write mode from interrupt context, | 47 | * this is never held in write mode from interrupt context, |
48 | * which I believe means that we only have to disable irqs when | 48 | * which I believe means that we only have to disable irqs when |
49 | * doing a write lock. | 49 | * doing a write lock. |
50 | * | 50 | * |
51 | * An ordinary spin lock 'region_lock' that protects the three | 51 | * An ordinary spin lock 'region_lock' that protects the three |
52 | * lists in the region_hash, with the 'state', 'list' and | 52 | * lists in the region_hash, with the 'state', 'list' and |
53 | * 'delayed_bios' fields of the regions. This is used from irq | 53 | * 'delayed_bios' fields of the regions. This is used from irq |
54 | * context, so all other uses will have to suspend local irqs. | 54 | * context, so all other uses will have to suspend local irqs. |
55 | *---------------------------------------------------------------*/ | 55 | *---------------------------------------------------------------*/ |
56 | struct dm_region_hash { | 56 | struct dm_region_hash { |
57 | uint32_t region_size; | 57 | uint32_t region_size; |
58 | unsigned region_shift; | 58 | unsigned region_shift; |
59 | 59 | ||
60 | /* holds persistent region state */ | 60 | /* holds persistent region state */ |
61 | struct dm_dirty_log *log; | 61 | struct dm_dirty_log *log; |
62 | 62 | ||
63 | /* hash table */ | 63 | /* hash table */ |
64 | rwlock_t hash_lock; | 64 | rwlock_t hash_lock; |
65 | mempool_t *region_pool; | 65 | mempool_t *region_pool; |
66 | unsigned mask; | 66 | unsigned mask; |
67 | unsigned nr_buckets; | 67 | unsigned nr_buckets; |
68 | unsigned prime; | 68 | unsigned prime; |
69 | unsigned shift; | 69 | unsigned shift; |
70 | struct list_head *buckets; | 70 | struct list_head *buckets; |
71 | 71 | ||
72 | unsigned max_recovery; /* Max # of regions to recover in parallel */ | 72 | unsigned max_recovery; /* Max # of regions to recover in parallel */ |
73 | 73 | ||
74 | spinlock_t region_lock; | 74 | spinlock_t region_lock; |
75 | atomic_t recovery_in_flight; | 75 | atomic_t recovery_in_flight; |
76 | struct semaphore recovery_count; | 76 | struct semaphore recovery_count; |
77 | struct list_head clean_regions; | 77 | struct list_head clean_regions; |
78 | struct list_head quiesced_regions; | 78 | struct list_head quiesced_regions; |
79 | struct list_head recovered_regions; | 79 | struct list_head recovered_regions; |
80 | struct list_head failed_recovered_regions; | 80 | struct list_head failed_recovered_regions; |
81 | 81 | ||
82 | /* | ||
83 | * If there was a barrier failure no regions can be marked clean. | ||
84 | */ | ||
85 | int barrier_failure; | ||
86 | |||
82 | void *context; | 87 | void *context; |
83 | sector_t target_begin; | 88 | sector_t target_begin; |
84 | 89 | ||
85 | /* Callback function to schedule bios writes */ | 90 | /* Callback function to schedule bios writes */ |
86 | void (*dispatch_bios)(void *context, struct bio_list *bios); | 91 | void (*dispatch_bios)(void *context, struct bio_list *bios); |
87 | 92 | ||
88 | /* Callback function to wakeup callers worker thread. */ | 93 | /* Callback function to wakeup callers worker thread. */ |
89 | void (*wakeup_workers)(void *context); | 94 | void (*wakeup_workers)(void *context); |
90 | 95 | ||
91 | /* Callback function to wakeup callers recovery waiters. */ | 96 | /* Callback function to wakeup callers recovery waiters. */ |
92 | void (*wakeup_all_recovery_waiters)(void *context); | 97 | void (*wakeup_all_recovery_waiters)(void *context); |
93 | }; | 98 | }; |
94 | 99 | ||
95 | struct dm_region { | 100 | struct dm_region { |
96 | struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ | 101 | struct dm_region_hash *rh; /* FIXME: can we get rid of this ? */ |
97 | region_t key; | 102 | region_t key; |
98 | int state; | 103 | int state; |
99 | 104 | ||
100 | struct list_head hash_list; | 105 | struct list_head hash_list; |
101 | struct list_head list; | 106 | struct list_head list; |
102 | 107 | ||
103 | atomic_t pending; | 108 | atomic_t pending; |
104 | struct bio_list delayed_bios; | 109 | struct bio_list delayed_bios; |
105 | }; | 110 | }; |
106 | 111 | ||
107 | /* | 112 | /* |
108 | * Conversion fns | 113 | * Conversion fns |
109 | */ | 114 | */ |
110 | static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector) | 115 | static region_t dm_rh_sector_to_region(struct dm_region_hash *rh, sector_t sector) |
111 | { | 116 | { |
112 | return sector >> rh->region_shift; | 117 | return sector >> rh->region_shift; |
113 | } | 118 | } |
114 | 119 | ||
115 | sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) | 120 | sector_t dm_rh_region_to_sector(struct dm_region_hash *rh, region_t region) |
116 | { | 121 | { |
117 | return region << rh->region_shift; | 122 | return region << rh->region_shift; |
118 | } | 123 | } |
119 | EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); | 124 | EXPORT_SYMBOL_GPL(dm_rh_region_to_sector); |
120 | 125 | ||
121 | region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) | 126 | region_t dm_rh_bio_to_region(struct dm_region_hash *rh, struct bio *bio) |
122 | { | 127 | { |
123 | return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); | 128 | return dm_rh_sector_to_region(rh, bio->bi_sector - rh->target_begin); |
124 | } | 129 | } |
125 | EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); | 130 | EXPORT_SYMBOL_GPL(dm_rh_bio_to_region); |
126 | 131 | ||
127 | void *dm_rh_region_context(struct dm_region *reg) | 132 | void *dm_rh_region_context(struct dm_region *reg) |
128 | { | 133 | { |
129 | return reg->rh->context; | 134 | return reg->rh->context; |
130 | } | 135 | } |
131 | EXPORT_SYMBOL_GPL(dm_rh_region_context); | 136 | EXPORT_SYMBOL_GPL(dm_rh_region_context); |
132 | 137 | ||
133 | region_t dm_rh_get_region_key(struct dm_region *reg) | 138 | region_t dm_rh_get_region_key(struct dm_region *reg) |
134 | { | 139 | { |
135 | return reg->key; | 140 | return reg->key; |
136 | } | 141 | } |
137 | EXPORT_SYMBOL_GPL(dm_rh_get_region_key); | 142 | EXPORT_SYMBOL_GPL(dm_rh_get_region_key); |
138 | 143 | ||
139 | sector_t dm_rh_get_region_size(struct dm_region_hash *rh) | 144 | sector_t dm_rh_get_region_size(struct dm_region_hash *rh) |
140 | { | 145 | { |
141 | return rh->region_size; | 146 | return rh->region_size; |
142 | } | 147 | } |
143 | EXPORT_SYMBOL_GPL(dm_rh_get_region_size); | 148 | EXPORT_SYMBOL_GPL(dm_rh_get_region_size); |
144 | 149 | ||
145 | /* | 150 | /* |
146 | * FIXME: shall we pass in a structure instead of all these args to | 151 | * FIXME: shall we pass in a structure instead of all these args to |
147 | * dm_region_hash_create()???? | 152 | * dm_region_hash_create()???? |
148 | */ | 153 | */ |
149 | #define RH_HASH_MULT 2654435387U | 154 | #define RH_HASH_MULT 2654435387U |
150 | #define RH_HASH_SHIFT 12 | 155 | #define RH_HASH_SHIFT 12 |
151 | 156 | ||
152 | #define MIN_REGIONS 64 | 157 | #define MIN_REGIONS 64 |
153 | struct dm_region_hash *dm_region_hash_create( | 158 | struct dm_region_hash *dm_region_hash_create( |
154 | void *context, void (*dispatch_bios)(void *context, | 159 | void *context, void (*dispatch_bios)(void *context, |
155 | struct bio_list *bios), | 160 | struct bio_list *bios), |
156 | void (*wakeup_workers)(void *context), | 161 | void (*wakeup_workers)(void *context), |
157 | void (*wakeup_all_recovery_waiters)(void *context), | 162 | void (*wakeup_all_recovery_waiters)(void *context), |
158 | sector_t target_begin, unsigned max_recovery, | 163 | sector_t target_begin, unsigned max_recovery, |
159 | struct dm_dirty_log *log, uint32_t region_size, | 164 | struct dm_dirty_log *log, uint32_t region_size, |
160 | region_t nr_regions) | 165 | region_t nr_regions) |
161 | { | 166 | { |
162 | struct dm_region_hash *rh; | 167 | struct dm_region_hash *rh; |
163 | unsigned nr_buckets, max_buckets; | 168 | unsigned nr_buckets, max_buckets; |
164 | size_t i; | 169 | size_t i; |
165 | 170 | ||
166 | /* | 171 | /* |
167 | * Calculate a suitable number of buckets for our hash | 172 | * Calculate a suitable number of buckets for our hash |
168 | * table. | 173 | * table. |
169 | */ | 174 | */ |
170 | max_buckets = nr_regions >> 6; | 175 | max_buckets = nr_regions >> 6; |
171 | for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) | 176 | for (nr_buckets = 128u; nr_buckets < max_buckets; nr_buckets <<= 1) |
172 | ; | 177 | ; |
173 | nr_buckets >>= 1; | 178 | nr_buckets >>= 1; |
174 | 179 | ||
175 | rh = kmalloc(sizeof(*rh), GFP_KERNEL); | 180 | rh = kmalloc(sizeof(*rh), GFP_KERNEL); |
176 | if (!rh) { | 181 | if (!rh) { |
177 | DMERR("unable to allocate region hash memory"); | 182 | DMERR("unable to allocate region hash memory"); |
178 | return ERR_PTR(-ENOMEM); | 183 | return ERR_PTR(-ENOMEM); |
179 | } | 184 | } |
180 | 185 | ||
181 | rh->context = context; | 186 | rh->context = context; |
182 | rh->dispatch_bios = dispatch_bios; | 187 | rh->dispatch_bios = dispatch_bios; |
183 | rh->wakeup_workers = wakeup_workers; | 188 | rh->wakeup_workers = wakeup_workers; |
184 | rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; | 189 | rh->wakeup_all_recovery_waiters = wakeup_all_recovery_waiters; |
185 | rh->target_begin = target_begin; | 190 | rh->target_begin = target_begin; |
186 | rh->max_recovery = max_recovery; | 191 | rh->max_recovery = max_recovery; |
187 | rh->log = log; | 192 | rh->log = log; |
188 | rh->region_size = region_size; | 193 | rh->region_size = region_size; |
189 | rh->region_shift = ffs(region_size) - 1; | 194 | rh->region_shift = ffs(region_size) - 1; |
190 | rwlock_init(&rh->hash_lock); | 195 | rwlock_init(&rh->hash_lock); |
191 | rh->mask = nr_buckets - 1; | 196 | rh->mask = nr_buckets - 1; |
192 | rh->nr_buckets = nr_buckets; | 197 | rh->nr_buckets = nr_buckets; |
193 | 198 | ||
194 | rh->shift = RH_HASH_SHIFT; | 199 | rh->shift = RH_HASH_SHIFT; |
195 | rh->prime = RH_HASH_MULT; | 200 | rh->prime = RH_HASH_MULT; |
196 | 201 | ||
197 | rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); | 202 | rh->buckets = vmalloc(nr_buckets * sizeof(*rh->buckets)); |
198 | if (!rh->buckets) { | 203 | if (!rh->buckets) { |
199 | DMERR("unable to allocate region hash bucket memory"); | 204 | DMERR("unable to allocate region hash bucket memory"); |
200 | kfree(rh); | 205 | kfree(rh); |
201 | return ERR_PTR(-ENOMEM); | 206 | return ERR_PTR(-ENOMEM); |
202 | } | 207 | } |
203 | 208 | ||
204 | for (i = 0; i < nr_buckets; i++) | 209 | for (i = 0; i < nr_buckets; i++) |
205 | INIT_LIST_HEAD(rh->buckets + i); | 210 | INIT_LIST_HEAD(rh->buckets + i); |
206 | 211 | ||
207 | spin_lock_init(&rh->region_lock); | 212 | spin_lock_init(&rh->region_lock); |
208 | sema_init(&rh->recovery_count, 0); | 213 | sema_init(&rh->recovery_count, 0); |
209 | atomic_set(&rh->recovery_in_flight, 0); | 214 | atomic_set(&rh->recovery_in_flight, 0); |
210 | INIT_LIST_HEAD(&rh->clean_regions); | 215 | INIT_LIST_HEAD(&rh->clean_regions); |
211 | INIT_LIST_HEAD(&rh->quiesced_regions); | 216 | INIT_LIST_HEAD(&rh->quiesced_regions); |
212 | INIT_LIST_HEAD(&rh->recovered_regions); | 217 | INIT_LIST_HEAD(&rh->recovered_regions); |
213 | INIT_LIST_HEAD(&rh->failed_recovered_regions); | 218 | INIT_LIST_HEAD(&rh->failed_recovered_regions); |
219 | rh->barrier_failure = 0; | ||
214 | 220 | ||
215 | rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, | 221 | rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS, |
216 | sizeof(struct dm_region)); | 222 | sizeof(struct dm_region)); |
217 | if (!rh->region_pool) { | 223 | if (!rh->region_pool) { |
218 | vfree(rh->buckets); | 224 | vfree(rh->buckets); |
219 | kfree(rh); | 225 | kfree(rh); |
220 | rh = ERR_PTR(-ENOMEM); | 226 | rh = ERR_PTR(-ENOMEM); |
221 | } | 227 | } |
222 | 228 | ||
223 | return rh; | 229 | return rh; |
224 | } | 230 | } |
225 | EXPORT_SYMBOL_GPL(dm_region_hash_create); | 231 | EXPORT_SYMBOL_GPL(dm_region_hash_create); |
226 | 232 | ||
227 | void dm_region_hash_destroy(struct dm_region_hash *rh) | 233 | void dm_region_hash_destroy(struct dm_region_hash *rh) |
228 | { | 234 | { |
229 | unsigned h; | 235 | unsigned h; |
230 | struct dm_region *reg, *nreg; | 236 | struct dm_region *reg, *nreg; |
231 | 237 | ||
232 | BUG_ON(!list_empty(&rh->quiesced_regions)); | 238 | BUG_ON(!list_empty(&rh->quiesced_regions)); |
233 | for (h = 0; h < rh->nr_buckets; h++) { | 239 | for (h = 0; h < rh->nr_buckets; h++) { |
234 | list_for_each_entry_safe(reg, nreg, rh->buckets + h, | 240 | list_for_each_entry_safe(reg, nreg, rh->buckets + h, |
235 | hash_list) { | 241 | hash_list) { |
236 | BUG_ON(atomic_read(®->pending)); | 242 | BUG_ON(atomic_read(®->pending)); |
237 | mempool_free(reg, rh->region_pool); | 243 | mempool_free(reg, rh->region_pool); |
238 | } | 244 | } |
239 | } | 245 | } |
240 | 246 | ||
241 | if (rh->log) | 247 | if (rh->log) |
242 | dm_dirty_log_destroy(rh->log); | 248 | dm_dirty_log_destroy(rh->log); |
243 | 249 | ||
244 | if (rh->region_pool) | 250 | if (rh->region_pool) |
245 | mempool_destroy(rh->region_pool); | 251 | mempool_destroy(rh->region_pool); |
246 | 252 | ||
247 | vfree(rh->buckets); | 253 | vfree(rh->buckets); |
248 | kfree(rh); | 254 | kfree(rh); |
249 | } | 255 | } |
250 | EXPORT_SYMBOL_GPL(dm_region_hash_destroy); | 256 | EXPORT_SYMBOL_GPL(dm_region_hash_destroy); |
251 | 257 | ||
252 | struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh) | 258 | struct dm_dirty_log *dm_rh_dirty_log(struct dm_region_hash *rh) |
253 | { | 259 | { |
254 | return rh->log; | 260 | return rh->log; |
255 | } | 261 | } |
256 | EXPORT_SYMBOL_GPL(dm_rh_dirty_log); | 262 | EXPORT_SYMBOL_GPL(dm_rh_dirty_log); |
257 | 263 | ||
258 | static unsigned rh_hash(struct dm_region_hash *rh, region_t region) | 264 | static unsigned rh_hash(struct dm_region_hash *rh, region_t region) |
259 | { | 265 | { |
260 | return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask; | 266 | return (unsigned) ((region * rh->prime) >> rh->shift) & rh->mask; |
261 | } | 267 | } |
262 | 268 | ||
263 | static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region) | 269 | static struct dm_region *__rh_lookup(struct dm_region_hash *rh, region_t region) |
264 | { | 270 | { |
265 | struct dm_region *reg; | 271 | struct dm_region *reg; |
266 | struct list_head *bucket = rh->buckets + rh_hash(rh, region); | 272 | struct list_head *bucket = rh->buckets + rh_hash(rh, region); |
267 | 273 | ||
268 | list_for_each_entry(reg, bucket, hash_list) | 274 | list_for_each_entry(reg, bucket, hash_list) |
269 | if (reg->key == region) | 275 | if (reg->key == region) |
270 | return reg; | 276 | return reg; |
271 | 277 | ||
272 | return NULL; | 278 | return NULL; |
273 | } | 279 | } |
274 | 280 | ||
275 | static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg) | 281 | static void __rh_insert(struct dm_region_hash *rh, struct dm_region *reg) |
276 | { | 282 | { |
277 | list_add(®->hash_list, rh->buckets + rh_hash(rh, reg->key)); | 283 | list_add(®->hash_list, rh->buckets + rh_hash(rh, reg->key)); |
278 | } | 284 | } |
279 | 285 | ||
280 | static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) | 286 | static struct dm_region *__rh_alloc(struct dm_region_hash *rh, region_t region) |
281 | { | 287 | { |
282 | struct dm_region *reg, *nreg; | 288 | struct dm_region *reg, *nreg; |
283 | 289 | ||
284 | nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); | 290 | nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); |
285 | if (unlikely(!nreg)) | 291 | if (unlikely(!nreg)) |
286 | nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); | 292 | nreg = kmalloc(sizeof(*nreg), GFP_NOIO | __GFP_NOFAIL); |
287 | 293 | ||
288 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? | 294 | nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? |
289 | DM_RH_CLEAN : DM_RH_NOSYNC; | 295 | DM_RH_CLEAN : DM_RH_NOSYNC; |
290 | nreg->rh = rh; | 296 | nreg->rh = rh; |
291 | nreg->key = region; | 297 | nreg->key = region; |
292 | INIT_LIST_HEAD(&nreg->list); | 298 | INIT_LIST_HEAD(&nreg->list); |
293 | atomic_set(&nreg->pending, 0); | 299 | atomic_set(&nreg->pending, 0); |
294 | bio_list_init(&nreg->delayed_bios); | 300 | bio_list_init(&nreg->delayed_bios); |
295 | 301 | ||
296 | write_lock_irq(&rh->hash_lock); | 302 | write_lock_irq(&rh->hash_lock); |
297 | reg = __rh_lookup(rh, region); | 303 | reg = __rh_lookup(rh, region); |
298 | if (reg) | 304 | if (reg) |
299 | /* We lost the race. */ | 305 | /* We lost the race. */ |
300 | mempool_free(nreg, rh->region_pool); | 306 | mempool_free(nreg, rh->region_pool); |
301 | else { | 307 | else { |
302 | __rh_insert(rh, nreg); | 308 | __rh_insert(rh, nreg); |
303 | if (nreg->state == DM_RH_CLEAN) { | 309 | if (nreg->state == DM_RH_CLEAN) { |
304 | spin_lock(&rh->region_lock); | 310 | spin_lock(&rh->region_lock); |
305 | list_add(&nreg->list, &rh->clean_regions); | 311 | list_add(&nreg->list, &rh->clean_regions); |
306 | spin_unlock(&rh->region_lock); | 312 | spin_unlock(&rh->region_lock); |
307 | } | 313 | } |
308 | 314 | ||
309 | reg = nreg; | 315 | reg = nreg; |
310 | } | 316 | } |
311 | write_unlock_irq(&rh->hash_lock); | 317 | write_unlock_irq(&rh->hash_lock); |
312 | 318 | ||
313 | return reg; | 319 | return reg; |
314 | } | 320 | } |
315 | 321 | ||
316 | static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region) | 322 | static struct dm_region *__rh_find(struct dm_region_hash *rh, region_t region) |
317 | { | 323 | { |
318 | struct dm_region *reg; | 324 | struct dm_region *reg; |
319 | 325 | ||
320 | reg = __rh_lookup(rh, region); | 326 | reg = __rh_lookup(rh, region); |
321 | if (!reg) { | 327 | if (!reg) { |
322 | read_unlock(&rh->hash_lock); | 328 | read_unlock(&rh->hash_lock); |
323 | reg = __rh_alloc(rh, region); | 329 | reg = __rh_alloc(rh, region); |
324 | read_lock(&rh->hash_lock); | 330 | read_lock(&rh->hash_lock); |
325 | } | 331 | } |
326 | 332 | ||
327 | return reg; | 333 | return reg; |
328 | } | 334 | } |
329 | 335 | ||
330 | int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block) | 336 | int dm_rh_get_state(struct dm_region_hash *rh, region_t region, int may_block) |
331 | { | 337 | { |
332 | int r; | 338 | int r; |
333 | struct dm_region *reg; | 339 | struct dm_region *reg; |
334 | 340 | ||
335 | read_lock(&rh->hash_lock); | 341 | read_lock(&rh->hash_lock); |
336 | reg = __rh_lookup(rh, region); | 342 | reg = __rh_lookup(rh, region); |
337 | read_unlock(&rh->hash_lock); | 343 | read_unlock(&rh->hash_lock); |
338 | 344 | ||
339 | if (reg) | 345 | if (reg) |
340 | return reg->state; | 346 | return reg->state; |
341 | 347 | ||
342 | /* | 348 | /* |
343 | * The region wasn't in the hash, so we fall back to the | 349 | * The region wasn't in the hash, so we fall back to the |
344 | * dirty log. | 350 | * dirty log. |
345 | */ | 351 | */ |
346 | r = rh->log->type->in_sync(rh->log, region, may_block); | 352 | r = rh->log->type->in_sync(rh->log, region, may_block); |
347 | 353 | ||
348 | /* | 354 | /* |
349 | * Any error from the dirty log (eg. -EWOULDBLOCK) gets | 355 | * Any error from the dirty log (eg. -EWOULDBLOCK) gets |
350 | * taken as a DM_RH_NOSYNC | 356 | * taken as a DM_RH_NOSYNC |
351 | */ | 357 | */ |
352 | return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC; | 358 | return r == 1 ? DM_RH_CLEAN : DM_RH_NOSYNC; |
353 | } | 359 | } |
354 | EXPORT_SYMBOL_GPL(dm_rh_get_state); | 360 | EXPORT_SYMBOL_GPL(dm_rh_get_state); |
355 | 361 | ||
356 | static void complete_resync_work(struct dm_region *reg, int success) | 362 | static void complete_resync_work(struct dm_region *reg, int success) |
357 | { | 363 | { |
358 | struct dm_region_hash *rh = reg->rh; | 364 | struct dm_region_hash *rh = reg->rh; |
359 | 365 | ||
360 | rh->log->type->set_region_sync(rh->log, reg->key, success); | 366 | rh->log->type->set_region_sync(rh->log, reg->key, success); |
361 | 367 | ||
362 | /* | 368 | /* |
363 | * Dispatch the bios before we call 'wake_up_all'. | 369 | * Dispatch the bios before we call 'wake_up_all'. |
364 | * This is important because if we are suspending, | 370 | * This is important because if we are suspending, |
365 | * we want to know that recovery is complete and | 371 | * we want to know that recovery is complete and |
366 | * the work queue is flushed. If we wake_up_all | 372 | * the work queue is flushed. If we wake_up_all |
367 | * before we dispatch_bios (queue bios and call wake()), | 373 | * before we dispatch_bios (queue bios and call wake()), |
368 | * then we risk suspending before the work queue | 374 | * then we risk suspending before the work queue |
369 | * has been properly flushed. | 375 | * has been properly flushed. |
370 | */ | 376 | */ |
371 | rh->dispatch_bios(rh->context, ®->delayed_bios); | 377 | rh->dispatch_bios(rh->context, ®->delayed_bios); |
372 | if (atomic_dec_and_test(&rh->recovery_in_flight)) | 378 | if (atomic_dec_and_test(&rh->recovery_in_flight)) |
373 | rh->wakeup_all_recovery_waiters(rh->context); | 379 | rh->wakeup_all_recovery_waiters(rh->context); |
374 | up(&rh->recovery_count); | 380 | up(&rh->recovery_count); |
375 | } | 381 | } |
376 | 382 | ||
377 | /* dm_rh_mark_nosync | 383 | /* dm_rh_mark_nosync |
378 | * @ms | 384 | * @ms |
379 | * @bio | 385 | * @bio |
380 | * @done | 386 | * @done |
381 | * @error | 387 | * @error |
382 | * | 388 | * |
383 | * The bio was written on some mirror(s) but failed on other mirror(s). | 389 | * The bio was written on some mirror(s) but failed on other mirror(s). |
384 | * We can successfully endio the bio but should avoid the region being | 390 | * We can successfully endio the bio but should avoid the region being |
385 | * marked clean by setting the state DM_RH_NOSYNC. | 391 | * marked clean by setting the state DM_RH_NOSYNC. |
386 | * | 392 | * |
387 | * This function is _not_ safe in interrupt context! | 393 | * This function is _not_ safe in interrupt context! |
388 | */ | 394 | */ |
389 | void dm_rh_mark_nosync(struct dm_region_hash *rh, | 395 | void dm_rh_mark_nosync(struct dm_region_hash *rh, |
390 | struct bio *bio, unsigned done, int error) | 396 | struct bio *bio, unsigned done, int error) |
391 | { | 397 | { |
392 | unsigned long flags; | 398 | unsigned long flags; |
393 | struct dm_dirty_log *log = rh->log; | 399 | struct dm_dirty_log *log = rh->log; |
394 | struct dm_region *reg; | 400 | struct dm_region *reg; |
395 | region_t region = dm_rh_bio_to_region(rh, bio); | 401 | region_t region = dm_rh_bio_to_region(rh, bio); |
396 | int recovering = 0; | 402 | int recovering = 0; |
397 | 403 | ||
404 | if (bio_empty_barrier(bio)) { | ||
405 | rh->barrier_failure = 1; | ||
406 | return; | ||
407 | } | ||
408 | |||
398 | /* We must inform the log that the sync count has changed. */ | 409 | /* We must inform the log that the sync count has changed. */ |
399 | log->type->set_region_sync(log, region, 0); | 410 | log->type->set_region_sync(log, region, 0); |
400 | 411 | ||
401 | read_lock(&rh->hash_lock); | 412 | read_lock(&rh->hash_lock); |
402 | reg = __rh_find(rh, region); | 413 | reg = __rh_find(rh, region); |
403 | read_unlock(&rh->hash_lock); | 414 | read_unlock(&rh->hash_lock); |
404 | 415 | ||
405 | /* region hash entry should exist because write was in-flight */ | 416 | /* region hash entry should exist because write was in-flight */ |
406 | BUG_ON(!reg); | 417 | BUG_ON(!reg); |
407 | BUG_ON(!list_empty(®->list)); | 418 | BUG_ON(!list_empty(®->list)); |
408 | 419 | ||
409 | spin_lock_irqsave(&rh->region_lock, flags); | 420 | spin_lock_irqsave(&rh->region_lock, flags); |
410 | /* | 421 | /* |
411 | * Possible cases: | 422 | * Possible cases: |
412 | * 1) DM_RH_DIRTY | 423 | * 1) DM_RH_DIRTY |
413 | * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed | 424 | * 2) DM_RH_NOSYNC: was dirty, other preceeding writes failed |
414 | * 3) DM_RH_RECOVERING: flushing pending writes | 425 | * 3) DM_RH_RECOVERING: flushing pending writes |
415 | * Either case, the region should have not been connected to list. | 426 | * Either case, the region should have not been connected to list. |
416 | */ | 427 | */ |
417 | recovering = (reg->state == DM_RH_RECOVERING); | 428 | recovering = (reg->state == DM_RH_RECOVERING); |
418 | reg->state = DM_RH_NOSYNC; | 429 | reg->state = DM_RH_NOSYNC; |
419 | BUG_ON(!list_empty(®->list)); | 430 | BUG_ON(!list_empty(®->list)); |
420 | spin_unlock_irqrestore(&rh->region_lock, flags); | 431 | spin_unlock_irqrestore(&rh->region_lock, flags); |
421 | 432 | ||
422 | bio_endio(bio, error); | 433 | bio_endio(bio, error); |
423 | if (recovering) | 434 | if (recovering) |
424 | complete_resync_work(reg, 0); | 435 | complete_resync_work(reg, 0); |
425 | } | 436 | } |
426 | EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); | 437 | EXPORT_SYMBOL_GPL(dm_rh_mark_nosync); |
427 | 438 | ||
428 | void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) | 439 | void dm_rh_update_states(struct dm_region_hash *rh, int errors_handled) |
429 | { | 440 | { |
430 | struct dm_region *reg, *next; | 441 | struct dm_region *reg, *next; |
431 | 442 | ||
432 | LIST_HEAD(clean); | 443 | LIST_HEAD(clean); |
433 | LIST_HEAD(recovered); | 444 | LIST_HEAD(recovered); |
434 | LIST_HEAD(failed_recovered); | 445 | LIST_HEAD(failed_recovered); |
435 | 446 | ||
436 | /* | 447 | /* |
437 | * Quickly grab the lists. | 448 | * Quickly grab the lists. |
438 | */ | 449 | */ |
439 | write_lock_irq(&rh->hash_lock); | 450 | write_lock_irq(&rh->hash_lock); |
440 | spin_lock(&rh->region_lock); | 451 | spin_lock(&rh->region_lock); |
441 | if (!list_empty(&rh->clean_regions)) { | 452 | if (!list_empty(&rh->clean_regions)) { |
442 | list_splice_init(&rh->clean_regions, &clean); | 453 | list_splice_init(&rh->clean_regions, &clean); |
443 | 454 | ||
444 | list_for_each_entry(reg, &clean, list) | 455 | list_for_each_entry(reg, &clean, list) |
445 | list_del(®->hash_list); | 456 | list_del(®->hash_list); |
446 | } | 457 | } |
447 | 458 | ||
448 | if (!list_empty(&rh->recovered_regions)) { | 459 | if (!list_empty(&rh->recovered_regions)) { |
449 | list_splice_init(&rh->recovered_regions, &recovered); | 460 | list_splice_init(&rh->recovered_regions, &recovered); |
450 | 461 | ||
451 | list_for_each_entry(reg, &recovered, list) | 462 | list_for_each_entry(reg, &recovered, list) |
452 | list_del(®->hash_list); | 463 | list_del(®->hash_list); |
453 | } | 464 | } |
454 | 465 | ||
455 | if (!list_empty(&rh->failed_recovered_regions)) { | 466 | if (!list_empty(&rh->failed_recovered_regions)) { |
456 | list_splice_init(&rh->failed_recovered_regions, | 467 | list_splice_init(&rh->failed_recovered_regions, |
457 | &failed_recovered); | 468 | &failed_recovered); |
458 | 469 | ||
459 | list_for_each_entry(reg, &failed_recovered, list) | 470 | list_for_each_entry(reg, &failed_recovered, list) |
460 | list_del(®->hash_list); | 471 | list_del(®->hash_list); |
461 | } | 472 | } |
462 | 473 | ||
463 | spin_unlock(&rh->region_lock); | 474 | spin_unlock(&rh->region_lock); |
464 | write_unlock_irq(&rh->hash_lock); | 475 | write_unlock_irq(&rh->hash_lock); |
465 | 476 | ||
466 | /* | 477 | /* |
467 | * All the regions on the recovered and clean lists have | 478 | * All the regions on the recovered and clean lists have |
468 | * now been pulled out of the system, so no need to do | 479 | * now been pulled out of the system, so no need to do |
469 | * any more locking. | 480 | * any more locking. |
470 | */ | 481 | */ |
471 | list_for_each_entry_safe(reg, next, &recovered, list) { | 482 | list_for_each_entry_safe(reg, next, &recovered, list) { |
472 | rh->log->type->clear_region(rh->log, reg->key); | 483 | rh->log->type->clear_region(rh->log, reg->key); |
473 | complete_resync_work(reg, 1); | 484 | complete_resync_work(reg, 1); |
474 | mempool_free(reg, rh->region_pool); | 485 | mempool_free(reg, rh->region_pool); |
475 | } | 486 | } |
476 | 487 | ||
477 | list_for_each_entry_safe(reg, next, &failed_recovered, list) { | 488 | list_for_each_entry_safe(reg, next, &failed_recovered, list) { |
478 | complete_resync_work(reg, errors_handled ? 0 : 1); | 489 | complete_resync_work(reg, errors_handled ? 0 : 1); |
479 | mempool_free(reg, rh->region_pool); | 490 | mempool_free(reg, rh->region_pool); |
480 | } | 491 | } |
481 | 492 | ||
482 | list_for_each_entry_safe(reg, next, &clean, list) { | 493 | list_for_each_entry_safe(reg, next, &clean, list) { |
483 | rh->log->type->clear_region(rh->log, reg->key); | 494 | rh->log->type->clear_region(rh->log, reg->key); |
484 | mempool_free(reg, rh->region_pool); | 495 | mempool_free(reg, rh->region_pool); |
485 | } | 496 | } |
486 | 497 | ||
487 | rh->log->type->flush(rh->log); | 498 | rh->log->type->flush(rh->log); |
488 | } | 499 | } |
489 | EXPORT_SYMBOL_GPL(dm_rh_update_states); | 500 | EXPORT_SYMBOL_GPL(dm_rh_update_states); |
490 | 501 | ||
491 | static void rh_inc(struct dm_region_hash *rh, region_t region) | 502 | static void rh_inc(struct dm_region_hash *rh, region_t region) |
492 | { | 503 | { |
493 | struct dm_region *reg; | 504 | struct dm_region *reg; |
494 | 505 | ||
495 | read_lock(&rh->hash_lock); | 506 | read_lock(&rh->hash_lock); |
496 | reg = __rh_find(rh, region); | 507 | reg = __rh_find(rh, region); |
497 | 508 | ||
498 | spin_lock_irq(&rh->region_lock); | 509 | spin_lock_irq(&rh->region_lock); |
499 | atomic_inc(®->pending); | 510 | atomic_inc(®->pending); |
500 | 511 | ||
501 | if (reg->state == DM_RH_CLEAN) { | 512 | if (reg->state == DM_RH_CLEAN) { |
502 | reg->state = DM_RH_DIRTY; | 513 | reg->state = DM_RH_DIRTY; |
503 | list_del_init(®->list); /* take off the clean list */ | 514 | list_del_init(®->list); /* take off the clean list */ |
504 | spin_unlock_irq(&rh->region_lock); | 515 | spin_unlock_irq(&rh->region_lock); |
505 | 516 | ||
506 | rh->log->type->mark_region(rh->log, reg->key); | 517 | rh->log->type->mark_region(rh->log, reg->key); |
507 | } else | 518 | } else |
508 | spin_unlock_irq(&rh->region_lock); | 519 | spin_unlock_irq(&rh->region_lock); |
509 | 520 | ||
510 | 521 | ||
511 | read_unlock(&rh->hash_lock); | 522 | read_unlock(&rh->hash_lock); |
512 | } | 523 | } |
513 | 524 | ||
514 | void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) | 525 | void dm_rh_inc_pending(struct dm_region_hash *rh, struct bio_list *bios) |
515 | { | 526 | { |
516 | struct bio *bio; | 527 | struct bio *bio; |
517 | 528 | ||
518 | for (bio = bios->head; bio; bio = bio->bi_next) | 529 | for (bio = bios->head; bio; bio = bio->bi_next) { |
530 | if (bio_empty_barrier(bio)) | ||
531 | continue; | ||
519 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); | 532 | rh_inc(rh, dm_rh_bio_to_region(rh, bio)); |
533 | } | ||
520 | } | 534 | } |
521 | EXPORT_SYMBOL_GPL(dm_rh_inc_pending); | 535 | EXPORT_SYMBOL_GPL(dm_rh_inc_pending); |
522 | 536 | ||
523 | void dm_rh_dec(struct dm_region_hash *rh, region_t region) | 537 | void dm_rh_dec(struct dm_region_hash *rh, region_t region) |
524 | { | 538 | { |
525 | unsigned long flags; | 539 | unsigned long flags; |
526 | struct dm_region *reg; | 540 | struct dm_region *reg; |
527 | int should_wake = 0; | 541 | int should_wake = 0; |
528 | 542 | ||
529 | read_lock(&rh->hash_lock); | 543 | read_lock(&rh->hash_lock); |
530 | reg = __rh_lookup(rh, region); | 544 | reg = __rh_lookup(rh, region); |
531 | read_unlock(&rh->hash_lock); | 545 | read_unlock(&rh->hash_lock); |
532 | 546 | ||
533 | spin_lock_irqsave(&rh->region_lock, flags); | 547 | spin_lock_irqsave(&rh->region_lock, flags); |
534 | if (atomic_dec_and_test(®->pending)) { | 548 | if (atomic_dec_and_test(®->pending)) { |
535 | /* | 549 | /* |
536 | * There is no pending I/O for this region. | 550 | * There is no pending I/O for this region. |
537 | * We can move the region to corresponding list for next action. | 551 | * We can move the region to corresponding list for next action. |
538 | * At this point, the region is not yet connected to any list. | 552 | * At this point, the region is not yet connected to any list. |
539 | * | 553 | * |
540 | * If the state is DM_RH_NOSYNC, the region should be kept off | 554 | * If the state is DM_RH_NOSYNC, the region should be kept off |
541 | * from clean list. | 555 | * from clean list. |
542 | * The hash entry for DM_RH_NOSYNC will remain in memory | 556 | * The hash entry for DM_RH_NOSYNC will remain in memory |
543 | * until the region is recovered or the map is reloaded. | 557 | * until the region is recovered or the map is reloaded. |
544 | */ | 558 | */ |
545 | 559 | ||
546 | /* do nothing for DM_RH_NOSYNC */ | 560 | /* do nothing for DM_RH_NOSYNC */ |
547 | if (reg->state == DM_RH_RECOVERING) { | 561 | if (unlikely(rh->barrier_failure)) { |
562 | /* | ||
563 | * If a write barrier failed some time ago, we | ||
564 | * don't know whether or not this write made it | ||
565 | * to the disk, so we must resync the device. | ||
566 | */ | ||
567 | reg->state = DM_RH_NOSYNC; | ||
568 | } else if (reg->state == DM_RH_RECOVERING) { | ||
548 | list_add_tail(®->list, &rh->quiesced_regions); | 569 | list_add_tail(®->list, &rh->quiesced_regions); |
549 | } else if (reg->state == DM_RH_DIRTY) { | 570 | } else if (reg->state == DM_RH_DIRTY) { |
550 | reg->state = DM_RH_CLEAN; | 571 | reg->state = DM_RH_CLEAN; |
551 | list_add(®->list, &rh->clean_regions); | 572 | list_add(®->list, &rh->clean_regions); |
552 | } | 573 | } |
553 | should_wake = 1; | 574 | should_wake = 1; |
554 | } | 575 | } |
555 | spin_unlock_irqrestore(&rh->region_lock, flags); | 576 | spin_unlock_irqrestore(&rh->region_lock, flags); |
556 | 577 | ||
557 | if (should_wake) | 578 | if (should_wake) |
558 | rh->wakeup_workers(rh->context); | 579 | rh->wakeup_workers(rh->context); |
559 | } | 580 | } |
560 | EXPORT_SYMBOL_GPL(dm_rh_dec); | 581 | EXPORT_SYMBOL_GPL(dm_rh_dec); |
561 | 582 | ||
562 | /* | 583 | /* |
563 | * Starts quiescing a region in preparation for recovery. | 584 | * Starts quiescing a region in preparation for recovery. |
564 | */ | 585 | */ |
565 | static int __rh_recovery_prepare(struct dm_region_hash *rh) | 586 | static int __rh_recovery_prepare(struct dm_region_hash *rh) |
566 | { | 587 | { |
567 | int r; | 588 | int r; |
568 | region_t region; | 589 | region_t region; |
569 | struct dm_region *reg; | 590 | struct dm_region *reg; |
570 | 591 | ||
571 | /* | 592 | /* |
572 | * Ask the dirty log what's next. | 593 | * Ask the dirty log what's next. |
573 | */ | 594 | */ |
574 | r = rh->log->type->get_resync_work(rh->log, ®ion); | 595 | r = rh->log->type->get_resync_work(rh->log, ®ion); |
575 | if (r <= 0) | 596 | if (r <= 0) |
576 | return r; | 597 | return r; |
577 | 598 | ||
578 | /* | 599 | /* |
579 | * Get this region, and start it quiescing by setting the | 600 | * Get this region, and start it quiescing by setting the |
580 | * recovering flag. | 601 | * recovering flag. |
581 | */ | 602 | */ |
582 | read_lock(&rh->hash_lock); | 603 | read_lock(&rh->hash_lock); |
583 | reg = __rh_find(rh, region); | 604 | reg = __rh_find(rh, region); |
584 | read_unlock(&rh->hash_lock); | 605 | read_unlock(&rh->hash_lock); |
585 | 606 | ||
586 | spin_lock_irq(&rh->region_lock); | 607 | spin_lock_irq(&rh->region_lock); |
587 | reg->state = DM_RH_RECOVERING; | 608 | reg->state = DM_RH_RECOVERING; |
588 | 609 | ||
589 | /* Already quiesced ? */ | 610 | /* Already quiesced ? */ |
590 | if (atomic_read(®->pending)) | 611 | if (atomic_read(®->pending)) |
591 | list_del_init(®->list); | 612 | list_del_init(®->list); |
592 | else | 613 | else |
593 | list_move(®->list, &rh->quiesced_regions); | 614 | list_move(®->list, &rh->quiesced_regions); |
594 | 615 | ||
595 | spin_unlock_irq(&rh->region_lock); | 616 | spin_unlock_irq(&rh->region_lock); |
596 | 617 | ||
597 | return 1; | 618 | return 1; |
598 | } | 619 | } |
599 | 620 | ||
600 | void dm_rh_recovery_prepare(struct dm_region_hash *rh) | 621 | void dm_rh_recovery_prepare(struct dm_region_hash *rh) |
601 | { | 622 | { |
602 | /* Extra reference to avoid race with dm_rh_stop_recovery */ | 623 | /* Extra reference to avoid race with dm_rh_stop_recovery */ |
603 | atomic_inc(&rh->recovery_in_flight); | 624 | atomic_inc(&rh->recovery_in_flight); |
604 | 625 | ||
605 | while (!down_trylock(&rh->recovery_count)) { | 626 | while (!down_trylock(&rh->recovery_count)) { |
606 | atomic_inc(&rh->recovery_in_flight); | 627 | atomic_inc(&rh->recovery_in_flight); |
607 | if (__rh_recovery_prepare(rh) <= 0) { | 628 | if (__rh_recovery_prepare(rh) <= 0) { |
608 | atomic_dec(&rh->recovery_in_flight); | 629 | atomic_dec(&rh->recovery_in_flight); |
609 | up(&rh->recovery_count); | 630 | up(&rh->recovery_count); |
610 | break; | 631 | break; |
611 | } | 632 | } |
612 | } | 633 | } |
613 | 634 | ||
614 | /* Drop the extra reference */ | 635 | /* Drop the extra reference */ |
615 | if (atomic_dec_and_test(&rh->recovery_in_flight)) | 636 | if (atomic_dec_and_test(&rh->recovery_in_flight)) |
616 | rh->wakeup_all_recovery_waiters(rh->context); | 637 | rh->wakeup_all_recovery_waiters(rh->context); |
617 | } | 638 | } |
618 | EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); | 639 | EXPORT_SYMBOL_GPL(dm_rh_recovery_prepare); |
619 | 640 | ||
620 | /* | 641 | /* |
621 | * Returns any quiesced regions. | 642 | * Returns any quiesced regions. |
622 | */ | 643 | */ |
623 | struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh) | 644 | struct dm_region *dm_rh_recovery_start(struct dm_region_hash *rh) |
624 | { | 645 | { |
625 | struct dm_region *reg = NULL; | 646 | struct dm_region *reg = NULL; |
626 | 647 | ||
627 | spin_lock_irq(&rh->region_lock); | 648 | spin_lock_irq(&rh->region_lock); |
628 | if (!list_empty(&rh->quiesced_regions)) { | 649 | if (!list_empty(&rh->quiesced_regions)) { |
629 | reg = list_entry(rh->quiesced_regions.next, | 650 | reg = list_entry(rh->quiesced_regions.next, |
630 | struct dm_region, list); | 651 | struct dm_region, list); |
631 | list_del_init(®->list); /* remove from the quiesced list */ | 652 | list_del_init(®->list); /* remove from the quiesced list */ |
632 | } | 653 | } |
633 | spin_unlock_irq(&rh->region_lock); | 654 | spin_unlock_irq(&rh->region_lock); |
634 | 655 | ||
635 | return reg; | 656 | return reg; |
636 | } | 657 | } |
637 | EXPORT_SYMBOL_GPL(dm_rh_recovery_start); | 658 | EXPORT_SYMBOL_GPL(dm_rh_recovery_start); |
638 | 659 | ||
639 | void dm_rh_recovery_end(struct dm_region *reg, int success) | 660 | void dm_rh_recovery_end(struct dm_region *reg, int success) |
640 | { | 661 | { |
641 | struct dm_region_hash *rh = reg->rh; | 662 | struct dm_region_hash *rh = reg->rh; |
642 | 663 | ||
643 | spin_lock_irq(&rh->region_lock); | 664 | spin_lock_irq(&rh->region_lock); |
644 | if (success) | 665 | if (success) |
645 | list_add(®->list, ®->rh->recovered_regions); | 666 | list_add(®->list, ®->rh->recovered_regions); |
646 | else { | 667 | else { |
647 | reg->state = DM_RH_NOSYNC; | 668 | reg->state = DM_RH_NOSYNC; |
648 | list_add(®->list, ®->rh->failed_recovered_regions); | 669 | list_add(®->list, ®->rh->failed_recovered_regions); |
649 | } | 670 | } |
650 | spin_unlock_irq(&rh->region_lock); | 671 | spin_unlock_irq(&rh->region_lock); |
651 | 672 | ||
652 | rh->wakeup_workers(rh->context); | 673 | rh->wakeup_workers(rh->context); |
653 | } | 674 | } |
654 | EXPORT_SYMBOL_GPL(dm_rh_recovery_end); | 675 | EXPORT_SYMBOL_GPL(dm_rh_recovery_end); |
655 | 676 | ||
656 | /* Return recovery in flight count. */ | 677 | /* Return recovery in flight count. */ |
657 | int dm_rh_recovery_in_flight(struct dm_region_hash *rh) | 678 | int dm_rh_recovery_in_flight(struct dm_region_hash *rh) |
658 | { | 679 | { |
659 | return atomic_read(&rh->recovery_in_flight); | 680 | return atomic_read(&rh->recovery_in_flight); |
660 | } | 681 | } |
661 | EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); | 682 | EXPORT_SYMBOL_GPL(dm_rh_recovery_in_flight); |
662 | 683 | ||
663 | int dm_rh_flush(struct dm_region_hash *rh) | 684 | int dm_rh_flush(struct dm_region_hash *rh) |
664 | { | 685 | { |
665 | return rh->log->type->flush(rh->log); | 686 | return rh->log->type->flush(rh->log); |
666 | } | 687 | } |
667 | EXPORT_SYMBOL_GPL(dm_rh_flush); | 688 | EXPORT_SYMBOL_GPL(dm_rh_flush); |
668 | 689 | ||
669 | void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) | 690 | void dm_rh_delay(struct dm_region_hash *rh, struct bio *bio) |
670 | { | 691 | { |
671 | struct dm_region *reg; | 692 | struct dm_region *reg; |
672 | 693 | ||
673 | read_lock(&rh->hash_lock); | 694 | read_lock(&rh->hash_lock); |
674 | reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); | 695 | reg = __rh_find(rh, dm_rh_bio_to_region(rh, bio)); |
675 | bio_list_add(®->delayed_bios, bio); | 696 | bio_list_add(®->delayed_bios, bio); |
676 | read_unlock(&rh->hash_lock); | 697 | read_unlock(&rh->hash_lock); |
677 | } | 698 | } |
678 | EXPORT_SYMBOL_GPL(dm_rh_delay); | 699 | EXPORT_SYMBOL_GPL(dm_rh_delay); |
679 | 700 | ||
680 | void dm_rh_stop_recovery(struct dm_region_hash *rh) | 701 | void dm_rh_stop_recovery(struct dm_region_hash *rh) |
681 | { | 702 | { |
682 | int i; | 703 | int i; |
683 | 704 | ||
684 | /* wait for any recovering regions */ | 705 | /* wait for any recovering regions */ |
685 | for (i = 0; i < rh->max_recovery; i++) | 706 | for (i = 0; i < rh->max_recovery; i++) |
686 | down(&rh->recovery_count); | 707 | down(&rh->recovery_count); |
687 | } | 708 | } |
688 | EXPORT_SYMBOL_GPL(dm_rh_stop_recovery); | 709 | EXPORT_SYMBOL_GPL(dm_rh_stop_recovery); |
689 | 710 | ||
690 | void dm_rh_start_recovery(struct dm_region_hash *rh) | 711 | void dm_rh_start_recovery(struct dm_region_hash *rh) |
691 | { | 712 | { |
692 | int i; | 713 | int i; |
693 | 714 | ||
694 | for (i = 0; i < rh->max_recovery; i++) | 715 | for (i = 0; i < rh->max_recovery; i++) |
695 | up(&rh->recovery_count); | 716 | up(&rh->recovery_count); |
696 | 717 | ||
697 | rh->wakeup_workers(rh->context); | 718 | rh->wakeup_workers(rh->context); |
698 | } | 719 | } |
699 | EXPORT_SYMBOL_GPL(dm_rh_start_recovery); | 720 | EXPORT_SYMBOL_GPL(dm_rh_start_recovery); |
700 | 721 | ||
701 | MODULE_DESCRIPTION(DM_NAME " region hash"); | 722 | MODULE_DESCRIPTION(DM_NAME " region hash"); |
702 | MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>"); | 723 | MODULE_AUTHOR("Joe Thornber/Heinz Mauelshagen <dm-devel@redhat.com>"); |
703 | MODULE_LICENSE("GPL"); | 724 | MODULE_LICENSE("GPL"); |
704 | 725 |