Commit ed8b752bccf2560e305e25125721d2f0ac759e88

Authored by Mike Snitzer
Committed by Alasdair G Kergon
1 parent 772ae5f54d

dm table: set flush capability based on underlying devices

DM has always advertised both REQ_FLUSH and REQ_FUA flush capabilities
regardless of whether or not a given DM device's underlying devices
also advertised a need for them.

Block's flush-merge changes from 2.6.39 have proven to be more costly
for DM devices.  Performance regressions have been reported even when
DM's underlying devices do not advertise that they have a write cache.

Fix the performance regressions by configuring a DM device's flushing
capabilities based on those of the underlying devices' capabilities.

Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>

Showing 2 changed files with 43 additions and 1 deletions Inline Diff

drivers/md/dm-table.c
1 /* 1 /*
2 * Copyright (C) 2001 Sistina Software (UK) Limited. 2 * Copyright (C) 2001 Sistina Software (UK) Limited.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8 #include "dm.h" 8 #include "dm.h"
9 9
10 #include <linux/module.h> 10 #include <linux/module.h>
11 #include <linux/vmalloc.h> 11 #include <linux/vmalloc.h>
12 #include <linux/blkdev.h> 12 #include <linux/blkdev.h>
13 #include <linux/namei.h> 13 #include <linux/namei.h>
14 #include <linux/ctype.h> 14 #include <linux/ctype.h>
15 #include <linux/string.h> 15 #include <linux/string.h>
16 #include <linux/slab.h> 16 #include <linux/slab.h>
17 #include <linux/interrupt.h> 17 #include <linux/interrupt.h>
18 #include <linux/mutex.h> 18 #include <linux/mutex.h>
19 #include <linux/delay.h> 19 #include <linux/delay.h>
20 #include <linux/atomic.h> 20 #include <linux/atomic.h>
21 21
22 #define DM_MSG_PREFIX "table" 22 #define DM_MSG_PREFIX "table"
23 23
24 #define MAX_DEPTH 16 24 #define MAX_DEPTH 16
25 #define NODE_SIZE L1_CACHE_BYTES 25 #define NODE_SIZE L1_CACHE_BYTES
26 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t)) 26 #define KEYS_PER_NODE (NODE_SIZE / sizeof(sector_t))
27 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1) 27 #define CHILDREN_PER_NODE (KEYS_PER_NODE + 1)
28 28
29 /* 29 /*
30 * The table has always exactly one reference from either mapped_device->map 30 * The table has always exactly one reference from either mapped_device->map
31 * or hash_cell->new_map. This reference is not counted in table->holders. 31 * or hash_cell->new_map. This reference is not counted in table->holders.
32 * A pair of dm_create_table/dm_destroy_table functions is used for table 32 * A pair of dm_create_table/dm_destroy_table functions is used for table
33 * creation/destruction. 33 * creation/destruction.
34 * 34 *
35 * Temporary references from the other code increase table->holders. A pair 35 * Temporary references from the other code increase table->holders. A pair
36 * of dm_table_get/dm_table_put functions is used to manipulate it. 36 * of dm_table_get/dm_table_put functions is used to manipulate it.
37 * 37 *
38 * When the table is about to be destroyed, we wait for table->holders to 38 * When the table is about to be destroyed, we wait for table->holders to
39 * drop to zero. 39 * drop to zero.
40 */ 40 */
41 41
42 struct dm_table { 42 struct dm_table {
43 struct mapped_device *md; 43 struct mapped_device *md;
44 atomic_t holders; 44 atomic_t holders;
45 unsigned type; 45 unsigned type;
46 46
47 /* btree table */ 47 /* btree table */
48 unsigned int depth; 48 unsigned int depth;
49 unsigned int counts[MAX_DEPTH]; /* in nodes */ 49 unsigned int counts[MAX_DEPTH]; /* in nodes */
50 sector_t *index[MAX_DEPTH]; 50 sector_t *index[MAX_DEPTH];
51 51
52 unsigned int num_targets; 52 unsigned int num_targets;
53 unsigned int num_allocated; 53 unsigned int num_allocated;
54 sector_t *highs; 54 sector_t *highs;
55 struct dm_target *targets; 55 struct dm_target *targets;
56 56
57 unsigned integrity_supported:1; 57 unsigned integrity_supported:1;
58 58
59 /* 59 /*
60 * Indicates the rw permissions for the new logical 60 * Indicates the rw permissions for the new logical
61 * device. This should be a combination of FMODE_READ 61 * device. This should be a combination of FMODE_READ
62 * and FMODE_WRITE. 62 * and FMODE_WRITE.
63 */ 63 */
64 fmode_t mode; 64 fmode_t mode;
65 65
66 /* a list of devices used by this table */ 66 /* a list of devices used by this table */
67 struct list_head devices; 67 struct list_head devices;
68 68
69 /* events get handed up using this callback */ 69 /* events get handed up using this callback */
70 void (*event_fn)(void *); 70 void (*event_fn)(void *);
71 void *event_context; 71 void *event_context;
72 72
73 struct dm_md_mempools *mempools; 73 struct dm_md_mempools *mempools;
74 74
75 struct list_head target_callbacks; 75 struct list_head target_callbacks;
76 }; 76 };
77 77
78 /* 78 /*
79 * Similar to ceiling(log_size(n)) 79 * Similar to ceiling(log_size(n))
80 */ 80 */
81 static unsigned int int_log(unsigned int n, unsigned int base) 81 static unsigned int int_log(unsigned int n, unsigned int base)
82 { 82 {
83 int result = 0; 83 int result = 0;
84 84
85 while (n > 1) { 85 while (n > 1) {
86 n = dm_div_up(n, base); 86 n = dm_div_up(n, base);
87 result++; 87 result++;
88 } 88 }
89 89
90 return result; 90 return result;
91 } 91 }
92 92
93 /* 93 /*
94 * Calculate the index of the child node of the n'th node k'th key. 94 * Calculate the index of the child node of the n'th node k'th key.
95 */ 95 */
96 static inline unsigned int get_child(unsigned int n, unsigned int k) 96 static inline unsigned int get_child(unsigned int n, unsigned int k)
97 { 97 {
98 return (n * CHILDREN_PER_NODE) + k; 98 return (n * CHILDREN_PER_NODE) + k;
99 } 99 }
100 100
101 /* 101 /*
102 * Return the n'th node of level l from table t. 102 * Return the n'th node of level l from table t.
103 */ 103 */
104 static inline sector_t *get_node(struct dm_table *t, 104 static inline sector_t *get_node(struct dm_table *t,
105 unsigned int l, unsigned int n) 105 unsigned int l, unsigned int n)
106 { 106 {
107 return t->index[l] + (n * KEYS_PER_NODE); 107 return t->index[l] + (n * KEYS_PER_NODE);
108 } 108 }
109 109
110 /* 110 /*
111 * Return the highest key that you could lookup from the n'th 111 * Return the highest key that you could lookup from the n'th
112 * node on level l of the btree. 112 * node on level l of the btree.
113 */ 113 */
114 static sector_t high(struct dm_table *t, unsigned int l, unsigned int n) 114 static sector_t high(struct dm_table *t, unsigned int l, unsigned int n)
115 { 115 {
116 for (; l < t->depth - 1; l++) 116 for (; l < t->depth - 1; l++)
117 n = get_child(n, CHILDREN_PER_NODE - 1); 117 n = get_child(n, CHILDREN_PER_NODE - 1);
118 118
119 if (n >= t->counts[l]) 119 if (n >= t->counts[l])
120 return (sector_t) - 1; 120 return (sector_t) - 1;
121 121
122 return get_node(t, l, n)[KEYS_PER_NODE - 1]; 122 return get_node(t, l, n)[KEYS_PER_NODE - 1];
123 } 123 }
124 124
125 /* 125 /*
126 * Fills in a level of the btree based on the highs of the level 126 * Fills in a level of the btree based on the highs of the level
127 * below it. 127 * below it.
128 */ 128 */
129 static int setup_btree_index(unsigned int l, struct dm_table *t) 129 static int setup_btree_index(unsigned int l, struct dm_table *t)
130 { 130 {
131 unsigned int n, k; 131 unsigned int n, k;
132 sector_t *node; 132 sector_t *node;
133 133
134 for (n = 0U; n < t->counts[l]; n++) { 134 for (n = 0U; n < t->counts[l]; n++) {
135 node = get_node(t, l, n); 135 node = get_node(t, l, n);
136 136
137 for (k = 0U; k < KEYS_PER_NODE; k++) 137 for (k = 0U; k < KEYS_PER_NODE; k++)
138 node[k] = high(t, l + 1, get_child(n, k)); 138 node[k] = high(t, l + 1, get_child(n, k));
139 } 139 }
140 140
141 return 0; 141 return 0;
142 } 142 }
143 143
144 void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size) 144 void *dm_vcalloc(unsigned long nmemb, unsigned long elem_size)
145 { 145 {
146 unsigned long size; 146 unsigned long size;
147 void *addr; 147 void *addr;
148 148
149 /* 149 /*
150 * Check that we're not going to overflow. 150 * Check that we're not going to overflow.
151 */ 151 */
152 if (nmemb > (ULONG_MAX / elem_size)) 152 if (nmemb > (ULONG_MAX / elem_size))
153 return NULL; 153 return NULL;
154 154
155 size = nmemb * elem_size; 155 size = nmemb * elem_size;
156 addr = vzalloc(size); 156 addr = vzalloc(size);
157 157
158 return addr; 158 return addr;
159 } 159 }
160 EXPORT_SYMBOL(dm_vcalloc); 160 EXPORT_SYMBOL(dm_vcalloc);
161 161
162 /* 162 /*
163 * highs, and targets are managed as dynamic arrays during a 163 * highs, and targets are managed as dynamic arrays during a
164 * table load. 164 * table load.
165 */ 165 */
166 static int alloc_targets(struct dm_table *t, unsigned int num) 166 static int alloc_targets(struct dm_table *t, unsigned int num)
167 { 167 {
168 sector_t *n_highs; 168 sector_t *n_highs;
169 struct dm_target *n_targets; 169 struct dm_target *n_targets;
170 int n = t->num_targets; 170 int n = t->num_targets;
171 171
172 /* 172 /*
173 * Allocate both the target array and offset array at once. 173 * Allocate both the target array and offset array at once.
174 * Append an empty entry to catch sectors beyond the end of 174 * Append an empty entry to catch sectors beyond the end of
175 * the device. 175 * the device.
176 */ 176 */
177 n_highs = (sector_t *) dm_vcalloc(num + 1, sizeof(struct dm_target) + 177 n_highs = (sector_t *) dm_vcalloc(num + 1, sizeof(struct dm_target) +
178 sizeof(sector_t)); 178 sizeof(sector_t));
179 if (!n_highs) 179 if (!n_highs)
180 return -ENOMEM; 180 return -ENOMEM;
181 181
182 n_targets = (struct dm_target *) (n_highs + num); 182 n_targets = (struct dm_target *) (n_highs + num);
183 183
184 if (n) { 184 if (n) {
185 memcpy(n_highs, t->highs, sizeof(*n_highs) * n); 185 memcpy(n_highs, t->highs, sizeof(*n_highs) * n);
186 memcpy(n_targets, t->targets, sizeof(*n_targets) * n); 186 memcpy(n_targets, t->targets, sizeof(*n_targets) * n);
187 } 187 }
188 188
189 memset(n_highs + n, -1, sizeof(*n_highs) * (num - n)); 189 memset(n_highs + n, -1, sizeof(*n_highs) * (num - n));
190 vfree(t->highs); 190 vfree(t->highs);
191 191
192 t->num_allocated = num; 192 t->num_allocated = num;
193 t->highs = n_highs; 193 t->highs = n_highs;
194 t->targets = n_targets; 194 t->targets = n_targets;
195 195
196 return 0; 196 return 0;
197 } 197 }
198 198
199 int dm_table_create(struct dm_table **result, fmode_t mode, 199 int dm_table_create(struct dm_table **result, fmode_t mode,
200 unsigned num_targets, struct mapped_device *md) 200 unsigned num_targets, struct mapped_device *md)
201 { 201 {
202 struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL); 202 struct dm_table *t = kzalloc(sizeof(*t), GFP_KERNEL);
203 203
204 if (!t) 204 if (!t)
205 return -ENOMEM; 205 return -ENOMEM;
206 206
207 INIT_LIST_HEAD(&t->devices); 207 INIT_LIST_HEAD(&t->devices);
208 INIT_LIST_HEAD(&t->target_callbacks); 208 INIT_LIST_HEAD(&t->target_callbacks);
209 atomic_set(&t->holders, 0); 209 atomic_set(&t->holders, 0);
210 210
211 if (!num_targets) 211 if (!num_targets)
212 num_targets = KEYS_PER_NODE; 212 num_targets = KEYS_PER_NODE;
213 213
214 num_targets = dm_round_up(num_targets, KEYS_PER_NODE); 214 num_targets = dm_round_up(num_targets, KEYS_PER_NODE);
215 215
216 if (alloc_targets(t, num_targets)) { 216 if (alloc_targets(t, num_targets)) {
217 kfree(t); 217 kfree(t);
218 t = NULL; 218 t = NULL;
219 return -ENOMEM; 219 return -ENOMEM;
220 } 220 }
221 221
222 t->mode = mode; 222 t->mode = mode;
223 t->md = md; 223 t->md = md;
224 *result = t; 224 *result = t;
225 return 0; 225 return 0;
226 } 226 }
227 227
228 static void free_devices(struct list_head *devices) 228 static void free_devices(struct list_head *devices)
229 { 229 {
230 struct list_head *tmp, *next; 230 struct list_head *tmp, *next;
231 231
232 list_for_each_safe(tmp, next, devices) { 232 list_for_each_safe(tmp, next, devices) {
233 struct dm_dev_internal *dd = 233 struct dm_dev_internal *dd =
234 list_entry(tmp, struct dm_dev_internal, list); 234 list_entry(tmp, struct dm_dev_internal, list);
235 DMWARN("dm_table_destroy: dm_put_device call missing for %s", 235 DMWARN("dm_table_destroy: dm_put_device call missing for %s",
236 dd->dm_dev.name); 236 dd->dm_dev.name);
237 kfree(dd); 237 kfree(dd);
238 } 238 }
239 } 239 }
240 240
241 void dm_table_destroy(struct dm_table *t) 241 void dm_table_destroy(struct dm_table *t)
242 { 242 {
243 unsigned int i; 243 unsigned int i;
244 244
245 if (!t) 245 if (!t)
246 return; 246 return;
247 247
248 while (atomic_read(&t->holders)) 248 while (atomic_read(&t->holders))
249 msleep(1); 249 msleep(1);
250 smp_mb(); 250 smp_mb();
251 251
252 /* free the indexes */ 252 /* free the indexes */
253 if (t->depth >= 2) 253 if (t->depth >= 2)
254 vfree(t->index[t->depth - 2]); 254 vfree(t->index[t->depth - 2]);
255 255
256 /* free the targets */ 256 /* free the targets */
257 for (i = 0; i < t->num_targets; i++) { 257 for (i = 0; i < t->num_targets; i++) {
258 struct dm_target *tgt = t->targets + i; 258 struct dm_target *tgt = t->targets + i;
259 259
260 if (tgt->type->dtr) 260 if (tgt->type->dtr)
261 tgt->type->dtr(tgt); 261 tgt->type->dtr(tgt);
262 262
263 dm_put_target_type(tgt->type); 263 dm_put_target_type(tgt->type);
264 } 264 }
265 265
266 vfree(t->highs); 266 vfree(t->highs);
267 267
268 /* free the device list */ 268 /* free the device list */
269 if (t->devices.next != &t->devices) 269 if (t->devices.next != &t->devices)
270 free_devices(&t->devices); 270 free_devices(&t->devices);
271 271
272 dm_free_md_mempools(t->mempools); 272 dm_free_md_mempools(t->mempools);
273 273
274 kfree(t); 274 kfree(t);
275 } 275 }
276 276
277 void dm_table_get(struct dm_table *t) 277 void dm_table_get(struct dm_table *t)
278 { 278 {
279 atomic_inc(&t->holders); 279 atomic_inc(&t->holders);
280 } 280 }
281 EXPORT_SYMBOL(dm_table_get); 281 EXPORT_SYMBOL(dm_table_get);
282 282
283 void dm_table_put(struct dm_table *t) 283 void dm_table_put(struct dm_table *t)
284 { 284 {
285 if (!t) 285 if (!t)
286 return; 286 return;
287 287
288 smp_mb__before_atomic_dec(); 288 smp_mb__before_atomic_dec();
289 atomic_dec(&t->holders); 289 atomic_dec(&t->holders);
290 } 290 }
291 EXPORT_SYMBOL(dm_table_put); 291 EXPORT_SYMBOL(dm_table_put);
292 292
293 /* 293 /*
294 * Checks to see if we need to extend highs or targets. 294 * Checks to see if we need to extend highs or targets.
295 */ 295 */
296 static inline int check_space(struct dm_table *t) 296 static inline int check_space(struct dm_table *t)
297 { 297 {
298 if (t->num_targets >= t->num_allocated) 298 if (t->num_targets >= t->num_allocated)
299 return alloc_targets(t, t->num_allocated * 2); 299 return alloc_targets(t, t->num_allocated * 2);
300 300
301 return 0; 301 return 0;
302 } 302 }
303 303
304 /* 304 /*
305 * See if we've already got a device in the list. 305 * See if we've already got a device in the list.
306 */ 306 */
307 static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev) 307 static struct dm_dev_internal *find_device(struct list_head *l, dev_t dev)
308 { 308 {
309 struct dm_dev_internal *dd; 309 struct dm_dev_internal *dd;
310 310
311 list_for_each_entry (dd, l, list) 311 list_for_each_entry (dd, l, list)
312 if (dd->dm_dev.bdev->bd_dev == dev) 312 if (dd->dm_dev.bdev->bd_dev == dev)
313 return dd; 313 return dd;
314 314
315 return NULL; 315 return NULL;
316 } 316 }
317 317
318 /* 318 /*
319 * Open a device so we can use it as a map destination. 319 * Open a device so we can use it as a map destination.
320 */ 320 */
321 static int open_dev(struct dm_dev_internal *d, dev_t dev, 321 static int open_dev(struct dm_dev_internal *d, dev_t dev,
322 struct mapped_device *md) 322 struct mapped_device *md)
323 { 323 {
324 static char *_claim_ptr = "I belong to device-mapper"; 324 static char *_claim_ptr = "I belong to device-mapper";
325 struct block_device *bdev; 325 struct block_device *bdev;
326 326
327 int r; 327 int r;
328 328
329 BUG_ON(d->dm_dev.bdev); 329 BUG_ON(d->dm_dev.bdev);
330 330
331 bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr); 331 bdev = blkdev_get_by_dev(dev, d->dm_dev.mode | FMODE_EXCL, _claim_ptr);
332 if (IS_ERR(bdev)) 332 if (IS_ERR(bdev))
333 return PTR_ERR(bdev); 333 return PTR_ERR(bdev);
334 334
335 r = bd_link_disk_holder(bdev, dm_disk(md)); 335 r = bd_link_disk_holder(bdev, dm_disk(md));
336 if (r) { 336 if (r) {
337 blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL); 337 blkdev_put(bdev, d->dm_dev.mode | FMODE_EXCL);
338 return r; 338 return r;
339 } 339 }
340 340
341 d->dm_dev.bdev = bdev; 341 d->dm_dev.bdev = bdev;
342 return 0; 342 return 0;
343 } 343 }
344 344
345 /* 345 /*
346 * Close a device that we've been using. 346 * Close a device that we've been using.
347 */ 347 */
348 static void close_dev(struct dm_dev_internal *d, struct mapped_device *md) 348 static void close_dev(struct dm_dev_internal *d, struct mapped_device *md)
349 { 349 {
350 if (!d->dm_dev.bdev) 350 if (!d->dm_dev.bdev)
351 return; 351 return;
352 352
353 bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md)); 353 bd_unlink_disk_holder(d->dm_dev.bdev, dm_disk(md));
354 blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL); 354 blkdev_put(d->dm_dev.bdev, d->dm_dev.mode | FMODE_EXCL);
355 d->dm_dev.bdev = NULL; 355 d->dm_dev.bdev = NULL;
356 } 356 }
357 357
358 /* 358 /*
359 * If possible, this checks an area of a destination device is invalid. 359 * If possible, this checks an area of a destination device is invalid.
360 */ 360 */
361 static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev, 361 static int device_area_is_invalid(struct dm_target *ti, struct dm_dev *dev,
362 sector_t start, sector_t len, void *data) 362 sector_t start, sector_t len, void *data)
363 { 363 {
364 struct request_queue *q; 364 struct request_queue *q;
365 struct queue_limits *limits = data; 365 struct queue_limits *limits = data;
366 struct block_device *bdev = dev->bdev; 366 struct block_device *bdev = dev->bdev;
367 sector_t dev_size = 367 sector_t dev_size =
368 i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 368 i_size_read(bdev->bd_inode) >> SECTOR_SHIFT;
369 unsigned short logical_block_size_sectors = 369 unsigned short logical_block_size_sectors =
370 limits->logical_block_size >> SECTOR_SHIFT; 370 limits->logical_block_size >> SECTOR_SHIFT;
371 char b[BDEVNAME_SIZE]; 371 char b[BDEVNAME_SIZE];
372 372
373 /* 373 /*
374 * Some devices exist without request functions, 374 * Some devices exist without request functions,
375 * such as loop devices not yet bound to backing files. 375 * such as loop devices not yet bound to backing files.
376 * Forbid the use of such devices. 376 * Forbid the use of such devices.
377 */ 377 */
378 q = bdev_get_queue(bdev); 378 q = bdev_get_queue(bdev);
379 if (!q || !q->make_request_fn) { 379 if (!q || !q->make_request_fn) {
380 DMWARN("%s: %s is not yet initialised: " 380 DMWARN("%s: %s is not yet initialised: "
381 "start=%llu, len=%llu, dev_size=%llu", 381 "start=%llu, len=%llu, dev_size=%llu",
382 dm_device_name(ti->table->md), bdevname(bdev, b), 382 dm_device_name(ti->table->md), bdevname(bdev, b),
383 (unsigned long long)start, 383 (unsigned long long)start,
384 (unsigned long long)len, 384 (unsigned long long)len,
385 (unsigned long long)dev_size); 385 (unsigned long long)dev_size);
386 return 1; 386 return 1;
387 } 387 }
388 388
389 if (!dev_size) 389 if (!dev_size)
390 return 0; 390 return 0;
391 391
392 if ((start >= dev_size) || (start + len > dev_size)) { 392 if ((start >= dev_size) || (start + len > dev_size)) {
393 DMWARN("%s: %s too small for target: " 393 DMWARN("%s: %s too small for target: "
394 "start=%llu, len=%llu, dev_size=%llu", 394 "start=%llu, len=%llu, dev_size=%llu",
395 dm_device_name(ti->table->md), bdevname(bdev, b), 395 dm_device_name(ti->table->md), bdevname(bdev, b),
396 (unsigned long long)start, 396 (unsigned long long)start,
397 (unsigned long long)len, 397 (unsigned long long)len,
398 (unsigned long long)dev_size); 398 (unsigned long long)dev_size);
399 return 1; 399 return 1;
400 } 400 }
401 401
402 if (logical_block_size_sectors <= 1) 402 if (logical_block_size_sectors <= 1)
403 return 0; 403 return 0;
404 404
405 if (start & (logical_block_size_sectors - 1)) { 405 if (start & (logical_block_size_sectors - 1)) {
406 DMWARN("%s: start=%llu not aligned to h/w " 406 DMWARN("%s: start=%llu not aligned to h/w "
407 "logical block size %u of %s", 407 "logical block size %u of %s",
408 dm_device_name(ti->table->md), 408 dm_device_name(ti->table->md),
409 (unsigned long long)start, 409 (unsigned long long)start,
410 limits->logical_block_size, bdevname(bdev, b)); 410 limits->logical_block_size, bdevname(bdev, b));
411 return 1; 411 return 1;
412 } 412 }
413 413
414 if (len & (logical_block_size_sectors - 1)) { 414 if (len & (logical_block_size_sectors - 1)) {
415 DMWARN("%s: len=%llu not aligned to h/w " 415 DMWARN("%s: len=%llu not aligned to h/w "
416 "logical block size %u of %s", 416 "logical block size %u of %s",
417 dm_device_name(ti->table->md), 417 dm_device_name(ti->table->md),
418 (unsigned long long)len, 418 (unsigned long long)len,
419 limits->logical_block_size, bdevname(bdev, b)); 419 limits->logical_block_size, bdevname(bdev, b));
420 return 1; 420 return 1;
421 } 421 }
422 422
423 return 0; 423 return 0;
424 } 424 }
425 425
426 /* 426 /*
427 * This upgrades the mode on an already open dm_dev, being 427 * This upgrades the mode on an already open dm_dev, being
428 * careful to leave things as they were if we fail to reopen the 428 * careful to leave things as they were if we fail to reopen the
429 * device and not to touch the existing bdev field in case 429 * device and not to touch the existing bdev field in case
430 * it is accessed concurrently inside dm_table_any_congested(). 430 * it is accessed concurrently inside dm_table_any_congested().
431 */ 431 */
432 static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode, 432 static int upgrade_mode(struct dm_dev_internal *dd, fmode_t new_mode,
433 struct mapped_device *md) 433 struct mapped_device *md)
434 { 434 {
435 int r; 435 int r;
436 struct dm_dev_internal dd_new, dd_old; 436 struct dm_dev_internal dd_new, dd_old;
437 437
438 dd_new = dd_old = *dd; 438 dd_new = dd_old = *dd;
439 439
440 dd_new.dm_dev.mode |= new_mode; 440 dd_new.dm_dev.mode |= new_mode;
441 dd_new.dm_dev.bdev = NULL; 441 dd_new.dm_dev.bdev = NULL;
442 442
443 r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md); 443 r = open_dev(&dd_new, dd->dm_dev.bdev->bd_dev, md);
444 if (r) 444 if (r)
445 return r; 445 return r;
446 446
447 dd->dm_dev.mode |= new_mode; 447 dd->dm_dev.mode |= new_mode;
448 close_dev(&dd_old, md); 448 close_dev(&dd_old, md);
449 449
450 return 0; 450 return 0;
451 } 451 }
452 452
453 /* 453 /*
454 * Add a device to the list, or just increment the usage count if 454 * Add a device to the list, or just increment the usage count if
455 * it's already present. 455 * it's already present.
456 */ 456 */
457 int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, 457 int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
458 struct dm_dev **result) 458 struct dm_dev **result)
459 { 459 {
460 int r; 460 int r;
461 dev_t uninitialized_var(dev); 461 dev_t uninitialized_var(dev);
462 struct dm_dev_internal *dd; 462 struct dm_dev_internal *dd;
463 unsigned int major, minor; 463 unsigned int major, minor;
464 struct dm_table *t = ti->table; 464 struct dm_table *t = ti->table;
465 465
466 BUG_ON(!t); 466 BUG_ON(!t);
467 467
468 if (sscanf(path, "%u:%u", &major, &minor) == 2) { 468 if (sscanf(path, "%u:%u", &major, &minor) == 2) {
469 /* Extract the major/minor numbers */ 469 /* Extract the major/minor numbers */
470 dev = MKDEV(major, minor); 470 dev = MKDEV(major, minor);
471 if (MAJOR(dev) != major || MINOR(dev) != minor) 471 if (MAJOR(dev) != major || MINOR(dev) != minor)
472 return -EOVERFLOW; 472 return -EOVERFLOW;
473 } else { 473 } else {
474 /* convert the path to a device */ 474 /* convert the path to a device */
475 struct block_device *bdev = lookup_bdev(path); 475 struct block_device *bdev = lookup_bdev(path);
476 476
477 if (IS_ERR(bdev)) 477 if (IS_ERR(bdev))
478 return PTR_ERR(bdev); 478 return PTR_ERR(bdev);
479 dev = bdev->bd_dev; 479 dev = bdev->bd_dev;
480 bdput(bdev); 480 bdput(bdev);
481 } 481 }
482 482
483 dd = find_device(&t->devices, dev); 483 dd = find_device(&t->devices, dev);
484 if (!dd) { 484 if (!dd) {
485 dd = kmalloc(sizeof(*dd), GFP_KERNEL); 485 dd = kmalloc(sizeof(*dd), GFP_KERNEL);
486 if (!dd) 486 if (!dd)
487 return -ENOMEM; 487 return -ENOMEM;
488 488
489 dd->dm_dev.mode = mode; 489 dd->dm_dev.mode = mode;
490 dd->dm_dev.bdev = NULL; 490 dd->dm_dev.bdev = NULL;
491 491
492 if ((r = open_dev(dd, dev, t->md))) { 492 if ((r = open_dev(dd, dev, t->md))) {
493 kfree(dd); 493 kfree(dd);
494 return r; 494 return r;
495 } 495 }
496 496
497 format_dev_t(dd->dm_dev.name, dev); 497 format_dev_t(dd->dm_dev.name, dev);
498 498
499 atomic_set(&dd->count, 0); 499 atomic_set(&dd->count, 0);
500 list_add(&dd->list, &t->devices); 500 list_add(&dd->list, &t->devices);
501 501
502 } else if (dd->dm_dev.mode != (mode | dd->dm_dev.mode)) { 502 } else if (dd->dm_dev.mode != (mode | dd->dm_dev.mode)) {
503 r = upgrade_mode(dd, mode, t->md); 503 r = upgrade_mode(dd, mode, t->md);
504 if (r) 504 if (r)
505 return r; 505 return r;
506 } 506 }
507 atomic_inc(&dd->count); 507 atomic_inc(&dd->count);
508 508
509 *result = &dd->dm_dev; 509 *result = &dd->dm_dev;
510 return 0; 510 return 0;
511 } 511 }
512 EXPORT_SYMBOL(dm_get_device); 512 EXPORT_SYMBOL(dm_get_device);
513 513
514 int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev, 514 int dm_set_device_limits(struct dm_target *ti, struct dm_dev *dev,
515 sector_t start, sector_t len, void *data) 515 sector_t start, sector_t len, void *data)
516 { 516 {
517 struct queue_limits *limits = data; 517 struct queue_limits *limits = data;
518 struct block_device *bdev = dev->bdev; 518 struct block_device *bdev = dev->bdev;
519 struct request_queue *q = bdev_get_queue(bdev); 519 struct request_queue *q = bdev_get_queue(bdev);
520 char b[BDEVNAME_SIZE]; 520 char b[BDEVNAME_SIZE];
521 521
522 if (unlikely(!q)) { 522 if (unlikely(!q)) {
523 DMWARN("%s: Cannot set limits for nonexistent device %s", 523 DMWARN("%s: Cannot set limits for nonexistent device %s",
524 dm_device_name(ti->table->md), bdevname(bdev, b)); 524 dm_device_name(ti->table->md), bdevname(bdev, b));
525 return 0; 525 return 0;
526 } 526 }
527 527
528 if (bdev_stack_limits(limits, bdev, start) < 0) 528 if (bdev_stack_limits(limits, bdev, start) < 0)
529 DMWARN("%s: adding target device %s caused an alignment inconsistency: " 529 DMWARN("%s: adding target device %s caused an alignment inconsistency: "
530 "physical_block_size=%u, logical_block_size=%u, " 530 "physical_block_size=%u, logical_block_size=%u, "
531 "alignment_offset=%u, start=%llu", 531 "alignment_offset=%u, start=%llu",
532 dm_device_name(ti->table->md), bdevname(bdev, b), 532 dm_device_name(ti->table->md), bdevname(bdev, b),
533 q->limits.physical_block_size, 533 q->limits.physical_block_size,
534 q->limits.logical_block_size, 534 q->limits.logical_block_size,
535 q->limits.alignment_offset, 535 q->limits.alignment_offset,
536 (unsigned long long) start << SECTOR_SHIFT); 536 (unsigned long long) start << SECTOR_SHIFT);
537 537
538 /* 538 /*
539 * Check if merge fn is supported. 539 * Check if merge fn is supported.
540 * If not we'll force DM to use PAGE_SIZE or 540 * If not we'll force DM to use PAGE_SIZE or
541 * smaller I/O, just to be safe. 541 * smaller I/O, just to be safe.
542 */ 542 */
543 if (dm_queue_merge_is_compulsory(q) && !ti->type->merge) 543 if (dm_queue_merge_is_compulsory(q) && !ti->type->merge)
544 blk_limits_max_hw_sectors(limits, 544 blk_limits_max_hw_sectors(limits,
545 (unsigned int) (PAGE_SIZE >> 9)); 545 (unsigned int) (PAGE_SIZE >> 9));
546 return 0; 546 return 0;
547 } 547 }
548 EXPORT_SYMBOL_GPL(dm_set_device_limits); 548 EXPORT_SYMBOL_GPL(dm_set_device_limits);
549 549
550 /* 550 /*
551 * Decrement a device's use count and remove it if necessary. 551 * Decrement a device's use count and remove it if necessary.
552 */ 552 */
553 void dm_put_device(struct dm_target *ti, struct dm_dev *d) 553 void dm_put_device(struct dm_target *ti, struct dm_dev *d)
554 { 554 {
555 struct dm_dev_internal *dd = container_of(d, struct dm_dev_internal, 555 struct dm_dev_internal *dd = container_of(d, struct dm_dev_internal,
556 dm_dev); 556 dm_dev);
557 557
558 if (atomic_dec_and_test(&dd->count)) { 558 if (atomic_dec_and_test(&dd->count)) {
559 close_dev(dd, ti->table->md); 559 close_dev(dd, ti->table->md);
560 list_del(&dd->list); 560 list_del(&dd->list);
561 kfree(dd); 561 kfree(dd);
562 } 562 }
563 } 563 }
564 EXPORT_SYMBOL(dm_put_device); 564 EXPORT_SYMBOL(dm_put_device);
565 565
566 /* 566 /*
567 * Checks to see if the target joins onto the end of the table. 567 * Checks to see if the target joins onto the end of the table.
568 */ 568 */
569 static int adjoin(struct dm_table *table, struct dm_target *ti) 569 static int adjoin(struct dm_table *table, struct dm_target *ti)
570 { 570 {
571 struct dm_target *prev; 571 struct dm_target *prev;
572 572
573 if (!table->num_targets) 573 if (!table->num_targets)
574 return !ti->begin; 574 return !ti->begin;
575 575
576 prev = &table->targets[table->num_targets - 1]; 576 prev = &table->targets[table->num_targets - 1];
577 return (ti->begin == (prev->begin + prev->len)); 577 return (ti->begin == (prev->begin + prev->len));
578 } 578 }
579 579
580 /* 580 /*
581 * Used to dynamically allocate the arg array. 581 * Used to dynamically allocate the arg array.
582 */ 582 */
583 static char **realloc_argv(unsigned *array_size, char **old_argv) 583 static char **realloc_argv(unsigned *array_size, char **old_argv)
584 { 584 {
585 char **argv; 585 char **argv;
586 unsigned new_size; 586 unsigned new_size;
587 587
588 new_size = *array_size ? *array_size * 2 : 64; 588 new_size = *array_size ? *array_size * 2 : 64;
589 argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL); 589 argv = kmalloc(new_size * sizeof(*argv), GFP_KERNEL);
590 if (argv) { 590 if (argv) {
591 memcpy(argv, old_argv, *array_size * sizeof(*argv)); 591 memcpy(argv, old_argv, *array_size * sizeof(*argv));
592 *array_size = new_size; 592 *array_size = new_size;
593 } 593 }
594 594
595 kfree(old_argv); 595 kfree(old_argv);
596 return argv; 596 return argv;
597 } 597 }
598 598
599 /* 599 /*
600 * Destructively splits up the argument list to pass to ctr. 600 * Destructively splits up the argument list to pass to ctr.
601 */ 601 */
602 int dm_split_args(int *argc, char ***argvp, char *input) 602 int dm_split_args(int *argc, char ***argvp, char *input)
603 { 603 {
604 char *start, *end = input, *out, **argv = NULL; 604 char *start, *end = input, *out, **argv = NULL;
605 unsigned array_size = 0; 605 unsigned array_size = 0;
606 606
607 *argc = 0; 607 *argc = 0;
608 608
609 if (!input) { 609 if (!input) {
610 *argvp = NULL; 610 *argvp = NULL;
611 return 0; 611 return 0;
612 } 612 }
613 613
614 argv = realloc_argv(&array_size, argv); 614 argv = realloc_argv(&array_size, argv);
615 if (!argv) 615 if (!argv)
616 return -ENOMEM; 616 return -ENOMEM;
617 617
618 while (1) { 618 while (1) {
619 /* Skip whitespace */ 619 /* Skip whitespace */
620 start = skip_spaces(end); 620 start = skip_spaces(end);
621 621
622 if (!*start) 622 if (!*start)
623 break; /* success, we hit the end */ 623 break; /* success, we hit the end */
624 624
625 /* 'out' is used to remove any back-quotes */ 625 /* 'out' is used to remove any back-quotes */
626 end = out = start; 626 end = out = start;
627 while (*end) { 627 while (*end) {
628 /* Everything apart from '\0' can be quoted */ 628 /* Everything apart from '\0' can be quoted */
629 if (*end == '\\' && *(end + 1)) { 629 if (*end == '\\' && *(end + 1)) {
630 *out++ = *(end + 1); 630 *out++ = *(end + 1);
631 end += 2; 631 end += 2;
632 continue; 632 continue;
633 } 633 }
634 634
635 if (isspace(*end)) 635 if (isspace(*end))
636 break; /* end of token */ 636 break; /* end of token */
637 637
638 *out++ = *end++; 638 *out++ = *end++;
639 } 639 }
640 640
641 /* have we already filled the array ? */ 641 /* have we already filled the array ? */
642 if ((*argc + 1) > array_size) { 642 if ((*argc + 1) > array_size) {
643 argv = realloc_argv(&array_size, argv); 643 argv = realloc_argv(&array_size, argv);
644 if (!argv) 644 if (!argv)
645 return -ENOMEM; 645 return -ENOMEM;
646 } 646 }
647 647
648 /* we know this is whitespace */ 648 /* we know this is whitespace */
649 if (*end) 649 if (*end)
650 end++; 650 end++;
651 651
652 /* terminate the string and put it in the array */ 652 /* terminate the string and put it in the array */
653 *out = '\0'; 653 *out = '\0';
654 argv[*argc] = start; 654 argv[*argc] = start;
655 (*argc)++; 655 (*argc)++;
656 } 656 }
657 657
658 *argvp = argv; 658 *argvp = argv;
659 return 0; 659 return 0;
660 } 660 }
661 661
662 /* 662 /*
663 * Impose necessary and sufficient conditions on a devices's table such 663 * Impose necessary and sufficient conditions on a devices's table such
664 * that any incoming bio which respects its logical_block_size can be 664 * that any incoming bio which respects its logical_block_size can be
665 * processed successfully. If it falls across the boundary between 665 * processed successfully. If it falls across the boundary between
666 * two or more targets, the size of each piece it gets split into must 666 * two or more targets, the size of each piece it gets split into must
667 * be compatible with the logical_block_size of the target processing it. 667 * be compatible with the logical_block_size of the target processing it.
668 */ 668 */
669 static int validate_hardware_logical_block_alignment(struct dm_table *table, 669 static int validate_hardware_logical_block_alignment(struct dm_table *table,
670 struct queue_limits *limits) 670 struct queue_limits *limits)
671 { 671 {
672 /* 672 /*
673 * This function uses arithmetic modulo the logical_block_size 673 * This function uses arithmetic modulo the logical_block_size
674 * (in units of 512-byte sectors). 674 * (in units of 512-byte sectors).
675 */ 675 */
676 unsigned short device_logical_block_size_sects = 676 unsigned short device_logical_block_size_sects =
677 limits->logical_block_size >> SECTOR_SHIFT; 677 limits->logical_block_size >> SECTOR_SHIFT;
678 678
679 /* 679 /*
680 * Offset of the start of the next table entry, mod logical_block_size. 680 * Offset of the start of the next table entry, mod logical_block_size.
681 */ 681 */
682 unsigned short next_target_start = 0; 682 unsigned short next_target_start = 0;
683 683
684 /* 684 /*
685 * Given an aligned bio that extends beyond the end of a 685 * Given an aligned bio that extends beyond the end of a
686 * target, how many sectors must the next target handle? 686 * target, how many sectors must the next target handle?
687 */ 687 */
688 unsigned short remaining = 0; 688 unsigned short remaining = 0;
689 689
690 struct dm_target *uninitialized_var(ti); 690 struct dm_target *uninitialized_var(ti);
691 struct queue_limits ti_limits; 691 struct queue_limits ti_limits;
692 unsigned i = 0; 692 unsigned i = 0;
693 693
694 /* 694 /*
695 * Check each entry in the table in turn. 695 * Check each entry in the table in turn.
696 */ 696 */
697 while (i < dm_table_get_num_targets(table)) { 697 while (i < dm_table_get_num_targets(table)) {
698 ti = dm_table_get_target(table, i++); 698 ti = dm_table_get_target(table, i++);
699 699
700 blk_set_default_limits(&ti_limits); 700 blk_set_default_limits(&ti_limits);
701 701
702 /* combine all target devices' limits */ 702 /* combine all target devices' limits */
703 if (ti->type->iterate_devices) 703 if (ti->type->iterate_devices)
704 ti->type->iterate_devices(ti, dm_set_device_limits, 704 ti->type->iterate_devices(ti, dm_set_device_limits,
705 &ti_limits); 705 &ti_limits);
706 706
707 /* 707 /*
708 * If the remaining sectors fall entirely within this 708 * If the remaining sectors fall entirely within this
709 * table entry are they compatible with its logical_block_size? 709 * table entry are they compatible with its logical_block_size?
710 */ 710 */
711 if (remaining < ti->len && 711 if (remaining < ti->len &&
712 remaining & ((ti_limits.logical_block_size >> 712 remaining & ((ti_limits.logical_block_size >>
713 SECTOR_SHIFT) - 1)) 713 SECTOR_SHIFT) - 1))
714 break; /* Error */ 714 break; /* Error */
715 715
716 next_target_start = 716 next_target_start =
717 (unsigned short) ((next_target_start + ti->len) & 717 (unsigned short) ((next_target_start + ti->len) &
718 (device_logical_block_size_sects - 1)); 718 (device_logical_block_size_sects - 1));
719 remaining = next_target_start ? 719 remaining = next_target_start ?
720 device_logical_block_size_sects - next_target_start : 0; 720 device_logical_block_size_sects - next_target_start : 0;
721 } 721 }
722 722
723 if (remaining) { 723 if (remaining) {
724 DMWARN("%s: table line %u (start sect %llu len %llu) " 724 DMWARN("%s: table line %u (start sect %llu len %llu) "
725 "not aligned to h/w logical block size %u", 725 "not aligned to h/w logical block size %u",
726 dm_device_name(table->md), i, 726 dm_device_name(table->md), i,
727 (unsigned long long) ti->begin, 727 (unsigned long long) ti->begin,
728 (unsigned long long) ti->len, 728 (unsigned long long) ti->len,
729 limits->logical_block_size); 729 limits->logical_block_size);
730 return -EINVAL; 730 return -EINVAL;
731 } 731 }
732 732
733 return 0; 733 return 0;
734 } 734 }
735 735
736 int dm_table_add_target(struct dm_table *t, const char *type, 736 int dm_table_add_target(struct dm_table *t, const char *type,
737 sector_t start, sector_t len, char *params) 737 sector_t start, sector_t len, char *params)
738 { 738 {
739 int r = -EINVAL, argc; 739 int r = -EINVAL, argc;
740 char **argv; 740 char **argv;
741 struct dm_target *tgt; 741 struct dm_target *tgt;
742 742
743 if ((r = check_space(t))) 743 if ((r = check_space(t)))
744 return r; 744 return r;
745 745
746 tgt = t->targets + t->num_targets; 746 tgt = t->targets + t->num_targets;
747 memset(tgt, 0, sizeof(*tgt)); 747 memset(tgt, 0, sizeof(*tgt));
748 748
749 if (!len) { 749 if (!len) {
750 DMERR("%s: zero-length target", dm_device_name(t->md)); 750 DMERR("%s: zero-length target", dm_device_name(t->md));
751 return -EINVAL; 751 return -EINVAL;
752 } 752 }
753 753
754 tgt->type = dm_get_target_type(type); 754 tgt->type = dm_get_target_type(type);
755 if (!tgt->type) { 755 if (!tgt->type) {
756 DMERR("%s: %s: unknown target type", dm_device_name(t->md), 756 DMERR("%s: %s: unknown target type", dm_device_name(t->md),
757 type); 757 type);
758 return -EINVAL; 758 return -EINVAL;
759 } 759 }
760 760
761 tgt->table = t; 761 tgt->table = t;
762 tgt->begin = start; 762 tgt->begin = start;
763 tgt->len = len; 763 tgt->len = len;
764 tgt->error = "Unknown error"; 764 tgt->error = "Unknown error";
765 765
766 /* 766 /*
767 * Does this target adjoin the previous one ? 767 * Does this target adjoin the previous one ?
768 */ 768 */
769 if (!adjoin(t, tgt)) { 769 if (!adjoin(t, tgt)) {
770 tgt->error = "Gap in table"; 770 tgt->error = "Gap in table";
771 r = -EINVAL; 771 r = -EINVAL;
772 goto bad; 772 goto bad;
773 } 773 }
774 774
775 r = dm_split_args(&argc, &argv, params); 775 r = dm_split_args(&argc, &argv, params);
776 if (r) { 776 if (r) {
777 tgt->error = "couldn't split parameters (insufficient memory)"; 777 tgt->error = "couldn't split parameters (insufficient memory)";
778 goto bad; 778 goto bad;
779 } 779 }
780 780
781 r = tgt->type->ctr(tgt, argc, argv); 781 r = tgt->type->ctr(tgt, argc, argv);
782 kfree(argv); 782 kfree(argv);
783 if (r) 783 if (r)
784 goto bad; 784 goto bad;
785 785
786 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1; 786 t->highs[t->num_targets++] = tgt->begin + tgt->len - 1;
787 787
788 if (!tgt->num_discard_requests && tgt->discards_supported) 788 if (!tgt->num_discard_requests && tgt->discards_supported)
789 DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.", 789 DMWARN("%s: %s: ignoring discards_supported because num_discard_requests is zero.",
790 dm_device_name(t->md), type); 790 dm_device_name(t->md), type);
791 791
792 return 0; 792 return 0;
793 793
794 bad: 794 bad:
795 DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error); 795 DMERR("%s: %s: %s", dm_device_name(t->md), type, tgt->error);
796 dm_put_target_type(tgt->type); 796 dm_put_target_type(tgt->type);
797 return r; 797 return r;
798 } 798 }
799 799
800 /* 800 /*
801 * Target argument parsing helpers. 801 * Target argument parsing helpers.
802 */ 802 */
803 static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, 803 static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
804 unsigned *value, char **error, unsigned grouped) 804 unsigned *value, char **error, unsigned grouped)
805 { 805 {
806 const char *arg_str = dm_shift_arg(arg_set); 806 const char *arg_str = dm_shift_arg(arg_set);
807 807
808 if (!arg_str || 808 if (!arg_str ||
809 (sscanf(arg_str, "%u", value) != 1) || 809 (sscanf(arg_str, "%u", value) != 1) ||
810 (*value < arg->min) || 810 (*value < arg->min) ||
811 (*value > arg->max) || 811 (*value > arg->max) ||
812 (grouped && arg_set->argc < *value)) { 812 (grouped && arg_set->argc < *value)) {
813 *error = arg->error; 813 *error = arg->error;
814 return -EINVAL; 814 return -EINVAL;
815 } 815 }
816 816
817 return 0; 817 return 0;
818 } 818 }
819 819
820 int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, 820 int dm_read_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
821 unsigned *value, char **error) 821 unsigned *value, char **error)
822 { 822 {
823 return validate_next_arg(arg, arg_set, value, error, 0); 823 return validate_next_arg(arg, arg_set, value, error, 0);
824 } 824 }
825 EXPORT_SYMBOL(dm_read_arg); 825 EXPORT_SYMBOL(dm_read_arg);
826 826
827 int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set, 827 int dm_read_arg_group(struct dm_arg *arg, struct dm_arg_set *arg_set,
828 unsigned *value, char **error) 828 unsigned *value, char **error)
829 { 829 {
830 return validate_next_arg(arg, arg_set, value, error, 1); 830 return validate_next_arg(arg, arg_set, value, error, 1);
831 } 831 }
832 EXPORT_SYMBOL(dm_read_arg_group); 832 EXPORT_SYMBOL(dm_read_arg_group);
833 833
834 const char *dm_shift_arg(struct dm_arg_set *as) 834 const char *dm_shift_arg(struct dm_arg_set *as)
835 { 835 {
836 char *r; 836 char *r;
837 837
838 if (as->argc) { 838 if (as->argc) {
839 as->argc--; 839 as->argc--;
840 r = *as->argv; 840 r = *as->argv;
841 as->argv++; 841 as->argv++;
842 return r; 842 return r;
843 } 843 }
844 844
845 return NULL; 845 return NULL;
846 } 846 }
847 EXPORT_SYMBOL(dm_shift_arg); 847 EXPORT_SYMBOL(dm_shift_arg);
848 848
849 void dm_consume_args(struct dm_arg_set *as, unsigned num_args) 849 void dm_consume_args(struct dm_arg_set *as, unsigned num_args)
850 { 850 {
851 BUG_ON(as->argc < num_args); 851 BUG_ON(as->argc < num_args);
852 as->argc -= num_args; 852 as->argc -= num_args;
853 as->argv += num_args; 853 as->argv += num_args;
854 } 854 }
855 EXPORT_SYMBOL(dm_consume_args); 855 EXPORT_SYMBOL(dm_consume_args);
856 856
857 static int dm_table_set_type(struct dm_table *t) 857 static int dm_table_set_type(struct dm_table *t)
858 { 858 {
859 unsigned i; 859 unsigned i;
860 unsigned bio_based = 0, request_based = 0; 860 unsigned bio_based = 0, request_based = 0;
861 struct dm_target *tgt; 861 struct dm_target *tgt;
862 struct dm_dev_internal *dd; 862 struct dm_dev_internal *dd;
863 struct list_head *devices; 863 struct list_head *devices;
864 864
865 for (i = 0; i < t->num_targets; i++) { 865 for (i = 0; i < t->num_targets; i++) {
866 tgt = t->targets + i; 866 tgt = t->targets + i;
867 if (dm_target_request_based(tgt)) 867 if (dm_target_request_based(tgt))
868 request_based = 1; 868 request_based = 1;
869 else 869 else
870 bio_based = 1; 870 bio_based = 1;
871 871
872 if (bio_based && request_based) { 872 if (bio_based && request_based) {
873 DMWARN("Inconsistent table: different target types" 873 DMWARN("Inconsistent table: different target types"
874 " can't be mixed up"); 874 " can't be mixed up");
875 return -EINVAL; 875 return -EINVAL;
876 } 876 }
877 } 877 }
878 878
879 if (bio_based) { 879 if (bio_based) {
880 /* We must use this table as bio-based */ 880 /* We must use this table as bio-based */
881 t->type = DM_TYPE_BIO_BASED; 881 t->type = DM_TYPE_BIO_BASED;
882 return 0; 882 return 0;
883 } 883 }
884 884
885 BUG_ON(!request_based); /* No targets in this table */ 885 BUG_ON(!request_based); /* No targets in this table */
886 886
887 /* Non-request-stackable devices can't be used for request-based dm */ 887 /* Non-request-stackable devices can't be used for request-based dm */
888 devices = dm_table_get_devices(t); 888 devices = dm_table_get_devices(t);
889 list_for_each_entry(dd, devices, list) { 889 list_for_each_entry(dd, devices, list) {
890 if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) { 890 if (!blk_queue_stackable(bdev_get_queue(dd->dm_dev.bdev))) {
891 DMWARN("table load rejected: including" 891 DMWARN("table load rejected: including"
892 " non-request-stackable devices"); 892 " non-request-stackable devices");
893 return -EINVAL; 893 return -EINVAL;
894 } 894 }
895 } 895 }
896 896
897 /* 897 /*
898 * Request-based dm supports only tables that have a single target now. 898 * Request-based dm supports only tables that have a single target now.
899 * To support multiple targets, request splitting support is needed, 899 * To support multiple targets, request splitting support is needed,
900 * and that needs lots of changes in the block-layer. 900 * and that needs lots of changes in the block-layer.
901 * (e.g. request completion process for partial completion.) 901 * (e.g. request completion process for partial completion.)
902 */ 902 */
903 if (t->num_targets > 1) { 903 if (t->num_targets > 1) {
904 DMWARN("Request-based dm doesn't support multiple targets yet"); 904 DMWARN("Request-based dm doesn't support multiple targets yet");
905 return -EINVAL; 905 return -EINVAL;
906 } 906 }
907 907
908 t->type = DM_TYPE_REQUEST_BASED; 908 t->type = DM_TYPE_REQUEST_BASED;
909 909
910 return 0; 910 return 0;
911 } 911 }
912 912
913 unsigned dm_table_get_type(struct dm_table *t) 913 unsigned dm_table_get_type(struct dm_table *t)
914 { 914 {
915 return t->type; 915 return t->type;
916 } 916 }
917 917
918 bool dm_table_request_based(struct dm_table *t) 918 bool dm_table_request_based(struct dm_table *t)
919 { 919 {
920 return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED; 920 return dm_table_get_type(t) == DM_TYPE_REQUEST_BASED;
921 } 921 }
922 922
923 int dm_table_alloc_md_mempools(struct dm_table *t) 923 int dm_table_alloc_md_mempools(struct dm_table *t)
924 { 924 {
925 unsigned type = dm_table_get_type(t); 925 unsigned type = dm_table_get_type(t);
926 926
927 if (unlikely(type == DM_TYPE_NONE)) { 927 if (unlikely(type == DM_TYPE_NONE)) {
928 DMWARN("no table type is set, can't allocate mempools"); 928 DMWARN("no table type is set, can't allocate mempools");
929 return -EINVAL; 929 return -EINVAL;
930 } 930 }
931 931
932 t->mempools = dm_alloc_md_mempools(type, t->integrity_supported); 932 t->mempools = dm_alloc_md_mempools(type, t->integrity_supported);
933 if (!t->mempools) 933 if (!t->mempools)
934 return -ENOMEM; 934 return -ENOMEM;
935 935
936 return 0; 936 return 0;
937 } 937 }
938 938
939 void dm_table_free_md_mempools(struct dm_table *t) 939 void dm_table_free_md_mempools(struct dm_table *t)
940 { 940 {
941 dm_free_md_mempools(t->mempools); 941 dm_free_md_mempools(t->mempools);
942 t->mempools = NULL; 942 t->mempools = NULL;
943 } 943 }
944 944
945 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t) 945 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t)
946 { 946 {
947 return t->mempools; 947 return t->mempools;
948 } 948 }
949 949
950 static int setup_indexes(struct dm_table *t) 950 static int setup_indexes(struct dm_table *t)
951 { 951 {
952 int i; 952 int i;
953 unsigned int total = 0; 953 unsigned int total = 0;
954 sector_t *indexes; 954 sector_t *indexes;
955 955
956 /* allocate the space for *all* the indexes */ 956 /* allocate the space for *all* the indexes */
957 for (i = t->depth - 2; i >= 0; i--) { 957 for (i = t->depth - 2; i >= 0; i--) {
958 t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE); 958 t->counts[i] = dm_div_up(t->counts[i + 1], CHILDREN_PER_NODE);
959 total += t->counts[i]; 959 total += t->counts[i];
960 } 960 }
961 961
962 indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE); 962 indexes = (sector_t *) dm_vcalloc(total, (unsigned long) NODE_SIZE);
963 if (!indexes) 963 if (!indexes)
964 return -ENOMEM; 964 return -ENOMEM;
965 965
966 /* set up internal nodes, bottom-up */ 966 /* set up internal nodes, bottom-up */
967 for (i = t->depth - 2; i >= 0; i--) { 967 for (i = t->depth - 2; i >= 0; i--) {
968 t->index[i] = indexes; 968 t->index[i] = indexes;
969 indexes += (KEYS_PER_NODE * t->counts[i]); 969 indexes += (KEYS_PER_NODE * t->counts[i]);
970 setup_btree_index(i, t); 970 setup_btree_index(i, t);
971 } 971 }
972 972
973 return 0; 973 return 0;
974 } 974 }
975 975
976 /* 976 /*
977 * Builds the btree to index the map. 977 * Builds the btree to index the map.
978 */ 978 */
979 static int dm_table_build_index(struct dm_table *t) 979 static int dm_table_build_index(struct dm_table *t)
980 { 980 {
981 int r = 0; 981 int r = 0;
982 unsigned int leaf_nodes; 982 unsigned int leaf_nodes;
983 983
984 /* how many indexes will the btree have ? */ 984 /* how many indexes will the btree have ? */
985 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE); 985 leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
986 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE); 986 t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
987 987
988 /* leaf layer has already been set up */ 988 /* leaf layer has already been set up */
989 t->counts[t->depth - 1] = leaf_nodes; 989 t->counts[t->depth - 1] = leaf_nodes;
990 t->index[t->depth - 1] = t->highs; 990 t->index[t->depth - 1] = t->highs;
991 991
992 if (t->depth >= 2) 992 if (t->depth >= 2)
993 r = setup_indexes(t); 993 r = setup_indexes(t);
994 994
995 return r; 995 return r;
996 } 996 }
997 997
998 /* 998 /*
999 * Get a disk whose integrity profile reflects the table's profile. 999 * Get a disk whose integrity profile reflects the table's profile.
1000 * If %match_all is true, all devices' profiles must match. 1000 * If %match_all is true, all devices' profiles must match.
1001 * If %match_all is false, all devices must at least have an 1001 * If %match_all is false, all devices must at least have an
1002 * allocated integrity profile; but uninitialized is ok. 1002 * allocated integrity profile; but uninitialized is ok.
1003 * Returns NULL if integrity support was inconsistent or unavailable. 1003 * Returns NULL if integrity support was inconsistent or unavailable.
1004 */ 1004 */
1005 static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t, 1005 static struct gendisk * dm_table_get_integrity_disk(struct dm_table *t,
1006 bool match_all) 1006 bool match_all)
1007 { 1007 {
1008 struct list_head *devices = dm_table_get_devices(t); 1008 struct list_head *devices = dm_table_get_devices(t);
1009 struct dm_dev_internal *dd = NULL; 1009 struct dm_dev_internal *dd = NULL;
1010 struct gendisk *prev_disk = NULL, *template_disk = NULL; 1010 struct gendisk *prev_disk = NULL, *template_disk = NULL;
1011 1011
1012 list_for_each_entry(dd, devices, list) { 1012 list_for_each_entry(dd, devices, list) {
1013 template_disk = dd->dm_dev.bdev->bd_disk; 1013 template_disk = dd->dm_dev.bdev->bd_disk;
1014 if (!blk_get_integrity(template_disk)) 1014 if (!blk_get_integrity(template_disk))
1015 goto no_integrity; 1015 goto no_integrity;
1016 if (!match_all && !blk_integrity_is_initialized(template_disk)) 1016 if (!match_all && !blk_integrity_is_initialized(template_disk))
1017 continue; /* skip uninitialized profiles */ 1017 continue; /* skip uninitialized profiles */
1018 else if (prev_disk && 1018 else if (prev_disk &&
1019 blk_integrity_compare(prev_disk, template_disk) < 0) 1019 blk_integrity_compare(prev_disk, template_disk) < 0)
1020 goto no_integrity; 1020 goto no_integrity;
1021 prev_disk = template_disk; 1021 prev_disk = template_disk;
1022 } 1022 }
1023 1023
1024 return template_disk; 1024 return template_disk;
1025 1025
1026 no_integrity: 1026 no_integrity:
1027 if (prev_disk) 1027 if (prev_disk)
1028 DMWARN("%s: integrity not set: %s and %s profile mismatch", 1028 DMWARN("%s: integrity not set: %s and %s profile mismatch",
1029 dm_device_name(t->md), 1029 dm_device_name(t->md),
1030 prev_disk->disk_name, 1030 prev_disk->disk_name,
1031 template_disk->disk_name); 1031 template_disk->disk_name);
1032 return NULL; 1032 return NULL;
1033 } 1033 }
1034 1034
1035 /* 1035 /*
1036 * Register the mapped device for blk_integrity support if 1036 * Register the mapped device for blk_integrity support if
1037 * the underlying devices have an integrity profile. But all devices 1037 * the underlying devices have an integrity profile. But all devices
1038 * may not have matching profiles (checking all devices isn't reliable 1038 * may not have matching profiles (checking all devices isn't reliable
1039 * during table load because this table may use other DM device(s) which 1039 * during table load because this table may use other DM device(s) which
1040 * must be resumed before they will have an initialized integity profile). 1040 * must be resumed before they will have an initialized integity profile).
1041 * Stacked DM devices force a 2 stage integrity profile validation: 1041 * Stacked DM devices force a 2 stage integrity profile validation:
1042 * 1 - during load, validate all initialized integrity profiles match 1042 * 1 - during load, validate all initialized integrity profiles match
1043 * 2 - during resume, validate all integrity profiles match 1043 * 2 - during resume, validate all integrity profiles match
1044 */ 1044 */
1045 static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md) 1045 static int dm_table_prealloc_integrity(struct dm_table *t, struct mapped_device *md)
1046 { 1046 {
1047 struct gendisk *template_disk = NULL; 1047 struct gendisk *template_disk = NULL;
1048 1048
1049 template_disk = dm_table_get_integrity_disk(t, false); 1049 template_disk = dm_table_get_integrity_disk(t, false);
1050 if (!template_disk) 1050 if (!template_disk)
1051 return 0; 1051 return 0;
1052 1052
1053 if (!blk_integrity_is_initialized(dm_disk(md))) { 1053 if (!blk_integrity_is_initialized(dm_disk(md))) {
1054 t->integrity_supported = 1; 1054 t->integrity_supported = 1;
1055 return blk_integrity_register(dm_disk(md), NULL); 1055 return blk_integrity_register(dm_disk(md), NULL);
1056 } 1056 }
1057 1057
1058 /* 1058 /*
1059 * If DM device already has an initalized integrity 1059 * If DM device already has an initalized integrity
1060 * profile the new profile should not conflict. 1060 * profile the new profile should not conflict.
1061 */ 1061 */
1062 if (blk_integrity_is_initialized(template_disk) && 1062 if (blk_integrity_is_initialized(template_disk) &&
1063 blk_integrity_compare(dm_disk(md), template_disk) < 0) { 1063 blk_integrity_compare(dm_disk(md), template_disk) < 0) {
1064 DMWARN("%s: conflict with existing integrity profile: " 1064 DMWARN("%s: conflict with existing integrity profile: "
1065 "%s profile mismatch", 1065 "%s profile mismatch",
1066 dm_device_name(t->md), 1066 dm_device_name(t->md),
1067 template_disk->disk_name); 1067 template_disk->disk_name);
1068 return 1; 1068 return 1;
1069 } 1069 }
1070 1070
1071 /* Preserve existing initialized integrity profile */ 1071 /* Preserve existing initialized integrity profile */
1072 t->integrity_supported = 1; 1072 t->integrity_supported = 1;
1073 return 0; 1073 return 0;
1074 } 1074 }
1075 1075
1076 /* 1076 /*
1077 * Prepares the table for use by building the indices, 1077 * Prepares the table for use by building the indices,
1078 * setting the type, and allocating mempools. 1078 * setting the type, and allocating mempools.
1079 */ 1079 */
1080 int dm_table_complete(struct dm_table *t) 1080 int dm_table_complete(struct dm_table *t)
1081 { 1081 {
1082 int r; 1082 int r;
1083 1083
1084 r = dm_table_set_type(t); 1084 r = dm_table_set_type(t);
1085 if (r) { 1085 if (r) {
1086 DMERR("unable to set table type"); 1086 DMERR("unable to set table type");
1087 return r; 1087 return r;
1088 } 1088 }
1089 1089
1090 r = dm_table_build_index(t); 1090 r = dm_table_build_index(t);
1091 if (r) { 1091 if (r) {
1092 DMERR("unable to build btrees"); 1092 DMERR("unable to build btrees");
1093 return r; 1093 return r;
1094 } 1094 }
1095 1095
1096 r = dm_table_prealloc_integrity(t, t->md); 1096 r = dm_table_prealloc_integrity(t, t->md);
1097 if (r) { 1097 if (r) {
1098 DMERR("could not register integrity profile."); 1098 DMERR("could not register integrity profile.");
1099 return r; 1099 return r;
1100 } 1100 }
1101 1101
1102 r = dm_table_alloc_md_mempools(t); 1102 r = dm_table_alloc_md_mempools(t);
1103 if (r) 1103 if (r)
1104 DMERR("unable to allocate mempools"); 1104 DMERR("unable to allocate mempools");
1105 1105
1106 return r; 1106 return r;
1107 } 1107 }
1108 1108
1109 static DEFINE_MUTEX(_event_lock); 1109 static DEFINE_MUTEX(_event_lock);
1110 void dm_table_event_callback(struct dm_table *t, 1110 void dm_table_event_callback(struct dm_table *t,
1111 void (*fn)(void *), void *context) 1111 void (*fn)(void *), void *context)
1112 { 1112 {
1113 mutex_lock(&_event_lock); 1113 mutex_lock(&_event_lock);
1114 t->event_fn = fn; 1114 t->event_fn = fn;
1115 t->event_context = context; 1115 t->event_context = context;
1116 mutex_unlock(&_event_lock); 1116 mutex_unlock(&_event_lock);
1117 } 1117 }
1118 1118
1119 void dm_table_event(struct dm_table *t) 1119 void dm_table_event(struct dm_table *t)
1120 { 1120 {
1121 /* 1121 /*
1122 * You can no longer call dm_table_event() from interrupt 1122 * You can no longer call dm_table_event() from interrupt
1123 * context, use a bottom half instead. 1123 * context, use a bottom half instead.
1124 */ 1124 */
1125 BUG_ON(in_interrupt()); 1125 BUG_ON(in_interrupt());
1126 1126
1127 mutex_lock(&_event_lock); 1127 mutex_lock(&_event_lock);
1128 if (t->event_fn) 1128 if (t->event_fn)
1129 t->event_fn(t->event_context); 1129 t->event_fn(t->event_context);
1130 mutex_unlock(&_event_lock); 1130 mutex_unlock(&_event_lock);
1131 } 1131 }
1132 EXPORT_SYMBOL(dm_table_event); 1132 EXPORT_SYMBOL(dm_table_event);
1133 1133
1134 sector_t dm_table_get_size(struct dm_table *t) 1134 sector_t dm_table_get_size(struct dm_table *t)
1135 { 1135 {
1136 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0; 1136 return t->num_targets ? (t->highs[t->num_targets - 1] + 1) : 0;
1137 } 1137 }
1138 EXPORT_SYMBOL(dm_table_get_size); 1138 EXPORT_SYMBOL(dm_table_get_size);
1139 1139
1140 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index) 1140 struct dm_target *dm_table_get_target(struct dm_table *t, unsigned int index)
1141 { 1141 {
1142 if (index >= t->num_targets) 1142 if (index >= t->num_targets)
1143 return NULL; 1143 return NULL;
1144 1144
1145 return t->targets + index; 1145 return t->targets + index;
1146 } 1146 }
1147 1147
1148 /* 1148 /*
1149 * Search the btree for the correct target. 1149 * Search the btree for the correct target.
1150 * 1150 *
1151 * Caller should check returned pointer with dm_target_is_valid() 1151 * Caller should check returned pointer with dm_target_is_valid()
1152 * to trap I/O beyond end of device. 1152 * to trap I/O beyond end of device.
1153 */ 1153 */
1154 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector) 1154 struct dm_target *dm_table_find_target(struct dm_table *t, sector_t sector)
1155 { 1155 {
1156 unsigned int l, n = 0, k = 0; 1156 unsigned int l, n = 0, k = 0;
1157 sector_t *node; 1157 sector_t *node;
1158 1158
1159 for (l = 0; l < t->depth; l++) { 1159 for (l = 0; l < t->depth; l++) {
1160 n = get_child(n, k); 1160 n = get_child(n, k);
1161 node = get_node(t, l, n); 1161 node = get_node(t, l, n);
1162 1162
1163 for (k = 0; k < KEYS_PER_NODE; k++) 1163 for (k = 0; k < KEYS_PER_NODE; k++)
1164 if (node[k] >= sector) 1164 if (node[k] >= sector)
1165 break; 1165 break;
1166 } 1166 }
1167 1167
1168 return &t->targets[(KEYS_PER_NODE * n) + k]; 1168 return &t->targets[(KEYS_PER_NODE * n) + k];
1169 } 1169 }
1170 1170
1171 /* 1171 /*
1172 * Establish the new table's queue_limits and validate them. 1172 * Establish the new table's queue_limits and validate them.
1173 */ 1173 */
1174 int dm_calculate_queue_limits(struct dm_table *table, 1174 int dm_calculate_queue_limits(struct dm_table *table,
1175 struct queue_limits *limits) 1175 struct queue_limits *limits)
1176 { 1176 {
1177 struct dm_target *uninitialized_var(ti); 1177 struct dm_target *uninitialized_var(ti);
1178 struct queue_limits ti_limits; 1178 struct queue_limits ti_limits;
1179 unsigned i = 0; 1179 unsigned i = 0;
1180 1180
1181 blk_set_default_limits(limits); 1181 blk_set_default_limits(limits);
1182 1182
1183 while (i < dm_table_get_num_targets(table)) { 1183 while (i < dm_table_get_num_targets(table)) {
1184 blk_set_default_limits(&ti_limits); 1184 blk_set_default_limits(&ti_limits);
1185 1185
1186 ti = dm_table_get_target(table, i++); 1186 ti = dm_table_get_target(table, i++);
1187 1187
1188 if (!ti->type->iterate_devices) 1188 if (!ti->type->iterate_devices)
1189 goto combine_limits; 1189 goto combine_limits;
1190 1190
1191 /* 1191 /*
1192 * Combine queue limits of all the devices this target uses. 1192 * Combine queue limits of all the devices this target uses.
1193 */ 1193 */
1194 ti->type->iterate_devices(ti, dm_set_device_limits, 1194 ti->type->iterate_devices(ti, dm_set_device_limits,
1195 &ti_limits); 1195 &ti_limits);
1196 1196
1197 /* Set I/O hints portion of queue limits */ 1197 /* Set I/O hints portion of queue limits */
1198 if (ti->type->io_hints) 1198 if (ti->type->io_hints)
1199 ti->type->io_hints(ti, &ti_limits); 1199 ti->type->io_hints(ti, &ti_limits);
1200 1200
1201 /* 1201 /*
1202 * Check each device area is consistent with the target's 1202 * Check each device area is consistent with the target's
1203 * overall queue limits. 1203 * overall queue limits.
1204 */ 1204 */
1205 if (ti->type->iterate_devices(ti, device_area_is_invalid, 1205 if (ti->type->iterate_devices(ti, device_area_is_invalid,
1206 &ti_limits)) 1206 &ti_limits))
1207 return -EINVAL; 1207 return -EINVAL;
1208 1208
1209 combine_limits: 1209 combine_limits:
1210 /* 1210 /*
1211 * Merge this target's queue limits into the overall limits 1211 * Merge this target's queue limits into the overall limits
1212 * for the table. 1212 * for the table.
1213 */ 1213 */
1214 if (blk_stack_limits(limits, &ti_limits, 0) < 0) 1214 if (blk_stack_limits(limits, &ti_limits, 0) < 0)
1215 DMWARN("%s: adding target device " 1215 DMWARN("%s: adding target device "
1216 "(start sect %llu len %llu) " 1216 "(start sect %llu len %llu) "
1217 "caused an alignment inconsistency", 1217 "caused an alignment inconsistency",
1218 dm_device_name(table->md), 1218 dm_device_name(table->md),
1219 (unsigned long long) ti->begin, 1219 (unsigned long long) ti->begin,
1220 (unsigned long long) ti->len); 1220 (unsigned long long) ti->len);
1221 } 1221 }
1222 1222
1223 return validate_hardware_logical_block_alignment(table, limits); 1223 return validate_hardware_logical_block_alignment(table, limits);
1224 } 1224 }
1225 1225
1226 /* 1226 /*
1227 * Set the integrity profile for this device if all devices used have 1227 * Set the integrity profile for this device if all devices used have
1228 * matching profiles. We're quite deep in the resume path but still 1228 * matching profiles. We're quite deep in the resume path but still
1229 * don't know if all devices (particularly DM devices this device 1229 * don't know if all devices (particularly DM devices this device
1230 * may be stacked on) have matching profiles. Even if the profiles 1230 * may be stacked on) have matching profiles. Even if the profiles
1231 * don't match we have no way to fail (to resume) at this point. 1231 * don't match we have no way to fail (to resume) at this point.
1232 */ 1232 */
1233 static void dm_table_set_integrity(struct dm_table *t) 1233 static void dm_table_set_integrity(struct dm_table *t)
1234 { 1234 {
1235 struct gendisk *template_disk = NULL; 1235 struct gendisk *template_disk = NULL;
1236 1236
1237 if (!blk_get_integrity(dm_disk(t->md))) 1237 if (!blk_get_integrity(dm_disk(t->md)))
1238 return; 1238 return;
1239 1239
1240 template_disk = dm_table_get_integrity_disk(t, true); 1240 template_disk = dm_table_get_integrity_disk(t, true);
1241 if (!template_disk && 1241 if (!template_disk &&
1242 blk_integrity_is_initialized(dm_disk(t->md))) { 1242 blk_integrity_is_initialized(dm_disk(t->md))) {
1243 DMWARN("%s: device no longer has a valid integrity profile", 1243 DMWARN("%s: device no longer has a valid integrity profile",
1244 dm_device_name(t->md)); 1244 dm_device_name(t->md));
1245 return; 1245 return;
1246 } 1246 }
1247 blk_integrity_register(dm_disk(t->md), 1247 blk_integrity_register(dm_disk(t->md),
1248 blk_get_integrity(template_disk)); 1248 blk_get_integrity(template_disk));
1249 } 1249 }
1250 1250
1251 static int device_flush_capable(struct dm_target *ti, struct dm_dev *dev,
1252 sector_t start, sector_t len, void *data)
1253 {
1254 unsigned flush = (*(unsigned *)data);
1255 struct request_queue *q = bdev_get_queue(dev->bdev);
1256
1257 return q && (q->flush_flags & flush);
1258 }
1259
1260 static bool dm_table_supports_flush(struct dm_table *t, unsigned flush)
1261 {
1262 struct dm_target *ti;
1263 unsigned i = 0;
1264
1265 /*
1266 * Require at least one underlying device to support flushes.
1267 * t->devices includes internal dm devices such as mirror logs
1268 * so we need to use iterate_devices here, which targets
1269 * supporting flushes must provide.
1270 */
1271 while (i < dm_table_get_num_targets(t)) {
1272 ti = dm_table_get_target(t, i++);
1273
1274 if (!ti->num_flush_requests)
1275 continue;
1276
1277 if (ti->type->iterate_devices &&
1278 ti->type->iterate_devices(ti, device_flush_capable, &flush))
1279 return 1;
1280 }
1281
1282 return 0;
1283 }
1284
1251 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q, 1285 void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q,
1252 struct queue_limits *limits) 1286 struct queue_limits *limits)
1253 { 1287 {
1288 unsigned flush = 0;
1289
1254 /* 1290 /*
1255 * Copy table's limits to the DM device's request_queue 1291 * Copy table's limits to the DM device's request_queue
1256 */ 1292 */
1257 q->limits = *limits; 1293 q->limits = *limits;
1258 1294
1259 if (!dm_table_supports_discards(t)) 1295 if (!dm_table_supports_discards(t))
1260 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); 1296 queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q);
1261 else 1297 else
1262 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); 1298 queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
1299
1300 if (dm_table_supports_flush(t, REQ_FLUSH)) {
1301 flush |= REQ_FLUSH;
1302 if (dm_table_supports_flush(t, REQ_FUA))
1303 flush |= REQ_FUA;
1304 }
1305 blk_queue_flush(q, flush);
1263 1306
1264 dm_table_set_integrity(t); 1307 dm_table_set_integrity(t);
1265 1308
1266 /* 1309 /*
1267 * QUEUE_FLAG_STACKABLE must be set after all queue settings are 1310 * QUEUE_FLAG_STACKABLE must be set after all queue settings are
1268 * visible to other CPUs because, once the flag is set, incoming bios 1311 * visible to other CPUs because, once the flag is set, incoming bios
1269 * are processed by request-based dm, which refers to the queue 1312 * are processed by request-based dm, which refers to the queue
1270 * settings. 1313 * settings.
1271 * Until the flag set, bios are passed to bio-based dm and queued to 1314 * Until the flag set, bios are passed to bio-based dm and queued to
1272 * md->deferred where queue settings are not needed yet. 1315 * md->deferred where queue settings are not needed yet.
1273 * Those bios are passed to request-based dm at the resume time. 1316 * Those bios are passed to request-based dm at the resume time.
1274 */ 1317 */
1275 smp_mb(); 1318 smp_mb();
1276 if (dm_table_request_based(t)) 1319 if (dm_table_request_based(t))
1277 queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q); 1320 queue_flag_set_unlocked(QUEUE_FLAG_STACKABLE, q);
1278 } 1321 }
1279 1322
1280 unsigned int dm_table_get_num_targets(struct dm_table *t) 1323 unsigned int dm_table_get_num_targets(struct dm_table *t)
1281 { 1324 {
1282 return t->num_targets; 1325 return t->num_targets;
1283 } 1326 }
1284 1327
1285 struct list_head *dm_table_get_devices(struct dm_table *t) 1328 struct list_head *dm_table_get_devices(struct dm_table *t)
1286 { 1329 {
1287 return &t->devices; 1330 return &t->devices;
1288 } 1331 }
1289 1332
1290 fmode_t dm_table_get_mode(struct dm_table *t) 1333 fmode_t dm_table_get_mode(struct dm_table *t)
1291 { 1334 {
1292 return t->mode; 1335 return t->mode;
1293 } 1336 }
1294 EXPORT_SYMBOL(dm_table_get_mode); 1337 EXPORT_SYMBOL(dm_table_get_mode);
1295 1338
1296 static void suspend_targets(struct dm_table *t, unsigned postsuspend) 1339 static void suspend_targets(struct dm_table *t, unsigned postsuspend)
1297 { 1340 {
1298 int i = t->num_targets; 1341 int i = t->num_targets;
1299 struct dm_target *ti = t->targets; 1342 struct dm_target *ti = t->targets;
1300 1343
1301 while (i--) { 1344 while (i--) {
1302 if (postsuspend) { 1345 if (postsuspend) {
1303 if (ti->type->postsuspend) 1346 if (ti->type->postsuspend)
1304 ti->type->postsuspend(ti); 1347 ti->type->postsuspend(ti);
1305 } else if (ti->type->presuspend) 1348 } else if (ti->type->presuspend)
1306 ti->type->presuspend(ti); 1349 ti->type->presuspend(ti);
1307 1350
1308 ti++; 1351 ti++;
1309 } 1352 }
1310 } 1353 }
1311 1354
1312 void dm_table_presuspend_targets(struct dm_table *t) 1355 void dm_table_presuspend_targets(struct dm_table *t)
1313 { 1356 {
1314 if (!t) 1357 if (!t)
1315 return; 1358 return;
1316 1359
1317 suspend_targets(t, 0); 1360 suspend_targets(t, 0);
1318 } 1361 }
1319 1362
1320 void dm_table_postsuspend_targets(struct dm_table *t) 1363 void dm_table_postsuspend_targets(struct dm_table *t)
1321 { 1364 {
1322 if (!t) 1365 if (!t)
1323 return; 1366 return;
1324 1367
1325 suspend_targets(t, 1); 1368 suspend_targets(t, 1);
1326 } 1369 }
1327 1370
1328 int dm_table_resume_targets(struct dm_table *t) 1371 int dm_table_resume_targets(struct dm_table *t)
1329 { 1372 {
1330 int i, r = 0; 1373 int i, r = 0;
1331 1374
1332 for (i = 0; i < t->num_targets; i++) { 1375 for (i = 0; i < t->num_targets; i++) {
1333 struct dm_target *ti = t->targets + i; 1376 struct dm_target *ti = t->targets + i;
1334 1377
1335 if (!ti->type->preresume) 1378 if (!ti->type->preresume)
1336 continue; 1379 continue;
1337 1380
1338 r = ti->type->preresume(ti); 1381 r = ti->type->preresume(ti);
1339 if (r) 1382 if (r)
1340 return r; 1383 return r;
1341 } 1384 }
1342 1385
1343 for (i = 0; i < t->num_targets; i++) { 1386 for (i = 0; i < t->num_targets; i++) {
1344 struct dm_target *ti = t->targets + i; 1387 struct dm_target *ti = t->targets + i;
1345 1388
1346 if (ti->type->resume) 1389 if (ti->type->resume)
1347 ti->type->resume(ti); 1390 ti->type->resume(ti);
1348 } 1391 }
1349 1392
1350 return 0; 1393 return 0;
1351 } 1394 }
1352 1395
1353 void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb) 1396 void dm_table_add_target_callbacks(struct dm_table *t, struct dm_target_callbacks *cb)
1354 { 1397 {
1355 list_add(&cb->list, &t->target_callbacks); 1398 list_add(&cb->list, &t->target_callbacks);
1356 } 1399 }
1357 EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks); 1400 EXPORT_SYMBOL_GPL(dm_table_add_target_callbacks);
1358 1401
1359 int dm_table_any_congested(struct dm_table *t, int bdi_bits) 1402 int dm_table_any_congested(struct dm_table *t, int bdi_bits)
1360 { 1403 {
1361 struct dm_dev_internal *dd; 1404 struct dm_dev_internal *dd;
1362 struct list_head *devices = dm_table_get_devices(t); 1405 struct list_head *devices = dm_table_get_devices(t);
1363 struct dm_target_callbacks *cb; 1406 struct dm_target_callbacks *cb;
1364 int r = 0; 1407 int r = 0;
1365 1408
1366 list_for_each_entry(dd, devices, list) { 1409 list_for_each_entry(dd, devices, list) {
1367 struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev); 1410 struct request_queue *q = bdev_get_queue(dd->dm_dev.bdev);
1368 char b[BDEVNAME_SIZE]; 1411 char b[BDEVNAME_SIZE];
1369 1412
1370 if (likely(q)) 1413 if (likely(q))
1371 r |= bdi_congested(&q->backing_dev_info, bdi_bits); 1414 r |= bdi_congested(&q->backing_dev_info, bdi_bits);
1372 else 1415 else
1373 DMWARN_LIMIT("%s: any_congested: nonexistent device %s", 1416 DMWARN_LIMIT("%s: any_congested: nonexistent device %s",
1374 dm_device_name(t->md), 1417 dm_device_name(t->md),
1375 bdevname(dd->dm_dev.bdev, b)); 1418 bdevname(dd->dm_dev.bdev, b));
1376 } 1419 }
1377 1420
1378 list_for_each_entry(cb, &t->target_callbacks, list) 1421 list_for_each_entry(cb, &t->target_callbacks, list)
1379 if (cb->congested_fn) 1422 if (cb->congested_fn)
1380 r |= cb->congested_fn(cb, bdi_bits); 1423 r |= cb->congested_fn(cb, bdi_bits);
1381 1424
1382 return r; 1425 return r;
1383 } 1426 }
1384 1427
1385 int dm_table_any_busy_target(struct dm_table *t) 1428 int dm_table_any_busy_target(struct dm_table *t)
1386 { 1429 {
1387 unsigned i; 1430 unsigned i;
1388 struct dm_target *ti; 1431 struct dm_target *ti;
1389 1432
1390 for (i = 0; i < t->num_targets; i++) { 1433 for (i = 0; i < t->num_targets; i++) {
1391 ti = t->targets + i; 1434 ti = t->targets + i;
1392 if (ti->type->busy && ti->type->busy(ti)) 1435 if (ti->type->busy && ti->type->busy(ti))
1393 return 1; 1436 return 1;
1394 } 1437 }
1395 1438
1396 return 0; 1439 return 0;
1397 } 1440 }
1398 1441
1399 struct mapped_device *dm_table_get_md(struct dm_table *t) 1442 struct mapped_device *dm_table_get_md(struct dm_table *t)
1400 { 1443 {
1401 return t->md; 1444 return t->md;
1402 } 1445 }
1403 EXPORT_SYMBOL(dm_table_get_md); 1446 EXPORT_SYMBOL(dm_table_get_md);
1404 1447
1405 static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev, 1448 static int device_discard_capable(struct dm_target *ti, struct dm_dev *dev,
1406 sector_t start, sector_t len, void *data) 1449 sector_t start, sector_t len, void *data)
1407 { 1450 {
1408 struct request_queue *q = bdev_get_queue(dev->bdev); 1451 struct request_queue *q = bdev_get_queue(dev->bdev);
1409 1452
1410 return q && blk_queue_discard(q); 1453 return q && blk_queue_discard(q);
1411 } 1454 }
1412 1455
1413 bool dm_table_supports_discards(struct dm_table *t) 1456 bool dm_table_supports_discards(struct dm_table *t)
1414 { 1457 {
1415 struct dm_target *ti; 1458 struct dm_target *ti;
1416 unsigned i = 0; 1459 unsigned i = 0;
1417 1460
1418 /* 1461 /*
1419 * Unless any target used by the table set discards_supported, 1462 * Unless any target used by the table set discards_supported,
1420 * require at least one underlying device to support discards. 1463 * require at least one underlying device to support discards.
1421 * t->devices includes internal dm devices such as mirror logs 1464 * t->devices includes internal dm devices such as mirror logs
1422 * so we need to use iterate_devices here, which targets 1465 * so we need to use iterate_devices here, which targets
1423 * supporting discard selectively must provide. 1466 * supporting discard selectively must provide.
1424 */ 1467 */
1425 while (i < dm_table_get_num_targets(t)) { 1468 while (i < dm_table_get_num_targets(t)) {
1426 ti = dm_table_get_target(t, i++); 1469 ti = dm_table_get_target(t, i++);
1427 1470
1428 if (!ti->num_discard_requests) 1471 if (!ti->num_discard_requests)
1429 continue; 1472 continue;
1430 1473
1431 if (ti->discards_supported) 1474 if (ti->discards_supported)
1432 return 1; 1475 return 1;
1433 1476
1434 if (ti->type->iterate_devices && 1477 if (ti->type->iterate_devices &&
1435 ti->type->iterate_devices(ti, device_discard_capable, NULL)) 1478 ti->type->iterate_devices(ti, device_discard_capable, NULL))
1436 return 1; 1479 return 1;
1437 } 1480 }
1438 1481
1439 return 0; 1482 return 0;
1440 } 1483 }
1441 1484
1 /* 1 /*
2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited. 2 * Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. 3 * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4 * 4 *
5 * This file is released under the GPL. 5 * This file is released under the GPL.
6 */ 6 */
7 7
8 #include "dm.h" 8 #include "dm.h"
9 #include "dm-uevent.h" 9 #include "dm-uevent.h"
10 10
11 #include <linux/init.h> 11 #include <linux/init.h>
12 #include <linux/module.h> 12 #include <linux/module.h>
13 #include <linux/mutex.h> 13 #include <linux/mutex.h>
14 #include <linux/moduleparam.h> 14 #include <linux/moduleparam.h>
15 #include <linux/blkpg.h> 15 #include <linux/blkpg.h>
16 #include <linux/bio.h> 16 #include <linux/bio.h>
17 #include <linux/buffer_head.h> 17 #include <linux/buffer_head.h>
18 #include <linux/mempool.h> 18 #include <linux/mempool.h>
19 #include <linux/slab.h> 19 #include <linux/slab.h>
20 #include <linux/idr.h> 20 #include <linux/idr.h>
21 #include <linux/hdreg.h> 21 #include <linux/hdreg.h>
22 #include <linux/delay.h> 22 #include <linux/delay.h>
23 23
24 #include <trace/events/block.h> 24 #include <trace/events/block.h>
25 25
26 #define DM_MSG_PREFIX "core" 26 #define DM_MSG_PREFIX "core"
27 27
28 /* 28 /*
29 * Cookies are numeric values sent with CHANGE and REMOVE 29 * Cookies are numeric values sent with CHANGE and REMOVE
30 * uevents while resuming, removing or renaming the device. 30 * uevents while resuming, removing or renaming the device.
31 */ 31 */
32 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE" 32 #define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
33 #define DM_COOKIE_LENGTH 24 33 #define DM_COOKIE_LENGTH 24
34 34
35 static const char *_name = DM_NAME; 35 static const char *_name = DM_NAME;
36 36
37 static unsigned int major = 0; 37 static unsigned int major = 0;
38 static unsigned int _major = 0; 38 static unsigned int _major = 0;
39 39
40 static DEFINE_IDR(_minor_idr); 40 static DEFINE_IDR(_minor_idr);
41 41
42 static DEFINE_SPINLOCK(_minor_lock); 42 static DEFINE_SPINLOCK(_minor_lock);
43 /* 43 /*
44 * For bio-based dm. 44 * For bio-based dm.
45 * One of these is allocated per bio. 45 * One of these is allocated per bio.
46 */ 46 */
47 struct dm_io { 47 struct dm_io {
48 struct mapped_device *md; 48 struct mapped_device *md;
49 int error; 49 int error;
50 atomic_t io_count; 50 atomic_t io_count;
51 struct bio *bio; 51 struct bio *bio;
52 unsigned long start_time; 52 unsigned long start_time;
53 spinlock_t endio_lock; 53 spinlock_t endio_lock;
54 }; 54 };
55 55
56 /* 56 /*
57 * For bio-based dm. 57 * For bio-based dm.
58 * One of these is allocated per target within a bio. Hopefully 58 * One of these is allocated per target within a bio. Hopefully
59 * this will be simplified out one day. 59 * this will be simplified out one day.
60 */ 60 */
61 struct dm_target_io { 61 struct dm_target_io {
62 struct dm_io *io; 62 struct dm_io *io;
63 struct dm_target *ti; 63 struct dm_target *ti;
64 union map_info info; 64 union map_info info;
65 }; 65 };
66 66
67 /* 67 /*
68 * For request-based dm. 68 * For request-based dm.
69 * One of these is allocated per request. 69 * One of these is allocated per request.
70 */ 70 */
71 struct dm_rq_target_io { 71 struct dm_rq_target_io {
72 struct mapped_device *md; 72 struct mapped_device *md;
73 struct dm_target *ti; 73 struct dm_target *ti;
74 struct request *orig, clone; 74 struct request *orig, clone;
75 int error; 75 int error;
76 union map_info info; 76 union map_info info;
77 }; 77 };
78 78
79 /* 79 /*
80 * For request-based dm. 80 * For request-based dm.
81 * One of these is allocated per bio. 81 * One of these is allocated per bio.
82 */ 82 */
83 struct dm_rq_clone_bio_info { 83 struct dm_rq_clone_bio_info {
84 struct bio *orig; 84 struct bio *orig;
85 struct dm_rq_target_io *tio; 85 struct dm_rq_target_io *tio;
86 }; 86 };
87 87
88 union map_info *dm_get_mapinfo(struct bio *bio) 88 union map_info *dm_get_mapinfo(struct bio *bio)
89 { 89 {
90 if (bio && bio->bi_private) 90 if (bio && bio->bi_private)
91 return &((struct dm_target_io *)bio->bi_private)->info; 91 return &((struct dm_target_io *)bio->bi_private)->info;
92 return NULL; 92 return NULL;
93 } 93 }
94 94
95 union map_info *dm_get_rq_mapinfo(struct request *rq) 95 union map_info *dm_get_rq_mapinfo(struct request *rq)
96 { 96 {
97 if (rq && rq->end_io_data) 97 if (rq && rq->end_io_data)
98 return &((struct dm_rq_target_io *)rq->end_io_data)->info; 98 return &((struct dm_rq_target_io *)rq->end_io_data)->info;
99 return NULL; 99 return NULL;
100 } 100 }
101 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); 101 EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
102 102
103 #define MINOR_ALLOCED ((void *)-1) 103 #define MINOR_ALLOCED ((void *)-1)
104 104
105 /* 105 /*
106 * Bits for the md->flags field. 106 * Bits for the md->flags field.
107 */ 107 */
108 #define DMF_BLOCK_IO_FOR_SUSPEND 0 108 #define DMF_BLOCK_IO_FOR_SUSPEND 0
109 #define DMF_SUSPENDED 1 109 #define DMF_SUSPENDED 1
110 #define DMF_FROZEN 2 110 #define DMF_FROZEN 2
111 #define DMF_FREEING 3 111 #define DMF_FREEING 3
112 #define DMF_DELETING 4 112 #define DMF_DELETING 4
113 #define DMF_NOFLUSH_SUSPENDING 5 113 #define DMF_NOFLUSH_SUSPENDING 5
114 #define DMF_MERGE_IS_OPTIONAL 6 114 #define DMF_MERGE_IS_OPTIONAL 6
115 115
116 /* 116 /*
117 * Work processed by per-device workqueue. 117 * Work processed by per-device workqueue.
118 */ 118 */
119 struct mapped_device { 119 struct mapped_device {
120 struct rw_semaphore io_lock; 120 struct rw_semaphore io_lock;
121 struct mutex suspend_lock; 121 struct mutex suspend_lock;
122 rwlock_t map_lock; 122 rwlock_t map_lock;
123 atomic_t holders; 123 atomic_t holders;
124 atomic_t open_count; 124 atomic_t open_count;
125 125
126 unsigned long flags; 126 unsigned long flags;
127 127
128 struct request_queue *queue; 128 struct request_queue *queue;
129 unsigned type; 129 unsigned type;
130 /* Protect queue and type against concurrent access. */ 130 /* Protect queue and type against concurrent access. */
131 struct mutex type_lock; 131 struct mutex type_lock;
132 132
133 struct gendisk *disk; 133 struct gendisk *disk;
134 char name[16]; 134 char name[16];
135 135
136 void *interface_ptr; 136 void *interface_ptr;
137 137
138 /* 138 /*
139 * A list of ios that arrived while we were suspended. 139 * A list of ios that arrived while we were suspended.
140 */ 140 */
141 atomic_t pending[2]; 141 atomic_t pending[2];
142 wait_queue_head_t wait; 142 wait_queue_head_t wait;
143 struct work_struct work; 143 struct work_struct work;
144 struct bio_list deferred; 144 struct bio_list deferred;
145 spinlock_t deferred_lock; 145 spinlock_t deferred_lock;
146 146
147 /* 147 /*
148 * Processing queue (flush) 148 * Processing queue (flush)
149 */ 149 */
150 struct workqueue_struct *wq; 150 struct workqueue_struct *wq;
151 151
152 /* 152 /*
153 * The current mapping. 153 * The current mapping.
154 */ 154 */
155 struct dm_table *map; 155 struct dm_table *map;
156 156
157 /* 157 /*
158 * io objects are allocated from here. 158 * io objects are allocated from here.
159 */ 159 */
160 mempool_t *io_pool; 160 mempool_t *io_pool;
161 mempool_t *tio_pool; 161 mempool_t *tio_pool;
162 162
163 struct bio_set *bs; 163 struct bio_set *bs;
164 164
165 /* 165 /*
166 * Event handling. 166 * Event handling.
167 */ 167 */
168 atomic_t event_nr; 168 atomic_t event_nr;
169 wait_queue_head_t eventq; 169 wait_queue_head_t eventq;
170 atomic_t uevent_seq; 170 atomic_t uevent_seq;
171 struct list_head uevent_list; 171 struct list_head uevent_list;
172 spinlock_t uevent_lock; /* Protect access to uevent_list */ 172 spinlock_t uevent_lock; /* Protect access to uevent_list */
173 173
174 /* 174 /*
175 * freeze/thaw support require holding onto a super block 175 * freeze/thaw support require holding onto a super block
176 */ 176 */
177 struct super_block *frozen_sb; 177 struct super_block *frozen_sb;
178 struct block_device *bdev; 178 struct block_device *bdev;
179 179
180 /* forced geometry settings */ 180 /* forced geometry settings */
181 struct hd_geometry geometry; 181 struct hd_geometry geometry;
182 182
183 /* For saving the address of __make_request for request based dm */ 183 /* For saving the address of __make_request for request based dm */
184 make_request_fn *saved_make_request_fn; 184 make_request_fn *saved_make_request_fn;
185 185
186 /* sysfs handle */ 186 /* sysfs handle */
187 struct kobject kobj; 187 struct kobject kobj;
188 188
189 /* zero-length flush that will be cloned and submitted to targets */ 189 /* zero-length flush that will be cloned and submitted to targets */
190 struct bio flush_bio; 190 struct bio flush_bio;
191 }; 191 };
192 192
193 /* 193 /*
194 * For mempools pre-allocation at the table loading time. 194 * For mempools pre-allocation at the table loading time.
195 */ 195 */
196 struct dm_md_mempools { 196 struct dm_md_mempools {
197 mempool_t *io_pool; 197 mempool_t *io_pool;
198 mempool_t *tio_pool; 198 mempool_t *tio_pool;
199 struct bio_set *bs; 199 struct bio_set *bs;
200 }; 200 };
201 201
202 #define MIN_IOS 256 202 #define MIN_IOS 256
203 static struct kmem_cache *_io_cache; 203 static struct kmem_cache *_io_cache;
204 static struct kmem_cache *_tio_cache; 204 static struct kmem_cache *_tio_cache;
205 static struct kmem_cache *_rq_tio_cache; 205 static struct kmem_cache *_rq_tio_cache;
206 static struct kmem_cache *_rq_bio_info_cache; 206 static struct kmem_cache *_rq_bio_info_cache;
207 207
208 static int __init local_init(void) 208 static int __init local_init(void)
209 { 209 {
210 int r = -ENOMEM; 210 int r = -ENOMEM;
211 211
212 /* allocate a slab for the dm_ios */ 212 /* allocate a slab for the dm_ios */
213 _io_cache = KMEM_CACHE(dm_io, 0); 213 _io_cache = KMEM_CACHE(dm_io, 0);
214 if (!_io_cache) 214 if (!_io_cache)
215 return r; 215 return r;
216 216
217 /* allocate a slab for the target ios */ 217 /* allocate a slab for the target ios */
218 _tio_cache = KMEM_CACHE(dm_target_io, 0); 218 _tio_cache = KMEM_CACHE(dm_target_io, 0);
219 if (!_tio_cache) 219 if (!_tio_cache)
220 goto out_free_io_cache; 220 goto out_free_io_cache;
221 221
222 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0); 222 _rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
223 if (!_rq_tio_cache) 223 if (!_rq_tio_cache)
224 goto out_free_tio_cache; 224 goto out_free_tio_cache;
225 225
226 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0); 226 _rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
227 if (!_rq_bio_info_cache) 227 if (!_rq_bio_info_cache)
228 goto out_free_rq_tio_cache; 228 goto out_free_rq_tio_cache;
229 229
230 r = dm_uevent_init(); 230 r = dm_uevent_init();
231 if (r) 231 if (r)
232 goto out_free_rq_bio_info_cache; 232 goto out_free_rq_bio_info_cache;
233 233
234 _major = major; 234 _major = major;
235 r = register_blkdev(_major, _name); 235 r = register_blkdev(_major, _name);
236 if (r < 0) 236 if (r < 0)
237 goto out_uevent_exit; 237 goto out_uevent_exit;
238 238
239 if (!_major) 239 if (!_major)
240 _major = r; 240 _major = r;
241 241
242 return 0; 242 return 0;
243 243
244 out_uevent_exit: 244 out_uevent_exit:
245 dm_uevent_exit(); 245 dm_uevent_exit();
246 out_free_rq_bio_info_cache: 246 out_free_rq_bio_info_cache:
247 kmem_cache_destroy(_rq_bio_info_cache); 247 kmem_cache_destroy(_rq_bio_info_cache);
248 out_free_rq_tio_cache: 248 out_free_rq_tio_cache:
249 kmem_cache_destroy(_rq_tio_cache); 249 kmem_cache_destroy(_rq_tio_cache);
250 out_free_tio_cache: 250 out_free_tio_cache:
251 kmem_cache_destroy(_tio_cache); 251 kmem_cache_destroy(_tio_cache);
252 out_free_io_cache: 252 out_free_io_cache:
253 kmem_cache_destroy(_io_cache); 253 kmem_cache_destroy(_io_cache);
254 254
255 return r; 255 return r;
256 } 256 }
257 257
258 static void local_exit(void) 258 static void local_exit(void)
259 { 259 {
260 kmem_cache_destroy(_rq_bio_info_cache); 260 kmem_cache_destroy(_rq_bio_info_cache);
261 kmem_cache_destroy(_rq_tio_cache); 261 kmem_cache_destroy(_rq_tio_cache);
262 kmem_cache_destroy(_tio_cache); 262 kmem_cache_destroy(_tio_cache);
263 kmem_cache_destroy(_io_cache); 263 kmem_cache_destroy(_io_cache);
264 unregister_blkdev(_major, _name); 264 unregister_blkdev(_major, _name);
265 dm_uevent_exit(); 265 dm_uevent_exit();
266 266
267 _major = 0; 267 _major = 0;
268 268
269 DMINFO("cleaned up"); 269 DMINFO("cleaned up");
270 } 270 }
271 271
272 static int (*_inits[])(void) __initdata = { 272 static int (*_inits[])(void) __initdata = {
273 local_init, 273 local_init,
274 dm_target_init, 274 dm_target_init,
275 dm_linear_init, 275 dm_linear_init,
276 dm_stripe_init, 276 dm_stripe_init,
277 dm_io_init, 277 dm_io_init,
278 dm_kcopyd_init, 278 dm_kcopyd_init,
279 dm_interface_init, 279 dm_interface_init,
280 }; 280 };
281 281
282 static void (*_exits[])(void) = { 282 static void (*_exits[])(void) = {
283 local_exit, 283 local_exit,
284 dm_target_exit, 284 dm_target_exit,
285 dm_linear_exit, 285 dm_linear_exit,
286 dm_stripe_exit, 286 dm_stripe_exit,
287 dm_io_exit, 287 dm_io_exit,
288 dm_kcopyd_exit, 288 dm_kcopyd_exit,
289 dm_interface_exit, 289 dm_interface_exit,
290 }; 290 };
291 291
292 static int __init dm_init(void) 292 static int __init dm_init(void)
293 { 293 {
294 const int count = ARRAY_SIZE(_inits); 294 const int count = ARRAY_SIZE(_inits);
295 295
296 int r, i; 296 int r, i;
297 297
298 for (i = 0; i < count; i++) { 298 for (i = 0; i < count; i++) {
299 r = _inits[i](); 299 r = _inits[i]();
300 if (r) 300 if (r)
301 goto bad; 301 goto bad;
302 } 302 }
303 303
304 return 0; 304 return 0;
305 305
306 bad: 306 bad:
307 while (i--) 307 while (i--)
308 _exits[i](); 308 _exits[i]();
309 309
310 return r; 310 return r;
311 } 311 }
312 312
313 static void __exit dm_exit(void) 313 static void __exit dm_exit(void)
314 { 314 {
315 int i = ARRAY_SIZE(_exits); 315 int i = ARRAY_SIZE(_exits);
316 316
317 while (i--) 317 while (i--)
318 _exits[i](); 318 _exits[i]();
319 319
320 /* 320 /*
321 * Should be empty by this point. 321 * Should be empty by this point.
322 */ 322 */
323 idr_remove_all(&_minor_idr); 323 idr_remove_all(&_minor_idr);
324 idr_destroy(&_minor_idr); 324 idr_destroy(&_minor_idr);
325 } 325 }
326 326
327 /* 327 /*
328 * Block device functions 328 * Block device functions
329 */ 329 */
330 int dm_deleting_md(struct mapped_device *md) 330 int dm_deleting_md(struct mapped_device *md)
331 { 331 {
332 return test_bit(DMF_DELETING, &md->flags); 332 return test_bit(DMF_DELETING, &md->flags);
333 } 333 }
334 334
335 static int dm_blk_open(struct block_device *bdev, fmode_t mode) 335 static int dm_blk_open(struct block_device *bdev, fmode_t mode)
336 { 336 {
337 struct mapped_device *md; 337 struct mapped_device *md;
338 338
339 spin_lock(&_minor_lock); 339 spin_lock(&_minor_lock);
340 340
341 md = bdev->bd_disk->private_data; 341 md = bdev->bd_disk->private_data;
342 if (!md) 342 if (!md)
343 goto out; 343 goto out;
344 344
345 if (test_bit(DMF_FREEING, &md->flags) || 345 if (test_bit(DMF_FREEING, &md->flags) ||
346 dm_deleting_md(md)) { 346 dm_deleting_md(md)) {
347 md = NULL; 347 md = NULL;
348 goto out; 348 goto out;
349 } 349 }
350 350
351 dm_get(md); 351 dm_get(md);
352 atomic_inc(&md->open_count); 352 atomic_inc(&md->open_count);
353 353
354 out: 354 out:
355 spin_unlock(&_minor_lock); 355 spin_unlock(&_minor_lock);
356 356
357 return md ? 0 : -ENXIO; 357 return md ? 0 : -ENXIO;
358 } 358 }
359 359
360 static int dm_blk_close(struct gendisk *disk, fmode_t mode) 360 static int dm_blk_close(struct gendisk *disk, fmode_t mode)
361 { 361 {
362 struct mapped_device *md = disk->private_data; 362 struct mapped_device *md = disk->private_data;
363 363
364 spin_lock(&_minor_lock); 364 spin_lock(&_minor_lock);
365 365
366 atomic_dec(&md->open_count); 366 atomic_dec(&md->open_count);
367 dm_put(md); 367 dm_put(md);
368 368
369 spin_unlock(&_minor_lock); 369 spin_unlock(&_minor_lock);
370 370
371 return 0; 371 return 0;
372 } 372 }
373 373
374 int dm_open_count(struct mapped_device *md) 374 int dm_open_count(struct mapped_device *md)
375 { 375 {
376 return atomic_read(&md->open_count); 376 return atomic_read(&md->open_count);
377 } 377 }
378 378
379 /* 379 /*
380 * Guarantees nothing is using the device before it's deleted. 380 * Guarantees nothing is using the device before it's deleted.
381 */ 381 */
382 int dm_lock_for_deletion(struct mapped_device *md) 382 int dm_lock_for_deletion(struct mapped_device *md)
383 { 383 {
384 int r = 0; 384 int r = 0;
385 385
386 spin_lock(&_minor_lock); 386 spin_lock(&_minor_lock);
387 387
388 if (dm_open_count(md)) 388 if (dm_open_count(md))
389 r = -EBUSY; 389 r = -EBUSY;
390 else 390 else
391 set_bit(DMF_DELETING, &md->flags); 391 set_bit(DMF_DELETING, &md->flags);
392 392
393 spin_unlock(&_minor_lock); 393 spin_unlock(&_minor_lock);
394 394
395 return r; 395 return r;
396 } 396 }
397 397
398 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) 398 static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
399 { 399 {
400 struct mapped_device *md = bdev->bd_disk->private_data; 400 struct mapped_device *md = bdev->bd_disk->private_data;
401 401
402 return dm_get_geometry(md, geo); 402 return dm_get_geometry(md, geo);
403 } 403 }
404 404
405 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, 405 static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
406 unsigned int cmd, unsigned long arg) 406 unsigned int cmd, unsigned long arg)
407 { 407 {
408 struct mapped_device *md = bdev->bd_disk->private_data; 408 struct mapped_device *md = bdev->bd_disk->private_data;
409 struct dm_table *map = dm_get_live_table(md); 409 struct dm_table *map = dm_get_live_table(md);
410 struct dm_target *tgt; 410 struct dm_target *tgt;
411 int r = -ENOTTY; 411 int r = -ENOTTY;
412 412
413 if (!map || !dm_table_get_size(map)) 413 if (!map || !dm_table_get_size(map))
414 goto out; 414 goto out;
415 415
416 /* We only support devices that have a single target */ 416 /* We only support devices that have a single target */
417 if (dm_table_get_num_targets(map) != 1) 417 if (dm_table_get_num_targets(map) != 1)
418 goto out; 418 goto out;
419 419
420 tgt = dm_table_get_target(map, 0); 420 tgt = dm_table_get_target(map, 0);
421 421
422 if (dm_suspended_md(md)) { 422 if (dm_suspended_md(md)) {
423 r = -EAGAIN; 423 r = -EAGAIN;
424 goto out; 424 goto out;
425 } 425 }
426 426
427 if (tgt->type->ioctl) 427 if (tgt->type->ioctl)
428 r = tgt->type->ioctl(tgt, cmd, arg); 428 r = tgt->type->ioctl(tgt, cmd, arg);
429 429
430 out: 430 out:
431 dm_table_put(map); 431 dm_table_put(map);
432 432
433 return r; 433 return r;
434 } 434 }
435 435
436 static struct dm_io *alloc_io(struct mapped_device *md) 436 static struct dm_io *alloc_io(struct mapped_device *md)
437 { 437 {
438 return mempool_alloc(md->io_pool, GFP_NOIO); 438 return mempool_alloc(md->io_pool, GFP_NOIO);
439 } 439 }
440 440
441 static void free_io(struct mapped_device *md, struct dm_io *io) 441 static void free_io(struct mapped_device *md, struct dm_io *io)
442 { 442 {
443 mempool_free(io, md->io_pool); 443 mempool_free(io, md->io_pool);
444 } 444 }
445 445
446 static void free_tio(struct mapped_device *md, struct dm_target_io *tio) 446 static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
447 { 447 {
448 mempool_free(tio, md->tio_pool); 448 mempool_free(tio, md->tio_pool);
449 } 449 }
450 450
451 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, 451 static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
452 gfp_t gfp_mask) 452 gfp_t gfp_mask)
453 { 453 {
454 return mempool_alloc(md->tio_pool, gfp_mask); 454 return mempool_alloc(md->tio_pool, gfp_mask);
455 } 455 }
456 456
457 static void free_rq_tio(struct dm_rq_target_io *tio) 457 static void free_rq_tio(struct dm_rq_target_io *tio)
458 { 458 {
459 mempool_free(tio, tio->md->tio_pool); 459 mempool_free(tio, tio->md->tio_pool);
460 } 460 }
461 461
462 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md) 462 static struct dm_rq_clone_bio_info *alloc_bio_info(struct mapped_device *md)
463 { 463 {
464 return mempool_alloc(md->io_pool, GFP_ATOMIC); 464 return mempool_alloc(md->io_pool, GFP_ATOMIC);
465 } 465 }
466 466
467 static void free_bio_info(struct dm_rq_clone_bio_info *info) 467 static void free_bio_info(struct dm_rq_clone_bio_info *info)
468 { 468 {
469 mempool_free(info, info->tio->md->io_pool); 469 mempool_free(info, info->tio->md->io_pool);
470 } 470 }
471 471
472 static int md_in_flight(struct mapped_device *md) 472 static int md_in_flight(struct mapped_device *md)
473 { 473 {
474 return atomic_read(&md->pending[READ]) + 474 return atomic_read(&md->pending[READ]) +
475 atomic_read(&md->pending[WRITE]); 475 atomic_read(&md->pending[WRITE]);
476 } 476 }
477 477
478 static void start_io_acct(struct dm_io *io) 478 static void start_io_acct(struct dm_io *io)
479 { 479 {
480 struct mapped_device *md = io->md; 480 struct mapped_device *md = io->md;
481 int cpu; 481 int cpu;
482 int rw = bio_data_dir(io->bio); 482 int rw = bio_data_dir(io->bio);
483 483
484 io->start_time = jiffies; 484 io->start_time = jiffies;
485 485
486 cpu = part_stat_lock(); 486 cpu = part_stat_lock();
487 part_round_stats(cpu, &dm_disk(md)->part0); 487 part_round_stats(cpu, &dm_disk(md)->part0);
488 part_stat_unlock(); 488 part_stat_unlock();
489 atomic_set(&dm_disk(md)->part0.in_flight[rw], 489 atomic_set(&dm_disk(md)->part0.in_flight[rw],
490 atomic_inc_return(&md->pending[rw])); 490 atomic_inc_return(&md->pending[rw]));
491 } 491 }
492 492
493 static void end_io_acct(struct dm_io *io) 493 static void end_io_acct(struct dm_io *io)
494 { 494 {
495 struct mapped_device *md = io->md; 495 struct mapped_device *md = io->md;
496 struct bio *bio = io->bio; 496 struct bio *bio = io->bio;
497 unsigned long duration = jiffies - io->start_time; 497 unsigned long duration = jiffies - io->start_time;
498 int pending, cpu; 498 int pending, cpu;
499 int rw = bio_data_dir(bio); 499 int rw = bio_data_dir(bio);
500 500
501 cpu = part_stat_lock(); 501 cpu = part_stat_lock();
502 part_round_stats(cpu, &dm_disk(md)->part0); 502 part_round_stats(cpu, &dm_disk(md)->part0);
503 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration); 503 part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
504 part_stat_unlock(); 504 part_stat_unlock();
505 505
506 /* 506 /*
507 * After this is decremented the bio must not be touched if it is 507 * After this is decremented the bio must not be touched if it is
508 * a flush. 508 * a flush.
509 */ 509 */
510 pending = atomic_dec_return(&md->pending[rw]); 510 pending = atomic_dec_return(&md->pending[rw]);
511 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending); 511 atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
512 pending += atomic_read(&md->pending[rw^0x1]); 512 pending += atomic_read(&md->pending[rw^0x1]);
513 513
514 /* nudge anyone waiting on suspend queue */ 514 /* nudge anyone waiting on suspend queue */
515 if (!pending) 515 if (!pending)
516 wake_up(&md->wait); 516 wake_up(&md->wait);
517 } 517 }
518 518
519 /* 519 /*
520 * Add the bio to the list of deferred io. 520 * Add the bio to the list of deferred io.
521 */ 521 */
522 static void queue_io(struct mapped_device *md, struct bio *bio) 522 static void queue_io(struct mapped_device *md, struct bio *bio)
523 { 523 {
524 unsigned long flags; 524 unsigned long flags;
525 525
526 spin_lock_irqsave(&md->deferred_lock, flags); 526 spin_lock_irqsave(&md->deferred_lock, flags);
527 bio_list_add(&md->deferred, bio); 527 bio_list_add(&md->deferred, bio);
528 spin_unlock_irqrestore(&md->deferred_lock, flags); 528 spin_unlock_irqrestore(&md->deferred_lock, flags);
529 queue_work(md->wq, &md->work); 529 queue_work(md->wq, &md->work);
530 } 530 }
531 531
532 /* 532 /*
533 * Everyone (including functions in this file), should use this 533 * Everyone (including functions in this file), should use this
534 * function to access the md->map field, and make sure they call 534 * function to access the md->map field, and make sure they call
535 * dm_table_put() when finished. 535 * dm_table_put() when finished.
536 */ 536 */
537 struct dm_table *dm_get_live_table(struct mapped_device *md) 537 struct dm_table *dm_get_live_table(struct mapped_device *md)
538 { 538 {
539 struct dm_table *t; 539 struct dm_table *t;
540 unsigned long flags; 540 unsigned long flags;
541 541
542 read_lock_irqsave(&md->map_lock, flags); 542 read_lock_irqsave(&md->map_lock, flags);
543 t = md->map; 543 t = md->map;
544 if (t) 544 if (t)
545 dm_table_get(t); 545 dm_table_get(t);
546 read_unlock_irqrestore(&md->map_lock, flags); 546 read_unlock_irqrestore(&md->map_lock, flags);
547 547
548 return t; 548 return t;
549 } 549 }
550 550
551 /* 551 /*
552 * Get the geometry associated with a dm device 552 * Get the geometry associated with a dm device
553 */ 553 */
554 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo) 554 int dm_get_geometry(struct mapped_device *md, struct hd_geometry *geo)
555 { 555 {
556 *geo = md->geometry; 556 *geo = md->geometry;
557 557
558 return 0; 558 return 0;
559 } 559 }
560 560
561 /* 561 /*
562 * Set the geometry of a device. 562 * Set the geometry of a device.
563 */ 563 */
564 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo) 564 int dm_set_geometry(struct mapped_device *md, struct hd_geometry *geo)
565 { 565 {
566 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors; 566 sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
567 567
568 if (geo->start > sz) { 568 if (geo->start > sz) {
569 DMWARN("Start sector is beyond the geometry limits."); 569 DMWARN("Start sector is beyond the geometry limits.");
570 return -EINVAL; 570 return -EINVAL;
571 } 571 }
572 572
573 md->geometry = *geo; 573 md->geometry = *geo;
574 574
575 return 0; 575 return 0;
576 } 576 }
577 577
578 /*----------------------------------------------------------------- 578 /*-----------------------------------------------------------------
579 * CRUD START: 579 * CRUD START:
580 * A more elegant soln is in the works that uses the queue 580 * A more elegant soln is in the works that uses the queue
581 * merge fn, unfortunately there are a couple of changes to 581 * merge fn, unfortunately there are a couple of changes to
582 * the block layer that I want to make for this. So in the 582 * the block layer that I want to make for this. So in the
583 * interests of getting something for people to use I give 583 * interests of getting something for people to use I give
584 * you this clearly demarcated crap. 584 * you this clearly demarcated crap.
585 *---------------------------------------------------------------*/ 585 *---------------------------------------------------------------*/
586 586
587 static int __noflush_suspending(struct mapped_device *md) 587 static int __noflush_suspending(struct mapped_device *md)
588 { 588 {
589 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 589 return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
590 } 590 }
591 591
592 /* 592 /*
593 * Decrements the number of outstanding ios that a bio has been 593 * Decrements the number of outstanding ios that a bio has been
594 * cloned into, completing the original io if necc. 594 * cloned into, completing the original io if necc.
595 */ 595 */
596 static void dec_pending(struct dm_io *io, int error) 596 static void dec_pending(struct dm_io *io, int error)
597 { 597 {
598 unsigned long flags; 598 unsigned long flags;
599 int io_error; 599 int io_error;
600 struct bio *bio; 600 struct bio *bio;
601 struct mapped_device *md = io->md; 601 struct mapped_device *md = io->md;
602 602
603 /* Push-back supersedes any I/O errors */ 603 /* Push-back supersedes any I/O errors */
604 if (unlikely(error)) { 604 if (unlikely(error)) {
605 spin_lock_irqsave(&io->endio_lock, flags); 605 spin_lock_irqsave(&io->endio_lock, flags);
606 if (!(io->error > 0 && __noflush_suspending(md))) 606 if (!(io->error > 0 && __noflush_suspending(md)))
607 io->error = error; 607 io->error = error;
608 spin_unlock_irqrestore(&io->endio_lock, flags); 608 spin_unlock_irqrestore(&io->endio_lock, flags);
609 } 609 }
610 610
611 if (atomic_dec_and_test(&io->io_count)) { 611 if (atomic_dec_and_test(&io->io_count)) {
612 if (io->error == DM_ENDIO_REQUEUE) { 612 if (io->error == DM_ENDIO_REQUEUE) {
613 /* 613 /*
614 * Target requested pushing back the I/O. 614 * Target requested pushing back the I/O.
615 */ 615 */
616 spin_lock_irqsave(&md->deferred_lock, flags); 616 spin_lock_irqsave(&md->deferred_lock, flags);
617 if (__noflush_suspending(md)) 617 if (__noflush_suspending(md))
618 bio_list_add_head(&md->deferred, io->bio); 618 bio_list_add_head(&md->deferred, io->bio);
619 else 619 else
620 /* noflush suspend was interrupted. */ 620 /* noflush suspend was interrupted. */
621 io->error = -EIO; 621 io->error = -EIO;
622 spin_unlock_irqrestore(&md->deferred_lock, flags); 622 spin_unlock_irqrestore(&md->deferred_lock, flags);
623 } 623 }
624 624
625 io_error = io->error; 625 io_error = io->error;
626 bio = io->bio; 626 bio = io->bio;
627 end_io_acct(io); 627 end_io_acct(io);
628 free_io(md, io); 628 free_io(md, io);
629 629
630 if (io_error == DM_ENDIO_REQUEUE) 630 if (io_error == DM_ENDIO_REQUEUE)
631 return; 631 return;
632 632
633 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) { 633 if ((bio->bi_rw & REQ_FLUSH) && bio->bi_size) {
634 /* 634 /*
635 * Preflush done for flush with data, reissue 635 * Preflush done for flush with data, reissue
636 * without REQ_FLUSH. 636 * without REQ_FLUSH.
637 */ 637 */
638 bio->bi_rw &= ~REQ_FLUSH; 638 bio->bi_rw &= ~REQ_FLUSH;
639 queue_io(md, bio); 639 queue_io(md, bio);
640 } else { 640 } else {
641 /* done with normal IO or empty flush */ 641 /* done with normal IO or empty flush */
642 trace_block_bio_complete(md->queue, bio, io_error); 642 trace_block_bio_complete(md->queue, bio, io_error);
643 bio_endio(bio, io_error); 643 bio_endio(bio, io_error);
644 } 644 }
645 } 645 }
646 } 646 }
647 647
648 static void clone_endio(struct bio *bio, int error) 648 static void clone_endio(struct bio *bio, int error)
649 { 649 {
650 int r = 0; 650 int r = 0;
651 struct dm_target_io *tio = bio->bi_private; 651 struct dm_target_io *tio = bio->bi_private;
652 struct dm_io *io = tio->io; 652 struct dm_io *io = tio->io;
653 struct mapped_device *md = tio->io->md; 653 struct mapped_device *md = tio->io->md;
654 dm_endio_fn endio = tio->ti->type->end_io; 654 dm_endio_fn endio = tio->ti->type->end_io;
655 655
656 if (!bio_flagged(bio, BIO_UPTODATE) && !error) 656 if (!bio_flagged(bio, BIO_UPTODATE) && !error)
657 error = -EIO; 657 error = -EIO;
658 658
659 if (endio) { 659 if (endio) {
660 r = endio(tio->ti, bio, error, &tio->info); 660 r = endio(tio->ti, bio, error, &tio->info);
661 if (r < 0 || r == DM_ENDIO_REQUEUE) 661 if (r < 0 || r == DM_ENDIO_REQUEUE)
662 /* 662 /*
663 * error and requeue request are handled 663 * error and requeue request are handled
664 * in dec_pending(). 664 * in dec_pending().
665 */ 665 */
666 error = r; 666 error = r;
667 else if (r == DM_ENDIO_INCOMPLETE) 667 else if (r == DM_ENDIO_INCOMPLETE)
668 /* The target will handle the io */ 668 /* The target will handle the io */
669 return; 669 return;
670 else if (r) { 670 else if (r) {
671 DMWARN("unimplemented target endio return value: %d", r); 671 DMWARN("unimplemented target endio return value: %d", r);
672 BUG(); 672 BUG();
673 } 673 }
674 } 674 }
675 675
676 /* 676 /*
677 * Store md for cleanup instead of tio which is about to get freed. 677 * Store md for cleanup instead of tio which is about to get freed.
678 */ 678 */
679 bio->bi_private = md->bs; 679 bio->bi_private = md->bs;
680 680
681 free_tio(md, tio); 681 free_tio(md, tio);
682 bio_put(bio); 682 bio_put(bio);
683 dec_pending(io, error); 683 dec_pending(io, error);
684 } 684 }
685 685
686 /* 686 /*
687 * Partial completion handling for request-based dm 687 * Partial completion handling for request-based dm
688 */ 688 */
689 static void end_clone_bio(struct bio *clone, int error) 689 static void end_clone_bio(struct bio *clone, int error)
690 { 690 {
691 struct dm_rq_clone_bio_info *info = clone->bi_private; 691 struct dm_rq_clone_bio_info *info = clone->bi_private;
692 struct dm_rq_target_io *tio = info->tio; 692 struct dm_rq_target_io *tio = info->tio;
693 struct bio *bio = info->orig; 693 struct bio *bio = info->orig;
694 unsigned int nr_bytes = info->orig->bi_size; 694 unsigned int nr_bytes = info->orig->bi_size;
695 695
696 bio_put(clone); 696 bio_put(clone);
697 697
698 if (tio->error) 698 if (tio->error)
699 /* 699 /*
700 * An error has already been detected on the request. 700 * An error has already been detected on the request.
701 * Once error occurred, just let clone->end_io() handle 701 * Once error occurred, just let clone->end_io() handle
702 * the remainder. 702 * the remainder.
703 */ 703 */
704 return; 704 return;
705 else if (error) { 705 else if (error) {
706 /* 706 /*
707 * Don't notice the error to the upper layer yet. 707 * Don't notice the error to the upper layer yet.
708 * The error handling decision is made by the target driver, 708 * The error handling decision is made by the target driver,
709 * when the request is completed. 709 * when the request is completed.
710 */ 710 */
711 tio->error = error; 711 tio->error = error;
712 return; 712 return;
713 } 713 }
714 714
715 /* 715 /*
716 * I/O for the bio successfully completed. 716 * I/O for the bio successfully completed.
717 * Notice the data completion to the upper layer. 717 * Notice the data completion to the upper layer.
718 */ 718 */
719 719
720 /* 720 /*
721 * bios are processed from the head of the list. 721 * bios are processed from the head of the list.
722 * So the completing bio should always be rq->bio. 722 * So the completing bio should always be rq->bio.
723 * If it's not, something wrong is happening. 723 * If it's not, something wrong is happening.
724 */ 724 */
725 if (tio->orig->bio != bio) 725 if (tio->orig->bio != bio)
726 DMERR("bio completion is going in the middle of the request"); 726 DMERR("bio completion is going in the middle of the request");
727 727
728 /* 728 /*
729 * Update the original request. 729 * Update the original request.
730 * Do not use blk_end_request() here, because it may complete 730 * Do not use blk_end_request() here, because it may complete
731 * the original request before the clone, and break the ordering. 731 * the original request before the clone, and break the ordering.
732 */ 732 */
733 blk_update_request(tio->orig, 0, nr_bytes); 733 blk_update_request(tio->orig, 0, nr_bytes);
734 } 734 }
735 735
736 /* 736 /*
737 * Don't touch any member of the md after calling this function because 737 * Don't touch any member of the md after calling this function because
738 * the md may be freed in dm_put() at the end of this function. 738 * the md may be freed in dm_put() at the end of this function.
739 * Or do dm_get() before calling this function and dm_put() later. 739 * Or do dm_get() before calling this function and dm_put() later.
740 */ 740 */
741 static void rq_completed(struct mapped_device *md, int rw, int run_queue) 741 static void rq_completed(struct mapped_device *md, int rw, int run_queue)
742 { 742 {
743 atomic_dec(&md->pending[rw]); 743 atomic_dec(&md->pending[rw]);
744 744
745 /* nudge anyone waiting on suspend queue */ 745 /* nudge anyone waiting on suspend queue */
746 if (!md_in_flight(md)) 746 if (!md_in_flight(md))
747 wake_up(&md->wait); 747 wake_up(&md->wait);
748 748
749 if (run_queue) 749 if (run_queue)
750 blk_run_queue(md->queue); 750 blk_run_queue(md->queue);
751 751
752 /* 752 /*
753 * dm_put() must be at the end of this function. See the comment above 753 * dm_put() must be at the end of this function. See the comment above
754 */ 754 */
755 dm_put(md); 755 dm_put(md);
756 } 756 }
757 757
758 static void free_rq_clone(struct request *clone) 758 static void free_rq_clone(struct request *clone)
759 { 759 {
760 struct dm_rq_target_io *tio = clone->end_io_data; 760 struct dm_rq_target_io *tio = clone->end_io_data;
761 761
762 blk_rq_unprep_clone(clone); 762 blk_rq_unprep_clone(clone);
763 free_rq_tio(tio); 763 free_rq_tio(tio);
764 } 764 }
765 765
766 /* 766 /*
767 * Complete the clone and the original request. 767 * Complete the clone and the original request.
768 * Must be called without queue lock. 768 * Must be called without queue lock.
769 */ 769 */
770 static void dm_end_request(struct request *clone, int error) 770 static void dm_end_request(struct request *clone, int error)
771 { 771 {
772 int rw = rq_data_dir(clone); 772 int rw = rq_data_dir(clone);
773 struct dm_rq_target_io *tio = clone->end_io_data; 773 struct dm_rq_target_io *tio = clone->end_io_data;
774 struct mapped_device *md = tio->md; 774 struct mapped_device *md = tio->md;
775 struct request *rq = tio->orig; 775 struct request *rq = tio->orig;
776 776
777 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { 777 if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
778 rq->errors = clone->errors; 778 rq->errors = clone->errors;
779 rq->resid_len = clone->resid_len; 779 rq->resid_len = clone->resid_len;
780 780
781 if (rq->sense) 781 if (rq->sense)
782 /* 782 /*
783 * We are using the sense buffer of the original 783 * We are using the sense buffer of the original
784 * request. 784 * request.
785 * So setting the length of the sense data is enough. 785 * So setting the length of the sense data is enough.
786 */ 786 */
787 rq->sense_len = clone->sense_len; 787 rq->sense_len = clone->sense_len;
788 } 788 }
789 789
790 free_rq_clone(clone); 790 free_rq_clone(clone);
791 blk_end_request_all(rq, error); 791 blk_end_request_all(rq, error);
792 rq_completed(md, rw, true); 792 rq_completed(md, rw, true);
793 } 793 }
794 794
795 static void dm_unprep_request(struct request *rq) 795 static void dm_unprep_request(struct request *rq)
796 { 796 {
797 struct request *clone = rq->special; 797 struct request *clone = rq->special;
798 798
799 rq->special = NULL; 799 rq->special = NULL;
800 rq->cmd_flags &= ~REQ_DONTPREP; 800 rq->cmd_flags &= ~REQ_DONTPREP;
801 801
802 free_rq_clone(clone); 802 free_rq_clone(clone);
803 } 803 }
804 804
805 /* 805 /*
806 * Requeue the original request of a clone. 806 * Requeue the original request of a clone.
807 */ 807 */
808 void dm_requeue_unmapped_request(struct request *clone) 808 void dm_requeue_unmapped_request(struct request *clone)
809 { 809 {
810 int rw = rq_data_dir(clone); 810 int rw = rq_data_dir(clone);
811 struct dm_rq_target_io *tio = clone->end_io_data; 811 struct dm_rq_target_io *tio = clone->end_io_data;
812 struct mapped_device *md = tio->md; 812 struct mapped_device *md = tio->md;
813 struct request *rq = tio->orig; 813 struct request *rq = tio->orig;
814 struct request_queue *q = rq->q; 814 struct request_queue *q = rq->q;
815 unsigned long flags; 815 unsigned long flags;
816 816
817 dm_unprep_request(rq); 817 dm_unprep_request(rq);
818 818
819 spin_lock_irqsave(q->queue_lock, flags); 819 spin_lock_irqsave(q->queue_lock, flags);
820 blk_requeue_request(q, rq); 820 blk_requeue_request(q, rq);
821 spin_unlock_irqrestore(q->queue_lock, flags); 821 spin_unlock_irqrestore(q->queue_lock, flags);
822 822
823 rq_completed(md, rw, 0); 823 rq_completed(md, rw, 0);
824 } 824 }
825 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request); 825 EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
826 826
827 static void __stop_queue(struct request_queue *q) 827 static void __stop_queue(struct request_queue *q)
828 { 828 {
829 blk_stop_queue(q); 829 blk_stop_queue(q);
830 } 830 }
831 831
832 static void stop_queue(struct request_queue *q) 832 static void stop_queue(struct request_queue *q)
833 { 833 {
834 unsigned long flags; 834 unsigned long flags;
835 835
836 spin_lock_irqsave(q->queue_lock, flags); 836 spin_lock_irqsave(q->queue_lock, flags);
837 __stop_queue(q); 837 __stop_queue(q);
838 spin_unlock_irqrestore(q->queue_lock, flags); 838 spin_unlock_irqrestore(q->queue_lock, flags);
839 } 839 }
840 840
841 static void __start_queue(struct request_queue *q) 841 static void __start_queue(struct request_queue *q)
842 { 842 {
843 if (blk_queue_stopped(q)) 843 if (blk_queue_stopped(q))
844 blk_start_queue(q); 844 blk_start_queue(q);
845 } 845 }
846 846
847 static void start_queue(struct request_queue *q) 847 static void start_queue(struct request_queue *q)
848 { 848 {
849 unsigned long flags; 849 unsigned long flags;
850 850
851 spin_lock_irqsave(q->queue_lock, flags); 851 spin_lock_irqsave(q->queue_lock, flags);
852 __start_queue(q); 852 __start_queue(q);
853 spin_unlock_irqrestore(q->queue_lock, flags); 853 spin_unlock_irqrestore(q->queue_lock, flags);
854 } 854 }
855 855
856 static void dm_done(struct request *clone, int error, bool mapped) 856 static void dm_done(struct request *clone, int error, bool mapped)
857 { 857 {
858 int r = error; 858 int r = error;
859 struct dm_rq_target_io *tio = clone->end_io_data; 859 struct dm_rq_target_io *tio = clone->end_io_data;
860 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io; 860 dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
861 861
862 if (mapped && rq_end_io) 862 if (mapped && rq_end_io)
863 r = rq_end_io(tio->ti, clone, error, &tio->info); 863 r = rq_end_io(tio->ti, clone, error, &tio->info);
864 864
865 if (r <= 0) 865 if (r <= 0)
866 /* The target wants to complete the I/O */ 866 /* The target wants to complete the I/O */
867 dm_end_request(clone, r); 867 dm_end_request(clone, r);
868 else if (r == DM_ENDIO_INCOMPLETE) 868 else if (r == DM_ENDIO_INCOMPLETE)
869 /* The target will handle the I/O */ 869 /* The target will handle the I/O */
870 return; 870 return;
871 else if (r == DM_ENDIO_REQUEUE) 871 else if (r == DM_ENDIO_REQUEUE)
872 /* The target wants to requeue the I/O */ 872 /* The target wants to requeue the I/O */
873 dm_requeue_unmapped_request(clone); 873 dm_requeue_unmapped_request(clone);
874 else { 874 else {
875 DMWARN("unimplemented target endio return value: %d", r); 875 DMWARN("unimplemented target endio return value: %d", r);
876 BUG(); 876 BUG();
877 } 877 }
878 } 878 }
879 879
880 /* 880 /*
881 * Request completion handler for request-based dm 881 * Request completion handler for request-based dm
882 */ 882 */
883 static void dm_softirq_done(struct request *rq) 883 static void dm_softirq_done(struct request *rq)
884 { 884 {
885 bool mapped = true; 885 bool mapped = true;
886 struct request *clone = rq->completion_data; 886 struct request *clone = rq->completion_data;
887 struct dm_rq_target_io *tio = clone->end_io_data; 887 struct dm_rq_target_io *tio = clone->end_io_data;
888 888
889 if (rq->cmd_flags & REQ_FAILED) 889 if (rq->cmd_flags & REQ_FAILED)
890 mapped = false; 890 mapped = false;
891 891
892 dm_done(clone, tio->error, mapped); 892 dm_done(clone, tio->error, mapped);
893 } 893 }
894 894
895 /* 895 /*
896 * Complete the clone and the original request with the error status 896 * Complete the clone and the original request with the error status
897 * through softirq context. 897 * through softirq context.
898 */ 898 */
899 static void dm_complete_request(struct request *clone, int error) 899 static void dm_complete_request(struct request *clone, int error)
900 { 900 {
901 struct dm_rq_target_io *tio = clone->end_io_data; 901 struct dm_rq_target_io *tio = clone->end_io_data;
902 struct request *rq = tio->orig; 902 struct request *rq = tio->orig;
903 903
904 tio->error = error; 904 tio->error = error;
905 rq->completion_data = clone; 905 rq->completion_data = clone;
906 blk_complete_request(rq); 906 blk_complete_request(rq);
907 } 907 }
908 908
909 /* 909 /*
910 * Complete the not-mapped clone and the original request with the error status 910 * Complete the not-mapped clone and the original request with the error status
911 * through softirq context. 911 * through softirq context.
912 * Target's rq_end_io() function isn't called. 912 * Target's rq_end_io() function isn't called.
913 * This may be used when the target's map_rq() function fails. 913 * This may be used when the target's map_rq() function fails.
914 */ 914 */
915 void dm_kill_unmapped_request(struct request *clone, int error) 915 void dm_kill_unmapped_request(struct request *clone, int error)
916 { 916 {
917 struct dm_rq_target_io *tio = clone->end_io_data; 917 struct dm_rq_target_io *tio = clone->end_io_data;
918 struct request *rq = tio->orig; 918 struct request *rq = tio->orig;
919 919
920 rq->cmd_flags |= REQ_FAILED; 920 rq->cmd_flags |= REQ_FAILED;
921 dm_complete_request(clone, error); 921 dm_complete_request(clone, error);
922 } 922 }
923 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request); 923 EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
924 924
925 /* 925 /*
926 * Called with the queue lock held 926 * Called with the queue lock held
927 */ 927 */
928 static void end_clone_request(struct request *clone, int error) 928 static void end_clone_request(struct request *clone, int error)
929 { 929 {
930 /* 930 /*
931 * For just cleaning up the information of the queue in which 931 * For just cleaning up the information of the queue in which
932 * the clone was dispatched. 932 * the clone was dispatched.
933 * The clone is *NOT* freed actually here because it is alloced from 933 * The clone is *NOT* freed actually here because it is alloced from
934 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags. 934 * dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
935 */ 935 */
936 __blk_put_request(clone->q, clone); 936 __blk_put_request(clone->q, clone);
937 937
938 /* 938 /*
939 * Actual request completion is done in a softirq context which doesn't 939 * Actual request completion is done in a softirq context which doesn't
940 * hold the queue lock. Otherwise, deadlock could occur because: 940 * hold the queue lock. Otherwise, deadlock could occur because:
941 * - another request may be submitted by the upper level driver 941 * - another request may be submitted by the upper level driver
942 * of the stacking during the completion 942 * of the stacking during the completion
943 * - the submission which requires queue lock may be done 943 * - the submission which requires queue lock may be done
944 * against this queue 944 * against this queue
945 */ 945 */
946 dm_complete_request(clone, error); 946 dm_complete_request(clone, error);
947 } 947 }
948 948
949 /* 949 /*
950 * Return maximum size of I/O possible at the supplied sector up to the current 950 * Return maximum size of I/O possible at the supplied sector up to the current
951 * target boundary. 951 * target boundary.
952 */ 952 */
953 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti) 953 static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
954 { 954 {
955 sector_t target_offset = dm_target_offset(ti, sector); 955 sector_t target_offset = dm_target_offset(ti, sector);
956 956
957 return ti->len - target_offset; 957 return ti->len - target_offset;
958 } 958 }
959 959
960 static sector_t max_io_len(sector_t sector, struct dm_target *ti) 960 static sector_t max_io_len(sector_t sector, struct dm_target *ti)
961 { 961 {
962 sector_t len = max_io_len_target_boundary(sector, ti); 962 sector_t len = max_io_len_target_boundary(sector, ti);
963 963
964 /* 964 /*
965 * Does the target need to split even further ? 965 * Does the target need to split even further ?
966 */ 966 */
967 if (ti->split_io) { 967 if (ti->split_io) {
968 sector_t boundary; 968 sector_t boundary;
969 sector_t offset = dm_target_offset(ti, sector); 969 sector_t offset = dm_target_offset(ti, sector);
970 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1)) 970 boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
971 - offset; 971 - offset;
972 if (len > boundary) 972 if (len > boundary)
973 len = boundary; 973 len = boundary;
974 } 974 }
975 975
976 return len; 976 return len;
977 } 977 }
978 978
979 static void __map_bio(struct dm_target *ti, struct bio *clone, 979 static void __map_bio(struct dm_target *ti, struct bio *clone,
980 struct dm_target_io *tio) 980 struct dm_target_io *tio)
981 { 981 {
982 int r; 982 int r;
983 sector_t sector; 983 sector_t sector;
984 struct mapped_device *md; 984 struct mapped_device *md;
985 985
986 clone->bi_end_io = clone_endio; 986 clone->bi_end_io = clone_endio;
987 clone->bi_private = tio; 987 clone->bi_private = tio;
988 988
989 /* 989 /*
990 * Map the clone. If r == 0 we don't need to do 990 * Map the clone. If r == 0 we don't need to do
991 * anything, the target has assumed ownership of 991 * anything, the target has assumed ownership of
992 * this io. 992 * this io.
993 */ 993 */
994 atomic_inc(&tio->io->io_count); 994 atomic_inc(&tio->io->io_count);
995 sector = clone->bi_sector; 995 sector = clone->bi_sector;
996 r = ti->type->map(ti, clone, &tio->info); 996 r = ti->type->map(ti, clone, &tio->info);
997 if (r == DM_MAPIO_REMAPPED) { 997 if (r == DM_MAPIO_REMAPPED) {
998 /* the bio has been remapped so dispatch it */ 998 /* the bio has been remapped so dispatch it */
999 999
1000 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone, 1000 trace_block_bio_remap(bdev_get_queue(clone->bi_bdev), clone,
1001 tio->io->bio->bi_bdev->bd_dev, sector); 1001 tio->io->bio->bi_bdev->bd_dev, sector);
1002 1002
1003 generic_make_request(clone); 1003 generic_make_request(clone);
1004 } else if (r < 0 || r == DM_MAPIO_REQUEUE) { 1004 } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
1005 /* error the io and bail out, or requeue it if needed */ 1005 /* error the io and bail out, or requeue it if needed */
1006 md = tio->io->md; 1006 md = tio->io->md;
1007 dec_pending(tio->io, r); 1007 dec_pending(tio->io, r);
1008 /* 1008 /*
1009 * Store bio_set for cleanup. 1009 * Store bio_set for cleanup.
1010 */ 1010 */
1011 clone->bi_private = md->bs; 1011 clone->bi_private = md->bs;
1012 bio_put(clone); 1012 bio_put(clone);
1013 free_tio(md, tio); 1013 free_tio(md, tio);
1014 } else if (r) { 1014 } else if (r) {
1015 DMWARN("unimplemented target map return value: %d", r); 1015 DMWARN("unimplemented target map return value: %d", r);
1016 BUG(); 1016 BUG();
1017 } 1017 }
1018 } 1018 }
1019 1019
1020 struct clone_info { 1020 struct clone_info {
1021 struct mapped_device *md; 1021 struct mapped_device *md;
1022 struct dm_table *map; 1022 struct dm_table *map;
1023 struct bio *bio; 1023 struct bio *bio;
1024 struct dm_io *io; 1024 struct dm_io *io;
1025 sector_t sector; 1025 sector_t sector;
1026 sector_t sector_count; 1026 sector_t sector_count;
1027 unsigned short idx; 1027 unsigned short idx;
1028 }; 1028 };
1029 1029
1030 static void dm_bio_destructor(struct bio *bio) 1030 static void dm_bio_destructor(struct bio *bio)
1031 { 1031 {
1032 struct bio_set *bs = bio->bi_private; 1032 struct bio_set *bs = bio->bi_private;
1033 1033
1034 bio_free(bio, bs); 1034 bio_free(bio, bs);
1035 } 1035 }
1036 1036
1037 /* 1037 /*
1038 * Creates a little bio that just does part of a bvec. 1038 * Creates a little bio that just does part of a bvec.
1039 */ 1039 */
1040 static struct bio *split_bvec(struct bio *bio, sector_t sector, 1040 static struct bio *split_bvec(struct bio *bio, sector_t sector,
1041 unsigned short idx, unsigned int offset, 1041 unsigned short idx, unsigned int offset,
1042 unsigned int len, struct bio_set *bs) 1042 unsigned int len, struct bio_set *bs)
1043 { 1043 {
1044 struct bio *clone; 1044 struct bio *clone;
1045 struct bio_vec *bv = bio->bi_io_vec + idx; 1045 struct bio_vec *bv = bio->bi_io_vec + idx;
1046 1046
1047 clone = bio_alloc_bioset(GFP_NOIO, 1, bs); 1047 clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
1048 clone->bi_destructor = dm_bio_destructor; 1048 clone->bi_destructor = dm_bio_destructor;
1049 *clone->bi_io_vec = *bv; 1049 *clone->bi_io_vec = *bv;
1050 1050
1051 clone->bi_sector = sector; 1051 clone->bi_sector = sector;
1052 clone->bi_bdev = bio->bi_bdev; 1052 clone->bi_bdev = bio->bi_bdev;
1053 clone->bi_rw = bio->bi_rw; 1053 clone->bi_rw = bio->bi_rw;
1054 clone->bi_vcnt = 1; 1054 clone->bi_vcnt = 1;
1055 clone->bi_size = to_bytes(len); 1055 clone->bi_size = to_bytes(len);
1056 clone->bi_io_vec->bv_offset = offset; 1056 clone->bi_io_vec->bv_offset = offset;
1057 clone->bi_io_vec->bv_len = clone->bi_size; 1057 clone->bi_io_vec->bv_len = clone->bi_size;
1058 clone->bi_flags |= 1 << BIO_CLONED; 1058 clone->bi_flags |= 1 << BIO_CLONED;
1059 1059
1060 if (bio_integrity(bio)) { 1060 if (bio_integrity(bio)) {
1061 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1061 bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1062 bio_integrity_trim(clone, 1062 bio_integrity_trim(clone,
1063 bio_sector_offset(bio, idx, offset), len); 1063 bio_sector_offset(bio, idx, offset), len);
1064 } 1064 }
1065 1065
1066 return clone; 1066 return clone;
1067 } 1067 }
1068 1068
1069 /* 1069 /*
1070 * Creates a bio that consists of range of complete bvecs. 1070 * Creates a bio that consists of range of complete bvecs.
1071 */ 1071 */
1072 static struct bio *clone_bio(struct bio *bio, sector_t sector, 1072 static struct bio *clone_bio(struct bio *bio, sector_t sector,
1073 unsigned short idx, unsigned short bv_count, 1073 unsigned short idx, unsigned short bv_count,
1074 unsigned int len, struct bio_set *bs) 1074 unsigned int len, struct bio_set *bs)
1075 { 1075 {
1076 struct bio *clone; 1076 struct bio *clone;
1077 1077
1078 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs); 1078 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1079 __bio_clone(clone, bio); 1079 __bio_clone(clone, bio);
1080 clone->bi_destructor = dm_bio_destructor; 1080 clone->bi_destructor = dm_bio_destructor;
1081 clone->bi_sector = sector; 1081 clone->bi_sector = sector;
1082 clone->bi_idx = idx; 1082 clone->bi_idx = idx;
1083 clone->bi_vcnt = idx + bv_count; 1083 clone->bi_vcnt = idx + bv_count;
1084 clone->bi_size = to_bytes(len); 1084 clone->bi_size = to_bytes(len);
1085 clone->bi_flags &= ~(1 << BIO_SEG_VALID); 1085 clone->bi_flags &= ~(1 << BIO_SEG_VALID);
1086 1086
1087 if (bio_integrity(bio)) { 1087 if (bio_integrity(bio)) {
1088 bio_integrity_clone(clone, bio, GFP_NOIO, bs); 1088 bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1089 1089
1090 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size) 1090 if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
1091 bio_integrity_trim(clone, 1091 bio_integrity_trim(clone,
1092 bio_sector_offset(bio, idx, 0), len); 1092 bio_sector_offset(bio, idx, 0), len);
1093 } 1093 }
1094 1094
1095 return clone; 1095 return clone;
1096 } 1096 }
1097 1097
1098 static struct dm_target_io *alloc_tio(struct clone_info *ci, 1098 static struct dm_target_io *alloc_tio(struct clone_info *ci,
1099 struct dm_target *ti) 1099 struct dm_target *ti)
1100 { 1100 {
1101 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO); 1101 struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
1102 1102
1103 tio->io = ci->io; 1103 tio->io = ci->io;
1104 tio->ti = ti; 1104 tio->ti = ti;
1105 memset(&tio->info, 0, sizeof(tio->info)); 1105 memset(&tio->info, 0, sizeof(tio->info));
1106 1106
1107 return tio; 1107 return tio;
1108 } 1108 }
1109 1109
1110 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti, 1110 static void __issue_target_request(struct clone_info *ci, struct dm_target *ti,
1111 unsigned request_nr, sector_t len) 1111 unsigned request_nr, sector_t len)
1112 { 1112 {
1113 struct dm_target_io *tio = alloc_tio(ci, ti); 1113 struct dm_target_io *tio = alloc_tio(ci, ti);
1114 struct bio *clone; 1114 struct bio *clone;
1115 1115
1116 tio->info.target_request_nr = request_nr; 1116 tio->info.target_request_nr = request_nr;
1117 1117
1118 /* 1118 /*
1119 * Discard requests require the bio's inline iovecs be initialized. 1119 * Discard requests require the bio's inline iovecs be initialized.
1120 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush 1120 * ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
1121 * and discard, so no need for concern about wasted bvec allocations. 1121 * and discard, so no need for concern about wasted bvec allocations.
1122 */ 1122 */
1123 clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs); 1123 clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
1124 __bio_clone(clone, ci->bio); 1124 __bio_clone(clone, ci->bio);
1125 clone->bi_destructor = dm_bio_destructor; 1125 clone->bi_destructor = dm_bio_destructor;
1126 if (len) { 1126 if (len) {
1127 clone->bi_sector = ci->sector; 1127 clone->bi_sector = ci->sector;
1128 clone->bi_size = to_bytes(len); 1128 clone->bi_size = to_bytes(len);
1129 } 1129 }
1130 1130
1131 __map_bio(ti, clone, tio); 1131 __map_bio(ti, clone, tio);
1132 } 1132 }
1133 1133
1134 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti, 1134 static void __issue_target_requests(struct clone_info *ci, struct dm_target *ti,
1135 unsigned num_requests, sector_t len) 1135 unsigned num_requests, sector_t len)
1136 { 1136 {
1137 unsigned request_nr; 1137 unsigned request_nr;
1138 1138
1139 for (request_nr = 0; request_nr < num_requests; request_nr++) 1139 for (request_nr = 0; request_nr < num_requests; request_nr++)
1140 __issue_target_request(ci, ti, request_nr, len); 1140 __issue_target_request(ci, ti, request_nr, len);
1141 } 1141 }
1142 1142
1143 static int __clone_and_map_empty_flush(struct clone_info *ci) 1143 static int __clone_and_map_empty_flush(struct clone_info *ci)
1144 { 1144 {
1145 unsigned target_nr = 0; 1145 unsigned target_nr = 0;
1146 struct dm_target *ti; 1146 struct dm_target *ti;
1147 1147
1148 BUG_ON(bio_has_data(ci->bio)); 1148 BUG_ON(bio_has_data(ci->bio));
1149 while ((ti = dm_table_get_target(ci->map, target_nr++))) 1149 while ((ti = dm_table_get_target(ci->map, target_nr++)))
1150 __issue_target_requests(ci, ti, ti->num_flush_requests, 0); 1150 __issue_target_requests(ci, ti, ti->num_flush_requests, 0);
1151 1151
1152 return 0; 1152 return 0;
1153 } 1153 }
1154 1154
1155 /* 1155 /*
1156 * Perform all io with a single clone. 1156 * Perform all io with a single clone.
1157 */ 1157 */
1158 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti) 1158 static void __clone_and_map_simple(struct clone_info *ci, struct dm_target *ti)
1159 { 1159 {
1160 struct bio *clone, *bio = ci->bio; 1160 struct bio *clone, *bio = ci->bio;
1161 struct dm_target_io *tio; 1161 struct dm_target_io *tio;
1162 1162
1163 tio = alloc_tio(ci, ti); 1163 tio = alloc_tio(ci, ti);
1164 clone = clone_bio(bio, ci->sector, ci->idx, 1164 clone = clone_bio(bio, ci->sector, ci->idx,
1165 bio->bi_vcnt - ci->idx, ci->sector_count, 1165 bio->bi_vcnt - ci->idx, ci->sector_count,
1166 ci->md->bs); 1166 ci->md->bs);
1167 __map_bio(ti, clone, tio); 1167 __map_bio(ti, clone, tio);
1168 ci->sector_count = 0; 1168 ci->sector_count = 0;
1169 } 1169 }
1170 1170
1171 static int __clone_and_map_discard(struct clone_info *ci) 1171 static int __clone_and_map_discard(struct clone_info *ci)
1172 { 1172 {
1173 struct dm_target *ti; 1173 struct dm_target *ti;
1174 sector_t len; 1174 sector_t len;
1175 1175
1176 do { 1176 do {
1177 ti = dm_table_find_target(ci->map, ci->sector); 1177 ti = dm_table_find_target(ci->map, ci->sector);
1178 if (!dm_target_is_valid(ti)) 1178 if (!dm_target_is_valid(ti))
1179 return -EIO; 1179 return -EIO;
1180 1180
1181 /* 1181 /*
1182 * Even though the device advertised discard support, 1182 * Even though the device advertised discard support,
1183 * that does not mean every target supports it, and 1183 * that does not mean every target supports it, and
1184 * reconfiguration might also have changed that since the 1184 * reconfiguration might also have changed that since the
1185 * check was performed. 1185 * check was performed.
1186 */ 1186 */
1187 if (!ti->num_discard_requests) 1187 if (!ti->num_discard_requests)
1188 return -EOPNOTSUPP; 1188 return -EOPNOTSUPP;
1189 1189
1190 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti)); 1190 len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1191 1191
1192 __issue_target_requests(ci, ti, ti->num_discard_requests, len); 1192 __issue_target_requests(ci, ti, ti->num_discard_requests, len);
1193 1193
1194 ci->sector += len; 1194 ci->sector += len;
1195 } while (ci->sector_count -= len); 1195 } while (ci->sector_count -= len);
1196 1196
1197 return 0; 1197 return 0;
1198 } 1198 }
1199 1199
1200 static int __clone_and_map(struct clone_info *ci) 1200 static int __clone_and_map(struct clone_info *ci)
1201 { 1201 {
1202 struct bio *clone, *bio = ci->bio; 1202 struct bio *clone, *bio = ci->bio;
1203 struct dm_target *ti; 1203 struct dm_target *ti;
1204 sector_t len = 0, max; 1204 sector_t len = 0, max;
1205 struct dm_target_io *tio; 1205 struct dm_target_io *tio;
1206 1206
1207 if (unlikely(bio->bi_rw & REQ_DISCARD)) 1207 if (unlikely(bio->bi_rw & REQ_DISCARD))
1208 return __clone_and_map_discard(ci); 1208 return __clone_and_map_discard(ci);
1209 1209
1210 ti = dm_table_find_target(ci->map, ci->sector); 1210 ti = dm_table_find_target(ci->map, ci->sector);
1211 if (!dm_target_is_valid(ti)) 1211 if (!dm_target_is_valid(ti))
1212 return -EIO; 1212 return -EIO;
1213 1213
1214 max = max_io_len(ci->sector, ti); 1214 max = max_io_len(ci->sector, ti);
1215 1215
1216 if (ci->sector_count <= max) { 1216 if (ci->sector_count <= max) {
1217 /* 1217 /*
1218 * Optimise for the simple case where we can do all of 1218 * Optimise for the simple case where we can do all of
1219 * the remaining io with a single clone. 1219 * the remaining io with a single clone.
1220 */ 1220 */
1221 __clone_and_map_simple(ci, ti); 1221 __clone_and_map_simple(ci, ti);
1222 1222
1223 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) { 1223 } else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1224 /* 1224 /*
1225 * There are some bvecs that don't span targets. 1225 * There are some bvecs that don't span targets.
1226 * Do as many of these as possible. 1226 * Do as many of these as possible.
1227 */ 1227 */
1228 int i; 1228 int i;
1229 sector_t remaining = max; 1229 sector_t remaining = max;
1230 sector_t bv_len; 1230 sector_t bv_len;
1231 1231
1232 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) { 1232 for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
1233 bv_len = to_sector(bio->bi_io_vec[i].bv_len); 1233 bv_len = to_sector(bio->bi_io_vec[i].bv_len);
1234 1234
1235 if (bv_len > remaining) 1235 if (bv_len > remaining)
1236 break; 1236 break;
1237 1237
1238 remaining -= bv_len; 1238 remaining -= bv_len;
1239 len += bv_len; 1239 len += bv_len;
1240 } 1240 }
1241 1241
1242 tio = alloc_tio(ci, ti); 1242 tio = alloc_tio(ci, ti);
1243 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len, 1243 clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
1244 ci->md->bs); 1244 ci->md->bs);
1245 __map_bio(ti, clone, tio); 1245 __map_bio(ti, clone, tio);
1246 1246
1247 ci->sector += len; 1247 ci->sector += len;
1248 ci->sector_count -= len; 1248 ci->sector_count -= len;
1249 ci->idx = i; 1249 ci->idx = i;
1250 1250
1251 } else { 1251 } else {
1252 /* 1252 /*
1253 * Handle a bvec that must be split between two or more targets. 1253 * Handle a bvec that must be split between two or more targets.
1254 */ 1254 */
1255 struct bio_vec *bv = bio->bi_io_vec + ci->idx; 1255 struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1256 sector_t remaining = to_sector(bv->bv_len); 1256 sector_t remaining = to_sector(bv->bv_len);
1257 unsigned int offset = 0; 1257 unsigned int offset = 0;
1258 1258
1259 do { 1259 do {
1260 if (offset) { 1260 if (offset) {
1261 ti = dm_table_find_target(ci->map, ci->sector); 1261 ti = dm_table_find_target(ci->map, ci->sector);
1262 if (!dm_target_is_valid(ti)) 1262 if (!dm_target_is_valid(ti))
1263 return -EIO; 1263 return -EIO;
1264 1264
1265 max = max_io_len(ci->sector, ti); 1265 max = max_io_len(ci->sector, ti);
1266 } 1266 }
1267 1267
1268 len = min(remaining, max); 1268 len = min(remaining, max);
1269 1269
1270 tio = alloc_tio(ci, ti); 1270 tio = alloc_tio(ci, ti);
1271 clone = split_bvec(bio, ci->sector, ci->idx, 1271 clone = split_bvec(bio, ci->sector, ci->idx,
1272 bv->bv_offset + offset, len, 1272 bv->bv_offset + offset, len,
1273 ci->md->bs); 1273 ci->md->bs);
1274 1274
1275 __map_bio(ti, clone, tio); 1275 __map_bio(ti, clone, tio);
1276 1276
1277 ci->sector += len; 1277 ci->sector += len;
1278 ci->sector_count -= len; 1278 ci->sector_count -= len;
1279 offset += to_bytes(len); 1279 offset += to_bytes(len);
1280 } while (remaining -= len); 1280 } while (remaining -= len);
1281 1281
1282 ci->idx++; 1282 ci->idx++;
1283 } 1283 }
1284 1284
1285 return 0; 1285 return 0;
1286 } 1286 }
1287 1287
1288 /* 1288 /*
1289 * Split the bio into several clones and submit it to targets. 1289 * Split the bio into several clones and submit it to targets.
1290 */ 1290 */
1291 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) 1291 static void __split_and_process_bio(struct mapped_device *md, struct bio *bio)
1292 { 1292 {
1293 struct clone_info ci; 1293 struct clone_info ci;
1294 int error = 0; 1294 int error = 0;
1295 1295
1296 ci.map = dm_get_live_table(md); 1296 ci.map = dm_get_live_table(md);
1297 if (unlikely(!ci.map)) { 1297 if (unlikely(!ci.map)) {
1298 bio_io_error(bio); 1298 bio_io_error(bio);
1299 return; 1299 return;
1300 } 1300 }
1301 1301
1302 ci.md = md; 1302 ci.md = md;
1303 ci.io = alloc_io(md); 1303 ci.io = alloc_io(md);
1304 ci.io->error = 0; 1304 ci.io->error = 0;
1305 atomic_set(&ci.io->io_count, 1); 1305 atomic_set(&ci.io->io_count, 1);
1306 ci.io->bio = bio; 1306 ci.io->bio = bio;
1307 ci.io->md = md; 1307 ci.io->md = md;
1308 spin_lock_init(&ci.io->endio_lock); 1308 spin_lock_init(&ci.io->endio_lock);
1309 ci.sector = bio->bi_sector; 1309 ci.sector = bio->bi_sector;
1310 ci.idx = bio->bi_idx; 1310 ci.idx = bio->bi_idx;
1311 1311
1312 start_io_acct(ci.io); 1312 start_io_acct(ci.io);
1313 if (bio->bi_rw & REQ_FLUSH) { 1313 if (bio->bi_rw & REQ_FLUSH) {
1314 ci.bio = &ci.md->flush_bio; 1314 ci.bio = &ci.md->flush_bio;
1315 ci.sector_count = 0; 1315 ci.sector_count = 0;
1316 error = __clone_and_map_empty_flush(&ci); 1316 error = __clone_and_map_empty_flush(&ci);
1317 /* dec_pending submits any data associated with flush */ 1317 /* dec_pending submits any data associated with flush */
1318 } else { 1318 } else {
1319 ci.bio = bio; 1319 ci.bio = bio;
1320 ci.sector_count = bio_sectors(bio); 1320 ci.sector_count = bio_sectors(bio);
1321 while (ci.sector_count && !error) 1321 while (ci.sector_count && !error)
1322 error = __clone_and_map(&ci); 1322 error = __clone_and_map(&ci);
1323 } 1323 }
1324 1324
1325 /* drop the extra reference count */ 1325 /* drop the extra reference count */
1326 dec_pending(ci.io, error); 1326 dec_pending(ci.io, error);
1327 dm_table_put(ci.map); 1327 dm_table_put(ci.map);
1328 } 1328 }
1329 /*----------------------------------------------------------------- 1329 /*-----------------------------------------------------------------
1330 * CRUD END 1330 * CRUD END
1331 *---------------------------------------------------------------*/ 1331 *---------------------------------------------------------------*/
1332 1332
1333 static int dm_merge_bvec(struct request_queue *q, 1333 static int dm_merge_bvec(struct request_queue *q,
1334 struct bvec_merge_data *bvm, 1334 struct bvec_merge_data *bvm,
1335 struct bio_vec *biovec) 1335 struct bio_vec *biovec)
1336 { 1336 {
1337 struct mapped_device *md = q->queuedata; 1337 struct mapped_device *md = q->queuedata;
1338 struct dm_table *map = dm_get_live_table(md); 1338 struct dm_table *map = dm_get_live_table(md);
1339 struct dm_target *ti; 1339 struct dm_target *ti;
1340 sector_t max_sectors; 1340 sector_t max_sectors;
1341 int max_size = 0; 1341 int max_size = 0;
1342 1342
1343 if (unlikely(!map)) 1343 if (unlikely(!map))
1344 goto out; 1344 goto out;
1345 1345
1346 ti = dm_table_find_target(map, bvm->bi_sector); 1346 ti = dm_table_find_target(map, bvm->bi_sector);
1347 if (!dm_target_is_valid(ti)) 1347 if (!dm_target_is_valid(ti))
1348 goto out_table; 1348 goto out_table;
1349 1349
1350 /* 1350 /*
1351 * Find maximum amount of I/O that won't need splitting 1351 * Find maximum amount of I/O that won't need splitting
1352 */ 1352 */
1353 max_sectors = min(max_io_len(bvm->bi_sector, ti), 1353 max_sectors = min(max_io_len(bvm->bi_sector, ti),
1354 (sector_t) BIO_MAX_SECTORS); 1354 (sector_t) BIO_MAX_SECTORS);
1355 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size; 1355 max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1356 if (max_size < 0) 1356 if (max_size < 0)
1357 max_size = 0; 1357 max_size = 0;
1358 1358
1359 /* 1359 /*
1360 * merge_bvec_fn() returns number of bytes 1360 * merge_bvec_fn() returns number of bytes
1361 * it can accept at this offset 1361 * it can accept at this offset
1362 * max is precomputed maximal io size 1362 * max is precomputed maximal io size
1363 */ 1363 */
1364 if (max_size && ti->type->merge) 1364 if (max_size && ti->type->merge)
1365 max_size = ti->type->merge(ti, bvm, biovec, max_size); 1365 max_size = ti->type->merge(ti, bvm, biovec, max_size);
1366 /* 1366 /*
1367 * If the target doesn't support merge method and some of the devices 1367 * If the target doesn't support merge method and some of the devices
1368 * provided their merge_bvec method (we know this by looking at 1368 * provided their merge_bvec method (we know this by looking at
1369 * queue_max_hw_sectors), then we can't allow bios with multiple vector 1369 * queue_max_hw_sectors), then we can't allow bios with multiple vector
1370 * entries. So always set max_size to 0, and the code below allows 1370 * entries. So always set max_size to 0, and the code below allows
1371 * just one page. 1371 * just one page.
1372 */ 1372 */
1373 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9) 1373 else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1374 1374
1375 max_size = 0; 1375 max_size = 0;
1376 1376
1377 out_table: 1377 out_table:
1378 dm_table_put(map); 1378 dm_table_put(map);
1379 1379
1380 out: 1380 out:
1381 /* 1381 /*
1382 * Always allow an entire first page 1382 * Always allow an entire first page
1383 */ 1383 */
1384 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT)) 1384 if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1385 max_size = biovec->bv_len; 1385 max_size = biovec->bv_len;
1386 1386
1387 return max_size; 1387 return max_size;
1388 } 1388 }
1389 1389
1390 /* 1390 /*
1391 * The request function that just remaps the bio built up by 1391 * The request function that just remaps the bio built up by
1392 * dm_merge_bvec. 1392 * dm_merge_bvec.
1393 */ 1393 */
1394 static int _dm_request(struct request_queue *q, struct bio *bio) 1394 static int _dm_request(struct request_queue *q, struct bio *bio)
1395 { 1395 {
1396 int rw = bio_data_dir(bio); 1396 int rw = bio_data_dir(bio);
1397 struct mapped_device *md = q->queuedata; 1397 struct mapped_device *md = q->queuedata;
1398 int cpu; 1398 int cpu;
1399 1399
1400 down_read(&md->io_lock); 1400 down_read(&md->io_lock);
1401 1401
1402 cpu = part_stat_lock(); 1402 cpu = part_stat_lock();
1403 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); 1403 part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1404 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio)); 1404 part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1405 part_stat_unlock(); 1405 part_stat_unlock();
1406 1406
1407 /* if we're suspended, we have to queue this io for later */ 1407 /* if we're suspended, we have to queue this io for later */
1408 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { 1408 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) {
1409 up_read(&md->io_lock); 1409 up_read(&md->io_lock);
1410 1410
1411 if (bio_rw(bio) != READA) 1411 if (bio_rw(bio) != READA)
1412 queue_io(md, bio); 1412 queue_io(md, bio);
1413 else 1413 else
1414 bio_io_error(bio); 1414 bio_io_error(bio);
1415 return 0; 1415 return 0;
1416 } 1416 }
1417 1417
1418 __split_and_process_bio(md, bio); 1418 __split_and_process_bio(md, bio);
1419 up_read(&md->io_lock); 1419 up_read(&md->io_lock);
1420 return 0; 1420 return 0;
1421 } 1421 }
1422 1422
1423 static int dm_make_request(struct request_queue *q, struct bio *bio) 1423 static int dm_make_request(struct request_queue *q, struct bio *bio)
1424 { 1424 {
1425 struct mapped_device *md = q->queuedata; 1425 struct mapped_device *md = q->queuedata;
1426 1426
1427 return md->saved_make_request_fn(q, bio); /* call __make_request() */ 1427 return md->saved_make_request_fn(q, bio); /* call __make_request() */
1428 } 1428 }
1429 1429
1430 static int dm_request_based(struct mapped_device *md) 1430 static int dm_request_based(struct mapped_device *md)
1431 { 1431 {
1432 return blk_queue_stackable(md->queue); 1432 return blk_queue_stackable(md->queue);
1433 } 1433 }
1434 1434
1435 static int dm_request(struct request_queue *q, struct bio *bio) 1435 static int dm_request(struct request_queue *q, struct bio *bio)
1436 { 1436 {
1437 struct mapped_device *md = q->queuedata; 1437 struct mapped_device *md = q->queuedata;
1438 1438
1439 if (dm_request_based(md)) 1439 if (dm_request_based(md))
1440 return dm_make_request(q, bio); 1440 return dm_make_request(q, bio);
1441 1441
1442 return _dm_request(q, bio); 1442 return _dm_request(q, bio);
1443 } 1443 }
1444 1444
1445 void dm_dispatch_request(struct request *rq) 1445 void dm_dispatch_request(struct request *rq)
1446 { 1446 {
1447 int r; 1447 int r;
1448 1448
1449 if (blk_queue_io_stat(rq->q)) 1449 if (blk_queue_io_stat(rq->q))
1450 rq->cmd_flags |= REQ_IO_STAT; 1450 rq->cmd_flags |= REQ_IO_STAT;
1451 1451
1452 rq->start_time = jiffies; 1452 rq->start_time = jiffies;
1453 r = blk_insert_cloned_request(rq->q, rq); 1453 r = blk_insert_cloned_request(rq->q, rq);
1454 if (r) 1454 if (r)
1455 dm_complete_request(rq, r); 1455 dm_complete_request(rq, r);
1456 } 1456 }
1457 EXPORT_SYMBOL_GPL(dm_dispatch_request); 1457 EXPORT_SYMBOL_GPL(dm_dispatch_request);
1458 1458
1459 static void dm_rq_bio_destructor(struct bio *bio) 1459 static void dm_rq_bio_destructor(struct bio *bio)
1460 { 1460 {
1461 struct dm_rq_clone_bio_info *info = bio->bi_private; 1461 struct dm_rq_clone_bio_info *info = bio->bi_private;
1462 struct mapped_device *md = info->tio->md; 1462 struct mapped_device *md = info->tio->md;
1463 1463
1464 free_bio_info(info); 1464 free_bio_info(info);
1465 bio_free(bio, md->bs); 1465 bio_free(bio, md->bs);
1466 } 1466 }
1467 1467
1468 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig, 1468 static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
1469 void *data) 1469 void *data)
1470 { 1470 {
1471 struct dm_rq_target_io *tio = data; 1471 struct dm_rq_target_io *tio = data;
1472 struct mapped_device *md = tio->md; 1472 struct mapped_device *md = tio->md;
1473 struct dm_rq_clone_bio_info *info = alloc_bio_info(md); 1473 struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
1474 1474
1475 if (!info) 1475 if (!info)
1476 return -ENOMEM; 1476 return -ENOMEM;
1477 1477
1478 info->orig = bio_orig; 1478 info->orig = bio_orig;
1479 info->tio = tio; 1479 info->tio = tio;
1480 bio->bi_end_io = end_clone_bio; 1480 bio->bi_end_io = end_clone_bio;
1481 bio->bi_private = info; 1481 bio->bi_private = info;
1482 bio->bi_destructor = dm_rq_bio_destructor; 1482 bio->bi_destructor = dm_rq_bio_destructor;
1483 1483
1484 return 0; 1484 return 0;
1485 } 1485 }
1486 1486
1487 static int setup_clone(struct request *clone, struct request *rq, 1487 static int setup_clone(struct request *clone, struct request *rq,
1488 struct dm_rq_target_io *tio) 1488 struct dm_rq_target_io *tio)
1489 { 1489 {
1490 int r; 1490 int r;
1491 1491
1492 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC, 1492 r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1493 dm_rq_bio_constructor, tio); 1493 dm_rq_bio_constructor, tio);
1494 if (r) 1494 if (r)
1495 return r; 1495 return r;
1496 1496
1497 clone->cmd = rq->cmd; 1497 clone->cmd = rq->cmd;
1498 clone->cmd_len = rq->cmd_len; 1498 clone->cmd_len = rq->cmd_len;
1499 clone->sense = rq->sense; 1499 clone->sense = rq->sense;
1500 clone->buffer = rq->buffer; 1500 clone->buffer = rq->buffer;
1501 clone->end_io = end_clone_request; 1501 clone->end_io = end_clone_request;
1502 clone->end_io_data = tio; 1502 clone->end_io_data = tio;
1503 1503
1504 return 0; 1504 return 0;
1505 } 1505 }
1506 1506
1507 static struct request *clone_rq(struct request *rq, struct mapped_device *md, 1507 static struct request *clone_rq(struct request *rq, struct mapped_device *md,
1508 gfp_t gfp_mask) 1508 gfp_t gfp_mask)
1509 { 1509 {
1510 struct request *clone; 1510 struct request *clone;
1511 struct dm_rq_target_io *tio; 1511 struct dm_rq_target_io *tio;
1512 1512
1513 tio = alloc_rq_tio(md, gfp_mask); 1513 tio = alloc_rq_tio(md, gfp_mask);
1514 if (!tio) 1514 if (!tio)
1515 return NULL; 1515 return NULL;
1516 1516
1517 tio->md = md; 1517 tio->md = md;
1518 tio->ti = NULL; 1518 tio->ti = NULL;
1519 tio->orig = rq; 1519 tio->orig = rq;
1520 tio->error = 0; 1520 tio->error = 0;
1521 memset(&tio->info, 0, sizeof(tio->info)); 1521 memset(&tio->info, 0, sizeof(tio->info));
1522 1522
1523 clone = &tio->clone; 1523 clone = &tio->clone;
1524 if (setup_clone(clone, rq, tio)) { 1524 if (setup_clone(clone, rq, tio)) {
1525 /* -ENOMEM */ 1525 /* -ENOMEM */
1526 free_rq_tio(tio); 1526 free_rq_tio(tio);
1527 return NULL; 1527 return NULL;
1528 } 1528 }
1529 1529
1530 return clone; 1530 return clone;
1531 } 1531 }
1532 1532
1533 /* 1533 /*
1534 * Called with the queue lock held. 1534 * Called with the queue lock held.
1535 */ 1535 */
1536 static int dm_prep_fn(struct request_queue *q, struct request *rq) 1536 static int dm_prep_fn(struct request_queue *q, struct request *rq)
1537 { 1537 {
1538 struct mapped_device *md = q->queuedata; 1538 struct mapped_device *md = q->queuedata;
1539 struct request *clone; 1539 struct request *clone;
1540 1540
1541 if (unlikely(rq->special)) { 1541 if (unlikely(rq->special)) {
1542 DMWARN("Already has something in rq->special."); 1542 DMWARN("Already has something in rq->special.");
1543 return BLKPREP_KILL; 1543 return BLKPREP_KILL;
1544 } 1544 }
1545 1545
1546 clone = clone_rq(rq, md, GFP_ATOMIC); 1546 clone = clone_rq(rq, md, GFP_ATOMIC);
1547 if (!clone) 1547 if (!clone)
1548 return BLKPREP_DEFER; 1548 return BLKPREP_DEFER;
1549 1549
1550 rq->special = clone; 1550 rq->special = clone;
1551 rq->cmd_flags |= REQ_DONTPREP; 1551 rq->cmd_flags |= REQ_DONTPREP;
1552 1552
1553 return BLKPREP_OK; 1553 return BLKPREP_OK;
1554 } 1554 }
1555 1555
1556 /* 1556 /*
1557 * Returns: 1557 * Returns:
1558 * 0 : the request has been processed (not requeued) 1558 * 0 : the request has been processed (not requeued)
1559 * !0 : the request has been requeued 1559 * !0 : the request has been requeued
1560 */ 1560 */
1561 static int map_request(struct dm_target *ti, struct request *clone, 1561 static int map_request(struct dm_target *ti, struct request *clone,
1562 struct mapped_device *md) 1562 struct mapped_device *md)
1563 { 1563 {
1564 int r, requeued = 0; 1564 int r, requeued = 0;
1565 struct dm_rq_target_io *tio = clone->end_io_data; 1565 struct dm_rq_target_io *tio = clone->end_io_data;
1566 1566
1567 /* 1567 /*
1568 * Hold the md reference here for the in-flight I/O. 1568 * Hold the md reference here for the in-flight I/O.
1569 * We can't rely on the reference count by device opener, 1569 * We can't rely on the reference count by device opener,
1570 * because the device may be closed during the request completion 1570 * because the device may be closed during the request completion
1571 * when all bios are completed. 1571 * when all bios are completed.
1572 * See the comment in rq_completed() too. 1572 * See the comment in rq_completed() too.
1573 */ 1573 */
1574 dm_get(md); 1574 dm_get(md);
1575 1575
1576 tio->ti = ti; 1576 tio->ti = ti;
1577 r = ti->type->map_rq(ti, clone, &tio->info); 1577 r = ti->type->map_rq(ti, clone, &tio->info);
1578 switch (r) { 1578 switch (r) {
1579 case DM_MAPIO_SUBMITTED: 1579 case DM_MAPIO_SUBMITTED:
1580 /* The target has taken the I/O to submit by itself later */ 1580 /* The target has taken the I/O to submit by itself later */
1581 break; 1581 break;
1582 case DM_MAPIO_REMAPPED: 1582 case DM_MAPIO_REMAPPED:
1583 /* The target has remapped the I/O so dispatch it */ 1583 /* The target has remapped the I/O so dispatch it */
1584 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)), 1584 trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1585 blk_rq_pos(tio->orig)); 1585 blk_rq_pos(tio->orig));
1586 dm_dispatch_request(clone); 1586 dm_dispatch_request(clone);
1587 break; 1587 break;
1588 case DM_MAPIO_REQUEUE: 1588 case DM_MAPIO_REQUEUE:
1589 /* The target wants to requeue the I/O */ 1589 /* The target wants to requeue the I/O */
1590 dm_requeue_unmapped_request(clone); 1590 dm_requeue_unmapped_request(clone);
1591 requeued = 1; 1591 requeued = 1;
1592 break; 1592 break;
1593 default: 1593 default:
1594 if (r > 0) { 1594 if (r > 0) {
1595 DMWARN("unimplemented target map return value: %d", r); 1595 DMWARN("unimplemented target map return value: %d", r);
1596 BUG(); 1596 BUG();
1597 } 1597 }
1598 1598
1599 /* The target wants to complete the I/O */ 1599 /* The target wants to complete the I/O */
1600 dm_kill_unmapped_request(clone, r); 1600 dm_kill_unmapped_request(clone, r);
1601 break; 1601 break;
1602 } 1602 }
1603 1603
1604 return requeued; 1604 return requeued;
1605 } 1605 }
1606 1606
1607 /* 1607 /*
1608 * q->request_fn for request-based dm. 1608 * q->request_fn for request-based dm.
1609 * Called with the queue lock held. 1609 * Called with the queue lock held.
1610 */ 1610 */
1611 static void dm_request_fn(struct request_queue *q) 1611 static void dm_request_fn(struct request_queue *q)
1612 { 1612 {
1613 struct mapped_device *md = q->queuedata; 1613 struct mapped_device *md = q->queuedata;
1614 struct dm_table *map = dm_get_live_table(md); 1614 struct dm_table *map = dm_get_live_table(md);
1615 struct dm_target *ti; 1615 struct dm_target *ti;
1616 struct request *rq, *clone; 1616 struct request *rq, *clone;
1617 sector_t pos; 1617 sector_t pos;
1618 1618
1619 /* 1619 /*
1620 * For suspend, check blk_queue_stopped() and increment 1620 * For suspend, check blk_queue_stopped() and increment
1621 * ->pending within a single queue_lock not to increment the 1621 * ->pending within a single queue_lock not to increment the
1622 * number of in-flight I/Os after the queue is stopped in 1622 * number of in-flight I/Os after the queue is stopped in
1623 * dm_suspend(). 1623 * dm_suspend().
1624 */ 1624 */
1625 while (!blk_queue_stopped(q)) { 1625 while (!blk_queue_stopped(q)) {
1626 rq = blk_peek_request(q); 1626 rq = blk_peek_request(q);
1627 if (!rq) 1627 if (!rq)
1628 goto delay_and_out; 1628 goto delay_and_out;
1629 1629
1630 /* always use block 0 to find the target for flushes for now */ 1630 /* always use block 0 to find the target for flushes for now */
1631 pos = 0; 1631 pos = 0;
1632 if (!(rq->cmd_flags & REQ_FLUSH)) 1632 if (!(rq->cmd_flags & REQ_FLUSH))
1633 pos = blk_rq_pos(rq); 1633 pos = blk_rq_pos(rq);
1634 1634
1635 ti = dm_table_find_target(map, pos); 1635 ti = dm_table_find_target(map, pos);
1636 BUG_ON(!dm_target_is_valid(ti)); 1636 BUG_ON(!dm_target_is_valid(ti));
1637 1637
1638 if (ti->type->busy && ti->type->busy(ti)) 1638 if (ti->type->busy && ti->type->busy(ti))
1639 goto delay_and_out; 1639 goto delay_and_out;
1640 1640
1641 blk_start_request(rq); 1641 blk_start_request(rq);
1642 clone = rq->special; 1642 clone = rq->special;
1643 atomic_inc(&md->pending[rq_data_dir(clone)]); 1643 atomic_inc(&md->pending[rq_data_dir(clone)]);
1644 1644
1645 spin_unlock(q->queue_lock); 1645 spin_unlock(q->queue_lock);
1646 if (map_request(ti, clone, md)) 1646 if (map_request(ti, clone, md))
1647 goto requeued; 1647 goto requeued;
1648 1648
1649 BUG_ON(!irqs_disabled()); 1649 BUG_ON(!irqs_disabled());
1650 spin_lock(q->queue_lock); 1650 spin_lock(q->queue_lock);
1651 } 1651 }
1652 1652
1653 goto out; 1653 goto out;
1654 1654
1655 requeued: 1655 requeued:
1656 BUG_ON(!irqs_disabled()); 1656 BUG_ON(!irqs_disabled());
1657 spin_lock(q->queue_lock); 1657 spin_lock(q->queue_lock);
1658 1658
1659 delay_and_out: 1659 delay_and_out:
1660 blk_delay_queue(q, HZ / 10); 1660 blk_delay_queue(q, HZ / 10);
1661 out: 1661 out:
1662 dm_table_put(map); 1662 dm_table_put(map);
1663 1663
1664 return; 1664 return;
1665 } 1665 }
1666 1666
1667 int dm_underlying_device_busy(struct request_queue *q) 1667 int dm_underlying_device_busy(struct request_queue *q)
1668 { 1668 {
1669 return blk_lld_busy(q); 1669 return blk_lld_busy(q);
1670 } 1670 }
1671 EXPORT_SYMBOL_GPL(dm_underlying_device_busy); 1671 EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1672 1672
1673 static int dm_lld_busy(struct request_queue *q) 1673 static int dm_lld_busy(struct request_queue *q)
1674 { 1674 {
1675 int r; 1675 int r;
1676 struct mapped_device *md = q->queuedata; 1676 struct mapped_device *md = q->queuedata;
1677 struct dm_table *map = dm_get_live_table(md); 1677 struct dm_table *map = dm_get_live_table(md);
1678 1678
1679 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) 1679 if (!map || test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1680 r = 1; 1680 r = 1;
1681 else 1681 else
1682 r = dm_table_any_busy_target(map); 1682 r = dm_table_any_busy_target(map);
1683 1683
1684 dm_table_put(map); 1684 dm_table_put(map);
1685 1685
1686 return r; 1686 return r;
1687 } 1687 }
1688 1688
1689 static int dm_any_congested(void *congested_data, int bdi_bits) 1689 static int dm_any_congested(void *congested_data, int bdi_bits)
1690 { 1690 {
1691 int r = bdi_bits; 1691 int r = bdi_bits;
1692 struct mapped_device *md = congested_data; 1692 struct mapped_device *md = congested_data;
1693 struct dm_table *map; 1693 struct dm_table *map;
1694 1694
1695 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 1695 if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1696 map = dm_get_live_table(md); 1696 map = dm_get_live_table(md);
1697 if (map) { 1697 if (map) {
1698 /* 1698 /*
1699 * Request-based dm cares about only own queue for 1699 * Request-based dm cares about only own queue for
1700 * the query about congestion status of request_queue 1700 * the query about congestion status of request_queue
1701 */ 1701 */
1702 if (dm_request_based(md)) 1702 if (dm_request_based(md))
1703 r = md->queue->backing_dev_info.state & 1703 r = md->queue->backing_dev_info.state &
1704 bdi_bits; 1704 bdi_bits;
1705 else 1705 else
1706 r = dm_table_any_congested(map, bdi_bits); 1706 r = dm_table_any_congested(map, bdi_bits);
1707 1707
1708 dm_table_put(map); 1708 dm_table_put(map);
1709 } 1709 }
1710 } 1710 }
1711 1711
1712 return r; 1712 return r;
1713 } 1713 }
1714 1714
1715 /*----------------------------------------------------------------- 1715 /*-----------------------------------------------------------------
1716 * An IDR is used to keep track of allocated minor numbers. 1716 * An IDR is used to keep track of allocated minor numbers.
1717 *---------------------------------------------------------------*/ 1717 *---------------------------------------------------------------*/
1718 static void free_minor(int minor) 1718 static void free_minor(int minor)
1719 { 1719 {
1720 spin_lock(&_minor_lock); 1720 spin_lock(&_minor_lock);
1721 idr_remove(&_minor_idr, minor); 1721 idr_remove(&_minor_idr, minor);
1722 spin_unlock(&_minor_lock); 1722 spin_unlock(&_minor_lock);
1723 } 1723 }
1724 1724
1725 /* 1725 /*
1726 * See if the device with a specific minor # is free. 1726 * See if the device with a specific minor # is free.
1727 */ 1727 */
1728 static int specific_minor(int minor) 1728 static int specific_minor(int minor)
1729 { 1729 {
1730 int r, m; 1730 int r, m;
1731 1731
1732 if (minor >= (1 << MINORBITS)) 1732 if (minor >= (1 << MINORBITS))
1733 return -EINVAL; 1733 return -EINVAL;
1734 1734
1735 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1735 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1736 if (!r) 1736 if (!r)
1737 return -ENOMEM; 1737 return -ENOMEM;
1738 1738
1739 spin_lock(&_minor_lock); 1739 spin_lock(&_minor_lock);
1740 1740
1741 if (idr_find(&_minor_idr, minor)) { 1741 if (idr_find(&_minor_idr, minor)) {
1742 r = -EBUSY; 1742 r = -EBUSY;
1743 goto out; 1743 goto out;
1744 } 1744 }
1745 1745
1746 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m); 1746 r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
1747 if (r) 1747 if (r)
1748 goto out; 1748 goto out;
1749 1749
1750 if (m != minor) { 1750 if (m != minor) {
1751 idr_remove(&_minor_idr, m); 1751 idr_remove(&_minor_idr, m);
1752 r = -EBUSY; 1752 r = -EBUSY;
1753 goto out; 1753 goto out;
1754 } 1754 }
1755 1755
1756 out: 1756 out:
1757 spin_unlock(&_minor_lock); 1757 spin_unlock(&_minor_lock);
1758 return r; 1758 return r;
1759 } 1759 }
1760 1760
1761 static int next_free_minor(int *minor) 1761 static int next_free_minor(int *minor)
1762 { 1762 {
1763 int r, m; 1763 int r, m;
1764 1764
1765 r = idr_pre_get(&_minor_idr, GFP_KERNEL); 1765 r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1766 if (!r) 1766 if (!r)
1767 return -ENOMEM; 1767 return -ENOMEM;
1768 1768
1769 spin_lock(&_minor_lock); 1769 spin_lock(&_minor_lock);
1770 1770
1771 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m); 1771 r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
1772 if (r) 1772 if (r)
1773 goto out; 1773 goto out;
1774 1774
1775 if (m >= (1 << MINORBITS)) { 1775 if (m >= (1 << MINORBITS)) {
1776 idr_remove(&_minor_idr, m); 1776 idr_remove(&_minor_idr, m);
1777 r = -ENOSPC; 1777 r = -ENOSPC;
1778 goto out; 1778 goto out;
1779 } 1779 }
1780 1780
1781 *minor = m; 1781 *minor = m;
1782 1782
1783 out: 1783 out:
1784 spin_unlock(&_minor_lock); 1784 spin_unlock(&_minor_lock);
1785 return r; 1785 return r;
1786 } 1786 }
1787 1787
1788 static const struct block_device_operations dm_blk_dops; 1788 static const struct block_device_operations dm_blk_dops;
1789 1789
1790 static void dm_wq_work(struct work_struct *work); 1790 static void dm_wq_work(struct work_struct *work);
1791 1791
1792 static void dm_init_md_queue(struct mapped_device *md) 1792 static void dm_init_md_queue(struct mapped_device *md)
1793 { 1793 {
1794 /* 1794 /*
1795 * Request-based dm devices cannot be stacked on top of bio-based dm 1795 * Request-based dm devices cannot be stacked on top of bio-based dm
1796 * devices. The type of this dm device has not been decided yet. 1796 * devices. The type of this dm device has not been decided yet.
1797 * The type is decided at the first table loading time. 1797 * The type is decided at the first table loading time.
1798 * To prevent problematic device stacking, clear the queue flag 1798 * To prevent problematic device stacking, clear the queue flag
1799 * for request stacking support until then. 1799 * for request stacking support until then.
1800 * 1800 *
1801 * This queue is new, so no concurrency on the queue_flags. 1801 * This queue is new, so no concurrency on the queue_flags.
1802 */ 1802 */
1803 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue); 1803 queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1804 1804
1805 md->queue->queuedata = md; 1805 md->queue->queuedata = md;
1806 md->queue->backing_dev_info.congested_fn = dm_any_congested; 1806 md->queue->backing_dev_info.congested_fn = dm_any_congested;
1807 md->queue->backing_dev_info.congested_data = md; 1807 md->queue->backing_dev_info.congested_data = md;
1808 blk_queue_make_request(md->queue, dm_request); 1808 blk_queue_make_request(md->queue, dm_request);
1809 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); 1809 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1810 blk_queue_merge_bvec(md->queue, dm_merge_bvec); 1810 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1811 blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1812 } 1811 }
1813 1812
1814 /* 1813 /*
1815 * Allocate and initialise a blank device with a given minor. 1814 * Allocate and initialise a blank device with a given minor.
1816 */ 1815 */
1817 static struct mapped_device *alloc_dev(int minor) 1816 static struct mapped_device *alloc_dev(int minor)
1818 { 1817 {
1819 int r; 1818 int r;
1820 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); 1819 struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
1821 void *old_md; 1820 void *old_md;
1822 1821
1823 if (!md) { 1822 if (!md) {
1824 DMWARN("unable to allocate device, out of memory."); 1823 DMWARN("unable to allocate device, out of memory.");
1825 return NULL; 1824 return NULL;
1826 } 1825 }
1827 1826
1828 if (!try_module_get(THIS_MODULE)) 1827 if (!try_module_get(THIS_MODULE))
1829 goto bad_module_get; 1828 goto bad_module_get;
1830 1829
1831 /* get a minor number for the dev */ 1830 /* get a minor number for the dev */
1832 if (minor == DM_ANY_MINOR) 1831 if (minor == DM_ANY_MINOR)
1833 r = next_free_minor(&minor); 1832 r = next_free_minor(&minor);
1834 else 1833 else
1835 r = specific_minor(minor); 1834 r = specific_minor(minor);
1836 if (r < 0) 1835 if (r < 0)
1837 goto bad_minor; 1836 goto bad_minor;
1838 1837
1839 md->type = DM_TYPE_NONE; 1838 md->type = DM_TYPE_NONE;
1840 init_rwsem(&md->io_lock); 1839 init_rwsem(&md->io_lock);
1841 mutex_init(&md->suspend_lock); 1840 mutex_init(&md->suspend_lock);
1842 mutex_init(&md->type_lock); 1841 mutex_init(&md->type_lock);
1843 spin_lock_init(&md->deferred_lock); 1842 spin_lock_init(&md->deferred_lock);
1844 rwlock_init(&md->map_lock); 1843 rwlock_init(&md->map_lock);
1845 atomic_set(&md->holders, 1); 1844 atomic_set(&md->holders, 1);
1846 atomic_set(&md->open_count, 0); 1845 atomic_set(&md->open_count, 0);
1847 atomic_set(&md->event_nr, 0); 1846 atomic_set(&md->event_nr, 0);
1848 atomic_set(&md->uevent_seq, 0); 1847 atomic_set(&md->uevent_seq, 0);
1849 INIT_LIST_HEAD(&md->uevent_list); 1848 INIT_LIST_HEAD(&md->uevent_list);
1850 spin_lock_init(&md->uevent_lock); 1849 spin_lock_init(&md->uevent_lock);
1851 1850
1852 md->queue = blk_alloc_queue(GFP_KERNEL); 1851 md->queue = blk_alloc_queue(GFP_KERNEL);
1853 if (!md->queue) 1852 if (!md->queue)
1854 goto bad_queue; 1853 goto bad_queue;
1855 1854
1856 dm_init_md_queue(md); 1855 dm_init_md_queue(md);
1857 1856
1858 md->disk = alloc_disk(1); 1857 md->disk = alloc_disk(1);
1859 if (!md->disk) 1858 if (!md->disk)
1860 goto bad_disk; 1859 goto bad_disk;
1861 1860
1862 atomic_set(&md->pending[0], 0); 1861 atomic_set(&md->pending[0], 0);
1863 atomic_set(&md->pending[1], 0); 1862 atomic_set(&md->pending[1], 0);
1864 init_waitqueue_head(&md->wait); 1863 init_waitqueue_head(&md->wait);
1865 INIT_WORK(&md->work, dm_wq_work); 1864 INIT_WORK(&md->work, dm_wq_work);
1866 init_waitqueue_head(&md->eventq); 1865 init_waitqueue_head(&md->eventq);
1867 1866
1868 md->disk->major = _major; 1867 md->disk->major = _major;
1869 md->disk->first_minor = minor; 1868 md->disk->first_minor = minor;
1870 md->disk->fops = &dm_blk_dops; 1869 md->disk->fops = &dm_blk_dops;
1871 md->disk->queue = md->queue; 1870 md->disk->queue = md->queue;
1872 md->disk->private_data = md; 1871 md->disk->private_data = md;
1873 sprintf(md->disk->disk_name, "dm-%d", minor); 1872 sprintf(md->disk->disk_name, "dm-%d", minor);
1874 add_disk(md->disk); 1873 add_disk(md->disk);
1875 format_dev_t(md->name, MKDEV(_major, minor)); 1874 format_dev_t(md->name, MKDEV(_major, minor));
1876 1875
1877 md->wq = alloc_workqueue("kdmflush", 1876 md->wq = alloc_workqueue("kdmflush",
1878 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0); 1877 WQ_NON_REENTRANT | WQ_MEM_RECLAIM, 0);
1879 if (!md->wq) 1878 if (!md->wq)
1880 goto bad_thread; 1879 goto bad_thread;
1881 1880
1882 md->bdev = bdget_disk(md->disk, 0); 1881 md->bdev = bdget_disk(md->disk, 0);
1883 if (!md->bdev) 1882 if (!md->bdev)
1884 goto bad_bdev; 1883 goto bad_bdev;
1885 1884
1886 bio_init(&md->flush_bio); 1885 bio_init(&md->flush_bio);
1887 md->flush_bio.bi_bdev = md->bdev; 1886 md->flush_bio.bi_bdev = md->bdev;
1888 md->flush_bio.bi_rw = WRITE_FLUSH; 1887 md->flush_bio.bi_rw = WRITE_FLUSH;
1889 1888
1890 /* Populate the mapping, nobody knows we exist yet */ 1889 /* Populate the mapping, nobody knows we exist yet */
1891 spin_lock(&_minor_lock); 1890 spin_lock(&_minor_lock);
1892 old_md = idr_replace(&_minor_idr, md, minor); 1891 old_md = idr_replace(&_minor_idr, md, minor);
1893 spin_unlock(&_minor_lock); 1892 spin_unlock(&_minor_lock);
1894 1893
1895 BUG_ON(old_md != MINOR_ALLOCED); 1894 BUG_ON(old_md != MINOR_ALLOCED);
1896 1895
1897 return md; 1896 return md;
1898 1897
1899 bad_bdev: 1898 bad_bdev:
1900 destroy_workqueue(md->wq); 1899 destroy_workqueue(md->wq);
1901 bad_thread: 1900 bad_thread:
1902 del_gendisk(md->disk); 1901 del_gendisk(md->disk);
1903 put_disk(md->disk); 1902 put_disk(md->disk);
1904 bad_disk: 1903 bad_disk:
1905 blk_cleanup_queue(md->queue); 1904 blk_cleanup_queue(md->queue);
1906 bad_queue: 1905 bad_queue:
1907 free_minor(minor); 1906 free_minor(minor);
1908 bad_minor: 1907 bad_minor:
1909 module_put(THIS_MODULE); 1908 module_put(THIS_MODULE);
1910 bad_module_get: 1909 bad_module_get:
1911 kfree(md); 1910 kfree(md);
1912 return NULL; 1911 return NULL;
1913 } 1912 }
1914 1913
1915 static void unlock_fs(struct mapped_device *md); 1914 static void unlock_fs(struct mapped_device *md);
1916 1915
1917 static void free_dev(struct mapped_device *md) 1916 static void free_dev(struct mapped_device *md)
1918 { 1917 {
1919 int minor = MINOR(disk_devt(md->disk)); 1918 int minor = MINOR(disk_devt(md->disk));
1920 1919
1921 unlock_fs(md); 1920 unlock_fs(md);
1922 bdput(md->bdev); 1921 bdput(md->bdev);
1923 destroy_workqueue(md->wq); 1922 destroy_workqueue(md->wq);
1924 if (md->tio_pool) 1923 if (md->tio_pool)
1925 mempool_destroy(md->tio_pool); 1924 mempool_destroy(md->tio_pool);
1926 if (md->io_pool) 1925 if (md->io_pool)
1927 mempool_destroy(md->io_pool); 1926 mempool_destroy(md->io_pool);
1928 if (md->bs) 1927 if (md->bs)
1929 bioset_free(md->bs); 1928 bioset_free(md->bs);
1930 blk_integrity_unregister(md->disk); 1929 blk_integrity_unregister(md->disk);
1931 del_gendisk(md->disk); 1930 del_gendisk(md->disk);
1932 free_minor(minor); 1931 free_minor(minor);
1933 1932
1934 spin_lock(&_minor_lock); 1933 spin_lock(&_minor_lock);
1935 md->disk->private_data = NULL; 1934 md->disk->private_data = NULL;
1936 spin_unlock(&_minor_lock); 1935 spin_unlock(&_minor_lock);
1937 1936
1938 put_disk(md->disk); 1937 put_disk(md->disk);
1939 blk_cleanup_queue(md->queue); 1938 blk_cleanup_queue(md->queue);
1940 module_put(THIS_MODULE); 1939 module_put(THIS_MODULE);
1941 kfree(md); 1940 kfree(md);
1942 } 1941 }
1943 1942
1944 static void __bind_mempools(struct mapped_device *md, struct dm_table *t) 1943 static void __bind_mempools(struct mapped_device *md, struct dm_table *t)
1945 { 1944 {
1946 struct dm_md_mempools *p; 1945 struct dm_md_mempools *p;
1947 1946
1948 if (md->io_pool && md->tio_pool && md->bs) 1947 if (md->io_pool && md->tio_pool && md->bs)
1949 /* the md already has necessary mempools */ 1948 /* the md already has necessary mempools */
1950 goto out; 1949 goto out;
1951 1950
1952 p = dm_table_get_md_mempools(t); 1951 p = dm_table_get_md_mempools(t);
1953 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs); 1952 BUG_ON(!p || md->io_pool || md->tio_pool || md->bs);
1954 1953
1955 md->io_pool = p->io_pool; 1954 md->io_pool = p->io_pool;
1956 p->io_pool = NULL; 1955 p->io_pool = NULL;
1957 md->tio_pool = p->tio_pool; 1956 md->tio_pool = p->tio_pool;
1958 p->tio_pool = NULL; 1957 p->tio_pool = NULL;
1959 md->bs = p->bs; 1958 md->bs = p->bs;
1960 p->bs = NULL; 1959 p->bs = NULL;
1961 1960
1962 out: 1961 out:
1963 /* mempool bind completed, now no need any mempools in the table */ 1962 /* mempool bind completed, now no need any mempools in the table */
1964 dm_table_free_md_mempools(t); 1963 dm_table_free_md_mempools(t);
1965 } 1964 }
1966 1965
1967 /* 1966 /*
1968 * Bind a table to the device. 1967 * Bind a table to the device.
1969 */ 1968 */
1970 static void event_callback(void *context) 1969 static void event_callback(void *context)
1971 { 1970 {
1972 unsigned long flags; 1971 unsigned long flags;
1973 LIST_HEAD(uevents); 1972 LIST_HEAD(uevents);
1974 struct mapped_device *md = (struct mapped_device *) context; 1973 struct mapped_device *md = (struct mapped_device *) context;
1975 1974
1976 spin_lock_irqsave(&md->uevent_lock, flags); 1975 spin_lock_irqsave(&md->uevent_lock, flags);
1977 list_splice_init(&md->uevent_list, &uevents); 1976 list_splice_init(&md->uevent_list, &uevents);
1978 spin_unlock_irqrestore(&md->uevent_lock, flags); 1977 spin_unlock_irqrestore(&md->uevent_lock, flags);
1979 1978
1980 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj); 1979 dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
1981 1980
1982 atomic_inc(&md->event_nr); 1981 atomic_inc(&md->event_nr);
1983 wake_up(&md->eventq); 1982 wake_up(&md->eventq);
1984 } 1983 }
1985 1984
1986 /* 1985 /*
1987 * Protected by md->suspend_lock obtained by dm_swap_table(). 1986 * Protected by md->suspend_lock obtained by dm_swap_table().
1988 */ 1987 */
1989 static void __set_size(struct mapped_device *md, sector_t size) 1988 static void __set_size(struct mapped_device *md, sector_t size)
1990 { 1989 {
1991 set_capacity(md->disk, size); 1990 set_capacity(md->disk, size);
1992 1991
1993 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT); 1992 i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
1994 } 1993 }
1995 1994
1996 /* 1995 /*
1997 * Return 1 if the queue has a compulsory merge_bvec_fn function. 1996 * Return 1 if the queue has a compulsory merge_bvec_fn function.
1998 * 1997 *
1999 * If this function returns 0, then the device is either a non-dm 1998 * If this function returns 0, then the device is either a non-dm
2000 * device without a merge_bvec_fn, or it is a dm device that is 1999 * device without a merge_bvec_fn, or it is a dm device that is
2001 * able to split any bios it receives that are too big. 2000 * able to split any bios it receives that are too big.
2002 */ 2001 */
2003 int dm_queue_merge_is_compulsory(struct request_queue *q) 2002 int dm_queue_merge_is_compulsory(struct request_queue *q)
2004 { 2003 {
2005 struct mapped_device *dev_md; 2004 struct mapped_device *dev_md;
2006 2005
2007 if (!q->merge_bvec_fn) 2006 if (!q->merge_bvec_fn)
2008 return 0; 2007 return 0;
2009 2008
2010 if (q->make_request_fn == dm_request) { 2009 if (q->make_request_fn == dm_request) {
2011 dev_md = q->queuedata; 2010 dev_md = q->queuedata;
2012 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags)) 2011 if (test_bit(DMF_MERGE_IS_OPTIONAL, &dev_md->flags))
2013 return 0; 2012 return 0;
2014 } 2013 }
2015 2014
2016 return 1; 2015 return 1;
2017 } 2016 }
2018 2017
2019 static int dm_device_merge_is_compulsory(struct dm_target *ti, 2018 static int dm_device_merge_is_compulsory(struct dm_target *ti,
2020 struct dm_dev *dev, sector_t start, 2019 struct dm_dev *dev, sector_t start,
2021 sector_t len, void *data) 2020 sector_t len, void *data)
2022 { 2021 {
2023 struct block_device *bdev = dev->bdev; 2022 struct block_device *bdev = dev->bdev;
2024 struct request_queue *q = bdev_get_queue(bdev); 2023 struct request_queue *q = bdev_get_queue(bdev);
2025 2024
2026 return dm_queue_merge_is_compulsory(q); 2025 return dm_queue_merge_is_compulsory(q);
2027 } 2026 }
2028 2027
2029 /* 2028 /*
2030 * Return 1 if it is acceptable to ignore merge_bvec_fn based 2029 * Return 1 if it is acceptable to ignore merge_bvec_fn based
2031 * on the properties of the underlying devices. 2030 * on the properties of the underlying devices.
2032 */ 2031 */
2033 static int dm_table_merge_is_optional(struct dm_table *table) 2032 static int dm_table_merge_is_optional(struct dm_table *table)
2034 { 2033 {
2035 unsigned i = 0; 2034 unsigned i = 0;
2036 struct dm_target *ti; 2035 struct dm_target *ti;
2037 2036
2038 while (i < dm_table_get_num_targets(table)) { 2037 while (i < dm_table_get_num_targets(table)) {
2039 ti = dm_table_get_target(table, i++); 2038 ti = dm_table_get_target(table, i++);
2040 2039
2041 if (ti->type->iterate_devices && 2040 if (ti->type->iterate_devices &&
2042 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL)) 2041 ti->type->iterate_devices(ti, dm_device_merge_is_compulsory, NULL))
2043 return 0; 2042 return 0;
2044 } 2043 }
2045 2044
2046 return 1; 2045 return 1;
2047 } 2046 }
2048 2047
2049 /* 2048 /*
2050 * Returns old map, which caller must destroy. 2049 * Returns old map, which caller must destroy.
2051 */ 2050 */
2052 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, 2051 static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
2053 struct queue_limits *limits) 2052 struct queue_limits *limits)
2054 { 2053 {
2055 struct dm_table *old_map; 2054 struct dm_table *old_map;
2056 struct request_queue *q = md->queue; 2055 struct request_queue *q = md->queue;
2057 sector_t size; 2056 sector_t size;
2058 unsigned long flags; 2057 unsigned long flags;
2059 int merge_is_optional; 2058 int merge_is_optional;
2060 2059
2061 size = dm_table_get_size(t); 2060 size = dm_table_get_size(t);
2062 2061
2063 /* 2062 /*
2064 * Wipe any geometry if the size of the table changed. 2063 * Wipe any geometry if the size of the table changed.
2065 */ 2064 */
2066 if (size != get_capacity(md->disk)) 2065 if (size != get_capacity(md->disk))
2067 memset(&md->geometry, 0, sizeof(md->geometry)); 2066 memset(&md->geometry, 0, sizeof(md->geometry));
2068 2067
2069 __set_size(md, size); 2068 __set_size(md, size);
2070 2069
2071 dm_table_event_callback(t, event_callback, md); 2070 dm_table_event_callback(t, event_callback, md);
2072 2071
2073 /* 2072 /*
2074 * The queue hasn't been stopped yet, if the old table type wasn't 2073 * The queue hasn't been stopped yet, if the old table type wasn't
2075 * for request-based during suspension. So stop it to prevent 2074 * for request-based during suspension. So stop it to prevent
2076 * I/O mapping before resume. 2075 * I/O mapping before resume.
2077 * This must be done before setting the queue restrictions, 2076 * This must be done before setting the queue restrictions,
2078 * because request-based dm may be run just after the setting. 2077 * because request-based dm may be run just after the setting.
2079 */ 2078 */
2080 if (dm_table_request_based(t) && !blk_queue_stopped(q)) 2079 if (dm_table_request_based(t) && !blk_queue_stopped(q))
2081 stop_queue(q); 2080 stop_queue(q);
2082 2081
2083 __bind_mempools(md, t); 2082 __bind_mempools(md, t);
2084 2083
2085 merge_is_optional = dm_table_merge_is_optional(t); 2084 merge_is_optional = dm_table_merge_is_optional(t);
2086 2085
2087 write_lock_irqsave(&md->map_lock, flags); 2086 write_lock_irqsave(&md->map_lock, flags);
2088 old_map = md->map; 2087 old_map = md->map;
2089 md->map = t; 2088 md->map = t;
2090 dm_table_set_restrictions(t, q, limits); 2089 dm_table_set_restrictions(t, q, limits);
2091 if (merge_is_optional) 2090 if (merge_is_optional)
2092 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2091 set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2093 else 2092 else
2094 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); 2093 clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags);
2095 write_unlock_irqrestore(&md->map_lock, flags); 2094 write_unlock_irqrestore(&md->map_lock, flags);
2096 2095
2097 return old_map; 2096 return old_map;
2098 } 2097 }
2099 2098
2100 /* 2099 /*
2101 * Returns unbound table for the caller to free. 2100 * Returns unbound table for the caller to free.
2102 */ 2101 */
2103 static struct dm_table *__unbind(struct mapped_device *md) 2102 static struct dm_table *__unbind(struct mapped_device *md)
2104 { 2103 {
2105 struct dm_table *map = md->map; 2104 struct dm_table *map = md->map;
2106 unsigned long flags; 2105 unsigned long flags;
2107 2106
2108 if (!map) 2107 if (!map)
2109 return NULL; 2108 return NULL;
2110 2109
2111 dm_table_event_callback(map, NULL, NULL); 2110 dm_table_event_callback(map, NULL, NULL);
2112 write_lock_irqsave(&md->map_lock, flags); 2111 write_lock_irqsave(&md->map_lock, flags);
2113 md->map = NULL; 2112 md->map = NULL;
2114 write_unlock_irqrestore(&md->map_lock, flags); 2113 write_unlock_irqrestore(&md->map_lock, flags);
2115 2114
2116 return map; 2115 return map;
2117 } 2116 }
2118 2117
2119 /* 2118 /*
2120 * Constructor for a new device. 2119 * Constructor for a new device.
2121 */ 2120 */
2122 int dm_create(int minor, struct mapped_device **result) 2121 int dm_create(int minor, struct mapped_device **result)
2123 { 2122 {
2124 struct mapped_device *md; 2123 struct mapped_device *md;
2125 2124
2126 md = alloc_dev(minor); 2125 md = alloc_dev(minor);
2127 if (!md) 2126 if (!md)
2128 return -ENXIO; 2127 return -ENXIO;
2129 2128
2130 dm_sysfs_init(md); 2129 dm_sysfs_init(md);
2131 2130
2132 *result = md; 2131 *result = md;
2133 return 0; 2132 return 0;
2134 } 2133 }
2135 2134
2136 /* 2135 /*
2137 * Functions to manage md->type. 2136 * Functions to manage md->type.
2138 * All are required to hold md->type_lock. 2137 * All are required to hold md->type_lock.
2139 */ 2138 */
2140 void dm_lock_md_type(struct mapped_device *md) 2139 void dm_lock_md_type(struct mapped_device *md)
2141 { 2140 {
2142 mutex_lock(&md->type_lock); 2141 mutex_lock(&md->type_lock);
2143 } 2142 }
2144 2143
2145 void dm_unlock_md_type(struct mapped_device *md) 2144 void dm_unlock_md_type(struct mapped_device *md)
2146 { 2145 {
2147 mutex_unlock(&md->type_lock); 2146 mutex_unlock(&md->type_lock);
2148 } 2147 }
2149 2148
2150 void dm_set_md_type(struct mapped_device *md, unsigned type) 2149 void dm_set_md_type(struct mapped_device *md, unsigned type)
2151 { 2150 {
2152 md->type = type; 2151 md->type = type;
2153 } 2152 }
2154 2153
2155 unsigned dm_get_md_type(struct mapped_device *md) 2154 unsigned dm_get_md_type(struct mapped_device *md)
2156 { 2155 {
2157 return md->type; 2156 return md->type;
2158 } 2157 }
2159 2158
2160 /* 2159 /*
2161 * Fully initialize a request-based queue (->elevator, ->request_fn, etc). 2160 * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2162 */ 2161 */
2163 static int dm_init_request_based_queue(struct mapped_device *md) 2162 static int dm_init_request_based_queue(struct mapped_device *md)
2164 { 2163 {
2165 struct request_queue *q = NULL; 2164 struct request_queue *q = NULL;
2166 2165
2167 if (md->queue->elevator) 2166 if (md->queue->elevator)
2168 return 1; 2167 return 1;
2169 2168
2170 /* Fully initialize the queue */ 2169 /* Fully initialize the queue */
2171 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); 2170 q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2172 if (!q) 2171 if (!q)
2173 return 0; 2172 return 0;
2174 2173
2175 md->queue = q; 2174 md->queue = q;
2176 md->saved_make_request_fn = md->queue->make_request_fn; 2175 md->saved_make_request_fn = md->queue->make_request_fn;
2177 dm_init_md_queue(md); 2176 dm_init_md_queue(md);
2178 blk_queue_softirq_done(md->queue, dm_softirq_done); 2177 blk_queue_softirq_done(md->queue, dm_softirq_done);
2179 blk_queue_prep_rq(md->queue, dm_prep_fn); 2178 blk_queue_prep_rq(md->queue, dm_prep_fn);
2180 blk_queue_lld_busy(md->queue, dm_lld_busy); 2179 blk_queue_lld_busy(md->queue, dm_lld_busy);
2181 2180
2182 elv_register_queue(md->queue); 2181 elv_register_queue(md->queue);
2183 2182
2184 return 1; 2183 return 1;
2185 } 2184 }
2186 2185
2187 /* 2186 /*
2188 * Setup the DM device's queue based on md's type 2187 * Setup the DM device's queue based on md's type
2189 */ 2188 */
2190 int dm_setup_md_queue(struct mapped_device *md) 2189 int dm_setup_md_queue(struct mapped_device *md)
2191 { 2190 {
2192 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) && 2191 if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
2193 !dm_init_request_based_queue(md)) { 2192 !dm_init_request_based_queue(md)) {
2194 DMWARN("Cannot initialize queue for request-based mapped device"); 2193 DMWARN("Cannot initialize queue for request-based mapped device");
2195 return -EINVAL; 2194 return -EINVAL;
2196 } 2195 }
2197 2196
2198 return 0; 2197 return 0;
2199 } 2198 }
2200 2199
2201 static struct mapped_device *dm_find_md(dev_t dev) 2200 static struct mapped_device *dm_find_md(dev_t dev)
2202 { 2201 {
2203 struct mapped_device *md; 2202 struct mapped_device *md;
2204 unsigned minor = MINOR(dev); 2203 unsigned minor = MINOR(dev);
2205 2204
2206 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS)) 2205 if (MAJOR(dev) != _major || minor >= (1 << MINORBITS))
2207 return NULL; 2206 return NULL;
2208 2207
2209 spin_lock(&_minor_lock); 2208 spin_lock(&_minor_lock);
2210 2209
2211 md = idr_find(&_minor_idr, minor); 2210 md = idr_find(&_minor_idr, minor);
2212 if (md && (md == MINOR_ALLOCED || 2211 if (md && (md == MINOR_ALLOCED ||
2213 (MINOR(disk_devt(dm_disk(md))) != minor) || 2212 (MINOR(disk_devt(dm_disk(md))) != minor) ||
2214 dm_deleting_md(md) || 2213 dm_deleting_md(md) ||
2215 test_bit(DMF_FREEING, &md->flags))) { 2214 test_bit(DMF_FREEING, &md->flags))) {
2216 md = NULL; 2215 md = NULL;
2217 goto out; 2216 goto out;
2218 } 2217 }
2219 2218
2220 out: 2219 out:
2221 spin_unlock(&_minor_lock); 2220 spin_unlock(&_minor_lock);
2222 2221
2223 return md; 2222 return md;
2224 } 2223 }
2225 2224
2226 struct mapped_device *dm_get_md(dev_t dev) 2225 struct mapped_device *dm_get_md(dev_t dev)
2227 { 2226 {
2228 struct mapped_device *md = dm_find_md(dev); 2227 struct mapped_device *md = dm_find_md(dev);
2229 2228
2230 if (md) 2229 if (md)
2231 dm_get(md); 2230 dm_get(md);
2232 2231
2233 return md; 2232 return md;
2234 } 2233 }
2235 2234
2236 void *dm_get_mdptr(struct mapped_device *md) 2235 void *dm_get_mdptr(struct mapped_device *md)
2237 { 2236 {
2238 return md->interface_ptr; 2237 return md->interface_ptr;
2239 } 2238 }
2240 2239
2241 void dm_set_mdptr(struct mapped_device *md, void *ptr) 2240 void dm_set_mdptr(struct mapped_device *md, void *ptr)
2242 { 2241 {
2243 md->interface_ptr = ptr; 2242 md->interface_ptr = ptr;
2244 } 2243 }
2245 2244
2246 void dm_get(struct mapped_device *md) 2245 void dm_get(struct mapped_device *md)
2247 { 2246 {
2248 atomic_inc(&md->holders); 2247 atomic_inc(&md->holders);
2249 BUG_ON(test_bit(DMF_FREEING, &md->flags)); 2248 BUG_ON(test_bit(DMF_FREEING, &md->flags));
2250 } 2249 }
2251 2250
2252 const char *dm_device_name(struct mapped_device *md) 2251 const char *dm_device_name(struct mapped_device *md)
2253 { 2252 {
2254 return md->name; 2253 return md->name;
2255 } 2254 }
2256 EXPORT_SYMBOL_GPL(dm_device_name); 2255 EXPORT_SYMBOL_GPL(dm_device_name);
2257 2256
2258 static void __dm_destroy(struct mapped_device *md, bool wait) 2257 static void __dm_destroy(struct mapped_device *md, bool wait)
2259 { 2258 {
2260 struct dm_table *map; 2259 struct dm_table *map;
2261 2260
2262 might_sleep(); 2261 might_sleep();
2263 2262
2264 spin_lock(&_minor_lock); 2263 spin_lock(&_minor_lock);
2265 map = dm_get_live_table(md); 2264 map = dm_get_live_table(md);
2266 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md)))); 2265 idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2267 set_bit(DMF_FREEING, &md->flags); 2266 set_bit(DMF_FREEING, &md->flags);
2268 spin_unlock(&_minor_lock); 2267 spin_unlock(&_minor_lock);
2269 2268
2270 if (!dm_suspended_md(md)) { 2269 if (!dm_suspended_md(md)) {
2271 dm_table_presuspend_targets(map); 2270 dm_table_presuspend_targets(map);
2272 dm_table_postsuspend_targets(map); 2271 dm_table_postsuspend_targets(map);
2273 } 2272 }
2274 2273
2275 /* 2274 /*
2276 * Rare, but there may be I/O requests still going to complete, 2275 * Rare, but there may be I/O requests still going to complete,
2277 * for example. Wait for all references to disappear. 2276 * for example. Wait for all references to disappear.
2278 * No one should increment the reference count of the mapped_device, 2277 * No one should increment the reference count of the mapped_device,
2279 * after the mapped_device state becomes DMF_FREEING. 2278 * after the mapped_device state becomes DMF_FREEING.
2280 */ 2279 */
2281 if (wait) 2280 if (wait)
2282 while (atomic_read(&md->holders)) 2281 while (atomic_read(&md->holders))
2283 msleep(1); 2282 msleep(1);
2284 else if (atomic_read(&md->holders)) 2283 else if (atomic_read(&md->holders))
2285 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)", 2284 DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2286 dm_device_name(md), atomic_read(&md->holders)); 2285 dm_device_name(md), atomic_read(&md->holders));
2287 2286
2288 dm_sysfs_exit(md); 2287 dm_sysfs_exit(md);
2289 dm_table_put(map); 2288 dm_table_put(map);
2290 dm_table_destroy(__unbind(md)); 2289 dm_table_destroy(__unbind(md));
2291 free_dev(md); 2290 free_dev(md);
2292 } 2291 }
2293 2292
2294 void dm_destroy(struct mapped_device *md) 2293 void dm_destroy(struct mapped_device *md)
2295 { 2294 {
2296 __dm_destroy(md, true); 2295 __dm_destroy(md, true);
2297 } 2296 }
2298 2297
2299 void dm_destroy_immediate(struct mapped_device *md) 2298 void dm_destroy_immediate(struct mapped_device *md)
2300 { 2299 {
2301 __dm_destroy(md, false); 2300 __dm_destroy(md, false);
2302 } 2301 }
2303 2302
2304 void dm_put(struct mapped_device *md) 2303 void dm_put(struct mapped_device *md)
2305 { 2304 {
2306 atomic_dec(&md->holders); 2305 atomic_dec(&md->holders);
2307 } 2306 }
2308 EXPORT_SYMBOL_GPL(dm_put); 2307 EXPORT_SYMBOL_GPL(dm_put);
2309 2308
2310 static int dm_wait_for_completion(struct mapped_device *md, int interruptible) 2309 static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2311 { 2310 {
2312 int r = 0; 2311 int r = 0;
2313 DECLARE_WAITQUEUE(wait, current); 2312 DECLARE_WAITQUEUE(wait, current);
2314 2313
2315 add_wait_queue(&md->wait, &wait); 2314 add_wait_queue(&md->wait, &wait);
2316 2315
2317 while (1) { 2316 while (1) {
2318 set_current_state(interruptible); 2317 set_current_state(interruptible);
2319 2318
2320 smp_mb(); 2319 smp_mb();
2321 if (!md_in_flight(md)) 2320 if (!md_in_flight(md))
2322 break; 2321 break;
2323 2322
2324 if (interruptible == TASK_INTERRUPTIBLE && 2323 if (interruptible == TASK_INTERRUPTIBLE &&
2325 signal_pending(current)) { 2324 signal_pending(current)) {
2326 r = -EINTR; 2325 r = -EINTR;
2327 break; 2326 break;
2328 } 2327 }
2329 2328
2330 io_schedule(); 2329 io_schedule();
2331 } 2330 }
2332 set_current_state(TASK_RUNNING); 2331 set_current_state(TASK_RUNNING);
2333 2332
2334 remove_wait_queue(&md->wait, &wait); 2333 remove_wait_queue(&md->wait, &wait);
2335 2334
2336 return r; 2335 return r;
2337 } 2336 }
2338 2337
2339 /* 2338 /*
2340 * Process the deferred bios 2339 * Process the deferred bios
2341 */ 2340 */
2342 static void dm_wq_work(struct work_struct *work) 2341 static void dm_wq_work(struct work_struct *work)
2343 { 2342 {
2344 struct mapped_device *md = container_of(work, struct mapped_device, 2343 struct mapped_device *md = container_of(work, struct mapped_device,
2345 work); 2344 work);
2346 struct bio *c; 2345 struct bio *c;
2347 2346
2348 down_read(&md->io_lock); 2347 down_read(&md->io_lock);
2349 2348
2350 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { 2349 while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2351 spin_lock_irq(&md->deferred_lock); 2350 spin_lock_irq(&md->deferred_lock);
2352 c = bio_list_pop(&md->deferred); 2351 c = bio_list_pop(&md->deferred);
2353 spin_unlock_irq(&md->deferred_lock); 2352 spin_unlock_irq(&md->deferred_lock);
2354 2353
2355 if (!c) 2354 if (!c)
2356 break; 2355 break;
2357 2356
2358 up_read(&md->io_lock); 2357 up_read(&md->io_lock);
2359 2358
2360 if (dm_request_based(md)) 2359 if (dm_request_based(md))
2361 generic_make_request(c); 2360 generic_make_request(c);
2362 else 2361 else
2363 __split_and_process_bio(md, c); 2362 __split_and_process_bio(md, c);
2364 2363
2365 down_read(&md->io_lock); 2364 down_read(&md->io_lock);
2366 } 2365 }
2367 2366
2368 up_read(&md->io_lock); 2367 up_read(&md->io_lock);
2369 } 2368 }
2370 2369
2371 static void dm_queue_flush(struct mapped_device *md) 2370 static void dm_queue_flush(struct mapped_device *md)
2372 { 2371 {
2373 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2372 clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2374 smp_mb__after_clear_bit(); 2373 smp_mb__after_clear_bit();
2375 queue_work(md->wq, &md->work); 2374 queue_work(md->wq, &md->work);
2376 } 2375 }
2377 2376
2378 /* 2377 /*
2379 * Swap in a new table, returning the old one for the caller to destroy. 2378 * Swap in a new table, returning the old one for the caller to destroy.
2380 */ 2379 */
2381 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table) 2380 struct dm_table *dm_swap_table(struct mapped_device *md, struct dm_table *table)
2382 { 2381 {
2383 struct dm_table *map = ERR_PTR(-EINVAL); 2382 struct dm_table *map = ERR_PTR(-EINVAL);
2384 struct queue_limits limits; 2383 struct queue_limits limits;
2385 int r; 2384 int r;
2386 2385
2387 mutex_lock(&md->suspend_lock); 2386 mutex_lock(&md->suspend_lock);
2388 2387
2389 /* device must be suspended */ 2388 /* device must be suspended */
2390 if (!dm_suspended_md(md)) 2389 if (!dm_suspended_md(md))
2391 goto out; 2390 goto out;
2392 2391
2393 r = dm_calculate_queue_limits(table, &limits); 2392 r = dm_calculate_queue_limits(table, &limits);
2394 if (r) { 2393 if (r) {
2395 map = ERR_PTR(r); 2394 map = ERR_PTR(r);
2396 goto out; 2395 goto out;
2397 } 2396 }
2398 2397
2399 map = __bind(md, table, &limits); 2398 map = __bind(md, table, &limits);
2400 2399
2401 out: 2400 out:
2402 mutex_unlock(&md->suspend_lock); 2401 mutex_unlock(&md->suspend_lock);
2403 return map; 2402 return map;
2404 } 2403 }
2405 2404
2406 /* 2405 /*
2407 * Functions to lock and unlock any filesystem running on the 2406 * Functions to lock and unlock any filesystem running on the
2408 * device. 2407 * device.
2409 */ 2408 */
2410 static int lock_fs(struct mapped_device *md) 2409 static int lock_fs(struct mapped_device *md)
2411 { 2410 {
2412 int r; 2411 int r;
2413 2412
2414 WARN_ON(md->frozen_sb); 2413 WARN_ON(md->frozen_sb);
2415 2414
2416 md->frozen_sb = freeze_bdev(md->bdev); 2415 md->frozen_sb = freeze_bdev(md->bdev);
2417 if (IS_ERR(md->frozen_sb)) { 2416 if (IS_ERR(md->frozen_sb)) {
2418 r = PTR_ERR(md->frozen_sb); 2417 r = PTR_ERR(md->frozen_sb);
2419 md->frozen_sb = NULL; 2418 md->frozen_sb = NULL;
2420 return r; 2419 return r;
2421 } 2420 }
2422 2421
2423 set_bit(DMF_FROZEN, &md->flags); 2422 set_bit(DMF_FROZEN, &md->flags);
2424 2423
2425 return 0; 2424 return 0;
2426 } 2425 }
2427 2426
2428 static void unlock_fs(struct mapped_device *md) 2427 static void unlock_fs(struct mapped_device *md)
2429 { 2428 {
2430 if (!test_bit(DMF_FROZEN, &md->flags)) 2429 if (!test_bit(DMF_FROZEN, &md->flags))
2431 return; 2430 return;
2432 2431
2433 thaw_bdev(md->bdev, md->frozen_sb); 2432 thaw_bdev(md->bdev, md->frozen_sb);
2434 md->frozen_sb = NULL; 2433 md->frozen_sb = NULL;
2435 clear_bit(DMF_FROZEN, &md->flags); 2434 clear_bit(DMF_FROZEN, &md->flags);
2436 } 2435 }
2437 2436
2438 /* 2437 /*
2439 * We need to be able to change a mapping table under a mounted 2438 * We need to be able to change a mapping table under a mounted
2440 * filesystem. For example we might want to move some data in 2439 * filesystem. For example we might want to move some data in
2441 * the background. Before the table can be swapped with 2440 * the background. Before the table can be swapped with
2442 * dm_bind_table, dm_suspend must be called to flush any in 2441 * dm_bind_table, dm_suspend must be called to flush any in
2443 * flight bios and ensure that any further io gets deferred. 2442 * flight bios and ensure that any further io gets deferred.
2444 */ 2443 */
2445 /* 2444 /*
2446 * Suspend mechanism in request-based dm. 2445 * Suspend mechanism in request-based dm.
2447 * 2446 *
2448 * 1. Flush all I/Os by lock_fs() if needed. 2447 * 1. Flush all I/Os by lock_fs() if needed.
2449 * 2. Stop dispatching any I/O by stopping the request_queue. 2448 * 2. Stop dispatching any I/O by stopping the request_queue.
2450 * 3. Wait for all in-flight I/Os to be completed or requeued. 2449 * 3. Wait for all in-flight I/Os to be completed or requeued.
2451 * 2450 *
2452 * To abort suspend, start the request_queue. 2451 * To abort suspend, start the request_queue.
2453 */ 2452 */
2454 int dm_suspend(struct mapped_device *md, unsigned suspend_flags) 2453 int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2455 { 2454 {
2456 struct dm_table *map = NULL; 2455 struct dm_table *map = NULL;
2457 int r = 0; 2456 int r = 0;
2458 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0; 2457 int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
2459 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0; 2458 int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
2460 2459
2461 mutex_lock(&md->suspend_lock); 2460 mutex_lock(&md->suspend_lock);
2462 2461
2463 if (dm_suspended_md(md)) { 2462 if (dm_suspended_md(md)) {
2464 r = -EINVAL; 2463 r = -EINVAL;
2465 goto out_unlock; 2464 goto out_unlock;
2466 } 2465 }
2467 2466
2468 map = dm_get_live_table(md); 2467 map = dm_get_live_table(md);
2469 2468
2470 /* 2469 /*
2471 * DMF_NOFLUSH_SUSPENDING must be set before presuspend. 2470 * DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2472 * This flag is cleared before dm_suspend returns. 2471 * This flag is cleared before dm_suspend returns.
2473 */ 2472 */
2474 if (noflush) 2473 if (noflush)
2475 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2474 set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2476 2475
2477 /* This does not get reverted if there's an error later. */ 2476 /* This does not get reverted if there's an error later. */
2478 dm_table_presuspend_targets(map); 2477 dm_table_presuspend_targets(map);
2479 2478
2480 /* 2479 /*
2481 * Flush I/O to the device. 2480 * Flush I/O to the device.
2482 * Any I/O submitted after lock_fs() may not be flushed. 2481 * Any I/O submitted after lock_fs() may not be flushed.
2483 * noflush takes precedence over do_lockfs. 2482 * noflush takes precedence over do_lockfs.
2484 * (lock_fs() flushes I/Os and waits for them to complete.) 2483 * (lock_fs() flushes I/Os and waits for them to complete.)
2485 */ 2484 */
2486 if (!noflush && do_lockfs) { 2485 if (!noflush && do_lockfs) {
2487 r = lock_fs(md); 2486 r = lock_fs(md);
2488 if (r) 2487 if (r)
2489 goto out; 2488 goto out;
2490 } 2489 }
2491 2490
2492 /* 2491 /*
2493 * Here we must make sure that no processes are submitting requests 2492 * Here we must make sure that no processes are submitting requests
2494 * to target drivers i.e. no one may be executing 2493 * to target drivers i.e. no one may be executing
2495 * __split_and_process_bio. This is called from dm_request and 2494 * __split_and_process_bio. This is called from dm_request and
2496 * dm_wq_work. 2495 * dm_wq_work.
2497 * 2496 *
2498 * To get all processes out of __split_and_process_bio in dm_request, 2497 * To get all processes out of __split_and_process_bio in dm_request,
2499 * we take the write lock. To prevent any process from reentering 2498 * we take the write lock. To prevent any process from reentering
2500 * __split_and_process_bio from dm_request and quiesce the thread 2499 * __split_and_process_bio from dm_request and quiesce the thread
2501 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call 2500 * (dm_wq_work), we set BMF_BLOCK_IO_FOR_SUSPEND and call
2502 * flush_workqueue(md->wq). 2501 * flush_workqueue(md->wq).
2503 */ 2502 */
2504 down_write(&md->io_lock); 2503 down_write(&md->io_lock);
2505 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags); 2504 set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2506 up_write(&md->io_lock); 2505 up_write(&md->io_lock);
2507 2506
2508 /* 2507 /*
2509 * Stop md->queue before flushing md->wq in case request-based 2508 * Stop md->queue before flushing md->wq in case request-based
2510 * dm defers requests to md->wq from md->queue. 2509 * dm defers requests to md->wq from md->queue.
2511 */ 2510 */
2512 if (dm_request_based(md)) 2511 if (dm_request_based(md))
2513 stop_queue(md->queue); 2512 stop_queue(md->queue);
2514 2513
2515 flush_workqueue(md->wq); 2514 flush_workqueue(md->wq);
2516 2515
2517 /* 2516 /*
2518 * At this point no more requests are entering target request routines. 2517 * At this point no more requests are entering target request routines.
2519 * We call dm_wait_for_completion to wait for all existing requests 2518 * We call dm_wait_for_completion to wait for all existing requests
2520 * to finish. 2519 * to finish.
2521 */ 2520 */
2522 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE); 2521 r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
2523 2522
2524 down_write(&md->io_lock); 2523 down_write(&md->io_lock);
2525 if (noflush) 2524 if (noflush)
2526 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags); 2525 clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2527 up_write(&md->io_lock); 2526 up_write(&md->io_lock);
2528 2527
2529 /* were we interrupted ? */ 2528 /* were we interrupted ? */
2530 if (r < 0) { 2529 if (r < 0) {
2531 dm_queue_flush(md); 2530 dm_queue_flush(md);
2532 2531
2533 if (dm_request_based(md)) 2532 if (dm_request_based(md))
2534 start_queue(md->queue); 2533 start_queue(md->queue);
2535 2534
2536 unlock_fs(md); 2535 unlock_fs(md);
2537 goto out; /* pushback list is already flushed, so skip flush */ 2536 goto out; /* pushback list is already flushed, so skip flush */
2538 } 2537 }
2539 2538
2540 /* 2539 /*
2541 * If dm_wait_for_completion returned 0, the device is completely 2540 * If dm_wait_for_completion returned 0, the device is completely
2542 * quiescent now. There is no request-processing activity. All new 2541 * quiescent now. There is no request-processing activity. All new
2543 * requests are being added to md->deferred list. 2542 * requests are being added to md->deferred list.
2544 */ 2543 */
2545 2544
2546 set_bit(DMF_SUSPENDED, &md->flags); 2545 set_bit(DMF_SUSPENDED, &md->flags);
2547 2546
2548 dm_table_postsuspend_targets(map); 2547 dm_table_postsuspend_targets(map);
2549 2548
2550 out: 2549 out:
2551 dm_table_put(map); 2550 dm_table_put(map);
2552 2551
2553 out_unlock: 2552 out_unlock:
2554 mutex_unlock(&md->suspend_lock); 2553 mutex_unlock(&md->suspend_lock);
2555 return r; 2554 return r;
2556 } 2555 }
2557 2556
2558 int dm_resume(struct mapped_device *md) 2557 int dm_resume(struct mapped_device *md)
2559 { 2558 {
2560 int r = -EINVAL; 2559 int r = -EINVAL;
2561 struct dm_table *map = NULL; 2560 struct dm_table *map = NULL;
2562 2561
2563 mutex_lock(&md->suspend_lock); 2562 mutex_lock(&md->suspend_lock);
2564 if (!dm_suspended_md(md)) 2563 if (!dm_suspended_md(md))
2565 goto out; 2564 goto out;
2566 2565
2567 map = dm_get_live_table(md); 2566 map = dm_get_live_table(md);
2568 if (!map || !dm_table_get_size(map)) 2567 if (!map || !dm_table_get_size(map))
2569 goto out; 2568 goto out;
2570 2569
2571 r = dm_table_resume_targets(map); 2570 r = dm_table_resume_targets(map);
2572 if (r) 2571 if (r)
2573 goto out; 2572 goto out;
2574 2573
2575 dm_queue_flush(md); 2574 dm_queue_flush(md);
2576 2575
2577 /* 2576 /*
2578 * Flushing deferred I/Os must be done after targets are resumed 2577 * Flushing deferred I/Os must be done after targets are resumed
2579 * so that mapping of targets can work correctly. 2578 * so that mapping of targets can work correctly.
2580 * Request-based dm is queueing the deferred I/Os in its request_queue. 2579 * Request-based dm is queueing the deferred I/Os in its request_queue.
2581 */ 2580 */
2582 if (dm_request_based(md)) 2581 if (dm_request_based(md))
2583 start_queue(md->queue); 2582 start_queue(md->queue);
2584 2583
2585 unlock_fs(md); 2584 unlock_fs(md);
2586 2585
2587 clear_bit(DMF_SUSPENDED, &md->flags); 2586 clear_bit(DMF_SUSPENDED, &md->flags);
2588 2587
2589 r = 0; 2588 r = 0;
2590 out: 2589 out:
2591 dm_table_put(map); 2590 dm_table_put(map);
2592 mutex_unlock(&md->suspend_lock); 2591 mutex_unlock(&md->suspend_lock);
2593 2592
2594 return r; 2593 return r;
2595 } 2594 }
2596 2595
2597 /*----------------------------------------------------------------- 2596 /*-----------------------------------------------------------------
2598 * Event notification. 2597 * Event notification.
2599 *---------------------------------------------------------------*/ 2598 *---------------------------------------------------------------*/
2600 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action, 2599 int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2601 unsigned cookie) 2600 unsigned cookie)
2602 { 2601 {
2603 char udev_cookie[DM_COOKIE_LENGTH]; 2602 char udev_cookie[DM_COOKIE_LENGTH];
2604 char *envp[] = { udev_cookie, NULL }; 2603 char *envp[] = { udev_cookie, NULL };
2605 2604
2606 if (!cookie) 2605 if (!cookie)
2607 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action); 2606 return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2608 else { 2607 else {
2609 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u", 2608 snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2610 DM_COOKIE_ENV_VAR_NAME, cookie); 2609 DM_COOKIE_ENV_VAR_NAME, cookie);
2611 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj, 2610 return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2612 action, envp); 2611 action, envp);
2613 } 2612 }
2614 } 2613 }
2615 2614
2616 uint32_t dm_next_uevent_seq(struct mapped_device *md) 2615 uint32_t dm_next_uevent_seq(struct mapped_device *md)
2617 { 2616 {
2618 return atomic_add_return(1, &md->uevent_seq); 2617 return atomic_add_return(1, &md->uevent_seq);
2619 } 2618 }
2620 2619
2621 uint32_t dm_get_event_nr(struct mapped_device *md) 2620 uint32_t dm_get_event_nr(struct mapped_device *md)
2622 { 2621 {
2623 return atomic_read(&md->event_nr); 2622 return atomic_read(&md->event_nr);
2624 } 2623 }
2625 2624
2626 int dm_wait_event(struct mapped_device *md, int event_nr) 2625 int dm_wait_event(struct mapped_device *md, int event_nr)
2627 { 2626 {
2628 return wait_event_interruptible(md->eventq, 2627 return wait_event_interruptible(md->eventq,
2629 (event_nr != atomic_read(&md->event_nr))); 2628 (event_nr != atomic_read(&md->event_nr)));
2630 } 2629 }
2631 2630
2632 void dm_uevent_add(struct mapped_device *md, struct list_head *elist) 2631 void dm_uevent_add(struct mapped_device *md, struct list_head *elist)
2633 { 2632 {
2634 unsigned long flags; 2633 unsigned long flags;
2635 2634
2636 spin_lock_irqsave(&md->uevent_lock, flags); 2635 spin_lock_irqsave(&md->uevent_lock, flags);
2637 list_add(elist, &md->uevent_list); 2636 list_add(elist, &md->uevent_list);
2638 spin_unlock_irqrestore(&md->uevent_lock, flags); 2637 spin_unlock_irqrestore(&md->uevent_lock, flags);
2639 } 2638 }
2640 2639
2641 /* 2640 /*
2642 * The gendisk is only valid as long as you have a reference 2641 * The gendisk is only valid as long as you have a reference
2643 * count on 'md'. 2642 * count on 'md'.
2644 */ 2643 */
2645 struct gendisk *dm_disk(struct mapped_device *md) 2644 struct gendisk *dm_disk(struct mapped_device *md)
2646 { 2645 {
2647 return md->disk; 2646 return md->disk;
2648 } 2647 }
2649 2648
2650 struct kobject *dm_kobject(struct mapped_device *md) 2649 struct kobject *dm_kobject(struct mapped_device *md)
2651 { 2650 {
2652 return &md->kobj; 2651 return &md->kobj;
2653 } 2652 }
2654 2653
2655 /* 2654 /*
2656 * struct mapped_device should not be exported outside of dm.c 2655 * struct mapped_device should not be exported outside of dm.c
2657 * so use this check to verify that kobj is part of md structure 2656 * so use this check to verify that kobj is part of md structure
2658 */ 2657 */
2659 struct mapped_device *dm_get_from_kobject(struct kobject *kobj) 2658 struct mapped_device *dm_get_from_kobject(struct kobject *kobj)
2660 { 2659 {
2661 struct mapped_device *md; 2660 struct mapped_device *md;
2662 2661
2663 md = container_of(kobj, struct mapped_device, kobj); 2662 md = container_of(kobj, struct mapped_device, kobj);
2664 if (&md->kobj != kobj) 2663 if (&md->kobj != kobj)
2665 return NULL; 2664 return NULL;
2666 2665
2667 if (test_bit(DMF_FREEING, &md->flags) || 2666 if (test_bit(DMF_FREEING, &md->flags) ||
2668 dm_deleting_md(md)) 2667 dm_deleting_md(md))
2669 return NULL; 2668 return NULL;
2670 2669
2671 dm_get(md); 2670 dm_get(md);
2672 return md; 2671 return md;
2673 } 2672 }
2674 2673
2675 int dm_suspended_md(struct mapped_device *md) 2674 int dm_suspended_md(struct mapped_device *md)
2676 { 2675 {
2677 return test_bit(DMF_SUSPENDED, &md->flags); 2676 return test_bit(DMF_SUSPENDED, &md->flags);
2678 } 2677 }
2679 2678
2680 int dm_suspended(struct dm_target *ti) 2679 int dm_suspended(struct dm_target *ti)
2681 { 2680 {
2682 return dm_suspended_md(dm_table_get_md(ti->table)); 2681 return dm_suspended_md(dm_table_get_md(ti->table));
2683 } 2682 }
2684 EXPORT_SYMBOL_GPL(dm_suspended); 2683 EXPORT_SYMBOL_GPL(dm_suspended);
2685 2684
2686 int dm_noflush_suspending(struct dm_target *ti) 2685 int dm_noflush_suspending(struct dm_target *ti)
2687 { 2686 {
2688 return __noflush_suspending(dm_table_get_md(ti->table)); 2687 return __noflush_suspending(dm_table_get_md(ti->table));
2689 } 2688 }
2690 EXPORT_SYMBOL_GPL(dm_noflush_suspending); 2689 EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2691 2690
2692 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity) 2691 struct dm_md_mempools *dm_alloc_md_mempools(unsigned type, unsigned integrity)
2693 { 2692 {
2694 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL); 2693 struct dm_md_mempools *pools = kmalloc(sizeof(*pools), GFP_KERNEL);
2695 unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS; 2694 unsigned int pool_size = (type == DM_TYPE_BIO_BASED) ? 16 : MIN_IOS;
2696 2695
2697 if (!pools) 2696 if (!pools)
2698 return NULL; 2697 return NULL;
2699 2698
2700 pools->io_pool = (type == DM_TYPE_BIO_BASED) ? 2699 pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
2701 mempool_create_slab_pool(MIN_IOS, _io_cache) : 2700 mempool_create_slab_pool(MIN_IOS, _io_cache) :
2702 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache); 2701 mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
2703 if (!pools->io_pool) 2702 if (!pools->io_pool)
2704 goto free_pools_and_out; 2703 goto free_pools_and_out;
2705 2704
2706 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ? 2705 pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
2707 mempool_create_slab_pool(MIN_IOS, _tio_cache) : 2706 mempool_create_slab_pool(MIN_IOS, _tio_cache) :
2708 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache); 2707 mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
2709 if (!pools->tio_pool) 2708 if (!pools->tio_pool)
2710 goto free_io_pool_and_out; 2709 goto free_io_pool_and_out;
2711 2710
2712 pools->bs = bioset_create(pool_size, 0); 2711 pools->bs = bioset_create(pool_size, 0);
2713 if (!pools->bs) 2712 if (!pools->bs)
2714 goto free_tio_pool_and_out; 2713 goto free_tio_pool_and_out;
2715 2714
2716 if (integrity && bioset_integrity_create(pools->bs, pool_size)) 2715 if (integrity && bioset_integrity_create(pools->bs, pool_size))
2717 goto free_bioset_and_out; 2716 goto free_bioset_and_out;
2718 2717
2719 return pools; 2718 return pools;
2720 2719
2721 free_bioset_and_out: 2720 free_bioset_and_out:
2722 bioset_free(pools->bs); 2721 bioset_free(pools->bs);
2723 2722
2724 free_tio_pool_and_out: 2723 free_tio_pool_and_out:
2725 mempool_destroy(pools->tio_pool); 2724 mempool_destroy(pools->tio_pool);
2726 2725
2727 free_io_pool_and_out: 2726 free_io_pool_and_out:
2728 mempool_destroy(pools->io_pool); 2727 mempool_destroy(pools->io_pool);
2729 2728
2730 free_pools_and_out: 2729 free_pools_and_out:
2731 kfree(pools); 2730 kfree(pools);
2732 2731
2733 return NULL; 2732 return NULL;
2734 } 2733 }
2735 2734
2736 void dm_free_md_mempools(struct dm_md_mempools *pools) 2735 void dm_free_md_mempools(struct dm_md_mempools *pools)
2737 { 2736 {
2738 if (!pools) 2737 if (!pools)
2739 return; 2738 return;
2740 2739
2741 if (pools->io_pool) 2740 if (pools->io_pool)
2742 mempool_destroy(pools->io_pool); 2741 mempool_destroy(pools->io_pool);
2743 2742
2744 if (pools->tio_pool) 2743 if (pools->tio_pool)
2745 mempool_destroy(pools->tio_pool); 2744 mempool_destroy(pools->tio_pool);
2746 2745
2747 if (pools->bs) 2746 if (pools->bs)
2748 bioset_free(pools->bs); 2747 bioset_free(pools->bs);
2749 2748
2750 kfree(pools); 2749 kfree(pools);
2751 } 2750 }
2752 2751
2753 static const struct block_device_operations dm_blk_dops = { 2752 static const struct block_device_operations dm_blk_dops = {
2754 .open = dm_blk_open, 2753 .open = dm_blk_open,
2755 .release = dm_blk_close, 2754 .release = dm_blk_close,
2756 .ioctl = dm_blk_ioctl, 2755 .ioctl = dm_blk_ioctl,
2757 .getgeo = dm_blk_getgeo, 2756 .getgeo = dm_blk_getgeo,
2758 .owner = THIS_MODULE 2757 .owner = THIS_MODULE
2759 }; 2758 };
2760 2759
2761 EXPORT_SYMBOL(dm_get_mapinfo); 2760 EXPORT_SYMBOL(dm_get_mapinfo);
2762 2761
2763 /* 2762 /*
2764 * module hooks 2763 * module hooks
2765 */ 2764 */
2766 module_init(dm_init); 2765 module_init(dm_init);
2767 module_exit(dm_exit); 2766 module_exit(dm_exit);
2768 2767
2769 module_param(major, uint, 0); 2768 module_param(major, uint, 0);
2770 MODULE_PARM_DESC(major, "The major number of the device mapper"); 2769 MODULE_PARM_DESC(major, "The major number of the device mapper");
2771 MODULE_DESCRIPTION(DM_NAME " driver"); 2770 MODULE_DESCRIPTION(DM_NAME " driver");
2772 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); 2771 MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2773 MODULE_LICENSE("GPL"); 2772 MODULE_LICENSE("GPL");
2774 2773