Commit d87f4c14f27dc82d215108d8392a7d26687148a1

Authored by Tejun Heo
Committed by Jens Axboe
1 parent 3a2edd0d6d

dm: implement REQ_FLUSH/FUA support for bio-based dm

This patch converts bio-based dm to support REQ_FLUSH/FUA instead of
now deprecated REQ_HARDBARRIER.

* -EOPNOTSUPP handling logic dropped.

* Preflush is handled as before but postflush is dropped and replaced
  with passing down REQ_FUA to member request_queues.  This replaces
  one array wide cache flush w/ member specific FUA writes.

* __split_and_process_bio() now calls __clone_and_map_flush() directly
  for flushes and guarantees all FLUSH bio's going to targets are zero
`  length.

* It's now guaranteed that all FLUSH bio's which are passed onto dm
  targets are zero length.  bio_empty_barrier() tests are replaced
  with REQ_FLUSH tests.

* Empty WRITE_BARRIERs are replaced with WRITE_FLUSHes.

* Dropped unlikely() around REQ_FLUSH tests.  Flushes are not unlikely
  enough to be marked with unlikely().

* Block layer now filters out REQ_FLUSH/FUA bio's if the request_queue
  doesn't support cache flushing.  Advertise REQ_FLUSH | REQ_FUA
  capability.

* Request based dm isn't converted yet.  dm_init_request_based_queue()
  resets flush support to 0 for now.  To avoid disturbing request
  based dm code, dm->flush_error is added for bio based dm while
  requested based dm continues to use dm->barrier_error.

Lightly tested linear, stripe, raid1, snap and crypt targets.  Please
proceed with caution as I'm not familiar with the code base.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: dm-devel@redhat.com
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>

Showing 9 changed files with 80 additions and 97 deletions Side-by-side Diff

drivers/md/dm-crypt.c
... ... @@ -1278,7 +1278,7 @@
1278 1278 struct dm_crypt_io *io;
1279 1279 struct crypt_config *cc;
1280 1280  
1281   - if (unlikely(bio_empty_barrier(bio))) {
  1281 + if (bio->bi_rw & REQ_FLUSH) {
1282 1282 cc = ti->private;
1283 1283 bio->bi_bdev = cc->dev->bdev;
1284 1284 return DM_MAPIO_REMAPPED;
... ... @@ -31,7 +31,6 @@
31 31 */
32 32 struct io {
33 33 unsigned long error_bits;
34   - unsigned long eopnotsupp_bits;
35 34 atomic_t count;
36 35 struct task_struct *sleeper;
37 36 struct dm_io_client *client;
38 37  
... ... @@ -130,11 +129,8 @@
130 129 *---------------------------------------------------------------*/
131 130 static void dec_count(struct io *io, unsigned int region, int error)
132 131 {
133   - if (error) {
  132 + if (error)
134 133 set_bit(region, &io->error_bits);
135   - if (error == -EOPNOTSUPP)
136   - set_bit(region, &io->eopnotsupp_bits);
137   - }
138 134  
139 135 if (atomic_dec_and_test(&io->count)) {
140 136 if (io->sleeper)
... ... @@ -310,8 +306,8 @@
310 306 sector_t remaining = where->count;
311 307  
312 308 /*
313   - * where->count may be zero if rw holds a write barrier and we
314   - * need to send a zero-sized barrier.
  309 + * where->count may be zero if rw holds a flush and we need to
  310 + * send a zero-sized flush.
315 311 */
316 312 do {
317 313 /*
... ... @@ -364,7 +360,7 @@
364 360 */
365 361 for (i = 0; i < num_regions; i++) {
366 362 *dp = old_pages;
367   - if (where[i].count || (rw & REQ_HARDBARRIER))
  363 + if (where[i].count || (rw & REQ_FLUSH))
368 364 do_region(rw, i, where + i, dp, io);
369 365 }
370 366  
371 367  
... ... @@ -393,9 +389,7 @@
393 389 return -EIO;
394 390 }
395 391  
396   -retry:
397 392 io->error_bits = 0;
398   - io->eopnotsupp_bits = 0;
399 393 atomic_set(&io->count, 1); /* see dispatch_io() */
400 394 io->sleeper = current;
401 395 io->client = client;
... ... @@ -412,11 +406,6 @@
412 406 }
413 407 set_current_state(TASK_RUNNING);
414 408  
415   - if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
416   - rw &= ~REQ_HARDBARRIER;
417   - goto retry;
418   - }
419   -
420 409 if (error_bits)
421 410 *error_bits = io->error_bits;
422 411  
... ... @@ -437,7 +426,6 @@
437 426  
438 427 io = mempool_alloc(client->pool, GFP_NOIO);
439 428 io->error_bits = 0;
440   - io->eopnotsupp_bits = 0;
441 429 atomic_set(&io->count, 1); /* see dispatch_io() */
442 430 io->sleeper = NULL;
443 431 io->client = client;
... ... @@ -300,7 +300,7 @@
300 300 .count = 0,
301 301 };
302 302  
303   - lc->io_req.bi_rw = WRITE_BARRIER;
  303 + lc->io_req.bi_rw = WRITE_FLUSH;
304 304  
305 305 return dm_io(&lc->io_req, 1, &null_location, NULL);
306 306 }
drivers/md/dm-raid1.c
... ... @@ -259,7 +259,7 @@
259 259 struct dm_io_region io[ms->nr_mirrors];
260 260 struct mirror *m;
261 261 struct dm_io_request io_req = {
262   - .bi_rw = WRITE_BARRIER,
  262 + .bi_rw = WRITE_FLUSH,
263 263 .mem.type = DM_IO_KMEM,
264 264 .mem.ptr.bvec = NULL,
265 265 .client = ms->io_client,
... ... @@ -629,7 +629,7 @@
629 629 struct dm_io_region io[ms->nr_mirrors], *dest = io;
630 630 struct mirror *m;
631 631 struct dm_io_request io_req = {
632   - .bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
  632 + .bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
633 633 .mem.type = DM_IO_BVEC,
634 634 .mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
635 635 .notify.fn = write_callback,
... ... @@ -670,7 +670,7 @@
670 670 bio_list_init(&requeue);
671 671  
672 672 while ((bio = bio_list_pop(writes))) {
673   - if (unlikely(bio_empty_barrier(bio))) {
  673 + if (bio->bi_rw & REQ_FLUSH) {
674 674 bio_list_add(&sync, bio);
675 675 continue;
676 676 }
... ... @@ -1203,7 +1203,7 @@
1203 1203 * We need to dec pending if this was a write.
1204 1204 */
1205 1205 if (rw == WRITE) {
1206   - if (likely(!bio_empty_barrier(bio)))
  1206 + if (!(bio->bi_rw & REQ_FLUSH))
1207 1207 dm_rh_dec(ms->rh, map_context->ll);
1208 1208 return error;
1209 1209 }
drivers/md/dm-region-hash.c
... ... @@ -81,9 +81,9 @@
81 81 struct list_head failed_recovered_regions;
82 82  
83 83 /*
84   - * If there was a barrier failure no regions can be marked clean.
  84 + * If there was a flush failure no regions can be marked clean.
85 85 */
86   - int barrier_failure;
  86 + int flush_failure;
87 87  
88 88 void *context;
89 89 sector_t target_begin;
... ... @@ -217,7 +217,7 @@
217 217 INIT_LIST_HEAD(&rh->quiesced_regions);
218 218 INIT_LIST_HEAD(&rh->recovered_regions);
219 219 INIT_LIST_HEAD(&rh->failed_recovered_regions);
220   - rh->barrier_failure = 0;
  220 + rh->flush_failure = 0;
221 221  
222 222 rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
223 223 sizeof(struct dm_region));
... ... @@ -399,8 +399,8 @@
399 399 region_t region = dm_rh_bio_to_region(rh, bio);
400 400 int recovering = 0;
401 401  
402   - if (bio_empty_barrier(bio)) {
403   - rh->barrier_failure = 1;
  402 + if (bio->bi_rw & REQ_FLUSH) {
  403 + rh->flush_failure = 1;
404 404 return;
405 405 }
406 406  
... ... @@ -524,7 +524,7 @@
524 524 struct bio *bio;
525 525  
526 526 for (bio = bios->head; bio; bio = bio->bi_next) {
527   - if (bio_empty_barrier(bio))
  527 + if (bio->bi_rw & REQ_FLUSH)
528 528 continue;
529 529 rh_inc(rh, dm_rh_bio_to_region(rh, bio));
530 530 }
531 531  
... ... @@ -555,9 +555,9 @@
555 555 */
556 556  
557 557 /* do nothing for DM_RH_NOSYNC */
558   - if (unlikely(rh->barrier_failure)) {
  558 + if (unlikely(rh->flush_failure)) {
559 559 /*
560   - * If a write barrier failed some time ago, we
  560 + * If a write flush failed some time ago, we
561 561 * don't know whether or not this write made it
562 562 * to the disk, so we must resync the device.
563 563 */
drivers/md/dm-snap-persistent.c
... ... @@ -687,7 +687,7 @@
687 687 /*
688 688 * Commit exceptions to disk.
689 689 */
690   - if (ps->valid && area_io(ps, WRITE_BARRIER))
  690 + if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
691 691 ps->valid = 0;
692 692  
693 693 /*
drivers/md/dm-snap.c
... ... @@ -1587,7 +1587,7 @@
1587 1587 chunk_t chunk;
1588 1588 struct dm_snap_pending_exception *pe = NULL;
1589 1589  
1590   - if (unlikely(bio_empty_barrier(bio))) {
  1590 + if (bio->bi_rw & REQ_FLUSH) {
1591 1591 bio->bi_bdev = s->cow->bdev;
1592 1592 return DM_MAPIO_REMAPPED;
1593 1593 }
... ... @@ -1691,7 +1691,7 @@
1691 1691 int r = DM_MAPIO_REMAPPED;
1692 1692 chunk_t chunk;
1693 1693  
1694   - if (unlikely(bio_empty_barrier(bio))) {
  1694 + if (bio->bi_rw & REQ_FLUSH) {
1695 1695 if (!map_context->target_request_nr)
1696 1696 bio->bi_bdev = s->origin->bdev;
1697 1697 else
... ... @@ -2135,7 +2135,7 @@
2135 2135 struct dm_dev *dev = ti->private;
2136 2136 bio->bi_bdev = dev->bdev;
2137 2137  
2138   - if (unlikely(bio_empty_barrier(bio)))
  2138 + if (bio->bi_rw & REQ_FLUSH)
2139 2139 return DM_MAPIO_REMAPPED;
2140 2140  
2141 2141 /* Only tell snapshots if this is a write */
drivers/md/dm-stripe.c
... ... @@ -271,7 +271,7 @@
271 271 uint32_t stripe;
272 272 unsigned target_request_nr;
273 273  
274   - if (unlikely(bio_empty_barrier(bio))) {
  274 + if (bio->bi_rw & REQ_FLUSH) {
275 275 target_request_nr = map_context->target_request_nr;
276 276 BUG_ON(target_request_nr >= sc->stripes);
277 277 bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
... ... @@ -144,15 +144,16 @@
144 144 spinlock_t deferred_lock;
145 145  
146 146 /*
147   - * An error from the barrier request currently being processed.
  147 + * An error from the flush request currently being processed.
148 148 */
149   - int barrier_error;
  149 + int flush_error;
150 150  
151 151 /*
152 152 * Protect barrier_error from concurrent endio processing
153 153 * in request-based dm.
154 154 */
155 155 spinlock_t barrier_error_lock;
  156 + int barrier_error;
156 157  
157 158 /*
158 159 * Processing queue (flush/barriers)
... ... @@ -200,8 +201,8 @@
200 201 /* sysfs handle */
201 202 struct kobject kobj;
202 203  
203   - /* zero-length barrier that will be cloned and submitted to targets */
204   - struct bio barrier_bio;
  204 + /* zero-length flush that will be cloned and submitted to targets */
  205 + struct bio flush_bio;
205 206 };
206 207  
207 208 /*
... ... @@ -512,7 +513,7 @@
512 513  
513 514 /*
514 515 * After this is decremented the bio must not be touched if it is
515   - * a barrier.
  516 + * a flush.
516 517 */
517 518 dm_disk(md)->part0.in_flight[rw] = pending =
518 519 atomic_dec_return(&md->pending[rw]);
... ... @@ -626,7 +627,7 @@
626 627 */
627 628 spin_lock_irqsave(&md->deferred_lock, flags);
628 629 if (__noflush_suspending(md)) {
629   - if (!(io->bio->bi_rw & REQ_HARDBARRIER))
  630 + if (!(io->bio->bi_rw & REQ_FLUSH))
630 631 bio_list_add_head(&md->deferred,
631 632 io->bio);
632 633 } else
633 634  
634 635  
635 636  
... ... @@ -638,20 +639,14 @@
638 639 io_error = io->error;
639 640 bio = io->bio;
640 641  
641   - if (bio->bi_rw & REQ_HARDBARRIER) {
  642 + if (bio->bi_rw & REQ_FLUSH) {
642 643 /*
643   - * There can be just one barrier request so we use
  644 + * There can be just one flush request so we use
644 645 * a per-device variable for error reporting.
645 646 * Note that you can't touch the bio after end_io_acct
646   - *
647   - * We ignore -EOPNOTSUPP for empty flush reported by
648   - * underlying devices. We assume that if the device
649   - * doesn't support empty barriers, it doesn't need
650   - * cache flushing commands.
651 647 */
652   - if (!md->barrier_error &&
653   - !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))
654   - md->barrier_error = io_error;
  648 + if (!md->flush_error)
  649 + md->flush_error = io_error;
655 650 end_io_acct(io);
656 651 free_io(md, io);
657 652 } else {
... ... @@ -1119,7 +1114,7 @@
1119 1114 }
1120 1115  
1121 1116 /*
1122   - * Creates a little bio that is just does part of a bvec.
  1117 + * Creates a little bio that just does part of a bvec.
1123 1118 */
1124 1119 static struct bio *split_bvec(struct bio *bio, sector_t sector,
1125 1120 unsigned short idx, unsigned int offset,
... ... @@ -1134,7 +1129,7 @@
1134 1129  
1135 1130 clone->bi_sector = sector;
1136 1131 clone->bi_bdev = bio->bi_bdev;
1137   - clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
  1132 + clone->bi_rw = bio->bi_rw;
1138 1133 clone->bi_vcnt = 1;
1139 1134 clone->bi_size = to_bytes(len);
1140 1135 clone->bi_io_vec->bv_offset = offset;
... ... @@ -1161,7 +1156,6 @@
1161 1156  
1162 1157 clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1163 1158 __bio_clone(clone, bio);
1164   - clone->bi_rw &= ~REQ_HARDBARRIER;
1165 1159 clone->bi_destructor = dm_bio_destructor;
1166 1160 clone->bi_sector = sector;
1167 1161 clone->bi_idx = idx;
... ... @@ -1225,7 +1219,7 @@
1225 1219 __issue_target_request(ci, ti, request_nr, len);
1226 1220 }
1227 1221  
1228   -static int __clone_and_map_empty_barrier(struct clone_info *ci)
  1222 +static int __clone_and_map_flush(struct clone_info *ci)
1229 1223 {
1230 1224 unsigned target_nr = 0;
1231 1225 struct dm_target *ti;
... ... @@ -1289,9 +1283,6 @@
1289 1283 sector_t len = 0, max;
1290 1284 struct dm_target_io *tio;
1291 1285  
1292   - if (unlikely(bio_empty_barrier(bio)))
1293   - return __clone_and_map_empty_barrier(ci);
1294   -
1295 1286 if (unlikely(bio->bi_rw & REQ_DISCARD))
1296 1287 return __clone_and_map_discard(ci);
1297 1288  
1298 1289  
... ... @@ -1383,11 +1374,11 @@
1383 1374  
1384 1375 ci.map = dm_get_live_table(md);
1385 1376 if (unlikely(!ci.map)) {
1386   - if (!(bio->bi_rw & REQ_HARDBARRIER))
  1377 + if (!(bio->bi_rw & REQ_FLUSH))
1387 1378 bio_io_error(bio);
1388 1379 else
1389   - if (!md->barrier_error)
1390   - md->barrier_error = -EIO;
  1380 + if (!md->flush_error)
  1381 + md->flush_error = -EIO;
1391 1382 return;
1392 1383 }
1393 1384  
1394 1385  
1395 1386  
... ... @@ -1400,14 +1391,22 @@
1400 1391 ci.io->md = md;
1401 1392 spin_lock_init(&ci.io->endio_lock);
1402 1393 ci.sector = bio->bi_sector;
1403   - ci.sector_count = bio_sectors(bio);
1404   - if (unlikely(bio_empty_barrier(bio)))
  1394 + if (!(bio->bi_rw & REQ_FLUSH))
  1395 + ci.sector_count = bio_sectors(bio);
  1396 + else {
  1397 + /* all FLUSH bio's reaching here should be empty */
  1398 + WARN_ON_ONCE(bio_has_data(bio));
1405 1399 ci.sector_count = 1;
  1400 + }
1406 1401 ci.idx = bio->bi_idx;
1407 1402  
1408 1403 start_io_acct(ci.io);
1409   - while (ci.sector_count && !error)
1410   - error = __clone_and_map(&ci);
  1404 + while (ci.sector_count && !error) {
  1405 + if (!(bio->bi_rw & REQ_FLUSH))
  1406 + error = __clone_and_map(&ci);
  1407 + else
  1408 + error = __clone_and_map_flush(&ci);
  1409 + }
1411 1410  
1412 1411 /* drop the extra reference count */
1413 1412 dec_pending(ci.io, error);
1414 1413  
... ... @@ -1492,11 +1491,11 @@
1492 1491 part_stat_unlock();
1493 1492  
1494 1493 /*
1495   - * If we're suspended or the thread is processing barriers
  1494 + * If we're suspended or the thread is processing flushes
1496 1495 * we have to queue this io for later.
1497 1496 */
1498 1497 if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
1499   - unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
  1498 + (bio->bi_rw & REQ_FLUSH)) {
1500 1499 up_read(&md->io_lock);
1501 1500  
1502 1501 if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
... ... @@ -1940,6 +1939,7 @@
1940 1939 blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1941 1940 md->queue->unplug_fn = dm_unplug_all;
1942 1941 blk_queue_merge_bvec(md->queue, dm_merge_bvec);
  1942 + blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
1943 1943 }
1944 1944  
1945 1945 /*
... ... @@ -2245,7 +2245,8 @@
2245 2245 blk_queue_softirq_done(md->queue, dm_softirq_done);
2246 2246 blk_queue_prep_rq(md->queue, dm_prep_fn);
2247 2247 blk_queue_lld_busy(md->queue, dm_lld_busy);
2248   - blk_queue_flush(md->queue, REQ_FLUSH);
  2248 + /* no flush support for request based dm yet */
  2249 + blk_queue_flush(md->queue, 0);
2249 2250  
2250 2251 elv_register_queue(md->queue);
2251 2252  
2252 2253  
2253 2254  
2254 2255  
2255 2256  
2256 2257  
... ... @@ -2406,41 +2407,35 @@
2406 2407 return r;
2407 2408 }
2408 2409  
2409   -static void dm_flush(struct mapped_device *md)
  2410 +static void process_flush(struct mapped_device *md, struct bio *bio)
2410 2411 {
  2412 + md->flush_error = 0;
  2413 +
  2414 + /* handle REQ_FLUSH */
2411 2415 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2412 2416  
2413   - bio_init(&md->barrier_bio);
2414   - md->barrier_bio.bi_bdev = md->bdev;
2415   - md->barrier_bio.bi_rw = WRITE_BARRIER;
2416   - __split_and_process_bio(md, &md->barrier_bio);
  2417 + bio_init(&md->flush_bio);
  2418 + md->flush_bio.bi_bdev = md->bdev;
  2419 + md->flush_bio.bi_rw = WRITE_FLUSH;
  2420 + __split_and_process_bio(md, &md->flush_bio);
2417 2421  
2418 2422 dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2419   -}
2420 2423  
2421   -static void process_barrier(struct mapped_device *md, struct bio *bio)
2422   -{
2423   - md->barrier_error = 0;
2424   -
2425   - dm_flush(md);
2426   -
2427   - if (!bio_empty_barrier(bio)) {
2428   - __split_and_process_bio(md, bio);
2429   - /*
2430   - * If the request isn't supported, don't waste time with
2431   - * the second flush.
2432   - */
2433   - if (md->barrier_error != -EOPNOTSUPP)
2434   - dm_flush(md);
  2424 + /* if it's an empty flush or the preflush failed, we're done */
  2425 + if (!bio_has_data(bio) || md->flush_error) {
  2426 + if (md->flush_error != DM_ENDIO_REQUEUE)
  2427 + bio_endio(bio, md->flush_error);
  2428 + else {
  2429 + spin_lock_irq(&md->deferred_lock);
  2430 + bio_list_add_head(&md->deferred, bio);
  2431 + spin_unlock_irq(&md->deferred_lock);
  2432 + }
  2433 + return;
2435 2434 }
2436 2435  
2437   - if (md->barrier_error != DM_ENDIO_REQUEUE)
2438   - bio_endio(bio, md->barrier_error);
2439   - else {
2440   - spin_lock_irq(&md->deferred_lock);
2441   - bio_list_add_head(&md->deferred, bio);
2442   - spin_unlock_irq(&md->deferred_lock);
2443   - }
  2436 + /* issue data + REQ_FUA */
  2437 + bio->bi_rw &= ~REQ_FLUSH;
  2438 + __split_and_process_bio(md, bio);
2444 2439 }
2445 2440  
2446 2441 /*
... ... @@ -2469,8 +2464,8 @@
2469 2464 if (dm_request_based(md))
2470 2465 generic_make_request(c);
2471 2466 else {
2472   - if (c->bi_rw & REQ_HARDBARRIER)
2473   - process_barrier(md, c);
  2467 + if (c->bi_rw & REQ_FLUSH)
  2468 + process_flush(md, c);
2474 2469 else
2475 2470 __split_and_process_bio(md, c);
2476 2471 }