dm: implement REQ_FLUSH/FUA support for bio-based dm

This patch converts bio-based dm to support REQ_FLUSH/FUA instead of now deprecated REQ_HARDBARRIER. * -EOPNOTSUPP handling logic dropped. * Preflush is handled as before but postflush is dropped and replaced with passing down REQ_FUA to member request_queues. This replaces one array wide cache flush w/ member specific FUA writes. * __split_and_process_bio() now calls __clone_and_map_flush() directly for flushes and guarantees all FLUSH bio's going to targets are zero ` length. * It's now guaranteed that all FLUSH bio's which are passed onto dm targets are zero length. bio_empty_barrier() tests are replaced with REQ_FLUSH tests. * Empty WRITE_BARRIERs are replaced with WRITE_FLUSHes. * Dropped unlikely() around REQ_FLUSH tests. Flushes are not unlikely enough to be marked with unlikely(). * Block layer now filters out REQ_FLUSH/FUA bio's if the request_queue doesn't support cache flushing. Advertise REQ_FLUSH | REQ_FUA capability. * Request based dm isn't converted yet. dm_init_request_based_queue() resets flush support to 0 for now. To avoid disturbing request based dm code, dm->flush_error is added for bio based dm while requested based dm continues to use dm->barrier_error. Lightly tested linear, stripe, raid1, snap and crypt targets. Please proceed with caution as I'm not familiar with the code base. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: dm-devel@redhat.com Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>

dm: implement REQ_FLUSH/FUA support for bio-based dm
This patch converts bio-based dm to support REQ_FLUSH/FUA instead of now deprecated REQ_HARDBARRIER. * -EOPNOTSUPP handling logic dropped. * Preflush is handled as before but postflush is dropped and replaced with passing down REQ_FUA to member request_queues. This replaces one array wide cache flush w/ member specific FUA writes. * __split_and_process_bio() now calls __clone_and_map_flush() directly for flushes and guarantees all FLUSH bio's going to targets are zero ` length. * It's now guaranteed that all FLUSH bio's which are passed onto dm targets are zero length. bio_empty_barrier() tests are replaced with REQ_FLUSH tests. * Empty WRITE_BARRIERs are replaced with WRITE_FLUSHes. * Dropped unlikely() around REQ_FLUSH tests. Flushes are not unlikely enough to be marked with unlikely(). * Block layer now filters out REQ_FLUSH/FUA bio's if the request_queue doesn't support cache flushing. Advertise REQ_FLUSH | REQ_FUA capability. * Request based dm isn't converted yet. dm_init_request_based_queue() resets flush support to 0 for now. To avoid disturbing request based dm code, dm->flush_error is added for bio based dm while requested based dm continues to use dm->barrier_error. Lightly tested linear, stripe, raid1, snap and crypt targets. Please proceed with caution as I'm not familiar with the code base. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: dm-devel@redhat.com Cc: Christoph Hellwig <hch@lst.de> Signed-off-by: Jens Axboe <jaxboe@fusionio.com>
Tejun Heo · Jens Axboe
1 parent 3a2edd0d6d
Showing 9 changed files with 80 additions and 97 deletions Side-by-side Diff
drivers/md/dm-crypt.c
drivers/md/dm-io.c
drivers/md/dm-log.c
drivers/md/dm-raid1.c
drivers/md/dm-region-hash.c
drivers/md/dm-snap-persistent.c
drivers/md/dm-snap.c
drivers/md/dm-stripe.c
drivers/md/dm.c
@@ -1278,7 +1278,7 @@
 	struct dm_crypt_io *io;
 	struct crypt_config *cc;
  
-	if (unlikely(bio_empty_barrier(bio))) {
+	if (bio->bi_rw & REQ_FLUSH) {
 		cc = ti->private;
 		bio->bi_bdev = cc->dev->bdev;
 		return DM_MAPIO_REMAPPED;
@@ -31,7 +31,6 @@
  */
 struct io {
 	unsigned long error_bits;
-	unsigned long eopnotsupp_bits;
 	atomic_t count;
 	struct task_struct *sleeper;
 	struct dm_io_client *client;
  
@@ -130,11 +129,8 @@
  *---------------------------------------------------------------*/
 static void dec_count(struct io *io, unsigned int region, int error)
 {
-	if (error) {
+	if (error)
 		set_bit(region, &io->error_bits);
-		if (error == -EOPNOTSUPP)
-			set_bit(region, &io->eopnotsupp_bits);
-	}
  
 	if (atomic_dec_and_test(&io->count)) {
 		if (io->sleeper)
@@ -310,8 +306,8 @@
 	sector_t remaining = where->count;
  
 	/*
-	 * where->count may be zero if rw holds a write barrier and we
-	 * need to send a zero-sized barrier.
+	 * where->count may be zero if rw holds a flush and we need to
+	 * send a zero-sized flush.
 	 */
 	do {
 		/*
@@ -364,7 +360,7 @@
 	 */
 	for (i = 0; i < num_regions; i++) {
 		*dp = old_pages;
-		if (where[i].count || (rw & REQ_HARDBARRIER))
+		if (where[i].count || (rw & REQ_FLUSH))
 			do_region(rw, i, where + i, dp, io);
 	}
  
  
@@ -393,9 +389,7 @@
 		return -EIO;
 	}
  
-retry:
 	io->error_bits = 0;
-	io->eopnotsupp_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
 	io->sleeper = current;
 	io->client = client;
@@ -412,11 +406,6 @@
 	}
 	set_current_state(TASK_RUNNING);
  
-	if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
-		rw &= ~REQ_HARDBARRIER;
-		goto retry;
-	}
-
 	if (error_bits)
 		*error_bits = io->error_bits;
  
@@ -437,7 +426,6 @@
  
 	io = mempool_alloc(client->pool, GFP_NOIO);
 	io->error_bits = 0;
-	io->eopnotsupp_bits = 0;
 	atomic_set(&io->count, 1); /* see dispatch_io() */
 	io->sleeper = NULL;
 	io->client = client;
@@ -300,7 +300,7 @@
 		.count = 0,
 	};
  
-	lc->io_req.bi_rw = WRITE_BARRIER;
+	lc->io_req.bi_rw = WRITE_FLUSH;
  
 	return dm_io(&lc->io_req, 1, &null_location, NULL);
 }
@@ -259,7 +259,7 @@
 	struct dm_io_region io[ms->nr_mirrors];
 	struct mirror *m;
 	struct dm_io_request io_req = {
-		.bi_rw = WRITE_BARRIER,
+		.bi_rw = WRITE_FLUSH,
 		.mem.type = DM_IO_KMEM,
 		.mem.ptr.bvec = NULL,
 		.client = ms->io_client,
@@ -629,7 +629,7 @@
 	struct dm_io_region io[ms->nr_mirrors], *dest = io;
 	struct mirror *m;
 	struct dm_io_request io_req = {
-		.bi_rw = WRITE | (bio->bi_rw & WRITE_BARRIER),
+		.bi_rw = WRITE | (bio->bi_rw & WRITE_FLUSH_FUA),
 		.mem.type = DM_IO_BVEC,
 		.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
 		.notify.fn = write_callback,
@@ -670,7 +670,7 @@
 	bio_list_init(&requeue);
  
 	while ((bio = bio_list_pop(writes))) {
-		if (unlikely(bio_empty_barrier(bio))) {
+		if (bio->bi_rw & REQ_FLUSH) {
 			bio_list_add(&sync, bio);
 			continue;
 		}
@@ -1203,7 +1203,7 @@
 	 * We need to dec pending if this was a write.
 	 */
 	if (rw == WRITE) {
-		if (likely(!bio_empty_barrier(bio)))
+		if (!(bio->bi_rw & REQ_FLUSH))
 			dm_rh_dec(ms->rh, map_context->ll);
 		return error;
 	}
@@ -81,9 +81,9 @@
 	struct list_head failed_recovered_regions;
  
 	/*
-	 * If there was a barrier failure no regions can be marked clean.
+	 * If there was a flush failure no regions can be marked clean.
 	 */
-	int barrier_failure;
+	int flush_failure;
  
 	void *context;
 	sector_t target_begin;
@@ -217,7 +217,7 @@
 	INIT_LIST_HEAD(&rh->quiesced_regions);
 	INIT_LIST_HEAD(&rh->recovered_regions);
 	INIT_LIST_HEAD(&rh->failed_recovered_regions);
-	rh->barrier_failure = 0;
+	rh->flush_failure = 0;
  
 	rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
 						      sizeof(struct dm_region));
@@ -399,8 +399,8 @@
 	region_t region = dm_rh_bio_to_region(rh, bio);
 	int recovering = 0;
  
-	if (bio_empty_barrier(bio)) {
-		rh->barrier_failure = 1;
+	if (bio->bi_rw & REQ_FLUSH) {
+		rh->flush_failure = 1;
 		return;
 	}
  
@@ -524,7 +524,7 @@
 	struct bio *bio;
  
 	for (bio = bios->head; bio; bio = bio->bi_next) {
-		if (bio_empty_barrier(bio))
+		if (bio->bi_rw & REQ_FLUSH)
 			continue;
 		rh_inc(rh, dm_rh_bio_to_region(rh, bio));
 	}
  
@@ -555,9 +555,9 @@
 		 */
  
 		/* do nothing for DM_RH_NOSYNC */
-		if (unlikely(rh->barrier_failure)) {
+		if (unlikely(rh->flush_failure)) {
 			/*
-			 * If a write barrier failed some time ago, we
+			 * If a write flush failed some time ago, we
 			 * don't know whether or not this write made it
 			 * to the disk, so we must resync the device.
 			 */
@@ -687,7 +687,7 @@
 	/*
 	 * Commit exceptions to disk.
 	 */
-	if (ps->valid && area_io(ps, WRITE_BARRIER))
+	if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
 		ps->valid = 0;
  
 	/*
@@ -1587,7 +1587,7 @@
 	chunk_t chunk;
 	struct dm_snap_pending_exception *pe = NULL;
  
-	if (unlikely(bio_empty_barrier(bio))) {
+	if (bio->bi_rw & REQ_FLUSH) {
 		bio->bi_bdev = s->cow->bdev;
 		return DM_MAPIO_REMAPPED;
 	}
@@ -1691,7 +1691,7 @@
 	int r = DM_MAPIO_REMAPPED;
 	chunk_t chunk;
  
-	if (unlikely(bio_empty_barrier(bio))) {
+	if (bio->bi_rw & REQ_FLUSH) {
 		if (!map_context->target_request_nr)
 			bio->bi_bdev = s->origin->bdev;
 		else
@@ -2135,7 +2135,7 @@
 	struct dm_dev *dev = ti->private;
 	bio->bi_bdev = dev->bdev;
  
-	if (unlikely(bio_empty_barrier(bio)))
+	if (bio->bi_rw & REQ_FLUSH)
 		return DM_MAPIO_REMAPPED;
  
 	/* Only tell snapshots if this is a write */
@@ -271,7 +271,7 @@
 	uint32_t stripe;
 	unsigned target_request_nr;
  
-	if (unlikely(bio_empty_barrier(bio))) {
+	if (bio->bi_rw & REQ_FLUSH) {
 		target_request_nr = map_context->target_request_nr;
 		BUG_ON(target_request_nr >= sc->stripes);
 		bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
@@ -144,15 +144,16 @@
 	spinlock_t deferred_lock;
  
 	/*
-	 * An error from the barrier request currently being processed.
+	 * An error from the flush request currently being processed.
 	 */
-	int barrier_error;
+	int flush_error;
  
 	/*
 	 * Protect barrier_error from concurrent endio processing
 	 * in request-based dm.
 	 */
 	spinlock_t barrier_error_lock;
+	int barrier_error;
  
 	/*
 	 * Processing queue (flush/barriers)
@@ -200,8 +201,8 @@
 	/* sysfs handle */
 	struct kobject kobj;
  
-	/* zero-length barrier that will be cloned and submitted to targets */
-	struct bio barrier_bio;
+	/* zero-length flush that will be cloned and submitted to targets */
+	struct bio flush_bio;
 };
  
 /*
@@ -512,7 +513,7 @@
  
 	/*
 	 * After this is decremented the bio must not be touched if it is
-	 * a barrier.
+	 * a flush.
 	 */
 	dm_disk(md)->part0.in_flight[rw] = pending =
 		atomic_dec_return(&md->pending[rw]);
@@ -626,7 +627,7 @@
 			 */
 			spin_lock_irqsave(&md->deferred_lock, flags);
 			if (__noflush_suspending(md)) {
-				if (!(io->bio->bi_rw & REQ_HARDBARRIER))
+				if (!(io->bio->bi_rw & REQ_FLUSH))
 					bio_list_add_head(&md->deferred,
 							  io->bio);
 			} else
  
  
  
@@ -638,20 +639,14 @@
 		io_error = io->error;
 		bio = io->bio;
  
-		if (bio->bi_rw & REQ_HARDBARRIER) {
+		if (bio->bi_rw & REQ_FLUSH) {
 			/*
-			 * There can be just one barrier request so we use
+			 * There can be just one flush request so we use
 			 * a per-device variable for error reporting.
 			 * Note that you can't touch the bio after end_io_acct
-			 *
-			 * We ignore -EOPNOTSUPP for empty flush reported by
-			 * underlying devices. We assume that if the device
-			 * doesn't support empty barriers, it doesn't need
-			 * cache flushing commands.
 			 */
-			if (!md->barrier_error &&
-			    !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))
-				md->barrier_error = io_error;
+			if (!md->flush_error)
+				md->flush_error = io_error;
 			end_io_acct(io);
 			free_io(md, io);
 		} else {
@@ -1119,7 +1114,7 @@
 }
  
 /*
- * Creates a little bio that is just does part of a bvec.
+ * Creates a little bio that just does part of a bvec.
  */
 static struct bio *split_bvec(struct bio *bio, sector_t sector,
 			      unsigned short idx, unsigned int offset,
@@ -1134,7 +1129,7 @@
  
 	clone->bi_sector = sector;
 	clone->bi_bdev = bio->bi_bdev;
-	clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
+	clone->bi_rw = bio->bi_rw;
 	clone->bi_vcnt = 1;
 	clone->bi_size = to_bytes(len);
 	clone->bi_io_vec->bv_offset = offset;
@@ -1161,7 +1156,6 @@
  
 	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
 	__bio_clone(clone, bio);
-	clone->bi_rw &= ~REQ_HARDBARRIER;
 	clone->bi_destructor = dm_bio_destructor;
 	clone->bi_sector = sector;
 	clone->bi_idx = idx;
@@ -1225,7 +1219,7 @@
 		__issue_target_request(ci, ti, request_nr, len);
 }
  
-static int __clone_and_map_empty_barrier(struct clone_info *ci)
+static int __clone_and_map_flush(struct clone_info *ci)
 {
 	unsigned target_nr = 0;
 	struct dm_target *ti;
@@ -1289,9 +1283,6 @@
 	sector_t len = 0, max;
 	struct dm_target_io *tio;
  
-	if (unlikely(bio_empty_barrier(bio)))
-		return __clone_and_map_empty_barrier(ci);
-
 	if (unlikely(bio->bi_rw & REQ_DISCARD))
 		return __clone_and_map_discard(ci);
  
  
@@ -1383,11 +1374,11 @@
  
 	ci.map = dm_get_live_table(md);
 	if (unlikely(!ci.map)) {
-		if (!(bio->bi_rw & REQ_HARDBARRIER))
+		if (!(bio->bi_rw & REQ_FLUSH))
 			bio_io_error(bio);
 		else
-			if (!md->barrier_error)
-				md->barrier_error = -EIO;
+			if (!md->flush_error)
+				md->flush_error = -EIO;
 		return;
 	}
  
  
  
@@ -1400,14 +1391,22 @@
 	ci.io->md = md;
 	spin_lock_init(&ci.io->endio_lock);
 	ci.sector = bio->bi_sector;
-	ci.sector_count = bio_sectors(bio);
-	if (unlikely(bio_empty_barrier(bio)))
+	if (!(bio->bi_rw & REQ_FLUSH))
+		ci.sector_count = bio_sectors(bio);
+	else {
+		/* all FLUSH bio's reaching here should be empty */
+		WARN_ON_ONCE(bio_has_data(bio));
 		ci.sector_count = 1;
+	}
 	ci.idx = bio->bi_idx;
  
 	start_io_acct(ci.io);
-	while (ci.sector_count && !error)
-		error = __clone_and_map(&ci);
+	while (ci.sector_count && !error) {
+		if (!(bio->bi_rw & REQ_FLUSH))
+			error = __clone_and_map(&ci);
+		else
+			error = __clone_and_map_flush(&ci);
+	}
  
 	/* drop the extra reference count */
 	dec_pending(ci.io, error);
  
@@ -1492,11 +1491,11 @@
 	part_stat_unlock();
  
 	/*
-	 * If we're suspended or the thread is processing barriers
+	 * If we're suspended or the thread is processing flushes
 	 * we have to queue this io for later.
 	 */
 	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
-	    unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
+	    (bio->bi_rw & REQ_FLUSH)) {
 		up_read(&md->io_lock);
  
 		if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
@@ -1940,6 +1939,7 @@
 	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
 	md->queue->unplug_fn = dm_unplug_all;
 	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
+	blk_queue_flush(md->queue, REQ_FLUSH | REQ_FUA);
 }
  
 /*
@@ -2245,7 +2245,8 @@
 	blk_queue_softirq_done(md->queue, dm_softirq_done);
 	blk_queue_prep_rq(md->queue, dm_prep_fn);
 	blk_queue_lld_busy(md->queue, dm_lld_busy);
-	blk_queue_flush(md->queue, REQ_FLUSH);
+	/* no flush support for request based dm yet */
+	blk_queue_flush(md->queue, 0);
  
 	elv_register_queue(md->queue);
  
  
  
  
  
  
@@ -2406,41 +2407,35 @@
 	return r;
 }
  
-static void dm_flush(struct mapped_device *md)
+static void process_flush(struct mapped_device *md, struct bio *bio)
 {
+	md->flush_error = 0;
+
+	/* handle REQ_FLUSH */
 	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
  
-	bio_init(&md->barrier_bio);
-	md->barrier_bio.bi_bdev = md->bdev;
-	md->barrier_bio.bi_rw = WRITE_BARRIER;
-	__split_and_process_bio(md, &md->barrier_bio);
+	bio_init(&md->flush_bio);
+	md->flush_bio.bi_bdev = md->bdev;
+	md->flush_bio.bi_rw = WRITE_FLUSH;
+	__split_and_process_bio(md, &md->flush_bio);
  
 	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
-}
  
-static void process_barrier(struct mapped_device *md, struct bio *bio)
-{
-	md->barrier_error = 0;
-
-	dm_flush(md);
-
-	if (!bio_empty_barrier(bio)) {
-		__split_and_process_bio(md, bio);
-		/*
-		 * If the request isn't supported, don't waste time with
-		 * the second flush.
-		 */
-		if (md->barrier_error != -EOPNOTSUPP)
-			dm_flush(md);
+	/* if it's an empty flush or the preflush failed, we're done */
+	if (!bio_has_data(bio) || md->flush_error) {
+		if (md->flush_error != DM_ENDIO_REQUEUE)
+			bio_endio(bio, md->flush_error);
+		else {
+			spin_lock_irq(&md->deferred_lock);
+			bio_list_add_head(&md->deferred, bio);
+			spin_unlock_irq(&md->deferred_lock);
+		}
+		return;
 	}
  
-	if (md->barrier_error != DM_ENDIO_REQUEUE)
-		bio_endio(bio, md->barrier_error);
-	else {
-		spin_lock_irq(&md->deferred_lock);
-		bio_list_add_head(&md->deferred, bio);
-		spin_unlock_irq(&md->deferred_lock);
-	}
+	/* issue data + REQ_FUA */
+	bio->bi_rw &= ~REQ_FLUSH;
+	__split_and_process_bio(md, bio);
 }
  
 /*
@@ -2469,8 +2464,8 @@
 		if (dm_request_based(md))
 			generic_make_request(c);
 		else {
-			if (c->bi_rw & REQ_HARDBARRIER)
-				process_barrier(md, c);
+			if (c->bi_rw & REQ_FLUSH)
+				process_flush(md, c);
 			else
 				__split_and_process_bio(md, c);
 		}
...	...	@@ -1278,7 +1278,7 @@
1278	1278	struct dm_crypt_io *io;
1279	1279	struct crypt_config *cc;
1280	1280
1281		- if (unlikely(bio_empty_barrier(bio))) {
	1281	+ if (bio->bi_rw & REQ_FLUSH) {
1282	1282	cc = ti->private;
1283	1283	bio->bi_bdev = cc->dev->bdev;
1284	1284	return DM_MAPIO_REMAPPED;
...	...	@@ -31,7 +31,6 @@
31	31	*/
32	32	struct io {
33	33	unsigned long error_bits;
34		- unsigned long eopnotsupp_bits;
35	34	atomic_t count;
36	35	struct task_struct *sleeper;
37	36	struct dm_io_client *client;
38	37
...	...	@@ -130,11 +129,8 @@
130	129	---------------------------------------------------------------/
131	130	static void dec_count(struct io *io, unsigned int region, int error)
132	131	{
133		- if (error) {
	132	+ if (error)
134	133	set_bit(region, &io->error_bits);
135		- if (error == -EOPNOTSUPP)
136		- set_bit(region, &io->eopnotsupp_bits);
137		- }
138	134
139	135	if (atomic_dec_and_test(&io->count)) {
140	136	if (io->sleeper)
...	...	@@ -310,8 +306,8 @@
310	306	sector_t remaining = where->count;
311	307
312	308	/*
313		- * where->count may be zero if rw holds a write barrier and we
314		- * need to send a zero-sized barrier.
	309	+ * where->count may be zero if rw holds a flush and we need to
	310	+ * send a zero-sized flush.
315	311	*/
316	312	do {
317	313	/*
...	...	@@ -364,7 +360,7 @@
364	360	*/
365	361	for (i = 0; i < num_regions; i++) {
366	362	*dp = old_pages;
367		- if (where[i].count \|\| (rw & REQ_HARDBARRIER))
	363	+ if (where[i].count \|\| (rw & REQ_FLUSH))
368	364	do_region(rw, i, where + i, dp, io);
369	365	}
370	366
371	367
...	...	@@ -393,9 +389,7 @@
393	389	return -EIO;
394	390	}
395	391
396		-retry:
397	392	io->error_bits = 0;
398		- io->eopnotsupp_bits = 0;
399	393	atomic_set(&io->count, 1); /* see dispatch_io() */
400	394	io->sleeper = current;
401	395	io->client = client;
...	...	@@ -412,11 +406,6 @@
412	406	}
413	407	set_current_state(TASK_RUNNING);
414	408
415		- if (io->eopnotsupp_bits && (rw & REQ_HARDBARRIER)) {
416		- rw &= ~REQ_HARDBARRIER;
417		- goto retry;
418		- }
419		-
420	409	if (error_bits)
421	410	*error_bits = io->error_bits;
422	411
...	...	@@ -437,7 +426,6 @@
437	426
438	427	io = mempool_alloc(client->pool, GFP_NOIO);
439	428	io->error_bits = 0;
440		- io->eopnotsupp_bits = 0;
441	429	atomic_set(&io->count, 1); /* see dispatch_io() */
442	430	io->sleeper = NULL;
443	431	io->client = client;
...	...	@@ -300,7 +300,7 @@
300	300	.count = 0,
301	301	};
302	302
303		- lc->io_req.bi_rw = WRITE_BARRIER;
	303	+ lc->io_req.bi_rw = WRITE_FLUSH;
304	304
305	305	return dm_io(&lc->io_req, 1, &null_location, NULL);
306	306	}
...	...	@@ -259,7 +259,7 @@
259	259	struct dm_io_region io[ms->nr_mirrors];
260	260	struct mirror *m;
261	261	struct dm_io_request io_req = {
262		- .bi_rw = WRITE_BARRIER,
	262	+ .bi_rw = WRITE_FLUSH,
263	263	.mem.type = DM_IO_KMEM,
264	264	.mem.ptr.bvec = NULL,
265	265	.client = ms->io_client,
...	...	@@ -629,7 +629,7 @@
629	629	struct dm_io_region io[ms->nr_mirrors], *dest = io;
630	630	struct mirror *m;
631	631	struct dm_io_request io_req = {
632		- .bi_rw = WRITE \| (bio->bi_rw & WRITE_BARRIER),
	632	+ .bi_rw = WRITE \| (bio->bi_rw & WRITE_FLUSH_FUA),
633	633	.mem.type = DM_IO_BVEC,
634	634	.mem.ptr.bvec = bio->bi_io_vec + bio->bi_idx,
635	635	.notify.fn = write_callback,
...	...	@@ -670,7 +670,7 @@
670	670	bio_list_init(&requeue);
671	671
672	672	while ((bio = bio_list_pop(writes))) {
673		- if (unlikely(bio_empty_barrier(bio))) {
	673	+ if (bio->bi_rw & REQ_FLUSH) {
674	674	bio_list_add(&sync, bio);
675	675	continue;
676	676	}
...	...	@@ -1203,7 +1203,7 @@
1203	1203	* We need to dec pending if this was a write.
1204	1204	*/
1205	1205	if (rw == WRITE) {
1206		- if (likely(!bio_empty_barrier(bio)))
	1206	+ if (!(bio->bi_rw & REQ_FLUSH))
1207	1207	dm_rh_dec(ms->rh, map_context->ll);
1208	1208	return error;
1209	1209	}
...	...	@@ -81,9 +81,9 @@
81	81	struct list_head failed_recovered_regions;
82	82
83	83	/*
84		- * If there was a barrier failure no regions can be marked clean.
	84	+ * If there was a flush failure no regions can be marked clean.
85	85	*/
86		- int barrier_failure;
	86	+ int flush_failure;
87	87
88	88	void *context;
89	89	sector_t target_begin;
...	...	@@ -217,7 +217,7 @@
217	217	INIT_LIST_HEAD(&rh->quiesced_regions);
218	218	INIT_LIST_HEAD(&rh->recovered_regions);
219	219	INIT_LIST_HEAD(&rh->failed_recovered_regions);
220		- rh->barrier_failure = 0;
	220	+ rh->flush_failure = 0;
221	221
222	222	rh->region_pool = mempool_create_kmalloc_pool(MIN_REGIONS,
223	223	sizeof(struct dm_region));
...	...	@@ -399,8 +399,8 @@
399	399	region_t region = dm_rh_bio_to_region(rh, bio);
400	400	int recovering = 0;
401	401
402		- if (bio_empty_barrier(bio)) {
403		- rh->barrier_failure = 1;
	402	+ if (bio->bi_rw & REQ_FLUSH) {
	403	+ rh->flush_failure = 1;
404	404	return;
405	405	}
406	406
...	...	@@ -524,7 +524,7 @@
524	524	struct bio *bio;
525	525
526	526	for (bio = bios->head; bio; bio = bio->bi_next) {
527		- if (bio_empty_barrier(bio))
	527	+ if (bio->bi_rw & REQ_FLUSH)
528	528	continue;
529	529	rh_inc(rh, dm_rh_bio_to_region(rh, bio));
530	530	}
531	531
...	...	@@ -555,9 +555,9 @@
555	555	*/
556	556
557	557	/* do nothing for DM_RH_NOSYNC */
558		- if (unlikely(rh->barrier_failure)) {
	558	+ if (unlikely(rh->flush_failure)) {
559	559	/*
560		- * If a write barrier failed some time ago, we
	560	+ * If a write flush failed some time ago, we
561	561	* don't know whether or not this write made it
562	562	* to the disk, so we must resync the device.
563	563	*/
...	...	@@ -687,7 +687,7 @@
687	687	/*
688	688	* Commit exceptions to disk.
689	689	*/
690		- if (ps->valid && area_io(ps, WRITE_BARRIER))
	690	+ if (ps->valid && area_io(ps, WRITE_FLUSH_FUA))
691	691	ps->valid = 0;
692	692
693	693	/*
...	...	@@ -1587,7 +1587,7 @@
1587	1587	chunk_t chunk;
1588	1588	struct dm_snap_pending_exception *pe = NULL;
1589	1589
1590		- if (unlikely(bio_empty_barrier(bio))) {
	1590	+ if (bio->bi_rw & REQ_FLUSH) {
1591	1591	bio->bi_bdev = s->cow->bdev;
1592	1592	return DM_MAPIO_REMAPPED;
1593	1593	}
...	...	@@ -1691,7 +1691,7 @@
1691	1691	int r = DM_MAPIO_REMAPPED;
1692	1692	chunk_t chunk;
1693	1693
1694		- if (unlikely(bio_empty_barrier(bio))) {
	1694	+ if (bio->bi_rw & REQ_FLUSH) {
1695	1695	if (!map_context->target_request_nr)
1696	1696	bio->bi_bdev = s->origin->bdev;
1697	1697	else
...	...	@@ -2135,7 +2135,7 @@
2135	2135	struct dm_dev *dev = ti->private;
2136	2136	bio->bi_bdev = dev->bdev;
2137	2137
2138		- if (unlikely(bio_empty_barrier(bio)))
	2138	+ if (bio->bi_rw & REQ_FLUSH)
2139	2139	return DM_MAPIO_REMAPPED;
2140	2140
2141	2141	/* Only tell snapshots if this is a write */
...	...	@@ -271,7 +271,7 @@
271	271	uint32_t stripe;
272	272	unsigned target_request_nr;
273	273
274		- if (unlikely(bio_empty_barrier(bio))) {
	274	+ if (bio->bi_rw & REQ_FLUSH) {
275	275	target_request_nr = map_context->target_request_nr;
276	276	BUG_ON(target_request_nr >= sc->stripes);
277	277	bio->bi_bdev = sc->stripe[target_request_nr].dev->bdev;
...	...	@@ -144,15 +144,16 @@
144	144	spinlock_t deferred_lock;
145	145
146	146	/*
147		- * An error from the barrier request currently being processed.
	147	+ * An error from the flush request currently being processed.
148	148	*/
149		- int barrier_error;
	149	+ int flush_error;
150	150
151	151	/*
152	152	* Protect barrier_error from concurrent endio processing
153	153	* in request-based dm.
154	154	*/
155	155	spinlock_t barrier_error_lock;
	156	+ int barrier_error;
156	157
157	158	/*
158	159	* Processing queue (flush/barriers)
...	...	@@ -200,8 +201,8 @@
200	201	/* sysfs handle */
201	202	struct kobject kobj;
202	203
203		- /* zero-length barrier that will be cloned and submitted to targets */
204		- struct bio barrier_bio;
	204	+ /* zero-length flush that will be cloned and submitted to targets */
	205	+ struct bio flush_bio;
205	206	};
206	207
207	208	/*
...	...	@@ -512,7 +513,7 @@
512	513
513	514	/*
514	515	* After this is decremented the bio must not be touched if it is
515		- * a barrier.
	516	+ * a flush.
516	517	*/
517	518	dm_disk(md)->part0.in_flight[rw] = pending =
518	519	atomic_dec_return(&md->pending[rw]);
...	...	@@ -626,7 +627,7 @@
626	627	*/
627	628	spin_lock_irqsave(&md->deferred_lock, flags);
628	629	if (__noflush_suspending(md)) {
629		- if (!(io->bio->bi_rw & REQ_HARDBARRIER))
	630	+ if (!(io->bio->bi_rw & REQ_FLUSH))
630	631	bio_list_add_head(&md->deferred,
631	632	io->bio);
632	633	} else
633	634
634	635
635	636
...	...	@@ -638,20 +639,14 @@
638	639	io_error = io->error;
639	640	bio = io->bio;
640	641
641		- if (bio->bi_rw & REQ_HARDBARRIER) {
	642	+ if (bio->bi_rw & REQ_FLUSH) {
642	643	/*
643		- * There can be just one barrier request so we use
	644	+ * There can be just one flush request so we use
644	645	* a per-device variable for error reporting.
645	646	* Note that you can't touch the bio after end_io_acct
646		- *
647		- * We ignore -EOPNOTSUPP for empty flush reported by
648		- * underlying devices. We assume that if the device
649		- * doesn't support empty barriers, it doesn't need
650		- * cache flushing commands.
651	647	*/
652		- if (!md->barrier_error &&
653		- !(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))
654		- md->barrier_error = io_error;
	648	+ if (!md->flush_error)
	649	+ md->flush_error = io_error;
655	650	end_io_acct(io);
656	651	free_io(md, io);
657	652	} else {
...	...	@@ -1119,7 +1114,7 @@
1119	1114	}
1120	1115
1121	1116	/*
1122		- * Creates a little bio that is just does part of a bvec.
	1117	+ * Creates a little bio that just does part of a bvec.
1123	1118	*/
1124	1119	static struct bio split_bvec(struct bio bio, sector_t sector,
1125	1120	unsigned short idx, unsigned int offset,
...	...	@@ -1134,7 +1129,7 @@
1134	1129
1135	1130	clone->bi_sector = sector;
1136	1131	clone->bi_bdev = bio->bi_bdev;
1137		- clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
	1132	+ clone->bi_rw = bio->bi_rw;
1138	1133	clone->bi_vcnt = 1;
1139	1134	clone->bi_size = to_bytes(len);
1140	1135	clone->bi_io_vec->bv_offset = offset;
...	...	@@ -1161,7 +1156,6 @@
1161	1156
1162	1157	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1163	1158	__bio_clone(clone, bio);
1164		- clone->bi_rw &= ~REQ_HARDBARRIER;
1165	1159	clone->bi_destructor = dm_bio_destructor;
1166	1160	clone->bi_sector = sector;
1167	1161	clone->bi_idx = idx;
...	...	@@ -1225,7 +1219,7 @@
1225	1219	__issue_target_request(ci, ti, request_nr, len);
1226	1220	}
1227	1221
1228		-static int __clone_and_map_empty_barrier(struct clone_info *ci)
	1222	+static int __clone_and_map_flush(struct clone_info *ci)
1229	1223	{
1230	1224	unsigned target_nr = 0;
1231	1225	struct dm_target *ti;
...	...	@@ -1289,9 +1283,6 @@
1289	1283	sector_t len = 0, max;
1290	1284	struct dm_target_io *tio;
1291	1285
1292		- if (unlikely(bio_empty_barrier(bio)))
1293		- return __clone_and_map_empty_barrier(ci);
1294		-
1295	1286	if (unlikely(bio->bi_rw & REQ_DISCARD))
1296	1287	return __clone_and_map_discard(ci);
1297	1288
1298	1289
...	...	@@ -1383,11 +1374,11 @@
1383	1374
1384	1375	ci.map = dm_get_live_table(md);
1385	1376	if (unlikely(!ci.map)) {
1386		- if (!(bio->bi_rw & REQ_HARDBARRIER))
	1377	+ if (!(bio->bi_rw & REQ_FLUSH))
1387	1378	bio_io_error(bio);
1388	1379	else
1389		- if (!md->barrier_error)
1390		- md->barrier_error = -EIO;
	1380	+ if (!md->flush_error)
	1381	+ md->flush_error = -EIO;
1391	1382	return;
1392	1383	}
1393	1384
1394	1385
1395	1386
...	...	@@ -1400,14 +1391,22 @@
1400	1391	ci.io->md = md;
1401	1392	spin_lock_init(&ci.io->endio_lock);
1402	1393	ci.sector = bio->bi_sector;
1403		- ci.sector_count = bio_sectors(bio);
1404		- if (unlikely(bio_empty_barrier(bio)))
	1394	+ if (!(bio->bi_rw & REQ_FLUSH))
	1395	+ ci.sector_count = bio_sectors(bio);
	1396	+ else {
	1397	+ /* all FLUSH bio's reaching here should be empty */
	1398	+ WARN_ON_ONCE(bio_has_data(bio));
1405	1399	ci.sector_count = 1;
	1400	+ }
1406	1401	ci.idx = bio->bi_idx;
1407	1402
1408	1403	start_io_acct(ci.io);
1409		- while (ci.sector_count && !error)
1410		- error = __clone_and_map(&ci);
	1404	+ while (ci.sector_count && !error) {
	1405	+ if (!(bio->bi_rw & REQ_FLUSH))
	1406	+ error = __clone_and_map(&ci);
	1407	+ else
	1408	+ error = __clone_and_map_flush(&ci);
	1409	+ }
1411	1410
1412	1411	/* drop the extra reference count */
1413	1412	dec_pending(ci.io, error);
1414	1413
...	...	@@ -1492,11 +1491,11 @@
1492	1491	part_stat_unlock();
1493	1492
1494	1493	/*
1495		- * If we're suspended or the thread is processing barriers
	1494	+ * If we're suspended or the thread is processing flushes
1496	1495	* we have to queue this io for later.
1497	1496	*/
1498	1497	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) \|\|
1499		- unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
	1498	+ (bio->bi_rw & REQ_FLUSH)) {
1500	1499	up_read(&md->io_lock);
1501	1500
1502	1501	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
...	...	@@ -1940,6 +1939,7 @@
1940	1939	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1941	1940	md->queue->unplug_fn = dm_unplug_all;
1942	1941	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
	1942	+ blk_queue_flush(md->queue, REQ_FLUSH \| REQ_FUA);
1943	1943	}
1944	1944
1945	1945	/*
...	...	@@ -2245,7 +2245,8 @@
2245	2245	blk_queue_softirq_done(md->queue, dm_softirq_done);
2246	2246	blk_queue_prep_rq(md->queue, dm_prep_fn);
2247	2247	blk_queue_lld_busy(md->queue, dm_lld_busy);
2248		- blk_queue_flush(md->queue, REQ_FLUSH);
	2248	+ /* no flush support for request based dm yet */
	2249	+ blk_queue_flush(md->queue, 0);
2249	2250
2250	2251	elv_register_queue(md->queue);
2251	2252
2252	2253
2253	2254
2254	2255
2255	2256
2256	2257
...	...	@@ -2406,41 +2407,35 @@
2406	2407	return r;
2407	2408	}
2408	2409
2409		-static void dm_flush(struct mapped_device *md)
	2410	+static void process_flush(struct mapped_device md, struct bio bio)
2410	2411	{
	2412	+ md->flush_error = 0;
	2413	+
	2414	+ /* handle REQ_FLUSH */
2411	2415	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2412	2416
2413		- bio_init(&md->barrier_bio);
2414		- md->barrier_bio.bi_bdev = md->bdev;
2415		- md->barrier_bio.bi_rw = WRITE_BARRIER;
2416		- __split_and_process_bio(md, &md->barrier_bio);
	2417	+ bio_init(&md->flush_bio);
	2418	+ md->flush_bio.bi_bdev = md->bdev;
	2419	+ md->flush_bio.bi_rw = WRITE_FLUSH;
	2420	+ __split_and_process_bio(md, &md->flush_bio);
2417	2421
2418	2422	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2419		-}
2420	2423
2421		-static void process_barrier(struct mapped_device md, struct bio bio)
2422		-{
2423		- md->barrier_error = 0;
2424		-
2425		- dm_flush(md);
2426		-
2427		- if (!bio_empty_barrier(bio)) {
2428		- __split_and_process_bio(md, bio);
2429		- /*
2430		- * If the request isn't supported, don't waste time with
2431		- * the second flush.
2432		- */
2433		- if (md->barrier_error != -EOPNOTSUPP)
2434		- dm_flush(md);
	2424	+ /* if it's an empty flush or the preflush failed, we're done */
	2425	+ if (!bio_has_data(bio) \|\| md->flush_error) {
	2426	+ if (md->flush_error != DM_ENDIO_REQUEUE)
	2427	+ bio_endio(bio, md->flush_error);
	2428	+ else {
	2429	+ spin_lock_irq(&md->deferred_lock);
	2430	+ bio_list_add_head(&md->deferred, bio);
	2431	+ spin_unlock_irq(&md->deferred_lock);
	2432	+ }
	2433	+ return;
2435	2434	}
2436	2435
2437		- if (md->barrier_error != DM_ENDIO_REQUEUE)
2438		- bio_endio(bio, md->barrier_error);
2439		- else {
2440		- spin_lock_irq(&md->deferred_lock);
2441		- bio_list_add_head(&md->deferred, bio);
2442		- spin_unlock_irq(&md->deferred_lock);
2443		- }
	2436	+ /* issue data + REQ_FUA */
	2437	+ bio->bi_rw &= ~REQ_FLUSH;
	2438	+ __split_and_process_bio(md, bio);
2444	2439	}
2445	2440
2446	2441	/*
...	...	@@ -2469,8 +2464,8 @@
2469	2464	if (dm_request_based(md))
2470	2465	generic_make_request(c);
2471	2466	else {
2472		- if (c->bi_rw & REQ_HARDBARRIER)
2473		- process_barrier(md, c);
	2467	+ if (c->bi_rw & REQ_FLUSH)
	2468	+ process_flush(md, c);
2474	2469	else
2475	2470	__split_and_process_bio(md, c);
2476	2471	}