Eric Lee / smarc-fsl-linux-kernel

Commit 4913efe456c987057e5d36a3f0a55422a9072cae

Authored by Tejun Heo 15 years ago

Committed by Jens Axboe 15 years ago

Exists in master and in 39 other branches

block: deprecate barrier and replace blk_queue_ordered() with blk_queue_flush()

Barrier is deemed too heavy and will soon be replaced by FLUSH/FUA
requests.  Deprecate barrier.  All REQ_HARDBARRIERs are failed with
-EOPNOTSUPP and blk_queue_ordered() is replaced with simpler
blk_queue_flush().

blk_queue_flush() takes combinations of REQ_FLUSH and FUA.  If a
device has write cache and can flush it, it should set REQ_FLUSH.  If
the device can handle FUA writes, it should also set REQ_FUA.

All blk_queue_ordered() users are converted.

* ORDERED_DRAIN is mapped to 0 which is the default value.
* ORDERED_DRAIN_FLUSH is mapped to REQ_FLUSH.
* ORDERED_DRAIN_FLUSH_FUA is mapped to REQ_FLUSH | REQ_FUA.

Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Boaz Harrosh <bharrosh@panasas.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Nick Piggin <npiggin@kernel.dk>
Cc: Michael S. Tsirkin <mst@redhat.com>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Geert Uytterhoeven <Geert.Uytterhoeven@sonycom.com>
Cc: David S. Miller <davem@davemloft.net>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Pierre Ossman <drzeus@drzeus.cx>
Cc: Stefan Weinhuber <wein@de.ibm.com>
Signed-off-by: Jens Axboe <jaxboe@fusionio.com>

Showing 15 changed files with 67 additions and 102 deletions Inline Diff

block/blk-barrier.c
block/blk-core.c
block/blk-settings.c
drivers/block/brd.c
drivers/block/loop.c
drivers/block/osdblk.c
drivers/block/ps3disk.c
drivers/block/virtio_blk.c
drivers/block/xen-blkfront.c
drivers/ide/ide-disk.c
drivers/md/dm.c
drivers/mmc/card/queue.c
drivers/s390/block/dasd.c
drivers/scsi/sd.c
include/linux/blkdev.h

block/blk-barrier.c

Diff comments View file @ 4913efe

 /*
  * Functions related to barrier IO handling
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/gfp.h>
 #include "blk.h"
-/**
- * blk_queue_ordered - does this queue support ordered writes
- * @q:        the request queue
- * @ordered:  one of QUEUE_ORDERED_*
- *
- * Description:
- *   For journalled file systems, doing ordered writes on a commit
- *   block instead of explicitly doing wait_on_buffer (which is bad
- *   for performance) can be a big win. Block drivers supporting this
- *   feature should call this function and indicate so.
- *
- **/
-int blk_queue_ordered(struct request_queue *q, unsigned ordered)
-{
-	if (ordered != QUEUE_ORDERED_NONE &&
-	    ordered != QUEUE_ORDERED_DRAIN &&
-	    ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
-	    ordered != QUEUE_ORDERED_DRAIN_FUA) {
-		printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
-		return -EINVAL;
-	}
-	q->ordered = ordered;
-	q->next_ordered = ordered;
-	return 0;
-}
-EXPORT_SYMBOL(blk_queue_ordered);
 /*
  * Cache flushing for ordered writes handling
  */
 unsigned blk_ordered_cur_seq(struct request_queue *q)
 {
 	if (!q->ordseq)
 		return 0;
 	return 1 << ffz(q->ordseq);
 }
 unsigned blk_ordered_req_seq(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	BUG_ON(q->ordseq == 0);
 	if (rq == &q->pre_flush_rq)
 		return QUEUE_ORDSEQ_PREFLUSH;
 	if (rq == &q->bar_rq)
 		return QUEUE_ORDSEQ_BAR;
 	if (rq == &q->post_flush_rq)
 		return QUEUE_ORDSEQ_POSTFLUSH;
 	/*
 	 * !fs requests don't need to follow barrier ordering.  Always
 	 * put them at the front.  This fixes the following deadlock.
 	 *
 	 * http://thread.gmane.org/gmane.linux.kernel/537473
 	 */
 	if (rq->cmd_type != REQ_TYPE_FS)
 		return QUEUE_ORDSEQ_DRAIN;
 	if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
 	    (q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
 		return QUEUE_ORDSEQ_DRAIN;
 	else
 		return QUEUE_ORDSEQ_DONE;
 }
 bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
 {
 	struct request *rq;
 	if (error && !q->orderr)
 		q->orderr = error;
 	BUG_ON(q->ordseq & seq);
 	q->ordseq |= seq;
 	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
 		return false;
 	/*
 	 * Okay, sequence complete.
 	 */
 	q->ordseq = 0;
 	rq = q->orig_bar_rq;
 	__blk_end_request_all(rq, q->orderr);
 	return true;
 }
 static void pre_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
 	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
 }
 static void bar_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
 	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
 }
 static void post_flush_end_io(struct request *rq, int error)
 {
 	elv_completed_request(rq->q, rq);
 	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
 }
 static void queue_flush(struct request_queue *q, unsigned which)
 {
 	struct request *rq;
 	rq_end_io_fn *end_io;
 	if (which == QUEUE_ORDERED_DO_PREFLUSH) {
 		rq = &q->pre_flush_rq;
 		end_io = pre_flush_end_io;
 	} else {
 		rq = &q->post_flush_rq;
 		end_io = post_flush_end_io;
 	}
 	blk_rq_init(q, rq);
 	rq->cmd_type = REQ_TYPE_FS;
 	rq->cmd_flags = REQ_HARDBARRIER | REQ_FLUSH;
 	rq->rq_disk = q->orig_bar_rq->rq_disk;
 	rq->end_io = end_io;
 	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 }
 static inline bool start_ordered(struct request_queue *q, struct request **rqp)
 {
 	struct request *rq = *rqp;
 	unsigned skip = 0;
 	q->orderr = 0;
 	q->ordered = q->next_ordered;
 	q->ordseq |= QUEUE_ORDSEQ_STARTED;
 	/*
 	 * For an empty barrier, there's no actual BAR request, which
 	 * in turn makes POSTFLUSH unnecessary.  Mask them off.
 	 */
 	if (!blk_rq_sectors(rq))
 		q->ordered &= ~(QUEUE_ORDERED_DO_BAR |
 				QUEUE_ORDERED_DO_POSTFLUSH);
 	/* stash away the original request */
 	blk_dequeue_request(rq);
 	q->orig_bar_rq = rq;
 	rq = NULL;
 	/*
 	 * Queue ordered sequence.  As we stack them at the head, we
 	 * need to queue in reverse order.  Note that we rely on that
 	 * no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
 	 * request gets inbetween ordered sequence.
 	 */
 	if (q->ordered & QUEUE_ORDERED_DO_POSTFLUSH) {
 		queue_flush(q, QUEUE_ORDERED_DO_POSTFLUSH);
 		rq = &q->post_flush_rq;
 	} else
 		skip |= QUEUE_ORDSEQ_POSTFLUSH;
 	if (q->ordered & QUEUE_ORDERED_DO_BAR) {
 		rq = &q->bar_rq;
 		/* initialize proxy request and queue it */
 		blk_rq_init(q, rq);
 		if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
 			rq->cmd_flags |= REQ_WRITE;
 		if (q->ordered & QUEUE_ORDERED_DO_FUA)
 			rq->cmd_flags |= REQ_FUA;
 		init_request_from_bio(rq, q->orig_bar_rq->bio);
 		rq->end_io = bar_end_io;
 		elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
 	} else
 		skip |= QUEUE_ORDSEQ_BAR;
 	if (q->ordered & QUEUE_ORDERED_DO_PREFLUSH) {
 		queue_flush(q, QUEUE_ORDERED_DO_PREFLUSH);
 		rq = &q->pre_flush_rq;
 	} else
 		skip |= QUEUE_ORDSEQ_PREFLUSH;
 	if (queue_in_flight(q))
 		rq = NULL;
 	else
 		skip |= QUEUE_ORDSEQ_DRAIN;
 	*rqp = rq;
 	/*
 	 * Complete skipped sequences.  If whole sequence is complete,
 	 * return false to tell elevator that this request is gone.
 	 */
 	return !blk_ordered_complete_seq(q, skip, 0);
 }
 bool blk_do_ordered(struct request_queue *q, struct request **rqp)
 {
 	struct request *rq = *rqp;
 	const int is_barrier = rq->cmd_type == REQ_TYPE_FS &&
 				(rq->cmd_flags & REQ_HARDBARRIER);
 	if (!q->ordseq) {
 		if (!is_barrier)
 			return true;
 		if (q->next_ordered != QUEUE_ORDERED_NONE)
 			return start_ordered(q, rqp);
 		else {
 			/*
 			 * Queue ordering not supported.  Terminate
 			 * with prejudice.
 			 */
 			blk_dequeue_request(rq);
 			__blk_end_request_all(rq, -EOPNOTSUPP);
 			*rqp = NULL;
 			return false;
 		}
 	}
 	/*
 	 * Ordered sequence in progress
 	 */
 	/* Special requests are not subject to ordering rules. */
 	if (rq->cmd_type != REQ_TYPE_FS &&
 	    rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
 		return true;
 	/* Ordered by draining.  Wait for turn. */
 	WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
 	if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
 		*rqp = NULL;
 	return true;
 }
 static void bio_end_empty_barrier(struct bio *bio, int err)
 {
 	if (err) {
 		if (err == -EOPNOTSUPP)
 			set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
 		clear_bit(BIO_UPTODATE, &bio->bi_flags);
 	}
 	if (bio->bi_private)
 		complete(bio->bi_private);
 	bio_put(bio);
 }
 /**
  * blkdev_issue_flush - queue a flush
  * @bdev:	blockdev to issue flush for
  * @gfp_mask:	memory allocation flags (for bio_alloc)
  * @error_sector:	error sector
  * @flags:	BLKDEV_IFL_* flags to control behaviour
  *
  * Description:
  *    Issue a flush for the block device in question. Caller can supply
  *    room for storing the error offset in case of a flush error, if they
  *    wish to. If WAIT flag is not passed then caller may check only what
  *    request was pushed in some internal queue for later handling.
  */
 int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
 		sector_t *error_sector, unsigned long flags)
 {
 	DECLARE_COMPLETION_ONSTACK(wait);
 	struct request_queue *q;
 	struct bio *bio;
 	int ret = 0;
 	if (bdev->bd_disk == NULL)
 		return -ENXIO;
 	q = bdev_get_queue(bdev);
 	if (!q)
 		return -ENXIO;
 	/*
 	 * some block devices may not have their queue correctly set up here
 	 * (e.g. loop device without a backing file) and so issuing a flush
 	 * here will panic. Ensure there is a request function before issuing
 	 * the barrier.
 	 */
 	if (!q->make_request_fn)
 		return -ENXIO;
 	bio = bio_alloc(gfp_mask, 0);
 	bio->bi_end_io = bio_end_empty_barrier;
 	bio->bi_bdev = bdev;
 	if (test_bit(BLKDEV_WAIT, &flags))
 		bio->bi_private = &wait;
 	bio_get(bio);
 	submit_bio(WRITE_BARRIER, bio);
 	if (test_bit(BLKDEV_WAIT, &flags)) {
 		wait_for_completion(&wait);
 		/*
 		 * The driver must store the error location in ->bi_sector, if
 		 * it supports it. For non-stacked drivers, this should be
 		 * copied from blk_rq_pos(rq).
 		 */
 		if (error_sector)
 			*error_sector = bio->bi_sector;
 	}
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
 	else if (!bio_flagged(bio, BIO_UPTODATE))
 		ret = -EIO;
 	bio_put(bio);
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_flush);

block/blk-core.c

Diff comments View file @ 4913efe

 /*
  * Copyright (C) 1991, 1992 Linus Torvalds
  * Copyright (C) 1994,      Karl Keyte: Added support for disk statistics
  * Elevator latency, (C) 2000  Andrea Arcangeli <andrea@suse.de> SuSE
  * Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
  * kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au>
  *	-  July2000
  * bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
  */
 /*
  * This handles all read/write requests to block devices
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/highmem.h>
 #include <linux/mm.h>
 #include <linux/kernel_stat.h>
 #include <linux/string.h>
 #include <linux/init.h>
 #include <linux/completion.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/fault-inject.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/block.h>
 #include "blk.h"
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 static int __make_request(struct request_queue *q, struct bio *bio);
 /*
  * For the allocated request tables
  */
 static struct kmem_cache *request_cachep;
 /*
  * For queue allocation
  */
 struct kmem_cache *blk_requestq_cachep;
 /*
  * Controlling structure to kblockd
  */
 static struct workqueue_struct *kblockd_workqueue;
 static void drive_stat_acct(struct request *rq, int new_io)
 {
 	struct hd_struct *part;
 	int rw = rq_data_dir(rq);
 	int cpu;
 	if (!blk_do_io_stat(rq))
 		return;
 	cpu = part_stat_lock();
 	part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
 	if (!new_io)
 		part_stat_inc(cpu, part, merges[rw]);
 	else {
 		part_round_stats(cpu, part);
 		part_inc_in_flight(part, rw);
 	}
 	part_stat_unlock();
 }
 void blk_queue_congestion_threshold(struct request_queue *q)
 {
 	int nr;
 	nr = q->nr_requests - (q->nr_requests / 8) + 1;
 	if (nr > q->nr_requests)
 		nr = q->nr_requests;
 	q->nr_congestion_on = nr;
 	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
 	if (nr < 1)
 		nr = 1;
 	q->nr_congestion_off = nr;
 }
 /**
  * blk_get_backing_dev_info - get the address of a queue's backing_dev_info
  * @bdev:	device
  *
  * Locates the passed device's request queue and returns the address of its
  * backing_dev_info
  *
  * Will return NULL if the request queue cannot be located.
  */
 struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev)
 {
 	struct backing_dev_info *ret = NULL;
 	struct request_queue *q = bdev_get_queue(bdev);
 	if (q)
 		ret = &q->backing_dev_info;
 	return ret;
 }
 EXPORT_SYMBOL(blk_get_backing_dev_info);
 void blk_rq_init(struct request_queue *q, struct request *rq)
 {
 	memset(rq, 0, sizeof(*rq));
 	INIT_LIST_HEAD(&rq->queuelist);
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->cpu = -1;
 	rq->q = q;
 	rq->__sector = (sector_t) -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->cmd = rq->__cmd;
 	rq->cmd_len = BLK_MAX_CDB;
 	rq->tag = -1;
 	rq->ref_count = 1;
 	rq->start_time = jiffies;
 	set_start_time_ns(rq);
 }
 EXPORT_SYMBOL(blk_rq_init);
 static void req_bio_endio(struct request *rq, struct bio *bio,
 			  unsigned int nbytes, int error)
 {
 	struct request_queue *q = rq->q;
 	if (&q->bar_rq != rq) {
 		if (error)
 			clear_bit(BIO_UPTODATE, &bio->bi_flags);
 		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
 			error = -EIO;
 		if (unlikely(nbytes > bio->bi_size)) {
 			printk(KERN_ERR "%s: want %u bytes done, %u left\n",
 			       __func__, nbytes, bio->bi_size);
 			nbytes = bio->bi_size;
 		}
 		if (unlikely(rq->cmd_flags & REQ_QUIET))
 			set_bit(BIO_QUIET, &bio->bi_flags);
 		bio->bi_size -= nbytes;
 		bio->bi_sector += (nbytes >> 9);
 		if (bio_integrity(bio))
 			bio_integrity_advance(bio, nbytes);
 		if (bio->bi_size == 0)
 			bio_endio(bio, error);
 	} else {
 		/*
 		 * Okay, this is the barrier request in progress, just
 		 * record the error;
 		 */
 		if (error && !q->orderr)
 			q->orderr = error;
 	}
 }
 void blk_dump_rq_flags(struct request *rq, char *msg)
 {
 	int bit;
 	printk(KERN_INFO "%s: dev %s: type=%x, flags=%x\n", msg,
 		rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
 		rq->cmd_flags);
 	printk(KERN_INFO "  sector %llu, nr/cnr %u/%u\n",
 	       (unsigned long long)blk_rq_pos(rq),
 	       blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
 	printk(KERN_INFO "  bio %p, biotail %p, buffer %p, len %u\n",
 	       rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq));
 	if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
 		printk(KERN_INFO "  cdb: ");
 		for (bit = 0; bit < BLK_MAX_CDB; bit++)
 			printk("%02x ", rq->cmd[bit]);
 		printk("\n");
 	}
 }
 EXPORT_SYMBOL(blk_dump_rq_flags);
 /*
  * "plug" the device if there are no outstanding requests: this will
  * force the transfer to start only after we have put all the requests
  * on the list.
  *
  * This is called with interrupts off and no requests on the queue and
  * with the queue lock held.
  */
 void blk_plug_device(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	/*
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
 	 * which will restart the queueing
 	 */
 	if (blk_queue_stopped(q))
 		return;
 	if (!queue_flag_test_and_set(QUEUE_FLAG_PLUGGED, q)) {
 		mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
 		trace_block_plug(q);
 	}
 }
 EXPORT_SYMBOL(blk_plug_device);
 /**
  * blk_plug_device_unlocked - plug a device without queue lock held
  * @q:    The &struct request_queue to plug
  *
  * Description:
  *   Like @blk_plug_device(), but grabs the queue lock and disables
  *   interrupts.
  **/
 void blk_plug_device_unlocked(struct request_queue *q)
 {
 	unsigned long flags;
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_plug_device(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_plug_device_unlocked);
 /*
  * remove the queue from the plugged list, if present. called with
  * queue lock held and interrupts disabled.
  */
 int blk_remove_plug(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	if (!queue_flag_test_and_clear(QUEUE_FLAG_PLUGGED, q))
 		return 0;
 	del_timer(&q->unplug_timer);
 	return 1;
 }
 EXPORT_SYMBOL(blk_remove_plug);
 /*
  * remove the plug and let it rip..
  */
 void __generic_unplug_device(struct request_queue *q)
 {
 	if (unlikely(blk_queue_stopped(q)))
 		return;
 	if (!blk_remove_plug(q) && !blk_queue_nonrot(q))
 		return;
 	q->request_fn(q);
 }
 /**
  * generic_unplug_device - fire a request queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   Linux uses plugging to build bigger requests queues before letting
  *   the device have at them. If a queue is plugged, the I/O scheduler
  *   is still adding and merging requests on the queue. Once the queue
  *   gets unplugged, the request_fn defined for the queue is invoked and
  *   transfers started.
  **/
 void generic_unplug_device(struct request_queue *q)
 {
 	if (blk_queue_plugged(q)) {
 		spin_lock_irq(q->queue_lock);
 		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
 	}
 }
 EXPORT_SYMBOL(generic_unplug_device);
 static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
 				   struct page *page)
 {
 	struct request_queue *q = bdi->unplug_io_data;
 	blk_unplug(q);
 }
 void blk_unplug_work(struct work_struct *work)
 {
 	struct request_queue *q =
 		container_of(work, struct request_queue, unplug_work);
 	trace_block_unplug_io(q);
 	q->unplug_fn(q);
 }
 void blk_unplug_timeout(unsigned long data)
 {
 	struct request_queue *q = (struct request_queue *)data;
 	trace_block_unplug_timer(q);
 	kblockd_schedule_work(q, &q->unplug_work);
 }
 void blk_unplug(struct request_queue *q)
 {
 	/*
 	 * devices don't necessarily have an ->unplug_fn defined
 	 */
 	if (q->unplug_fn) {
 		trace_block_unplug_io(q);
 		q->unplug_fn(q);
 	}
 }
 EXPORT_SYMBOL(blk_unplug);
 /**
  * blk_start_queue - restart a previously stopped queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   blk_start_queue() will clear the stop flag on the queue, and call
  *   the request_fn for the queue if it was in a stopped state when
  *   entered. Also see blk_stop_queue(). Queue lock must be held.
  **/
 void blk_start_queue(struct request_queue *q)
 {
 	WARN_ON(!irqs_disabled());
 	queue_flag_clear(QUEUE_FLAG_STOPPED, q);
 	__blk_run_queue(q);
 }
 EXPORT_SYMBOL(blk_start_queue);
 /**
  * blk_stop_queue - stop a queue
  * @q:    The &struct request_queue in question
  *
  * Description:
  *   The Linux block layer assumes that a block driver will consume all
  *   entries on the request queue when the request_fn strategy is called.
  *   Often this will not happen, because of hardware limitations (queue
  *   depth settings). If a device driver gets a 'queue full' response,
  *   or if it simply chooses not to queue more I/O at one point, it can
  *   call this function to prevent the request_fn from being called until
  *   the driver has signalled it's ready to go again. This happens by calling
  *   blk_start_queue() to restart queue operations. Queue lock must be held.
  **/
 void blk_stop_queue(struct request_queue *q)
 {
 	blk_remove_plug(q);
 	queue_flag_set(QUEUE_FLAG_STOPPED, q);
 }
 EXPORT_SYMBOL(blk_stop_queue);
 /**
  * blk_sync_queue - cancel any pending callbacks on a queue
  * @q: the queue
  *
  * Description:
  *     The block layer may perform asynchronous callback activity
  *     on a queue, such as calling the unplug function after a timeout.
  *     A block device may call blk_sync_queue to ensure that any
  *     such activity is cancelled, thus allowing it to release resources
  *     that the callbacks might use. The caller must already have made sure
  *     that its ->make_request_fn will not re-add plugging prior to calling
  *     this function.
  *
  */
 void blk_sync_queue(struct request_queue *q)
 {
 	del_timer_sync(&q->unplug_timer);
 	del_timer_sync(&q->timeout);
 	cancel_work_sync(&q->unplug_work);
 }
 EXPORT_SYMBOL(blk_sync_queue);
 /**
  * __blk_run_queue - run a single device queue
  * @q:	The queue to run
  *
  * Description:
  *    See @blk_run_queue. This variant must be called with the queue lock
  *    held and interrupts disabled.
  *
  */
 void __blk_run_queue(struct request_queue *q)
 {
 	blk_remove_plug(q);
 	if (unlikely(blk_queue_stopped(q)))
 		return;
 	if (elv_queue_empty(q))
 		return;
 	/*
 	 * Only recurse once to avoid overrunning the stack, let the unplug
 	 * handling reinvoke the handler shortly if we already got there.
 	 */
 	if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) {
 		q->request_fn(q);
 		queue_flag_clear(QUEUE_FLAG_REENTER, q);
 	} else {
 		queue_flag_set(QUEUE_FLAG_PLUGGED, q);
 		kblockd_schedule_work(q, &q->unplug_work);
 	}
 }
 EXPORT_SYMBOL(__blk_run_queue);
 /**
  * blk_run_queue - run a single device queue
  * @q: The queue to run
  *
  * Description:
  *    Invoke request handling on this queue, if it has pending work to do.
  *    May be used to restart queueing when a request has completed.
  */
 void blk_run_queue(struct request_queue *q)
 {
 	unsigned long flags;
 	spin_lock_irqsave(q->queue_lock, flags);
 	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_run_queue);
 void blk_put_queue(struct request_queue *q)
 {
 	kobject_put(&q->kobj);
 }
 void blk_cleanup_queue(struct request_queue *q)
 {
 	/*
 	 * We know we have process context here, so we can be a little
 	 * cautious and ensure that pending block actions on this device
 	 * are done before moving on. Going into this function, we should
 	 * not have processes doing IO to this device.
 	 */
 	blk_sync_queue(q);
 	del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer);
 	mutex_lock(&q->sysfs_lock);
 	queue_flag_set_unlocked(QUEUE_FLAG_DEAD, q);
 	mutex_unlock(&q->sysfs_lock);
 	if (q->elevator)
 		elevator_exit(q->elevator);
 	blk_put_queue(q);
 }
 EXPORT_SYMBOL(blk_cleanup_queue);
 static int blk_init_free_list(struct request_queue *q)
 {
 	struct request_list *rl = &q->rq;
 	if (unlikely(rl->rq_pool))
 		return 0;
 	rl->count[BLK_RW_SYNC] = rl->count[BLK_RW_ASYNC] = 0;
 	rl->starved[BLK_RW_SYNC] = rl->starved[BLK_RW_ASYNC] = 0;
 	rl->elvpriv = 0;
 	init_waitqueue_head(&rl->wait[BLK_RW_SYNC]);
 	init_waitqueue_head(&rl->wait[BLK_RW_ASYNC]);
 	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
 				mempool_free_slab, request_cachep, q->node);
 	if (!rl->rq_pool)
 		return -ENOMEM;
 	return 0;
 }
 struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
 {
 	return blk_alloc_queue_node(gfp_mask, -1);
 }
 EXPORT_SYMBOL(blk_alloc_queue);
 struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
 {
 	struct request_queue *q;
 	int err;
 	q = kmem_cache_alloc_node(blk_requestq_cachep,
 				gfp_mask | __GFP_ZERO, node_id);
 	if (!q)
 		return NULL;
 	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
 	q->backing_dev_info.unplug_io_data = q;
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
 	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
 	q->backing_dev_info.name = "block";
 	err = bdi_init(&q->backing_dev_info);
 	if (err) {
 		kmem_cache_free(blk_requestq_cachep, q);
 		return NULL;
 	}
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	init_timer(&q->unplug_timer);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
 	INIT_WORK(&q->unplug_work, blk_unplug_work);
 	kobject_init(&q->kobj, &blk_queue_ktype);
 	mutex_init(&q->sysfs_lock);
 	spin_lock_init(&q->__queue_lock);
 	return q;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
 /**
  * blk_init_queue  - prepare a request queue for use with a block device
  * @rfn:  The function to be called to process requests that have been
  *        placed on the queue.
  * @lock: Request queue spin lock
  *
  * Description:
  *    If a block device wishes to use the standard request handling procedures,
  *    which sorts requests and coalesces adjacent requests, then it must
  *    call blk_init_queue().  The function @rfn will be called when there
  *    are requests on the queue that need to be processed.  If the device
  *    supports plugging, then @rfn may not be called immediately when requests
  *    are available on the queue, but may be called at some time later instead.
  *    Plugged queues are generally unplugged when a buffer belonging to one
  *    of the requests on the queue is needed, or due to memory pressure.
  *
  *    @rfn is not required, or even expected, to remove all requests off the
  *    queue, but only as many as it can handle at a time.  If it does leave
  *    requests on the queue, it is responsible for arranging that the requests
  *    get dealt with eventually.
  *
  *    The queue spin lock must be held while manipulating the requests on the
  *    request queue; this lock will be taken also from interrupt context, so irq
  *    disabling is needed for it.
  *
  *    Function returns a pointer to the initialized request queue, or %NULL if
  *    it didn't succeed.
  *
  * Note:
  *    blk_init_queue() must be paired with a blk_cleanup_queue() call
  *    when the block device is deactivated (such as at module unload).
  **/
 struct request_queue *blk_init_queue(request_fn_proc *rfn, spinlock_t *lock)
 {
 	return blk_init_queue_node(rfn, lock, -1);
 }
 EXPORT_SYMBOL(blk_init_queue);
 struct request_queue *
 blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 {
 	struct request_queue *uninit_q, *q;
 	uninit_q = blk_alloc_queue_node(GFP_KERNEL, node_id);
 	if (!uninit_q)
 		return NULL;
 	q = blk_init_allocated_queue_node(uninit_q, rfn, lock, node_id);
 	if (!q)
 		blk_cleanup_queue(uninit_q);
 	return q;
 }
 EXPORT_SYMBOL(blk_init_queue_node);
 struct request_queue *
 blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn,
 			 spinlock_t *lock)
 {
 	return blk_init_allocated_queue_node(q, rfn, lock, -1);
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
 struct request_queue *
 blk_init_allocated_queue_node(struct request_queue *q, request_fn_proc *rfn,
 			      spinlock_t *lock, int node_id)
 {
 	if (!q)
 		return NULL;
 	q->node = node_id;
 	if (blk_init_free_list(q))
 		return NULL;
 	q->request_fn		= rfn;
 	q->prep_rq_fn		= NULL;
 	q->unprep_rq_fn		= NULL;
 	q->unplug_fn		= generic_unplug_device;
 	q->queue_flags		= QUEUE_FLAG_DEFAULT;
 	q->queue_lock		= lock;
 	/*
 	 * This also sets hw/phys segments, boundary and size
 	 */
 	blk_queue_make_request(q, __make_request);
 	q->sg_reserved_size = INT_MAX;
 	/*
 	 * all done
 	 */
 	if (!elevator_init(q, NULL)) {
 		blk_queue_congestion_threshold(q);
 		return q;
 	}
 	return NULL;
 }
 EXPORT_SYMBOL(blk_init_allocated_queue_node);
 int blk_get_queue(struct request_queue *q)
 {
 	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
 		kobject_get(&q->kobj);
 		return 0;
 	}
 	return 1;
 }
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
 	if (rq->cmd_flags & REQ_ELVPRIV)
 		elv_put_request(q, rq);
 	mempool_free(rq, q->rq.rq_pool);
 }
 static struct request *
 blk_alloc_request(struct request_queue *q, int flags, int priv, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
 	if (!rq)
 		return NULL;
 	blk_rq_init(q, rq);
 	rq->cmd_flags = flags | REQ_ALLOCED;
 	if (priv) {
 		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
 			mempool_free(rq, q->rq.rq_pool);
 			return NULL;
 		}
 		rq->cmd_flags |= REQ_ELVPRIV;
 	}
 	return rq;
 }
 /*
  * ioc_batching returns true if the ioc is a valid batching request and
  * should be given priority access to a request.
  */
 static inline int ioc_batching(struct request_queue *q, struct io_context *ioc)
 {
 	if (!ioc)
 		return 0;
 	/*
 	 * Make sure the process is able to allocate at least 1 request
 	 * even if the batch times out, otherwise we could theoretically
 	 * lose wakeups.
 	 */
 	return ioc->nr_batch_requests == q->nr_batching ||
 		(ioc->nr_batch_requests > 0
 		&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
 }
 /*
  * ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
  * will cause the process to be a "batcher" on all queues in the system. This
  * is the behaviour we want though - once it gets a wakeup it should be given
  * a nice run.
  */
 static void ioc_set_batching(struct request_queue *q, struct io_context *ioc)
 {
 	if (!ioc || ioc_batching(q, ioc))
 		return;
 	ioc->nr_batch_requests = q->nr_batching;
 	ioc->last_waited = jiffies;
 }
 static void __freed_request(struct request_queue *q, int sync)
 {
 	struct request_list *rl = &q->rq;
 	if (rl->count[sync] < queue_congestion_off_threshold(q))
 		blk_clear_queue_congested(q, sync);
 	if (rl->count[sync] + 1 <= q->nr_requests) {
 		if (waitqueue_active(&rl->wait[sync]))
 			wake_up(&rl->wait[sync]);
 		blk_clear_queue_full(q, sync);
 	}
 }
 /*
  * A request has just been released.  Account for it, update the full and
  * congestion status, wake up any waiters.   Called under q->queue_lock.
  */
 static void freed_request(struct request_queue *q, int sync, int priv)
 {
 	struct request_list *rl = &q->rq;
 	rl->count[sync]--;
 	if (priv)
 		rl->elvpriv--;
 	__freed_request(q, sync);
 	if (unlikely(rl->starved[sync ^ 1]))
 		__freed_request(q, sync ^ 1);
 }
 /*
  * Get a free request, queue_lock must be held.
  * Returns NULL on failure, with queue_lock held.
  * Returns !NULL on success, with queue_lock *not held*.
  */
 static struct request *get_request(struct request_queue *q, int rw_flags,
 				   struct bio *bio, gfp_t gfp_mask)
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
 	struct io_context *ioc = NULL;
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	int may_queue, priv;
 	may_queue = elv_may_queue(q, rw_flags);
 	if (may_queue == ELV_MQUEUE_NO)
 		goto rq_starved;
 	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
 		if (rl->count[is_sync]+1 >= q->nr_requests) {
 			ioc = current_io_context(GFP_ATOMIC, q->node);
 			/*
 			 * The queue will fill after this allocation, so set
 			 * it as full, and mark this process as "batching".
 			 * This process will be allowed to complete a batch of
 			 * requests, others will be blocked.
 			 */
 			if (!blk_queue_full(q, is_sync)) {
 				ioc_set_batching(q, ioc);
 				blk_set_queue_full(q, is_sync);
 			} else {
 				if (may_queue != ELV_MQUEUE_MUST
 						&& !ioc_batching(q, ioc)) {
 					/*
 					 * The queue is full and the allocating
 					 * process is not a "batcher", and not
 					 * exempted by the IO scheduler
 					 */
 					goto out;
 				}
 			}
 		}
 		blk_set_queue_congested(q, is_sync);
 	}
 	/*
 	 * Only allow batching queuers to allocate up to 50% over the defined
 	 * limit of requests, otherwise we could have thousands of requests
 	 * allocated with any setting of ->nr_requests
 	 */
 	if (rl->count[is_sync] >= (3 * q->nr_requests / 2))
 		goto out;
 	rl->count[is_sync]++;
 	rl->starved[is_sync] = 0;
 	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	if (priv)
 		rl->elvpriv++;
 	if (blk_queue_io_stat(q))
 		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
 	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
 	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
 		 * we might have messed up.
 		 *
 		 * Allocating task should really be put onto the front of the
 		 * wait queue, but this is pretty rare.
 		 */
 		spin_lock_irq(q->queue_lock);
 		freed_request(q, is_sync, priv);
 		/*
 		 * in the very unlikely event that allocation failed and no
 		 * requests for this direction was pending, mark us starved
 		 * so that freeing of a request in the other direction will
 		 * notice us. another possible fix would be to split the
 		 * rq mempool into READ and WRITE
 		 */
 rq_starved:
 		if (unlikely(rl->count[is_sync] == 0))
 			rl->starved[is_sync] = 1;
 		goto out;
 	}
 	/*
 	 * ioc may be NULL here, and ioc_batching will be false. That's
 	 * OK, if the queue is under the request limit then requests need
 	 * not count toward the nr_batch_requests limit. There will always
 	 * be some limit enforced by BLK_BATCH_TIME.
 	 */
 	if (ioc_batching(q, ioc))
 		ioc->nr_batch_requests--;
 	trace_block_getrq(q, bio, rw_flags & 1);
 out:
 	return rq;
 }
 /*
  * No available requests for this queue, unplug the device and wait for some
  * requests to become available.
  *
  * Called with q->queue_lock held, and returns with it unlocked.
  */
 static struct request *get_request_wait(struct request_queue *q, int rw_flags,
 					struct bio *bio)
 {
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	struct request *rq;
 	rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	while (!rq) {
 		DEFINE_WAIT(wait);
 		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
 		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
 				TASK_UNINTERRUPTIBLE);
 		trace_block_sleeprq(q, bio, rw_flags & 1);
 		__generic_unplug_device(q);
 		spin_unlock_irq(q->queue_lock);
 		io_schedule();
 		/*
 		 * After sleeping, we become a "batching" process and
 		 * will be able to allocate at least one request, and
 		 * up to a big batch of them for a small period time.
 		 * See ioc_batching, ioc_set_batching
 		 */
 		ioc = current_io_context(GFP_NOIO, q->node);
 		ioc_set_batching(q, ioc);
 		spin_lock_irq(q->queue_lock);
 		finish_wait(&rl->wait[is_sync], &wait);
 		rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	};
 	return rq;
 }
 struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask)
 {
 	struct request *rq;
 	BUG_ON(rw != READ && rw != WRITE);
 	spin_lock_irq(q->queue_lock);
 	if (gfp_mask & __GFP_WAIT) {
 		rq = get_request_wait(q, rw, NULL);
 	} else {
 		rq = get_request(q, rw, NULL, gfp_mask);
 		if (!rq)
 			spin_unlock_irq(q->queue_lock);
 	}
 	/* q->queue_lock is unlocked at this point */
 	return rq;
 }
 EXPORT_SYMBOL(blk_get_request);
 /**
  * blk_make_request - given a bio, allocate a corresponding struct request.
  * @q: target request queue
  * @bio:  The bio describing the memory mappings that will be submitted for IO.
  *        It may be a chained-bio properly constructed by block/bio layer.
  * @gfp_mask: gfp flags to be used for memory allocation
  *
  * blk_make_request is the parallel of generic_make_request for BLOCK_PC
  * type commands. Where the struct request needs to be farther initialized by
  * the caller. It is passed a &struct bio, which describes the memory info of
  * the I/O transfer.
  *
  * The caller of blk_make_request must make sure that bi_io_vec
  * are set to describe the memory buffers. That bio_data_dir() will return
  * the needed direction of the request. (And all bio's in the passed bio-chain
  * are properly set accordingly)
  *
  * If called under none-sleepable conditions, mapped bio buffers must not
  * need bouncing, by calling the appropriate masked or flagged allocator,
  * suitable for the target device. Otherwise the call to blk_queue_bounce will
  * BUG.
  *
  * WARNING: When allocating/cloning a bio-chain, careful consideration should be
  * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for
  * anything but the first bio in the chain. Otherwise you risk waiting for IO
  * completion of a bio that hasn't been submitted yet, thus resulting in a
  * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead
  * of bio_alloc(), as that avoids the mempool deadlock.
  * If possible a big IO should be split into smaller parts when allocation
  * fails. Partial allocation should not be an error, or you risk a live-lock.
  */
 struct request *blk_make_request(struct request_queue *q, struct bio *bio,
 				 gfp_t gfp_mask)
 {
 	struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask);
 	if (unlikely(!rq))
 		return ERR_PTR(-ENOMEM);
 	for_each_bio(bio) {
 		struct bio *bounce_bio = bio;
 		int ret;
 		blk_queue_bounce(q, &bounce_bio);
 		ret = blk_rq_append_bio(q, rq, bounce_bio);
 		if (unlikely(ret)) {
 			blk_put_request(rq);
 			return ERR_PTR(ret);
 		}
 	}
 	return rq;
 }
 EXPORT_SYMBOL(blk_make_request);
 /**
  * blk_requeue_request - put a request back on queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
  *
  * Description:
  *    Drivers often keep queueing requests until the hardware cannot accept
  *    more, when that condition happens we need to put the request back
  *    on the queue. Must be called with queue lock held.
  */
 void blk_requeue_request(struct request_queue *q, struct request *rq)
 {
 	blk_delete_timer(rq);
 	blk_clear_rq_complete(rq);
 	trace_block_rq_requeue(q, rq);
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 	BUG_ON(blk_queued_rq(rq));
 	elv_requeue_request(q, rq);
 }
 EXPORT_SYMBOL(blk_requeue_request);
 /**
  * blk_insert_request - insert a special request into a request queue
  * @q:		request queue where request should be inserted
  * @rq:		request to be inserted
  * @at_head:	insert request at head or tail of queue
  * @data:	private data
  *
  * Description:
  *    Many block devices need to execute commands asynchronously, so they don't
  *    block the whole kernel from preemption during request execution.  This is
  *    accomplished normally by inserting aritficial requests tagged as
  *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
  *    be scheduled for actual execution by the request queue.
  *
  *    We have the option of inserting the head or the tail of the queue.
  *    Typically we use the tail for new ioctls and so forth.  We use the head
  *    of the queue for things like a QUEUE_FULL message from a device, or a
  *    host that is unable to accept a particular command.
  */
 void blk_insert_request(struct request_queue *q, struct request *rq,
 			int at_head, void *data)
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
 	unsigned long flags;
 	/*
 	 * tell I/O scheduler that this isn't a regular read/write (ie it
 	 * must not attempt merges on this) and that it acts as a soft
 	 * barrier
 	 */
 	rq->cmd_type = REQ_TYPE_SPECIAL;
 	rq->special = data;
 	spin_lock_irqsave(q->queue_lock, flags);
 	/*
 	 * If command is tagged, release the tag
 	 */
 	if (blk_rq_tagged(rq))
 		blk_queue_end_tag(q, rq);
 	drive_stat_acct(rq, 1);
 	__elv_add_request(q, rq, where, 0);
 	__blk_run_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_insert_request);
 /*
  * add-request adds a request to the linked list.
  * queue lock is held and interrupts disabled, as we muck with the
  * request queue list.
  */
 static inline void add_request(struct request_queue *q, struct request *req)
 {
 	drive_stat_acct(req, 1);
 	/*
 	 * elevator indicated where it wants this request to be
 	 * inserted at elevator_merge time
 	 */
 	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
 }
 static void part_round_stats_single(int cpu, struct hd_struct *part,
 				    unsigned long now)
 {
 	if (now == part->stamp)
 		return;
 	if (part_in_flight(part)) {
 		__part_stat_add(cpu, part, time_in_queue,
 				part_in_flight(part) * (now - part->stamp));
 		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
 	}
 	part->stamp = now;
 }
 /**
  * part_round_stats() - Round off the performance stats on a struct disk_stats.
  * @cpu: cpu number for stats access
  * @part: target partition
  *
  * The average IO queue length and utilisation statistics are maintained
  * by observing the current state of the queue length and the amount of
  * time it has been in this state for.
  *
  * Normally, that accounting is done on IO completion, but that can result
  * in more than a second's worth of IO being accounted for within any one
  * second, leading to >100% utilisation.  To deal with that, we call this
  * function to do a round-off before returning the results when reading
  * /proc/diskstats.  This accounts immediately for all queue usage up to
  * the current jiffies and restarts the counters again.
  */
 void part_round_stats(int cpu, struct hd_struct *part)
 {
 	unsigned long now = jiffies;
 	if (part->partno)
 		part_round_stats_single(cpu, &part_to_disk(part)->part0, now);
 	part_round_stats_single(cpu, part, now);
 }
 EXPORT_SYMBOL_GPL(part_round_stats);
 /*
  * queue lock must be held
  */
 void __blk_put_request(struct request_queue *q, struct request *req)
 {
 	if (unlikely(!q))
 		return;
 	if (unlikely(--req->ref_count))
 		return;
 	elv_completed_request(q, req);
 	/* this is a bio leak */
 	WARN_ON(req->bio != NULL);
 	/*
 	 * Request may not have originated from ll_rw_blk. if not,
 	 * it didn't come out of our reserved rq pools
 	 */
 	if (req->cmd_flags & REQ_ALLOCED) {
 		int is_sync = rq_is_sync(req) != 0;
 		int priv = req->cmd_flags & REQ_ELVPRIV;
 		BUG_ON(!list_empty(&req->queuelist));
 		BUG_ON(!hlist_unhashed(&req->hash));
 		blk_free_request(q, req);
 		freed_request(q, is_sync, priv);
 	}
 }
 EXPORT_SYMBOL_GPL(__blk_put_request);
 void blk_put_request(struct request *req)
 {
 	unsigned long flags;
 	struct request_queue *q = req->q;
 	spin_lock_irqsave(q->queue_lock, flags);
 	__blk_put_request(q, req);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(blk_put_request);
 /**
  * blk_add_request_payload - add a payload to a request
  * @rq: request to update
  * @page: page backing the payload
  * @len: length of the payload.
  *
  * This allows to later add a payload to an already submitted request by
  * a block driver.  The driver needs to take care of freeing the payload
  * itself.
  *
  * Note that this is a quite horrible hack and nothing but handling of
  * discard requests should ever use it.
  */
 void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len)
 {
 	struct bio *bio = rq->bio;
 	bio->bi_io_vec->bv_page = page;
 	bio->bi_io_vec->bv_offset = 0;
 	bio->bi_io_vec->bv_len = len;
 	bio->bi_size = len;
 	bio->bi_vcnt = 1;
 	bio->bi_phys_segments = 1;
 	rq->__data_len = rq->resid_len = len;
 	rq->nr_phys_segments = 1;
 	rq->buffer = bio_data(bio);
 }
 EXPORT_SYMBOL_GPL(blk_add_request_payload);
 void init_request_from_bio(struct request *req, struct bio *bio)
 {
 	req->cpu = bio->bi_comp_cpu;
 	req->cmd_type = REQ_TYPE_FS;
 	req->cmd_flags |= bio->bi_rw & REQ_COMMON_MASK;
 	if (bio->bi_rw & REQ_RAHEAD)
 		req->cmd_flags |= REQ_FAILFAST_MASK;
 	req->errors = 0;
 	req->__sector = bio->bi_sector;
 	req->ioprio = bio_prio(bio);
 	blk_rq_bio_prep(req->q, req, bio);
 }
 /*
  * Only disabling plugging for non-rotational devices if it does tagging
  * as well, otherwise we do need the proper merging
  */
 static inline bool queue_should_plug(struct request_queue *q)
 {
 	return !(blk_queue_nonrot(q) && blk_queue_tagged(q));
 }
 static int __make_request(struct request_queue *q, struct bio *bio)
 {
 	struct request *req;
 	int el_ret;
 	unsigned int bytes = bio->bi_size;
 	const unsigned short prio = bio_prio(bio);
 	const bool sync = (bio->bi_rw & REQ_SYNC);
 	const bool unplug = (bio->bi_rw & REQ_UNPLUG);
 	const unsigned int ff = bio->bi_rw & REQ_FAILFAST_MASK;
 	int rw_flags;
-	if ((bio->bi_rw & REQ_HARDBARRIER) &&
+	/* REQ_HARDBARRIER is no more */
-	    (q->next_ordered == QUEUE_ORDERED_NONE)) {
+	if (WARN_ONCE(bio->bi_rw & REQ_HARDBARRIER,
+		"block: HARDBARRIER is deprecated, use FLUSH/FUA instead\n")) {
 		bio_endio(bio, -EOPNOTSUPP);
 		return 0;
 	}
 	/*
 	 * low level driver can indicate that it wants pages above a
 	 * certain limit bounced to low memory (ie for highmem, or even
 	 * ISA dma in theory)
 	 */
 	blk_queue_bounce(q, &bio);
 	spin_lock_irq(q->queue_lock);
 	if (unlikely((bio->bi_rw & REQ_HARDBARRIER)) || elv_queue_empty(q))
 		goto get_rq;
 	el_ret = elv_merge(q, &req, bio);
 	switch (el_ret) {
 	case ELEVATOR_BACK_MERGE:
 		BUG_ON(!rq_mergeable(req));
 		if (!ll_back_merge_fn(q, req, bio))
 			break;
 		trace_block_bio_backmerge(q, bio);
 		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
 			blk_rq_set_mixed_merge(req);
 		req->biotail->bi_next = bio;
 		req->biotail = bio;
 		req->__data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
 		drive_stat_acct(req, 0);
 		elv_bio_merged(q, req, bio);
 		if (!attempt_back_merge(q, req))
 			elv_merged_request(q, req, el_ret);
 		goto out;
 	case ELEVATOR_FRONT_MERGE:
 		BUG_ON(!rq_mergeable(req));
 		if (!ll_front_merge_fn(q, req, bio))
 			break;
 		trace_block_bio_frontmerge(q, bio);
 		if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) {
 			blk_rq_set_mixed_merge(req);
 			req->cmd_flags &= ~REQ_FAILFAST_MASK;
 			req->cmd_flags |= ff;
 		}
 		bio->bi_next = req->bio;
 		req->bio = bio;
 		/*
 		 * may not be valid. if the low level driver said
 		 * it didn't need a bounce buffer then it better
 		 * not touch req->buffer either...
 		 */
 		req->buffer = bio_data(bio);
 		req->__sector = bio->bi_sector;
 		req->__data_len += bytes;
 		req->ioprio = ioprio_best(req->ioprio, prio);
 		if (!blk_rq_cpu_valid(req))
 			req->cpu = bio->bi_comp_cpu;
 		drive_stat_acct(req, 0);
 		elv_bio_merged(q, req, bio);
 		if (!attempt_front_merge(q, req))
 			elv_merged_request(q, req, el_ret);
 		goto out;
 	/* ELV_NO_MERGE: elevator says don't/can't merge. */
 	default:
 		;
 	}
 get_rq:
 	/*
 	 * This sync check and mask will be re-done in init_request_from_bio(),
 	 * but we need to set it earlier to expose the sync flag to the
 	 * rq allocator and io schedulers.
 	 */
 	rw_flags = bio_data_dir(bio);
 	if (sync)
 		rw_flags |= REQ_SYNC;
 	/*
 	 * Grab a free request. This is might sleep but can not fail.
 	 * Returns with the queue unlocked.
 	 */
 	req = get_request_wait(q, rw_flags, bio);
 	/*
 	 * After dropping the lock and possibly sleeping here, our request
 	 * may now be mergeable after it had proven unmergeable (above).
 	 * We don't worry about that case for efficiency. It won't happen
 	 * often, and the elevators are able to handle it.
 	 */
 	init_request_from_bio(req, bio);
 	spin_lock_irq(q->queue_lock);
 	if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) ||
 	    bio_flagged(bio, BIO_CPU_AFFINE))
 		req->cpu = blk_cpu_to_group(smp_processor_id());
 	if (queue_should_plug(q) && elv_queue_empty(q))
 		blk_plug_device(q);
 	add_request(q, req);
 out:
 	if (unplug || !queue_should_plug(q))
 		__generic_unplug_device(q);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
 }
 /*
  * If bio->bi_dev is a partition, remap the location
  */
 static inline void blk_partition_remap(struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	if (bio_sectors(bio) && bdev != bdev->bd_contains) {
 		struct hd_struct *p = bdev->bd_part;
 		bio->bi_sector += p->start_sect;
 		bio->bi_bdev = bdev->bd_contains;
 		trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
 				    bdev->bd_dev,
 				    bio->bi_sector - p->start_sect);
 	}
 }
 static void handle_bad_sector(struct bio *bio)
 {
 	char b[BDEVNAME_SIZE];
 	printk(KERN_INFO "attempt to access beyond end of device\n");
 	printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
 			bdevname(bio->bi_bdev, b),
 			bio->bi_rw,
 			(unsigned long long)bio->bi_sector + bio_sectors(bio),
 			(long long)(bio->bi_bdev->bd_inode->i_size >> 9));
 	set_bit(BIO_EOF, &bio->bi_flags);
 }
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 static DECLARE_FAULT_ATTR(fail_make_request);
 static int __init setup_fail_make_request(char *str)
 {
 	return setup_fault_attr(&fail_make_request, str);
 }
 __setup("fail_make_request=", setup_fail_make_request);
 static int should_fail_request(struct bio *bio)
 {
 	struct hd_struct *part = bio->bi_bdev->bd_part;
 	if (part_to_disk(part)->part0.make_it_fail || part->make_it_fail)
 		return should_fail(&fail_make_request, bio->bi_size);
 	return 0;
 }
 static int __init fail_make_request_debugfs(void)
 {
 	return init_fault_attr_dentries(&fail_make_request,
 					"fail_make_request");
 }
 late_initcall(fail_make_request_debugfs);
 #else /* CONFIG_FAIL_MAKE_REQUEST */
 static inline int should_fail_request(struct bio *bio)
 {
 	return 0;
 }
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 /*
  * Check whether this bio extends beyond the end of the device.
  */
 static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
 {
 	sector_t maxsector;
 	if (!nr_sectors)
 		return 0;
 	/* Test device or partition size, when known. */
 	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
 	if (maxsector) {
 		sector_t sector = bio->bi_sector;
 		if (maxsector < nr_sectors || maxsector - nr_sectors < sector) {
 			/*
 			 * This may well happen - the kernel calls bread()
 			 * without checking the size of the device, e.g., when
 			 * mounting a device.
 			 */
 			handle_bad_sector(bio);
 			return 1;
 		}
 	}
 	return 0;
 }
 /**
  * generic_make_request - hand a buffer to its device driver for I/O
  * @bio:  The bio describing the location in memory and on the device.
  *
  * generic_make_request() is used to make I/O requests of block
  * devices. It is passed a &struct bio, which describes the I/O that needs
  * to be done.
  *
  * generic_make_request() does not return any status.  The
  * success/failure status of the request, along with notification of
  * completion, is delivered asynchronously through the bio->bi_end_io
  * function described (one day) else where.
  *
  * The caller of generic_make_request must make sure that bi_io_vec
  * are set to describe the memory buffer, and that bi_dev and bi_sector are
  * set to describe the device address, and the
  * bi_end_io and optionally bi_private are set to describe how
  * completion notification should be signaled.
  *
  * generic_make_request and the drivers it calls may use bi_next if this
  * bio happens to be merged with someone else, and may change bi_dev and
  * bi_sector for remaps as it sees fit.  So the values of these fields
  * should NOT be depended on after the call to generic_make_request.
  */
 static inline void __generic_make_request(struct bio *bio)
 {
 	struct request_queue *q;
 	sector_t old_sector;
 	int ret, nr_sectors = bio_sectors(bio);
 	dev_t old_dev;
 	int err = -EIO;
 	might_sleep();
 	if (bio_check_eod(bio, nr_sectors))
 		goto end_io;
 	/*
 	 * Resolve the mapping until finished. (drivers are
 	 * still free to implement/resolve their own stacking
 	 * by explicitly returning 0)
 	 *
 	 * NOTE: we don't repeat the blk_size check for each new device.
 	 * Stacking drivers are expected to know what they are doing.
 	 */
 	old_sector = -1;
 	old_dev = 0;
 	do {
 		char b[BDEVNAME_SIZE];
 		q = bdev_get_queue(bio->bi_bdev);
 		if (unlikely(!q)) {
 			printk(KERN_ERR
 			       "generic_make_request: Trying to access "
 				"nonexistent block-device %s (%Lu)\n",
 				bdevname(bio->bi_bdev, b),
 				(long long) bio->bi_sector);
 			goto end_io;
 		}
 		if (unlikely(!(bio->bi_rw & REQ_DISCARD) &&
 			     nr_sectors > queue_max_hw_sectors(q))) {
 			printk(KERN_ERR "bio too big device %s (%u > %u)\n",
 			       bdevname(bio->bi_bdev, b),
 			       bio_sectors(bio),
 			       queue_max_hw_sectors(q));
 			goto end_io;
 		}
 		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
 			goto end_io;
 		if (should_fail_request(bio))
 			goto end_io;
 		/*
 		 * If this device has partitions, remap block n
 		 * of partition p to block n+start(p) of the disk.
 		 */
 		blk_partition_remap(bio);
 		if (bio_integrity_enabled(bio) && bio_integrity_prep(bio))
 			goto end_io;
 		if (old_sector != -1)
 			trace_block_remap(q, bio, old_dev, old_sector);
 		old_sector = bio->bi_sector;
 		old_dev = bio->bi_bdev->bd_dev;
 		if (bio_check_eod(bio, nr_sectors))
 			goto end_io;
 		if ((bio->bi_rw & REQ_DISCARD) &&
 		    (!blk_queue_discard(q) ||
 		     ((bio->bi_rw & REQ_SECURE) &&
 		      !blk_queue_secdiscard(q)))) {
 			err = -EOPNOTSUPP;
 			goto end_io;
 		}
 		trace_block_bio_queue(q, bio);
 		ret = q->make_request_fn(q, bio);
 	} while (ret);
 	return;
 end_io:
 	bio_endio(bio, err);
 }
 /*
  * We only want one ->make_request_fn to be active at a time,
  * else stack usage with stacked devices could be a problem.
  * So use current->bio_list to keep a list of requests
  * submited by a make_request_fn function.
  * current->bio_list is also used as a flag to say if
  * generic_make_request is currently active in this task or not.
  * If it is NULL, then no make_request is active.  If it is non-NULL,
  * then a make_request is active, and new requests should be added
  * at the tail
  */
 void generic_make_request(struct bio *bio)
 {
 	struct bio_list bio_list_on_stack;
 	if (current->bio_list) {
 		/* make_request is active */
 		bio_list_add(current->bio_list, bio);
 		return;
 	}
 	/* following loop may be a bit non-obvious, and so deserves some
 	 * explanation.
 	 * Before entering the loop, bio->bi_next is NULL (as all callers
 	 * ensure that) so we have a list with a single bio.
 	 * We pretend that we have just taken it off a longer list, so
 	 * we assign bio_list to a pointer to the bio_list_on_stack,
 	 * thus initialising the bio_list of new bios to be
 	 * added.  __generic_make_request may indeed add some more bios
 	 * through a recursive call to generic_make_request.  If it
 	 * did, we find a non-NULL value in bio_list and re-enter the loop
 	 * from the top.  In this case we really did just take the bio
 	 * of the top of the list (no pretending) and so remove it from
 	 * bio_list, and call into __generic_make_request again.
 	 *
 	 * The loop was structured like this to make only one call to
 	 * __generic_make_request (which is important as it is large and
 	 * inlined) and to keep the structure simple.
 	 */
 	BUG_ON(bio->bi_next);
 	bio_list_init(&bio_list_on_stack);
 	current->bio_list = &bio_list_on_stack;
 	do {
 		__generic_make_request(bio);
 		bio = bio_list_pop(current->bio_list);
 	} while (bio);
 	current->bio_list = NULL; /* deactivate */
 }
 EXPORT_SYMBOL(generic_make_request);
 /**
  * submit_bio - submit a bio to the block device layer for I/O
  * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
  * @bio: The &struct bio which describes the I/O
  *
  * submit_bio() is very similar in purpose to generic_make_request(), and
  * uses that function to do most of the work. Both are fairly rough
  * interfaces; @bio must be presetup and ready for I/O.
  *
  */
 void submit_bio(int rw, struct bio *bio)
 {
 	int count = bio_sectors(bio);
 	bio->bi_rw |= rw;
 	/*
 	 * If it's a regular read/write or a barrier with data attached,
 	 * go through the normal accounting stuff before submission.
 	 */
 	if (bio_has_data(bio) && !(rw & REQ_DISCARD)) {
 		if (rw & WRITE) {
 			count_vm_events(PGPGOUT, count);
 		} else {
 			task_io_account_read(bio->bi_size);
 			count_vm_events(PGPGIN, count);
 		}
 		if (unlikely(block_dump)) {
 			char b[BDEVNAME_SIZE];
 			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
 			current->comm, task_pid_nr(current),
 				(rw & WRITE) ? "WRITE" : "READ",
 				(unsigned long long)bio->bi_sector,
 				bdevname(bio->bi_bdev, b));
 		}
 	}
 	generic_make_request(bio);
 }
 EXPORT_SYMBOL(submit_bio);
 /**
  * blk_rq_check_limits - Helper function to check a request for the queue limit
  * @q:  the queue
  * @rq: the request being checked
  *
  * Description:
  *    @rq may have been made based on weaker limitations of upper-level queues
  *    in request stacking drivers, and it may violate the limitation of @q.
  *    Since the block layer and the underlying device driver trust @rq
  *    after it is inserted to @q, it should be checked against @q before
  *    the insertion using this generic function.
  *
  *    This function should also be useful for request stacking drivers
  *    in some cases below, so export this fuction.
  *    Request stacking drivers like request-based dm may change the queue
  *    limits while requests are in the queue (e.g. dm's table swapping).
  *    Such request stacking drivers should check those requests agaist
  *    the new queue limits again when they dispatch those requests,
  *    although such checkings are also done against the old queue limits
  *    when submitting requests.
  */
 int blk_rq_check_limits(struct request_queue *q, struct request *rq)
 {
 	if (rq->cmd_flags & REQ_DISCARD)
 		return 0;
 	if (blk_rq_sectors(rq) > queue_max_sectors(q) ||
 	    blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) {
 		printk(KERN_ERR "%s: over max size limit.\n", __func__);
 		return -EIO;
 	}
 	/*
 	 * queue's settings related to segment counting like q->bounce_pfn
 	 * may differ from that of other stacking queues.
 	 * Recalculate it to check the request correctly on this queue's
 	 * limitation.
 	 */
 	blk_recalc_rq_segments(rq);
 	if (rq->nr_phys_segments > queue_max_segments(q)) {
 		printk(KERN_ERR "%s: over max segments limit.\n", __func__);
 		return -EIO;
 	}
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_rq_check_limits);
 /**
  * blk_insert_cloned_request - Helper for stacking drivers to submit a request
  * @q:  the queue to submit the request
  * @rq: the request being queued
  */
 int blk_insert_cloned_request(struct request_queue *q, struct request *rq)
 {
 	unsigned long flags;
 	if (blk_rq_check_limits(q, rq))
 		return -EIO;
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	if (rq->rq_disk && rq->rq_disk->part0.make_it_fail &&
 	    should_fail(&fail_make_request, blk_rq_bytes(rq)))
 		return -EIO;
 #endif
 	spin_lock_irqsave(q->queue_lock, flags);
 	/*
 	 * Submitting request must be dequeued before calling this function
 	 * because it will be linked to another request_queue
 	 */
 	BUG_ON(blk_queued_rq(rq));
 	drive_stat_acct(rq, 1);
 	__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
 /**
  * blk_rq_err_bytes - determine number of bytes till the next failure boundary
  * @rq: request to examine
  *
  * Description:
  *     A request could be merge of IOs which require different failure
  *     handling.  This function determines the number of bytes which
  *     can be failed from the beginning of the request without
  *     crossing into area which need to be retried further.
  *
  * Return:
  *     The number of bytes to fail.
  *
  * Context:
  *     queue_lock must be held.
  */
 unsigned int blk_rq_err_bytes(const struct request *rq)
 {
 	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
 	unsigned int bytes = 0;
 	struct bio *bio;
 	if (!(rq->cmd_flags & REQ_MIXED_MERGE))
 		return blk_rq_bytes(rq);
 	/*
 	 * Currently the only 'mixing' which can happen is between
 	 * different fastfail types.  We can safely fail portions
 	 * which have all the failfast bits that the first one has -
 	 * the ones which are at least as eager to fail as the first
 	 * one.
 	 */
 	for (bio = rq->bio; bio; bio = bio->bi_next) {
 		if ((bio->bi_rw & ff) != ff)
 			break;
 		bytes += bio->bi_size;
 	}
 	/* this could lead to infinite loop */
 	BUG_ON(blk_rq_bytes(rq) && !bytes);
 	return bytes;
 }
 EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
 static void blk_account_io_completion(struct request *req, unsigned int bytes)
 {
 	if (blk_do_io_stat(req)) {
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
 		cpu = part_stat_lock();
 		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 		part_stat_add(cpu, part, sectors[rw], bytes >> 9);
 		part_stat_unlock();
 	}
 }
 static void blk_account_io_done(struct request *req)
 {
 	/*
 	 * Account IO completion.  bar_rq isn't accounted as a normal
 	 * IO on queueing nor completion.  Accounting the containing
 	 * request is enough.
 	 */
 	if (blk_do_io_stat(req) && req != &req->q->bar_rq) {
 		unsigned long duration = jiffies - req->start_time;
 		const int rw = rq_data_dir(req);
 		struct hd_struct *part;
 		int cpu;
 		cpu = part_stat_lock();
 		part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req));
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rw);
 		part_stat_unlock();
 	}
 }
 /**
  * blk_peek_request - peek at the top of a request queue
  * @q: request queue to peek at
  *
  * Description:
  *     Return the request at the top of @q.  The returned request
  *     should be started using blk_start_request() before LLD starts
  *     processing it.
  *
  * Return:
  *     Pointer to the request at the top of @q if available.  Null
  *     otherwise.
  *
  * Context:
  *     queue_lock must be held.
  */
 struct request *blk_peek_request(struct request_queue *q)
 {
 	struct request *rq;
 	int ret;
 	while ((rq = __elv_next_request(q)) != NULL) {
 		if (!(rq->cmd_flags & REQ_STARTED)) {
 			/*
 			 * This is the first time the device driver
 			 * sees this request (possibly after
 			 * requeueing).  Notify IO scheduler.
 			 */
 			if (rq->cmd_flags & REQ_SORTED)
 				elv_activate_rq(q, rq);
 			/*
 			 * just mark as started even if we don't start
 			 * it, a request that has been delayed should
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
 			trace_block_rq_issue(q, rq);
 		}
 		if (!q->boundary_rq || q->boundary_rq == rq) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = NULL;
 		}
 		if (rq->cmd_flags & REQ_DONTPREP)
 			break;
 		if (q->dma_drain_size && blk_rq_bytes(rq)) {
 			/*
 			 * make sure space for the drain appears we
 			 * know we can do this because max_hw_segments
 			 * has been adjusted to be one fewer than the
 			 * device can handle
 			 */
 			rq->nr_phys_segments++;
 		}
 		if (!q->prep_rq_fn)
 			break;
 		ret = q->prep_rq_fn(q, rq);
 		if (ret == BLKPREP_OK) {
 			break;
 		} else if (ret == BLKPREP_DEFER) {
 			/*
 			 * the request may have been (partially) prepped.
 			 * we need to keep this request in the front to
 			 * avoid resource deadlock.  REQ_STARTED will
 			 * prevent other fs requests from passing this one.
 			 */
 			if (q->dma_drain_size && blk_rq_bytes(rq) &&
 			    !(rq->cmd_flags & REQ_DONTPREP)) {
 				/*
 				 * remove the space for the drain we added
 				 * so that we don't add it again
 				 */
 				--rq->nr_phys_segments;
 			}
 			rq = NULL;
 			break;
 		} else if (ret == BLKPREP_KILL) {
 			rq->cmd_flags |= REQ_QUIET;
 			/*
 			 * Mark this request as started so we don't trigger
 			 * any debug logic in the end I/O path.
 			 */
 			blk_start_request(rq);
 			__blk_end_request_all(rq, -EIO);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __func__, ret);
 			break;
 		}
 	}
 	return rq;
 }
 EXPORT_SYMBOL(blk_peek_request);
 void blk_dequeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	BUG_ON(list_empty(&rq->queuelist));
 	BUG_ON(ELV_ON_HASH(rq));
 	list_del_init(&rq->queuelist);
 	/*
 	 * the time frame between a request being removed from the lists
 	 * and to it is freed is accounted as io that is in progress at
 	 * the driver side.
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]++;
 		set_io_start_time_ns(rq);
 	}
 }
 /**
  * blk_start_request - start request processing on the driver
  * @req: request to dequeue
  *
  * Description:
  *     Dequeue @req and start timeout timer on it.  This hands off the
  *     request to the driver.
  *
  *     Block internal functions which don't want to start timer should
  *     call blk_dequeue_request().
  *
  * Context:
  *     queue_lock must be held.
  */
 void blk_start_request(struct request *req)
 {
 	blk_dequeue_request(req);
 	/*
 	 * We are now handing the request to the hardware, initialize
 	 * resid_len to full count and add the timeout handler.
 	 */
 	req->resid_len = blk_rq_bytes(req);
 	if (unlikely(blk_bidi_rq(req)))
 		req->next_rq->resid_len = blk_rq_bytes(req->next_rq);
 	blk_add_timer(req);
 }
 EXPORT_SYMBOL(blk_start_request);
 /**
  * blk_fetch_request - fetch a request from a request queue
  * @q: request queue to fetch a request from
  *
  * Description:
  *     Return the request at the top of @q.  The request is started on
  *     return and LLD can start processing it immediately.
  *
  * Return:
  *     Pointer to the request at the top of @q if available.  Null
  *     otherwise.
  *
  * Context:
  *     queue_lock must be held.
  */
 struct request *blk_fetch_request(struct request_queue *q)
 {
 	struct request *rq;
 	rq = blk_peek_request(q);
 	if (rq)
 		blk_start_request(rq);
 	return rq;
 }
 EXPORT_SYMBOL(blk_fetch_request);
 /**
  * blk_update_request - Special helper function for request stacking drivers
  * @req:      the request being processed
  * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete @req
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @req, but doesn't complete
  *     the request structure even if @req doesn't have leftover.
  *     If @req has leftover, sets it up for the next range of segments.
  *
  *     This special helper function is only for request stacking drivers
  *     (e.g. request-based dm) so that they can handle partial completion.
  *     Actual device drivers should use blk_end_request instead.
  *
  *     Passing the result of blk_rq_bytes() as @nr_bytes guarantees
  *     %false return from this function.
  *
  * Return:
  *     %false - this request doesn't have any more data
  *     %true  - this request has more data
  **/
 bool blk_update_request(struct request *req, int error, unsigned int nr_bytes)
 {
 	int total_bytes, bio_nbytes, next_idx = 0;
 	struct bio *bio;
 	if (!req->bio)
 		return false;
 	trace_block_rq_complete(req->q, req);
 	/*
 	 * For fs requests, rq is just carrier of independent bio's
 	 * and each partial completion should be handled separately.
 	 * Reset per-request error on each partial completion.
 	 *
 	 * TODO: tj: This is too subtle.  It would be better to let
 	 * low level drivers do what they see fit.
 	 */
 	if (req->cmd_type == REQ_TYPE_FS)
 		req->errors = 0;
 	if (error && req->cmd_type == REQ_TYPE_FS &&
 	    !(req->cmd_flags & REQ_QUIET)) {
 		printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n",
 				req->rq_disk ? req->rq_disk->disk_name : "?",
 				(unsigned long long)blk_rq_pos(req));
 	}
 	blk_account_io_completion(req, nr_bytes);
 	total_bytes = bio_nbytes = 0;
 	while ((bio = req->bio) != NULL) {
 		int nbytes;
 		if (nr_bytes >= bio->bi_size) {
 			req->bio = bio->bi_next;
 			nbytes = bio->bi_size;
 			req_bio_endio(req, bio, nbytes, error);
 			next_idx = 0;
 			bio_nbytes = 0;
 		} else {
 			int idx = bio->bi_idx + next_idx;
 			if (unlikely(idx >= bio->bi_vcnt)) {
 				blk_dump_rq_flags(req, "__end_that");
 				printk(KERN_ERR "%s: bio idx %d >= vcnt %d\n",
 				       __func__, idx, bio->bi_vcnt);
 				break;
 			}
 			nbytes = bio_iovec_idx(bio, idx)->bv_len;
 			BIO_BUG_ON(nbytes > bio->bi_size);
 			/*
 			 * not a complete bvec done
 			 */
 			if (unlikely(nbytes > nr_bytes)) {
 				bio_nbytes += nr_bytes;
 				total_bytes += nr_bytes;
 				break;
 			}
 			/*
 			 * advance to the next vector
 			 */
 			next_idx++;
 			bio_nbytes += nbytes;
 		}
 		total_bytes += nbytes;
 		nr_bytes -= nbytes;
 		bio = req->bio;
 		if (bio) {
 			/*
 			 * end more in this run, or just return 'not-done'
 			 */
 			if (unlikely(nr_bytes <= 0))
 				break;
 		}
 	}
 	/*
 	 * completely done
 	 */
 	if (!req->bio) {
 		/*
 		 * Reset counters so that the request stacking driver
 		 * can find how many bytes remain in the request
 		 * later.
 		 */
 		req->__data_len = 0;
 		return false;
 	}
 	/*
 	 * if the request wasn't completed, update state
 	 */
 	if (bio_nbytes) {
 		req_bio_endio(req, bio, bio_nbytes, error);
 		bio->bi_idx += next_idx;
 		bio_iovec(bio)->bv_offset += nr_bytes;
 		bio_iovec(bio)->bv_len -= nr_bytes;
 	}
 	req->__data_len -= total_bytes;
 	req->buffer = bio_data(req->bio);
 	/* update sector only for requests with clear definition of sector */
 	if (req->cmd_type == REQ_TYPE_FS || (req->cmd_flags & REQ_DISCARD))
 		req->__sector += total_bytes >> 9;
 	/* mixed attributes always follow the first bio */
 	if (req->cmd_flags & REQ_MIXED_MERGE) {
 		req->cmd_flags &= ~REQ_FAILFAST_MASK;
 		req->cmd_flags |= req->bio->bi_rw & REQ_FAILFAST_MASK;
 	}
 	/*
 	 * If total number of sectors is less than the first segment
 	 * size, something has gone terribly wrong.
 	 */
 	if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
 		printk(KERN_ERR "blk: request botched\n");
 		req->__data_len = blk_rq_cur_bytes(req);
 	}
 	/* recalculate the number of segments */
 	blk_recalc_rq_segments(req);
 	return true;
 }
 EXPORT_SYMBOL_GPL(blk_update_request);
 static bool blk_update_bidi_request(struct request *rq, int error,
 				    unsigned int nr_bytes,
 				    unsigned int bidi_bytes)
 {
 	if (blk_update_request(rq, error, nr_bytes))
 		return true;
 	/* Bidi request must be completed as a whole */
 	if (unlikely(blk_bidi_rq(rq)) &&
 	    blk_update_request(rq->next_rq, error, bidi_bytes))
 		return true;
 	if (blk_queue_add_random(rq->q))
 		add_disk_randomness(rq->rq_disk);
 	return false;
 }
 /**
  * blk_unprep_request - unprepare a request
  * @req:	the request
  *
  * This function makes a request ready for complete resubmission (or
  * completion).  It happens only after all error handling is complete,
  * so represents the appropriate moment to deallocate any resources
  * that were allocated to the request in the prep_rq_fn.  The queue
  * lock is held when calling this.
  */
 void blk_unprep_request(struct request *req)
 {
 	struct request_queue *q = req->q;
 	req->cmd_flags &= ~REQ_DONTPREP;
 	if (q->unprep_rq_fn)
 		q->unprep_rq_fn(q, req);
 }
 EXPORT_SYMBOL_GPL(blk_unprep_request);
 /*
  * queue lock must be held
  */
 static void blk_finish_request(struct request *req, int error)
 {
 	if (blk_rq_tagged(req))
 		blk_queue_end_tag(req->q, req);
 	BUG_ON(blk_queued_rq(req));
 	if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS)
 		laptop_io_completion(&req->q->backing_dev_info);
 	blk_delete_timer(req);
 	if (req->cmd_flags & REQ_DONTPREP)
 		blk_unprep_request(req);
 	blk_account_io_done(req);
 	if (req->end_io)
 		req->end_io(req, error);
 	else {
 		if (blk_bidi_rq(req))
 			__blk_put_request(req->next_rq->q, req->next_rq);
 		__blk_put_request(req->q, req);
 	}
 }
 /**
  * blk_end_bidi_request - Complete a bidi request
  * @rq:         the request to complete
  * @error:      %0 for success, < %0 for error
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq and @rq->next_rq.
  *     Drivers that supports bidi can safely call this member for any
  *     type of request, bidi or uni.  In the later case @bidi_bytes is
  *     just ignored.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
 static bool blk_end_bidi_request(struct request *rq, int error,
 				 unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
 		return true;
 	spin_lock_irqsave(q->queue_lock, flags);
 	blk_finish_request(rq, error);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 	return false;
 }
 /**
  * __blk_end_bidi_request - Complete a bidi request with queue lock held
  * @rq:         the request to complete
  * @error:      %0 for success, < %0 for error
  * @nr_bytes:   number of bytes to complete @rq
  * @bidi_bytes: number of bytes to complete @rq->next_rq
  *
  * Description:
  *     Identical to blk_end_bidi_request() except that queue lock is
  *     assumed to be locked on entry and remains so on return.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
 static bool __blk_end_bidi_request(struct request *rq, int error,
 				   unsigned int nr_bytes, unsigned int bidi_bytes)
 {
 	if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes))
 		return true;
 	blk_finish_request(rq, error);
 	return false;
 }
 /**
  * blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
  * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
  *     Ends I/O on a number of bytes attached to @rq.
  *     If @rq has leftover, sets it up for the next range of segments.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
 bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
 	return blk_end_bidi_request(rq, error, nr_bytes, 0);
 }
 EXPORT_SYMBOL(blk_end_request);
 /**
  * blk_end_request_all - Helper function for drives to finish the request.
  * @rq: the request to finish
  * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Completely finish @rq.
  */
 void blk_end_request_all(struct request *rq, int error)
 {
 	bool pending;
 	unsigned int bidi_bytes = 0;
 	if (unlikely(blk_bidi_rq(rq)))
 		bidi_bytes = blk_rq_bytes(rq->next_rq);
 	pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
 	BUG_ON(pending);
 }
 EXPORT_SYMBOL(blk_end_request_all);
 /**
  * blk_end_request_cur - Helper function to finish the current request chunk.
  * @rq: the request to finish the current chunk for
  * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  */
 bool blk_end_request_cur(struct request *rq, int error)
 {
 	return blk_end_request(rq, error, blk_rq_cur_bytes(rq));
 }
 EXPORT_SYMBOL(blk_end_request_cur);
 /**
  * blk_end_request_err - Finish a request till the next failure boundary.
  * @rq: the request to finish till the next failure boundary for
  * @error: must be negative errno
  *
  * Description:
  *     Complete @rq till the next failure boundary.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  */
 bool blk_end_request_err(struct request *rq, int error)
 {
 	WARN_ON(error >= 0);
 	return blk_end_request(rq, error, blk_rq_err_bytes(rq));
 }
 EXPORT_SYMBOL_GPL(blk_end_request_err);
 /**
  * __blk_end_request - Helper function for drivers to complete the request.
  * @rq:       the request being processed
  * @error:    %0 for success, < %0 for error
  * @nr_bytes: number of bytes to complete
  *
  * Description:
  *     Must be called with queue lock held unlike blk_end_request().
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  **/
 bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes)
 {
 	return __blk_end_bidi_request(rq, error, nr_bytes, 0);
 }
 EXPORT_SYMBOL(__blk_end_request);
 /**
  * __blk_end_request_all - Helper function for drives to finish the request.
  * @rq: the request to finish
  * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Completely finish @rq.  Must be called with queue lock held.
  */
 void __blk_end_request_all(struct request *rq, int error)
 {
 	bool pending;
 	unsigned int bidi_bytes = 0;
 	if (unlikely(blk_bidi_rq(rq)))
 		bidi_bytes = blk_rq_bytes(rq->next_rq);
 	pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes);
 	BUG_ON(pending);
 }
 EXPORT_SYMBOL(__blk_end_request_all);
 /**
  * __blk_end_request_cur - Helper function to finish the current request chunk.
  * @rq: the request to finish the current chunk for
  * @error: %0 for success, < %0 for error
  *
  * Description:
  *     Complete the current consecutively mapped chunk from @rq.  Must
  *     be called with queue lock held.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  */
 bool __blk_end_request_cur(struct request *rq, int error)
 {
 	return __blk_end_request(rq, error, blk_rq_cur_bytes(rq));
 }
 EXPORT_SYMBOL(__blk_end_request_cur);
 /**
  * __blk_end_request_err - Finish a request till the next failure boundary.
  * @rq: the request to finish till the next failure boundary for
  * @error: must be negative errno
  *
  * Description:
  *     Complete @rq till the next failure boundary.  Must be called
  *     with queue lock held.
  *
  * Return:
  *     %false - we are done with this request
  *     %true  - still buffers pending for this request
  */
 bool __blk_end_request_err(struct request *rq, int error)
 {
 	WARN_ON(error >= 0);
 	return __blk_end_request(rq, error, blk_rq_err_bytes(rq));
 }
 EXPORT_SYMBOL_GPL(__blk_end_request_err);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 		     struct bio *bio)
 {
 	/* Bit 0 (R/W) is identical in rq->cmd_flags and bio->bi_rw */
 	rq->cmd_flags |= bio->bi_rw & REQ_WRITE;
 	if (bio_has_data(bio)) {
 		rq->nr_phys_segments = bio_phys_segments(q, bio);
 		rq->buffer = bio_data(bio);
 	}
 	rq->__data_len = bio->bi_size;
 	rq->bio = rq->biotail = bio;
 	if (bio->bi_bdev)
 		rq->rq_disk = bio->bi_bdev->bd_disk;
 }
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 /**
  * rq_flush_dcache_pages - Helper function to flush all pages in a request
  * @rq: the request to be flushed
  *
  * Description:
  *     Flush all pages in @rq.
  */
 void rq_flush_dcache_pages(struct request *rq)
 {
 	struct req_iterator iter;
 	struct bio_vec *bvec;
 	rq_for_each_segment(bvec, rq, iter)
 		flush_dcache_page(bvec->bv_page);
 }
 EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
 #endif
 /**
  * blk_lld_busy - Check if underlying low-level drivers of a device are busy
  * @q : the queue of the device being checked
  *
  * Description:
  *    Check if underlying low-level drivers of a device are busy.
  *    If the drivers want to export their busy state, they must set own
  *    exporting function using blk_queue_lld_busy() first.
  *
  *    Basically, this function is used only by request stacking drivers
  *    to stop dispatching requests to underlying devices when underlying
  *    devices are busy.  This behavior helps more I/O merging on the queue
  *    of the request stacking driver and prevents I/O throughput regression
  *    on burst I/O load.
  *
  * Return:
  *    0 - Not busy (The request stacking driver should dispatch request)
  *    1 - Busy (The request stacking driver should stop dispatching request)
  */
 int blk_lld_busy(struct request_queue *q)
 {
 	if (q->lld_busy_fn)
 		return q->lld_busy_fn(q);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_lld_busy);
 /**
  * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
  * @rq: the clone request to be cleaned up
  *
  * Description:
  *     Free all bios in @rq for a cloned request.
  */
 void blk_rq_unprep_clone(struct request *rq)
 {
 	struct bio *bio;
 	while ((bio = rq->bio) != NULL) {
 		rq->bio = bio->bi_next;
 		bio_put(bio);
 	}
 }
 EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
 /*
  * Copy attributes of the original request to the clone request.
  * The actual data parts (e.g. ->cmd, ->buffer, ->sense) are not copied.
  */
 static void __blk_rq_prep_clone(struct request *dst, struct request *src)
 {
 	dst->cpu = src->cpu;
 	dst->cmd_flags = (rq_data_dir(src) | REQ_NOMERGE);
 	if (src->cmd_flags & REQ_DISCARD)
 		dst->cmd_flags |= REQ_DISCARD;
 	dst->cmd_type = src->cmd_type;
 	dst->__sector = blk_rq_pos(src);
 	dst->__data_len = blk_rq_bytes(src);
 	dst->nr_phys_segments = src->nr_phys_segments;
 	dst->ioprio = src->ioprio;
 	dst->extra_len = src->extra_len;
 }
 /**
  * blk_rq_prep_clone - Helper function to setup clone request
  * @rq: the request to be setup
  * @rq_src: original request to be cloned
  * @bs: bio_set that bios for clone are allocated from
  * @gfp_mask: memory allocation mask for bio
  * @bio_ctr: setup function to be called for each clone bio.
  *           Returns %0 for success, non %0 for failure.
  * @data: private data to be passed to @bio_ctr
  *
  * Description:
  *     Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
  *     The actual data parts of @rq_src (e.g. ->cmd, ->buffer, ->sense)
  *     are not copied, and copying such parts is the caller's responsibility.
  *     Also, pages which the original bios are pointing to are not copied
  *     and the cloned bios just point same pages.
  *     So cloned bios must be completed before original bios, which means
  *     the caller must complete @rq before @rq_src.
  */
 int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 		      struct bio_set *bs, gfp_t gfp_mask,
 		      int (*bio_ctr)(struct bio *, struct bio *, void *),
 		      void *data)
 {
 	struct bio *bio, *bio_src;
 	if (!bs)
 		bs = fs_bio_set;
 	blk_rq_init(NULL, rq);
 	__rq_for_each_bio(bio_src, rq_src) {
 		bio = bio_alloc_bioset(gfp_mask, bio_src->bi_max_vecs, bs);
 		if (!bio)
 			goto free_and_out;
 		__bio_clone(bio, bio_src);
 		if (bio_integrity(bio_src) &&
 		    bio_integrity_clone(bio, bio_src, gfp_mask, bs))
 			goto free_and_out;
 		if (bio_ctr && bio_ctr(bio, bio_src, data))
 			goto free_and_out;
 		if (rq->bio) {
 			rq->biotail->bi_next = bio;
 			rq->biotail = bio;
 		} else
 			rq->bio = rq->biotail = bio;
 	}
 	__blk_rq_prep_clone(rq, rq_src);
 	return 0;
 free_and_out:
 	if (bio)
 		bio_free(bio, bs);
 	blk_rq_unprep_clone(rq);
 	return -ENOMEM;
 }
 EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work)
 {
 	return queue_work(kblockd_workqueue, work);
 }
 EXPORT_SYMBOL(kblockd_schedule_work);
 int __init blk_dev_init(void)
 {
 	BUILD_BUG_ON(__REQ_NR_BITS > 8 *
 			sizeof(((struct request *)0)->cmd_flags));
 	kblockd_workqueue = create_workqueue("kblockd");
 	if (!kblockd_workqueue)
 		panic("Failed to create kblockd\n");
 	request_cachep = kmem_cache_create("blkdev_requests",
 			sizeof(struct request), 0, SLAB_PANIC, NULL);
 	blk_requestq_cachep = kmem_cache_create("blkdev_queue",
 			sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
 	return 0;
 }

block/blk-settings.c

Diff comments View file @ 4913efe

 /*
  * Functions related to setting various queue properties from drivers
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
 #include <linux/gcd.h>
 #include <linux/lcm.h>
 #include <linux/jiffies.h>
 #include <linux/gfp.h>
 #include "blk.h"
 unsigned long blk_max_low_pfn;
 EXPORT_SYMBOL(blk_max_low_pfn);
 unsigned long blk_max_pfn;
 /**
  * blk_queue_prep_rq - set a prepare_request function for queue
  * @q:		queue
  * @pfn:	prepare_request function
  *
  * It's possible for a queue to register a prepare_request callback which
  * is invoked before the request is handed to the request_fn. The goal of
  * the function is to prepare a request for I/O, it can be used to build a
  * cdb from the request data for instance.
  *
  */
 void blk_queue_prep_rq(struct request_queue *q, prep_rq_fn *pfn)
 {
 	q->prep_rq_fn = pfn;
 }
 EXPORT_SYMBOL(blk_queue_prep_rq);
 /**
  * blk_queue_unprep_rq - set an unprepare_request function for queue
  * @q:		queue
  * @ufn:	unprepare_request function
  *
  * It's possible for a queue to register an unprepare_request callback
  * which is invoked before the request is finally completed. The goal
  * of the function is to deallocate any data that was allocated in the
  * prepare_request callback.
  *
  */
 void blk_queue_unprep_rq(struct request_queue *q, unprep_rq_fn *ufn)
 {
 	q->unprep_rq_fn = ufn;
 }
 EXPORT_SYMBOL(blk_queue_unprep_rq);
 /**
  * blk_queue_merge_bvec - set a merge_bvec function for queue
  * @q:		queue
  * @mbfn:	merge_bvec_fn
  *
  * Usually queues have static limitations on the max sectors or segments that
  * we can put in a request. Stacking drivers may have some settings that
  * are dynamic, and thus we have to query the queue whether it is ok to
  * add a new bio_vec to a bio at a given offset or not. If the block device
  * has such limitations, it needs to register a merge_bvec_fn to control
  * the size of bio's sent to it. Note that a block device *must* allow a
  * single page to be added to an empty bio. The block device driver may want
  * to use the bio_split() function to deal with these bio's. By default
  * no merge_bvec_fn is defined for a queue, and only the fixed limits are
  * honored.
  */
 void blk_queue_merge_bvec(struct request_queue *q, merge_bvec_fn *mbfn)
 {
 	q->merge_bvec_fn = mbfn;
 }
 EXPORT_SYMBOL(blk_queue_merge_bvec);
 void blk_queue_softirq_done(struct request_queue *q, softirq_done_fn *fn)
 {
 	q->softirq_done_fn = fn;
 }
 EXPORT_SYMBOL(blk_queue_softirq_done);
 void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
 {
 	q->rq_timeout = timeout;
 }
 EXPORT_SYMBOL_GPL(blk_queue_rq_timeout);
 void blk_queue_rq_timed_out(struct request_queue *q, rq_timed_out_fn *fn)
 {
 	q->rq_timed_out_fn = fn;
 }
 EXPORT_SYMBOL_GPL(blk_queue_rq_timed_out);
 void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn)
 {
 	q->lld_busy_fn = fn;
 }
 EXPORT_SYMBOL_GPL(blk_queue_lld_busy);
 /**
  * blk_set_default_limits - reset limits to default values
  * @lim:  the queue_limits structure to reset
  *
  * Description:
  *   Returns a queue_limit struct to its default state.  Can be used by
  *   stacking drivers like DM that stage table swaps and reuse an
  *   existing device queue.
  */
 void blk_set_default_limits(struct queue_limits *lim)
 {
 	lim->max_segments = BLK_MAX_SEGMENTS;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
 	lim->max_sectors = BLK_DEF_MAX_SECTORS;
 	lim->max_hw_sectors = INT_MAX;
 	lim->max_discard_sectors = 0;
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
 	lim->discard_zeroes_data = -1;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
 	lim->io_opt = 0;
 	lim->misaligned = 0;
 	lim->no_cluster = 0;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 /**
  * blk_queue_make_request - define an alternate make_request function for a device
  * @q:  the request queue for the device to be affected
  * @mfn: the alternate make_request function
  *
  * Description:
  *    The normal way for &struct bios to be passed to a device
  *    driver is for them to be collected into requests on a request
  *    queue, and then to allow the device driver to select requests
  *    off that queue when it is ready.  This works well for many block
  *    devices. However some block devices (typically virtual devices
  *    such as md or lvm) do not benefit from the processing on the
  *    request queue, and are served best by having the requests passed
  *    directly to them.  This can be achieved by providing a function
  *    to blk_queue_make_request().
  *
  * Caveat:
  *    The driver that does this *must* be able to deal appropriately
  *    with buffers in "highmemory". This can be accomplished by either calling
  *    __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
  *    blk_queue_bounce() to create a buffer in normal memory.
  **/
 void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
 {
 	/*
 	 * set defaults
 	 */
 	q->nr_requests = BLKDEV_MAX_RQ;
 	q->make_request_fn = mfn;
 	blk_queue_dma_alignment(q, 511);
 	blk_queue_congestion_threshold(q);
 	q->nr_batching = BLK_BATCH_REQ;
 	q->unplug_thresh = 4;		/* hmm */
 	q->unplug_delay = msecs_to_jiffies(3);	/* 3 milliseconds */
 	if (q->unplug_delay == 0)
 		q->unplug_delay = 1;
 	q->unplug_timer.function = blk_unplug_timeout;
 	q->unplug_timer.data = (unsigned long)q;
 	blk_set_default_limits(&q->limits);
 	blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
 	/*
 	 * If the caller didn't supply a lock, fall back to our embedded
 	 * per-queue locks
 	 */
 	if (!q->queue_lock)
 		q->queue_lock = &q->__queue_lock;
 	/*
 	 * by default assume old behaviour and bounce for any highmem page
 	 */
 	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
 }
 EXPORT_SYMBOL(blk_queue_make_request);
 /**
  * blk_queue_bounce_limit - set bounce buffer limit for queue
  * @q: the request queue for the device
  * @dma_mask: the maximum address the device can handle
  *
  * Description:
  *    Different hardware can have different requirements as to what pages
  *    it can do I/O directly to. A low level driver can call
  *    blk_queue_bounce_limit to have lower memory pages allocated as bounce
  *    buffers for doing I/O to pages residing above @dma_mask.
  **/
 void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask)
 {
 	unsigned long b_pfn = dma_mask >> PAGE_SHIFT;
 	int dma = 0;
 	q->bounce_gfp = GFP_NOIO;
 #if BITS_PER_LONG == 64
 	/*
 	 * Assume anything <= 4GB can be handled by IOMMU.  Actually
 	 * some IOMMUs can handle everything, but I don't know of a
 	 * way to test this here.
 	 */
 	if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
 		dma = 1;
 	q->limits.bounce_pfn = max_low_pfn;
 #else
 	if (b_pfn < blk_max_low_pfn)
 		dma = 1;
 	q->limits.bounce_pfn = b_pfn;
 #endif
 	if (dma) {
 		init_emergency_isa_pool();
 		q->bounce_gfp = GFP_NOIO | GFP_DMA;
 		q->limits.bounce_pfn = b_pfn;
 	}
 }
 EXPORT_SYMBOL(blk_queue_bounce_limit);
 /**
  * blk_queue_max_hw_sectors - set max sectors for a request for this queue
  * @q:  the request queue for the device
  * @max_hw_sectors:  max hardware sectors in the usual 512b unit
  *
  * Description:
  *    Enables a low level driver to set a hard upper limit,
  *    max_hw_sectors, on the size of requests.  max_hw_sectors is set by
  *    the device driver based upon the combined capabilities of I/O
  *    controller and storage device.
  *
  *    max_sectors is a soft limit imposed by the block layer for
  *    filesystem type requests.  This value can be overridden on a
  *    per-device basis in /sys/block/<device>/queue/max_sectors_kb.
  *    The soft limit can not exceed max_hw_sectors.
  **/
 void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors)
 {
 	if ((max_hw_sectors << 9) < PAGE_CACHE_SIZE) {
 		max_hw_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
 		printk(KERN_INFO "%s: set to minimum %d\n",
 		       __func__, max_hw_sectors);
 	}
 	q->limits.max_hw_sectors = max_hw_sectors;
 	q->limits.max_sectors = min_t(unsigned int, max_hw_sectors,
 				      BLK_DEF_MAX_SECTORS);
 }
 EXPORT_SYMBOL(blk_queue_max_hw_sectors);
 /**
  * blk_queue_max_discard_sectors - set max sectors for a single discard
  * @q:  the request queue for the device
  * @max_discard_sectors: maximum number of sectors to discard
  **/
 void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors)
 {
 	q->limits.max_discard_sectors = max_discard_sectors;
 }
 EXPORT_SYMBOL(blk_queue_max_discard_sectors);
 /**
  * blk_queue_max_segments - set max hw segments for a request for this queue
  * @q:  the request queue for the device
  * @max_segments:  max number of segments
  *
  * Description:
  *    Enables a low level driver to set an upper limit on the number of
  *    hw data segments in a request.
  **/
 void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments)
 {
 	if (!max_segments) {
 		max_segments = 1;
 		printk(KERN_INFO "%s: set to minimum %d\n",
 		       __func__, max_segments);
 	}
 	q->limits.max_segments = max_segments;
 }
 EXPORT_SYMBOL(blk_queue_max_segments);
 /**
  * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
  * @q:  the request queue for the device
  * @max_size:  max size of segment in bytes
  *
  * Description:
  *    Enables a low level driver to set an upper limit on the size of a
  *    coalesced segment
  **/
 void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
 {
 	if (max_size < PAGE_CACHE_SIZE) {
 		max_size = PAGE_CACHE_SIZE;
 		printk(KERN_INFO "%s: set to minimum %d\n",
 		       __func__, max_size);
 	}
 	q->limits.max_segment_size = max_size;
 }
 EXPORT_SYMBOL(blk_queue_max_segment_size);
 /**
  * blk_queue_logical_block_size - set logical block size for the queue
  * @q:  the request queue for the device
  * @size:  the logical block size, in bytes
  *
  * Description:
  *   This should be set to the lowest possible block size that the
  *   storage device can address.  The default of 512 covers most
  *   hardware.
  **/
 void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
 {
 	q->limits.logical_block_size = size;
 	if (q->limits.physical_block_size < size)
 		q->limits.physical_block_size = size;
 	if (q->limits.io_min < q->limits.physical_block_size)
 		q->limits.io_min = q->limits.physical_block_size;
 }
 EXPORT_SYMBOL(blk_queue_logical_block_size);
 /**
  * blk_queue_physical_block_size - set physical block size for the queue
  * @q:  the request queue for the device
  * @size:  the physical block size, in bytes
  *
  * Description:
  *   This should be set to the lowest possible sector size that the
  *   hardware can operate on without reverting to read-modify-write
  *   operations.
  */
 void blk_queue_physical_block_size(struct request_queue *q, unsigned short size)
 {
 	q->limits.physical_block_size = size;
 	if (q->limits.physical_block_size < q->limits.logical_block_size)
 		q->limits.physical_block_size = q->limits.logical_block_size;
 	if (q->limits.io_min < q->limits.physical_block_size)
 		q->limits.io_min = q->limits.physical_block_size;
 }
 EXPORT_SYMBOL(blk_queue_physical_block_size);
 /**
  * blk_queue_alignment_offset - set physical block alignment offset
  * @q:	the request queue for the device
  * @offset: alignment offset in bytes
  *
  * Description:
  *   Some devices are naturally misaligned to compensate for things like
  *   the legacy DOS partition table 63-sector offset.  Low-level drivers
  *   should call this function for devices whose first sector is not
  *   naturally aligned.
  */
 void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
 {
 	q->limits.alignment_offset =
 		offset & (q->limits.physical_block_size - 1);
 	q->limits.misaligned = 0;
 }
 EXPORT_SYMBOL(blk_queue_alignment_offset);
 /**
  * blk_limits_io_min - set minimum request size for a device
  * @limits: the queue limits
  * @min:  smallest I/O size in bytes
  *
  * Description:
  *   Some devices have an internal block size bigger than the reported
  *   hardware sector size.  This function can be used to signal the
  *   smallest I/O the device can perform without incurring a performance
  *   penalty.
  */
 void blk_limits_io_min(struct queue_limits *limits, unsigned int min)
 {
 	limits->io_min = min;
 	if (limits->io_min < limits->logical_block_size)
 		limits->io_min = limits->logical_block_size;
 	if (limits->io_min < limits->physical_block_size)
 		limits->io_min = limits->physical_block_size;
 }
 EXPORT_SYMBOL(blk_limits_io_min);
 /**
  * blk_queue_io_min - set minimum request size for the queue
  * @q:	the request queue for the device
  * @min:  smallest I/O size in bytes
  *
  * Description:
  *   Storage devices may report a granularity or preferred minimum I/O
  *   size which is the smallest request the device can perform without
  *   incurring a performance penalty.  For disk drives this is often the
  *   physical block size.  For RAID arrays it is often the stripe chunk
  *   size.  A properly aligned multiple of minimum_io_size is the
  *   preferred request size for workloads where a high number of I/O
  *   operations is desired.
  */
 void blk_queue_io_min(struct request_queue *q, unsigned int min)
 {
 	blk_limits_io_min(&q->limits, min);
 }
 EXPORT_SYMBOL(blk_queue_io_min);
 /**
  * blk_limits_io_opt - set optimal request size for a device
  * @limits: the queue limits
  * @opt:  smallest I/O size in bytes
  *
  * Description:
  *   Storage devices may report an optimal I/O size, which is the
  *   device's preferred unit for sustained I/O.  This is rarely reported
  *   for disk drives.  For RAID arrays it is usually the stripe width or
  *   the internal track size.  A properly aligned multiple of
  *   optimal_io_size is the preferred request size for workloads where
  *   sustained throughput is desired.
  */
 void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt)
 {
 	limits->io_opt = opt;
 }
 EXPORT_SYMBOL(blk_limits_io_opt);
 /**
  * blk_queue_io_opt - set optimal request size for the queue
  * @q:	the request queue for the device
  * @opt:  optimal request size in bytes
  *
  * Description:
  *   Storage devices may report an optimal I/O size, which is the
  *   device's preferred unit for sustained I/O.  This is rarely reported
  *   for disk drives.  For RAID arrays it is usually the stripe width or
  *   the internal track size.  A properly aligned multiple of
  *   optimal_io_size is the preferred request size for workloads where
  *   sustained throughput is desired.
  */
 void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
 {
 	blk_limits_io_opt(&q->limits, opt);
 }
 EXPORT_SYMBOL(blk_queue_io_opt);
 /*
  * Returns the minimum that is _not_ zero, unless both are zero.
  */
 #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
 /**
  * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
  * @t:	the stacking driver (top)
  * @b:  the underlying device (bottom)
  **/
 void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
 {
 	blk_stack_limits(&t->limits, &b->limits, 0);
 	if (!t->queue_lock)
 		WARN_ON_ONCE(1);
 	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
 		unsigned long flags;
 		spin_lock_irqsave(t->queue_lock, flags);
 		queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
 		spin_unlock_irqrestore(t->queue_lock, flags);
 	}
 }
 EXPORT_SYMBOL(blk_queue_stack_limits);
 /**
  * blk_stack_limits - adjust queue_limits for stacked devices
  * @t:	the stacking driver limits (top device)
  * @b:  the underlying queue limits (bottom, component device)
  * @start:  first data sector within component device
  *
  * Description:
  *    This function is used by stacking drivers like MD and DM to ensure
  *    that all component devices have compatible block sizes and
  *    alignments.  The stacking driver must provide a queue_limits
  *    struct (top) and then iteratively call the stacking function for
  *    all component (bottom) devices.  The stacking function will
  *    attempt to combine the values and ensure proper alignment.
  *
  *    Returns 0 if the top and bottom queue_limits are compatible.  The
  *    top device's block sizes and alignment offsets may be adjusted to
  *    ensure alignment with the bottom device. If no compatible sizes
  *    and alignments exist, -1 is returned and the resulting top
  *    queue_limits will have the misaligned flag set to indicate that
  *    the alignment_offset is undefined.
  */
 int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 		     sector_t start)
 {
 	unsigned int top, bottom, alignment, ret = 0;
 	t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
 	t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
 	t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
 	t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
 					    b->seg_boundary_mask);
 	t->max_segments = min_not_zero(t->max_segments, b->max_segments);
 	t->max_segment_size = min_not_zero(t->max_segment_size,
 					   b->max_segment_size);
 	t->misaligned |= b->misaligned;
 	alignment = queue_limit_alignment_offset(b, start);
 	/* Bottom device has different alignment.  Check that it is
 	 * compatible with the current top alignment.
 	 */
 	if (t->alignment_offset != alignment) {
 		top = max(t->physical_block_size, t->io_min)
 			+ t->alignment_offset;
 		bottom = max(b->physical_block_size, b->io_min) + alignment;
 		/* Verify that top and bottom intervals line up */
 		if (max(top, bottom) & (min(top, bottom) - 1)) {
 			t->misaligned = 1;
 			ret = -1;
 		}
 	}
 	t->logical_block_size = max(t->logical_block_size,
 				    b->logical_block_size);
 	t->physical_block_size = max(t->physical_block_size,
 				     b->physical_block_size);
 	t->io_min = max(t->io_min, b->io_min);
 	t->io_opt = lcm(t->io_opt, b->io_opt);
 	t->no_cluster |= b->no_cluster;
 	t->discard_zeroes_data &= b->discard_zeroes_data;
 	/* Physical block size a multiple of the logical block size? */
 	if (t->physical_block_size & (t->logical_block_size - 1)) {
 		t->physical_block_size = t->logical_block_size;
 		t->misaligned = 1;
 		ret = -1;
 	}
 	/* Minimum I/O a multiple of the physical block size? */
 	if (t->io_min & (t->physical_block_size - 1)) {
 		t->io_min = t->physical_block_size;
 		t->misaligned = 1;
 		ret = -1;
 	}
 	/* Optimal I/O a multiple of the physical block size? */
 	if (t->io_opt & (t->physical_block_size - 1)) {
 		t->io_opt = 0;
 		t->misaligned = 1;
 		ret = -1;
 	}
 	/* Find lowest common alignment_offset */
 	t->alignment_offset = lcm(t->alignment_offset, alignment)
 		& (max(t->physical_block_size, t->io_min) - 1);
 	/* Verify that new alignment_offset is on a logical block boundary */
 	if (t->alignment_offset & (t->logical_block_size - 1)) {
 		t->misaligned = 1;
 		ret = -1;
 	}
 	/* Discard alignment and granularity */
 	if (b->discard_granularity) {
 		alignment = queue_limit_discard_alignment(b, start);
 		if (t->discard_granularity != 0 &&
 		    t->discard_alignment != alignment) {
 			top = t->discard_granularity + t->discard_alignment;
 			bottom = b->discard_granularity + alignment;
 			/* Verify that top and bottom intervals line up */
 			if (max(top, bottom) & (min(top, bottom) - 1))
 				t->discard_misaligned = 1;
 		}
 		t->max_discard_sectors = min_not_zero(t->max_discard_sectors,
 						      b->max_discard_sectors);
 		t->discard_granularity = max(t->discard_granularity,
 					     b->discard_granularity);
 		t->discard_alignment = lcm(t->discard_alignment, alignment) &
 			(t->discard_granularity - 1);
 	}
 	return ret;
 }
 EXPORT_SYMBOL(blk_stack_limits);
 /**
  * bdev_stack_limits - adjust queue limits for stacked drivers
  * @t:	the stacking driver limits (top device)
  * @bdev:  the component block_device (bottom)
  * @start:  first data sector within component device
  *
  * Description:
  *    Merges queue limits for a top device and a block_device.  Returns
  *    0 if alignment didn't change.  Returns -1 if adding the bottom
  *    device caused misalignment.
  */
 int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
 		      sector_t start)
 {
 	struct request_queue *bq = bdev_get_queue(bdev);
 	start += get_start_sect(bdev);
 	return blk_stack_limits(t, &bq->limits, start);
 }
 EXPORT_SYMBOL(bdev_stack_limits);
 /**
  * disk_stack_limits - adjust queue limits for stacked drivers
  * @disk:  MD/DM gendisk (top)
  * @bdev:  the underlying block device (bottom)
  * @offset:  offset to beginning of data within component device
  *
  * Description:
  *    Merges the limits for a top level gendisk and a bottom level
  *    block_device.
  */
 void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
 		       sector_t offset)
 {
 	struct request_queue *t = disk->queue;
 	struct request_queue *b = bdev_get_queue(bdev);
 	if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
 		char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
 		disk_name(disk, 0, top);
 		bdevname(bdev, bottom);
 		printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
 		       top, bottom);
 	}
 	if (!t->queue_lock)
 		WARN_ON_ONCE(1);
 	else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) {
 		unsigned long flags;
 		spin_lock_irqsave(t->queue_lock, flags);
 		if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
 			queue_flag_clear(QUEUE_FLAG_CLUSTER, t);
 		spin_unlock_irqrestore(t->queue_lock, flags);
 	}
 }
 EXPORT_SYMBOL(disk_stack_limits);
 /**
  * blk_queue_dma_pad - set pad mask
  * @q:     the request queue for the device
  * @mask:  pad mask
  *
  * Set dma pad mask.
  *
  * Appending pad buffer to a request modifies the last entry of a
  * scatter list such that it includes the pad buffer.
  **/
 void blk_queue_dma_pad(struct request_queue *q, unsigned int mask)
 {
 	q->dma_pad_mask = mask;
 }
 EXPORT_SYMBOL(blk_queue_dma_pad);
 /**
  * blk_queue_update_dma_pad - update pad mask
  * @q:     the request queue for the device
  * @mask:  pad mask
  *
  * Update dma pad mask.
  *
  * Appending pad buffer to a request modifies the last entry of a
  * scatter list such that it includes the pad buffer.
  **/
 void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask)
 {
 	if (mask > q->dma_pad_mask)
 		q->dma_pad_mask = mask;
 }
 EXPORT_SYMBOL(blk_queue_update_dma_pad);
 /**
  * blk_queue_dma_drain - Set up a drain buffer for excess dma.
  * @q:  the request queue for the device
  * @dma_drain_needed: fn which returns non-zero if drain is necessary
  * @buf:	physically contiguous buffer
  * @size:	size of the buffer in bytes
  *
  * Some devices have excess DMA problems and can't simply discard (or
  * zero fill) the unwanted piece of the transfer.  They have to have a
  * real area of memory to transfer it into.  The use case for this is
  * ATAPI devices in DMA mode.  If the packet command causes a transfer
  * bigger than the transfer size some HBAs will lock up if there
  * aren't DMA elements to contain the excess transfer.  What this API
  * does is adjust the queue so that the buf is always appended
  * silently to the scatterlist.
  *
  * Note: This routine adjusts max_hw_segments to make room for appending
  * the drain buffer.  If you call blk_queue_max_segments() after calling
  * this routine, you must set the limit to one fewer than your device
  * can support otherwise there won't be room for the drain buffer.
  */
 int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size)
 {
 	if (queue_max_segments(q) < 2)
 		return -EINVAL;
 	/* make room for appending the drain */
 	blk_queue_max_segments(q, queue_max_segments(q) - 1);
 	q->dma_drain_needed = dma_drain_needed;
 	q->dma_drain_buffer = buf;
 	q->dma_drain_size = size;
 	return 0;
 }
 EXPORT_SYMBOL_GPL(blk_queue_dma_drain);
 /**
  * blk_queue_segment_boundary - set boundary rules for segment merging
  * @q:  the request queue for the device
  * @mask:  the memory boundary mask
  **/
 void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
 {
 	if (mask < PAGE_CACHE_SIZE - 1) {
 		mask = PAGE_CACHE_SIZE - 1;
 		printk(KERN_INFO "%s: set to minimum %lx\n",
 		       __func__, mask);
 	}
 	q->limits.seg_boundary_mask = mask;
 }
 EXPORT_SYMBOL(blk_queue_segment_boundary);
 /**
  * blk_queue_dma_alignment - set dma length and memory alignment
  * @q:     the request queue for the device
  * @mask:  alignment mask
  *
  * description:
  *    set required memory and length alignment for direct dma transactions.
  *    this is used when building direct io requests for the queue.
  *
  **/
 void blk_queue_dma_alignment(struct request_queue *q, int mask)
 {
 	q->dma_alignment = mask;
 }
 EXPORT_SYMBOL(blk_queue_dma_alignment);
 /**
  * blk_queue_update_dma_alignment - update dma length and memory alignment
  * @q:     the request queue for the device
  * @mask:  alignment mask
  *
  * description:
  *    update required memory and length alignment for direct dma transactions.
  *    If the requested alignment is larger than the current alignment, then
  *    the current queue alignment is updated to the new value, otherwise it
  *    is left alone.  The design of this is to allow multiple objects
  *    (driver, device, transport etc) to set their respective
  *    alignments without having them interfere.
  *
  **/
 void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
 {
 	BUG_ON(mask > PAGE_SIZE);
 	if (mask > q->dma_alignment)
 		q->dma_alignment = mask;
 }
 EXPORT_SYMBOL(blk_queue_update_dma_alignment);
+/**
+ * blk_queue_flush - configure queue's cache flush capability
+ * @q:		the request queue for the device
+ * @flush:	0, REQ_FLUSH or REQ_FLUSH | REQ_FUA
+ *
+ * Tell block layer cache flush capability of @q.  If it supports
+ * flushing, REQ_FLUSH should be set.  If it supports bypassing
+ * write cache for individual writes, REQ_FUA should be set.
+ */
+void blk_queue_flush(struct request_queue *q, unsigned int flush)
+{
+	WARN_ON_ONCE(flush & ~(REQ_FLUSH | REQ_FUA));
+	if (WARN_ON_ONCE(!(flush & REQ_FLUSH) && (flush & REQ_FUA)))
+		flush &= ~REQ_FUA;
+	q->flush_flags = flush & (REQ_FLUSH | REQ_FUA);
+}
+EXPORT_SYMBOL_GPL(blk_queue_flush);
 static int __init blk_settings_init(void)
 {
 	blk_max_low_pfn = max_low_pfn - 1;
 	blk_max_pfn = max_pfn - 1;
 	return 0;
 }
 subsys_initcall(blk_settings_init);

drivers/block/brd.c

Diff comments View file @ 4913efe

 /*
  * Ram backed block device driver.
  *
  * Copyright (C) 2007 Nick Piggin
  * Copyright (C) 2007 Novell Inc.
  *
  * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
  * of their respective owners.
  */
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/major.h>
 #include <linux/blkdev.h>
 #include <linux/bio.h>
 #include <linux/highmem.h>
 #include <linux/smp_lock.h>
 #include <linux/radix-tree.h>
 #include <linux/buffer_head.h> /* invalidate_bh_lrus() */
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #define SECTOR_SHIFT		9
 #define PAGE_SECTORS_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
 #define PAGE_SECTORS		(1 << PAGE_SECTORS_SHIFT)
 /*
  * Each block ramdisk device has a radix_tree brd_pages of pages that stores
  * the pages containing the block device's contents. A brd page's ->index is
  * its offset in PAGE_SIZE units. This is similar to, but in no way connected
  * with, the kernel's pagecache or buffer cache (which sit above our block
  * device).
  */
 struct brd_device {
 	int		brd_number;
 	int		brd_refcnt;
 	loff_t		brd_offset;
 	loff_t		brd_sizelimit;
 	unsigned	brd_blocksize;
 	struct request_queue	*brd_queue;
 	struct gendisk		*brd_disk;
 	struct list_head	brd_list;
 	/*
 	 * Backing store of pages and lock to protect it. This is the contents
 	 * of the block device.
 	 */
 	spinlock_t		brd_lock;
 	struct radix_tree_root	brd_pages;
 };
 /*
  * Look up and return a brd's page for a given sector.
  */
 static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
 {
 	pgoff_t idx;
 	struct page *page;
 	/*
 	 * The page lifetime is protected by the fact that we have opened the
 	 * device node -- brd pages will never be deleted under us, so we
 	 * don't need any further locking or refcounting.
 	 *
 	 * This is strictly true for the radix-tree nodes as well (ie. we
 	 * don't actually need the rcu_read_lock()), however that is not a
 	 * documented feature of the radix-tree API so it is better to be
 	 * safe here (we don't have total exclusion from radix tree updates
 	 * here, only deletes).
 	 */
 	rcu_read_lock();
 	idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
 	page = radix_tree_lookup(&brd->brd_pages, idx);
 	rcu_read_unlock();
 	BUG_ON(page && page->index != idx);
 	return page;
 }
 /*
  * Look up and return a brd's page for a given sector.
  * If one does not exist, allocate an empty page, and insert that. Then
  * return it.
  */
 static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
 {
 	pgoff_t idx;
 	struct page *page;
 	gfp_t gfp_flags;
 	page = brd_lookup_page(brd, sector);
 	if (page)
 		return page;
 	/*
 	 * Must use NOIO because we don't want to recurse back into the
 	 * block or filesystem layers from page reclaim.
 	 *
 	 * Cannot support XIP and highmem, because our ->direct_access
 	 * routine for XIP must return memory that is always addressable.
 	 * If XIP was reworked to use pfns and kmap throughout, this
 	 * restriction might be able to be lifted.
 	 */
 	gfp_flags = GFP_NOIO | __GFP_ZERO;
 #ifndef CONFIG_BLK_DEV_XIP
 	gfp_flags |= __GFP_HIGHMEM;
 #endif
 	page = alloc_page(gfp_flags);
 	if (!page)
 		return NULL;
 	if (radix_tree_preload(GFP_NOIO)) {
 		__free_page(page);
 		return NULL;
 	}
 	spin_lock(&brd->brd_lock);
 	idx = sector >> PAGE_SECTORS_SHIFT;
 	if (radix_tree_insert(&brd->brd_pages, idx, page)) {
 		__free_page(page);
 		page = radix_tree_lookup(&brd->brd_pages, idx);
 		BUG_ON(!page);
 		BUG_ON(page->index != idx);
 	} else
 		page->index = idx;
 	spin_unlock(&brd->brd_lock);
 	radix_tree_preload_end();
 	return page;
 }
 static void brd_free_page(struct brd_device *brd, sector_t sector)
 {
 	struct page *page;
 	pgoff_t idx;
 	spin_lock(&brd->brd_lock);
 	idx = sector >> PAGE_SECTORS_SHIFT;
 	page = radix_tree_delete(&brd->brd_pages, idx);
 	spin_unlock(&brd->brd_lock);
 	if (page)
 		__free_page(page);
 }
 static void brd_zero_page(struct brd_device *brd, sector_t sector)
 {
 	struct page *page;
 	page = brd_lookup_page(brd, sector);
 	if (page)
 		clear_highpage(page);
 }
 /*
  * Free all backing store pages and radix tree. This must only be called when
  * there are no other users of the device.
  */
 #define FREE_BATCH 16
 static void brd_free_pages(struct brd_device *brd)
 {
 	unsigned long pos = 0;
 	struct page *pages[FREE_BATCH];
 	int nr_pages;
 	do {
 		int i;
 		nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
 				(void **)pages, pos, FREE_BATCH);
 		for (i = 0; i < nr_pages; i++) {
 			void *ret;
 			BUG_ON(pages[i]->index < pos);
 			pos = pages[i]->index;
 			ret = radix_tree_delete(&brd->brd_pages, pos);
 			BUG_ON(!ret || ret != pages[i]);
 			__free_page(pages[i]);
 		}
 		pos++;
 		/*
 		 * This assumes radix_tree_gang_lookup always returns as
 		 * many pages as possible. If the radix-tree code changes,
 		 * so will this have to.
 		 */
 	} while (nr_pages == FREE_BATCH);
 }
 /*
  * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
  */
 static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
 {
 	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 	size_t copy;
 	copy = min_t(size_t, n, PAGE_SIZE - offset);
 	if (!brd_insert_page(brd, sector))
 		return -ENOMEM;
 	if (copy < n) {
 		sector += copy >> SECTOR_SHIFT;
 		if (!brd_insert_page(brd, sector))
 			return -ENOMEM;
 	}
 	return 0;
 }
 static void discard_from_brd(struct brd_device *brd,
 			sector_t sector, size_t n)
 {
 	while (n >= PAGE_SIZE) {
 		/*
 		 * Don't want to actually discard pages here because
 		 * re-allocating the pages can result in writeback
 		 * deadlocks under heavy load.
 		 */
 		if (0)
 			brd_free_page(brd, sector);
 		else
 			brd_zero_page(brd, sector);
 		sector += PAGE_SIZE >> SECTOR_SHIFT;
 		n -= PAGE_SIZE;
 	}
 }
 /*
  * Copy n bytes from src to the brd starting at sector. Does not sleep.
  */
 static void copy_to_brd(struct brd_device *brd, const void *src,
 			sector_t sector, size_t n)
 {
 	struct page *page;
 	void *dst;
 	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 	size_t copy;
 	copy = min_t(size_t, n, PAGE_SIZE - offset);
 	page = brd_lookup_page(brd, sector);
 	BUG_ON(!page);
 	dst = kmap_atomic(page, KM_USER1);
 	memcpy(dst + offset, src, copy);
 	kunmap_atomic(dst, KM_USER1);
 	if (copy < n) {
 		src += copy;
 		sector += copy >> SECTOR_SHIFT;
 		copy = n - copy;
 		page = brd_lookup_page(brd, sector);
 		BUG_ON(!page);
 		dst = kmap_atomic(page, KM_USER1);
 		memcpy(dst, src, copy);
 		kunmap_atomic(dst, KM_USER1);
 	}
 }
 /*
  * Copy n bytes to dst from the brd starting at sector. Does not sleep.
  */
 static void copy_from_brd(void *dst, struct brd_device *brd,
 			sector_t sector, size_t n)
 {
 	struct page *page;
 	void *src;
 	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
 	size_t copy;
 	copy = min_t(size_t, n, PAGE_SIZE - offset);
 	page = brd_lookup_page(brd, sector);
 	if (page) {
 		src = kmap_atomic(page, KM_USER1);
 		memcpy(dst, src + offset, copy);
 		kunmap_atomic(src, KM_USER1);
 	} else
 		memset(dst, 0, copy);
 	if (copy < n) {
 		dst += copy;
 		sector += copy >> SECTOR_SHIFT;
 		copy = n - copy;
 		page = brd_lookup_page(brd, sector);
 		if (page) {
 			src = kmap_atomic(page, KM_USER1);
 			memcpy(dst, src, copy);
 			kunmap_atomic(src, KM_USER1);
 		} else
 			memset(dst, 0, copy);
 	}
 }
 /*
  * Process a single bvec of a bio.
  */
 static int brd_do_bvec(struct brd_device *brd, struct page *page,
 			unsigned int len, unsigned int off, int rw,
 			sector_t sector)
 {
 	void *mem;
 	int err = 0;
 	if (rw != READ) {
 		err = copy_to_brd_setup(brd, sector, len);
 		if (err)
 			goto out;
 	}
 	mem = kmap_atomic(page, KM_USER0);
 	if (rw == READ) {
 		copy_from_brd(mem + off, brd, sector, len);
 		flush_dcache_page(page);
 	} else {
 		flush_dcache_page(page);
 		copy_to_brd(brd, mem + off, sector, len);
 	}
 	kunmap_atomic(mem, KM_USER0);
 out:
 	return err;
 }
 static int brd_make_request(struct request_queue *q, struct bio *bio)
 {
 	struct block_device *bdev = bio->bi_bdev;
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	int rw;
 	struct bio_vec *bvec;
 	sector_t sector;
 	int i;
 	int err = -EIO;
 	sector = bio->bi_sector;
 	if (sector + (bio->bi_size >> SECTOR_SHIFT) >
 						get_capacity(bdev->bd_disk))
 		goto out;
 	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
 		err = 0;
 		discard_from_brd(brd, sector, bio->bi_size);
 		goto out;
 	}
 	rw = bio_rw(bio);
 	if (rw == READA)
 		rw = READ;
 	bio_for_each_segment(bvec, bio, i) {
 		unsigned int len = bvec->bv_len;
 		err = brd_do_bvec(brd, bvec->bv_page, len,
 					bvec->bv_offset, rw, sector);
 		if (err)
 			break;
 		sector += len >> SECTOR_SHIFT;
 	}
 out:
 	bio_endio(bio, err);
 	return 0;
 }
 #ifdef CONFIG_BLK_DEV_XIP
 static int brd_direct_access(struct block_device *bdev, sector_t sector,
 			void **kaddr, unsigned long *pfn)
 {
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	struct page *page;
 	if (!brd)
 		return -ENODEV;
 	if (sector & (PAGE_SECTORS-1))
 		return -EINVAL;
 	if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk))
 		return -ERANGE;
 	page = brd_insert_page(brd, sector);
 	if (!page)
 		return -ENOMEM;
 	*kaddr = page_address(page);
 	*pfn = page_to_pfn(page);
 	return 0;
 }
 #endif
 static int brd_ioctl(struct block_device *bdev, fmode_t mode,
 			unsigned int cmd, unsigned long arg)
 {
 	int error;
 	struct brd_device *brd = bdev->bd_disk->private_data;
 	if (cmd != BLKFLSBUF)
 		return -ENOTTY;
 	/*
 	 * ram device BLKFLSBUF has special semantics, we want to actually
 	 * release and destroy the ramdisk data.
 	 */
 	lock_kernel();
 	mutex_lock(&bdev->bd_mutex);
 	error = -EBUSY;
 	if (bdev->bd_openers <= 1) {
 		/*
 		 * Invalidate the cache first, so it isn't written
 		 * back to the device.
 		 *
 		 * Another thread might instantiate more buffercache here,
 		 * but there is not much we can do to close that race.
 		 */
 		invalidate_bh_lrus();
 		truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
 		brd_free_pages(brd);
 		error = 0;
 	}
 	mutex_unlock(&bdev->bd_mutex);
 	unlock_kernel();
 	return error;
 }
 static const struct block_device_operations brd_fops = {
 	.owner =		THIS_MODULE,
 	.ioctl =		brd_ioctl,
 #ifdef CONFIG_BLK_DEV_XIP
 	.direct_access =	brd_direct_access,
 #endif
 };
 /*
  * And now the modules code and kernel interface.
  */
 static int rd_nr;
 int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
 static int max_part;
 static int part_shift;
 module_param(rd_nr, int, 0);
 MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
 module_param(rd_size, int, 0);
 MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
 module_param(max_part, int, 0);
 MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
 MODULE_ALIAS("rd");
 #ifndef MODULE
 /* Legacy boot options - nonmodular */
 static int __init ramdisk_size(char *str)
 {
 	rd_size = simple_strtol(str, NULL, 0);
 	return 1;
 }
 __setup("ramdisk_size=", ramdisk_size);
 #endif
 /*
  * The device scheme is derived from loop.c. Keep them in synch where possible
  * (should share code eventually).
  */
 static LIST_HEAD(brd_devices);
 static DEFINE_MUTEX(brd_devices_mutex);
 static struct brd_device *brd_alloc(int i)
 {
 	struct brd_device *brd;
 	struct gendisk *disk;
 	brd = kzalloc(sizeof(*brd), GFP_KERNEL);
 	if (!brd)
 		goto out;
 	brd->brd_number		= i;
 	spin_lock_init(&brd->brd_lock);
 	INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
 	brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
 	if (!brd->brd_queue)
 		goto out_free_dev;
 	blk_queue_make_request(brd->brd_queue, brd_make_request);
-	blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_DRAIN);
 	blk_queue_max_hw_sectors(brd->brd_queue, 1024);
 	blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
 	brd->brd_queue->limits.discard_granularity = PAGE_SIZE;
 	brd->brd_queue->limits.max_discard_sectors = UINT_MAX;
 	brd->brd_queue->limits.discard_zeroes_data = 1;
 	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue);
 	disk = brd->brd_disk = alloc_disk(1 << part_shift);
 	if (!disk)
 		goto out_free_queue;
 	disk->major		= RAMDISK_MAJOR;
 	disk->first_minor	= i << part_shift;
 	disk->fops		= &brd_fops;
 	disk->private_data	= brd;
 	disk->queue		= brd->brd_queue;
 	disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
 	sprintf(disk->disk_name, "ram%d", i);
 	set_capacity(disk, rd_size * 2);
 	return brd;
 out_free_queue:
 	blk_cleanup_queue(brd->brd_queue);
 out_free_dev:
 	kfree(brd);
 out:
 	return NULL;
 }
 static void brd_free(struct brd_device *brd)
 {
 	put_disk(brd->brd_disk);
 	blk_cleanup_queue(brd->brd_queue);
 	brd_free_pages(brd);
 	kfree(brd);
 }
 static struct brd_device *brd_init_one(int i)
 {
 	struct brd_device *brd;
 	list_for_each_entry(brd, &brd_devices, brd_list) {
 		if (brd->brd_number == i)
 			goto out;
 	}
 	brd = brd_alloc(i);
 	if (brd) {
 		add_disk(brd->brd_disk);
 		list_add_tail(&brd->brd_list, &brd_devices);
 	}
 out:
 	return brd;
 }
 static void brd_del_one(struct brd_device *brd)
 {
 	list_del(&brd->brd_list);
 	del_gendisk(brd->brd_disk);
 	brd_free(brd);
 }
 static struct kobject *brd_probe(dev_t dev, int *part, void *data)
 {
 	struct brd_device *brd;
 	struct kobject *kobj;
 	mutex_lock(&brd_devices_mutex);
 	brd = brd_init_one(dev & MINORMASK);
 	kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM);
 	mutex_unlock(&brd_devices_mutex);
 	*part = 0;
 	return kobj;
 }
 static int __init brd_init(void)
 {
 	int i, nr;
 	unsigned long range;
 	struct brd_device *brd, *next;
 	/*
 	 * brd module now has a feature to instantiate underlying device
 	 * structure on-demand, provided that there is an access dev node.
 	 * However, this will not work well with user space tool that doesn't
 	 * know about such "feature".  In order to not break any existing
 	 * tool, we do the following:
 	 *
 	 * (1) if rd_nr is specified, create that many upfront, and this
 	 *     also becomes a hard limit.
 	 * (2) if rd_nr is not specified, create 1 rd device on module
 	 *     load, user can further extend brd device by create dev node
 	 *     themselves and have kernel automatically instantiate actual
 	 *     device on-demand.
 	 */
 	part_shift = 0;
 	if (max_part > 0)
 		part_shift = fls(max_part);
 	if (rd_nr > 1UL << (MINORBITS - part_shift))
 		return -EINVAL;
 	if (rd_nr) {
 		nr = rd_nr;
 		range = rd_nr;
 	} else {
 		nr = CONFIG_BLK_DEV_RAM_COUNT;
 		range = 1UL << (MINORBITS - part_shift);
 	}
 	if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
 		return -EIO;
 	for (i = 0; i < nr; i++) {
 		brd = brd_alloc(i);
 		if (!brd)
 			goto out_free;
 		list_add_tail(&brd->brd_list, &brd_devices);
 	}
 	/* point of no return */
 	list_for_each_entry(brd, &brd_devices, brd_list)
 		add_disk(brd->brd_disk);
 	blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
 				  THIS_MODULE, brd_probe, NULL, NULL);
 	printk(KERN_INFO "brd: module loaded\n");
 	return 0;
 out_free:
 	list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
 		list_del(&brd->brd_list);
 		brd_free(brd);
 	}
 	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 	return -ENOMEM;
 }
 static void __exit brd_exit(void)
 {
 	unsigned long range;
 	struct brd_device *brd, *next;
 	range = rd_nr ? rd_nr :  1UL << (MINORBITS - part_shift);
 	list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
 		brd_del_one(brd);
 	blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
 	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
 }
 module_init(brd_init);
 module_exit(brd_exit);

drivers/block/loop.c

Diff comments View file @ 4913efe

1	/*	1	/*
2	* linux/drivers/block/loop.c	2	* linux/drivers/block/loop.c
3	*	3	*
4	* Written by Theodore Ts'o, 3/29/93	4	* Written by Theodore Ts'o, 3/29/93
5	*	5	*
6	* Copyright 1993 by Theodore Ts'o. Redistribution of this file is	6	* Copyright 1993 by Theodore Ts'o. Redistribution of this file is
7	* permitted under the GNU General Public License.	7	* permitted under the GNU General Public License.
8	*	8	*
9	* DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993	9	* DES encryption plus some minor changes by Werner Almesberger, 30-MAY-1993
10	* more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996	10	* more DES encryption plus IDEA encryption by Nicholas J. Leon, June 20, 1996
11	*	11	*
12	* Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994	12	* Modularized and updated for 1.1.16 kernel - Mitch Dsouza 28th May 1994
13	* Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996	13	* Adapted for 1.3.59 kernel - Andries Brouwer, 1 Feb 1996
14	*	14	*
15	* Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997	15	* Fixed do_loop_request() re-entrancy - Vincent.Renardias@waw.com Mar 20, 1997
16	*	16	*
17	* Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998	17	* Added devfs support - Richard Gooch <rgooch@atnf.csiro.au> 16-Jan-1998
18	*	18	*
19	* Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998	19	* Handle sparse backing files correctly - Kenn Humborg, Jun 28, 1998
20	*	20	*
21	* Loadable modules and other fixes by AK, 1998	21	* Loadable modules and other fixes by AK, 1998
22	*	22	*
23	* Make real block number available to downstream transfer functions, enables	23	* Make real block number available to downstream transfer functions, enables
24	* CBC (and relatives) mode encryption requiring unique IVs per data block.	24	* CBC (and relatives) mode encryption requiring unique IVs per data block.
25	* Reed H. Petty, rhp@draper.net	25	* Reed H. Petty, rhp@draper.net
26	*	26	*
27	* Maximum number of loop devices now dynamic via max_loop module parameter.	27	* Maximum number of loop devices now dynamic via max_loop module parameter.
28	* Russell Kroll <rkroll@exploits.org> 19990701	28	* Russell Kroll <rkroll@exploits.org> 19990701
29	*	29	*
30	* Maximum number of loop devices when compiled-in now selectable by passing	30	* Maximum number of loop devices when compiled-in now selectable by passing
31	* max_loop=<1-255> to the kernel on boot.	31	* max_loop=<1-255> to the kernel on boot.
32	* Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999	32	* Erik I. Bolsø, <eriki@himolde.no>, Oct 31, 1999
33	*	33	*
34	* Completely rewrite request handling to be make_request_fn style and	34	* Completely rewrite request handling to be make_request_fn style and
35	* non blocking, pushing work to a helper thread. Lots of fixes from	35	* non blocking, pushing work to a helper thread. Lots of fixes from
36	* Al Viro too.	36	* Al Viro too.
37	* Jens Axboe <axboe@suse.de>, Nov 2000	37	* Jens Axboe <axboe@suse.de>, Nov 2000
38	*	38	*
39	* Support up to 256 loop devices	39	* Support up to 256 loop devices
40	* Heinz Mauelshagen <mge@sistina.com>, Feb 2002	40	* Heinz Mauelshagen <mge@sistina.com>, Feb 2002
41	*	41	*
42	* Support for falling back on the write file operation when the address space	42	* Support for falling back on the write file operation when the address space
43	* operations write_begin is not available on the backing filesystem.	43	* operations write_begin is not available on the backing filesystem.
44	* Anton Altaparmakov, 16 Feb 2005	44	* Anton Altaparmakov, 16 Feb 2005
45	*	45	*
46	* Still To Fix:	46	* Still To Fix:
47	* - Advisory locking is ignored here.	47	* - Advisory locking is ignored here.
48	* - Should use an own CAP_* category instead of CAP_SYS_ADMIN	48	* - Should use an own CAP_* category instead of CAP_SYS_ADMIN
49	*	49	*
50	*/	50	*/
51		51
52	#include <linux/module.h>	52	#include <linux/module.h>
53	#include <linux/moduleparam.h>	53	#include <linux/moduleparam.h>
54	#include <linux/sched.h>	54	#include <linux/sched.h>
55	#include <linux/fs.h>	55	#include <linux/fs.h>
56	#include <linux/file.h>	56	#include <linux/file.h>
57	#include <linux/stat.h>	57	#include <linux/stat.h>
58	#include <linux/errno.h>	58	#include <linux/errno.h>
59	#include <linux/major.h>	59	#include <linux/major.h>
60	#include <linux/wait.h>	60	#include <linux/wait.h>
61	#include <linux/blkdev.h>	61	#include <linux/blkdev.h>
62	#include <linux/blkpg.h>	62	#include <linux/blkpg.h>
63	#include <linux/init.h>	63	#include <linux/init.h>
64	#include <linux/swap.h>	64	#include <linux/swap.h>
65	#include <linux/slab.h>	65	#include <linux/slab.h>
66	#include <linux/loop.h>	66	#include <linux/loop.h>
67	#include <linux/compat.h>	67	#include <linux/compat.h>
68	#include <linux/suspend.h>	68	#include <linux/suspend.h>
69	#include <linux/freezer.h>	69	#include <linux/freezer.h>
70	#include <linux/smp_lock.h>	70	#include <linux/smp_lock.h>
71	#include <linux/writeback.h>	71	#include <linux/writeback.h>
72	#include <linux/buffer_head.h> /* for invalidate_bdev() */	72	#include <linux/buffer_head.h> /* for invalidate_bdev() */
73	#include <linux/completion.h>	73	#include <linux/completion.h>
74	#include <linux/highmem.h>	74	#include <linux/highmem.h>
75	#include <linux/kthread.h>	75	#include <linux/kthread.h>
76	#include <linux/splice.h>	76	#include <linux/splice.h>
77		77
78	#include <asm/uaccess.h>	78	#include <asm/uaccess.h>
79		79
80	static LIST_HEAD(loop_devices);	80	static LIST_HEAD(loop_devices);
81	static DEFINE_MUTEX(loop_devices_mutex);	81	static DEFINE_MUTEX(loop_devices_mutex);
82		82
83	static int max_part;	83	static int max_part;
84	static int part_shift;	84	static int part_shift;
85		85
86	/*	86	/*
87	* Transfer functions	87	* Transfer functions
88	*/	88	*/
89	static int transfer_none(struct loop_device *lo, int cmd,	89	static int transfer_none(struct loop_device *lo, int cmd,
90	struct page *raw_page, unsigned raw_off,	90	struct page *raw_page, unsigned raw_off,
91	struct page *loop_page, unsigned loop_off,	91	struct page *loop_page, unsigned loop_off,
92	int size, sector_t real_block)	92	int size, sector_t real_block)
93	{	93	{
94	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;	94	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
95	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;	95	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
96		96
97	if (cmd == READ)	97	if (cmd == READ)
98	memcpy(loop_buf, raw_buf, size);	98	memcpy(loop_buf, raw_buf, size);
99	else	99	else
100	memcpy(raw_buf, loop_buf, size);	100	memcpy(raw_buf, loop_buf, size);
101		101
102	kunmap_atomic(raw_buf, KM_USER0);	102	kunmap_atomic(raw_buf, KM_USER0);
103	kunmap_atomic(loop_buf, KM_USER1);	103	kunmap_atomic(loop_buf, KM_USER1);
104	cond_resched();	104	cond_resched();
105	return 0;	105	return 0;
106	}	106	}
107		107
108	static int transfer_xor(struct loop_device *lo, int cmd,	108	static int transfer_xor(struct loop_device *lo, int cmd,
109	struct page *raw_page, unsigned raw_off,	109	struct page *raw_page, unsigned raw_off,
110	struct page *loop_page, unsigned loop_off,	110	struct page *loop_page, unsigned loop_off,
111	int size, sector_t real_block)	111	int size, sector_t real_block)
112	{	112	{
113	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;	113	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
114	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;	114	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
115	char in, out, *key;	115	char in, out, *key;
116	int i, keysize;	116	int i, keysize;
117		117
118	if (cmd == READ) {	118	if (cmd == READ) {
119	in = raw_buf;	119	in = raw_buf;
120	out = loop_buf;	120	out = loop_buf;
121	} else {	121	} else {
122	in = loop_buf;	122	in = loop_buf;
123	out = raw_buf;	123	out = raw_buf;
124	}	124	}
125		125
126	key = lo->lo_encrypt_key;	126	key = lo->lo_encrypt_key;
127	keysize = lo->lo_encrypt_key_size;	127	keysize = lo->lo_encrypt_key_size;
128	for (i = 0; i < size; i++)	128	for (i = 0; i < size; i++)
129	out++ = in++ ^ key[(i & 511) % keysize];	129	out++ = in++ ^ key[(i & 511) % keysize];
130		130
131	kunmap_atomic(raw_buf, KM_USER0);	131	kunmap_atomic(raw_buf, KM_USER0);
132	kunmap_atomic(loop_buf, KM_USER1);	132	kunmap_atomic(loop_buf, KM_USER1);
133	cond_resched();	133	cond_resched();
134	return 0;	134	return 0;
135	}	135	}
136		136
137	static int xor_init(struct loop_device lo, const struct loop_info64 info)	137	static int xor_init(struct loop_device lo, const struct loop_info64 info)
138	{	138	{
139	if (unlikely(info->lo_encrypt_key_size <= 0))	139	if (unlikely(info->lo_encrypt_key_size <= 0))
140	return -EINVAL;	140	return -EINVAL;
141	return 0;	141	return 0;
142	}	142	}
143		143
144	static struct loop_func_table none_funcs = {	144	static struct loop_func_table none_funcs = {
145	.number = LO_CRYPT_NONE,	145	.number = LO_CRYPT_NONE,
146	.transfer = transfer_none,	146	.transfer = transfer_none,
147	};	147	};
148		148
149	static struct loop_func_table xor_funcs = {	149	static struct loop_func_table xor_funcs = {
150	.number = LO_CRYPT_XOR,	150	.number = LO_CRYPT_XOR,
151	.transfer = transfer_xor,	151	.transfer = transfer_xor,
152	.init = xor_init	152	.init = xor_init
153	};	153	};
154		154
155	/* xfer_funcs[0] is special - its release function is never called */	155	/* xfer_funcs[0] is special - its release function is never called */
156	static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {	156	static struct loop_func_table *xfer_funcs[MAX_LO_CRYPT] = {
157	&none_funcs,	157	&none_funcs,
158	&xor_funcs	158	&xor_funcs
159	};	159	};
160		160
161	static loff_t get_loop_size(struct loop_device lo, struct file file)	161	static loff_t get_loop_size(struct loop_device lo, struct file file)
162	{	162	{
163	loff_t size, offset, loopsize;	163	loff_t size, offset, loopsize;
164		164
165	/* Compute loopsize in bytes */	165	/* Compute loopsize in bytes */
166	size = i_size_read(file->f_mapping->host);	166	size = i_size_read(file->f_mapping->host);
167	offset = lo->lo_offset;	167	offset = lo->lo_offset;
168	loopsize = size - offset;	168	loopsize = size - offset;
169	if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)	169	if (lo->lo_sizelimit > 0 && lo->lo_sizelimit < loopsize)
170	loopsize = lo->lo_sizelimit;	170	loopsize = lo->lo_sizelimit;
171		171
172	/*	172	/*
173	* Unfortunately, if we want to do I/O on the device,	173	* Unfortunately, if we want to do I/O on the device,
174	* the number of 512-byte sectors has to fit into a sector_t.	174	* the number of 512-byte sectors has to fit into a sector_t.
175	*/	175	*/
176	return loopsize >> 9;	176	return loopsize >> 9;
177	}	177	}
178		178
179	static int	179	static int
180	figure_loop_size(struct loop_device *lo)	180	figure_loop_size(struct loop_device *lo)
181	{	181	{
182	loff_t size = get_loop_size(lo, lo->lo_backing_file);	182	loff_t size = get_loop_size(lo, lo->lo_backing_file);
183	sector_t x = (sector_t)size;	183	sector_t x = (sector_t)size;
184		184
185	if (unlikely((loff_t)x != size))	185	if (unlikely((loff_t)x != size))
186	return -EFBIG;	186	return -EFBIG;
187		187
188	set_capacity(lo->lo_disk, x);	188	set_capacity(lo->lo_disk, x);
189	return 0;	189	return 0;
190	}	190	}
191		191
192	static inline int	192	static inline int
193	lo_do_transfer(struct loop_device *lo, int cmd,	193	lo_do_transfer(struct loop_device *lo, int cmd,
194	struct page *rpage, unsigned roffs,	194	struct page *rpage, unsigned roffs,
195	struct page *lpage, unsigned loffs,	195	struct page *lpage, unsigned loffs,
196	int size, sector_t rblock)	196	int size, sector_t rblock)
197	{	197	{
198	if (unlikely(!lo->transfer))	198	if (unlikely(!lo->transfer))
199	return 0;	199	return 0;
200		200
201	return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);	201	return lo->transfer(lo, cmd, rpage, roffs, lpage, loffs, size, rblock);
202	}	202	}
203		203
204	/**	204	/**
205	* do_lo_send_aops - helper for writing data to a loop device	205	* do_lo_send_aops - helper for writing data to a loop device
206	*	206	*
207	* This is the fast version for backing filesystems which implement the address	207	* This is the fast version for backing filesystems which implement the address
208	* space operations write_begin and write_end.	208	* space operations write_begin and write_end.
209	*/	209	*/
210	static int do_lo_send_aops(struct loop_device lo, struct bio_vec bvec,	210	static int do_lo_send_aops(struct loop_device lo, struct bio_vec bvec,
211	loff_t pos, struct page *unused)	211	loff_t pos, struct page *unused)
212	{	212	{
213	struct file file = lo->lo_backing_file; / kudos to NFsckingS */	213	struct file file = lo->lo_backing_file; / kudos to NFsckingS */
214	struct address_space *mapping = file->f_mapping;	214	struct address_space *mapping = file->f_mapping;
215	pgoff_t index;	215	pgoff_t index;
216	unsigned offset, bv_offs;	216	unsigned offset, bv_offs;
217	int len, ret;	217	int len, ret;
218		218
219	mutex_lock(&mapping->host->i_mutex);	219	mutex_lock(&mapping->host->i_mutex);
220	index = pos >> PAGE_CACHE_SHIFT;	220	index = pos >> PAGE_CACHE_SHIFT;
221	offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);	221	offset = pos & ((pgoff_t)PAGE_CACHE_SIZE - 1);
222	bv_offs = bvec->bv_offset;	222	bv_offs = bvec->bv_offset;
223	len = bvec->bv_len;	223	len = bvec->bv_len;
224	while (len > 0) {	224	while (len > 0) {
225	sector_t IV;	225	sector_t IV;
226	unsigned size, copied;	226	unsigned size, copied;
227	int transfer_result;	227	int transfer_result;
228	struct page *page;	228	struct page *page;
229	void *fsdata;	229	void *fsdata;
230		230
231	IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);	231	IV = ((sector_t)index << (PAGE_CACHE_SHIFT - 9))+(offset >> 9);
232	size = PAGE_CACHE_SIZE - offset;	232	size = PAGE_CACHE_SIZE - offset;
233	if (size > len)	233	if (size > len)
234	size = len;	234	size = len;
235		235
236	ret = pagecache_write_begin(file, mapping, pos, size, 0,	236	ret = pagecache_write_begin(file, mapping, pos, size, 0,
237	&page, &fsdata);	237	&page, &fsdata);
238	if (ret)	238	if (ret)
239	goto fail;	239	goto fail;
240		240
241	file_update_time(file);	241	file_update_time(file);
242		242
243	transfer_result = lo_do_transfer(lo, WRITE, page, offset,	243	transfer_result = lo_do_transfer(lo, WRITE, page, offset,
244	bvec->bv_page, bv_offs, size, IV);	244	bvec->bv_page, bv_offs, size, IV);
245	copied = size;	245	copied = size;
246	if (unlikely(transfer_result))	246	if (unlikely(transfer_result))
247	copied = 0;	247	copied = 0;
248		248
249	ret = pagecache_write_end(file, mapping, pos, size, copied,	249	ret = pagecache_write_end(file, mapping, pos, size, copied,
250	page, fsdata);	250	page, fsdata);
251	if (ret < 0 \|\| ret != copied)	251	if (ret < 0 \|\| ret != copied)
252	goto fail;	252	goto fail;
253		253
254	if (unlikely(transfer_result))	254	if (unlikely(transfer_result))
255	goto fail;	255	goto fail;
256		256
257	bv_offs += copied;	257	bv_offs += copied;
258	len -= copied;	258	len -= copied;
259	offset = 0;	259	offset = 0;
260	index++;	260	index++;
261	pos += copied;	261	pos += copied;
262	}	262	}
263	ret = 0;	263	ret = 0;
264	out:	264	out:
265	mutex_unlock(&mapping->host->i_mutex);	265	mutex_unlock(&mapping->host->i_mutex);
266	return ret;	266	return ret;
267	fail:	267	fail:
268	ret = -1;	268	ret = -1;
269	goto out;	269	goto out;
270	}	270	}
271		271
272	/**	272	/**
273	* __do_lo_send_write - helper for writing data to a loop device	273	* __do_lo_send_write - helper for writing data to a loop device
274	*	274	*
275	* This helper just factors out common code between do_lo_send_direct_write()	275	* This helper just factors out common code between do_lo_send_direct_write()
276	* and do_lo_send_write().	276	* and do_lo_send_write().
277	*/	277	*/
278	static int __do_lo_send_write(struct file *file,	278	static int __do_lo_send_write(struct file *file,
279	u8 *buf, const int len, loff_t pos)	279	u8 *buf, const int len, loff_t pos)
280	{	280	{
281	ssize_t bw;	281	ssize_t bw;
282	mm_segment_t old_fs = get_fs();	282	mm_segment_t old_fs = get_fs();
283		283
284	set_fs(get_ds());	284	set_fs(get_ds());
285	bw = file->f_op->write(file, buf, len, &pos);	285	bw = file->f_op->write(file, buf, len, &pos);
286	set_fs(old_fs);	286	set_fs(old_fs);
287	if (likely(bw == len))	287	if (likely(bw == len))
288	return 0;	288	return 0;
289	printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",	289	printk(KERN_ERR "loop: Write error at byte offset %llu, length %i.\n",
290	(unsigned long long)pos, len);	290	(unsigned long long)pos, len);
291	if (bw >= 0)	291	if (bw >= 0)
292	bw = -EIO;	292	bw = -EIO;
293	return bw;	293	return bw;
294	}	294	}
295		295
296	/**	296	/**
297	* do_lo_send_direct_write - helper for writing data to a loop device	297	* do_lo_send_direct_write - helper for writing data to a loop device
298	*	298	*
299	* This is the fast, non-transforming version for backing filesystems which do	299	* This is the fast, non-transforming version for backing filesystems which do
300	* not implement the address space operations write_begin and write_end.	300	* not implement the address space operations write_begin and write_end.
301	* It uses the write file operation which should be present on all writeable	301	* It uses the write file operation which should be present on all writeable
302	* filesystems.	302	* filesystems.
303	*/	303	*/
304	static int do_lo_send_direct_write(struct loop_device *lo,	304	static int do_lo_send_direct_write(struct loop_device *lo,
305	struct bio_vec bvec, loff_t pos, struct page page)	305	struct bio_vec bvec, loff_t pos, struct page page)
306	{	306	{
307	ssize_t bw = __do_lo_send_write(lo->lo_backing_file,	307	ssize_t bw = __do_lo_send_write(lo->lo_backing_file,
308	kmap(bvec->bv_page) + bvec->bv_offset,	308	kmap(bvec->bv_page) + bvec->bv_offset,
309	bvec->bv_len, pos);	309	bvec->bv_len, pos);
310	kunmap(bvec->bv_page);	310	kunmap(bvec->bv_page);
311	cond_resched();	311	cond_resched();
312	return bw;	312	return bw;
313	}	313	}
314		314
315	/**	315	/**
316	* do_lo_send_write - helper for writing data to a loop device	316	* do_lo_send_write - helper for writing data to a loop device
317	*	317	*
318	* This is the slow, transforming version for filesystems which do not	318	* This is the slow, transforming version for filesystems which do not
319	* implement the address space operations write_begin and write_end. It	319	* implement the address space operations write_begin and write_end. It
320	* uses the write file operation which should be present on all writeable	320	* uses the write file operation which should be present on all writeable
321	* filesystems.	321	* filesystems.
322	*	322	*
323	* Using fops->write is slower than using aops->{prepare,commit}_write in the	323	* Using fops->write is slower than using aops->{prepare,commit}_write in the
324	* transforming case because we need to double buffer the data as we cannot do	324	* transforming case because we need to double buffer the data as we cannot do
325	* the transformations in place as we do not have direct access to the	325	* the transformations in place as we do not have direct access to the
326	* destination pages of the backing file.	326	* destination pages of the backing file.
327	*/	327	*/
328	static int do_lo_send_write(struct loop_device lo, struct bio_vec bvec,	328	static int do_lo_send_write(struct loop_device lo, struct bio_vec bvec,
329	loff_t pos, struct page *page)	329	loff_t pos, struct page *page)
330	{	330	{
331	int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page,	331	int ret = lo_do_transfer(lo, WRITE, page, 0, bvec->bv_page,
332	bvec->bv_offset, bvec->bv_len, pos >> 9);	332	bvec->bv_offset, bvec->bv_len, pos >> 9);
333	if (likely(!ret))	333	if (likely(!ret))
334	return __do_lo_send_write(lo->lo_backing_file,	334	return __do_lo_send_write(lo->lo_backing_file,
335	page_address(page), bvec->bv_len,	335	page_address(page), bvec->bv_len,
336	pos);	336	pos);
337	printk(KERN_ERR "loop: Transfer error at byte offset %llu, "	337	printk(KERN_ERR "loop: Transfer error at byte offset %llu, "
338	"length %i.\n", (unsigned long long)pos, bvec->bv_len);	338	"length %i.\n", (unsigned long long)pos, bvec->bv_len);
339	if (ret > 0)	339	if (ret > 0)
340	ret = -EIO;	340	ret = -EIO;
341	return ret;	341	return ret;
342	}	342	}
343		343
344	static int lo_send(struct loop_device lo, struct bio bio, loff_t pos)	344	static int lo_send(struct loop_device lo, struct bio bio, loff_t pos)
345	{	345	{
346	int (do_lo_send)(struct loop_device , struct bio_vec *, loff_t,	346	int (do_lo_send)(struct loop_device , struct bio_vec *, loff_t,
347	struct page *page);	347	struct page *page);
348	struct bio_vec *bvec;	348	struct bio_vec *bvec;
349	struct page *page = NULL;	349	struct page *page = NULL;
350	int i, ret = 0;	350	int i, ret = 0;
351		351
352	do_lo_send = do_lo_send_aops;	352	do_lo_send = do_lo_send_aops;
353	if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {	353	if (!(lo->lo_flags & LO_FLAGS_USE_AOPS)) {
354	do_lo_send = do_lo_send_direct_write;	354	do_lo_send = do_lo_send_direct_write;
355	if (lo->transfer != transfer_none) {	355	if (lo->transfer != transfer_none) {
356	page = alloc_page(GFP_NOIO \| __GFP_HIGHMEM);	356	page = alloc_page(GFP_NOIO \| __GFP_HIGHMEM);
357	if (unlikely(!page))	357	if (unlikely(!page))
358	goto fail;	358	goto fail;
359	kmap(page);	359	kmap(page);
360	do_lo_send = do_lo_send_write;	360	do_lo_send = do_lo_send_write;
361	}	361	}
362	}	362	}
363	bio_for_each_segment(bvec, bio, i) {	363	bio_for_each_segment(bvec, bio, i) {
364	ret = do_lo_send(lo, bvec, pos, page);	364	ret = do_lo_send(lo, bvec, pos, page);
365	if (ret < 0)	365	if (ret < 0)
366	break;	366	break;
367	pos += bvec->bv_len;	367	pos += bvec->bv_len;
368	}	368	}
369	if (page) {	369	if (page) {
370	kunmap(page);	370	kunmap(page);
371	__free_page(page);	371	__free_page(page);
372	}	372	}
373	out:	373	out:
374	return ret;	374	return ret;
375	fail:	375	fail:
376	printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");	376	printk(KERN_ERR "loop: Failed to allocate temporary page for write.\n");
377	ret = -ENOMEM;	377	ret = -ENOMEM;
378	goto out;	378	goto out;
379	}	379	}
380		380
381	struct lo_read_data {	381	struct lo_read_data {
382	struct loop_device *lo;	382	struct loop_device *lo;
383	struct page *page;	383	struct page *page;
384	unsigned offset;	384	unsigned offset;
385	int bsize;	385	int bsize;
386	};	386	};
387		387
388	static int	388	static int
389	lo_splice_actor(struct pipe_inode_info pipe, struct pipe_buffer buf,	389	lo_splice_actor(struct pipe_inode_info pipe, struct pipe_buffer buf,
390	struct splice_desc *sd)	390	struct splice_desc *sd)
391	{	391	{
392	struct lo_read_data *p = sd->u.data;	392	struct lo_read_data *p = sd->u.data;
393	struct loop_device *lo = p->lo;	393	struct loop_device *lo = p->lo;
394	struct page *page = buf->page;	394	struct page *page = buf->page;
395	sector_t IV;	395	sector_t IV;
396	int size, ret;	396	int size, ret;
397		397
398	ret = buf->ops->confirm(pipe, buf);	398	ret = buf->ops->confirm(pipe, buf);
399	if (unlikely(ret))	399	if (unlikely(ret))
400	return ret;	400	return ret;
401		401
402	IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +	402	IV = ((sector_t) page->index << (PAGE_CACHE_SHIFT - 9)) +
403	(buf->offset >> 9);	403	(buf->offset >> 9);
404	size = sd->len;	404	size = sd->len;
405	if (size > p->bsize)	405	if (size > p->bsize)
406	size = p->bsize;	406	size = p->bsize;
407		407
408	if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) {	408	if (lo_do_transfer(lo, READ, page, buf->offset, p->page, p->offset, size, IV)) {
409	printk(KERN_ERR "loop: transfer error block %ld\n",	409	printk(KERN_ERR "loop: transfer error block %ld\n",
410	page->index);	410	page->index);
411	size = -EINVAL;	411	size = -EINVAL;
412	}	412	}
413		413
414	flush_dcache_page(p->page);	414	flush_dcache_page(p->page);
415		415
416	if (size > 0)	416	if (size > 0)
417	p->offset += size;	417	p->offset += size;
418		418
419	return size;	419	return size;
420	}	420	}
421		421
422	static int	422	static int
423	lo_direct_splice_actor(struct pipe_inode_info pipe, struct splice_desc sd)	423	lo_direct_splice_actor(struct pipe_inode_info pipe, struct splice_desc sd)
424	{	424	{
425	return __splice_from_pipe(pipe, sd, lo_splice_actor);	425	return __splice_from_pipe(pipe, sd, lo_splice_actor);
426	}	426	}
427		427
428	static int	428	static int
429	do_lo_receive(struct loop_device *lo,	429	do_lo_receive(struct loop_device *lo,
430	struct bio_vec *bvec, int bsize, loff_t pos)	430	struct bio_vec *bvec, int bsize, loff_t pos)
431	{	431	{
432	struct lo_read_data cookie;	432	struct lo_read_data cookie;
433	struct splice_desc sd;	433	struct splice_desc sd;
434	struct file *file;	434	struct file *file;
435	long retval;	435	long retval;
436		436
437	cookie.lo = lo;	437	cookie.lo = lo;
438	cookie.page = bvec->bv_page;	438	cookie.page = bvec->bv_page;
439	cookie.offset = bvec->bv_offset;	439	cookie.offset = bvec->bv_offset;
440	cookie.bsize = bsize;	440	cookie.bsize = bsize;
441		441
442	sd.len = 0;	442	sd.len = 0;
443	sd.total_len = bvec->bv_len;	443	sd.total_len = bvec->bv_len;
444	sd.flags = 0;	444	sd.flags = 0;
445	sd.pos = pos;	445	sd.pos = pos;
446	sd.u.data = &cookie;	446	sd.u.data = &cookie;
447		447
448	file = lo->lo_backing_file;	448	file = lo->lo_backing_file;
449	retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor);	449	retval = splice_direct_to_actor(file, &sd, lo_direct_splice_actor);
450		450
451	if (retval < 0)	451	if (retval < 0)
452	return retval;	452	return retval;
453		453
454	return 0;	454	return 0;
455	}	455	}
456		456
457	static int	457	static int
458	lo_receive(struct loop_device lo, struct bio bio, int bsize, loff_t pos)	458	lo_receive(struct loop_device lo, struct bio bio, int bsize, loff_t pos)
459	{	459	{
460	struct bio_vec *bvec;	460	struct bio_vec *bvec;
461	int i, ret = 0;	461	int i, ret = 0;
462		462
463	bio_for_each_segment(bvec, bio, i) {	463	bio_for_each_segment(bvec, bio, i) {
464	ret = do_lo_receive(lo, bvec, bsize, pos);	464	ret = do_lo_receive(lo, bvec, bsize, pos);
465	if (ret < 0)	465	if (ret < 0)
466	break;	466	break;
467	pos += bvec->bv_len;	467	pos += bvec->bv_len;
468	}	468	}
469	return ret;	469	return ret;
470	}	470	}
471		471
472	static int do_bio_filebacked(struct loop_device lo, struct bio bio)	472	static int do_bio_filebacked(struct loop_device lo, struct bio bio)
473	{	473	{
474	loff_t pos;	474	loff_t pos;
475	int ret;	475	int ret;
476		476
477	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;	477	pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
478		478
479	if (bio_rw(bio) == WRITE) {	479	if (bio_rw(bio) == WRITE) {
480	bool barrier = (bio->bi_rw & REQ_HARDBARRIER);	480	bool barrier = (bio->bi_rw & REQ_HARDBARRIER);
481	struct file *file = lo->lo_backing_file;	481	struct file *file = lo->lo_backing_file;
482		482
483	if (barrier) {	483	if (barrier) {
484	if (unlikely(!file->f_op->fsync)) {	484	if (unlikely(!file->f_op->fsync)) {
485	ret = -EOPNOTSUPP;	485	ret = -EOPNOTSUPP;
486	goto out;	486	goto out;
487	}	487	}
488		488
489	ret = vfs_fsync(file, 0);	489	ret = vfs_fsync(file, 0);
490	if (unlikely(ret)) {	490	if (unlikely(ret)) {
491	ret = -EIO;	491	ret = -EIO;
492	goto out;	492	goto out;
493	}	493	}
494	}	494	}
495		495
496	ret = lo_send(lo, bio, pos);	496	ret = lo_send(lo, bio, pos);
497		497
498	if (barrier && !ret) {	498	if (barrier && !ret) {
499	ret = vfs_fsync(file, 0);	499	ret = vfs_fsync(file, 0);
500	if (unlikely(ret))	500	if (unlikely(ret))
501	ret = -EIO;	501	ret = -EIO;
502	}	502	}
503	} else	503	} else
504	ret = lo_receive(lo, bio, lo->lo_blocksize, pos);	504	ret = lo_receive(lo, bio, lo->lo_blocksize, pos);
505		505
506	out:	506	out:
507	return ret;	507	return ret;
508	}	508	}
509		509
510	/*	510	/*
511	* Add bio to back of pending list	511	* Add bio to back of pending list
512	*/	512	*/
513	static void loop_add_bio(struct loop_device lo, struct bio bio)	513	static void loop_add_bio(struct loop_device lo, struct bio bio)
514	{	514	{
515	bio_list_add(&lo->lo_bio_list, bio);	515	bio_list_add(&lo->lo_bio_list, bio);
516	}	516	}
517		517
518	/*	518	/*
519	* Grab first pending buffer	519	* Grab first pending buffer
520	*/	520	*/
521	static struct bio loop_get_bio(struct loop_device lo)	521	static struct bio loop_get_bio(struct loop_device lo)
522	{	522	{
523	return bio_list_pop(&lo->lo_bio_list);	523	return bio_list_pop(&lo->lo_bio_list);
524	}	524	}
525		525
526	static int loop_make_request(struct request_queue q, struct bio old_bio)	526	static int loop_make_request(struct request_queue q, struct bio old_bio)
527	{	527	{
528	struct loop_device *lo = q->queuedata;	528	struct loop_device *lo = q->queuedata;
529	int rw = bio_rw(old_bio);	529	int rw = bio_rw(old_bio);
530		530
531	if (rw == READA)	531	if (rw == READA)
532	rw = READ;	532	rw = READ;
533		533
534	BUG_ON(!lo \|\| (rw != READ && rw != WRITE));	534	BUG_ON(!lo \|\| (rw != READ && rw != WRITE));
535		535
536	spin_lock_irq(&lo->lo_lock);	536	spin_lock_irq(&lo->lo_lock);
537	if (lo->lo_state != Lo_bound)	537	if (lo->lo_state != Lo_bound)
538	goto out;	538	goto out;
539	if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))	539	if (unlikely(rw == WRITE && (lo->lo_flags & LO_FLAGS_READ_ONLY)))
540	goto out;	540	goto out;
541	loop_add_bio(lo, old_bio);	541	loop_add_bio(lo, old_bio);
542	wake_up(&lo->lo_event);	542	wake_up(&lo->lo_event);
543	spin_unlock_irq(&lo->lo_lock);	543	spin_unlock_irq(&lo->lo_lock);
544	return 0;	544	return 0;
545		545
546	out:	546	out:
547	spin_unlock_irq(&lo->lo_lock);	547	spin_unlock_irq(&lo->lo_lock);
548	bio_io_error(old_bio);	548	bio_io_error(old_bio);
549	return 0;	549	return 0;
550	}	550	}
551		551
552	/*	552	/*
553	* kick off io on the underlying address space	553	* kick off io on the underlying address space
554	*/	554	*/
555	static void loop_unplug(struct request_queue *q)	555	static void loop_unplug(struct request_queue *q)
556	{	556	{
557	struct loop_device *lo = q->queuedata;	557	struct loop_device *lo = q->queuedata;
558		558
559	queue_flag_clear_unlocked(QUEUE_FLAG_PLUGGED, q);	559	queue_flag_clear_unlocked(QUEUE_FLAG_PLUGGED, q);
560	blk_run_address_space(lo->lo_backing_file->f_mapping);	560	blk_run_address_space(lo->lo_backing_file->f_mapping);
561	}	561	}
562		562
563	struct switch_request {	563	struct switch_request {
564	struct file *file;	564	struct file *file;
565	struct completion wait;	565	struct completion wait;
566	};	566	};
567		567
568	static void do_loop_switch(struct loop_device , struct switch_request );	568	static void do_loop_switch(struct loop_device , struct switch_request );
569		569
570	static inline void loop_handle_bio(struct loop_device lo, struct bio bio)	570	static inline void loop_handle_bio(struct loop_device lo, struct bio bio)
571	{	571	{
572	if (unlikely(!bio->bi_bdev)) {	572	if (unlikely(!bio->bi_bdev)) {
573	do_loop_switch(lo, bio->bi_private);	573	do_loop_switch(lo, bio->bi_private);
574	bio_put(bio);	574	bio_put(bio);
575	} else {	575	} else {
576	int ret = do_bio_filebacked(lo, bio);	576	int ret = do_bio_filebacked(lo, bio);
577	bio_endio(bio, ret);	577	bio_endio(bio, ret);
578	}	578	}
579	}	579	}
580		580
581	/*	581	/*
582	* worker thread that handles reads/writes to file backed loop devices,	582	* worker thread that handles reads/writes to file backed loop devices,
583	* to avoid blocking in our make_request_fn. it also does loop decrypting	583	* to avoid blocking in our make_request_fn. it also does loop decrypting
584	* on reads for block backed loop, as that is too heavy to do from	584	* on reads for block backed loop, as that is too heavy to do from
585	* b_end_io context where irqs may be disabled.	585	* b_end_io context where irqs may be disabled.
586	*	586	*
587	* Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before	587	* Loop explanation: loop_clr_fd() sets lo_state to Lo_rundown before
588	* calling kthread_stop(). Therefore once kthread_should_stop() is	588	* calling kthread_stop(). Therefore once kthread_should_stop() is
589	* true, make_request will not place any more requests. Therefore	589	* true, make_request will not place any more requests. Therefore
590	* once kthread_should_stop() is true and lo_bio is NULL, we are	590	* once kthread_should_stop() is true and lo_bio is NULL, we are
591	* done with the loop.	591	* done with the loop.
592	*/	592	*/
593	static int loop_thread(void *data)	593	static int loop_thread(void *data)
594	{	594	{
595	struct loop_device *lo = data;	595	struct loop_device *lo = data;
596	struct bio *bio;	596	struct bio *bio;
597		597
598	set_user_nice(current, -20);	598	set_user_nice(current, -20);
599		599
600	while (!kthread_should_stop() \|\| !bio_list_empty(&lo->lo_bio_list)) {	600	while (!kthread_should_stop() \|\| !bio_list_empty(&lo->lo_bio_list)) {
601		601
602	wait_event_interruptible(lo->lo_event,	602	wait_event_interruptible(lo->lo_event,
603	!bio_list_empty(&lo->lo_bio_list) \|\|	603	!bio_list_empty(&lo->lo_bio_list) \|\|
604	kthread_should_stop());	604	kthread_should_stop());
605		605
606	if (bio_list_empty(&lo->lo_bio_list))	606	if (bio_list_empty(&lo->lo_bio_list))
607	continue;	607	continue;
608	spin_lock_irq(&lo->lo_lock);	608	spin_lock_irq(&lo->lo_lock);
609	bio = loop_get_bio(lo);	609	bio = loop_get_bio(lo);
610	spin_unlock_irq(&lo->lo_lock);	610	spin_unlock_irq(&lo->lo_lock);
611		611
612	BUG_ON(!bio);	612	BUG_ON(!bio);
613	loop_handle_bio(lo, bio);	613	loop_handle_bio(lo, bio);
614	}	614	}
615		615
616	return 0;	616	return 0;
617	}	617	}
618		618
619	/*	619	/*
620	* loop_switch performs the hard work of switching a backing store.	620	* loop_switch performs the hard work of switching a backing store.
621	* First it needs to flush existing IO, it does this by sending a magic	621	* First it needs to flush existing IO, it does this by sending a magic
622	* BIO down the pipe. The completion of this BIO does the actual switch.	622	* BIO down the pipe. The completion of this BIO does the actual switch.
623	*/	623	*/
624	static int loop_switch(struct loop_device lo, struct file file)	624	static int loop_switch(struct loop_device lo, struct file file)
625	{	625	{
626	struct switch_request w;	626	struct switch_request w;
627	struct bio *bio = bio_alloc(GFP_KERNEL, 0);	627	struct bio *bio = bio_alloc(GFP_KERNEL, 0);
628	if (!bio)	628	if (!bio)
629	return -ENOMEM;	629	return -ENOMEM;
630	init_completion(&w.wait);	630	init_completion(&w.wait);
631	w.file = file;	631	w.file = file;
632	bio->bi_private = &w;	632	bio->bi_private = &w;
633	bio->bi_bdev = NULL;	633	bio->bi_bdev = NULL;
634	loop_make_request(lo->lo_queue, bio);	634	loop_make_request(lo->lo_queue, bio);
635	wait_for_completion(&w.wait);	635	wait_for_completion(&w.wait);
636	return 0;	636	return 0;
637	}	637	}
638		638
639	/*	639	/*
640	* Helper to flush the IOs in loop, but keeping loop thread running	640	* Helper to flush the IOs in loop, but keeping loop thread running
641	*/	641	*/
642	static int loop_flush(struct loop_device *lo)	642	static int loop_flush(struct loop_device *lo)
643	{	643	{
644	/* loop not yet configured, no running thread, nothing to flush */	644	/* loop not yet configured, no running thread, nothing to flush */
645	if (!lo->lo_thread)	645	if (!lo->lo_thread)
646	return 0;	646	return 0;
647		647
648	return loop_switch(lo, NULL);	648	return loop_switch(lo, NULL);
649	}	649	}
650		650
651	/*	651	/*
652	* Do the actual switch; called from the BIO completion routine	652	* Do the actual switch; called from the BIO completion routine
653	*/	653	*/
654	static void do_loop_switch(struct loop_device lo, struct switch_request p)	654	static void do_loop_switch(struct loop_device lo, struct switch_request p)
655	{	655	{
656	struct file *file = p->file;	656	struct file *file = p->file;
657	struct file *old_file = lo->lo_backing_file;	657	struct file *old_file = lo->lo_backing_file;
658	struct address_space *mapping;	658	struct address_space *mapping;
659		659
660	/* if no new file, only flush of queued bios requested */	660	/* if no new file, only flush of queued bios requested */
661	if (!file)	661	if (!file)
662	goto out;	662	goto out;
663		663
664	mapping = file->f_mapping;	664	mapping = file->f_mapping;
665	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);	665	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
666	lo->lo_backing_file = file;	666	lo->lo_backing_file = file;
667	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?	667	lo->lo_blocksize = S_ISBLK(mapping->host->i_mode) ?
668	mapping->host->i_bdev->bd_block_size : PAGE_SIZE;	668	mapping->host->i_bdev->bd_block_size : PAGE_SIZE;
669	lo->old_gfp_mask = mapping_gfp_mask(mapping);	669	lo->old_gfp_mask = mapping_gfp_mask(mapping);
670	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO\|__GFP_FS));	670	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO\|__GFP_FS));
671	out:	671	out:
672	complete(&p->wait);	672	complete(&p->wait);
673	}	673	}
674		674
675		675
676	/*	676	/*
677	* loop_change_fd switched the backing store of a loopback device to	677	* loop_change_fd switched the backing store of a loopback device to
678	* a new file. This is useful for operating system installers to free up	678	* a new file. This is useful for operating system installers to free up
679	* the original file and in High Availability environments to switch to	679	* the original file and in High Availability environments to switch to
680	* an alternative location for the content in case of server meltdown.	680	* an alternative location for the content in case of server meltdown.
681	* This can only work if the loop device is used read-only, and if the	681	* This can only work if the loop device is used read-only, and if the
682	* new backing store is the same size and type as the old backing store.	682	* new backing store is the same size and type as the old backing store.
683	*/	683	*/
684	static int loop_change_fd(struct loop_device lo, struct block_device bdev,	684	static int loop_change_fd(struct loop_device lo, struct block_device bdev,
685	unsigned int arg)	685	unsigned int arg)
686	{	686	{
687	struct file file, old_file;	687	struct file file, old_file;
688	struct inode *inode;	688	struct inode *inode;
689	int error;	689	int error;
690		690
691	error = -ENXIO;	691	error = -ENXIO;
692	if (lo->lo_state != Lo_bound)	692	if (lo->lo_state != Lo_bound)
693	goto out;	693	goto out;
694		694
695	/* the loop device has to be read-only */	695	/* the loop device has to be read-only */
696	error = -EINVAL;	696	error = -EINVAL;
697	if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))	697	if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
698	goto out;	698	goto out;
699		699
700	error = -EBADF;	700	error = -EBADF;
701	file = fget(arg);	701	file = fget(arg);
702	if (!file)	702	if (!file)
703	goto out;	703	goto out;
704		704
705	inode = file->f_mapping->host;	705	inode = file->f_mapping->host;
706	old_file = lo->lo_backing_file;	706	old_file = lo->lo_backing_file;
707		707
708	error = -EINVAL;	708	error = -EINVAL;
709		709
710	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))	710	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
711	goto out_putf;	711	goto out_putf;
712		712
713	/* size of the new backing store needs to be the same */	713	/* size of the new backing store needs to be the same */
714	if (get_loop_size(lo, file) != get_loop_size(lo, old_file))	714	if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
715	goto out_putf;	715	goto out_putf;
716		716
717	/* and ... switch */	717	/* and ... switch */
718	error = loop_switch(lo, file);	718	error = loop_switch(lo, file);
719	if (error)	719	if (error)
720	goto out_putf;	720	goto out_putf;
721		721
722	fput(old_file);	722	fput(old_file);
723	if (max_part > 0)	723	if (max_part > 0)
724	ioctl_by_bdev(bdev, BLKRRPART, 0);	724	ioctl_by_bdev(bdev, BLKRRPART, 0);
725	return 0;	725	return 0;
726		726
727	out_putf:	727	out_putf:
728	fput(file);	728	fput(file);
729	out:	729	out:
730	return error;	730	return error;
731	}	731	}
732		732
733	static inline int is_loop_device(struct file *file)	733	static inline int is_loop_device(struct file *file)
734	{	734	{
735	struct inode *i = file->f_mapping->host;	735	struct inode *i = file->f_mapping->host;
736		736
737	return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;	737	return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
738	}	738	}
739		739
740	static int loop_set_fd(struct loop_device *lo, fmode_t mode,	740	static int loop_set_fd(struct loop_device *lo, fmode_t mode,
741	struct block_device *bdev, unsigned int arg)	741	struct block_device *bdev, unsigned int arg)
742	{	742	{
743	struct file file, f;	743	struct file file, f;
744	struct inode *inode;	744	struct inode *inode;
745	struct address_space *mapping;	745	struct address_space *mapping;
746	unsigned lo_blocksize;	746	unsigned lo_blocksize;
747	int lo_flags = 0;	747	int lo_flags = 0;
748	int error;	748	int error;
749	loff_t size;	749	loff_t size;
750		750
751	/* This is safe, since we have a reference from open(). */	751	/* This is safe, since we have a reference from open(). */
752	__module_get(THIS_MODULE);	752	__module_get(THIS_MODULE);
753		753
754	error = -EBADF;	754	error = -EBADF;
755	file = fget(arg);	755	file = fget(arg);
756	if (!file)	756	if (!file)
757	goto out;	757	goto out;
758		758
759	error = -EBUSY;	759	error = -EBUSY;
760	if (lo->lo_state != Lo_unbound)	760	if (lo->lo_state != Lo_unbound)
761	goto out_putf;	761	goto out_putf;
762		762
763	/* Avoid recursion */	763	/* Avoid recursion */
764	f = file;	764	f = file;
765	while (is_loop_device(f)) {	765	while (is_loop_device(f)) {
766	struct loop_device *l;	766	struct loop_device *l;
767		767
768	if (f->f_mapping->host->i_bdev == bdev)	768	if (f->f_mapping->host->i_bdev == bdev)
769	goto out_putf;	769	goto out_putf;
770		770
771	l = f->f_mapping->host->i_bdev->bd_disk->private_data;	771	l = f->f_mapping->host->i_bdev->bd_disk->private_data;
772	if (l->lo_state == Lo_unbound) {	772	if (l->lo_state == Lo_unbound) {
773	error = -EINVAL;	773	error = -EINVAL;
774	goto out_putf;	774	goto out_putf;
775	}	775	}
776	f = l->lo_backing_file;	776	f = l->lo_backing_file;
777	}	777	}
778		778
779	mapping = file->f_mapping;	779	mapping = file->f_mapping;
780	inode = mapping->host;	780	inode = mapping->host;
781		781
782	if (!(file->f_mode & FMODE_WRITE))	782	if (!(file->f_mode & FMODE_WRITE))
783	lo_flags \|= LO_FLAGS_READ_ONLY;	783	lo_flags \|= LO_FLAGS_READ_ONLY;
784		784
785	error = -EINVAL;	785	error = -EINVAL;
786	if (S_ISREG(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {	786	if (S_ISREG(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {
787	const struct address_space_operations *aops = mapping->a_ops;	787	const struct address_space_operations *aops = mapping->a_ops;
788		788
789	if (aops->write_begin)	789	if (aops->write_begin)
790	lo_flags \|= LO_FLAGS_USE_AOPS;	790	lo_flags \|= LO_FLAGS_USE_AOPS;
791	if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)	791	if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write)
792	lo_flags \|= LO_FLAGS_READ_ONLY;	792	lo_flags \|= LO_FLAGS_READ_ONLY;
793		793
794	lo_blocksize = S_ISBLK(inode->i_mode) ?	794	lo_blocksize = S_ISBLK(inode->i_mode) ?
795	inode->i_bdev->bd_block_size : PAGE_SIZE;	795	inode->i_bdev->bd_block_size : PAGE_SIZE;
796		796
797	error = 0;	797	error = 0;
798	} else {	798	} else {
799	goto out_putf;	799	goto out_putf;
800	}	800	}
801		801
802	size = get_loop_size(lo, file);	802	size = get_loop_size(lo, file);
803		803
804	if ((loff_t)(sector_t)size != size) {	804	if ((loff_t)(sector_t)size != size) {
805	error = -EFBIG;	805	error = -EFBIG;
806	goto out_putf;	806	goto out_putf;
807	}	807	}
808		808
809	if (!(mode & FMODE_WRITE))	809	if (!(mode & FMODE_WRITE))
810	lo_flags \|= LO_FLAGS_READ_ONLY;	810	lo_flags \|= LO_FLAGS_READ_ONLY;
811		811
812	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);	812	set_device_ro(bdev, (lo_flags & LO_FLAGS_READ_ONLY) != 0);
813		813
814	lo->lo_blocksize = lo_blocksize;	814	lo->lo_blocksize = lo_blocksize;
815	lo->lo_device = bdev;	815	lo->lo_device = bdev;
816	lo->lo_flags = lo_flags;	816	lo->lo_flags = lo_flags;
817	lo->lo_backing_file = file;	817	lo->lo_backing_file = file;
818	lo->transfer = transfer_none;	818	lo->transfer = transfer_none;
819	lo->ioctl = NULL;	819	lo->ioctl = NULL;
820	lo->lo_sizelimit = 0;	820	lo->lo_sizelimit = 0;
821	lo->old_gfp_mask = mapping_gfp_mask(mapping);	821	lo->old_gfp_mask = mapping_gfp_mask(mapping);
822	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO\|__GFP_FS));	822	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO\|__GFP_FS));
823		823
824	bio_list_init(&lo->lo_bio_list);	824	bio_list_init(&lo->lo_bio_list);
825		825
826	/*	826	/*
827	* set queue make_request_fn, and add limits based on lower level	827	* set queue make_request_fn, and add limits based on lower level
828	* device	828	* device
829	*/	829	*/
830	blk_queue_make_request(lo->lo_queue, loop_make_request);	830	blk_queue_make_request(lo->lo_queue, loop_make_request);
831	lo->lo_queue->queuedata = lo;	831	lo->lo_queue->queuedata = lo;
832	lo->lo_queue->unplug_fn = loop_unplug;	832	lo->lo_queue->unplug_fn = loop_unplug;
833		833
834	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)	834	if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
835	blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN_FLUSH);	835	blk_queue_flush(lo->lo_queue, REQ_FLUSH);
836		836
837	set_capacity(lo->lo_disk, size);	837	set_capacity(lo->lo_disk, size);
838	bd_set_size(bdev, size << 9);	838	bd_set_size(bdev, size << 9);
839	/* let user-space know about the new size */	839	/* let user-space know about the new size */
840	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);	840	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
841		841
842	set_blocksize(bdev, lo_blocksize);	842	set_blocksize(bdev, lo_blocksize);
843		843
844	lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",	844	lo->lo_thread = kthread_create(loop_thread, lo, "loop%d",
845	lo->lo_number);	845	lo->lo_number);
846	if (IS_ERR(lo->lo_thread)) {	846	if (IS_ERR(lo->lo_thread)) {
847	error = PTR_ERR(lo->lo_thread);	847	error = PTR_ERR(lo->lo_thread);
848	goto out_clr;	848	goto out_clr;
849	}	849	}
850	lo->lo_state = Lo_bound;	850	lo->lo_state = Lo_bound;
851	wake_up_process(lo->lo_thread);	851	wake_up_process(lo->lo_thread);
852	if (max_part > 0)	852	if (max_part > 0)
853	ioctl_by_bdev(bdev, BLKRRPART, 0);	853	ioctl_by_bdev(bdev, BLKRRPART, 0);
854	return 0;	854	return 0;
855		855
856	out_clr:	856	out_clr:
857	lo->lo_thread = NULL;	857	lo->lo_thread = NULL;
858	lo->lo_device = NULL;	858	lo->lo_device = NULL;
859	lo->lo_backing_file = NULL;	859	lo->lo_backing_file = NULL;
860	lo->lo_flags = 0;	860	lo->lo_flags = 0;
861	set_capacity(lo->lo_disk, 0);	861	set_capacity(lo->lo_disk, 0);
862	invalidate_bdev(bdev);	862	invalidate_bdev(bdev);
863	bd_set_size(bdev, 0);	863	bd_set_size(bdev, 0);
864	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);	864	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
865	mapping_set_gfp_mask(mapping, lo->old_gfp_mask);	865	mapping_set_gfp_mask(mapping, lo->old_gfp_mask);
866	lo->lo_state = Lo_unbound;	866	lo->lo_state = Lo_unbound;
867	out_putf:	867	out_putf:
868	fput(file);	868	fput(file);
869	out:	869	out:
870	/* This is safe: open() is still holding a reference. */	870	/* This is safe: open() is still holding a reference. */
871	module_put(THIS_MODULE);	871	module_put(THIS_MODULE);
872	return error;	872	return error;
873	}	873	}
874		874
875	static int	875	static int
876	loop_release_xfer(struct loop_device *lo)	876	loop_release_xfer(struct loop_device *lo)
877	{	877	{
878	int err = 0;	878	int err = 0;
879	struct loop_func_table *xfer = lo->lo_encryption;	879	struct loop_func_table *xfer = lo->lo_encryption;
880		880
881	if (xfer) {	881	if (xfer) {
882	if (xfer->release)	882	if (xfer->release)
883	err = xfer->release(lo);	883	err = xfer->release(lo);
884	lo->transfer = NULL;	884	lo->transfer = NULL;
885	lo->lo_encryption = NULL;	885	lo->lo_encryption = NULL;
886	module_put(xfer->owner);	886	module_put(xfer->owner);
887	}	887	}
888	return err;	888	return err;
889	}	889	}
890		890
891	static int	891	static int
892	loop_init_xfer(struct loop_device lo, struct loop_func_table xfer,	892	loop_init_xfer(struct loop_device lo, struct loop_func_table xfer,
893	const struct loop_info64 *i)	893	const struct loop_info64 *i)
894	{	894	{
895	int err = 0;	895	int err = 0;
896		896
897	if (xfer) {	897	if (xfer) {
898	struct module *owner = xfer->owner;	898	struct module *owner = xfer->owner;
899		899
900	if (!try_module_get(owner))	900	if (!try_module_get(owner))
901	return -EINVAL;	901	return -EINVAL;
902	if (xfer->init)	902	if (xfer->init)
903	err = xfer->init(lo, i);	903	err = xfer->init(lo, i);
904	if (err)	904	if (err)
905	module_put(owner);	905	module_put(owner);
906	else	906	else
907	lo->lo_encryption = xfer;	907	lo->lo_encryption = xfer;
908	}	908	}
909	return err;	909	return err;
910	}	910	}
911		911
912	static int loop_clr_fd(struct loop_device lo, struct block_device bdev)	912	static int loop_clr_fd(struct loop_device lo, struct block_device bdev)
913	{	913	{
914	struct file *filp = lo->lo_backing_file;	914	struct file *filp = lo->lo_backing_file;
915	gfp_t gfp = lo->old_gfp_mask;	915	gfp_t gfp = lo->old_gfp_mask;
916		916
917	if (lo->lo_state != Lo_bound)	917	if (lo->lo_state != Lo_bound)
918	return -ENXIO;	918	return -ENXIO;
919		919
920	if (lo->lo_refcnt > 1) /* we needed one fd for the ioctl */	920	if (lo->lo_refcnt > 1) /* we needed one fd for the ioctl */
921	return -EBUSY;	921	return -EBUSY;
922		922
923	if (filp == NULL)	923	if (filp == NULL)
924	return -EINVAL;	924	return -EINVAL;
925		925
926	spin_lock_irq(&lo->lo_lock);	926	spin_lock_irq(&lo->lo_lock);
927	lo->lo_state = Lo_rundown;	927	lo->lo_state = Lo_rundown;
928	spin_unlock_irq(&lo->lo_lock);	928	spin_unlock_irq(&lo->lo_lock);
929		929
930	kthread_stop(lo->lo_thread);	930	kthread_stop(lo->lo_thread);
931		931
932	lo->lo_queue->unplug_fn = NULL;	932	lo->lo_queue->unplug_fn = NULL;
933	lo->lo_backing_file = NULL;	933	lo->lo_backing_file = NULL;
934		934
935	loop_release_xfer(lo);	935	loop_release_xfer(lo);
936	lo->transfer = NULL;	936	lo->transfer = NULL;
937	lo->ioctl = NULL;	937	lo->ioctl = NULL;
938	lo->lo_device = NULL;	938	lo->lo_device = NULL;
939	lo->lo_encryption = NULL;	939	lo->lo_encryption = NULL;
940	lo->lo_offset = 0;	940	lo->lo_offset = 0;
941	lo->lo_sizelimit = 0;	941	lo->lo_sizelimit = 0;
942	lo->lo_encrypt_key_size = 0;	942	lo->lo_encrypt_key_size = 0;
943	lo->lo_flags = 0;	943	lo->lo_flags = 0;
944	lo->lo_thread = NULL;	944	lo->lo_thread = NULL;
945	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);	945	memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE);
946	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);	946	memset(lo->lo_crypt_name, 0, LO_NAME_SIZE);
947	memset(lo->lo_file_name, 0, LO_NAME_SIZE);	947	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
948	if (bdev)	948	if (bdev)
949	invalidate_bdev(bdev);	949	invalidate_bdev(bdev);
950	set_capacity(lo->lo_disk, 0);	950	set_capacity(lo->lo_disk, 0);
951	if (bdev) {	951	if (bdev) {
952	bd_set_size(bdev, 0);	952	bd_set_size(bdev, 0);
953	/* let user-space know about this change */	953	/* let user-space know about this change */
954	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);	954	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
955	}	955	}
956	mapping_set_gfp_mask(filp->f_mapping, gfp);	956	mapping_set_gfp_mask(filp->f_mapping, gfp);
957	lo->lo_state = Lo_unbound;	957	lo->lo_state = Lo_unbound;
958	/* This is safe: open() is still holding a reference. */	958	/* This is safe: open() is still holding a reference. */
959	module_put(THIS_MODULE);	959	module_put(THIS_MODULE);
960	if (max_part > 0 && bdev)	960	if (max_part > 0 && bdev)
961	ioctl_by_bdev(bdev, BLKRRPART, 0);	961	ioctl_by_bdev(bdev, BLKRRPART, 0);
962	mutex_unlock(&lo->lo_ctl_mutex);	962	mutex_unlock(&lo->lo_ctl_mutex);
963	/*	963	/*
964	* Need not hold lo_ctl_mutex to fput backing file.	964	* Need not hold lo_ctl_mutex to fput backing file.
965	* Calling fput holding lo_ctl_mutex triggers a circular	965	* Calling fput holding lo_ctl_mutex triggers a circular
966	* lock dependency possibility warning as fput can take	966	* lock dependency possibility warning as fput can take
967	* bd_mutex which is usually taken before lo_ctl_mutex.	967	* bd_mutex which is usually taken before lo_ctl_mutex.
968	*/	968	*/
969	fput(filp);	969	fput(filp);
970	return 0;	970	return 0;
971	}	971	}
972		972
973	static int	973	static int
974	loop_set_status(struct loop_device lo, const struct loop_info64 info)	974	loop_set_status(struct loop_device lo, const struct loop_info64 info)
975	{	975	{
976	int err;	976	int err;
977	struct loop_func_table *xfer;	977	struct loop_func_table *xfer;
978	uid_t uid = current_uid();	978	uid_t uid = current_uid();
979		979
980	if (lo->lo_encrypt_key_size &&	980	if (lo->lo_encrypt_key_size &&
981	lo->lo_key_owner != uid &&	981	lo->lo_key_owner != uid &&
982	!capable(CAP_SYS_ADMIN))	982	!capable(CAP_SYS_ADMIN))
983	return -EPERM;	983	return -EPERM;
984	if (lo->lo_state != Lo_bound)	984	if (lo->lo_state != Lo_bound)
985	return -ENXIO;	985	return -ENXIO;
986	if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)	986	if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
987	return -EINVAL;	987	return -EINVAL;
988		988
989	err = loop_release_xfer(lo);	989	err = loop_release_xfer(lo);
990	if (err)	990	if (err)
991	return err;	991	return err;
992		992
993	if (info->lo_encrypt_type) {	993	if (info->lo_encrypt_type) {
994	unsigned int type = info->lo_encrypt_type;	994	unsigned int type = info->lo_encrypt_type;
995		995
996	if (type >= MAX_LO_CRYPT)	996	if (type >= MAX_LO_CRYPT)
997	return -EINVAL;	997	return -EINVAL;
998	xfer = xfer_funcs[type];	998	xfer = xfer_funcs[type];
999	if (xfer == NULL)	999	if (xfer == NULL)
1000	return -EINVAL;	1000	return -EINVAL;
1001	} else	1001	} else
1002	xfer = NULL;	1002	xfer = NULL;
1003		1003
1004	err = loop_init_xfer(lo, xfer, info);	1004	err = loop_init_xfer(lo, xfer, info);
1005	if (err)	1005	if (err)
1006	return err;	1006	return err;
1007		1007
1008	if (lo->lo_offset != info->lo_offset \|\|	1008	if (lo->lo_offset != info->lo_offset \|\|
1009	lo->lo_sizelimit != info->lo_sizelimit) {	1009	lo->lo_sizelimit != info->lo_sizelimit) {
1010	lo->lo_offset = info->lo_offset;	1010	lo->lo_offset = info->lo_offset;
1011	lo->lo_sizelimit = info->lo_sizelimit;	1011	lo->lo_sizelimit = info->lo_sizelimit;
1012	if (figure_loop_size(lo))	1012	if (figure_loop_size(lo))
1013	return -EFBIG;	1013	return -EFBIG;
1014	}	1014	}
1015		1015
1016	memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);	1016	memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
1017	memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);	1017	memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE);
1018	lo->lo_file_name[LO_NAME_SIZE-1] = 0;	1018	lo->lo_file_name[LO_NAME_SIZE-1] = 0;
1019	lo->lo_crypt_name[LO_NAME_SIZE-1] = 0;	1019	lo->lo_crypt_name[LO_NAME_SIZE-1] = 0;
1020		1020
1021	if (!xfer)	1021	if (!xfer)
1022	xfer = &none_funcs;	1022	xfer = &none_funcs;
1023	lo->transfer = xfer->transfer;	1023	lo->transfer = xfer->transfer;
1024	lo->ioctl = xfer->ioctl;	1024	lo->ioctl = xfer->ioctl;
1025		1025
1026	if ((lo->lo_flags & LO_FLAGS_AUTOCLEAR) !=	1026	if ((lo->lo_flags & LO_FLAGS_AUTOCLEAR) !=
1027	(info->lo_flags & LO_FLAGS_AUTOCLEAR))	1027	(info->lo_flags & LO_FLAGS_AUTOCLEAR))
1028	lo->lo_flags ^= LO_FLAGS_AUTOCLEAR;	1028	lo->lo_flags ^= LO_FLAGS_AUTOCLEAR;
1029		1029
1030	lo->lo_encrypt_key_size = info->lo_encrypt_key_size;	1030	lo->lo_encrypt_key_size = info->lo_encrypt_key_size;
1031	lo->lo_init[0] = info->lo_init[0];	1031	lo->lo_init[0] = info->lo_init[0];
1032	lo->lo_init[1] = info->lo_init[1];	1032	lo->lo_init[1] = info->lo_init[1];
1033	if (info->lo_encrypt_key_size) {	1033	if (info->lo_encrypt_key_size) {
1034	memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,	1034	memcpy(lo->lo_encrypt_key, info->lo_encrypt_key,
1035	info->lo_encrypt_key_size);	1035	info->lo_encrypt_key_size);
1036	lo->lo_key_owner = uid;	1036	lo->lo_key_owner = uid;
1037	}	1037	}
1038		1038
1039	return 0;	1039	return 0;
1040	}	1040	}
1041		1041
1042	static int	1042	static int
1043	loop_get_status(struct loop_device lo, struct loop_info64 info)	1043	loop_get_status(struct loop_device lo, struct loop_info64 info)
1044	{	1044	{
1045	struct file *file = lo->lo_backing_file;	1045	struct file *file = lo->lo_backing_file;
1046	struct kstat stat;	1046	struct kstat stat;
1047	int error;	1047	int error;
1048		1048
1049	if (lo->lo_state != Lo_bound)	1049	if (lo->lo_state != Lo_bound)
1050	return -ENXIO;	1050	return -ENXIO;
1051	error = vfs_getattr(file->f_path.mnt, file->f_path.dentry, &stat);	1051	error = vfs_getattr(file->f_path.mnt, file->f_path.dentry, &stat);
1052	if (error)	1052	if (error)
1053	return error;	1053	return error;
1054	memset(info, 0, sizeof(*info));	1054	memset(info, 0, sizeof(*info));
1055	info->lo_number = lo->lo_number;	1055	info->lo_number = lo->lo_number;
1056	info->lo_device = huge_encode_dev(stat.dev);	1056	info->lo_device = huge_encode_dev(stat.dev);
1057	info->lo_inode = stat.ino;	1057	info->lo_inode = stat.ino;
1058	info->lo_rdevice = huge_encode_dev(lo->lo_device ? stat.rdev : stat.dev);	1058	info->lo_rdevice = huge_encode_dev(lo->lo_device ? stat.rdev : stat.dev);
1059	info->lo_offset = lo->lo_offset;	1059	info->lo_offset = lo->lo_offset;
1060	info->lo_sizelimit = lo->lo_sizelimit;	1060	info->lo_sizelimit = lo->lo_sizelimit;
1061	info->lo_flags = lo->lo_flags;	1061	info->lo_flags = lo->lo_flags;
1062	memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);	1062	memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);
1063	memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE);	1063	memcpy(info->lo_crypt_name, lo->lo_crypt_name, LO_NAME_SIZE);
1064	info->lo_encrypt_type =	1064	info->lo_encrypt_type =
1065	lo->lo_encryption ? lo->lo_encryption->number : 0;	1065	lo->lo_encryption ? lo->lo_encryption->number : 0;
1066	if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) {	1066	if (lo->lo_encrypt_key_size && capable(CAP_SYS_ADMIN)) {
1067	info->lo_encrypt_key_size = lo->lo_encrypt_key_size;	1067	info->lo_encrypt_key_size = lo->lo_encrypt_key_size;
1068	memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,	1068	memcpy(info->lo_encrypt_key, lo->lo_encrypt_key,
1069	lo->lo_encrypt_key_size);	1069	lo->lo_encrypt_key_size);
1070	}	1070	}
1071	return 0;	1071	return 0;
1072	}	1072	}
1073		1073
1074	static void	1074	static void
1075	loop_info64_from_old(const struct loop_info info, struct loop_info64 info64)	1075	loop_info64_from_old(const struct loop_info info, struct loop_info64 info64)
1076	{	1076	{
1077	memset(info64, 0, sizeof(*info64));	1077	memset(info64, 0, sizeof(*info64));
1078	info64->lo_number = info->lo_number;	1078	info64->lo_number = info->lo_number;
1079	info64->lo_device = info->lo_device;	1079	info64->lo_device = info->lo_device;
1080	info64->lo_inode = info->lo_inode;	1080	info64->lo_inode = info->lo_inode;
1081	info64->lo_rdevice = info->lo_rdevice;	1081	info64->lo_rdevice = info->lo_rdevice;
1082	info64->lo_offset = info->lo_offset;	1082	info64->lo_offset = info->lo_offset;
1083	info64->lo_sizelimit = 0;	1083	info64->lo_sizelimit = 0;
1084	info64->lo_encrypt_type = info->lo_encrypt_type;	1084	info64->lo_encrypt_type = info->lo_encrypt_type;
1085	info64->lo_encrypt_key_size = info->lo_encrypt_key_size;	1085	info64->lo_encrypt_key_size = info->lo_encrypt_key_size;
1086	info64->lo_flags = info->lo_flags;	1086	info64->lo_flags = info->lo_flags;
1087	info64->lo_init[0] = info->lo_init[0];	1087	info64->lo_init[0] = info->lo_init[0];
1088	info64->lo_init[1] = info->lo_init[1];	1088	info64->lo_init[1] = info->lo_init[1];
1089	if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)	1089	if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1090	memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE);	1090	memcpy(info64->lo_crypt_name, info->lo_name, LO_NAME_SIZE);
1091	else	1091	else
1092	memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);	1092	memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
1093	memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE);	1093	memcpy(info64->lo_encrypt_key, info->lo_encrypt_key, LO_KEY_SIZE);
1094	}	1094	}
1095		1095
1096	static int	1096	static int
1097	loop_info64_to_old(const struct loop_info64 info64, struct loop_info info)	1097	loop_info64_to_old(const struct loop_info64 info64, struct loop_info info)
1098	{	1098	{
1099	memset(info, 0, sizeof(*info));	1099	memset(info, 0, sizeof(*info));
1100	info->lo_number = info64->lo_number;	1100	info->lo_number = info64->lo_number;
1101	info->lo_device = info64->lo_device;	1101	info->lo_device = info64->lo_device;
1102	info->lo_inode = info64->lo_inode;	1102	info->lo_inode = info64->lo_inode;
1103	info->lo_rdevice = info64->lo_rdevice;	1103	info->lo_rdevice = info64->lo_rdevice;
1104	info->lo_offset = info64->lo_offset;	1104	info->lo_offset = info64->lo_offset;
1105	info->lo_encrypt_type = info64->lo_encrypt_type;	1105	info->lo_encrypt_type = info64->lo_encrypt_type;
1106	info->lo_encrypt_key_size = info64->lo_encrypt_key_size;	1106	info->lo_encrypt_key_size = info64->lo_encrypt_key_size;
1107	info->lo_flags = info64->lo_flags;	1107	info->lo_flags = info64->lo_flags;
1108	info->lo_init[0] = info64->lo_init[0];	1108	info->lo_init[0] = info64->lo_init[0];
1109	info->lo_init[1] = info64->lo_init[1];	1109	info->lo_init[1] = info64->lo_init[1];
1110	if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)	1110	if (info->lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1111	memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE);	1111	memcpy(info->lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
1112	else	1112	else
1113	memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);	1113	memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
1114	memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);	1114	memcpy(info->lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
1115		1115
1116	/* error in case values were truncated */	1116	/* error in case values were truncated */
1117	if (info->lo_device != info64->lo_device \|\|	1117	if (info->lo_device != info64->lo_device \|\|
1118	info->lo_rdevice != info64->lo_rdevice \|\|	1118	info->lo_rdevice != info64->lo_rdevice \|\|
1119	info->lo_inode != info64->lo_inode \|\|	1119	info->lo_inode != info64->lo_inode \|\|
1120	info->lo_offset != info64->lo_offset)	1120	info->lo_offset != info64->lo_offset)
1121	return -EOVERFLOW;	1121	return -EOVERFLOW;
1122		1122
1123	return 0;	1123	return 0;
1124	}	1124	}
1125		1125
1126	static int	1126	static int
1127	loop_set_status_old(struct loop_device lo, const struct loop_info __user arg)	1127	loop_set_status_old(struct loop_device lo, const struct loop_info __user arg)
1128	{	1128	{
1129	struct loop_info info;	1129	struct loop_info info;
1130	struct loop_info64 info64;	1130	struct loop_info64 info64;
1131		1131
1132	if (copy_from_user(&info, arg, sizeof (struct loop_info)))	1132	if (copy_from_user(&info, arg, sizeof (struct loop_info)))
1133	return -EFAULT;	1133	return -EFAULT;
1134	loop_info64_from_old(&info, &info64);	1134	loop_info64_from_old(&info, &info64);
1135	return loop_set_status(lo, &info64);	1135	return loop_set_status(lo, &info64);
1136	}	1136	}
1137		1137
1138	static int	1138	static int
1139	loop_set_status64(struct loop_device lo, const struct loop_info64 __user arg)	1139	loop_set_status64(struct loop_device lo, const struct loop_info64 __user arg)
1140	{	1140	{
1141	struct loop_info64 info64;	1141	struct loop_info64 info64;
1142		1142
1143	if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))	1143	if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
1144	return -EFAULT;	1144	return -EFAULT;
1145	return loop_set_status(lo, &info64);	1145	return loop_set_status(lo, &info64);
1146	}	1146	}
1147		1147
1148	static int	1148	static int
1149	loop_get_status_old(struct loop_device lo, struct loop_info __user arg) {	1149	loop_get_status_old(struct loop_device lo, struct loop_info __user arg) {
1150	struct loop_info info;	1150	struct loop_info info;
1151	struct loop_info64 info64;	1151	struct loop_info64 info64;
1152	int err = 0;	1152	int err = 0;
1153		1153
1154	if (!arg)	1154	if (!arg)
1155	err = -EINVAL;	1155	err = -EINVAL;
1156	if (!err)	1156	if (!err)
1157	err = loop_get_status(lo, &info64);	1157	err = loop_get_status(lo, &info64);
1158	if (!err)	1158	if (!err)
1159	err = loop_info64_to_old(&info64, &info);	1159	err = loop_info64_to_old(&info64, &info);
1160	if (!err && copy_to_user(arg, &info, sizeof(info)))	1160	if (!err && copy_to_user(arg, &info, sizeof(info)))
1161	err = -EFAULT;	1161	err = -EFAULT;
1162		1162
1163	return err;	1163	return err;
1164	}	1164	}
1165		1165
1166	static int	1166	static int
1167	loop_get_status64(struct loop_device lo, struct loop_info64 __user arg) {	1167	loop_get_status64(struct loop_device lo, struct loop_info64 __user arg) {
1168	struct loop_info64 info64;	1168	struct loop_info64 info64;
1169	int err = 0;	1169	int err = 0;
1170		1170
1171	if (!arg)	1171	if (!arg)
1172	err = -EINVAL;	1172	err = -EINVAL;
1173	if (!err)	1173	if (!err)
1174	err = loop_get_status(lo, &info64);	1174	err = loop_get_status(lo, &info64);
1175	if (!err && copy_to_user(arg, &info64, sizeof(info64)))	1175	if (!err && copy_to_user(arg, &info64, sizeof(info64)))
1176	err = -EFAULT;	1176	err = -EFAULT;
1177		1177
1178	return err;	1178	return err;
1179	}	1179	}
1180		1180
1181	static int loop_set_capacity(struct loop_device lo, struct block_device bdev)	1181	static int loop_set_capacity(struct loop_device lo, struct block_device bdev)
1182	{	1182	{
1183	int err;	1183	int err;
1184	sector_t sec;	1184	sector_t sec;
1185	loff_t sz;	1185	loff_t sz;
1186		1186
1187	err = -ENXIO;	1187	err = -ENXIO;
1188	if (unlikely(lo->lo_state != Lo_bound))	1188	if (unlikely(lo->lo_state != Lo_bound))
1189	goto out;	1189	goto out;
1190	err = figure_loop_size(lo);	1190	err = figure_loop_size(lo);
1191	if (unlikely(err))	1191	if (unlikely(err))
1192	goto out;	1192	goto out;
1193	sec = get_capacity(lo->lo_disk);	1193	sec = get_capacity(lo->lo_disk);
1194	/* the width of sector_t may be narrow for bit-shift */	1194	/* the width of sector_t may be narrow for bit-shift */
1195	sz = sec;	1195	sz = sec;
1196	sz <<= 9;	1196	sz <<= 9;
1197	mutex_lock(&bdev->bd_mutex);	1197	mutex_lock(&bdev->bd_mutex);
1198	bd_set_size(bdev, sz);	1198	bd_set_size(bdev, sz);
1199	/* let user-space know about the new size */	1199	/* let user-space know about the new size */
1200	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);	1200	kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
1201	mutex_unlock(&bdev->bd_mutex);	1201	mutex_unlock(&bdev->bd_mutex);
1202		1202
1203	out:	1203	out:
1204	return err;	1204	return err;
1205	}	1205	}
1206		1206
1207	static int lo_ioctl(struct block_device *bdev, fmode_t mode,	1207	static int lo_ioctl(struct block_device *bdev, fmode_t mode,
1208	unsigned int cmd, unsigned long arg)	1208	unsigned int cmd, unsigned long arg)
1209	{	1209	{
1210	struct loop_device *lo = bdev->bd_disk->private_data;	1210	struct loop_device *lo = bdev->bd_disk->private_data;
1211	int err;	1211	int err;
1212		1212
1213	mutex_lock_nested(&lo->lo_ctl_mutex, 1);	1213	mutex_lock_nested(&lo->lo_ctl_mutex, 1);
1214	switch (cmd) {	1214	switch (cmd) {
1215	case LOOP_SET_FD:	1215	case LOOP_SET_FD:
1216	err = loop_set_fd(lo, mode, bdev, arg);	1216	err = loop_set_fd(lo, mode, bdev, arg);
1217	break;	1217	break;
1218	case LOOP_CHANGE_FD:	1218	case LOOP_CHANGE_FD:
1219	err = loop_change_fd(lo, bdev, arg);	1219	err = loop_change_fd(lo, bdev, arg);
1220	break;	1220	break;
1221	case LOOP_CLR_FD:	1221	case LOOP_CLR_FD:
1222	/* loop_clr_fd would have unlocked lo_ctl_mutex on success */	1222	/* loop_clr_fd would have unlocked lo_ctl_mutex on success */
1223	err = loop_clr_fd(lo, bdev);	1223	err = loop_clr_fd(lo, bdev);
1224	if (!err)	1224	if (!err)
1225	goto out_unlocked;	1225	goto out_unlocked;
1226	break;	1226	break;
1227	case LOOP_SET_STATUS:	1227	case LOOP_SET_STATUS:
1228	err = loop_set_status_old(lo, (struct loop_info __user *) arg);	1228	err = loop_set_status_old(lo, (struct loop_info __user *) arg);
1229	break;	1229	break;
1230	case LOOP_GET_STATUS:	1230	case LOOP_GET_STATUS:
1231	err = loop_get_status_old(lo, (struct loop_info __user *) arg);	1231	err = loop_get_status_old(lo, (struct loop_info __user *) arg);
1232	break;	1232	break;
1233	case LOOP_SET_STATUS64:	1233	case LOOP_SET_STATUS64:
1234	err = loop_set_status64(lo, (struct loop_info64 __user *) arg);	1234	err = loop_set_status64(lo, (struct loop_info64 __user *) arg);
1235	break;	1235	break;
1236	case LOOP_GET_STATUS64:	1236	case LOOP_GET_STATUS64:
1237	err = loop_get_status64(lo, (struct loop_info64 __user *) arg);	1237	err = loop_get_status64(lo, (struct loop_info64 __user *) arg);
1238	break;	1238	break;
1239	case LOOP_SET_CAPACITY:	1239	case LOOP_SET_CAPACITY:
1240	err = -EPERM;	1240	err = -EPERM;
1241	if ((mode & FMODE_WRITE) \|\| capable(CAP_SYS_ADMIN))	1241	if ((mode & FMODE_WRITE) \|\| capable(CAP_SYS_ADMIN))
1242	err = loop_set_capacity(lo, bdev);	1242	err = loop_set_capacity(lo, bdev);
1243	break;	1243	break;
1244	default:	1244	default:
1245	err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;	1245	err = lo->ioctl ? lo->ioctl(lo, cmd, arg) : -EINVAL;
1246	}	1246	}
1247	mutex_unlock(&lo->lo_ctl_mutex);	1247	mutex_unlock(&lo->lo_ctl_mutex);
1248		1248
1249	out_unlocked:	1249	out_unlocked:
1250	return err;	1250	return err;
1251	}	1251	}
1252		1252
1253	#ifdef CONFIG_COMPAT	1253	#ifdef CONFIG_COMPAT
1254	struct compat_loop_info {	1254	struct compat_loop_info {
1255	compat_int_t lo_number; /* ioctl r/o */	1255	compat_int_t lo_number; /* ioctl r/o */
1256	compat_dev_t lo_device; /* ioctl r/o */	1256	compat_dev_t lo_device; /* ioctl r/o */
1257	compat_ulong_t lo_inode; /* ioctl r/o */	1257	compat_ulong_t lo_inode; /* ioctl r/o */
1258	compat_dev_t lo_rdevice; /* ioctl r/o */	1258	compat_dev_t lo_rdevice; /* ioctl r/o */
1259	compat_int_t lo_offset;	1259	compat_int_t lo_offset;
1260	compat_int_t lo_encrypt_type;	1260	compat_int_t lo_encrypt_type;
1261	compat_int_t lo_encrypt_key_size; /* ioctl w/o */	1261	compat_int_t lo_encrypt_key_size; /* ioctl w/o */
1262	compat_int_t lo_flags; /* ioctl r/o */	1262	compat_int_t lo_flags; /* ioctl r/o */
1263	char lo_name[LO_NAME_SIZE];	1263	char lo_name[LO_NAME_SIZE];
1264	unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */	1264	unsigned char lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */
1265	compat_ulong_t lo_init[2];	1265	compat_ulong_t lo_init[2];
1266	char reserved[4];	1266	char reserved[4];
1267	};	1267	};
1268		1268
1269	/*	1269	/*
1270	* Transfer 32-bit compatibility structure in userspace to 64-bit loop info	1270	* Transfer 32-bit compatibility structure in userspace to 64-bit loop info
1271	* - noinlined to reduce stack space usage in main part of driver	1271	* - noinlined to reduce stack space usage in main part of driver
1272	*/	1272	*/
1273	static noinline int	1273	static noinline int
1274	loop_info64_from_compat(const struct compat_loop_info __user *arg,	1274	loop_info64_from_compat(const struct compat_loop_info __user *arg,
1275	struct loop_info64 *info64)	1275	struct loop_info64 *info64)
1276	{	1276	{
1277	struct compat_loop_info info;	1277	struct compat_loop_info info;
1278		1278
1279	if (copy_from_user(&info, arg, sizeof(info)))	1279	if (copy_from_user(&info, arg, sizeof(info)))
1280	return -EFAULT;	1280	return -EFAULT;
1281		1281
1282	memset(info64, 0, sizeof(*info64));	1282	memset(info64, 0, sizeof(*info64));
1283	info64->lo_number = info.lo_number;	1283	info64->lo_number = info.lo_number;
1284	info64->lo_device = info.lo_device;	1284	info64->lo_device = info.lo_device;
1285	info64->lo_inode = info.lo_inode;	1285	info64->lo_inode = info.lo_inode;
1286	info64->lo_rdevice = info.lo_rdevice;	1286	info64->lo_rdevice = info.lo_rdevice;
1287	info64->lo_offset = info.lo_offset;	1287	info64->lo_offset = info.lo_offset;
1288	info64->lo_sizelimit = 0;	1288	info64->lo_sizelimit = 0;
1289	info64->lo_encrypt_type = info.lo_encrypt_type;	1289	info64->lo_encrypt_type = info.lo_encrypt_type;
1290	info64->lo_encrypt_key_size = info.lo_encrypt_key_size;	1290	info64->lo_encrypt_key_size = info.lo_encrypt_key_size;
1291	info64->lo_flags = info.lo_flags;	1291	info64->lo_flags = info.lo_flags;
1292	info64->lo_init[0] = info.lo_init[0];	1292	info64->lo_init[0] = info.lo_init[0];
1293	info64->lo_init[1] = info.lo_init[1];	1293	info64->lo_init[1] = info.lo_init[1];
1294	if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)	1294	if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1295	memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE);	1295	memcpy(info64->lo_crypt_name, info.lo_name, LO_NAME_SIZE);
1296	else	1296	else
1297	memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);	1297	memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
1298	memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE);	1298	memcpy(info64->lo_encrypt_key, info.lo_encrypt_key, LO_KEY_SIZE);
1299	return 0;	1299	return 0;
1300	}	1300	}
1301		1301
1302	/*	1302	/*
1303	* Transfer 64-bit loop info to 32-bit compatibility structure in userspace	1303	* Transfer 64-bit loop info to 32-bit compatibility structure in userspace
1304	* - noinlined to reduce stack space usage in main part of driver	1304	* - noinlined to reduce stack space usage in main part of driver
1305	*/	1305	*/
1306	static noinline int	1306	static noinline int
1307	loop_info64_to_compat(const struct loop_info64 *info64,	1307	loop_info64_to_compat(const struct loop_info64 *info64,
1308	struct compat_loop_info __user *arg)	1308	struct compat_loop_info __user *arg)
1309	{	1309	{
1310	struct compat_loop_info info;	1310	struct compat_loop_info info;
1311		1311
1312	memset(&info, 0, sizeof(info));	1312	memset(&info, 0, sizeof(info));
1313	info.lo_number = info64->lo_number;	1313	info.lo_number = info64->lo_number;
1314	info.lo_device = info64->lo_device;	1314	info.lo_device = info64->lo_device;
1315	info.lo_inode = info64->lo_inode;	1315	info.lo_inode = info64->lo_inode;
1316	info.lo_rdevice = info64->lo_rdevice;	1316	info.lo_rdevice = info64->lo_rdevice;
1317	info.lo_offset = info64->lo_offset;	1317	info.lo_offset = info64->lo_offset;
1318	info.lo_encrypt_type = info64->lo_encrypt_type;	1318	info.lo_encrypt_type = info64->lo_encrypt_type;
1319	info.lo_encrypt_key_size = info64->lo_encrypt_key_size;	1319	info.lo_encrypt_key_size = info64->lo_encrypt_key_size;
1320	info.lo_flags = info64->lo_flags;	1320	info.lo_flags = info64->lo_flags;
1321	info.lo_init[0] = info64->lo_init[0];	1321	info.lo_init[0] = info64->lo_init[0];
1322	info.lo_init[1] = info64->lo_init[1];	1322	info.lo_init[1] = info64->lo_init[1];
1323	if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)	1323	if (info.lo_encrypt_type == LO_CRYPT_CRYPTOAPI)
1324	memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE);	1324	memcpy(info.lo_name, info64->lo_crypt_name, LO_NAME_SIZE);
1325	else	1325	else
1326	memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);	1326	memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);
1327	memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);	1327	memcpy(info.lo_encrypt_key, info64->lo_encrypt_key, LO_KEY_SIZE);
1328		1328
1329	/* error in case values were truncated */	1329	/* error in case values were truncated */
1330	if (info.lo_device != info64->lo_device \|\|	1330	if (info.lo_device != info64->lo_device \|\|
1331	info.lo_rdevice != info64->lo_rdevice \|\|	1331	info.lo_rdevice != info64->lo_rdevice \|\|
1332	info.lo_inode != info64->lo_inode \|\|	1332	info.lo_inode != info64->lo_inode \|\|
1333	info.lo_offset != info64->lo_offset \|\|	1333	info.lo_offset != info64->lo_offset \|\|
1334	info.lo_init[0] != info64->lo_init[0] \|\|	1334	info.lo_init[0] != info64->lo_init[0] \|\|
1335	info.lo_init[1] != info64->lo_init[1])	1335	info.lo_init[1] != info64->lo_init[1])
1336	return -EOVERFLOW;	1336	return -EOVERFLOW;
1337		1337
1338	if (copy_to_user(arg, &info, sizeof(info)))	1338	if (copy_to_user(arg, &info, sizeof(info)))
1339	return -EFAULT;	1339	return -EFAULT;
1340	return 0;	1340	return 0;
1341	}	1341	}
1342		1342
1343	static int	1343	static int
1344	loop_set_status_compat(struct loop_device *lo,	1344	loop_set_status_compat(struct loop_device *lo,
1345	const struct compat_loop_info __user *arg)	1345	const struct compat_loop_info __user *arg)
1346	{	1346	{
1347	struct loop_info64 info64;	1347	struct loop_info64 info64;
1348	int ret;	1348	int ret;
1349		1349
1350	ret = loop_info64_from_compat(arg, &info64);	1350	ret = loop_info64_from_compat(arg, &info64);
1351	if (ret < 0)	1351	if (ret < 0)
1352	return ret;	1352	return ret;
1353	return loop_set_status(lo, &info64);	1353	return loop_set_status(lo, &info64);
1354	}	1354	}
1355		1355
1356	static int	1356	static int
1357	loop_get_status_compat(struct loop_device *lo,	1357	loop_get_status_compat(struct loop_device *lo,
1358	struct compat_loop_info __user *arg)	1358	struct compat_loop_info __user *arg)
1359	{	1359	{
1360	struct loop_info64 info64;	1360	struct loop_info64 info64;
1361	int err = 0;	1361	int err = 0;
1362		1362
1363	if (!arg)	1363	if (!arg)
1364	err = -EINVAL;	1364	err = -EINVAL;
1365	if (!err)	1365	if (!err)
1366	err = loop_get_status(lo, &info64);	1366	err = loop_get_status(lo, &info64);
1367	if (!err)	1367	if (!err)
1368	err = loop_info64_to_compat(&info64, arg);	1368	err = loop_info64_to_compat(&info64, arg);
1369	return err;	1369	return err;
1370	}	1370	}
1371		1371
1372	static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,	1372	static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
1373	unsigned int cmd, unsigned long arg)	1373	unsigned int cmd, unsigned long arg)
1374	{	1374	{
1375	struct loop_device *lo = bdev->bd_disk->private_data;	1375	struct loop_device *lo = bdev->bd_disk->private_data;
1376	int err;	1376	int err;
1377		1377
1378	switch(cmd) {	1378	switch(cmd) {
1379	case LOOP_SET_STATUS:	1379	case LOOP_SET_STATUS:
1380	mutex_lock(&lo->lo_ctl_mutex);	1380	mutex_lock(&lo->lo_ctl_mutex);
1381	err = loop_set_status_compat(	1381	err = loop_set_status_compat(
1382	lo, (const struct compat_loop_info __user *) arg);	1382	lo, (const struct compat_loop_info __user *) arg);
1383	mutex_unlock(&lo->lo_ctl_mutex);	1383	mutex_unlock(&lo->lo_ctl_mutex);
1384	break;	1384	break;
1385	case LOOP_GET_STATUS:	1385	case LOOP_GET_STATUS:
1386	mutex_lock(&lo->lo_ctl_mutex);	1386	mutex_lock(&lo->lo_ctl_mutex);
1387	err = loop_get_status_compat(	1387	err = loop_get_status_compat(
1388	lo, (struct compat_loop_info __user *) arg);	1388	lo, (struct compat_loop_info __user *) arg);
1389	mutex_unlock(&lo->lo_ctl_mutex);	1389	mutex_unlock(&lo->lo_ctl_mutex);
1390	break;	1390	break;
1391	case LOOP_SET_CAPACITY:	1391	case LOOP_SET_CAPACITY:
1392	case LOOP_CLR_FD:	1392	case LOOP_CLR_FD:
1393	case LOOP_GET_STATUS64:	1393	case LOOP_GET_STATUS64:
1394	case LOOP_SET_STATUS64:	1394	case LOOP_SET_STATUS64:
1395	arg = (unsigned long) compat_ptr(arg);	1395	arg = (unsigned long) compat_ptr(arg);
1396	case LOOP_SET_FD:	1396	case LOOP_SET_FD:
1397	case LOOP_CHANGE_FD:	1397	case LOOP_CHANGE_FD:
1398	err = lo_ioctl(bdev, mode, cmd, arg);	1398	err = lo_ioctl(bdev, mode, cmd, arg);
1399	break;	1399	break;
1400	default:	1400	default:
1401	err = -ENOIOCTLCMD;	1401	err = -ENOIOCTLCMD;
1402	break;	1402	break;
1403	}	1403	}
1404	return err;	1404	return err;
1405	}	1405	}
1406	#endif	1406	#endif
1407		1407
1408	static int lo_open(struct block_device *bdev, fmode_t mode)	1408	static int lo_open(struct block_device *bdev, fmode_t mode)
1409	{	1409	{
1410	struct loop_device *lo = bdev->bd_disk->private_data;	1410	struct loop_device *lo = bdev->bd_disk->private_data;
1411		1411
1412	lock_kernel();	1412	lock_kernel();
1413	mutex_lock(&lo->lo_ctl_mutex);	1413	mutex_lock(&lo->lo_ctl_mutex);
1414	lo->lo_refcnt++;	1414	lo->lo_refcnt++;
1415	mutex_unlock(&lo->lo_ctl_mutex);	1415	mutex_unlock(&lo->lo_ctl_mutex);
1416	unlock_kernel();	1416	unlock_kernel();
1417		1417
1418	return 0;	1418	return 0;
1419	}	1419	}
1420		1420
1421	static int lo_release(struct gendisk *disk, fmode_t mode)	1421	static int lo_release(struct gendisk *disk, fmode_t mode)
1422	{	1422	{
1423	struct loop_device *lo = disk->private_data;	1423	struct loop_device *lo = disk->private_data;
1424	int err;	1424	int err;
1425		1425
1426	lock_kernel();	1426	lock_kernel();
1427	mutex_lock(&lo->lo_ctl_mutex);	1427	mutex_lock(&lo->lo_ctl_mutex);
1428		1428
1429	if (--lo->lo_refcnt)	1429	if (--lo->lo_refcnt)
1430	goto out;	1430	goto out;
1431		1431
1432	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {	1432	if (lo->lo_flags & LO_FLAGS_AUTOCLEAR) {
1433	/*	1433	/*
1434	* In autoclear mode, stop the loop thread	1434	* In autoclear mode, stop the loop thread
1435	* and remove configuration after last close.	1435	* and remove configuration after last close.
1436	*/	1436	*/
1437	err = loop_clr_fd(lo, NULL);	1437	err = loop_clr_fd(lo, NULL);
1438	if (!err)	1438	if (!err)
1439	goto out_unlocked;	1439	goto out_unlocked;
1440	} else {	1440	} else {
1441	/*	1441	/*
1442	* Otherwise keep thread (if running) and config,	1442	* Otherwise keep thread (if running) and config,
1443	* but flush possible ongoing bios in thread.	1443	* but flush possible ongoing bios in thread.
1444	*/	1444	*/
1445	loop_flush(lo);	1445	loop_flush(lo);
1446	}	1446	}
1447		1447
1448	out:	1448	out:
1449	mutex_unlock(&lo->lo_ctl_mutex);	1449	mutex_unlock(&lo->lo_ctl_mutex);
1450	out_unlocked:	1450	out_unlocked:
1451	lock_kernel();	1451	lock_kernel();
1452	return 0;	1452	return 0;
1453	}	1453	}
1454		1454
1455	static const struct block_device_operations lo_fops = {	1455	static const struct block_device_operations lo_fops = {
1456	.owner = THIS_MODULE,	1456	.owner = THIS_MODULE,
1457	.open = lo_open,	1457	.open = lo_open,
1458	.release = lo_release,	1458	.release = lo_release,
1459	.ioctl = lo_ioctl,	1459	.ioctl = lo_ioctl,
1460	#ifdef CONFIG_COMPAT	1460	#ifdef CONFIG_COMPAT
1461	.compat_ioctl = lo_compat_ioctl,	1461	.compat_ioctl = lo_compat_ioctl,
1462	#endif	1462	#endif
1463	};	1463	};
1464		1464
1465	/*	1465	/*
1466	* And now the modules code and kernel interface.	1466	* And now the modules code and kernel interface.
1467	*/	1467	*/
1468	static int max_loop;	1468	static int max_loop;
1469	module_param(max_loop, int, 0);	1469	module_param(max_loop, int, 0);
1470	MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");	1470	MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
1471	module_param(max_part, int, 0);	1471	module_param(max_part, int, 0);
1472	MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");	1472	MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");
1473	MODULE_LICENSE("GPL");	1473	MODULE_LICENSE("GPL");
1474	MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);	1474	MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
1475		1475
1476	int loop_register_transfer(struct loop_func_table *funcs)	1476	int loop_register_transfer(struct loop_func_table *funcs)
1477	{	1477	{
1478	unsigned int n = funcs->number;	1478	unsigned int n = funcs->number;
1479		1479
1480	if (n >= MAX_LO_CRYPT \|\| xfer_funcs[n])	1480	if (n >= MAX_LO_CRYPT \|\| xfer_funcs[n])
1481	return -EINVAL;	1481	return -EINVAL;
1482	xfer_funcs[n] = funcs;	1482	xfer_funcs[n] = funcs;
1483	return 0;	1483	return 0;
1484	}	1484	}
1485		1485
1486	int loop_unregister_transfer(int number)	1486	int loop_unregister_transfer(int number)
1487	{	1487	{
1488	unsigned int n = number;	1488	unsigned int n = number;
1489	struct loop_device *lo;	1489	struct loop_device *lo;
1490	struct loop_func_table *xfer;	1490	struct loop_func_table *xfer;
1491		1491
1492	if (n == 0 \|\| n >= MAX_LO_CRYPT \|\| (xfer = xfer_funcs[n]) == NULL)	1492	if (n == 0 \|\| n >= MAX_LO_CRYPT \|\| (xfer = xfer_funcs[n]) == NULL)
1493	return -EINVAL;	1493	return -EINVAL;
1494		1494
1495	xfer_funcs[n] = NULL;	1495	xfer_funcs[n] = NULL;
1496		1496
1497	list_for_each_entry(lo, &loop_devices, lo_list) {	1497	list_for_each_entry(lo, &loop_devices, lo_list) {
1498	mutex_lock(&lo->lo_ctl_mutex);	1498	mutex_lock(&lo->lo_ctl_mutex);
1499		1499
1500	if (lo->lo_encryption == xfer)	1500	if (lo->lo_encryption == xfer)
1501	loop_release_xfer(lo);	1501	loop_release_xfer(lo);
1502		1502
1503	mutex_unlock(&lo->lo_ctl_mutex);	1503	mutex_unlock(&lo->lo_ctl_mutex);
1504	}	1504	}
1505		1505
1506	return 0;	1506	return 0;
1507	}	1507	}
1508		1508
1509	EXPORT_SYMBOL(loop_register_transfer);	1509	EXPORT_SYMBOL(loop_register_transfer);
1510	EXPORT_SYMBOL(loop_unregister_transfer);	1510	EXPORT_SYMBOL(loop_unregister_transfer);
1511		1511
1512	static struct loop_device *loop_alloc(int i)	1512	static struct loop_device *loop_alloc(int i)
1513	{	1513	{
1514	struct loop_device *lo;	1514	struct loop_device *lo;
1515	struct gendisk *disk;	1515	struct gendisk *disk;
1516		1516
1517	lo = kzalloc(sizeof(*lo), GFP_KERNEL);	1517	lo = kzalloc(sizeof(*lo), GFP_KERNEL);
1518	if (!lo)	1518	if (!lo)
1519	goto out;	1519	goto out;
1520		1520
1521	lo->lo_queue = blk_alloc_queue(GFP_KERNEL);	1521	lo->lo_queue = blk_alloc_queue(GFP_KERNEL);
1522	if (!lo->lo_queue)	1522	if (!lo->lo_queue)
1523	goto out_free_dev;	1523	goto out_free_dev;
1524		1524
1525	disk = lo->lo_disk = alloc_disk(1 << part_shift);	1525	disk = lo->lo_disk = alloc_disk(1 << part_shift);
1526	if (!disk)	1526	if (!disk)
1527	goto out_free_queue;	1527	goto out_free_queue;
1528		1528
1529	mutex_init(&lo->lo_ctl_mutex);	1529	mutex_init(&lo->lo_ctl_mutex);
1530	lo->lo_number = i;	1530	lo->lo_number = i;
1531	lo->lo_thread = NULL;	1531	lo->lo_thread = NULL;
1532	init_waitqueue_head(&lo->lo_event);	1532	init_waitqueue_head(&lo->lo_event);
1533	spin_lock_init(&lo->lo_lock);	1533	spin_lock_init(&lo->lo_lock);
1534	disk->major = LOOP_MAJOR;	1534	disk->major = LOOP_MAJOR;
1535	disk->first_minor = i << part_shift;	1535	disk->first_minor = i << part_shift;
1536	disk->fops = &lo_fops;	1536	disk->fops = &lo_fops;
1537	disk->private_data = lo;	1537	disk->private_data = lo;
1538	disk->queue = lo->lo_queue;	1538	disk->queue = lo->lo_queue;
1539	sprintf(disk->disk_name, "loop%d", i);	1539	sprintf(disk->disk_name, "loop%d", i);
1540	return lo;	1540	return lo;
1541		1541
1542	out_free_queue:	1542	out_free_queue:
1543	blk_cleanup_queue(lo->lo_queue);	1543	blk_cleanup_queue(lo->lo_queue);
1544	out_free_dev:	1544	out_free_dev:
1545	kfree(lo);	1545	kfree(lo);
1546	out:	1546	out:
1547	return NULL;	1547	return NULL;
1548	}	1548	}
1549		1549
1550	static void loop_free(struct loop_device *lo)	1550	static void loop_free(struct loop_device *lo)
1551	{	1551	{
1552	blk_cleanup_queue(lo->lo_queue);	1552	blk_cleanup_queue(lo->lo_queue);
1553	put_disk(lo->lo_disk);	1553	put_disk(lo->lo_disk);
1554	list_del(&lo->lo_list);	1554	list_del(&lo->lo_list);
1555	kfree(lo);	1555	kfree(lo);
1556	}	1556	}
1557		1557
1558	static struct loop_device *loop_init_one(int i)	1558	static struct loop_device *loop_init_one(int i)
1559	{	1559	{
1560	struct loop_device *lo;	1560	struct loop_device *lo;
1561		1561
1562	list_for_each_entry(lo, &loop_devices, lo_list) {	1562	list_for_each_entry(lo, &loop_devices, lo_list) {
1563	if (lo->lo_number == i)	1563	if (lo->lo_number == i)
1564	return lo;	1564	return lo;
1565	}	1565	}
1566		1566
1567	lo = loop_alloc(i);	1567	lo = loop_alloc(i);
1568	if (lo) {	1568	if (lo) {
1569	add_disk(lo->lo_disk);	1569	add_disk(lo->lo_disk);
1570	list_add_tail(&lo->lo_list, &loop_devices);	1570	list_add_tail(&lo->lo_list, &loop_devices);
1571	}	1571	}
1572	return lo;	1572	return lo;
1573	}	1573	}
1574		1574
1575	static void loop_del_one(struct loop_device *lo)	1575	static void loop_del_one(struct loop_device *lo)
1576	{	1576	{
1577	del_gendisk(lo->lo_disk);	1577	del_gendisk(lo->lo_disk);
1578	loop_free(lo);	1578	loop_free(lo);
1579	}	1579	}
1580		1580
1581	static struct kobject loop_probe(dev_t dev, int part, void *data)	1581	static struct kobject loop_probe(dev_t dev, int part, void *data)
1582	{	1582	{
1583	struct loop_device *lo;	1583	struct loop_device *lo;
1584	struct kobject *kobj;	1584	struct kobject *kobj;
1585		1585
1586	mutex_lock(&loop_devices_mutex);	1586	mutex_lock(&loop_devices_mutex);
1587	lo = loop_init_one(dev & MINORMASK);	1587	lo = loop_init_one(dev & MINORMASK);
1588	kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);	1588	kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM);
1589	mutex_unlock(&loop_devices_mutex);	1589	mutex_unlock(&loop_devices_mutex);
1590		1590
1591	*part = 0;	1591	*part = 0;
1592	return kobj;	1592	return kobj;
1593	}	1593	}
1594		1594
1595	static int __init loop_init(void)	1595	static int __init loop_init(void)
1596	{	1596	{
1597	int i, nr;	1597	int i, nr;
1598	unsigned long range;	1598	unsigned long range;
1599	struct loop_device lo, next;	1599	struct loop_device lo, next;
1600		1600
1601	/*	1601	/*
1602	* loop module now has a feature to instantiate underlying device	1602	* loop module now has a feature to instantiate underlying device
1603	* structure on-demand, provided that there is an access dev node.	1603	* structure on-demand, provided that there is an access dev node.
1604	* However, this will not work well with user space tool that doesn't	1604	* However, this will not work well with user space tool that doesn't
1605	* know about such "feature". In order to not break any existing	1605	* know about such "feature". In order to not break any existing
1606	* tool, we do the following:	1606	* tool, we do the following:
1607	*	1607	*
1608	* (1) if max_loop is specified, create that many upfront, and this	1608	* (1) if max_loop is specified, create that many upfront, and this
1609	* also becomes a hard limit.	1609	* also becomes a hard limit.
1610	* (2) if max_loop is not specified, create 8 loop device on module	1610	* (2) if max_loop is not specified, create 8 loop device on module
1611	* load, user can further extend loop device by create dev node	1611	* load, user can further extend loop device by create dev node
1612	* themselves and have kernel automatically instantiate actual	1612	* themselves and have kernel automatically instantiate actual
1613	* device on-demand.	1613	* device on-demand.
1614	*/	1614	*/
1615		1615
1616	part_shift = 0;	1616	part_shift = 0;
1617	if (max_part > 0)	1617	if (max_part > 0)
1618	part_shift = fls(max_part);	1618	part_shift = fls(max_part);
1619		1619
1620	if (max_loop > 1UL << (MINORBITS - part_shift))	1620	if (max_loop > 1UL << (MINORBITS - part_shift))
1621	return -EINVAL;	1621	return -EINVAL;
1622		1622
1623	if (max_loop) {	1623	if (max_loop) {
1624	nr = max_loop;	1624	nr = max_loop;
1625	range = max_loop;	1625	range = max_loop;
1626	} else {	1626	} else {
1627	nr = 8;	1627	nr = 8;
1628	range = 1UL << (MINORBITS - part_shift);	1628	range = 1UL << (MINORBITS - part_shift);
1629	}	1629	}
1630		1630
1631	if (register_blkdev(LOOP_MAJOR, "loop"))	1631	if (register_blkdev(LOOP_MAJOR, "loop"))
1632	return -EIO;	1632	return -EIO;
1633		1633
1634	for (i = 0; i < nr; i++) {	1634	for (i = 0; i < nr; i++) {
1635	lo = loop_alloc(i);	1635	lo = loop_alloc(i);
1636	if (!lo)	1636	if (!lo)
1637	goto Enomem;	1637	goto Enomem;
1638	list_add_tail(&lo->lo_list, &loop_devices);	1638	list_add_tail(&lo->lo_list, &loop_devices);
1639	}	1639	}
1640		1640
1641	/* point of no return */	1641	/* point of no return */
1642		1642
1643	list_for_each_entry(lo, &loop_devices, lo_list)	1643	list_for_each_entry(lo, &loop_devices, lo_list)
1644	add_disk(lo->lo_disk);	1644	add_disk(lo->lo_disk);
1645		1645
1646	blk_register_region(MKDEV(LOOP_MAJOR, 0), range,	1646	blk_register_region(MKDEV(LOOP_MAJOR, 0), range,
1647	THIS_MODULE, loop_probe, NULL, NULL);	1647	THIS_MODULE, loop_probe, NULL, NULL);
1648		1648
1649	printk(KERN_INFO "loop: module loaded\n");	1649	printk(KERN_INFO "loop: module loaded\n");
1650	return 0;	1650	return 0;
1651		1651
1652	Enomem:	1652	Enomem:
1653	printk(KERN_INFO "loop: out of memory\n");	1653	printk(KERN_INFO "loop: out of memory\n");
1654		1654
1655	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)	1655	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
1656	loop_free(lo);	1656	loop_free(lo);
1657		1657
1658	unregister_blkdev(LOOP_MAJOR, "loop");	1658	unregister_blkdev(LOOP_MAJOR, "loop");
1659	return -ENOMEM;	1659	return -ENOMEM;
1660	}	1660	}
1661		1661
1662	static void __exit loop_exit(void)	1662	static void __exit loop_exit(void)
1663	{	1663	{
1664	unsigned long range;	1664	unsigned long range;
1665	struct loop_device lo, next;	1665	struct loop_device lo, next;
1666		1666
1667	range = max_loop ? max_loop : 1UL << (MINORBITS - part_shift);	1667	range = max_loop ? max_loop : 1UL << (MINORBITS - part_shift);
1668		1668
1669	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)	1669	list_for_each_entry_safe(lo, next, &loop_devices, lo_list)
1670	loop_del_one(lo);	1670	loop_del_one(lo);
1671		1671
1672	blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);	1672	blk_unregister_region(MKDEV(LOOP_MAJOR, 0), range);
1673	unregister_blkdev(LOOP_MAJOR, "loop");	1673	unregister_blkdev(LOOP_MAJOR, "loop");
1674	}	1674	}
1675		1675
1676	module_init(loop_init);	1676	module_init(loop_init);
1677	module_exit(loop_exit);	1677	module_exit(loop_exit);
1678		1678
1679	#ifndef MODULE	1679	#ifndef MODULE
1680	static int __init max_loop_setup(char *str)	1680	static int __init max_loop_setup(char *str)
1681	{	1681	{
1682	max_loop = simple_strtol(str, NULL, 0);	1682	max_loop = simple_strtol(str, NULL, 0);
1683	return 1;	1683	return 1;
1684	}	1684	}
1685		1685
1686	__setup("max_loop=", max_loop_setup);	1686	__setup("max_loop=", max_loop_setup);
1687	#endif	1687	#endif
1688		1688

drivers/block/osdblk.c

Diff comments View file @ 4913efe

1		1
2	/*	2	/*
3	osdblk.c -- Export a single SCSI OSD object as a Linux block device	3	osdblk.c -- Export a single SCSI OSD object as a Linux block device
4		4
5		5
6	Copyright 2009 Red Hat, Inc.	6	Copyright 2009 Red Hat, Inc.
7		7
8	This program is free software; you can redistribute it and/or modify	8	This program is free software; you can redistribute it and/or modify
9	it under the terms of the GNU General Public License as published by	9	it under the terms of the GNU General Public License as published by
10	the Free Software Foundation.	10	the Free Software Foundation.
11		11
12	This program is distributed in the hope that it will be useful,	12	This program is distributed in the hope that it will be useful,
13	but WITHOUT ANY WARRANTY; without even the implied warranty of	13	but WITHOUT ANY WARRANTY; without even the implied warranty of
14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	14	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	GNU General Public License for more details.	15	GNU General Public License for more details.
16		16
17	You should have received a copy of the GNU General Public License	17	You should have received a copy of the GNU General Public License
18	along with this program; see the file COPYING. If not, write to	18	along with this program; see the file COPYING. If not, write to
19	the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.	19	the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
20		20
21		21
22	Instructions for use	22	Instructions for use
23	--------------------	23	--------------------
24		24
25	1) Map a Linux block device to an existing OSD object.	25	1) Map a Linux block device to an existing OSD object.
26		26
27	In this example, we will use partition id 1234, object id 5678,	27	In this example, we will use partition id 1234, object id 5678,
28	OSD device /dev/osd1.	28	OSD device /dev/osd1.
29		29
30	$ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add	30	$ echo "1234 5678 /dev/osd1" > /sys/class/osdblk/add
31		31
32		32
33	2) List all active blkdev<->object mappings.	33	2) List all active blkdev<->object mappings.
34		34
35	In this example, we have performed step #1 twice, creating two blkdevs,	35	In this example, we have performed step #1 twice, creating two blkdevs,
36	mapped to two separate OSD objects.	36	mapped to two separate OSD objects.
37		37
38	$ cat /sys/class/osdblk/list	38	$ cat /sys/class/osdblk/list
39	0 174 1234 5678 /dev/osd1	39	0 174 1234 5678 /dev/osd1
40	1 179 1994 897123 /dev/osd0	40	1 179 1994 897123 /dev/osd0
41		41
42	The columns, in order, are:	42	The columns, in order, are:
43	- blkdev unique id	43	- blkdev unique id
44	- blkdev assigned major	44	- blkdev assigned major
45	- OSD object partition id	45	- OSD object partition id
46	- OSD object id	46	- OSD object id
47	- OSD device	47	- OSD device
48		48
49		49
50	3) Remove an active blkdev<->object mapping.	50	3) Remove an active blkdev<->object mapping.
51		51
52	In this example, we remove the mapping with blkdev unique id 1.	52	In this example, we remove the mapping with blkdev unique id 1.
53		53
54	$ echo 1 > /sys/class/osdblk/remove	54	$ echo 1 > /sys/class/osdblk/remove
55		55
56		56
57	NOTE: The actual creation and deletion of OSD objects is outside the scope	57	NOTE: The actual creation and deletion of OSD objects is outside the scope
58	of this driver.	58	of this driver.
59		59
60	*/	60	*/
61		61
62	#include <linux/kernel.h>	62	#include <linux/kernel.h>
63	#include <linux/device.h>	63	#include <linux/device.h>
64	#include <linux/module.h>	64	#include <linux/module.h>
65	#include <linux/fs.h>	65	#include <linux/fs.h>
66	#include <linux/slab.h>	66	#include <linux/slab.h>
67	#include <scsi/osd_initiator.h>	67	#include <scsi/osd_initiator.h>
68	#include <scsi/osd_attributes.h>	68	#include <scsi/osd_attributes.h>
69	#include <scsi/osd_sec.h>	69	#include <scsi/osd_sec.h>
70	#include <scsi/scsi_device.h>	70	#include <scsi/scsi_device.h>
71		71
72	#define DRV_NAME "osdblk"	72	#define DRV_NAME "osdblk"
73	#define PFX DRV_NAME ": "	73	#define PFX DRV_NAME ": "
74		74
75	/* #define _OSDBLK_DEBUG */	75	/* #define _OSDBLK_DEBUG */
76	#ifdef _OSDBLK_DEBUG	76	#ifdef _OSDBLK_DEBUG
77	#define OSDBLK_DEBUG(fmt, a...) \	77	#define OSDBLK_DEBUG(fmt, a...) \
78	printk(KERN_NOTICE "osdblk @%s:%d: " fmt, __func__, __LINE__, ##a)	78	printk(KERN_NOTICE "osdblk @%s:%d: " fmt, __func__, __LINE__, ##a)
79	#else	79	#else
80	#define OSDBLK_DEBUG(fmt, a...) \	80	#define OSDBLK_DEBUG(fmt, a...) \
81	do { if (0) printk(fmt, ##a); } while (0)	81	do { if (0) printk(fmt, ##a); } while (0)
82	#endif	82	#endif
83		83
84	MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");	84	MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
85	MODULE_DESCRIPTION("block device inside an OSD object osdblk.ko");	85	MODULE_DESCRIPTION("block device inside an OSD object osdblk.ko");
86	MODULE_LICENSE("GPL");	86	MODULE_LICENSE("GPL");
87		87
88	struct osdblk_device;	88	struct osdblk_device;
89		89
90	enum {	90	enum {
91	OSDBLK_MINORS_PER_MAJOR = 256, /* max minors per blkdev */	91	OSDBLK_MINORS_PER_MAJOR = 256, /* max minors per blkdev */
92	OSDBLK_MAX_REQ = 32, /* max parallel requests */	92	OSDBLK_MAX_REQ = 32, /* max parallel requests */
93	OSDBLK_OP_TIMEOUT = 4 * 60, /* sync OSD req timeout */	93	OSDBLK_OP_TIMEOUT = 4 * 60, /* sync OSD req timeout */
94	};	94	};
95		95
96	struct osdblk_request {	96	struct osdblk_request {
97	struct request rq; / blk layer request */	97	struct request rq; / blk layer request */
98	struct bio bio; / cloned bio */	98	struct bio bio; / cloned bio */
99	struct osdblk_device osdev; / associated blkdev */	99	struct osdblk_device osdev; / associated blkdev */
100	};	100	};
101		101
102	struct osdblk_device {	102	struct osdblk_device {
103	int id; /* blkdev unique id */	103	int id; /* blkdev unique id */
104		104
105	int major; /* blkdev assigned major */	105	int major; /* blkdev assigned major */
106	struct gendisk disk; / blkdev's gendisk and rq */	106	struct gendisk disk; / blkdev's gendisk and rq */
107	struct request_queue *q;	107	struct request_queue *q;
108		108
109	struct osd_dev osd; / associated OSD */	109	struct osd_dev osd; / associated OSD */
110		110
111	char name[32]; /* blkdev name, e.g. osdblk34 */	111	char name[32]; /* blkdev name, e.g. osdblk34 */
112		112
113	spinlock_t lock; /* queue lock */	113	spinlock_t lock; /* queue lock */
114		114
115	struct osd_obj_id obj; /* OSD partition, obj id */	115	struct osd_obj_id obj; /* OSD partition, obj id */
116	uint8_t obj_cred[OSD_CAP_LEN]; /* OSD cred */	116	uint8_t obj_cred[OSD_CAP_LEN]; /* OSD cred */
117		117
118	struct osdblk_request req[OSDBLK_MAX_REQ]; /* request table */	118	struct osdblk_request req[OSDBLK_MAX_REQ]; /* request table */
119		119
120	struct list_head node;	120	struct list_head node;
121		121
122	char osd_path[0]; /* OSD device path */	122	char osd_path[0]; /* OSD device path */
123	};	123	};
124		124
125	static struct class class_osdblk; / /sys/class/osdblk */	125	static struct class class_osdblk; / /sys/class/osdblk */
126	static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */	126	static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
127	static LIST_HEAD(osdblkdev_list);	127	static LIST_HEAD(osdblkdev_list);
128		128
129	static const struct block_device_operations osdblk_bd_ops = {	129	static const struct block_device_operations osdblk_bd_ops = {
130	.owner = THIS_MODULE,	130	.owner = THIS_MODULE,
131	};	131	};
132		132
133	static const struct osd_attr g_attr_logical_length = ATTR_DEF(	133	static const struct osd_attr g_attr_logical_length = ATTR_DEF(
134	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);	134	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
135		135
136	static void osdblk_make_credential(u8 cred_a[OSD_CAP_LEN],	136	static void osdblk_make_credential(u8 cred_a[OSD_CAP_LEN],
137	const struct osd_obj_id *obj)	137	const struct osd_obj_id *obj)
138	{	138	{
139	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);	139	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
140	}	140	}
141		141
142	/* copied from exofs; move to libosd? */	142	/* copied from exofs; move to libosd? */
143	/*	143	/*
144	* Perform a synchronous OSD operation. copied from exofs; move to libosd?	144	* Perform a synchronous OSD operation. copied from exofs; move to libosd?
145	*/	145	*/
146	static int osd_sync_op(struct osd_request or, int timeout, uint8_t credential)	146	static int osd_sync_op(struct osd_request or, int timeout, uint8_t credential)
147	{	147	{
148	int ret;	148	int ret;
149		149
150	or->timeout = timeout;	150	or->timeout = timeout;
151	ret = osd_finalize_request(or, 0, credential, NULL);	151	ret = osd_finalize_request(or, 0, credential, NULL);
152	if (ret)	152	if (ret)
153	return ret;	153	return ret;
154		154
155	ret = osd_execute_request(or);	155	ret = osd_execute_request(or);
156		156
157	/* osd_req_decode_sense(or, ret); */	157	/* osd_req_decode_sense(or, ret); */
158	return ret;	158	return ret;
159	}	159	}
160		160
161	/*	161	/*
162	* Perform an asynchronous OSD operation. copied from exofs; move to libosd?	162	* Perform an asynchronous OSD operation. copied from exofs; move to libosd?
163	*/	163	*/
164	static int osd_async_op(struct osd_request or, osd_req_done_fn async_done,	164	static int osd_async_op(struct osd_request or, osd_req_done_fn async_done,
165	void caller_context, u8 cred)	165	void caller_context, u8 cred)
166	{	166	{
167	int ret;	167	int ret;
168		168
169	ret = osd_finalize_request(or, 0, cred, NULL);	169	ret = osd_finalize_request(or, 0, cred, NULL);
170	if (ret)	170	if (ret)
171	return ret;	171	return ret;
172		172
173	ret = osd_execute_request_async(or, async_done, caller_context);	173	ret = osd_execute_request_async(or, async_done, caller_context);
174		174
175	return ret;	175	return ret;
176	}	176	}
177		177
178	/* copied from exofs; move to libosd? */	178	/* copied from exofs; move to libosd? */
179	static int extract_attr_from_req(struct osd_request or, struct osd_attr attr)	179	static int extract_attr_from_req(struct osd_request or, struct osd_attr attr)
180	{	180	{
181	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */	181	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
182	void *iter = NULL;	182	void *iter = NULL;
183	int nelem;	183	int nelem;
184		184
185	do {	185	do {
186	nelem = 1;	186	nelem = 1;
187	osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);	187	osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
188	if ((cur_attr.attr_page == attr->attr_page) &&	188	if ((cur_attr.attr_page == attr->attr_page) &&
189	(cur_attr.attr_id == attr->attr_id)) {	189	(cur_attr.attr_id == attr->attr_id)) {
190	attr->len = cur_attr.len;	190	attr->len = cur_attr.len;
191	attr->val_ptr = cur_attr.val_ptr;	191	attr->val_ptr = cur_attr.val_ptr;
192	return 0;	192	return 0;
193	}	193	}
194	} while (iter);	194	} while (iter);
195		195
196	return -EIO;	196	return -EIO;
197	}	197	}
198		198
199	static int osdblk_get_obj_size(struct osdblk_device osdev, u64 size_out)	199	static int osdblk_get_obj_size(struct osdblk_device osdev, u64 size_out)
200	{	200	{
201	struct osd_request *or;	201	struct osd_request *or;
202	struct osd_attr attr;	202	struct osd_attr attr;
203	int ret;	203	int ret;
204		204
205	/* start request */	205	/* start request */
206	or = osd_start_request(osdev->osd, GFP_KERNEL);	206	or = osd_start_request(osdev->osd, GFP_KERNEL);
207	if (!or)	207	if (!or)
208	return -ENOMEM;	208	return -ENOMEM;
209		209
210	/* create a get-attributes(length) request */	210	/* create a get-attributes(length) request */
211	osd_req_get_attributes(or, &osdev->obj);	211	osd_req_get_attributes(or, &osdev->obj);
212		212
213	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);	213	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
214		214
215	/* execute op synchronously */	215	/* execute op synchronously */
216	ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);	216	ret = osd_sync_op(or, OSDBLK_OP_TIMEOUT, osdev->obj_cred);
217	if (ret)	217	if (ret)
218	goto out;	218	goto out;
219		219
220	/* extract length from returned attribute info */	220	/* extract length from returned attribute info */
221	attr = g_attr_logical_length;	221	attr = g_attr_logical_length;
222	ret = extract_attr_from_req(or, &attr);	222	ret = extract_attr_from_req(or, &attr);
223	if (ret)	223	if (ret)
224	goto out;	224	goto out;
225		225
226	*size_out = get_unaligned_be64(attr.val_ptr);	226	*size_out = get_unaligned_be64(attr.val_ptr);
227		227
228	out:	228	out:
229	osd_end_request(or);	229	osd_end_request(or);
230	return ret;	230	return ret;
231		231
232	}	232	}
233		233
234	static void osdblk_osd_complete(struct osd_request or, void private)	234	static void osdblk_osd_complete(struct osd_request or, void private)
235	{	235	{
236	struct osdblk_request *orq = private;	236	struct osdblk_request *orq = private;
237	struct osd_sense_info osi;	237	struct osd_sense_info osi;
238	int ret = osd_req_decode_sense(or, &osi);	238	int ret = osd_req_decode_sense(or, &osi);
239		239
240	if (ret) {	240	if (ret) {
241	ret = -EIO;	241	ret = -EIO;
242	OSDBLK_DEBUG("osdblk_osd_complete with err=%d\n", ret);	242	OSDBLK_DEBUG("osdblk_osd_complete with err=%d\n", ret);
243	}	243	}
244		244
245	/* complete OSD request */	245	/* complete OSD request */
246	osd_end_request(or);	246	osd_end_request(or);
247		247
248	/* complete request passed to osdblk by block layer */	248	/* complete request passed to osdblk by block layer */
249	__blk_end_request_all(orq->rq, ret);	249	__blk_end_request_all(orq->rq, ret);
250	}	250	}
251		251
252	static void bio_chain_put(struct bio *chain)	252	static void bio_chain_put(struct bio *chain)
253	{	253	{
254	struct bio *tmp;	254	struct bio *tmp;
255		255
256	while (chain) {	256	while (chain) {
257	tmp = chain;	257	tmp = chain;
258	chain = chain->bi_next;	258	chain = chain->bi_next;
259		259
260	bio_put(tmp);	260	bio_put(tmp);
261	}	261	}
262	}	262	}
263		263
264	static struct bio bio_chain_clone(struct bio old_chain, gfp_t gfpmask)	264	static struct bio bio_chain_clone(struct bio old_chain, gfp_t gfpmask)
265	{	265	{
266	struct bio tmp, new_chain = NULL, *tail = NULL;	266	struct bio tmp, new_chain = NULL, *tail = NULL;
267		267
268	while (old_chain) {	268	while (old_chain) {
269	tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);	269	tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
270	if (!tmp)	270	if (!tmp)
271	goto err_out;	271	goto err_out;
272		272
273	__bio_clone(tmp, old_chain);	273	__bio_clone(tmp, old_chain);
274	tmp->bi_bdev = NULL;	274	tmp->bi_bdev = NULL;
275	gfpmask &= ~__GFP_WAIT;	275	gfpmask &= ~__GFP_WAIT;
276	tmp->bi_next = NULL;	276	tmp->bi_next = NULL;
277		277
278	if (!new_chain)	278	if (!new_chain)
279	new_chain = tail = tmp;	279	new_chain = tail = tmp;
280	else {	280	else {
281	tail->bi_next = tmp;	281	tail->bi_next = tmp;
282	tail = tmp;	282	tail = tmp;
283	}	283	}
284		284
285	old_chain = old_chain->bi_next;	285	old_chain = old_chain->bi_next;
286	}	286	}
287		287
288	return new_chain;	288	return new_chain;
289		289
290	err_out:	290	err_out:
291	OSDBLK_DEBUG("bio_chain_clone with err\n");	291	OSDBLK_DEBUG("bio_chain_clone with err\n");
292	bio_chain_put(new_chain);	292	bio_chain_put(new_chain);
293	return NULL;	293	return NULL;
294	}	294	}
295		295
296	static void osdblk_rq_fn(struct request_queue *q)	296	static void osdblk_rq_fn(struct request_queue *q)
297	{	297	{
298	struct osdblk_device *osdev = q->queuedata;	298	struct osdblk_device *osdev = q->queuedata;
299		299
300	while (1) {	300	while (1) {
301	struct request *rq;	301	struct request *rq;
302	struct osdblk_request *orq;	302	struct osdblk_request *orq;
303	struct osd_request *or;	303	struct osd_request *or;
304	struct bio *bio;	304	struct bio *bio;
305	bool do_write, do_flush;	305	bool do_write, do_flush;
306		306
307	/* peek at request from block layer */	307	/* peek at request from block layer */
308	rq = blk_fetch_request(q);	308	rq = blk_fetch_request(q);
309	if (!rq)	309	if (!rq)
310	break;	310	break;
311		311
312	/* filter out block requests we don't understand */	312	/* filter out block requests we don't understand */
313	if (rq->cmd_type != REQ_TYPE_FS &&	313	if (rq->cmd_type != REQ_TYPE_FS &&
314	!(rq->cmd_flags & REQ_HARDBARRIER)) {	314	!(rq->cmd_flags & REQ_HARDBARRIER)) {
315	blk_end_request_all(rq, 0);	315	blk_end_request_all(rq, 0);
316	continue;	316	continue;
317	}	317	}
318		318
319	/* deduce our operation (read, write, flush) */	319	/* deduce our operation (read, write, flush) */
320	/* I wish the block layer simplified cmd_type/cmd_flags/cmd[]	320	/* I wish the block layer simplified cmd_type/cmd_flags/cmd[]
321	* into a clearly defined set of RPC commands:	321	* into a clearly defined set of RPC commands:
322	* read, write, flush, scsi command, power mgmt req,	322	* read, write, flush, scsi command, power mgmt req,
323	* driver-specific, etc.	323	* driver-specific, etc.
324	*/	324	*/
325		325
326	do_flush = rq->cmd_flags & REQ_FLUSH;	326	do_flush = rq->cmd_flags & REQ_FLUSH;
327	do_write = (rq_data_dir(rq) == WRITE);	327	do_write = (rq_data_dir(rq) == WRITE);
328		328
329	if (!do_flush) { /* osd_flush does not use a bio */	329	if (!do_flush) { /* osd_flush does not use a bio */
330	/* a bio clone to be passed down to OSD request */	330	/* a bio clone to be passed down to OSD request */
331	bio = bio_chain_clone(rq->bio, GFP_ATOMIC);	331	bio = bio_chain_clone(rq->bio, GFP_ATOMIC);
332	if (!bio)	332	if (!bio)
333	break;	333	break;
334	} else	334	} else
335	bio = NULL;	335	bio = NULL;
336		336
337	/* alloc internal OSD request, for OSD command execution */	337	/* alloc internal OSD request, for OSD command execution */
338	or = osd_start_request(osdev->osd, GFP_ATOMIC);	338	or = osd_start_request(osdev->osd, GFP_ATOMIC);
339	if (!or) {	339	if (!or) {
340	bio_chain_put(bio);	340	bio_chain_put(bio);
341	OSDBLK_DEBUG("osd_start_request with err\n");	341	OSDBLK_DEBUG("osd_start_request with err\n");
342	break;	342	break;
343	}	343	}
344		344
345	orq = &osdev->req[rq->tag];	345	orq = &osdev->req[rq->tag];
346	orq->rq = rq;	346	orq->rq = rq;
347	orq->bio = bio;	347	orq->bio = bio;
348	orq->osdev = osdev;	348	orq->osdev = osdev;
349		349
350	/* init OSD command: flush, write or read */	350	/* init OSD command: flush, write or read */
351	if (do_flush)	351	if (do_flush)
352	osd_req_flush_object(or, &osdev->obj,	352	osd_req_flush_object(or, &osdev->obj,
353	OSD_CDB_FLUSH_ALL, 0, 0);	353	OSD_CDB_FLUSH_ALL, 0, 0);
354	else if (do_write)	354	else if (do_write)
355	osd_req_write(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,	355	osd_req_write(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,
356	bio, blk_rq_bytes(rq));	356	bio, blk_rq_bytes(rq));
357	else	357	else
358	osd_req_read(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,	358	osd_req_read(or, &osdev->obj, blk_rq_pos(rq) * 512ULL,
359	bio, blk_rq_bytes(rq));	359	bio, blk_rq_bytes(rq));
360		360
361	OSDBLK_DEBUG("%s 0x%x bytes at 0x%llx\n",	361	OSDBLK_DEBUG("%s 0x%x bytes at 0x%llx\n",
362	do_flush ? "flush" : do_write ?	362	do_flush ? "flush" : do_write ?
363	"write" : "read", blk_rq_bytes(rq),	363	"write" : "read", blk_rq_bytes(rq),
364	blk_rq_pos(rq) * 512ULL);	364	blk_rq_pos(rq) * 512ULL);
365		365
366	/* begin OSD command execution */	366	/* begin OSD command execution */
367	if (osd_async_op(or, osdblk_osd_complete, orq,	367	if (osd_async_op(or, osdblk_osd_complete, orq,
368	osdev->obj_cred)) {	368	osdev->obj_cred)) {
369	osd_end_request(or);	369	osd_end_request(or);
370	blk_requeue_request(q, rq);	370	blk_requeue_request(q, rq);
371	bio_chain_put(bio);	371	bio_chain_put(bio);
372	OSDBLK_DEBUG("osd_execute_request_async with err\n");	372	OSDBLK_DEBUG("osd_execute_request_async with err\n");
373	break;	373	break;
374	}	374	}
375		375
376	/* remove the special 'flush' marker, now that the command	376	/* remove the special 'flush' marker, now that the command
377	* is executing	377	* is executing
378	*/	378	*/
379	rq->special = NULL;	379	rq->special = NULL;
380	}	380	}
381	}	381	}
382		382
383	static void osdblk_free_disk(struct osdblk_device *osdev)	383	static void osdblk_free_disk(struct osdblk_device *osdev)
384	{	384	{
385	struct gendisk *disk = osdev->disk;	385	struct gendisk *disk = osdev->disk;
386		386
387	if (!disk)	387	if (!disk)
388	return;	388	return;
389		389
390	if (disk->flags & GENHD_FL_UP)	390	if (disk->flags & GENHD_FL_UP)
391	del_gendisk(disk);	391	del_gendisk(disk);
392	if (disk->queue)	392	if (disk->queue)
393	blk_cleanup_queue(disk->queue);	393	blk_cleanup_queue(disk->queue);
394	put_disk(disk);	394	put_disk(disk);
395	}	395	}
396		396
397	static int osdblk_init_disk(struct osdblk_device *osdev)	397	static int osdblk_init_disk(struct osdblk_device *osdev)
398	{	398	{
399	struct gendisk *disk;	399	struct gendisk *disk;
400	struct request_queue *q;	400	struct request_queue *q;
401	int rc;	401	int rc;
402	u64 obj_size = 0;	402	u64 obj_size = 0;
403		403
404	/* contact OSD, request size info about the object being mapped */	404	/* contact OSD, request size info about the object being mapped */
405	rc = osdblk_get_obj_size(osdev, &obj_size);	405	rc = osdblk_get_obj_size(osdev, &obj_size);
406	if (rc)	406	if (rc)
407	return rc;	407	return rc;
408		408
409	/* create gendisk info */	409	/* create gendisk info */
410	disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR);	410	disk = alloc_disk(OSDBLK_MINORS_PER_MAJOR);
411	if (!disk)	411	if (!disk)
412	return -ENOMEM;	412	return -ENOMEM;
413		413
414	sprintf(disk->disk_name, DRV_NAME "%d", osdev->id);	414	sprintf(disk->disk_name, DRV_NAME "%d", osdev->id);
415	disk->major = osdev->major;	415	disk->major = osdev->major;
416	disk->first_minor = 0;	416	disk->first_minor = 0;
417	disk->fops = &osdblk_bd_ops;	417	disk->fops = &osdblk_bd_ops;
418	disk->private_data = osdev;	418	disk->private_data = osdev;
419		419
420	/* init rq */	420	/* init rq */
421	q = blk_init_queue(osdblk_rq_fn, &osdev->lock);	421	q = blk_init_queue(osdblk_rq_fn, &osdev->lock);
422	if (!q) {	422	if (!q) {
423	put_disk(disk);	423	put_disk(disk);
424	return -ENOMEM;	424	return -ENOMEM;
425	}	425	}
426		426
427	/* switch queue to TCQ mode; allocate tag map */	427	/* switch queue to TCQ mode; allocate tag map */
428	rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL);	428	rc = blk_queue_init_tags(q, OSDBLK_MAX_REQ, NULL);
429	if (rc) {	429	if (rc) {
430	blk_cleanup_queue(q);	430	blk_cleanup_queue(q);
431	put_disk(disk);	431	put_disk(disk);
432	return rc;	432	return rc;
433	}	433	}
434		434
435	/* Set our limits to the lower device limits, because osdblk cannot	435	/* Set our limits to the lower device limits, because osdblk cannot
436	* sleep when allocating a lower-request and therefore cannot be	436	* sleep when allocating a lower-request and therefore cannot be
437	* bouncing.	437	* bouncing.
438	*/	438	*/
439	blk_queue_stack_limits(q, osd_request_queue(osdev->osd));	439	blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
440		440
441	blk_queue_prep_rq(q, blk_queue_start_tag);	441	blk_queue_prep_rq(q, blk_queue_start_tag);
442	blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);	442	blk_queue_flush(q, REQ_FLUSH);
443		443
444	disk->queue = q;	444	disk->queue = q;
445		445
446	q->queuedata = osdev;	446	q->queuedata = osdev;
447		447
448	osdev->disk = disk;	448	osdev->disk = disk;
449	osdev->q = q;	449	osdev->q = q;
450		450
451	/* finally, announce the disk to the world */	451	/* finally, announce the disk to the world */
452	set_capacity(disk, obj_size / 512ULL);	452	set_capacity(disk, obj_size / 512ULL);
453	add_disk(disk);	453	add_disk(disk);
454		454
455	printk(KERN_INFO "%s: Added of size 0x%llx\n",	455	printk(KERN_INFO "%s: Added of size 0x%llx\n",
456	disk->disk_name, (unsigned long long)obj_size);	456	disk->disk_name, (unsigned long long)obj_size);
457		457
458	return 0;	458	return 0;
459	}	459	}
460		460
461	/********************************************************************	461	/********************************************************************
462	* /sys/class/osdblk/	462	* /sys/class/osdblk/
463	* add map OSD object to blkdev	463	* add map OSD object to blkdev
464	* remove unmap OSD object	464	* remove unmap OSD object
465	* list show mappings	465	* list show mappings
466	*******************************************************************/	466	*******************************************************************/
467		467
468	static void class_osdblk_release(struct class *cls)	468	static void class_osdblk_release(struct class *cls)
469	{	469	{
470	kfree(cls);	470	kfree(cls);
471	}	471	}
472		472
473	static ssize_t class_osdblk_list(struct class *c,	473	static ssize_t class_osdblk_list(struct class *c,
474	struct class_attribute *attr,	474	struct class_attribute *attr,
475	char *data)	475	char *data)
476	{	476	{
477	int n = 0;	477	int n = 0;
478	struct list_head *tmp;	478	struct list_head *tmp;
479		479
480	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);	480	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
481		481
482	list_for_each(tmp, &osdblkdev_list) {	482	list_for_each(tmp, &osdblkdev_list) {
483	struct osdblk_device *osdev;	483	struct osdblk_device *osdev;
484		484
485	osdev = list_entry(tmp, struct osdblk_device, node);	485	osdev = list_entry(tmp, struct osdblk_device, node);
486		486
487	n += sprintf(data+n, "%d %d %llu %llu %s\n",	487	n += sprintf(data+n, "%d %d %llu %llu %s\n",
488	osdev->id,	488	osdev->id,
489	osdev->major,	489	osdev->major,
490	osdev->obj.partition,	490	osdev->obj.partition,
491	osdev->obj.id,	491	osdev->obj.id,
492	osdev->osd_path);	492	osdev->osd_path);
493	}	493	}
494		494
495	mutex_unlock(&ctl_mutex);	495	mutex_unlock(&ctl_mutex);
496	return n;	496	return n;
497	}	497	}
498		498
499	static ssize_t class_osdblk_add(struct class *c,	499	static ssize_t class_osdblk_add(struct class *c,
500	struct class_attribute *attr,	500	struct class_attribute *attr,
501	const char *buf, size_t count)	501	const char *buf, size_t count)
502	{	502	{
503	struct osdblk_device *osdev;	503	struct osdblk_device *osdev;
504	ssize_t rc;	504	ssize_t rc;
505	int irc, new_id = 0;	505	int irc, new_id = 0;
506	struct list_head *tmp;	506	struct list_head *tmp;
507		507
508	if (!try_module_get(THIS_MODULE))	508	if (!try_module_get(THIS_MODULE))
509	return -ENODEV;	509	return -ENODEV;
510		510
511	/* new osdblk_device object */	511	/* new osdblk_device object */
512	osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL);	512	osdev = kzalloc(sizeof(*osdev) + strlen(buf) + 1, GFP_KERNEL);
513	if (!osdev) {	513	if (!osdev) {
514	rc = -ENOMEM;	514	rc = -ENOMEM;
515	goto err_out_mod;	515	goto err_out_mod;
516	}	516	}
517		517
518	/* static osdblk_device initialization */	518	/* static osdblk_device initialization */
519	spin_lock_init(&osdev->lock);	519	spin_lock_init(&osdev->lock);
520	INIT_LIST_HEAD(&osdev->node);	520	INIT_LIST_HEAD(&osdev->node);
521		521
522	/* generate unique id: find highest unique id, add one */	522	/* generate unique id: find highest unique id, add one */
523		523
524	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);	524	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
525		525
526	list_for_each(tmp, &osdblkdev_list) {	526	list_for_each(tmp, &osdblkdev_list) {
527	struct osdblk_device *osdev;	527	struct osdblk_device *osdev;
528		528
529	osdev = list_entry(tmp, struct osdblk_device, node);	529	osdev = list_entry(tmp, struct osdblk_device, node);
530	if (osdev->id > new_id)	530	if (osdev->id > new_id)
531	new_id = osdev->id + 1;	531	new_id = osdev->id + 1;
532	}	532	}
533		533
534	osdev->id = new_id;	534	osdev->id = new_id;
535		535
536	/* add to global list */	536	/* add to global list */
537	list_add_tail(&osdev->node, &osdblkdev_list);	537	list_add_tail(&osdev->node, &osdblkdev_list);
538		538
539	mutex_unlock(&ctl_mutex);	539	mutex_unlock(&ctl_mutex);
540		540
541	/* parse add command */	541	/* parse add command */
542	if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id,	542	if (sscanf(buf, "%llu %llu %s", &osdev->obj.partition, &osdev->obj.id,
543	osdev->osd_path) != 3) {	543	osdev->osd_path) != 3) {
544	rc = -EINVAL;	544	rc = -EINVAL;
545	goto err_out_slot;	545	goto err_out_slot;
546	}	546	}
547		547
548	/* initialize rest of new object */	548	/* initialize rest of new object */
549	sprintf(osdev->name, DRV_NAME "%d", osdev->id);	549	sprintf(osdev->name, DRV_NAME "%d", osdev->id);
550		550
551	/* contact requested OSD */	551	/* contact requested OSD */
552	osdev->osd = osduld_path_lookup(osdev->osd_path);	552	osdev->osd = osduld_path_lookup(osdev->osd_path);
553	if (IS_ERR(osdev->osd)) {	553	if (IS_ERR(osdev->osd)) {
554	rc = PTR_ERR(osdev->osd);	554	rc = PTR_ERR(osdev->osd);
555	goto err_out_slot;	555	goto err_out_slot;
556	}	556	}
557		557
558	/* build OSD credential */	558	/* build OSD credential */
559	osdblk_make_credential(osdev->obj_cred, &osdev->obj);	559	osdblk_make_credential(osdev->obj_cred, &osdev->obj);
560		560
561	/* register our block device */	561	/* register our block device */
562	irc = register_blkdev(0, osdev->name);	562	irc = register_blkdev(0, osdev->name);
563	if (irc < 0) {	563	if (irc < 0) {
564	rc = irc;	564	rc = irc;
565	goto err_out_osd;	565	goto err_out_osd;
566	}	566	}
567		567
568	osdev->major = irc;	568	osdev->major = irc;
569		569
570	/* set up and announce blkdev mapping */	570	/* set up and announce blkdev mapping */
571	rc = osdblk_init_disk(osdev);	571	rc = osdblk_init_disk(osdev);
572	if (rc)	572	if (rc)
573	goto err_out_blkdev;	573	goto err_out_blkdev;
574		574
575	return count;	575	return count;
576		576
577	err_out_blkdev:	577	err_out_blkdev:
578	unregister_blkdev(osdev->major, osdev->name);	578	unregister_blkdev(osdev->major, osdev->name);
579	err_out_osd:	579	err_out_osd:
580	osduld_put_device(osdev->osd);	580	osduld_put_device(osdev->osd);
581	err_out_slot:	581	err_out_slot:
582	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);	582	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
583	list_del_init(&osdev->node);	583	list_del_init(&osdev->node);
584	mutex_unlock(&ctl_mutex);	584	mutex_unlock(&ctl_mutex);
585		585
586	kfree(osdev);	586	kfree(osdev);
587	err_out_mod:	587	err_out_mod:
588	OSDBLK_DEBUG("Error adding device %s\n", buf);	588	OSDBLK_DEBUG("Error adding device %s\n", buf);
589	module_put(THIS_MODULE);	589	module_put(THIS_MODULE);
590	return rc;	590	return rc;
591	}	591	}
592		592
593	static ssize_t class_osdblk_remove(struct class *c,	593	static ssize_t class_osdblk_remove(struct class *c,
594	struct class_attribute *attr,	594	struct class_attribute *attr,
595	const char *buf,	595	const char *buf,
596	size_t count)	596	size_t count)
597	{	597	{
598	struct osdblk_device *osdev = NULL;	598	struct osdblk_device *osdev = NULL;
599	int target_id, rc;	599	int target_id, rc;
600	unsigned long ul;	600	unsigned long ul;
601	struct list_head *tmp;	601	struct list_head *tmp;
602		602
603	rc = strict_strtoul(buf, 10, &ul);	603	rc = strict_strtoul(buf, 10, &ul);
604	if (rc)	604	if (rc)
605	return rc;	605	return rc;
606		606
607	/* convert to int; abort if we lost anything in the conversion */	607	/* convert to int; abort if we lost anything in the conversion */
608	target_id = (int) ul;	608	target_id = (int) ul;
609	if (target_id != ul)	609	if (target_id != ul)
610	return -EINVAL;	610	return -EINVAL;
611		611
612	/* remove object from list immediately */	612	/* remove object from list immediately */
613	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);	613	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
614		614
615	list_for_each(tmp, &osdblkdev_list) {	615	list_for_each(tmp, &osdblkdev_list) {
616	osdev = list_entry(tmp, struct osdblk_device, node);	616	osdev = list_entry(tmp, struct osdblk_device, node);
617	if (osdev->id == target_id) {	617	if (osdev->id == target_id) {
618	list_del_init(&osdev->node);	618	list_del_init(&osdev->node);
619	break;	619	break;
620	}	620	}
621	osdev = NULL;	621	osdev = NULL;
622	}	622	}
623		623
624	mutex_unlock(&ctl_mutex);	624	mutex_unlock(&ctl_mutex);
625		625
626	if (!osdev)	626	if (!osdev)
627	return -ENOENT;	627	return -ENOENT;
628		628
629	/* clean up and free blkdev and associated OSD connection */	629	/* clean up and free blkdev and associated OSD connection */
630	osdblk_free_disk(osdev);	630	osdblk_free_disk(osdev);
631	unregister_blkdev(osdev->major, osdev->name);	631	unregister_blkdev(osdev->major, osdev->name);
632	osduld_put_device(osdev->osd);	632	osduld_put_device(osdev->osd);
633	kfree(osdev);	633	kfree(osdev);
634		634
635	/* release module ref */	635	/* release module ref */
636	module_put(THIS_MODULE);	636	module_put(THIS_MODULE);
637		637
638	return count;	638	return count;
639	}	639	}
640		640
641	static struct class_attribute class_osdblk_attrs[] = {	641	static struct class_attribute class_osdblk_attrs[] = {
642	__ATTR(add, 0200, NULL, class_osdblk_add),	642	__ATTR(add, 0200, NULL, class_osdblk_add),
643	__ATTR(remove, 0200, NULL, class_osdblk_remove),	643	__ATTR(remove, 0200, NULL, class_osdblk_remove),
644	__ATTR(list, 0444, class_osdblk_list, NULL),	644	__ATTR(list, 0444, class_osdblk_list, NULL),
645	__ATTR_NULL	645	__ATTR_NULL
646	};	646	};
647		647
648	static int osdblk_sysfs_init(void)	648	static int osdblk_sysfs_init(void)
649	{	649	{
650	int ret = 0;	650	int ret = 0;
651		651
652	/*	652	/*
653	* create control files in sysfs	653	* create control files in sysfs
654	* /sys/class/osdblk/...	654	* /sys/class/osdblk/...
655	*/	655	*/
656	class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL);	656	class_osdblk = kzalloc(sizeof(*class_osdblk), GFP_KERNEL);
657	if (!class_osdblk)	657	if (!class_osdblk)
658	return -ENOMEM;	658	return -ENOMEM;
659		659
660	class_osdblk->name = DRV_NAME;	660	class_osdblk->name = DRV_NAME;
661	class_osdblk->owner = THIS_MODULE;	661	class_osdblk->owner = THIS_MODULE;
662	class_osdblk->class_release = class_osdblk_release;	662	class_osdblk->class_release = class_osdblk_release;
663	class_osdblk->class_attrs = class_osdblk_attrs;	663	class_osdblk->class_attrs = class_osdblk_attrs;
664		664
665	ret = class_register(class_osdblk);	665	ret = class_register(class_osdblk);
666	if (ret) {	666	if (ret) {
667	kfree(class_osdblk);	667	kfree(class_osdblk);
668	class_osdblk = NULL;	668	class_osdblk = NULL;
669	printk(PFX "failed to create class osdblk\n");	669	printk(PFX "failed to create class osdblk\n");
670	return ret;	670	return ret;
671	}	671	}
672		672
673	return 0;	673	return 0;
674	}	674	}
675		675
676	static void osdblk_sysfs_cleanup(void)	676	static void osdblk_sysfs_cleanup(void)
677	{	677	{
678	if (class_osdblk)	678	if (class_osdblk)
679	class_destroy(class_osdblk);	679	class_destroy(class_osdblk);
680	class_osdblk = NULL;	680	class_osdblk = NULL;
681	}	681	}
682		682
683	static int __init osdblk_init(void)	683	static int __init osdblk_init(void)
684	{	684	{
685	int rc;	685	int rc;
686		686
687	rc = osdblk_sysfs_init();	687	rc = osdblk_sysfs_init();
688	if (rc)	688	if (rc)
689	return rc;	689	return rc;
690		690
691	return 0;	691	return 0;
692	}	692	}
693		693
694	static void __exit osdblk_exit(void)	694	static void __exit osdblk_exit(void)
695	{	695	{
696	osdblk_sysfs_cleanup();	696	osdblk_sysfs_cleanup();
697	}	697	}
698		698
699	module_init(osdblk_init);	699	module_init(osdblk_init);
700	module_exit(osdblk_exit);	700	module_exit(osdblk_exit);
701		701
702		702

drivers/block/ps3disk.c

Diff comments View file @ 4913efe

1	/*	1	/*
2	* PS3 Disk Storage Driver	2	* PS3 Disk Storage Driver
3	*	3	*
4	* Copyright (C) 2007 Sony Computer Entertainment Inc.	4	* Copyright (C) 2007 Sony Computer Entertainment Inc.
5	* Copyright 2007 Sony Corp.	5	* Copyright 2007 Sony Corp.
6	*	6	*
7	* This program is free software; you can redistribute it and/or modify it	7	* This program is free software; you can redistribute it and/or modify it
8	* under the terms of the GNU General Public License as published	8	* under the terms of the GNU General Public License as published
9	* by the Free Software Foundation; version 2 of the License.	9	* by the Free Software Foundation; version 2 of the License.
10	*	10	*
11	* This program is distributed in the hope that it will be useful, but	11	* This program is distributed in the hope that it will be useful, but
12	* WITHOUT ANY WARRANTY; without even the implied warranty of	12	* WITHOUT ANY WARRANTY; without even the implied warranty of
13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU	13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14	* General Public License for more details.	14	* General Public License for more details.
15	*	15	*
16	* You should have received a copy of the GNU General Public License along	16	* You should have received a copy of the GNU General Public License along
17	* with this program; if not, write to the Free Software Foundation, Inc.,	17	* with this program; if not, write to the Free Software Foundation, Inc.,
18	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.	18	* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
19	*/	19	*/
20		20
21	#include <linux/ata.h>	21	#include <linux/ata.h>
22	#include <linux/blkdev.h>	22	#include <linux/blkdev.h>
23	#include <linux/slab.h>	23	#include <linux/slab.h>
24		24
25	#include <asm/lv1call.h>	25	#include <asm/lv1call.h>
26	#include <asm/ps3stor.h>	26	#include <asm/ps3stor.h>
27	#include <asm/firmware.h>	27	#include <asm/firmware.h>
28		28
29		29
30	#define DEVICE_NAME "ps3disk"	30	#define DEVICE_NAME "ps3disk"
31		31
32	#define BOUNCE_SIZE (64*1024)	32	#define BOUNCE_SIZE (64*1024)
33		33
34	#define PS3DISK_MAX_DISKS 16	34	#define PS3DISK_MAX_DISKS 16
35	#define PS3DISK_MINORS 16	35	#define PS3DISK_MINORS 16
36		36
37		37
38	#define PS3DISK_NAME "ps3d%c"	38	#define PS3DISK_NAME "ps3d%c"
39		39
40		40
41	struct ps3disk_private {	41	struct ps3disk_private {
42	spinlock_t lock; /* Request queue spinlock */	42	spinlock_t lock; /* Request queue spinlock */
43	struct request_queue *queue;	43	struct request_queue *queue;
44	struct gendisk *gendisk;	44	struct gendisk *gendisk;
45	unsigned int blocking_factor;	45	unsigned int blocking_factor;
46	struct request *req;	46	struct request *req;
47	u64 raw_capacity;	47	u64 raw_capacity;
48	unsigned char model[ATA_ID_PROD_LEN+1];	48	unsigned char model[ATA_ID_PROD_LEN+1];
49	};	49	};
50		50
51		51
52	#define LV1_STORAGE_SEND_ATA_COMMAND (2)	52	#define LV1_STORAGE_SEND_ATA_COMMAND (2)
53	#define LV1_STORAGE_ATA_HDDOUT (0x23)	53	#define LV1_STORAGE_ATA_HDDOUT (0x23)
54		54
55	struct lv1_ata_cmnd_block {	55	struct lv1_ata_cmnd_block {
56	u16 features;	56	u16 features;
57	u16 sector_count;	57	u16 sector_count;
58	u16 LBA_low;	58	u16 LBA_low;
59	u16 LBA_mid;	59	u16 LBA_mid;
60	u16 LBA_high;	60	u16 LBA_high;
61	u8 device;	61	u8 device;
62	u8 command;	62	u8 command;
63	u32 is_ext;	63	u32 is_ext;
64	u32 proto;	64	u32 proto;
65	u32 in_out;	65	u32 in_out;
66	u32 size;	66	u32 size;
67	u64 buffer;	67	u64 buffer;
68	u32 arglen;	68	u32 arglen;
69	};	69	};
70		70
71	enum lv1_ata_proto {	71	enum lv1_ata_proto {
72	NON_DATA_PROTO = 0,	72	NON_DATA_PROTO = 0,
73	PIO_DATA_IN_PROTO = 1,	73	PIO_DATA_IN_PROTO = 1,
74	PIO_DATA_OUT_PROTO = 2,	74	PIO_DATA_OUT_PROTO = 2,
75	DMA_PROTO = 3	75	DMA_PROTO = 3
76	};	76	};
77		77
78	enum lv1_ata_in_out {	78	enum lv1_ata_in_out {
79	DIR_WRITE = 0, /* memory -> device */	79	DIR_WRITE = 0, /* memory -> device */
80	DIR_READ = 1 /* device -> memory */	80	DIR_READ = 1 /* device -> memory */
81	};	81	};
82		82
83	static int ps3disk_major;	83	static int ps3disk_major;
84		84
85		85
86	static const struct block_device_operations ps3disk_fops = {	86	static const struct block_device_operations ps3disk_fops = {
87	.owner = THIS_MODULE,	87	.owner = THIS_MODULE,
88	};	88	};
89		89
90		90
91	static void ps3disk_scatter_gather(struct ps3_storage_device *dev,	91	static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
92	struct request *req, int gather)	92	struct request *req, int gather)
93	{	93	{
94	unsigned int offset = 0;	94	unsigned int offset = 0;
95	struct req_iterator iter;	95	struct req_iterator iter;
96	struct bio_vec *bvec;	96	struct bio_vec *bvec;
97	unsigned int i = 0;	97	unsigned int i = 0;
98	size_t size;	98	size_t size;
99	void *buf;	99	void *buf;
100		100
101	rq_for_each_segment(bvec, req, iter) {	101	rq_for_each_segment(bvec, req, iter) {
102	unsigned long flags;	102	unsigned long flags;
103	dev_dbg(&dev->sbd.core,	103	dev_dbg(&dev->sbd.core,
104	"%s:%u: bio %u: %u segs %u sectors from %lu\n",	104	"%s:%u: bio %u: %u segs %u sectors from %lu\n",
105	__func__, __LINE__, i, bio_segments(iter.bio),	105	__func__, __LINE__, i, bio_segments(iter.bio),
106	bio_sectors(iter.bio), iter.bio->bi_sector);	106	bio_sectors(iter.bio), iter.bio->bi_sector);
107		107
108	size = bvec->bv_len;	108	size = bvec->bv_len;
109	buf = bvec_kmap_irq(bvec, &flags);	109	buf = bvec_kmap_irq(bvec, &flags);
110	if (gather)	110	if (gather)
111	memcpy(dev->bounce_buf+offset, buf, size);	111	memcpy(dev->bounce_buf+offset, buf, size);
112	else	112	else
113	memcpy(buf, dev->bounce_buf+offset, size);	113	memcpy(buf, dev->bounce_buf+offset, size);
114	offset += size;	114	offset += size;
115	flush_kernel_dcache_page(bvec->bv_page);	115	flush_kernel_dcache_page(bvec->bv_page);
116	bvec_kunmap_irq(bvec, &flags);	116	bvec_kunmap_irq(bvec, &flags);
117	i++;	117	i++;
118	}	118	}
119	}	119	}
120		120
121	static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,	121	static int ps3disk_submit_request_sg(struct ps3_storage_device *dev,
122	struct request *req)	122	struct request *req)
123	{	123	{
124	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);	124	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
125	int write = rq_data_dir(req), res;	125	int write = rq_data_dir(req), res;
126	const char *op = write ? "write" : "read";	126	const char *op = write ? "write" : "read";
127	u64 start_sector, sectors;	127	u64 start_sector, sectors;
128	unsigned int region_id = dev->regions[dev->region_idx].id;	128	unsigned int region_id = dev->regions[dev->region_idx].id;
129		129
130	#ifdef DEBUG	130	#ifdef DEBUG
131	unsigned int n = 0;	131	unsigned int n = 0;
132	struct bio_vec *bv;	132	struct bio_vec *bv;
133	struct req_iterator iter;	133	struct req_iterator iter;
134		134
135	rq_for_each_segment(bv, req, iter)	135	rq_for_each_segment(bv, req, iter)
136	n++;	136	n++;
137	dev_dbg(&dev->sbd.core,	137	dev_dbg(&dev->sbd.core,
138	"%s:%u: %s req has %u bvecs for %u sectors\n",	138	"%s:%u: %s req has %u bvecs for %u sectors\n",
139	__func__, __LINE__, op, n, blk_rq_sectors(req));	139	__func__, __LINE__, op, n, blk_rq_sectors(req));
140	#endif	140	#endif
141		141
142	start_sector = blk_rq_pos(req) * priv->blocking_factor;	142	start_sector = blk_rq_pos(req) * priv->blocking_factor;
143	sectors = blk_rq_sectors(req) * priv->blocking_factor;	143	sectors = blk_rq_sectors(req) * priv->blocking_factor;
144	dev_dbg(&dev->sbd.core, "%s:%u: %s %llu sectors starting at %llu\n",	144	dev_dbg(&dev->sbd.core, "%s:%u: %s %llu sectors starting at %llu\n",
145	__func__, __LINE__, op, sectors, start_sector);	145	__func__, __LINE__, op, sectors, start_sector);
146		146
147	if (write) {	147	if (write) {
148	ps3disk_scatter_gather(dev, req, 1);	148	ps3disk_scatter_gather(dev, req, 1);
149		149
150	res = lv1_storage_write(dev->sbd.dev_id, region_id,	150	res = lv1_storage_write(dev->sbd.dev_id, region_id,
151	start_sector, sectors, 0,	151	start_sector, sectors, 0,
152	dev->bounce_lpar, &dev->tag);	152	dev->bounce_lpar, &dev->tag);
153	} else {	153	} else {
154	res = lv1_storage_read(dev->sbd.dev_id, region_id,	154	res = lv1_storage_read(dev->sbd.dev_id, region_id,
155	start_sector, sectors, 0,	155	start_sector, sectors, 0,
156	dev->bounce_lpar, &dev->tag);	156	dev->bounce_lpar, &dev->tag);
157	}	157	}
158	if (res) {	158	if (res) {
159	dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__,	159	dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__,
160	__LINE__, op, res);	160	__LINE__, op, res);
161	__blk_end_request_all(req, -EIO);	161	__blk_end_request_all(req, -EIO);
162	return 0;	162	return 0;
163	}	163	}
164		164
165	priv->req = req;	165	priv->req = req;
166	return 1;	166	return 1;
167	}	167	}
168		168
169	static int ps3disk_submit_flush_request(struct ps3_storage_device *dev,	169	static int ps3disk_submit_flush_request(struct ps3_storage_device *dev,
170	struct request *req)	170	struct request *req)
171	{	171	{
172	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);	172	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
173	u64 res;	173	u64 res;
174		174
175	dev_dbg(&dev->sbd.core, "%s:%u: flush request\n", __func__, __LINE__);	175	dev_dbg(&dev->sbd.core, "%s:%u: flush request\n", __func__, __LINE__);
176		176
177	res = lv1_storage_send_device_command(dev->sbd.dev_id,	177	res = lv1_storage_send_device_command(dev->sbd.dev_id,
178	LV1_STORAGE_ATA_HDDOUT, 0, 0, 0,	178	LV1_STORAGE_ATA_HDDOUT, 0, 0, 0,
179	0, &dev->tag);	179	0, &dev->tag);
180	if (res) {	180	if (res) {
181	dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",	181	dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",
182	__func__, __LINE__, res);	182	__func__, __LINE__, res);
183	__blk_end_request_all(req, -EIO);	183	__blk_end_request_all(req, -EIO);
184	return 0;	184	return 0;
185	}	185	}
186		186
187	priv->req = req;	187	priv->req = req;
188	return 1;	188	return 1;
189	}	189	}
190		190
191	static void ps3disk_do_request(struct ps3_storage_device *dev,	191	static void ps3disk_do_request(struct ps3_storage_device *dev,
192	struct request_queue *q)	192	struct request_queue *q)
193	{	193	{
194	struct request *req;	194	struct request *req;
195		195
196	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);	196	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
197		197
198	while ((req = blk_fetch_request(q))) {	198	while ((req = blk_fetch_request(q))) {
199	if (req->cmd_flags & REQ_FLUSH) {	199	if (req->cmd_flags & REQ_FLUSH) {
200	if (ps3disk_submit_flush_request(dev, req))	200	if (ps3disk_submit_flush_request(dev, req))
201	break;	201	break;
202	} else if (req->cmd_type == REQ_TYPE_FS) {	202	} else if (req->cmd_type == REQ_TYPE_FS) {
203	if (ps3disk_submit_request_sg(dev, req))	203	if (ps3disk_submit_request_sg(dev, req))
204	break;	204	break;
205	} else {	205	} else {
206	blk_dump_rq_flags(req, DEVICE_NAME " bad request");	206	blk_dump_rq_flags(req, DEVICE_NAME " bad request");
207	__blk_end_request_all(req, -EIO);	207	__blk_end_request_all(req, -EIO);
208	continue;	208	continue;
209	}	209	}
210	}	210	}
211	}	211	}
212		212
213	static void ps3disk_request(struct request_queue *q)	213	static void ps3disk_request(struct request_queue *q)
214	{	214	{
215	struct ps3_storage_device *dev = q->queuedata;	215	struct ps3_storage_device *dev = q->queuedata;
216	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);	216	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
217		217
218	if (priv->req) {	218	if (priv->req) {
219	dev_dbg(&dev->sbd.core, "%s:%u busy\n", __func__, __LINE__);	219	dev_dbg(&dev->sbd.core, "%s:%u busy\n", __func__, __LINE__);
220	return;	220	return;
221	}	221	}
222		222
223	ps3disk_do_request(dev, q);	223	ps3disk_do_request(dev, q);
224	}	224	}
225		225
226	static irqreturn_t ps3disk_interrupt(int irq, void *data)	226	static irqreturn_t ps3disk_interrupt(int irq, void *data)
227	{	227	{
228	struct ps3_storage_device *dev = data;	228	struct ps3_storage_device *dev = data;
229	struct ps3disk_private *priv;	229	struct ps3disk_private *priv;
230	struct request *req;	230	struct request *req;
231	int res, read, error;	231	int res, read, error;
232	u64 tag, status;	232	u64 tag, status;
233	const char *op;	233	const char *op;
234		234
235	res = lv1_storage_get_async_status(dev->sbd.dev_id, &tag, &status);	235	res = lv1_storage_get_async_status(dev->sbd.dev_id, &tag, &status);
236		236
237	if (tag != dev->tag)	237	if (tag != dev->tag)
238	dev_err(&dev->sbd.core,	238	dev_err(&dev->sbd.core,
239	"%s:%u: tag mismatch, got %llx, expected %llx\n",	239	"%s:%u: tag mismatch, got %llx, expected %llx\n",
240	__func__, __LINE__, tag, dev->tag);	240	__func__, __LINE__, tag, dev->tag);
241		241
242	if (res) {	242	if (res) {
243	dev_err(&dev->sbd.core, "%s:%u: res=%d status=0x%llx\n",	243	dev_err(&dev->sbd.core, "%s:%u: res=%d status=0x%llx\n",
244	__func__, __LINE__, res, status);	244	__func__, __LINE__, res, status);
245	return IRQ_HANDLED;	245	return IRQ_HANDLED;
246	}	246	}
247		247
248	priv = ps3_system_bus_get_drvdata(&dev->sbd);	248	priv = ps3_system_bus_get_drvdata(&dev->sbd);
249	req = priv->req;	249	req = priv->req;
250	if (!req) {	250	if (!req) {
251	dev_dbg(&dev->sbd.core,	251	dev_dbg(&dev->sbd.core,
252	"%s:%u non-block layer request completed\n", __func__,	252	"%s:%u non-block layer request completed\n", __func__,
253	__LINE__);	253	__LINE__);
254	dev->lv1_status = status;	254	dev->lv1_status = status;
255	complete(&dev->done);	255	complete(&dev->done);
256	return IRQ_HANDLED;	256	return IRQ_HANDLED;
257	}	257	}
258		258
259	if (req->cmd_flags & REQ_FLUSH) {	259	if (req->cmd_flags & REQ_FLUSH) {
260	read = 0;	260	read = 0;
261	op = "flush";	261	op = "flush";
262	} else {	262	} else {
263	read = !rq_data_dir(req);	263	read = !rq_data_dir(req);
264	op = read ? "read" : "write";	264	op = read ? "read" : "write";
265	}	265	}
266	if (status) {	266	if (status) {
267	dev_dbg(&dev->sbd.core, "%s:%u: %s failed 0x%llx\n", __func__,	267	dev_dbg(&dev->sbd.core, "%s:%u: %s failed 0x%llx\n", __func__,
268	__LINE__, op, status);	268	__LINE__, op, status);
269	error = -EIO;	269	error = -EIO;
270	} else {	270	} else {
271	dev_dbg(&dev->sbd.core, "%s:%u: %s completed\n", __func__,	271	dev_dbg(&dev->sbd.core, "%s:%u: %s completed\n", __func__,
272	__LINE__, op);	272	__LINE__, op);
273	error = 0;	273	error = 0;
274	if (read)	274	if (read)
275	ps3disk_scatter_gather(dev, req, 0);	275	ps3disk_scatter_gather(dev, req, 0);
276	}	276	}
277		277
278	spin_lock(&priv->lock);	278	spin_lock(&priv->lock);
279	__blk_end_request_all(req, error);	279	__blk_end_request_all(req, error);
280	priv->req = NULL;	280	priv->req = NULL;
281	ps3disk_do_request(dev, priv->queue);	281	ps3disk_do_request(dev, priv->queue);
282	spin_unlock(&priv->lock);	282	spin_unlock(&priv->lock);
283		283
284	return IRQ_HANDLED;	284	return IRQ_HANDLED;
285	}	285	}
286		286
287	static int ps3disk_sync_cache(struct ps3_storage_device *dev)	287	static int ps3disk_sync_cache(struct ps3_storage_device *dev)
288	{	288	{
289	u64 res;	289	u64 res;
290		290
291	dev_dbg(&dev->sbd.core, "%s:%u: sync cache\n", __func__, __LINE__);	291	dev_dbg(&dev->sbd.core, "%s:%u: sync cache\n", __func__, __LINE__);
292		292
293	res = ps3stor_send_command(dev, LV1_STORAGE_ATA_HDDOUT, 0, 0, 0, 0);	293	res = ps3stor_send_command(dev, LV1_STORAGE_ATA_HDDOUT, 0, 0, 0, 0);
294	if (res) {	294	if (res) {
295	dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",	295	dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",
296	__func__, __LINE__, res);	296	__func__, __LINE__, res);
297	return -EIO;	297	return -EIO;
298	}	298	}
299	return 0;	299	return 0;
300	}	300	}
301		301
302		302
303	/* ATA helpers copied from drivers/ata/libata-core.c */	303	/* ATA helpers copied from drivers/ata/libata-core.c */
304		304
305	static void swap_buf_le16(u16 *buf, unsigned int buf_words)	305	static void swap_buf_le16(u16 *buf, unsigned int buf_words)
306	{	306	{
307	#ifdef __BIG_ENDIAN	307	#ifdef __BIG_ENDIAN
308	unsigned int i;	308	unsigned int i;
309		309
310	for (i = 0; i < buf_words; i++)	310	for (i = 0; i < buf_words; i++)
311	buf[i] = le16_to_cpu(buf[i]);	311	buf[i] = le16_to_cpu(buf[i]);
312	#endif /* __BIG_ENDIAN */	312	#endif /* __BIG_ENDIAN */
313	}	313	}
314		314
315	static u64 ata_id_n_sectors(const u16 *id)	315	static u64 ata_id_n_sectors(const u16 *id)
316	{	316	{
317	if (ata_id_has_lba(id)) {	317	if (ata_id_has_lba(id)) {
318	if (ata_id_has_lba48(id))	318	if (ata_id_has_lba48(id))
319	return ata_id_u64(id, 100);	319	return ata_id_u64(id, 100);
320	else	320	else
321	return ata_id_u32(id, 60);	321	return ata_id_u32(id, 60);
322	} else {	322	} else {
323	if (ata_id_current_chs_valid(id))	323	if (ata_id_current_chs_valid(id))
324	return ata_id_u32(id, 57);	324	return ata_id_u32(id, 57);
325	else	325	else
326	return id[1] * id[3] * id[6];	326	return id[1] * id[3] * id[6];
327	}	327	}
328	}	328	}
329		329
330	static void ata_id_string(const u16 id, unsigned char s, unsigned int ofs,	330	static void ata_id_string(const u16 id, unsigned char s, unsigned int ofs,
331	unsigned int len)	331	unsigned int len)
332	{	332	{
333	unsigned int c;	333	unsigned int c;
334		334
335	while (len > 0) {	335	while (len > 0) {
336	c = id[ofs] >> 8;	336	c = id[ofs] >> 8;
337	*s = c;	337	*s = c;
338	s++;	338	s++;
339		339
340	c = id[ofs] & 0xff;	340	c = id[ofs] & 0xff;
341	*s = c;	341	*s = c;
342	s++;	342	s++;
343		343
344	ofs++;	344	ofs++;
345	len -= 2;	345	len -= 2;
346	}	346	}
347	}	347	}
348		348
349	static void ata_id_c_string(const u16 id, unsigned char s, unsigned int ofs,	349	static void ata_id_c_string(const u16 id, unsigned char s, unsigned int ofs,
350	unsigned int len)	350	unsigned int len)
351	{	351	{
352	unsigned char *p;	352	unsigned char *p;
353		353
354	WARN_ON(!(len & 1));	354	WARN_ON(!(len & 1));
355		355
356	ata_id_string(id, s, ofs, len - 1);	356	ata_id_string(id, s, ofs, len - 1);
357		357
358	p = s + strnlen(s, len - 1);	358	p = s + strnlen(s, len - 1);
359	while (p > s && p[-1] == ' ')	359	while (p > s && p[-1] == ' ')
360	p--;	360	p--;
361	*p = '\0';	361	*p = '\0';
362	}	362	}
363		363
364	static int ps3disk_identify(struct ps3_storage_device *dev)	364	static int ps3disk_identify(struct ps3_storage_device *dev)
365	{	365	{
366	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);	366	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
367	struct lv1_ata_cmnd_block ata_cmnd;	367	struct lv1_ata_cmnd_block ata_cmnd;
368	u16 *id = dev->bounce_buf;	368	u16 *id = dev->bounce_buf;
369	u64 res;	369	u64 res;
370		370
371	dev_dbg(&dev->sbd.core, "%s:%u: identify disk\n", __func__, __LINE__);	371	dev_dbg(&dev->sbd.core, "%s:%u: identify disk\n", __func__, __LINE__);
372		372
373	memset(&ata_cmnd, 0, sizeof(struct lv1_ata_cmnd_block));	373	memset(&ata_cmnd, 0, sizeof(struct lv1_ata_cmnd_block));
374	ata_cmnd.command = ATA_CMD_ID_ATA;	374	ata_cmnd.command = ATA_CMD_ID_ATA;
375	ata_cmnd.sector_count = 1;	375	ata_cmnd.sector_count = 1;
376	ata_cmnd.size = ata_cmnd.arglen = ATA_ID_WORDS * 2;	376	ata_cmnd.size = ata_cmnd.arglen = ATA_ID_WORDS * 2;
377	ata_cmnd.buffer = dev->bounce_lpar;	377	ata_cmnd.buffer = dev->bounce_lpar;
378	ata_cmnd.proto = PIO_DATA_IN_PROTO;	378	ata_cmnd.proto = PIO_DATA_IN_PROTO;
379	ata_cmnd.in_out = DIR_READ;	379	ata_cmnd.in_out = DIR_READ;
380		380
381	res = ps3stor_send_command(dev, LV1_STORAGE_SEND_ATA_COMMAND,	381	res = ps3stor_send_command(dev, LV1_STORAGE_SEND_ATA_COMMAND,
382	ps3_mm_phys_to_lpar(__pa(&ata_cmnd)),	382	ps3_mm_phys_to_lpar(__pa(&ata_cmnd)),
383	sizeof(ata_cmnd), ata_cmnd.buffer,	383	sizeof(ata_cmnd), ata_cmnd.buffer,
384	ata_cmnd.arglen);	384	ata_cmnd.arglen);
385	if (res) {	385	if (res) {
386	dev_err(&dev->sbd.core, "%s:%u: identify disk failed 0x%llx\n",	386	dev_err(&dev->sbd.core, "%s:%u: identify disk failed 0x%llx\n",
387	__func__, __LINE__, res);	387	__func__, __LINE__, res);
388	return -EIO;	388	return -EIO;
389	}	389	}
390		390
391	swap_buf_le16(id, ATA_ID_WORDS);	391	swap_buf_le16(id, ATA_ID_WORDS);
392		392
393	/* All we're interested in are raw capacity and model name */	393	/* All we're interested in are raw capacity and model name */
394	priv->raw_capacity = ata_id_n_sectors(id);	394	priv->raw_capacity = ata_id_n_sectors(id);
395	ata_id_c_string(id, priv->model, ATA_ID_PROD, sizeof(priv->model));	395	ata_id_c_string(id, priv->model, ATA_ID_PROD, sizeof(priv->model));
396	return 0;	396	return 0;
397	}	397	}
398		398
399	static unsigned long ps3disk_mask;	399	static unsigned long ps3disk_mask;
400		400
401	static DEFINE_MUTEX(ps3disk_mask_mutex);	401	static DEFINE_MUTEX(ps3disk_mask_mutex);
402		402
403	static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)	403	static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
404	{	404	{
405	struct ps3_storage_device *dev = to_ps3_storage_device(&_dev->core);	405	struct ps3_storage_device *dev = to_ps3_storage_device(&_dev->core);
406	struct ps3disk_private *priv;	406	struct ps3disk_private *priv;
407	int error;	407	int error;
408	unsigned int devidx;	408	unsigned int devidx;
409	struct request_queue *queue;	409	struct request_queue *queue;
410	struct gendisk *gendisk;	410	struct gendisk *gendisk;
411		411
412	if (dev->blk_size < 512) {	412	if (dev->blk_size < 512) {
413	dev_err(&dev->sbd.core,	413	dev_err(&dev->sbd.core,
414	"%s:%u: cannot handle block size %llu\n", __func__,	414	"%s:%u: cannot handle block size %llu\n", __func__,
415	__LINE__, dev->blk_size);	415	__LINE__, dev->blk_size);
416	return -EINVAL;	416	return -EINVAL;
417	}	417	}
418		418
419	BUILD_BUG_ON(PS3DISK_MAX_DISKS > BITS_PER_LONG);	419	BUILD_BUG_ON(PS3DISK_MAX_DISKS > BITS_PER_LONG);
420	mutex_lock(&ps3disk_mask_mutex);	420	mutex_lock(&ps3disk_mask_mutex);
421	devidx = find_first_zero_bit(&ps3disk_mask, PS3DISK_MAX_DISKS);	421	devidx = find_first_zero_bit(&ps3disk_mask, PS3DISK_MAX_DISKS);
422	if (devidx >= PS3DISK_MAX_DISKS) {	422	if (devidx >= PS3DISK_MAX_DISKS) {
423	dev_err(&dev->sbd.core, "%s:%u: Too many disks\n", __func__,	423	dev_err(&dev->sbd.core, "%s:%u: Too many disks\n", __func__,
424	__LINE__);	424	__LINE__);
425	mutex_unlock(&ps3disk_mask_mutex);	425	mutex_unlock(&ps3disk_mask_mutex);
426	return -ENOSPC;	426	return -ENOSPC;
427	}	427	}
428	__set_bit(devidx, &ps3disk_mask);	428	__set_bit(devidx, &ps3disk_mask);
429	mutex_unlock(&ps3disk_mask_mutex);	429	mutex_unlock(&ps3disk_mask_mutex);
430		430
431	priv = kzalloc(sizeof(*priv), GFP_KERNEL);	431	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
432	if (!priv) {	432	if (!priv) {
433	error = -ENOMEM;	433	error = -ENOMEM;
434	goto fail;	434	goto fail;
435	}	435	}
436		436
437	ps3_system_bus_set_drvdata(_dev, priv);	437	ps3_system_bus_set_drvdata(_dev, priv);
438	spin_lock_init(&priv->lock);	438	spin_lock_init(&priv->lock);
439		439
440	dev->bounce_size = BOUNCE_SIZE;	440	dev->bounce_size = BOUNCE_SIZE;
441	dev->bounce_buf = kmalloc(BOUNCE_SIZE, GFP_DMA);	441	dev->bounce_buf = kmalloc(BOUNCE_SIZE, GFP_DMA);
442	if (!dev->bounce_buf) {	442	if (!dev->bounce_buf) {
443	error = -ENOMEM;	443	error = -ENOMEM;
444	goto fail_free_priv;	444	goto fail_free_priv;
445	}	445	}
446		446
447	error = ps3stor_setup(dev, ps3disk_interrupt);	447	error = ps3stor_setup(dev, ps3disk_interrupt);
448	if (error)	448	if (error)
449	goto fail_free_bounce;	449	goto fail_free_bounce;
450		450
451	ps3disk_identify(dev);	451	ps3disk_identify(dev);
452		452
453	queue = blk_init_queue(ps3disk_request, &priv->lock);	453	queue = blk_init_queue(ps3disk_request, &priv->lock);
454	if (!queue) {	454	if (!queue) {
455	dev_err(&dev->sbd.core, "%s:%u: blk_init_queue failed\n",	455	dev_err(&dev->sbd.core, "%s:%u: blk_init_queue failed\n",
456	__func__, __LINE__);	456	__func__, __LINE__);
457	error = -ENOMEM;	457	error = -ENOMEM;
458	goto fail_teardown;	458	goto fail_teardown;
459	}	459	}
460		460
461	priv->queue = queue;	461	priv->queue = queue;
462	queue->queuedata = dev;	462	queue->queuedata = dev;
463		463
464	blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH);	464	blk_queue_bounce_limit(queue, BLK_BOUNCE_HIGH);
465		465
466	blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);	466	blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);
467	blk_queue_segment_boundary(queue, -1UL);	467	blk_queue_segment_boundary(queue, -1UL);
468	blk_queue_dma_alignment(queue, dev->blk_size-1);	468	blk_queue_dma_alignment(queue, dev->blk_size-1);
469	blk_queue_logical_block_size(queue, dev->blk_size);	469	blk_queue_logical_block_size(queue, dev->blk_size);
470		470
471	blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH);	471	blk_queue_flush(queue, REQ_FLUSH);
472		472
473	blk_queue_max_segments(queue, -1);	473	blk_queue_max_segments(queue, -1);
474	blk_queue_max_segment_size(queue, dev->bounce_size);	474	blk_queue_max_segment_size(queue, dev->bounce_size);
475		475
476	gendisk = alloc_disk(PS3DISK_MINORS);	476	gendisk = alloc_disk(PS3DISK_MINORS);
477	if (!gendisk) {	477	if (!gendisk) {
478	dev_err(&dev->sbd.core, "%s:%u: alloc_disk failed\n", __func__,	478	dev_err(&dev->sbd.core, "%s:%u: alloc_disk failed\n", __func__,
479	__LINE__);	479	__LINE__);
480	error = -ENOMEM;	480	error = -ENOMEM;
481	goto fail_cleanup_queue;	481	goto fail_cleanup_queue;
482	}	482	}
483		483
484	priv->gendisk = gendisk;	484	priv->gendisk = gendisk;
485	gendisk->major = ps3disk_major;	485	gendisk->major = ps3disk_major;
486	gendisk->first_minor = devidx * PS3DISK_MINORS;	486	gendisk->first_minor = devidx * PS3DISK_MINORS;
487	gendisk->fops = &ps3disk_fops;	487	gendisk->fops = &ps3disk_fops;
488	gendisk->queue = queue;	488	gendisk->queue = queue;
489	gendisk->private_data = dev;	489	gendisk->private_data = dev;
490	gendisk->driverfs_dev = &dev->sbd.core;	490	gendisk->driverfs_dev = &dev->sbd.core;
491	snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME,	491	snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME,
492	devidx+'a');	492	devidx+'a');
493	priv->blocking_factor = dev->blk_size >> 9;	493	priv->blocking_factor = dev->blk_size >> 9;
494	set_capacity(gendisk,	494	set_capacity(gendisk,
495	dev->regions[dev->region_idx].size*priv->blocking_factor);	495	dev->regions[dev->region_idx].size*priv->blocking_factor);
496		496
497	dev_info(&dev->sbd.core,	497	dev_info(&dev->sbd.core,
498	"%s is a %s (%llu MiB total, %lu MiB for OtherOS)\n",	498	"%s is a %s (%llu MiB total, %lu MiB for OtherOS)\n",
499	gendisk->disk_name, priv->model, priv->raw_capacity >> 11,	499	gendisk->disk_name, priv->model, priv->raw_capacity >> 11,
500	get_capacity(gendisk) >> 11);	500	get_capacity(gendisk) >> 11);
501		501
502	add_disk(gendisk);	502	add_disk(gendisk);
503	return 0;	503	return 0;
504		504
505	fail_cleanup_queue:	505	fail_cleanup_queue:
506	blk_cleanup_queue(queue);	506	blk_cleanup_queue(queue);
507	fail_teardown:	507	fail_teardown:
508	ps3stor_teardown(dev);	508	ps3stor_teardown(dev);
509	fail_free_bounce:	509	fail_free_bounce:
510	kfree(dev->bounce_buf);	510	kfree(dev->bounce_buf);
511	fail_free_priv:	511	fail_free_priv:
512	kfree(priv);	512	kfree(priv);
513	ps3_system_bus_set_drvdata(_dev, NULL);	513	ps3_system_bus_set_drvdata(_dev, NULL);
514	fail:	514	fail:
515	mutex_lock(&ps3disk_mask_mutex);	515	mutex_lock(&ps3disk_mask_mutex);
516	__clear_bit(devidx, &ps3disk_mask);	516	__clear_bit(devidx, &ps3disk_mask);
517	mutex_unlock(&ps3disk_mask_mutex);	517	mutex_unlock(&ps3disk_mask_mutex);
518	return error;	518	return error;
519	}	519	}
520		520
521	static int ps3disk_remove(struct ps3_system_bus_device *_dev)	521	static int ps3disk_remove(struct ps3_system_bus_device *_dev)
522	{	522	{
523	struct ps3_storage_device *dev = to_ps3_storage_device(&_dev->core);	523	struct ps3_storage_device *dev = to_ps3_storage_device(&_dev->core);
524	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);	524	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
525		525
526	mutex_lock(&ps3disk_mask_mutex);	526	mutex_lock(&ps3disk_mask_mutex);
527	__clear_bit(MINOR(disk_devt(priv->gendisk)) / PS3DISK_MINORS,	527	__clear_bit(MINOR(disk_devt(priv->gendisk)) / PS3DISK_MINORS,
528	&ps3disk_mask);	528	&ps3disk_mask);
529	mutex_unlock(&ps3disk_mask_mutex);	529	mutex_unlock(&ps3disk_mask_mutex);
530	del_gendisk(priv->gendisk);	530	del_gendisk(priv->gendisk);
531	blk_cleanup_queue(priv->queue);	531	blk_cleanup_queue(priv->queue);
532	put_disk(priv->gendisk);	532	put_disk(priv->gendisk);
533	dev_notice(&dev->sbd.core, "Synchronizing disk cache\n");	533	dev_notice(&dev->sbd.core, "Synchronizing disk cache\n");
534	ps3disk_sync_cache(dev);	534	ps3disk_sync_cache(dev);
535	ps3stor_teardown(dev);	535	ps3stor_teardown(dev);
536	kfree(dev->bounce_buf);	536	kfree(dev->bounce_buf);
537	kfree(priv);	537	kfree(priv);
538	ps3_system_bus_set_drvdata(_dev, NULL);	538	ps3_system_bus_set_drvdata(_dev, NULL);
539	return 0;	539	return 0;
540	}	540	}
541		541
542	static struct ps3_system_bus_driver ps3disk = {	542	static struct ps3_system_bus_driver ps3disk = {
543	.match_id = PS3_MATCH_ID_STOR_DISK,	543	.match_id = PS3_MATCH_ID_STOR_DISK,
544	.core.name = DEVICE_NAME,	544	.core.name = DEVICE_NAME,
545	.core.owner = THIS_MODULE,	545	.core.owner = THIS_MODULE,
546	.probe = ps3disk_probe,	546	.probe = ps3disk_probe,
547	.remove = ps3disk_remove,	547	.remove = ps3disk_remove,
548	.shutdown = ps3disk_remove,	548	.shutdown = ps3disk_remove,
549	};	549	};
550		550
551		551
552	static int __init ps3disk_init(void)	552	static int __init ps3disk_init(void)
553	{	553	{
554	int error;	554	int error;
555		555
556	if (!firmware_has_feature(FW_FEATURE_PS3_LV1))	556	if (!firmware_has_feature(FW_FEATURE_PS3_LV1))
557	return -ENODEV;	557	return -ENODEV;
558		558
559	error = register_blkdev(0, DEVICE_NAME);	559	error = register_blkdev(0, DEVICE_NAME);
560	if (error <= 0) {	560	if (error <= 0) {
561	printk(KERN_ERR "%s:%u: register_blkdev failed %d\n", __func__,	561	printk(KERN_ERR "%s:%u: register_blkdev failed %d\n", __func__,
562	__LINE__, error);	562	__LINE__, error);
563	return error;	563	return error;
564	}	564	}
565	ps3disk_major = error;	565	ps3disk_major = error;
566		566
567	pr_info("%s:%u: registered block device major %d\n", __func__,	567	pr_info("%s:%u: registered block device major %d\n", __func__,
568	__LINE__, ps3disk_major);	568	__LINE__, ps3disk_major);
569		569
570	error = ps3_system_bus_driver_register(&ps3disk);	570	error = ps3_system_bus_driver_register(&ps3disk);
571	if (error)	571	if (error)
572	unregister_blkdev(ps3disk_major, DEVICE_NAME);	572	unregister_blkdev(ps3disk_major, DEVICE_NAME);
573		573
574	return error;	574	return error;
575	}	575	}
576		576
577	static void __exit ps3disk_exit(void)	577	static void __exit ps3disk_exit(void)
578	{	578	{
579	ps3_system_bus_driver_unregister(&ps3disk);	579	ps3_system_bus_driver_unregister(&ps3disk);
580	unregister_blkdev(ps3disk_major, DEVICE_NAME);	580	unregister_blkdev(ps3disk_major, DEVICE_NAME);
581	}	581	}
582		582
583	module_init(ps3disk_init);	583	module_init(ps3disk_init);
584	module_exit(ps3disk_exit);	584	module_exit(ps3disk_exit);
585		585
586	MODULE_LICENSE("GPL");	586	MODULE_LICENSE("GPL");
587	MODULE_DESCRIPTION("PS3 Disk Storage Driver");	587	MODULE_DESCRIPTION("PS3 Disk Storage Driver");
588	MODULE_AUTHOR("Sony Corporation");	588	MODULE_AUTHOR("Sony Corporation");
589	MODULE_ALIAS(PS3_MODULE_ALIAS_STOR_DISK);	589	MODULE_ALIAS(PS3_MODULE_ALIAS_STOR_DISK);
590		590

drivers/block/virtio_blk.c

Diff comments View file @ 4913efe

 //#define DEBUG
 #include <linux/spinlock.h>
 #include <linux/slab.h>
 #include <linux/blkdev.h>
 #include <linux/smp_lock.h>
 #include <linux/hdreg.h>
 #include <linux/virtio.h>
 #include <linux/virtio_blk.h>
 #include <linux/scatterlist.h>
 #define PART_BITS 4
 static int major, index;
 struct virtio_blk
 {
 	spinlock_t lock;
 	struct virtio_device *vdev;
 	struct virtqueue *vq;
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;
 	/* Request tracking. */
 	struct list_head reqs;
 	mempool_t *pool;
 	/* What host tells us, plus 2 for header & tailer. */
 	unsigned int sg_elems;
 	/* Scatterlist: can be too big for stack. */
 	struct scatterlist sg[/*sg_elems*/];
 };
 struct virtblk_req
 {
 	struct list_head list;
 	struct request *req;
 	struct virtio_blk_outhdr out_hdr;
 	struct virtio_scsi_inhdr in_hdr;
 	u8 status;
 };
 static void blk_done(struct virtqueue *vq)
 {
 	struct virtio_blk *vblk = vq->vdev->priv;
 	struct virtblk_req *vbr;
 	unsigned int len;
 	unsigned long flags;
 	spin_lock_irqsave(&vblk->lock, flags);
 	while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) {
 		int error;
 		switch (vbr->status) {
 		case VIRTIO_BLK_S_OK:
 			error = 0;
 			break;
 		case VIRTIO_BLK_S_UNSUPP:
 			error = -ENOTTY;
 			break;
 		default:
 			error = -EIO;
 			break;
 		}
 		switch (vbr->req->cmd_type) {
 		case REQ_TYPE_BLOCK_PC:
 			vbr->req->resid_len = vbr->in_hdr.residual;
 			vbr->req->sense_len = vbr->in_hdr.sense_len;
 			vbr->req->errors = vbr->in_hdr.errors;
 			break;
 		case REQ_TYPE_SPECIAL:
 			vbr->req->errors = (error != 0);
 			break;
 		default:
 			break;
 		}
 		__blk_end_request_all(vbr->req, error);
 		list_del(&vbr->list);
 		mempool_free(vbr, vblk->pool);
 	}
 	/* In case queue is stopped waiting for more buffers. */
 	blk_start_queue(vblk->disk->queue);
 	spin_unlock_irqrestore(&vblk->lock, flags);
 }
 static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		   struct request *req)
 {
 	unsigned long num, out = 0, in = 0;
 	struct virtblk_req *vbr;
 	vbr = mempool_alloc(vblk->pool, GFP_ATOMIC);
 	if (!vbr)
 		/* When another request finishes we'll try again. */
 		return false;
 	vbr->req = req;
 	if (req->cmd_flags & REQ_FLUSH) {
 		vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH;
 		vbr->out_hdr.sector = 0;
 		vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 	} else {
 		switch (req->cmd_type) {
 		case REQ_TYPE_FS:
 			vbr->out_hdr.type = 0;
 			vbr->out_hdr.sector = blk_rq_pos(vbr->req);
 			vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 			break;
 		case REQ_TYPE_BLOCK_PC:
 			vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD;
 			vbr->out_hdr.sector = 0;
 			vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 			break;
 		case REQ_TYPE_SPECIAL:
 			vbr->out_hdr.type = VIRTIO_BLK_T_GET_ID;
 			vbr->out_hdr.sector = 0;
 			vbr->out_hdr.ioprio = req_get_ioprio(vbr->req);
 			break;
 		default:
 			/* We don't put anything else in the queue. */
 			BUG();
 		}
 	}
 	if (vbr->req->cmd_flags & REQ_HARDBARRIER)
 		vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
 	sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
 	/*
 	 * If this is a packet command we need a couple of additional headers.
 	 * Behind the normal outhdr we put a segment with the scsi command
 	 * block, and before the normal inhdr we put the sense data and the
 	 * inhdr with additional status information before the normal inhdr.
 	 */
 	if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC)
 		sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len);
 	num = blk_rq_map_sg(q, vbr->req, vblk->sg + out);
 	if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) {
 		sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, 96);
 		sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr,
 			   sizeof(vbr->in_hdr));
 	}
 	sg_set_buf(&vblk->sg[num + out + in++], &vbr->status,
 		   sizeof(vbr->status));
 	if (num) {
 		if (rq_data_dir(vbr->req) == WRITE) {
 			vbr->out_hdr.type |= VIRTIO_BLK_T_OUT;
 			out += num;
 		} else {
 			vbr->out_hdr.type |= VIRTIO_BLK_T_IN;
 			in += num;
 		}
 	}
 	if (virtqueue_add_buf(vblk->vq, vblk->sg, out, in, vbr) < 0) {
 		mempool_free(vbr, vblk->pool);
 		return false;
 	}
 	list_add_tail(&vbr->list, &vblk->reqs);
 	return true;
 }
 static void do_virtblk_request(struct request_queue *q)
 {
 	struct virtio_blk *vblk = q->queuedata;
 	struct request *req;
 	unsigned int issued = 0;
 	while ((req = blk_peek_request(q)) != NULL) {
 		BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems);
 		/* If this request fails, stop queue and wait for something to
 		   finish to restart it. */
 		if (!do_req(q, vblk, req)) {
 			blk_stop_queue(q);
 			break;
 		}
 		blk_start_request(req);
 		issued++;
 	}
 	if (issued)
 		virtqueue_kick(vblk->vq);
 }
 /* return id (s/n) string for *disk to *id_str
  */
 static int virtblk_get_id(struct gendisk *disk, char *id_str)
 {
 	struct virtio_blk *vblk = disk->private_data;
 	struct request *req;
 	struct bio *bio;
 	bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES,
 			   GFP_KERNEL);
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
 	req = blk_make_request(vblk->disk->queue, bio, GFP_KERNEL);
 	if (IS_ERR(req)) {
 		bio_put(bio);
 		return PTR_ERR(req);
 	}
 	req->cmd_type = REQ_TYPE_SPECIAL;
 	return blk_execute_rq(vblk->disk->queue, vblk->disk, req, false);
 }
 static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
 			 unsigned cmd, unsigned long data)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	struct virtio_blk *vblk = disk->private_data;
 	/*
 	 * Only allow the generic SCSI ioctls if the host can support it.
 	 */
 	if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI))
 		return -ENOTTY;
 	return scsi_cmd_ioctl(disk->queue, disk, mode, cmd,
 			      (void __user *)data);
 }
 static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
 			     unsigned int cmd, unsigned long param)
 {
 	int ret;
 	lock_kernel();
 	ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
 	unlock_kernel();
 	return ret;
 }
 /* We provide getgeo only to please some old bootloader/partitioning tools */
 static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
 {
 	struct virtio_blk *vblk = bd->bd_disk->private_data;
 	struct virtio_blk_geometry vgeo;
 	int err;
 	/* see if the host passed in geometry config */
 	err = virtio_config_val(vblk->vdev, VIRTIO_BLK_F_GEOMETRY,
 				offsetof(struct virtio_blk_config, geometry),
 				&vgeo);
 	if (!err) {
 		geo->heads = vgeo.heads;
 		geo->sectors = vgeo.sectors;
 		geo->cylinders = vgeo.cylinders;
 	} else {
 		/* some standard values, similar to sd */
 		geo->heads = 1 << 6;
 		geo->sectors = 1 << 5;
 		geo->cylinders = get_capacity(bd->bd_disk) >> 11;
 	}
 	return 0;
 }
 static const struct block_device_operations virtblk_fops = {
 	.ioctl  = virtblk_ioctl,
 	.owner  = THIS_MODULE,
 	.getgeo = virtblk_getgeo,
 };
 static int index_to_minor(int index)
 {
 	return index << PART_BITS;
 }
 static ssize_t virtblk_serial_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	struct gendisk *disk = dev_to_disk(dev);
 	int err;
 	/* sysfs gives us a PAGE_SIZE buffer */
 	BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);
 	buf[VIRTIO_BLK_ID_BYTES] = '\0';
 	err = virtblk_get_id(disk, buf);
 	if (!err)
 		return strlen(buf);
 	if (err == -EIO) /* Unsupported? Make it empty. */
 		return 0;
 	return err;
 }
 DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL);
 static int __devinit virtblk_probe(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk;
 	struct request_queue *q;
 	int err;
 	u64 cap;
 	u32 v, blk_size, sg_elems, opt_io_size;
 	u16 min_io_size;
 	u8 physical_block_exp, alignment_offset;
 	if (index_to_minor(index) >= 1 << MINORBITS)
 		return -ENOSPC;
 	/* We need to know how many segments before we allocate. */
 	err = virtio_config_val(vdev, VIRTIO_BLK_F_SEG_MAX,
 				offsetof(struct virtio_blk_config, seg_max),
 				&sg_elems);
 	/* We need at least one SG element, whatever they say. */
 	if (err || !sg_elems)
 		sg_elems = 1;
 	/* We need an extra sg elements at head and tail. */
 	sg_elems += 2;
 	vdev->priv = vblk = kmalloc(sizeof(*vblk) +
 				    sizeof(vblk->sg[0]) * sg_elems, GFP_KERNEL);
 	if (!vblk) {
 		err = -ENOMEM;
 		goto out;
 	}
 	INIT_LIST_HEAD(&vblk->reqs);
 	spin_lock_init(&vblk->lock);
 	vblk->vdev = vdev;
 	vblk->sg_elems = sg_elems;
 	sg_init_table(vblk->sg, vblk->sg_elems);
 	/* We expect one virtqueue, for output. */
 	vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests");
 	if (IS_ERR(vblk->vq)) {
 		err = PTR_ERR(vblk->vq);
 		goto out_free_vblk;
 	}
 	vblk->pool = mempool_create_kmalloc_pool(1,sizeof(struct virtblk_req));
 	if (!vblk->pool) {
 		err = -ENOMEM;
 		goto out_free_vq;
 	}
 	/* FIXME: How many partitions?  How long is a piece of string? */
 	vblk->disk = alloc_disk(1 << PART_BITS);
 	if (!vblk->disk) {
 		err = -ENOMEM;
 		goto out_mempool;
 	}
 	q = vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock);
 	if (!q) {
 		err = -ENOMEM;
 		goto out_put_disk;
 	}
 	q->queuedata = vblk;
 	if (index < 26) {
 		sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26);
 	} else if (index < (26 + 1) * 26) {
 		sprintf(vblk->disk->disk_name, "vd%c%c",
 			'a' + index / 26 - 1, 'a' + index % 26);
 	} else {
 		const unsigned int m1 = (index / 26 - 1) / 26 - 1;
 		const unsigned int m2 = (index / 26 - 1) % 26;
 		const unsigned int m3 =  index % 26;
 		sprintf(vblk->disk->disk_name, "vd%c%c%c",
 			'a' + m1, 'a' + m2, 'a' + m3);
 	}
 	vblk->disk->major = major;
 	vblk->disk->first_minor = index_to_minor(index);
 	vblk->disk->private_data = vblk;
 	vblk->disk->fops = &virtblk_fops;
 	vblk->disk->driverfs_dev = &vdev->dev;
 	index++;
-	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) {
+	/*
-		/*
+	 * If the FLUSH feature is supported we do have support for
-		 * If the FLUSH feature is supported we do have support for
+	 * flushing a volatile write cache on the host.  Use that to
-		 * flushing a volatile write cache on the host.  Use that
+	 * implement write barrier support; otherwise, we must assume
-		 * to implement write barrier support.
+	 * that the host does not perform any kind of volatile write
-		 */
+	 * caching.
-		blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
+	 */
-	} else {
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
-		/*
+		blk_queue_flush(q, REQ_FLUSH);
-		 * If the FLUSH feature is not supported we must assume that
-		 * the host does not perform any kind of volatile write
-		 * caching. We still need to drain the queue to provider
-		 * proper barrier semantics.
-		 */
-		blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
-	}
 	/* If disk is read-only in the host, the guest should obey */
 	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
 		set_disk_ro(vblk->disk, 1);
 	/* Host must always specify the capacity. */
 	vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity),
 			  &cap, sizeof(cap));
 	/* If capacity is too big, truncate with warning. */
 	if ((sector_t)cap != cap) {
 		dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n",
 			 (unsigned long long)cap);
 		cap = (sector_t)-1;
 	}
 	set_capacity(vblk->disk, cap);
 	/* We can handle whatever the host told us to handle. */
 	blk_queue_max_segments(q, vblk->sg_elems-2);
 	/* No need to bounce any requests */
 	blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
 	/* No real sector limit. */
 	blk_queue_max_hw_sectors(q, -1U);
 	/* Host can optionally specify maximum segment size and number of
 	 * segments. */
 	err = virtio_config_val(vdev, VIRTIO_BLK_F_SIZE_MAX,
 				offsetof(struct virtio_blk_config, size_max),
 				&v);
 	if (!err)
 		blk_queue_max_segment_size(q, v);
 	else
 		blk_queue_max_segment_size(q, -1U);
 	/* Host can optionally specify the block size of the device */
 	err = virtio_config_val(vdev, VIRTIO_BLK_F_BLK_SIZE,
 				offsetof(struct virtio_blk_config, blk_size),
 				&blk_size);
 	if (!err)
 		blk_queue_logical_block_size(q, blk_size);
 	else
 		blk_size = queue_logical_block_size(q);
 	/* Use topology information if available */
 	err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
 			offsetof(struct virtio_blk_config, physical_block_exp),
 			&physical_block_exp);
 	if (!err && physical_block_exp)
 		blk_queue_physical_block_size(q,
 				blk_size * (1 << physical_block_exp));
 	err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
 			offsetof(struct virtio_blk_config, alignment_offset),
 			&alignment_offset);
 	if (!err && alignment_offset)
 		blk_queue_alignment_offset(q, blk_size * alignment_offset);
 	err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
 			offsetof(struct virtio_blk_config, min_io_size),
 			&min_io_size);
 	if (!err && min_io_size)
 		blk_queue_io_min(q, blk_size * min_io_size);
 	err = virtio_config_val(vdev, VIRTIO_BLK_F_TOPOLOGY,
 			offsetof(struct virtio_blk_config, opt_io_size),
 			&opt_io_size);
 	if (!err && opt_io_size)
 		blk_queue_io_opt(q, blk_size * opt_io_size);
 	add_disk(vblk->disk);
 	err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial);
 	if (err)
 		goto out_del_disk;
 	return 0;
 out_del_disk:
 	del_gendisk(vblk->disk);
 	blk_cleanup_queue(vblk->disk->queue);
 out_put_disk:
 	put_disk(vblk->disk);
 out_mempool:
 	mempool_destroy(vblk->pool);
 out_free_vq:
 	vdev->config->del_vqs(vdev);
 out_free_vblk:
 	kfree(vblk);
 out:
 	return err;
 }
 static void __devexit virtblk_remove(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk = vdev->priv;
 	/* Nothing should be pending. */
 	BUG_ON(!list_empty(&vblk->reqs));
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
 	del_gendisk(vblk->disk);
 	blk_cleanup_queue(vblk->disk->queue);
 	put_disk(vblk->disk);
 	mempool_destroy(vblk->pool);
 	vdev->config->del_vqs(vdev);
 	kfree(vblk);
 }
 static const struct virtio_device_id id_table[] = {
 	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
 	{ 0 },
 };
 static unsigned int features[] = {
 	VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
 	VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
 	VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
 };
 /*
  * virtio_blk causes spurious section mismatch warning by
  * simultaneously referring to a __devinit and a __devexit function.
  * Use __refdata to avoid this warning.
  */
 static struct virtio_driver __refdata virtio_blk = {
 	.feature_table = features,
 	.feature_table_size = ARRAY_SIZE(features),
 	.driver.name =	KBUILD_MODNAME,
 	.driver.owner =	THIS_MODULE,
 	.id_table =	id_table,
 	.probe =	virtblk_probe,
 	.remove =	__devexit_p(virtblk_remove),
 };
 static int __init init(void)
 {
 	major = register_blkdev(0, "virtblk");
 	if (major < 0)
 		return major;
 	return register_virtio_driver(&virtio_blk);
 }
 static void __exit fini(void)
 {
 	unregister_blkdev(major, "virtblk");
 	unregister_virtio_driver(&virtio_blk);
 }
 module_init(init);
 module_exit(fini);
 MODULE_DEVICE_TABLE(virtio, id_table);
 MODULE_DESCRIPTION("Virtio block driver");
 MODULE_LICENSE("GPL");

drivers/block/xen-blkfront.c

Diff comments View file @ 4913efe

 /*
  * blkfront.c
  *
  * XenLinux virtual block device driver.
  *
  * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
  * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
  * Copyright (c) 2004, Christian Limpach
  * Copyright (c) 2004, Andrew Warfield
  * Copyright (c) 2005, Christopher Clark
  * Copyright (c) 2005, XenSource Ltd
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License version 2
  * as published by the Free Software Foundation; or, when distributed
  * separately from the Linux kernel or incorporated into other
  * software packages, subject to the following license:
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this source file (the "Software"), to deal in the Software without
  * restriction, including without limitation the rights to use, copy, modify,
  * merge, publish, distribute, sublicense, and/or sell copies of the Software,
  * and to permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  */
 #include <linux/interrupt.h>
 #include <linux/blkdev.h>
 #include <linux/hdreg.h>
 #include <linux/cdrom.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/scatterlist.h>
 #include <xen/xen.h>
 #include <xen/xenbus.h>
 #include <xen/grant_table.h>
 #include <xen/events.h>
 #include <xen/page.h>
 #include <xen/platform_pci.h>
 #include <xen/interface/grant_table.h>
 #include <xen/interface/io/blkif.h>
 #include <xen/interface/io/protocols.h>
 #include <asm/xen/hypervisor.h>
 enum blkif_state {
 	BLKIF_STATE_DISCONNECTED,
 	BLKIF_STATE_CONNECTED,
 	BLKIF_STATE_SUSPENDED,
 };
 struct blk_shadow {
 	struct blkif_request req;
 	unsigned long request;
 	unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 };
 static const struct block_device_operations xlvbd_block_fops;
 #define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
 /*
  * We have one of these per vbd, whether ide, scsi or 'other'.  They
  * hang in private_data off the gendisk structure. We may end up
  * putting all kinds of interesting stuff here :-)
  */
 struct blkfront_info
 {
 	struct mutex mutex;
 	struct xenbus_device *xbdev;
 	struct gendisk *gd;
 	int vdevice;
 	blkif_vdev_t handle;
 	enum blkif_state connected;
 	int ring_ref;
 	struct blkif_front_ring ring;
 	struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int evtchn, irq;
 	struct request_queue *rq;
 	struct work_struct work;
 	struct gnttab_free_callback callback;
 	struct blk_shadow shadow[BLK_RING_SIZE];
 	unsigned long shadow_free;
-	int feature_barrier;
+	unsigned int feature_flush;
 	int is_ready;
 };
 static DEFINE_SPINLOCK(blkif_io_lock);
 static unsigned int nr_minors;
 static unsigned long *minors;
 static DEFINE_SPINLOCK(minor_lock);
 #define MAXIMUM_OUTSTANDING_BLOCK_REQS \
 	(BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE)
 #define GRANT_INVALID_REF	0
 #define PARTS_PER_DISK		16
 #define PARTS_PER_EXT_DISK      256
 #define BLKIF_MAJOR(dev) ((dev)>>8)
 #define BLKIF_MINOR(dev) ((dev) & 0xff)
 #define EXT_SHIFT 28
 #define EXTENDED (1<<EXT_SHIFT)
 #define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
 #define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
 #define DEV_NAME	"xvd"	/* name in /dev */
 static int get_id_from_freelist(struct blkfront_info *info)
 {
 	unsigned long free = info->shadow_free;
 	BUG_ON(free >= BLK_RING_SIZE);
 	info->shadow_free = info->shadow[free].req.id;
 	info->shadow[free].req.id = 0x0fffffee; /* debug */
 	return free;
 }
 static void add_id_to_freelist(struct blkfront_info *info,
 			       unsigned long id)
 {
 	info->shadow[id].req.id  = info->shadow_free;
 	info->shadow[id].request = 0;
 	info->shadow_free = id;
 }
 static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
 {
 	unsigned int end = minor + nr;
 	int rc;
 	if (end > nr_minors) {
 		unsigned long *bitmap, *old;
 		bitmap = kzalloc(BITS_TO_LONGS(end) * sizeof(*bitmap),
 				 GFP_KERNEL);
 		if (bitmap == NULL)
 			return -ENOMEM;
 		spin_lock(&minor_lock);
 		if (end > nr_minors) {
 			old = minors;
 			memcpy(bitmap, minors,
 			       BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
 			minors = bitmap;
 			nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
 		} else
 			old = bitmap;
 		spin_unlock(&minor_lock);
 		kfree(old);
 	}
 	spin_lock(&minor_lock);
 	if (find_next_bit(minors, end, minor) >= end) {
 		for (; minor < end; ++minor)
 			__set_bit(minor, minors);
 		rc = 0;
 	} else
 		rc = -EBUSY;
 	spin_unlock(&minor_lock);
 	return rc;
 }
 static void xlbd_release_minors(unsigned int minor, unsigned int nr)
 {
 	unsigned int end = minor + nr;
 	BUG_ON(end > nr_minors);
 	spin_lock(&minor_lock);
 	for (; minor < end; ++minor)
 		__clear_bit(minor, minors);
 	spin_unlock(&minor_lock);
 }
 static void blkif_restart_queue_callback(void *arg)
 {
 	struct blkfront_info *info = (struct blkfront_info *)arg;
 	schedule_work(&info->work);
 }
 static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
 {
 	/* We don't have real geometry info, but let's at least return
 	   values consistent with the size of the device */
 	sector_t nsect = get_capacity(bd->bd_disk);
 	sector_t cylinders = nsect;
 	hg->heads = 0xff;
 	hg->sectors = 0x3f;
 	sector_div(cylinders, hg->heads * hg->sectors);
 	hg->cylinders = cylinders;
 	if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
 		hg->cylinders = 0xffff;
 	return 0;
 }
 static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
 		       unsigned command, unsigned long argument)
 {
 	struct blkfront_info *info = bdev->bd_disk->private_data;
 	int i;
 	dev_dbg(&info->xbdev->dev, "command: 0x%x, argument: 0x%lx\n",
 		command, (long)argument);
 	switch (command) {
 	case CDROMMULTISESSION:
 		dev_dbg(&info->xbdev->dev, "FIXME: support multisession CDs later\n");
 		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
 			if (put_user(0, (char __user *)(argument + i)))
 				return -EFAULT;
 		return 0;
 	case CDROM_GET_CAPABILITY: {
 		struct gendisk *gd = info->gd;
 		if (gd->flags & GENHD_FL_CD)
 			return 0;
 		return -EINVAL;
 	}
 	default:
 		/*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n",
 		  command);*/
 		return -EINVAL; /* same return as native Linux */
 	}
 	return 0;
 }
 /*
  * blkif_queue_request
  *
  * request block io
  *
  * id: for guest use only.
  * operation: BLKIF_OP_{READ,WRITE,PROBE}
  * buffer: buffer to read/write into. this should be a
  *   virtual address in the guest os.
  */
 static int blkif_queue_request(struct request *req)
 {
 	struct blkfront_info *info = req->rq_disk->private_data;
 	unsigned long buffer_mfn;
 	struct blkif_request *ring_req;
 	unsigned long id;
 	unsigned int fsect, lsect;
 	int i, ref;
 	grant_ref_t gref_head;
 	struct scatterlist *sg;
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 		return 1;
 	if (gnttab_alloc_grant_references(
 		BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) {
 		gnttab_request_free_callback(
 			&info->callback,
 			blkif_restart_queue_callback,
 			info,
 			BLKIF_MAX_SEGMENTS_PER_REQUEST);
 		return 1;
 	}
 	/* Fill out a communications ring structure. */
 	ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
 	id = get_id_from_freelist(info);
 	info->shadow[id].request = (unsigned long)req;
 	ring_req->id = id;
 	ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req);
 	ring_req->handle = info->handle;
 	ring_req->operation = rq_data_dir(req) ?
 		BLKIF_OP_WRITE : BLKIF_OP_READ;
 	if (req->cmd_flags & REQ_HARDBARRIER)
 		ring_req->operation = BLKIF_OP_WRITE_BARRIER;
 	ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg);
 	BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
 	for_each_sg(info->sg, sg, ring_req->nr_segments, i) {
 		buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg)));
 		fsect = sg->offset >> 9;
 		lsect = fsect + (sg->length >> 9) - 1;
 		/* install a grant reference. */
 		ref = gnttab_claim_grant_reference(&gref_head);
 		BUG_ON(ref == -ENOSPC);
 		gnttab_grant_foreign_access_ref(
 				ref,
 				info->xbdev->otherend_id,
 				buffer_mfn,
 				rq_data_dir(req) );
 		info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn);
 		ring_req->seg[i] =
 				(struct blkif_request_segment) {
 					.gref       = ref,
 					.first_sect = fsect,
 					.last_sect  = lsect };
 	}
 	info->ring.req_prod_pvt++;
 	/* Keep a private copy so we can reissue requests when recovering. */
 	info->shadow[id].req = *ring_req;
 	gnttab_free_grant_references(gref_head);
 	return 0;
 }
 static inline void flush_requests(struct blkfront_info *info)
 {
 	int notify;
 	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify);
 	if (notify)
 		notify_remote_via_irq(info->irq);
 }
 /*
  * do_blkif_request
  *  read a block; request is in a request queue
  */
 static void do_blkif_request(struct request_queue *rq)
 {
 	struct blkfront_info *info = NULL;
 	struct request *req;
 	int queued;
 	pr_debug("Entered do_blkif_request\n");
 	queued = 0;
 	while ((req = blk_peek_request(rq)) != NULL) {
 		info = req->rq_disk->private_data;
 		if (RING_FULL(&info->ring))
 			goto wait;
 		blk_start_request(req);
 		if (req->cmd_type != REQ_TYPE_FS) {
 			__blk_end_request_all(req, -EIO);
 			continue;
 		}
 		pr_debug("do_blk_req %p: cmd %p, sec %lx, "
 			 "(%u/%u) buffer:%p [%s]\n",
 			 req, req->cmd, (unsigned long)blk_rq_pos(req),
 			 blk_rq_cur_sectors(req), blk_rq_sectors(req),
 			 req->buffer, rq_data_dir(req) ? "write" : "read");
 		if (blkif_queue_request(req)) {
 			blk_requeue_request(rq, req);
 wait:
 			/* Avoid pointless unplugs. */
 			blk_stop_queue(rq);
 			break;
 		}
 		queued++;
 	}
 	if (queued != 0)
 		flush_requests(info);
 }
 static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 {
 	struct request_queue *rq;
 	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
 	if (rq == NULL)
 		return -1;
 	queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq);
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
 	blk_queue_logical_block_size(rq, sector_size);
 	blk_queue_max_hw_sectors(rq, 512);
 	/* Each segment in a request is up to an aligned page in size. */
 	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
 	blk_queue_max_segment_size(rq, PAGE_SIZE);
 	/* Ensure a merged request will fit in a single I/O ring slot. */
 	blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 	/* Make sure buffer addresses are sector-aligned. */
 	blk_queue_dma_alignment(rq, 511);
 	/* Make sure we don't use bounce buffers. */
 	blk_queue_bounce_limit(rq, BLK_BOUNCE_ANY);
 	gd->queue = rq;
 	return 0;
 }
-static int xlvbd_barrier(struct blkfront_info *info)
+static void xlvbd_flush(struct blkfront_info *info)
 {
-	int err;
+	blk_queue_flush(info->rq, info->feature_flush);
-	const char *barrier;
-	switch (info->feature_barrier) {
-	case QUEUE_ORDERED_DRAIN:	barrier = "enabled"; break;
-	case QUEUE_ORDERED_NONE:	barrier = "disabled"; break;
-	default:			return -EINVAL;
-	}
-	err = blk_queue_ordered(info->rq, info->feature_barrier);
-	if (err)
-		return err;
 	printk(KERN_INFO "blkfront: %s: barriers %s\n",
-	       info->gd->disk_name, barrier);
+	       info->gd->disk_name,
-	return 0;
+	       info->feature_flush ? "enabled" : "disabled");
 }
 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 			       struct blkfront_info *info,
 			       u16 vdisk_info, u16 sector_size)
 {
 	struct gendisk *gd;
 	int nr_minors = 1;
 	int err = -ENODEV;
 	unsigned int offset;
 	int minor;
 	int nr_parts;
 	BUG_ON(info->gd != NULL);
 	BUG_ON(info->rq != NULL);
 	if ((info->vdevice>>EXT_SHIFT) > 1) {
 		/* this is above the extended range; something is wrong */
 		printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
 		return -ENODEV;
 	}
 	if (!VDEV_IS_EXTENDED(info->vdevice)) {
 		minor = BLKIF_MINOR(info->vdevice);
 		nr_parts = PARTS_PER_DISK;
 	} else {
 		minor = BLKIF_MINOR_EXT(info->vdevice);
 		nr_parts = PARTS_PER_EXT_DISK;
 	}
 	if ((minor % nr_parts) == 0)
 		nr_minors = nr_parts;
 	err = xlbd_reserve_minors(minor, nr_minors);
 	if (err)
 		goto out;
 	err = -ENODEV;
 	gd = alloc_disk(nr_minors);
 	if (gd == NULL)
 		goto release;
 	offset = minor / nr_parts;
 	if (nr_minors > 1) {
 		if (offset < 26)
 			sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
 		else
 			sprintf(gd->disk_name, "%s%c%c", DEV_NAME,
 				'a' + ((offset / 26)-1), 'a' + (offset % 26));
 	} else {
 		if (offset < 26)
 			sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
 				'a' + offset,
 				minor & (nr_parts - 1));
 		else
 			sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME,
 				'a' + ((offset / 26) - 1),
 				'a' + (offset % 26),
 				minor & (nr_parts - 1));
 	}
 	gd->major = XENVBD_MAJOR;
 	gd->first_minor = minor;
 	gd->fops = &xlvbd_block_fops;
 	gd->private_data = info;
 	gd->driverfs_dev = &(info->xbdev->dev);
 	set_capacity(gd, capacity);
 	if (xlvbd_init_blk_queue(gd, sector_size)) {
 		del_gendisk(gd);
 		goto release;
 	}
 	info->rq = gd->queue;
 	info->gd = gd;
-	xlvbd_barrier(info);
+	xlvbd_flush(info);
 	if (vdisk_info & VDISK_READONLY)
 		set_disk_ro(gd, 1);
 	if (vdisk_info & VDISK_REMOVABLE)
 		gd->flags |= GENHD_FL_REMOVABLE;
 	if (vdisk_info & VDISK_CDROM)
 		gd->flags |= GENHD_FL_CD;
 	return 0;
  release:
 	xlbd_release_minors(minor, nr_minors);
  out:
 	return err;
 }
 static void xlvbd_release_gendisk(struct blkfront_info *info)
 {
 	unsigned int minor, nr_minors;
 	unsigned long flags;
 	if (info->rq == NULL)
 		return;
 	spin_lock_irqsave(&blkif_io_lock, flags);
 	/* No more blkif_request(). */
 	blk_stop_queue(info->rq);
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&info->callback);
 	spin_unlock_irqrestore(&blkif_io_lock, flags);
 	/* Flush gnttab callback work. Must be done with no locks held. */
 	flush_scheduled_work();
 	del_gendisk(info->gd);
 	minor = info->gd->first_minor;
 	nr_minors = info->gd->minors;
 	xlbd_release_minors(minor, nr_minors);
 	blk_cleanup_queue(info->rq);
 	info->rq = NULL;
 	put_disk(info->gd);
 	info->gd = NULL;
 }
 static void kick_pending_request_queues(struct blkfront_info *info)
 {
 	if (!RING_FULL(&info->ring)) {
 		/* Re-enable calldowns. */
 		blk_start_queue(info->rq);
 		/* Kick things off immediately. */
 		do_blkif_request(info->rq);
 	}
 }
 static void blkif_restart_queue(struct work_struct *work)
 {
 	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
 	spin_lock_irq(&blkif_io_lock);
 	if (info->connected == BLKIF_STATE_CONNECTED)
 		kick_pending_request_queues(info);
 	spin_unlock_irq(&blkif_io_lock);
 }
 static void blkif_free(struct blkfront_info *info, int suspend)
 {
 	/* Prevent new requests being issued until we fix things up. */
 	spin_lock_irq(&blkif_io_lock);
 	info->connected = suspend ?
 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
 	/* No more blkif_request(). */
 	if (info->rq)
 		blk_stop_queue(info->rq);
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&info->callback);
 	spin_unlock_irq(&blkif_io_lock);
 	/* Flush gnttab callback work. Must be done with no locks held. */
 	flush_scheduled_work();
 	/* Free resources associated with old device channel. */
 	if (info->ring_ref != GRANT_INVALID_REF) {
 		gnttab_end_foreign_access(info->ring_ref, 0,
 					  (unsigned long)info->ring.sring);
 		info->ring_ref = GRANT_INVALID_REF;
 		info->ring.sring = NULL;
 	}
 	if (info->irq)
 		unbind_from_irqhandler(info->irq, info);
 	info->evtchn = info->irq = 0;
 }
 static void blkif_completion(struct blk_shadow *s)
 {
 	int i;
 	for (i = 0; i < s->req.nr_segments; i++)
 		gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL);
 }
 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 {
 	struct request *req;
 	struct blkif_response *bret;
 	RING_IDX i, rp;
 	unsigned long flags;
 	struct blkfront_info *info = (struct blkfront_info *)dev_id;
 	int error;
 	spin_lock_irqsave(&blkif_io_lock, flags);
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
 		spin_unlock_irqrestore(&blkif_io_lock, flags);
 		return IRQ_HANDLED;
 	}
  again:
 	rp = info->ring.sring->rsp_prod;
 	rmb(); /* Ensure we see queued responses up to 'rp'. */
 	for (i = info->ring.rsp_cons; i != rp; i++) {
 		unsigned long id;
 		bret = RING_GET_RESPONSE(&info->ring, i);
 		id   = bret->id;
 		req  = (struct request *)info->shadow[id].request;
 		blkif_completion(&info->shadow[id]);
 		add_id_to_freelist(info, id);
 		error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO;
 		switch (bret->operation) {
 		case BLKIF_OP_WRITE_BARRIER:
 			if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) {
 				printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
 				       info->gd->disk_name);
 				error = -EOPNOTSUPP;
-				info->feature_barrier = QUEUE_ORDERED_NONE;
+				info->feature_flush = 0;
-				xlvbd_barrier(info);
+				xlvbd_flush(info);
 			}
 			/* fall through */
 		case BLKIF_OP_READ:
 		case BLKIF_OP_WRITE:
 			if (unlikely(bret->status != BLKIF_RSP_OKAY))
 				dev_dbg(&info->xbdev->dev, "Bad return from blkdev data "
 					"request: %x\n", bret->status);
 			__blk_end_request_all(req, error);
 			break;
 		default:
 			BUG();
 		}
 	}
 	info->ring.rsp_cons = i;
 	if (i != info->ring.req_prod_pvt) {
 		int more_to_do;
 		RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do);
 		if (more_to_do)
 			goto again;
 	} else
 		info->ring.sring->rsp_event = i + 1;
 	kick_pending_request_queues(info);
 	spin_unlock_irqrestore(&blkif_io_lock, flags);
 	return IRQ_HANDLED;
 }
 static int setup_blkring(struct xenbus_device *dev,
 			 struct blkfront_info *info)
 {
 	struct blkif_sring *sring;
 	int err;
 	info->ring_ref = GRANT_INVALID_REF;
 	sring = (struct blkif_sring *)__get_free_page(GFP_NOIO | __GFP_HIGH);
 	if (!sring) {
 		xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring");
 		return -ENOMEM;
 	}
 	SHARED_RING_INIT(sring);
 	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
 	sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
 	err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
 	if (err < 0) {
 		free_page((unsigned long)sring);
 		info->ring.sring = NULL;
 		goto fail;
 	}
 	info->ring_ref = err;
 	err = xenbus_alloc_evtchn(dev, &info->evtchn);
 	if (err)
 		goto fail;
 	err = bind_evtchn_to_irqhandler(info->evtchn,
 					blkif_interrupt,
 					IRQF_SAMPLE_RANDOM, "blkif", info);
 	if (err <= 0) {
 		xenbus_dev_fatal(dev, err,
 				 "bind_evtchn_to_irqhandler failed");
 		goto fail;
 	}
 	info->irq = err;
 	return 0;
 fail:
 	blkif_free(info, 0);
 	return err;
 }
 /* Common code used when first setting up, and when resuming. */
 static int talk_to_blkback(struct xenbus_device *dev,
 			   struct blkfront_info *info)
 {
 	const char *message = NULL;
 	struct xenbus_transaction xbt;
 	int err;
 	/* Create shared ring, alloc event channel. */
 	err = setup_blkring(dev, info);
 	if (err)
 		goto out;
 again:
 	err = xenbus_transaction_start(&xbt);
 	if (err) {
 		xenbus_dev_fatal(dev, err, "starting transaction");
 		goto destroy_blkring;
 	}
 	err = xenbus_printf(xbt, dev->nodename,
 			    "ring-ref", "%u", info->ring_ref);
 	if (err) {
 		message = "writing ring-ref";
 		goto abort_transaction;
 	}
 	err = xenbus_printf(xbt, dev->nodename,
 			    "event-channel", "%u", info->evtchn);
 	if (err) {
 		message = "writing event-channel";
 		goto abort_transaction;
 	}
 	err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
 			    XEN_IO_PROTO_ABI_NATIVE);
 	if (err) {
 		message = "writing protocol";
 		goto abort_transaction;
 	}
 	err = xenbus_transaction_end(xbt, 0);
 	if (err) {
 		if (err == -EAGAIN)
 			goto again;
 		xenbus_dev_fatal(dev, err, "completing transaction");
 		goto destroy_blkring;
 	}
 	xenbus_switch_state(dev, XenbusStateInitialised);
 	return 0;
  abort_transaction:
 	xenbus_transaction_end(xbt, 1);
 	if (message)
 		xenbus_dev_fatal(dev, err, "%s", message);
  destroy_blkring:
 	blkif_free(info, 0);
  out:
 	return err;
 }
 /**
  * Entry point to this code when a new device is created.  Allocate the basic
  * structures and the ring buffer for communication with the backend, and
  * inform the backend of the appropriate details for those.  Switch to
  * Initialised state.
  */
 static int blkfront_probe(struct xenbus_device *dev,
 			  const struct xenbus_device_id *id)
 {
 	int err, vdevice, i;
 	struct blkfront_info *info;
 	/* FIXME: Use dynamic device id if this is not set. */
 	err = xenbus_scanf(XBT_NIL, dev->nodename,
 			   "virtual-device", "%i", &vdevice);
 	if (err != 1) {
 		/* go looking in the extended area instead */
 		err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
 				   "%i", &vdevice);
 		if (err != 1) {
 			xenbus_dev_fatal(dev, err, "reading virtual-device");
 			return err;
 		}
 	}
 	if (xen_hvm_domain()) {
 		char *type;
 		int len;
 		/* no unplug has been done: do not hook devices != xen vbds */
 		if (xen_platform_pci_unplug & XEN_UNPLUG_IGNORE) {
 			int major;
 			if (!VDEV_IS_EXTENDED(vdevice))
 				major = BLKIF_MAJOR(vdevice);
 			else
 				major = XENVBD_MAJOR;
 			if (major != XENVBD_MAJOR) {
 				printk(KERN_INFO
 						"%s: HVM does not support vbd %d as xen block device\n",
 						__FUNCTION__, vdevice);
 				return -ENODEV;
 			}
 		}
 		/* do not create a PV cdrom device if we are an HVM guest */
 		type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
 		if (IS_ERR(type))
 			return -ENODEV;
 		if (strncmp(type, "cdrom", 5) == 0) {
 			kfree(type);
 			return -ENODEV;
 		}
 		kfree(type);
 	}
 	info = kzalloc(sizeof(*info), GFP_KERNEL);
 	if (!info) {
 		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
 		return -ENOMEM;
 	}
 	mutex_init(&info->mutex);
 	info->xbdev = dev;
 	info->vdevice = vdevice;
 	info->connected = BLKIF_STATE_DISCONNECTED;
 	INIT_WORK(&info->work, blkif_restart_queue);
 	for (i = 0; i < BLK_RING_SIZE; i++)
 		info->shadow[i].req.id = i+1;
 	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
 	/* Front end dir is a number, which is used as the id. */
 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
 	dev_set_drvdata(&dev->dev, info);
 	err = talk_to_blkback(dev, info);
 	if (err) {
 		kfree(info);
 		dev_set_drvdata(&dev->dev, NULL);
 		return err;
 	}
 	return 0;
 }
 static int blkif_recover(struct blkfront_info *info)
 {
 	int i;
 	struct blkif_request *req;
 	struct blk_shadow *copy;
 	int j;
 	/* Stage 1: Make a safe copy of the shadow state. */
 	copy = kmalloc(sizeof(info->shadow),
 		       GFP_NOIO | __GFP_REPEAT | __GFP_HIGH);
 	if (!copy)
 		return -ENOMEM;
 	memcpy(copy, info->shadow, sizeof(info->shadow));
 	/* Stage 2: Set up free list. */
 	memset(&info->shadow, 0, sizeof(info->shadow));
 	for (i = 0; i < BLK_RING_SIZE; i++)
 		info->shadow[i].req.id = i+1;
 	info->shadow_free = info->ring.req_prod_pvt;
 	info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff;
 	/* Stage 3: Find pending requests and requeue them. */
 	for (i = 0; i < BLK_RING_SIZE; i++) {
 		/* Not in use? */
 		if (copy[i].request == 0)
 			continue;
 		/* Grab a request slot and copy shadow state into it. */
 		req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
 		*req = copy[i].req;
 		/* We get a new request id, and must reset the shadow state. */
 		req->id = get_id_from_freelist(info);
 		memcpy(&info->shadow[req->id], &copy[i], sizeof(copy[i]));
 		/* Rewrite any grant references invalidated by susp/resume. */
 		for (j = 0; j < req->nr_segments; j++)
 			gnttab_grant_foreign_access_ref(
 				req->seg[j].gref,
 				info->xbdev->otherend_id,
 				pfn_to_mfn(info->shadow[req->id].frame[j]),
 				rq_data_dir(
 					(struct request *)
 					info->shadow[req->id].request));
 		info->shadow[req->id].req = *req;
 		info->ring.req_prod_pvt++;
 	}
 	kfree(copy);
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 	spin_lock_irq(&blkif_io_lock);
 	/* Now safe for us to use the shared ring */
 	info->connected = BLKIF_STATE_CONNECTED;
 	/* Send off requeued requests */
 	flush_requests(info);
 	/* Kick any other new requests queued since we resumed */
 	kick_pending_request_queues(info);
 	spin_unlock_irq(&blkif_io_lock);
 	return 0;
 }
 /**
  * We are reconnecting to the backend, due to a suspend/resume, or a backend
  * driver restart.  We tear down our blkif structure and recreate it, but
  * leave the device-layer structures intact so that this is transparent to the
  * rest of the kernel.
  */
 static int blkfront_resume(struct xenbus_device *dev)
 {
 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 	int err;
 	dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
 	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
 	err = talk_to_blkback(dev, info);
 	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
 		err = blkif_recover(info);
 	return err;
 }
 static void
 blkfront_closing(struct blkfront_info *info)
 {
 	struct xenbus_device *xbdev = info->xbdev;
 	struct block_device *bdev = NULL;
 	mutex_lock(&info->mutex);
 	if (xbdev->state == XenbusStateClosing) {
 		mutex_unlock(&info->mutex);
 		return;
 	}
 	if (info->gd)
 		bdev = bdget_disk(info->gd, 0);
 	mutex_unlock(&info->mutex);
 	if (!bdev) {
 		xenbus_frontend_closed(xbdev);
 		return;
 	}
 	mutex_lock(&bdev->bd_mutex);
 	if (bdev->bd_openers) {
 		xenbus_dev_error(xbdev, -EBUSY,
 				 "Device in use; refusing to close");
 		xenbus_switch_state(xbdev, XenbusStateClosing);
 	} else {
 		xlvbd_release_gendisk(info);
 		xenbus_frontend_closed(xbdev);
 	}
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 }
 /*
  * Invoked when the backend is finally 'ready' (and has told produced
  * the details about the physical device - #sectors, size, etc).
  */
 static void blkfront_connect(struct blkfront_info *info)
 {
 	unsigned long long sectors;
 	unsigned long sector_size;
 	unsigned int binfo;
 	int err;
 	int barrier;
 	switch (info->connected) {
 	case BLKIF_STATE_CONNECTED:
 		/*
 		 * Potentially, the back-end may be signalling
 		 * a capacity change; update the capacity.
 		 */
 		err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
 				   "sectors", "%Lu", &sectors);
 		if (XENBUS_EXIST_ERR(err))
 			return;
 		printk(KERN_INFO "Setting capacity to %Lu\n",
 		       sectors);
 		set_capacity(info->gd, sectors);
 		revalidate_disk(info->gd);
 		/* fall through */
 	case BLKIF_STATE_SUSPENDED:
 		return;
 	default:
 		break;
 	}
 	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
 		__func__, info->xbdev->otherend);
 	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 			    "sectors", "%llu", &sectors,
 			    "info", "%u", &binfo,
 			    "sector-size", "%lu", &sector_size,
 			    NULL);
 	if (err) {
 		xenbus_dev_fatal(info->xbdev, err,
 				 "reading backend fields at %s",
 				 info->xbdev->otherend);
 		return;
 	}
 	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
 			    "feature-barrier", "%lu", &barrier,
 			    NULL);
 	/*
 	 * If there's no "feature-barrier" defined, then it means
 	 * we're dealing with a very old backend which writes
-	 * synchronously; draining will do what needs to get done.
+	 * synchronously; nothing to do.
 	 *
 	 * If there are barriers, then we use flush.
-	 *
-	 * If barriers are not supported, then there's no much we can
-	 * do, so just set ordering to NONE.
 	 */
-	if (err)
+	info->feature_flush = 0;
-		info->feature_barrier = QUEUE_ORDERED_DRAIN;
+	if (!err && barrier)
-	else if (barrier)
+		info->feature_flush = REQ_FLUSH;
-		info->feature_barrier = QUEUE_ORDERED_DRAIN_FLUSH;
-	else
-		info->feature_barrier = QUEUE_ORDERED_NONE;
 	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
 	if (err) {
 		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
 				 info->xbdev->otherend);
 		return;
 	}
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 	/* Kick pending requests. */
 	spin_lock_irq(&blkif_io_lock);
 	info->connected = BLKIF_STATE_CONNECTED;
 	kick_pending_request_queues(info);
 	spin_unlock_irq(&blkif_io_lock);
 	add_disk(info->gd);
 	info->is_ready = 1;
 }
 /**
  * Callback received when the backend's state changes.
  */
 static void blkback_changed(struct xenbus_device *dev,
 			    enum xenbus_state backend_state)
 {
 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 	dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
 	switch (backend_state) {
 	case XenbusStateInitialising:
 	case XenbusStateInitWait:
 	case XenbusStateInitialised:
 	case XenbusStateUnknown:
 	case XenbusStateClosed:
 		break;
 	case XenbusStateConnected:
 		blkfront_connect(info);
 		break;
 	case XenbusStateClosing:
 		blkfront_closing(info);
 		break;
 	}
 }
 static int blkfront_remove(struct xenbus_device *xbdev)
 {
 	struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
 	struct block_device *bdev = NULL;
 	struct gendisk *disk;
 	dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
 	blkif_free(info, 0);
 	mutex_lock(&info->mutex);
 	disk = info->gd;
 	if (disk)
 		bdev = bdget_disk(disk, 0);
 	info->xbdev = NULL;
 	mutex_unlock(&info->mutex);
 	if (!bdev) {
 		kfree(info);
 		return 0;
 	}
 	/*
 	 * The xbdev was removed before we reached the Closed
 	 * state. See if it's safe to remove the disk. If the bdev
 	 * isn't closed yet, we let release take care of it.
 	 */
 	mutex_lock(&bdev->bd_mutex);
 	info = disk->private_data;
 	dev_warn(disk_to_dev(disk),
 		 "%s was hot-unplugged, %d stale handles\n",
 		 xbdev->nodename, bdev->bd_openers);
 	if (info && !bdev->bd_openers) {
 		xlvbd_release_gendisk(info);
 		disk->private_data = NULL;
 		kfree(info);
 	}
 	mutex_unlock(&bdev->bd_mutex);
 	bdput(bdev);
 	return 0;
 }
 static int blkfront_is_ready(struct xenbus_device *dev)
 {
 	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
 	return info->is_ready && info->xbdev;
 }
 static int blkif_open(struct block_device *bdev, fmode_t mode)
 {
 	struct gendisk *disk = bdev->bd_disk;
 	struct blkfront_info *info;
 	int err = 0;
 	lock_kernel();
 	info = disk->private_data;
 	if (!info) {
 		/* xbdev gone */
 		err = -ERESTARTSYS;
 		goto out;
 	}
 	mutex_lock(&info->mutex);
 	if (!info->gd)
 		/* xbdev is closed */
 		err = -ERESTARTSYS;
 	mutex_unlock(&info->mutex);
 out:
 	unlock_kernel();
 	return err;
 }
 static int blkif_release(struct gendisk *disk, fmode_t mode)
 {
 	struct blkfront_info *info = disk->private_data;
 	struct block_device *bdev;
 	struct xenbus_device *xbdev;
 	lock_kernel();
 	bdev = bdget_disk(disk, 0);
 	bdput(bdev);
 	if (bdev->bd_openers)
 		goto out;
 	/*
 	 * Check if we have been instructed to close. We will have
 	 * deferred this request, because the bdev was still open.
 	 */
 	mutex_lock(&info->mutex);
 	xbdev = info->xbdev;
 	if (xbdev && xbdev->state == XenbusStateClosing) {
 		/* pending switch to state closed */
 		dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
 		xlvbd_release_gendisk(info);
 		xenbus_frontend_closed(info->xbdev);
  	}
 	mutex_unlock(&info->mutex);
 	if (!xbdev) {
 		/* sudden device removal */
 		dev_info(disk_to_dev(bdev->bd_disk), "releasing disk\n");
 		xlvbd_release_gendisk(info);
 		disk->private_data = NULL;
 		kfree(info);
 	}
 out:
 	unlock_kernel();
 	return 0;
 }
 static const struct block_device_operations xlvbd_block_fops =
 {
 	.owner = THIS_MODULE,
 	.open = blkif_open,
 	.release = blkif_release,
 	.getgeo = blkif_getgeo,
 	.ioctl = blkif_ioctl,
 };
 static const struct xenbus_device_id blkfront_ids[] = {
 	{ "vbd" },
 	{ "" }
 };
 static struct xenbus_driver blkfront = {
 	.name = "vbd",
 	.owner = THIS_MODULE,
 	.ids = blkfront_ids,
 	.probe = blkfront_probe,
 	.remove = blkfront_remove,
 	.resume = blkfront_resume,
 	.otherend_changed = blkback_changed,
 	.is_ready = blkfront_is_ready,
 };
 static int __init xlblk_init(void)
 {
 	if (!xen_domain())
 		return -ENODEV;
 	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
 		printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
 		       XENVBD_MAJOR, DEV_NAME);
 		return -ENODEV;
 	}
 	return xenbus_register_frontend(&blkfront);
 }
 module_init(xlblk_init);
 static void __exit xlblk_exit(void)
 {
 	return xenbus_unregister_driver(&blkfront);
 }
 module_exit(xlblk_exit);
 MODULE_DESCRIPTION("Xen virtual block device frontend");
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
 MODULE_ALIAS("xen:vbd");
 MODULE_ALIAS("xenblk");

drivers/ide/ide-disk.c

Diff comments View file @ 4913efe

 /*
  *  Copyright (C) 1994-1998	   Linus Torvalds & authors (see below)
  *  Copyright (C) 1998-2002	   Linux ATA Development
  *				      Andre Hedrick <andre@linux-ide.org>
  *  Copyright (C) 2003		   Red Hat
  *  Copyright (C) 2003-2005, 2007  Bartlomiej Zolnierkiewicz
  */
 /*
  *  Mostly written by Mark Lord <mlord@pobox.com>
  *                and Gadi Oxman <gadio@netvision.net.il>
  *                and Andre Hedrick <andre@linux-ide.org>
  *
  * This is the IDE/ATA disk driver, as evolved from hd.c and ide.c.
  */
 #include <linux/types.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
 #include <linux/major.h>
 #include <linux/errno.h>
 #include <linux/genhd.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/mutex.h>
 #include <linux/leds.h>
 #include <linux/ide.h>
 #include <asm/byteorder.h>
 #include <asm/irq.h>
 #include <asm/uaccess.h>
 #include <asm/io.h>
 #include <asm/div64.h>
 #include "ide-disk.h"
 static const u8 ide_rw_cmds[] = {
 	ATA_CMD_READ_MULTI,
 	ATA_CMD_WRITE_MULTI,
 	ATA_CMD_READ_MULTI_EXT,
 	ATA_CMD_WRITE_MULTI_EXT,
 	ATA_CMD_PIO_READ,
 	ATA_CMD_PIO_WRITE,
 	ATA_CMD_PIO_READ_EXT,
 	ATA_CMD_PIO_WRITE_EXT,
 	ATA_CMD_READ,
 	ATA_CMD_WRITE,
 	ATA_CMD_READ_EXT,
 	ATA_CMD_WRITE_EXT,
 };
 static void ide_tf_set_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 dma)
 {
 	u8 index, lba48, write;
 	lba48 = (cmd->tf_flags & IDE_TFLAG_LBA48) ? 2 : 0;
 	write = (cmd->tf_flags & IDE_TFLAG_WRITE) ? 1 : 0;
 	if (dma) {
 		cmd->protocol = ATA_PROT_DMA;
 		index = 8;
 	} else {
 		cmd->protocol = ATA_PROT_PIO;
 		if (drive->mult_count) {
 			cmd->tf_flags |= IDE_TFLAG_MULTI_PIO;
 			index = 0;
 		} else
 			index = 4;
 	}
 	cmd->tf.command = ide_rw_cmds[index + lba48 + write];
 }
 /*
  * __ide_do_rw_disk() issues READ and WRITE commands to a disk,
  * using LBA if supported, or CHS otherwise, to address sectors.
  */
 static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 					sector_t block)
 {
 	ide_hwif_t *hwif	= drive->hwif;
 	u16 nsectors		= (u16)blk_rq_sectors(rq);
 	u8 lba48		= !!(drive->dev_flags & IDE_DFLAG_LBA48);
 	u8 dma			= !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 	struct ide_cmd		cmd;
 	struct ide_taskfile	*tf = &cmd.tf;
 	ide_startstop_t		rc;
 	if ((hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) && lba48 && dma) {
 		if (block + blk_rq_sectors(rq) > 1ULL << 28)
 			dma = 0;
 		else
 			lba48 = 0;
 	}
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
 	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 	if (drive->dev_flags & IDE_DFLAG_LBA) {
 		if (lba48) {
 			pr_debug("%s: LBA=0x%012llx\n", drive->name,
 					(unsigned long long)block);
 			tf->nsect  = nsectors & 0xff;
 			tf->lbal   = (u8) block;
 			tf->lbam   = (u8)(block >>  8);
 			tf->lbah   = (u8)(block >> 16);
 			tf->device = ATA_LBA;
 			tf = &cmd.hob;
 			tf->nsect = (nsectors >> 8) & 0xff;
 			tf->lbal  = (u8)(block >> 24);
 			if (sizeof(block) != 4) {
 				tf->lbam = (u8)((u64)block >> 32);
 				tf->lbah = (u8)((u64)block >> 40);
 			}
 			cmd.valid.out.hob = IDE_VALID_OUT_HOB;
 			cmd.valid.in.hob  = IDE_VALID_IN_HOB;
 			cmd.tf_flags |= IDE_TFLAG_LBA48;
 		} else {
 			tf->nsect  = nsectors & 0xff;
 			tf->lbal   = block;
 			tf->lbam   = block >>= 8;
 			tf->lbah   = block >>= 8;
 			tf->device = ((block >> 8) & 0xf) | ATA_LBA;
 		}
 	} else {
 		unsigned int sect, head, cyl, track;
 		track = (int)block / drive->sect;
 		sect  = (int)block % drive->sect + 1;
 		head  = track % drive->head;
 		cyl   = track / drive->head;
 		pr_debug("%s: CHS=%u/%u/%u\n", drive->name, cyl, head, sect);
 		tf->nsect  = nsectors & 0xff;
 		tf->lbal   = sect;
 		tf->lbam   = cyl;
 		tf->lbah   = cyl >> 8;
 		tf->device = head;
 	}
 	cmd.tf_flags |= IDE_TFLAG_FS;
 	if (rq_data_dir(rq))
 		cmd.tf_flags |= IDE_TFLAG_WRITE;
 	ide_tf_set_cmd(drive, &cmd, dma);
 	cmd.rq = rq;
 	if (dma == 0) {
 		ide_init_sg_cmd(&cmd, nsectors << 9);
 		ide_map_sg(drive, &cmd);
 	}
 	rc = do_rw_taskfile(drive, &cmd);
 	if (rc == ide_stopped && dma) {
 		/* fallback to PIO */
 		cmd.tf_flags |= IDE_TFLAG_DMA_PIO_FALLBACK;
 		ide_tf_set_cmd(drive, &cmd, 0);
 		ide_init_sg_cmd(&cmd, nsectors << 9);
 		rc = do_rw_taskfile(drive, &cmd);
 	}
 	return rc;
 }
 /*
  * 268435455  == 137439 MB or 28bit limit
  * 320173056  == 163929 MB or 48bit addressing
  * 1073741822 == 549756 MB or 48bit addressing fake drive
  */
 static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq,
 				      sector_t block)
 {
 	ide_hwif_t *hwif = drive->hwif;
 	BUG_ON(drive->dev_flags & IDE_DFLAG_BLOCKED);
 	BUG_ON(rq->cmd_type != REQ_TYPE_FS);
 	ledtrig_ide_activity();
 	pr_debug("%s: %sing: block=%llu, sectors=%u, buffer=0x%08lx\n",
 		 drive->name, rq_data_dir(rq) == READ ? "read" : "writ",
 		 (unsigned long long)block, blk_rq_sectors(rq),
 		 (unsigned long)rq->buffer);
 	if (hwif->rw_disk)
 		hwif->rw_disk(drive, rq);
 	return __ide_do_rw_disk(drive, rq, block);
 }
 /*
  * Queries for true maximum capacity of the drive.
  * Returns maximum LBA address (> 0) of the drive, 0 if failed.
  */
 static u64 idedisk_read_native_max_address(ide_drive_t *drive, int lba48)
 {
 	struct ide_cmd cmd;
 	struct ide_taskfile *tf = &cmd.tf;
 	u64 addr = 0;
 	memset(&cmd, 0, sizeof(cmd));
 	if (lba48)
 		tf->command = ATA_CMD_READ_NATIVE_MAX_EXT;
 	else
 		tf->command = ATA_CMD_READ_NATIVE_MAX;
 	tf->device  = ATA_LBA;
 	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
 	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 	if (lba48) {
 		cmd.valid.out.hob = IDE_VALID_OUT_HOB;
 		cmd.valid.in.hob  = IDE_VALID_IN_HOB;
 		cmd.tf_flags = IDE_TFLAG_LBA48;
 	}
 	ide_no_data_taskfile(drive, &cmd);
 	/* if OK, compute maximum address value */
 	if (!(tf->status & ATA_ERR))
 		addr = ide_get_lba_addr(&cmd, lba48) + 1;
 	return addr;
 }
 /*
  * Sets maximum virtual LBA address of the drive.
  * Returns new maximum virtual LBA address (> 0) or 0 on failure.
  */
 static u64 idedisk_set_max_address(ide_drive_t *drive, u64 addr_req, int lba48)
 {
 	struct ide_cmd cmd;
 	struct ide_taskfile *tf = &cmd.tf;
 	u64 addr_set = 0;
 	addr_req--;
 	memset(&cmd, 0, sizeof(cmd));
 	tf->lbal     = (addr_req >>  0) & 0xff;
 	tf->lbam     = (addr_req >>= 8) & 0xff;
 	tf->lbah     = (addr_req >>= 8) & 0xff;
 	if (lba48) {
 		cmd.hob.lbal = (addr_req >>= 8) & 0xff;
 		cmd.hob.lbam = (addr_req >>= 8) & 0xff;
 		cmd.hob.lbah = (addr_req >>= 8) & 0xff;
 		tf->command  = ATA_CMD_SET_MAX_EXT;
 	} else {
 		tf->device   = (addr_req >>= 8) & 0x0f;
 		tf->command  = ATA_CMD_SET_MAX;
 	}
 	tf->device |= ATA_LBA;
 	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
 	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 	if (lba48) {
 		cmd.valid.out.hob = IDE_VALID_OUT_HOB;
 		cmd.valid.in.hob  = IDE_VALID_IN_HOB;
 		cmd.tf_flags = IDE_TFLAG_LBA48;
 	}
 	ide_no_data_taskfile(drive, &cmd);
 	/* if OK, compute maximum address value */
 	if (!(tf->status & ATA_ERR))
 		addr_set = ide_get_lba_addr(&cmd, lba48) + 1;
 	return addr_set;
 }
 static unsigned long long sectors_to_MB(unsigned long long n)
 {
 	n <<= 9;		/* make it bytes */
 	do_div(n, 1000000);	/* make it MB */
 	return n;
 }
 /*
  * Some disks report total number of sectors instead of
  * maximum sector address.  We list them here.
  */
 static const struct drive_list_entry hpa_list[] = {
 	{ "ST340823A",	NULL },
 	{ "ST320413A",	NULL },
 	{ "ST310211A",	NULL },
 	{ NULL,		NULL }
 };
 static u64 ide_disk_hpa_get_native_capacity(ide_drive_t *drive, int lba48)
 {
 	u64 capacity, set_max;
 	capacity = drive->capacity64;
 	set_max  = idedisk_read_native_max_address(drive, lba48);
 	if (ide_in_drive_list(drive->id, hpa_list)) {
 		/*
 		 * Since we are inclusive wrt to firmware revisions do this
 		 * extra check and apply the workaround only when needed.
 		 */
 		if (set_max == capacity + 1)
 			set_max--;
 	}
 	return set_max;
 }
 static u64 ide_disk_hpa_set_capacity(ide_drive_t *drive, u64 set_max, int lba48)
 {
 	set_max = idedisk_set_max_address(drive, set_max, lba48);
 	if (set_max)
 		drive->capacity64 = set_max;
 	return set_max;
 }
 static void idedisk_check_hpa(ide_drive_t *drive)
 {
 	u64 capacity, set_max;
 	int lba48 = ata_id_lba48_enabled(drive->id);
 	capacity = drive->capacity64;
 	set_max  = ide_disk_hpa_get_native_capacity(drive, lba48);
 	if (set_max <= capacity)
 		return;
 	drive->probed_capacity = set_max;
 	printk(KERN_INFO "%s: Host Protected Area detected.\n"
 			 "\tcurrent capacity is %llu sectors (%llu MB)\n"
 			 "\tnative  capacity is %llu sectors (%llu MB)\n",
 			 drive->name,
 			 capacity, sectors_to_MB(capacity),
 			 set_max, sectors_to_MB(set_max));
 	if ((drive->dev_flags & IDE_DFLAG_NOHPA) == 0)
 		return;
 	set_max = ide_disk_hpa_set_capacity(drive, set_max, lba48);
 	if (set_max)
 		printk(KERN_INFO "%s: Host Protected Area disabled.\n",
 				 drive->name);
 }
 static int ide_disk_get_capacity(ide_drive_t *drive)
 {
 	u16 *id = drive->id;
 	int lba;
 	if (ata_id_lba48_enabled(id)) {
 		/* drive speaks 48-bit LBA */
 		lba = 1;
 		drive->capacity64 = ata_id_u64(id, ATA_ID_LBA_CAPACITY_2);
 	} else if (ata_id_has_lba(id) && ata_id_is_lba_capacity_ok(id)) {
 		/* drive speaks 28-bit LBA */
 		lba = 1;
 		drive->capacity64 = ata_id_u32(id, ATA_ID_LBA_CAPACITY);
 	} else {
 		/* drive speaks boring old 28-bit CHS */
 		lba = 0;
 		drive->capacity64 = drive->cyl * drive->head * drive->sect;
 	}
 	drive->probed_capacity = drive->capacity64;
 	if (lba) {
 		drive->dev_flags |= IDE_DFLAG_LBA;
 		/*
 		* If this device supports the Host Protected Area feature set,
 		* then we may need to change our opinion about its capacity.
 		*/
 		if (ata_id_hpa_enabled(id))
 			idedisk_check_hpa(drive);
 	}
 	/* limit drive capacity to 137GB if LBA48 cannot be used */
 	if ((drive->dev_flags & IDE_DFLAG_LBA48) == 0 &&
 	    drive->capacity64 > 1ULL << 28) {
 		printk(KERN_WARNING "%s: cannot use LBA48 - full capacity "
 		       "%llu sectors (%llu MB)\n",
 		       drive->name, (unsigned long long)drive->capacity64,
 		       sectors_to_MB(drive->capacity64));
 		drive->probed_capacity = drive->capacity64 = 1ULL << 28;
 	}
 	if ((drive->hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) &&
 	    (drive->dev_flags & IDE_DFLAG_LBA48)) {
 		if (drive->capacity64 > 1ULL << 28) {
 			printk(KERN_INFO "%s: cannot use LBA48 DMA - PIO mode"
 					 " will be used for accessing sectors "
 					 "> %u\n", drive->name, 1 << 28);
 		} else
 			drive->dev_flags &= ~IDE_DFLAG_LBA48;
 	}
 	return 0;
 }
 static void ide_disk_unlock_native_capacity(ide_drive_t *drive)
 {
 	u16 *id = drive->id;
 	int lba48 = ata_id_lba48_enabled(id);
 	if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 ||
 	    ata_id_hpa_enabled(id) == 0)
 		return;
 	/*
 	 * according to the spec the SET MAX ADDRESS command shall be
 	 * immediately preceded by a READ NATIVE MAX ADDRESS command
 	 */
 	if (!ide_disk_hpa_get_native_capacity(drive, lba48))
 		return;
 	if (ide_disk_hpa_set_capacity(drive, drive->probed_capacity, lba48))
 		drive->dev_flags |= IDE_DFLAG_NOHPA; /* disable HPA on resume */
 }
 static int idedisk_prep_fn(struct request_queue *q, struct request *rq)
 {
 	ide_drive_t *drive = q->queuedata;
 	struct ide_cmd *cmd;
 	if (!(rq->cmd_flags & REQ_FLUSH))
 		return BLKPREP_OK;
 	cmd = kmalloc(sizeof(*cmd), GFP_ATOMIC);
 	/* FIXME: map struct ide_taskfile on rq->cmd[] */
 	BUG_ON(cmd == NULL);
 	memset(cmd, 0, sizeof(*cmd));
 	if (ata_id_flush_ext_enabled(drive->id) &&
 	    (drive->capacity64 >= (1UL << 28)))
 		cmd->tf.command = ATA_CMD_FLUSH_EXT;
 	else
 		cmd->tf.command = ATA_CMD_FLUSH;
 	cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
 	cmd->tf_flags = IDE_TFLAG_DYN;
 	cmd->protocol = ATA_PROT_NODATA;
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
 	rq->special = cmd;
 	cmd->rq = rq;
 	return BLKPREP_OK;
 }
 ide_devset_get(multcount, mult_count);
 /*
  * This is tightly woven into the driver->do_special can not touch.
  * DON'T do it again until a total personality rewrite is committed.
  */
 static int set_multcount(ide_drive_t *drive, int arg)
 {
 	struct request *rq;
 	int error;
 	if (arg < 0 || arg > (drive->id[ATA_ID_MAX_MULTSECT] & 0xff))
 		return -EINVAL;
 	if (drive->special_flags & IDE_SFLAG_SET_MULTMODE)
 		return -EBUSY;
 	rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
 	rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
 	drive->mult_req = arg;
 	drive->special_flags |= IDE_SFLAG_SET_MULTMODE;
 	error = blk_execute_rq(drive->queue, NULL, rq, 0);
 	blk_put_request(rq);
 	return (drive->mult_count == arg) ? 0 : -EIO;
 }
 ide_devset_get_flag(nowerr, IDE_DFLAG_NOWERR);
 static int set_nowerr(ide_drive_t *drive, int arg)
 {
 	if (arg < 0 || arg > 1)
 		return -EINVAL;
 	if (arg)
 		drive->dev_flags |= IDE_DFLAG_NOWERR;
 	else
 		drive->dev_flags &= ~IDE_DFLAG_NOWERR;
 	drive->bad_wstat = arg ? BAD_R_STAT : BAD_W_STAT;
 	return 0;
 }
 static int ide_do_setfeature(ide_drive_t *drive, u8 feature, u8 nsect)
 {
 	struct ide_cmd cmd;
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.tf.feature = feature;
 	cmd.tf.nsect   = nsect;
 	cmd.tf.command = ATA_CMD_SET_FEATURES;
 	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
 	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 	return ide_no_data_taskfile(drive, &cmd);
 }
-static void update_ordered(ide_drive_t *drive)
+static void update_flush(ide_drive_t *drive)
 {
 	u16 *id = drive->id;
-	unsigned ordered = QUEUE_ORDERED_NONE;
+	unsigned flush = 0;
 	if (drive->dev_flags & IDE_DFLAG_WCACHE) {
 		unsigned long long capacity;
 		int barrier;
 		/*
 		 * We must avoid issuing commands a drive does not
 		 * understand or we may crash it. We check flush cache
 		 * is supported. We also check we have the LBA48 flush
 		 * cache if the drive capacity is too large. By this
 		 * time we have trimmed the drive capacity if LBA48 is
 		 * not available so we don't need to recheck that.
 		 */
 		capacity = ide_gd_capacity(drive);
 		barrier = ata_id_flush_enabled(id) &&
 			(drive->dev_flags & IDE_DFLAG_NOFLUSH) == 0 &&
 			((drive->dev_flags & IDE_DFLAG_LBA48) == 0 ||
 			 capacity <= (1ULL << 28) ||
 			 ata_id_flush_ext_enabled(id));
 		printk(KERN_INFO "%s: cache flushes %ssupported\n",
 		       drive->name, barrier ? "" : "not ");
 		if (barrier) {
-			ordered = QUEUE_ORDERED_DRAIN_FLUSH;
+			flush = REQ_FLUSH;
 			blk_queue_prep_rq(drive->queue, idedisk_prep_fn);
 		}
-	} else
+	}
-		ordered = QUEUE_ORDERED_DRAIN;
-	blk_queue_ordered(drive->queue, ordered);
+	blk_queue_flush(drive->queue, flush);
 }
 ide_devset_get_flag(wcache, IDE_DFLAG_WCACHE);
 static int set_wcache(ide_drive_t *drive, int arg)
 {
 	int err = 1;
 	if (arg < 0 || arg > 1)
 		return -EINVAL;
 	if (ata_id_flush_enabled(drive->id)) {
 		err = ide_do_setfeature(drive,
 			arg ? SETFEATURES_WC_ON : SETFEATURES_WC_OFF, 0);
 		if (err == 0) {
 			if (arg)
 				drive->dev_flags |= IDE_DFLAG_WCACHE;
 			else
 				drive->dev_flags &= ~IDE_DFLAG_WCACHE;
 		}
 	}
-	update_ordered(drive);
+	update_flush(drive);
 	return err;
 }
 static int do_idedisk_flushcache(ide_drive_t *drive)
 {
 	struct ide_cmd cmd;
 	memset(&cmd, 0, sizeof(cmd));
 	if (ata_id_flush_ext_enabled(drive->id))
 		cmd.tf.command = ATA_CMD_FLUSH_EXT;
 	else
 		cmd.tf.command = ATA_CMD_FLUSH;
 	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
 	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 	return ide_no_data_taskfile(drive, &cmd);
 }
 ide_devset_get(acoustic, acoustic);
 static int set_acoustic(ide_drive_t *drive, int arg)
 {
 	if (arg < 0 || arg > 254)
 		return -EINVAL;
 	ide_do_setfeature(drive,
 		arg ? SETFEATURES_AAM_ON : SETFEATURES_AAM_OFF, arg);
 	drive->acoustic = arg;
 	return 0;
 }
 ide_devset_get_flag(addressing, IDE_DFLAG_LBA48);
 /*
  * drive->addressing:
  *	0: 28-bit
  *	1: 48-bit
  *	2: 48-bit capable doing 28-bit
  */
 static int set_addressing(ide_drive_t *drive, int arg)
 {
 	if (arg < 0 || arg > 2)
 		return -EINVAL;
 	if (arg && ((drive->hwif->host_flags & IDE_HFLAG_NO_LBA48) ||
 	    ata_id_lba48_enabled(drive->id) == 0))
 		return -EIO;
 	if (arg == 2)
 		arg = 0;
 	if (arg)
 		drive->dev_flags |= IDE_DFLAG_LBA48;
 	else
 		drive->dev_flags &= ~IDE_DFLAG_LBA48;
 	return 0;
 }
 ide_ext_devset_rw(acoustic, acoustic);
 ide_ext_devset_rw(address, addressing);
 ide_ext_devset_rw(multcount, multcount);
 ide_ext_devset_rw(wcache, wcache);
 ide_ext_devset_rw_sync(nowerr, nowerr);
 static int ide_disk_check(ide_drive_t *drive, const char *s)
 {
 	return 1;
 }
 static void ide_disk_setup(ide_drive_t *drive)
 {
 	struct ide_disk_obj *idkp = drive->driver_data;
 	struct request_queue *q = drive->queue;
 	ide_hwif_t *hwif = drive->hwif;
 	u16 *id = drive->id;
 	char *m = (char *)&id[ATA_ID_PROD];
 	unsigned long long capacity;
 	ide_proc_register_driver(drive, idkp->driver);
 	if ((drive->dev_flags & IDE_DFLAG_ID_READ) == 0)
 		return;
 	if (drive->dev_flags & IDE_DFLAG_REMOVABLE) {
 		/*
 		 * Removable disks (eg. SYQUEST); ignore 'WD' drives
 		 */
 		if (m[0] != 'W' || m[1] != 'D')
 			drive->dev_flags |= IDE_DFLAG_DOORLOCKING;
 	}
 	(void)set_addressing(drive, 1);
 	if (drive->dev_flags & IDE_DFLAG_LBA48) {
 		int max_s = 2048;
 		if (max_s > hwif->rqsize)
 			max_s = hwif->rqsize;
 		blk_queue_max_hw_sectors(q, max_s);
 	}
 	printk(KERN_INFO "%s: max request size: %dKiB\n", drive->name,
 	       queue_max_sectors(q) / 2);
 	if (ata_id_is_ssd(id))
 		queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q);
 	/* calculate drive capacity, and select LBA if possible */
 	ide_disk_get_capacity(drive);
 	/*
 	 * if possible, give fdisk access to more of the drive,
 	 * by correcting bios_cyls:
 	 */
 	capacity = ide_gd_capacity(drive);
 	if ((drive->dev_flags & IDE_DFLAG_FORCED_GEOM) == 0) {
 		if (ata_id_lba48_enabled(drive->id)) {
 			/* compatibility */
 			drive->bios_sect = 63;
 			drive->bios_head = 255;
 		}
 		if (drive->bios_sect && drive->bios_head) {
 			unsigned int cap0 = capacity; /* truncate to 32 bits */
 			unsigned int cylsz, cyl;
 			if (cap0 != capacity)
 				drive->bios_cyl = 65535;
 			else {
 				cylsz = drive->bios_sect * drive->bios_head;
 				cyl = cap0 / cylsz;
 				if (cyl > 65535)
 					cyl = 65535;
 				if (cyl > drive->bios_cyl)
 					drive->bios_cyl = cyl;
 			}
 		}
 	}
 	printk(KERN_INFO "%s: %llu sectors (%llu MB)",
 			 drive->name, capacity, sectors_to_MB(capacity));
 	/* Only print cache size when it was specified */
 	if (id[ATA_ID_BUF_SIZE])
 		printk(KERN_CONT " w/%dKiB Cache", id[ATA_ID_BUF_SIZE] / 2);
 	printk(KERN_CONT ", CHS=%d/%d/%d\n",
 			 drive->bios_cyl, drive->bios_head, drive->bios_sect);
 	/* write cache enabled? */
 	if ((id[ATA_ID_CSFO] & 1) || ata_id_wcache_enabled(id))
 		drive->dev_flags |= IDE_DFLAG_WCACHE;
 	set_wcache(drive, 1);
 	if ((drive->dev_flags & IDE_DFLAG_LBA) == 0 &&
 	    (drive->head == 0 || drive->head > 16)) {
 		printk(KERN_ERR "%s: invalid geometry: %d physical heads?\n",
 			drive->name, drive->head);
 		drive->dev_flags &= ~IDE_DFLAG_ATTACH;
 	} else
 		drive->dev_flags |= IDE_DFLAG_ATTACH;
 }
 static void ide_disk_flush(ide_drive_t *drive)
 {
 	if (ata_id_flush_enabled(drive->id) == 0 ||
 	    (drive->dev_flags & IDE_DFLAG_WCACHE) == 0)
 		return;
 	if (do_idedisk_flushcache(drive))
 		printk(KERN_INFO "%s: wcache flush failed!\n", drive->name);
 }
 static int ide_disk_init_media(ide_drive_t *drive, struct gendisk *disk)
 {
 	return 0;
 }
 static int ide_disk_set_doorlock(ide_drive_t *drive, struct gendisk *disk,
 				 int on)
 {
 	struct ide_cmd cmd;
 	int ret;
 	if ((drive->dev_flags & IDE_DFLAG_DOORLOCKING) == 0)
 		return 0;
 	memset(&cmd, 0, sizeof(cmd));
 	cmd.tf.command = on ? ATA_CMD_MEDIA_LOCK : ATA_CMD_MEDIA_UNLOCK;
 	cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
 	cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
 	ret = ide_no_data_taskfile(drive, &cmd);
 	if (ret)
 		drive->dev_flags &= ~IDE_DFLAG_DOORLOCKING;
 	return ret;
 }
 const struct ide_disk_ops ide_ata_disk_ops = {
 	.check			= ide_disk_check,
 	.unlock_native_capacity	= ide_disk_unlock_native_capacity,
 	.get_capacity		= ide_disk_get_capacity,
 	.setup			= ide_disk_setup,
 	.flush			= ide_disk_flush,
 	.init_media		= ide_disk_init_media,
 	.set_doorlock		= ide_disk_set_doorlock,
 	.do_request		= ide_do_rw_disk,
 	.ioctl			= ide_disk_ioctl,
 };

drivers/md/dm.c

Diff comments View file @ 4913efe

1	/*	1	/*
2	* Copyright (C) 2001, 2002 Sistina Software (UK) Limited.	2	* Copyright (C) 2001, 2002 Sistina Software (UK) Limited.
3	* Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.	3	* Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved.
4	*	4	*
5	* This file is released under the GPL.	5	* This file is released under the GPL.
6	*/	6	*/
7		7
8	#include "dm.h"	8	#include "dm.h"
9	#include "dm-uevent.h"	9	#include "dm-uevent.h"
10		10
11	#include <linux/init.h>	11	#include <linux/init.h>
12	#include <linux/module.h>	12	#include <linux/module.h>
13	#include <linux/mutex.h>	13	#include <linux/mutex.h>
14	#include <linux/moduleparam.h>	14	#include <linux/moduleparam.h>
15	#include <linux/blkpg.h>	15	#include <linux/blkpg.h>
16	#include <linux/bio.h>	16	#include <linux/bio.h>
17	#include <linux/buffer_head.h>	17	#include <linux/buffer_head.h>
18	#include <linux/smp_lock.h>	18	#include <linux/smp_lock.h>
19	#include <linux/mempool.h>	19	#include <linux/mempool.h>
20	#include <linux/slab.h>	20	#include <linux/slab.h>
21	#include <linux/idr.h>	21	#include <linux/idr.h>
22	#include <linux/hdreg.h>	22	#include <linux/hdreg.h>
23	#include <linux/delay.h>	23	#include <linux/delay.h>
24		24
25	#include <trace/events/block.h>	25	#include <trace/events/block.h>
26		26
27	#define DM_MSG_PREFIX "core"	27	#define DM_MSG_PREFIX "core"
28		28
29	/*	29	/*
30	* Cookies are numeric values sent with CHANGE and REMOVE	30	* Cookies are numeric values sent with CHANGE and REMOVE
31	* uevents while resuming, removing or renaming the device.	31	* uevents while resuming, removing or renaming the device.
32	*/	32	*/
33	#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"	33	#define DM_COOKIE_ENV_VAR_NAME "DM_COOKIE"
34	#define DM_COOKIE_LENGTH 24	34	#define DM_COOKIE_LENGTH 24
35		35
36	static const char *_name = DM_NAME;	36	static const char *_name = DM_NAME;
37		37
38	static unsigned int major = 0;	38	static unsigned int major = 0;
39	static unsigned int _major = 0;	39	static unsigned int _major = 0;
40		40
41	static DEFINE_SPINLOCK(_minor_lock);	41	static DEFINE_SPINLOCK(_minor_lock);
42	/*	42	/*
43	* For bio-based dm.	43	* For bio-based dm.
44	* One of these is allocated per bio.	44	* One of these is allocated per bio.
45	*/	45	*/
46	struct dm_io {	46	struct dm_io {
47	struct mapped_device *md;	47	struct mapped_device *md;
48	int error;	48	int error;
49	atomic_t io_count;	49	atomic_t io_count;
50	struct bio *bio;	50	struct bio *bio;
51	unsigned long start_time;	51	unsigned long start_time;
52	spinlock_t endio_lock;	52	spinlock_t endio_lock;
53	};	53	};
54		54
55	/*	55	/*
56	* For bio-based dm.	56	* For bio-based dm.
57	* One of these is allocated per target within a bio. Hopefully	57	* One of these is allocated per target within a bio. Hopefully
58	* this will be simplified out one day.	58	* this will be simplified out one day.
59	*/	59	*/
60	struct dm_target_io {	60	struct dm_target_io {
61	struct dm_io *io;	61	struct dm_io *io;
62	struct dm_target *ti;	62	struct dm_target *ti;
63	union map_info info;	63	union map_info info;
64	};	64	};
65		65
66	/*	66	/*
67	* For request-based dm.	67	* For request-based dm.
68	* One of these is allocated per request.	68	* One of these is allocated per request.
69	*/	69	*/
70	struct dm_rq_target_io {	70	struct dm_rq_target_io {
71	struct mapped_device *md;	71	struct mapped_device *md;
72	struct dm_target *ti;	72	struct dm_target *ti;
73	struct request *orig, clone;	73	struct request *orig, clone;
74	int error;	74	int error;
75	union map_info info;	75	union map_info info;
76	};	76	};
77		77
78	/*	78	/*
79	* For request-based dm.	79	* For request-based dm.
80	* One of these is allocated per bio.	80	* One of these is allocated per bio.
81	*/	81	*/
82	struct dm_rq_clone_bio_info {	82	struct dm_rq_clone_bio_info {
83	struct bio *orig;	83	struct bio *orig;
84	struct dm_rq_target_io *tio;	84	struct dm_rq_target_io *tio;
85	};	85	};
86		86
87	union map_info dm_get_mapinfo(struct bio bio)	87	union map_info dm_get_mapinfo(struct bio bio)
88	{	88	{
89	if (bio && bio->bi_private)	89	if (bio && bio->bi_private)
90	return &((struct dm_target_io *)bio->bi_private)->info;	90	return &((struct dm_target_io *)bio->bi_private)->info;
91	return NULL;	91	return NULL;
92	}	92	}
93		93
94	union map_info dm_get_rq_mapinfo(struct request rq)	94	union map_info dm_get_rq_mapinfo(struct request rq)
95	{	95	{
96	if (rq && rq->end_io_data)	96	if (rq && rq->end_io_data)
97	return &((struct dm_rq_target_io *)rq->end_io_data)->info;	97	return &((struct dm_rq_target_io *)rq->end_io_data)->info;
98	return NULL;	98	return NULL;
99	}	99	}
100	EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);	100	EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
101		101
102	#define MINOR_ALLOCED ((void *)-1)	102	#define MINOR_ALLOCED ((void *)-1)
103		103
104	/*	104	/*
105	* Bits for the md->flags field.	105	* Bits for the md->flags field.
106	*/	106	*/
107	#define DMF_BLOCK_IO_FOR_SUSPEND 0	107	#define DMF_BLOCK_IO_FOR_SUSPEND 0
108	#define DMF_SUSPENDED 1	108	#define DMF_SUSPENDED 1
109	#define DMF_FROZEN 2	109	#define DMF_FROZEN 2
110	#define DMF_FREEING 3	110	#define DMF_FREEING 3
111	#define DMF_DELETING 4	111	#define DMF_DELETING 4
112	#define DMF_NOFLUSH_SUSPENDING 5	112	#define DMF_NOFLUSH_SUSPENDING 5
113	#define DMF_QUEUE_IO_TO_THREAD 6	113	#define DMF_QUEUE_IO_TO_THREAD 6
114		114
115	/*	115	/*
116	* Work processed by per-device workqueue.	116	* Work processed by per-device workqueue.
117	*/	117	*/
118	struct mapped_device {	118	struct mapped_device {
119	struct rw_semaphore io_lock;	119	struct rw_semaphore io_lock;
120	struct mutex suspend_lock;	120	struct mutex suspend_lock;
121	rwlock_t map_lock;	121	rwlock_t map_lock;
122	atomic_t holders;	122	atomic_t holders;
123	atomic_t open_count;	123	atomic_t open_count;
124		124
125	unsigned long flags;	125	unsigned long flags;
126		126
127	struct request_queue *queue;	127	struct request_queue *queue;
128	unsigned type;	128	unsigned type;
129	/* Protect queue and type against concurrent access. */	129	/* Protect queue and type against concurrent access. */
130	struct mutex type_lock;	130	struct mutex type_lock;
131		131
132	struct gendisk *disk;	132	struct gendisk *disk;
133	char name[16];	133	char name[16];
134		134
135	void *interface_ptr;	135	void *interface_ptr;
136		136
137	/*	137	/*
138	* A list of ios that arrived while we were suspended.	138	* A list of ios that arrived while we were suspended.
139	*/	139	*/
140	atomic_t pending[2];	140	atomic_t pending[2];
141	wait_queue_head_t wait;	141	wait_queue_head_t wait;
142	struct work_struct work;	142	struct work_struct work;
143	struct bio_list deferred;	143	struct bio_list deferred;
144	spinlock_t deferred_lock;	144	spinlock_t deferred_lock;
145		145
146	/*	146	/*
147	* An error from the barrier request currently being processed.	147	* An error from the barrier request currently being processed.
148	*/	148	*/
149	int barrier_error;	149	int barrier_error;
150		150
151	/*	151	/*
152	* Protect barrier_error from concurrent endio processing	152	* Protect barrier_error from concurrent endio processing
153	* in request-based dm.	153	* in request-based dm.
154	*/	154	*/
155	spinlock_t barrier_error_lock;	155	spinlock_t barrier_error_lock;
156		156
157	/*	157	/*
158	* Processing queue (flush/barriers)	158	* Processing queue (flush/barriers)
159	*/	159	*/
160	struct workqueue_struct *wq;	160	struct workqueue_struct *wq;
161	struct work_struct barrier_work;	161	struct work_struct barrier_work;
162		162
163	/* A pointer to the currently processing pre/post flush request */	163	/* A pointer to the currently processing pre/post flush request */
164	struct request *flush_request;	164	struct request *flush_request;
165		165
166	/*	166	/*
167	* The current mapping.	167	* The current mapping.
168	*/	168	*/
169	struct dm_table *map;	169	struct dm_table *map;
170		170
171	/*	171	/*
172	* io objects are allocated from here.	172	* io objects are allocated from here.
173	*/	173	*/
174	mempool_t *io_pool;	174	mempool_t *io_pool;
175	mempool_t *tio_pool;	175	mempool_t *tio_pool;
176		176
177	struct bio_set *bs;	177	struct bio_set *bs;
178		178
179	/*	179	/*
180	* Event handling.	180	* Event handling.
181	*/	181	*/
182	atomic_t event_nr;	182	atomic_t event_nr;
183	wait_queue_head_t eventq;	183	wait_queue_head_t eventq;
184	atomic_t uevent_seq;	184	atomic_t uevent_seq;
185	struct list_head uevent_list;	185	struct list_head uevent_list;
186	spinlock_t uevent_lock; /* Protect access to uevent_list */	186	spinlock_t uevent_lock; /* Protect access to uevent_list */
187		187
188	/*	188	/*
189	* freeze/thaw support require holding onto a super block	189	* freeze/thaw support require holding onto a super block
190	*/	190	*/
191	struct super_block *frozen_sb;	191	struct super_block *frozen_sb;
192	struct block_device *bdev;	192	struct block_device *bdev;
193		193
194	/* forced geometry settings */	194	/* forced geometry settings */
195	struct hd_geometry geometry;	195	struct hd_geometry geometry;
196		196
197	/* For saving the address of __make_request for request based dm */	197	/* For saving the address of __make_request for request based dm */
198	make_request_fn *saved_make_request_fn;	198	make_request_fn *saved_make_request_fn;
199		199
200	/* sysfs handle */	200	/* sysfs handle */
201	struct kobject kobj;	201	struct kobject kobj;
202		202
203	/* zero-length barrier that will be cloned and submitted to targets */	203	/* zero-length barrier that will be cloned and submitted to targets */
204	struct bio barrier_bio;	204	struct bio barrier_bio;
205	};	205	};
206		206
207	/*	207	/*
208	* For mempools pre-allocation at the table loading time.	208	* For mempools pre-allocation at the table loading time.
209	*/	209	*/
210	struct dm_md_mempools {	210	struct dm_md_mempools {
211	mempool_t *io_pool;	211	mempool_t *io_pool;
212	mempool_t *tio_pool;	212	mempool_t *tio_pool;
213	struct bio_set *bs;	213	struct bio_set *bs;
214	};	214	};
215		215
216	#define MIN_IOS 256	216	#define MIN_IOS 256
217	static struct kmem_cache *_io_cache;	217	static struct kmem_cache *_io_cache;
218	static struct kmem_cache *_tio_cache;	218	static struct kmem_cache *_tio_cache;
219	static struct kmem_cache *_rq_tio_cache;	219	static struct kmem_cache *_rq_tio_cache;
220	static struct kmem_cache *_rq_bio_info_cache;	220	static struct kmem_cache *_rq_bio_info_cache;
221		221
222	static int __init local_init(void)	222	static int __init local_init(void)
223	{	223	{
224	int r = -ENOMEM;	224	int r = -ENOMEM;
225		225
226	/* allocate a slab for the dm_ios */	226	/* allocate a slab for the dm_ios */
227	_io_cache = KMEM_CACHE(dm_io, 0);	227	_io_cache = KMEM_CACHE(dm_io, 0);
228	if (!_io_cache)	228	if (!_io_cache)
229	return r;	229	return r;
230		230
231	/* allocate a slab for the target ios */	231	/* allocate a slab for the target ios */
232	_tio_cache = KMEM_CACHE(dm_target_io, 0);	232	_tio_cache = KMEM_CACHE(dm_target_io, 0);
233	if (!_tio_cache)	233	if (!_tio_cache)
234	goto out_free_io_cache;	234	goto out_free_io_cache;
235		235
236	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);	236	_rq_tio_cache = KMEM_CACHE(dm_rq_target_io, 0);
237	if (!_rq_tio_cache)	237	if (!_rq_tio_cache)
238	goto out_free_tio_cache;	238	goto out_free_tio_cache;
239		239
240	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);	240	_rq_bio_info_cache = KMEM_CACHE(dm_rq_clone_bio_info, 0);
241	if (!_rq_bio_info_cache)	241	if (!_rq_bio_info_cache)
242	goto out_free_rq_tio_cache;	242	goto out_free_rq_tio_cache;
243		243
244	r = dm_uevent_init();	244	r = dm_uevent_init();
245	if (r)	245	if (r)
246	goto out_free_rq_bio_info_cache;	246	goto out_free_rq_bio_info_cache;
247		247
248	_major = major;	248	_major = major;
249	r = register_blkdev(_major, _name);	249	r = register_blkdev(_major, _name);
250	if (r < 0)	250	if (r < 0)
251	goto out_uevent_exit;	251	goto out_uevent_exit;
252		252
253	if (!_major)	253	if (!_major)
254	_major = r;	254	_major = r;
255		255
256	return 0;	256	return 0;
257		257
258	out_uevent_exit:	258	out_uevent_exit:
259	dm_uevent_exit();	259	dm_uevent_exit();
260	out_free_rq_bio_info_cache:	260	out_free_rq_bio_info_cache:
261	kmem_cache_destroy(_rq_bio_info_cache);	261	kmem_cache_destroy(_rq_bio_info_cache);
262	out_free_rq_tio_cache:	262	out_free_rq_tio_cache:
263	kmem_cache_destroy(_rq_tio_cache);	263	kmem_cache_destroy(_rq_tio_cache);
264	out_free_tio_cache:	264	out_free_tio_cache:
265	kmem_cache_destroy(_tio_cache);	265	kmem_cache_destroy(_tio_cache);
266	out_free_io_cache:	266	out_free_io_cache:
267	kmem_cache_destroy(_io_cache);	267	kmem_cache_destroy(_io_cache);
268		268
269	return r;	269	return r;
270	}	270	}
271		271
272	static void local_exit(void)	272	static void local_exit(void)
273	{	273	{
274	kmem_cache_destroy(_rq_bio_info_cache);	274	kmem_cache_destroy(_rq_bio_info_cache);
275	kmem_cache_destroy(_rq_tio_cache);	275	kmem_cache_destroy(_rq_tio_cache);
276	kmem_cache_destroy(_tio_cache);	276	kmem_cache_destroy(_tio_cache);
277	kmem_cache_destroy(_io_cache);	277	kmem_cache_destroy(_io_cache);
278	unregister_blkdev(_major, _name);	278	unregister_blkdev(_major, _name);
279	dm_uevent_exit();	279	dm_uevent_exit();
280		280
281	_major = 0;	281	_major = 0;
282		282
283	DMINFO("cleaned up");	283	DMINFO("cleaned up");
284	}	284	}
285		285
286	static int (*_inits[])(void) __initdata = {	286	static int (*_inits[])(void) __initdata = {
287	local_init,	287	local_init,
288	dm_target_init,	288	dm_target_init,
289	dm_linear_init,	289	dm_linear_init,
290	dm_stripe_init,	290	dm_stripe_init,
291	dm_io_init,	291	dm_io_init,
292	dm_kcopyd_init,	292	dm_kcopyd_init,
293	dm_interface_init,	293	dm_interface_init,
294	};	294	};
295		295
296	static void (*_exits[])(void) = {	296	static void (*_exits[])(void) = {
297	local_exit,	297	local_exit,
298	dm_target_exit,	298	dm_target_exit,
299	dm_linear_exit,	299	dm_linear_exit,
300	dm_stripe_exit,	300	dm_stripe_exit,
301	dm_io_exit,	301	dm_io_exit,
302	dm_kcopyd_exit,	302	dm_kcopyd_exit,
303	dm_interface_exit,	303	dm_interface_exit,
304	};	304	};
305		305
306	static int __init dm_init(void)	306	static int __init dm_init(void)
307	{	307	{
308	const int count = ARRAY_SIZE(_inits);	308	const int count = ARRAY_SIZE(_inits);
309		309
310	int r, i;	310	int r, i;
311		311
312	for (i = 0; i < count; i++) {	312	for (i = 0; i < count; i++) {
313	r = _inits[i]();	313	r = _inits[i]();
314	if (r)	314	if (r)
315	goto bad;	315	goto bad;
316	}	316	}
317		317
318	return 0;	318	return 0;
319		319
320	bad:	320	bad:
321	while (i--)	321	while (i--)
322	_exits[i]();	322	_exits[i]();
323		323
324	return r;	324	return r;
325	}	325	}
326		326
327	static void __exit dm_exit(void)	327	static void __exit dm_exit(void)
328	{	328	{
329	int i = ARRAY_SIZE(_exits);	329	int i = ARRAY_SIZE(_exits);
330		330
331	while (i--)	331	while (i--)
332	_exits[i]();	332	_exits[i]();
333	}	333	}
334		334
335	/*	335	/*
336	* Block device functions	336	* Block device functions
337	*/	337	*/
338	int dm_deleting_md(struct mapped_device *md)	338	int dm_deleting_md(struct mapped_device *md)
339	{	339	{
340	return test_bit(DMF_DELETING, &md->flags);	340	return test_bit(DMF_DELETING, &md->flags);
341	}	341	}
342		342
343	static int dm_blk_open(struct block_device *bdev, fmode_t mode)	343	static int dm_blk_open(struct block_device *bdev, fmode_t mode)
344	{	344	{
345	struct mapped_device *md;	345	struct mapped_device *md;
346		346
347	lock_kernel();	347	lock_kernel();
348	spin_lock(&_minor_lock);	348	spin_lock(&_minor_lock);
349		349
350	md = bdev->bd_disk->private_data;	350	md = bdev->bd_disk->private_data;
351	if (!md)	351	if (!md)
352	goto out;	352	goto out;
353		353
354	if (test_bit(DMF_FREEING, &md->flags) \|\|	354	if (test_bit(DMF_FREEING, &md->flags) \|\|
355	dm_deleting_md(md)) {	355	dm_deleting_md(md)) {
356	md = NULL;	356	md = NULL;
357	goto out;	357	goto out;
358	}	358	}
359		359
360	dm_get(md);	360	dm_get(md);
361	atomic_inc(&md->open_count);	361	atomic_inc(&md->open_count);
362		362
363	out:	363	out:
364	spin_unlock(&_minor_lock);	364	spin_unlock(&_minor_lock);
365	unlock_kernel();	365	unlock_kernel();
366		366
367	return md ? 0 : -ENXIO;	367	return md ? 0 : -ENXIO;
368	}	368	}
369		369
370	static int dm_blk_close(struct gendisk *disk, fmode_t mode)	370	static int dm_blk_close(struct gendisk *disk, fmode_t mode)
371	{	371	{
372	struct mapped_device *md = disk->private_data;	372	struct mapped_device *md = disk->private_data;
373		373
374	lock_kernel();	374	lock_kernel();
375	atomic_dec(&md->open_count);	375	atomic_dec(&md->open_count);
376	dm_put(md);	376	dm_put(md);
377	unlock_kernel();	377	unlock_kernel();
378		378
379	return 0;	379	return 0;
380	}	380	}
381		381
382	int dm_open_count(struct mapped_device *md)	382	int dm_open_count(struct mapped_device *md)
383	{	383	{
384	return atomic_read(&md->open_count);	384	return atomic_read(&md->open_count);
385	}	385	}
386		386
387	/*	387	/*
388	* Guarantees nothing is using the device before it's deleted.	388	* Guarantees nothing is using the device before it's deleted.
389	*/	389	*/
390	int dm_lock_for_deletion(struct mapped_device *md)	390	int dm_lock_for_deletion(struct mapped_device *md)
391	{	391	{
392	int r = 0;	392	int r = 0;
393		393
394	spin_lock(&_minor_lock);	394	spin_lock(&_minor_lock);
395		395
396	if (dm_open_count(md))	396	if (dm_open_count(md))
397	r = -EBUSY;	397	r = -EBUSY;
398	else	398	else
399	set_bit(DMF_DELETING, &md->flags);	399	set_bit(DMF_DELETING, &md->flags);
400		400
401	spin_unlock(&_minor_lock);	401	spin_unlock(&_minor_lock);
402		402
403	return r;	403	return r;
404	}	404	}
405		405
406	static int dm_blk_getgeo(struct block_device bdev, struct hd_geometry geo)	406	static int dm_blk_getgeo(struct block_device bdev, struct hd_geometry geo)
407	{	407	{
408	struct mapped_device *md = bdev->bd_disk->private_data;	408	struct mapped_device *md = bdev->bd_disk->private_data;
409		409
410	return dm_get_geometry(md, geo);	410	return dm_get_geometry(md, geo);
411	}	411	}
412		412
413	static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,	413	static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
414	unsigned int cmd, unsigned long arg)	414	unsigned int cmd, unsigned long arg)
415	{	415	{
416	struct mapped_device *md = bdev->bd_disk->private_data;	416	struct mapped_device *md = bdev->bd_disk->private_data;
417	struct dm_table *map = dm_get_live_table(md);	417	struct dm_table *map = dm_get_live_table(md);
418	struct dm_target *tgt;	418	struct dm_target *tgt;
419	int r = -ENOTTY;	419	int r = -ENOTTY;
420		420
421	if (!map \|\| !dm_table_get_size(map))	421	if (!map \|\| !dm_table_get_size(map))
422	goto out;	422	goto out;
423		423
424	/* We only support devices that have a single target */	424	/* We only support devices that have a single target */
425	if (dm_table_get_num_targets(map) != 1)	425	if (dm_table_get_num_targets(map) != 1)
426	goto out;	426	goto out;
427		427
428	tgt = dm_table_get_target(map, 0);	428	tgt = dm_table_get_target(map, 0);
429		429
430	if (dm_suspended_md(md)) {	430	if (dm_suspended_md(md)) {
431	r = -EAGAIN;	431	r = -EAGAIN;
432	goto out;	432	goto out;
433	}	433	}
434		434
435	if (tgt->type->ioctl)	435	if (tgt->type->ioctl)
436	r = tgt->type->ioctl(tgt, cmd, arg);	436	r = tgt->type->ioctl(tgt, cmd, arg);
437		437
438	out:	438	out:
439	dm_table_put(map);	439	dm_table_put(map);
440		440
441	return r;	441	return r;
442	}	442	}
443		443
444	static struct dm_io alloc_io(struct mapped_device md)	444	static struct dm_io alloc_io(struct mapped_device md)
445	{	445	{
446	return mempool_alloc(md->io_pool, GFP_NOIO);	446	return mempool_alloc(md->io_pool, GFP_NOIO);
447	}	447	}
448		448
449	static void free_io(struct mapped_device md, struct dm_io io)	449	static void free_io(struct mapped_device md, struct dm_io io)
450	{	450	{
451	mempool_free(io, md->io_pool);	451	mempool_free(io, md->io_pool);
452	}	452	}
453		453
454	static void free_tio(struct mapped_device md, struct dm_target_io tio)	454	static void free_tio(struct mapped_device md, struct dm_target_io tio)
455	{	455	{
456	mempool_free(tio, md->tio_pool);	456	mempool_free(tio, md->tio_pool);
457	}	457	}
458		458
459	static struct dm_rq_target_io alloc_rq_tio(struct mapped_device md,	459	static struct dm_rq_target_io alloc_rq_tio(struct mapped_device md,
460	gfp_t gfp_mask)	460	gfp_t gfp_mask)
461	{	461	{
462	return mempool_alloc(md->tio_pool, gfp_mask);	462	return mempool_alloc(md->tio_pool, gfp_mask);
463	}	463	}
464		464
465	static void free_rq_tio(struct dm_rq_target_io *tio)	465	static void free_rq_tio(struct dm_rq_target_io *tio)
466	{	466	{
467	mempool_free(tio, tio->md->tio_pool);	467	mempool_free(tio, tio->md->tio_pool);
468	}	468	}
469		469
470	static struct dm_rq_clone_bio_info alloc_bio_info(struct mapped_device md)	470	static struct dm_rq_clone_bio_info alloc_bio_info(struct mapped_device md)
471	{	471	{
472	return mempool_alloc(md->io_pool, GFP_ATOMIC);	472	return mempool_alloc(md->io_pool, GFP_ATOMIC);
473	}	473	}
474		474
475	static void free_bio_info(struct dm_rq_clone_bio_info *info)	475	static void free_bio_info(struct dm_rq_clone_bio_info *info)
476	{	476	{
477	mempool_free(info, info->tio->md->io_pool);	477	mempool_free(info, info->tio->md->io_pool);
478	}	478	}
479		479
480	static int md_in_flight(struct mapped_device *md)	480	static int md_in_flight(struct mapped_device *md)
481	{	481	{
482	return atomic_read(&md->pending[READ]) +	482	return atomic_read(&md->pending[READ]) +
483	atomic_read(&md->pending[WRITE]);	483	atomic_read(&md->pending[WRITE]);
484	}	484	}
485		485
486	static void start_io_acct(struct dm_io *io)	486	static void start_io_acct(struct dm_io *io)
487	{	487	{
488	struct mapped_device *md = io->md;	488	struct mapped_device *md = io->md;
489	int cpu;	489	int cpu;
490	int rw = bio_data_dir(io->bio);	490	int rw = bio_data_dir(io->bio);
491		491
492	io->start_time = jiffies;	492	io->start_time = jiffies;
493		493
494	cpu = part_stat_lock();	494	cpu = part_stat_lock();
495	part_round_stats(cpu, &dm_disk(md)->part0);	495	part_round_stats(cpu, &dm_disk(md)->part0);
496	part_stat_unlock();	496	part_stat_unlock();
497	dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]);	497	dm_disk(md)->part0.in_flight[rw] = atomic_inc_return(&md->pending[rw]);
498	}	498	}
499		499
500	static void end_io_acct(struct dm_io *io)	500	static void end_io_acct(struct dm_io *io)
501	{	501	{
502	struct mapped_device *md = io->md;	502	struct mapped_device *md = io->md;
503	struct bio *bio = io->bio;	503	struct bio *bio = io->bio;
504	unsigned long duration = jiffies - io->start_time;	504	unsigned long duration = jiffies - io->start_time;
505	int pending, cpu;	505	int pending, cpu;
506	int rw = bio_data_dir(bio);	506	int rw = bio_data_dir(bio);
507		507
508	cpu = part_stat_lock();	508	cpu = part_stat_lock();
509	part_round_stats(cpu, &dm_disk(md)->part0);	509	part_round_stats(cpu, &dm_disk(md)->part0);
510	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);	510	part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
511	part_stat_unlock();	511	part_stat_unlock();
512		512
513	/*	513	/*
514	* After this is decremented the bio must not be touched if it is	514	* After this is decremented the bio must not be touched if it is
515	* a barrier.	515	* a barrier.
516	*/	516	*/
517	dm_disk(md)->part0.in_flight[rw] = pending =	517	dm_disk(md)->part0.in_flight[rw] = pending =
518	atomic_dec_return(&md->pending[rw]);	518	atomic_dec_return(&md->pending[rw]);
519	pending += atomic_read(&md->pending[rw^0x1]);	519	pending += atomic_read(&md->pending[rw^0x1]);
520		520
521	/* nudge anyone waiting on suspend queue */	521	/* nudge anyone waiting on suspend queue */
522	if (!pending)	522	if (!pending)
523	wake_up(&md->wait);	523	wake_up(&md->wait);
524	}	524	}
525		525
526	/*	526	/*
527	* Add the bio to the list of deferred io.	527	* Add the bio to the list of deferred io.
528	*/	528	*/
529	static void queue_io(struct mapped_device md, struct bio bio)	529	static void queue_io(struct mapped_device md, struct bio bio)
530	{	530	{
531	down_write(&md->io_lock);	531	down_write(&md->io_lock);
532		532
533	spin_lock_irq(&md->deferred_lock);	533	spin_lock_irq(&md->deferred_lock);
534	bio_list_add(&md->deferred, bio);	534	bio_list_add(&md->deferred, bio);
535	spin_unlock_irq(&md->deferred_lock);	535	spin_unlock_irq(&md->deferred_lock);
536		536
537	if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))	537	if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
538	queue_work(md->wq, &md->work);	538	queue_work(md->wq, &md->work);
539		539
540	up_write(&md->io_lock);	540	up_write(&md->io_lock);
541	}	541	}
542		542
543	/*	543	/*
544	* Everyone (including functions in this file), should use this	544	* Everyone (including functions in this file), should use this
545	* function to access the md->map field, and make sure they call	545	* function to access the md->map field, and make sure they call
546	* dm_table_put() when finished.	546	* dm_table_put() when finished.
547	*/	547	*/
548	struct dm_table dm_get_live_table(struct mapped_device md)	548	struct dm_table dm_get_live_table(struct mapped_device md)
549	{	549	{
550	struct dm_table *t;	550	struct dm_table *t;
551	unsigned long flags;	551	unsigned long flags;
552		552
553	read_lock_irqsave(&md->map_lock, flags);	553	read_lock_irqsave(&md->map_lock, flags);
554	t = md->map;	554	t = md->map;
555	if (t)	555	if (t)
556	dm_table_get(t);	556	dm_table_get(t);
557	read_unlock_irqrestore(&md->map_lock, flags);	557	read_unlock_irqrestore(&md->map_lock, flags);
558		558
559	return t;	559	return t;
560	}	560	}
561		561
562	/*	562	/*
563	* Get the geometry associated with a dm device	563	* Get the geometry associated with a dm device
564	*/	564	*/
565	int dm_get_geometry(struct mapped_device md, struct hd_geometry geo)	565	int dm_get_geometry(struct mapped_device md, struct hd_geometry geo)
566	{	566	{
567	*geo = md->geometry;	567	*geo = md->geometry;
568		568
569	return 0;	569	return 0;
570	}	570	}
571		571
572	/*	572	/*
573	* Set the geometry of a device.	573	* Set the geometry of a device.
574	*/	574	*/
575	int dm_set_geometry(struct mapped_device md, struct hd_geometry geo)	575	int dm_set_geometry(struct mapped_device md, struct hd_geometry geo)
576	{	576	{
577	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;	577	sector_t sz = (sector_t)geo->cylinders * geo->heads * geo->sectors;
578		578
579	if (geo->start > sz) {	579	if (geo->start > sz) {
580	DMWARN("Start sector is beyond the geometry limits.");	580	DMWARN("Start sector is beyond the geometry limits.");
581	return -EINVAL;	581	return -EINVAL;
582	}	582	}
583		583
584	md->geometry = *geo;	584	md->geometry = *geo;
585		585
586	return 0;	586	return 0;
587	}	587	}
588		588
589	/*-----------------------------------------------------------------	589	/*-----------------------------------------------------------------
590	* CRUD START:	590	* CRUD START:
591	* A more elegant soln is in the works that uses the queue	591	* A more elegant soln is in the works that uses the queue
592	* merge fn, unfortunately there are a couple of changes to	592	* merge fn, unfortunately there are a couple of changes to
593	* the block layer that I want to make for this. So in the	593	* the block layer that I want to make for this. So in the
594	* interests of getting something for people to use I give	594	* interests of getting something for people to use I give
595	* you this clearly demarcated crap.	595	* you this clearly demarcated crap.
596	---------------------------------------------------------------/	596	---------------------------------------------------------------/
597		597
598	static int __noflush_suspending(struct mapped_device *md)	598	static int __noflush_suspending(struct mapped_device *md)
599	{	599	{
600	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);	600	return test_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
601	}	601	}
602		602
603	/*	603	/*
604	* Decrements the number of outstanding ios that a bio has been	604	* Decrements the number of outstanding ios that a bio has been
605	* cloned into, completing the original io if necc.	605	* cloned into, completing the original io if necc.
606	*/	606	*/
607	static void dec_pending(struct dm_io *io, int error)	607	static void dec_pending(struct dm_io *io, int error)
608	{	608	{
609	unsigned long flags;	609	unsigned long flags;
610	int io_error;	610	int io_error;
611	struct bio *bio;	611	struct bio *bio;
612	struct mapped_device *md = io->md;	612	struct mapped_device *md = io->md;
613		613
614	/* Push-back supersedes any I/O errors */	614	/* Push-back supersedes any I/O errors */
615	if (unlikely(error)) {	615	if (unlikely(error)) {
616	spin_lock_irqsave(&io->endio_lock, flags);	616	spin_lock_irqsave(&io->endio_lock, flags);
617	if (!(io->error > 0 && __noflush_suspending(md)))	617	if (!(io->error > 0 && __noflush_suspending(md)))
618	io->error = error;	618	io->error = error;
619	spin_unlock_irqrestore(&io->endio_lock, flags);	619	spin_unlock_irqrestore(&io->endio_lock, flags);
620	}	620	}
621		621
622	if (atomic_dec_and_test(&io->io_count)) {	622	if (atomic_dec_and_test(&io->io_count)) {
623	if (io->error == DM_ENDIO_REQUEUE) {	623	if (io->error == DM_ENDIO_REQUEUE) {
624	/*	624	/*
625	* Target requested pushing back the I/O.	625	* Target requested pushing back the I/O.
626	*/	626	*/
627	spin_lock_irqsave(&md->deferred_lock, flags);	627	spin_lock_irqsave(&md->deferred_lock, flags);
628	if (__noflush_suspending(md)) {	628	if (__noflush_suspending(md)) {
629	if (!(io->bio->bi_rw & REQ_HARDBARRIER))	629	if (!(io->bio->bi_rw & REQ_HARDBARRIER))
630	bio_list_add_head(&md->deferred,	630	bio_list_add_head(&md->deferred,
631	io->bio);	631	io->bio);
632	} else	632	} else
633	/* noflush suspend was interrupted. */	633	/* noflush suspend was interrupted. */
634	io->error = -EIO;	634	io->error = -EIO;
635	spin_unlock_irqrestore(&md->deferred_lock, flags);	635	spin_unlock_irqrestore(&md->deferred_lock, flags);
636	}	636	}
637		637
638	io_error = io->error;	638	io_error = io->error;
639	bio = io->bio;	639	bio = io->bio;
640		640
641	if (bio->bi_rw & REQ_HARDBARRIER) {	641	if (bio->bi_rw & REQ_HARDBARRIER) {
642	/*	642	/*
643	* There can be just one barrier request so we use	643	* There can be just one barrier request so we use
644	* a per-device variable for error reporting.	644	* a per-device variable for error reporting.
645	* Note that you can't touch the bio after end_io_acct	645	* Note that you can't touch the bio after end_io_acct
646	*	646	*
647	* We ignore -EOPNOTSUPP for empty flush reported by	647	* We ignore -EOPNOTSUPP for empty flush reported by
648	* underlying devices. We assume that if the device	648	* underlying devices. We assume that if the device
649	* doesn't support empty barriers, it doesn't need	649	* doesn't support empty barriers, it doesn't need
650	* cache flushing commands.	650	* cache flushing commands.
651	*/	651	*/
652	if (!md->barrier_error &&	652	if (!md->barrier_error &&
653	!(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))	653	!(bio_empty_barrier(bio) && io_error == -EOPNOTSUPP))
654	md->barrier_error = io_error;	654	md->barrier_error = io_error;
655	end_io_acct(io);	655	end_io_acct(io);
656	free_io(md, io);	656	free_io(md, io);
657	} else {	657	} else {
658	end_io_acct(io);	658	end_io_acct(io);
659	free_io(md, io);	659	free_io(md, io);
660		660
661	if (io_error != DM_ENDIO_REQUEUE) {	661	if (io_error != DM_ENDIO_REQUEUE) {
662	trace_block_bio_complete(md->queue, bio);	662	trace_block_bio_complete(md->queue, bio);
663		663
664	bio_endio(bio, io_error);	664	bio_endio(bio, io_error);
665	}	665	}
666	}	666	}
667	}	667	}
668	}	668	}
669		669
670	static void clone_endio(struct bio *bio, int error)	670	static void clone_endio(struct bio *bio, int error)
671	{	671	{
672	int r = 0;	672	int r = 0;
673	struct dm_target_io *tio = bio->bi_private;	673	struct dm_target_io *tio = bio->bi_private;
674	struct dm_io *io = tio->io;	674	struct dm_io *io = tio->io;
675	struct mapped_device *md = tio->io->md;	675	struct mapped_device *md = tio->io->md;
676	dm_endio_fn endio = tio->ti->type->end_io;	676	dm_endio_fn endio = tio->ti->type->end_io;
677		677
678	if (!bio_flagged(bio, BIO_UPTODATE) && !error)	678	if (!bio_flagged(bio, BIO_UPTODATE) && !error)
679	error = -EIO;	679	error = -EIO;
680		680
681	if (endio) {	681	if (endio) {
682	r = endio(tio->ti, bio, error, &tio->info);	682	r = endio(tio->ti, bio, error, &tio->info);
683	if (r < 0 \|\| r == DM_ENDIO_REQUEUE)	683	if (r < 0 \|\| r == DM_ENDIO_REQUEUE)
684	/*	684	/*
685	* error and requeue request are handled	685	* error and requeue request are handled
686	* in dec_pending().	686	* in dec_pending().
687	*/	687	*/
688	error = r;	688	error = r;
689	else if (r == DM_ENDIO_INCOMPLETE)	689	else if (r == DM_ENDIO_INCOMPLETE)
690	/* The target will handle the io */	690	/* The target will handle the io */
691	return;	691	return;
692	else if (r) {	692	else if (r) {
693	DMWARN("unimplemented target endio return value: %d", r);	693	DMWARN("unimplemented target endio return value: %d", r);
694	BUG();	694	BUG();
695	}	695	}
696	}	696	}
697		697
698	/*	698	/*
699	* Store md for cleanup instead of tio which is about to get freed.	699	* Store md for cleanup instead of tio which is about to get freed.
700	*/	700	*/
701	bio->bi_private = md->bs;	701	bio->bi_private = md->bs;
702		702
703	free_tio(md, tio);	703	free_tio(md, tio);
704	bio_put(bio);	704	bio_put(bio);
705	dec_pending(io, error);	705	dec_pending(io, error);
706	}	706	}
707		707
708	/*	708	/*
709	* Partial completion handling for request-based dm	709	* Partial completion handling for request-based dm
710	*/	710	*/
711	static void end_clone_bio(struct bio *clone, int error)	711	static void end_clone_bio(struct bio *clone, int error)
712	{	712	{
713	struct dm_rq_clone_bio_info *info = clone->bi_private;	713	struct dm_rq_clone_bio_info *info = clone->bi_private;
714	struct dm_rq_target_io *tio = info->tio;	714	struct dm_rq_target_io *tio = info->tio;
715	struct bio *bio = info->orig;	715	struct bio *bio = info->orig;
716	unsigned int nr_bytes = info->orig->bi_size;	716	unsigned int nr_bytes = info->orig->bi_size;
717		717
718	bio_put(clone);	718	bio_put(clone);
719		719
720	if (tio->error)	720	if (tio->error)
721	/*	721	/*
722	* An error has already been detected on the request.	722	* An error has already been detected on the request.
723	* Once error occurred, just let clone->end_io() handle	723	* Once error occurred, just let clone->end_io() handle
724	* the remainder.	724	* the remainder.
725	*/	725	*/
726	return;	726	return;
727	else if (error) {	727	else if (error) {
728	/*	728	/*
729	* Don't notice the error to the upper layer yet.	729	* Don't notice the error to the upper layer yet.
730	* The error handling decision is made by the target driver,	730	* The error handling decision is made by the target driver,
731	* when the request is completed.	731	* when the request is completed.
732	*/	732	*/
733	tio->error = error;	733	tio->error = error;
734	return;	734	return;
735	}	735	}
736		736
737	/*	737	/*
738	* I/O for the bio successfully completed.	738	* I/O for the bio successfully completed.
739	* Notice the data completion to the upper layer.	739	* Notice the data completion to the upper layer.
740	*/	740	*/
741		741
742	/*	742	/*
743	* bios are processed from the head of the list.	743	* bios are processed from the head of the list.
744	* So the completing bio should always be rq->bio.	744	* So the completing bio should always be rq->bio.
745	* If it's not, something wrong is happening.	745	* If it's not, something wrong is happening.
746	*/	746	*/
747	if (tio->orig->bio != bio)	747	if (tio->orig->bio != bio)
748	DMERR("bio completion is going in the middle of the request");	748	DMERR("bio completion is going in the middle of the request");
749		749
750	/*	750	/*
751	* Update the original request.	751	* Update the original request.
752	* Do not use blk_end_request() here, because it may complete	752	* Do not use blk_end_request() here, because it may complete
753	* the original request before the clone, and break the ordering.	753	* the original request before the clone, and break the ordering.
754	*/	754	*/
755	blk_update_request(tio->orig, 0, nr_bytes);	755	blk_update_request(tio->orig, 0, nr_bytes);
756	}	756	}
757		757
758	static void store_barrier_error(struct mapped_device *md, int error)	758	static void store_barrier_error(struct mapped_device *md, int error)
759	{	759	{
760	unsigned long flags;	760	unsigned long flags;
761		761
762	spin_lock_irqsave(&md->barrier_error_lock, flags);	762	spin_lock_irqsave(&md->barrier_error_lock, flags);
763	/*	763	/*
764	* Basically, the first error is taken, but:	764	* Basically, the first error is taken, but:
765	* -EOPNOTSUPP supersedes any I/O error.	765	* -EOPNOTSUPP supersedes any I/O error.
766	* Requeue request supersedes any I/O error but -EOPNOTSUPP.	766	* Requeue request supersedes any I/O error but -EOPNOTSUPP.
767	*/	767	*/
768	if (!md->barrier_error \|\| error == -EOPNOTSUPP \|\|	768	if (!md->barrier_error \|\| error == -EOPNOTSUPP \|\|
769	(md->barrier_error != -EOPNOTSUPP &&	769	(md->barrier_error != -EOPNOTSUPP &&
770	error == DM_ENDIO_REQUEUE))	770	error == DM_ENDIO_REQUEUE))
771	md->barrier_error = error;	771	md->barrier_error = error;
772	spin_unlock_irqrestore(&md->barrier_error_lock, flags);	772	spin_unlock_irqrestore(&md->barrier_error_lock, flags);
773	}	773	}
774		774
775	/*	775	/*
776	* Don't touch any member of the md after calling this function because	776	* Don't touch any member of the md after calling this function because
777	* the md may be freed in dm_put() at the end of this function.	777	* the md may be freed in dm_put() at the end of this function.
778	* Or do dm_get() before calling this function and dm_put() later.	778	* Or do dm_get() before calling this function and dm_put() later.
779	*/	779	*/
780	static void rq_completed(struct mapped_device *md, int rw, int run_queue)	780	static void rq_completed(struct mapped_device *md, int rw, int run_queue)
781	{	781	{
782	atomic_dec(&md->pending[rw]);	782	atomic_dec(&md->pending[rw]);
783		783
784	/* nudge anyone waiting on suspend queue */	784	/* nudge anyone waiting on suspend queue */
785	if (!md_in_flight(md))	785	if (!md_in_flight(md))
786	wake_up(&md->wait);	786	wake_up(&md->wait);
787		787
788	if (run_queue)	788	if (run_queue)
789	blk_run_queue(md->queue);	789	blk_run_queue(md->queue);
790		790
791	/*	791	/*
792	* dm_put() must be at the end of this function. See the comment above	792	* dm_put() must be at the end of this function. See the comment above
793	*/	793	*/
794	dm_put(md);	794	dm_put(md);
795	}	795	}
796		796
797	static void free_rq_clone(struct request *clone)	797	static void free_rq_clone(struct request *clone)
798	{	798	{
799	struct dm_rq_target_io *tio = clone->end_io_data;	799	struct dm_rq_target_io *tio = clone->end_io_data;
800		800
801	blk_rq_unprep_clone(clone);	801	blk_rq_unprep_clone(clone);
802	free_rq_tio(tio);	802	free_rq_tio(tio);
803	}	803	}
804		804
805	/*	805	/*
806	* Complete the clone and the original request.	806	* Complete the clone and the original request.
807	* Must be called without queue lock.	807	* Must be called without queue lock.
808	*/	808	*/
809	static void dm_end_request(struct request *clone, int error)	809	static void dm_end_request(struct request *clone, int error)
810	{	810	{
811	int rw = rq_data_dir(clone);	811	int rw = rq_data_dir(clone);
812	int run_queue = 1;	812	int run_queue = 1;
813	bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;	813	bool is_barrier = clone->cmd_flags & REQ_HARDBARRIER;
814	struct dm_rq_target_io *tio = clone->end_io_data;	814	struct dm_rq_target_io *tio = clone->end_io_data;
815	struct mapped_device *md = tio->md;	815	struct mapped_device *md = tio->md;
816	struct request *rq = tio->orig;	816	struct request *rq = tio->orig;
817		817
818	if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) {	818	if (rq->cmd_type == REQ_TYPE_BLOCK_PC && !is_barrier) {
819	rq->errors = clone->errors;	819	rq->errors = clone->errors;
820	rq->resid_len = clone->resid_len;	820	rq->resid_len = clone->resid_len;
821		821
822	if (rq->sense)	822	if (rq->sense)
823	/*	823	/*
824	* We are using the sense buffer of the original	824	* We are using the sense buffer of the original
825	* request.	825	* request.
826	* So setting the length of the sense data is enough.	826	* So setting the length of the sense data is enough.
827	*/	827	*/
828	rq->sense_len = clone->sense_len;	828	rq->sense_len = clone->sense_len;
829	}	829	}
830		830
831	free_rq_clone(clone);	831	free_rq_clone(clone);
832		832
833	if (unlikely(is_barrier)) {	833	if (unlikely(is_barrier)) {
834	if (unlikely(error))	834	if (unlikely(error))
835	store_barrier_error(md, error);	835	store_barrier_error(md, error);
836	run_queue = 0;	836	run_queue = 0;
837	} else	837	} else
838	blk_end_request_all(rq, error);	838	blk_end_request_all(rq, error);
839		839
840	rq_completed(md, rw, run_queue);	840	rq_completed(md, rw, run_queue);
841	}	841	}
842		842
843	static void dm_unprep_request(struct request *rq)	843	static void dm_unprep_request(struct request *rq)
844	{	844	{
845	struct request *clone = rq->special;	845	struct request *clone = rq->special;
846		846
847	rq->special = NULL;	847	rq->special = NULL;
848	rq->cmd_flags &= ~REQ_DONTPREP;	848	rq->cmd_flags &= ~REQ_DONTPREP;
849		849
850	free_rq_clone(clone);	850	free_rq_clone(clone);
851	}	851	}
852		852
853	/*	853	/*
854	* Requeue the original request of a clone.	854	* Requeue the original request of a clone.
855	*/	855	*/
856	void dm_requeue_unmapped_request(struct request *clone)	856	void dm_requeue_unmapped_request(struct request *clone)
857	{	857	{
858	int rw = rq_data_dir(clone);	858	int rw = rq_data_dir(clone);
859	struct dm_rq_target_io *tio = clone->end_io_data;	859	struct dm_rq_target_io *tio = clone->end_io_data;
860	struct mapped_device *md = tio->md;	860	struct mapped_device *md = tio->md;
861	struct request *rq = tio->orig;	861	struct request *rq = tio->orig;
862	struct request_queue *q = rq->q;	862	struct request_queue *q = rq->q;
863	unsigned long flags;	863	unsigned long flags;
864		864
865	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {	865	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
866	/*	866	/*
867	* Barrier clones share an original request.	867	* Barrier clones share an original request.
868	* Leave it to dm_end_request(), which handles this special	868	* Leave it to dm_end_request(), which handles this special
869	* case.	869	* case.
870	*/	870	*/
871	dm_end_request(clone, DM_ENDIO_REQUEUE);	871	dm_end_request(clone, DM_ENDIO_REQUEUE);
872	return;	872	return;
873	}	873	}
874		874
875	dm_unprep_request(rq);	875	dm_unprep_request(rq);
876		876
877	spin_lock_irqsave(q->queue_lock, flags);	877	spin_lock_irqsave(q->queue_lock, flags);
878	if (elv_queue_empty(q))	878	if (elv_queue_empty(q))
879	blk_plug_device(q);	879	blk_plug_device(q);
880	blk_requeue_request(q, rq);	880	blk_requeue_request(q, rq);
881	spin_unlock_irqrestore(q->queue_lock, flags);	881	spin_unlock_irqrestore(q->queue_lock, flags);
882		882
883	rq_completed(md, rw, 0);	883	rq_completed(md, rw, 0);
884	}	884	}
885	EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);	885	EXPORT_SYMBOL_GPL(dm_requeue_unmapped_request);
886		886
887	static void __stop_queue(struct request_queue *q)	887	static void __stop_queue(struct request_queue *q)
888	{	888	{
889	blk_stop_queue(q);	889	blk_stop_queue(q);
890	}	890	}
891		891
892	static void stop_queue(struct request_queue *q)	892	static void stop_queue(struct request_queue *q)
893	{	893	{
894	unsigned long flags;	894	unsigned long flags;
895		895
896	spin_lock_irqsave(q->queue_lock, flags);	896	spin_lock_irqsave(q->queue_lock, flags);
897	__stop_queue(q);	897	__stop_queue(q);
898	spin_unlock_irqrestore(q->queue_lock, flags);	898	spin_unlock_irqrestore(q->queue_lock, flags);
899	}	899	}
900		900
901	static void __start_queue(struct request_queue *q)	901	static void __start_queue(struct request_queue *q)
902	{	902	{
903	if (blk_queue_stopped(q))	903	if (blk_queue_stopped(q))
904	blk_start_queue(q);	904	blk_start_queue(q);
905	}	905	}
906		906
907	static void start_queue(struct request_queue *q)	907	static void start_queue(struct request_queue *q)
908	{	908	{
909	unsigned long flags;	909	unsigned long flags;
910		910
911	spin_lock_irqsave(q->queue_lock, flags);	911	spin_lock_irqsave(q->queue_lock, flags);
912	__start_queue(q);	912	__start_queue(q);
913	spin_unlock_irqrestore(q->queue_lock, flags);	913	spin_unlock_irqrestore(q->queue_lock, flags);
914	}	914	}
915		915
916	static void dm_done(struct request *clone, int error, bool mapped)	916	static void dm_done(struct request *clone, int error, bool mapped)
917	{	917	{
918	int r = error;	918	int r = error;
919	struct dm_rq_target_io *tio = clone->end_io_data;	919	struct dm_rq_target_io *tio = clone->end_io_data;
920	dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;	920	dm_request_endio_fn rq_end_io = tio->ti->type->rq_end_io;
921		921
922	if (mapped && rq_end_io)	922	if (mapped && rq_end_io)
923	r = rq_end_io(tio->ti, clone, error, &tio->info);	923	r = rq_end_io(tio->ti, clone, error, &tio->info);
924		924
925	if (r <= 0)	925	if (r <= 0)
926	/* The target wants to complete the I/O */	926	/* The target wants to complete the I/O */
927	dm_end_request(clone, r);	927	dm_end_request(clone, r);
928	else if (r == DM_ENDIO_INCOMPLETE)	928	else if (r == DM_ENDIO_INCOMPLETE)
929	/* The target will handle the I/O */	929	/* The target will handle the I/O */
930	return;	930	return;
931	else if (r == DM_ENDIO_REQUEUE)	931	else if (r == DM_ENDIO_REQUEUE)
932	/* The target wants to requeue the I/O */	932	/* The target wants to requeue the I/O */
933	dm_requeue_unmapped_request(clone);	933	dm_requeue_unmapped_request(clone);
934	else {	934	else {
935	DMWARN("unimplemented target endio return value: %d", r);	935	DMWARN("unimplemented target endio return value: %d", r);
936	BUG();	936	BUG();
937	}	937	}
938	}	938	}
939		939
940	/*	940	/*
941	* Request completion handler for request-based dm	941	* Request completion handler for request-based dm
942	*/	942	*/
943	static void dm_softirq_done(struct request *rq)	943	static void dm_softirq_done(struct request *rq)
944	{	944	{
945	bool mapped = true;	945	bool mapped = true;
946	struct request *clone = rq->completion_data;	946	struct request *clone = rq->completion_data;
947	struct dm_rq_target_io *tio = clone->end_io_data;	947	struct dm_rq_target_io *tio = clone->end_io_data;
948		948
949	if (rq->cmd_flags & REQ_FAILED)	949	if (rq->cmd_flags & REQ_FAILED)
950	mapped = false;	950	mapped = false;
951		951
952	dm_done(clone, tio->error, mapped);	952	dm_done(clone, tio->error, mapped);
953	}	953	}
954		954
955	/*	955	/*
956	* Complete the clone and the original request with the error status	956	* Complete the clone and the original request with the error status
957	* through softirq context.	957	* through softirq context.
958	*/	958	*/
959	static void dm_complete_request(struct request *clone, int error)	959	static void dm_complete_request(struct request *clone, int error)
960	{	960	{
961	struct dm_rq_target_io *tio = clone->end_io_data;	961	struct dm_rq_target_io *tio = clone->end_io_data;
962	struct request *rq = tio->orig;	962	struct request *rq = tio->orig;
963		963
964	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {	964	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
965	/*	965	/*
966	* Barrier clones share an original request. So can't use	966	* Barrier clones share an original request. So can't use
967	* softirq_done with the original.	967	* softirq_done with the original.
968	* Pass the clone to dm_done() directly in this special case.	968	* Pass the clone to dm_done() directly in this special case.
969	* It is safe (even if clone->q->queue_lock is held here)	969	* It is safe (even if clone->q->queue_lock is held here)
970	* because there is no I/O dispatching during the completion	970	* because there is no I/O dispatching during the completion
971	* of barrier clone.	971	* of barrier clone.
972	*/	972	*/
973	dm_done(clone, error, true);	973	dm_done(clone, error, true);
974	return;	974	return;
975	}	975	}
976		976
977	tio->error = error;	977	tio->error = error;
978	rq->completion_data = clone;	978	rq->completion_data = clone;
979	blk_complete_request(rq);	979	blk_complete_request(rq);
980	}	980	}
981		981
982	/*	982	/*
983	* Complete the not-mapped clone and the original request with the error status	983	* Complete the not-mapped clone and the original request with the error status
984	* through softirq context.	984	* through softirq context.
985	* Target's rq_end_io() function isn't called.	985	* Target's rq_end_io() function isn't called.
986	* This may be used when the target's map_rq() function fails.	986	* This may be used when the target's map_rq() function fails.
987	*/	987	*/
988	void dm_kill_unmapped_request(struct request *clone, int error)	988	void dm_kill_unmapped_request(struct request *clone, int error)
989	{	989	{
990	struct dm_rq_target_io *tio = clone->end_io_data;	990	struct dm_rq_target_io *tio = clone->end_io_data;
991	struct request *rq = tio->orig;	991	struct request *rq = tio->orig;
992		992
993	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {	993	if (unlikely(clone->cmd_flags & REQ_HARDBARRIER)) {
994	/*	994	/*
995	* Barrier clones share an original request.	995	* Barrier clones share an original request.
996	* Leave it to dm_end_request(), which handles this special	996	* Leave it to dm_end_request(), which handles this special
997	* case.	997	* case.
998	*/	998	*/
999	BUG_ON(error > 0);	999	BUG_ON(error > 0);
1000	dm_end_request(clone, error);	1000	dm_end_request(clone, error);
1001	return;	1001	return;
1002	}	1002	}
1003		1003
1004	rq->cmd_flags \|= REQ_FAILED;	1004	rq->cmd_flags \|= REQ_FAILED;
1005	dm_complete_request(clone, error);	1005	dm_complete_request(clone, error);
1006	}	1006	}
1007	EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);	1007	EXPORT_SYMBOL_GPL(dm_kill_unmapped_request);
1008		1008
1009	/*	1009	/*
1010	* Called with the queue lock held	1010	* Called with the queue lock held
1011	*/	1011	*/
1012	static void end_clone_request(struct request *clone, int error)	1012	static void end_clone_request(struct request *clone, int error)
1013	{	1013	{
1014	/*	1014	/*
1015	* For just cleaning up the information of the queue in which	1015	* For just cleaning up the information of the queue in which
1016	* the clone was dispatched.	1016	* the clone was dispatched.
1017	* The clone is NOT freed actually here because it is alloced from	1017	* The clone is NOT freed actually here because it is alloced from
1018	* dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.	1018	* dm own mempool and REQ_ALLOCED isn't set in clone->cmd_flags.
1019	*/	1019	*/
1020	__blk_put_request(clone->q, clone);	1020	__blk_put_request(clone->q, clone);
1021		1021
1022	/*	1022	/*
1023	* Actual request completion is done in a softirq context which doesn't	1023	* Actual request completion is done in a softirq context which doesn't
1024	* hold the queue lock. Otherwise, deadlock could occur because:	1024	* hold the queue lock. Otherwise, deadlock could occur because:
1025	* - another request may be submitted by the upper level driver	1025	* - another request may be submitted by the upper level driver
1026	* of the stacking during the completion	1026	* of the stacking during the completion
1027	* - the submission which requires queue lock may be done	1027	* - the submission which requires queue lock may be done
1028	* against this queue	1028	* against this queue
1029	*/	1029	*/
1030	dm_complete_request(clone, error);	1030	dm_complete_request(clone, error);
1031	}	1031	}
1032		1032
1033	/*	1033	/*
1034	* Return maximum size of I/O possible at the supplied sector up to the current	1034	* Return maximum size of I/O possible at the supplied sector up to the current
1035	* target boundary.	1035	* target boundary.
1036	*/	1036	*/
1037	static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)	1037	static sector_t max_io_len_target_boundary(sector_t sector, struct dm_target *ti)
1038	{	1038	{
1039	sector_t target_offset = dm_target_offset(ti, sector);	1039	sector_t target_offset = dm_target_offset(ti, sector);
1040		1040
1041	return ti->len - target_offset;	1041	return ti->len - target_offset;
1042	}	1042	}
1043		1043
1044	static sector_t max_io_len(sector_t sector, struct dm_target *ti)	1044	static sector_t max_io_len(sector_t sector, struct dm_target *ti)
1045	{	1045	{
1046	sector_t len = max_io_len_target_boundary(sector, ti);	1046	sector_t len = max_io_len_target_boundary(sector, ti);
1047		1047
1048	/*	1048	/*
1049	* Does the target need to split even further ?	1049	* Does the target need to split even further ?
1050	*/	1050	*/
1051	if (ti->split_io) {	1051	if (ti->split_io) {
1052	sector_t boundary;	1052	sector_t boundary;
1053	sector_t offset = dm_target_offset(ti, sector);	1053	sector_t offset = dm_target_offset(ti, sector);
1054	boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))	1054	boundary = ((offset + ti->split_io) & ~(ti->split_io - 1))
1055	- offset;	1055	- offset;
1056	if (len > boundary)	1056	if (len > boundary)
1057	len = boundary;	1057	len = boundary;
1058	}	1058	}
1059		1059
1060	return len;	1060	return len;
1061	}	1061	}
1062		1062
1063	static void __map_bio(struct dm_target ti, struct bio clone,	1063	static void __map_bio(struct dm_target ti, struct bio clone,
1064	struct dm_target_io *tio)	1064	struct dm_target_io *tio)
1065	{	1065	{
1066	int r;	1066	int r;
1067	sector_t sector;	1067	sector_t sector;
1068	struct mapped_device *md;	1068	struct mapped_device *md;
1069		1069
1070	clone->bi_end_io = clone_endio;	1070	clone->bi_end_io = clone_endio;
1071	clone->bi_private = tio;	1071	clone->bi_private = tio;
1072		1072
1073	/*	1073	/*
1074	* Map the clone. If r == 0 we don't need to do	1074	* Map the clone. If r == 0 we don't need to do
1075	* anything, the target has assumed ownership of	1075	* anything, the target has assumed ownership of
1076	* this io.	1076	* this io.
1077	*/	1077	*/
1078	atomic_inc(&tio->io->io_count);	1078	atomic_inc(&tio->io->io_count);
1079	sector = clone->bi_sector;	1079	sector = clone->bi_sector;
1080	r = ti->type->map(ti, clone, &tio->info);	1080	r = ti->type->map(ti, clone, &tio->info);
1081	if (r == DM_MAPIO_REMAPPED) {	1081	if (r == DM_MAPIO_REMAPPED) {
1082	/* the bio has been remapped so dispatch it */	1082	/* the bio has been remapped so dispatch it */
1083		1083
1084	trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,	1084	trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
1085	tio->io->bio->bi_bdev->bd_dev, sector);	1085	tio->io->bio->bi_bdev->bd_dev, sector);
1086		1086
1087	generic_make_request(clone);	1087	generic_make_request(clone);
1088	} else if (r < 0 \|\| r == DM_MAPIO_REQUEUE) {	1088	} else if (r < 0 \|\| r == DM_MAPIO_REQUEUE) {
1089	/* error the io and bail out, or requeue it if needed */	1089	/* error the io and bail out, or requeue it if needed */
1090	md = tio->io->md;	1090	md = tio->io->md;
1091	dec_pending(tio->io, r);	1091	dec_pending(tio->io, r);
1092	/*	1092	/*
1093	* Store bio_set for cleanup.	1093	* Store bio_set for cleanup.
1094	*/	1094	*/
1095	clone->bi_private = md->bs;	1095	clone->bi_private = md->bs;
1096	bio_put(clone);	1096	bio_put(clone);
1097	free_tio(md, tio);	1097	free_tio(md, tio);
1098	} else if (r) {	1098	} else if (r) {
1099	DMWARN("unimplemented target map return value: %d", r);	1099	DMWARN("unimplemented target map return value: %d", r);
1100	BUG();	1100	BUG();
1101	}	1101	}
1102	}	1102	}
1103		1103
1104	struct clone_info {	1104	struct clone_info {
1105	struct mapped_device *md;	1105	struct mapped_device *md;
1106	struct dm_table *map;	1106	struct dm_table *map;
1107	struct bio *bio;	1107	struct bio *bio;
1108	struct dm_io *io;	1108	struct dm_io *io;
1109	sector_t sector;	1109	sector_t sector;
1110	sector_t sector_count;	1110	sector_t sector_count;
1111	unsigned short idx;	1111	unsigned short idx;
1112	};	1112	};
1113		1113
1114	static void dm_bio_destructor(struct bio *bio)	1114	static void dm_bio_destructor(struct bio *bio)
1115	{	1115	{
1116	struct bio_set *bs = bio->bi_private;	1116	struct bio_set *bs = bio->bi_private;
1117		1117
1118	bio_free(bio, bs);	1118	bio_free(bio, bs);
1119	}	1119	}
1120		1120
1121	/*	1121	/*
1122	* Creates a little bio that is just does part of a bvec.	1122	* Creates a little bio that is just does part of a bvec.
1123	*/	1123	*/
1124	static struct bio split_bvec(struct bio bio, sector_t sector,	1124	static struct bio split_bvec(struct bio bio, sector_t sector,
1125	unsigned short idx, unsigned int offset,	1125	unsigned short idx, unsigned int offset,
1126	unsigned int len, struct bio_set *bs)	1126	unsigned int len, struct bio_set *bs)
1127	{	1127	{
1128	struct bio *clone;	1128	struct bio *clone;
1129	struct bio_vec *bv = bio->bi_io_vec + idx;	1129	struct bio_vec *bv = bio->bi_io_vec + idx;
1130		1130
1131	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);	1131	clone = bio_alloc_bioset(GFP_NOIO, 1, bs);
1132	clone->bi_destructor = dm_bio_destructor;	1132	clone->bi_destructor = dm_bio_destructor;
1133	clone->bi_io_vec = bv;	1133	clone->bi_io_vec = bv;
1134		1134
1135	clone->bi_sector = sector;	1135	clone->bi_sector = sector;
1136	clone->bi_bdev = bio->bi_bdev;	1136	clone->bi_bdev = bio->bi_bdev;
1137	clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;	1137	clone->bi_rw = bio->bi_rw & ~REQ_HARDBARRIER;
1138	clone->bi_vcnt = 1;	1138	clone->bi_vcnt = 1;
1139	clone->bi_size = to_bytes(len);	1139	clone->bi_size = to_bytes(len);
1140	clone->bi_io_vec->bv_offset = offset;	1140	clone->bi_io_vec->bv_offset = offset;
1141	clone->bi_io_vec->bv_len = clone->bi_size;	1141	clone->bi_io_vec->bv_len = clone->bi_size;
1142	clone->bi_flags \|= 1 << BIO_CLONED;	1142	clone->bi_flags \|= 1 << BIO_CLONED;
1143		1143
1144	if (bio_integrity(bio)) {	1144	if (bio_integrity(bio)) {
1145	bio_integrity_clone(clone, bio, GFP_NOIO, bs);	1145	bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1146	bio_integrity_trim(clone,	1146	bio_integrity_trim(clone,
1147	bio_sector_offset(bio, idx, offset), len);	1147	bio_sector_offset(bio, idx, offset), len);
1148	}	1148	}
1149		1149
1150	return clone;	1150	return clone;
1151	}	1151	}
1152		1152
1153	/*	1153	/*
1154	* Creates a bio that consists of range of complete bvecs.	1154	* Creates a bio that consists of range of complete bvecs.
1155	*/	1155	*/
1156	static struct bio clone_bio(struct bio bio, sector_t sector,	1156	static struct bio clone_bio(struct bio bio, sector_t sector,
1157	unsigned short idx, unsigned short bv_count,	1157	unsigned short idx, unsigned short bv_count,
1158	unsigned int len, struct bio_set *bs)	1158	unsigned int len, struct bio_set *bs)
1159	{	1159	{
1160	struct bio *clone;	1160	struct bio *clone;
1161		1161
1162	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);	1162	clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
1163	__bio_clone(clone, bio);	1163	__bio_clone(clone, bio);
1164	clone->bi_rw &= ~REQ_HARDBARRIER;	1164	clone->bi_rw &= ~REQ_HARDBARRIER;
1165	clone->bi_destructor = dm_bio_destructor;	1165	clone->bi_destructor = dm_bio_destructor;
1166	clone->bi_sector = sector;	1166	clone->bi_sector = sector;
1167	clone->bi_idx = idx;	1167	clone->bi_idx = idx;
1168	clone->bi_vcnt = idx + bv_count;	1168	clone->bi_vcnt = idx + bv_count;
1169	clone->bi_size = to_bytes(len);	1169	clone->bi_size = to_bytes(len);
1170	clone->bi_flags &= ~(1 << BIO_SEG_VALID);	1170	clone->bi_flags &= ~(1 << BIO_SEG_VALID);
1171		1171
1172	if (bio_integrity(bio)) {	1172	if (bio_integrity(bio)) {
1173	bio_integrity_clone(clone, bio, GFP_NOIO, bs);	1173	bio_integrity_clone(clone, bio, GFP_NOIO, bs);
1174		1174
1175	if (idx != bio->bi_idx \|\| clone->bi_size < bio->bi_size)	1175	if (idx != bio->bi_idx \|\| clone->bi_size < bio->bi_size)
1176	bio_integrity_trim(clone,	1176	bio_integrity_trim(clone,
1177	bio_sector_offset(bio, idx, 0), len);	1177	bio_sector_offset(bio, idx, 0), len);
1178	}	1178	}
1179		1179
1180	return clone;	1180	return clone;
1181	}	1181	}
1182		1182
1183	static struct dm_target_io alloc_tio(struct clone_info ci,	1183	static struct dm_target_io alloc_tio(struct clone_info ci,
1184	struct dm_target *ti)	1184	struct dm_target *ti)
1185	{	1185	{
1186	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);	1186	struct dm_target_io *tio = mempool_alloc(ci->md->tio_pool, GFP_NOIO);
1187		1187
1188	tio->io = ci->io;	1188	tio->io = ci->io;
1189	tio->ti = ti;	1189	tio->ti = ti;
1190	memset(&tio->info, 0, sizeof(tio->info));	1190	memset(&tio->info, 0, sizeof(tio->info));
1191		1191
1192	return tio;	1192	return tio;
1193	}	1193	}
1194		1194
1195	static void __issue_target_request(struct clone_info ci, struct dm_target ti,	1195	static void __issue_target_request(struct clone_info ci, struct dm_target ti,
1196	unsigned request_nr, sector_t len)	1196	unsigned request_nr, sector_t len)
1197	{	1197	{
1198	struct dm_target_io *tio = alloc_tio(ci, ti);	1198	struct dm_target_io *tio = alloc_tio(ci, ti);
1199	struct bio *clone;	1199	struct bio *clone;
1200		1200
1201	tio->info.target_request_nr = request_nr;	1201	tio->info.target_request_nr = request_nr;
1202		1202
1203	/*	1203	/*
1204	* Discard requests require the bio's inline iovecs be initialized.	1204	* Discard requests require the bio's inline iovecs be initialized.
1205	* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush	1205	* ci->bio->bi_max_vecs is BIO_INLINE_VECS anyway, for both flush
1206	* and discard, so no need for concern about wasted bvec allocations.	1206	* and discard, so no need for concern about wasted bvec allocations.
1207	*/	1207	*/
1208	clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);	1208	clone = bio_alloc_bioset(GFP_NOIO, ci->bio->bi_max_vecs, ci->md->bs);
1209	__bio_clone(clone, ci->bio);	1209	__bio_clone(clone, ci->bio);
1210	clone->bi_destructor = dm_bio_destructor;	1210	clone->bi_destructor = dm_bio_destructor;
1211	if (len) {	1211	if (len) {
1212	clone->bi_sector = ci->sector;	1212	clone->bi_sector = ci->sector;
1213	clone->bi_size = to_bytes(len);	1213	clone->bi_size = to_bytes(len);
1214	}	1214	}
1215		1215
1216	__map_bio(ti, clone, tio);	1216	__map_bio(ti, clone, tio);
1217	}	1217	}
1218		1218
1219	static void __issue_target_requests(struct clone_info ci, struct dm_target ti,	1219	static void __issue_target_requests(struct clone_info ci, struct dm_target ti,
1220	unsigned num_requests, sector_t len)	1220	unsigned num_requests, sector_t len)
1221	{	1221	{
1222	unsigned request_nr;	1222	unsigned request_nr;
1223		1223
1224	for (request_nr = 0; request_nr < num_requests; request_nr++)	1224	for (request_nr = 0; request_nr < num_requests; request_nr++)
1225	__issue_target_request(ci, ti, request_nr, len);	1225	__issue_target_request(ci, ti, request_nr, len);
1226	}	1226	}
1227		1227
1228	static int __clone_and_map_empty_barrier(struct clone_info *ci)	1228	static int __clone_and_map_empty_barrier(struct clone_info *ci)
1229	{	1229	{
1230	unsigned target_nr = 0;	1230	unsigned target_nr = 0;
1231	struct dm_target *ti;	1231	struct dm_target *ti;
1232		1232
1233	while ((ti = dm_table_get_target(ci->map, target_nr++)))	1233	while ((ti = dm_table_get_target(ci->map, target_nr++)))
1234	__issue_target_requests(ci, ti, ti->num_flush_requests, 0);	1234	__issue_target_requests(ci, ti, ti->num_flush_requests, 0);
1235		1235
1236	ci->sector_count = 0;	1236	ci->sector_count = 0;
1237		1237
1238	return 0;	1238	return 0;
1239	}	1239	}
1240		1240
1241	/*	1241	/*
1242	* Perform all io with a single clone.	1242	* Perform all io with a single clone.
1243	*/	1243	*/
1244	static void __clone_and_map_simple(struct clone_info ci, struct dm_target ti)	1244	static void __clone_and_map_simple(struct clone_info ci, struct dm_target ti)
1245	{	1245	{
1246	struct bio clone, bio = ci->bio;	1246	struct bio clone, bio = ci->bio;
1247	struct dm_target_io *tio;	1247	struct dm_target_io *tio;
1248		1248
1249	tio = alloc_tio(ci, ti);	1249	tio = alloc_tio(ci, ti);
1250	clone = clone_bio(bio, ci->sector, ci->idx,	1250	clone = clone_bio(bio, ci->sector, ci->idx,
1251	bio->bi_vcnt - ci->idx, ci->sector_count,	1251	bio->bi_vcnt - ci->idx, ci->sector_count,
1252	ci->md->bs);	1252	ci->md->bs);
1253	__map_bio(ti, clone, tio);	1253	__map_bio(ti, clone, tio);
1254	ci->sector_count = 0;	1254	ci->sector_count = 0;
1255	}	1255	}
1256		1256
1257	static int __clone_and_map_discard(struct clone_info *ci)	1257	static int __clone_and_map_discard(struct clone_info *ci)
1258	{	1258	{
1259	struct dm_target *ti;	1259	struct dm_target *ti;
1260	sector_t len;	1260	sector_t len;
1261		1261
1262	do {	1262	do {
1263	ti = dm_table_find_target(ci->map, ci->sector);	1263	ti = dm_table_find_target(ci->map, ci->sector);
1264	if (!dm_target_is_valid(ti))	1264	if (!dm_target_is_valid(ti))
1265	return -EIO;	1265	return -EIO;
1266		1266
1267	/*	1267	/*
1268	* Even though the device advertised discard support,	1268	* Even though the device advertised discard support,
1269	* reconfiguration might have changed that since the	1269	* reconfiguration might have changed that since the
1270	* check was performed.	1270	* check was performed.
1271	*/	1271	*/
1272	if (!ti->num_discard_requests)	1272	if (!ti->num_discard_requests)
1273	return -EOPNOTSUPP;	1273	return -EOPNOTSUPP;
1274		1274
1275	len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));	1275	len = min(ci->sector_count, max_io_len_target_boundary(ci->sector, ti));
1276		1276
1277	__issue_target_requests(ci, ti, ti->num_discard_requests, len);	1277	__issue_target_requests(ci, ti, ti->num_discard_requests, len);
1278		1278
1279	ci->sector += len;	1279	ci->sector += len;
1280	} while (ci->sector_count -= len);	1280	} while (ci->sector_count -= len);
1281		1281
1282	return 0;	1282	return 0;
1283	}	1283	}
1284		1284
1285	static int __clone_and_map(struct clone_info *ci)	1285	static int __clone_and_map(struct clone_info *ci)
1286	{	1286	{
1287	struct bio clone, bio = ci->bio;	1287	struct bio clone, bio = ci->bio;
1288	struct dm_target *ti;	1288	struct dm_target *ti;
1289	sector_t len = 0, max;	1289	sector_t len = 0, max;
1290	struct dm_target_io *tio;	1290	struct dm_target_io *tio;
1291		1291
1292	if (unlikely(bio_empty_barrier(bio)))	1292	if (unlikely(bio_empty_barrier(bio)))
1293	return __clone_and_map_empty_barrier(ci);	1293	return __clone_and_map_empty_barrier(ci);
1294		1294
1295	if (unlikely(bio->bi_rw & REQ_DISCARD))	1295	if (unlikely(bio->bi_rw & REQ_DISCARD))
1296	return __clone_and_map_discard(ci);	1296	return __clone_and_map_discard(ci);
1297		1297
1298	ti = dm_table_find_target(ci->map, ci->sector);	1298	ti = dm_table_find_target(ci->map, ci->sector);
1299	if (!dm_target_is_valid(ti))	1299	if (!dm_target_is_valid(ti))
1300	return -EIO;	1300	return -EIO;
1301		1301
1302	max = max_io_len(ci->sector, ti);	1302	max = max_io_len(ci->sector, ti);
1303		1303
1304	if (ci->sector_count <= max) {	1304	if (ci->sector_count <= max) {
1305	/*	1305	/*
1306	* Optimise for the simple case where we can do all of	1306	* Optimise for the simple case where we can do all of
1307	* the remaining io with a single clone.	1307	* the remaining io with a single clone.
1308	*/	1308	*/
1309	__clone_and_map_simple(ci, ti);	1309	__clone_and_map_simple(ci, ti);
1310		1310
1311	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {	1311	} else if (to_sector(bio->bi_io_vec[ci->idx].bv_len) <= max) {
1312	/*	1312	/*
1313	* There are some bvecs that don't span targets.	1313	* There are some bvecs that don't span targets.
1314	* Do as many of these as possible.	1314	* Do as many of these as possible.
1315	*/	1315	*/
1316	int i;	1316	int i;
1317	sector_t remaining = max;	1317	sector_t remaining = max;
1318	sector_t bv_len;	1318	sector_t bv_len;
1319		1319
1320	for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {	1320	for (i = ci->idx; remaining && (i < bio->bi_vcnt); i++) {
1321	bv_len = to_sector(bio->bi_io_vec[i].bv_len);	1321	bv_len = to_sector(bio->bi_io_vec[i].bv_len);
1322		1322
1323	if (bv_len > remaining)	1323	if (bv_len > remaining)
1324	break;	1324	break;
1325		1325
1326	remaining -= bv_len;	1326	remaining -= bv_len;
1327	len += bv_len;	1327	len += bv_len;
1328	}	1328	}
1329		1329
1330	tio = alloc_tio(ci, ti);	1330	tio = alloc_tio(ci, ti);
1331	clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,	1331	clone = clone_bio(bio, ci->sector, ci->idx, i - ci->idx, len,
1332	ci->md->bs);	1332	ci->md->bs);
1333	__map_bio(ti, clone, tio);	1333	__map_bio(ti, clone, tio);
1334		1334
1335	ci->sector += len;	1335	ci->sector += len;
1336	ci->sector_count -= len;	1336	ci->sector_count -= len;
1337	ci->idx = i;	1337	ci->idx = i;
1338		1338
1339	} else {	1339	} else {
1340	/*	1340	/*
1341	* Handle a bvec that must be split between two or more targets.	1341	* Handle a bvec that must be split between two or more targets.
1342	*/	1342	*/
1343	struct bio_vec *bv = bio->bi_io_vec + ci->idx;	1343	struct bio_vec *bv = bio->bi_io_vec + ci->idx;
1344	sector_t remaining = to_sector(bv->bv_len);	1344	sector_t remaining = to_sector(bv->bv_len);
1345	unsigned int offset = 0;	1345	unsigned int offset = 0;
1346		1346
1347	do {	1347	do {
1348	if (offset) {	1348	if (offset) {
1349	ti = dm_table_find_target(ci->map, ci->sector);	1349	ti = dm_table_find_target(ci->map, ci->sector);
1350	if (!dm_target_is_valid(ti))	1350	if (!dm_target_is_valid(ti))
1351	return -EIO;	1351	return -EIO;
1352		1352
1353	max = max_io_len(ci->sector, ti);	1353	max = max_io_len(ci->sector, ti);
1354	}	1354	}
1355		1355
1356	len = min(remaining, max);	1356	len = min(remaining, max);
1357		1357
1358	tio = alloc_tio(ci, ti);	1358	tio = alloc_tio(ci, ti);
1359	clone = split_bvec(bio, ci->sector, ci->idx,	1359	clone = split_bvec(bio, ci->sector, ci->idx,
1360	bv->bv_offset + offset, len,	1360	bv->bv_offset + offset, len,
1361	ci->md->bs);	1361	ci->md->bs);
1362		1362
1363	__map_bio(ti, clone, tio);	1363	__map_bio(ti, clone, tio);
1364		1364
1365	ci->sector += len;	1365	ci->sector += len;
1366	ci->sector_count -= len;	1366	ci->sector_count -= len;
1367	offset += to_bytes(len);	1367	offset += to_bytes(len);
1368	} while (remaining -= len);	1368	} while (remaining -= len);
1369		1369
1370	ci->idx++;	1370	ci->idx++;
1371	}	1371	}
1372		1372
1373	return 0;	1373	return 0;
1374	}	1374	}
1375		1375
1376	/*	1376	/*
1377	* Split the bio into several clones and submit it to targets.	1377	* Split the bio into several clones and submit it to targets.
1378	*/	1378	*/
1379	static void __split_and_process_bio(struct mapped_device md, struct bio bio)	1379	static void __split_and_process_bio(struct mapped_device md, struct bio bio)
1380	{	1380	{
1381	struct clone_info ci;	1381	struct clone_info ci;
1382	int error = 0;	1382	int error = 0;
1383		1383
1384	ci.map = dm_get_live_table(md);	1384	ci.map = dm_get_live_table(md);
1385	if (unlikely(!ci.map)) {	1385	if (unlikely(!ci.map)) {
1386	if (!(bio->bi_rw & REQ_HARDBARRIER))	1386	if (!(bio->bi_rw & REQ_HARDBARRIER))
1387	bio_io_error(bio);	1387	bio_io_error(bio);
1388	else	1388	else
1389	if (!md->barrier_error)	1389	if (!md->barrier_error)
1390	md->barrier_error = -EIO;	1390	md->barrier_error = -EIO;
1391	return;	1391	return;
1392	}	1392	}
1393		1393
1394	ci.md = md;	1394	ci.md = md;
1395	ci.bio = bio;	1395	ci.bio = bio;
1396	ci.io = alloc_io(md);	1396	ci.io = alloc_io(md);
1397	ci.io->error = 0;	1397	ci.io->error = 0;
1398	atomic_set(&ci.io->io_count, 1);	1398	atomic_set(&ci.io->io_count, 1);
1399	ci.io->bio = bio;	1399	ci.io->bio = bio;
1400	ci.io->md = md;	1400	ci.io->md = md;
1401	spin_lock_init(&ci.io->endio_lock);	1401	spin_lock_init(&ci.io->endio_lock);
1402	ci.sector = bio->bi_sector;	1402	ci.sector = bio->bi_sector;
1403	ci.sector_count = bio_sectors(bio);	1403	ci.sector_count = bio_sectors(bio);
1404	if (unlikely(bio_empty_barrier(bio)))	1404	if (unlikely(bio_empty_barrier(bio)))
1405	ci.sector_count = 1;	1405	ci.sector_count = 1;
1406	ci.idx = bio->bi_idx;	1406	ci.idx = bio->bi_idx;
1407		1407
1408	start_io_acct(ci.io);	1408	start_io_acct(ci.io);
1409	while (ci.sector_count && !error)	1409	while (ci.sector_count && !error)
1410	error = __clone_and_map(&ci);	1410	error = __clone_and_map(&ci);
1411		1411
1412	/* drop the extra reference count */	1412	/* drop the extra reference count */
1413	dec_pending(ci.io, error);	1413	dec_pending(ci.io, error);
1414	dm_table_put(ci.map);	1414	dm_table_put(ci.map);
1415	}	1415	}
1416	/*-----------------------------------------------------------------	1416	/*-----------------------------------------------------------------
1417	* CRUD END	1417	* CRUD END
1418	---------------------------------------------------------------/	1418	---------------------------------------------------------------/
1419		1419
1420	static int dm_merge_bvec(struct request_queue *q,	1420	static int dm_merge_bvec(struct request_queue *q,
1421	struct bvec_merge_data *bvm,	1421	struct bvec_merge_data *bvm,
1422	struct bio_vec *biovec)	1422	struct bio_vec *biovec)
1423	{	1423	{
1424	struct mapped_device *md = q->queuedata;	1424	struct mapped_device *md = q->queuedata;
1425	struct dm_table *map = dm_get_live_table(md);	1425	struct dm_table *map = dm_get_live_table(md);
1426	struct dm_target *ti;	1426	struct dm_target *ti;
1427	sector_t max_sectors;	1427	sector_t max_sectors;
1428	int max_size = 0;	1428	int max_size = 0;
1429		1429
1430	if (unlikely(!map))	1430	if (unlikely(!map))
1431	goto out;	1431	goto out;
1432		1432
1433	ti = dm_table_find_target(map, bvm->bi_sector);	1433	ti = dm_table_find_target(map, bvm->bi_sector);
1434	if (!dm_target_is_valid(ti))	1434	if (!dm_target_is_valid(ti))
1435	goto out_table;	1435	goto out_table;
1436		1436
1437	/*	1437	/*
1438	* Find maximum amount of I/O that won't need splitting	1438	* Find maximum amount of I/O that won't need splitting
1439	*/	1439	*/
1440	max_sectors = min(max_io_len(bvm->bi_sector, ti),	1440	max_sectors = min(max_io_len(bvm->bi_sector, ti),
1441	(sector_t) BIO_MAX_SECTORS);	1441	(sector_t) BIO_MAX_SECTORS);
1442	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;	1442	max_size = (max_sectors << SECTOR_SHIFT) - bvm->bi_size;
1443	if (max_size < 0)	1443	if (max_size < 0)
1444	max_size = 0;	1444	max_size = 0;
1445		1445
1446	/*	1446	/*
1447	* merge_bvec_fn() returns number of bytes	1447	* merge_bvec_fn() returns number of bytes
1448	* it can accept at this offset	1448	* it can accept at this offset
1449	* max is precomputed maximal io size	1449	* max is precomputed maximal io size
1450	*/	1450	*/
1451	if (max_size && ti->type->merge)	1451	if (max_size && ti->type->merge)
1452	max_size = ti->type->merge(ti, bvm, biovec, max_size);	1452	max_size = ti->type->merge(ti, bvm, biovec, max_size);
1453	/*	1453	/*
1454	* If the target doesn't support merge method and some of the devices	1454	* If the target doesn't support merge method and some of the devices
1455	* provided their merge_bvec method (we know this by looking at	1455	* provided their merge_bvec method (we know this by looking at
1456	* queue_max_hw_sectors), then we can't allow bios with multiple vector	1456	* queue_max_hw_sectors), then we can't allow bios with multiple vector
1457	* entries. So always set max_size to 0, and the code below allows	1457	* entries. So always set max_size to 0, and the code below allows
1458	* just one page.	1458	* just one page.
1459	*/	1459	*/
1460	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)	1460	else if (queue_max_hw_sectors(q) <= PAGE_SIZE >> 9)
1461		1461
1462	max_size = 0;	1462	max_size = 0;
1463		1463
1464	out_table:	1464	out_table:
1465	dm_table_put(map);	1465	dm_table_put(map);
1466		1466
1467	out:	1467	out:
1468	/*	1468	/*
1469	* Always allow an entire first page	1469	* Always allow an entire first page
1470	*/	1470	*/
1471	if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))	1471	if (max_size <= biovec->bv_len && !(bvm->bi_size >> SECTOR_SHIFT))
1472	max_size = biovec->bv_len;	1472	max_size = biovec->bv_len;
1473		1473
1474	return max_size;	1474	return max_size;
1475	}	1475	}
1476		1476
1477	/*	1477	/*
1478	* The request function that just remaps the bio built up by	1478	* The request function that just remaps the bio built up by
1479	* dm_merge_bvec.	1479	* dm_merge_bvec.
1480	*/	1480	*/
1481	static int _dm_request(struct request_queue q, struct bio bio)	1481	static int _dm_request(struct request_queue q, struct bio bio)
1482	{	1482	{
1483	int rw = bio_data_dir(bio);	1483	int rw = bio_data_dir(bio);
1484	struct mapped_device *md = q->queuedata;	1484	struct mapped_device *md = q->queuedata;
1485	int cpu;	1485	int cpu;
1486		1486
1487	down_read(&md->io_lock);	1487	down_read(&md->io_lock);
1488		1488
1489	cpu = part_stat_lock();	1489	cpu = part_stat_lock();
1490	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);	1490	part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]);
1491	part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));	1491	part_stat_add(cpu, &dm_disk(md)->part0, sectors[rw], bio_sectors(bio));
1492	part_stat_unlock();	1492	part_stat_unlock();
1493		1493
1494	/*	1494	/*
1495	* If we're suspended or the thread is processing barriers	1495	* If we're suspended or the thread is processing barriers
1496	* we have to queue this io for later.	1496	* we have to queue this io for later.
1497	*/	1497	*/
1498	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) \|\|	1498	if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) \|\|
1499	unlikely(bio->bi_rw & REQ_HARDBARRIER)) {	1499	unlikely(bio->bi_rw & REQ_HARDBARRIER)) {
1500	up_read(&md->io_lock);	1500	up_read(&md->io_lock);
1501		1501
1502	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&	1502	if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
1503	bio_rw(bio) == READA) {	1503	bio_rw(bio) == READA) {
1504	bio_io_error(bio);	1504	bio_io_error(bio);
1505	return 0;	1505	return 0;
1506	}	1506	}
1507		1507
1508	queue_io(md, bio);	1508	queue_io(md, bio);
1509		1509
1510	return 0;	1510	return 0;
1511	}	1511	}
1512		1512
1513	__split_and_process_bio(md, bio);	1513	__split_and_process_bio(md, bio);
1514	up_read(&md->io_lock);	1514	up_read(&md->io_lock);
1515	return 0;	1515	return 0;
1516	}	1516	}
1517		1517
1518	static int dm_make_request(struct request_queue q, struct bio bio)	1518	static int dm_make_request(struct request_queue q, struct bio bio)
1519	{	1519	{
1520	struct mapped_device *md = q->queuedata;	1520	struct mapped_device *md = q->queuedata;
1521		1521
1522	return md->saved_make_request_fn(q, bio); /* call __make_request() */	1522	return md->saved_make_request_fn(q, bio); /* call __make_request() */
1523	}	1523	}
1524		1524
1525	static int dm_request_based(struct mapped_device *md)	1525	static int dm_request_based(struct mapped_device *md)
1526	{	1526	{
1527	return blk_queue_stackable(md->queue);	1527	return blk_queue_stackable(md->queue);
1528	}	1528	}
1529		1529
1530	static int dm_request(struct request_queue q, struct bio bio)	1530	static int dm_request(struct request_queue q, struct bio bio)
1531	{	1531	{
1532	struct mapped_device *md = q->queuedata;	1532	struct mapped_device *md = q->queuedata;
1533		1533
1534	if (dm_request_based(md))	1534	if (dm_request_based(md))
1535	return dm_make_request(q, bio);	1535	return dm_make_request(q, bio);
1536		1536
1537	return _dm_request(q, bio);	1537	return _dm_request(q, bio);
1538	}	1538	}
1539		1539
1540	static bool dm_rq_is_flush_request(struct request *rq)	1540	static bool dm_rq_is_flush_request(struct request *rq)
1541	{	1541	{
1542	if (rq->cmd_flags & REQ_FLUSH)	1542	if (rq->cmd_flags & REQ_FLUSH)
1543	return true;	1543	return true;
1544	else	1544	else
1545	return false;	1545	return false;
1546	}	1546	}
1547		1547
1548	void dm_dispatch_request(struct request *rq)	1548	void dm_dispatch_request(struct request *rq)
1549	{	1549	{
1550	int r;	1550	int r;
1551		1551
1552	if (blk_queue_io_stat(rq->q))	1552	if (blk_queue_io_stat(rq->q))
1553	rq->cmd_flags \|= REQ_IO_STAT;	1553	rq->cmd_flags \|= REQ_IO_STAT;
1554		1554
1555	rq->start_time = jiffies;	1555	rq->start_time = jiffies;
1556	r = blk_insert_cloned_request(rq->q, rq);	1556	r = blk_insert_cloned_request(rq->q, rq);
1557	if (r)	1557	if (r)
1558	dm_complete_request(rq, r);	1558	dm_complete_request(rq, r);
1559	}	1559	}
1560	EXPORT_SYMBOL_GPL(dm_dispatch_request);	1560	EXPORT_SYMBOL_GPL(dm_dispatch_request);
1561		1561
1562	static void dm_rq_bio_destructor(struct bio *bio)	1562	static void dm_rq_bio_destructor(struct bio *bio)
1563	{	1563	{
1564	struct dm_rq_clone_bio_info *info = bio->bi_private;	1564	struct dm_rq_clone_bio_info *info = bio->bi_private;
1565	struct mapped_device *md = info->tio->md;	1565	struct mapped_device *md = info->tio->md;
1566		1566
1567	free_bio_info(info);	1567	free_bio_info(info);
1568	bio_free(bio, md->bs);	1568	bio_free(bio, md->bs);
1569	}	1569	}
1570		1570
1571	static int dm_rq_bio_constructor(struct bio bio, struct bio bio_orig,	1571	static int dm_rq_bio_constructor(struct bio bio, struct bio bio_orig,
1572	void *data)	1572	void *data)
1573	{	1573	{
1574	struct dm_rq_target_io *tio = data;	1574	struct dm_rq_target_io *tio = data;
1575	struct mapped_device *md = tio->md;	1575	struct mapped_device *md = tio->md;
1576	struct dm_rq_clone_bio_info *info = alloc_bio_info(md);	1576	struct dm_rq_clone_bio_info *info = alloc_bio_info(md);
1577		1577
1578	if (!info)	1578	if (!info)
1579	return -ENOMEM;	1579	return -ENOMEM;
1580		1580
1581	info->orig = bio_orig;	1581	info->orig = bio_orig;
1582	info->tio = tio;	1582	info->tio = tio;
1583	bio->bi_end_io = end_clone_bio;	1583	bio->bi_end_io = end_clone_bio;
1584	bio->bi_private = info;	1584	bio->bi_private = info;
1585	bio->bi_destructor = dm_rq_bio_destructor;	1585	bio->bi_destructor = dm_rq_bio_destructor;
1586		1586
1587	return 0;	1587	return 0;
1588	}	1588	}
1589		1589
1590	static int setup_clone(struct request clone, struct request rq,	1590	static int setup_clone(struct request clone, struct request rq,
1591	struct dm_rq_target_io *tio)	1591	struct dm_rq_target_io *tio)
1592	{	1592	{
1593	int r;	1593	int r;
1594		1594
1595	if (dm_rq_is_flush_request(rq)) {	1595	if (dm_rq_is_flush_request(rq)) {
1596	blk_rq_init(NULL, clone);	1596	blk_rq_init(NULL, clone);
1597	clone->cmd_type = REQ_TYPE_FS;	1597	clone->cmd_type = REQ_TYPE_FS;
1598	clone->cmd_flags \|= (REQ_HARDBARRIER \| WRITE);	1598	clone->cmd_flags \|= (REQ_HARDBARRIER \| WRITE);
1599	} else {	1599	} else {
1600	r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,	1600	r = blk_rq_prep_clone(clone, rq, tio->md->bs, GFP_ATOMIC,
1601	dm_rq_bio_constructor, tio);	1601	dm_rq_bio_constructor, tio);
1602	if (r)	1602	if (r)
1603	return r;	1603	return r;
1604		1604
1605	clone->cmd = rq->cmd;	1605	clone->cmd = rq->cmd;
1606	clone->cmd_len = rq->cmd_len;	1606	clone->cmd_len = rq->cmd_len;
1607	clone->sense = rq->sense;	1607	clone->sense = rq->sense;
1608	clone->buffer = rq->buffer;	1608	clone->buffer = rq->buffer;
1609	}	1609	}
1610		1610
1611	clone->end_io = end_clone_request;	1611	clone->end_io = end_clone_request;
1612	clone->end_io_data = tio;	1612	clone->end_io_data = tio;
1613		1613
1614	return 0;	1614	return 0;
1615	}	1615	}
1616		1616
1617	static struct request clone_rq(struct request rq, struct mapped_device *md,	1617	static struct request clone_rq(struct request rq, struct mapped_device *md,
1618	gfp_t gfp_mask)	1618	gfp_t gfp_mask)
1619	{	1619	{
1620	struct request *clone;	1620	struct request *clone;
1621	struct dm_rq_target_io *tio;	1621	struct dm_rq_target_io *tio;
1622		1622
1623	tio = alloc_rq_tio(md, gfp_mask);	1623	tio = alloc_rq_tio(md, gfp_mask);
1624	if (!tio)	1624	if (!tio)
1625	return NULL;	1625	return NULL;
1626		1626
1627	tio->md = md;	1627	tio->md = md;
1628	tio->ti = NULL;	1628	tio->ti = NULL;
1629	tio->orig = rq;	1629	tio->orig = rq;
1630	tio->error = 0;	1630	tio->error = 0;
1631	memset(&tio->info, 0, sizeof(tio->info));	1631	memset(&tio->info, 0, sizeof(tio->info));
1632		1632
1633	clone = &tio->clone;	1633	clone = &tio->clone;
1634	if (setup_clone(clone, rq, tio)) {	1634	if (setup_clone(clone, rq, tio)) {
1635	/* -ENOMEM */	1635	/* -ENOMEM */
1636	free_rq_tio(tio);	1636	free_rq_tio(tio);
1637	return NULL;	1637	return NULL;
1638	}	1638	}
1639		1639
1640	return clone;	1640	return clone;
1641	}	1641	}
1642		1642
1643	/*	1643	/*
1644	* Called with the queue lock held.	1644	* Called with the queue lock held.
1645	*/	1645	*/
1646	static int dm_prep_fn(struct request_queue q, struct request rq)	1646	static int dm_prep_fn(struct request_queue q, struct request rq)
1647	{	1647	{
1648	struct mapped_device *md = q->queuedata;	1648	struct mapped_device *md = q->queuedata;
1649	struct request *clone;	1649	struct request *clone;
1650		1650
1651	if (unlikely(dm_rq_is_flush_request(rq)))	1651	if (unlikely(dm_rq_is_flush_request(rq)))
1652	return BLKPREP_OK;	1652	return BLKPREP_OK;
1653		1653
1654	if (unlikely(rq->special)) {	1654	if (unlikely(rq->special)) {
1655	DMWARN("Already has something in rq->special.");	1655	DMWARN("Already has something in rq->special.");
1656	return BLKPREP_KILL;	1656	return BLKPREP_KILL;
1657	}	1657	}
1658		1658
1659	clone = clone_rq(rq, md, GFP_ATOMIC);	1659	clone = clone_rq(rq, md, GFP_ATOMIC);
1660	if (!clone)	1660	if (!clone)
1661	return BLKPREP_DEFER;	1661	return BLKPREP_DEFER;
1662		1662
1663	rq->special = clone;	1663	rq->special = clone;
1664	rq->cmd_flags \|= REQ_DONTPREP;	1664	rq->cmd_flags \|= REQ_DONTPREP;
1665		1665
1666	return BLKPREP_OK;	1666	return BLKPREP_OK;
1667	}	1667	}
1668		1668
1669	/*	1669	/*
1670	* Returns:	1670	* Returns:
1671	* 0 : the request has been processed (not requeued)	1671	* 0 : the request has been processed (not requeued)
1672	* !0 : the request has been requeued	1672	* !0 : the request has been requeued
1673	*/	1673	*/
1674	static int map_request(struct dm_target ti, struct request clone,	1674	static int map_request(struct dm_target ti, struct request clone,
1675	struct mapped_device *md)	1675	struct mapped_device *md)
1676	{	1676	{
1677	int r, requeued = 0;	1677	int r, requeued = 0;
1678	struct dm_rq_target_io *tio = clone->end_io_data;	1678	struct dm_rq_target_io *tio = clone->end_io_data;
1679		1679
1680	/*	1680	/*
1681	* Hold the md reference here for the in-flight I/O.	1681	* Hold the md reference here for the in-flight I/O.
1682	* We can't rely on the reference count by device opener,	1682	* We can't rely on the reference count by device opener,
1683	* because the device may be closed during the request completion	1683	* because the device may be closed during the request completion
1684	* when all bios are completed.	1684	* when all bios are completed.
1685	* See the comment in rq_completed() too.	1685	* See the comment in rq_completed() too.
1686	*/	1686	*/
1687	dm_get(md);	1687	dm_get(md);
1688		1688
1689	tio->ti = ti;	1689	tio->ti = ti;
1690	r = ti->type->map_rq(ti, clone, &tio->info);	1690	r = ti->type->map_rq(ti, clone, &tio->info);
1691	switch (r) {	1691	switch (r) {
1692	case DM_MAPIO_SUBMITTED:	1692	case DM_MAPIO_SUBMITTED:
1693	/* The target has taken the I/O to submit by itself later */	1693	/* The target has taken the I/O to submit by itself later */
1694	break;	1694	break;
1695	case DM_MAPIO_REMAPPED:	1695	case DM_MAPIO_REMAPPED:
1696	/* The target has remapped the I/O so dispatch it */	1696	/* The target has remapped the I/O so dispatch it */
1697	trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),	1697	trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
1698	blk_rq_pos(tio->orig));	1698	blk_rq_pos(tio->orig));
1699	dm_dispatch_request(clone);	1699	dm_dispatch_request(clone);
1700	break;	1700	break;
1701	case DM_MAPIO_REQUEUE:	1701	case DM_MAPIO_REQUEUE:
1702	/* The target wants to requeue the I/O */	1702	/* The target wants to requeue the I/O */
1703	dm_requeue_unmapped_request(clone);	1703	dm_requeue_unmapped_request(clone);
1704	requeued = 1;	1704	requeued = 1;
1705	break;	1705	break;
1706	default:	1706	default:
1707	if (r > 0) {	1707	if (r > 0) {
1708	DMWARN("unimplemented target map return value: %d", r);	1708	DMWARN("unimplemented target map return value: %d", r);
1709	BUG();	1709	BUG();
1710	}	1710	}
1711		1711
1712	/* The target wants to complete the I/O */	1712	/* The target wants to complete the I/O */
1713	dm_kill_unmapped_request(clone, r);	1713	dm_kill_unmapped_request(clone, r);
1714	break;	1714	break;
1715	}	1715	}
1716		1716
1717	return requeued;	1717	return requeued;
1718	}	1718	}
1719		1719
1720	/*	1720	/*
1721	* q->request_fn for request-based dm.	1721	* q->request_fn for request-based dm.
1722	* Called with the queue lock held.	1722	* Called with the queue lock held.
1723	*/	1723	*/
1724	static void dm_request_fn(struct request_queue *q)	1724	static void dm_request_fn(struct request_queue *q)
1725	{	1725	{
1726	struct mapped_device *md = q->queuedata;	1726	struct mapped_device *md = q->queuedata;
1727	struct dm_table *map = dm_get_live_table(md);	1727	struct dm_table *map = dm_get_live_table(md);
1728	struct dm_target *ti;	1728	struct dm_target *ti;
1729	struct request rq, clone;	1729	struct request rq, clone;
1730		1730
1731	/*	1731	/*
1732	* For suspend, check blk_queue_stopped() and increment	1732	* For suspend, check blk_queue_stopped() and increment
1733	* ->pending within a single queue_lock not to increment the	1733	* ->pending within a single queue_lock not to increment the
1734	* number of in-flight I/Os after the queue is stopped in	1734	* number of in-flight I/Os after the queue is stopped in
1735	* dm_suspend().	1735	* dm_suspend().
1736	*/	1736	*/
1737	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {	1737	while (!blk_queue_plugged(q) && !blk_queue_stopped(q)) {
1738	rq = blk_peek_request(q);	1738	rq = blk_peek_request(q);
1739	if (!rq)	1739	if (!rq)
1740	goto plug_and_out;	1740	goto plug_and_out;
1741		1741
1742	if (unlikely(dm_rq_is_flush_request(rq))) {	1742	if (unlikely(dm_rq_is_flush_request(rq))) {
1743	BUG_ON(md->flush_request);	1743	BUG_ON(md->flush_request);
1744	md->flush_request = rq;	1744	md->flush_request = rq;
1745	blk_start_request(rq);	1745	blk_start_request(rq);
1746	queue_work(md->wq, &md->barrier_work);	1746	queue_work(md->wq, &md->barrier_work);
1747	goto out;	1747	goto out;
1748	}	1748	}
1749		1749
1750	ti = dm_table_find_target(map, blk_rq_pos(rq));	1750	ti = dm_table_find_target(map, blk_rq_pos(rq));
1751	if (ti->type->busy && ti->type->busy(ti))	1751	if (ti->type->busy && ti->type->busy(ti))
1752	goto plug_and_out;	1752	goto plug_and_out;
1753		1753
1754	blk_start_request(rq);	1754	blk_start_request(rq);
1755	clone = rq->special;	1755	clone = rq->special;
1756	atomic_inc(&md->pending[rq_data_dir(clone)]);	1756	atomic_inc(&md->pending[rq_data_dir(clone)]);
1757		1757
1758	spin_unlock(q->queue_lock);	1758	spin_unlock(q->queue_lock);
1759	if (map_request(ti, clone, md))	1759	if (map_request(ti, clone, md))
1760	goto requeued;	1760	goto requeued;
1761		1761
1762	spin_lock_irq(q->queue_lock);	1762	spin_lock_irq(q->queue_lock);
1763	}	1763	}
1764		1764
1765	goto out;	1765	goto out;
1766		1766
1767	requeued:	1767	requeued:
1768	spin_lock_irq(q->queue_lock);	1768	spin_lock_irq(q->queue_lock);
1769		1769
1770	plug_and_out:	1770	plug_and_out:
1771	if (!elv_queue_empty(q))	1771	if (!elv_queue_empty(q))
1772	/* Some requests still remain, retry later */	1772	/* Some requests still remain, retry later */
1773	blk_plug_device(q);	1773	blk_plug_device(q);
1774		1774
1775	out:	1775	out:
1776	dm_table_put(map);	1776	dm_table_put(map);
1777		1777
1778	return;	1778	return;
1779	}	1779	}
1780		1780
1781	int dm_underlying_device_busy(struct request_queue *q)	1781	int dm_underlying_device_busy(struct request_queue *q)
1782	{	1782	{
1783	return blk_lld_busy(q);	1783	return blk_lld_busy(q);
1784	}	1784	}
1785	EXPORT_SYMBOL_GPL(dm_underlying_device_busy);	1785	EXPORT_SYMBOL_GPL(dm_underlying_device_busy);
1786		1786
1787	static int dm_lld_busy(struct request_queue *q)	1787	static int dm_lld_busy(struct request_queue *q)
1788	{	1788	{
1789	int r;	1789	int r;
1790	struct mapped_device *md = q->queuedata;	1790	struct mapped_device *md = q->queuedata;
1791	struct dm_table *map = dm_get_live_table(md);	1791	struct dm_table *map = dm_get_live_table(md);
1792		1792
1793	if (!map \|\| test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))	1793	if (!map \|\| test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))
1794	r = 1;	1794	r = 1;
1795	else	1795	else
1796	r = dm_table_any_busy_target(map);	1796	r = dm_table_any_busy_target(map);
1797		1797
1798	dm_table_put(map);	1798	dm_table_put(map);
1799		1799
1800	return r;	1800	return r;
1801	}	1801	}
1802		1802
1803	static void dm_unplug_all(struct request_queue *q)	1803	static void dm_unplug_all(struct request_queue *q)
1804	{	1804	{
1805	struct mapped_device *md = q->queuedata;	1805	struct mapped_device *md = q->queuedata;
1806	struct dm_table *map = dm_get_live_table(md);	1806	struct dm_table *map = dm_get_live_table(md);
1807		1807
1808	if (map) {	1808	if (map) {
1809	if (dm_request_based(md))	1809	if (dm_request_based(md))
1810	generic_unplug_device(q);	1810	generic_unplug_device(q);
1811		1811
1812	dm_table_unplug_all(map);	1812	dm_table_unplug_all(map);
1813	dm_table_put(map);	1813	dm_table_put(map);
1814	}	1814	}
1815	}	1815	}
1816		1816
1817	static int dm_any_congested(void *congested_data, int bdi_bits)	1817	static int dm_any_congested(void *congested_data, int bdi_bits)
1818	{	1818	{
1819	int r = bdi_bits;	1819	int r = bdi_bits;
1820	struct mapped_device *md = congested_data;	1820	struct mapped_device *md = congested_data;
1821	struct dm_table *map;	1821	struct dm_table *map;
1822		1822
1823	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {	1823	if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
1824	map = dm_get_live_table(md);	1824	map = dm_get_live_table(md);
1825	if (map) {	1825	if (map) {
1826	/*	1826	/*
1827	* Request-based dm cares about only own queue for	1827	* Request-based dm cares about only own queue for
1828	* the query about congestion status of request_queue	1828	* the query about congestion status of request_queue
1829	*/	1829	*/
1830	if (dm_request_based(md))	1830	if (dm_request_based(md))
1831	r = md->queue->backing_dev_info.state &	1831	r = md->queue->backing_dev_info.state &
1832	bdi_bits;	1832	bdi_bits;
1833	else	1833	else
1834	r = dm_table_any_congested(map, bdi_bits);	1834	r = dm_table_any_congested(map, bdi_bits);
1835		1835
1836	dm_table_put(map);	1836	dm_table_put(map);
1837	}	1837	}
1838	}	1838	}
1839		1839
1840	return r;	1840	return r;
1841	}	1841	}
1842		1842
1843	/*-----------------------------------------------------------------	1843	/*-----------------------------------------------------------------
1844	* An IDR is used to keep track of allocated minor numbers.	1844	* An IDR is used to keep track of allocated minor numbers.
1845	---------------------------------------------------------------/	1845	---------------------------------------------------------------/
1846	static DEFINE_IDR(_minor_idr);	1846	static DEFINE_IDR(_minor_idr);
1847		1847
1848	static void free_minor(int minor)	1848	static void free_minor(int minor)
1849	{	1849	{
1850	spin_lock(&_minor_lock);	1850	spin_lock(&_minor_lock);
1851	idr_remove(&_minor_idr, minor);	1851	idr_remove(&_minor_idr, minor);
1852	spin_unlock(&_minor_lock);	1852	spin_unlock(&_minor_lock);
1853	}	1853	}
1854		1854
1855	/*	1855	/*
1856	* See if the device with a specific minor # is free.	1856	* See if the device with a specific minor # is free.
1857	*/	1857	*/
1858	static int specific_minor(int minor)	1858	static int specific_minor(int minor)
1859	{	1859	{
1860	int r, m;	1860	int r, m;
1861		1861
1862	if (minor >= (1 << MINORBITS))	1862	if (minor >= (1 << MINORBITS))
1863	return -EINVAL;	1863	return -EINVAL;
1864		1864
1865	r = idr_pre_get(&_minor_idr, GFP_KERNEL);	1865	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1866	if (!r)	1866	if (!r)
1867	return -ENOMEM;	1867	return -ENOMEM;
1868		1868
1869	spin_lock(&_minor_lock);	1869	spin_lock(&_minor_lock);
1870		1870
1871	if (idr_find(&_minor_idr, minor)) {	1871	if (idr_find(&_minor_idr, minor)) {
1872	r = -EBUSY;	1872	r = -EBUSY;
1873	goto out;	1873	goto out;
1874	}	1874	}
1875		1875
1876	r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);	1876	r = idr_get_new_above(&_minor_idr, MINOR_ALLOCED, minor, &m);
1877	if (r)	1877	if (r)
1878	goto out;	1878	goto out;
1879		1879
1880	if (m != minor) {	1880	if (m != minor) {
1881	idr_remove(&_minor_idr, m);	1881	idr_remove(&_minor_idr, m);
1882	r = -EBUSY;	1882	r = -EBUSY;
1883	goto out;	1883	goto out;
1884	}	1884	}
1885		1885
1886	out:	1886	out:
1887	spin_unlock(&_minor_lock);	1887	spin_unlock(&_minor_lock);
1888	return r;	1888	return r;
1889	}	1889	}
1890		1890
1891	static int next_free_minor(int *minor)	1891	static int next_free_minor(int *minor)
1892	{	1892	{
1893	int r, m;	1893	int r, m;
1894		1894
1895	r = idr_pre_get(&_minor_idr, GFP_KERNEL);	1895	r = idr_pre_get(&_minor_idr, GFP_KERNEL);
1896	if (!r)	1896	if (!r)
1897	return -ENOMEM;	1897	return -ENOMEM;
1898		1898
1899	spin_lock(&_minor_lock);	1899	spin_lock(&_minor_lock);
1900		1900
1901	r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);	1901	r = idr_get_new(&_minor_idr, MINOR_ALLOCED, &m);
1902	if (r)	1902	if (r)
1903	goto out;	1903	goto out;
1904		1904
1905	if (m >= (1 << MINORBITS)) {	1905	if (m >= (1 << MINORBITS)) {
1906	idr_remove(&_minor_idr, m);	1906	idr_remove(&_minor_idr, m);
1907	r = -ENOSPC;	1907	r = -ENOSPC;
1908	goto out;	1908	goto out;
1909	}	1909	}
1910		1910
1911	*minor = m;	1911	*minor = m;
1912		1912
1913	out:	1913	out:
1914	spin_unlock(&_minor_lock);	1914	spin_unlock(&_minor_lock);
1915	return r;	1915	return r;
1916	}	1916	}
1917		1917
1918	static const struct block_device_operations dm_blk_dops;	1918	static const struct block_device_operations dm_blk_dops;
1919		1919
1920	static void dm_wq_work(struct work_struct *work);	1920	static void dm_wq_work(struct work_struct *work);
1921	static void dm_rq_barrier_work(struct work_struct *work);	1921	static void dm_rq_barrier_work(struct work_struct *work);
1922		1922
1923	static void dm_init_md_queue(struct mapped_device *md)	1923	static void dm_init_md_queue(struct mapped_device *md)
1924	{	1924	{
1925	/*	1925	/*
1926	* Request-based dm devices cannot be stacked on top of bio-based dm	1926	* Request-based dm devices cannot be stacked on top of bio-based dm
1927	* devices. The type of this dm device has not been decided yet.	1927	* devices. The type of this dm device has not been decided yet.
1928	* The type is decided at the first table loading time.	1928	* The type is decided at the first table loading time.
1929	* To prevent problematic device stacking, clear the queue flag	1929	* To prevent problematic device stacking, clear the queue flag
1930	* for request stacking support until then.	1930	* for request stacking support until then.
1931	*	1931	*
1932	* This queue is new, so no concurrency on the queue_flags.	1932	* This queue is new, so no concurrency on the queue_flags.
1933	*/	1933	*/
1934	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);	1934	queue_flag_clear_unlocked(QUEUE_FLAG_STACKABLE, md->queue);
1935		1935
1936	md->queue->queuedata = md;	1936	md->queue->queuedata = md;
1937	md->queue->backing_dev_info.congested_fn = dm_any_congested;	1937	md->queue->backing_dev_info.congested_fn = dm_any_congested;
1938	md->queue->backing_dev_info.congested_data = md;	1938	md->queue->backing_dev_info.congested_data = md;
1939	blk_queue_make_request(md->queue, dm_request);	1939	blk_queue_make_request(md->queue, dm_request);
1940	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);	1940	blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY);
1941	md->queue->unplug_fn = dm_unplug_all;	1941	md->queue->unplug_fn = dm_unplug_all;
1942	blk_queue_merge_bvec(md->queue, dm_merge_bvec);	1942	blk_queue_merge_bvec(md->queue, dm_merge_bvec);
1943	}	1943	}
1944		1944
1945	/*	1945	/*
1946	* Allocate and initialise a blank device with a given minor.	1946	* Allocate and initialise a blank device with a given minor.
1947	*/	1947	*/
1948	static struct mapped_device *alloc_dev(int minor)	1948	static struct mapped_device *alloc_dev(int minor)
1949	{	1949	{
1950	int r;	1950	int r;
1951	struct mapped_device md = kzalloc(sizeof(md), GFP_KERNEL);	1951	struct mapped_device md = kzalloc(sizeof(md), GFP_KERNEL);
1952	void *old_md;	1952	void *old_md;
1953		1953
1954	if (!md) {	1954	if (!md) {
1955	DMWARN("unable to allocate device, out of memory.");	1955	DMWARN("unable to allocate device, out of memory.");
1956	return NULL;	1956	return NULL;
1957	}	1957	}
1958		1958
1959	if (!try_module_get(THIS_MODULE))	1959	if (!try_module_get(THIS_MODULE))
1960	goto bad_module_get;	1960	goto bad_module_get;
1961		1961
1962	/* get a minor number for the dev */	1962	/* get a minor number for the dev */
1963	if (minor == DM_ANY_MINOR)	1963	if (minor == DM_ANY_MINOR)
1964	r = next_free_minor(&minor);	1964	r = next_free_minor(&minor);
1965	else	1965	else
1966	r = specific_minor(minor);	1966	r = specific_minor(minor);
1967	if (r < 0)	1967	if (r < 0)
1968	goto bad_minor;	1968	goto bad_minor;
1969		1969
1970	md->type = DM_TYPE_NONE;	1970	md->type = DM_TYPE_NONE;
1971	init_rwsem(&md->io_lock);	1971	init_rwsem(&md->io_lock);
1972	mutex_init(&md->suspend_lock);	1972	mutex_init(&md->suspend_lock);
1973	mutex_init(&md->type_lock);	1973	mutex_init(&md->type_lock);
1974	spin_lock_init(&md->deferred_lock);	1974	spin_lock_init(&md->deferred_lock);
1975	spin_lock_init(&md->barrier_error_lock);	1975	spin_lock_init(&md->barrier_error_lock);
1976	rwlock_init(&md->map_lock);	1976	rwlock_init(&md->map_lock);
1977	atomic_set(&md->holders, 1);	1977	atomic_set(&md->holders, 1);
1978	atomic_set(&md->open_count, 0);	1978	atomic_set(&md->open_count, 0);
1979	atomic_set(&md->event_nr, 0);	1979	atomic_set(&md->event_nr, 0);
1980	atomic_set(&md->uevent_seq, 0);	1980	atomic_set(&md->uevent_seq, 0);
1981	INIT_LIST_HEAD(&md->uevent_list);	1981	INIT_LIST_HEAD(&md->uevent_list);
1982	spin_lock_init(&md->uevent_lock);	1982	spin_lock_init(&md->uevent_lock);
1983		1983
1984	md->queue = blk_alloc_queue(GFP_KERNEL);	1984	md->queue = blk_alloc_queue(GFP_KERNEL);
1985	if (!md->queue)	1985	if (!md->queue)
1986	goto bad_queue;	1986	goto bad_queue;
1987		1987
1988	dm_init_md_queue(md);	1988	dm_init_md_queue(md);
1989		1989
1990	md->disk = alloc_disk(1);	1990	md->disk = alloc_disk(1);
1991	if (!md->disk)	1991	if (!md->disk)
1992	goto bad_disk;	1992	goto bad_disk;
1993		1993
1994	atomic_set(&md->pending[0], 0);	1994	atomic_set(&md->pending[0], 0);
1995	atomic_set(&md->pending[1], 0);	1995	atomic_set(&md->pending[1], 0);
1996	init_waitqueue_head(&md->wait);	1996	init_waitqueue_head(&md->wait);
1997	INIT_WORK(&md->work, dm_wq_work);	1997	INIT_WORK(&md->work, dm_wq_work);
1998	INIT_WORK(&md->barrier_work, dm_rq_barrier_work);	1998	INIT_WORK(&md->barrier_work, dm_rq_barrier_work);
1999	init_waitqueue_head(&md->eventq);	1999	init_waitqueue_head(&md->eventq);
2000		2000
2001	md->disk->major = _major;	2001	md->disk->major = _major;
2002	md->disk->first_minor = minor;	2002	md->disk->first_minor = minor;
2003	md->disk->fops = &dm_blk_dops;	2003	md->disk->fops = &dm_blk_dops;
2004	md->disk->queue = md->queue;	2004	md->disk->queue = md->queue;
2005	md->disk->private_data = md;	2005	md->disk->private_data = md;
2006	sprintf(md->disk->disk_name, "dm-%d", minor);	2006	sprintf(md->disk->disk_name, "dm-%d", minor);
2007	add_disk(md->disk);	2007	add_disk(md->disk);
2008	format_dev_t(md->name, MKDEV(_major, minor));	2008	format_dev_t(md->name, MKDEV(_major, minor));
2009		2009
2010	md->wq = create_singlethread_workqueue("kdmflush");	2010	md->wq = create_singlethread_workqueue("kdmflush");
2011	if (!md->wq)	2011	if (!md->wq)
2012	goto bad_thread;	2012	goto bad_thread;
2013		2013
2014	md->bdev = bdget_disk(md->disk, 0);	2014	md->bdev = bdget_disk(md->disk, 0);
2015	if (!md->bdev)	2015	if (!md->bdev)
2016	goto bad_bdev;	2016	goto bad_bdev;
2017		2017
2018	/* Populate the mapping, nobody knows we exist yet */	2018	/* Populate the mapping, nobody knows we exist yet */
2019	spin_lock(&_minor_lock);	2019	spin_lock(&_minor_lock);
2020	old_md = idr_replace(&_minor_idr, md, minor);	2020	old_md = idr_replace(&_minor_idr, md, minor);
2021	spin_unlock(&_minor_lock);	2021	spin_unlock(&_minor_lock);
2022		2022
2023	BUG_ON(old_md != MINOR_ALLOCED);	2023	BUG_ON(old_md != MINOR_ALLOCED);
2024		2024
2025	return md;	2025	return md;
2026		2026
2027	bad_bdev:	2027	bad_bdev:
2028	destroy_workqueue(md->wq);	2028	destroy_workqueue(md->wq);
2029	bad_thread:	2029	bad_thread:
2030	del_gendisk(md->disk);	2030	del_gendisk(md->disk);
2031	put_disk(md->disk);	2031	put_disk(md->disk);
2032	bad_disk:	2032	bad_disk:
2033	blk_cleanup_queue(md->queue);	2033	blk_cleanup_queue(md->queue);
2034	bad_queue:	2034	bad_queue:
2035	free_minor(minor);	2035	free_minor(minor);
2036	bad_minor:	2036	bad_minor:
2037	module_put(THIS_MODULE);	2037	module_put(THIS_MODULE);
2038	bad_module_get:	2038	bad_module_get:
2039	kfree(md);	2039	kfree(md);
2040	return NULL;	2040	return NULL;
2041	}	2041	}
2042		2042
2043	static void unlock_fs(struct mapped_device *md);	2043	static void unlock_fs(struct mapped_device *md);
2044		2044
2045	static void free_dev(struct mapped_device *md)	2045	static void free_dev(struct mapped_device *md)
2046	{	2046	{
2047	int minor = MINOR(disk_devt(md->disk));	2047	int minor = MINOR(disk_devt(md->disk));
2048		2048
2049	unlock_fs(md);	2049	unlock_fs(md);
2050	bdput(md->bdev);	2050	bdput(md->bdev);
2051	destroy_workqueue(md->wq);	2051	destroy_workqueue(md->wq);
2052	if (md->tio_pool)	2052	if (md->tio_pool)
2053	mempool_destroy(md->tio_pool);	2053	mempool_destroy(md->tio_pool);
2054	if (md->io_pool)	2054	if (md->io_pool)
2055	mempool_destroy(md->io_pool);	2055	mempool_destroy(md->io_pool);
2056	if (md->bs)	2056	if (md->bs)
2057	bioset_free(md->bs);	2057	bioset_free(md->bs);
2058	blk_integrity_unregister(md->disk);	2058	blk_integrity_unregister(md->disk);
2059	del_gendisk(md->disk);	2059	del_gendisk(md->disk);
2060	free_minor(minor);	2060	free_minor(minor);
2061		2061
2062	spin_lock(&_minor_lock);	2062	spin_lock(&_minor_lock);
2063	md->disk->private_data = NULL;	2063	md->disk->private_data = NULL;
2064	spin_unlock(&_minor_lock);	2064	spin_unlock(&_minor_lock);
2065		2065
2066	put_disk(md->disk);	2066	put_disk(md->disk);
2067	blk_cleanup_queue(md->queue);	2067	blk_cleanup_queue(md->queue);
2068	module_put(THIS_MODULE);	2068	module_put(THIS_MODULE);
2069	kfree(md);	2069	kfree(md);
2070	}	2070	}
2071		2071
2072	static void __bind_mempools(struct mapped_device md, struct dm_table t)	2072	static void __bind_mempools(struct mapped_device md, struct dm_table t)
2073	{	2073	{
2074	struct dm_md_mempools *p;	2074	struct dm_md_mempools *p;
2075		2075
2076	if (md->io_pool && md->tio_pool && md->bs)	2076	if (md->io_pool && md->tio_pool && md->bs)
2077	/* the md already has necessary mempools */	2077	/* the md already has necessary mempools */
2078	goto out;	2078	goto out;
2079		2079
2080	p = dm_table_get_md_mempools(t);	2080	p = dm_table_get_md_mempools(t);
2081	BUG_ON(!p \|\| md->io_pool \|\| md->tio_pool \|\| md->bs);	2081	BUG_ON(!p \|\| md->io_pool \|\| md->tio_pool \|\| md->bs);
2082		2082
2083	md->io_pool = p->io_pool;	2083	md->io_pool = p->io_pool;
2084	p->io_pool = NULL;	2084	p->io_pool = NULL;
2085	md->tio_pool = p->tio_pool;	2085	md->tio_pool = p->tio_pool;
2086	p->tio_pool = NULL;	2086	p->tio_pool = NULL;
2087	md->bs = p->bs;	2087	md->bs = p->bs;
2088	p->bs = NULL;	2088	p->bs = NULL;
2089		2089
2090	out:	2090	out:
2091	/* mempool bind completed, now no need any mempools in the table */	2091	/* mempool bind completed, now no need any mempools in the table */
2092	dm_table_free_md_mempools(t);	2092	dm_table_free_md_mempools(t);
2093	}	2093	}
2094		2094
2095	/*	2095	/*
2096	* Bind a table to the device.	2096	* Bind a table to the device.
2097	*/	2097	*/
2098	static void event_callback(void *context)	2098	static void event_callback(void *context)
2099	{	2099	{
2100	unsigned long flags;	2100	unsigned long flags;
2101	LIST_HEAD(uevents);	2101	LIST_HEAD(uevents);
2102	struct mapped_device md = (struct mapped_device ) context;	2102	struct mapped_device md = (struct mapped_device ) context;
2103		2103
2104	spin_lock_irqsave(&md->uevent_lock, flags);	2104	spin_lock_irqsave(&md->uevent_lock, flags);
2105	list_splice_init(&md->uevent_list, &uevents);	2105	list_splice_init(&md->uevent_list, &uevents);
2106	spin_unlock_irqrestore(&md->uevent_lock, flags);	2106	spin_unlock_irqrestore(&md->uevent_lock, flags);
2107		2107
2108	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);	2108	dm_send_uevents(&uevents, &disk_to_dev(md->disk)->kobj);
2109		2109
2110	atomic_inc(&md->event_nr);	2110	atomic_inc(&md->event_nr);
2111	wake_up(&md->eventq);	2111	wake_up(&md->eventq);
2112	}	2112	}
2113		2113
2114	static void __set_size(struct mapped_device *md, sector_t size)	2114	static void __set_size(struct mapped_device *md, sector_t size)
2115	{	2115	{
2116	set_capacity(md->disk, size);	2116	set_capacity(md->disk, size);
2117		2117
2118	mutex_lock(&md->bdev->bd_inode->i_mutex);	2118	mutex_lock(&md->bdev->bd_inode->i_mutex);
2119	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);	2119	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
2120	mutex_unlock(&md->bdev->bd_inode->i_mutex);	2120	mutex_unlock(&md->bdev->bd_inode->i_mutex);
2121	}	2121	}
2122		2122
2123	/*	2123	/*
2124	* Returns old map, which caller must destroy.	2124	* Returns old map, which caller must destroy.
2125	*/	2125	*/
2126	static struct dm_table __bind(struct mapped_device md, struct dm_table *t,	2126	static struct dm_table __bind(struct mapped_device md, struct dm_table *t,
2127	struct queue_limits *limits)	2127	struct queue_limits *limits)
2128	{	2128	{
2129	struct dm_table *old_map;	2129	struct dm_table *old_map;
2130	struct request_queue *q = md->queue;	2130	struct request_queue *q = md->queue;
2131	sector_t size;	2131	sector_t size;
2132	unsigned long flags;	2132	unsigned long flags;
2133		2133
2134	size = dm_table_get_size(t);	2134	size = dm_table_get_size(t);
2135		2135
2136	/*	2136	/*
2137	* Wipe any geometry if the size of the table changed.	2137	* Wipe any geometry if the size of the table changed.
2138	*/	2138	*/
2139	if (size != get_capacity(md->disk))	2139	if (size != get_capacity(md->disk))
2140	memset(&md->geometry, 0, sizeof(md->geometry));	2140	memset(&md->geometry, 0, sizeof(md->geometry));
2141		2141
2142	__set_size(md, size);	2142	__set_size(md, size);
2143		2143
2144	dm_table_event_callback(t, event_callback, md);	2144	dm_table_event_callback(t, event_callback, md);
2145		2145
2146	/*	2146	/*
2147	* The queue hasn't been stopped yet, if the old table type wasn't	2147	* The queue hasn't been stopped yet, if the old table type wasn't
2148	* for request-based during suspension. So stop it to prevent	2148	* for request-based during suspension. So stop it to prevent
2149	* I/O mapping before resume.	2149	* I/O mapping before resume.
2150	* This must be done before setting the queue restrictions,	2150	* This must be done before setting the queue restrictions,
2151	* because request-based dm may be run just after the setting.	2151	* because request-based dm may be run just after the setting.
2152	*/	2152	*/
2153	if (dm_table_request_based(t) && !blk_queue_stopped(q))	2153	if (dm_table_request_based(t) && !blk_queue_stopped(q))
2154	stop_queue(q);	2154	stop_queue(q);
2155		2155
2156	__bind_mempools(md, t);	2156	__bind_mempools(md, t);
2157		2157
2158	write_lock_irqsave(&md->map_lock, flags);	2158	write_lock_irqsave(&md->map_lock, flags);
2159	old_map = md->map;	2159	old_map = md->map;
2160	md->map = t;	2160	md->map = t;
2161	dm_table_set_restrictions(t, q, limits);	2161	dm_table_set_restrictions(t, q, limits);
2162	write_unlock_irqrestore(&md->map_lock, flags);	2162	write_unlock_irqrestore(&md->map_lock, flags);
2163		2163
2164	return old_map;	2164	return old_map;
2165	}	2165	}
2166		2166
2167	/*	2167	/*
2168	* Returns unbound table for the caller to free.	2168	* Returns unbound table for the caller to free.
2169	*/	2169	*/
2170	static struct dm_table __unbind(struct mapped_device md)	2170	static struct dm_table __unbind(struct mapped_device md)
2171	{	2171	{
2172	struct dm_table *map = md->map;	2172	struct dm_table *map = md->map;
2173	unsigned long flags;	2173	unsigned long flags;
2174		2174
2175	if (!map)	2175	if (!map)
2176	return NULL;	2176	return NULL;
2177		2177
2178	dm_table_event_callback(map, NULL, NULL);	2178	dm_table_event_callback(map, NULL, NULL);
2179	write_lock_irqsave(&md->map_lock, flags);	2179	write_lock_irqsave(&md->map_lock, flags);
2180	md->map = NULL;	2180	md->map = NULL;
2181	write_unlock_irqrestore(&md->map_lock, flags);	2181	write_unlock_irqrestore(&md->map_lock, flags);
2182		2182
2183	return map;	2183	return map;
2184	}	2184	}
2185		2185
2186	/*	2186	/*
2187	* Constructor for a new device.	2187	* Constructor for a new device.
2188	*/	2188	*/
2189	int dm_create(int minor, struct mapped_device **result)	2189	int dm_create(int minor, struct mapped_device **result)
2190	{	2190	{
2191	struct mapped_device *md;	2191	struct mapped_device *md;
2192		2192
2193	md = alloc_dev(minor);	2193	md = alloc_dev(minor);
2194	if (!md)	2194	if (!md)
2195	return -ENXIO;	2195	return -ENXIO;
2196		2196
2197	dm_sysfs_init(md);	2197	dm_sysfs_init(md);
2198		2198
2199	*result = md;	2199	*result = md;
2200	return 0;	2200	return 0;
2201	}	2201	}
2202		2202
2203	/*	2203	/*
2204	* Functions to manage md->type.	2204	* Functions to manage md->type.
2205	* All are required to hold md->type_lock.	2205	* All are required to hold md->type_lock.
2206	*/	2206	*/
2207	void dm_lock_md_type(struct mapped_device *md)	2207	void dm_lock_md_type(struct mapped_device *md)
2208	{	2208	{
2209	mutex_lock(&md->type_lock);	2209	mutex_lock(&md->type_lock);
2210	}	2210	}
2211		2211
2212	void dm_unlock_md_type(struct mapped_device *md)	2212	void dm_unlock_md_type(struct mapped_device *md)
2213	{	2213	{
2214	mutex_unlock(&md->type_lock);	2214	mutex_unlock(&md->type_lock);
2215	}	2215	}
2216		2216
2217	void dm_set_md_type(struct mapped_device *md, unsigned type)	2217	void dm_set_md_type(struct mapped_device *md, unsigned type)
2218	{	2218	{
2219	md->type = type;	2219	md->type = type;
2220	}	2220	}
2221		2221
2222	unsigned dm_get_md_type(struct mapped_device *md)	2222	unsigned dm_get_md_type(struct mapped_device *md)
2223	{	2223	{
2224	return md->type;	2224	return md->type;
2225	}	2225	}
2226		2226
2227	/*	2227	/*
2228	* Fully initialize a request-based queue (->elevator, ->request_fn, etc).	2228	* Fully initialize a request-based queue (->elevator, ->request_fn, etc).
2229	*/	2229	*/
2230	static int dm_init_request_based_queue(struct mapped_device *md)	2230	static int dm_init_request_based_queue(struct mapped_device *md)
2231	{	2231	{
2232	struct request_queue *q = NULL;	2232	struct request_queue *q = NULL;
2233		2233
2234	if (md->queue->elevator)	2234	if (md->queue->elevator)
2235	return 1;	2235	return 1;
2236		2236
2237	/* Fully initialize the queue */	2237	/* Fully initialize the queue */
2238	q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);	2238	q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
2239	if (!q)	2239	if (!q)
2240	return 0;	2240	return 0;
2241		2241
2242	md->queue = q;	2242	md->queue = q;
2243	md->saved_make_request_fn = md->queue->make_request_fn;	2243	md->saved_make_request_fn = md->queue->make_request_fn;
2244	dm_init_md_queue(md);	2244	dm_init_md_queue(md);
2245	blk_queue_softirq_done(md->queue, dm_softirq_done);	2245	blk_queue_softirq_done(md->queue, dm_softirq_done);
2246	blk_queue_prep_rq(md->queue, dm_prep_fn);	2246	blk_queue_prep_rq(md->queue, dm_prep_fn);
2247	blk_queue_lld_busy(md->queue, dm_lld_busy);	2247	blk_queue_lld_busy(md->queue, dm_lld_busy);
2248	blk_queue_ordered(md->queue, QUEUE_ORDERED_DRAIN_FLUSH);	2248	blk_queue_flush(md->queue, REQ_FLUSH);
2249		2249
2250	elv_register_queue(md->queue);	2250	elv_register_queue(md->queue);
2251		2251
2252	return 1;	2252	return 1;
2253	}	2253	}
2254		2254
2255	/*	2255	/*
2256	* Setup the DM device's queue based on md's type	2256	* Setup the DM device's queue based on md's type
2257	*/	2257	*/
2258	int dm_setup_md_queue(struct mapped_device *md)	2258	int dm_setup_md_queue(struct mapped_device *md)
2259	{	2259	{
2260	if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&	2260	if ((dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) &&
2261	!dm_init_request_based_queue(md)) {	2261	!dm_init_request_based_queue(md)) {
2262	DMWARN("Cannot initialize queue for request-based mapped device");	2262	DMWARN("Cannot initialize queue for request-based mapped device");
2263	return -EINVAL;	2263	return -EINVAL;
2264	}	2264	}
2265		2265
2266	return 0;	2266	return 0;
2267	}	2267	}
2268		2268
2269	static struct mapped_device *dm_find_md(dev_t dev)	2269	static struct mapped_device *dm_find_md(dev_t dev)
2270	{	2270	{
2271	struct mapped_device *md;	2271	struct mapped_device *md;
2272	unsigned minor = MINOR(dev);	2272	unsigned minor = MINOR(dev);
2273		2273
2274	if (MAJOR(dev) != _major \|\| minor >= (1 << MINORBITS))	2274	if (MAJOR(dev) != _major \|\| minor >= (1 << MINORBITS))
2275	return NULL;	2275	return NULL;
2276		2276
2277	spin_lock(&_minor_lock);	2277	spin_lock(&_minor_lock);
2278		2278
2279	md = idr_find(&_minor_idr, minor);	2279	md = idr_find(&_minor_idr, minor);
2280	if (md && (md == MINOR_ALLOCED \|\|	2280	if (md && (md == MINOR_ALLOCED \|\|
2281	(MINOR(disk_devt(dm_disk(md))) != minor) \|\|	2281	(MINOR(disk_devt(dm_disk(md))) != minor) \|\|
2282	dm_deleting_md(md) \|\|	2282	dm_deleting_md(md) \|\|
2283	test_bit(DMF_FREEING, &md->flags))) {	2283	test_bit(DMF_FREEING, &md->flags))) {
2284	md = NULL;	2284	md = NULL;
2285	goto out;	2285	goto out;
2286	}	2286	}
2287		2287
2288	out:	2288	out:
2289	spin_unlock(&_minor_lock);	2289	spin_unlock(&_minor_lock);
2290		2290
2291	return md;	2291	return md;
2292	}	2292	}
2293		2293
2294	struct mapped_device *dm_get_md(dev_t dev)	2294	struct mapped_device *dm_get_md(dev_t dev)
2295	{	2295	{
2296	struct mapped_device *md = dm_find_md(dev);	2296	struct mapped_device *md = dm_find_md(dev);
2297		2297
2298	if (md)	2298	if (md)
2299	dm_get(md);	2299	dm_get(md);
2300		2300
2301	return md;	2301	return md;
2302	}	2302	}
2303		2303
2304	void dm_get_mdptr(struct mapped_device md)	2304	void dm_get_mdptr(struct mapped_device md)
2305	{	2305	{
2306	return md->interface_ptr;	2306	return md->interface_ptr;
2307	}	2307	}
2308		2308
2309	void dm_set_mdptr(struct mapped_device md, void ptr)	2309	void dm_set_mdptr(struct mapped_device md, void ptr)
2310	{	2310	{
2311	md->interface_ptr = ptr;	2311	md->interface_ptr = ptr;
2312	}	2312	}
2313		2313
2314	void dm_get(struct mapped_device *md)	2314	void dm_get(struct mapped_device *md)
2315	{	2315	{
2316	atomic_inc(&md->holders);	2316	atomic_inc(&md->holders);
2317	BUG_ON(test_bit(DMF_FREEING, &md->flags));	2317	BUG_ON(test_bit(DMF_FREEING, &md->flags));
2318	}	2318	}
2319		2319
2320	const char dm_device_name(struct mapped_device md)	2320	const char dm_device_name(struct mapped_device md)
2321	{	2321	{
2322	return md->name;	2322	return md->name;
2323	}	2323	}
2324	EXPORT_SYMBOL_GPL(dm_device_name);	2324	EXPORT_SYMBOL_GPL(dm_device_name);
2325		2325
2326	static void __dm_destroy(struct mapped_device *md, bool wait)	2326	static void __dm_destroy(struct mapped_device *md, bool wait)
2327	{	2327	{
2328	struct dm_table *map;	2328	struct dm_table *map;
2329		2329
2330	might_sleep();	2330	might_sleep();
2331		2331
2332	spin_lock(&_minor_lock);	2332	spin_lock(&_minor_lock);
2333	map = dm_get_live_table(md);	2333	map = dm_get_live_table(md);
2334	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));	2334	idr_replace(&_minor_idr, MINOR_ALLOCED, MINOR(disk_devt(dm_disk(md))));
2335	set_bit(DMF_FREEING, &md->flags);	2335	set_bit(DMF_FREEING, &md->flags);
2336	spin_unlock(&_minor_lock);	2336	spin_unlock(&_minor_lock);
2337		2337
2338	if (!dm_suspended_md(md)) {	2338	if (!dm_suspended_md(md)) {
2339	dm_table_presuspend_targets(map);	2339	dm_table_presuspend_targets(map);
2340	dm_table_postsuspend_targets(map);	2340	dm_table_postsuspend_targets(map);
2341	}	2341	}
2342		2342
2343	/*	2343	/*
2344	* Rare, but there may be I/O requests still going to complete,	2344	* Rare, but there may be I/O requests still going to complete,
2345	* for example. Wait for all references to disappear.	2345	* for example. Wait for all references to disappear.
2346	* No one should increment the reference count of the mapped_device,	2346	* No one should increment the reference count of the mapped_device,
2347	* after the mapped_device state becomes DMF_FREEING.	2347	* after the mapped_device state becomes DMF_FREEING.
2348	*/	2348	*/
2349	if (wait)	2349	if (wait)
2350	while (atomic_read(&md->holders))	2350	while (atomic_read(&md->holders))
2351	msleep(1);	2351	msleep(1);
2352	else if (atomic_read(&md->holders))	2352	else if (atomic_read(&md->holders))
2353	DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",	2353	DMWARN("%s: Forcibly removing mapped_device still in use! (%d users)",
2354	dm_device_name(md), atomic_read(&md->holders));	2354	dm_device_name(md), atomic_read(&md->holders));
2355		2355
2356	dm_sysfs_exit(md);	2356	dm_sysfs_exit(md);
2357	dm_table_put(map);	2357	dm_table_put(map);
2358	dm_table_destroy(__unbind(md));	2358	dm_table_destroy(__unbind(md));
2359	free_dev(md);	2359	free_dev(md);
2360	}	2360	}
2361		2361
2362	void dm_destroy(struct mapped_device *md)	2362	void dm_destroy(struct mapped_device *md)
2363	{	2363	{
2364	__dm_destroy(md, true);	2364	__dm_destroy(md, true);
2365	}	2365	}
2366		2366
2367	void dm_destroy_immediate(struct mapped_device *md)	2367	void dm_destroy_immediate(struct mapped_device *md)
2368	{	2368	{
2369	__dm_destroy(md, false);	2369	__dm_destroy(md, false);
2370	}	2370	}
2371		2371
2372	void dm_put(struct mapped_device *md)	2372	void dm_put(struct mapped_device *md)
2373	{	2373	{
2374	atomic_dec(&md->holders);	2374	atomic_dec(&md->holders);
2375	}	2375	}
2376	EXPORT_SYMBOL_GPL(dm_put);	2376	EXPORT_SYMBOL_GPL(dm_put);
2377		2377
2378	static int dm_wait_for_completion(struct mapped_device *md, int interruptible)	2378	static int dm_wait_for_completion(struct mapped_device *md, int interruptible)
2379	{	2379	{
2380	int r = 0;	2380	int r = 0;
2381	DECLARE_WAITQUEUE(wait, current);	2381	DECLARE_WAITQUEUE(wait, current);
2382		2382
2383	dm_unplug_all(md->queue);	2383	dm_unplug_all(md->queue);
2384		2384
2385	add_wait_queue(&md->wait, &wait);	2385	add_wait_queue(&md->wait, &wait);
2386		2386
2387	while (1) {	2387	while (1) {
2388	set_current_state(interruptible);	2388	set_current_state(interruptible);
2389		2389
2390	smp_mb();	2390	smp_mb();
2391	if (!md_in_flight(md))	2391	if (!md_in_flight(md))
2392	break;	2392	break;
2393		2393
2394	if (interruptible == TASK_INTERRUPTIBLE &&	2394	if (interruptible == TASK_INTERRUPTIBLE &&
2395	signal_pending(current)) {	2395	signal_pending(current)) {
2396	r = -EINTR;	2396	r = -EINTR;
2397	break;	2397	break;
2398	}	2398	}
2399		2399
2400	io_schedule();	2400	io_schedule();
2401	}	2401	}
2402	set_current_state(TASK_RUNNING);	2402	set_current_state(TASK_RUNNING);
2403		2403
2404	remove_wait_queue(&md->wait, &wait);	2404	remove_wait_queue(&md->wait, &wait);
2405		2405
2406	return r;	2406	return r;
2407	}	2407	}
2408		2408
2409	static void dm_flush(struct mapped_device *md)	2409	static void dm_flush(struct mapped_device *md)
2410	{	2410	{
2411	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);	2411	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2412		2412
2413	bio_init(&md->barrier_bio);	2413	bio_init(&md->barrier_bio);
2414	md->barrier_bio.bi_bdev = md->bdev;	2414	md->barrier_bio.bi_bdev = md->bdev;
2415	md->barrier_bio.bi_rw = WRITE_BARRIER;	2415	md->barrier_bio.bi_rw = WRITE_BARRIER;
2416	__split_and_process_bio(md, &md->barrier_bio);	2416	__split_and_process_bio(md, &md->barrier_bio);
2417		2417
2418	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);	2418	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2419	}	2419	}
2420		2420
2421	static void process_barrier(struct mapped_device md, struct bio bio)	2421	static void process_barrier(struct mapped_device md, struct bio bio)
2422	{	2422	{
2423	md->barrier_error = 0;	2423	md->barrier_error = 0;
2424		2424
2425	dm_flush(md);	2425	dm_flush(md);
2426		2426
2427	if (!bio_empty_barrier(bio)) {	2427	if (!bio_empty_barrier(bio)) {
2428	__split_and_process_bio(md, bio);	2428	__split_and_process_bio(md, bio);
2429	/*	2429	/*
2430	* If the request isn't supported, don't waste time with	2430	* If the request isn't supported, don't waste time with
2431	* the second flush.	2431	* the second flush.
2432	*/	2432	*/
2433	if (md->barrier_error != -EOPNOTSUPP)	2433	if (md->barrier_error != -EOPNOTSUPP)
2434	dm_flush(md);	2434	dm_flush(md);
2435	}	2435	}
2436		2436
2437	if (md->barrier_error != DM_ENDIO_REQUEUE)	2437	if (md->barrier_error != DM_ENDIO_REQUEUE)
2438	bio_endio(bio, md->barrier_error);	2438	bio_endio(bio, md->barrier_error);
2439	else {	2439	else {
2440	spin_lock_irq(&md->deferred_lock);	2440	spin_lock_irq(&md->deferred_lock);
2441	bio_list_add_head(&md->deferred, bio);	2441	bio_list_add_head(&md->deferred, bio);
2442	spin_unlock_irq(&md->deferred_lock);	2442	spin_unlock_irq(&md->deferred_lock);
2443	}	2443	}
2444	}	2444	}
2445		2445
2446	/*	2446	/*
2447	* Process the deferred bios	2447	* Process the deferred bios
2448	*/	2448	*/
2449	static void dm_wq_work(struct work_struct *work)	2449	static void dm_wq_work(struct work_struct *work)
2450	{	2450	{
2451	struct mapped_device *md = container_of(work, struct mapped_device,	2451	struct mapped_device *md = container_of(work, struct mapped_device,
2452	work);	2452	work);
2453	struct bio *c;	2453	struct bio *c;
2454		2454
2455	down_write(&md->io_lock);	2455	down_write(&md->io_lock);
2456		2456
2457	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {	2457	while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
2458	spin_lock_irq(&md->deferred_lock);	2458	spin_lock_irq(&md->deferred_lock);
2459	c = bio_list_pop(&md->deferred);	2459	c = bio_list_pop(&md->deferred);
2460	spin_unlock_irq(&md->deferred_lock);	2460	spin_unlock_irq(&md->deferred_lock);
2461		2461
2462	if (!c) {	2462	if (!c) {
2463	clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);	2463	clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2464	break;	2464	break;
2465	}	2465	}
2466		2466
2467	up_write(&md->io_lock);	2467	up_write(&md->io_lock);
2468		2468
2469	if (dm_request_based(md))	2469	if (dm_request_based(md))
2470	generic_make_request(c);	2470	generic_make_request(c);
2471	else {	2471	else {
2472	if (c->bi_rw & REQ_HARDBARRIER)	2472	if (c->bi_rw & REQ_HARDBARRIER)
2473	process_barrier(md, c);	2473	process_barrier(md, c);
2474	else	2474	else
2475	__split_and_process_bio(md, c);	2475	__split_and_process_bio(md, c);
2476	}	2476	}
2477		2477
2478	down_write(&md->io_lock);	2478	down_write(&md->io_lock);
2479	}	2479	}
2480		2480
2481	up_write(&md->io_lock);	2481	up_write(&md->io_lock);
2482	}	2482	}
2483		2483
2484	static void dm_queue_flush(struct mapped_device *md)	2484	static void dm_queue_flush(struct mapped_device *md)
2485	{	2485	{
2486	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);	2486	clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2487	smp_mb__after_clear_bit();	2487	smp_mb__after_clear_bit();
2488	queue_work(md->wq, &md->work);	2488	queue_work(md->wq, &md->work);
2489	}	2489	}
2490		2490
2491	static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)	2491	static void dm_rq_set_target_request_nr(struct request *clone, unsigned request_nr)
2492	{	2492	{
2493	struct dm_rq_target_io *tio = clone->end_io_data;	2493	struct dm_rq_target_io *tio = clone->end_io_data;
2494		2494
2495	tio->info.target_request_nr = request_nr;	2495	tio->info.target_request_nr = request_nr;
2496	}	2496	}
2497		2497
2498	/* Issue barrier requests to targets and wait for their completion. */	2498	/* Issue barrier requests to targets and wait for their completion. */
2499	static int dm_rq_barrier(struct mapped_device *md)	2499	static int dm_rq_barrier(struct mapped_device *md)
2500	{	2500	{
2501	int i, j;	2501	int i, j;
2502	struct dm_table *map = dm_get_live_table(md);	2502	struct dm_table *map = dm_get_live_table(md);
2503	unsigned num_targets = dm_table_get_num_targets(map);	2503	unsigned num_targets = dm_table_get_num_targets(map);
2504	struct dm_target *ti;	2504	struct dm_target *ti;
2505	struct request *clone;	2505	struct request *clone;
2506		2506
2507	md->barrier_error = 0;	2507	md->barrier_error = 0;
2508		2508
2509	for (i = 0; i < num_targets; i++) {	2509	for (i = 0; i < num_targets; i++) {
2510	ti = dm_table_get_target(map, i);	2510	ti = dm_table_get_target(map, i);
2511	for (j = 0; j < ti->num_flush_requests; j++) {	2511	for (j = 0; j < ti->num_flush_requests; j++) {
2512	clone = clone_rq(md->flush_request, md, GFP_NOIO);	2512	clone = clone_rq(md->flush_request, md, GFP_NOIO);
2513	dm_rq_set_target_request_nr(clone, j);	2513	dm_rq_set_target_request_nr(clone, j);
2514	atomic_inc(&md->pending[rq_data_dir(clone)]);	2514	atomic_inc(&md->pending[rq_data_dir(clone)]);
2515	map_request(ti, clone, md);	2515	map_request(ti, clone, md);
2516	}	2516	}
2517	}	2517	}
2518		2518
2519	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);	2519	dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
2520	dm_table_put(map);	2520	dm_table_put(map);
2521		2521
2522	return md->barrier_error;	2522	return md->barrier_error;
2523	}	2523	}
2524		2524
2525	static void dm_rq_barrier_work(struct work_struct *work)	2525	static void dm_rq_barrier_work(struct work_struct *work)
2526	{	2526	{
2527	int error;	2527	int error;
2528	struct mapped_device *md = container_of(work, struct mapped_device,	2528	struct mapped_device *md = container_of(work, struct mapped_device,
2529	barrier_work);	2529	barrier_work);
2530	struct request_queue *q = md->queue;	2530	struct request_queue *q = md->queue;
2531	struct request *rq;	2531	struct request *rq;
2532	unsigned long flags;	2532	unsigned long flags;
2533		2533
2534	/*	2534	/*
2535	* Hold the md reference here and leave it at the last part so that	2535	* Hold the md reference here and leave it at the last part so that
2536	* the md can't be deleted by device opener when the barrier request	2536	* the md can't be deleted by device opener when the barrier request
2537	* completes.	2537	* completes.
2538	*/	2538	*/
2539	dm_get(md);	2539	dm_get(md);
2540		2540
2541	error = dm_rq_barrier(md);	2541	error = dm_rq_barrier(md);
2542		2542
2543	rq = md->flush_request;	2543	rq = md->flush_request;
2544	md->flush_request = NULL;	2544	md->flush_request = NULL;
2545		2545
2546	if (error == DM_ENDIO_REQUEUE) {	2546	if (error == DM_ENDIO_REQUEUE) {
2547	spin_lock_irqsave(q->queue_lock, flags);	2547	spin_lock_irqsave(q->queue_lock, flags);
2548	blk_requeue_request(q, rq);	2548	blk_requeue_request(q, rq);
2549	spin_unlock_irqrestore(q->queue_lock, flags);	2549	spin_unlock_irqrestore(q->queue_lock, flags);
2550	} else	2550	} else
2551	blk_end_request_all(rq, error);	2551	blk_end_request_all(rq, error);
2552		2552
2553	blk_run_queue(q);	2553	blk_run_queue(q);
2554		2554
2555	dm_put(md);	2555	dm_put(md);
2556	}	2556	}
2557		2557
2558	/*	2558	/*
2559	* Swap in a new table, returning the old one for the caller to destroy.	2559	* Swap in a new table, returning the old one for the caller to destroy.
2560	*/	2560	*/
2561	struct dm_table dm_swap_table(struct mapped_device md, struct dm_table *table)	2561	struct dm_table dm_swap_table(struct mapped_device md, struct dm_table *table)
2562	{	2562	{
2563	struct dm_table *map = ERR_PTR(-EINVAL);	2563	struct dm_table *map = ERR_PTR(-EINVAL);
2564	struct queue_limits limits;	2564	struct queue_limits limits;
2565	int r;	2565	int r;
2566		2566
2567	mutex_lock(&md->suspend_lock);	2567	mutex_lock(&md->suspend_lock);
2568		2568
2569	/* device must be suspended */	2569	/* device must be suspended */
2570	if (!dm_suspended_md(md))	2570	if (!dm_suspended_md(md))
2571	goto out;	2571	goto out;
2572		2572
2573	r = dm_calculate_queue_limits(table, &limits);	2573	r = dm_calculate_queue_limits(table, &limits);
2574	if (r) {	2574	if (r) {
2575	map = ERR_PTR(r);	2575	map = ERR_PTR(r);
2576	goto out;	2576	goto out;
2577	}	2577	}
2578		2578
2579	map = __bind(md, table, &limits);	2579	map = __bind(md, table, &limits);
2580		2580
2581	out:	2581	out:
2582	mutex_unlock(&md->suspend_lock);	2582	mutex_unlock(&md->suspend_lock);
2583	return map;	2583	return map;
2584	}	2584	}
2585		2585
2586	/*	2586	/*
2587	* Functions to lock and unlock any filesystem running on the	2587	* Functions to lock and unlock any filesystem running on the
2588	* device.	2588	* device.
2589	*/	2589	*/
2590	static int lock_fs(struct mapped_device *md)	2590	static int lock_fs(struct mapped_device *md)
2591	{	2591	{
2592	int r;	2592	int r;
2593		2593
2594	WARN_ON(md->frozen_sb);	2594	WARN_ON(md->frozen_sb);
2595		2595
2596	md->frozen_sb = freeze_bdev(md->bdev);	2596	md->frozen_sb = freeze_bdev(md->bdev);
2597	if (IS_ERR(md->frozen_sb)) {	2597	if (IS_ERR(md->frozen_sb)) {
2598	r = PTR_ERR(md->frozen_sb);	2598	r = PTR_ERR(md->frozen_sb);
2599	md->frozen_sb = NULL;	2599	md->frozen_sb = NULL;
2600	return r;	2600	return r;
2601	}	2601	}
2602		2602
2603	set_bit(DMF_FROZEN, &md->flags);	2603	set_bit(DMF_FROZEN, &md->flags);
2604		2604
2605	return 0;	2605	return 0;
2606	}	2606	}
2607		2607
2608	static void unlock_fs(struct mapped_device *md)	2608	static void unlock_fs(struct mapped_device *md)
2609	{	2609	{
2610	if (!test_bit(DMF_FROZEN, &md->flags))	2610	if (!test_bit(DMF_FROZEN, &md->flags))
2611	return;	2611	return;
2612		2612
2613	thaw_bdev(md->bdev, md->frozen_sb);	2613	thaw_bdev(md->bdev, md->frozen_sb);
2614	md->frozen_sb = NULL;	2614	md->frozen_sb = NULL;
2615	clear_bit(DMF_FROZEN, &md->flags);	2615	clear_bit(DMF_FROZEN, &md->flags);
2616	}	2616	}
2617		2617
2618	/*	2618	/*
2619	* We need to be able to change a mapping table under a mounted	2619	* We need to be able to change a mapping table under a mounted
2620	* filesystem. For example we might want to move some data in	2620	* filesystem. For example we might want to move some data in
2621	* the background. Before the table can be swapped with	2621	* the background. Before the table can be swapped with
2622	* dm_bind_table, dm_suspend must be called to flush any in	2622	* dm_bind_table, dm_suspend must be called to flush any in
2623	* flight bios and ensure that any further io gets deferred.	2623	* flight bios and ensure that any further io gets deferred.
2624	*/	2624	*/
2625	/*	2625	/*
2626	* Suspend mechanism in request-based dm.	2626	* Suspend mechanism in request-based dm.
2627	*	2627	*
2628	* 1. Flush all I/Os by lock_fs() if needed.	2628	* 1. Flush all I/Os by lock_fs() if needed.
2629	* 2. Stop dispatching any I/O by stopping the request_queue.	2629	* 2. Stop dispatching any I/O by stopping the request_queue.
2630	* 3. Wait for all in-flight I/Os to be completed or requeued.	2630	* 3. Wait for all in-flight I/Os to be completed or requeued.
2631	*	2631	*
2632	* To abort suspend, start the request_queue.	2632	* To abort suspend, start the request_queue.
2633	*/	2633	*/
2634	int dm_suspend(struct mapped_device *md, unsigned suspend_flags)	2634	int dm_suspend(struct mapped_device *md, unsigned suspend_flags)
2635	{	2635	{
2636	struct dm_table *map = NULL;	2636	struct dm_table *map = NULL;
2637	int r = 0;	2637	int r = 0;
2638	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;	2638	int do_lockfs = suspend_flags & DM_SUSPEND_LOCKFS_FLAG ? 1 : 0;
2639	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;	2639	int noflush = suspend_flags & DM_SUSPEND_NOFLUSH_FLAG ? 1 : 0;
2640		2640
2641	mutex_lock(&md->suspend_lock);	2641	mutex_lock(&md->suspend_lock);
2642		2642
2643	if (dm_suspended_md(md)) {	2643	if (dm_suspended_md(md)) {
2644	r = -EINVAL;	2644	r = -EINVAL;
2645	goto out_unlock;	2645	goto out_unlock;
2646	}	2646	}
2647		2647
2648	map = dm_get_live_table(md);	2648	map = dm_get_live_table(md);
2649		2649
2650	/*	2650	/*
2651	* DMF_NOFLUSH_SUSPENDING must be set before presuspend.	2651	* DMF_NOFLUSH_SUSPENDING must be set before presuspend.
2652	* This flag is cleared before dm_suspend returns.	2652	* This flag is cleared before dm_suspend returns.
2653	*/	2653	*/
2654	if (noflush)	2654	if (noflush)
2655	set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);	2655	set_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2656		2656
2657	/* This does not get reverted if there's an error later. */	2657	/* This does not get reverted if there's an error later. */
2658	dm_table_presuspend_targets(map);	2658	dm_table_presuspend_targets(map);
2659		2659
2660	/*	2660	/*
2661	* Flush I/O to the device.	2661	* Flush I/O to the device.
2662	* Any I/O submitted after lock_fs() may not be flushed.	2662	* Any I/O submitted after lock_fs() may not be flushed.
2663	* noflush takes precedence over do_lockfs.	2663	* noflush takes precedence over do_lockfs.
2664	* (lock_fs() flushes I/Os and waits for them to complete.)	2664	* (lock_fs() flushes I/Os and waits for them to complete.)
2665	*/	2665	*/
2666	if (!noflush && do_lockfs) {	2666	if (!noflush && do_lockfs) {
2667	r = lock_fs(md);	2667	r = lock_fs(md);
2668	if (r)	2668	if (r)
2669	goto out;	2669	goto out;
2670	}	2670	}
2671		2671
2672	/*	2672	/*
2673	* Here we must make sure that no processes are submitting requests	2673	* Here we must make sure that no processes are submitting requests
2674	* to target drivers i.e. no one may be executing	2674	* to target drivers i.e. no one may be executing
2675	* __split_and_process_bio. This is called from dm_request and	2675	* __split_and_process_bio. This is called from dm_request and
2676	* dm_wq_work.	2676	* dm_wq_work.
2677	*	2677	*
2678	* To get all processes out of __split_and_process_bio in dm_request,	2678	* To get all processes out of __split_and_process_bio in dm_request,
2679	* we take the write lock. To prevent any process from reentering	2679	* we take the write lock. To prevent any process from reentering
2680	* __split_and_process_bio from dm_request, we set	2680	* __split_and_process_bio from dm_request, we set
2681	* DMF_QUEUE_IO_TO_THREAD.	2681	* DMF_QUEUE_IO_TO_THREAD.
2682	*	2682	*
2683	* To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND	2683	* To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
2684	* and call flush_workqueue(md->wq). flush_workqueue will wait until	2684	* and call flush_workqueue(md->wq). flush_workqueue will wait until
2685	* dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any	2685	* dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
2686	* further calls to __split_and_process_bio from dm_wq_work.	2686	* further calls to __split_and_process_bio from dm_wq_work.
2687	*/	2687	*/
2688	down_write(&md->io_lock);	2688	down_write(&md->io_lock);
2689	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);	2689	set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
2690	set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);	2690	set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
2691	up_write(&md->io_lock);	2691	up_write(&md->io_lock);
2692		2692
2693	/*	2693	/*
2694	* Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which	2694	* Request-based dm uses md->wq for barrier (dm_rq_barrier_work) which
2695	* can be kicked until md->queue is stopped. So stop md->queue before	2695	* can be kicked until md->queue is stopped. So stop md->queue before
2696	* flushing md->wq.	2696	* flushing md->wq.
2697	*/	2697	*/
2698	if (dm_request_based(md))	2698	if (dm_request_based(md))
2699	stop_queue(md->queue);	2699	stop_queue(md->queue);
2700		2700
2701	flush_workqueue(md->wq);	2701	flush_workqueue(md->wq);
2702		2702
2703	/*	2703	/*
2704	* At this point no more requests are entering target request routines.	2704	* At this point no more requests are entering target request routines.
2705	* We call dm_wait_for_completion to wait for all existing requests	2705	* We call dm_wait_for_completion to wait for all existing requests
2706	* to finish.	2706	* to finish.
2707	*/	2707	*/
2708	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);	2708	r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
2709		2709
2710	down_write(&md->io_lock);	2710	down_write(&md->io_lock);
2711	if (noflush)	2711	if (noflush)
2712	clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);	2712	clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
2713	up_write(&md->io_lock);	2713	up_write(&md->io_lock);
2714		2714
2715	/* were we interrupted ? */	2715	/* were we interrupted ? */
2716	if (r < 0) {	2716	if (r < 0) {
2717	dm_queue_flush(md);	2717	dm_queue_flush(md);
2718		2718
2719	if (dm_request_based(md))	2719	if (dm_request_based(md))
2720	start_queue(md->queue);	2720	start_queue(md->queue);
2721		2721
2722	unlock_fs(md);	2722	unlock_fs(md);
2723	goto out; /* pushback list is already flushed, so skip flush */	2723	goto out; /* pushback list is already flushed, so skip flush */
2724	}	2724	}
2725		2725
2726	/*	2726	/*
2727	* If dm_wait_for_completion returned 0, the device is completely	2727	* If dm_wait_for_completion returned 0, the device is completely
2728	* quiescent now. There is no request-processing activity. All new	2728	* quiescent now. There is no request-processing activity. All new
2729	* requests are being added to md->deferred list.	2729	* requests are being added to md->deferred list.
2730	*/	2730	*/
2731		2731
2732	set_bit(DMF_SUSPENDED, &md->flags);	2732	set_bit(DMF_SUSPENDED, &md->flags);
2733		2733
2734	dm_table_postsuspend_targets(map);	2734	dm_table_postsuspend_targets(map);
2735		2735
2736	out:	2736	out:
2737	dm_table_put(map);	2737	dm_table_put(map);
2738		2738
2739	out_unlock:	2739	out_unlock:
2740	mutex_unlock(&md->suspend_lock);	2740	mutex_unlock(&md->suspend_lock);
2741	return r;	2741	return r;
2742	}	2742	}
2743		2743
2744	int dm_resume(struct mapped_device *md)	2744	int dm_resume(struct mapped_device *md)
2745	{	2745	{
2746	int r = -EINVAL;	2746	int r = -EINVAL;
2747	struct dm_table *map = NULL;	2747	struct dm_table *map = NULL;
2748		2748
2749	mutex_lock(&md->suspend_lock);	2749	mutex_lock(&md->suspend_lock);
2750	if (!dm_suspended_md(md))	2750	if (!dm_suspended_md(md))
2751	goto out;	2751	goto out;
2752		2752
2753	map = dm_get_live_table(md);	2753	map = dm_get_live_table(md);
2754	if (!map \|\| !dm_table_get_size(map))	2754	if (!map \|\| !dm_table_get_size(map))
2755	goto out;	2755	goto out;
2756		2756
2757	r = dm_table_resume_targets(map);	2757	r = dm_table_resume_targets(map);
2758	if (r)	2758	if (r)
2759	goto out;	2759	goto out;
2760		2760
2761	dm_queue_flush(md);	2761	dm_queue_flush(md);
2762		2762
2763	/*	2763	/*
2764	* Flushing deferred I/Os must be done after targets are resumed	2764	* Flushing deferred I/Os must be done after targets are resumed
2765	* so that mapping of targets can work correctly.	2765	* so that mapping of targets can work correctly.
2766	* Request-based dm is queueing the deferred I/Os in its request_queue.	2766	* Request-based dm is queueing the deferred I/Os in its request_queue.
2767	*/	2767	*/
2768	if (dm_request_based(md))	2768	if (dm_request_based(md))
2769	start_queue(md->queue);	2769	start_queue(md->queue);
2770		2770
2771	unlock_fs(md);	2771	unlock_fs(md);
2772		2772
2773	clear_bit(DMF_SUSPENDED, &md->flags);	2773	clear_bit(DMF_SUSPENDED, &md->flags);
2774		2774
2775	dm_table_unplug_all(map);	2775	dm_table_unplug_all(map);
2776	r = 0;	2776	r = 0;
2777	out:	2777	out:
2778	dm_table_put(map);	2778	dm_table_put(map);
2779	mutex_unlock(&md->suspend_lock);	2779	mutex_unlock(&md->suspend_lock);
2780		2780
2781	return r;	2781	return r;
2782	}	2782	}
2783		2783
2784	/*-----------------------------------------------------------------	2784	/*-----------------------------------------------------------------
2785	* Event notification.	2785	* Event notification.
2786	---------------------------------------------------------------/	2786	---------------------------------------------------------------/
2787	int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,	2787	int dm_kobject_uevent(struct mapped_device *md, enum kobject_action action,
2788	unsigned cookie)	2788	unsigned cookie)
2789	{	2789	{
2790	char udev_cookie[DM_COOKIE_LENGTH];	2790	char udev_cookie[DM_COOKIE_LENGTH];
2791	char *envp[] = { udev_cookie, NULL };	2791	char *envp[] = { udev_cookie, NULL };
2792		2792
2793	if (!cookie)	2793	if (!cookie)
2794	return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);	2794	return kobject_uevent(&disk_to_dev(md->disk)->kobj, action);
2795	else {	2795	else {
2796	snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",	2796	snprintf(udev_cookie, DM_COOKIE_LENGTH, "%s=%u",
2797	DM_COOKIE_ENV_VAR_NAME, cookie);	2797	DM_COOKIE_ENV_VAR_NAME, cookie);
2798	return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,	2798	return kobject_uevent_env(&disk_to_dev(md->disk)->kobj,
2799	action, envp);	2799	action, envp);
2800	}	2800	}
2801	}	2801	}
2802		2802
2803	uint32_t dm_next_uevent_seq(struct mapped_device *md)	2803	uint32_t dm_next_uevent_seq(struct mapped_device *md)
2804	{	2804	{
2805	return atomic_add_return(1, &md->uevent_seq);	2805	return atomic_add_return(1, &md->uevent_seq);
2806	}	2806	}
2807		2807
2808	uint32_t dm_get_event_nr(struct mapped_device *md)	2808	uint32_t dm_get_event_nr(struct mapped_device *md)
2809	{	2809	{
2810	return atomic_read(&md->event_nr);	2810	return atomic_read(&md->event_nr);
2811	}	2811	}
2812		2812
2813	int dm_wait_event(struct mapped_device *md, int event_nr)	2813	int dm_wait_event(struct mapped_device *md, int event_nr)
2814	{	2814	{
2815	return wait_event_interruptible(md->eventq,	2815	return wait_event_interruptible(md->eventq,
2816	(event_nr != atomic_read(&md->event_nr)));	2816	(event_nr != atomic_read(&md->event_nr)));
2817	}	2817	}
2818		2818
2819	void dm_uevent_add(struct mapped_device md, struct list_head elist)	2819	void dm_uevent_add(struct mapped_device md, struct list_head elist)
2820	{	2820	{
2821	unsigned long flags;	2821	unsigned long flags;
2822		2822
2823	spin_lock_irqsave(&md->uevent_lock, flags);	2823	spin_lock_irqsave(&md->uevent_lock, flags);
2824	list_add(elist, &md->uevent_list);	2824	list_add(elist, &md->uevent_list);
2825	spin_unlock_irqrestore(&md->uevent_lock, flags);	2825	spin_unlock_irqrestore(&md->uevent_lock, flags);
2826	}	2826	}
2827		2827
2828	/*	2828	/*
2829	* The gendisk is only valid as long as you have a reference	2829	* The gendisk is only valid as long as you have a reference
2830	* count on 'md'.	2830	* count on 'md'.
2831	*/	2831	*/
2832	struct gendisk dm_disk(struct mapped_device md)	2832	struct gendisk dm_disk(struct mapped_device md)
2833	{	2833	{
2834	return md->disk;	2834	return md->disk;
2835	}	2835	}
2836		2836
2837	struct kobject dm_kobject(struct mapped_device md)	2837	struct kobject dm_kobject(struct mapped_device md)
2838	{	2838	{
2839	return &md->kobj;	2839	return &md->kobj;
2840	}	2840	}
2841		2841
2842	/*	2842	/*
2843	* struct mapped_device should not be exported outside of dm.c	2843	* struct mapped_device should not be exported outside of dm.c
2844	* so use this check to verify that kobj is part of md structure	2844	* so use this check to verify that kobj is part of md structure
2845	*/	2845	*/
2846	struct mapped_device dm_get_from_kobject(struct kobject kobj)	2846	struct mapped_device dm_get_from_kobject(struct kobject kobj)
2847	{	2847	{
2848	struct mapped_device *md;	2848	struct mapped_device *md;
2849		2849
2850	md = container_of(kobj, struct mapped_device, kobj);	2850	md = container_of(kobj, struct mapped_device, kobj);
2851	if (&md->kobj != kobj)	2851	if (&md->kobj != kobj)
2852	return NULL;	2852	return NULL;
2853		2853
2854	if (test_bit(DMF_FREEING, &md->flags) \|\|	2854	if (test_bit(DMF_FREEING, &md->flags) \|\|
2855	dm_deleting_md(md))	2855	dm_deleting_md(md))
2856	return NULL;	2856	return NULL;
2857		2857
2858	dm_get(md);	2858	dm_get(md);
2859	return md;	2859	return md;
2860	}	2860	}
2861		2861
2862	int dm_suspended_md(struct mapped_device *md)	2862	int dm_suspended_md(struct mapped_device *md)
2863	{	2863	{
2864	return test_bit(DMF_SUSPENDED, &md->flags);	2864	return test_bit(DMF_SUSPENDED, &md->flags);
2865	}	2865	}
2866		2866
2867	int dm_suspended(struct dm_target *ti)	2867	int dm_suspended(struct dm_target *ti)
2868	{	2868	{
2869	return dm_suspended_md(dm_table_get_md(ti->table));	2869	return dm_suspended_md(dm_table_get_md(ti->table));
2870	}	2870	}
2871	EXPORT_SYMBOL_GPL(dm_suspended);	2871	EXPORT_SYMBOL_GPL(dm_suspended);
2872		2872
2873	int dm_noflush_suspending(struct dm_target *ti)	2873	int dm_noflush_suspending(struct dm_target *ti)
2874	{	2874	{
2875	return __noflush_suspending(dm_table_get_md(ti->table));	2875	return __noflush_suspending(dm_table_get_md(ti->table));
2876	}	2876	}
2877	EXPORT_SYMBOL_GPL(dm_noflush_suspending);	2877	EXPORT_SYMBOL_GPL(dm_noflush_suspending);
2878		2878
2879	struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)	2879	struct dm_md_mempools *dm_alloc_md_mempools(unsigned type)
2880	{	2880	{
2881	struct dm_md_mempools pools = kmalloc(sizeof(pools), GFP_KERNEL);	2881	struct dm_md_mempools pools = kmalloc(sizeof(pools), GFP_KERNEL);
2882		2882
2883	if (!pools)	2883	if (!pools)
2884	return NULL;	2884	return NULL;
2885		2885
2886	pools->io_pool = (type == DM_TYPE_BIO_BASED) ?	2886	pools->io_pool = (type == DM_TYPE_BIO_BASED) ?
2887	mempool_create_slab_pool(MIN_IOS, _io_cache) :	2887	mempool_create_slab_pool(MIN_IOS, _io_cache) :
2888	mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);	2888	mempool_create_slab_pool(MIN_IOS, _rq_bio_info_cache);
2889	if (!pools->io_pool)	2889	if (!pools->io_pool)
2890	goto free_pools_and_out;	2890	goto free_pools_and_out;
2891		2891
2892	pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?	2892	pools->tio_pool = (type == DM_TYPE_BIO_BASED) ?
2893	mempool_create_slab_pool(MIN_IOS, _tio_cache) :	2893	mempool_create_slab_pool(MIN_IOS, _tio_cache) :
2894	mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);	2894	mempool_create_slab_pool(MIN_IOS, _rq_tio_cache);
2895	if (!pools->tio_pool)	2895	if (!pools->tio_pool)
2896	goto free_io_pool_and_out;	2896	goto free_io_pool_and_out;
2897		2897
2898	pools->bs = (type == DM_TYPE_BIO_BASED) ?	2898	pools->bs = (type == DM_TYPE_BIO_BASED) ?
2899	bioset_create(16, 0) : bioset_create(MIN_IOS, 0);	2899	bioset_create(16, 0) : bioset_create(MIN_IOS, 0);
2900	if (!pools->bs)	2900	if (!pools->bs)
2901	goto free_tio_pool_and_out;	2901	goto free_tio_pool_and_out;
2902		2902
2903	return pools;	2903	return pools;
2904		2904
2905	free_tio_pool_and_out:	2905	free_tio_pool_and_out:
2906	mempool_destroy(pools->tio_pool);	2906	mempool_destroy(pools->tio_pool);
2907		2907
2908	free_io_pool_and_out:	2908	free_io_pool_and_out:
2909	mempool_destroy(pools->io_pool);	2909	mempool_destroy(pools->io_pool);
2910		2910
2911	free_pools_and_out:	2911	free_pools_and_out:
2912	kfree(pools);	2912	kfree(pools);
2913		2913
2914	return NULL;	2914	return NULL;
2915	}	2915	}
2916		2916
2917	void dm_free_md_mempools(struct dm_md_mempools *pools)	2917	void dm_free_md_mempools(struct dm_md_mempools *pools)
2918	{	2918	{
2919	if (!pools)	2919	if (!pools)
2920	return;	2920	return;
2921		2921
2922	if (pools->io_pool)	2922	if (pools->io_pool)
2923	mempool_destroy(pools->io_pool);	2923	mempool_destroy(pools->io_pool);
2924		2924
2925	if (pools->tio_pool)	2925	if (pools->tio_pool)
2926	mempool_destroy(pools->tio_pool);	2926	mempool_destroy(pools->tio_pool);
2927		2927
2928	if (pools->bs)	2928	if (pools->bs)
2929	bioset_free(pools->bs);	2929	bioset_free(pools->bs);
2930		2930
2931	kfree(pools);	2931	kfree(pools);
2932	}	2932	}
2933		2933
2934	static const struct block_device_operations dm_blk_dops = {	2934	static const struct block_device_operations dm_blk_dops = {
2935	.open = dm_blk_open,	2935	.open = dm_blk_open,
2936	.release = dm_blk_close,	2936	.release = dm_blk_close,
2937	.ioctl = dm_blk_ioctl,	2937	.ioctl = dm_blk_ioctl,
2938	.getgeo = dm_blk_getgeo,	2938	.getgeo = dm_blk_getgeo,
2939	.owner = THIS_MODULE	2939	.owner = THIS_MODULE
2940	};	2940	};
2941		2941
2942	EXPORT_SYMBOL(dm_get_mapinfo);	2942	EXPORT_SYMBOL(dm_get_mapinfo);
2943		2943
2944	/*	2944	/*
2945	* module hooks	2945	* module hooks
2946	*/	2946	*/
2947	module_init(dm_init);	2947	module_init(dm_init);
2948	module_exit(dm_exit);	2948	module_exit(dm_exit);
2949		2949
2950	module_param(major, uint, 0);	2950	module_param(major, uint, 0);
2951	MODULE_PARM_DESC(major, "The major number of the device mapper");	2951	MODULE_PARM_DESC(major, "The major number of the device mapper");
2952	MODULE_DESCRIPTION(DM_NAME " driver");	2952	MODULE_DESCRIPTION(DM_NAME " driver");
2953	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");	2953	MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
2954	MODULE_LICENSE("GPL");	2954	MODULE_LICENSE("GPL");
2955		2955

drivers/mmc/card/queue.c

Diff comments View file @ 4913efe

 /*
  *  linux/drivers/mmc/card/queue.c
  *
  *  Copyright (C) 2003 Russell King, All Rights Reserved.
  *  Copyright 2006-2007 Pierre Ossman
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
  */
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/blkdev.h>
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/scatterlist.h>
 #include <linux/mmc/card.h>
 #include <linux/mmc/host.h>
 #include "queue.h"
 #define MMC_QUEUE_BOUNCESZ	65536
 #define MMC_QUEUE_SUSPENDED	(1 << 0)
 /*
  * Prepare a MMC request. This just filters out odd stuff.
  */
 static int mmc_prep_request(struct request_queue *q, struct request *req)
 {
 	/*
 	 * We only like normal block requests and discards.
 	 */
 	if (req->cmd_type != REQ_TYPE_FS && !(req->cmd_flags & REQ_DISCARD)) {
 		blk_dump_rq_flags(req, "MMC bad request");
 		return BLKPREP_KILL;
 	}
 	req->cmd_flags |= REQ_DONTPREP;
 	return BLKPREP_OK;
 }
 static int mmc_queue_thread(void *d)
 {
 	struct mmc_queue *mq = d;
 	struct request_queue *q = mq->queue;
 	current->flags |= PF_MEMALLOC;
 	down(&mq->thread_sem);
 	do {
 		struct request *req = NULL;
 		spin_lock_irq(q->queue_lock);
 		set_current_state(TASK_INTERRUPTIBLE);
 		if (!blk_queue_plugged(q))
 			req = blk_fetch_request(q);
 		mq->req = req;
 		spin_unlock_irq(q->queue_lock);
 		if (!req) {
 			if (kthread_should_stop()) {
 				set_current_state(TASK_RUNNING);
 				break;
 			}
 			up(&mq->thread_sem);
 			schedule();
 			down(&mq->thread_sem);
 			continue;
 		}
 		set_current_state(TASK_RUNNING);
 		mq->issue_fn(mq, req);
 	} while (1);
 	up(&mq->thread_sem);
 	return 0;
 }
 /*
  * Generic MMC request handler.  This is called for any queue on a
  * particular host.  When the host is not busy, we look for a request
  * on any queue on this host, and attempt to issue it.  This may
  * not be the queue we were asked to process.
  */
 static void mmc_request(struct request_queue *q)
 {
 	struct mmc_queue *mq = q->queuedata;
 	struct request *req;
 	if (!mq) {
 		while ((req = blk_fetch_request(q)) != NULL) {
 			req->cmd_flags |= REQ_QUIET;
 			__blk_end_request_all(req, -EIO);
 		}
 		return;
 	}
 	if (!mq->req)
 		wake_up_process(mq->thread);
 }
 /**
  * mmc_init_queue - initialise a queue structure.
  * @mq: mmc queue
  * @card: mmc card to attach this queue
  * @lock: queue lock
  *
  * Initialise a MMC card request queue.
  */
 int mmc_init_queue(struct mmc_queue *mq, struct mmc_card *card, spinlock_t *lock)
 {
 	struct mmc_host *host = card->host;
 	u64 limit = BLK_BOUNCE_HIGH;
 	int ret;
 	if (mmc_dev(host)->dma_mask && *mmc_dev(host)->dma_mask)
 		limit = *mmc_dev(host)->dma_mask;
 	mq->card = card;
 	mq->queue = blk_init_queue(mmc_request, lock);
 	if (!mq->queue)
 		return -ENOMEM;
 	mq->queue->queuedata = mq;
 	mq->req = NULL;
 	blk_queue_prep_rq(mq->queue, mmc_prep_request);
-	blk_queue_ordered(mq->queue, QUEUE_ORDERED_DRAIN);
 	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, mq->queue);
 	if (mmc_can_erase(card)) {
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mq->queue);
 		mq->queue->limits.max_discard_sectors = UINT_MAX;
 		if (card->erased_byte == 0)
 			mq->queue->limits.discard_zeroes_data = 1;
 		if (!mmc_can_trim(card) && is_power_of_2(card->erase_size)) {
 			mq->queue->limits.discard_granularity =
 							card->erase_size << 9;
 			mq->queue->limits.discard_alignment =
 							card->erase_size << 9;
 		}
 		if (mmc_can_secure_erase_trim(card))
 			queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD,
 						mq->queue);
 	}
 #ifdef CONFIG_MMC_BLOCK_BOUNCE
 	if (host->max_hw_segs == 1) {
 		unsigned int bouncesz;
 		bouncesz = MMC_QUEUE_BOUNCESZ;
 		if (bouncesz > host->max_req_size)
 			bouncesz = host->max_req_size;
 		if (bouncesz > host->max_seg_size)
 			bouncesz = host->max_seg_size;
 		if (bouncesz > (host->max_blk_count * 512))
 			bouncesz = host->max_blk_count * 512;
 		if (bouncesz > 512) {
 			mq->bounce_buf = kmalloc(bouncesz, GFP_KERNEL);
 			if (!mq->bounce_buf) {
 				printk(KERN_WARNING "%s: unable to "
 					"allocate bounce buffer\n",
 					mmc_card_name(card));
 			}
 		}
 		if (mq->bounce_buf) {
 			blk_queue_bounce_limit(mq->queue, BLK_BOUNCE_ANY);
 			blk_queue_max_hw_sectors(mq->queue, bouncesz / 512);
 			blk_queue_max_segments(mq->queue, bouncesz / 512);
 			blk_queue_max_segment_size(mq->queue, bouncesz);
 			mq->sg = kmalloc(sizeof(struct scatterlist),
 				GFP_KERNEL);
 			if (!mq->sg) {
 				ret = -ENOMEM;
 				goto cleanup_queue;
 			}
 			sg_init_table(mq->sg, 1);
 			mq->bounce_sg = kmalloc(sizeof(struct scatterlist) *
 				bouncesz / 512, GFP_KERNEL);
 			if (!mq->bounce_sg) {
 				ret = -ENOMEM;
 				goto cleanup_queue;
 			}
 			sg_init_table(mq->bounce_sg, bouncesz / 512);
 		}
 	}
 #endif
 	if (!mq->bounce_buf) {
 		blk_queue_bounce_limit(mq->queue, limit);
 		blk_queue_max_hw_sectors(mq->queue,
 			min(host->max_blk_count, host->max_req_size / 512));
 		blk_queue_max_segments(mq->queue, host->max_hw_segs);
 		blk_queue_max_segment_size(mq->queue, host->max_seg_size);
 		mq->sg = kmalloc(sizeof(struct scatterlist) *
 			host->max_phys_segs, GFP_KERNEL);
 		if (!mq->sg) {
 			ret = -ENOMEM;
 			goto cleanup_queue;
 		}
 		sg_init_table(mq->sg, host->max_phys_segs);
 	}
 	init_MUTEX(&mq->thread_sem);
 	mq->thread = kthread_run(mmc_queue_thread, mq, "mmcqd");
 	if (IS_ERR(mq->thread)) {
 		ret = PTR_ERR(mq->thread);
 		goto free_bounce_sg;
 	}
 	return 0;
  free_bounce_sg:
  	if (mq->bounce_sg)
  		kfree(mq->bounce_sg);
  	mq->bounce_sg = NULL;
  cleanup_queue:
  	if (mq->sg)
 		kfree(mq->sg);
 	mq->sg = NULL;
 	if (mq->bounce_buf)
 		kfree(mq->bounce_buf);
 	mq->bounce_buf = NULL;
 	blk_cleanup_queue(mq->queue);
 	return ret;
 }
 void mmc_cleanup_queue(struct mmc_queue *mq)
 {
 	struct request_queue *q = mq->queue;
 	unsigned long flags;
 	/* Make sure the queue isn't suspended, as that will deadlock */
 	mmc_queue_resume(mq);
 	/* Then terminate our worker thread */
 	kthread_stop(mq->thread);
 	/* Empty the queue */
 	spin_lock_irqsave(q->queue_lock, flags);
 	q->queuedata = NULL;
 	blk_start_queue(q);
 	spin_unlock_irqrestore(q->queue_lock, flags);
  	if (mq->bounce_sg)
  		kfree(mq->bounce_sg);
  	mq->bounce_sg = NULL;
 	kfree(mq->sg);
 	mq->sg = NULL;
 	if (mq->bounce_buf)
 		kfree(mq->bounce_buf);
 	mq->bounce_buf = NULL;
 	mq->card = NULL;
 }
 EXPORT_SYMBOL(mmc_cleanup_queue);
 /**
  * mmc_queue_suspend - suspend a MMC request queue
  * @mq: MMC queue to suspend
  *
  * Stop the block request queue, and wait for our thread to
  * complete any outstanding requests.  This ensures that we
  * won't suspend while a request is being processed.
  */
 void mmc_queue_suspend(struct mmc_queue *mq)
 {
 	struct request_queue *q = mq->queue;
 	unsigned long flags;
 	if (!(mq->flags & MMC_QUEUE_SUSPENDED)) {
 		mq->flags |= MMC_QUEUE_SUSPENDED;
 		spin_lock_irqsave(q->queue_lock, flags);
 		blk_stop_queue(q);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 		down(&mq->thread_sem);
 	}
 }
 /**
  * mmc_queue_resume - resume a previously suspended MMC request queue
  * @mq: MMC queue to resume
  */
 void mmc_queue_resume(struct mmc_queue *mq)
 {
 	struct request_queue *q = mq->queue;
 	unsigned long flags;
 	if (mq->flags & MMC_QUEUE_SUSPENDED) {
 		mq->flags &= ~MMC_QUEUE_SUSPENDED;
 		up(&mq->thread_sem);
 		spin_lock_irqsave(q->queue_lock, flags);
 		blk_start_queue(q);
 		spin_unlock_irqrestore(q->queue_lock, flags);
 	}
 }
 /*
  * Prepare the sg list(s) to be handed of to the host driver
  */
 unsigned int mmc_queue_map_sg(struct mmc_queue *mq)
 {
 	unsigned int sg_len;
 	size_t buflen;
 	struct scatterlist *sg;
 	int i;
 	if (!mq->bounce_buf)
 		return blk_rq_map_sg(mq->queue, mq->req, mq->sg);
 	BUG_ON(!mq->bounce_sg);
 	sg_len = blk_rq_map_sg(mq->queue, mq->req, mq->bounce_sg);
 	mq->bounce_sg_len = sg_len;
 	buflen = 0;
 	for_each_sg(mq->bounce_sg, sg, sg_len, i)
 		buflen += sg->length;
 	sg_init_one(mq->sg, mq->bounce_buf, buflen);
 	return 1;
 }
 /*
  * If writing, bounce the data to the buffer before the request
  * is sent to the host driver
  */
 void mmc_queue_bounce_pre(struct mmc_queue *mq)
 {
 	unsigned long flags;
 	if (!mq->bounce_buf)
 		return;
 	if (rq_data_dir(mq->req) != WRITE)
 		return;
 	local_irq_save(flags);
 	sg_copy_to_buffer(mq->bounce_sg, mq->bounce_sg_len,
 		mq->bounce_buf, mq->sg[0].length);
 	local_irq_restore(flags);
 }
 /*
  * If reading, bounce the data from the buffer after the request
  * has been handled by the host driver
  */
 void mmc_queue_bounce_post(struct mmc_queue *mq)
 {
 	unsigned long flags;
 	if (!mq->bounce_buf)
 		return;
 	if (rq_data_dir(mq->req) != READ)
 		return;
 	local_irq_save(flags);
 	sg_copy_from_buffer(mq->bounce_sg, mq->bounce_sg_len,
 		mq->bounce_buf, mq->sg[0].length);
 	local_irq_restore(flags);
 }

drivers/s390/block/dasd.c

Diff comments View file @ 4913efe

 /*
  * File...........: linux/drivers/s390/block/dasd.c
  * Author(s)......: Holger Smolinski <Holger.Smolinski@de.ibm.com>
  *		    Horst Hummel <Horst.Hummel@de.ibm.com>
  *		    Carsten Otte <Cotte@de.ibm.com>
  *		    Martin Schwidefsky <schwidefsky@de.ibm.com>
  * Bugreports.to..: <Linux390@de.ibm.com>
  * Copyright IBM Corp. 1999, 2009
  */
 #define KMSG_COMPONENT "dasd"
 #define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
 #include <linux/kmod.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/ctype.h>
 #include <linux/major.h>
 #include <linux/slab.h>
 #include <linux/buffer_head.h>
 #include <linux/hdreg.h>
 #include <linux/async.h>
 #include <linux/mutex.h>
 #include <linux/smp_lock.h>
 #include <asm/ccwdev.h>
 #include <asm/ebcdic.h>
 #include <asm/idals.h>
 #include <asm/itcw.h>
 #include <asm/diag.h>
 /* This is ugly... */
 #define PRINTK_HEADER "dasd:"
 #include "dasd_int.h"
 /*
  * SECTION: Constant definitions to be used within this file
  */
 #define DASD_CHANQ_MAX_SIZE 4
 #define DASD_SLEEPON_START_TAG	(void *) 1
 #define DASD_SLEEPON_END_TAG	(void *) 2
 /*
  * SECTION: exported variables of dasd.c
  */
 debug_info_t *dasd_debug_area;
 struct dasd_discipline *dasd_diag_discipline_pointer;
 void dasd_int_handler(struct ccw_device *, unsigned long, struct irb *);
 MODULE_AUTHOR("Holger Smolinski <Holger.Smolinski@de.ibm.com>");
 MODULE_DESCRIPTION("Linux on S/390 DASD device driver,"
 		   " Copyright 2000 IBM Corporation");
 MODULE_SUPPORTED_DEVICE("dasd");
 MODULE_LICENSE("GPL");
 /*
  * SECTION: prototypes for static functions of dasd.c
  */
 static int  dasd_alloc_queue(struct dasd_block *);
 static void dasd_setup_queue(struct dasd_block *);
 static void dasd_free_queue(struct dasd_block *);
 static void dasd_flush_request_queue(struct dasd_block *);
 static int dasd_flush_block_queue(struct dasd_block *);
 static void dasd_device_tasklet(struct dasd_device *);
 static void dasd_block_tasklet(struct dasd_block *);
 static void do_kick_device(struct work_struct *);
 static void do_restore_device(struct work_struct *);
 static void do_reload_device(struct work_struct *);
 static void dasd_return_cqr_cb(struct dasd_ccw_req *, void *);
 static void dasd_device_timeout(unsigned long);
 static void dasd_block_timeout(unsigned long);
 static void __dasd_process_erp(struct dasd_device *, struct dasd_ccw_req *);
 /*
  * SECTION: Operations on the device structure.
  */
 static wait_queue_head_t dasd_init_waitq;
 static wait_queue_head_t dasd_flush_wq;
 static wait_queue_head_t generic_waitq;
 /*
  * Allocate memory for a new device structure.
  */
 struct dasd_device *dasd_alloc_device(void)
 {
 	struct dasd_device *device;
 	device = kzalloc(sizeof(struct dasd_device), GFP_ATOMIC);
 	if (!device)
 		return ERR_PTR(-ENOMEM);
 	/* Get two pages for normal block device operations. */
 	device->ccw_mem = (void *) __get_free_pages(GFP_ATOMIC | GFP_DMA, 1);
 	if (!device->ccw_mem) {
 		kfree(device);
 		return ERR_PTR(-ENOMEM);
 	}
 	/* Get one page for error recovery. */
 	device->erp_mem = (void *) get_zeroed_page(GFP_ATOMIC | GFP_DMA);
 	if (!device->erp_mem) {
 		free_pages((unsigned long) device->ccw_mem, 1);
 		kfree(device);
 		return ERR_PTR(-ENOMEM);
 	}
 	dasd_init_chunklist(&device->ccw_chunks, device->ccw_mem, PAGE_SIZE*2);
 	dasd_init_chunklist(&device->erp_chunks, device->erp_mem, PAGE_SIZE);
 	spin_lock_init(&device->mem_lock);
 	atomic_set(&device->tasklet_scheduled, 0);
 	tasklet_init(&device->tasklet,
 		     (void (*)(unsigned long)) dasd_device_tasklet,
 		     (unsigned long) device);
 	INIT_LIST_HEAD(&device->ccw_queue);
 	init_timer(&device->timer);
 	device->timer.function = dasd_device_timeout;
 	device->timer.data = (unsigned long) device;
 	INIT_WORK(&device->kick_work, do_kick_device);
 	INIT_WORK(&device->restore_device, do_restore_device);
 	INIT_WORK(&device->reload_device, do_reload_device);
 	device->state = DASD_STATE_NEW;
 	device->target = DASD_STATE_NEW;
 	mutex_init(&device->state_mutex);
 	return device;
 }
 /*
  * Free memory of a device structure.
  */
 void dasd_free_device(struct dasd_device *device)
 {
 	kfree(device->private);
 	free_page((unsigned long) device->erp_mem);
 	free_pages((unsigned long) device->ccw_mem, 1);
 	kfree(device);
 }
 /*
  * Allocate memory for a new device structure.
  */
 struct dasd_block *dasd_alloc_block(void)
 {
 	struct dasd_block *block;
 	block = kzalloc(sizeof(*block), GFP_ATOMIC);
 	if (!block)
 		return ERR_PTR(-ENOMEM);
 	/* open_count = 0 means device online but not in use */
 	atomic_set(&block->open_count, -1);
 	spin_lock_init(&block->request_queue_lock);
 	atomic_set(&block->tasklet_scheduled, 0);
 	tasklet_init(&block->tasklet,
 		     (void (*)(unsigned long)) dasd_block_tasklet,
 		     (unsigned long) block);
 	INIT_LIST_HEAD(&block->ccw_queue);
 	spin_lock_init(&block->queue_lock);
 	init_timer(&block->timer);
 	block->timer.function = dasd_block_timeout;
 	block->timer.data = (unsigned long) block;
 	return block;
 }
 /*
  * Free memory of a device structure.
  */
 void dasd_free_block(struct dasd_block *block)
 {
 	kfree(block);
 }
 /*
  * Make a new device known to the system.
  */
 static int dasd_state_new_to_known(struct dasd_device *device)
 {
 	int rc;
 	/*
 	 * As long as the device is not in state DASD_STATE_NEW we want to
 	 * keep the reference count > 0.
 	 */
 	dasd_get_device(device);
 	if (device->block) {
 		rc = dasd_alloc_queue(device->block);
 		if (rc) {
 			dasd_put_device(device);
 			return rc;
 		}
 	}
 	device->state = DASD_STATE_KNOWN;
 	return 0;
 }
 /*
  * Let the system forget about a device.
  */
 static int dasd_state_known_to_new(struct dasd_device *device)
 {
 	/* Disable extended error reporting for this device. */
 	dasd_eer_disable(device);
 	/* Forget the discipline information. */
 	if (device->discipline) {
 		if (device->discipline->uncheck_device)
 			device->discipline->uncheck_device(device);
 		module_put(device->discipline->owner);
 	}
 	device->discipline = NULL;
 	if (device->base_discipline)
 		module_put(device->base_discipline->owner);
 	device->base_discipline = NULL;
 	device->state = DASD_STATE_NEW;
 	if (device->block)
 		dasd_free_queue(device->block);
 	/* Give up reference we took in dasd_state_new_to_known. */
 	dasd_put_device(device);
 	return 0;
 }
 /*
  * Request the irq line for the device.
  */
 static int dasd_state_known_to_basic(struct dasd_device *device)
 {
 	int rc;
 	/* Allocate and register gendisk structure. */
 	if (device->block) {
 		rc = dasd_gendisk_alloc(device->block);
 		if (rc)
 			return rc;
 	}
 	/* register 'device' debug area, used for all DBF_DEV_XXX calls */
 	device->debug_area = debug_register(dev_name(&device->cdev->dev), 4, 1,
 					    8 * sizeof(long));
 	debug_register_view(device->debug_area, &debug_sprintf_view);
 	debug_set_level(device->debug_area, DBF_WARNING);
 	DBF_DEV_EVENT(DBF_EMERG, device, "%s", "debug area created");
 	device->state = DASD_STATE_BASIC;
 	return 0;
 }
 /*
  * Release the irq line for the device. Terminate any running i/o.
  */
 static int dasd_state_basic_to_known(struct dasd_device *device)
 {
 	int rc;
 	if (device->block) {
 		dasd_gendisk_free(device->block);
 		dasd_block_clear_timer(device->block);
 	}
 	rc = dasd_flush_device_queue(device);
 	if (rc)
 		return rc;
 	dasd_device_clear_timer(device);
 	DBF_DEV_EVENT(DBF_EMERG, device, "%p debug area deleted", device);
 	if (device->debug_area != NULL) {
 		debug_unregister(device->debug_area);
 		device->debug_area = NULL;
 	}
 	device->state = DASD_STATE_KNOWN;
 	return 0;
 }
 /*
  * Do the initial analysis. The do_analysis function may return
  * -EAGAIN in which case the device keeps the state DASD_STATE_BASIC
  * until the discipline decides to continue the startup sequence
  * by calling the function dasd_change_state. The eckd disciplines
  * uses this to start a ccw that detects the format. The completion
  * interrupt for this detection ccw uses the kernel event daemon to
  * trigger the call to dasd_change_state. All this is done in the
  * discipline code, see dasd_eckd.c.
  * After the analysis ccw is done (do_analysis returned 0) the block
  * device is setup.
  * In case the analysis returns an error, the device setup is stopped
  * (a fake disk was already added to allow formatting).
  */
 static int dasd_state_basic_to_ready(struct dasd_device *device)
 {
 	int rc;
 	struct dasd_block *block;
 	rc = 0;
 	block = device->block;
 	/* make disk known with correct capacity */
 	if (block) {
 		if (block->base->discipline->do_analysis != NULL)
 			rc = block->base->discipline->do_analysis(block);
 		if (rc) {
 			if (rc != -EAGAIN)
 				device->state = DASD_STATE_UNFMT;
 			return rc;
 		}
 		dasd_setup_queue(block);
 		set_capacity(block->gdp,
 			     block->blocks << block->s2b_shift);
 		device->state = DASD_STATE_READY;
 		rc = dasd_scan_partitions(block);
 		if (rc)
 			device->state = DASD_STATE_BASIC;
 	} else {
 		device->state = DASD_STATE_READY;
 	}
 	return rc;
 }
 /*
  * Remove device from block device layer. Destroy dirty buffers.
  * Forget format information. Check if the target level is basic
  * and if it is create fake disk for formatting.
  */
 static int dasd_state_ready_to_basic(struct dasd_device *device)
 {
 	int rc;
 	device->state = DASD_STATE_BASIC;
 	if (device->block) {
 		struct dasd_block *block = device->block;
 		rc = dasd_flush_block_queue(block);
 		if (rc) {
 			device->state = DASD_STATE_READY;
 			return rc;
 		}
 		dasd_flush_request_queue(block);
 		dasd_destroy_partitions(block);
 		block->blocks = 0;
 		block->bp_block = 0;
 		block->s2b_shift = 0;
 	}
 	return 0;
 }
 /*
  * Back to basic.
  */
 static int dasd_state_unfmt_to_basic(struct dasd_device *device)
 {
 	device->state = DASD_STATE_BASIC;
 	return 0;
 }
 /*
  * Make the device online and schedule the bottom half to start
  * the requeueing of requests from the linux request queue to the
  * ccw queue.
  */
 static int
 dasd_state_ready_to_online(struct dasd_device * device)
 {
 	int rc;
 	struct gendisk *disk;
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 	if (device->discipline->ready_to_online) {
 		rc = device->discipline->ready_to_online(device);
 		if (rc)
 			return rc;
 	}
 	device->state = DASD_STATE_ONLINE;
 	if (device->block) {
 		dasd_schedule_block_bh(device->block);
 		disk = device->block->bdev->bd_disk;
 		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
 		while ((part = disk_part_iter_next(&piter)))
 			kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE);
 		disk_part_iter_exit(&piter);
 	}
 	return 0;
 }
 /*
  * Stop the requeueing of requests again.
  */
 static int dasd_state_online_to_ready(struct dasd_device *device)
 {
 	int rc;
 	struct gendisk *disk;
 	struct disk_part_iter piter;
 	struct hd_struct *part;
 	if (device->discipline->online_to_ready) {
 		rc = device->discipline->online_to_ready(device);
 		if (rc)
 			return rc;
 	}
 	device->state = DASD_STATE_READY;
 	if (device->block) {
 		disk = device->block->bdev->bd_disk;
 		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
 		while ((part = disk_part_iter_next(&piter)))
 			kobject_uevent(&part_to_dev(part)->kobj, KOBJ_CHANGE);
 		disk_part_iter_exit(&piter);
 	}
 	return 0;
 }
 /*
  * Device startup state changes.
  */
 static int dasd_increase_state(struct dasd_device *device)
 {
 	int rc;
 	rc = 0;
 	if (device->state == DASD_STATE_NEW &&
 	    device->target >= DASD_STATE_KNOWN)
 		rc = dasd_state_new_to_known(device);
 	if (!rc &&
 	    device->state == DASD_STATE_KNOWN &&
 	    device->target >= DASD_STATE_BASIC)
 		rc = dasd_state_known_to_basic(device);
 	if (!rc &&
 	    device->state == DASD_STATE_BASIC &&
 	    device->target >= DASD_STATE_READY)
 		rc = dasd_state_basic_to_ready(device);
 	if (!rc &&
 	    device->state == DASD_STATE_UNFMT &&
 	    device->target > DASD_STATE_UNFMT)
 		rc = -EPERM;
 	if (!rc &&
 	    device->state == DASD_STATE_READY &&
 	    device->target >= DASD_STATE_ONLINE)
 		rc = dasd_state_ready_to_online(device);
 	return rc;
 }
 /*
  * Device shutdown state changes.
  */
 static int dasd_decrease_state(struct dasd_device *device)
 {
 	int rc;
 	rc = 0;
 	if (device->state == DASD_STATE_ONLINE &&
 	    device->target <= DASD_STATE_READY)
 		rc = dasd_state_online_to_ready(device);
 	if (!rc &&
 	    device->state == DASD_STATE_READY &&
 	    device->target <= DASD_STATE_BASIC)
 		rc = dasd_state_ready_to_basic(device);
 	if (!rc &&
 	    device->state == DASD_STATE_UNFMT &&
 	    device->target <= DASD_STATE_BASIC)
 		rc = dasd_state_unfmt_to_basic(device);
 	if (!rc &&
 	    device->state == DASD_STATE_BASIC &&
 	    device->target <= DASD_STATE_KNOWN)
 		rc = dasd_state_basic_to_known(device);
 	if (!rc &&
 	    device->state == DASD_STATE_KNOWN &&
 	    device->target <= DASD_STATE_NEW)
 		rc = dasd_state_known_to_new(device);
 	return rc;
 }
 /*
  * This is the main startup/shutdown routine.
  */
 static void dasd_change_state(struct dasd_device *device)
 {
 	int rc;
 	if (device->state == device->target)
 		/* Already where we want to go today... */
 		return;
 	if (device->state < device->target)
 		rc = dasd_increase_state(device);
 	else
 		rc = dasd_decrease_state(device);
 	if (rc == -EAGAIN)
 		return;
 	if (rc)
 		device->target = device->state;
 	if (device->state == device->target)
 		wake_up(&dasd_init_waitq);
 	/* let user-space know that the device status changed */
 	kobject_uevent(&device->cdev->dev.kobj, KOBJ_CHANGE);
 }
 /*
  * Kick starter for devices that did not complete the startup/shutdown
  * procedure or were sleeping because of a pending state.
  * dasd_kick_device will schedule a call do do_kick_device to the kernel
  * event daemon.
  */
 static void do_kick_device(struct work_struct *work)
 {
 	struct dasd_device *device = container_of(work, struct dasd_device, kick_work);
 	mutex_lock(&device->state_mutex);
 	dasd_change_state(device);
 	mutex_unlock(&device->state_mutex);
 	dasd_schedule_device_bh(device);
 	dasd_put_device(device);
 }
 void dasd_kick_device(struct dasd_device *device)
 {
 	dasd_get_device(device);
 	/* queue call to dasd_kick_device to the kernel event daemon. */
 	schedule_work(&device->kick_work);
 }
 /*
  * dasd_reload_device will schedule a call do do_reload_device to the kernel
  * event daemon.
  */
 static void do_reload_device(struct work_struct *work)
 {
 	struct dasd_device *device = container_of(work, struct dasd_device,
 						  reload_device);
 	device->discipline->reload(device);
 	dasd_put_device(device);
 }
 void dasd_reload_device(struct dasd_device *device)
 {
 	dasd_get_device(device);
 	/* queue call to dasd_reload_device to the kernel event daemon. */
 	schedule_work(&device->reload_device);
 }
 EXPORT_SYMBOL(dasd_reload_device);
 /*
  * dasd_restore_device will schedule a call do do_restore_device to the kernel
  * event daemon.
  */
 static void do_restore_device(struct work_struct *work)
 {
 	struct dasd_device *device = container_of(work, struct dasd_device,
 						  restore_device);
 	device->cdev->drv->restore(device->cdev);
 	dasd_put_device(device);
 }
 void dasd_restore_device(struct dasd_device *device)
 {
 	dasd_get_device(device);
 	/* queue call to dasd_restore_device to the kernel event daemon. */
 	schedule_work(&device->restore_device);
 }
 /*
  * Set the target state for a device and starts the state change.
  */
 void dasd_set_target_state(struct dasd_device *device, int target)
 {
 	dasd_get_device(device);
 	mutex_lock(&device->state_mutex);
 	/* If we are in probeonly mode stop at DASD_STATE_READY. */
 	if (dasd_probeonly && target > DASD_STATE_READY)
 		target = DASD_STATE_READY;
 	if (device->target != target) {
 		if (device->state == target)
 			wake_up(&dasd_init_waitq);
 		device->target = target;
 	}
 	if (device->state != device->target)
 		dasd_change_state(device);
 	mutex_unlock(&device->state_mutex);
 	dasd_put_device(device);
 }
 /*
  * Enable devices with device numbers in [from..to].
  */
 static inline int _wait_for_device(struct dasd_device *device)
 {
 	return (device->state == device->target);
 }
 void dasd_enable_device(struct dasd_device *device)
 {
 	dasd_set_target_state(device, DASD_STATE_ONLINE);
 	if (device->state <= DASD_STATE_KNOWN)
 		/* No discipline for device found. */
 		dasd_set_target_state(device, DASD_STATE_NEW);
 	/* Now wait for the devices to come up. */
 	wait_event(dasd_init_waitq, _wait_for_device(device));
 }
 /*
  * SECTION: device operation (interrupt handler, start i/o, term i/o ...)
  */
 #ifdef CONFIG_DASD_PROFILE
 struct dasd_profile_info_t dasd_global_profile;
 unsigned int dasd_profile_level = DASD_PROFILE_OFF;
 /*
  * Increments counter in global and local profiling structures.
  */
 #define dasd_profile_counter(value, counter, block) \
 { \
 	int index; \
 	for (index = 0; index < 31 && value >> (2+index); index++); \
 	dasd_global_profile.counter[index]++; \
 	block->profile.counter[index]++; \
 }
 /*
  * Add profiling information for cqr before execution.
  */
 static void dasd_profile_start(struct dasd_block *block,
 			       struct dasd_ccw_req *cqr,
 			       struct request *req)
 {
 	struct list_head *l;
 	unsigned int counter;
 	if (dasd_profile_level != DASD_PROFILE_ON)
 		return;
 	/* count the length of the chanq for statistics */
 	counter = 0;
 	list_for_each(l, &block->ccw_queue)
 		if (++counter >= 31)
 			break;
 	dasd_global_profile.dasd_io_nr_req[counter]++;
 	block->profile.dasd_io_nr_req[counter]++;
 }
 /*
  * Add profiling information for cqr after execution.
  */
 static void dasd_profile_end(struct dasd_block *block,
 			     struct dasd_ccw_req *cqr,
 			     struct request *req)
 {
 	long strtime, irqtime, endtime, tottime;	/* in microseconds */
 	long tottimeps, sectors;
 	if (dasd_profile_level != DASD_PROFILE_ON)
 		return;
 	sectors = blk_rq_sectors(req);
 	if (!cqr->buildclk || !cqr->startclk ||
 	    !cqr->stopclk || !cqr->endclk ||
 	    !sectors)
 		return;
 	strtime = ((cqr->startclk - cqr->buildclk) >> 12);
 	irqtime = ((cqr->stopclk - cqr->startclk) >> 12);
 	endtime = ((cqr->endclk - cqr->stopclk) >> 12);
 	tottime = ((cqr->endclk - cqr->buildclk) >> 12);
 	tottimeps = tottime / sectors;
 	if (!dasd_global_profile.dasd_io_reqs)
 		memset(&dasd_global_profile, 0,
 		       sizeof(struct dasd_profile_info_t));
 	dasd_global_profile.dasd_io_reqs++;
 	dasd_global_profile.dasd_io_sects += sectors;
 	if (!block->profile.dasd_io_reqs)
 		memset(&block->profile, 0,
 		       sizeof(struct dasd_profile_info_t));
 	block->profile.dasd_io_reqs++;
 	block->profile.dasd_io_sects += sectors;
 	dasd_profile_counter(sectors, dasd_io_secs, block);
 	dasd_profile_counter(tottime, dasd_io_times, block);
 	dasd_profile_counter(tottimeps, dasd_io_timps, block);
 	dasd_profile_counter(strtime, dasd_io_time1, block);
 	dasd_profile_counter(irqtime, dasd_io_time2, block);
 	dasd_profile_counter(irqtime / sectors, dasd_io_time2ps, block);
 	dasd_profile_counter(endtime, dasd_io_time3, block);
 }
 #else
 #define dasd_profile_start(block, cqr, req) do {} while (0)
 #define dasd_profile_end(block, cqr, req) do {} while (0)
 #endif				/* CONFIG_DASD_PROFILE */
 /*
  * Allocate memory for a channel program with 'cplength' channel
  * command words and 'datasize' additional space. There are two
  * variantes: 1) dasd_kmalloc_request uses kmalloc to get the needed
  * memory and 2) dasd_smalloc_request uses the static ccw memory
  * that gets allocated for each device.
  */
 struct dasd_ccw_req *dasd_kmalloc_request(int magic, int cplength,
 					  int datasize,
 					  struct dasd_device *device)
 {
 	struct dasd_ccw_req *cqr;
 	/* Sanity checks */
 	BUG_ON(datasize > PAGE_SIZE ||
 	     (cplength*sizeof(struct ccw1)) > PAGE_SIZE);
 	cqr = kzalloc(sizeof(struct dasd_ccw_req), GFP_ATOMIC);
 	if (cqr == NULL)
 		return ERR_PTR(-ENOMEM);
 	cqr->cpaddr = NULL;
 	if (cplength > 0) {
 		cqr->cpaddr = kcalloc(cplength, sizeof(struct ccw1),
 				      GFP_ATOMIC | GFP_DMA);
 		if (cqr->cpaddr == NULL) {
 			kfree(cqr);
 			return ERR_PTR(-ENOMEM);
 		}
 	}
 	cqr->data = NULL;
 	if (datasize > 0) {
 		cqr->data = kzalloc(datasize, GFP_ATOMIC | GFP_DMA);
 		if (cqr->data == NULL) {
 			kfree(cqr->cpaddr);
 			kfree(cqr);
 			return ERR_PTR(-ENOMEM);
 		}
 	}
 	cqr->magic =  magic;
 	set_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags);
 	dasd_get_device(device);
 	return cqr;
 }
 struct dasd_ccw_req *dasd_smalloc_request(int magic, int cplength,
 					  int datasize,
 					  struct dasd_device *device)
 {
 	unsigned long flags;
 	struct dasd_ccw_req *cqr;
 	char *data;
 	int size;
 	/* Sanity checks */
 	BUG_ON(datasize > PAGE_SIZE ||
 	     (cplength*sizeof(struct ccw1)) > PAGE_SIZE);
 	size = (sizeof(struct dasd_ccw_req) + 7L) & -8L;
 	if (cplength > 0)
 		size += cplength * sizeof(struct ccw1);
 	if (datasize > 0)
 		size += datasize;
 	spin_lock_irqsave(&device->mem_lock, flags);
 	cqr = (struct dasd_ccw_req *)
 		dasd_alloc_chunk(&device->ccw_chunks, size);
 	spin_unlock_irqrestore(&device->mem_lock, flags);
 	if (cqr == NULL)
 		return ERR_PTR(-ENOMEM);
 	memset(cqr, 0, sizeof(struct dasd_ccw_req));
 	data = (char *) cqr + ((sizeof(struct dasd_ccw_req) + 7L) & -8L);
 	cqr->cpaddr = NULL;
 	if (cplength > 0) {
 		cqr->cpaddr = (struct ccw1 *) data;
 		data += cplength*sizeof(struct ccw1);
 		memset(cqr->cpaddr, 0, cplength*sizeof(struct ccw1));
 	}
 	cqr->data = NULL;
 	if (datasize > 0) {
 		cqr->data = data;
  		memset(cqr->data, 0, datasize);
 	}
 	cqr->magic = magic;
 	set_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags);
 	dasd_get_device(device);
 	return cqr;
 }
 /*
  * Free memory of a channel program. This function needs to free all the
  * idal lists that might have been created by dasd_set_cda and the
  * struct dasd_ccw_req itself.
  */
 void dasd_kfree_request(struct dasd_ccw_req *cqr, struct dasd_device *device)
 {
 #ifdef CONFIG_64BIT
 	struct ccw1 *ccw;
 	/* Clear any idals used for the request. */
 	ccw = cqr->cpaddr;
 	do {
 		clear_normalized_cda(ccw);
 	} while (ccw++->flags & (CCW_FLAG_CC | CCW_FLAG_DC));
 #endif
 	kfree(cqr->cpaddr);
 	kfree(cqr->data);
 	kfree(cqr);
 	dasd_put_device(device);
 }
 void dasd_sfree_request(struct dasd_ccw_req *cqr, struct dasd_device *device)
 {
 	unsigned long flags;
 	spin_lock_irqsave(&device->mem_lock, flags);
 	dasd_free_chunk(&device->ccw_chunks, cqr);
 	spin_unlock_irqrestore(&device->mem_lock, flags);
 	dasd_put_device(device);
 }
 /*
  * Check discipline magic in cqr.
  */
 static inline int dasd_check_cqr(struct dasd_ccw_req *cqr)
 {
 	struct dasd_device *device;
 	if (cqr == NULL)
 		return -EINVAL;
 	device = cqr->startdev;
 	if (strncmp((char *) &cqr->magic, device->discipline->ebcname, 4)) {
 		DBF_DEV_EVENT(DBF_WARNING, device,
 			    " dasd_ccw_req 0x%08x magic doesn't match"
 			    " discipline 0x%08x",
 			    cqr->magic,
 			    *(unsigned int *) device->discipline->name);
 		return -EINVAL;
 	}
 	return 0;
 }
 /*
  * Terminate the current i/o and set the request to clear_pending.
  * Timer keeps device runnig.
  * ccw_device_clear can fail if the i/o subsystem
  * is in a bad mood.
  */
 int dasd_term_IO(struct dasd_ccw_req *cqr)
 {
 	struct dasd_device *device;
 	int retries, rc;
 	char errorstring[ERRORLENGTH];
 	/* Check the cqr */
 	rc = dasd_check_cqr(cqr);
 	if (rc)
 		return rc;
 	retries = 0;
 	device = (struct dasd_device *) cqr->startdev;
 	while ((retries < 5) && (cqr->status == DASD_CQR_IN_IO)) {
 		rc = ccw_device_clear(device->cdev, (long) cqr);
 		switch (rc) {
 		case 0:	/* termination successful */
 			cqr->retries--;
 			cqr->status = DASD_CQR_CLEAR_PENDING;
 			cqr->stopclk = get_clock();
 			cqr->starttime = 0;
 			DBF_DEV_EVENT(DBF_DEBUG, device,
 				      "terminate cqr %p successful",
 				      cqr);
 			break;
 		case -ENODEV:
 			DBF_DEV_EVENT(DBF_ERR, device, "%s",
 				      "device gone, retry");
 			break;
 		case -EIO:
 			DBF_DEV_EVENT(DBF_ERR, device, "%s",
 				      "I/O error, retry");
 			break;
 		case -EINVAL:
 		case -EBUSY:
 			DBF_DEV_EVENT(DBF_ERR, device, "%s",
 				      "device busy, retry later");
 			break;
 		default:
 			/* internal error 10 - unknown rc*/
 			snprintf(errorstring, ERRORLENGTH, "10 %d", rc);
 			dev_err(&device->cdev->dev, "An error occurred in the "
 				"DASD device driver, reason=%s\n", errorstring);
 			BUG();
 			break;
 		}
 		retries++;
 	}
 	dasd_schedule_device_bh(device);
 	return rc;
 }
 /*
  * Start the i/o. This start_IO can fail if the channel is really busy.
  * In that case set up a timer to start the request later.
  */
 int dasd_start_IO(struct dasd_ccw_req *cqr)
 {
 	struct dasd_device *device;
 	int rc;
 	char errorstring[ERRORLENGTH];
 	/* Check the cqr */
 	rc = dasd_check_cqr(cqr);
 	if (rc) {
 		cqr->intrc = rc;
 		return rc;
 	}
 	device = (struct dasd_device *) cqr->startdev;
 	if (cqr->retries < 0) {
 		/* internal error 14 - start_IO run out of retries */
 		sprintf(errorstring, "14 %p", cqr);
 		dev_err(&device->cdev->dev, "An error occurred in the DASD "
 			"device driver, reason=%s\n", errorstring);
 		cqr->status = DASD_CQR_ERROR;
 		return -EIO;
 	}
 	cqr->startclk = get_clock();
 	cqr->starttime = jiffies;
 	cqr->retries--;
 	if (cqr->cpmode == 1) {
 		rc = ccw_device_tm_start(device->cdev, cqr->cpaddr,
 					 (long) cqr, cqr->lpm);
 	} else {
 		rc = ccw_device_start(device->cdev, cqr->cpaddr,
 				      (long) cqr, cqr->lpm, 0);
 	}
 	switch (rc) {
 	case 0:
 		cqr->status = DASD_CQR_IN_IO;
 		break;
 	case -EBUSY:
 		DBF_DEV_EVENT(DBF_DEBUG, device, "%s",
 			      "start_IO: device busy, retry later");
 		break;
 	case -ETIMEDOUT:
 		DBF_DEV_EVENT(DBF_DEBUG, device, "%s",
 			      "start_IO: request timeout, retry later");
 		break;
 	case -EACCES:
 		/* -EACCES indicates that the request used only a
 		 * subset of the available pathes and all these
 		 * pathes are gone.
 		 * Do a retry with all available pathes.
 		 */
 		cqr->lpm = LPM_ANYPATH;
 		DBF_DEV_EVENT(DBF_DEBUG, device, "%s",
 			      "start_IO: selected pathes gone,"
 			      " retry on all pathes");
 		break;
 	case -ENODEV:
 		DBF_DEV_EVENT(DBF_DEBUG, device, "%s",
 			      "start_IO: -ENODEV device gone, retry");
 		break;
 	case -EIO:
 		DBF_DEV_EVENT(DBF_DEBUG, device, "%s",
 			      "start_IO: -EIO device gone, retry");
 		break;
 	case -EINVAL:
 		/* most likely caused in power management context */
 		DBF_DEV_EVENT(DBF_DEBUG, device, "%s",
 			      "start_IO: -EINVAL device currently "
 			      "not accessible");
 		break;
 	default:
 		/* internal error 11 - unknown rc */
 		snprintf(errorstring, ERRORLENGTH, "11 %d", rc);
 		dev_err(&device->cdev->dev,
 			"An error occurred in the DASD device driver, "
 			"reason=%s\n", errorstring);
 		BUG();
 		break;
 	}
 	cqr->intrc = rc;
 	return rc;
 }
 /*
  * Timeout function for dasd devices. This is used for different purposes
  *  1) missing interrupt handler for normal operation
  *  2) delayed start of request where start_IO failed with -EBUSY
  *  3) timeout for missing state change interrupts
  * The head of the ccw queue will have status DASD_CQR_IN_IO for 1),
  * DASD_CQR_QUEUED for 2) and 3).
  */
 static void dasd_device_timeout(unsigned long ptr)
 {
 	unsigned long flags;
 	struct dasd_device *device;
 	device = (struct dasd_device *) ptr;
 	spin_lock_irqsave(get_ccwdev_lock(device->cdev), flags);
 	/* re-activate request queue */
 	dasd_device_remove_stop_bits(device, DASD_STOPPED_PENDING);
 	spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags);
 	dasd_schedule_device_bh(device);
 }
 /*
  * Setup timeout for a device in jiffies.
  */
 void dasd_device_set_timer(struct dasd_device *device, int expires)
 {
 	if (expires == 0)
 		del_timer(&device->timer);
 	else
 		mod_timer(&device->timer, jiffies + expires);
 }
 /*
  * Clear timeout for a device.
  */
 void dasd_device_clear_timer(struct dasd_device *device)
 {
 	del_timer(&device->timer);
 }
 static void dasd_handle_killed_request(struct ccw_device *cdev,
 				       unsigned long intparm)
 {
 	struct dasd_ccw_req *cqr;
 	struct dasd_device *device;
 	if (!intparm)
 		return;
 	cqr = (struct dasd_ccw_req *) intparm;
 	if (cqr->status != DASD_CQR_IN_IO) {
 		DBF_EVENT_DEVID(DBF_DEBUG, cdev,
 				"invalid status in handle_killed_request: "
 				"%02x", cqr->status);
 		return;
 	}
 	device = dasd_device_from_cdev_locked(cdev);
 	if (IS_ERR(device)) {
 		DBF_EVENT_DEVID(DBF_DEBUG, cdev, "%s",
 				"unable to get device from cdev");
 		return;
 	}
 	if (!cqr->startdev ||
 	    device != cqr->startdev ||
 	    strncmp(cqr->startdev->discipline->ebcname,
 		    (char *) &cqr->magic, 4)) {
 		DBF_EVENT_DEVID(DBF_DEBUG, cdev, "%s",
 				"invalid device in request");
 		dasd_put_device(device);
 		return;
 	}
 	/* Schedule request to be retried. */
 	cqr->status = DASD_CQR_QUEUED;
 	dasd_device_clear_timer(device);
 	dasd_schedule_device_bh(device);
 	dasd_put_device(device);
 }
 void dasd_generic_handle_state_change(struct dasd_device *device)
 {
 	/* First of all start sense subsystem status request. */
 	dasd_eer_snss(device);
 	dasd_device_remove_stop_bits(device, DASD_STOPPED_PENDING);
 	dasd_schedule_device_bh(device);
 	if (device->block)
 		dasd_schedule_block_bh(device->block);
 }
 /*
  * Interrupt handler for "normal" ssch-io based dasd devices.
  */
 void dasd_int_handler(struct ccw_device *cdev, unsigned long intparm,
 		      struct irb *irb)
 {
 	struct dasd_ccw_req *cqr, *next;
 	struct dasd_device *device;
 	unsigned long long now;
 	int expires;
 	if (IS_ERR(irb)) {
 		switch (PTR_ERR(irb)) {
 		case -EIO:
 			break;
 		case -ETIMEDOUT:
 			DBF_EVENT_DEVID(DBF_WARNING, cdev, "%s: "
 					"request timed out\n", __func__);
 			break;
 		default:
 			DBF_EVENT_DEVID(DBF_WARNING, cdev, "%s: "
 					"unknown error %ld\n", __func__,
 					PTR_ERR(irb));
 		}
 		dasd_handle_killed_request(cdev, intparm);
 		return;
 	}
 	now = get_clock();
 	/* check for unsolicited interrupts */
 	cqr = (struct dasd_ccw_req *) intparm;
 	if (!cqr || ((scsw_cc(&irb->scsw) == 1) &&
 		     (scsw_fctl(&irb->scsw) & SCSW_FCTL_START_FUNC) &&
 		     (scsw_stctl(&irb->scsw) & SCSW_STCTL_STATUS_PEND))) {
 		if (cqr && cqr->status == DASD_CQR_IN_IO)
 			cqr->status = DASD_CQR_QUEUED;
 		device = dasd_device_from_cdev_locked(cdev);
 		if (!IS_ERR(device)) {
 			dasd_device_clear_timer(device);
 			device->discipline->handle_unsolicited_interrupt(device,
 									 irb);
 			dasd_put_device(device);
 		}
 		return;
 	}
 	device = (struct dasd_device *) cqr->startdev;
 	if (!device ||
 	    strncmp(device->discipline->ebcname, (char *) &cqr->magic, 4)) {
 		DBF_EVENT_DEVID(DBF_DEBUG, cdev, "%s",
 				"invalid device in request");
 		return;
 	}
 	/* Check for clear pending */
 	if (cqr->status == DASD_CQR_CLEAR_PENDING &&
 	    scsw_fctl(&irb->scsw) & SCSW_FCTL_CLEAR_FUNC) {
 		cqr->status = DASD_CQR_CLEARED;
 		dasd_device_clear_timer(device);
 		wake_up(&dasd_flush_wq);
 		dasd_schedule_device_bh(device);
 		return;
 	}
 	/* check status - the request might have been killed by dyn detach */
 	if (cqr->status != DASD_CQR_IN_IO) {
 		DBF_DEV_EVENT(DBF_DEBUG, device, "invalid status: bus_id %s, "
 			      "status %02x", dev_name(&cdev->dev), cqr->status);
 		return;
 	}
 	next = NULL;
 	expires = 0;
 	if (scsw_dstat(&irb->scsw) == (DEV_STAT_CHN_END | DEV_STAT_DEV_END) &&
 	    scsw_cstat(&irb->scsw) == 0) {
 		/* request was completed successfully */
 		cqr->status = DASD_CQR_SUCCESS;
 		cqr->stopclk = now;
 		/* Start first request on queue if possible -> fast_io. */
 		if (cqr->devlist.next != &device->ccw_queue) {
 			next = list_entry(cqr->devlist.next,
 					  struct dasd_ccw_req, devlist);
 		}
 	} else {  /* error */
 		memcpy(&cqr->irb, irb, sizeof(struct irb));
 		/* log sense for every failed I/O to s390 debugfeature */
 		dasd_log_sense_dbf(cqr, irb);
 		if (device->features & DASD_FEATURE_ERPLOG) {
 			dasd_log_sense(cqr, irb);
 		}
 		/*
 		 * If we don't want complex ERP for this request, then just
 		 * reset this and retry it in the fastpath
 		 */
 		if (!test_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags) &&
 		    cqr->retries > 0) {
 			if (cqr->lpm == LPM_ANYPATH)
 				DBF_DEV_EVENT(DBF_DEBUG, device,
 					      "default ERP in fastpath "
 					      "(%i retries left)",
 					      cqr->retries);
 			cqr->lpm    = LPM_ANYPATH;
 			cqr->status = DASD_CQR_QUEUED;
 			next = cqr;
 		} else
 			cqr->status = DASD_CQR_ERROR;
 	}
 	if (next && (next->status == DASD_CQR_QUEUED) &&
 	    (!device->stopped)) {
 		if (device->discipline->start_IO(next) == 0)
 			expires = next->expires;
 	}
 	if (expires != 0)
 		dasd_device_set_timer(device, expires);
 	else
 		dasd_device_clear_timer(device);
 	dasd_schedule_device_bh(device);
 }
 enum uc_todo dasd_generic_uc_handler(struct ccw_device *cdev, struct irb *irb)
 {
 	struct dasd_device *device;
 	device = dasd_device_from_cdev_locked(cdev);
 	if (IS_ERR(device))
 		goto out;
 	if (test_bit(DASD_FLAG_OFFLINE, &device->flags) ||
 	   device->state != device->target ||
 	   !device->discipline->handle_unsolicited_interrupt){
 		dasd_put_device(device);
 		goto out;
 	}
 	dasd_device_clear_timer(device);
 	device->discipline->handle_unsolicited_interrupt(device, irb);
 	dasd_put_device(device);
 out:
 	return UC_TODO_RETRY;
 }
 EXPORT_SYMBOL_GPL(dasd_generic_uc_handler);
 /*
  * If we have an error on a dasd_block layer request then we cancel
  * and return all further requests from the same dasd_block as well.
  */
 static void __dasd_device_recovery(struct dasd_device *device,
 				   struct dasd_ccw_req *ref_cqr)
 {
 	struct list_head *l, *n;
 	struct dasd_ccw_req *cqr;
 	/*
 	 * only requeue request that came from the dasd_block layer
 	 */
 	if (!ref_cqr->block)
 		return;
 	list_for_each_safe(l, n, &device->ccw_queue) {
 		cqr = list_entry(l, struct dasd_ccw_req, devlist);
 		if (cqr->status == DASD_CQR_QUEUED &&
 		    ref_cqr->block == cqr->block) {
 			cqr->status = DASD_CQR_CLEARED;
 		}
 	}
 };
 /*
  * Remove those ccw requests from the queue that need to be returned
  * to the upper layer.
  */
 static void __dasd_device_process_ccw_queue(struct dasd_device *device,
 					    struct list_head *final_queue)
 {
 	struct list_head *l, *n;
 	struct dasd_ccw_req *cqr;
 	/* Process request with final status. */
 	list_for_each_safe(l, n, &device->ccw_queue) {
 		cqr = list_entry(l, struct dasd_ccw_req, devlist);
 		/* Stop list processing at the first non-final request. */
 		if (cqr->status == DASD_CQR_QUEUED ||
 		    cqr->status == DASD_CQR_IN_IO ||
 		    cqr->status == DASD_CQR_CLEAR_PENDING)
 			break;
 		if (cqr->status == DASD_CQR_ERROR) {
 			__dasd_device_recovery(device, cqr);
 		}
 		/* Rechain finished requests to final queue */
 		list_move_tail(&cqr->devlist, final_queue);
 	}
 }
 /*
  * the cqrs from the final queue are returned to the upper layer
  * by setting a dasd_block state and calling the callback function
  */
 static void __dasd_device_process_final_queue(struct dasd_device *device,
 					      struct list_head *final_queue)
 {
 	struct list_head *l, *n;
 	struct dasd_ccw_req *cqr;
 	struct dasd_block *block;
 	void (*callback)(struct dasd_ccw_req *, void *data);
 	void *callback_data;
 	char errorstring[ERRORLENGTH];
 	list_for_each_safe(l, n, final_queue) {
 		cqr = list_entry(l, struct dasd_ccw_req, devlist);
 		list_del_init(&cqr->devlist);
 		block = cqr->block;
 		callback = cqr->callback;
 		callback_data = cqr->callback_data;
 		if (block)
 			spin_lock_bh(&block->queue_lock);
 		switch (cqr->status) {
 		case DASD_CQR_SUCCESS:
 			cqr->status = DASD_CQR_DONE;
 			break;
 		case DASD_CQR_ERROR:
 			cqr->status = DASD_CQR_NEED_ERP;
 			break;
 		case DASD_CQR_CLEARED:
 			cqr->status = DASD_CQR_TERMINATED;
 			break;
 		default:
 			/* internal error 12 - wrong cqr status*/
 			snprintf(errorstring, ERRORLENGTH, "12 %p %x02", cqr, cqr->status);
 			dev_err(&device->cdev->dev,
 				"An error occurred in the DASD device driver, "
 				"reason=%s\n", errorstring);
 			BUG();
 		}
 		if (cqr->callback != NULL)
 			(callback)(cqr, callback_data);
 		if (block)
 			spin_unlock_bh(&block->queue_lock);
 	}
 }
 /*
  * Take a look at the first request on the ccw queue and check
  * if it reached its expire time. If so, terminate the IO.
  */
 static void __dasd_device_check_expire(struct dasd_device *device)
 {
 	struct dasd_ccw_req *cqr;
 	if (list_empty(&device->ccw_queue))
 		return;
 	cqr = list_entry(device->ccw_queue.next, struct dasd_ccw_req, devlist);
 	if ((cqr->status == DASD_CQR_IN_IO && cqr->expires != 0) &&
 	    (time_after_eq(jiffies, cqr->expires + cqr->starttime))) {
 		if (device->discipline->term_IO(cqr) != 0) {
 			/* Hmpf, try again in 5 sec */
 			dev_err(&device->cdev->dev,
 				"cqr %p timed out (%lus) but cannot be "
 				"ended, retrying in 5 s\n",
 				cqr, (cqr->expires/HZ));
 			cqr->expires += 5*HZ;
 			dasd_device_set_timer(device, 5*HZ);
 		} else {
 			dev_err(&device->cdev->dev,
 				"cqr %p timed out (%lus), %i retries "
 				"remaining\n", cqr, (cqr->expires/HZ),
 				cqr->retries);
 		}
 	}
 }
 /*
  * Take a look at the first request on the ccw queue and check
  * if it needs to be started.
  */
 static void __dasd_device_start_head(struct dasd_device *device)
 {
 	struct dasd_ccw_req *cqr;
 	int rc;
 	if (list_empty(&device->ccw_queue))
 		return;
 	cqr = list_entry(device->ccw_queue.next, struct dasd_ccw_req, devlist);
 	if (cqr->status != DASD_CQR_QUEUED)
 		return;
 	/* when device is stopped, return request to previous layer */
 	if (device->stopped) {
 		cqr->status = DASD_CQR_CLEARED;
 		dasd_schedule_device_bh(device);
 		return;
 	}
 	rc = device->discipline->start_IO(cqr);
 	if (rc == 0)
 		dasd_device_set_timer(device, cqr->expires);
 	else if (rc == -EACCES) {
 		dasd_schedule_device_bh(device);
 	} else
 		/* Hmpf, try again in 1/2 sec */
 		dasd_device_set_timer(device, 50);
 }
 /*
  * Go through all request on the dasd_device request queue,
  * terminate them on the cdev if necessary, and return them to the
  * submitting layer via callback.
  * Note:
  * Make sure that all 'submitting layers' still exist when
  * this function is called!. In other words, when 'device' is a base
  * device then all block layer requests must have been removed before
  * via dasd_flush_block_queue.
  */
 int dasd_flush_device_queue(struct dasd_device *device)
 {
 	struct dasd_ccw_req *cqr, *n;
 	int rc;
 	struct list_head flush_queue;
 	INIT_LIST_HEAD(&flush_queue);
 	spin_lock_irq(get_ccwdev_lock(device->cdev));
 	rc = 0;
 	list_for_each_entry_safe(cqr, n, &device->ccw_queue, devlist) {
 		/* Check status and move request to flush_queue */
 		switch (cqr->status) {
 		case DASD_CQR_IN_IO:
 			rc = device->discipline->term_IO(cqr);
 			if (rc) {
 				/* unable to terminate requeust */
 				dev_err(&device->cdev->dev,
 					"Flushing the DASD request queue "
 					"failed for request %p\n", cqr);
 				/* stop flush processing */
 				goto finished;
 			}
 			break;
 		case DASD_CQR_QUEUED:
 			cqr->stopclk = get_clock();
 			cqr->status = DASD_CQR_CLEARED;
 			break;
 		default: /* no need to modify the others */
 			break;
 		}
 		list_move_tail(&cqr->devlist, &flush_queue);
 	}
 finished:
 	spin_unlock_irq(get_ccwdev_lock(device->cdev));
 	/*
 	 * After this point all requests must be in state CLEAR_PENDING,
 	 * CLEARED, SUCCESS or ERROR. Now wait for CLEAR_PENDING to become
 	 * one of the others.
 	 */
 	list_for_each_entry_safe(cqr, n, &flush_queue, devlist)
 		wait_event(dasd_flush_wq,
 			   (cqr->status != DASD_CQR_CLEAR_PENDING));
 	/*
 	 * Now set each request back to TERMINATED, DONE or NEED_ERP
 	 * and call the callback function of flushed requests
 	 */
 	__dasd_device_process_final_queue(device, &flush_queue);
 	return rc;
 }
 /*
  * Acquire the device lock and process queues for the device.
  */
 static void dasd_device_tasklet(struct dasd_device *device)
 {
 	struct list_head final_queue;
 	atomic_set (&device->tasklet_scheduled, 0);
 	INIT_LIST_HEAD(&final_queue);
 	spin_lock_irq(get_ccwdev_lock(device->cdev));
 	/* Check expire time of first request on the ccw queue. */
 	__dasd_device_check_expire(device);
 	/* find final requests on ccw queue */
 	__dasd_device_process_ccw_queue(device, &final_queue);
 	spin_unlock_irq(get_ccwdev_lock(device->cdev));
 	/* Now call the callback function of requests with final status */
 	__dasd_device_process_final_queue(device, &final_queue);
 	spin_lock_irq(get_ccwdev_lock(device->cdev));
 	/* Now check if the head of the ccw queue needs to be started. */
 	__dasd_device_start_head(device);
 	spin_unlock_irq(get_ccwdev_lock(device->cdev));
 	dasd_put_device(device);
 }
 /*
  * Schedules a call to dasd_tasklet over the device tasklet.
  */
 void dasd_schedule_device_bh(struct dasd_device *device)
 {
 	/* Protect against rescheduling. */
 	if (atomic_cmpxchg (&device->tasklet_scheduled, 0, 1) != 0)
 		return;
 	dasd_get_device(device);
 	tasklet_hi_schedule(&device->tasklet);
 }
 void dasd_device_set_stop_bits(struct dasd_device *device, int bits)
 {
 	device->stopped |= bits;
 }
 EXPORT_SYMBOL_GPL(dasd_device_set_stop_bits);
 void dasd_device_remove_stop_bits(struct dasd_device *device, int bits)
 {
 	device->stopped &= ~bits;
 	if (!device->stopped)
 		wake_up(&generic_waitq);
 }
 EXPORT_SYMBOL_GPL(dasd_device_remove_stop_bits);
 /*
  * Queue a request to the head of the device ccw_queue.
  * Start the I/O if possible.
  */
 void dasd_add_request_head(struct dasd_ccw_req *cqr)
 {
 	struct dasd_device *device;
 	unsigned long flags;
 	device = cqr->startdev;
 	spin_lock_irqsave(get_ccwdev_lock(device->cdev), flags);
 	cqr->status = DASD_CQR_QUEUED;
 	list_add(&cqr->devlist, &device->ccw_queue);
 	/* let the bh start the request to keep them in order */
 	dasd_schedule_device_bh(device);
 	spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags);
 }
 /*
  * Queue a request to the tail of the device ccw_queue.
  * Start the I/O if possible.
  */
 void dasd_add_request_tail(struct dasd_ccw_req *cqr)
 {
 	struct dasd_device *device;
 	unsigned long flags;
 	device = cqr->startdev;
 	spin_lock_irqsave(get_ccwdev_lock(device->cdev), flags);
 	cqr->status = DASD_CQR_QUEUED;
 	list_add_tail(&cqr->devlist, &device->ccw_queue);
 	/* let the bh start the request to keep them in order */
 	dasd_schedule_device_bh(device);
 	spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags);
 }
 /*
  * Wakeup helper for the 'sleep_on' functions.
  */
 static void dasd_wakeup_cb(struct dasd_ccw_req *cqr, void *data)
 {
 	spin_lock_irq(get_ccwdev_lock(cqr->startdev->cdev));
 	cqr->callback_data = DASD_SLEEPON_END_TAG;
 	spin_unlock_irq(get_ccwdev_lock(cqr->startdev->cdev));
 	wake_up(&generic_waitq);
 }
 static inline int _wait_for_wakeup(struct dasd_ccw_req *cqr)
 {
 	struct dasd_device *device;
 	int rc;
 	device = cqr->startdev;
 	spin_lock_irq(get_ccwdev_lock(device->cdev));
 	rc = (cqr->callback_data == DASD_SLEEPON_END_TAG);
 	spin_unlock_irq(get_ccwdev_lock(device->cdev));
 	return rc;
 }
 /*
  * checks if error recovery is necessary, returns 1 if yes, 0 otherwise.
  */
 static int __dasd_sleep_on_erp(struct dasd_ccw_req *cqr)
 {
 	struct dasd_device *device;
 	dasd_erp_fn_t erp_fn;
 	if (cqr->status == DASD_CQR_FILLED)
 		return 0;
 	device = cqr->startdev;
 	if (test_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags)) {
 		if (cqr->status == DASD_CQR_TERMINATED) {
 			device->discipline->handle_terminated_request(cqr);
 			return 1;
 		}
 		if (cqr->status == DASD_CQR_NEED_ERP) {
 			erp_fn = device->discipline->erp_action(cqr);
 			erp_fn(cqr);
 			return 1;
 		}
 		if (cqr->status == DASD_CQR_FAILED)
 			dasd_log_sense(cqr, &cqr->irb);
 		if (cqr->refers) {
 			__dasd_process_erp(device, cqr);
 			return 1;
 		}
 	}
 	return 0;
 }
 static int __dasd_sleep_on_loop_condition(struct dasd_ccw_req *cqr)
 {
 	if (test_bit(DASD_CQR_FLAGS_USE_ERP, &cqr->flags)) {
 		if (cqr->refers) /* erp is not done yet */
 			return 1;
 		return ((cqr->status != DASD_CQR_DONE) &&
 			(cqr->status != DASD_CQR_FAILED));
 	} else
 		return (cqr->status == DASD_CQR_FILLED);
 }
 static int _dasd_sleep_on(struct dasd_ccw_req *maincqr, int interruptible)
 {
 	struct dasd_device *device;
 	int rc;
 	struct list_head ccw_queue;
 	struct dasd_ccw_req *cqr;
 	INIT_LIST_HEAD(&ccw_queue);
 	maincqr->status = DASD_CQR_FILLED;
 	device = maincqr->startdev;
 	list_add(&maincqr->blocklist, &ccw_queue);
 	for (cqr = maincqr;  __dasd_sleep_on_loop_condition(cqr);
 	     cqr = list_first_entry(&ccw_queue,
 				    struct dasd_ccw_req, blocklist)) {
 		if (__dasd_sleep_on_erp(cqr))
 			continue;
 		if (cqr->status != DASD_CQR_FILLED) /* could be failed */
 			continue;
 		/* Non-temporary stop condition will trigger fail fast */
 		if (device->stopped & ~DASD_STOPPED_PENDING &&
 		    test_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags) &&
 		    (!dasd_eer_enabled(device))) {
 			cqr->status = DASD_CQR_FAILED;
 			continue;
 		}
 		/* Don't try to start requests if device is stopped */
 		if (interruptible) {
 			rc = wait_event_interruptible(
 				generic_waitq, !(device->stopped));
 			if (rc == -ERESTARTSYS) {
 				cqr->status = DASD_CQR_FAILED;
 				maincqr->intrc = rc;
 				continue;
 			}
 		} else
 			wait_event(generic_waitq, !(device->stopped));
 		cqr->callback = dasd_wakeup_cb;
 		cqr->callback_data = DASD_SLEEPON_START_TAG;
 		dasd_add_request_tail(cqr);
 		if (interruptible) {
 			rc = wait_event_interruptible(
 				generic_waitq, _wait_for_wakeup(cqr));
 			if (rc == -ERESTARTSYS) {
 				dasd_cancel_req(cqr);
 				/* wait (non-interruptible) for final status */
 				wait_event(generic_waitq,
 					   _wait_for_wakeup(cqr));
 				cqr->status = DASD_CQR_FAILED;
 				maincqr->intrc = rc;
 				continue;
 			}
 		} else
 			wait_event(generic_waitq, _wait_for_wakeup(cqr));
 	}
 	maincqr->endclk = get_clock();
 	if ((maincqr->status != DASD_CQR_DONE) &&
 	    (maincqr->intrc != -ERESTARTSYS))
 		dasd_log_sense(maincqr, &maincqr->irb);
 	if (maincqr->status == DASD_CQR_DONE)
 		rc = 0;
 	else if (maincqr->intrc)
 		rc = maincqr->intrc;
 	else
 		rc = -EIO;
 	return rc;
 }
 /*
  * Queue a request to the tail of the device ccw_queue and wait for
  * it's completion.
  */
 int dasd_sleep_on(struct dasd_ccw_req *cqr)
 {
 	return _dasd_sleep_on(cqr, 0);
 }
 /*
  * Queue a request to the tail of the device ccw_queue and wait
  * interruptible for it's completion.
  */
 int dasd_sleep_on_interruptible(struct dasd_ccw_req *cqr)
 {
 	return _dasd_sleep_on(cqr, 1);
 }
 /*
  * Whoa nelly now it gets really hairy. For some functions (e.g. steal lock
  * for eckd devices) the currently running request has to be terminated
  * and be put back to status queued, before the special request is added
  * to the head of the queue. Then the special request is waited on normally.
  */
 static inline int _dasd_term_running_cqr(struct dasd_device *device)
 {
 	struct dasd_ccw_req *cqr;
 	if (list_empty(&device->ccw_queue))
 		return 0;
 	cqr = list_entry(device->ccw_queue.next, struct dasd_ccw_req, devlist);
 	return device->discipline->term_IO(cqr);
 }
 int dasd_sleep_on_immediatly(struct dasd_ccw_req *cqr)
 {
 	struct dasd_device *device;
 	int rc;
 	device = cqr->startdev;
 	spin_lock_irq(get_ccwdev_lock(device->cdev));
 	rc = _dasd_term_running_cqr(device);
 	if (rc) {
 		spin_unlock_irq(get_ccwdev_lock(device->cdev));
 		return rc;
 	}
 	cqr->callback = dasd_wakeup_cb;
 	cqr->callback_data = DASD_SLEEPON_START_TAG;
 	cqr->status = DASD_CQR_QUEUED;
 	list_add(&cqr->devlist, &device->ccw_queue);
 	/* let the bh start the request to keep them in order */
 	dasd_schedule_device_bh(device);
 	spin_unlock_irq(get_ccwdev_lock(device->cdev));
 	wait_event(generic_waitq, _wait_for_wakeup(cqr));
 	if (cqr->status == DASD_CQR_DONE)
 		rc = 0;
 	else if (cqr->intrc)
 		rc = cqr->intrc;
 	else
 		rc = -EIO;
 	return rc;
 }
 /*
  * Cancels a request that was started with dasd_sleep_on_req.
  * This is useful to timeout requests. The request will be
  * terminated if it is currently in i/o.
  * Returns 1 if the request has been terminated.
  *	   0 if there was no need to terminate the request (not started yet)
  *	   negative error code if termination failed
  * Cancellation of a request is an asynchronous operation! The calling
  * function has to wait until the request is properly returned via callback.
  */
 int dasd_cancel_req(struct dasd_ccw_req *cqr)
 {
 	struct dasd_device *device = cqr->startdev;
 	unsigned long flags;
 	int rc;
 	rc = 0;
 	spin_lock_irqsave(get_ccwdev_lock(device->cdev), flags);
 	switch (cqr->status) {
 	case DASD_CQR_QUEUED:
 		/* request was not started - just set to cleared */
 		cqr->status = DASD_CQR_CLEARED;
 		break;
 	case DASD_CQR_IN_IO:
 		/* request in IO - terminate IO and release again */
 		rc = device->discipline->term_IO(cqr);
 		if (rc) {
 			dev_err(&device->cdev->dev,
 				"Cancelling request %p failed with rc=%d\n",
 				cqr, rc);
 		} else {
 			cqr->stopclk = get_clock();
 		}
 		break;
 	default: /* already finished or clear pending - do nothing */
 		break;
 	}
 	spin_unlock_irqrestore(get_ccwdev_lock(device->cdev), flags);
 	dasd_schedule_device_bh(device);
 	return rc;
 }
 /*
  * SECTION: Operations of the dasd_block layer.
  */
 /*
  * Timeout function for dasd_block. This is used when the block layer
  * is waiting for something that may not come reliably, (e.g. a state
  * change interrupt)
  */
 static void dasd_block_timeout(unsigned long ptr)
 {
 	unsigned long flags;
 	struct dasd_block *block;
 	block = (struct dasd_block *) ptr;
 	spin_lock_irqsave(get_ccwdev_lock(block->base->cdev), flags);
 	/* re-activate request queue */
 	dasd_device_remove_stop_bits(block->base, DASD_STOPPED_PENDING);
 	spin_unlock_irqrestore(get_ccwdev_lock(block->base->cdev), flags);
 	dasd_schedule_block_bh(block);
 }
 /*
  * Setup timeout for a dasd_block in jiffies.
  */
 void dasd_block_set_timer(struct dasd_block *block, int expires)
 {
 	if (expires == 0)
 		del_timer(&block->timer);
 	else
 		mod_timer(&block->timer, jiffies + expires);
 }
 /*
  * Clear timeout for a dasd_block.
  */
 void dasd_block_clear_timer(struct dasd_block *block)
 {
 	del_timer(&block->timer);
 }
 /*
  * Process finished error recovery ccw.
  */
 static void __dasd_process_erp(struct dasd_device *device,
 			       struct dasd_ccw_req *cqr)
 {
 	dasd_erp_fn_t erp_fn;
 	if (cqr->status == DASD_CQR_DONE)
 		DBF_DEV_EVENT(DBF_NOTICE, device, "%s", "ERP successful");
 	else
 		dev_err(&device->cdev->dev, "ERP failed for the DASD\n");
 	erp_fn = device->discipline->erp_postaction(cqr);
 	erp_fn(cqr);
 }
 /*
  * Fetch requests from the block device queue.
  */
 static void __dasd_process_request_queue(struct dasd_block *block)
 {
 	struct request_queue *queue;
 	struct request *req;
 	struct dasd_ccw_req *cqr;
 	struct dasd_device *basedev;
 	unsigned long flags;
 	queue = block->request_queue;
 	basedev = block->base;
 	/* No queue ? Then there is nothing to do. */
 	if (queue == NULL)
 		return;
 	/*
 	 * We requeue request from the block device queue to the ccw
 	 * queue only in two states. In state DASD_STATE_READY the
 	 * partition detection is done and we need to requeue requests
 	 * for that. State DASD_STATE_ONLINE is normal block device
 	 * operation.
 	 */
 	if (basedev->state < DASD_STATE_READY) {
 		while ((req = blk_fetch_request(block->request_queue)))
 			__blk_end_request_all(req, -EIO);
 		return;
 	}
 	/* Now we try to fetch requests from the request queue */
 	while (!blk_queue_plugged(queue) && (req = blk_peek_request(queue))) {
 		if (basedev->features & DASD_FEATURE_READONLY &&
 		    rq_data_dir(req) == WRITE) {
 			DBF_DEV_EVENT(DBF_ERR, basedev,
 				      "Rejecting write request %p",
 				      req);
 			blk_start_request(req);
 			__blk_end_request_all(req, -EIO);
 			continue;
 		}
 		cqr = basedev->discipline->build_cp(basedev, block, req);
 		if (IS_ERR(cqr)) {
 			if (PTR_ERR(cqr) == -EBUSY)
 				break;	/* normal end condition */
 			if (PTR_ERR(cqr) == -ENOMEM)
 				break;	/* terminate request queue loop */
 			if (PTR_ERR(cqr) == -EAGAIN) {
 				/*
 				 * The current request cannot be build right
 				 * now, we have to try later. If this request
 				 * is the head-of-queue we stop the device
 				 * for 1/2 second.
 				 */
 				if (!list_empty(&block->ccw_queue))
 					break;
 				spin_lock_irqsave(
 					get_ccwdev_lock(basedev->cdev), flags);
 				dasd_device_set_stop_bits(basedev,
 							  DASD_STOPPED_PENDING);
 				spin_unlock_irqrestore(
 					get_ccwdev_lock(basedev->cdev), flags);
 				dasd_block_set_timer(block, HZ/2);
 				break;
 			}
 			DBF_DEV_EVENT(DBF_ERR, basedev,
 				      "CCW creation failed (rc=%ld) "
 				      "on request %p",
 				      PTR_ERR(cqr), req);
 			blk_start_request(req);
 			__blk_end_request_all(req, -EIO);
 			continue;
 		}
 		/*
 		 *  Note: callback is set to dasd_return_cqr_cb in
 		 * __dasd_block_start_head to cover erp requests as well
 		 */
 		cqr->callback_data = (void *) req;
 		cqr->status = DASD_CQR_FILLED;
 		blk_start_request(req);
 		list_add_tail(&cqr->blocklist, &block->ccw_queue);
 		dasd_profile_start(block, cqr, req);
 	}
 }
 static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr)
 {
 	struct request *req;
 	int status;
 	int error = 0;
 	req = (struct request *) cqr->callback_data;
 	dasd_profile_end(cqr->block, cqr, req);
 	status = cqr->block->base->discipline->free_cp(cqr, req);
 	if (status <= 0)
 		error = status ? status : -EIO;
 	__blk_end_request_all(req, error);
 }
 /*
  * Process ccw request queue.
  */
 static void __dasd_process_block_ccw_queue(struct dasd_block *block,
 					   struct list_head *final_queue)
 {
 	struct list_head *l, *n;
 	struct dasd_ccw_req *cqr;
 	dasd_erp_fn_t erp_fn;
 	unsigned long flags;
 	struct dasd_device *base = block->base;
 restart:
 	/* Process request with final status. */
 	list_for_each_safe(l, n, &block->ccw_queue) {
 		cqr = list_entry(l, struct dasd_ccw_req, blocklist);
 		if (cqr->status != DASD_CQR_DONE &&
 		    cqr->status != DASD_CQR_FAILED &&
 		    cqr->status != DASD_CQR_NEED_ERP &&
 		    cqr->status != DASD_CQR_TERMINATED)
 			continue;
 		if (cqr->status == DASD_CQR_TERMINATED) {
 			base->discipline->handle_terminated_request(cqr);
 			goto restart;
 		}
 		/*  Process requests that may be recovered */
 		if (cqr->status == DASD_CQR_NEED_ERP) {
 			erp_fn = base->discipline->erp_action(cqr);
 			if (IS_ERR(erp_fn(cqr)))
 				continue;
 			goto restart;
 		}
 		/* log sense for fatal error */
 		if (cqr->status == DASD_CQR_FAILED) {
 			dasd_log_sense(cqr, &cqr->irb);
 		}
 		/* First of all call extended error reporting. */
 		if (dasd_eer_enabled(base) &&
 		    cqr->status == DASD_CQR_FAILED) {
 			dasd_eer_write(base, cqr, DASD_EER_FATALERROR);
 			/* restart request  */
 			cqr->status = DASD_CQR_FILLED;
 			cqr->retries = 255;
 			spin_lock_irqsave(get_ccwdev_lock(base->cdev), flags);
 			dasd_device_set_stop_bits(base, DASD_STOPPED_QUIESCE);
 			spin_unlock_irqrestore(get_ccwdev_lock(base->cdev),
 					       flags);
 			goto restart;
 		}
 		/* Process finished ERP request. */
 		if (cqr->refers) {
 			__dasd_process_erp(base, cqr);
 			goto restart;
 		}
 		/* Rechain finished requests to final queue */
 		cqr->endclk = get_clock();
 		list_move_tail(&cqr->blocklist, final_queue);
 	}
 }
 static void dasd_return_cqr_cb(struct dasd_ccw_req *cqr, void *data)
 {
 	dasd_schedule_block_bh(cqr->block);
 }
 static void __dasd_block_start_head(struct dasd_block *block)
 {
 	struct dasd_ccw_req *cqr;
 	if (list_empty(&block->ccw_queue))
 		return;
 	/* We allways begin with the first requests on the queue, as some
 	 * of previously started requests have to be enqueued on a
 	 * dasd_device again for error recovery.
 	 */
 	list_for_each_entry(cqr, &block->ccw_queue, blocklist) {
 		if (cqr->status != DASD_CQR_FILLED)
 			continue;
 		/* Non-temporary stop condition will trigger fail fast */
 		if (block->base->stopped & ~DASD_STOPPED_PENDING &&
 		    test_bit(DASD_CQR_FLAGS_FAILFAST, &cqr->flags) &&
 		    (!dasd_eer_enabled(block->base))) {
 			cqr->status = DASD_CQR_FAILED;
 			dasd_schedule_block_bh(block);
 			continue;
 		}
 		/* Don't try to start requests if device is stopped */
 		if (block->base->stopped)
 			return;
 		/* just a fail safe check, should not happen */
 		if (!cqr->startdev)
 			cqr->startdev = block->base;
 		/* make sure that the requests we submit find their way back */
 		cqr->callback = dasd_return_cqr_cb;
 		dasd_add_request_tail(cqr);
 	}
 }
 /*
  * Central dasd_block layer routine. Takes requests from the generic
  * block layer request queue, creates ccw requests, enqueues them on
  * a dasd_device and processes ccw requests that have been returned.
  */
 static void dasd_block_tasklet(struct dasd_block *block)
 {
 	struct list_head final_queue;
 	struct list_head *l, *n;
 	struct dasd_ccw_req *cqr;
 	atomic_set(&block->tasklet_scheduled, 0);
 	INIT_LIST_HEAD(&final_queue);
 	spin_lock(&block->queue_lock);
 	/* Finish off requests on ccw queue */
 	__dasd_process_block_ccw_queue(block, &final_queue);
 	spin_unlock(&block->queue_lock);
 	/* Now call the callback function of requests with final status */
 	spin_lock_irq(&block->request_queue_lock);
 	list_for_each_safe(l, n, &final_queue) {
 		cqr = list_entry(l, struct dasd_ccw_req, blocklist);
 		list_del_init(&cqr->blocklist);
 		__dasd_cleanup_cqr(cqr);
 	}
 	spin_lock(&block->queue_lock);
 	/* Get new request from the block device request queue */
 	__dasd_process_request_queue(block);
 	/* Now check if the head of the ccw queue needs to be started. */
 	__dasd_block_start_head(block);
 	spin_unlock(&block->queue_lock);
 	spin_unlock_irq(&block->request_queue_lock);
 	dasd_put_device(block->base);
 }
 static void _dasd_wake_block_flush_cb(struct dasd_ccw_req *cqr, void *data)
 {
 	wake_up(&dasd_flush_wq);
 }
 /*
  * Go through all request on the dasd_block request queue, cancel them
  * on the respective dasd_device, and return them to the generic
  * block layer.
  */
 static int dasd_flush_block_queue(struct dasd_block *block)
 {
 	struct dasd_ccw_req *cqr, *n;
 	int rc, i;
 	struct list_head flush_queue;
 	INIT_LIST_HEAD(&flush_queue);
 	spin_lock_bh(&block->queue_lock);
 	rc = 0;
 restart:
 	list_for_each_entry_safe(cqr, n, &block->ccw_queue, blocklist) {
 		/* if this request currently owned by a dasd_device cancel it */
 		if (cqr->status >= DASD_CQR_QUEUED)
 			rc = dasd_cancel_req(cqr);
 		if (rc < 0)
 			break;
 		/* Rechain request (including erp chain) so it won't be
 		 * touched by the dasd_block_tasklet anymore.
 		 * Replace the callback so we notice when the request
 		 * is returned from the dasd_device layer.
 		 */
 		cqr->callback = _dasd_wake_block_flush_cb;
 		for (i = 0; cqr != NULL; cqr = cqr->refers, i++)
 			list_move_tail(&cqr->blocklist, &flush_queue);
 		if (i > 1)
 			/* moved more than one request - need to restart */
 			goto restart;
 	}
 	spin_unlock_bh(&block->queue_lock);
 	/* Now call the callback function of flushed requests */
 restart_cb:
 	list_for_each_entry_safe(cqr, n, &flush_queue, blocklist) {
 		wait_event(dasd_flush_wq, (cqr->status < DASD_CQR_QUEUED));
 		/* Process finished ERP request. */
 		if (cqr->refers) {
 			spin_lock_bh(&block->queue_lock);
 			__dasd_process_erp(block->base, cqr);
 			spin_unlock_bh(&block->queue_lock);
 			/* restart list_for_xx loop since dasd_process_erp
 			 * might remove multiple elements */
 			goto restart_cb;
 		}
 		/* call the callback function */
 		spin_lock_irq(&block->request_queue_lock);
 		cqr->endclk = get_clock();
 		list_del_init(&cqr->blocklist);
 		__dasd_cleanup_cqr(cqr);
 		spin_unlock_irq(&block->request_queue_lock);
 	}
 	return rc;
 }
 /*
  * Schedules a call to dasd_tasklet over the device tasklet.
  */
 void dasd_schedule_block_bh(struct dasd_block *block)
 {
 	/* Protect against rescheduling. */
 	if (atomic_cmpxchg(&block->tasklet_scheduled, 0, 1) != 0)
 		return;
 	/* life cycle of block is bound to it's base device */
 	dasd_get_device(block->base);
 	tasklet_hi_schedule(&block->tasklet);
 }
 /*
  * SECTION: external block device operations
  * (request queue handling, open, release, etc.)
  */
 /*
  * Dasd request queue function. Called from ll_rw_blk.c
  */
 static void do_dasd_request(struct request_queue *queue)
 {
 	struct dasd_block *block;
 	block = queue->queuedata;
 	spin_lock(&block->queue_lock);
 	/* Get new request from the block device request queue */
 	__dasd_process_request_queue(block);
 	/* Now check if the head of the ccw queue needs to be started. */
 	__dasd_block_start_head(block);
 	spin_unlock(&block->queue_lock);
 }
 /*
  * Allocate and initialize request queue and default I/O scheduler.
  */
 static int dasd_alloc_queue(struct dasd_block *block)
 {
 	int rc;
 	block->request_queue = blk_init_queue(do_dasd_request,
 					       &block->request_queue_lock);
 	if (block->request_queue == NULL)
 		return -ENOMEM;
 	block->request_queue->queuedata = block;
 	elevator_exit(block->request_queue->elevator);
 	block->request_queue->elevator = NULL;
 	rc = elevator_init(block->request_queue, "deadline");
 	if (rc) {
 		blk_cleanup_queue(block->request_queue);
 		return rc;
 	}
 	return 0;
 }
 /*
  * Allocate and initialize request queue.
  */
 static void dasd_setup_queue(struct dasd_block *block)
 {
 	int max;
 	blk_queue_logical_block_size(block->request_queue, block->bp_block);
 	max = block->base->discipline->max_blocks << block->s2b_shift;
 	blk_queue_max_hw_sectors(block->request_queue, max);
 	blk_queue_max_segments(block->request_queue, -1L);
 	/* with page sized segments we can translate each segement into
 	 * one idaw/tidaw
 	 */
 	blk_queue_max_segment_size(block->request_queue, PAGE_SIZE);
 	blk_queue_segment_boundary(block->request_queue, PAGE_SIZE - 1);
-	blk_queue_ordered(block->request_queue, QUEUE_ORDERED_DRAIN);
 }
 /*
  * Deactivate and free request queue.
  */
 static void dasd_free_queue(struct dasd_block *block)
 {
 	if (block->request_queue) {
 		blk_cleanup_queue(block->request_queue);
 		block->request_queue = NULL;
 	}
 }
 /*
  * Flush request on the request queue.
  */
 static void dasd_flush_request_queue(struct dasd_block *block)
 {
 	struct request *req;
 	if (!block->request_queue)
 		return;
 	spin_lock_irq(&block->request_queue_lock);
 	while ((req = blk_fetch_request(block->request_queue)))
 		__blk_end_request_all(req, -EIO);
 	spin_unlock_irq(&block->request_queue_lock);
 }
 static int dasd_open(struct block_device *bdev, fmode_t mode)
 {
 	struct dasd_block *block = bdev->bd_disk->private_data;
 	struct dasd_device *base;
 	int rc;
 	if (!block)
 		return -ENODEV;
 	lock_kernel();
 	base = block->base;
 	atomic_inc(&block->open_count);
 	if (test_bit(DASD_FLAG_OFFLINE, &base->flags)) {
 		rc = -ENODEV;
 		goto unlock;
 	}
 	if (!try_module_get(base->discipline->owner)) {
 		rc = -EINVAL;
 		goto unlock;
 	}
 	if (dasd_probeonly) {
 		dev_info(&base->cdev->dev,
 			 "Accessing the DASD failed because it is in "
 			 "probeonly mode\n");
 		rc = -EPERM;
 		goto out;
 	}
 	if (base->state <= DASD_STATE_BASIC) {
 		DBF_DEV_EVENT(DBF_ERR, base, " %s",
 			      " Cannot open unrecognized device");
 		rc = -ENODEV;
 		goto out;
 	}
 	if ((mode & FMODE_WRITE) &&
 	    (test_bit(DASD_FLAG_DEVICE_RO, &base->flags) ||
 	     (base->features & DASD_FEATURE_READONLY))) {
 		rc = -EROFS;
 		goto out;
 	}
 	unlock_kernel();
 	return 0;
 out:
 	module_put(base->discipline->owner);
 unlock:
 	atomic_dec(&block->open_count);
 	unlock_kernel();
 	return rc;
 }
 static int dasd_release(struct gendisk *disk, fmode_t mode)
 {
 	struct dasd_block *block = disk->private_data;
 	lock_kernel();
 	atomic_dec(&block->open_count);
 	module_put(block->base->discipline->owner);
 	unlock_kernel();
 	return 0;
 }
 /*
  * Return disk geometry.
  */
 static int dasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
 {
 	struct dasd_block *block;
 	struct dasd_device *base;
 	block = bdev->bd_disk->private_data;
 	if (!block)
 		return -ENODEV;
 	base = block->base;
 	if (!base->discipline ||
 	    !base->discipline->fill_geometry)
 		return -EINVAL;
 	base->discipline->fill_geometry(block, geo);
 	geo->start = get_start_sect(bdev) >> block->s2b_shift;
 	return 0;
 }
 const struct block_device_operations
 dasd_device_operations = {
 	.owner		= THIS_MODULE,
 	.open		= dasd_open,
 	.release	= dasd_release,
 	.ioctl		= dasd_ioctl,
 	.compat_ioctl	= dasd_ioctl,
 	.getgeo		= dasd_getgeo,
 };
 /*******************************************************************************
  * end of block device operations
  */
 static void
 dasd_exit(void)
 {
 #ifdef CONFIG_PROC_FS
 	dasd_proc_exit();
 #endif
 	dasd_eer_exit();
         if (dasd_page_cache != NULL) {
 		kmem_cache_destroy(dasd_page_cache);
 		dasd_page_cache = NULL;
 	}
 	dasd_gendisk_exit();
 	dasd_devmap_exit();
 	if (dasd_debug_area != NULL) {
 		debug_unregister(dasd_debug_area);
 		dasd_debug_area = NULL;
 	}
 }
 /*
  * SECTION: common functions for ccw_driver use
  */
 /*
  * Is the device read-only?
  * Note that this function does not report the setting of the
  * readonly device attribute, but how it is configured in z/VM.
  */
 int dasd_device_is_ro(struct dasd_device *device)
 {
 	struct ccw_dev_id dev_id;
 	struct diag210 diag_data;
 	int rc;
 	if (!MACHINE_IS_VM)
 		return 0;
 	ccw_device_get_id(device->cdev, &dev_id);
 	memset(&diag_data, 0, sizeof(diag_data));
 	diag_data.vrdcdvno = dev_id.devno;
 	diag_data.vrdclen = sizeof(diag_data);
 	rc = diag210(&diag_data);
 	if (rc == 0 || rc == 2) {
 		return diag_data.vrdcvfla & 0x80;
 	} else {
 		DBF_EVENT(DBF_WARNING, "diag210 failed for dev=%04x with rc=%d",
 			  dev_id.devno, rc);
 		return 0;
 	}
 }
 EXPORT_SYMBOL_GPL(dasd_device_is_ro);
 static void dasd_generic_auto_online(void *data, async_cookie_t cookie)
 {
 	struct ccw_device *cdev = data;
 	int ret;
 	ret = ccw_device_set_online(cdev);
 	if (ret)
 		pr_warning("%s: Setting the DASD online failed with rc=%d\n",
 			   dev_name(&cdev->dev), ret);
 }
 /*
  * Initial attempt at a probe function. this can be simplified once
  * the other detection code is gone.
  */
 int dasd_generic_probe(struct ccw_device *cdev,
 		       struct dasd_discipline *discipline)
 {
 	int ret;
 	ret = dasd_add_sysfs_files(cdev);
 	if (ret) {
 		DBF_EVENT_DEVID(DBF_WARNING, cdev, "%s",
 				"dasd_generic_probe: could not add "
 				"sysfs entries");
 		return ret;
 	}
 	cdev->handler = &dasd_int_handler;
 	/*
 	 * Automatically online either all dasd devices (dasd_autodetect)
 	 * or all devices specified with dasd= parameters during
 	 * initial probe.
 	 */
 	if ((dasd_get_feature(cdev, DASD_FEATURE_INITIAL_ONLINE) > 0 ) ||
 	    (dasd_autodetect && dasd_busid_known(dev_name(&cdev->dev)) != 0))
 		async_schedule(dasd_generic_auto_online, cdev);
 	return 0;
 }
 /*
  * This will one day be called from a global not_oper handler.
  * It is also used by driver_unregister during module unload.
  */
 void dasd_generic_remove(struct ccw_device *cdev)
 {
 	struct dasd_device *device;
 	struct dasd_block *block;
 	cdev->handler = NULL;
 	dasd_remove_sysfs_files(cdev);
 	device = dasd_device_from_cdev(cdev);
 	if (IS_ERR(device))
 		return;
 	if (test_and_set_bit(DASD_FLAG_OFFLINE, &device->flags)) {
 		/* Already doing offline processing */
 		dasd_put_device(device);
 		return;
 	}
 	/*
 	 * This device is removed unconditionally. Set offline
 	 * flag to prevent dasd_open from opening it while it is
 	 * no quite down yet.
 	 */
 	dasd_set_target_state(device, DASD_STATE_NEW);
 	/* dasd_delete_device destroys the device reference. */
 	block = device->block;
 	device->block = NULL;
 	dasd_delete_device(device);
 	/*
 	 * life cycle of block is bound to device, so delete it after
 	 * device was safely removed
 	 */
 	if (block)
 		dasd_free_block(block);
 }
 /*
  * Activate a device. This is called from dasd_{eckd,fba}_probe() when either
  * the device is detected for the first time and is supposed to be used
  * or the user has started activation through sysfs.
  */
 int dasd_generic_set_online(struct ccw_device *cdev,
 			    struct dasd_discipline *base_discipline)
 {
 	struct dasd_discipline *discipline;
 	struct dasd_device *device;
 	int rc;
 	/* first online clears initial online feature flag */
 	dasd_set_feature(cdev, DASD_FEATURE_INITIAL_ONLINE, 0);
 	device = dasd_create_device(cdev);
 	if (IS_ERR(device))
 		return PTR_ERR(device);
 	discipline = base_discipline;
 	if (device->features & DASD_FEATURE_USEDIAG) {
 	  	if (!dasd_diag_discipline_pointer) {
 			pr_warning("%s Setting the DASD online failed because "
 				   "of missing DIAG discipline\n",
 				   dev_name(&cdev->dev));
 			dasd_delete_device(device);
 			return -ENODEV;
 		}
 		discipline = dasd_diag_discipline_pointer;
 	}
 	if (!try_module_get(base_discipline->owner)) {
 		dasd_delete_device(device);
 		return -EINVAL;
 	}
 	if (!try_module_get(discipline->owner)) {
 		module_put(base_discipline->owner);
 		dasd_delete_device(device);
 		return -EINVAL;
 	}
 	device->base_discipline = base_discipline;
 	device->discipline = discipline;
 	/* check_device will allocate block device if necessary */
 	rc = discipline->check_device(device);
 	if (rc) {
 		pr_warning("%s Setting the DASD online with discipline %s "
 			   "failed with rc=%i\n",
 			   dev_name(&cdev->dev), discipline->name, rc);
 		module_put(discipline->owner);
 		module_put(base_discipline->owner);
 		dasd_delete_device(device);
 		return rc;
 	}
 	dasd_set_target_state(device, DASD_STATE_ONLINE);
 	if (device->state <= DASD_STATE_KNOWN) {
 		pr_warning("%s Setting the DASD online failed because of a "
 			   "missing discipline\n", dev_name(&cdev->dev));
 		rc = -ENODEV;
 		dasd_set_target_state(device, DASD_STATE_NEW);
 		if (device->block)
 			dasd_free_block(device->block);
 		dasd_delete_device(device);
 	} else
 		pr_debug("dasd_generic device %s found\n",
 				dev_name(&cdev->dev));
 	wait_event(dasd_init_waitq, _wait_for_device(device));
 	dasd_put_device(device);
 	return rc;
 }
 int dasd_generic_set_offline(struct ccw_device *cdev)
 {
 	struct dasd_device *device;
 	struct dasd_block *block;
 	int max_count, open_count;
 	device = dasd_device_from_cdev(cdev);
 	if (IS_ERR(device))
 		return PTR_ERR(device);
 	if (test_and_set_bit(DASD_FLAG_OFFLINE, &device->flags)) {
 		/* Already doing offline processing */
 		dasd_put_device(device);
 		return 0;
 	}
 	/*
 	 * We must make sure that this device is currently not in use.
 	 * The open_count is increased for every opener, that includes
 	 * the blkdev_get in dasd_scan_partitions. We are only interested
 	 * in the other openers.
 	 */
 	if (device->block) {
 		max_count = device->block->bdev ? 0 : -1;
 		open_count = atomic_read(&device->block->open_count);
 		if (open_count > max_count) {
 			if (open_count > 0)
 				pr_warning("%s: The DASD cannot be set offline "
 					   "with open count %i\n",
 					   dev_name(&cdev->dev), open_count);
 			else
 				pr_warning("%s: The DASD cannot be set offline "
 					   "while it is in use\n",
 					   dev_name(&cdev->dev));
 			clear_bit(DASD_FLAG_OFFLINE, &device->flags);
 			dasd_put_device(device);
 			return -EBUSY;
 		}
 	}
 	dasd_set_target_state(device, DASD_STATE_NEW);
 	/* dasd_delete_device destroys the device reference. */
 	block = device->block;
 	device->block = NULL;
 	dasd_delete_device(device);
 	/*
 	 * life cycle of block is bound to device, so delete it after
 	 * device was safely removed
 	 */
 	if (block)
 		dasd_free_block(block);
 	return 0;
 }
 int dasd_generic_notify(struct ccw_device *cdev, int event)
 {
 	struct dasd_device *device;
 	struct dasd_ccw_req *cqr;
 	int ret;
 	device = dasd_device_from_cdev_locked(cdev);
 	if (IS_ERR(device))
 		return 0;
 	ret = 0;
 	switch (event) {
 	case CIO_GONE:
 	case CIO_BOXED:
 	case CIO_NO_PATH:
 		/* First of all call extended error reporting. */
 		dasd_eer_write(device, NULL, DASD_EER_NOPATH);
 		if (device->state < DASD_STATE_BASIC)
 			break;
 		/* Device is active. We want to keep it. */
 		list_for_each_entry(cqr, &device->ccw_queue, devlist)
 			if (cqr->status == DASD_CQR_IN_IO) {
 				cqr->status = DASD_CQR_QUEUED;
 				cqr->retries++;
 			}
 		dasd_device_set_stop_bits(device, DASD_STOPPED_DC_WAIT);
 		dasd_device_clear_timer(device);
 		dasd_schedule_device_bh(device);
 		ret = 1;
 		break;
 	case CIO_OPER:
 		/* FIXME: add a sanity check. */
 		dasd_device_remove_stop_bits(device, DASD_STOPPED_DC_WAIT);
 		if (device->stopped & DASD_UNRESUMED_PM) {
 			dasd_device_remove_stop_bits(device, DASD_UNRESUMED_PM);
 			dasd_restore_device(device);
 			ret = 1;
 			break;
 		}
 		dasd_schedule_device_bh(device);
 		if (device->block)
 			dasd_schedule_block_bh(device->block);
 		ret = 1;
 		break;
 	}
 	dasd_put_device(device);
 	return ret;
 }
 int dasd_generic_pm_freeze(struct ccw_device *cdev)
 {
 	struct dasd_ccw_req *cqr, *n;
 	int rc;
 	struct list_head freeze_queue;
 	struct dasd_device *device = dasd_device_from_cdev(cdev);
 	if (IS_ERR(device))
 		return PTR_ERR(device);
 	/* disallow new I/O  */
 	dasd_device_set_stop_bits(device, DASD_STOPPED_PM);
 	/* clear active requests */
 	INIT_LIST_HEAD(&freeze_queue);
 	spin_lock_irq(get_ccwdev_lock(cdev));
 	rc = 0;
 	list_for_each_entry_safe(cqr, n, &device->ccw_queue, devlist) {
 		/* Check status and move request to flush_queue */
 		if (cqr->status == DASD_CQR_IN_IO) {
 			rc = device->discipline->term_IO(cqr);
 			if (rc) {
 				/* unable to terminate requeust */
 				dev_err(&device->cdev->dev,
 					"Unable to terminate request %p "
 					"on suspend\n", cqr);
 				spin_unlock_irq(get_ccwdev_lock(cdev));
 				dasd_put_device(device);
 				return rc;
 			}
 		}
 		list_move_tail(&cqr->devlist, &freeze_queue);
 	}
 	spin_unlock_irq(get_ccwdev_lock(cdev));
 	list_for_each_entry_safe(cqr, n, &freeze_queue, devlist) {
 		wait_event(dasd_flush_wq,
 			   (cqr->status != DASD_CQR_CLEAR_PENDING));
 		if (cqr->status == DASD_CQR_CLEARED)
 			cqr->status = DASD_CQR_QUEUED;
 	}
 	/* move freeze_queue to start of the ccw_queue */
 	spin_lock_irq(get_ccwdev_lock(cdev));
 	list_splice_tail(&freeze_queue, &device->ccw_queue);
 	spin_unlock_irq(get_ccwdev_lock(cdev));
 	if (device->discipline->freeze)
 		rc = device->discipline->freeze(device);
 	dasd_put_device(device);
 	return rc;
 }
 EXPORT_SYMBOL_GPL(dasd_generic_pm_freeze);
 int dasd_generic_restore_device(struct ccw_device *cdev)
 {
 	struct dasd_device *device = dasd_device_from_cdev(cdev);
 	int rc = 0;
 	if (IS_ERR(device))
 		return PTR_ERR(device);
 	/* allow new IO again */
 	dasd_device_remove_stop_bits(device,
 				     (DASD_STOPPED_PM | DASD_UNRESUMED_PM));
 	dasd_schedule_device_bh(device);
 	/*
 	 * call discipline restore function
 	 * if device is stopped do nothing e.g. for disconnected devices
 	 */
 	if (device->discipline->restore && !(device->stopped))
 		rc = device->discipline->restore(device);
 	if (rc || device->stopped)
 		/*
 		 * if the resume failed for the DASD we put it in
 		 * an UNRESUMED stop state
 		 */
 		device->stopped |= DASD_UNRESUMED_PM;
 	if (device->block)
 		dasd_schedule_block_bh(device->block);
 	dasd_put_device(device);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(dasd_generic_restore_device);
 static struct dasd_ccw_req *dasd_generic_build_rdc(struct dasd_device *device,
 						   void *rdc_buffer,
 						   int rdc_buffer_size,
 						   int magic)
 {
 	struct dasd_ccw_req *cqr;
 	struct ccw1 *ccw;
 	unsigned long *idaw;
 	cqr = dasd_smalloc_request(magic, 1 /* RDC */, rdc_buffer_size, device);
 	if (IS_ERR(cqr)) {
 		/* internal error 13 - Allocating the RDC request failed*/
 		dev_err(&device->cdev->dev,
 			 "An error occurred in the DASD device driver, "
 			 "reason=%s\n", "13");
 		return cqr;
 	}
 	ccw = cqr->cpaddr;
 	ccw->cmd_code = CCW_CMD_RDC;
 	if (idal_is_needed(rdc_buffer, rdc_buffer_size)) {
 		idaw = (unsigned long *) (cqr->data);
 		ccw->cda = (__u32)(addr_t) idaw;
 		ccw->flags = CCW_FLAG_IDA;
 		idaw = idal_create_words(idaw, rdc_buffer, rdc_buffer_size);
 	} else {
 		ccw->cda = (__u32)(addr_t) rdc_buffer;
 		ccw->flags = 0;
 	}
 	ccw->count = rdc_buffer_size;
 	cqr->startdev = device;
 	cqr->memdev = device;
 	cqr->expires = 10*HZ;
 	cqr->retries = 256;
 	cqr->buildclk = get_clock();
 	cqr->status = DASD_CQR_FILLED;
 	return cqr;
 }
 int dasd_generic_read_dev_chars(struct dasd_device *device, int magic,
 				void *rdc_buffer, int rdc_buffer_size)
 {
 	int ret;
 	struct dasd_ccw_req *cqr;
 	cqr = dasd_generic_build_rdc(device, rdc_buffer, rdc_buffer_size,
 				     magic);
 	if (IS_ERR(cqr))
 		return PTR_ERR(cqr);
 	ret = dasd_sleep_on(cqr);
 	dasd_sfree_request(cqr, cqr->memdev);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(dasd_generic_read_dev_chars);
 /*
  *   In command mode and transport mode we need to look for sense
  *   data in different places. The sense data itself is allways
  *   an array of 32 bytes, so we can unify the sense data access
  *   for both modes.
  */
 char *dasd_get_sense(struct irb *irb)
 {
 	struct tsb *tsb = NULL;
 	char *sense = NULL;
 	if (scsw_is_tm(&irb->scsw) && (irb->scsw.tm.fcxs == 0x01)) {
 		if (irb->scsw.tm.tcw)
 			tsb = tcw_get_tsb((struct tcw *)(unsigned long)
 					  irb->scsw.tm.tcw);
 		if (tsb && tsb->length == 64 && tsb->flags)
 			switch (tsb->flags & 0x07) {
 			case 1:	/* tsa_iostat */
 				sense = tsb->tsa.iostat.sense;
 				break;
 			case 2: /* tsa_ddpc */
 				sense = tsb->tsa.ddpc.sense;
 				break;
 			default:
 				/* currently we don't use interrogate data */
 				break;
 			}
 	} else if (irb->esw.esw0.erw.cons) {
 		sense = irb->ecw;
 	}
 	return sense;
 }
 EXPORT_SYMBOL_GPL(dasd_get_sense);
 static int __init dasd_init(void)
 {
 	int rc;
 	init_waitqueue_head(&dasd_init_waitq);
 	init_waitqueue_head(&dasd_flush_wq);
 	init_waitqueue_head(&generic_waitq);
 	/* register 'common' DASD debug area, used for all DBF_XXX calls */
 	dasd_debug_area = debug_register("dasd", 1, 1, 8 * sizeof(long));
 	if (dasd_debug_area == NULL) {
 		rc = -ENOMEM;
 		goto failed;
 	}
 	debug_register_view(dasd_debug_area, &debug_sprintf_view);
 	debug_set_level(dasd_debug_area, DBF_WARNING);
 	DBF_EVENT(DBF_EMERG, "%s", "debug area created");
 	dasd_diag_discipline_pointer = NULL;
 	rc = dasd_devmap_init();
 	if (rc)
 		goto failed;
 	rc = dasd_gendisk_init();
 	if (rc)
 		goto failed;
 	rc = dasd_parse();
 	if (rc)
 		goto failed;
 	rc = dasd_eer_init();
 	if (rc)
 		goto failed;
 #ifdef CONFIG_PROC_FS
 	rc = dasd_proc_init();
 	if (rc)
 		goto failed;
 #endif
 	return 0;
 failed:
 	pr_info("The DASD device driver could not be initialized\n");
 	dasd_exit();
 	return rc;
 }
 module_init(dasd_init);
 module_exit(dasd_exit);
 EXPORT_SYMBOL(dasd_debug_area);
 EXPORT_SYMBOL(dasd_diag_discipline_pointer);
 EXPORT_SYMBOL(dasd_add_request_head);
 EXPORT_SYMBOL(dasd_add_request_tail);
 EXPORT_SYMBOL(dasd_cancel_req);
 EXPORT_SYMBOL(dasd_device_clear_timer);
 EXPORT_SYMBOL(dasd_block_clear_timer);
 EXPORT_SYMBOL(dasd_enable_device);
 EXPORT_SYMBOL(dasd_int_handler);
 EXPORT_SYMBOL(dasd_kfree_request);
 EXPORT_SYMBOL(dasd_kick_device);
 EXPORT_SYMBOL(dasd_kmalloc_request);
 EXPORT_SYMBOL(dasd_schedule_device_bh);
 EXPORT_SYMBOL(dasd_schedule_block_bh);
 EXPORT_SYMBOL(dasd_set_target_state);
 EXPORT_SYMBOL(dasd_device_set_timer);
 EXPORT_SYMBOL(dasd_block_set_timer);
 EXPORT_SYMBOL(dasd_sfree_request);
 EXPORT_SYMBOL(dasd_sleep_on);
 EXPORT_SYMBOL(dasd_sleep_on_immediatly);
 EXPORT_SYMBOL(dasd_sleep_on_interruptible);
 EXPORT_SYMBOL(dasd_smalloc_request);
 EXPORT_SYMBOL(dasd_start_IO);
 EXPORT_SYMBOL(dasd_term_IO);
 EXPORT_SYMBOL_GPL(dasd_generic_probe);
 EXPORT_SYMBOL_GPL(dasd_generic_remove);
 EXPORT_SYMBOL_GPL(dasd_generic_notify);
 EXPORT_SYMBOL_GPL(dasd_generic_set_online);
 EXPORT_SYMBOL_GPL(dasd_generic_set_offline);
 EXPORT_SYMBOL_GPL(dasd_generic_handle_state_change);
 EXPORT_SYMBOL_GPL(dasd_flush_device_queue);
 EXPORT_SYMBOL_GPL(dasd_alloc_block);
 EXPORT_SYMBOL_GPL(dasd_free_block);

drivers/scsi/sd.c

Diff comments View file @ 4913efe

1	/*	1	/*
2	* sd.c Copyright (C) 1992 Drew Eckhardt	2	* sd.c Copyright (C) 1992 Drew Eckhardt
3	* Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale	3	* Copyright (C) 1993, 1994, 1995, 1999 Eric Youngdale
4	*	4	*
5	* Linux scsi disk driver	5	* Linux scsi disk driver
6	* Initial versions: Drew Eckhardt	6	* Initial versions: Drew Eckhardt
7	* Subsequent revisions: Eric Youngdale	7	* Subsequent revisions: Eric Youngdale
8	* Modification history:	8	* Modification history:
9	* - Drew Eckhardt <drew@colorado.edu> original	9	* - Drew Eckhardt <drew@colorado.edu> original
10	* - Eric Youngdale <eric@andante.org> add scatter-gather, multiple	10	* - Eric Youngdale <eric@andante.org> add scatter-gather, multiple
11	* outstanding request, and other enhancements.	11	* outstanding request, and other enhancements.
12	* Support loadable low-level scsi drivers.	12	* Support loadable low-level scsi drivers.
13	* - Jirka Hanika <geo@ff.cuni.cz> support more scsi disks using	13	* - Jirka Hanika <geo@ff.cuni.cz> support more scsi disks using
14	* eight major numbers.	14	* eight major numbers.
15	* - Richard Gooch <rgooch@atnf.csiro.au> support devfs.	15	* - Richard Gooch <rgooch@atnf.csiro.au> support devfs.
16	* - Torben Mathiasen <tmm@image.dk> Resource allocation fixes in	16	* - Torben Mathiasen <tmm@image.dk> Resource allocation fixes in
17	* sd_init and cleanups.	17	* sd_init and cleanups.
18	* - Alex Davis <letmein@erols.com> Fix problem where partition info	18	* - Alex Davis <letmein@erols.com> Fix problem where partition info
19	* not being read in sd_open. Fix problem where removable media	19	* not being read in sd_open. Fix problem where removable media
20	* could be ejected after sd_open.	20	* could be ejected after sd_open.
21	* - Douglas Gilbert <dgilbert@interlog.com> cleanup for lk 2.5.x	21	* - Douglas Gilbert <dgilbert@interlog.com> cleanup for lk 2.5.x
22	* - Badari Pulavarty <pbadari@us.ibm.com>, Matthew Wilcox	22	* - Badari Pulavarty <pbadari@us.ibm.com>, Matthew Wilcox
23	* <willy@debian.org>, Kurt Garloff <garloff@suse.de>:	23	* <willy@debian.org>, Kurt Garloff <garloff@suse.de>:
24	* Support 32k/1M disks.	24	* Support 32k/1M disks.
25	*	25	*
26	* Logging policy (needs CONFIG_SCSI_LOGGING defined):	26	* Logging policy (needs CONFIG_SCSI_LOGGING defined):
27	* - setting up transfer: SCSI_LOG_HLQUEUE levels 1 and 2	27	* - setting up transfer: SCSI_LOG_HLQUEUE levels 1 and 2
28	* - end of transfer (bh + scsi_lib): SCSI_LOG_HLCOMPLETE level 1	28	* - end of transfer (bh + scsi_lib): SCSI_LOG_HLCOMPLETE level 1
29	* - entering sd_ioctl: SCSI_LOG_IOCTL level 1	29	* - entering sd_ioctl: SCSI_LOG_IOCTL level 1
30	* - entering other commands: SCSI_LOG_HLQUEUE level 3	30	* - entering other commands: SCSI_LOG_HLQUEUE level 3
31	* Note: when the logging level is set by the user, it must be greater	31	* Note: when the logging level is set by the user, it must be greater
32	* than the level indicated above to trigger output.	32	* than the level indicated above to trigger output.
33	*/	33	*/
34		34
35	#include <linux/module.h>	35	#include <linux/module.h>
36	#include <linux/fs.h>	36	#include <linux/fs.h>
37	#include <linux/kernel.h>	37	#include <linux/kernel.h>
38	#include <linux/mm.h>	38	#include <linux/mm.h>
39	#include <linux/bio.h>	39	#include <linux/bio.h>
40	#include <linux/genhd.h>	40	#include <linux/genhd.h>
41	#include <linux/hdreg.h>	41	#include <linux/hdreg.h>
42	#include <linux/errno.h>	42	#include <linux/errno.h>
43	#include <linux/idr.h>	43	#include <linux/idr.h>
44	#include <linux/interrupt.h>	44	#include <linux/interrupt.h>
45	#include <linux/init.h>	45	#include <linux/init.h>
46	#include <linux/blkdev.h>	46	#include <linux/blkdev.h>
47	#include <linux/blkpg.h>	47	#include <linux/blkpg.h>
48	#include <linux/delay.h>	48	#include <linux/delay.h>
49	#include <linux/smp_lock.h>	49	#include <linux/smp_lock.h>
50	#include <linux/mutex.h>	50	#include <linux/mutex.h>
51	#include <linux/string_helpers.h>	51	#include <linux/string_helpers.h>
52	#include <linux/async.h>	52	#include <linux/async.h>
53	#include <linux/slab.h>	53	#include <linux/slab.h>
54	#include <asm/uaccess.h>	54	#include <asm/uaccess.h>
55	#include <asm/unaligned.h>	55	#include <asm/unaligned.h>
56		56
57	#include <scsi/scsi.h>	57	#include <scsi/scsi.h>
58	#include <scsi/scsi_cmnd.h>	58	#include <scsi/scsi_cmnd.h>
59	#include <scsi/scsi_dbg.h>	59	#include <scsi/scsi_dbg.h>
60	#include <scsi/scsi_device.h>	60	#include <scsi/scsi_device.h>
61	#include <scsi/scsi_driver.h>	61	#include <scsi/scsi_driver.h>
62	#include <scsi/scsi_eh.h>	62	#include <scsi/scsi_eh.h>
63	#include <scsi/scsi_host.h>	63	#include <scsi/scsi_host.h>
64	#include <scsi/scsi_ioctl.h>	64	#include <scsi/scsi_ioctl.h>
65	#include <scsi/scsicam.h>	65	#include <scsi/scsicam.h>
66		66
67	#include "sd.h"	67	#include "sd.h"
68	#include "scsi_logging.h"	68	#include "scsi_logging.h"
69		69
70	MODULE_AUTHOR("Eric Youngdale");	70	MODULE_AUTHOR("Eric Youngdale");
71	MODULE_DESCRIPTION("SCSI disk (sd) driver");	71	MODULE_DESCRIPTION("SCSI disk (sd) driver");
72	MODULE_LICENSE("GPL");	72	MODULE_LICENSE("GPL");
73		73
74	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK0_MAJOR);	74	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK0_MAJOR);
75	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK1_MAJOR);	75	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK1_MAJOR);
76	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK2_MAJOR);	76	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK2_MAJOR);
77	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK3_MAJOR);	77	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK3_MAJOR);
78	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK4_MAJOR);	78	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK4_MAJOR);
79	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK5_MAJOR);	79	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK5_MAJOR);
80	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK6_MAJOR);	80	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK6_MAJOR);
81	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK7_MAJOR);	81	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK7_MAJOR);
82	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK8_MAJOR);	82	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK8_MAJOR);
83	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK9_MAJOR);	83	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK9_MAJOR);
84	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK10_MAJOR);	84	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK10_MAJOR);
85	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK11_MAJOR);	85	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK11_MAJOR);
86	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK12_MAJOR);	86	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK12_MAJOR);
87	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK13_MAJOR);	87	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK13_MAJOR);
88	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK14_MAJOR);	88	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK14_MAJOR);
89	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK15_MAJOR);	89	MODULE_ALIAS_BLOCKDEV_MAJOR(SCSI_DISK15_MAJOR);
90	MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK);	90	MODULE_ALIAS_SCSI_DEVICE(TYPE_DISK);
91	MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD);	91	MODULE_ALIAS_SCSI_DEVICE(TYPE_MOD);
92	MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC);	92	MODULE_ALIAS_SCSI_DEVICE(TYPE_RBC);
93		93
94	#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)	94	#if !defined(CONFIG_DEBUG_BLOCK_EXT_DEVT)
95	#define SD_MINORS 16	95	#define SD_MINORS 16
96	#else	96	#else
97	#define SD_MINORS 0	97	#define SD_MINORS 0
98	#endif	98	#endif
99		99
100	static int sd_revalidate_disk(struct gendisk *);	100	static int sd_revalidate_disk(struct gendisk *);
101	static void sd_unlock_native_capacity(struct gendisk *disk);	101	static void sd_unlock_native_capacity(struct gendisk *disk);
102	static int sd_probe(struct device *);	102	static int sd_probe(struct device *);
103	static int sd_remove(struct device *);	103	static int sd_remove(struct device *);
104	static void sd_shutdown(struct device *);	104	static void sd_shutdown(struct device *);
105	static int sd_suspend(struct device *, pm_message_t state);	105	static int sd_suspend(struct device *, pm_message_t state);
106	static int sd_resume(struct device *);	106	static int sd_resume(struct device *);
107	static void sd_rescan(struct device *);	107	static void sd_rescan(struct device *);
108	static int sd_done(struct scsi_cmnd *);	108	static int sd_done(struct scsi_cmnd *);
109	static void sd_read_capacity(struct scsi_disk sdkp, unsigned char buffer);	109	static void sd_read_capacity(struct scsi_disk sdkp, unsigned char buffer);
110	static void scsi_disk_release(struct device *cdev);	110	static void scsi_disk_release(struct device *cdev);
111	static void sd_print_sense_hdr(struct scsi_disk , struct scsi_sense_hdr );	111	static void sd_print_sense_hdr(struct scsi_disk , struct scsi_sense_hdr );
112	static void sd_print_result(struct scsi_disk *, int);	112	static void sd_print_result(struct scsi_disk *, int);
113		113
114	static DEFINE_SPINLOCK(sd_index_lock);	114	static DEFINE_SPINLOCK(sd_index_lock);
115	static DEFINE_IDA(sd_index_ida);	115	static DEFINE_IDA(sd_index_ida);
116		116
117	/* This semaphore is used to mediate the 0->1 reference get in the	117	/* This semaphore is used to mediate the 0->1 reference get in the
118	* face of object destruction (i.e. we can't allow a get on an	118	* face of object destruction (i.e. we can't allow a get on an
119	* object after last put) */	119	* object after last put) */
120	static DEFINE_MUTEX(sd_ref_mutex);	120	static DEFINE_MUTEX(sd_ref_mutex);
121		121
122	static struct kmem_cache *sd_cdb_cache;	122	static struct kmem_cache *sd_cdb_cache;
123	static mempool_t *sd_cdb_pool;	123	static mempool_t *sd_cdb_pool;
124		124
125	static const char *sd_cache_types[] = {	125	static const char *sd_cache_types[] = {
126	"write through", "none", "write back",	126	"write through", "none", "write back",
127	"write back, no read (daft)"	127	"write back, no read (daft)"
128	};	128	};
129		129
130	static ssize_t	130	static ssize_t
131	sd_store_cache_type(struct device dev, struct device_attribute attr,	131	sd_store_cache_type(struct device dev, struct device_attribute attr,
132	const char *buf, size_t count)	132	const char *buf, size_t count)
133	{	133	{
134	int i, ct = -1, rcd, wce, sp;	134	int i, ct = -1, rcd, wce, sp;
135	struct scsi_disk *sdkp = to_scsi_disk(dev);	135	struct scsi_disk *sdkp = to_scsi_disk(dev);
136	struct scsi_device *sdp = sdkp->device;	136	struct scsi_device *sdp = sdkp->device;
137	char buffer[64];	137	char buffer[64];
138	char *buffer_data;	138	char *buffer_data;
139	struct scsi_mode_data data;	139	struct scsi_mode_data data;
140	struct scsi_sense_hdr sshdr;	140	struct scsi_sense_hdr sshdr;
141	int len;	141	int len;
142		142
143	if (sdp->type != TYPE_DISK)	143	if (sdp->type != TYPE_DISK)
144	/* no cache control on RBC devices; theoretically they	144	/* no cache control on RBC devices; theoretically they
145	* can do it, but there's probably so many exceptions	145	* can do it, but there's probably so many exceptions
146	* it's not worth the risk */	146	* it's not worth the risk */
147	return -EINVAL;	147	return -EINVAL;
148		148
149	for (i = 0; i < ARRAY_SIZE(sd_cache_types); i++) {	149	for (i = 0; i < ARRAY_SIZE(sd_cache_types); i++) {
150	len = strlen(sd_cache_types[i]);	150	len = strlen(sd_cache_types[i]);
151	if (strncmp(sd_cache_types[i], buf, len) == 0 &&	151	if (strncmp(sd_cache_types[i], buf, len) == 0 &&
152	buf[len] == '\n') {	152	buf[len] == '\n') {
153	ct = i;	153	ct = i;
154	break;	154	break;
155	}	155	}
156	}	156	}
157	if (ct < 0)	157	if (ct < 0)
158	return -EINVAL;	158	return -EINVAL;
159	rcd = ct & 0x01 ? 1 : 0;	159	rcd = ct & 0x01 ? 1 : 0;
160	wce = ct & 0x02 ? 1 : 0;	160	wce = ct & 0x02 ? 1 : 0;
161	if (scsi_mode_sense(sdp, 0x08, 8, buffer, sizeof(buffer), SD_TIMEOUT,	161	if (scsi_mode_sense(sdp, 0x08, 8, buffer, sizeof(buffer), SD_TIMEOUT,
162	SD_MAX_RETRIES, &data, NULL))	162	SD_MAX_RETRIES, &data, NULL))
163	return -EINVAL;	163	return -EINVAL;
164	len = min_t(size_t, sizeof(buffer), data.length - data.header_length -	164	len = min_t(size_t, sizeof(buffer), data.length - data.header_length -
165	data.block_descriptor_length);	165	data.block_descriptor_length);
166	buffer_data = buffer + data.header_length +	166	buffer_data = buffer + data.header_length +
167	data.block_descriptor_length;	167	data.block_descriptor_length;
168	buffer_data[2] &= ~0x05;	168	buffer_data[2] &= ~0x05;
169	buffer_data[2] \|= wce << 2 \| rcd;	169	buffer_data[2] \|= wce << 2 \| rcd;
170	sp = buffer_data[0] & 0x80 ? 1 : 0;	170	sp = buffer_data[0] & 0x80 ? 1 : 0;
171		171
172	if (scsi_mode_select(sdp, 1, sp, 8, buffer_data, len, SD_TIMEOUT,	172	if (scsi_mode_select(sdp, 1, sp, 8, buffer_data, len, SD_TIMEOUT,
173	SD_MAX_RETRIES, &data, &sshdr)) {	173	SD_MAX_RETRIES, &data, &sshdr)) {
174	if (scsi_sense_valid(&sshdr))	174	if (scsi_sense_valid(&sshdr))
175	sd_print_sense_hdr(sdkp, &sshdr);	175	sd_print_sense_hdr(sdkp, &sshdr);
176	return -EINVAL;	176	return -EINVAL;
177	}	177	}
178	revalidate_disk(sdkp->disk);	178	revalidate_disk(sdkp->disk);
179	return count;	179	return count;
180	}	180	}
181		181
182	static ssize_t	182	static ssize_t
183	sd_store_manage_start_stop(struct device dev, struct device_attribute attr,	183	sd_store_manage_start_stop(struct device dev, struct device_attribute attr,
184	const char *buf, size_t count)	184	const char *buf, size_t count)
185	{	185	{
186	struct scsi_disk *sdkp = to_scsi_disk(dev);	186	struct scsi_disk *sdkp = to_scsi_disk(dev);
187	struct scsi_device *sdp = sdkp->device;	187	struct scsi_device *sdp = sdkp->device;
188		188
189	if (!capable(CAP_SYS_ADMIN))	189	if (!capable(CAP_SYS_ADMIN))
190	return -EACCES;	190	return -EACCES;
191		191
192	sdp->manage_start_stop = simple_strtoul(buf, NULL, 10);	192	sdp->manage_start_stop = simple_strtoul(buf, NULL, 10);
193		193
194	return count;	194	return count;
195	}	195	}
196		196
197	static ssize_t	197	static ssize_t
198	sd_store_allow_restart(struct device dev, struct device_attribute attr,	198	sd_store_allow_restart(struct device dev, struct device_attribute attr,
199	const char *buf, size_t count)	199	const char *buf, size_t count)
200	{	200	{
201	struct scsi_disk *sdkp = to_scsi_disk(dev);	201	struct scsi_disk *sdkp = to_scsi_disk(dev);
202	struct scsi_device *sdp = sdkp->device;	202	struct scsi_device *sdp = sdkp->device;
203		203
204	if (!capable(CAP_SYS_ADMIN))	204	if (!capable(CAP_SYS_ADMIN))
205	return -EACCES;	205	return -EACCES;
206		206
207	if (sdp->type != TYPE_DISK)	207	if (sdp->type != TYPE_DISK)
208	return -EINVAL;	208	return -EINVAL;
209		209
210	sdp->allow_restart = simple_strtoul(buf, NULL, 10);	210	sdp->allow_restart = simple_strtoul(buf, NULL, 10);
211		211
212	return count;	212	return count;
213	}	213	}
214		214
215	static ssize_t	215	static ssize_t
216	sd_show_cache_type(struct device dev, struct device_attribute attr,	216	sd_show_cache_type(struct device dev, struct device_attribute attr,
217	char *buf)	217	char *buf)
218	{	218	{
219	struct scsi_disk *sdkp = to_scsi_disk(dev);	219	struct scsi_disk *sdkp = to_scsi_disk(dev);
220	int ct = sdkp->RCD + 2*sdkp->WCE;	220	int ct = sdkp->RCD + 2*sdkp->WCE;
221		221
222	return snprintf(buf, 40, "%s\n", sd_cache_types[ct]);	222	return snprintf(buf, 40, "%s\n", sd_cache_types[ct]);
223	}	223	}
224		224
225	static ssize_t	225	static ssize_t
226	sd_show_fua(struct device dev, struct device_attribute attr, char *buf)	226	sd_show_fua(struct device dev, struct device_attribute attr, char *buf)
227	{	227	{
228	struct scsi_disk *sdkp = to_scsi_disk(dev);	228	struct scsi_disk *sdkp = to_scsi_disk(dev);
229		229
230	return snprintf(buf, 20, "%u\n", sdkp->DPOFUA);	230	return snprintf(buf, 20, "%u\n", sdkp->DPOFUA);
231	}	231	}
232		232
233	static ssize_t	233	static ssize_t
234	sd_show_manage_start_stop(struct device dev, struct device_attribute attr,	234	sd_show_manage_start_stop(struct device dev, struct device_attribute attr,
235	char *buf)	235	char *buf)
236	{	236	{
237	struct scsi_disk *sdkp = to_scsi_disk(dev);	237	struct scsi_disk *sdkp = to_scsi_disk(dev);
238	struct scsi_device *sdp = sdkp->device;	238	struct scsi_device *sdp = sdkp->device;
239		239
240	return snprintf(buf, 20, "%u\n", sdp->manage_start_stop);	240	return snprintf(buf, 20, "%u\n", sdp->manage_start_stop);
241	}	241	}
242		242
243	static ssize_t	243	static ssize_t
244	sd_show_allow_restart(struct device dev, struct device_attribute attr,	244	sd_show_allow_restart(struct device dev, struct device_attribute attr,
245	char *buf)	245	char *buf)
246	{	246	{
247	struct scsi_disk *sdkp = to_scsi_disk(dev);	247	struct scsi_disk *sdkp = to_scsi_disk(dev);
248		248
249	return snprintf(buf, 40, "%d\n", sdkp->device->allow_restart);	249	return snprintf(buf, 40, "%d\n", sdkp->device->allow_restart);
250	}	250	}
251		251
252	static ssize_t	252	static ssize_t
253	sd_show_protection_type(struct device dev, struct device_attribute attr,	253	sd_show_protection_type(struct device dev, struct device_attribute attr,
254	char *buf)	254	char *buf)
255	{	255	{
256	struct scsi_disk *sdkp = to_scsi_disk(dev);	256	struct scsi_disk *sdkp = to_scsi_disk(dev);
257		257
258	return snprintf(buf, 20, "%u\n", sdkp->protection_type);	258	return snprintf(buf, 20, "%u\n", sdkp->protection_type);
259	}	259	}
260		260
261	static ssize_t	261	static ssize_t
262	sd_show_app_tag_own(struct device dev, struct device_attribute attr,	262	sd_show_app_tag_own(struct device dev, struct device_attribute attr,
263	char *buf)	263	char *buf)
264	{	264	{
265	struct scsi_disk *sdkp = to_scsi_disk(dev);	265	struct scsi_disk *sdkp = to_scsi_disk(dev);
266		266
267	return snprintf(buf, 20, "%u\n", sdkp->ATO);	267	return snprintf(buf, 20, "%u\n", sdkp->ATO);
268	}	268	}
269		269
270	static ssize_t	270	static ssize_t
271	sd_show_thin_provisioning(struct device dev, struct device_attribute attr,	271	sd_show_thin_provisioning(struct device dev, struct device_attribute attr,
272	char *buf)	272	char *buf)
273	{	273	{
274	struct scsi_disk *sdkp = to_scsi_disk(dev);	274	struct scsi_disk *sdkp = to_scsi_disk(dev);
275		275
276	return snprintf(buf, 20, "%u\n", sdkp->thin_provisioning);	276	return snprintf(buf, 20, "%u\n", sdkp->thin_provisioning);
277	}	277	}
278		278
279	static struct device_attribute sd_disk_attrs[] = {	279	static struct device_attribute sd_disk_attrs[] = {
280	__ATTR(cache_type, S_IRUGO\|S_IWUSR, sd_show_cache_type,	280	__ATTR(cache_type, S_IRUGO\|S_IWUSR, sd_show_cache_type,
281	sd_store_cache_type),	281	sd_store_cache_type),
282	__ATTR(FUA, S_IRUGO, sd_show_fua, NULL),	282	__ATTR(FUA, S_IRUGO, sd_show_fua, NULL),
283	__ATTR(allow_restart, S_IRUGO\|S_IWUSR, sd_show_allow_restart,	283	__ATTR(allow_restart, S_IRUGO\|S_IWUSR, sd_show_allow_restart,
284	sd_store_allow_restart),	284	sd_store_allow_restart),
285	__ATTR(manage_start_stop, S_IRUGO\|S_IWUSR, sd_show_manage_start_stop,	285	__ATTR(manage_start_stop, S_IRUGO\|S_IWUSR, sd_show_manage_start_stop,
286	sd_store_manage_start_stop),	286	sd_store_manage_start_stop),
287	__ATTR(protection_type, S_IRUGO, sd_show_protection_type, NULL),	287	__ATTR(protection_type, S_IRUGO, sd_show_protection_type, NULL),
288	__ATTR(app_tag_own, S_IRUGO, sd_show_app_tag_own, NULL),	288	__ATTR(app_tag_own, S_IRUGO, sd_show_app_tag_own, NULL),
289	__ATTR(thin_provisioning, S_IRUGO, sd_show_thin_provisioning, NULL),	289	__ATTR(thin_provisioning, S_IRUGO, sd_show_thin_provisioning, NULL),
290	__ATTR_NULL,	290	__ATTR_NULL,
291	};	291	};
292		292
293	static struct class sd_disk_class = {	293	static struct class sd_disk_class = {
294	.name = "scsi_disk",	294	.name = "scsi_disk",
295	.owner = THIS_MODULE,	295	.owner = THIS_MODULE,
296	.dev_release = scsi_disk_release,	296	.dev_release = scsi_disk_release,
297	.dev_attrs = sd_disk_attrs,	297	.dev_attrs = sd_disk_attrs,
298	};	298	};
299		299
300	static struct scsi_driver sd_template = {	300	static struct scsi_driver sd_template = {
301	.owner = THIS_MODULE,	301	.owner = THIS_MODULE,
302	.gendrv = {	302	.gendrv = {
303	.name = "sd",	303	.name = "sd",
304	.probe = sd_probe,	304	.probe = sd_probe,
305	.remove = sd_remove,	305	.remove = sd_remove,
306	.suspend = sd_suspend,	306	.suspend = sd_suspend,
307	.resume = sd_resume,	307	.resume = sd_resume,
308	.shutdown = sd_shutdown,	308	.shutdown = sd_shutdown,
309	},	309	},
310	.rescan = sd_rescan,	310	.rescan = sd_rescan,
311	.done = sd_done,	311	.done = sd_done,
312	};	312	};
313		313
314	/*	314	/*
315	* Device no to disk mapping:	315	* Device no to disk mapping:
316	*	316	*
317	* major disc2 disc p1	317	* major disc2 disc p1
318	* \|............\|.............\|....\|....\| <- dev_t	318	* \|............\|.............\|....\|....\| <- dev_t
319	* 31 20 19 8 7 4 3 0	319	* 31 20 19 8 7 4 3 0
320	*	320	*
321	* Inside a major, we have 16k disks, however mapped non-	321	* Inside a major, we have 16k disks, however mapped non-
322	* contiguously. The first 16 disks are for major0, the next	322	* contiguously. The first 16 disks are for major0, the next
323	* ones with major1, ... Disk 256 is for major0 again, disk 272	323	* ones with major1, ... Disk 256 is for major0 again, disk 272
324	* for major1, ...	324	* for major1, ...
325	* As we stay compatible with our numbering scheme, we can reuse	325	* As we stay compatible with our numbering scheme, we can reuse
326	* the well-know SCSI majors 8, 65--71, 136--143.	326	* the well-know SCSI majors 8, 65--71, 136--143.
327	*/	327	*/
328	static int sd_major(int major_idx)	328	static int sd_major(int major_idx)
329	{	329	{
330	switch (major_idx) {	330	switch (major_idx) {
331	case 0:	331	case 0:
332	return SCSI_DISK0_MAJOR;	332	return SCSI_DISK0_MAJOR;
333	case 1 ... 7:	333	case 1 ... 7:
334	return SCSI_DISK1_MAJOR + major_idx - 1;	334	return SCSI_DISK1_MAJOR + major_idx - 1;
335	case 8 ... 15:	335	case 8 ... 15:
336	return SCSI_DISK8_MAJOR + major_idx - 8;	336	return SCSI_DISK8_MAJOR + major_idx - 8;
337	default:	337	default:
338	BUG();	338	BUG();
339	return 0; /* shut up gcc */	339	return 0; /* shut up gcc */
340	}	340	}
341	}	341	}
342		342
343	static struct scsi_disk __scsi_disk_get(struct gendisk disk)	343	static struct scsi_disk __scsi_disk_get(struct gendisk disk)
344	{	344	{
345	struct scsi_disk *sdkp = NULL;	345	struct scsi_disk *sdkp = NULL;
346		346
347	if (disk->private_data) {	347	if (disk->private_data) {
348	sdkp = scsi_disk(disk);	348	sdkp = scsi_disk(disk);
349	if (scsi_device_get(sdkp->device) == 0)	349	if (scsi_device_get(sdkp->device) == 0)
350	get_device(&sdkp->dev);	350	get_device(&sdkp->dev);
351	else	351	else
352	sdkp = NULL;	352	sdkp = NULL;
353	}	353	}
354	return sdkp;	354	return sdkp;
355	}	355	}
356		356
357	static struct scsi_disk scsi_disk_get(struct gendisk disk)	357	static struct scsi_disk scsi_disk_get(struct gendisk disk)
358	{	358	{
359	struct scsi_disk *sdkp;	359	struct scsi_disk *sdkp;
360		360
361	mutex_lock(&sd_ref_mutex);	361	mutex_lock(&sd_ref_mutex);
362	sdkp = __scsi_disk_get(disk);	362	sdkp = __scsi_disk_get(disk);
363	mutex_unlock(&sd_ref_mutex);	363	mutex_unlock(&sd_ref_mutex);
364	return sdkp;	364	return sdkp;
365	}	365	}
366		366
367	static struct scsi_disk scsi_disk_get_from_dev(struct device dev)	367	static struct scsi_disk scsi_disk_get_from_dev(struct device dev)
368	{	368	{
369	struct scsi_disk *sdkp;	369	struct scsi_disk *sdkp;
370		370
371	mutex_lock(&sd_ref_mutex);	371	mutex_lock(&sd_ref_mutex);
372	sdkp = dev_get_drvdata(dev);	372	sdkp = dev_get_drvdata(dev);
373	if (sdkp)	373	if (sdkp)
374	sdkp = __scsi_disk_get(sdkp->disk);	374	sdkp = __scsi_disk_get(sdkp->disk);
375	mutex_unlock(&sd_ref_mutex);	375	mutex_unlock(&sd_ref_mutex);
376	return sdkp;	376	return sdkp;
377	}	377	}
378		378
379	static void scsi_disk_put(struct scsi_disk *sdkp)	379	static void scsi_disk_put(struct scsi_disk *sdkp)
380	{	380	{
381	struct scsi_device *sdev = sdkp->device;	381	struct scsi_device *sdev = sdkp->device;
382		382
383	mutex_lock(&sd_ref_mutex);	383	mutex_lock(&sd_ref_mutex);
384	put_device(&sdkp->dev);	384	put_device(&sdkp->dev);
385	scsi_device_put(sdev);	385	scsi_device_put(sdev);
386	mutex_unlock(&sd_ref_mutex);	386	mutex_unlock(&sd_ref_mutex);
387	}	387	}
388		388
389	static void sd_prot_op(struct scsi_cmnd *scmd, unsigned int dif)	389	static void sd_prot_op(struct scsi_cmnd *scmd, unsigned int dif)
390	{	390	{
391	unsigned int prot_op = SCSI_PROT_NORMAL;	391	unsigned int prot_op = SCSI_PROT_NORMAL;
392	unsigned int dix = scsi_prot_sg_count(scmd);	392	unsigned int dix = scsi_prot_sg_count(scmd);
393		393
394	if (scmd->sc_data_direction == DMA_FROM_DEVICE) {	394	if (scmd->sc_data_direction == DMA_FROM_DEVICE) {
395	if (dif && dix)	395	if (dif && dix)
396	prot_op = SCSI_PROT_READ_PASS;	396	prot_op = SCSI_PROT_READ_PASS;
397	else if (dif && !dix)	397	else if (dif && !dix)
398	prot_op = SCSI_PROT_READ_STRIP;	398	prot_op = SCSI_PROT_READ_STRIP;
399	else if (!dif && dix)	399	else if (!dif && dix)
400	prot_op = SCSI_PROT_READ_INSERT;	400	prot_op = SCSI_PROT_READ_INSERT;
401	} else {	401	} else {
402	if (dif && dix)	402	if (dif && dix)
403	prot_op = SCSI_PROT_WRITE_PASS;	403	prot_op = SCSI_PROT_WRITE_PASS;
404	else if (dif && !dix)	404	else if (dif && !dix)
405	prot_op = SCSI_PROT_WRITE_INSERT;	405	prot_op = SCSI_PROT_WRITE_INSERT;
406	else if (!dif && dix)	406	else if (!dif && dix)
407	prot_op = SCSI_PROT_WRITE_STRIP;	407	prot_op = SCSI_PROT_WRITE_STRIP;
408	}	408	}
409		409
410	scsi_set_prot_op(scmd, prot_op);	410	scsi_set_prot_op(scmd, prot_op);
411	scsi_set_prot_type(scmd, dif);	411	scsi_set_prot_type(scmd, dif);
412	}	412	}
413		413
414	/**	414	/**
415	* scsi_setup_discard_cmnd - unmap blocks on thinly provisioned device	415	* scsi_setup_discard_cmnd - unmap blocks on thinly provisioned device
416	* @sdp: scsi device to operate one	416	* @sdp: scsi device to operate one
417	* @rq: Request to prepare	417	* @rq: Request to prepare
418	*	418	*
419	* Will issue either UNMAP or WRITE SAME(16) depending on preference	419	* Will issue either UNMAP or WRITE SAME(16) depending on preference
420	* indicated by target device.	420	* indicated by target device.
421	**/	421	**/
422	static int scsi_setup_discard_cmnd(struct scsi_device sdp, struct request rq)	422	static int scsi_setup_discard_cmnd(struct scsi_device sdp, struct request rq)
423	{	423	{
424	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);	424	struct scsi_disk *sdkp = scsi_disk(rq->rq_disk);
425	struct bio *bio = rq->bio;	425	struct bio *bio = rq->bio;
426	sector_t sector = bio->bi_sector;	426	sector_t sector = bio->bi_sector;
427	unsigned int nr_sectors = bio_sectors(bio);	427	unsigned int nr_sectors = bio_sectors(bio);
428	unsigned int len;	428	unsigned int len;
429	int ret;	429	int ret;
430	struct page *page;	430	struct page *page;
431		431
432	if (sdkp->device->sector_size == 4096) {	432	if (sdkp->device->sector_size == 4096) {
433	sector >>= 3;	433	sector >>= 3;
434	nr_sectors >>= 3;	434	nr_sectors >>= 3;
435	}	435	}
436		436
437	rq->timeout = SD_TIMEOUT;	437	rq->timeout = SD_TIMEOUT;
438		438
439	memset(rq->cmd, 0, rq->cmd_len);	439	memset(rq->cmd, 0, rq->cmd_len);
440		440
441	page = alloc_page(GFP_ATOMIC \| __GFP_ZERO);	441	page = alloc_page(GFP_ATOMIC \| __GFP_ZERO);
442	if (!page)	442	if (!page)
443	return BLKPREP_DEFER;	443	return BLKPREP_DEFER;
444		444
445	if (sdkp->unmap) {	445	if (sdkp->unmap) {
446	char *buf = page_address(page);	446	char *buf = page_address(page);
447		447
448	rq->cmd_len = 10;	448	rq->cmd_len = 10;
449	rq->cmd[0] = UNMAP;	449	rq->cmd[0] = UNMAP;
450	rq->cmd[8] = 24;	450	rq->cmd[8] = 24;
451		451
452	put_unaligned_be16(6 + 16, &buf[0]);	452	put_unaligned_be16(6 + 16, &buf[0]);
453	put_unaligned_be16(16, &buf[2]);	453	put_unaligned_be16(16, &buf[2]);
454	put_unaligned_be64(sector, &buf[8]);	454	put_unaligned_be64(sector, &buf[8]);
455	put_unaligned_be32(nr_sectors, &buf[16]);	455	put_unaligned_be32(nr_sectors, &buf[16]);
456		456
457	len = 24;	457	len = 24;
458	} else {	458	} else {
459	rq->cmd_len = 16;	459	rq->cmd_len = 16;
460	rq->cmd[0] = WRITE_SAME_16;	460	rq->cmd[0] = WRITE_SAME_16;
461	rq->cmd[1] = 0x8; /* UNMAP */	461	rq->cmd[1] = 0x8; /* UNMAP */
462	put_unaligned_be64(sector, &rq->cmd[2]);	462	put_unaligned_be64(sector, &rq->cmd[2]);
463	put_unaligned_be32(nr_sectors, &rq->cmd[10]);	463	put_unaligned_be32(nr_sectors, &rq->cmd[10]);
464		464
465	len = sdkp->device->sector_size;	465	len = sdkp->device->sector_size;
466	}	466	}
467		467
468	blk_add_request_payload(rq, page, len);	468	blk_add_request_payload(rq, page, len);
469	ret = scsi_setup_blk_pc_cmnd(sdp, rq);	469	ret = scsi_setup_blk_pc_cmnd(sdp, rq);
470	rq->buffer = page_address(page);	470	rq->buffer = page_address(page);
471	if (ret != BLKPREP_OK) {	471	if (ret != BLKPREP_OK) {
472	__free_page(page);	472	__free_page(page);
473	rq->buffer = NULL;	473	rq->buffer = NULL;
474	}	474	}
475	return ret;	475	return ret;
476	}	476	}
477		477
478	static int scsi_setup_flush_cmnd(struct scsi_device sdp, struct request rq)	478	static int scsi_setup_flush_cmnd(struct scsi_device sdp, struct request rq)
479	{	479	{
480	rq->timeout = SD_TIMEOUT;	480	rq->timeout = SD_TIMEOUT;
481	rq->retries = SD_MAX_RETRIES;	481	rq->retries = SD_MAX_RETRIES;
482	rq->cmd[0] = SYNCHRONIZE_CACHE;	482	rq->cmd[0] = SYNCHRONIZE_CACHE;
483	rq->cmd_len = 10;	483	rq->cmd_len = 10;
484		484
485	return scsi_setup_blk_pc_cmnd(sdp, rq);	485	return scsi_setup_blk_pc_cmnd(sdp, rq);
486	}	486	}
487		487
488	static void sd_unprep_fn(struct request_queue q, struct request rq)	488	static void sd_unprep_fn(struct request_queue q, struct request rq)
489	{	489	{
490	if (rq->cmd_flags & REQ_DISCARD) {	490	if (rq->cmd_flags & REQ_DISCARD) {
491	free_page((unsigned long)rq->buffer);	491	free_page((unsigned long)rq->buffer);
492	rq->buffer = NULL;	492	rq->buffer = NULL;
493	}	493	}
494	}	494	}
495		495
496	/**	496	/**
497	* sd_init_command - build a scsi (read or write) command from	497	* sd_init_command - build a scsi (read or write) command from
498	* information in the request structure.	498	* information in the request structure.
499	* @SCpnt: pointer to mid-level's per scsi command structure that	499	* @SCpnt: pointer to mid-level's per scsi command structure that
500	* contains request and into which the scsi command is written	500	* contains request and into which the scsi command is written
501	*	501	*
502	* Returns 1 if successful and 0 if error (or cannot be done now).	502	* Returns 1 if successful and 0 if error (or cannot be done now).
503	**/	503	**/
504	static int sd_prep_fn(struct request_queue q, struct request rq)	504	static int sd_prep_fn(struct request_queue q, struct request rq)
505	{	505	{
506	struct scsi_cmnd *SCpnt;	506	struct scsi_cmnd *SCpnt;
507	struct scsi_device *sdp = q->queuedata;	507	struct scsi_device *sdp = q->queuedata;
508	struct gendisk *disk = rq->rq_disk;	508	struct gendisk *disk = rq->rq_disk;
509	struct scsi_disk *sdkp;	509	struct scsi_disk *sdkp;
510	sector_t block = blk_rq_pos(rq);	510	sector_t block = blk_rq_pos(rq);
511	sector_t threshold;	511	sector_t threshold;
512	unsigned int this_count = blk_rq_sectors(rq);	512	unsigned int this_count = blk_rq_sectors(rq);
513	int ret, host_dif;	513	int ret, host_dif;
514	unsigned char protect;	514	unsigned char protect;
515		515
516	/*	516	/*
517	* Discard request come in as REQ_TYPE_FS but we turn them into	517	* Discard request come in as REQ_TYPE_FS but we turn them into
518	* block PC requests to make life easier.	518	* block PC requests to make life easier.
519	*/	519	*/
520	if (rq->cmd_flags & REQ_DISCARD) {	520	if (rq->cmd_flags & REQ_DISCARD) {
521	ret = scsi_setup_discard_cmnd(sdp, rq);	521	ret = scsi_setup_discard_cmnd(sdp, rq);
522	goto out;	522	goto out;
523	} else if (rq->cmd_flags & REQ_FLUSH) {	523	} else if (rq->cmd_flags & REQ_FLUSH) {
524	ret = scsi_setup_flush_cmnd(sdp, rq);	524	ret = scsi_setup_flush_cmnd(sdp, rq);
525	goto out;	525	goto out;
526	} else if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {	526	} else if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
527	ret = scsi_setup_blk_pc_cmnd(sdp, rq);	527	ret = scsi_setup_blk_pc_cmnd(sdp, rq);
528	goto out;	528	goto out;
529	} else if (rq->cmd_type != REQ_TYPE_FS) {	529	} else if (rq->cmd_type != REQ_TYPE_FS) {
530	ret = BLKPREP_KILL;	530	ret = BLKPREP_KILL;
531	goto out;	531	goto out;
532	}	532	}
533	ret = scsi_setup_fs_cmnd(sdp, rq);	533	ret = scsi_setup_fs_cmnd(sdp, rq);
534	if (ret != BLKPREP_OK)	534	if (ret != BLKPREP_OK)
535	goto out;	535	goto out;
536	SCpnt = rq->special;	536	SCpnt = rq->special;
537	sdkp = scsi_disk(disk);	537	sdkp = scsi_disk(disk);
538		538
539	/* from here on until we're complete, any goto out	539	/* from here on until we're complete, any goto out
540	* is used for a killable error condition */	540	* is used for a killable error condition */
541	ret = BLKPREP_KILL;	541	ret = BLKPREP_KILL;
542		542
543	SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt,	543	SCSI_LOG_HLQUEUE(1, scmd_printk(KERN_INFO, SCpnt,
544	"sd_init_command: block=%llu, "	544	"sd_init_command: block=%llu, "
545	"count=%d\n",	545	"count=%d\n",
546	(unsigned long long)block,	546	(unsigned long long)block,
547	this_count));	547	this_count));
548		548
549	if (!sdp \|\| !scsi_device_online(sdp) \|\|	549	if (!sdp \|\| !scsi_device_online(sdp) \|\|
550	block + blk_rq_sectors(rq) > get_capacity(disk)) {	550	block + blk_rq_sectors(rq) > get_capacity(disk)) {
551	SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,	551	SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
552	"Finishing %u sectors\n",	552	"Finishing %u sectors\n",
553	blk_rq_sectors(rq)));	553	blk_rq_sectors(rq)));
554	SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,	554	SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
555	"Retry with 0x%p\n", SCpnt));	555	"Retry with 0x%p\n", SCpnt));
556	goto out;	556	goto out;
557	}	557	}
558		558
559	if (sdp->changed) {	559	if (sdp->changed) {
560	/*	560	/*
561	* quietly refuse to do anything to a changed disc until	561	* quietly refuse to do anything to a changed disc until
562	* the changed bit has been reset	562	* the changed bit has been reset
563	*/	563	*/
564	/* printk("SCSI disk has been changed. Prohibiting further I/O.\n"); */	564	/* printk("SCSI disk has been changed. Prohibiting further I/O.\n"); */
565	goto out;	565	goto out;
566	}	566	}
567		567
568	/*	568	/*
569	* Some SD card readers can't handle multi-sector accesses which touch	569	* Some SD card readers can't handle multi-sector accesses which touch
570	* the last one or two hardware sectors. Split accesses as needed.	570	* the last one or two hardware sectors. Split accesses as needed.
571	*/	571	*/
572	threshold = get_capacity(disk) - SD_LAST_BUGGY_SECTORS *	572	threshold = get_capacity(disk) - SD_LAST_BUGGY_SECTORS *
573	(sdp->sector_size / 512);	573	(sdp->sector_size / 512);
574		574
575	if (unlikely(sdp->last_sector_bug && block + this_count > threshold)) {	575	if (unlikely(sdp->last_sector_bug && block + this_count > threshold)) {
576	if (block < threshold) {	576	if (block < threshold) {
577	/* Access up to the threshold but not beyond */	577	/* Access up to the threshold but not beyond */
578	this_count = threshold - block;	578	this_count = threshold - block;
579	} else {	579	} else {
580	/* Access only a single hardware sector */	580	/* Access only a single hardware sector */
581	this_count = sdp->sector_size / 512;	581	this_count = sdp->sector_size / 512;
582	}	582	}
583	}	583	}
584		584
585	SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, "block=%llu\n",	585	SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, "block=%llu\n",
586	(unsigned long long)block));	586	(unsigned long long)block));
587		587
588	/*	588	/*
589	* If we have a 1K hardware sectorsize, prevent access to single	589	* If we have a 1K hardware sectorsize, prevent access to single
590	* 512 byte sectors. In theory we could handle this - in fact	590	* 512 byte sectors. In theory we could handle this - in fact
591	* the scsi cdrom driver must be able to handle this because	591	* the scsi cdrom driver must be able to handle this because
592	* we typically use 1K blocksizes, and cdroms typically have	592	* we typically use 1K blocksizes, and cdroms typically have
593	* 2K hardware sectorsizes. Of course, things are simpler	593	* 2K hardware sectorsizes. Of course, things are simpler
594	* with the cdrom, since it is read-only. For performance	594	* with the cdrom, since it is read-only. For performance
595	* reasons, the filesystems should be able to handle this	595	* reasons, the filesystems should be able to handle this
596	* and not force the scsi disk driver to use bounce buffers	596	* and not force the scsi disk driver to use bounce buffers
597	* for this.	597	* for this.
598	*/	598	*/
599	if (sdp->sector_size == 1024) {	599	if (sdp->sector_size == 1024) {
600	if ((block & 1) \|\| (blk_rq_sectors(rq) & 1)) {	600	if ((block & 1) \|\| (blk_rq_sectors(rq) & 1)) {
601	scmd_printk(KERN_ERR, SCpnt,	601	scmd_printk(KERN_ERR, SCpnt,
602	"Bad block number requested\n");	602	"Bad block number requested\n");
603	goto out;	603	goto out;
604	} else {	604	} else {
605	block = block >> 1;	605	block = block >> 1;
606	this_count = this_count >> 1;	606	this_count = this_count >> 1;
607	}	607	}
608	}	608	}
609	if (sdp->sector_size == 2048) {	609	if (sdp->sector_size == 2048) {
610	if ((block & 3) \|\| (blk_rq_sectors(rq) & 3)) {	610	if ((block & 3) \|\| (blk_rq_sectors(rq) & 3)) {
611	scmd_printk(KERN_ERR, SCpnt,	611	scmd_printk(KERN_ERR, SCpnt,
612	"Bad block number requested\n");	612	"Bad block number requested\n");
613	goto out;	613	goto out;
614	} else {	614	} else {
615	block = block >> 2;	615	block = block >> 2;
616	this_count = this_count >> 2;	616	this_count = this_count >> 2;
617	}	617	}
618	}	618	}
619	if (sdp->sector_size == 4096) {	619	if (sdp->sector_size == 4096) {
620	if ((block & 7) \|\| (blk_rq_sectors(rq) & 7)) {	620	if ((block & 7) \|\| (blk_rq_sectors(rq) & 7)) {
621	scmd_printk(KERN_ERR, SCpnt,	621	scmd_printk(KERN_ERR, SCpnt,
622	"Bad block number requested\n");	622	"Bad block number requested\n");
623	goto out;	623	goto out;
624	} else {	624	} else {
625	block = block >> 3;	625	block = block >> 3;
626	this_count = this_count >> 3;	626	this_count = this_count >> 3;
627	}	627	}
628	}	628	}
629	if (rq_data_dir(rq) == WRITE) {	629	if (rq_data_dir(rq) == WRITE) {
630	if (!sdp->writeable) {	630	if (!sdp->writeable) {
631	goto out;	631	goto out;
632	}	632	}
633	SCpnt->cmnd[0] = WRITE_6;	633	SCpnt->cmnd[0] = WRITE_6;
634	SCpnt->sc_data_direction = DMA_TO_DEVICE;	634	SCpnt->sc_data_direction = DMA_TO_DEVICE;
635		635
636	if (blk_integrity_rq(rq) &&	636	if (blk_integrity_rq(rq) &&
637	sd_dif_prepare(rq, block, sdp->sector_size) == -EIO)	637	sd_dif_prepare(rq, block, sdp->sector_size) == -EIO)
638	goto out;	638	goto out;
639		639
640	} else if (rq_data_dir(rq) == READ) {	640	} else if (rq_data_dir(rq) == READ) {
641	SCpnt->cmnd[0] = READ_6;	641	SCpnt->cmnd[0] = READ_6;
642	SCpnt->sc_data_direction = DMA_FROM_DEVICE;	642	SCpnt->sc_data_direction = DMA_FROM_DEVICE;
643	} else {	643	} else {
644	scmd_printk(KERN_ERR, SCpnt, "Unknown command %x\n", rq->cmd_flags);	644	scmd_printk(KERN_ERR, SCpnt, "Unknown command %x\n", rq->cmd_flags);
645	goto out;	645	goto out;
646	}	646	}
647		647
648	SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,	648	SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt,
649	"%s %d/%u 512 byte blocks.\n",	649	"%s %d/%u 512 byte blocks.\n",
650	(rq_data_dir(rq) == WRITE) ?	650	(rq_data_dir(rq) == WRITE) ?
651	"writing" : "reading", this_count,	651	"writing" : "reading", this_count,
652	blk_rq_sectors(rq)));	652	blk_rq_sectors(rq)));
653		653
654	/* Set RDPROTECT/WRPROTECT if disk is formatted with DIF */	654	/* Set RDPROTECT/WRPROTECT if disk is formatted with DIF */
655	host_dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type);	655	host_dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type);
656	if (host_dif)	656	if (host_dif)
657	protect = 1 << 5;	657	protect = 1 << 5;
658	else	658	else
659	protect = 0;	659	protect = 0;
660		660
661	if (host_dif == SD_DIF_TYPE2_PROTECTION) {	661	if (host_dif == SD_DIF_TYPE2_PROTECTION) {
662	SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC);	662	SCpnt->cmnd = mempool_alloc(sd_cdb_pool, GFP_ATOMIC);
663		663
664	if (unlikely(SCpnt->cmnd == NULL)) {	664	if (unlikely(SCpnt->cmnd == NULL)) {
665	ret = BLKPREP_DEFER;	665	ret = BLKPREP_DEFER;
666	goto out;	666	goto out;
667	}	667	}
668		668
669	SCpnt->cmd_len = SD_EXT_CDB_SIZE;	669	SCpnt->cmd_len = SD_EXT_CDB_SIZE;
670	memset(SCpnt->cmnd, 0, SCpnt->cmd_len);	670	memset(SCpnt->cmnd, 0, SCpnt->cmd_len);
671	SCpnt->cmnd[0] = VARIABLE_LENGTH_CMD;	671	SCpnt->cmnd[0] = VARIABLE_LENGTH_CMD;
672	SCpnt->cmnd[7] = 0x18;	672	SCpnt->cmnd[7] = 0x18;
673	SCpnt->cmnd[9] = (rq_data_dir(rq) == READ) ? READ_32 : WRITE_32;	673	SCpnt->cmnd[9] = (rq_data_dir(rq) == READ) ? READ_32 : WRITE_32;
674	SCpnt->cmnd[10] = protect \| ((rq->cmd_flags & REQ_FUA) ? 0x8 : 0);	674	SCpnt->cmnd[10] = protect \| ((rq->cmd_flags & REQ_FUA) ? 0x8 : 0);
675		675
676	/* LBA */	676	/* LBA */
677	SCpnt->cmnd[12] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0;	677	SCpnt->cmnd[12] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0;
678	SCpnt->cmnd[13] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0;	678	SCpnt->cmnd[13] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0;
679	SCpnt->cmnd[14] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0;	679	SCpnt->cmnd[14] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0;
680	SCpnt->cmnd[15] = sizeof(block) > 4 ? (unsigned char) (block >> 32) & 0xff : 0;	680	SCpnt->cmnd[15] = sizeof(block) > 4 ? (unsigned char) (block >> 32) & 0xff : 0;
681	SCpnt->cmnd[16] = (unsigned char) (block >> 24) & 0xff;	681	SCpnt->cmnd[16] = (unsigned char) (block >> 24) & 0xff;
682	SCpnt->cmnd[17] = (unsigned char) (block >> 16) & 0xff;	682	SCpnt->cmnd[17] = (unsigned char) (block >> 16) & 0xff;
683	SCpnt->cmnd[18] = (unsigned char) (block >> 8) & 0xff;	683	SCpnt->cmnd[18] = (unsigned char) (block >> 8) & 0xff;
684	SCpnt->cmnd[19] = (unsigned char) block & 0xff;	684	SCpnt->cmnd[19] = (unsigned char) block & 0xff;
685		685
686	/* Expected Indirect LBA */	686	/* Expected Indirect LBA */
687	SCpnt->cmnd[20] = (unsigned char) (block >> 24) & 0xff;	687	SCpnt->cmnd[20] = (unsigned char) (block >> 24) & 0xff;
688	SCpnt->cmnd[21] = (unsigned char) (block >> 16) & 0xff;	688	SCpnt->cmnd[21] = (unsigned char) (block >> 16) & 0xff;
689	SCpnt->cmnd[22] = (unsigned char) (block >> 8) & 0xff;	689	SCpnt->cmnd[22] = (unsigned char) (block >> 8) & 0xff;
690	SCpnt->cmnd[23] = (unsigned char) block & 0xff;	690	SCpnt->cmnd[23] = (unsigned char) block & 0xff;
691		691
692	/* Transfer length */	692	/* Transfer length */
693	SCpnt->cmnd[28] = (unsigned char) (this_count >> 24) & 0xff;	693	SCpnt->cmnd[28] = (unsigned char) (this_count >> 24) & 0xff;
694	SCpnt->cmnd[29] = (unsigned char) (this_count >> 16) & 0xff;	694	SCpnt->cmnd[29] = (unsigned char) (this_count >> 16) & 0xff;
695	SCpnt->cmnd[30] = (unsigned char) (this_count >> 8) & 0xff;	695	SCpnt->cmnd[30] = (unsigned char) (this_count >> 8) & 0xff;
696	SCpnt->cmnd[31] = (unsigned char) this_count & 0xff;	696	SCpnt->cmnd[31] = (unsigned char) this_count & 0xff;
697	} else if (block > 0xffffffff) {	697	} else if (block > 0xffffffff) {
698	SCpnt->cmnd[0] += READ_16 - READ_6;	698	SCpnt->cmnd[0] += READ_16 - READ_6;
699	SCpnt->cmnd[1] = protect \| ((rq->cmd_flags & REQ_FUA) ? 0x8 : 0);	699	SCpnt->cmnd[1] = protect \| ((rq->cmd_flags & REQ_FUA) ? 0x8 : 0);
700	SCpnt->cmnd[2] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0;	700	SCpnt->cmnd[2] = sizeof(block) > 4 ? (unsigned char) (block >> 56) & 0xff : 0;
701	SCpnt->cmnd[3] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0;	701	SCpnt->cmnd[3] = sizeof(block) > 4 ? (unsigned char) (block >> 48) & 0xff : 0;
702	SCpnt->cmnd[4] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0;	702	SCpnt->cmnd[4] = sizeof(block) > 4 ? (unsigned char) (block >> 40) & 0xff : 0;
703	SCpnt->cmnd[5] = sizeof(block) > 4 ? (unsigned char) (block >> 32) & 0xff : 0;	703	SCpnt->cmnd[5] = sizeof(block) > 4 ? (unsigned char) (block >> 32) & 0xff : 0;
704	SCpnt->cmnd[6] = (unsigned char) (block >> 24) & 0xff;	704	SCpnt->cmnd[6] = (unsigned char) (block >> 24) & 0xff;
705	SCpnt->cmnd[7] = (unsigned char) (block >> 16) & 0xff;	705	SCpnt->cmnd[7] = (unsigned char) (block >> 16) & 0xff;
706	SCpnt->cmnd[8] = (unsigned char) (block >> 8) & 0xff;	706	SCpnt->cmnd[8] = (unsigned char) (block >> 8) & 0xff;
707	SCpnt->cmnd[9] = (unsigned char) block & 0xff;	707	SCpnt->cmnd[9] = (unsigned char) block & 0xff;
708	SCpnt->cmnd[10] = (unsigned char) (this_count >> 24) & 0xff;	708	SCpnt->cmnd[10] = (unsigned char) (this_count >> 24) & 0xff;
709	SCpnt->cmnd[11] = (unsigned char) (this_count >> 16) & 0xff;	709	SCpnt->cmnd[11] = (unsigned char) (this_count >> 16) & 0xff;
710	SCpnt->cmnd[12] = (unsigned char) (this_count >> 8) & 0xff;	710	SCpnt->cmnd[12] = (unsigned char) (this_count >> 8) & 0xff;
711	SCpnt->cmnd[13] = (unsigned char) this_count & 0xff;	711	SCpnt->cmnd[13] = (unsigned char) this_count & 0xff;
712	SCpnt->cmnd[14] = SCpnt->cmnd[15] = 0;	712	SCpnt->cmnd[14] = SCpnt->cmnd[15] = 0;
713	} else if ((this_count > 0xff) \|\| (block > 0x1fffff) \|\|	713	} else if ((this_count > 0xff) \|\| (block > 0x1fffff) \|\|
714	scsi_device_protection(SCpnt->device) \|\|	714	scsi_device_protection(SCpnt->device) \|\|
715	SCpnt->device->use_10_for_rw) {	715	SCpnt->device->use_10_for_rw) {
716	if (this_count > 0xffff)	716	if (this_count > 0xffff)
717	this_count = 0xffff;	717	this_count = 0xffff;
718		718
719	SCpnt->cmnd[0] += READ_10 - READ_6;	719	SCpnt->cmnd[0] += READ_10 - READ_6;
720	SCpnt->cmnd[1] = protect \| ((rq->cmd_flags & REQ_FUA) ? 0x8 : 0);	720	SCpnt->cmnd[1] = protect \| ((rq->cmd_flags & REQ_FUA) ? 0x8 : 0);
721	SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;	721	SCpnt->cmnd[2] = (unsigned char) (block >> 24) & 0xff;
722	SCpnt->cmnd[3] = (unsigned char) (block >> 16) & 0xff;	722	SCpnt->cmnd[3] = (unsigned char) (block >> 16) & 0xff;
723	SCpnt->cmnd[4] = (unsigned char) (block >> 8) & 0xff;	723	SCpnt->cmnd[4] = (unsigned char) (block >> 8) & 0xff;
724	SCpnt->cmnd[5] = (unsigned char) block & 0xff;	724	SCpnt->cmnd[5] = (unsigned char) block & 0xff;
725	SCpnt->cmnd[6] = SCpnt->cmnd[9] = 0;	725	SCpnt->cmnd[6] = SCpnt->cmnd[9] = 0;
726	SCpnt->cmnd[7] = (unsigned char) (this_count >> 8) & 0xff;	726	SCpnt->cmnd[7] = (unsigned char) (this_count >> 8) & 0xff;
727	SCpnt->cmnd[8] = (unsigned char) this_count & 0xff;	727	SCpnt->cmnd[8] = (unsigned char) this_count & 0xff;
728	} else {	728	} else {
729	if (unlikely(rq->cmd_flags & REQ_FUA)) {	729	if (unlikely(rq->cmd_flags & REQ_FUA)) {
730	/*	730	/*
731	* This happens only if this drive failed	731	* This happens only if this drive failed
732	* 10byte rw command with ILLEGAL_REQUEST	732	* 10byte rw command with ILLEGAL_REQUEST
733	* during operation and thus turned off	733	* during operation and thus turned off
734	* use_10_for_rw.	734	* use_10_for_rw.
735	*/	735	*/
736	scmd_printk(KERN_ERR, SCpnt,	736	scmd_printk(KERN_ERR, SCpnt,
737	"FUA write on READ/WRITE(6) drive\n");	737	"FUA write on READ/WRITE(6) drive\n");
738	goto out;	738	goto out;
739	}	739	}
740		740
741	SCpnt->cmnd[1] \|= (unsigned char) ((block >> 16) & 0x1f);	741	SCpnt->cmnd[1] \|= (unsigned char) ((block >> 16) & 0x1f);
742	SCpnt->cmnd[2] = (unsigned char) ((block >> 8) & 0xff);	742	SCpnt->cmnd[2] = (unsigned char) ((block >> 8) & 0xff);
743	SCpnt->cmnd[3] = (unsigned char) block & 0xff;	743	SCpnt->cmnd[3] = (unsigned char) block & 0xff;
744	SCpnt->cmnd[4] = (unsigned char) this_count;	744	SCpnt->cmnd[4] = (unsigned char) this_count;
745	SCpnt->cmnd[5] = 0;	745	SCpnt->cmnd[5] = 0;
746	}	746	}
747	SCpnt->sdb.length = this_count * sdp->sector_size;	747	SCpnt->sdb.length = this_count * sdp->sector_size;
748		748
749	/* If DIF or DIX is enabled, tell HBA how to handle request */	749	/* If DIF or DIX is enabled, tell HBA how to handle request */
750	if (host_dif \|\| scsi_prot_sg_count(SCpnt))	750	if (host_dif \|\| scsi_prot_sg_count(SCpnt))
751	sd_prot_op(SCpnt, host_dif);	751	sd_prot_op(SCpnt, host_dif);
752		752
753	/*	753	/*
754	* We shouldn't disconnect in the middle of a sector, so with a dumb	754	* We shouldn't disconnect in the middle of a sector, so with a dumb
755	* host adapter, it's safe to assume that we can at least transfer	755	* host adapter, it's safe to assume that we can at least transfer
756	* this many bytes between each connect / disconnect.	756	* this many bytes between each connect / disconnect.
757	*/	757	*/
758	SCpnt->transfersize = sdp->sector_size;	758	SCpnt->transfersize = sdp->sector_size;
759	SCpnt->underflow = this_count << 9;	759	SCpnt->underflow = this_count << 9;
760	SCpnt->allowed = SD_MAX_RETRIES;	760	SCpnt->allowed = SD_MAX_RETRIES;
761		761
762	/*	762	/*
763	* This indicates that the command is ready from our end to be	763	* This indicates that the command is ready from our end to be
764	* queued.	764	* queued.
765	*/	765	*/
766	ret = BLKPREP_OK;	766	ret = BLKPREP_OK;
767	out:	767	out:
768	return scsi_prep_return(q, rq, ret);	768	return scsi_prep_return(q, rq, ret);
769	}	769	}
770		770
771	/**	771	/**
772	* sd_open - open a scsi disk device	772	* sd_open - open a scsi disk device
773	* @inode: only i_rdev member may be used	773	* @inode: only i_rdev member may be used
774	* @filp: only f_mode and f_flags may be used	774	* @filp: only f_mode and f_flags may be used
775	*	775	*
776	* Returns 0 if successful. Returns a negated errno value in case	776	* Returns 0 if successful. Returns a negated errno value in case
777	* of error.	777	* of error.
778	*	778	*
779	* Note: This can be called from a user context (e.g. fsck(1) )	779	* Note: This can be called from a user context (e.g. fsck(1) )
780	* or from within the kernel (e.g. as a result of a mount(1) ).	780	* or from within the kernel (e.g. as a result of a mount(1) ).
781	* In the latter case @inode and @filp carry an abridged amount	781	* In the latter case @inode and @filp carry an abridged amount
782	* of information as noted above.	782	* of information as noted above.
783	*	783	*
784	* Locking: called with bdev->bd_mutex held.	784	* Locking: called with bdev->bd_mutex held.
785	**/	785	**/
786	static int sd_open(struct block_device *bdev, fmode_t mode)	786	static int sd_open(struct block_device *bdev, fmode_t mode)
787	{	787	{
788	struct scsi_disk *sdkp = scsi_disk_get(bdev->bd_disk);	788	struct scsi_disk *sdkp = scsi_disk_get(bdev->bd_disk);
789	struct scsi_device *sdev;	789	struct scsi_device *sdev;
790	int retval;	790	int retval;
791		791
792	if (!sdkp)	792	if (!sdkp)
793	return -ENXIO;	793	return -ENXIO;
794		794
795	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n"));	795	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_open\n"));
796		796
797	sdev = sdkp->device;	797	sdev = sdkp->device;
798		798
799	retval = scsi_autopm_get_device(sdev);	799	retval = scsi_autopm_get_device(sdev);
800	if (retval)	800	if (retval)
801	goto error_autopm;	801	goto error_autopm;
802		802
803	/*	803	/*
804	* If the device is in error recovery, wait until it is done.	804	* If the device is in error recovery, wait until it is done.
805	* If the device is offline, then disallow any access to it.	805	* If the device is offline, then disallow any access to it.
806	*/	806	*/
807	retval = -ENXIO;	807	retval = -ENXIO;
808	if (!scsi_block_when_processing_errors(sdev))	808	if (!scsi_block_when_processing_errors(sdev))
809	goto error_out;	809	goto error_out;
810		810
811	if (sdev->removable \|\| sdkp->write_prot)	811	if (sdev->removable \|\| sdkp->write_prot)
812	check_disk_change(bdev);	812	check_disk_change(bdev);
813		813
814	/*	814	/*
815	* If the drive is empty, just let the open fail.	815	* If the drive is empty, just let the open fail.
816	*/	816	*/
817	retval = -ENOMEDIUM;	817	retval = -ENOMEDIUM;
818	if (sdev->removable && !sdkp->media_present && !(mode & FMODE_NDELAY))	818	if (sdev->removable && !sdkp->media_present && !(mode & FMODE_NDELAY))
819	goto error_out;	819	goto error_out;
820		820
821	/*	821	/*
822	* If the device has the write protect tab set, have the open fail	822	* If the device has the write protect tab set, have the open fail
823	* if the user expects to be able to write to the thing.	823	* if the user expects to be able to write to the thing.
824	*/	824	*/
825	retval = -EROFS;	825	retval = -EROFS;
826	if (sdkp->write_prot && (mode & FMODE_WRITE))	826	if (sdkp->write_prot && (mode & FMODE_WRITE))
827	goto error_out;	827	goto error_out;
828		828
829	/*	829	/*
830	* It is possible that the disk changing stuff resulted in	830	* It is possible that the disk changing stuff resulted in
831	* the device being taken offline. If this is the case,	831	* the device being taken offline. If this is the case,
832	* report this to the user, and don't pretend that the	832	* report this to the user, and don't pretend that the
833	* open actually succeeded.	833	* open actually succeeded.
834	*/	834	*/
835	retval = -ENXIO;	835	retval = -ENXIO;
836	if (!scsi_device_online(sdev))	836	if (!scsi_device_online(sdev))
837	goto error_out;	837	goto error_out;
838		838
839	if ((atomic_inc_return(&sdkp->openers) == 1) && sdev->removable) {	839	if ((atomic_inc_return(&sdkp->openers) == 1) && sdev->removable) {
840	if (scsi_block_when_processing_errors(sdev))	840	if (scsi_block_when_processing_errors(sdev))
841	scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT);	841	scsi_set_medium_removal(sdev, SCSI_REMOVAL_PREVENT);
842	}	842	}
843		843
844	return 0;	844	return 0;
845		845
846	error_out:	846	error_out:
847	scsi_autopm_put_device(sdev);	847	scsi_autopm_put_device(sdev);
848	error_autopm:	848	error_autopm:
849	scsi_disk_put(sdkp);	849	scsi_disk_put(sdkp);
850	return retval;	850	return retval;
851	}	851	}
852		852
853	/**	853	/**
854	* sd_release - invoked when the (last) close(2) is called on this	854	* sd_release - invoked when the (last) close(2) is called on this
855	* scsi disk.	855	* scsi disk.
856	* @inode: only i_rdev member may be used	856	* @inode: only i_rdev member may be used
857	* @filp: only f_mode and f_flags may be used	857	* @filp: only f_mode and f_flags may be used
858	*	858	*
859	* Returns 0.	859	* Returns 0.
860	*	860	*
861	* Note: may block (uninterruptible) if error recovery is underway	861	* Note: may block (uninterruptible) if error recovery is underway
862	* on this disk.	862	* on this disk.
863	*	863	*
864	* Locking: called with bdev->bd_mutex held.	864	* Locking: called with bdev->bd_mutex held.
865	**/	865	**/
866	static int sd_release(struct gendisk *disk, fmode_t mode)	866	static int sd_release(struct gendisk *disk, fmode_t mode)
867	{	867	{
868	struct scsi_disk *sdkp = scsi_disk(disk);	868	struct scsi_disk *sdkp = scsi_disk(disk);
869	struct scsi_device *sdev = sdkp->device;	869	struct scsi_device *sdev = sdkp->device;
870		870
871	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_release\n"));	871	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_release\n"));
872		872
873	if (atomic_dec_return(&sdkp->openers) && sdev->removable) {	873	if (atomic_dec_return(&sdkp->openers) && sdev->removable) {
874	if (scsi_block_when_processing_errors(sdev))	874	if (scsi_block_when_processing_errors(sdev))
875	scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW);	875	scsi_set_medium_removal(sdev, SCSI_REMOVAL_ALLOW);
876	}	876	}
877		877
878	/*	878	/*
879	* XXX and what if there are packets in flight and this close()	879	* XXX and what if there are packets in flight and this close()
880	* XXX is followed by a "rmmod sd_mod"?	880	* XXX is followed by a "rmmod sd_mod"?
881	*/	881	*/
882		882
883	scsi_autopm_put_device(sdev);	883	scsi_autopm_put_device(sdev);
884	scsi_disk_put(sdkp);	884	scsi_disk_put(sdkp);
885	return 0;	885	return 0;
886	}	886	}
887		887
888	static int sd_getgeo(struct block_device bdev, struct hd_geometry geo)	888	static int sd_getgeo(struct block_device bdev, struct hd_geometry geo)
889	{	889	{
890	struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);	890	struct scsi_disk *sdkp = scsi_disk(bdev->bd_disk);
891	struct scsi_device *sdp = sdkp->device;	891	struct scsi_device *sdp = sdkp->device;
892	struct Scsi_Host *host = sdp->host;	892	struct Scsi_Host *host = sdp->host;
893	int diskinfo[4];	893	int diskinfo[4];
894		894
895	/* default to most commonly used values */	895	/* default to most commonly used values */
896	diskinfo[0] = 0x40; /* 1 << 6 */	896	diskinfo[0] = 0x40; /* 1 << 6 */
897	diskinfo[1] = 0x20; /* 1 << 5 */	897	diskinfo[1] = 0x20; /* 1 << 5 */
898	diskinfo[2] = sdkp->capacity >> 11;	898	diskinfo[2] = sdkp->capacity >> 11;
899		899
900	/* override with calculated, extended default, or driver values */	900	/* override with calculated, extended default, or driver values */
901	if (host->hostt->bios_param)	901	if (host->hostt->bios_param)
902	host->hostt->bios_param(sdp, bdev, sdkp->capacity, diskinfo);	902	host->hostt->bios_param(sdp, bdev, sdkp->capacity, diskinfo);
903	else	903	else
904	scsicam_bios_param(bdev, sdkp->capacity, diskinfo);	904	scsicam_bios_param(bdev, sdkp->capacity, diskinfo);
905		905
906	geo->heads = diskinfo[0];	906	geo->heads = diskinfo[0];
907	geo->sectors = diskinfo[1];	907	geo->sectors = diskinfo[1];
908	geo->cylinders = diskinfo[2];	908	geo->cylinders = diskinfo[2];
909	return 0;	909	return 0;
910	}	910	}
911		911
912	/**	912	/**
913	* sd_ioctl - process an ioctl	913	* sd_ioctl - process an ioctl
914	* @inode: only i_rdev/i_bdev members may be used	914	* @inode: only i_rdev/i_bdev members may be used
915	* @filp: only f_mode and f_flags may be used	915	* @filp: only f_mode and f_flags may be used
916	* @cmd: ioctl command number	916	* @cmd: ioctl command number
917	* @arg: this is third argument given to ioctl(2) system call.	917	* @arg: this is third argument given to ioctl(2) system call.
918	* Often contains a pointer.	918	* Often contains a pointer.
919	*	919	*
920	* Returns 0 if successful (some ioctls return postive numbers on	920	* Returns 0 if successful (some ioctls return postive numbers on
921	* success as well). Returns a negated errno value in case of error.	921	* success as well). Returns a negated errno value in case of error.
922	*	922	*
923	* Note: most ioctls are forward onto the block subsystem or further	923	* Note: most ioctls are forward onto the block subsystem or further
924	* down in the scsi subsystem.	924	* down in the scsi subsystem.
925	**/	925	**/
926	static int sd_ioctl(struct block_device *bdev, fmode_t mode,	926	static int sd_ioctl(struct block_device *bdev, fmode_t mode,
927	unsigned int cmd, unsigned long arg)	927	unsigned int cmd, unsigned long arg)
928	{	928	{
929	struct gendisk *disk = bdev->bd_disk;	929	struct gendisk *disk = bdev->bd_disk;
930	struct scsi_device *sdp = scsi_disk(disk)->device;	930	struct scsi_device *sdp = scsi_disk(disk)->device;
931	void __user p = (void __user )arg;	931	void __user p = (void __user )arg;
932	int error;	932	int error;
933		933
934	SCSI_LOG_IOCTL(1, printk("sd_ioctl: disk=%s, cmd=0x%x\n",	934	SCSI_LOG_IOCTL(1, printk("sd_ioctl: disk=%s, cmd=0x%x\n",
935	disk->disk_name, cmd));	935	disk->disk_name, cmd));
936		936
937	/*	937	/*
938	* If we are in the middle of error recovery, don't let anyone	938	* If we are in the middle of error recovery, don't let anyone
939	* else try and use this device. Also, if error recovery fails, it	939	* else try and use this device. Also, if error recovery fails, it
940	* may try and take the device offline, in which case all further	940	* may try and take the device offline, in which case all further
941	* access to the device is prohibited.	941	* access to the device is prohibited.
942	*/	942	*/
943	error = scsi_nonblockable_ioctl(sdp, cmd, p,	943	error = scsi_nonblockable_ioctl(sdp, cmd, p,
944	(mode & FMODE_NDELAY) != 0);	944	(mode & FMODE_NDELAY) != 0);
945	if (!scsi_block_when_processing_errors(sdp) \|\| !error)	945	if (!scsi_block_when_processing_errors(sdp) \|\| !error)
946	goto out;	946	goto out;
947		947
948	/*	948	/*
949	* Send SCSI addressing ioctls directly to mid level, send other	949	* Send SCSI addressing ioctls directly to mid level, send other
950	* ioctls to block level and then onto mid level if they can't be	950	* ioctls to block level and then onto mid level if they can't be
951	* resolved.	951	* resolved.
952	*/	952	*/
953	switch (cmd) {	953	switch (cmd) {
954	case SCSI_IOCTL_GET_IDLUN:	954	case SCSI_IOCTL_GET_IDLUN:
955	case SCSI_IOCTL_GET_BUS_NUMBER:	955	case SCSI_IOCTL_GET_BUS_NUMBER:
956	error = scsi_ioctl(sdp, cmd, p);	956	error = scsi_ioctl(sdp, cmd, p);
957	break;	957	break;
958	default:	958	default:
959	error = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, p);	959	error = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, p);
960	if (error != -ENOTTY)	960	if (error != -ENOTTY)
961	break;	961	break;
962	error = scsi_ioctl(sdp, cmd, p);	962	error = scsi_ioctl(sdp, cmd, p);
963	break;	963	break;
964	}	964	}
965	out:	965	out:
966	return error;	966	return error;
967	}	967	}
968		968
969	static void set_media_not_present(struct scsi_disk *sdkp)	969	static void set_media_not_present(struct scsi_disk *sdkp)
970	{	970	{
971	sdkp->media_present = 0;	971	sdkp->media_present = 0;
972	sdkp->capacity = 0;	972	sdkp->capacity = 0;
973	sdkp->device->changed = 1;	973	sdkp->device->changed = 1;
974	}	974	}
975		975
976	/**	976	/**
977	* sd_media_changed - check if our medium changed	977	* sd_media_changed - check if our medium changed
978	* @disk: kernel device descriptor	978	* @disk: kernel device descriptor
979	*	979	*
980	* Returns 0 if not applicable or no change; 1 if change	980	* Returns 0 if not applicable or no change; 1 if change
981	*	981	*
982	* Note: this function is invoked from the block subsystem.	982	* Note: this function is invoked from the block subsystem.
983	**/	983	**/
984	static int sd_media_changed(struct gendisk *disk)	984	static int sd_media_changed(struct gendisk *disk)
985	{	985	{
986	struct scsi_disk *sdkp = scsi_disk(disk);	986	struct scsi_disk *sdkp = scsi_disk(disk);
987	struct scsi_device *sdp = sdkp->device;	987	struct scsi_device *sdp = sdkp->device;
988	struct scsi_sense_hdr *sshdr = NULL;	988	struct scsi_sense_hdr *sshdr = NULL;
989	int retval;	989	int retval;
990		990
991	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_media_changed\n"));	991	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp, "sd_media_changed\n"));
992		992
993	if (!sdp->removable)	993	if (!sdp->removable)
994	return 0;	994	return 0;
995		995
996	/*	996	/*
997	* If the device is offline, don't send any commands - just pretend as	997	* If the device is offline, don't send any commands - just pretend as
998	* if the command failed. If the device ever comes back online, we	998	* if the command failed. If the device ever comes back online, we
999	* can deal with it then. It is only because of unrecoverable errors	999	* can deal with it then. It is only because of unrecoverable errors
1000	* that we would ever take a device offline in the first place.	1000	* that we would ever take a device offline in the first place.
1001	*/	1001	*/
1002	if (!scsi_device_online(sdp)) {	1002	if (!scsi_device_online(sdp)) {
1003	set_media_not_present(sdkp);	1003	set_media_not_present(sdkp);
1004	retval = 1;	1004	retval = 1;
1005	goto out;	1005	goto out;
1006	}	1006	}
1007		1007
1008	/*	1008	/*
1009	* Using TEST_UNIT_READY enables differentiation between drive with	1009	* Using TEST_UNIT_READY enables differentiation between drive with
1010	* no cartridge loaded - NOT READY, drive with changed cartridge -	1010	* no cartridge loaded - NOT READY, drive with changed cartridge -
1011	* UNIT ATTENTION, or with same cartridge - GOOD STATUS.	1011	* UNIT ATTENTION, or with same cartridge - GOOD STATUS.
1012	*	1012	*
1013	* Drives that auto spin down. eg iomega jaz 1G, will be started	1013	* Drives that auto spin down. eg iomega jaz 1G, will be started
1014	* by sd_spinup_disk() from sd_revalidate_disk(), which happens whenever	1014	* by sd_spinup_disk() from sd_revalidate_disk(), which happens whenever
1015	* sd_revalidate() is called.	1015	* sd_revalidate() is called.
1016	*/	1016	*/
1017	retval = -ENODEV;	1017	retval = -ENODEV;
1018		1018
1019	if (scsi_block_when_processing_errors(sdp)) {	1019	if (scsi_block_when_processing_errors(sdp)) {
1020	sshdr = kzalloc(sizeof(*sshdr), GFP_KERNEL);	1020	sshdr = kzalloc(sizeof(*sshdr), GFP_KERNEL);
1021	retval = scsi_test_unit_ready(sdp, SD_TIMEOUT, SD_MAX_RETRIES,	1021	retval = scsi_test_unit_ready(sdp, SD_TIMEOUT, SD_MAX_RETRIES,
1022	sshdr);	1022	sshdr);
1023	}	1023	}
1024		1024
1025	/*	1025	/*
1026	* Unable to test, unit probably not ready. This usually	1026	* Unable to test, unit probably not ready. This usually
1027	* means there is no disc in the drive. Mark as changed,	1027	* means there is no disc in the drive. Mark as changed,
1028	* and we will figure it out later once the drive is	1028	* and we will figure it out later once the drive is
1029	* available again.	1029	* available again.
1030	*/	1030	*/
1031	if (retval \|\| (scsi_sense_valid(sshdr) &&	1031	if (retval \|\| (scsi_sense_valid(sshdr) &&
1032	/* 0x3a is medium not present */	1032	/* 0x3a is medium not present */
1033	sshdr->asc == 0x3a)) {	1033	sshdr->asc == 0x3a)) {
1034	set_media_not_present(sdkp);	1034	set_media_not_present(sdkp);
1035	retval = 1;	1035	retval = 1;
1036	goto out;	1036	goto out;
1037	}	1037	}
1038		1038
1039	/*	1039	/*
1040	* For removable scsi disk we have to recognise the presence	1040	* For removable scsi disk we have to recognise the presence
1041	* of a disk in the drive. This is kept in the struct scsi_disk	1041	* of a disk in the drive. This is kept in the struct scsi_disk
1042	* struct and tested at open ! Daniel Roche (dan@lectra.fr)	1042	* struct and tested at open ! Daniel Roche (dan@lectra.fr)
1043	*/	1043	*/
1044	sdkp->media_present = 1;	1044	sdkp->media_present = 1;
1045		1045
1046	retval = sdp->changed;	1046	retval = sdp->changed;
1047	sdp->changed = 0;	1047	sdp->changed = 0;
1048	out:	1048	out:
1049	if (retval != sdkp->previous_state)	1049	if (retval != sdkp->previous_state)
1050	sdev_evt_send_simple(sdp, SDEV_EVT_MEDIA_CHANGE, GFP_KERNEL);	1050	sdev_evt_send_simple(sdp, SDEV_EVT_MEDIA_CHANGE, GFP_KERNEL);
1051	sdkp->previous_state = retval;	1051	sdkp->previous_state = retval;
1052	kfree(sshdr);	1052	kfree(sshdr);
1053	return retval;	1053	return retval;
1054	}	1054	}
1055		1055
1056	static int sd_sync_cache(struct scsi_disk *sdkp)	1056	static int sd_sync_cache(struct scsi_disk *sdkp)
1057	{	1057	{
1058	int retries, res;	1058	int retries, res;
1059	struct scsi_device *sdp = sdkp->device;	1059	struct scsi_device *sdp = sdkp->device;
1060	struct scsi_sense_hdr sshdr;	1060	struct scsi_sense_hdr sshdr;
1061		1061
1062	if (!scsi_device_online(sdp))	1062	if (!scsi_device_online(sdp))
1063	return -ENODEV;	1063	return -ENODEV;
1064		1064
1065		1065
1066	for (retries = 3; retries > 0; --retries) {	1066	for (retries = 3; retries > 0; --retries) {
1067	unsigned char cmd[10] = { 0 };	1067	unsigned char cmd[10] = { 0 };
1068		1068
1069	cmd[0] = SYNCHRONIZE_CACHE;	1069	cmd[0] = SYNCHRONIZE_CACHE;
1070	/*	1070	/*
1071	* Leave the rest of the command zero to indicate	1071	* Leave the rest of the command zero to indicate
1072	* flush everything.	1072	* flush everything.
1073	*/	1073	*/
1074	res = scsi_execute_req(sdp, cmd, DMA_NONE, NULL, 0, &sshdr,	1074	res = scsi_execute_req(sdp, cmd, DMA_NONE, NULL, 0, &sshdr,
1075	SD_TIMEOUT, SD_MAX_RETRIES, NULL);	1075	SD_TIMEOUT, SD_MAX_RETRIES, NULL);
1076	if (res == 0)	1076	if (res == 0)
1077	break;	1077	break;
1078	}	1078	}
1079		1079
1080	if (res) {	1080	if (res) {
1081	sd_print_result(sdkp, res);	1081	sd_print_result(sdkp, res);
1082	if (driver_byte(res) & DRIVER_SENSE)	1082	if (driver_byte(res) & DRIVER_SENSE)
1083	sd_print_sense_hdr(sdkp, &sshdr);	1083	sd_print_sense_hdr(sdkp, &sshdr);
1084	}	1084	}
1085		1085
1086	if (res)	1086	if (res)
1087	return -EIO;	1087	return -EIO;
1088	return 0;	1088	return 0;
1089	}	1089	}
1090		1090
1091	static void sd_rescan(struct device *dev)	1091	static void sd_rescan(struct device *dev)
1092	{	1092	{
1093	struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev);	1093	struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev);
1094		1094
1095	if (sdkp) {	1095	if (sdkp) {
1096	revalidate_disk(sdkp->disk);	1096	revalidate_disk(sdkp->disk);
1097	scsi_disk_put(sdkp);	1097	scsi_disk_put(sdkp);
1098	}	1098	}
1099	}	1099	}
1100		1100
1101		1101
1102	#ifdef CONFIG_COMPAT	1102	#ifdef CONFIG_COMPAT
1103	/*	1103	/*
1104	* This gets directly called from VFS. When the ioctl	1104	* This gets directly called from VFS. When the ioctl
1105	* is not recognized we go back to the other translation paths.	1105	* is not recognized we go back to the other translation paths.
1106	*/	1106	*/
1107	static int sd_compat_ioctl(struct block_device *bdev, fmode_t mode,	1107	static int sd_compat_ioctl(struct block_device *bdev, fmode_t mode,
1108	unsigned int cmd, unsigned long arg)	1108	unsigned int cmd, unsigned long arg)
1109	{	1109	{
1110	struct scsi_device *sdev = scsi_disk(bdev->bd_disk)->device;	1110	struct scsi_device *sdev = scsi_disk(bdev->bd_disk)->device;
1111		1111
1112	/*	1112	/*
1113	* If we are in the middle of error recovery, don't let anyone	1113	* If we are in the middle of error recovery, don't let anyone
1114	* else try and use this device. Also, if error recovery fails, it	1114	* else try and use this device. Also, if error recovery fails, it
1115	* may try and take the device offline, in which case all further	1115	* may try and take the device offline, in which case all further
1116	* access to the device is prohibited.	1116	* access to the device is prohibited.
1117	*/	1117	*/
1118	if (!scsi_block_when_processing_errors(sdev))	1118	if (!scsi_block_when_processing_errors(sdev))
1119	return -ENODEV;	1119	return -ENODEV;
1120		1120
1121	if (sdev->host->hostt->compat_ioctl) {	1121	if (sdev->host->hostt->compat_ioctl) {
1122	int ret;	1122	int ret;
1123		1123
1124	ret = sdev->host->hostt->compat_ioctl(sdev, cmd, (void __user *)arg);	1124	ret = sdev->host->hostt->compat_ioctl(sdev, cmd, (void __user *)arg);
1125		1125
1126	return ret;	1126	return ret;
1127	}	1127	}
1128		1128
1129	/*	1129	/*
1130	* Let the static ioctl translation table take care of it.	1130	* Let the static ioctl translation table take care of it.
1131	*/	1131	*/
1132	return -ENOIOCTLCMD;	1132	return -ENOIOCTLCMD;
1133	}	1133	}
1134	#endif	1134	#endif
1135		1135
1136	static const struct block_device_operations sd_fops = {	1136	static const struct block_device_operations sd_fops = {
1137	.owner = THIS_MODULE,	1137	.owner = THIS_MODULE,
1138	.open = sd_open,	1138	.open = sd_open,
1139	.release = sd_release,	1139	.release = sd_release,
1140	.ioctl = sd_ioctl,	1140	.ioctl = sd_ioctl,
1141	.getgeo = sd_getgeo,	1141	.getgeo = sd_getgeo,
1142	#ifdef CONFIG_COMPAT	1142	#ifdef CONFIG_COMPAT
1143	.compat_ioctl = sd_compat_ioctl,	1143	.compat_ioctl = sd_compat_ioctl,
1144	#endif	1144	#endif
1145	.media_changed = sd_media_changed,	1145	.media_changed = sd_media_changed,
1146	.revalidate_disk = sd_revalidate_disk,	1146	.revalidate_disk = sd_revalidate_disk,
1147	.unlock_native_capacity = sd_unlock_native_capacity,	1147	.unlock_native_capacity = sd_unlock_native_capacity,
1148	};	1148	};
1149		1149
1150	static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd)	1150	static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd)
1151	{	1151	{
1152	u64 start_lba = blk_rq_pos(scmd->request);	1152	u64 start_lba = blk_rq_pos(scmd->request);
1153	u64 end_lba = blk_rq_pos(scmd->request) + (scsi_bufflen(scmd) / 512);	1153	u64 end_lba = blk_rq_pos(scmd->request) + (scsi_bufflen(scmd) / 512);
1154	u64 bad_lba;	1154	u64 bad_lba;
1155	int info_valid;	1155	int info_valid;
1156		1156
1157	if (scmd->request->cmd_type != REQ_TYPE_FS)	1157	if (scmd->request->cmd_type != REQ_TYPE_FS)
1158	return 0;	1158	return 0;
1159		1159
1160	info_valid = scsi_get_sense_info_fld(scmd->sense_buffer,	1160	info_valid = scsi_get_sense_info_fld(scmd->sense_buffer,
1161	SCSI_SENSE_BUFFERSIZE,	1161	SCSI_SENSE_BUFFERSIZE,
1162	&bad_lba);	1162	&bad_lba);
1163	if (!info_valid)	1163	if (!info_valid)
1164	return 0;	1164	return 0;
1165		1165
1166	if (scsi_bufflen(scmd) <= scmd->device->sector_size)	1166	if (scsi_bufflen(scmd) <= scmd->device->sector_size)
1167	return 0;	1167	return 0;
1168		1168
1169	if (scmd->device->sector_size < 512) {	1169	if (scmd->device->sector_size < 512) {
1170	/* only legitimate sector_size here is 256 */	1170	/* only legitimate sector_size here is 256 */
1171	start_lba <<= 1;	1171	start_lba <<= 1;
1172	end_lba <<= 1;	1172	end_lba <<= 1;
1173	} else {	1173	} else {
1174	/* be careful ... don't want any overflows */	1174	/* be careful ... don't want any overflows */
1175	u64 factor = scmd->device->sector_size / 512;	1175	u64 factor = scmd->device->sector_size / 512;
1176	do_div(start_lba, factor);	1176	do_div(start_lba, factor);
1177	do_div(end_lba, factor);	1177	do_div(end_lba, factor);
1178	}	1178	}
1179		1179
1180	/* The bad lba was reported incorrectly, we have no idea where	1180	/* The bad lba was reported incorrectly, we have no idea where
1181	* the error is.	1181	* the error is.
1182	*/	1182	*/
1183	if (bad_lba < start_lba \|\| bad_lba >= end_lba)	1183	if (bad_lba < start_lba \|\| bad_lba >= end_lba)
1184	return 0;	1184	return 0;
1185		1185
1186	/* This computation should always be done in terms of	1186	/* This computation should always be done in terms of
1187	* the resolution of the device's medium.	1187	* the resolution of the device's medium.
1188	*/	1188	*/
1189	return (bad_lba - start_lba) * scmd->device->sector_size;	1189	return (bad_lba - start_lba) * scmd->device->sector_size;
1190	}	1190	}
1191		1191
1192	/**	1192	/**
1193	* sd_done - bottom half handler: called when the lower level	1193	* sd_done - bottom half handler: called when the lower level
1194	* driver has completed (successfully or otherwise) a scsi command.	1194	* driver has completed (successfully or otherwise) a scsi command.
1195	* @SCpnt: mid-level's per command structure.	1195	* @SCpnt: mid-level's per command structure.
1196	*	1196	*
1197	* Note: potentially run from within an ISR. Must not block.	1197	* Note: potentially run from within an ISR. Must not block.
1198	**/	1198	**/
1199	static int sd_done(struct scsi_cmnd *SCpnt)	1199	static int sd_done(struct scsi_cmnd *SCpnt)
1200	{	1200	{
1201	int result = SCpnt->result;	1201	int result = SCpnt->result;
1202	unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt);	1202	unsigned int good_bytes = result ? 0 : scsi_bufflen(SCpnt);
1203	struct scsi_sense_hdr sshdr;	1203	struct scsi_sense_hdr sshdr;
1204	struct scsi_disk *sdkp = scsi_disk(SCpnt->request->rq_disk);	1204	struct scsi_disk *sdkp = scsi_disk(SCpnt->request->rq_disk);
1205	int sense_valid = 0;	1205	int sense_valid = 0;
1206	int sense_deferred = 0;	1206	int sense_deferred = 0;
1207		1207
1208	if (SCpnt->request->cmd_flags & REQ_DISCARD) {	1208	if (SCpnt->request->cmd_flags & REQ_DISCARD) {
1209	if (!result)	1209	if (!result)
1210	scsi_set_resid(SCpnt, 0);	1210	scsi_set_resid(SCpnt, 0);
1211	return good_bytes;	1211	return good_bytes;
1212	}	1212	}
1213		1213
1214	if (result) {	1214	if (result) {
1215	sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr);	1215	sense_valid = scsi_command_normalize_sense(SCpnt, &sshdr);
1216	if (sense_valid)	1216	if (sense_valid)
1217	sense_deferred = scsi_sense_is_deferred(&sshdr);	1217	sense_deferred = scsi_sense_is_deferred(&sshdr);
1218	}	1218	}
1219	#ifdef CONFIG_SCSI_LOGGING	1219	#ifdef CONFIG_SCSI_LOGGING
1220	SCSI_LOG_HLCOMPLETE(1, scsi_print_result(SCpnt));	1220	SCSI_LOG_HLCOMPLETE(1, scsi_print_result(SCpnt));
1221	if (sense_valid) {	1221	if (sense_valid) {
1222	SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt,	1222	SCSI_LOG_HLCOMPLETE(1, scmd_printk(KERN_INFO, SCpnt,
1223	"sd_done: sb[respc,sk,asc,"	1223	"sd_done: sb[respc,sk,asc,"
1224	"ascq]=%x,%x,%x,%x\n",	1224	"ascq]=%x,%x,%x,%x\n",
1225	sshdr.response_code,	1225	sshdr.response_code,
1226	sshdr.sense_key, sshdr.asc,	1226	sshdr.sense_key, sshdr.asc,
1227	sshdr.ascq));	1227	sshdr.ascq));
1228	}	1228	}
1229	#endif	1229	#endif
1230	if (driver_byte(result) != DRIVER_SENSE &&	1230	if (driver_byte(result) != DRIVER_SENSE &&
1231	(!sense_valid \|\| sense_deferred))	1231	(!sense_valid \|\| sense_deferred))
1232	goto out;	1232	goto out;
1233		1233
1234	switch (sshdr.sense_key) {	1234	switch (sshdr.sense_key) {
1235	case HARDWARE_ERROR:	1235	case HARDWARE_ERROR:
1236	case MEDIUM_ERROR:	1236	case MEDIUM_ERROR:
1237	good_bytes = sd_completed_bytes(SCpnt);	1237	good_bytes = sd_completed_bytes(SCpnt);
1238	break;	1238	break;
1239	case RECOVERED_ERROR:	1239	case RECOVERED_ERROR:
1240	good_bytes = scsi_bufflen(SCpnt);	1240	good_bytes = scsi_bufflen(SCpnt);
1241	break;	1241	break;
1242	case NO_SENSE:	1242	case NO_SENSE:
1243	/* This indicates a false check condition, so ignore it. An	1243	/* This indicates a false check condition, so ignore it. An
1244	* unknown amount of data was transferred so treat it as an	1244	* unknown amount of data was transferred so treat it as an
1245	* error.	1245	* error.
1246	*/	1246	*/
1247	scsi_print_sense("sd", SCpnt);	1247	scsi_print_sense("sd", SCpnt);
1248	SCpnt->result = 0;	1248	SCpnt->result = 0;
1249	memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);	1249	memset(SCpnt->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE);
1250	break;	1250	break;
1251	case ABORTED_COMMAND: /* DIF: Target detected corruption */	1251	case ABORTED_COMMAND: /* DIF: Target detected corruption */
1252	case ILLEGAL_REQUEST: /* DIX: Host detected corruption */	1252	case ILLEGAL_REQUEST: /* DIX: Host detected corruption */
1253	if (sshdr.asc == 0x10)	1253	if (sshdr.asc == 0x10)
1254	good_bytes = sd_completed_bytes(SCpnt);	1254	good_bytes = sd_completed_bytes(SCpnt);
1255	break;	1255	break;
1256	default:	1256	default:
1257	break;	1257	break;
1258	}	1258	}
1259	out:	1259	out:
1260	if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt))	1260	if (rq_data_dir(SCpnt->request) == READ && scsi_prot_sg_count(SCpnt))
1261	sd_dif_complete(SCpnt, good_bytes);	1261	sd_dif_complete(SCpnt, good_bytes);
1262		1262
1263	if (scsi_host_dif_capable(sdkp->device->host, sdkp->protection_type)	1263	if (scsi_host_dif_capable(sdkp->device->host, sdkp->protection_type)
1264	== SD_DIF_TYPE2_PROTECTION && SCpnt->cmnd != SCpnt->request->cmd) {	1264	== SD_DIF_TYPE2_PROTECTION && SCpnt->cmnd != SCpnt->request->cmd) {
1265		1265
1266	/* We have to print a failed command here as the	1266	/* We have to print a failed command here as the
1267	* extended CDB gets freed before scsi_io_completion()	1267	* extended CDB gets freed before scsi_io_completion()
1268	* is called.	1268	* is called.
1269	*/	1269	*/
1270	if (result)	1270	if (result)
1271	scsi_print_command(SCpnt);	1271	scsi_print_command(SCpnt);
1272		1272
1273	mempool_free(SCpnt->cmnd, sd_cdb_pool);	1273	mempool_free(SCpnt->cmnd, sd_cdb_pool);
1274	SCpnt->cmnd = NULL;	1274	SCpnt->cmnd = NULL;
1275	SCpnt->cmd_len = 0;	1275	SCpnt->cmd_len = 0;
1276	}	1276	}
1277		1277
1278	return good_bytes;	1278	return good_bytes;
1279	}	1279	}
1280		1280
1281	static int media_not_present(struct scsi_disk *sdkp,	1281	static int media_not_present(struct scsi_disk *sdkp,
1282	struct scsi_sense_hdr *sshdr)	1282	struct scsi_sense_hdr *sshdr)
1283	{	1283	{
1284		1284
1285	if (!scsi_sense_valid(sshdr))	1285	if (!scsi_sense_valid(sshdr))
1286	return 0;	1286	return 0;
1287	/* not invoked for commands that could return deferred errors */	1287	/* not invoked for commands that could return deferred errors */
1288	if (sshdr->sense_key != NOT_READY &&	1288	if (sshdr->sense_key != NOT_READY &&
1289	sshdr->sense_key != UNIT_ATTENTION)	1289	sshdr->sense_key != UNIT_ATTENTION)
1290	return 0;	1290	return 0;
1291	if (sshdr->asc != 0x3A) /* medium not present */	1291	if (sshdr->asc != 0x3A) /* medium not present */
1292	return 0;	1292	return 0;
1293		1293
1294	set_media_not_present(sdkp);	1294	set_media_not_present(sdkp);
1295	return 1;	1295	return 1;
1296	}	1296	}
1297		1297
1298	/*	1298	/*
1299	* spinup disk - called only in sd_revalidate_disk()	1299	* spinup disk - called only in sd_revalidate_disk()
1300	*/	1300	*/
1301	static void	1301	static void
1302	sd_spinup_disk(struct scsi_disk *sdkp)	1302	sd_spinup_disk(struct scsi_disk *sdkp)
1303	{	1303	{
1304	unsigned char cmd[10];	1304	unsigned char cmd[10];
1305	unsigned long spintime_expire = 0;	1305	unsigned long spintime_expire = 0;
1306	int retries, spintime;	1306	int retries, spintime;
1307	unsigned int the_result;	1307	unsigned int the_result;
1308	struct scsi_sense_hdr sshdr;	1308	struct scsi_sense_hdr sshdr;
1309	int sense_valid = 0;	1309	int sense_valid = 0;
1310		1310
1311	spintime = 0;	1311	spintime = 0;
1312		1312
1313	/* Spin up drives, as required. Only do this at boot time */	1313	/* Spin up drives, as required. Only do this at boot time */
1314	/* Spinup needs to be done for module loads too. */	1314	/* Spinup needs to be done for module loads too. */
1315	do {	1315	do {
1316	retries = 0;	1316	retries = 0;
1317		1317
1318	do {	1318	do {
1319	cmd[0] = TEST_UNIT_READY;	1319	cmd[0] = TEST_UNIT_READY;
1320	memset((void *) &cmd[1], 0, 9);	1320	memset((void *) &cmd[1], 0, 9);
1321		1321
1322	the_result = scsi_execute_req(sdkp->device, cmd,	1322	the_result = scsi_execute_req(sdkp->device, cmd,
1323	DMA_NONE, NULL, 0,	1323	DMA_NONE, NULL, 0,
1324	&sshdr, SD_TIMEOUT,	1324	&sshdr, SD_TIMEOUT,
1325	SD_MAX_RETRIES, NULL);	1325	SD_MAX_RETRIES, NULL);
1326		1326
1327	/*	1327	/*
1328	* If the drive has indicated to us that it	1328	* If the drive has indicated to us that it
1329	* doesn't have any media in it, don't bother	1329	* doesn't have any media in it, don't bother
1330	* with any more polling.	1330	* with any more polling.
1331	*/	1331	*/
1332	if (media_not_present(sdkp, &sshdr))	1332	if (media_not_present(sdkp, &sshdr))
1333	return;	1333	return;
1334		1334
1335	if (the_result)	1335	if (the_result)
1336	sense_valid = scsi_sense_valid(&sshdr);	1336	sense_valid = scsi_sense_valid(&sshdr);
1337	retries++;	1337	retries++;
1338	} while (retries < 3 &&	1338	} while (retries < 3 &&
1339	(!scsi_status_is_good(the_result) \|\|	1339	(!scsi_status_is_good(the_result) \|\|
1340	((driver_byte(the_result) & DRIVER_SENSE) &&	1340	((driver_byte(the_result) & DRIVER_SENSE) &&
1341	sense_valid && sshdr.sense_key == UNIT_ATTENTION)));	1341	sense_valid && sshdr.sense_key == UNIT_ATTENTION)));
1342		1342
1343	if ((driver_byte(the_result) & DRIVER_SENSE) == 0) {	1343	if ((driver_byte(the_result) & DRIVER_SENSE) == 0) {
1344	/* no sense, TUR either succeeded or failed	1344	/* no sense, TUR either succeeded or failed
1345	* with a status error */	1345	* with a status error */
1346	if(!spintime && !scsi_status_is_good(the_result)) {	1346	if(!spintime && !scsi_status_is_good(the_result)) {
1347	sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready\n");	1347	sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready\n");
1348	sd_print_result(sdkp, the_result);	1348	sd_print_result(sdkp, the_result);
1349	}	1349	}
1350	break;	1350	break;
1351	}	1351	}
1352		1352
1353	/*	1353	/*
1354	* The device does not want the automatic start to be issued.	1354	* The device does not want the automatic start to be issued.
1355	*/	1355	*/
1356	if (sdkp->device->no_start_on_add)	1356	if (sdkp->device->no_start_on_add)
1357	break;	1357	break;
1358		1358
1359	if (sense_valid && sshdr.sense_key == NOT_READY) {	1359	if (sense_valid && sshdr.sense_key == NOT_READY) {
1360	if (sshdr.asc == 4 && sshdr.ascq == 3)	1360	if (sshdr.asc == 4 && sshdr.ascq == 3)
1361	break; /* manual intervention required */	1361	break; /* manual intervention required */
1362	if (sshdr.asc == 4 && sshdr.ascq == 0xb)	1362	if (sshdr.asc == 4 && sshdr.ascq == 0xb)
1363	break; /* standby */	1363	break; /* standby */
1364	if (sshdr.asc == 4 && sshdr.ascq == 0xc)	1364	if (sshdr.asc == 4 && sshdr.ascq == 0xc)
1365	break; /* unavailable */	1365	break; /* unavailable */
1366	/*	1366	/*
1367	* Issue command to spin up drive when not ready	1367	* Issue command to spin up drive when not ready
1368	*/	1368	*/
1369	if (!spintime) {	1369	if (!spintime) {
1370	sd_printk(KERN_NOTICE, sdkp, "Spinning up disk...");	1370	sd_printk(KERN_NOTICE, sdkp, "Spinning up disk...");
1371	cmd[0] = START_STOP;	1371	cmd[0] = START_STOP;
1372	cmd[1] = 1; /* Return immediately */	1372	cmd[1] = 1; /* Return immediately */
1373	memset((void *) &cmd[2], 0, 8);	1373	memset((void *) &cmd[2], 0, 8);
1374	cmd[4] = 1; /* Start spin cycle */	1374	cmd[4] = 1; /* Start spin cycle */
1375	if (sdkp->device->start_stop_pwr_cond)	1375	if (sdkp->device->start_stop_pwr_cond)
1376	cmd[4] \|= 1 << 4;	1376	cmd[4] \|= 1 << 4;
1377	scsi_execute_req(sdkp->device, cmd, DMA_NONE,	1377	scsi_execute_req(sdkp->device, cmd, DMA_NONE,
1378	NULL, 0, &sshdr,	1378	NULL, 0, &sshdr,
1379	SD_TIMEOUT, SD_MAX_RETRIES,	1379	SD_TIMEOUT, SD_MAX_RETRIES,
1380	NULL);	1380	NULL);
1381	spintime_expire = jiffies + 100 * HZ;	1381	spintime_expire = jiffies + 100 * HZ;
1382	spintime = 1;	1382	spintime = 1;
1383	}	1383	}
1384	/* Wait 1 second for next try */	1384	/* Wait 1 second for next try */
1385	msleep(1000);	1385	msleep(1000);
1386	printk(".");	1386	printk(".");
1387		1387
1388	/*	1388	/*
1389	* Wait for USB flash devices with slow firmware.	1389	* Wait for USB flash devices with slow firmware.
1390	* Yes, this sense key/ASC combination shouldn't	1390	* Yes, this sense key/ASC combination shouldn't
1391	* occur here. It's characteristic of these devices.	1391	* occur here. It's characteristic of these devices.
1392	*/	1392	*/
1393	} else if (sense_valid &&	1393	} else if (sense_valid &&
1394	sshdr.sense_key == UNIT_ATTENTION &&	1394	sshdr.sense_key == UNIT_ATTENTION &&
1395	sshdr.asc == 0x28) {	1395	sshdr.asc == 0x28) {
1396	if (!spintime) {	1396	if (!spintime) {
1397	spintime_expire = jiffies + 5 * HZ;	1397	spintime_expire = jiffies + 5 * HZ;
1398	spintime = 1;	1398	spintime = 1;
1399	}	1399	}
1400	/* Wait 1 second for next try */	1400	/* Wait 1 second for next try */
1401	msleep(1000);	1401	msleep(1000);
1402	} else {	1402	} else {
1403	/* we don't understand the sense code, so it's	1403	/* we don't understand the sense code, so it's
1404	* probably pointless to loop */	1404	* probably pointless to loop */
1405	if(!spintime) {	1405	if(!spintime) {
1406	sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready\n");	1406	sd_printk(KERN_NOTICE, sdkp, "Unit Not Ready\n");
1407	sd_print_sense_hdr(sdkp, &sshdr);	1407	sd_print_sense_hdr(sdkp, &sshdr);
1408	}	1408	}
1409	break;	1409	break;
1410	}	1410	}
1411		1411
1412	} while (spintime && time_before_eq(jiffies, spintime_expire));	1412	} while (spintime && time_before_eq(jiffies, spintime_expire));
1413		1413
1414	if (spintime) {	1414	if (spintime) {
1415	if (scsi_status_is_good(the_result))	1415	if (scsi_status_is_good(the_result))
1416	printk("ready\n");	1416	printk("ready\n");
1417	else	1417	else
1418	printk("not responding...\n");	1418	printk("not responding...\n");
1419	}	1419	}
1420	}	1420	}
1421		1421
1422		1422
1423	/*	1423	/*
1424	* Determine whether disk supports Data Integrity Field.	1424	* Determine whether disk supports Data Integrity Field.
1425	*/	1425	*/
1426	static void sd_read_protection_type(struct scsi_disk sdkp, unsigned char buffer)	1426	static void sd_read_protection_type(struct scsi_disk sdkp, unsigned char buffer)
1427	{	1427	{
1428	struct scsi_device *sdp = sdkp->device;	1428	struct scsi_device *sdp = sdkp->device;
1429	u8 type;	1429	u8 type;
1430		1430
1431	if (scsi_device_protection(sdp) == 0 \|\| (buffer[12] & 1) == 0)	1431	if (scsi_device_protection(sdp) == 0 \|\| (buffer[12] & 1) == 0)
1432	return;	1432	return;
1433		1433
1434	type = ((buffer[12] >> 1) & 7) + 1; /* P_TYPE 0 = Type 1 */	1434	type = ((buffer[12] >> 1) & 7) + 1; /* P_TYPE 0 = Type 1 */
1435		1435
1436	if (type == sdkp->protection_type \|\| !sdkp->first_scan)	1436	if (type == sdkp->protection_type \|\| !sdkp->first_scan)
1437	return;	1437	return;
1438		1438
1439	sdkp->protection_type = type;	1439	sdkp->protection_type = type;
1440		1440
1441	if (type > SD_DIF_TYPE3_PROTECTION) {	1441	if (type > SD_DIF_TYPE3_PROTECTION) {
1442	sd_printk(KERN_ERR, sdkp, "formatted with unsupported " \	1442	sd_printk(KERN_ERR, sdkp, "formatted with unsupported " \
1443	"protection type %u. Disabling disk!\n", type);	1443	"protection type %u. Disabling disk!\n", type);
1444	sdkp->capacity = 0;	1444	sdkp->capacity = 0;
1445	return;	1445	return;
1446	}	1446	}
1447		1447
1448	if (scsi_host_dif_capable(sdp->host, type))	1448	if (scsi_host_dif_capable(sdp->host, type))
1449	sd_printk(KERN_NOTICE, sdkp,	1449	sd_printk(KERN_NOTICE, sdkp,
1450	"Enabling DIF Type %u protection\n", type);	1450	"Enabling DIF Type %u protection\n", type);
1451	else	1451	else
1452	sd_printk(KERN_NOTICE, sdkp,	1452	sd_printk(KERN_NOTICE, sdkp,
1453	"Disabling DIF Type %u protection\n", type);	1453	"Disabling DIF Type %u protection\n", type);
1454	}	1454	}
1455		1455
1456	static void read_capacity_error(struct scsi_disk sdkp, struct scsi_device sdp,	1456	static void read_capacity_error(struct scsi_disk sdkp, struct scsi_device sdp,
1457	struct scsi_sense_hdr *sshdr, int sense_valid,	1457	struct scsi_sense_hdr *sshdr, int sense_valid,
1458	int the_result)	1458	int the_result)
1459	{	1459	{
1460	sd_print_result(sdkp, the_result);	1460	sd_print_result(sdkp, the_result);
1461	if (driver_byte(the_result) & DRIVER_SENSE)	1461	if (driver_byte(the_result) & DRIVER_SENSE)
1462	sd_print_sense_hdr(sdkp, sshdr);	1462	sd_print_sense_hdr(sdkp, sshdr);
1463	else	1463	else
1464	sd_printk(KERN_NOTICE, sdkp, "Sense not available.\n");	1464	sd_printk(KERN_NOTICE, sdkp, "Sense not available.\n");
1465		1465
1466	/*	1466	/*
1467	* Set dirty bit for removable devices if not ready -	1467	* Set dirty bit for removable devices if not ready -
1468	* sometimes drives will not report this properly.	1468	* sometimes drives will not report this properly.
1469	*/	1469	*/
1470	if (sdp->removable &&	1470	if (sdp->removable &&
1471	sense_valid && sshdr->sense_key == NOT_READY)	1471	sense_valid && sshdr->sense_key == NOT_READY)
1472	sdp->changed = 1;	1472	sdp->changed = 1;
1473		1473
1474	/*	1474	/*
1475	* We used to set media_present to 0 here to indicate no media	1475	* We used to set media_present to 0 here to indicate no media
1476	* in the drive, but some drives fail read capacity even with	1476	* in the drive, but some drives fail read capacity even with
1477	* media present, so we can't do that.	1477	* media present, so we can't do that.
1478	*/	1478	*/
1479	sdkp->capacity = 0; /* unknown mapped to zero - as usual */	1479	sdkp->capacity = 0; /* unknown mapped to zero - as usual */
1480	}	1480	}
1481		1481
1482	#define RC16_LEN 32	1482	#define RC16_LEN 32
1483	#if RC16_LEN > SD_BUF_SIZE	1483	#if RC16_LEN > SD_BUF_SIZE
1484	#error RC16_LEN must not be more than SD_BUF_SIZE	1484	#error RC16_LEN must not be more than SD_BUF_SIZE
1485	#endif	1485	#endif
1486		1486
1487	#define READ_CAPACITY_RETRIES_ON_RESET 10	1487	#define READ_CAPACITY_RETRIES_ON_RESET 10
1488		1488
1489	static int read_capacity_16(struct scsi_disk sdkp, struct scsi_device sdp,	1489	static int read_capacity_16(struct scsi_disk sdkp, struct scsi_device sdp,
1490	unsigned char *buffer)	1490	unsigned char *buffer)
1491	{	1491	{
1492	unsigned char cmd[16];	1492	unsigned char cmd[16];
1493	struct scsi_sense_hdr sshdr;	1493	struct scsi_sense_hdr sshdr;
1494	int sense_valid = 0;	1494	int sense_valid = 0;
1495	int the_result;	1495	int the_result;
1496	int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET;	1496	int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET;
1497	unsigned int alignment;	1497	unsigned int alignment;
1498	unsigned long long lba;	1498	unsigned long long lba;
1499	unsigned sector_size;	1499	unsigned sector_size;
1500		1500
1501	do {	1501	do {
1502	memset(cmd, 0, 16);	1502	memset(cmd, 0, 16);
1503	cmd[0] = SERVICE_ACTION_IN;	1503	cmd[0] = SERVICE_ACTION_IN;
1504	cmd[1] = SAI_READ_CAPACITY_16;	1504	cmd[1] = SAI_READ_CAPACITY_16;
1505	cmd[13] = RC16_LEN;	1505	cmd[13] = RC16_LEN;
1506	memset(buffer, 0, RC16_LEN);	1506	memset(buffer, 0, RC16_LEN);
1507		1507
1508	the_result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,	1508	the_result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
1509	buffer, RC16_LEN, &sshdr,	1509	buffer, RC16_LEN, &sshdr,
1510	SD_TIMEOUT, SD_MAX_RETRIES, NULL);	1510	SD_TIMEOUT, SD_MAX_RETRIES, NULL);
1511		1511
1512	if (media_not_present(sdkp, &sshdr))	1512	if (media_not_present(sdkp, &sshdr))
1513	return -ENODEV;	1513	return -ENODEV;
1514		1514
1515	if (the_result) {	1515	if (the_result) {
1516	sense_valid = scsi_sense_valid(&sshdr);	1516	sense_valid = scsi_sense_valid(&sshdr);
1517	if (sense_valid &&	1517	if (sense_valid &&
1518	sshdr.sense_key == ILLEGAL_REQUEST &&	1518	sshdr.sense_key == ILLEGAL_REQUEST &&
1519	(sshdr.asc == 0x20 \|\| sshdr.asc == 0x24) &&	1519	(sshdr.asc == 0x20 \|\| sshdr.asc == 0x24) &&
1520	sshdr.ascq == 0x00)	1520	sshdr.ascq == 0x00)
1521	/* Invalid Command Operation Code or	1521	/* Invalid Command Operation Code or
1522	* Invalid Field in CDB, just retry	1522	* Invalid Field in CDB, just retry
1523	* silently with RC10 */	1523	* silently with RC10 */
1524	return -EINVAL;	1524	return -EINVAL;
1525	if (sense_valid &&	1525	if (sense_valid &&
1526	sshdr.sense_key == UNIT_ATTENTION &&	1526	sshdr.sense_key == UNIT_ATTENTION &&
1527	sshdr.asc == 0x29 && sshdr.ascq == 0x00)	1527	sshdr.asc == 0x29 && sshdr.ascq == 0x00)
1528	/* Device reset might occur several times,	1528	/* Device reset might occur several times,
1529	* give it one more chance */	1529	* give it one more chance */
1530	if (--reset_retries > 0)	1530	if (--reset_retries > 0)
1531	continue;	1531	continue;
1532	}	1532	}
1533	retries--;	1533	retries--;
1534		1534
1535	} while (the_result && retries);	1535	} while (the_result && retries);
1536		1536
1537	if (the_result) {	1537	if (the_result) {
1538	sd_printk(KERN_NOTICE, sdkp, "READ CAPACITY(16) failed\n");	1538	sd_printk(KERN_NOTICE, sdkp, "READ CAPACITY(16) failed\n");
1539	read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result);	1539	read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result);
1540	return -EINVAL;	1540	return -EINVAL;
1541	}	1541	}
1542		1542
1543	sector_size = get_unaligned_be32(&buffer[8]);	1543	sector_size = get_unaligned_be32(&buffer[8]);
1544	lba = get_unaligned_be64(&buffer[0]);	1544	lba = get_unaligned_be64(&buffer[0]);
1545		1545
1546	sd_read_protection_type(sdkp, buffer);	1546	sd_read_protection_type(sdkp, buffer);
1547		1547
1548	if ((sizeof(sdkp->capacity) == 4) && (lba >= 0xffffffffULL)) {	1548	if ((sizeof(sdkp->capacity) == 4) && (lba >= 0xffffffffULL)) {
1549	sd_printk(KERN_ERR, sdkp, "Too big for this kernel. Use a "	1549	sd_printk(KERN_ERR, sdkp, "Too big for this kernel. Use a "
1550	"kernel compiled with support for large block "	1550	"kernel compiled with support for large block "
1551	"devices.\n");	1551	"devices.\n");
1552	sdkp->capacity = 0;	1552	sdkp->capacity = 0;
1553	return -EOVERFLOW;	1553	return -EOVERFLOW;
1554	}	1554	}
1555		1555
1556	/* Logical blocks per physical block exponent */	1556	/* Logical blocks per physical block exponent */
1557	sdkp->hw_sector_size = (1 << (buffer[13] & 0xf)) * sector_size;	1557	sdkp->hw_sector_size = (1 << (buffer[13] & 0xf)) * sector_size;
1558		1558
1559	/* Lowest aligned logical block */	1559	/* Lowest aligned logical block */
1560	alignment = ((buffer[14] & 0x3f) << 8 \| buffer[15]) * sector_size;	1560	alignment = ((buffer[14] & 0x3f) << 8 \| buffer[15]) * sector_size;
1561	blk_queue_alignment_offset(sdp->request_queue, alignment);	1561	blk_queue_alignment_offset(sdp->request_queue, alignment);
1562	if (alignment && sdkp->first_scan)	1562	if (alignment && sdkp->first_scan)
1563	sd_printk(KERN_NOTICE, sdkp,	1563	sd_printk(KERN_NOTICE, sdkp,
1564	"physical block alignment offset: %u\n", alignment);	1564	"physical block alignment offset: %u\n", alignment);
1565		1565
1566	if (buffer[14] & 0x80) { /* TPE */	1566	if (buffer[14] & 0x80) { /* TPE */
1567	struct request_queue *q = sdp->request_queue;	1567	struct request_queue *q = sdp->request_queue;
1568		1568
1569	sdkp->thin_provisioning = 1;	1569	sdkp->thin_provisioning = 1;
1570	q->limits.discard_granularity = sdkp->hw_sector_size;	1570	q->limits.discard_granularity = sdkp->hw_sector_size;
1571	q->limits.max_discard_sectors = 0xffffffff;	1571	q->limits.max_discard_sectors = 0xffffffff;
1572		1572
1573	if (buffer[14] & 0x40) /* TPRZ */	1573	if (buffer[14] & 0x40) /* TPRZ */
1574	q->limits.discard_zeroes_data = 1;	1574	q->limits.discard_zeroes_data = 1;
1575		1575
1576	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);	1576	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q);
1577	}	1577	}
1578		1578
1579	sdkp->capacity = lba + 1;	1579	sdkp->capacity = lba + 1;
1580	return sector_size;	1580	return sector_size;
1581	}	1581	}
1582		1582
1583	static int read_capacity_10(struct scsi_disk sdkp, struct scsi_device sdp,	1583	static int read_capacity_10(struct scsi_disk sdkp, struct scsi_device sdp,
1584	unsigned char *buffer)	1584	unsigned char *buffer)
1585	{	1585	{
1586	unsigned char cmd[16];	1586	unsigned char cmd[16];
1587	struct scsi_sense_hdr sshdr;	1587	struct scsi_sense_hdr sshdr;
1588	int sense_valid = 0;	1588	int sense_valid = 0;
1589	int the_result;	1589	int the_result;
1590	int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET;	1590	int retries = 3, reset_retries = READ_CAPACITY_RETRIES_ON_RESET;
1591	sector_t lba;	1591	sector_t lba;
1592	unsigned sector_size;	1592	unsigned sector_size;
1593		1593
1594	do {	1594	do {
1595	cmd[0] = READ_CAPACITY;	1595	cmd[0] = READ_CAPACITY;
1596	memset(&cmd[1], 0, 9);	1596	memset(&cmd[1], 0, 9);
1597	memset(buffer, 0, 8);	1597	memset(buffer, 0, 8);
1598		1598
1599	the_result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,	1599	the_result = scsi_execute_req(sdp, cmd, DMA_FROM_DEVICE,
1600	buffer, 8, &sshdr,	1600	buffer, 8, &sshdr,
1601	SD_TIMEOUT, SD_MAX_RETRIES, NULL);	1601	SD_TIMEOUT, SD_MAX_RETRIES, NULL);
1602		1602
1603	if (media_not_present(sdkp, &sshdr))	1603	if (media_not_present(sdkp, &sshdr))
1604	return -ENODEV;	1604	return -ENODEV;
1605		1605
1606	if (the_result) {	1606	if (the_result) {
1607	sense_valid = scsi_sense_valid(&sshdr);	1607	sense_valid = scsi_sense_valid(&sshdr);
1608	if (sense_valid &&	1608	if (sense_valid &&
1609	sshdr.sense_key == UNIT_ATTENTION &&	1609	sshdr.sense_key == UNIT_ATTENTION &&
1610	sshdr.asc == 0x29 && sshdr.ascq == 0x00)	1610	sshdr.asc == 0x29 && sshdr.ascq == 0x00)
1611	/* Device reset might occur several times,	1611	/* Device reset might occur several times,
1612	* give it one more chance */	1612	* give it one more chance */
1613	if (--reset_retries > 0)	1613	if (--reset_retries > 0)
1614	continue;	1614	continue;
1615	}	1615	}
1616	retries--;	1616	retries--;
1617		1617
1618	} while (the_result && retries);	1618	} while (the_result && retries);
1619		1619
1620	if (the_result) {	1620	if (the_result) {
1621	sd_printk(KERN_NOTICE, sdkp, "READ CAPACITY failed\n");	1621	sd_printk(KERN_NOTICE, sdkp, "READ CAPACITY failed\n");
1622	read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result);	1622	read_capacity_error(sdkp, sdp, &sshdr, sense_valid, the_result);
1623	return -EINVAL;	1623	return -EINVAL;
1624	}	1624	}
1625		1625
1626	sector_size = get_unaligned_be32(&buffer[4]);	1626	sector_size = get_unaligned_be32(&buffer[4]);
1627	lba = get_unaligned_be32(&buffer[0]);	1627	lba = get_unaligned_be32(&buffer[0]);
1628		1628
1629	if ((sizeof(sdkp->capacity) == 4) && (lba == 0xffffffff)) {	1629	if ((sizeof(sdkp->capacity) == 4) && (lba == 0xffffffff)) {
1630	sd_printk(KERN_ERR, sdkp, "Too big for this kernel. Use a "	1630	sd_printk(KERN_ERR, sdkp, "Too big for this kernel. Use a "
1631	"kernel compiled with support for large block "	1631	"kernel compiled with support for large block "
1632	"devices.\n");	1632	"devices.\n");
1633	sdkp->capacity = 0;	1633	sdkp->capacity = 0;
1634	return -EOVERFLOW;	1634	return -EOVERFLOW;
1635	}	1635	}
1636		1636
1637	sdkp->capacity = lba + 1;	1637	sdkp->capacity = lba + 1;
1638	sdkp->hw_sector_size = sector_size;	1638	sdkp->hw_sector_size = sector_size;
1639	return sector_size;	1639	return sector_size;
1640	}	1640	}
1641		1641
1642	static int sd_try_rc16_first(struct scsi_device *sdp)	1642	static int sd_try_rc16_first(struct scsi_device *sdp)
1643	{	1643	{
1644	if (sdp->host->max_cmd_len < 16)	1644	if (sdp->host->max_cmd_len < 16)
1645	return 0;	1645	return 0;
1646	if (sdp->scsi_level > SCSI_SPC_2)	1646	if (sdp->scsi_level > SCSI_SPC_2)
1647	return 1;	1647	return 1;
1648	if (scsi_device_protection(sdp))	1648	if (scsi_device_protection(sdp))
1649	return 1;	1649	return 1;
1650	return 0;	1650	return 0;
1651	}	1651	}
1652		1652
1653	/*	1653	/*
1654	* read disk capacity	1654	* read disk capacity
1655	*/	1655	*/
1656	static void	1656	static void
1657	sd_read_capacity(struct scsi_disk sdkp, unsigned char buffer)	1657	sd_read_capacity(struct scsi_disk sdkp, unsigned char buffer)
1658	{	1658	{
1659	int sector_size;	1659	int sector_size;
1660	struct scsi_device *sdp = sdkp->device;	1660	struct scsi_device *sdp = sdkp->device;
1661	sector_t old_capacity = sdkp->capacity;	1661	sector_t old_capacity = sdkp->capacity;
1662		1662
1663	if (sd_try_rc16_first(sdp)) {	1663	if (sd_try_rc16_first(sdp)) {
1664	sector_size = read_capacity_16(sdkp, sdp, buffer);	1664	sector_size = read_capacity_16(sdkp, sdp, buffer);
1665	if (sector_size == -EOVERFLOW)	1665	if (sector_size == -EOVERFLOW)
1666	goto got_data;	1666	goto got_data;
1667	if (sector_size == -ENODEV)	1667	if (sector_size == -ENODEV)
1668	return;	1668	return;
1669	if (sector_size < 0)	1669	if (sector_size < 0)
1670	sector_size = read_capacity_10(sdkp, sdp, buffer);	1670	sector_size = read_capacity_10(sdkp, sdp, buffer);
1671	if (sector_size < 0)	1671	if (sector_size < 0)
1672	return;	1672	return;
1673	} else {	1673	} else {
1674	sector_size = read_capacity_10(sdkp, sdp, buffer);	1674	sector_size = read_capacity_10(sdkp, sdp, buffer);
1675	if (sector_size == -EOVERFLOW)	1675	if (sector_size == -EOVERFLOW)
1676	goto got_data;	1676	goto got_data;
1677	if (sector_size < 0)	1677	if (sector_size < 0)
1678	return;	1678	return;
1679	if ((sizeof(sdkp->capacity) > 4) &&	1679	if ((sizeof(sdkp->capacity) > 4) &&
1680	(sdkp->capacity > 0xffffffffULL)) {	1680	(sdkp->capacity > 0xffffffffULL)) {
1681	int old_sector_size = sector_size;	1681	int old_sector_size = sector_size;
1682	sd_printk(KERN_NOTICE, sdkp, "Very big device. "	1682	sd_printk(KERN_NOTICE, sdkp, "Very big device. "
1683	"Trying to use READ CAPACITY(16).\n");	1683	"Trying to use READ CAPACITY(16).\n");
1684	sector_size = read_capacity_16(sdkp, sdp, buffer);	1684	sector_size = read_capacity_16(sdkp, sdp, buffer);
1685	if (sector_size < 0) {	1685	if (sector_size < 0) {
1686	sd_printk(KERN_NOTICE, sdkp,	1686	sd_printk(KERN_NOTICE, sdkp,
1687	"Using 0xffffffff as device size\n");	1687	"Using 0xffffffff as device size\n");
1688	sdkp->capacity = 1 + (sector_t) 0xffffffff;	1688	sdkp->capacity = 1 + (sector_t) 0xffffffff;
1689	sector_size = old_sector_size;	1689	sector_size = old_sector_size;
1690	goto got_data;	1690	goto got_data;
1691	}	1691	}
1692	}	1692	}
1693	}	1693	}
1694		1694
1695	/* Some devices are known to return the total number of blocks,	1695	/* Some devices are known to return the total number of blocks,
1696	* not the highest block number. Some devices have versions	1696	* not the highest block number. Some devices have versions
1697	* which do this and others which do not. Some devices we might	1697	* which do this and others which do not. Some devices we might
1698	* suspect of doing this but we don't know for certain.	1698	* suspect of doing this but we don't know for certain.
1699	*	1699	*
1700	* If we know the reported capacity is wrong, decrement it. If	1700	* If we know the reported capacity is wrong, decrement it. If
1701	* we can only guess, then assume the number of blocks is even	1701	* we can only guess, then assume the number of blocks is even
1702	* (usually true but not always) and err on the side of lowering	1702	* (usually true but not always) and err on the side of lowering
1703	* the capacity.	1703	* the capacity.
1704	*/	1704	*/
1705	if (sdp->fix_capacity \|\|	1705	if (sdp->fix_capacity \|\|
1706	(sdp->guess_capacity && (sdkp->capacity & 0x01))) {	1706	(sdp->guess_capacity && (sdkp->capacity & 0x01))) {
1707	sd_printk(KERN_INFO, sdkp, "Adjusting the sector count "	1707	sd_printk(KERN_INFO, sdkp, "Adjusting the sector count "
1708	"from its reported value: %llu\n",	1708	"from its reported value: %llu\n",
1709	(unsigned long long) sdkp->capacity);	1709	(unsigned long long) sdkp->capacity);
1710	--sdkp->capacity;	1710	--sdkp->capacity;
1711	}	1711	}
1712		1712
1713	got_data:	1713	got_data:
1714	if (sector_size == 0) {	1714	if (sector_size == 0) {
1715	sector_size = 512;	1715	sector_size = 512;
1716	sd_printk(KERN_NOTICE, sdkp, "Sector size 0 reported, "	1716	sd_printk(KERN_NOTICE, sdkp, "Sector size 0 reported, "
1717	"assuming 512.\n");	1717	"assuming 512.\n");
1718	}	1718	}
1719		1719
1720	if (sector_size != 512 &&	1720	if (sector_size != 512 &&
1721	sector_size != 1024 &&	1721	sector_size != 1024 &&
1722	sector_size != 2048 &&	1722	sector_size != 2048 &&
1723	sector_size != 4096 &&	1723	sector_size != 4096 &&
1724	sector_size != 256) {	1724	sector_size != 256) {
1725	sd_printk(KERN_NOTICE, sdkp, "Unsupported sector size %d.\n",	1725	sd_printk(KERN_NOTICE, sdkp, "Unsupported sector size %d.\n",
1726	sector_size);	1726	sector_size);
1727	/*	1727	/*
1728	* The user might want to re-format the drive with	1728	* The user might want to re-format the drive with
1729	* a supported sectorsize. Once this happens, it	1729	* a supported sectorsize. Once this happens, it
1730	* would be relatively trivial to set the thing up.	1730	* would be relatively trivial to set the thing up.
1731	* For this reason, we leave the thing in the table.	1731	* For this reason, we leave the thing in the table.
1732	*/	1732	*/
1733	sdkp->capacity = 0;	1733	sdkp->capacity = 0;
1734	/*	1734	/*
1735	* set a bogus sector size so the normal read/write	1735	* set a bogus sector size so the normal read/write
1736	* logic in the block layer will eventually refuse any	1736	* logic in the block layer will eventually refuse any
1737	* request on this device without tripping over power	1737	* request on this device without tripping over power
1738	* of two sector size assumptions	1738	* of two sector size assumptions
1739	*/	1739	*/
1740	sector_size = 512;	1740	sector_size = 512;
1741	}	1741	}
1742	blk_queue_logical_block_size(sdp->request_queue, sector_size);	1742	blk_queue_logical_block_size(sdp->request_queue, sector_size);
1743		1743
1744	{	1744	{
1745	char cap_str_2[10], cap_str_10[10];	1745	char cap_str_2[10], cap_str_10[10];
1746	u64 sz = (u64)sdkp->capacity << ilog2(sector_size);	1746	u64 sz = (u64)sdkp->capacity << ilog2(sector_size);
1747		1747
1748	string_get_size(sz, STRING_UNITS_2, cap_str_2,	1748	string_get_size(sz, STRING_UNITS_2, cap_str_2,
1749	sizeof(cap_str_2));	1749	sizeof(cap_str_2));
1750	string_get_size(sz, STRING_UNITS_10, cap_str_10,	1750	string_get_size(sz, STRING_UNITS_10, cap_str_10,
1751	sizeof(cap_str_10));	1751	sizeof(cap_str_10));
1752		1752
1753	if (sdkp->first_scan \|\| old_capacity != sdkp->capacity) {	1753	if (sdkp->first_scan \|\| old_capacity != sdkp->capacity) {
1754	sd_printk(KERN_NOTICE, sdkp,	1754	sd_printk(KERN_NOTICE, sdkp,
1755	"%llu %d-byte logical blocks: (%s/%s)\n",	1755	"%llu %d-byte logical blocks: (%s/%s)\n",
1756	(unsigned long long)sdkp->capacity,	1756	(unsigned long long)sdkp->capacity,
1757	sector_size, cap_str_10, cap_str_2);	1757	sector_size, cap_str_10, cap_str_2);
1758		1758
1759	if (sdkp->hw_sector_size != sector_size)	1759	if (sdkp->hw_sector_size != sector_size)
1760	sd_printk(KERN_NOTICE, sdkp,	1760	sd_printk(KERN_NOTICE, sdkp,
1761	"%u-byte physical blocks\n",	1761	"%u-byte physical blocks\n",
1762	sdkp->hw_sector_size);	1762	sdkp->hw_sector_size);
1763	}	1763	}
1764	}	1764	}
1765		1765
1766	/* Rescale capacity to 512-byte units */	1766	/* Rescale capacity to 512-byte units */
1767	if (sector_size == 4096)	1767	if (sector_size == 4096)
1768	sdkp->capacity <<= 3;	1768	sdkp->capacity <<= 3;
1769	else if (sector_size == 2048)	1769	else if (sector_size == 2048)
1770	sdkp->capacity <<= 2;	1770	sdkp->capacity <<= 2;
1771	else if (sector_size == 1024)	1771	else if (sector_size == 1024)
1772	sdkp->capacity <<= 1;	1772	sdkp->capacity <<= 1;
1773	else if (sector_size == 256)	1773	else if (sector_size == 256)
1774	sdkp->capacity >>= 1;	1774	sdkp->capacity >>= 1;
1775		1775
1776	blk_queue_physical_block_size(sdp->request_queue, sdkp->hw_sector_size);	1776	blk_queue_physical_block_size(sdp->request_queue, sdkp->hw_sector_size);
1777	sdkp->device->sector_size = sector_size;	1777	sdkp->device->sector_size = sector_size;
1778	}	1778	}
1779		1779
1780	/* called with buffer of length 512 */	1780	/* called with buffer of length 512 */
1781	static inline int	1781	static inline int
1782	sd_do_mode_sense(struct scsi_device *sdp, int dbd, int modepage,	1782	sd_do_mode_sense(struct scsi_device *sdp, int dbd, int modepage,
1783	unsigned char buffer, int len, struct scsi_mode_data data,	1783	unsigned char buffer, int len, struct scsi_mode_data data,
1784	struct scsi_sense_hdr *sshdr)	1784	struct scsi_sense_hdr *sshdr)
1785	{	1785	{
1786	return scsi_mode_sense(sdp, dbd, modepage, buffer, len,	1786	return scsi_mode_sense(sdp, dbd, modepage, buffer, len,
1787	SD_TIMEOUT, SD_MAX_RETRIES, data,	1787	SD_TIMEOUT, SD_MAX_RETRIES, data,
1788	sshdr);	1788	sshdr);
1789	}	1789	}
1790		1790
1791	/*	1791	/*
1792	* read write protect setting, if possible - called only in sd_revalidate_disk()	1792	* read write protect setting, if possible - called only in sd_revalidate_disk()
1793	* called with buffer of length SD_BUF_SIZE	1793	* called with buffer of length SD_BUF_SIZE
1794	*/	1794	*/
1795	static void	1795	static void
1796	sd_read_write_protect_flag(struct scsi_disk sdkp, unsigned char buffer)	1796	sd_read_write_protect_flag(struct scsi_disk sdkp, unsigned char buffer)
1797	{	1797	{
1798	int res;	1798	int res;
1799	struct scsi_device *sdp = sdkp->device;	1799	struct scsi_device *sdp = sdkp->device;
1800	struct scsi_mode_data data;	1800	struct scsi_mode_data data;
1801	int old_wp = sdkp->write_prot;	1801	int old_wp = sdkp->write_prot;
1802		1802
1803	set_disk_ro(sdkp->disk, 0);	1803	set_disk_ro(sdkp->disk, 0);
1804	if (sdp->skip_ms_page_3f) {	1804	if (sdp->skip_ms_page_3f) {
1805	sd_printk(KERN_NOTICE, sdkp, "Assuming Write Enabled\n");	1805	sd_printk(KERN_NOTICE, sdkp, "Assuming Write Enabled\n");
1806	return;	1806	return;
1807	}	1807	}
1808		1808
1809	if (sdp->use_192_bytes_for_3f) {	1809	if (sdp->use_192_bytes_for_3f) {
1810	res = sd_do_mode_sense(sdp, 0, 0x3F, buffer, 192, &data, NULL);	1810	res = sd_do_mode_sense(sdp, 0, 0x3F, buffer, 192, &data, NULL);
1811	} else {	1811	} else {
1812	/*	1812	/*
1813	* First attempt: ask for all pages (0x3F), but only 4 bytes.	1813	* First attempt: ask for all pages (0x3F), but only 4 bytes.
1814	* We have to start carefully: some devices hang if we ask	1814	* We have to start carefully: some devices hang if we ask
1815	* for more than is available.	1815	* for more than is available.
1816	*/	1816	*/
1817	res = sd_do_mode_sense(sdp, 0, 0x3F, buffer, 4, &data, NULL);	1817	res = sd_do_mode_sense(sdp, 0, 0x3F, buffer, 4, &data, NULL);
1818		1818
1819	/*	1819	/*
1820	* Second attempt: ask for page 0 When only page 0 is	1820	* Second attempt: ask for page 0 When only page 0 is
1821	* implemented, a request for page 3F may return Sense Key	1821	* implemented, a request for page 3F may return Sense Key
1822	* 5: Illegal Request, Sense Code 24: Invalid field in	1822	* 5: Illegal Request, Sense Code 24: Invalid field in
1823	* CDB.	1823	* CDB.
1824	*/	1824	*/
1825	if (!scsi_status_is_good(res))	1825	if (!scsi_status_is_good(res))
1826	res = sd_do_mode_sense(sdp, 0, 0, buffer, 4, &data, NULL);	1826	res = sd_do_mode_sense(sdp, 0, 0, buffer, 4, &data, NULL);
1827		1827
1828	/*	1828	/*
1829	* Third attempt: ask 255 bytes, as we did earlier.	1829	* Third attempt: ask 255 bytes, as we did earlier.
1830	*/	1830	*/
1831	if (!scsi_status_is_good(res))	1831	if (!scsi_status_is_good(res))
1832	res = sd_do_mode_sense(sdp, 0, 0x3F, buffer, 255,	1832	res = sd_do_mode_sense(sdp, 0, 0x3F, buffer, 255,
1833	&data, NULL);	1833	&data, NULL);
1834	}	1834	}
1835		1835
1836	if (!scsi_status_is_good(res)) {	1836	if (!scsi_status_is_good(res)) {
1837	sd_printk(KERN_WARNING, sdkp,	1837	sd_printk(KERN_WARNING, sdkp,
1838	"Test WP failed, assume Write Enabled\n");	1838	"Test WP failed, assume Write Enabled\n");
1839	} else {	1839	} else {
1840	sdkp->write_prot = ((data.device_specific & 0x80) != 0);	1840	sdkp->write_prot = ((data.device_specific & 0x80) != 0);
1841	set_disk_ro(sdkp->disk, sdkp->write_prot);	1841	set_disk_ro(sdkp->disk, sdkp->write_prot);
1842	if (sdkp->first_scan \|\| old_wp != sdkp->write_prot) {	1842	if (sdkp->first_scan \|\| old_wp != sdkp->write_prot) {
1843	sd_printk(KERN_NOTICE, sdkp, "Write Protect is %s\n",	1843	sd_printk(KERN_NOTICE, sdkp, "Write Protect is %s\n",
1844	sdkp->write_prot ? "on" : "off");	1844	sdkp->write_prot ? "on" : "off");
1845	sd_printk(KERN_DEBUG, sdkp,	1845	sd_printk(KERN_DEBUG, sdkp,
1846	"Mode Sense: %02x %02x %02x %02x\n",	1846	"Mode Sense: %02x %02x %02x %02x\n",
1847	buffer[0], buffer[1], buffer[2], buffer[3]);	1847	buffer[0], buffer[1], buffer[2], buffer[3]);
1848	}	1848	}
1849	}	1849	}
1850	}	1850	}
1851		1851
1852	/*	1852	/*
1853	* sd_read_cache_type - called only from sd_revalidate_disk()	1853	* sd_read_cache_type - called only from sd_revalidate_disk()
1854	* called with buffer of length SD_BUF_SIZE	1854	* called with buffer of length SD_BUF_SIZE
1855	*/	1855	*/
1856	static void	1856	static void
1857	sd_read_cache_type(struct scsi_disk sdkp, unsigned char buffer)	1857	sd_read_cache_type(struct scsi_disk sdkp, unsigned char buffer)
1858	{	1858	{
1859	int len = 0, res;	1859	int len = 0, res;
1860	struct scsi_device *sdp = sdkp->device;	1860	struct scsi_device *sdp = sdkp->device;
1861		1861
1862	int dbd;	1862	int dbd;
1863	int modepage;	1863	int modepage;
1864	struct scsi_mode_data data;	1864	struct scsi_mode_data data;
1865	struct scsi_sense_hdr sshdr;	1865	struct scsi_sense_hdr sshdr;
1866	int old_wce = sdkp->WCE;	1866	int old_wce = sdkp->WCE;
1867	int old_rcd = sdkp->RCD;	1867	int old_rcd = sdkp->RCD;
1868	int old_dpofua = sdkp->DPOFUA;	1868	int old_dpofua = sdkp->DPOFUA;
1869		1869
1870	if (sdp->skip_ms_page_8)	1870	if (sdp->skip_ms_page_8)
1871	goto defaults;	1871	goto defaults;
1872		1872
1873	if (sdp->type == TYPE_RBC) {	1873	if (sdp->type == TYPE_RBC) {
1874	modepage = 6;	1874	modepage = 6;
1875	dbd = 8;	1875	dbd = 8;
1876	} else {	1876	} else {
1877	modepage = 8;	1877	modepage = 8;
1878	dbd = 0;	1878	dbd = 0;
1879	}	1879	}
1880		1880
1881	/* cautiously ask */	1881	/* cautiously ask */
1882	res = sd_do_mode_sense(sdp, dbd, modepage, buffer, 4, &data, &sshdr);	1882	res = sd_do_mode_sense(sdp, dbd, modepage, buffer, 4, &data, &sshdr);
1883		1883
1884	if (!scsi_status_is_good(res))	1884	if (!scsi_status_is_good(res))
1885	goto bad_sense;	1885	goto bad_sense;
1886		1886
1887	if (!data.header_length) {	1887	if (!data.header_length) {
1888	modepage = 6;	1888	modepage = 6;
1889	sd_printk(KERN_ERR, sdkp, "Missing header in MODE_SENSE response\n");	1889	sd_printk(KERN_ERR, sdkp, "Missing header in MODE_SENSE response\n");
1890	}	1890	}
1891		1891
1892	/* that went OK, now ask for the proper length */	1892	/* that went OK, now ask for the proper length */
1893	len = data.length;	1893	len = data.length;
1894		1894
1895	/*	1895	/*
1896	* We're only interested in the first three bytes, actually.	1896	* We're only interested in the first three bytes, actually.
1897	* But the data cache page is defined for the first 20.	1897	* But the data cache page is defined for the first 20.
1898	*/	1898	*/
1899	if (len < 3)	1899	if (len < 3)
1900	goto bad_sense;	1900	goto bad_sense;
1901	if (len > 20)	1901	if (len > 20)
1902	len = 20;	1902	len = 20;
1903		1903
1904	/* Take headers and block descriptors into account */	1904	/* Take headers and block descriptors into account */
1905	len += data.header_length + data.block_descriptor_length;	1905	len += data.header_length + data.block_descriptor_length;
1906	if (len > SD_BUF_SIZE)	1906	if (len > SD_BUF_SIZE)
1907	goto bad_sense;	1907	goto bad_sense;
1908		1908
1909	/* Get the data */	1909	/* Get the data */
1910	res = sd_do_mode_sense(sdp, dbd, modepage, buffer, len, &data, &sshdr);	1910	res = sd_do_mode_sense(sdp, dbd, modepage, buffer, len, &data, &sshdr);
1911		1911
1912	if (scsi_status_is_good(res)) {	1912	if (scsi_status_is_good(res)) {
1913	int offset = data.header_length + data.block_descriptor_length;	1913	int offset = data.header_length + data.block_descriptor_length;
1914		1914
1915	if (offset >= SD_BUF_SIZE - 2) {	1915	if (offset >= SD_BUF_SIZE - 2) {
1916	sd_printk(KERN_ERR, sdkp, "Malformed MODE SENSE response\n");	1916	sd_printk(KERN_ERR, sdkp, "Malformed MODE SENSE response\n");
1917	goto defaults;	1917	goto defaults;
1918	}	1918	}
1919		1919
1920	if ((buffer[offset] & 0x3f) != modepage) {	1920	if ((buffer[offset] & 0x3f) != modepage) {
1921	sd_printk(KERN_ERR, sdkp, "Got wrong page\n");	1921	sd_printk(KERN_ERR, sdkp, "Got wrong page\n");
1922	goto defaults;	1922	goto defaults;
1923	}	1923	}
1924		1924
1925	if (modepage == 8) {	1925	if (modepage == 8) {
1926	sdkp->WCE = ((buffer[offset + 2] & 0x04) != 0);	1926	sdkp->WCE = ((buffer[offset + 2] & 0x04) != 0);
1927	sdkp->RCD = ((buffer[offset + 2] & 0x01) != 0);	1927	sdkp->RCD = ((buffer[offset + 2] & 0x01) != 0);
1928	} else {	1928	} else {
1929	sdkp->WCE = ((buffer[offset + 2] & 0x01) == 0);	1929	sdkp->WCE = ((buffer[offset + 2] & 0x01) == 0);
1930	sdkp->RCD = 0;	1930	sdkp->RCD = 0;
1931	}	1931	}
1932		1932
1933	sdkp->DPOFUA = (data.device_specific & 0x10) != 0;	1933	sdkp->DPOFUA = (data.device_specific & 0x10) != 0;
1934	if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw) {	1934	if (sdkp->DPOFUA && !sdkp->device->use_10_for_rw) {
1935	sd_printk(KERN_NOTICE, sdkp,	1935	sd_printk(KERN_NOTICE, sdkp,
1936	"Uses READ/WRITE(6), disabling FUA\n");	1936	"Uses READ/WRITE(6), disabling FUA\n");
1937	sdkp->DPOFUA = 0;	1937	sdkp->DPOFUA = 0;
1938	}	1938	}
1939		1939
1940	if (sdkp->first_scan \|\| old_wce != sdkp->WCE \|\|	1940	if (sdkp->first_scan \|\| old_wce != sdkp->WCE \|\|
1941	old_rcd != sdkp->RCD \|\| old_dpofua != sdkp->DPOFUA)	1941	old_rcd != sdkp->RCD \|\| old_dpofua != sdkp->DPOFUA)
1942	sd_printk(KERN_NOTICE, sdkp,	1942	sd_printk(KERN_NOTICE, sdkp,
1943	"Write cache: %s, read cache: %s, %s\n",	1943	"Write cache: %s, read cache: %s, %s\n",
1944	sdkp->WCE ? "enabled" : "disabled",	1944	sdkp->WCE ? "enabled" : "disabled",
1945	sdkp->RCD ? "disabled" : "enabled",	1945	sdkp->RCD ? "disabled" : "enabled",
1946	sdkp->DPOFUA ? "supports DPO and FUA"	1946	sdkp->DPOFUA ? "supports DPO and FUA"
1947	: "doesn't support DPO or FUA");	1947	: "doesn't support DPO or FUA");
1948		1948
1949	return;	1949	return;
1950	}	1950	}
1951		1951
1952	bad_sense:	1952	bad_sense:
1953	if (scsi_sense_valid(&sshdr) &&	1953	if (scsi_sense_valid(&sshdr) &&
1954	sshdr.sense_key == ILLEGAL_REQUEST &&	1954	sshdr.sense_key == ILLEGAL_REQUEST &&
1955	sshdr.asc == 0x24 && sshdr.ascq == 0x0)	1955	sshdr.asc == 0x24 && sshdr.ascq == 0x0)
1956	/* Invalid field in CDB */	1956	/* Invalid field in CDB */
1957	sd_printk(KERN_NOTICE, sdkp, "Cache data unavailable\n");	1957	sd_printk(KERN_NOTICE, sdkp, "Cache data unavailable\n");
1958	else	1958	else
1959	sd_printk(KERN_ERR, sdkp, "Asking for cache data failed\n");	1959	sd_printk(KERN_ERR, sdkp, "Asking for cache data failed\n");
1960		1960
1961	defaults:	1961	defaults:
1962	sd_printk(KERN_ERR, sdkp, "Assuming drive cache: write through\n");	1962	sd_printk(KERN_ERR, sdkp, "Assuming drive cache: write through\n");
1963	sdkp->WCE = 0;	1963	sdkp->WCE = 0;
1964	sdkp->RCD = 0;	1964	sdkp->RCD = 0;
1965	sdkp->DPOFUA = 0;	1965	sdkp->DPOFUA = 0;
1966	}	1966	}
1967		1967
1968	/*	1968	/*
1969	* The ATO bit indicates whether the DIF application tag is available	1969	* The ATO bit indicates whether the DIF application tag is available
1970	* for use by the operating system.	1970	* for use by the operating system.
1971	*/	1971	*/
1972	static void sd_read_app_tag_own(struct scsi_disk sdkp, unsigned char buffer)	1972	static void sd_read_app_tag_own(struct scsi_disk sdkp, unsigned char buffer)
1973	{	1973	{
1974	int res, offset;	1974	int res, offset;
1975	struct scsi_device *sdp = sdkp->device;	1975	struct scsi_device *sdp = sdkp->device;
1976	struct scsi_mode_data data;	1976	struct scsi_mode_data data;
1977	struct scsi_sense_hdr sshdr;	1977	struct scsi_sense_hdr sshdr;
1978		1978
1979	if (sdp->type != TYPE_DISK)	1979	if (sdp->type != TYPE_DISK)
1980	return;	1980	return;
1981		1981
1982	if (sdkp->protection_type == 0)	1982	if (sdkp->protection_type == 0)
1983	return;	1983	return;
1984		1984
1985	res = scsi_mode_sense(sdp, 1, 0x0a, buffer, 36, SD_TIMEOUT,	1985	res = scsi_mode_sense(sdp, 1, 0x0a, buffer, 36, SD_TIMEOUT,
1986	SD_MAX_RETRIES, &data, &sshdr);	1986	SD_MAX_RETRIES, &data, &sshdr);
1987		1987
1988	if (!scsi_status_is_good(res) \|\| !data.header_length \|\|	1988	if (!scsi_status_is_good(res) \|\| !data.header_length \|\|
1989	data.length < 6) {	1989	data.length < 6) {
1990	sd_printk(KERN_WARNING, sdkp,	1990	sd_printk(KERN_WARNING, sdkp,
1991	"getting Control mode page failed, assume no ATO\n");	1991	"getting Control mode page failed, assume no ATO\n");
1992		1992
1993	if (scsi_sense_valid(&sshdr))	1993	if (scsi_sense_valid(&sshdr))
1994	sd_print_sense_hdr(sdkp, &sshdr);	1994	sd_print_sense_hdr(sdkp, &sshdr);
1995		1995
1996	return;	1996	return;
1997	}	1997	}
1998		1998
1999	offset = data.header_length + data.block_descriptor_length;	1999	offset = data.header_length + data.block_descriptor_length;
2000		2000
2001	if ((buffer[offset] & 0x3f) != 0x0a) {	2001	if ((buffer[offset] & 0x3f) != 0x0a) {
2002	sd_printk(KERN_ERR, sdkp, "ATO Got wrong page\n");	2002	sd_printk(KERN_ERR, sdkp, "ATO Got wrong page\n");
2003	return;	2003	return;
2004	}	2004	}
2005		2005
2006	if ((buffer[offset + 5] & 0x80) == 0)	2006	if ((buffer[offset + 5] & 0x80) == 0)
2007	return;	2007	return;
2008		2008
2009	sdkp->ATO = 1;	2009	sdkp->ATO = 1;
2010		2010
2011	return;	2011	return;
2012	}	2012	}
2013		2013
2014	/**	2014	/**
2015	* sd_read_block_limits - Query disk device for preferred I/O sizes.	2015	* sd_read_block_limits - Query disk device for preferred I/O sizes.
2016	* @disk: disk to query	2016	* @disk: disk to query
2017	*/	2017	*/
2018	static void sd_read_block_limits(struct scsi_disk *sdkp)	2018	static void sd_read_block_limits(struct scsi_disk *sdkp)
2019	{	2019	{
2020	struct request_queue *q = sdkp->disk->queue;	2020	struct request_queue *q = sdkp->disk->queue;
2021	unsigned int sector_sz = sdkp->device->sector_size;	2021	unsigned int sector_sz = sdkp->device->sector_size;
2022	const int vpd_len = 64;	2022	const int vpd_len = 64;
2023	unsigned char *buffer = kmalloc(vpd_len, GFP_KERNEL);	2023	unsigned char *buffer = kmalloc(vpd_len, GFP_KERNEL);
2024		2024
2025	if (!buffer \|\|	2025	if (!buffer \|\|
2026	/* Block Limits VPD */	2026	/* Block Limits VPD */
2027	scsi_get_vpd_page(sdkp->device, 0xb0, buffer, vpd_len))	2027	scsi_get_vpd_page(sdkp->device, 0xb0, buffer, vpd_len))
2028	goto out;	2028	goto out;
2029		2029
2030	blk_queue_io_min(sdkp->disk->queue,	2030	blk_queue_io_min(sdkp->disk->queue,
2031	get_unaligned_be16(&buffer[6]) * sector_sz);	2031	get_unaligned_be16(&buffer[6]) * sector_sz);
2032	blk_queue_io_opt(sdkp->disk->queue,	2032	blk_queue_io_opt(sdkp->disk->queue,
2033	get_unaligned_be32(&buffer[12]) * sector_sz);	2033	get_unaligned_be32(&buffer[12]) * sector_sz);
2034		2034
2035	/* Thin provisioning enabled and page length indicates TP support */	2035	/* Thin provisioning enabled and page length indicates TP support */
2036	if (sdkp->thin_provisioning && buffer[3] == 0x3c) {	2036	if (sdkp->thin_provisioning && buffer[3] == 0x3c) {
2037	unsigned int lba_count, desc_count, granularity;	2037	unsigned int lba_count, desc_count, granularity;
2038		2038
2039	lba_count = get_unaligned_be32(&buffer[20]);	2039	lba_count = get_unaligned_be32(&buffer[20]);
2040	desc_count = get_unaligned_be32(&buffer[24]);	2040	desc_count = get_unaligned_be32(&buffer[24]);
2041		2041
2042	if (lba_count) {	2042	if (lba_count) {
2043	q->limits.max_discard_sectors =	2043	q->limits.max_discard_sectors =
2044	lba_count * sector_sz >> 9;	2044	lba_count * sector_sz >> 9;
2045		2045
2046	if (desc_count)	2046	if (desc_count)
2047	sdkp->unmap = 1;	2047	sdkp->unmap = 1;
2048	}	2048	}
2049		2049
2050	granularity = get_unaligned_be32(&buffer[28]);	2050	granularity = get_unaligned_be32(&buffer[28]);
2051		2051
2052	if (granularity)	2052	if (granularity)
2053	q->limits.discard_granularity = granularity * sector_sz;	2053	q->limits.discard_granularity = granularity * sector_sz;
2054		2054
2055	if (buffer[32] & 0x80)	2055	if (buffer[32] & 0x80)
2056	q->limits.discard_alignment =	2056	q->limits.discard_alignment =
2057	get_unaligned_be32(&buffer[32]) & ~(1 << 31);	2057	get_unaligned_be32(&buffer[32]) & ~(1 << 31);
2058	}	2058	}
2059		2059
2060	out:	2060	out:
2061	kfree(buffer);	2061	kfree(buffer);
2062	}	2062	}
2063		2063
2064	/**	2064	/**
2065	* sd_read_block_characteristics - Query block dev. characteristics	2065	* sd_read_block_characteristics - Query block dev. characteristics
2066	* @disk: disk to query	2066	* @disk: disk to query
2067	*/	2067	*/
2068	static void sd_read_block_characteristics(struct scsi_disk *sdkp)	2068	static void sd_read_block_characteristics(struct scsi_disk *sdkp)
2069	{	2069	{
2070	unsigned char *buffer;	2070	unsigned char *buffer;
2071	u16 rot;	2071	u16 rot;
2072	const int vpd_len = 64;	2072	const int vpd_len = 64;
2073		2073
2074	buffer = kmalloc(vpd_len, GFP_KERNEL);	2074	buffer = kmalloc(vpd_len, GFP_KERNEL);
2075		2075
2076	if (!buffer \|\|	2076	if (!buffer \|\|
2077	/* Block Device Characteristics VPD */	2077	/* Block Device Characteristics VPD */
2078	scsi_get_vpd_page(sdkp->device, 0xb1, buffer, vpd_len))	2078	scsi_get_vpd_page(sdkp->device, 0xb1, buffer, vpd_len))
2079	goto out;	2079	goto out;
2080		2080
2081	rot = get_unaligned_be16(&buffer[4]);	2081	rot = get_unaligned_be16(&buffer[4]);
2082		2082
2083	if (rot == 1)	2083	if (rot == 1)
2084	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, sdkp->disk->queue);	2084	queue_flag_set_unlocked(QUEUE_FLAG_NONROT, sdkp->disk->queue);
2085		2085
2086	out:	2086	out:
2087	kfree(buffer);	2087	kfree(buffer);
2088	}	2088	}
2089		2089
2090	static int sd_try_extended_inquiry(struct scsi_device *sdp)	2090	static int sd_try_extended_inquiry(struct scsi_device *sdp)
2091	{	2091	{
2092	/*	2092	/*
2093	* Although VPD inquiries can go to SCSI-2 type devices,	2093	* Although VPD inquiries can go to SCSI-2 type devices,
2094	* some USB ones crash on receiving them, and the pages	2094	* some USB ones crash on receiving them, and the pages
2095	* we currently ask for are for SPC-3 and beyond	2095	* we currently ask for are for SPC-3 and beyond
2096	*/	2096	*/
2097	if (sdp->scsi_level > SCSI_SPC_2)	2097	if (sdp->scsi_level > SCSI_SPC_2)
2098	return 1;	2098	return 1;
2099	return 0;	2099	return 0;
2100	}	2100	}
2101		2101
2102	/**	2102	/**
2103	* sd_revalidate_disk - called the first time a new disk is seen,	2103	* sd_revalidate_disk - called the first time a new disk is seen,
2104	* performs disk spin up, read_capacity, etc.	2104	* performs disk spin up, read_capacity, etc.
2105	* @disk: struct gendisk we care about	2105	* @disk: struct gendisk we care about
2106	**/	2106	**/
2107	static int sd_revalidate_disk(struct gendisk *disk)	2107	static int sd_revalidate_disk(struct gendisk *disk)
2108	{	2108	{
2109	struct scsi_disk *sdkp = scsi_disk(disk);	2109	struct scsi_disk *sdkp = scsi_disk(disk);
2110	struct scsi_device *sdp = sdkp->device;	2110	struct scsi_device *sdp = sdkp->device;
2111	unsigned char *buffer;	2111	unsigned char *buffer;
2112	unsigned ordered;	2112	unsigned flush = 0;
2113		2113
2114	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,	2114	SCSI_LOG_HLQUEUE(3, sd_printk(KERN_INFO, sdkp,
2115	"sd_revalidate_disk\n"));	2115	"sd_revalidate_disk\n"));
2116		2116
2117	/*	2117	/*
2118	* If the device is offline, don't try and read capacity or any	2118	* If the device is offline, don't try and read capacity or any
2119	* of the other niceties.	2119	* of the other niceties.
2120	*/	2120	*/
2121	if (!scsi_device_online(sdp))	2121	if (!scsi_device_online(sdp))
2122	goto out;	2122	goto out;
2123		2123
2124	buffer = kmalloc(SD_BUF_SIZE, GFP_KERNEL);	2124	buffer = kmalloc(SD_BUF_SIZE, GFP_KERNEL);
2125	if (!buffer) {	2125	if (!buffer) {
2126	sd_printk(KERN_WARNING, sdkp, "sd_revalidate_disk: Memory "	2126	sd_printk(KERN_WARNING, sdkp, "sd_revalidate_disk: Memory "
2127	"allocation failure.\n");	2127	"allocation failure.\n");
2128	goto out;	2128	goto out;
2129	}	2129	}
2130		2130
2131	sd_spinup_disk(sdkp);	2131	sd_spinup_disk(sdkp);
2132		2132
2133	/*	2133	/*
2134	* Without media there is no reason to ask; moreover, some devices	2134	* Without media there is no reason to ask; moreover, some devices
2135	* react badly if we do.	2135	* react badly if we do.
2136	*/	2136	*/
2137	if (sdkp->media_present) {	2137	if (sdkp->media_present) {
2138	sd_read_capacity(sdkp, buffer);	2138	sd_read_capacity(sdkp, buffer);
2139		2139
2140	if (sd_try_extended_inquiry(sdp)) {	2140	if (sd_try_extended_inquiry(sdp)) {
2141	sd_read_block_limits(sdkp);	2141	sd_read_block_limits(sdkp);
2142	sd_read_block_characteristics(sdkp);	2142	sd_read_block_characteristics(sdkp);
2143	}	2143	}
2144		2144
2145	sd_read_write_protect_flag(sdkp, buffer);	2145	sd_read_write_protect_flag(sdkp, buffer);
2146	sd_read_cache_type(sdkp, buffer);	2146	sd_read_cache_type(sdkp, buffer);
2147	sd_read_app_tag_own(sdkp, buffer);	2147	sd_read_app_tag_own(sdkp, buffer);
2148	}	2148	}
2149		2149
2150	sdkp->first_scan = 0;	2150	sdkp->first_scan = 0;
2151		2151
2152	/*	2152	/*
2153	* We now have all cache related info, determine how we deal	2153	* We now have all cache related info, determine how we deal
2154	* with ordered requests.	2154	* with flush requests.
2155	*/	2155	*/
2156	if (sdkp->WCE)	2156	if (sdkp->WCE) {
2157	ordered = sdkp->DPOFUA	2157	flush \|= REQ_FLUSH;
2158	? QUEUE_ORDERED_DRAIN_FUA : QUEUE_ORDERED_DRAIN_FLUSH;	2158	if (sdkp->DPOFUA)
2159	else	2159	flush \|= REQ_FUA;
2160	ordered = QUEUE_ORDERED_DRAIN;	2160	}
2161		2161
2162	blk_queue_ordered(sdkp->disk->queue, ordered);	2162	blk_queue_flush(sdkp->disk->queue, flush);
2163		2163
2164	set_capacity(disk, sdkp->capacity);	2164	set_capacity(disk, sdkp->capacity);
2165	kfree(buffer);	2165	kfree(buffer);
2166		2166
2167	out:	2167	out:
2168	return 0;	2168	return 0;
2169	}	2169	}
2170		2170
2171	/**	2171	/**
2172	* sd_unlock_native_capacity - unlock native capacity	2172	* sd_unlock_native_capacity - unlock native capacity
2173	* @disk: struct gendisk to set capacity for	2173	* @disk: struct gendisk to set capacity for
2174	*	2174	*
2175	* Block layer calls this function if it detects that partitions	2175	* Block layer calls this function if it detects that partitions
2176	* on @disk reach beyond the end of the device. If the SCSI host	2176	* on @disk reach beyond the end of the device. If the SCSI host
2177	* implements ->unlock_native_capacity() method, it's invoked to	2177	* implements ->unlock_native_capacity() method, it's invoked to
2178	* give it a chance to adjust the device capacity.	2178	* give it a chance to adjust the device capacity.
2179	*	2179	*
2180	* CONTEXT:	2180	* CONTEXT:
2181	* Defined by block layer. Might sleep.	2181	* Defined by block layer. Might sleep.
2182	*/	2182	*/
2183	static void sd_unlock_native_capacity(struct gendisk *disk)	2183	static void sd_unlock_native_capacity(struct gendisk *disk)
2184	{	2184	{
2185	struct scsi_device *sdev = scsi_disk(disk)->device;	2185	struct scsi_device *sdev = scsi_disk(disk)->device;
2186		2186
2187	if (sdev->host->hostt->unlock_native_capacity)	2187	if (sdev->host->hostt->unlock_native_capacity)
2188	sdev->host->hostt->unlock_native_capacity(sdev);	2188	sdev->host->hostt->unlock_native_capacity(sdev);
2189	}	2189	}
2190		2190
2191	/**	2191	/**
2192	* sd_format_disk_name - format disk name	2192	* sd_format_disk_name - format disk name
2193	* @prefix: name prefix - ie. "sd" for SCSI disks	2193	* @prefix: name prefix - ie. "sd" for SCSI disks
2194	* @index: index of the disk to format name for	2194	* @index: index of the disk to format name for
2195	* @buf: output buffer	2195	* @buf: output buffer
2196	* @buflen: length of the output buffer	2196	* @buflen: length of the output buffer
2197	*	2197	*
2198	* SCSI disk names starts at sda. The 26th device is sdz and the	2198	* SCSI disk names starts at sda. The 26th device is sdz and the
2199	* 27th is sdaa. The last one for two lettered suffix is sdzz	2199	* 27th is sdaa. The last one for two lettered suffix is sdzz
2200	* which is followed by sdaaa.	2200	* which is followed by sdaaa.
2201	*	2201	*
2202	* This is basically 26 base counting with one extra 'nil' entry	2202	* This is basically 26 base counting with one extra 'nil' entry
2203	* at the beginning from the second digit on and can be	2203	* at the beginning from the second digit on and can be
2204	* determined using similar method as 26 base conversion with the	2204	* determined using similar method as 26 base conversion with the
2205	* index shifted -1 after each digit is computed.	2205	* index shifted -1 after each digit is computed.
2206	*	2206	*
2207	* CONTEXT:	2207	* CONTEXT:
2208	* Don't care.	2208	* Don't care.
2209	*	2209	*
2210	* RETURNS:	2210	* RETURNS:
2211	* 0 on success, -errno on failure.	2211	* 0 on success, -errno on failure.
2212	*/	2212	*/
2213	static int sd_format_disk_name(char prefix, int index, char buf, int buflen)	2213	static int sd_format_disk_name(char prefix, int index, char buf, int buflen)
2214	{	2214	{
2215	const int base = 'z' - 'a' + 1;	2215	const int base = 'z' - 'a' + 1;
2216	char *begin = buf + strlen(prefix);	2216	char *begin = buf + strlen(prefix);
2217	char *end = buf + buflen;	2217	char *end = buf + buflen;
2218	char *p;	2218	char *p;
2219	int unit;	2219	int unit;
2220		2220
2221	p = end - 1;	2221	p = end - 1;
2222	*p = '\0';	2222	*p = '\0';
2223	unit = base;	2223	unit = base;
2224	do {	2224	do {
2225	if (p == begin)	2225	if (p == begin)
2226	return -EINVAL;	2226	return -EINVAL;
2227	*--p = 'a' + (index % unit);	2227	*--p = 'a' + (index % unit);
2228	index = (index / unit) - 1;	2228	index = (index / unit) - 1;
2229	} while (index >= 0);	2229	} while (index >= 0);
2230		2230
2231	memmove(begin, p, end - p);	2231	memmove(begin, p, end - p);
2232	memcpy(buf, prefix, strlen(prefix));	2232	memcpy(buf, prefix, strlen(prefix));
2233		2233
2234	return 0;	2234	return 0;
2235	}	2235	}
2236		2236
2237	/*	2237	/*
2238	* The asynchronous part of sd_probe	2238	* The asynchronous part of sd_probe
2239	*/	2239	*/
2240	static void sd_probe_async(void *data, async_cookie_t cookie)	2240	static void sd_probe_async(void *data, async_cookie_t cookie)
2241	{	2241	{
2242	struct scsi_disk *sdkp = data;	2242	struct scsi_disk *sdkp = data;
2243	struct scsi_device *sdp;	2243	struct scsi_device *sdp;
2244	struct gendisk *gd;	2244	struct gendisk *gd;
2245	u32 index;	2245	u32 index;
2246	struct device *dev;	2246	struct device *dev;
2247		2247
2248	sdp = sdkp->device;	2248	sdp = sdkp->device;
2249	gd = sdkp->disk;	2249	gd = sdkp->disk;
2250	index = sdkp->index;	2250	index = sdkp->index;
2251	dev = &sdp->sdev_gendev;	2251	dev = &sdp->sdev_gendev;
2252		2252
2253	if (index < SD_MAX_DISKS) {	2253	if (index < SD_MAX_DISKS) {
2254	gd->major = sd_major((index & 0xf0) >> 4);	2254	gd->major = sd_major((index & 0xf0) >> 4);
2255	gd->first_minor = ((index & 0xf) << 4) \| (index & 0xfff00);	2255	gd->first_minor = ((index & 0xf) << 4) \| (index & 0xfff00);
2256	gd->minors = SD_MINORS;	2256	gd->minors = SD_MINORS;
2257	}	2257	}
2258	gd->fops = &sd_fops;	2258	gd->fops = &sd_fops;
2259	gd->private_data = &sdkp->driver;	2259	gd->private_data = &sdkp->driver;
2260	gd->queue = sdkp->device->request_queue;	2260	gd->queue = sdkp->device->request_queue;
2261		2261
2262	/* defaults, until the device tells us otherwise */	2262	/* defaults, until the device tells us otherwise */
2263	sdp->sector_size = 512;	2263	sdp->sector_size = 512;
2264	sdkp->capacity = 0;	2264	sdkp->capacity = 0;
2265	sdkp->media_present = 1;	2265	sdkp->media_present = 1;
2266	sdkp->write_prot = 0;	2266	sdkp->write_prot = 0;
2267	sdkp->WCE = 0;	2267	sdkp->WCE = 0;
2268	sdkp->RCD = 0;	2268	sdkp->RCD = 0;
2269	sdkp->ATO = 0;	2269	sdkp->ATO = 0;
2270	sdkp->first_scan = 1;	2270	sdkp->first_scan = 1;
2271		2271
2272	sd_revalidate_disk(gd);	2272	sd_revalidate_disk(gd);
2273		2273
2274	blk_queue_prep_rq(sdp->request_queue, sd_prep_fn);	2274	blk_queue_prep_rq(sdp->request_queue, sd_prep_fn);
2275	blk_queue_unprep_rq(sdp->request_queue, sd_unprep_fn);	2275	blk_queue_unprep_rq(sdp->request_queue, sd_unprep_fn);
2276		2276
2277	gd->driverfs_dev = &sdp->sdev_gendev;	2277	gd->driverfs_dev = &sdp->sdev_gendev;
2278	gd->flags = GENHD_FL_EXT_DEVT;	2278	gd->flags = GENHD_FL_EXT_DEVT;
2279	if (sdp->removable)	2279	if (sdp->removable)
2280	gd->flags \|= GENHD_FL_REMOVABLE;	2280	gd->flags \|= GENHD_FL_REMOVABLE;
2281		2281
2282	add_disk(gd);	2282	add_disk(gd);
2283	sd_dif_config_host(sdkp);	2283	sd_dif_config_host(sdkp);
2284		2284
2285	sd_revalidate_disk(gd);	2285	sd_revalidate_disk(gd);
2286		2286
2287	sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n",	2287	sd_printk(KERN_NOTICE, sdkp, "Attached SCSI %sdisk\n",
2288	sdp->removable ? "removable " : "");	2288	sdp->removable ? "removable " : "");
2289	scsi_autopm_put_device(sdp);	2289	scsi_autopm_put_device(sdp);
2290	put_device(&sdkp->dev);	2290	put_device(&sdkp->dev);
2291	}	2291	}
2292		2292
2293	/**	2293	/**
2294	* sd_probe - called during driver initialization and whenever a	2294	* sd_probe - called during driver initialization and whenever a
2295	* new scsi device is attached to the system. It is called once	2295	* new scsi device is attached to the system. It is called once
2296	* for each scsi device (not just disks) present.	2296	* for each scsi device (not just disks) present.
2297	* @dev: pointer to device object	2297	* @dev: pointer to device object
2298	*	2298	*
2299	* Returns 0 if successful (or not interested in this scsi device	2299	* Returns 0 if successful (or not interested in this scsi device
2300	* (e.g. scanner)); 1 when there is an error.	2300	* (e.g. scanner)); 1 when there is an error.
2301	*	2301	*
2302	* Note: this function is invoked from the scsi mid-level.	2302	* Note: this function is invoked from the scsi mid-level.
2303	* This function sets up the mapping between a given	2303	* This function sets up the mapping between a given
2304	* <host,channel,id,lun> (found in sdp) and new device name	2304	* <host,channel,id,lun> (found in sdp) and new device name
2305	* (e.g. /dev/sda). More precisely it is the block device major	2305	* (e.g. /dev/sda). More precisely it is the block device major
2306	* and minor number that is chosen here.	2306	* and minor number that is chosen here.
2307	*	2307	*
2308	* Assume sd_attach is not re-entrant (for time being)	2308	* Assume sd_attach is not re-entrant (for time being)
2309	* Also think about sd_attach() and sd_remove() running coincidentally.	2309	* Also think about sd_attach() and sd_remove() running coincidentally.
2310	**/	2310	**/
2311	static int sd_probe(struct device *dev)	2311	static int sd_probe(struct device *dev)
2312	{	2312	{
2313	struct scsi_device *sdp = to_scsi_device(dev);	2313	struct scsi_device *sdp = to_scsi_device(dev);
2314	struct scsi_disk *sdkp;	2314	struct scsi_disk *sdkp;
2315	struct gendisk *gd;	2315	struct gendisk *gd;
2316	int index;	2316	int index;
2317	int error;	2317	int error;
2318		2318
2319	error = -ENODEV;	2319	error = -ENODEV;
2320	if (sdp->type != TYPE_DISK && sdp->type != TYPE_MOD && sdp->type != TYPE_RBC)	2320	if (sdp->type != TYPE_DISK && sdp->type != TYPE_MOD && sdp->type != TYPE_RBC)
2321	goto out;	2321	goto out;
2322		2322
2323	SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp,	2323	SCSI_LOG_HLQUEUE(3, sdev_printk(KERN_INFO, sdp,
2324	"sd_attach\n"));	2324	"sd_attach\n"));
2325		2325
2326	error = -ENOMEM;	2326	error = -ENOMEM;
2327	sdkp = kzalloc(sizeof(*sdkp), GFP_KERNEL);	2327	sdkp = kzalloc(sizeof(*sdkp), GFP_KERNEL);
2328	if (!sdkp)	2328	if (!sdkp)
2329	goto out;	2329	goto out;
2330		2330
2331	gd = alloc_disk(SD_MINORS);	2331	gd = alloc_disk(SD_MINORS);
2332	if (!gd)	2332	if (!gd)
2333	goto out_free;	2333	goto out_free;
2334		2334
2335	do {	2335	do {
2336	if (!ida_pre_get(&sd_index_ida, GFP_KERNEL))	2336	if (!ida_pre_get(&sd_index_ida, GFP_KERNEL))
2337	goto out_put;	2337	goto out_put;
2338		2338
2339	spin_lock(&sd_index_lock);	2339	spin_lock(&sd_index_lock);
2340	error = ida_get_new(&sd_index_ida, &index);	2340	error = ida_get_new(&sd_index_ida, &index);
2341	spin_unlock(&sd_index_lock);	2341	spin_unlock(&sd_index_lock);
2342	} while (error == -EAGAIN);	2342	} while (error == -EAGAIN);
2343		2343
2344	if (error)	2344	if (error)
2345	goto out_put;	2345	goto out_put;
2346		2346
2347	error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN);	2347	error = sd_format_disk_name("sd", index, gd->disk_name, DISK_NAME_LEN);
2348	if (error)	2348	if (error)
2349	goto out_free_index;	2349	goto out_free_index;
2350		2350
2351	sdkp->device = sdp;	2351	sdkp->device = sdp;
2352	sdkp->driver = &sd_template;	2352	sdkp->driver = &sd_template;
2353	sdkp->disk = gd;	2353	sdkp->disk = gd;
2354	sdkp->index = index;	2354	sdkp->index = index;
2355	atomic_set(&sdkp->openers, 0);	2355	atomic_set(&sdkp->openers, 0);
2356	sdkp->previous_state = 1;	2356	sdkp->previous_state = 1;
2357		2357
2358	if (!sdp->request_queue->rq_timeout) {	2358	if (!sdp->request_queue->rq_timeout) {
2359	if (sdp->type != TYPE_MOD)	2359	if (sdp->type != TYPE_MOD)
2360	blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT);	2360	blk_queue_rq_timeout(sdp->request_queue, SD_TIMEOUT);
2361	else	2361	else
2362	blk_queue_rq_timeout(sdp->request_queue,	2362	blk_queue_rq_timeout(sdp->request_queue,
2363	SD_MOD_TIMEOUT);	2363	SD_MOD_TIMEOUT);
2364	}	2364	}
2365		2365
2366	device_initialize(&sdkp->dev);	2366	device_initialize(&sdkp->dev);
2367	sdkp->dev.parent = dev;	2367	sdkp->dev.parent = dev;
2368	sdkp->dev.class = &sd_disk_class;	2368	sdkp->dev.class = &sd_disk_class;
2369	dev_set_name(&sdkp->dev, dev_name(dev));	2369	dev_set_name(&sdkp->dev, dev_name(dev));
2370		2370
2371	if (device_add(&sdkp->dev))	2371	if (device_add(&sdkp->dev))
2372	goto out_free_index;	2372	goto out_free_index;
2373		2373
2374	get_device(dev);	2374	get_device(dev);
2375	dev_set_drvdata(dev, sdkp);	2375	dev_set_drvdata(dev, sdkp);
2376		2376
2377	get_device(&sdkp->dev); /* prevent release before async_schedule */	2377	get_device(&sdkp->dev); /* prevent release before async_schedule */
2378	async_schedule(sd_probe_async, sdkp);	2378	async_schedule(sd_probe_async, sdkp);
2379		2379
2380	return 0;	2380	return 0;
2381		2381
2382	out_free_index:	2382	out_free_index:
2383	spin_lock(&sd_index_lock);	2383	spin_lock(&sd_index_lock);
2384	ida_remove(&sd_index_ida, index);	2384	ida_remove(&sd_index_ida, index);
2385	spin_unlock(&sd_index_lock);	2385	spin_unlock(&sd_index_lock);
2386	out_put:	2386	out_put:
2387	put_disk(gd);	2387	put_disk(gd);
2388	out_free:	2388	out_free:
2389	kfree(sdkp);	2389	kfree(sdkp);
2390	out:	2390	out:
2391	return error;	2391	return error;
2392	}	2392	}
2393		2393
2394	/**	2394	/**
2395	* sd_remove - called whenever a scsi disk (previously recognized by	2395	* sd_remove - called whenever a scsi disk (previously recognized by
2396	* sd_probe) is detached from the system. It is called (potentially	2396	* sd_probe) is detached from the system. It is called (potentially
2397	* multiple times) during sd module unload.	2397	* multiple times) during sd module unload.
2398	* @sdp: pointer to mid level scsi device object	2398	* @sdp: pointer to mid level scsi device object
2399	*	2399	*
2400	* Note: this function is invoked from the scsi mid-level.	2400	* Note: this function is invoked from the scsi mid-level.
2401	* This function potentially frees up a device name (e.g. /dev/sdc)	2401	* This function potentially frees up a device name (e.g. /dev/sdc)
2402	* that could be re-used by a subsequent sd_probe().	2402	* that could be re-used by a subsequent sd_probe().
2403	* This function is not called when the built-in sd driver is "exit-ed".	2403	* This function is not called when the built-in sd driver is "exit-ed".
2404	**/	2404	**/
2405	static int sd_remove(struct device *dev)	2405	static int sd_remove(struct device *dev)
2406	{	2406	{
2407	struct scsi_disk *sdkp;	2407	struct scsi_disk *sdkp;
2408		2408
2409	sdkp = dev_get_drvdata(dev);	2409	sdkp = dev_get_drvdata(dev);
2410	scsi_autopm_get_device(sdkp->device);	2410	scsi_autopm_get_device(sdkp->device);
2411		2411
2412	async_synchronize_full();	2412	async_synchronize_full();
2413	blk_queue_prep_rq(sdkp->device->request_queue, scsi_prep_fn);	2413	blk_queue_prep_rq(sdkp->device->request_queue, scsi_prep_fn);
2414	blk_queue_unprep_rq(sdkp->device->request_queue, NULL);	2414	blk_queue_unprep_rq(sdkp->device->request_queue, NULL);
2415	device_del(&sdkp->dev);	2415	device_del(&sdkp->dev);
2416	del_gendisk(sdkp->disk);	2416	del_gendisk(sdkp->disk);
2417	sd_shutdown(dev);	2417	sd_shutdown(dev);
2418		2418
2419	mutex_lock(&sd_ref_mutex);	2419	mutex_lock(&sd_ref_mutex);
2420	dev_set_drvdata(dev, NULL);	2420	dev_set_drvdata(dev, NULL);
2421	put_device(&sdkp->dev);	2421	put_device(&sdkp->dev);
2422	mutex_unlock(&sd_ref_mutex);	2422	mutex_unlock(&sd_ref_mutex);
2423		2423
2424	return 0;	2424	return 0;
2425	}	2425	}
2426		2426
2427	/**	2427	/**
2428	* scsi_disk_release - Called to free the scsi_disk structure	2428	* scsi_disk_release - Called to free the scsi_disk structure
2429	* @dev: pointer to embedded class device	2429	* @dev: pointer to embedded class device
2430	*	2430	*
2431	* sd_ref_mutex must be held entering this routine. Because it is	2431	* sd_ref_mutex must be held entering this routine. Because it is
2432	* called on last put, you should always use the scsi_disk_get()	2432	* called on last put, you should always use the scsi_disk_get()
2433	* scsi_disk_put() helpers which manipulate the semaphore directly	2433	* scsi_disk_put() helpers which manipulate the semaphore directly
2434	* and never do a direct put_device.	2434	* and never do a direct put_device.
2435	**/	2435	**/
2436	static void scsi_disk_release(struct device *dev)	2436	static void scsi_disk_release(struct device *dev)
2437	{	2437	{
2438	struct scsi_disk *sdkp = to_scsi_disk(dev);	2438	struct scsi_disk *sdkp = to_scsi_disk(dev);
2439	struct gendisk *disk = sdkp->disk;	2439	struct gendisk *disk = sdkp->disk;
2440		2440
2441	spin_lock(&sd_index_lock);	2441	spin_lock(&sd_index_lock);
2442	ida_remove(&sd_index_ida, sdkp->index);	2442	ida_remove(&sd_index_ida, sdkp->index);
2443	spin_unlock(&sd_index_lock);	2443	spin_unlock(&sd_index_lock);
2444		2444
2445	disk->private_data = NULL;	2445	disk->private_data = NULL;
2446	put_disk(disk);	2446	put_disk(disk);
2447	put_device(&sdkp->device->sdev_gendev);	2447	put_device(&sdkp->device->sdev_gendev);
2448		2448
2449	kfree(sdkp);	2449	kfree(sdkp);
2450	}	2450	}
2451		2451
2452	static int sd_start_stop_device(struct scsi_disk *sdkp, int start)	2452	static int sd_start_stop_device(struct scsi_disk *sdkp, int start)
2453	{	2453	{
2454	unsigned char cmd[6] = { START_STOP }; /* START_VALID */	2454	unsigned char cmd[6] = { START_STOP }; /* START_VALID */
2455	struct scsi_sense_hdr sshdr;	2455	struct scsi_sense_hdr sshdr;
2456	struct scsi_device *sdp = sdkp->device;	2456	struct scsi_device *sdp = sdkp->device;
2457	int res;	2457	int res;
2458		2458
2459	if (start)	2459	if (start)
2460	cmd[4] \|= 1; /* START */	2460	cmd[4] \|= 1; /* START */
2461		2461
2462	if (sdp->start_stop_pwr_cond)	2462	if (sdp->start_stop_pwr_cond)
2463	cmd[4] \|= start ? 1 << 4 : 3 << 4; /* Active or Standby */	2463	cmd[4] \|= start ? 1 << 4 : 3 << 4; /* Active or Standby */
2464		2464
2465	if (!scsi_device_online(sdp))	2465	if (!scsi_device_online(sdp))
2466	return -ENODEV;	2466	return -ENODEV;
2467		2467
2468	res = scsi_execute_req(sdp, cmd, DMA_NONE, NULL, 0, &sshdr,	2468	res = scsi_execute_req(sdp, cmd, DMA_NONE, NULL, 0, &sshdr,
2469	SD_TIMEOUT, SD_MAX_RETRIES, NULL);	2469	SD_TIMEOUT, SD_MAX_RETRIES, NULL);
2470	if (res) {	2470	if (res) {
2471	sd_printk(KERN_WARNING, sdkp, "START_STOP FAILED\n");	2471	sd_printk(KERN_WARNING, sdkp, "START_STOP FAILED\n");
2472	sd_print_result(sdkp, res);	2472	sd_print_result(sdkp, res);
2473	if (driver_byte(res) & DRIVER_SENSE)	2473	if (driver_byte(res) & DRIVER_SENSE)
2474	sd_print_sense_hdr(sdkp, &sshdr);	2474	sd_print_sense_hdr(sdkp, &sshdr);
2475	}	2475	}
2476		2476
2477	return res;	2477	return res;
2478	}	2478	}
2479		2479
2480	/*	2480	/*
2481	* Send a SYNCHRONIZE CACHE instruction down to the device through	2481	* Send a SYNCHRONIZE CACHE instruction down to the device through
2482	* the normal SCSI command structure. Wait for the command to	2482	* the normal SCSI command structure. Wait for the command to
2483	* complete.	2483	* complete.
2484	*/	2484	*/
2485	static void sd_shutdown(struct device *dev)	2485	static void sd_shutdown(struct device *dev)
2486	{	2486	{
2487	struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev);	2487	struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev);
2488		2488
2489	if (!sdkp)	2489	if (!sdkp)
2490	return; /* this can happen */	2490	return; /* this can happen */
2491		2491
2492	if (sdkp->WCE) {	2492	if (sdkp->WCE) {
2493	sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");	2493	sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");
2494	sd_sync_cache(sdkp);	2494	sd_sync_cache(sdkp);
2495	}	2495	}
2496		2496
2497	if (system_state != SYSTEM_RESTART && sdkp->device->manage_start_stop) {	2497	if (system_state != SYSTEM_RESTART && sdkp->device->manage_start_stop) {
2498	sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n");	2498	sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n");
2499	sd_start_stop_device(sdkp, 0);	2499	sd_start_stop_device(sdkp, 0);
2500	}	2500	}
2501		2501
2502	scsi_disk_put(sdkp);	2502	scsi_disk_put(sdkp);
2503	}	2503	}
2504		2504
2505	static int sd_suspend(struct device *dev, pm_message_t mesg)	2505	static int sd_suspend(struct device *dev, pm_message_t mesg)
2506	{	2506	{
2507	struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev);	2507	struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev);
2508	int ret = 0;	2508	int ret = 0;
2509		2509
2510	if (!sdkp)	2510	if (!sdkp)
2511	return 0; /* this can happen */	2511	return 0; /* this can happen */
2512		2512
2513	if (sdkp->WCE) {	2513	if (sdkp->WCE) {
2514	sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");	2514	sd_printk(KERN_NOTICE, sdkp, "Synchronizing SCSI cache\n");
2515	ret = sd_sync_cache(sdkp);	2515	ret = sd_sync_cache(sdkp);
2516	if (ret)	2516	if (ret)
2517	goto done;	2517	goto done;
2518	}	2518	}
2519		2519
2520	if ((mesg.event & PM_EVENT_SLEEP) && sdkp->device->manage_start_stop) {	2520	if ((mesg.event & PM_EVENT_SLEEP) && sdkp->device->manage_start_stop) {
2521	sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n");	2521	sd_printk(KERN_NOTICE, sdkp, "Stopping disk\n");
2522	ret = sd_start_stop_device(sdkp, 0);	2522	ret = sd_start_stop_device(sdkp, 0);
2523	}	2523	}
2524		2524
2525	done:	2525	done:
2526	scsi_disk_put(sdkp);	2526	scsi_disk_put(sdkp);
2527	return ret;	2527	return ret;
2528	}	2528	}
2529		2529
2530	static int sd_resume(struct device *dev)	2530	static int sd_resume(struct device *dev)
2531	{	2531	{
2532	struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev);	2532	struct scsi_disk *sdkp = scsi_disk_get_from_dev(dev);
2533	int ret = 0;	2533	int ret = 0;
2534		2534
2535	if (!sdkp->device->manage_start_stop)	2535	if (!sdkp->device->manage_start_stop)
2536	goto done;	2536	goto done;
2537		2537
2538	sd_printk(KERN_NOTICE, sdkp, "Starting disk\n");	2538	sd_printk(KERN_NOTICE, sdkp, "Starting disk\n");
2539	ret = sd_start_stop_device(sdkp, 1);	2539	ret = sd_start_stop_device(sdkp, 1);
2540		2540
2541	done:	2541	done:
2542	scsi_disk_put(sdkp);	2542	scsi_disk_put(sdkp);
2543	return ret;	2543	return ret;
2544	}	2544	}
2545		2545
2546	/**	2546	/**
2547	* init_sd - entry point for this driver (both when built in or when	2547	* init_sd - entry point for this driver (both when built in or when
2548	* a module).	2548	* a module).
2549	*	2549	*
2550	* Note: this function registers this driver with the scsi mid-level.	2550	* Note: this function registers this driver with the scsi mid-level.
2551	**/	2551	**/
2552	static int __init init_sd(void)	2552	static int __init init_sd(void)
2553	{	2553	{
2554	int majors = 0, i, err;	2554	int majors = 0, i, err;
2555		2555
2556	SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));	2556	SCSI_LOG_HLQUEUE(3, printk("init_sd: sd driver entry point\n"));
2557		2557
2558	for (i = 0; i < SD_MAJORS; i++)	2558	for (i = 0; i < SD_MAJORS; i++)
2559	if (register_blkdev(sd_major(i), "sd") == 0)	2559	if (register_blkdev(sd_major(i), "sd") == 0)
2560	majors++;	2560	majors++;
2561		2561
2562	if (!majors)	2562	if (!majors)
2563	return -ENODEV;	2563	return -ENODEV;
2564		2564
2565	err = class_register(&sd_disk_class);	2565	err = class_register(&sd_disk_class);
2566	if (err)	2566	if (err)
2567	goto err_out;	2567	goto err_out;
2568		2568
2569	err = scsi_register_driver(&sd_template.gendrv);	2569	err = scsi_register_driver(&sd_template.gendrv);
2570	if (err)	2570	if (err)
2571	goto err_out_class;	2571	goto err_out_class;
2572		2572
2573	sd_cdb_cache = kmem_cache_create("sd_ext_cdb", SD_EXT_CDB_SIZE,	2573	sd_cdb_cache = kmem_cache_create("sd_ext_cdb", SD_EXT_CDB_SIZE,
2574	0, 0, NULL);	2574	0, 0, NULL);
2575	if (!sd_cdb_cache) {	2575	if (!sd_cdb_cache) {
2576	printk(KERN_ERR "sd: can't init extended cdb cache\n");	2576	printk(KERN_ERR "sd: can't init extended cdb cache\n");
2577	goto err_out_class;	2577	goto err_out_class;
2578	}	2578	}
2579		2579
2580	sd_cdb_pool = mempool_create_slab_pool(SD_MEMPOOL_SIZE, sd_cdb_cache);	2580	sd_cdb_pool = mempool_create_slab_pool(SD_MEMPOOL_SIZE, sd_cdb_cache);
2581	if (!sd_cdb_pool) {	2581	if (!sd_cdb_pool) {
2582	printk(KERN_ERR "sd: can't init extended cdb pool\n");	2582	printk(KERN_ERR "sd: can't init extended cdb pool\n");
2583	goto err_out_cache;	2583	goto err_out_cache;
2584	}	2584	}
2585		2585
2586	return 0;	2586	return 0;
2587		2587
2588	err_out_cache:	2588	err_out_cache:
2589	kmem_cache_destroy(sd_cdb_cache);	2589	kmem_cache_destroy(sd_cdb_cache);
2590		2590
2591	err_out_class:	2591	err_out_class:
2592	class_unregister(&sd_disk_class);	2592	class_unregister(&sd_disk_class);
2593	err_out:	2593	err_out:
2594	for (i = 0; i < SD_MAJORS; i++)	2594	for (i = 0; i < SD_MAJORS; i++)
2595	unregister_blkdev(sd_major(i), "sd");	2595	unregister_blkdev(sd_major(i), "sd");
2596	return err;	2596	return err;
2597	}	2597	}
2598		2598
2599	/**	2599	/**
2600	* exit_sd - exit point for this driver (when it is a module).	2600	* exit_sd - exit point for this driver (when it is a module).
2601	*	2601	*
2602	* Note: this function unregisters this driver from the scsi mid-level.	2602	* Note: this function unregisters this driver from the scsi mid-level.
2603	**/	2603	**/
2604	static void __exit exit_sd(void)	2604	static void __exit exit_sd(void)
2605	{	2605	{
2606	int i;	2606	int i;
2607		2607
2608	SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n"));	2608	SCSI_LOG_HLQUEUE(3, printk("exit_sd: exiting sd driver\n"));
2609		2609
2610	mempool_destroy(sd_cdb_pool);	2610	mempool_destroy(sd_cdb_pool);
2611	kmem_cache_destroy(sd_cdb_cache);	2611	kmem_cache_destroy(sd_cdb_cache);
2612		2612
2613	scsi_unregister_driver(&sd_template.gendrv);	2613	scsi_unregister_driver(&sd_template.gendrv);
2614	class_unregister(&sd_disk_class);	2614	class_unregister(&sd_disk_class);
2615		2615
2616	for (i = 0; i < SD_MAJORS; i++)	2616	for (i = 0; i < SD_MAJORS; i++)
2617	unregister_blkdev(sd_major(i), "sd");	2617	unregister_blkdev(sd_major(i), "sd");
2618	}	2618	}
2619		2619
2620	module_init(init_sd);	2620	module_init(init_sd);
2621	module_exit(exit_sd);	2621	module_exit(exit_sd);
2622		2622
2623	static void sd_print_sense_hdr(struct scsi_disk *sdkp,	2623	static void sd_print_sense_hdr(struct scsi_disk *sdkp,
2624	struct scsi_sense_hdr *sshdr)	2624	struct scsi_sense_hdr *sshdr)
2625	{	2625	{
2626	sd_printk(KERN_INFO, sdkp, "");	2626	sd_printk(KERN_INFO, sdkp, "");
2627	scsi_show_sense_hdr(sshdr);	2627	scsi_show_sense_hdr(sshdr);
2628	sd_printk(KERN_INFO, sdkp, "");	2628	sd_printk(KERN_INFO, sdkp, "");
2629	scsi_show_extd_sense(sshdr->asc, sshdr->ascq);	2629	scsi_show_extd_sense(sshdr->asc, sshdr->ascq);
2630	}	2630	}
2631		2631
2632	static void sd_print_result(struct scsi_disk *sdkp, int result)	2632	static void sd_print_result(struct scsi_disk *sdkp, int result)
2633	{	2633	{
2634	sd_printk(KERN_INFO, sdkp, "");	2634	sd_printk(KERN_INFO, sdkp, "");
2635	scsi_show_result(result);	2635	scsi_show_result(result);
2636	}	2636	}
2637		2637
2638		2638

include/linux/blkdev.h

Diff comments View file @ 4913efe

 #ifndef _LINUX_BLKDEV_H
 #define _LINUX_BLKDEV_H
 #ifdef CONFIG_BLOCK
 #include <linux/sched.h>
 #include <linux/major.h>
 #include <linux/genhd.h>
 #include <linux/list.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include <linux/gfp.h>
 #include <linux/bsg.h>
 #include <linux/smp.h>
 #include <asm/scatterlist.h>
 struct scsi_ioctl_command;
 struct request_queue;
 struct elevator_queue;
 struct request_pm_state;
 struct blk_trace;
 struct request;
 struct sg_io_hdr;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
 struct request_list {
 	/*
 	 * count[], starved[], and wait[] are indexed by
 	 * BLK_RW_SYNC/BLK_RW_ASYNC
 	 */
 	int count[2];
 	int starved[2];
 	int elvpriv;
 	mempool_t *rq_pool;
 	wait_queue_head_t wait[2];
 };
 /*
  * request command types
  */
 enum rq_cmd_type_bits {
 	REQ_TYPE_FS		= 1,	/* fs request */
 	REQ_TYPE_BLOCK_PC,		/* scsi command */
 	REQ_TYPE_SENSE,			/* sense request */
 	REQ_TYPE_PM_SUSPEND,		/* suspend request */
 	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
 	REQ_TYPE_SPECIAL,		/* driver defined type */
 	/*
 	 * for ATA/ATAPI devices. this really doesn't belong here, ide should
 	 * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver
 	 * private REQ_LB opcodes to differentiate what type of request this is
 	 */
 	REQ_TYPE_ATA_TASKFILE,
 	REQ_TYPE_ATA_PC,
 };
 #define BLK_MAX_CDB	16
 /*
  * try to put the fields that are referenced together in the same cacheline.
  * if you modify this structure, be sure to check block/blk-core.c:rq_init()
  * as well!
  */
 struct request {
 	struct list_head queuelist;
 	struct call_single_data csd;
 	struct request_queue *q;
 	unsigned int cmd_flags;
 	enum rq_cmd_type_bits cmd_type;
 	unsigned long atomic_flags;
 	int cpu;
 	/* the following two fields are internal, NEVER access directly */
 	unsigned int __data_len;	/* total data len */
 	sector_t __sector;		/* sector cursor */
 	struct bio *bio;
 	struct bio *biotail;
 	struct hlist_node hash;	/* merge hash */
 	/*
 	 * The rb_node is only used inside the io scheduler, requests
 	 * are pruned when moved to the dispatch queue. So let the
 	 * completion_data share space with the rb_node.
 	 */
 	union {
 		struct rb_node rb_node;	/* sort/lookup */
 		void *completion_data;
 	};
 	/*
 	 * Three pointers are available for the IO schedulers, if they need
 	 * more they have to dynamically allocate it.
 	 */
 	void *elevator_private;
 	void *elevator_private2;
 	void *elevator_private3;
 	struct gendisk *rq_disk;
 	unsigned long start_time;
 #ifdef CONFIG_BLK_CGROUP
 	unsigned long long start_time_ns;
 	unsigned long long io_start_time_ns;    /* when passed to hardware */
 #endif
 	/* Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
 	unsigned short ioprio;
 	int ref_count;
 	void *special;		/* opaque pointer available for LLD use */
 	char *buffer;		/* kaddr of the current segment if available */
 	int tag;
 	int errors;
 	/*
 	 * when request is used as a packet command carrier
 	 */
 	unsigned char __cmd[BLK_MAX_CDB];
 	unsigned char *cmd;
 	unsigned short cmd_len;
 	unsigned int extra_len;	/* length of alignment and padding */
 	unsigned int sense_len;
 	unsigned int resid_len;	/* residual count */
 	void *sense;
 	unsigned long deadline;
 	struct list_head timeout_list;
 	unsigned int timeout;
 	int retries;
 	/*
 	 * completion callback.
 	 */
 	rq_end_io_fn *end_io;
 	void *end_io_data;
 	/* for bidi */
 	struct request *next_rq;
 };
 static inline unsigned short req_get_ioprio(struct request *req)
 {
 	return req->ioprio;
 }
 /*
  * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME
  * requests. Some step values could eventually be made generic.
  */
 struct request_pm_state
 {
 	/* PM state machine step value, currently driver specific */
 	int	pm_step;
 	/* requested PM state value (S1, S2, S3, S4, ...) */
 	u32	pm_state;
 	void*	data;		/* for driver use */
 };
 #include <linux/elevator.h>
 typedef void (request_fn_proc) (struct request_queue *q);
 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unprep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
 struct bio_vec;
 struct bvec_merge_data {
 	struct block_device *bi_bdev;
 	sector_t bi_sector;
 	unsigned bi_size;
 	unsigned long bi_rw;
 };
 typedef int (merge_bvec_fn) (struct request_queue *, struct bvec_merge_data *,
 			     struct bio_vec *);
 typedef void (softirq_done_fn)(struct request *);
 typedef int (dma_drain_needed_fn)(struct request *);
 typedef int (lld_busy_fn) (struct request_queue *q);
 enum blk_eh_timer_return {
 	BLK_EH_NOT_HANDLED,
 	BLK_EH_HANDLED,
 	BLK_EH_RESET_TIMER,
 };
 typedef enum blk_eh_timer_return (rq_timed_out_fn)(struct request *);
 enum blk_queue_state {
 	Queue_down,
 	Queue_up,
 };
 struct blk_queue_tag {
 	struct request **tag_index;	/* map of busy tags */
 	unsigned long *tag_map;		/* bit map of free/busy tags */
 	int busy;			/* current depth */
 	int max_depth;			/* what we will send to device */
 	int real_max_depth;		/* what the array can hold */
 	atomic_t refcnt;		/* map can be shared */
 };
 #define BLK_SCSI_MAX_CMDS	(256)
 #define BLK_SCSI_CMD_PER_LONG	(BLK_SCSI_MAX_CMDS / (sizeof(long) * 8))
 struct queue_limits {
 	unsigned long		bounce_pfn;
 	unsigned long		seg_boundary_mask;
 	unsigned int		max_hw_sectors;
 	unsigned int		max_sectors;
 	unsigned int		max_segment_size;
 	unsigned int		physical_block_size;
 	unsigned int		alignment_offset;
 	unsigned int		io_min;
 	unsigned int		io_opt;
 	unsigned int		max_discard_sectors;
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
 	unsigned short		logical_block_size;
 	unsigned short		max_segments;
 	unsigned char		misaligned;
 	unsigned char		discard_misaligned;
 	unsigned char		no_cluster;
 	signed char		discard_zeroes_data;
 };
 struct request_queue
 {
 	/*
 	 * Together with queue_head for cacheline sharing
 	 */
 	struct list_head	queue_head;
 	struct request		*last_merge;
 	struct elevator_queue	*elevator;
 	/*
 	 * the queue request freelist, one for reads and one for writes
 	 */
 	struct request_list	rq;
 	request_fn_proc		*request_fn;
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unprep_rq_fn		*unprep_rq_fn;
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	softirq_done_fn		*softirq_done_fn;
 	rq_timed_out_fn		*rq_timed_out_fn;
 	dma_drain_needed_fn	*dma_drain_needed;
 	lld_busy_fn		*lld_busy_fn;
 	/*
 	 * Dispatch queue sorting
 	 */
 	sector_t		end_sector;
 	struct request		*boundary_rq;
 	/*
 	 * Auto-unplugging state
 	 */
 	struct timer_list	unplug_timer;
 	int			unplug_thresh;	/* After this many requests */
 	unsigned long		unplug_delay;	/* After this many jiffies */
 	struct work_struct	unplug_work;
 	struct backing_dev_info	backing_dev_info;
 	/*
 	 * The queue owner gets to use this for whatever they like.
 	 * ll_rw_blk doesn't touch it.
 	 */
 	void			*queuedata;
 	/*
 	 * queue needs bounce pages for pages above this limit
 	 */
 	gfp_t			bounce_gfp;
 	/*
 	 * various queue flags, see QUEUE_* below
 	 */
 	unsigned long		queue_flags;
 	/*
 	 * protects queue structures from reentrancy. ->__queue_lock should
 	 * _never_ be used directly, it is queue private. always use
 	 * ->queue_lock.
 	 */
 	spinlock_t		__queue_lock;
 	spinlock_t		*queue_lock;
 	/*
 	 * queue kobject
 	 */
 	struct kobject kobj;
 	/*
 	 * queue settings
 	 */
 	unsigned long		nr_requests;	/* Max # of requests */
 	unsigned int		nr_congestion_on;
 	unsigned int		nr_congestion_off;
 	unsigned int		nr_batching;
 	void			*dma_drain_buffer;
 	unsigned int		dma_drain_size;
 	unsigned int		dma_pad_mask;
 	unsigned int		dma_alignment;
 	struct blk_queue_tag	*queue_tags;
 	struct list_head	tag_busy_list;
 	unsigned int		nr_sorted;
 	unsigned int		in_flight[2];
 	unsigned int		rq_timeout;
 	struct timer_list	timeout;
 	struct list_head	timeout_list;
 	struct queue_limits	limits;
 	/*
 	 * sg stuff
 	 */
 	unsigned int		sg_timeout;
 	unsigned int		sg_reserved_size;
 	int			node;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	struct blk_trace	*blk_trace;
 #endif
 	/*
-	 * reserved for flush operations
+	 * for flush operations
 	 */
+	unsigned int		flush_flags;
 	unsigned int		ordered, next_ordered, ordseq;
 	int			orderr, ordcolor;
 	struct request		pre_flush_rq, bar_rq, post_flush_rq;
 	struct request		*orig_bar_rq;
 	struct mutex		sysfs_lock;
 #if defined(CONFIG_BLK_DEV_BSG)
 	struct bsg_class_device bsg_dev;
 #endif
 };
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
 #define QUEUE_FLAG_STOPPED	2	/* queue is stopped */
 #define	QUEUE_FLAG_SYNCFULL	3	/* read queue has been filled */
 #define QUEUE_FLAG_ASYNCFULL	4	/* write queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 #define QUEUE_FLAG_ELVSWITCH	8	/* don't use elevator, just do FIFO */
 #define QUEUE_FLAG_BIDI		9	/* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES    10	/* disable merge attempts */
 #define QUEUE_FLAG_SAME_COMP   11	/* force complete on same CPU */
 #define QUEUE_FLAG_FAIL_IO     12	/* fake timeout */
 #define QUEUE_FLAG_STACKABLE   13	/* supports request stacking */
 #define QUEUE_FLAG_NONROT      14	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
 #define QUEUE_FLAG_IO_STAT     15	/* do IO stats */
 #define QUEUE_FLAG_DISCARD     16	/* supports DISCARD */
 #define QUEUE_FLAG_NOXMERGES   17	/* No extended merges */
 #define QUEUE_FLAG_ADD_RANDOM  18	/* Contributes to random pool */
 #define QUEUE_FLAG_SECDISCARD  19	/* supports SECDISCARD */
 #define QUEUE_FLAG_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_CLUSTER) |		\
 				 (1 << QUEUE_FLAG_STACKABLE)	|	\
 				 (1 << QUEUE_FLAG_SAME_COMP)	|	\
 				 (1 << QUEUE_FLAG_ADD_RANDOM))
 static inline int queue_is_locked(struct request_queue *q)
 {
 #ifdef CONFIG_SMP
 	spinlock_t *lock = q->queue_lock;
 	return lock && spin_is_locked(lock);
 #else
 	return 1;
 #endif
 }
 static inline void queue_flag_set_unlocked(unsigned int flag,
 					   struct request_queue *q)
 {
 	__set_bit(flag, &q->queue_flags);
 }
 static inline int queue_flag_test_and_clear(unsigned int flag,
 					    struct request_queue *q)
 {
 	WARN_ON_ONCE(!queue_is_locked(q));
 	if (test_bit(flag, &q->queue_flags)) {
 		__clear_bit(flag, &q->queue_flags);
 		return 1;
 	}
 	return 0;
 }
 static inline int queue_flag_test_and_set(unsigned int flag,
 					  struct request_queue *q)
 {
 	WARN_ON_ONCE(!queue_is_locked(q));
 	if (!test_bit(flag, &q->queue_flags)) {
 		__set_bit(flag, &q->queue_flags);
 		return 0;
 	}
 	return 1;
 }
 static inline void queue_flag_set(unsigned int flag, struct request_queue *q)
 {
 	WARN_ON_ONCE(!queue_is_locked(q));
 	__set_bit(flag, &q->queue_flags);
 }
 static inline void queue_flag_clear_unlocked(unsigned int flag,
 					     struct request_queue *q)
 {
 	__clear_bit(flag, &q->queue_flags);
 }
 static inline int queue_in_flight(struct request_queue *q)
 {
 	return q->in_flight[0] + q->in_flight[1];
 }
 static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 {
 	WARN_ON_ONCE(!queue_is_locked(q));
 	__clear_bit(flag, &q->queue_flags);
 }
 enum {
 	/*
 	 * Hardbarrier is supported with one of the following methods.
 	 *
 	 * NONE		: hardbarrier unsupported
 	 * DRAIN	: ordering by draining is enough
 	 * DRAIN_FLUSH	: ordering by draining w/ pre and post flushes
 	 * DRAIN_FUA	: ordering by draining w/ pre flush and FUA write
 	 */
 	QUEUE_ORDERED_DO_PREFLUSH	= 0x10,
 	QUEUE_ORDERED_DO_BAR		= 0x20,
 	QUEUE_ORDERED_DO_POSTFLUSH	= 0x40,
 	QUEUE_ORDERED_DO_FUA		= 0x80,
 	QUEUE_ORDERED_NONE		= 0x00,
 	QUEUE_ORDERED_DRAIN		= QUEUE_ORDERED_DO_BAR,
 	QUEUE_ORDERED_DRAIN_FLUSH	= QUEUE_ORDERED_DRAIN |
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_POSTFLUSH,
 	QUEUE_ORDERED_DRAIN_FUA		= QUEUE_ORDERED_DRAIN |
 					  QUEUE_ORDERED_DO_PREFLUSH |
 					  QUEUE_ORDERED_DO_FUA,
 	/*
 	 * Ordered operation sequence
 	 */
 	QUEUE_ORDSEQ_STARTED	= 0x01,	/* flushing in progress */
 	QUEUE_ORDSEQ_DRAIN	= 0x02,	/* waiting for the queue to be drained */
 	QUEUE_ORDSEQ_PREFLUSH	= 0x04,	/* pre-flushing in progress */
 	QUEUE_ORDSEQ_BAR	= 0x08,	/* original barrier req in progress */
 	QUEUE_ORDSEQ_POSTFLUSH	= 0x10,	/* post-flushing in progress */
 	QUEUE_ORDSEQ_DONE	= 0x20,
 };
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_noxmerges(q)	\
 	test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
 #define blk_queue_nonrot(q)	test_bit(QUEUE_FLAG_NONROT, &(q)->queue_flags)
 #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
 #define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
 #define blk_queue_stackable(q)	\
 	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
 #define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
 	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
 			     REQ_FAILFAST_DRIVER))
 #define blk_account_rq(rq) \
 	(((rq)->cmd_flags & REQ_STARTED) && \
 	 ((rq)->cmd_type == REQ_TYPE_FS || \
 	  ((rq)->cmd_flags & REQ_DISCARD)))
 #define blk_pm_request(rq)	\
 	((rq)->cmd_type == REQ_TYPE_PM_SUSPEND || \
 	 (rq)->cmd_type == REQ_TYPE_PM_RESUME)
 #define blk_rq_cpu_valid(rq)	((rq)->cpu != -1)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
 /* rq->queuelist of dequeued request must be list_empty() */
 #define blk_queued_rq(rq)	(!list_empty(&(rq)->queuelist))
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 #define rq_data_dir(rq)		((rq)->cmd_flags & 1)
 /*
  * We regard a request as sync, if either a read or a sync write
  */
 static inline bool rw_is_sync(unsigned int rw_flags)
 {
 	return !(rw_flags & REQ_WRITE) || (rw_flags & REQ_SYNC);
 }
 static inline bool rq_is_sync(struct request *rq)
 {
 	return rw_is_sync(rq->cmd_flags);
 }
 static inline int blk_queue_full(struct request_queue *q, int sync)
 {
 	if (sync)
 		return test_bit(QUEUE_FLAG_SYNCFULL, &q->queue_flags);
 	return test_bit(QUEUE_FLAG_ASYNCFULL, &q->queue_flags);
 }
 static inline void blk_set_queue_full(struct request_queue *q, int sync)
 {
 	if (sync)
 		queue_flag_set(QUEUE_FLAG_SYNCFULL, q);
 	else
 		queue_flag_set(QUEUE_FLAG_ASYNCFULL, q);
 }
 static inline void blk_clear_queue_full(struct request_queue *q, int sync)
 {
 	if (sync)
 		queue_flag_clear(QUEUE_FLAG_SYNCFULL, q);
 	else
 		queue_flag_clear(QUEUE_FLAG_ASYNCFULL, q);
 }
 /*
  * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may
  * it already be started by driver.
  */
 #define RQ_NOMERGE_FLAGS	\
 	(REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
 #define rq_mergeable(rq)	\
 	(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && \
 	 (((rq)->cmd_flags & REQ_DISCARD) || \
 	  (rq)->cmd_type == REQ_TYPE_FS))
 /*
  * q->prep_rq_fn return values
  */
 #define BLKPREP_OK		0	/* serve it */
 #define BLKPREP_KILL		1	/* fatal error, kill */
 #define BLKPREP_DEFER		2	/* leave on queue */
 extern unsigned long blk_max_low_pfn, blk_max_pfn;
 /*
  * standard bounce addresses:
  *
  * BLK_BOUNCE_HIGH	: bounce all highmem pages
  * BLK_BOUNCE_ANY	: don't bounce anything
  * BLK_BOUNCE_ISA	: bounce pages above ISA DMA boundary
  */
 #if BITS_PER_LONG == 32
 #define BLK_BOUNCE_HIGH		((u64)blk_max_low_pfn << PAGE_SHIFT)
 #else
 #define BLK_BOUNCE_HIGH		-1ULL
 #endif
 #define BLK_BOUNCE_ANY		(-1ULL)
 #define BLK_BOUNCE_ISA		(DMA_BIT_MASK(24))
 /*
  * default timeout for SG_IO if none specified
  */
 #define BLK_DEFAULT_SG_TIMEOUT	(60 * HZ)
 #define BLK_MIN_SG_TIMEOUT	(7 * HZ)
 #ifdef CONFIG_BOUNCE
 extern int init_emergency_isa_pool(void);
 extern void blk_queue_bounce(struct request_queue *q, struct bio **bio);
 #else
 static inline int init_emergency_isa_pool(void)
 {
 	return 0;
 }
 static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
 {
 }
 #endif /* CONFIG_MMU */
 struct rq_map_data {
 	struct page **pages;
 	int page_order;
 	int nr_entries;
 	unsigned long offset;
 	int null_mapped;
 	int from_user;
 };
 struct req_iterator {
 	int i;
 	struct bio *bio;
 };
 /* This should not be used directly - use rq_for_each_segment */
 #define for_each_bio(_bio)		\
 	for (; _bio; _bio = _bio->bi_next)
 #define __rq_for_each_bio(_bio, rq)	\
 	if ((rq->bio))			\
 		for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
 #define rq_for_each_segment(bvl, _rq, _iter)			\
 	__rq_for_each_bio(_iter.bio, _rq)			\
 		bio_for_each_segment(bvl, _iter.bio, _iter.i)
 #define rq_iter_last(rq, _iter)					\
 		(_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1)
 #ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 # error	"You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform"
 #endif
 #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
 extern void rq_flush_dcache_pages(struct request *rq);
 #else
 static inline void rq_flush_dcache_pages(struct request *rq)
 {
 }
 #endif
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern void register_disk(struct gendisk *dev);
 extern void generic_make_request(struct bio *bio);
 extern void blk_rq_init(struct request_queue *q, struct request *rq);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern struct request *blk_make_request(struct request_queue *, struct bio *,
 					gfp_t);
 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len);
 extern int blk_rq_check_limits(struct request_queue *q, struct request *rq);
 extern int blk_lld_busy(struct request_queue *q);
 extern int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 			     struct bio_set *bs, gfp_t gfp_mask,
 			     int (*bio_ctr)(struct bio *, struct bio *, void *),
 			     void *data);
 extern void blk_rq_unprep_clone(struct request *rq);
 extern int blk_insert_cloned_request(struct request_queue *q,
 				     struct request *rq);
 extern void blk_plug_device(struct request_queue *);
 extern void blk_plug_device_unlocked(struct request_queue *);
 extern int blk_remove_plug(struct request_queue *);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			  unsigned int, void __user *);
 extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t,
 			 struct scsi_ioctl_command __user *);
 /*
  * A queue has just exitted congestion.  Note this in the global counter of
  * congested queues, and wake up anyone who was waiting for requests to be
  * put back.
  */
 static inline void blk_clear_queue_congested(struct request_queue *q, int sync)
 {
 	clear_bdi_congested(&q->backing_dev_info, sync);
 }
 /*
  * A queue has just entered congestion.  Flag that in the queue's VM-visible
  * state flags and increment the global gounter of congested queues.
  */
 static inline void blk_set_queue_congested(struct request_queue *q, int sync)
 {
 	set_bdi_congested(&q->backing_dev_info, sync);
 }
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
 extern void __blk_stop_queue(struct request_queue *q);
 extern void __blk_run_queue(struct request_queue *);
 extern void blk_run_queue(struct request_queue *);
 extern int blk_rq_map_user(struct request_queue *, struct request *,
 			   struct rq_map_data *, void __user *, unsigned long,
 			   gfp_t);
 extern int blk_rq_unmap_user(struct bio *);
 extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
 			       struct rq_map_data *, struct sg_iovec *, int,
 			       unsigned int, gfp_t);
 extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
 extern void blk_unplug(struct request_queue *q);
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
 	return bdev->bd_disk->queue;
 }
 /*
  * blk_rq_pos()			: the current sector
  * blk_rq_bytes()		: bytes left in the entire request
  * blk_rq_cur_bytes()		: bytes left in the current segment
  * blk_rq_err_bytes()		: bytes left till the next error boundary
  * blk_rq_sectors()		: sectors left in the entire request
  * blk_rq_cur_sectors()		: sectors left in the current segment
  */
 static inline sector_t blk_rq_pos(const struct request *rq)
 {
 	return rq->__sector;
 }
 static inline unsigned int blk_rq_bytes(const struct request *rq)
 {
 	return rq->__data_len;
 }
 static inline int blk_rq_cur_bytes(const struct request *rq)
 {
 	return rq->bio ? bio_cur_bytes(rq->bio) : 0;
 }
 extern unsigned int blk_rq_err_bytes(const struct request *rq);
 static inline unsigned int blk_rq_sectors(const struct request *rq)
 {
 	return blk_rq_bytes(rq) >> 9;
 }
 static inline unsigned int blk_rq_cur_sectors(const struct request *rq)
 {
 	return blk_rq_cur_bytes(rq) >> 9;
 }
 /*
  * Request issue related functions.
  */
 extern struct request *blk_peek_request(struct request_queue *q);
 extern void blk_start_request(struct request *rq);
 extern struct request *blk_fetch_request(struct request_queue *q);
 /*
  * Request completion related functions.
  *
  * blk_update_request() completes given number of bytes and updates
  * the request without completing it.
  *
  * blk_end_request() and friends.  __blk_end_request() must be called
  * with the request queue spinlock acquired.
  *
  * Several drivers define their own end_request and call
  * blk_end_request() for parts of the original function.
  * This prevents code duplication in drivers.
  */
 extern bool blk_update_request(struct request *rq, int error,
 			       unsigned int nr_bytes);
 extern bool blk_end_request(struct request *rq, int error,
 			    unsigned int nr_bytes);
 extern void blk_end_request_all(struct request *rq, int error);
 extern bool blk_end_request_cur(struct request *rq, int error);
 extern bool blk_end_request_err(struct request *rq, int error);
 extern bool __blk_end_request(struct request *rq, int error,
 			      unsigned int nr_bytes);
 extern void __blk_end_request_all(struct request *rq, int error);
 extern bool __blk_end_request_cur(struct request *rq, int error);
 extern bool __blk_end_request_err(struct request *rq, int error);
 extern void blk_complete_request(struct request *);
 extern void __blk_complete_request(struct request *);
 extern void blk_abort_request(struct request *);
 extern void blk_abort_queue(struct request_queue *);
 extern void blk_unprep_request(struct request *);
 /*
  * Access functions for manipulating queue properties
  */
 extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
 					spinlock_t *lock, int node_id);
 extern struct request_queue *blk_init_allocated_queue_node(struct request_queue *,
 							   request_fn_proc *,
 							   spinlock_t *, int node_id);
 extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern struct request_queue *blk_init_allocated_queue(struct request_queue *,
 						      request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_max_discard_sectors(struct request_queue *q,
 		unsigned int max_discard_sectors);
 extern void blk_queue_logical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned short);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
 extern void blk_limits_io_min(struct queue_limits *limits, unsigned int min);
 extern void blk_queue_io_min(struct request_queue *q, unsigned int min);
 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
 extern void blk_set_default_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			    sector_t offset);
 extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
 			    sector_t offset);
 extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
 			      sector_t offset);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_dma_pad(struct request_queue *, unsigned int);
 extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int);
 extern int blk_queue_dma_drain(struct request_queue *q,
 			       dma_drain_needed_fn *dma_drain_needed,
 			       void *buf, unsigned int size);
 extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
 extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_update_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *);
 extern void blk_queue_rq_timeout(struct request_queue *, unsigned int);
+extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
-extern int blk_queue_ordered(struct request_queue *, unsigned);
 extern bool blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
 extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int);
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern void generic_unplug_device(struct request_queue *);
 extern long nr_blockdev_pages(void);
 int blk_get_queue(struct request_queue *);
 struct request_queue *blk_alloc_queue(gfp_t);
 struct request_queue *blk_alloc_queue_node(gfp_t, int);
 extern void blk_put_queue(struct request_queue *);
 /*
  * tag stuff
  */
 #define blk_rq_tagged(rq)		((rq)->cmd_flags & REQ_QUEUED)
 extern int blk_queue_start_tag(struct request_queue *, struct request *);
 extern struct request *blk_queue_find_tag(struct request_queue *, int);
 extern void blk_queue_end_tag(struct request_queue *, struct request *);
 extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *);
 extern void blk_queue_free_tags(struct request_queue *);
 extern int blk_queue_resize_tags(struct request_queue *, int);
 extern void blk_queue_invalidate_tags(struct request_queue *);
 extern struct blk_queue_tag *blk_init_tags(int);
 extern void blk_free_tags(struct blk_queue_tag *);
 static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 						int tag)
 {
 	if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
 		return NULL;
 	return bqt->tag_index[tag];
 }
 enum{
 	BLKDEV_WAIT,	/* wait for completion */
 	BLKDEV_BARRIER,	/* issue request with barrier */
 	BLKDEV_SECURE,	/* secure discard */
 };
 #define BLKDEV_IFL_WAIT		(1 << BLKDEV_WAIT)
 #define BLKDEV_IFL_BARRIER	(1 << BLKDEV_BARRIER)
 #define BLKDEV_IFL_SECURE	(1 << BLKDEV_SECURE)
 extern int blkdev_issue_flush(struct block_device *, gfp_t, sector_t *,
 			unsigned long);
 extern int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 			sector_t nr_sects, gfp_t gfp_mask, unsigned long flags);
 static inline int sb_issue_discard(struct super_block *sb,
 				   sector_t block, sector_t nr_blocks)
 {
 	block <<= (sb->s_blocksize_bits - 9);
 	nr_blocks <<= (sb->s_blocksize_bits - 9);
 	return blkdev_issue_discard(sb->s_bdev, block, nr_blocks, GFP_NOFS,
 				   BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
 }
 extern int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm);
 enum blk_default_limits {
 	BLK_MAX_SEGMENTS	= 128,
 	BLK_SAFE_MAX_SECTORS	= 255,
 	BLK_DEF_MAX_SECTORS	= 1024,
 	BLK_MAX_SEGMENT_SIZE	= 65536,
 	BLK_SEG_BOUNDARY_MASK	= 0xFFFFFFFFUL,
 };
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 static inline unsigned long queue_bounce_pfn(struct request_queue *q)
 {
 	return q->limits.bounce_pfn;
 }
 static inline unsigned long queue_segment_boundary(struct request_queue *q)
 {
 	return q->limits.seg_boundary_mask;
 }
 static inline unsigned int queue_max_sectors(struct request_queue *q)
 {
 	return q->limits.max_sectors;
 }
 static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
 {
 	return q->limits.max_hw_sectors;
 }
 static inline unsigned short queue_max_segments(struct request_queue *q)
 {
 	return q->limits.max_segments;
 }
 static inline unsigned int queue_max_segment_size(struct request_queue *q)
 {
 	return q->limits.max_segment_size;
 }
 static inline unsigned short queue_logical_block_size(struct request_queue *q)
 {
 	int retval = 512;
 	if (q && q->limits.logical_block_size)
 		retval = q->limits.logical_block_size;
 	return retval;
 }
 static inline unsigned short bdev_logical_block_size(struct block_device *bdev)
 {
 	return queue_logical_block_size(bdev_get_queue(bdev));
 }
 static inline unsigned int queue_physical_block_size(struct request_queue *q)
 {
 	return q->limits.physical_block_size;
 }
 static inline int bdev_physical_block_size(struct block_device *bdev)
 {
 	return queue_physical_block_size(bdev_get_queue(bdev));
 }
 static inline unsigned int queue_io_min(struct request_queue *q)
 {
 	return q->limits.io_min;
 }
 static inline int bdev_io_min(struct block_device *bdev)
 {
 	return queue_io_min(bdev_get_queue(bdev));
 }
 static inline unsigned int queue_io_opt(struct request_queue *q)
 {
 	return q->limits.io_opt;
 }
 static inline int bdev_io_opt(struct block_device *bdev)
 {
 	return queue_io_opt(bdev_get_queue(bdev));
 }
 static inline int queue_alignment_offset(struct request_queue *q)
 {
 	if (q->limits.misaligned)
 		return -1;
 	return q->limits.alignment_offset;
 }
 static inline int queue_limit_alignment_offset(struct queue_limits *lim, sector_t sector)
 {
 	unsigned int granularity = max(lim->physical_block_size, lim->io_min);
 	unsigned int alignment = (sector << 9) & (granularity - 1);
 	return (granularity + lim->alignment_offset - alignment)
 		& (granularity - 1);
 }
 static inline int bdev_alignment_offset(struct block_device *bdev)
 {
 	struct request_queue *q = bdev_get_queue(bdev);
 	if (q->limits.misaligned)
 		return -1;
 	if (bdev != bdev->bd_contains)
 		return bdev->bd_part->alignment_offset;
 	return q->limits.alignment_offset;
 }
 static inline int queue_discard_alignment(struct request_queue *q)
 {
 	if (q->limits.discard_misaligned)
 		return -1;
 	return q->limits.discard_alignment;
 }
 static inline int queue_limit_discard_alignment(struct queue_limits *lim, sector_t sector)
 {
 	unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1);
 	return (lim->discard_granularity + lim->discard_alignment - alignment)
 		& (lim->discard_granularity - 1);
 }
 static inline unsigned int queue_discard_zeroes_data(struct request_queue *q)
 {
 	if (q->limits.discard_zeroes_data == 1)
 		return 1;
 	return 0;
 }
 static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev)
 {
 	return queue_discard_zeroes_data(bdev_get_queue(bdev));
 }
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	return q ? q->dma_alignment : 511;
 }
 static inline int blk_rq_aligned(struct request_queue *q, void *addr,
 				 unsigned int len)
 {
 	unsigned int alignment = queue_dma_alignment(q) | q->dma_pad_mask;
 	return !((unsigned long)addr & alignment) && !(len & alignment);
 }
 /* assumes size > 256 */
 static inline unsigned int blksize_bits(unsigned int size)
 {
 	unsigned int bits = 8;
 	do {
 		bits++;
 		size >>= 1;
 	} while (size > 256);
 	return bits;
 }
 static inline unsigned int block_size(struct block_device *bdev)
 {
 	return bdev->bd_block_size;
 }
 typedef struct {struct page *v;} Sector;
 unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *);
 static inline void put_dev_sector(Sector p)
 {
 	page_cache_release(p.v);
 }
 struct work_struct;
 int kblockd_schedule_work(struct request_queue *q, struct work_struct *work);
 #ifdef CONFIG_BLK_CGROUP
 /*
  * This should not be using sched_clock(). A real patch is in progress
  * to fix this up, until that is in place we need to disable preemption
  * around sched_clock() in this function and set_io_start_time_ns().
  */
 static inline void set_start_time_ns(struct request *req)
 {
 	preempt_disable();
 	req->start_time_ns = sched_clock();
 	preempt_enable();
 }
 static inline void set_io_start_time_ns(struct request *req)
 {
 	preempt_disable();
 	req->io_start_time_ns = sched_clock();
 	preempt_enable();
 }
 static inline uint64_t rq_start_time_ns(struct request *req)
 {
         return req->start_time_ns;
 }
 static inline uint64_t rq_io_start_time_ns(struct request *req)
 {
         return req->io_start_time_ns;
 }
 #else
 static inline void set_start_time_ns(struct request *req) {}
 static inline void set_io_start_time_ns(struct request *req) {}
 static inline uint64_t rq_start_time_ns(struct request *req)
 {
 	return 0;
 }
 static inline uint64_t rq_io_start_time_ns(struct request *req)
 {
 	return 0;
 }
 #endif
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-*")
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 #define INTEGRITY_FLAG_READ	2	/* verify data integrity on read */
 #define INTEGRITY_FLAG_WRITE	4	/* generate data integrity on write */
 struct blk_integrity_exchg {
 	void			*prot_buf;
 	void			*data_buf;
 	sector_t		sector;
 	unsigned int		data_size;
 	unsigned short		sector_size;
 	const char		*disk_name;
 };
 typedef void (integrity_gen_fn) (struct blk_integrity_exchg *);
 typedef int (integrity_vrfy_fn) (struct blk_integrity_exchg *);
 typedef void (integrity_set_tag_fn) (void *, void *, unsigned int);
 typedef void (integrity_get_tag_fn) (void *, void *, unsigned int);
 struct blk_integrity {
 	integrity_gen_fn	*generate_fn;
 	integrity_vrfy_fn	*verify_fn;
 	integrity_set_tag_fn	*set_tag_fn;
 	integrity_get_tag_fn	*get_tag_fn;
 	unsigned short		flags;
 	unsigned short		tuple_size;
 	unsigned short		sector_size;
 	unsigned short		tag_size;
 	const char		*name;
 	struct kobject		kobj;
 };
 extern int blk_integrity_register(struct gendisk *, struct blk_integrity *);
 extern void blk_integrity_unregister(struct gendisk *);
 extern int blk_integrity_compare(struct gendisk *, struct gendisk *);
 extern int blk_rq_map_integrity_sg(struct request *, struct scatterlist *);
 extern int blk_rq_count_integrity_sg(struct request *);
 static inline
 struct blk_integrity *bdev_get_integrity(struct block_device *bdev)
 {
 	return bdev->bd_disk->integrity;
 }
 static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
 {
 	return disk->integrity;
 }
 static inline int blk_integrity_rq(struct request *rq)
 {
 	if (rq->bio == NULL)
 		return 0;
 	return bio_integrity(rq->bio);
 }
 #else /* CONFIG_BLK_DEV_INTEGRITY */
 #define blk_integrity_rq(rq)			(0)
 #define blk_rq_count_integrity_sg(a)		(0)
 #define blk_rq_map_integrity_sg(a, b)		(0)
 #define bdev_get_integrity(a)			(0)
 #define blk_get_integrity(a)			(0)
 #define blk_integrity_compare(a, b)		(0)
 #define blk_integrity_register(a, b)		(0)
 #define blk_integrity_unregister(a)		do { } while (0);
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 struct block_device_operations {
 	int (*open) (struct block_device *, fmode_t);
 	int (*release) (struct gendisk *, fmode_t);
 	int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
 	int (*direct_access) (struct block_device *, sector_t,
 						void **, unsigned long *);
 	int (*media_changed) (struct gendisk *);
 	void (*unlock_native_capacity) (struct gendisk *);
 	int (*revalidate_disk) (struct gendisk *);
 	int (*getgeo)(struct block_device *, struct hd_geometry *);
 	/* this callback is with swap_lock and sometimes page table lock held */
 	void (*swap_slot_free_notify) (struct block_device *, unsigned long);
 	struct module *owner;
 };
 extern int __blkdev_driver_ioctl(struct block_device *, fmode_t, unsigned int,
 				 unsigned long);
 #else /* CONFIG_BLOCK */
 /*
  * stubs for when the block layer is configured out
  */
 #define buffer_heads_over_limit 0
 static inline long nr_blockdev_pages(void)
 {
 	return 0;
 }
 #endif /* CONFIG_BLOCK */
 #endif