Eric Lee / smarc-ti-linux-kernel | Embedian Git Server

Commit 6f0d7a9eb60d70f22d71f00b2c762e255881ab31

Authored by Linus Torvalds 2014-11-15 06:03:23 +0800

Exists in ti-lsk-linux-4.1.y and in 10 other branches

Merge branch 'for-linus' of git://git.kernel.dk/linux-block

Pull block layer fixes from Jens Axboe:
 "Four small fixes that should be merged for the current 3.18-rc series.
  This pull request contains:

   - a minor bugfix for computation of best IO priority given two
     merging requests.  From Jan Kara.

   - the final (final) merge count issue that has been plaguing
     virtio-blk.  From Ming Lei.

   - enable parallel reinit notify for blk-mq queues, to combine the
     cost of an RCU grace period across lots of devices.  From Tejun
     Heo.

   - an error handling fix for the SCSI_IOCTL_SEND_COMMAND ioctl.  From
     Tony Battersby"

* 'for-linus' of git://git.kernel.dk/linux-block:
  block: blk-merge: fix blk_recount_segments()
  scsi: Fix more error handling in SCSI_IOCTL_SEND_COMMAND
  blk-mq: make mq_queue_reinit_notify() freeze queues in parallel
  block: Fix computation of merged request priority

Showing 4 changed files Inline Diff

block/blk-merge.c
block/blk-mq.c
block/ioprio.c
block/scsi_ioctl.c

block/blk-merge.c

Diff comments View file @ 6f0d7a9

 /*
  * Functions related to segment and merge handling
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/scatterlist.h>
 #include "blk.h"
 static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
 					     struct bio *bio,
 					     bool no_sg_merge)
 {
 	struct bio_vec bv, bvprv = { NULL };
 	int cluster, high, highprv = 1;
 	unsigned int seg_size, nr_phys_segs;
 	struct bio *fbio, *bbio;
 	struct bvec_iter iter;
 	if (!bio)
 		return 0;
 	/*
 	 * This should probably be returning 0, but blk_add_request_payload()
 	 * (Christoph!!!!)
 	 */
 	if (bio->bi_rw & REQ_DISCARD)
 		return 1;
 	if (bio->bi_rw & REQ_WRITE_SAME)
 		return 1;
 	fbio = bio;
 	cluster = blk_queue_cluster(q);
 	seg_size = 0;
 	nr_phys_segs = 0;
 	high = 0;
 	for_each_bio(bio) {
 		bio_for_each_segment(bv, bio, iter) {
 			/*
 			 * If SG merging is disabled, each bio vector is
 			 * a segment
 			 */
 			if (no_sg_merge)
 				goto new_segment;
 			/*
 			 * the trick here is making sure that a high page is
 			 * never considered part of another segment, since
 			 * that might change with the bounce page.
 			 */
 			high = page_to_pfn(bv.bv_page) > queue_bounce_pfn(q);
 			if (!high && !highprv && cluster) {
 				if (seg_size + bv.bv_len
 				    > queue_max_segment_size(q))
 					goto new_segment;
 				if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
 					goto new_segment;
 				if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
 					goto new_segment;
 				seg_size += bv.bv_len;
 				bvprv = bv;
 				continue;
 			}
 new_segment:
 			if (nr_phys_segs == 1 && seg_size >
 			    fbio->bi_seg_front_size)
 				fbio->bi_seg_front_size = seg_size;
 			nr_phys_segs++;
 			bvprv = bv;
 			seg_size = bv.bv_len;
 			highprv = high;
 		}
 		bbio = bio;
 	}
 	if (nr_phys_segs == 1 && seg_size > fbio->bi_seg_front_size)
 		fbio->bi_seg_front_size = seg_size;
 	if (seg_size > bbio->bi_seg_back_size)
 		bbio->bi_seg_back_size = seg_size;
 	return nr_phys_segs;
 }
 void blk_recalc_rq_segments(struct request *rq)
 {
 	bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
 			&rq->q->queue_flags);
 	rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio,
 			no_sg_merge);
 }
 void blk_recount_segments(struct request_queue *q, struct bio *bio)
 {
-	bool no_sg_merge = !!test_bit(QUEUE_FLAG_NO_SG_MERGE,
+	unsigned short seg_cnt;
-			&q->queue_flags);
-	bool merge_not_need = bio->bi_vcnt < queue_max_segments(q);
-	if (no_sg_merge && !bio_flagged(bio, BIO_CLONED) &&
+	/* estimate segment number by bi_vcnt for non-cloned bio */
-			merge_not_need)
+	if (bio_flagged(bio, BIO_CLONED))
-		bio->bi_phys_segments = bio->bi_vcnt;
+		seg_cnt = bio_segments(bio);
+	else
+		seg_cnt = bio->bi_vcnt;
+	if (test_bit(QUEUE_FLAG_NO_SG_MERGE, &q->queue_flags) &&
+			(seg_cnt < queue_max_segments(q)))
+		bio->bi_phys_segments = seg_cnt;
 	else {
 		struct bio *nxt = bio->bi_next;
 		bio->bi_next = NULL;
-		bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio,
+		bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio, false);
-				no_sg_merge && merge_not_need);
 		bio->bi_next = nxt;
 	}
 	bio->bi_flags |= (1 << BIO_SEG_VALID);
 }
 EXPORT_SYMBOL(blk_recount_segments);
 static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio,
 				   struct bio *nxt)
 {
 	struct bio_vec end_bv = { NULL }, nxt_bv;
 	struct bvec_iter iter;
 	if (!blk_queue_cluster(q))
 		return 0;
 	if (bio->bi_seg_back_size + nxt->bi_seg_front_size >
 	    queue_max_segment_size(q))
 		return 0;
 	if (!bio_has_data(bio))
 		return 1;
 	bio_for_each_segment(end_bv, bio, iter)
 		if (end_bv.bv_len == iter.bi_size)
 			break;
 	nxt_bv = bio_iovec(nxt);
 	if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
 		return 0;
 	/*
 	 * bio and nxt are contiguous in memory; check if the queue allows
 	 * these two to be merged into one
 	 */
 	if (BIOVEC_SEG_BOUNDARY(q, &end_bv, &nxt_bv))
 		return 1;
 	return 0;
 }
 static inline void
 __blk_segment_map_sg(struct request_queue *q, struct bio_vec *bvec,
 		     struct scatterlist *sglist, struct bio_vec *bvprv,
 		     struct scatterlist **sg, int *nsegs, int *cluster)
 {
 	int nbytes = bvec->bv_len;
 	if (*sg && *cluster) {
 		if ((*sg)->length + nbytes > queue_max_segment_size(q))
 			goto new_segment;
 		if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
 			goto new_segment;
 		if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
 			goto new_segment;
 		(*sg)->length += nbytes;
 	} else {
 new_segment:
 		if (!*sg)
 			*sg = sglist;
 		else {
 			/*
 			 * If the driver previously mapped a shorter
 			 * list, we could see a termination bit
 			 * prematurely unless it fully inits the sg
 			 * table on each mapping. We KNOW that there
 			 * must be more entries here or the driver
 			 * would be buggy, so force clear the
 			 * termination bit to avoid doing a full
 			 * sg_init_table() in drivers for each command.
 			 */
 			sg_unmark_end(*sg);
 			*sg = sg_next(*sg);
 		}
 		sg_set_page(*sg, bvec->bv_page, nbytes, bvec->bv_offset);
 		(*nsegs)++;
 	}
 	*bvprv = *bvec;
 }
 static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
 			     struct scatterlist *sglist,
 			     struct scatterlist **sg)
 {
 	struct bio_vec bvec, bvprv = { NULL };
 	struct bvec_iter iter;
 	int nsegs, cluster;
 	nsegs = 0;
 	cluster = blk_queue_cluster(q);
 	if (bio->bi_rw & REQ_DISCARD) {
 		/*
 		 * This is a hack - drivers should be neither modifying the
 		 * biovec, nor relying on bi_vcnt - but because of
 		 * blk_add_request_payload(), a discard bio may or may not have
 		 * a payload we need to set up here (thank you Christoph) and
 		 * bi_vcnt is really the only way of telling if we need to.
 		 */
 		if (bio->bi_vcnt)
 			goto single_segment;
 		return 0;
 	}
 	if (bio->bi_rw & REQ_WRITE_SAME) {
 single_segment:
 		*sg = sglist;
 		bvec = bio_iovec(bio);
 		sg_set_page(*sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset);
 		return 1;
 	}
 	for_each_bio(bio)
 		bio_for_each_segment(bvec, bio, iter)
 			__blk_segment_map_sg(q, &bvec, sglist, &bvprv, sg,
 					     &nsegs, &cluster);
 	return nsegs;
 }
 /*
  * map a request to scatterlist, return number of sg entries setup. Caller
  * must make sure sg can hold rq->nr_phys_segments entries
  */
 int blk_rq_map_sg(struct request_queue *q, struct request *rq,
 		  struct scatterlist *sglist)
 {
 	struct scatterlist *sg = NULL;
 	int nsegs = 0;
 	if (rq->bio)
 		nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
 	if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&
 	    (blk_rq_bytes(rq) & q->dma_pad_mask)) {
 		unsigned int pad_len =
 			(q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
 		sg->length += pad_len;
 		rq->extra_len += pad_len;
 	}
 	if (q->dma_drain_size && q->dma_drain_needed(rq)) {
 		if (rq->cmd_flags & REQ_WRITE)
 			memset(q->dma_drain_buffer, 0, q->dma_drain_size);
 		sg->page_link &= ~0x02;
 		sg = sg_next(sg);
 		sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
 			    q->dma_drain_size,
 			    ((unsigned long)q->dma_drain_buffer) &
 			    (PAGE_SIZE - 1));
 		nsegs++;
 		rq->extra_len += q->dma_drain_size;
 	}
 	if (sg)
 		sg_mark_end(sg);
 	return nsegs;
 }
 EXPORT_SYMBOL(blk_rq_map_sg);
 /**
  * blk_bio_map_sg - map a bio to a scatterlist
  * @q: request_queue in question
  * @bio: bio being mapped
  * @sglist: scatterlist being mapped
  *
  * Note:
  *    Caller must make sure sg can hold bio->bi_phys_segments entries
  *
  * Will return the number of sg entries setup
  */
 int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
 		   struct scatterlist *sglist)
 {
 	struct scatterlist *sg = NULL;
 	int nsegs;
 	struct bio *next = bio->bi_next;
 	bio->bi_next = NULL;
 	nsegs = __blk_bios_map_sg(q, bio, sglist, &sg);
 	bio->bi_next = next;
 	if (sg)
 		sg_mark_end(sg);
 	BUG_ON(bio->bi_phys_segments && nsegs > bio->bi_phys_segments);
 	return nsegs;
 }
 EXPORT_SYMBOL(blk_bio_map_sg);
 static inline int ll_new_hw_segment(struct request_queue *q,
 				    struct request *req,
 				    struct bio *bio)
 {
 	int nr_phys_segs = bio_phys_segments(q, bio);
 	if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
 		goto no_merge;
 	if (blk_integrity_merge_bio(q, req, bio) == false)
 		goto no_merge;
 	/*
 	 * This will form the start of a new hw segment.  Bump both
 	 * counters.
 	 */
 	req->nr_phys_segments += nr_phys_segs;
 	return 1;
 no_merge:
 	req->cmd_flags |= REQ_NOMERGE;
 	if (req == q->last_merge)
 		q->last_merge = NULL;
 	return 0;
 }
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		     struct bio *bio)
 {
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req)) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
 		return 0;
 	}
 	if (!bio_flagged(req->biotail, BIO_SEG_VALID))
 		blk_recount_segments(q, req->biotail);
 	if (!bio_flagged(bio, BIO_SEG_VALID))
 		blk_recount_segments(q, bio);
 	return ll_new_hw_segment(q, req, bio);
 }
 int ll_front_merge_fn(struct request_queue *q, struct request *req,
 		      struct bio *bio)
 {
 	if (blk_rq_sectors(req) + bio_sectors(bio) >
 	    blk_rq_get_max_sectors(req)) {
 		req->cmd_flags |= REQ_NOMERGE;
 		if (req == q->last_merge)
 			q->last_merge = NULL;
 		return 0;
 	}
 	if (!bio_flagged(bio, BIO_SEG_VALID))
 		blk_recount_segments(q, bio);
 	if (!bio_flagged(req->bio, BIO_SEG_VALID))
 		blk_recount_segments(q, req->bio);
 	return ll_new_hw_segment(q, req, bio);
 }
 /*
  * blk-mq uses req->special to carry normal driver per-request payload, it
  * does not indicate a prepared command that we cannot merge with.
  */
 static bool req_no_special_merge(struct request *req)
 {
 	struct request_queue *q = req->q;
 	return !q->mq_ops && req->special;
 }
 static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
 				struct request *next)
 {
 	int total_phys_segments;
 	unsigned int seg_size =
 		req->biotail->bi_seg_back_size + next->bio->bi_seg_front_size;
 	/*
 	 * First check if the either of the requests are re-queued
 	 * requests.  Can't merge them if they are.
 	 */
 	if (req_no_special_merge(req) || req_no_special_merge(next))
 		return 0;
 	/*
 	 * Will it become too large?
 	 */
 	if ((blk_rq_sectors(req) + blk_rq_sectors(next)) >
 	    blk_rq_get_max_sectors(req))
 		return 0;
 	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
 	if (blk_phys_contig_segment(q, req->biotail, next->bio)) {
 		if (req->nr_phys_segments == 1)
 			req->bio->bi_seg_front_size = seg_size;
 		if (next->nr_phys_segments == 1)
 			next->biotail->bi_seg_back_size = seg_size;
 		total_phys_segments--;
 	}
 	if (total_phys_segments > queue_max_segments(q))
 		return 0;
 	if (blk_integrity_merge_rq(q, req, next) == false)
 		return 0;
 	/* Merge is OK... */
 	req->nr_phys_segments = total_phys_segments;
 	return 1;
 }
 /**
  * blk_rq_set_mixed_merge - mark a request as mixed merge
  * @rq: request to mark as mixed merge
  *
  * Description:
  *     @rq is about to be mixed merged.  Make sure the attributes
  *     which can be mixed are set in each bio and mark @rq as mixed
  *     merged.
  */
 void blk_rq_set_mixed_merge(struct request *rq)
 {
 	unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
 	struct bio *bio;
 	if (rq->cmd_flags & REQ_MIXED_MERGE)
 		return;
 	/*
 	 * @rq will no longer represent mixable attributes for all the
 	 * contained bios.  It will just track those of the first one.
 	 * Distributes the attributs to each bio.
 	 */
 	for (bio = rq->bio; bio; bio = bio->bi_next) {
 		WARN_ON_ONCE((bio->bi_rw & REQ_FAILFAST_MASK) &&
 			     (bio->bi_rw & REQ_FAILFAST_MASK) != ff);
 		bio->bi_rw |= ff;
 	}
 	rq->cmd_flags |= REQ_MIXED_MERGE;
 }
 static void blk_account_io_merge(struct request *req)
 {
 	if (blk_do_io_stat(req)) {
 		struct hd_struct *part;
 		int cpu;
 		cpu = part_stat_lock();
 		part = req->part;
 		part_round_stats(cpu, part);
 		part_dec_in_flight(part, rq_data_dir(req));
 		hd_struct_put(part);
 		part_stat_unlock();
 	}
 }
 /*
  * Has to be called with the request spinlock acquired
  */
 static int attempt_merge(struct request_queue *q, struct request *req,
 			  struct request *next)
 {
 	if (!rq_mergeable(req) || !rq_mergeable(next))
 		return 0;
 	if (!blk_check_merge_flags(req->cmd_flags, next->cmd_flags))
 		return 0;
 	/*
 	 * not contiguous
 	 */
 	if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next))
 		return 0;
 	if (rq_data_dir(req) != rq_data_dir(next)
 	    || req->rq_disk != next->rq_disk
 	    || req_no_special_merge(next))
 		return 0;
 	if (req->cmd_flags & REQ_WRITE_SAME &&
 	    !blk_write_same_mergeable(req->bio, next->bio))
 		return 0;
 	/*
 	 * If we are allowed to merge, then append bio list
 	 * from next to rq and release next. merge_requests_fn
 	 * will have updated segment counts, update sector
 	 * counts here.
 	 */
 	if (!ll_merge_requests_fn(q, req, next))
 		return 0;
 	/*
 	 * If failfast settings disagree or any of the two is already
 	 * a mixed merge, mark both as mixed before proceeding.  This
 	 * makes sure that all involved bios have mixable attributes
 	 * set properly.
 	 */
 	if ((req->cmd_flags | next->cmd_flags) & REQ_MIXED_MERGE ||
 	    (req->cmd_flags & REQ_FAILFAST_MASK) !=
 	    (next->cmd_flags & REQ_FAILFAST_MASK)) {
 		blk_rq_set_mixed_merge(req);
 		blk_rq_set_mixed_merge(next);
 	}
 	/*
 	 * At this point we have either done a back merge
 	 * or front merge. We need the smaller start_time of
 	 * the merged requests to be the current request
 	 * for accounting purposes.
 	 */
 	if (time_after(req->start_time, next->start_time))
 		req->start_time = next->start_time;
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
 	req->__data_len += blk_rq_bytes(next);
 	elv_merge_requests(q, req, next);
 	/*
 	 * 'next' is going away, so update stats accordingly
 	 */
 	blk_account_io_merge(next);
 	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
 	if (blk_rq_cpu_valid(next))
 		req->cpu = next->cpu;
 	/* owner-ship of bio passed from next to req */
 	next->bio = NULL;
 	__blk_put_request(q, next);
 	return 1;
 }
 int attempt_back_merge(struct request_queue *q, struct request *rq)
 {
 	struct request *next = elv_latter_request(q, rq);
 	if (next)
 		return attempt_merge(q, rq, next);
 	return 0;
 }
 int attempt_front_merge(struct request_queue *q, struct request *rq)
 {
 	struct request *prev = elv_former_request(q, rq);
 	if (prev)
 		return attempt_merge(q, prev, rq);
 	return 0;
 }
 int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
 			  struct request *next)
 {
 	return attempt_merge(q, rq, next);
 }
 bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
 {
 	struct request_queue *q = rq->q;
 	if (!rq_mergeable(rq) || !bio_mergeable(bio))
 		return false;
 	if (!blk_check_merge_flags(rq->cmd_flags, bio->bi_rw))
 		return false;
 	/* different data direction or already started, don't merge */
 	if (bio_data_dir(bio) != rq_data_dir(rq))
 		return false;
 	/* must be same device and not a special request */
 	if (rq->rq_disk != bio->bi_bdev->bd_disk || req_no_special_merge(rq))
 		return false;
 	/* only merge integrity protected bio into ditto rq */
 	if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
 		return false;
 	/* must be using the same buffer */
 	if (rq->cmd_flags & REQ_WRITE_SAME &&
 	    !blk_write_same_mergeable(rq->bio, bio))
 		return false;
 	if (q->queue_flags & (1 << QUEUE_FLAG_SG_GAPS)) {
 		struct bio_vec *bprev;
 		bprev = &rq->biotail->bi_io_vec[bio->bi_vcnt - 1];
 		if (bvec_gap_to_prev(bprev, bio->bi_io_vec[0].bv_offset))
 			return false;
 	}
 	return true;
 }
 int blk_try_merge(struct request *rq, struct bio *bio)
 {
 	if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector)
 		return ELEVATOR_BACK_MERGE;
 	else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector)
 		return ELEVATOR_FRONT_MERGE;

block/blk-mq.c

Diff comments View file @ 6f0d7a9

 /*
  * Block multiqueue core code
  *
  * Copyright (C) 2013-2014 Jens Axboe
  * Copyright (C) 2013-2014 Christoph Hellwig
  */
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/backing-dev.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
 #include <linux/smp.h>
 #include <linux/llist.h>
 #include <linux/list_sort.h>
 #include <linux/cpu.h>
 #include <linux/cache.h>
 #include <linux/sched/sysctl.h>
 #include <linux/delay.h>
 #include <linux/crash_dump.h>
 #include <trace/events/block.h>
 #include <linux/blk-mq.h>
 #include "blk.h"
 #include "blk-mq.h"
 #include "blk-mq-tag.h"
 static DEFINE_MUTEX(all_q_mutex);
 static LIST_HEAD(all_q_list);
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx);
 /*
  * Check if any of the ctx's have pending work in this hardware queue
  */
 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx)
 {
 	unsigned int i;
 	for (i = 0; i < hctx->ctx_map.map_size; i++)
 		if (hctx->ctx_map.map[i].word)
 			return true;
 	return false;
 }
 static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx,
 					      struct blk_mq_ctx *ctx)
 {
 	return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word];
 }
 #define CTX_TO_BIT(hctx, ctx)	\
 	((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1))
 /*
  * Mark this ctx as having pending work in this hardware queue
  */
 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx,
 				     struct blk_mq_ctx *ctx)
 {
 	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
 	if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word))
 		set_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
 }
 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
 				      struct blk_mq_ctx *ctx)
 {
 	struct blk_align_bitmap *bm = get_bm(hctx, ctx);
 	clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word);
 }
 static int blk_mq_queue_enter(struct request_queue *q)
 {
 	while (true) {
 		int ret;
 		if (percpu_ref_tryget_live(&q->mq_usage_counter))
 			return 0;
 		ret = wait_event_interruptible(q->mq_freeze_wq,
 				!q->mq_freeze_depth || blk_queue_dying(q));
 		if (blk_queue_dying(q))
 			return -ENODEV;
 		if (ret)
 			return ret;
 	}
 }
 static void blk_mq_queue_exit(struct request_queue *q)
 {
 	percpu_ref_put(&q->mq_usage_counter);
 }
 static void blk_mq_usage_counter_release(struct percpu_ref *ref)
 {
 	struct request_queue *q =
 		container_of(ref, struct request_queue, mq_usage_counter);
 	wake_up_all(&q->mq_freeze_wq);
 }
-/*
+static void blk_mq_freeze_queue_start(struct request_queue *q)
- * Guarantee no request is in use, so we can change any data structure of
- * the queue afterward.
- */
-void blk_mq_freeze_queue(struct request_queue *q)
 {
 	bool freeze;
 	spin_lock_irq(q->queue_lock);
 	freeze = !q->mq_freeze_depth++;
 	spin_unlock_irq(q->queue_lock);
 	if (freeze) {
 		percpu_ref_kill(&q->mq_usage_counter);
 		blk_mq_run_queues(q, false);
 	}
+}
+static void blk_mq_freeze_queue_wait(struct request_queue *q)
+{
 	wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter));
 }
+/*
+ * Guarantee no request is in use, so we can change any data structure of
+ * the queue afterward.
+ */
+void blk_mq_freeze_queue(struct request_queue *q)
+{
+	blk_mq_freeze_queue_start(q);
+	blk_mq_freeze_queue_wait(q);
+}
 static void blk_mq_unfreeze_queue(struct request_queue *q)
 {
 	bool wake;
 	spin_lock_irq(q->queue_lock);
 	wake = !--q->mq_freeze_depth;
 	WARN_ON_ONCE(q->mq_freeze_depth < 0);
 	spin_unlock_irq(q->queue_lock);
 	if (wake) {
 		percpu_ref_reinit(&q->mq_usage_counter);
 		wake_up_all(&q->mq_freeze_wq);
 	}
 }
 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
 {
 	return blk_mq_has_free_tags(hctx->tags);
 }
 EXPORT_SYMBOL(blk_mq_can_queue);
 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx,
 			       struct request *rq, unsigned int rw_flags)
 {
 	if (blk_queue_io_stat(q))
 		rw_flags |= REQ_IO_STAT;
 	INIT_LIST_HEAD(&rq->queuelist);
 	/* csd/requeue_work/fifo_time is initialized before use */
 	rq->q = q;
 	rq->mq_ctx = ctx;
 	rq->cmd_flags |= rw_flags;
 	/* do not touch atomic flags, it needs atomic ops against the timer */
 	rq->cpu = -1;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->rq_disk = NULL;
 	rq->part = NULL;
 	rq->start_time = jiffies;
 #ifdef CONFIG_BLK_CGROUP
 	rq->rl = NULL;
 	set_start_time_ns(rq);
 	rq->io_start_time_ns = 0;
 #endif
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
 	rq->nr_integrity_segments = 0;
 #endif
 	rq->special = NULL;
 	/* tag was already set */
 	rq->errors = 0;
 	rq->cmd = rq->__cmd;
 	rq->extra_len = 0;
 	rq->sense_len = 0;
 	rq->resid_len = 0;
 	rq->sense = NULL;
 	INIT_LIST_HEAD(&rq->timeout_list);
 	rq->timeout = 0;
 	rq->end_io = NULL;
 	rq->end_io_data = NULL;
 	rq->next_rq = NULL;
 	ctx->rq_dispatched[rw_is_sync(rw_flags)]++;
 }
 static struct request *
 __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw)
 {
 	struct request *rq;
 	unsigned int tag;
 	tag = blk_mq_get_tag(data);
 	if (tag != BLK_MQ_TAG_FAIL) {
 		rq = data->hctx->tags->rqs[tag];
 		if (blk_mq_tag_busy(data->hctx)) {
 			rq->cmd_flags = REQ_MQ_INFLIGHT;
 			atomic_inc(&data->hctx->nr_active);
 		}
 		rq->tag = tag;
 		blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw);
 		return rq;
 	}
 	return NULL;
 }
 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp,
 		bool reserved)
 {
 	struct blk_mq_ctx *ctx;
 	struct blk_mq_hw_ctx *hctx;
 	struct request *rq;
 	struct blk_mq_alloc_data alloc_data;
 	int ret;
 	ret = blk_mq_queue_enter(q);
 	if (ret)
 		return ERR_PTR(ret);
 	ctx = blk_mq_get_ctx(q);
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT,
 			reserved, ctx, hctx);
 	rq = __blk_mq_alloc_request(&alloc_data, rw);
 	if (!rq && (gfp & __GFP_WAIT)) {
 		__blk_mq_run_hw_queue(hctx);
 		blk_mq_put_ctx(ctx);
 		ctx = blk_mq_get_ctx(q);
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
 		blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx,
 				hctx);
 		rq =  __blk_mq_alloc_request(&alloc_data, rw);
 		ctx = alloc_data.ctx;
 	}
 	blk_mq_put_ctx(ctx);
 	if (!rq)
 		return ERR_PTR(-EWOULDBLOCK);
 	return rq;
 }
 EXPORT_SYMBOL(blk_mq_alloc_request);
 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx,
 				  struct blk_mq_ctx *ctx, struct request *rq)
 {
 	const int tag = rq->tag;
 	struct request_queue *q = rq->q;
 	if (rq->cmd_flags & REQ_MQ_INFLIGHT)
 		atomic_dec(&hctx->nr_active);
 	rq->cmd_flags = 0;
 	clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	blk_mq_put_tag(hctx, tag, &ctx->last_tag);
 	blk_mq_queue_exit(q);
 }
 void blk_mq_free_request(struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	struct blk_mq_hw_ctx *hctx;
 	struct request_queue *q = rq->q;
 	ctx->rq_completed[rq_is_sync(rq)]++;
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	__blk_mq_free_request(hctx, ctx, rq);
 }
 inline void __blk_mq_end_request(struct request *rq, int error)
 {
 	blk_account_io_done(rq);
 	if (rq->end_io) {
 		rq->end_io(rq, error);
 	} else {
 		if (unlikely(blk_bidi_rq(rq)))
 			blk_mq_free_request(rq->next_rq);
 		blk_mq_free_request(rq);
 	}
 }
 EXPORT_SYMBOL(__blk_mq_end_request);
 void blk_mq_end_request(struct request *rq, int error)
 {
 	if (blk_update_request(rq, error, blk_rq_bytes(rq)))
 		BUG();
 	__blk_mq_end_request(rq, error);
 }
 EXPORT_SYMBOL(blk_mq_end_request);
 static void __blk_mq_complete_request_remote(void *data)
 {
 	struct request *rq = data;
 	rq->q->softirq_done_fn(rq);
 }
 static void blk_mq_ipi_complete_request(struct request *rq)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	bool shared = false;
 	int cpu;
 	if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) {
 		rq->q->softirq_done_fn(rq);
 		return;
 	}
 	cpu = get_cpu();
 	if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags))
 		shared = cpus_share_cache(cpu, ctx->cpu);
 	if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
 		rq->csd.func = __blk_mq_complete_request_remote;
 		rq->csd.info = rq;
 		rq->csd.flags = 0;
 		smp_call_function_single_async(ctx->cpu, &rq->csd);
 	} else {
 		rq->q->softirq_done_fn(rq);
 	}
 	put_cpu();
 }
 void __blk_mq_complete_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	if (!q->softirq_done_fn)
 		blk_mq_end_request(rq, rq->errors);
 	else
 		blk_mq_ipi_complete_request(rq);
 }
 /**
  * blk_mq_complete_request - end I/O on a request
  * @rq:		the request being processed
  *
  * Description:
  *	Ends all I/O on a request. It does not handle partial completions.
  *	The actual completion happens out-of-order, through a IPI handler.
  **/
 void blk_mq_complete_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	if (unlikely(blk_should_fake_timeout(q)))
 		return;
 	if (!blk_mark_rq_complete(rq))
 		__blk_mq_complete_request(rq);
 }
 EXPORT_SYMBOL(blk_mq_complete_request);
 void blk_mq_start_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	trace_block_rq_issue(q, rq);
 	rq->resid_len = blk_rq_bytes(rq);
 	if (unlikely(blk_bidi_rq(rq)))
 		rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq);
 	blk_add_timer(rq);
 	/*
 	 * Ensure that ->deadline is visible before set the started
 	 * flag and clear the completed flag.
 	 */
 	smp_mb__before_atomic();
 	/*
 	 * Mark us as started and clear complete. Complete might have been
 	 * set if requeue raced with timeout, which then marked it as
 	 * complete. So be sure to clear complete again when we start
 	 * the request, otherwise we'll ignore the completion event.
 	 */
 	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 		set_bit(REQ_ATOM_STARTED, &rq->atomic_flags);
 	if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags))
 		clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags);
 	if (q->dma_drain_size && blk_rq_bytes(rq)) {
 		/*
 		 * Make sure space for the drain appears.  We know we can do
 		 * this because max_hw_segments has been adjusted to be one
 		 * fewer than the device can handle.
 		 */
 		rq->nr_phys_segments++;
 	}
 }
 EXPORT_SYMBOL(blk_mq_start_request);
 static void __blk_mq_requeue_request(struct request *rq)
 {
 	struct request_queue *q = rq->q;
 	trace_block_rq_requeue(q, rq);
 	if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) {
 		if (q->dma_drain_size && blk_rq_bytes(rq))
 			rq->nr_phys_segments--;
 	}
 }
 void blk_mq_requeue_request(struct request *rq)
 {
 	__blk_mq_requeue_request(rq);
 	BUG_ON(blk_queued_rq(rq));
 	blk_mq_add_to_requeue_list(rq, true);
 }
 EXPORT_SYMBOL(blk_mq_requeue_request);
 static void blk_mq_requeue_work(struct work_struct *work)
 {
 	struct request_queue *q =
 		container_of(work, struct request_queue, requeue_work);
 	LIST_HEAD(rq_list);
 	struct request *rq, *next;
 	unsigned long flags;
 	spin_lock_irqsave(&q->requeue_lock, flags);
 	list_splice_init(&q->requeue_list, &rq_list);
 	spin_unlock_irqrestore(&q->requeue_lock, flags);
 	list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
 		if (!(rq->cmd_flags & REQ_SOFTBARRIER))
 			continue;
 		rq->cmd_flags &= ~REQ_SOFTBARRIER;
 		list_del_init(&rq->queuelist);
 		blk_mq_insert_request(rq, true, false, false);
 	}
 	while (!list_empty(&rq_list)) {
 		rq = list_entry(rq_list.next, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 		blk_mq_insert_request(rq, false, false, false);
 	}
 	/*
 	 * Use the start variant of queue running here, so that running
 	 * the requeue work will kick stopped queues.
 	 */
 	blk_mq_start_hw_queues(q);
 }
 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head)
 {
 	struct request_queue *q = rq->q;
 	unsigned long flags;
 	/*
 	 * We abuse this flag that is otherwise used by the I/O scheduler to
 	 * request head insertation from the workqueue.
 	 */
 	BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER);
 	spin_lock_irqsave(&q->requeue_lock, flags);
 	if (at_head) {
 		rq->cmd_flags |= REQ_SOFTBARRIER;
 		list_add(&rq->queuelist, &q->requeue_list);
 	} else {
 		list_add_tail(&rq->queuelist, &q->requeue_list);
 	}
 	spin_unlock_irqrestore(&q->requeue_lock, flags);
 }
 EXPORT_SYMBOL(blk_mq_add_to_requeue_list);
 void blk_mq_kick_requeue_list(struct request_queue *q)
 {
 	kblockd_schedule_work(&q->requeue_work);
 }
 EXPORT_SYMBOL(blk_mq_kick_requeue_list);
 static inline bool is_flush_request(struct request *rq,
 		struct blk_flush_queue *fq, unsigned int tag)
 {
 	return ((rq->cmd_flags & REQ_FLUSH_SEQ) &&
 			fq->flush_rq->tag == tag);
 }
 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
 {
 	struct request *rq = tags->rqs[tag];
 	/* mq_ctx of flush rq is always cloned from the corresponding req */
 	struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx);
 	if (!is_flush_request(rq, fq, tag))
 		return rq;
 	return fq->flush_rq;
 }
 EXPORT_SYMBOL(blk_mq_tag_to_rq);
 struct blk_mq_timeout_data {
 	unsigned long next;
 	unsigned int next_set;
 };
 void blk_mq_rq_timed_out(struct request *req, bool reserved)
 {
 	struct blk_mq_ops *ops = req->q->mq_ops;
 	enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER;
 	/*
 	 * We know that complete is set at this point. If STARTED isn't set
 	 * anymore, then the request isn't active and the "timeout" should
 	 * just be ignored. This can happen due to the bitflag ordering.
 	 * Timeout first checks if STARTED is set, and if it is, assumes
 	 * the request is active. But if we race with completion, then
 	 * we both flags will get cleared. So check here again, and ignore
 	 * a timeout event with a request that isn't active.
 	 */
 	if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags))
 		return;
 	if (ops->timeout)
 		ret = ops->timeout(req, reserved);
 	switch (ret) {
 	case BLK_EH_HANDLED:
 		__blk_mq_complete_request(req);
 		break;
 	case BLK_EH_RESET_TIMER:
 		blk_add_timer(req);
 		blk_clear_rq_complete(req);
 		break;
 	case BLK_EH_NOT_HANDLED:
 		break;
 	default:
 		printk(KERN_ERR "block: bad eh return: %d\n", ret);
 		break;
 	}
 }
 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
 		struct request *rq, void *priv, bool reserved)
 {
 	struct blk_mq_timeout_data *data = priv;
 	if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
 		return;
 	if (time_after_eq(jiffies, rq->deadline)) {
 		if (!blk_mark_rq_complete(rq))
 			blk_mq_rq_timed_out(rq, reserved);
 	} else if (!data->next_set || time_after(data->next, rq->deadline)) {
 		data->next = rq->deadline;
 		data->next_set = 1;
 	}
 }
 static void blk_mq_rq_timer(unsigned long priv)
 {
 	struct request_queue *q = (struct request_queue *)priv;
 	struct blk_mq_timeout_data data = {
 		.next		= 0,
 		.next_set	= 0,
 	};
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 	queue_for_each_hw_ctx(q, hctx, i) {
 		/*
 		 * If not software queues are currently mapped to this
 		 * hardware queue, there's nothing to check
 		 */
 		if (!hctx->nr_ctx || !hctx->tags)
 			continue;
 		blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data);
 	}
 	if (data.next_set) {
 		data.next = blk_rq_timeout(round_jiffies_up(data.next));
 		mod_timer(&q->timeout, data.next);
 	} else {
 		queue_for_each_hw_ctx(q, hctx, i)
 			blk_mq_tag_idle(hctx);
 	}
 }
 /*
  * Reverse check our software queue for entries that we could potentially
  * merge with. Currently includes a hand-wavy stop count of 8, to not spend
  * too much time checking for merges.
  */
 static bool blk_mq_attempt_merge(struct request_queue *q,
 				 struct blk_mq_ctx *ctx, struct bio *bio)
 {
 	struct request *rq;
 	int checked = 8;
 	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
 		int el_ret;
 		if (!checked--)
 			break;
 		if (!blk_rq_merge_ok(rq, bio))
 			continue;
 		el_ret = blk_try_merge(rq, bio);
 		if (el_ret == ELEVATOR_BACK_MERGE) {
 			if (bio_attempt_back_merge(q, rq, bio)) {
 				ctx->rq_merged++;
 				return true;
 			}
 			break;
 		} else if (el_ret == ELEVATOR_FRONT_MERGE) {
 			if (bio_attempt_front_merge(q, rq, bio)) {
 				ctx->rq_merged++;
 				return true;
 			}
 			break;
 		}
 	}
 	return false;
 }
 /*
  * Process software queues that have been marked busy, splicing them
  * to the for-dispatch
  */
 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list)
 {
 	struct blk_mq_ctx *ctx;
 	int i;
 	for (i = 0; i < hctx->ctx_map.map_size; i++) {
 		struct blk_align_bitmap *bm = &hctx->ctx_map.map[i];
 		unsigned int off, bit;
 		if (!bm->word)
 			continue;
 		bit = 0;
 		off = i * hctx->ctx_map.bits_per_word;
 		do {
 			bit = find_next_bit(&bm->word, bm->depth, bit);
 			if (bit >= bm->depth)
 				break;
 			ctx = hctx->ctxs[bit + off];
 			clear_bit(bit, &bm->word);
 			spin_lock(&ctx->lock);
 			list_splice_tail_init(&ctx->rq_list, list);
 			spin_unlock(&ctx->lock);
 			bit++;
 		} while (1);
 	}
 }
 /*
  * Run this hardware queue, pulling any software queues mapped to it in.
  * Note that this function currently has various problems around ordering
  * of IO. In particular, we'd like FIFO behaviour on handling existing
  * items on the hctx->dispatch list. Ignore that for now.
  */
 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	struct request_queue *q = hctx->queue;
 	struct request *rq;
 	LIST_HEAD(rq_list);
 	int queued;
 	WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask));
 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 		return;
 	hctx->run++;
 	/*
 	 * Touch any software queue that has pending entries.
 	 */
 	flush_busy_ctxs(hctx, &rq_list);
 	/*
 	 * If we have previous entries on our dispatch list, grab them
 	 * and stuff them at the front for more fair dispatch.
 	 */
 	if (!list_empty_careful(&hctx->dispatch)) {
 		spin_lock(&hctx->lock);
 		if (!list_empty(&hctx->dispatch))
 			list_splice_init(&hctx->dispatch, &rq_list);
 		spin_unlock(&hctx->lock);
 	}
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
 	queued = 0;
 	while (!list_empty(&rq_list)) {
 		int ret;
 		rq = list_first_entry(&rq_list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 		ret = q->mq_ops->queue_rq(hctx, rq, list_empty(&rq_list));
 		switch (ret) {
 		case BLK_MQ_RQ_QUEUE_OK:
 			queued++;
 			continue;
 		case BLK_MQ_RQ_QUEUE_BUSY:
 			list_add(&rq->queuelist, &rq_list);
 			__blk_mq_requeue_request(rq);
 			break;
 		default:
 			pr_err("blk-mq: bad return on queue: %d\n", ret);
 		case BLK_MQ_RQ_QUEUE_ERROR:
 			rq->errors = -EIO;
 			blk_mq_end_request(rq, rq->errors);
 			break;
 		}
 		if (ret == BLK_MQ_RQ_QUEUE_BUSY)
 			break;
 	}
 	if (!queued)
 		hctx->dispatched[0]++;
 	else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1)))
 		hctx->dispatched[ilog2(queued) + 1]++;
 	/*
 	 * Any items that need requeuing? Stuff them into hctx->dispatch,
 	 * that is where we will continue on next queue run.
 	 */
 	if (!list_empty(&rq_list)) {
 		spin_lock(&hctx->lock);
 		list_splice(&rq_list, &hctx->dispatch);
 		spin_unlock(&hctx->lock);
 	}
 }
 /*
  * It'd be great if the workqueue API had a way to pass
  * in a mask and had some smarts for more clever placement.
  * For now we just round-robin here, switching for every
  * BLK_MQ_CPU_WORK_BATCH queued items.
  */
 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx)
 {
 	int cpu = hctx->next_cpu;
 	if (--hctx->next_cpu_batch <= 0) {
 		int next_cpu;
 		next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask);
 		if (next_cpu >= nr_cpu_ids)
 			next_cpu = cpumask_first(hctx->cpumask);
 		hctx->next_cpu = next_cpu;
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
 	return cpu;
 }
 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
 {
 	if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state)))
 		return;
 	if (!async && cpumask_test_cpu(smp_processor_id(), hctx->cpumask))
 		__blk_mq_run_hw_queue(hctx);
 	else if (hctx->queue->nr_hw_queues == 1)
 		kblockd_schedule_delayed_work(&hctx->run_work, 0);
 	else {
 		unsigned int cpu;
 		cpu = blk_mq_hctx_next_cpu(hctx);
 		kblockd_schedule_delayed_work_on(cpu, &hctx->run_work, 0);
 	}
 }
 void blk_mq_run_queues(struct request_queue *q, bool async)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if ((!blk_mq_hctx_has_pending(hctx) &&
 		    list_empty_careful(&hctx->dispatch)) ||
 		    test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 			continue;
 		preempt_disable();
 		blk_mq_run_hw_queue(hctx, async);
 		preempt_enable();
 	}
 }
 EXPORT_SYMBOL(blk_mq_run_queues);
 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	cancel_delayed_work(&hctx->run_work);
 	cancel_delayed_work(&hctx->delay_work);
 	set_bit(BLK_MQ_S_STOPPED, &hctx->state);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queue);
 void blk_mq_stop_hw_queues(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_stop_hw_queue(hctx);
 }
 EXPORT_SYMBOL(blk_mq_stop_hw_queues);
 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
 {
 	clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 	preempt_disable();
 	blk_mq_run_hw_queue(hctx, false);
 	preempt_enable();
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queue);
 void blk_mq_start_hw_queues(struct request_queue *q)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 	queue_for_each_hw_ctx(q, hctx, i)
 		blk_mq_start_hw_queue(hctx);
 }
 EXPORT_SYMBOL(blk_mq_start_hw_queues);
 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
 {
 	struct blk_mq_hw_ctx *hctx;
 	int i;
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state))
 			continue;
 		clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
 		preempt_disable();
 		blk_mq_run_hw_queue(hctx, async);
 		preempt_enable();
 	}
 }
 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
 static void blk_mq_run_work_fn(struct work_struct *work)
 {
 	struct blk_mq_hw_ctx *hctx;
 	hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
 	__blk_mq_run_hw_queue(hctx);
 }
 static void blk_mq_delay_work_fn(struct work_struct *work)
 {
 	struct blk_mq_hw_ctx *hctx;
 	hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work);
 	if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state))
 		__blk_mq_run_hw_queue(hctx);
 }
 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
 {
 	unsigned long tmo = msecs_to_jiffies(msecs);
 	if (hctx->queue->nr_hw_queues == 1)
 		kblockd_schedule_delayed_work(&hctx->delay_work, tmo);
 	else {
 		unsigned int cpu;
 		cpu = blk_mq_hctx_next_cpu(hctx);
 		kblockd_schedule_delayed_work_on(cpu, &hctx->delay_work, tmo);
 	}
 }
 EXPORT_SYMBOL(blk_mq_delay_queue);
 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx,
 				    struct request *rq, bool at_head)
 {
 	struct blk_mq_ctx *ctx = rq->mq_ctx;
 	trace_block_rq_insert(hctx->queue, rq);
 	if (at_head)
 		list_add(&rq->queuelist, &ctx->rq_list);
 	else
 		list_add_tail(&rq->queuelist, &ctx->rq_list);
 	blk_mq_hctx_mark_pending(hctx, ctx);
 }
 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue,
 		bool async)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx;
 	current_ctx = blk_mq_get_ctx(q);
 	if (!cpu_online(ctx->cpu))
 		rq->mq_ctx = ctx = current_ctx;
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	spin_lock(&ctx->lock);
 	__blk_mq_insert_request(hctx, rq, at_head);
 	spin_unlock(&ctx->lock);
 	if (run_queue)
 		blk_mq_run_hw_queue(hctx, async);
 	blk_mq_put_ctx(current_ctx);
 }
 static void blk_mq_insert_requests(struct request_queue *q,
 				     struct blk_mq_ctx *ctx,
 				     struct list_head *list,
 				     int depth,
 				     bool from_schedule)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *current_ctx;
 	trace_block_unplug(q, depth, !from_schedule);
 	current_ctx = blk_mq_get_ctx(q);
 	if (!cpu_online(ctx->cpu))
 		ctx = current_ctx;
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	/*
 	 * preemption doesn't flush plug list, so it's possible ctx->cpu is
 	 * offline now
 	 */
 	spin_lock(&ctx->lock);
 	while (!list_empty(list)) {
 		struct request *rq;
 		rq = list_first_entry(list, struct request, queuelist);
 		list_del_init(&rq->queuelist);
 		rq->mq_ctx = ctx;
 		__blk_mq_insert_request(hctx, rq, false);
 	}
 	spin_unlock(&ctx->lock);
 	blk_mq_run_hw_queue(hctx, from_schedule);
 	blk_mq_put_ctx(current_ctx);
 }
 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b)
 {
 	struct request *rqa = container_of(a, struct request, queuelist);
 	struct request *rqb = container_of(b, struct request, queuelist);
 	return !(rqa->mq_ctx < rqb->mq_ctx ||
 		 (rqa->mq_ctx == rqb->mq_ctx &&
 		  blk_rq_pos(rqa) < blk_rq_pos(rqb)));
 }
 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
 {
 	struct blk_mq_ctx *this_ctx;
 	struct request_queue *this_q;
 	struct request *rq;
 	LIST_HEAD(list);
 	LIST_HEAD(ctx_list);
 	unsigned int depth;
 	list_splice_init(&plug->mq_list, &list);
 	list_sort(NULL, &list, plug_ctx_cmp);
 	this_q = NULL;
 	this_ctx = NULL;
 	depth = 0;
 	while (!list_empty(&list)) {
 		rq = list_entry_rq(list.next);
 		list_del_init(&rq->queuelist);
 		BUG_ON(!rq->q);
 		if (rq->mq_ctx != this_ctx) {
 			if (this_ctx) {
 				blk_mq_insert_requests(this_q, this_ctx,
 							&ctx_list, depth,
 							from_schedule);
 			}
 			this_ctx = rq->mq_ctx;
 			this_q = rq->q;
 			depth = 0;
 		}
 		depth++;
 		list_add_tail(&rq->queuelist, &ctx_list);
 	}
 	/*
 	 * If 'this_ctx' is set, we know we have entries to complete
 	 * on 'ctx_list'. Do those.
 	 */
 	if (this_ctx) {
 		blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth,
 				       from_schedule);
 	}
 }
 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
 {
 	init_request_from_bio(rq, bio);
 	if (blk_do_io_stat(rq))
 		blk_account_io_start(rq, 1);
 }
 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx)
 {
 	return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
 		!blk_queue_nomerges(hctx->queue);
 }
 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx,
 					 struct blk_mq_ctx *ctx,
 					 struct request *rq, struct bio *bio)
 {
 	if (!hctx_allow_merges(hctx)) {
 		blk_mq_bio_to_request(rq, bio);
 		spin_lock(&ctx->lock);
 insert_rq:
 		__blk_mq_insert_request(hctx, rq, false);
 		spin_unlock(&ctx->lock);
 		return false;
 	} else {
 		struct request_queue *q = hctx->queue;
 		spin_lock(&ctx->lock);
 		if (!blk_mq_attempt_merge(q, ctx, bio)) {
 			blk_mq_bio_to_request(rq, bio);
 			goto insert_rq;
 		}
 		spin_unlock(&ctx->lock);
 		__blk_mq_free_request(hctx, ctx, rq);
 		return true;
 	}
 }
 struct blk_map_ctx {
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 };
 static struct request *blk_mq_map_request(struct request_queue *q,
 					  struct bio *bio,
 					  struct blk_map_ctx *data)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	struct request *rq;
 	int rw = bio_data_dir(bio);
 	struct blk_mq_alloc_data alloc_data;
 	if (unlikely(blk_mq_queue_enter(q))) {
 		bio_endio(bio, -EIO);
 		return NULL;
 	}
 	ctx = blk_mq_get_ctx(q);
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	if (rw_is_sync(bio->bi_rw))
 		rw |= REQ_SYNC;
 	trace_block_getrq(q, bio, rw);
 	blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx,
 			hctx);
 	rq = __blk_mq_alloc_request(&alloc_data, rw);
 	if (unlikely(!rq)) {
 		__blk_mq_run_hw_queue(hctx);
 		blk_mq_put_ctx(ctx);
 		trace_block_sleeprq(q, bio, rw);
 		ctx = blk_mq_get_ctx(q);
 		hctx = q->mq_ops->map_queue(q, ctx->cpu);
 		blk_mq_set_alloc_data(&alloc_data, q,
 				__GFP_WAIT|GFP_ATOMIC, false, ctx, hctx);
 		rq = __blk_mq_alloc_request(&alloc_data, rw);
 		ctx = alloc_data.ctx;
 		hctx = alloc_data.hctx;
 	}
 	hctx->queued++;
 	data->hctx = hctx;
 	data->ctx = ctx;
 	return rq;
 }
 /*
  * Multiple hardware queue variant. This will not use per-process plugs,
  * but will attempt to bypass the hctx queueing if we can go straight to
  * hardware for SYNC IO.
  */
 static void blk_mq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = rw_is_sync(bio->bi_rw);
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 	struct blk_map_ctx data;
 	struct request *rq;
 	blk_queue_bounce(q, &bio);
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
 		bio_endio(bio, -EIO);
 		return;
 	}
 	rq = blk_mq_map_request(q, bio, &data);
 	if (unlikely(!rq))
 		return;
 	if (unlikely(is_flush_fua)) {
 		blk_mq_bio_to_request(rq, bio);
 		blk_insert_flush(rq);
 		goto run_queue;
 	}
 	if (is_sync) {
 		int ret;
 		blk_mq_bio_to_request(rq, bio);
 		/*
 		 * For OK queue, we are done. For error, kill it. Any other
 		 * error (busy), just add it to our list as we previously
 		 * would have done
 		 */
 		ret = q->mq_ops->queue_rq(data.hctx, rq, true);
 		if (ret == BLK_MQ_RQ_QUEUE_OK)
 			goto done;
 		else {
 			__blk_mq_requeue_request(rq);
 			if (ret == BLK_MQ_RQ_QUEUE_ERROR) {
 				rq->errors = -EIO;
 				blk_mq_end_request(rq, rq->errors);
 				goto done;
 			}
 		}
 	}
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
 		/*
 		 * For a SYNC request, send it to the hardware immediately. For
 		 * an ASYNC request, just ensure that we run it later on. The
 		 * latter allows for merging opportunities and more efficient
 		 * dispatching.
 		 */
 run_queue:
 		blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
 	}
 done:
 	blk_mq_put_ctx(data.ctx);
 }
 /*
  * Single hardware queue variant. This will attempt to use any per-process
  * plug for merging and IO deferral.
  */
 static void blk_sq_make_request(struct request_queue *q, struct bio *bio)
 {
 	const int is_sync = rw_is_sync(bio->bi_rw);
 	const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA);
 	unsigned int use_plug, request_count = 0;
 	struct blk_map_ctx data;
 	struct request *rq;
 	/*
 	 * If we have multiple hardware queues, just go directly to
 	 * one of those for sync IO.
 	 */
 	use_plug = !is_flush_fua && !is_sync;
 	blk_queue_bounce(q, &bio);
 	if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) {
 		bio_endio(bio, -EIO);
 		return;
 	}
 	if (use_plug && !blk_queue_nomerges(q) &&
 	    blk_attempt_plug_merge(q, bio, &request_count))
 		return;
 	rq = blk_mq_map_request(q, bio, &data);
 	if (unlikely(!rq))
 		return;
 	if (unlikely(is_flush_fua)) {
 		blk_mq_bio_to_request(rq, bio);
 		blk_insert_flush(rq);
 		goto run_queue;
 	}
 	/*
 	 * A task plug currently exists. Since this is completely lockless,
 	 * utilize that to temporarily store requests until the task is
 	 * either done or scheduled away.
 	 */
 	if (use_plug) {
 		struct blk_plug *plug = current->plug;
 		if (plug) {
 			blk_mq_bio_to_request(rq, bio);
 			if (list_empty(&plug->mq_list))
 				trace_block_plug(q);
 			else if (request_count >= BLK_MAX_REQUEST_COUNT) {
 				blk_flush_plug_list(plug, false);
 				trace_block_plug(q);
 			}
 			list_add_tail(&rq->queuelist, &plug->mq_list);
 			blk_mq_put_ctx(data.ctx);
 			return;
 		}
 	}
 	if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) {
 		/*
 		 * For a SYNC request, send it to the hardware immediately. For
 		 * an ASYNC request, just ensure that we run it later on. The
 		 * latter allows for merging opportunities and more efficient
 		 * dispatching.
 		 */
 run_queue:
 		blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua);
 	}
 	blk_mq_put_ctx(data.ctx);
 }
 /*
  * Default mapping to a software queue, since we use one per CPU.
  */
 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu)
 {
 	return q->queue_hw_ctx[q->mq_map[cpu]];
 }
 EXPORT_SYMBOL(blk_mq_map_queue);
 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set,
 		struct blk_mq_tags *tags, unsigned int hctx_idx)
 {
 	struct page *page;
 	if (tags->rqs && set->ops->exit_request) {
 		int i;
 		for (i = 0; i < tags->nr_tags; i++) {
 			if (!tags->rqs[i])
 				continue;
 			set->ops->exit_request(set->driver_data, tags->rqs[i],
 						hctx_idx, i);
 			tags->rqs[i] = NULL;
 		}
 	}
 	while (!list_empty(&tags->page_list)) {
 		page = list_first_entry(&tags->page_list, struct page, lru);
 		list_del_init(&page->lru);
 		__free_pages(page, page->private);
 	}
 	kfree(tags->rqs);
 	blk_mq_free_tags(tags);
 }
 static size_t order_to_size(unsigned int order)
 {
 	return (size_t)PAGE_SIZE << order;
 }
 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set,
 		unsigned int hctx_idx)
 {
 	struct blk_mq_tags *tags;
 	unsigned int i, j, entries_per_page, max_order = 4;
 	size_t rq_size, left;
 	tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags,
 				set->numa_node);
 	if (!tags)
 		return NULL;
 	INIT_LIST_HEAD(&tags->page_list);
 	tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *),
 				 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
 				 set->numa_node);
 	if (!tags->rqs) {
 		blk_mq_free_tags(tags);
 		return NULL;
 	}
 	/*
 	 * rq_size is the size of the request plus driver payload, rounded
 	 * to the cacheline size
 	 */
 	rq_size = round_up(sizeof(struct request) + set->cmd_size,
 				cache_line_size());
 	left = rq_size * set->queue_depth;
 	for (i = 0; i < set->queue_depth; ) {
 		int this_order = max_order;
 		struct page *page;
 		int to_do;
 		void *p;
 		while (left < order_to_size(this_order - 1) && this_order)
 			this_order--;
 		do {
 			page = alloc_pages_node(set->numa_node,
 				GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY,
 				this_order);
 			if (page)
 				break;
 			if (!this_order--)
 				break;
 			if (order_to_size(this_order) < rq_size)
 				break;
 		} while (1);
 		if (!page)
 			goto fail;
 		page->private = this_order;
 		list_add_tail(&page->lru, &tags->page_list);
 		p = page_address(page);
 		entries_per_page = order_to_size(this_order) / rq_size;
 		to_do = min(entries_per_page, set->queue_depth - i);
 		left -= to_do * rq_size;
 		for (j = 0; j < to_do; j++) {
 			tags->rqs[i] = p;
 			tags->rqs[i]->atomic_flags = 0;
 			tags->rqs[i]->cmd_flags = 0;
 			if (set->ops->init_request) {
 				if (set->ops->init_request(set->driver_data,
 						tags->rqs[i], hctx_idx, i,
 						set->numa_node)) {
 					tags->rqs[i] = NULL;
 					goto fail;
 				}
 			}
 			p += rq_size;
 			i++;
 		}
 	}
 	return tags;
 fail:
 	blk_mq_free_rq_map(set, tags, hctx_idx);
 	return NULL;
 }
 static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap)
 {
 	kfree(bitmap->map);
 }
 static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node)
 {
 	unsigned int bpw = 8, total, num_maps, i;
 	bitmap->bits_per_word = bpw;
 	num_maps = ALIGN(nr_cpu_ids, bpw) / bpw;
 	bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap),
 					GFP_KERNEL, node);
 	if (!bitmap->map)
 		return -ENOMEM;
 	bitmap->map_size = num_maps;
 	total = nr_cpu_ids;
 	for (i = 0; i < num_maps; i++) {
 		bitmap->map[i].depth = min(total, bitmap->bits_per_word);
 		total -= bitmap->map[i].depth;
 	}
 	return 0;
 }
 static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu)
 {
 	struct request_queue *q = hctx->queue;
 	struct blk_mq_ctx *ctx;
 	LIST_HEAD(tmp);
 	/*
 	 * Move ctx entries to new CPU, if this one is going away.
 	 */
 	ctx = __blk_mq_get_ctx(q, cpu);
 	spin_lock(&ctx->lock);
 	if (!list_empty(&ctx->rq_list)) {
 		list_splice_init(&ctx->rq_list, &tmp);
 		blk_mq_hctx_clear_pending(hctx, ctx);
 	}
 	spin_unlock(&ctx->lock);
 	if (list_empty(&tmp))
 		return NOTIFY_OK;
 	ctx = blk_mq_get_ctx(q);
 	spin_lock(&ctx->lock);
 	while (!list_empty(&tmp)) {
 		struct request *rq;
 		rq = list_first_entry(&tmp, struct request, queuelist);
 		rq->mq_ctx = ctx;
 		list_move_tail(&rq->queuelist, &ctx->rq_list);
 	}
 	hctx = q->mq_ops->map_queue(q, ctx->cpu);
 	blk_mq_hctx_mark_pending(hctx, ctx);
 	spin_unlock(&ctx->lock);
 	blk_mq_run_hw_queue(hctx, true);
 	blk_mq_put_ctx(ctx);
 	return NOTIFY_OK;
 }
 static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu)
 {
 	struct request_queue *q = hctx->queue;
 	struct blk_mq_tag_set *set = q->tag_set;
 	if (set->tags[hctx->queue_num])
 		return NOTIFY_OK;
 	set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num);
 	if (!set->tags[hctx->queue_num])
 		return NOTIFY_STOP;
 	hctx->tags = set->tags[hctx->queue_num];
 	return NOTIFY_OK;
 }
 static int blk_mq_hctx_notify(void *data, unsigned long action,
 			      unsigned int cpu)
 {
 	struct blk_mq_hw_ctx *hctx = data;
 	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
 		return blk_mq_hctx_cpu_offline(hctx, cpu);
 	else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN)
 		return blk_mq_hctx_cpu_online(hctx, cpu);
 	return NOTIFY_OK;
 }
 static void blk_mq_exit_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
 {
 	unsigned flush_start_tag = set->queue_depth;
 	blk_mq_tag_idle(hctx);
 	if (set->ops->exit_request)
 		set->ops->exit_request(set->driver_data,
 				       hctx->fq->flush_rq, hctx_idx,
 				       flush_start_tag + hctx_idx);
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
 	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 	blk_free_flush_queue(hctx->fq);
 	kfree(hctx->ctxs);
 	blk_mq_free_bitmap(&hctx->ctx_map);
 }
 static void blk_mq_exit_hw_queues(struct request_queue *q,
 		struct blk_mq_tag_set *set, int nr_queue)
 {
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int i;
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (i == nr_queue)
 			break;
 		blk_mq_exit_hctx(q, set, hctx, i);
 	}
 }
 static void blk_mq_free_hw_queues(struct request_queue *q,
 		struct blk_mq_tag_set *set)
 {
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int i;
 	queue_for_each_hw_ctx(q, hctx, i) {
 		free_cpumask_var(hctx->cpumask);
 		kfree(hctx);
 	}
 }
 static int blk_mq_init_hctx(struct request_queue *q,
 		struct blk_mq_tag_set *set,
 		struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
 {
 	int node;
 	unsigned flush_start_tag = set->queue_depth;
 	node = hctx->numa_node;
 	if (node == NUMA_NO_NODE)
 		node = hctx->numa_node = set->numa_node;
 	INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn);
 	INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn);
 	spin_lock_init(&hctx->lock);
 	INIT_LIST_HEAD(&hctx->dispatch);
 	hctx->queue = q;
 	hctx->queue_num = hctx_idx;
 	hctx->flags = set->flags;
 	hctx->cmd_size = set->cmd_size;
 	blk_mq_init_cpu_notifier(&hctx->cpu_notifier,
 					blk_mq_hctx_notify, hctx);
 	blk_mq_register_cpu_notifier(&hctx->cpu_notifier);
 	hctx->tags = set->tags[hctx_idx];
 	/*
 	 * Allocate space for all possible cpus to avoid allocation at
 	 * runtime
 	 */
 	hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *),
 					GFP_KERNEL, node);
 	if (!hctx->ctxs)
 		goto unregister_cpu_notifier;
 	if (blk_mq_alloc_bitmap(&hctx->ctx_map, node))
 		goto free_ctxs;
 	hctx->nr_ctx = 0;
 	if (set->ops->init_hctx &&
 	    set->ops->init_hctx(hctx, set->driver_data, hctx_idx))
 		goto free_bitmap;
 	hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size);
 	if (!hctx->fq)
 		goto exit_hctx;
 	if (set->ops->init_request &&
 	    set->ops->init_request(set->driver_data,
 				   hctx->fq->flush_rq, hctx_idx,
 				   flush_start_tag + hctx_idx, node))
 		goto free_fq;
 	return 0;
  free_fq:
 	kfree(hctx->fq);
  exit_hctx:
 	if (set->ops->exit_hctx)
 		set->ops->exit_hctx(hctx, hctx_idx);
  free_bitmap:
 	blk_mq_free_bitmap(&hctx->ctx_map);
  free_ctxs:
 	kfree(hctx->ctxs);
  unregister_cpu_notifier:
 	blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier);
 	return -1;
 }
 static int blk_mq_init_hw_queues(struct request_queue *q,
 		struct blk_mq_tag_set *set)
 {
 	struct blk_mq_hw_ctx *hctx;
 	unsigned int i;
 	/*
 	 * Initialize hardware queues
 	 */
 	queue_for_each_hw_ctx(q, hctx, i) {
 		if (blk_mq_init_hctx(q, set, hctx, i))
 			break;
 	}
 	if (i == q->nr_hw_queues)
 		return 0;
 	/*
 	 * Init failed
 	 */
 	blk_mq_exit_hw_queues(q, set, i);
 	return 1;
 }
 static void blk_mq_init_cpu_queues(struct request_queue *q,
 				   unsigned int nr_hw_queues)
 {
 	unsigned int i;
 	for_each_possible_cpu(i) {
 		struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i);
 		struct blk_mq_hw_ctx *hctx;
 		memset(__ctx, 0, sizeof(*__ctx));
 		__ctx->cpu = i;
 		spin_lock_init(&__ctx->lock);
 		INIT_LIST_HEAD(&__ctx->rq_list);
 		__ctx->queue = q;
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
 		if (!cpu_online(i))
 			continue;
 		hctx = q->mq_ops->map_queue(q, i);
 		cpumask_set_cpu(i, hctx->cpumask);
 		hctx->nr_ctx++;
 		/*
 		 * Set local node, IFF we have more than one hw queue. If
 		 * not, we remain on the home node of the device
 		 */
 		if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
 			hctx->numa_node = cpu_to_node(i);
 	}
 }
 static void blk_mq_map_swqueue(struct request_queue *q)
 {
 	unsigned int i;
 	struct blk_mq_hw_ctx *hctx;
 	struct blk_mq_ctx *ctx;
 	queue_for_each_hw_ctx(q, hctx, i) {
 		cpumask_clear(hctx->cpumask);
 		hctx->nr_ctx = 0;
 	}
 	/*
 	 * Map software to hardware queues
 	 */
 	queue_for_each_ctx(q, ctx, i) {
 		/* If the cpu isn't online, the cpu is mapped to first hctx */
 		if (!cpu_online(i))
 			continue;
 		hctx = q->mq_ops->map_queue(q, i);
 		cpumask_set_cpu(i, hctx->cpumask);
 		ctx->index_hw = hctx->nr_ctx;
 		hctx->ctxs[hctx->nr_ctx++] = ctx;
 	}
 	queue_for_each_hw_ctx(q, hctx, i) {
 		/*
 		 * If no software queues are mapped to this hardware queue,
 		 * disable it and free the request entries.
 		 */
 		if (!hctx->nr_ctx) {
 			struct blk_mq_tag_set *set = q->tag_set;
 			if (set->tags[i]) {
 				blk_mq_free_rq_map(set, set->tags[i], i);
 				set->tags[i] = NULL;
 				hctx->tags = NULL;
 			}
 			continue;
 		}
 		/*
 		 * Initialize batch roundrobin counts
 		 */
 		hctx->next_cpu = cpumask_first(hctx->cpumask);
 		hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH;
 	}
 }
 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set)
 {
 	struct blk_mq_hw_ctx *hctx;
 	struct request_queue *q;
 	bool shared;
 	int i;
 	if (set->tag_list.next == set->tag_list.prev)
 		shared = false;
 	else
 		shared = true;
 	list_for_each_entry(q, &set->tag_list, tag_set_list) {
 		blk_mq_freeze_queue(q);
 		queue_for_each_hw_ctx(q, hctx, i) {
 			if (shared)
 				hctx->flags |= BLK_MQ_F_TAG_SHARED;
 			else
 				hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
 		}
 		blk_mq_unfreeze_queue(q);
 	}
 }
 static void blk_mq_del_queue_tag_set(struct request_queue *q)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
 	mutex_lock(&set->tag_list_lock);
 	list_del_init(&q->tag_set_list);
 	blk_mq_update_tag_set_depth(set);
 	mutex_unlock(&set->tag_list_lock);
 }
 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
 				     struct request_queue *q)
 {
 	q->tag_set = set;
 	mutex_lock(&set->tag_list_lock);
 	list_add_tail(&q->tag_set_list, &set->tag_list);
 	blk_mq_update_tag_set_depth(set);
 	mutex_unlock(&set->tag_list_lock);
 }
 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
 {
 	struct blk_mq_hw_ctx **hctxs;
 	struct blk_mq_ctx __percpu *ctx;
 	struct request_queue *q;
 	unsigned int *map;
 	int i;
 	ctx = alloc_percpu(struct blk_mq_ctx);
 	if (!ctx)
 		return ERR_PTR(-ENOMEM);
 	/*
 	 * If a crashdump is active, then we are potentially in a very
 	 * memory constrained environment. Limit us to 1 queue and
 	 * 64 tags to prevent using too much memory.
 	 */
 	if (is_kdump_kernel()) {
 		set->nr_hw_queues = 1;
 		set->queue_depth = min(64U, set->queue_depth);
 	}
 	hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL,
 			set->numa_node);
 	if (!hctxs)
 		goto err_percpu;
 	map = blk_mq_make_queue_map(set);
 	if (!map)
 		goto err_map;
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		int node = blk_mq_hw_queue_to_node(map, i);
 		hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx),
 					GFP_KERNEL, node);
 		if (!hctxs[i])
 			goto err_hctxs;
 		if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL,
 						node))
 			goto err_hctxs;
 		atomic_set(&hctxs[i]->nr_active, 0);
 		hctxs[i]->numa_node = node;
 		hctxs[i]->queue_num = i;
 	}
 	q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
 	if (!q)
 		goto err_hctxs;
 	/*
 	 * Init percpu_ref in atomic mode so that it's faster to shutdown.
 	 * See blk_register_queue() for details.
 	 */
 	if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release,
 			    PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
 		goto err_map;
 	setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q);
 	blk_queue_rq_timeout(q, 30000);
 	q->nr_queues = nr_cpu_ids;
 	q->nr_hw_queues = set->nr_hw_queues;
 	q->mq_map = map;
 	q->queue_ctx = ctx;
 	q->queue_hw_ctx = hctxs;
 	q->mq_ops = set->ops;
 	q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
 	if (!(set->flags & BLK_MQ_F_SG_MERGE))
 		q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE;
 	q->sg_reserved_size = INT_MAX;
 	INIT_WORK(&q->requeue_work, blk_mq_requeue_work);
 	INIT_LIST_HEAD(&q->requeue_list);
 	spin_lock_init(&q->requeue_lock);
 	if (q->nr_hw_queues > 1)
 		blk_queue_make_request(q, blk_mq_make_request);
 	else
 		blk_queue_make_request(q, blk_sq_make_request);
 	if (set->timeout)
 		blk_queue_rq_timeout(q, set->timeout);
 	/*
 	 * Do this after blk_queue_make_request() overrides it...
 	 */
 	q->nr_requests = set->queue_depth;
 	if (set->ops->complete)
 		blk_queue_softirq_done(q, set->ops->complete);
 	blk_mq_init_cpu_queues(q, set->nr_hw_queues);
 	if (blk_mq_init_hw_queues(q, set))
 		goto err_hw;
 	mutex_lock(&all_q_mutex);
 	list_add_tail(&q->all_q_node, &all_q_list);
 	mutex_unlock(&all_q_mutex);
 	blk_mq_add_queue_tag_set(set, q);
 	blk_mq_map_swqueue(q);
 	return q;
 err_hw:
 	blk_cleanup_queue(q);
 err_hctxs:
 	kfree(map);
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		if (!hctxs[i])
 			break;
 		free_cpumask_var(hctxs[i]->cpumask);
 		kfree(hctxs[i]);
 	}
 err_map:
 	kfree(hctxs);
 err_percpu:
 	free_percpu(ctx);
 	return ERR_PTR(-ENOMEM);
 }
 EXPORT_SYMBOL(blk_mq_init_queue);
 void blk_mq_free_queue(struct request_queue *q)
 {
 	struct blk_mq_tag_set	*set = q->tag_set;
 	blk_mq_del_queue_tag_set(q);
 	blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
 	blk_mq_free_hw_queues(q, set);
 	percpu_ref_exit(&q->mq_usage_counter);
 	free_percpu(q->queue_ctx);
 	kfree(q->queue_hw_ctx);
 	kfree(q->mq_map);
 	q->queue_ctx = NULL;
 	q->queue_hw_ctx = NULL;
 	q->mq_map = NULL;
 	mutex_lock(&all_q_mutex);
 	list_del_init(&q->all_q_node);
 	mutex_unlock(&all_q_mutex);
 }
 /* Basically redo blk_mq_init_queue with queue frozen */
 static void blk_mq_queue_reinit(struct request_queue *q)
 {
-	blk_mq_freeze_queue(q);
+	WARN_ON_ONCE(!q->mq_freeze_depth);
 	blk_mq_sysfs_unregister(q);
 	blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues);
 	/*
 	 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe
 	 * we should change hctx numa_node according to new topology (this
 	 * involves free and re-allocate memory, worthy doing?)
 	 */
 	blk_mq_map_swqueue(q);
 	blk_mq_sysfs_register(q);
-	blk_mq_unfreeze_queue(q);
 }
 static int blk_mq_queue_reinit_notify(struct notifier_block *nb,
 				      unsigned long action, void *hcpu)
 {
 	struct request_queue *q;
 	/*
 	 * Before new mappings are established, hotadded cpu might already
 	 * start handling requests. This doesn't break anything as we map
 	 * offline CPUs to first hardware queue. We will re-init the queue
 	 * below to get optimal settings.
 	 */
 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN &&
 	    action != CPU_ONLINE && action != CPU_ONLINE_FROZEN)
 		return NOTIFY_OK;
 	mutex_lock(&all_q_mutex);
+	/*
+	 * We need to freeze and reinit all existing queues.  Freezing
+	 * involves synchronous wait for an RCU grace period and doing it
+	 * one by one may take a long time.  Start freezing all queues in
+	 * one swoop and then wait for the completions so that freezing can
+	 * take place in parallel.
+	 */
 	list_for_each_entry(q, &all_q_list, all_q_node)
+		blk_mq_freeze_queue_start(q);
+	list_for_each_entry(q, &all_q_list, all_q_node)
+		blk_mq_freeze_queue_wait(q);
+	list_for_each_entry(q, &all_q_list, all_q_node)
 		blk_mq_queue_reinit(q);
+	list_for_each_entry(q, &all_q_list, all_q_node)
+		blk_mq_unfreeze_queue(q);
 	mutex_unlock(&all_q_mutex);
 	return NOTIFY_OK;
 }
 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
 	int i;
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		set->tags[i] = blk_mq_init_rq_map(set, i);
 		if (!set->tags[i])
 			goto out_unwind;
 	}
 	return 0;
 out_unwind:
 	while (--i >= 0)
 		blk_mq_free_rq_map(set, set->tags[i], i);
 	return -ENOMEM;
 }
 /*
  * Allocate the request maps associated with this tag_set. Note that this
  * may reduce the depth asked for, if memory is tight. set->queue_depth
  * will be updated to reflect the allocated depth.
  */
 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
 {
 	unsigned int depth;
 	int err;
 	depth = set->queue_depth;
 	do {
 		err = __blk_mq_alloc_rq_maps(set);
 		if (!err)
 			break;
 		set->queue_depth >>= 1;
 		if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) {
 			err = -ENOMEM;
 			break;
 		}
 	} while (set->queue_depth);
 	if (!set->queue_depth || err) {
 		pr_err("blk-mq: failed to allocate request map\n");
 		return -ENOMEM;
 	}
 	if (depth != set->queue_depth)
 		pr_info("blk-mq: reduced tag depth (%u -> %u)\n",
 						depth, set->queue_depth);
 	return 0;
 }
 /*
  * Alloc a tag set to be associated with one or more request queues.
  * May fail with EINVAL for various error conditions. May adjust the
  * requested depth down, if if it too large. In that case, the set
  * value will be stored in set->queue_depth.
  */
 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
 {
 	if (!set->nr_hw_queues)
 		return -EINVAL;
 	if (!set->queue_depth)
 		return -EINVAL;
 	if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN)
 		return -EINVAL;
 	if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue)
 		return -EINVAL;
 	if (set->queue_depth > BLK_MQ_MAX_DEPTH) {
 		pr_info("blk-mq: reduced tag depth to %u\n",
 			BLK_MQ_MAX_DEPTH);
 		set->queue_depth = BLK_MQ_MAX_DEPTH;
 	}
 	set->tags = kmalloc_node(set->nr_hw_queues *
 				 sizeof(struct blk_mq_tags *),
 				 GFP_KERNEL, set->numa_node);
 	if (!set->tags)
 		return -ENOMEM;
 	if (blk_mq_alloc_rq_maps(set))
 		goto enomem;
 	mutex_init(&set->tag_list_lock);
 	INIT_LIST_HEAD(&set->tag_list);
 	return 0;
 enomem:
 	kfree(set->tags);
 	set->tags = NULL;
 	return -ENOMEM;
 }
 EXPORT_SYMBOL(blk_mq_alloc_tag_set);
 void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
 {
 	int i;
 	for (i = 0; i < set->nr_hw_queues; i++) {
 		if (set->tags[i])
 			blk_mq_free_rq_map(set, set->tags[i], i);
 	}
 	kfree(set->tags);
 	set->tags = NULL;
 }
 EXPORT_SYMBOL(blk_mq_free_tag_set);
 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
 {
 	struct blk_mq_tag_set *set = q->tag_set;
 	struct blk_mq_hw_ctx *hctx;
 	int i, ret;
 	if (!set || nr > set->queue_depth)
 		return -EINVAL;
 	ret = 0;
 	queue_for_each_hw_ctx(q, hctx, i) {
 		ret = blk_mq_tag_update_depth(hctx->tags, nr);
 		if (ret)
 			break;
 	}
 	if (!ret)
 		q->nr_requests = nr;
 	return ret;
 }
 void blk_mq_disable_hotplug(void)
 {
 	mutex_lock(&all_q_mutex);
 }
 void blk_mq_enable_hotplug(void)
 {
 	mutex_unlock(&all_q_mutex);
 }
 static int __init blk_mq_init(void)
 {
 	blk_mq_cpu_init();

block/ioprio.c

Diff comments View file @ 6f0d7a9

 /*
  * fs/ioprio.c
  *
  * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
  *
  * Helper functions for setting/querying io priorities of processes. The
  * system calls closely mimmick getpriority/setpriority, see the man page for
  * those. The prio argument is a composite of prio class and prio data, where
  * the data argument has meaning within that class. The standard scheduling
  * classes have 8 distinct prio levels, with 0 being the highest prio and 7
  * being the lowest.
  *
  * IOW, setting BE scheduling class with prio 2 is done ala:
  *
  * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
  *
  * ioprio_set(PRIO_PROCESS, pid, prio);
  *
  * See also Documentation/block/ioprio.txt
  *
  */
 #include <linux/gfp.h>
 #include <linux/kernel.h>
 #include <linux/export.h>
 #include <linux/ioprio.h>
 #include <linux/blkdev.h>
 #include <linux/capability.h>
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
 int set_task_ioprio(struct task_struct *task, int ioprio)
 {
 	int err;
 	struct io_context *ioc;
 	const struct cred *cred = current_cred(), *tcred;
 	rcu_read_lock();
 	tcred = __task_cred(task);
 	if (!uid_eq(tcred->uid, cred->euid) &&
 	    !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
 		rcu_read_unlock();
 		return -EPERM;
 	}
 	rcu_read_unlock();
 	err = security_task_setioprio(task, ioprio);
 	if (err)
 		return err;
 	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
 	if (ioc) {
 		ioc->ioprio = ioprio;
 		put_io_context(ioc);
 	}
 	return err;
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
 SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 {
 	int class = IOPRIO_PRIO_CLASS(ioprio);
 	int data = IOPRIO_PRIO_DATA(ioprio);
 	struct task_struct *p, *g;
 	struct user_struct *user;
 	struct pid *pgrp;
 	kuid_t uid;
 	int ret;
 	switch (class) {
 		case IOPRIO_CLASS_RT:
 			if (!capable(CAP_SYS_ADMIN))
 				return -EPERM;
 			/* fall through, rt has prio field too */
 		case IOPRIO_CLASS_BE:
 			if (data >= IOPRIO_BE_NR || data < 0)
 				return -EINVAL;
 			break;
 		case IOPRIO_CLASS_IDLE:
 			break;
 		case IOPRIO_CLASS_NONE:
 			if (data)
 				return -EINVAL;
 			break;
 		default:
 			return -EINVAL;
 	}
 	ret = -ESRCH;
 	rcu_read_lock();
 	switch (which) {
 		case IOPRIO_WHO_PROCESS:
 			if (!who)
 				p = current;
 			else
 				p = find_task_by_vpid(who);
 			if (p)
 				ret = set_task_ioprio(p, ioprio);
 			break;
 		case IOPRIO_WHO_PGRP:
 			if (!who)
 				pgrp = task_pgrp(current);
 			else
 				pgrp = find_vpid(who);
 			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
 					break;
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case IOPRIO_WHO_USER:
 			uid = make_kuid(current_user_ns(), who);
 			if (!uid_valid(uid))
 				break;
 			if (!who)
 				user = current_user();
 			else
 				user = find_user(uid);
 			if (!user)
 				break;
 			do_each_thread(g, p) {
 				if (!uid_eq(task_uid(p), uid))
 					continue;
 				ret = set_task_ioprio(p, ioprio);
 				if (ret)
 					goto free_uid;
 			} while_each_thread(g, p);
 free_uid:
 			if (who)
 				free_uid(user);
 			break;
 		default:
 			ret = -EINVAL;
 	}
 	rcu_read_unlock();
 	return ret;
 }
 static int get_task_ioprio(struct task_struct *p)
 {
 	int ret;
 	ret = security_task_getioprio(p);
 	if (ret)
 		goto out;
 	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
 	if (p->io_context)
 		ret = p->io_context->ioprio;
 out:
 	return ret;
 }
 int ioprio_best(unsigned short aprio, unsigned short bprio)
 {
-	unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
+	unsigned short aclass;
-	unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
+	unsigned short bclass;
-	if (aclass == IOPRIO_CLASS_NONE)
+	if (!ioprio_valid(aprio))
-		aclass = IOPRIO_CLASS_BE;
+		aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
-	if (bclass == IOPRIO_CLASS_NONE)
+	if (!ioprio_valid(bprio))
-		bclass = IOPRIO_CLASS_BE;
+		bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
+	aclass = IOPRIO_PRIO_CLASS(aprio);
+	bclass = IOPRIO_PRIO_CLASS(bprio);
 	if (aclass == bclass)
 		return min(aprio, bprio);
 	if (aclass > bclass)
 		return bprio;
 	else
 		return aprio;
 }
 SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
 	struct pid *pgrp;
 	kuid_t uid;
 	int ret = -ESRCH;
 	int tmpio;
 	rcu_read_lock();
 	switch (which) {
 		case IOPRIO_WHO_PROCESS:
 			if (!who)
 				p = current;
 			else
 				p = find_task_by_vpid(who);
 			if (p)
 				ret = get_task_ioprio(p);
 			break;
 		case IOPRIO_WHO_PGRP:
 			if (!who)
 				pgrp = task_pgrp(current);
 			else
 				pgrp = find_vpid(who);
 			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
 				tmpio = get_task_ioprio(p);
 				if (tmpio < 0)
 					continue;
 				if (ret == -ESRCH)
 					ret = tmpio;
 				else
 					ret = ioprio_best(ret, tmpio);
 			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
 			break;
 		case IOPRIO_WHO_USER:
 			uid = make_kuid(current_user_ns(), who);
 			if (!who)
 				user = current_user();
 			else
 				user = find_user(uid);
 			if (!user)
 				break;
 			do_each_thread(g, p) {
 				if (!uid_eq(task_uid(p), user->uid))
 					continue;
 				tmpio = get_task_ioprio(p);
 				if (tmpio < 0)
 					continue;
 				if (ret == -ESRCH)
 					ret = tmpio;
 				else
 					ret = ioprio_best(ret, tmpio);
 			} while_each_thread(g, p);
 			if (who)
 				free_uid(user);
 			break;
 		default:
 			ret = -EINVAL;
 	}
 	rcu_read_unlock();
 	return ret;
 }

block/scsi_ioctl.c

Diff comments View file @ 6f0d7a9

1	/*	1	/*
2	* Copyright (C) 2001 Jens Axboe <axboe@suse.de>	2	* Copyright (C) 2001 Jens Axboe <axboe@suse.de>
3	*	3	*
4	* This program is free software; you can redistribute it and/or modify	4	* This program is free software; you can redistribute it and/or modify
5	* it under the terms of the GNU General Public License version 2 as	5	* it under the terms of the GNU General Public License version 2 as
6	* published by the Free Software Foundation.	6	* published by the Free Software Foundation.
7	*	7	*
8	* This program is distributed in the hope that it will be useful,	8	* This program is distributed in the hope that it will be useful,
9	* but WITHOUT ANY WARRANTY; without even the implied warranty of	9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
10	*	10	*
11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the	11	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12	* GNU General Public License for more details.	12	* GNU General Public License for more details.
13	*	13	*
14	* You should have received a copy of the GNU General Public Licens	14	* You should have received a copy of the GNU General Public Licens
15	* along with this program; if not, write to the Free Software	15	* along with this program; if not, write to the Free Software
16	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-	16	* Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-
17	*	17	*
18	*/	18	*/
19	#include <linux/kernel.h>	19	#include <linux/kernel.h>
20	#include <linux/errno.h>	20	#include <linux/errno.h>
21	#include <linux/string.h>	21	#include <linux/string.h>
22	#include <linux/module.h>	22	#include <linux/module.h>
23	#include <linux/blkdev.h>	23	#include <linux/blkdev.h>
24	#include <linux/capability.h>	24	#include <linux/capability.h>
25	#include <linux/completion.h>	25	#include <linux/completion.h>
26	#include <linux/cdrom.h>	26	#include <linux/cdrom.h>
27	#include <linux/ratelimit.h>	27	#include <linux/ratelimit.h>
28	#include <linux/slab.h>	28	#include <linux/slab.h>
29	#include <linux/times.h>	29	#include <linux/times.h>
30	#include <linux/uio.h>	30	#include <linux/uio.h>
31	#include <asm/uaccess.h>	31	#include <asm/uaccess.h>
32		32
33	#include <scsi/scsi.h>	33	#include <scsi/scsi.h>
34	#include <scsi/scsi_ioctl.h>	34	#include <scsi/scsi_ioctl.h>
35	#include <scsi/scsi_cmnd.h>	35	#include <scsi/scsi_cmnd.h>
36		36
37	struct blk_cmd_filter {	37	struct blk_cmd_filter {
38	unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];	38	unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
39	unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];	39	unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
40	};	40	};
41		41
42	static struct blk_cmd_filter blk_default_cmd_filter;	42	static struct blk_cmd_filter blk_default_cmd_filter;
43		43
44	/* Command group 3 is reserved and should never be used. */	44	/* Command group 3 is reserved and should never be used. */
45	const unsigned char scsi_command_size_tbl[8] =	45	const unsigned char scsi_command_size_tbl[8] =
46	{	46	{
47	6, 10, 10, 12,	47	6, 10, 10, 12,
48	16, 12, 10, 10	48	16, 12, 10, 10
49	};	49	};
50	EXPORT_SYMBOL(scsi_command_size_tbl);	50	EXPORT_SYMBOL(scsi_command_size_tbl);
51		51
52	#include <scsi/sg.h>	52	#include <scsi/sg.h>
53		53
54	static int sg_get_version(int __user *p)	54	static int sg_get_version(int __user *p)
55	{	55	{
56	static const int sg_version_num = 30527;	56	static const int sg_version_num = 30527;
57	return put_user(sg_version_num, p);	57	return put_user(sg_version_num, p);
58	}	58	}
59		59
60	static int scsi_get_idlun(struct request_queue q, int __user p)	60	static int scsi_get_idlun(struct request_queue q, int __user p)
61	{	61	{
62	return put_user(0, p);	62	return put_user(0, p);
63	}	63	}
64		64
65	static int scsi_get_bus(struct request_queue q, int __user p)	65	static int scsi_get_bus(struct request_queue q, int __user p)
66	{	66	{
67	return put_user(0, p);	67	return put_user(0, p);
68	}	68	}
69		69
70	static int sg_get_timeout(struct request_queue *q)	70	static int sg_get_timeout(struct request_queue *q)
71	{	71	{
72	return jiffies_to_clock_t(q->sg_timeout);	72	return jiffies_to_clock_t(q->sg_timeout);
73	}	73	}
74		74
75	static int sg_set_timeout(struct request_queue q, int __user p)	75	static int sg_set_timeout(struct request_queue q, int __user p)
76	{	76	{
77	int timeout, err = get_user(timeout, p);	77	int timeout, err = get_user(timeout, p);
78		78
79	if (!err)	79	if (!err)
80	q->sg_timeout = clock_t_to_jiffies(timeout);	80	q->sg_timeout = clock_t_to_jiffies(timeout);
81		81
82	return err;	82	return err;
83	}	83	}
84		84
85	static int max_sectors_bytes(struct request_queue *q)	85	static int max_sectors_bytes(struct request_queue *q)
86	{	86	{
87	unsigned int max_sectors = queue_max_sectors(q);	87	unsigned int max_sectors = queue_max_sectors(q);
88		88
89	max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9);	89	max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9);
90		90
91	return max_sectors << 9;	91	return max_sectors << 9;
92	}	92	}
93		93
94	static int sg_get_reserved_size(struct request_queue q, int __user p)	94	static int sg_get_reserved_size(struct request_queue q, int __user p)
95	{	95	{
96	int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q));	96	int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q));
97		97
98	return put_user(val, p);	98	return put_user(val, p);
99	}	99	}
100		100
101	static int sg_set_reserved_size(struct request_queue q, int __user p)	101	static int sg_set_reserved_size(struct request_queue q, int __user p)
102	{	102	{
103	int size, err = get_user(size, p);	103	int size, err = get_user(size, p);
104		104
105	if (err)	105	if (err)
106	return err;	106	return err;
107		107
108	if (size < 0)	108	if (size < 0)
109	return -EINVAL;	109	return -EINVAL;
110		110
111	q->sg_reserved_size = min(size, max_sectors_bytes(q));	111	q->sg_reserved_size = min(size, max_sectors_bytes(q));
112	return 0;	112	return 0;
113	}	113	}
114		114
115	/*	115	/*
116	* will always return that we are ATAPI even for a real SCSI drive, I'm not	116	* will always return that we are ATAPI even for a real SCSI drive, I'm not
117	* so sure this is worth doing anything about (why would you care??)	117	* so sure this is worth doing anything about (why would you care??)
118	*/	118	*/
119	static int sg_emulated_host(struct request_queue q, int __user p)	119	static int sg_emulated_host(struct request_queue q, int __user p)
120	{	120	{
121	return put_user(1, p);	121	return put_user(1, p);
122	}	122	}
123		123
124	static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)	124	static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
125	{	125	{
126	/* Basic read-only commands */	126	/* Basic read-only commands */
127	__set_bit(TEST_UNIT_READY, filter->read_ok);	127	__set_bit(TEST_UNIT_READY, filter->read_ok);
128	__set_bit(REQUEST_SENSE, filter->read_ok);	128	__set_bit(REQUEST_SENSE, filter->read_ok);
129	__set_bit(READ_6, filter->read_ok);	129	__set_bit(READ_6, filter->read_ok);
130	__set_bit(READ_10, filter->read_ok);	130	__set_bit(READ_10, filter->read_ok);
131	__set_bit(READ_12, filter->read_ok);	131	__set_bit(READ_12, filter->read_ok);
132	__set_bit(READ_16, filter->read_ok);	132	__set_bit(READ_16, filter->read_ok);
133	__set_bit(READ_BUFFER, filter->read_ok);	133	__set_bit(READ_BUFFER, filter->read_ok);
134	__set_bit(READ_DEFECT_DATA, filter->read_ok);	134	__set_bit(READ_DEFECT_DATA, filter->read_ok);
135	__set_bit(READ_CAPACITY, filter->read_ok);	135	__set_bit(READ_CAPACITY, filter->read_ok);
136	__set_bit(READ_LONG, filter->read_ok);	136	__set_bit(READ_LONG, filter->read_ok);
137	__set_bit(INQUIRY, filter->read_ok);	137	__set_bit(INQUIRY, filter->read_ok);
138	__set_bit(MODE_SENSE, filter->read_ok);	138	__set_bit(MODE_SENSE, filter->read_ok);
139	__set_bit(MODE_SENSE_10, filter->read_ok);	139	__set_bit(MODE_SENSE_10, filter->read_ok);
140	__set_bit(LOG_SENSE, filter->read_ok);	140	__set_bit(LOG_SENSE, filter->read_ok);
141	__set_bit(START_STOP, filter->read_ok);	141	__set_bit(START_STOP, filter->read_ok);
142	__set_bit(GPCMD_VERIFY_10, filter->read_ok);	142	__set_bit(GPCMD_VERIFY_10, filter->read_ok);
143	__set_bit(VERIFY_16, filter->read_ok);	143	__set_bit(VERIFY_16, filter->read_ok);
144	__set_bit(REPORT_LUNS, filter->read_ok);	144	__set_bit(REPORT_LUNS, filter->read_ok);
145	__set_bit(SERVICE_ACTION_IN, filter->read_ok);	145	__set_bit(SERVICE_ACTION_IN, filter->read_ok);
146	__set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok);	146	__set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok);
147	__set_bit(MAINTENANCE_IN, filter->read_ok);	147	__set_bit(MAINTENANCE_IN, filter->read_ok);
148	__set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok);	148	__set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok);
149		149
150	/* Audio CD commands */	150	/* Audio CD commands */
151	__set_bit(GPCMD_PLAY_CD, filter->read_ok);	151	__set_bit(GPCMD_PLAY_CD, filter->read_ok);
152	__set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok);	152	__set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok);
153	__set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok);	153	__set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok);
154	__set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok);	154	__set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok);
155	__set_bit(GPCMD_PAUSE_RESUME, filter->read_ok);	155	__set_bit(GPCMD_PAUSE_RESUME, filter->read_ok);
156		156
157	/* CD/DVD data reading */	157	/* CD/DVD data reading */
158	__set_bit(GPCMD_READ_CD, filter->read_ok);	158	__set_bit(GPCMD_READ_CD, filter->read_ok);
159	__set_bit(GPCMD_READ_CD_MSF, filter->read_ok);	159	__set_bit(GPCMD_READ_CD_MSF, filter->read_ok);
160	__set_bit(GPCMD_READ_DISC_INFO, filter->read_ok);	160	__set_bit(GPCMD_READ_DISC_INFO, filter->read_ok);
161	__set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok);	161	__set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok);
162	__set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok);	162	__set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok);
163	__set_bit(GPCMD_READ_HEADER, filter->read_ok);	163	__set_bit(GPCMD_READ_HEADER, filter->read_ok);
164	__set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok);	164	__set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok);
165	__set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok);	165	__set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok);
166	__set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok);	166	__set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok);
167	__set_bit(GPCMD_REPORT_KEY, filter->read_ok);	167	__set_bit(GPCMD_REPORT_KEY, filter->read_ok);
168	__set_bit(GPCMD_SCAN, filter->read_ok);	168	__set_bit(GPCMD_SCAN, filter->read_ok);
169	__set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok);	169	__set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok);
170	__set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok);	170	__set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok);
171	__set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok);	171	__set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok);
172	__set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok);	172	__set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok);
173	__set_bit(GPCMD_SEEK, filter->read_ok);	173	__set_bit(GPCMD_SEEK, filter->read_ok);
174	__set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok);	174	__set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok);
175		175
176	/* Basic writing commands */	176	/* Basic writing commands */
177	__set_bit(WRITE_6, filter->write_ok);	177	__set_bit(WRITE_6, filter->write_ok);
178	__set_bit(WRITE_10, filter->write_ok);	178	__set_bit(WRITE_10, filter->write_ok);
179	__set_bit(WRITE_VERIFY, filter->write_ok);	179	__set_bit(WRITE_VERIFY, filter->write_ok);
180	__set_bit(WRITE_12, filter->write_ok);	180	__set_bit(WRITE_12, filter->write_ok);
181	__set_bit(WRITE_VERIFY_12, filter->write_ok);	181	__set_bit(WRITE_VERIFY_12, filter->write_ok);
182	__set_bit(WRITE_16, filter->write_ok);	182	__set_bit(WRITE_16, filter->write_ok);
183	__set_bit(WRITE_LONG, filter->write_ok);	183	__set_bit(WRITE_LONG, filter->write_ok);
184	__set_bit(WRITE_LONG_2, filter->write_ok);	184	__set_bit(WRITE_LONG_2, filter->write_ok);
185	__set_bit(ERASE, filter->write_ok);	185	__set_bit(ERASE, filter->write_ok);
186	__set_bit(GPCMD_MODE_SELECT_10, filter->write_ok);	186	__set_bit(GPCMD_MODE_SELECT_10, filter->write_ok);
187	__set_bit(MODE_SELECT, filter->write_ok);	187	__set_bit(MODE_SELECT, filter->write_ok);
188	__set_bit(LOG_SELECT, filter->write_ok);	188	__set_bit(LOG_SELECT, filter->write_ok);
189	__set_bit(GPCMD_BLANK, filter->write_ok);	189	__set_bit(GPCMD_BLANK, filter->write_ok);
190	__set_bit(GPCMD_CLOSE_TRACK, filter->write_ok);	190	__set_bit(GPCMD_CLOSE_TRACK, filter->write_ok);
191	__set_bit(GPCMD_FLUSH_CACHE, filter->write_ok);	191	__set_bit(GPCMD_FLUSH_CACHE, filter->write_ok);
192	__set_bit(GPCMD_FORMAT_UNIT, filter->write_ok);	192	__set_bit(GPCMD_FORMAT_UNIT, filter->write_ok);
193	__set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok);	193	__set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok);
194	__set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok);	194	__set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok);
195	__set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok);	195	__set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok);
196	__set_bit(GPCMD_SEND_EVENT, filter->write_ok);	196	__set_bit(GPCMD_SEND_EVENT, filter->write_ok);
197	__set_bit(GPCMD_SEND_KEY, filter->write_ok);	197	__set_bit(GPCMD_SEND_KEY, filter->write_ok);
198	__set_bit(GPCMD_SEND_OPC, filter->write_ok);	198	__set_bit(GPCMD_SEND_OPC, filter->write_ok);
199	__set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok);	199	__set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok);
200	__set_bit(GPCMD_SET_SPEED, filter->write_ok);	200	__set_bit(GPCMD_SET_SPEED, filter->write_ok);
201	__set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok);	201	__set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok);
202	__set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok);	202	__set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok);
203	__set_bit(GPCMD_SET_STREAMING, filter->write_ok);	203	__set_bit(GPCMD_SET_STREAMING, filter->write_ok);
204	__set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);	204	__set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
205	}	205	}
206		206
207	int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)	207	int blk_verify_command(unsigned char *cmd, fmode_t has_write_perm)
208	{	208	{
209	struct blk_cmd_filter *filter = &blk_default_cmd_filter;	209	struct blk_cmd_filter *filter = &blk_default_cmd_filter;
210		210
211	/* root can do any command. */	211	/* root can do any command. */
212	if (capable(CAP_SYS_RAWIO))	212	if (capable(CAP_SYS_RAWIO))
213	return 0;	213	return 0;
214		214
215	/* Anybody who can open the device can do a read-safe command */	215	/* Anybody who can open the device can do a read-safe command */
216	if (test_bit(cmd[0], filter->read_ok))	216	if (test_bit(cmd[0], filter->read_ok))
217	return 0;	217	return 0;
218		218
219	/* Write-safe commands require a writable open */	219	/* Write-safe commands require a writable open */
220	if (test_bit(cmd[0], filter->write_ok) && has_write_perm)	220	if (test_bit(cmd[0], filter->write_ok) && has_write_perm)
221	return 0;	221	return 0;
222		222
223	return -EPERM;	223	return -EPERM;
224	}	224	}
225	EXPORT_SYMBOL(blk_verify_command);	225	EXPORT_SYMBOL(blk_verify_command);
226		226
227	static int blk_fill_sghdr_rq(struct request_queue q, struct request rq,	227	static int blk_fill_sghdr_rq(struct request_queue q, struct request rq,
228	struct sg_io_hdr *hdr, fmode_t mode)	228	struct sg_io_hdr *hdr, fmode_t mode)
229	{	229	{
230	if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))	230	if (copy_from_user(rq->cmd, hdr->cmdp, hdr->cmd_len))
231	return -EFAULT;	231	return -EFAULT;
232	if (blk_verify_command(rq->cmd, mode & FMODE_WRITE))	232	if (blk_verify_command(rq->cmd, mode & FMODE_WRITE))
233	return -EPERM;	233	return -EPERM;
234		234
235	/*	235	/*
236	* fill in request structure	236	* fill in request structure
237	*/	237	*/
238	rq->cmd_len = hdr->cmd_len;	238	rq->cmd_len = hdr->cmd_len;
239		239
240	rq->timeout = msecs_to_jiffies(hdr->timeout);	240	rq->timeout = msecs_to_jiffies(hdr->timeout);
241	if (!rq->timeout)	241	if (!rq->timeout)
242	rq->timeout = q->sg_timeout;	242	rq->timeout = q->sg_timeout;
243	if (!rq->timeout)	243	if (!rq->timeout)
244	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;	244	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
245	if (rq->timeout < BLK_MIN_SG_TIMEOUT)	245	if (rq->timeout < BLK_MIN_SG_TIMEOUT)
246	rq->timeout = BLK_MIN_SG_TIMEOUT;	246	rq->timeout = BLK_MIN_SG_TIMEOUT;
247		247
248	return 0;	248	return 0;
249	}	249	}
250		250
251	static int blk_complete_sghdr_rq(struct request rq, struct sg_io_hdr hdr,	251	static int blk_complete_sghdr_rq(struct request rq, struct sg_io_hdr hdr,
252	struct bio *bio)	252	struct bio *bio)
253	{	253	{
254	int r, ret = 0;	254	int r, ret = 0;
255		255
256	/*	256	/*
257	* fill in all the output members	257	* fill in all the output members
258	*/	258	*/
259	hdr->status = rq->errors & 0xff;	259	hdr->status = rq->errors & 0xff;
260	hdr->masked_status = status_byte(rq->errors);	260	hdr->masked_status = status_byte(rq->errors);
261	hdr->msg_status = msg_byte(rq->errors);	261	hdr->msg_status = msg_byte(rq->errors);
262	hdr->host_status = host_byte(rq->errors);	262	hdr->host_status = host_byte(rq->errors);
263	hdr->driver_status = driver_byte(rq->errors);	263	hdr->driver_status = driver_byte(rq->errors);
264	hdr->info = 0;	264	hdr->info = 0;
265	if (hdr->masked_status \|\| hdr->host_status \|\| hdr->driver_status)	265	if (hdr->masked_status \|\| hdr->host_status \|\| hdr->driver_status)
266	hdr->info \|= SG_INFO_CHECK;	266	hdr->info \|= SG_INFO_CHECK;
267	hdr->resid = rq->resid_len;	267	hdr->resid = rq->resid_len;
268	hdr->sb_len_wr = 0;	268	hdr->sb_len_wr = 0;
269		269
270	if (rq->sense_len && hdr->sbp) {	270	if (rq->sense_len && hdr->sbp) {
271	int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len);	271	int len = min((unsigned int) hdr->mx_sb_len, rq->sense_len);
272		272
273	if (!copy_to_user(hdr->sbp, rq->sense, len))	273	if (!copy_to_user(hdr->sbp, rq->sense, len))
274	hdr->sb_len_wr = len;	274	hdr->sb_len_wr = len;
275	else	275	else
276	ret = -EFAULT;	276	ret = -EFAULT;
277	}	277	}
278		278
279	r = blk_rq_unmap_user(bio);	279	r = blk_rq_unmap_user(bio);
280	if (!ret)	280	if (!ret)
281	ret = r;	281	ret = r;
282		282
283	return ret;	283	return ret;
284	}	284	}
285		285
286	static int sg_io(struct request_queue q, struct gendisk bd_disk,	286	static int sg_io(struct request_queue q, struct gendisk bd_disk,
287	struct sg_io_hdr *hdr, fmode_t mode)	287	struct sg_io_hdr *hdr, fmode_t mode)
288	{	288	{
289	unsigned long start_time;	289	unsigned long start_time;
290	ssize_t ret = 0;	290	ssize_t ret = 0;
291	int writing = 0;	291	int writing = 0;
292	int at_head = 0;	292	int at_head = 0;
293	struct request *rq;	293	struct request *rq;
294	char sense[SCSI_SENSE_BUFFERSIZE];	294	char sense[SCSI_SENSE_BUFFERSIZE];
295	struct bio *bio;	295	struct bio *bio;
296		296
297	if (hdr->interface_id != 'S')	297	if (hdr->interface_id != 'S')
298	return -EINVAL;	298	return -EINVAL;
299		299
300	if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9))	300	if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9))
301	return -EIO;	301	return -EIO;
302		302
303	if (hdr->dxfer_len)	303	if (hdr->dxfer_len)
304	switch (hdr->dxfer_direction) {	304	switch (hdr->dxfer_direction) {
305	default:	305	default:
306	return -EINVAL;	306	return -EINVAL;
307	case SG_DXFER_TO_DEV:	307	case SG_DXFER_TO_DEV:
308	writing = 1;	308	writing = 1;
309	break;	309	break;
310	case SG_DXFER_TO_FROM_DEV:	310	case SG_DXFER_TO_FROM_DEV:
311	case SG_DXFER_FROM_DEV:	311	case SG_DXFER_FROM_DEV:
312	break;	312	break;
313	}	313	}
314	if (hdr->flags & SG_FLAG_Q_AT_HEAD)	314	if (hdr->flags & SG_FLAG_Q_AT_HEAD)
315	at_head = 1;	315	at_head = 1;
316		316
317	ret = -ENOMEM;	317	ret = -ENOMEM;
318	rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);	318	rq = blk_get_request(q, writing ? WRITE : READ, GFP_KERNEL);
319	if (IS_ERR(rq))	319	if (IS_ERR(rq))
320	return PTR_ERR(rq);	320	return PTR_ERR(rq);
321	blk_rq_set_block_pc(rq);	321	blk_rq_set_block_pc(rq);
322		322
323	if (hdr->cmd_len > BLK_MAX_CDB) {	323	if (hdr->cmd_len > BLK_MAX_CDB) {
324	rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);	324	rq->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);
325	if (!rq->cmd)	325	if (!rq->cmd)
326	goto out_put_request;	326	goto out_put_request;
327	}	327	}
328		328
329	ret = -EFAULT;	329	ret = -EFAULT;
330	if (blk_fill_sghdr_rq(q, rq, hdr, mode))	330	if (blk_fill_sghdr_rq(q, rq, hdr, mode))
331	goto out_free_cdb;	331	goto out_free_cdb;
332		332
333	ret = 0;	333	ret = 0;
334	if (hdr->iovec_count) {	334	if (hdr->iovec_count) {
335	size_t iov_data_len;	335	size_t iov_data_len;
336	struct iovec *iov = NULL;	336	struct iovec *iov = NULL;
337		337
338	ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count,	338	ret = rw_copy_check_uvector(-1, hdr->dxferp, hdr->iovec_count,
339	0, NULL, &iov);	339	0, NULL, &iov);
340	if (ret < 0) {	340	if (ret < 0) {
341	kfree(iov);	341	kfree(iov);
342	goto out_free_cdb;	342	goto out_free_cdb;
343	}	343	}
344		344
345	iov_data_len = ret;	345	iov_data_len = ret;
346	ret = 0;	346	ret = 0;
347		347
348	/* SG_IO howto says that the shorter of the two wins */	348	/* SG_IO howto says that the shorter of the two wins */
349	if (hdr->dxfer_len < iov_data_len) {	349	if (hdr->dxfer_len < iov_data_len) {
350	hdr->iovec_count = iov_shorten(iov,	350	hdr->iovec_count = iov_shorten(iov,
351	hdr->iovec_count,	351	hdr->iovec_count,
352	hdr->dxfer_len);	352	hdr->dxfer_len);
353	iov_data_len = hdr->dxfer_len;	353	iov_data_len = hdr->dxfer_len;
354	}	354	}
355		355
356	ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov,	356	ret = blk_rq_map_user_iov(q, rq, NULL, (struct sg_iovec *) iov,
357	hdr->iovec_count,	357	hdr->iovec_count,
358	iov_data_len, GFP_KERNEL);	358	iov_data_len, GFP_KERNEL);
359	kfree(iov);	359	kfree(iov);
360	} else if (hdr->dxfer_len)	360	} else if (hdr->dxfer_len)
361	ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,	361	ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
362	GFP_KERNEL);	362	GFP_KERNEL);
363		363
364	if (ret)	364	if (ret)
365	goto out_free_cdb;	365	goto out_free_cdb;
366		366
367	bio = rq->bio;	367	bio = rq->bio;
368	memset(sense, 0, sizeof(sense));	368	memset(sense, 0, sizeof(sense));
369	rq->sense = sense;	369	rq->sense = sense;
370	rq->sense_len = 0;	370	rq->sense_len = 0;
371	rq->retries = 0;	371	rq->retries = 0;
372		372
373	start_time = jiffies;	373	start_time = jiffies;
374		374
375	/* ignore return value. All information is passed back to caller	375	/* ignore return value. All information is passed back to caller
376	* (if he doesn't check that is his problem).	376	* (if he doesn't check that is his problem).
377	* N.B. a non-zero SCSI status is _not_ necessarily an error.	377	* N.B. a non-zero SCSI status is _not_ necessarily an error.
378	*/	378	*/
379	blk_execute_rq(q, bd_disk, rq, at_head);	379	blk_execute_rq(q, bd_disk, rq, at_head);
380		380
381	hdr->duration = jiffies_to_msecs(jiffies - start_time);	381	hdr->duration = jiffies_to_msecs(jiffies - start_time);
382		382
383	ret = blk_complete_sghdr_rq(rq, hdr, bio);	383	ret = blk_complete_sghdr_rq(rq, hdr, bio);
384		384
385	out_free_cdb:	385	out_free_cdb:
386	if (rq->cmd != rq->__cmd)	386	if (rq->cmd != rq->__cmd)
387	kfree(rq->cmd);	387	kfree(rq->cmd);
388	out_put_request:	388	out_put_request:
389	blk_put_request(rq);	389	blk_put_request(rq);
390	return ret;	390	return ret;
391	}	391	}
392		392
393	/**	393	/**
394	* sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl	394	* sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
395	* @file: file this ioctl operates on (optional)	395	* @file: file this ioctl operates on (optional)
396	* @q: request queue to send scsi commands down	396	* @q: request queue to send scsi commands down
397	* @disk: gendisk to operate on (option)	397	* @disk: gendisk to operate on (option)
398	* @sic: userspace structure describing the command to perform	398	* @sic: userspace structure describing the command to perform
399	*	399	*
400	* Send down the scsi command described by @sic to the device below	400	* Send down the scsi command described by @sic to the device below
401	* the request queue @q. If @file is non-NULL it's used to perform	401	* the request queue @q. If @file is non-NULL it's used to perform
402	* fine-grained permission checks that allow users to send down	402	* fine-grained permission checks that allow users to send down
403	* non-destructive SCSI commands. If the caller has a struct gendisk	403	* non-destructive SCSI commands. If the caller has a struct gendisk
404	* available it should be passed in as @disk to allow the low level	404	* available it should be passed in as @disk to allow the low level
405	* driver to use the information contained in it. A non-NULL @disk	405	* driver to use the information contained in it. A non-NULL @disk
406	* is only allowed if the caller knows that the low level driver doesn't	406	* is only allowed if the caller knows that the low level driver doesn't
407	* need it (e.g. in the scsi subsystem).	407	* need it (e.g. in the scsi subsystem).
408	*	408	*
409	* Notes:	409	* Notes:
410	* - This interface is deprecated - users should use the SG_IO	410	* - This interface is deprecated - users should use the SG_IO
411	* interface instead, as this is a more flexible approach to	411	* interface instead, as this is a more flexible approach to
412	* performing SCSI commands on a device.	412	* performing SCSI commands on a device.
413	* - The SCSI command length is determined by examining the 1st byte	413	* - The SCSI command length is determined by examining the 1st byte
414	* of the given command. There is no way to override this.	414	* of the given command. There is no way to override this.
415	* - Data transfers are limited to PAGE_SIZE	415	* - Data transfers are limited to PAGE_SIZE
416	* - The length (x + y) must be at least OMAX_SB_LEN bytes long to	416	* - The length (x + y) must be at least OMAX_SB_LEN bytes long to
417	* accommodate the sense buffer when an error occurs.	417	* accommodate the sense buffer when an error occurs.
418	* The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that	418	* The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that
419	* old code will not be surprised.	419	* old code will not be surprised.
420	* - If a Unix error occurs (e.g. ENOMEM) then the user will receive	420	* - If a Unix error occurs (e.g. ENOMEM) then the user will receive
421	* a negative return and the Unix error code in 'errno'.	421	* a negative return and the Unix error code in 'errno'.
422	* If the SCSI command succeeds then 0 is returned.	422	* If the SCSI command succeeds then 0 is returned.
423	* Positive numbers returned are the compacted SCSI error codes (4	423	* Positive numbers returned are the compacted SCSI error codes (4
424	* bytes in one int) where the lowest byte is the SCSI status.	424	* bytes in one int) where the lowest byte is the SCSI status.
425	*/	425	*/
426	#define OMAX_SB_LEN 16 /* For backward compatibility */	426	#define OMAX_SB_LEN 16 /* For backward compatibility */
427	int sg_scsi_ioctl(struct request_queue q, struct gendisk disk, fmode_t mode,	427	int sg_scsi_ioctl(struct request_queue q, struct gendisk disk, fmode_t mode,
428	struct scsi_ioctl_command __user *sic)	428	struct scsi_ioctl_command __user *sic)
429	{	429	{
430	struct request *rq;	430	struct request *rq;
431	int err;	431	int err;
432	unsigned int in_len, out_len, bytes, opcode, cmdlen;	432	unsigned int in_len, out_len, bytes, opcode, cmdlen;
433	char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE];	433	char *buffer = NULL, sense[SCSI_SENSE_BUFFERSIZE];
434		434
435	if (!sic)	435	if (!sic)
436	return -EINVAL;	436	return -EINVAL;
437		437
438	/*	438	/*
439	* get in an out lengths, verify they don't exceed a page worth of data	439	* get in an out lengths, verify they don't exceed a page worth of data
440	*/	440	*/
441	if (get_user(in_len, &sic->inlen))	441	if (get_user(in_len, &sic->inlen))
442	return -EFAULT;	442	return -EFAULT;
443	if (get_user(out_len, &sic->outlen))	443	if (get_user(out_len, &sic->outlen))
444	return -EFAULT;	444	return -EFAULT;
445	if (in_len > PAGE_SIZE \|\| out_len > PAGE_SIZE)	445	if (in_len > PAGE_SIZE \|\| out_len > PAGE_SIZE)
446	return -EINVAL;	446	return -EINVAL;
447	if (get_user(opcode, sic->data))	447	if (get_user(opcode, sic->data))
448	return -EFAULT;	448	return -EFAULT;
449		449
450	bytes = max(in_len, out_len);	450	bytes = max(in_len, out_len);
451	if (bytes) {	451	if (bytes) {
452	buffer = kzalloc(bytes, q->bounce_gfp \| GFP_USER\| __GFP_NOWARN);	452	buffer = kzalloc(bytes, q->bounce_gfp \| GFP_USER\| __GFP_NOWARN);
453	if (!buffer)	453	if (!buffer)
454	return -ENOMEM;	454	return -ENOMEM;
455		455
456	}	456	}
457		457
458	rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);	458	rq = blk_get_request(q, in_len ? WRITE : READ, __GFP_WAIT);
459	if (IS_ERR(rq)) {	459	if (IS_ERR(rq)) {
460	err = PTR_ERR(rq);	460	err = PTR_ERR(rq);
461	goto error;	461	goto error_free_buffer;
462	}	462	}
463	blk_rq_set_block_pc(rq);	463	blk_rq_set_block_pc(rq);
464		464
465	cmdlen = COMMAND_SIZE(opcode);	465	cmdlen = COMMAND_SIZE(opcode);
466		466
467	/*	467	/*
468	* get command and data to send to device, if any	468	* get command and data to send to device, if any
469	*/	469	*/
470	err = -EFAULT;	470	err = -EFAULT;
471	rq->cmd_len = cmdlen;	471	rq->cmd_len = cmdlen;
472	if (copy_from_user(rq->cmd, sic->data, cmdlen))	472	if (copy_from_user(rq->cmd, sic->data, cmdlen))
473	goto error;	473	goto error;
474		474
475	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))	475	if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
476	goto error;	476	goto error;
477		477
478	err = blk_verify_command(rq->cmd, mode & FMODE_WRITE);	478	err = blk_verify_command(rq->cmd, mode & FMODE_WRITE);
479	if (err)	479	if (err)
480	goto error;	480	goto error;
481		481
482	/* default. possible overriden later */	482	/* default. possible overriden later */
483	rq->retries = 5;	483	rq->retries = 5;
484		484
485	switch (opcode) {	485	switch (opcode) {
486	case SEND_DIAGNOSTIC:	486	case SEND_DIAGNOSTIC:
487	case FORMAT_UNIT:	487	case FORMAT_UNIT:
488	rq->timeout = FORMAT_UNIT_TIMEOUT;	488	rq->timeout = FORMAT_UNIT_TIMEOUT;
489	rq->retries = 1;	489	rq->retries = 1;
490	break;	490	break;
491	case START_STOP:	491	case START_STOP:
492	rq->timeout = START_STOP_TIMEOUT;	492	rq->timeout = START_STOP_TIMEOUT;
493	break;	493	break;
494	case MOVE_MEDIUM:	494	case MOVE_MEDIUM:
495	rq->timeout = MOVE_MEDIUM_TIMEOUT;	495	rq->timeout = MOVE_MEDIUM_TIMEOUT;
496	break;	496	break;
497	case READ_ELEMENT_STATUS:	497	case READ_ELEMENT_STATUS:
498	rq->timeout = READ_ELEMENT_STATUS_TIMEOUT;	498	rq->timeout = READ_ELEMENT_STATUS_TIMEOUT;
499	break;	499	break;
500	case READ_DEFECT_DATA:	500	case READ_DEFECT_DATA:
501	rq->timeout = READ_DEFECT_DATA_TIMEOUT;	501	rq->timeout = READ_DEFECT_DATA_TIMEOUT;
502	rq->retries = 1;	502	rq->retries = 1;
503	break;	503	break;
504	default:	504	default:
505	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;	505	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
506	break;	506	break;
507	}	507	}
508		508
509	if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) {	509	if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, __GFP_WAIT)) {
510	err = DRIVER_ERROR << 24;	510	err = DRIVER_ERROR << 24;
511	goto error;	511	goto error;
512	}	512	}
513		513
514	memset(sense, 0, sizeof(sense));	514	memset(sense, 0, sizeof(sense));
515	rq->sense = sense;	515	rq->sense = sense;
516	rq->sense_len = 0;	516	rq->sense_len = 0;
517		517
518	blk_execute_rq(q, disk, rq, 0);	518	blk_execute_rq(q, disk, rq, 0);
519		519
520	err = rq->errors & 0xff; /* only 8 bit SCSI status */	520	err = rq->errors & 0xff; /* only 8 bit SCSI status */
521	if (err) {	521	if (err) {
522	if (rq->sense_len && rq->sense) {	522	if (rq->sense_len && rq->sense) {
523	bytes = (OMAX_SB_LEN > rq->sense_len) ?	523	bytes = (OMAX_SB_LEN > rq->sense_len) ?
524	rq->sense_len : OMAX_SB_LEN;	524	rq->sense_len : OMAX_SB_LEN;
525	if (copy_to_user(sic->data, rq->sense, bytes))	525	if (copy_to_user(sic->data, rq->sense, bytes))
526	err = -EFAULT;	526	err = -EFAULT;
527	}	527	}
528	} else {	528	} else {
529	if (copy_to_user(sic->data, buffer, out_len))	529	if (copy_to_user(sic->data, buffer, out_len))
530	err = -EFAULT;	530	err = -EFAULT;
531	}	531	}
532		532
533	error:	533	error:
		534	blk_put_request(rq);
		535
		536	error_free_buffer:
534	kfree(buffer);	537	kfree(buffer);
535	if (rq)	538
536	blk_put_request(rq);
537	return err;	539	return err;
538	}	540	}
539	EXPORT_SYMBOL_GPL(sg_scsi_ioctl);	541	EXPORT_SYMBOL_GPL(sg_scsi_ioctl);
540		542
541	/* Send basic block requests */	543	/* Send basic block requests */
542	static int __blk_send_generic(struct request_queue q, struct gendisk bd_disk,	544	static int __blk_send_generic(struct request_queue q, struct gendisk bd_disk,
543	int cmd, int data)	545	int cmd, int data)
544	{	546	{
545	struct request *rq;	547	struct request *rq;
546	int err;	548	int err;
547		549
548	rq = blk_get_request(q, WRITE, __GFP_WAIT);	550	rq = blk_get_request(q, WRITE, __GFP_WAIT);
549	if (IS_ERR(rq))	551	if (IS_ERR(rq))
550	return PTR_ERR(rq);	552	return PTR_ERR(rq);
551	blk_rq_set_block_pc(rq);	553	blk_rq_set_block_pc(rq);
552	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;	554	rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
553	rq->cmd[0] = cmd;	555	rq->cmd[0] = cmd;
554	rq->cmd[4] = data;	556	rq->cmd[4] = data;
555	rq->cmd_len = 6;	557	rq->cmd_len = 6;
556	err = blk_execute_rq(q, bd_disk, rq, 0);	558	err = blk_execute_rq(q, bd_disk, rq, 0);
557	blk_put_request(rq);	559	blk_put_request(rq);
558		560
559	return err;	561	return err;
560	}	562	}
561		563
562	static inline int blk_send_start_stop(struct request_queue *q,	564	static inline int blk_send_start_stop(struct request_queue *q,
563	struct gendisk *bd_disk, int data)	565	struct gendisk *bd_disk, int data)
564	{	566	{
565	return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data);	567	return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data);
566	}	568	}
567		569
568	int scsi_cmd_ioctl(struct request_queue q, struct gendisk bd_disk, fmode_t mode,	570	int scsi_cmd_ioctl(struct request_queue q, struct gendisk bd_disk, fmode_t mode,
569	unsigned int cmd, void __user *arg)	571	unsigned int cmd, void __user *arg)
570	{	572	{
571	int err;	573	int err;
572		574
573	if (!q)	575	if (!q)
574	return -ENXIO;	576	return -ENXIO;
575		577
576	switch (cmd) {	578	switch (cmd) {
577	/*	579	/*
578	* new sgv3 interface	580	* new sgv3 interface
579	*/	581	*/
580	case SG_GET_VERSION_NUM:	582	case SG_GET_VERSION_NUM:
581	err = sg_get_version(arg);	583	err = sg_get_version(arg);
582	break;	584	break;
583	case SCSI_IOCTL_GET_IDLUN:	585	case SCSI_IOCTL_GET_IDLUN:
584	err = scsi_get_idlun(q, arg);	586	err = scsi_get_idlun(q, arg);
585	break;	587	break;
586	case SCSI_IOCTL_GET_BUS_NUMBER:	588	case SCSI_IOCTL_GET_BUS_NUMBER:
587	err = scsi_get_bus(q, arg);	589	err = scsi_get_bus(q, arg);
588	break;	590	break;
589	case SG_SET_TIMEOUT:	591	case SG_SET_TIMEOUT:
590	err = sg_set_timeout(q, arg);	592	err = sg_set_timeout(q, arg);
591	break;	593	break;
592	case SG_GET_TIMEOUT:	594	case SG_GET_TIMEOUT:
593	err = sg_get_timeout(q);	595	err = sg_get_timeout(q);
594	break;	596	break;
595	case SG_GET_RESERVED_SIZE:	597	case SG_GET_RESERVED_SIZE:
596	err = sg_get_reserved_size(q, arg);	598	err = sg_get_reserved_size(q, arg);
597	break;	599	break;
598	case SG_SET_RESERVED_SIZE:	600	case SG_SET_RESERVED_SIZE:
599	err = sg_set_reserved_size(q, arg);	601	err = sg_set_reserved_size(q, arg);
600	break;	602	break;
601	case SG_EMULATED_HOST:	603	case SG_EMULATED_HOST:
602	err = sg_emulated_host(q, arg);	604	err = sg_emulated_host(q, arg);
603	break;	605	break;
604	case SG_IO: {	606	case SG_IO: {
605	struct sg_io_hdr hdr;	607	struct sg_io_hdr hdr;
606		608
607	err = -EFAULT;	609	err = -EFAULT;
608	if (copy_from_user(&hdr, arg, sizeof(hdr)))	610	if (copy_from_user(&hdr, arg, sizeof(hdr)))
609	break;	611	break;
610	err = sg_io(q, bd_disk, &hdr, mode);	612	err = sg_io(q, bd_disk, &hdr, mode);
611	if (err == -EFAULT)	613	if (err == -EFAULT)
612	break;	614	break;
613		615
614	if (copy_to_user(arg, &hdr, sizeof(hdr)))	616	if (copy_to_user(arg, &hdr, sizeof(hdr)))
615	err = -EFAULT;	617	err = -EFAULT;
616	break;	618	break;
617	}	619	}
618	case CDROM_SEND_PACKET: {	620	case CDROM_SEND_PACKET: {
619	struct cdrom_generic_command cgc;	621	struct cdrom_generic_command cgc;
620	struct sg_io_hdr hdr;	622	struct sg_io_hdr hdr;
621		623
622	err = -EFAULT;	624	err = -EFAULT;
623	if (copy_from_user(&cgc, arg, sizeof(cgc)))	625	if (copy_from_user(&cgc, arg, sizeof(cgc)))
624	break;	626	break;
625	cgc.timeout = clock_t_to_jiffies(cgc.timeout);	627	cgc.timeout = clock_t_to_jiffies(cgc.timeout);
626	memset(&hdr, 0, sizeof(hdr));	628	memset(&hdr, 0, sizeof(hdr));
627	hdr.interface_id = 'S';	629	hdr.interface_id = 'S';
628	hdr.cmd_len = sizeof(cgc.cmd);	630	hdr.cmd_len = sizeof(cgc.cmd);
629	hdr.dxfer_len = cgc.buflen;	631	hdr.dxfer_len = cgc.buflen;
630	err = 0;	632	err = 0;
631	switch (cgc.data_direction) {	633	switch (cgc.data_direction) {
632	case CGC_DATA_UNKNOWN:	634	case CGC_DATA_UNKNOWN:
633	hdr.dxfer_direction = SG_DXFER_UNKNOWN;	635	hdr.dxfer_direction = SG_DXFER_UNKNOWN;
634	break;	636	break;
635	case CGC_DATA_WRITE:	637	case CGC_DATA_WRITE:
636	hdr.dxfer_direction = SG_DXFER_TO_DEV;	638	hdr.dxfer_direction = SG_DXFER_TO_DEV;
637	break;	639	break;
638	case CGC_DATA_READ:	640	case CGC_DATA_READ:
639	hdr.dxfer_direction = SG_DXFER_FROM_DEV;	641	hdr.dxfer_direction = SG_DXFER_FROM_DEV;
640	break;	642	break;
641	case CGC_DATA_NONE:	643	case CGC_DATA_NONE:
642	hdr.dxfer_direction = SG_DXFER_NONE;	644	hdr.dxfer_direction = SG_DXFER_NONE;
643	break;	645	break;
644	default:	646	default:
645	err = -EINVAL;	647	err = -EINVAL;
646	}	648	}
647	if (err)	649	if (err)
648	break;	650	break;
649		651
650	hdr.dxferp = cgc.buffer;	652	hdr.dxferp = cgc.buffer;
651	hdr.sbp = cgc.sense;	653	hdr.sbp = cgc.sense;
652	if (hdr.sbp)	654	if (hdr.sbp)
653	hdr.mx_sb_len = sizeof(struct request_sense);	655	hdr.mx_sb_len = sizeof(struct request_sense);
654	hdr.timeout = jiffies_to_msecs(cgc.timeout);	656	hdr.timeout = jiffies_to_msecs(cgc.timeout);
655	hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd;	657	hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd;
656	hdr.cmd_len = sizeof(cgc.cmd);	658	hdr.cmd_len = sizeof(cgc.cmd);
657		659
658	err = sg_io(q, bd_disk, &hdr, mode);	660	err = sg_io(q, bd_disk, &hdr, mode);
659	if (err == -EFAULT)	661	if (err == -EFAULT)
660	break;	662	break;
661		663
662	if (hdr.status)	664	if (hdr.status)
663	err = -EIO;	665	err = -EIO;
664		666
665	cgc.stat = err;	667	cgc.stat = err;
666	cgc.buflen = hdr.resid;	668	cgc.buflen = hdr.resid;
667	if (copy_to_user(arg, &cgc, sizeof(cgc)))	669	if (copy_to_user(arg, &cgc, sizeof(cgc)))
668	err = -EFAULT;	670	err = -EFAULT;
669		671
670	break;	672	break;
671	}	673	}
672		674
673	/*	675	/*
674	* old junk scsi send command ioctl	676	* old junk scsi send command ioctl
675	*/	677	*/
676	case SCSI_IOCTL_SEND_COMMAND:	678	case SCSI_IOCTL_SEND_COMMAND:
677	printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm);	679	printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm);
678	err = -EINVAL;	680	err = -EINVAL;
679	if (!arg)	681	if (!arg)
680	break;	682	break;
681		683
682	err = sg_scsi_ioctl(q, bd_disk, mode, arg);	684	err = sg_scsi_ioctl(q, bd_disk, mode, arg);
683	break;	685	break;
684	case CDROMCLOSETRAY:	686	case CDROMCLOSETRAY:
685	err = blk_send_start_stop(q, bd_disk, 0x03);	687	err = blk_send_start_stop(q, bd_disk, 0x03);
686	break;	688	break;
687	case CDROMEJECT:	689	case CDROMEJECT:
688	err = blk_send_start_stop(q, bd_disk, 0x02);	690	err = blk_send_start_stop(q, bd_disk, 0x02);
689	break;	691	break;
690	default:	692	default:
691	err = -ENOTTY;	693	err = -ENOTTY;
692	}	694	}
693		695
694	return err;	696	return err;
695	}	697	}
696	EXPORT_SYMBOL(scsi_cmd_ioctl);	698	EXPORT_SYMBOL(scsi_cmd_ioctl);
697		699
698	int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)	700	int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
699	{	701	{
700	if (bd && bd == bd->bd_contains)	702	if (bd && bd == bd->bd_contains)
701	return 0;	703	return 0;
702		704
703	/* Actually none of these is particularly useful on a partition,	705	/* Actually none of these is particularly useful on a partition,
704	* but they are safe.	706	* but they are safe.
705	*/	707	*/
706	switch (cmd) {	708	switch (cmd) {
707	case SCSI_IOCTL_GET_IDLUN:	709	case SCSI_IOCTL_GET_IDLUN:
708	case SCSI_IOCTL_GET_BUS_NUMBER:	710	case SCSI_IOCTL_GET_BUS_NUMBER:
709	case SCSI_IOCTL_GET_PCI:	711	case SCSI_IOCTL_GET_PCI:
710	case SCSI_IOCTL_PROBE_HOST:	712	case SCSI_IOCTL_PROBE_HOST:
711	case SG_GET_VERSION_NUM:	713	case SG_GET_VERSION_NUM:
712	case SG_SET_TIMEOUT:	714	case SG_SET_TIMEOUT:
713	case SG_GET_TIMEOUT:	715	case SG_GET_TIMEOUT:
714	case SG_GET_RESERVED_SIZE:	716	case SG_GET_RESERVED_SIZE:
715	case SG_SET_RESERVED_SIZE:	717	case SG_SET_RESERVED_SIZE:
716	case SG_EMULATED_HOST:	718	case SG_EMULATED_HOST:
717	return 0;	719	return 0;
718	case CDROM_GET_CAPABILITY:	720	case CDROM_GET_CAPABILITY:
719	/* Keep this until we remove the printk below. udev sends it	721	/* Keep this until we remove the printk below. udev sends it
720	* and we do not want to spam dmesg about it. CD-ROMs do	722	* and we do not want to spam dmesg about it. CD-ROMs do
721	* not have partitions, so we get here only for disks.	723	* not have partitions, so we get here only for disks.
722	*/	724	*/
723	return -ENOIOCTLCMD;	725	return -ENOIOCTLCMD;
724	default:	726	default:
725	break;	727	break;
726	}	728	}
727		729
728	if (capable(CAP_SYS_RAWIO))	730	if (capable(CAP_SYS_RAWIO))
729	return 0;	731	return 0;
730		732
731	/* In particular, rule out all resets and host-specific ioctls. */	733	/* In particular, rule out all resets and host-specific ioctls. */
732	printk_ratelimited(KERN_WARNING	734	printk_ratelimited(KERN_WARNING
733	"%s: sending ioctl %x to a partition!\n", current->comm, cmd);	735	"%s: sending ioctl %x to a partition!\n", current->comm, cmd);
734		736
735	return -ENOIOCTLCMD;	737	return -ENOIOCTLCMD;
736	}	738	}
737	EXPORT_SYMBOL(scsi_verify_blk_ioctl);	739	EXPORT_SYMBOL(scsi_verify_blk_ioctl);
738		740
739	int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,	741	int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
740	unsigned int cmd, void __user *arg)	742	unsigned int cmd, void __user *arg)
741	{	743	{
742	int ret;	744	int ret;
743		745
744	ret = scsi_verify_blk_ioctl(bd, cmd);	746	ret = scsi_verify_blk_ioctl(bd, cmd);
745	if (ret < 0)	747	if (ret < 0)
746	return ret;	748	return ret;
747		749
748	return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);	750	return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
749	}	751	}
750	EXPORT_SYMBOL(scsi_cmd_blk_ioctl);	752	EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
751		753
752	static int __init blk_scsi_ioctl_init(void)	754	static int __init blk_scsi_ioctl_init(void)
753	{	755	{
754	blk_set_cmd_filter_defaults(&blk_default_cmd_filter);	756	blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
755	return 0;	757	return 0;
756	}	758	}
757	fs_initcall(blk_scsi_ioctl_init);	759	fs_initcall(blk_scsi_ioctl_init);