Eric Lee / linux-smarc-t335x-v3.2

Commit bf2de6f5a4faf0197268f18d08969b003b87b6e8

Authored by Jens Axboe 2007-09-27 19:01:25 +0800

Exists in master and in 4 other branches

block: Initial support for data-less (or empty) barrier support

This implements functionality to pass down or insert a barrier
in a queue, without having data attached to it. The ->prepare_flush_fn()
infrastructure from data barriers are reused to provide this
functionality.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

Showing 5 changed files with 71 additions and 21 deletions Inline Diff

block/elevator.c
block/ll_rw_blk.c
include/linux/bio.h
include/linux/blkdev.h
mm/bounce.c

block/elevator.c

Diff comments View file @ bf2de6f

 /*
  *  Block device elevator/IO-scheduler.
  *
  *  Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
  *
  * 30042000 Jens Axboe <axboe@kernel.dk> :
  *
  * Split the elevator a bit so that it is possible to choose a different
  * one or even write a new "plug in". There are three pieces:
  * - elevator_fn, inserts a new request in the queue list
  * - elevator_merge_fn, decides whether a new buffer can be merged with
  *   an existing request
  * - elevator_dequeue_fn, called when a request is taken off the active list
  *
  * 20082000 Dave Jones <davej@suse.de> :
  * Removed tests for max-bomb-segments, which was breaking elvtune
  *  when run without -bN
  *
  * Jens:
  * - Rework again to work with bio instead of buffer_heads
  * - loose bi_dev comparisons, partition handling is right now
  * - completely modularize elevator setup and teardown
  *
  */
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/blkdev.h>
 #include <linux/elevator.h>
 #include <linux/bio.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
 #include <linux/hash.h>
 #include <asm/uaccess.h>
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 /*
  * Merge hash stuff.
  */
 static const int elv_hash_shift = 6;
 #define ELV_HASH_BLOCK(sec)	((sec) >> 3)
 #define ELV_HASH_FN(sec)	(hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift))
 #define ELV_HASH_ENTRIES	(1 << elv_hash_shift)
 #define rq_hash_key(rq)		((rq)->sector + (rq)->nr_sectors)
 #define ELV_ON_HASH(rq)		(!hlist_unhashed(&(rq)->hash))
 /*
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
  */
 static int elv_iosched_allow_merge(struct request *rq, struct bio *bio)
 {
 	struct request_queue *q = rq->q;
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_allow_merge_fn)
 		return e->ops->elevator_allow_merge_fn(q, rq, bio);
 	return 1;
 }
 /*
  * can we safely merge with this request?
  */
 inline int elv_rq_merge_ok(struct request *rq, struct bio *bio)
 {
 	if (!rq_mergeable(rq))
 		return 0;
 	/*
 	 * different data direction or already started, don't merge
 	 */
 	if (bio_data_dir(bio) != rq_data_dir(rq))
 		return 0;
 	/*
 	 * must be same device and not a special request
 	 */
 	if (rq->rq_disk != bio->bi_bdev->bd_disk || rq->special)
 		return 0;
 	if (!elv_iosched_allow_merge(rq, bio))
 		return 0;
 	return 1;
 }
 EXPORT_SYMBOL(elv_rq_merge_ok);
 static inline int elv_try_merge(struct request *__rq, struct bio *bio)
 {
 	int ret = ELEVATOR_NO_MERGE;
 	/*
 	 * we can merge and sequence is ok, check if it's possible
 	 */
 	if (elv_rq_merge_ok(__rq, bio)) {
 		if (__rq->sector + __rq->nr_sectors == bio->bi_sector)
 			ret = ELEVATOR_BACK_MERGE;
 		else if (__rq->sector - bio_sectors(bio) == bio->bi_sector)
 			ret = ELEVATOR_FRONT_MERGE;
 	}
 	return ret;
 }
 static struct elevator_type *elevator_find(const char *name)
 {
 	struct elevator_type *e;
 	list_for_each_entry(e, &elv_list, list) {
 		if (!strcmp(e->elevator_name, name))
 			return e;
 	}
 	return NULL;
 }
 static void elevator_put(struct elevator_type *e)
 {
 	module_put(e->elevator_owner);
 }
 static struct elevator_type *elevator_get(const char *name)
 {
 	struct elevator_type *e;
 	spin_lock(&elv_list_lock);
 	e = elevator_find(name);
 	if (e && !try_module_get(e->elevator_owner))
 		e = NULL;
 	spin_unlock(&elv_list_lock);
 	return e;
 }
 static void *elevator_init_queue(struct request_queue *q,
 				 struct elevator_queue *eq)
 {
 	return eq->ops->elevator_init_fn(q);
 }
 static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
 			   void *data)
 {
 	q->elevator = eq;
 	eq->elevator_data = data;
 }
 static char chosen_elevator[16];
 static int __init elevator_setup(char *str)
 {
 	/*
 	 * Be backwards-compatible with previous kernels, so users
 	 * won't get the wrong elevator.
 	 */
 	if (!strcmp(str, "as"))
 		strcpy(chosen_elevator, "anticipatory");
 	else
 		strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
 	return 1;
 }
 __setup("elevator=", elevator_setup);
 static struct kobj_type elv_ktype;
 static elevator_t *elevator_alloc(struct request_queue *q,
 				  struct elevator_type *e)
 {
 	elevator_t *eq;
 	int i;
 	eq = kmalloc_node(sizeof(elevator_t), GFP_KERNEL | __GFP_ZERO, q->node);
 	if (unlikely(!eq))
 		goto err;
 	eq->ops = &e->ops;
 	eq->elevator_type = e;
 	kobject_init(&eq->kobj);
 	kobject_set_name(&eq->kobj, "%s", "iosched");
 	eq->kobj.ktype = &elv_ktype;
 	mutex_init(&eq->sysfs_lock);
 	eq->hash = kmalloc_node(sizeof(struct hlist_head) * ELV_HASH_ENTRIES,
 					GFP_KERNEL, q->node);
 	if (!eq->hash)
 		goto err;
 	for (i = 0; i < ELV_HASH_ENTRIES; i++)
 		INIT_HLIST_HEAD(&eq->hash[i]);
 	return eq;
 err:
 	kfree(eq);
 	elevator_put(e);
 	return NULL;
 }
 static void elevator_release(struct kobject *kobj)
 {
 	elevator_t *e = container_of(kobj, elevator_t, kobj);
 	elevator_put(e->elevator_type);
 	kfree(e->hash);
 	kfree(e);
 }
 int elevator_init(struct request_queue *q, char *name)
 {
 	struct elevator_type *e = NULL;
 	struct elevator_queue *eq;
 	int ret = 0;
 	void *data;
 	INIT_LIST_HEAD(&q->queue_head);
 	q->last_merge = NULL;
 	q->end_sector = 0;
 	q->boundary_rq = NULL;
 	if (name && !(e = elevator_get(name)))
 		return -EINVAL;
 	if (!e && *chosen_elevator && !(e = elevator_get(chosen_elevator)))
 		printk("I/O scheduler %s not found\n", chosen_elevator);
 	if (!e && !(e = elevator_get(CONFIG_DEFAULT_IOSCHED))) {
 		printk("Default I/O scheduler not found, using no-op\n");
 		e = elevator_get("noop");
 	}
 	eq = elevator_alloc(q, e);
 	if (!eq)
 		return -ENOMEM;
 	data = elevator_init_queue(q, eq);
 	if (!data) {
 		kobject_put(&eq->kobj);
 		return -ENOMEM;
 	}
 	elevator_attach(q, eq, data);
 	return ret;
 }
 EXPORT_SYMBOL(elevator_init);
 void elevator_exit(elevator_t *e)
 {
 	mutex_lock(&e->sysfs_lock);
 	if (e->ops->elevator_exit_fn)
 		e->ops->elevator_exit_fn(e);
 	e->ops = NULL;
 	mutex_unlock(&e->sysfs_lock);
 	kobject_put(&e->kobj);
 }
 EXPORT_SYMBOL(elevator_exit);
 static void elv_activate_rq(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_activate_req_fn)
 		e->ops->elevator_activate_req_fn(q, rq);
 }
 static void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_deactivate_req_fn)
 		e->ops->elevator_deactivate_req_fn(q, rq);
 }
 static inline void __elv_rqhash_del(struct request *rq)
 {
 	hlist_del_init(&rq->hash);
 }
 static void elv_rqhash_del(struct request_queue *q, struct request *rq)
 {
 	if (ELV_ON_HASH(rq))
 		__elv_rqhash_del(rq);
 }
 static void elv_rqhash_add(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	BUG_ON(ELV_ON_HASH(rq));
 	hlist_add_head(&rq->hash, &e->hash[ELV_HASH_FN(rq_hash_key(rq))]);
 }
 static void elv_rqhash_reposition(struct request_queue *q, struct request *rq)
 {
 	__elv_rqhash_del(rq);
 	elv_rqhash_add(q, rq);
 }
 static struct request *elv_rqhash_find(struct request_queue *q, sector_t offset)
 {
 	elevator_t *e = q->elevator;
 	struct hlist_head *hash_list = &e->hash[ELV_HASH_FN(offset)];
 	struct hlist_node *entry, *next;
 	struct request *rq;
 	hlist_for_each_entry_safe(rq, entry, next, hash_list, hash) {
 		BUG_ON(!ELV_ON_HASH(rq));
 		if (unlikely(!rq_mergeable(rq))) {
 			__elv_rqhash_del(rq);
 			continue;
 		}
 		if (rq_hash_key(rq) == offset)
 			return rq;
 	}
 	return NULL;
 }
 /*
  * RB-tree support functions for inserting/lookup/removal of requests
  * in a sorted RB tree.
  */
 struct request *elv_rb_add(struct rb_root *root, struct request *rq)
 {
 	struct rb_node **p = &root->rb_node;
 	struct rb_node *parent = NULL;
 	struct request *__rq;
 	while (*p) {
 		parent = *p;
 		__rq = rb_entry(parent, struct request, rb_node);
 		if (rq->sector < __rq->sector)
 			p = &(*p)->rb_left;
 		else if (rq->sector > __rq->sector)
 			p = &(*p)->rb_right;
 		else
 			return __rq;
 	}
 	rb_link_node(&rq->rb_node, parent, p);
 	rb_insert_color(&rq->rb_node, root);
 	return NULL;
 }
 EXPORT_SYMBOL(elv_rb_add);
 void elv_rb_del(struct rb_root *root, struct request *rq)
 {
 	BUG_ON(RB_EMPTY_NODE(&rq->rb_node));
 	rb_erase(&rq->rb_node, root);
 	RB_CLEAR_NODE(&rq->rb_node);
 }
 EXPORT_SYMBOL(elv_rb_del);
 struct request *elv_rb_find(struct rb_root *root, sector_t sector)
 {
 	struct rb_node *n = root->rb_node;
 	struct request *rq;
 	while (n) {
 		rq = rb_entry(n, struct request, rb_node);
 		if (sector < rq->sector)
 			n = n->rb_left;
 		else if (sector > rq->sector)
 			n = n->rb_right;
 		else
 			return rq;
 	}
 	return NULL;
 }
 EXPORT_SYMBOL(elv_rb_find);
 /*
  * Insert rq into dispatch queue of q.  Queue lock must be held on
  * entry.  rq is sort insted into the dispatch queue. To be used by
  * specific elevators.
  */
 void elv_dispatch_sort(struct request_queue *q, struct request *rq)
 {
 	sector_t boundary;
 	struct list_head *entry;
 	if (q->last_merge == rq)
 		q->last_merge = NULL;
 	elv_rqhash_del(q, rq);
 	q->nr_sorted--;
 	boundary = q->end_sector;
 	list_for_each_prev(entry, &q->queue_head) {
 		struct request *pos = list_entry_rq(entry);
 		if (rq_data_dir(rq) != rq_data_dir(pos))
 			break;
 		if (pos->cmd_flags & (REQ_SOFTBARRIER|REQ_HARDBARRIER|REQ_STARTED))
 			break;
 		if (rq->sector >= boundary) {
 			if (pos->sector < boundary)
 				continue;
 		} else {
 			if (pos->sector >= boundary)
 				break;
 		}
 		if (rq->sector >= pos->sector)
 			break;
 	}
 	list_add(&rq->queuelist, entry);
 }
 EXPORT_SYMBOL(elv_dispatch_sort);
 /*
  * Insert rq into dispatch queue of q.  Queue lock must be held on
  * entry.  rq is added to the back of the dispatch queue. To be used by
  * specific elevators.
  */
 void elv_dispatch_add_tail(struct request_queue *q, struct request *rq)
 {
 	if (q->last_merge == rq)
 		q->last_merge = NULL;
 	elv_rqhash_del(q, rq);
 	q->nr_sorted--;
 	q->end_sector = rq_end_sector(rq);
 	q->boundary_rq = rq;
 	list_add_tail(&rq->queuelist, &q->queue_head);
 }
 EXPORT_SYMBOL(elv_dispatch_add_tail);
 int elv_merge(struct request_queue *q, struct request **req, struct bio *bio)
 {
 	elevator_t *e = q->elevator;
 	struct request *__rq;
 	int ret;
 	/*
 	 * First try one-hit cache.
 	 */
 	if (q->last_merge) {
 		ret = elv_try_merge(q->last_merge, bio);
 		if (ret != ELEVATOR_NO_MERGE) {
 			*req = q->last_merge;
 			return ret;
 		}
 	}
 	/*
 	 * See if our hash lookup can find a potential backmerge.
 	 */
 	__rq = elv_rqhash_find(q, bio->bi_sector);
 	if (__rq && elv_rq_merge_ok(__rq, bio)) {
 		*req = __rq;
 		return ELEVATOR_BACK_MERGE;
 	}
 	if (e->ops->elevator_merge_fn)
 		return e->ops->elevator_merge_fn(q, req, bio);
 	return ELEVATOR_NO_MERGE;
 }
 void elv_merged_request(struct request_queue *q, struct request *rq, int type)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_merged_fn)
 		e->ops->elevator_merged_fn(q, rq, type);
 	if (type == ELEVATOR_BACK_MERGE)
 		elv_rqhash_reposition(q, rq);
 	q->last_merge = rq;
 }
 void elv_merge_requests(struct request_queue *q, struct request *rq,
 			     struct request *next)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_merge_req_fn)
 		e->ops->elevator_merge_req_fn(q, rq, next);
 	elv_rqhash_reposition(q, rq);
 	elv_rqhash_del(q, next);
 	q->nr_sorted--;
 	q->last_merge = rq;
 }
 void elv_requeue_request(struct request_queue *q, struct request *rq)
 {
 	/*
 	 * it already went through dequeue, we need to decrement the
 	 * in_flight count again
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight--;
 		if (blk_sorted_rq(rq))
 			elv_deactivate_rq(q, rq);
 	}
 	rq->cmd_flags &= ~REQ_STARTED;
 	elv_insert(q, rq, ELEVATOR_INSERT_REQUEUE);
 }
 static void elv_drain_elevator(struct request_queue *q)
 {
 	static int printed;
 	while (q->elevator->ops->elevator_dispatch_fn(q, 1))
 		;
 	if (q->nr_sorted == 0)
 		return;
 	if (printed++ < 10) {
 		printk(KERN_ERR "%s: forced dispatching is broken "
 		       "(nr_sorted=%u), please report this\n",
 		       q->elevator->elevator_type->elevator_name, q->nr_sorted);
 	}
 }
 void elv_insert(struct request_queue *q, struct request *rq, int where)
 {
 	struct list_head *pos;
 	unsigned ordseq;
 	int unplug_it = 1;
 	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
 	rq->q = q;
 	switch (where) {
 	case ELEVATOR_INSERT_FRONT:
 		rq->cmd_flags |= REQ_SOFTBARRIER;
 		list_add(&rq->queuelist, &q->queue_head);
 		break;
 	case ELEVATOR_INSERT_BACK:
 		rq->cmd_flags |= REQ_SOFTBARRIER;
 		elv_drain_elevator(q);
 		list_add_tail(&rq->queuelist, &q->queue_head);
 		/*
 		 * We kick the queue here for the following reasons.
 		 * - The elevator might have returned NULL previously
 		 *   to delay requests and returned them now.  As the
 		 *   queue wasn't empty before this request, ll_rw_blk
 		 *   won't run the queue on return, resulting in hang.
 		 * - Usually, back inserted requests won't be merged
 		 *   with anything.  There's no point in delaying queue
 		 *   processing.
 		 */
 		blk_remove_plug(q);
 		q->request_fn(q);
 		break;
 	case ELEVATOR_INSERT_SORT:
 		BUG_ON(!blk_fs_request(rq));
 		rq->cmd_flags |= REQ_SORTED;
 		q->nr_sorted++;
 		if (rq_mergeable(rq)) {
 			elv_rqhash_add(q, rq);
 			if (!q->last_merge)
 				q->last_merge = rq;
 		}
 		/*
 		 * Some ioscheds (cfq) run q->request_fn directly, so
 		 * rq cannot be accessed after calling
 		 * elevator_add_req_fn.
 		 */
 		q->elevator->ops->elevator_add_req_fn(q, rq);
 		break;
 	case ELEVATOR_INSERT_REQUEUE:
 		/*
 		 * If ordered flush isn't in progress, we do front
 		 * insertion; otherwise, requests should be requeued
 		 * in ordseq order.
 		 */
 		rq->cmd_flags |= REQ_SOFTBARRIER;
 		/*
 		 * Most requeues happen because of a busy condition,
 		 * don't force unplug of the queue for that case.
 		 */
 		unplug_it = 0;
 		if (q->ordseq == 0) {
 			list_add(&rq->queuelist, &q->queue_head);
 			break;
 		}
 		ordseq = blk_ordered_req_seq(rq);
 		list_for_each(pos, &q->queue_head) {
 			struct request *pos_rq = list_entry_rq(pos);
 			if (ordseq <= blk_ordered_req_seq(pos_rq))
 				break;
 		}
 		list_add_tail(&rq->queuelist, pos);
 		break;
 	default:
 		printk(KERN_ERR "%s: bad insertion point %d\n",
 		       __FUNCTION__, where);
 		BUG();
 	}
 	if (unplug_it && blk_queue_plugged(q)) {
 		int nrq = q->rq.count[READ] + q->rq.count[WRITE]
 			- q->in_flight;
 		if (nrq >= q->unplug_thresh)
 			__generic_unplug_device(q);
 	}
 }
 void __elv_add_request(struct request_queue *q, struct request *rq, int where,
 		       int plug)
 {
 	if (q->ordcolor)
 		rq->cmd_flags |= REQ_ORDERED_COLOR;
 	if (rq->cmd_flags & (REQ_SOFTBARRIER | REQ_HARDBARRIER)) {
 		/*
 		 * toggle ordered color
 		 */
 		if (blk_barrier_rq(rq))
 			q->ordcolor ^= 1;
 		/*
 		 * barriers implicitly indicate back insertion
 		 */
 		if (where == ELEVATOR_INSERT_SORT)
 			where = ELEVATOR_INSERT_BACK;
 		/*
 		 * this request is scheduling boundary, update
 		 * end_sector
 		 */
 		if (blk_fs_request(rq)) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = rq;
 		}
 	} else if (!(rq->cmd_flags & REQ_ELVPRIV) && where == ELEVATOR_INSERT_SORT)
 		where = ELEVATOR_INSERT_BACK;
 	if (plug)
 		blk_plug_device(q);
 	elv_insert(q, rq, where);
 }
 EXPORT_SYMBOL(__elv_add_request);
 void elv_add_request(struct request_queue *q, struct request *rq, int where,
 		     int plug)
 {
 	unsigned long flags;
 	spin_lock_irqsave(q->queue_lock, flags);
 	__elv_add_request(q, rq, where, plug);
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 EXPORT_SYMBOL(elv_add_request);
 static inline struct request *__elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
 	while (1) {
 		while (!list_empty(&q->queue_head)) {
 			rq = list_entry_rq(q->queue_head.next);
 			if (blk_do_ordered(q, &rq))
 				return rq;
 		}
 		if (!q->elevator->ops->elevator_dispatch_fn(q, 0))
 			return NULL;
 	}
 }
 struct request *elv_next_request(struct request_queue *q)
 {
 	struct request *rq;
 	int ret;
 	while ((rq = __elv_next_request(q)) != NULL) {
+		/*
+		 * Kill the empty barrier place holder, the driver must
+		 * not ever see it.
+		 */
+		if (blk_empty_barrier(rq)) {
+			end_queued_request(rq, 1);
+			continue;
+		}
 		if (!(rq->cmd_flags & REQ_STARTED)) {
 			/*
 			 * This is the first time the device driver
 			 * sees this request (possibly after
 			 * requeueing).  Notify IO scheduler.
 			 */
 			if (blk_sorted_rq(rq))
 				elv_activate_rq(q, rq);
 			/*
 			 * just mark as started even if we don't start
 			 * it, a request that has been delayed should
 			 * not be passed by new incoming requests
 			 */
 			rq->cmd_flags |= REQ_STARTED;
 			blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
 		}
 		if (!q->boundary_rq || q->boundary_rq == rq) {
 			q->end_sector = rq_end_sector(rq);
 			q->boundary_rq = NULL;
 		}
 		if ((rq->cmd_flags & REQ_DONTPREP) || !q->prep_rq_fn)
 			break;
 		ret = q->prep_rq_fn(q, rq);
 		if (ret == BLKPREP_OK) {
 			break;
 		} else if (ret == BLKPREP_DEFER) {
 			/*
 			 * the request may have been (partially) prepped.
 			 * we need to keep this request in the front to
 			 * avoid resource deadlock.  REQ_STARTED will
 			 * prevent other fs requests from passing this one.
 			 */
 			rq = NULL;
 			break;
 		} else if (ret == BLKPREP_KILL) {
 			rq->cmd_flags |= REQ_QUIET;
 			end_queued_request(rq, 0);
 		} else {
 			printk(KERN_ERR "%s: bad return=%d\n", __FUNCTION__,
 								ret);
 			break;
 		}
 	}
 	return rq;
 }
 EXPORT_SYMBOL(elv_next_request);
 void elv_dequeue_request(struct request_queue *q, struct request *rq)
 {
 	BUG_ON(list_empty(&rq->queuelist));
 	BUG_ON(ELV_ON_HASH(rq));
 	list_del_init(&rq->queuelist);
 	/*
 	 * the time frame between a request being removed from the lists
 	 * and to it is freed is accounted as io that is in progress at
 	 * the driver side.
 	 */
 	if (blk_account_rq(rq))
 		q->in_flight++;
 }
 EXPORT_SYMBOL(elv_dequeue_request);
 int elv_queue_empty(struct request_queue *q)
 {
 	elevator_t *e = q->elevator;
 	if (!list_empty(&q->queue_head))
 		return 0;
 	if (e->ops->elevator_queue_empty_fn)
 		return e->ops->elevator_queue_empty_fn(q);
 	return 1;
 }
 EXPORT_SYMBOL(elv_queue_empty);
 struct request *elv_latter_request(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_latter_req_fn)
 		return e->ops->elevator_latter_req_fn(q, rq);
 	return NULL;
 }
 struct request *elv_former_request(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_former_req_fn)
 		return e->ops->elevator_former_req_fn(q, rq);
 	return NULL;
 }
 int elv_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_set_req_fn)
 		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
 	rq->elevator_private = NULL;
 	return 0;
 }
 void elv_put_request(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_put_req_fn)
 		e->ops->elevator_put_req_fn(rq);
 }
 int elv_may_queue(struct request_queue *q, int rw)
 {
 	elevator_t *e = q->elevator;
 	if (e->ops->elevator_may_queue_fn)
 		return e->ops->elevator_may_queue_fn(q, rw);
 	return ELV_MQUEUE_MAY;
 }
 void elv_completed_request(struct request_queue *q, struct request *rq)
 {
 	elevator_t *e = q->elevator;
 	/*
 	 * request is released from the driver, io must be done
 	 */
 	if (blk_account_rq(rq)) {
 		q->in_flight--;
 		if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn)
 			e->ops->elevator_completed_req_fn(q, rq);
 	}
 	/*
 	 * Check if the queue is waiting for fs requests to be
 	 * drained for flush sequence.
 	 */
 	if (unlikely(q->ordseq)) {
 		struct request *first_rq = list_entry_rq(q->queue_head.next);
 		if (q->in_flight == 0 &&
 		    blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN &&
 		    blk_ordered_req_seq(first_rq) > QUEUE_ORDSEQ_DRAIN) {
 			blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0);
 			q->request_fn(q);
 		}
 	}
 }
 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr)
 static ssize_t
 elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
 {
 	elevator_t *e = container_of(kobj, elevator_t, kobj);
 	struct elv_fs_entry *entry = to_elv(attr);
 	ssize_t error;
 	if (!entry->show)
 		return -EIO;
 	mutex_lock(&e->sysfs_lock);
 	error = e->ops ? entry->show(e, page) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
 	return error;
 }
 static ssize_t
 elv_attr_store(struct kobject *kobj, struct attribute *attr,
 	       const char *page, size_t length)
 {
 	elevator_t *e = container_of(kobj, elevator_t, kobj);
 	struct elv_fs_entry *entry = to_elv(attr);
 	ssize_t error;
 	if (!entry->store)
 		return -EIO;
 	mutex_lock(&e->sysfs_lock);
 	error = e->ops ? entry->store(e, page, length) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
 	return error;
 }
 static struct sysfs_ops elv_sysfs_ops = {
 	.show	= elv_attr_show,
 	.store	= elv_attr_store,
 };
 static struct kobj_type elv_ktype = {
 	.sysfs_ops	= &elv_sysfs_ops,
 	.release	= elevator_release,
 };
 int elv_register_queue(struct request_queue *q)
 {
 	elevator_t *e = q->elevator;
 	int error;
 	e->kobj.parent = &q->kobj;
 	error = kobject_add(&e->kobj);
 	if (!error) {
 		struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
 		if (attr) {
 			while (attr->attr.name) {
 				if (sysfs_create_file(&e->kobj, &attr->attr))
 					break;
 				attr++;
 			}
 		}
 		kobject_uevent(&e->kobj, KOBJ_ADD);
 	}
 	return error;
 }
 static void __elv_unregister_queue(elevator_t *e)
 {
 	kobject_uevent(&e->kobj, KOBJ_REMOVE);
 	kobject_del(&e->kobj);
 }
 void elv_unregister_queue(struct request_queue *q)
 {
 	if (q)
 		__elv_unregister_queue(q->elevator);
 }
 int elv_register(struct elevator_type *e)
 {
 	char *def = "";
 	spin_lock(&elv_list_lock);
 	BUG_ON(elevator_find(e->elevator_name));
 	list_add_tail(&e->list, &elv_list);
 	spin_unlock(&elv_list_lock);
 	if (!strcmp(e->elevator_name, chosen_elevator) ||
 			(!*chosen_elevator &&
 			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
 				def = " (default)";
 	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, def);
 	return 0;
 }
 EXPORT_SYMBOL_GPL(elv_register);
 void elv_unregister(struct elevator_type *e)
 {
 	struct task_struct *g, *p;
 	/*
 	 * Iterate every thread in the process to remove the io contexts.
 	 */
 	if (e->ops.trim) {
 		read_lock(&tasklist_lock);
 		do_each_thread(g, p) {
 			task_lock(p);
 			if (p->io_context)
 				e->ops.trim(p->io_context);
 			task_unlock(p);
 		} while_each_thread(g, p);
 		read_unlock(&tasklist_lock);
 	}
 	spin_lock(&elv_list_lock);
 	list_del_init(&e->list);
 	spin_unlock(&elv_list_lock);
 }
 EXPORT_SYMBOL_GPL(elv_unregister);
 /*
  * switch to new_e io scheduler. be careful not to introduce deadlocks -
  * we don't free the old io scheduler, before we have allocated what we
  * need for the new one. this way we have a chance of going back to the old
  * one, if the new one fails init for some reason.
  */
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
 	elevator_t *old_elevator, *e;
 	void *data;
 	/*
 	 * Allocate new elevator
 	 */
 	e = elevator_alloc(q, new_e);
 	if (!e)
 		return 0;
 	data = elevator_init_queue(q, e);
 	if (!data) {
 		kobject_put(&e->kobj);
 		return 0;
 	}
 	/*
 	 * Turn on BYPASS and drain all requests w/ elevator private data
 	 */
 	spin_lock_irq(q->queue_lock);
 	set_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	elv_drain_elevator(q);
 	while (q->rq.elvpriv) {
 		blk_remove_plug(q);
 		q->request_fn(q);
 		spin_unlock_irq(q->queue_lock);
 		msleep(10);
 		spin_lock_irq(q->queue_lock);
 		elv_drain_elevator(q);
 	}
 	/*
 	 * Remember old elevator.
 	 */
 	old_elevator = q->elevator;
 	/*
 	 * attach and start new elevator
 	 */
 	elevator_attach(q, e, data);
 	spin_unlock_irq(q->queue_lock);
 	__elv_unregister_queue(old_elevator);
 	if (elv_register_queue(q))
 		goto fail_register;
 	/*
 	 * finally exit old elevator and turn off BYPASS.
 	 */
 	elevator_exit(old_elevator);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	return 1;
 fail_register:
 	/*
 	 * switch failed, exit the new io scheduler and reattach the old
 	 * one again (along with re-adding the sysfs dir)
 	 */
 	elevator_exit(e);
 	q->elevator = old_elevator;
 	elv_register_queue(q);
 	clear_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
 	return 0;
 }
 ssize_t elv_iosched_store(struct request_queue *q, const char *name,
 			  size_t count)
 {
 	char elevator_name[ELV_NAME_MAX];
 	size_t len;
 	struct elevator_type *e;
 	elevator_name[sizeof(elevator_name) - 1] = '\0';
 	strncpy(elevator_name, name, sizeof(elevator_name) - 1);
 	len = strlen(elevator_name);
 	if (len && elevator_name[len - 1] == '\n')
 		elevator_name[len - 1] = '\0';
 	e = elevator_get(elevator_name);
 	if (!e) {
 		printk(KERN_ERR "elevator: type %s not found\n", elevator_name);
 		return -EINVAL;
 	}
 	if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
 		elevator_put(e);
 		return count;
 	}
 	if (!elevator_switch(q, e))
 		printk(KERN_ERR "elevator: switch to %s failed\n",elevator_name);
 	return count;
 }
 ssize_t elv_iosched_show(struct request_queue *q, char *name)
 {
 	elevator_t *e = q->elevator;
 	struct elevator_type *elv = e->elevator_type;
 	struct elevator_type *__e;
 	int len = 0;
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
 		if (!strcmp(elv->elevator_name, __e->elevator_name))
 			len += sprintf(name+len, "[%s] ", elv->elevator_name);
 		else
 			len += sprintf(name+len, "%s ", __e->elevator_name);
 	}
 	spin_unlock(&elv_list_lock);
 	len += sprintf(len+name, "\n");
 	return len;
 }
 struct request *elv_rb_former_request(struct request_queue *q,
 				      struct request *rq)
 {
 	struct rb_node *rbprev = rb_prev(&rq->rb_node);
 	if (rbprev)
 		return rb_entry_rq(rbprev);
 	return NULL;
 }
 EXPORT_SYMBOL(elv_rb_former_request);
 struct request *elv_rb_latter_request(struct request_queue *q,
 				      struct request *rq)
 {
 	struct rb_node *rbnext = rb_next(&rq->rb_node);
 	if (rbnext)
 		return rb_entry_rq(rbnext);
 	return NULL;
 }
 EXPORT_SYMBOL(elv_rb_latter_request);

block/ll_rw_blk.c

Diff comments View file @ bf2de6f

1	/*	1	/*
2	* Copyright (C) 1991, 1992 Linus Torvalds	2	* Copyright (C) 1991, 1992 Linus Torvalds
3	* Copyright (C) 1994, Karl Keyte: Added support for disk statistics	3	* Copyright (C) 1994, Karl Keyte: Added support for disk statistics
4	* Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE	4	* Elevator latency, (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
5	* Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>	5	* Queue request tables / lock, selectable elevator, Jens Axboe <axboe@suse.de>
6	* kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000	6	* kernel-doc documentation started by NeilBrown <neilb@cse.unsw.edu.au> - July2000
7	* bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001	7	* bio rewrite, highmem i/o, etc, Jens Axboe <axboe@suse.de> - may 2001
8	*/	8	*/
9		9
10	/*	10	/*
11	* This handles all read/write requests to block devices	11	* This handles all read/write requests to block devices
12	*/	12	*/
13	#include <linux/kernel.h>	13	#include <linux/kernel.h>
14	#include <linux/module.h>	14	#include <linux/module.h>
15	#include <linux/backing-dev.h>	15	#include <linux/backing-dev.h>
16	#include <linux/bio.h>	16	#include <linux/bio.h>
17	#include <linux/blkdev.h>	17	#include <linux/blkdev.h>
18	#include <linux/highmem.h>	18	#include <linux/highmem.h>
19	#include <linux/mm.h>	19	#include <linux/mm.h>
20	#include <linux/kernel_stat.h>	20	#include <linux/kernel_stat.h>
21	#include <linux/string.h>	21	#include <linux/string.h>
22	#include <linux/init.h>	22	#include <linux/init.h>
23	#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */	23	#include <linux/bootmem.h> /* for max_pfn/max_low_pfn */
24	#include <linux/completion.h>	24	#include <linux/completion.h>
25	#include <linux/slab.h>	25	#include <linux/slab.h>
26	#include <linux/swap.h>	26	#include <linux/swap.h>
27	#include <linux/writeback.h>	27	#include <linux/writeback.h>
28	#include <linux/task_io_accounting_ops.h>	28	#include <linux/task_io_accounting_ops.h>
29	#include <linux/interrupt.h>	29	#include <linux/interrupt.h>
30	#include <linux/cpu.h>	30	#include <linux/cpu.h>
31	#include <linux/blktrace_api.h>	31	#include <linux/blktrace_api.h>
32	#include <linux/fault-inject.h>	32	#include <linux/fault-inject.h>
33		33
34	/*	34	/*
35	* for max sense size	35	* for max sense size
36	*/	36	*/
37	#include <scsi/scsi_cmnd.h>	37	#include <scsi/scsi_cmnd.h>
38		38
39	static void blk_unplug_work(struct work_struct *work);	39	static void blk_unplug_work(struct work_struct *work);
40	static void blk_unplug_timeout(unsigned long data);	40	static void blk_unplug_timeout(unsigned long data);
41	static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);	41	static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io);
42	static void init_request_from_bio(struct request req, struct bio bio);	42	static void init_request_from_bio(struct request req, struct bio bio);
43	static int __make_request(struct request_queue q, struct bio bio);	43	static int __make_request(struct request_queue q, struct bio bio);
44	static struct io_context *current_io_context(gfp_t gfp_flags, int node);	44	static struct io_context *current_io_context(gfp_t gfp_flags, int node);
45	static void blk_recalc_rq_segments(struct request *rq);	45	static void blk_recalc_rq_segments(struct request *rq);
46	static void blk_rq_bio_prep(struct request_queue q, struct request rq,	46	static void blk_rq_bio_prep(struct request_queue q, struct request rq,
47	struct bio *bio);	47	struct bio *bio);
48		48
49	/*	49	/*
50	* For the allocated request tables	50	* For the allocated request tables
51	*/	51	*/
52	static struct kmem_cache *request_cachep;	52	static struct kmem_cache *request_cachep;
53		53
54	/*	54	/*
55	* For queue allocation	55	* For queue allocation
56	*/	56	*/
57	static struct kmem_cache *requestq_cachep;	57	static struct kmem_cache *requestq_cachep;
58		58
59	/*	59	/*
60	* For io context allocations	60	* For io context allocations
61	*/	61	*/
62	static struct kmem_cache *iocontext_cachep;	62	static struct kmem_cache *iocontext_cachep;
63		63
64	/*	64	/*
65	* Controlling structure to kblockd	65	* Controlling structure to kblockd
66	*/	66	*/
67	static struct workqueue_struct *kblockd_workqueue;	67	static struct workqueue_struct *kblockd_workqueue;
68		68
69	unsigned long blk_max_low_pfn, blk_max_pfn;	69	unsigned long blk_max_low_pfn, blk_max_pfn;
70		70
71	EXPORT_SYMBOL(blk_max_low_pfn);	71	EXPORT_SYMBOL(blk_max_low_pfn);
72	EXPORT_SYMBOL(blk_max_pfn);	72	EXPORT_SYMBOL(blk_max_pfn);
73		73
74	static DEFINE_PER_CPU(struct list_head, blk_cpu_done);	74	static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
75		75
76	/* Amount of time in which a process may batch requests */	76	/* Amount of time in which a process may batch requests */
77	#define BLK_BATCH_TIME (HZ/50UL)	77	#define BLK_BATCH_TIME (HZ/50UL)
78		78
79	/* Number of requests a "batching" process may submit */	79	/* Number of requests a "batching" process may submit */
80	#define BLK_BATCH_REQ 32	80	#define BLK_BATCH_REQ 32
81		81
82	/*	82	/*
83	* Return the threshold (number of used requests) at which the queue is	83	* Return the threshold (number of used requests) at which the queue is
84	* considered to be congested. It include a little hysteresis to keep the	84	* considered to be congested. It include a little hysteresis to keep the
85	* context switch rate down.	85	* context switch rate down.
86	*/	86	*/
87	static inline int queue_congestion_on_threshold(struct request_queue *q)	87	static inline int queue_congestion_on_threshold(struct request_queue *q)
88	{	88	{
89	return q->nr_congestion_on;	89	return q->nr_congestion_on;
90	}	90	}
91		91
92	/*	92	/*
93	* The threshold at which a queue is considered to be uncongested	93	* The threshold at which a queue is considered to be uncongested
94	*/	94	*/
95	static inline int queue_congestion_off_threshold(struct request_queue *q)	95	static inline int queue_congestion_off_threshold(struct request_queue *q)
96	{	96	{
97	return q->nr_congestion_off;	97	return q->nr_congestion_off;
98	}	98	}
99		99
100	static void blk_queue_congestion_threshold(struct request_queue *q)	100	static void blk_queue_congestion_threshold(struct request_queue *q)
101	{	101	{
102	int nr;	102	int nr;
103		103
104	nr = q->nr_requests - (q->nr_requests / 8) + 1;	104	nr = q->nr_requests - (q->nr_requests / 8) + 1;
105	if (nr > q->nr_requests)	105	if (nr > q->nr_requests)
106	nr = q->nr_requests;	106	nr = q->nr_requests;
107	q->nr_congestion_on = nr;	107	q->nr_congestion_on = nr;
108		108
109	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;	109	nr = q->nr_requests - (q->nr_requests / 8) - (q->nr_requests / 16) - 1;
110	if (nr < 1)	110	if (nr < 1)
111	nr = 1;	111	nr = 1;
112	q->nr_congestion_off = nr;	112	q->nr_congestion_off = nr;
113	}	113	}
114		114
115	/**	115	/**
116	* blk_get_backing_dev_info - get the address of a queue's backing_dev_info	116	* blk_get_backing_dev_info - get the address of a queue's backing_dev_info
117	* @bdev: device	117	* @bdev: device
118	*	118	*
119	* Locates the passed device's request queue and returns the address of its	119	* Locates the passed device's request queue and returns the address of its
120	* backing_dev_info	120	* backing_dev_info
121	*	121	*
122	* Will return NULL if the request queue cannot be located.	122	* Will return NULL if the request queue cannot be located.
123	*/	123	*/
124	struct backing_dev_info blk_get_backing_dev_info(struct block_device bdev)	124	struct backing_dev_info blk_get_backing_dev_info(struct block_device bdev)
125	{	125	{
126	struct backing_dev_info *ret = NULL;	126	struct backing_dev_info *ret = NULL;
127	struct request_queue *q = bdev_get_queue(bdev);	127	struct request_queue *q = bdev_get_queue(bdev);
128		128
129	if (q)	129	if (q)
130	ret = &q->backing_dev_info;	130	ret = &q->backing_dev_info;
131	return ret;	131	return ret;
132	}	132	}
133	EXPORT_SYMBOL(blk_get_backing_dev_info);	133	EXPORT_SYMBOL(blk_get_backing_dev_info);
134		134
135	/**	135	/**
136	* blk_queue_prep_rq - set a prepare_request function for queue	136	* blk_queue_prep_rq - set a prepare_request function for queue
137	* @q: queue	137	* @q: queue
138	* @pfn: prepare_request function	138	* @pfn: prepare_request function
139	*	139	*
140	* It's possible for a queue to register a prepare_request callback which	140	* It's possible for a queue to register a prepare_request callback which
141	* is invoked before the request is handed to the request_fn. The goal of	141	* is invoked before the request is handed to the request_fn. The goal of
142	* the function is to prepare a request for I/O, it can be used to build a	142	* the function is to prepare a request for I/O, it can be used to build a
143	* cdb from the request data for instance.	143	* cdb from the request data for instance.
144	*	144	*
145	*/	145	*/
146	void blk_queue_prep_rq(struct request_queue q, prep_rq_fn pfn)	146	void blk_queue_prep_rq(struct request_queue q, prep_rq_fn pfn)
147	{	147	{
148	q->prep_rq_fn = pfn;	148	q->prep_rq_fn = pfn;
149	}	149	}
150		150
151	EXPORT_SYMBOL(blk_queue_prep_rq);	151	EXPORT_SYMBOL(blk_queue_prep_rq);
152		152
153	/**	153	/**
154	* blk_queue_merge_bvec - set a merge_bvec function for queue	154	* blk_queue_merge_bvec - set a merge_bvec function for queue
155	* @q: queue	155	* @q: queue
156	* @mbfn: merge_bvec_fn	156	* @mbfn: merge_bvec_fn
157	*	157	*
158	* Usually queues have static limitations on the max sectors or segments that	158	* Usually queues have static limitations on the max sectors or segments that
159	* we can put in a request. Stacking drivers may have some settings that	159	* we can put in a request. Stacking drivers may have some settings that
160	* are dynamic, and thus we have to query the queue whether it is ok to	160	* are dynamic, and thus we have to query the queue whether it is ok to
161	* add a new bio_vec to a bio at a given offset or not. If the block device	161	* add a new bio_vec to a bio at a given offset or not. If the block device
162	* has such limitations, it needs to register a merge_bvec_fn to control	162	* has such limitations, it needs to register a merge_bvec_fn to control
163	* the size of bio's sent to it. Note that a block device must allow a	163	* the size of bio's sent to it. Note that a block device must allow a
164	* single page to be added to an empty bio. The block device driver may want	164	* single page to be added to an empty bio. The block device driver may want
165	* to use the bio_split() function to deal with these bio's. By default	165	* to use the bio_split() function to deal with these bio's. By default
166	* no merge_bvec_fn is defined for a queue, and only the fixed limits are	166	* no merge_bvec_fn is defined for a queue, and only the fixed limits are
167	* honored.	167	* honored.
168	*/	168	*/
169	void blk_queue_merge_bvec(struct request_queue q, merge_bvec_fn mbfn)	169	void blk_queue_merge_bvec(struct request_queue q, merge_bvec_fn mbfn)
170	{	170	{
171	q->merge_bvec_fn = mbfn;	171	q->merge_bvec_fn = mbfn;
172	}	172	}
173		173
174	EXPORT_SYMBOL(blk_queue_merge_bvec);	174	EXPORT_SYMBOL(blk_queue_merge_bvec);
175		175
176	void blk_queue_softirq_done(struct request_queue q, softirq_done_fn fn)	176	void blk_queue_softirq_done(struct request_queue q, softirq_done_fn fn)
177	{	177	{
178	q->softirq_done_fn = fn;	178	q->softirq_done_fn = fn;
179	}	179	}
180		180
181	EXPORT_SYMBOL(blk_queue_softirq_done);	181	EXPORT_SYMBOL(blk_queue_softirq_done);
182		182
183	/**	183	/**
184	* blk_queue_make_request - define an alternate make_request function for a device	184	* blk_queue_make_request - define an alternate make_request function for a device
185	* @q: the request queue for the device to be affected	185	* @q: the request queue for the device to be affected
186	* @mfn: the alternate make_request function	186	* @mfn: the alternate make_request function
187	*	187	*
188	* Description:	188	* Description:
189	* The normal way for &struct bios to be passed to a device	189	* The normal way for &struct bios to be passed to a device
190	* driver is for them to be collected into requests on a request	190	* driver is for them to be collected into requests on a request
191	* queue, and then to allow the device driver to select requests	191	* queue, and then to allow the device driver to select requests
192	* off that queue when it is ready. This works well for many block	192	* off that queue when it is ready. This works well for many block
193	* devices. However some block devices (typically virtual devices	193	* devices. However some block devices (typically virtual devices
194	* such as md or lvm) do not benefit from the processing on the	194	* such as md or lvm) do not benefit from the processing on the
195	* request queue, and are served best by having the requests passed	195	* request queue, and are served best by having the requests passed
196	* directly to them. This can be achieved by providing a function	196	* directly to them. This can be achieved by providing a function
197	* to blk_queue_make_request().	197	* to blk_queue_make_request().
198	*	198	*
199	* Caveat:	199	* Caveat:
200	* The driver that does this must be able to deal appropriately	200	* The driver that does this must be able to deal appropriately
201	* with buffers in "highmemory". This can be accomplished by either calling	201	* with buffers in "highmemory". This can be accomplished by either calling
202	* __bio_kmap_atomic() to get a temporary kernel mapping, or by calling	202	* __bio_kmap_atomic() to get a temporary kernel mapping, or by calling
203	* blk_queue_bounce() to create a buffer in normal memory.	203	* blk_queue_bounce() to create a buffer in normal memory.
204	**/	204	**/
205	void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn)	205	void blk_queue_make_request(struct request_queue * q, make_request_fn * mfn)
206	{	206	{
207	/*	207	/*
208	* set defaults	208	* set defaults
209	*/	209	*/
210	q->nr_requests = BLKDEV_MAX_RQ;	210	q->nr_requests = BLKDEV_MAX_RQ;
211	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);	211	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
212	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);	212	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
213	q->make_request_fn = mfn;	213	q->make_request_fn = mfn;
214	q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;	214	q->backing_dev_info.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
215	q->backing_dev_info.state = 0;	215	q->backing_dev_info.state = 0;
216	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;	216	q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY;
217	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);	217	blk_queue_max_sectors(q, SAFE_MAX_SECTORS);
218	blk_queue_hardsect_size(q, 512);	218	blk_queue_hardsect_size(q, 512);
219	blk_queue_dma_alignment(q, 511);	219	blk_queue_dma_alignment(q, 511);
220	blk_queue_congestion_threshold(q);	220	blk_queue_congestion_threshold(q);
221	q->nr_batching = BLK_BATCH_REQ;	221	q->nr_batching = BLK_BATCH_REQ;
222		222
223	q->unplug_thresh = 4; /* hmm */	223	q->unplug_thresh = 4; /* hmm */
224	q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */	224	q->unplug_delay = (3 * HZ) / 1000; /* 3 milliseconds */
225	if (q->unplug_delay == 0)	225	if (q->unplug_delay == 0)
226	q->unplug_delay = 1;	226	q->unplug_delay = 1;
227		227
228	INIT_WORK(&q->unplug_work, blk_unplug_work);	228	INIT_WORK(&q->unplug_work, blk_unplug_work);
229		229
230	q->unplug_timer.function = blk_unplug_timeout;	230	q->unplug_timer.function = blk_unplug_timeout;
231	q->unplug_timer.data = (unsigned long)q;	231	q->unplug_timer.data = (unsigned long)q;
232		232
233	/*	233	/*
234	* by default assume old behaviour and bounce for any highmem page	234	* by default assume old behaviour and bounce for any highmem page
235	*/	235	*/
236	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);	236	blk_queue_bounce_limit(q, BLK_BOUNCE_HIGH);
237	}	237	}
238		238
239	EXPORT_SYMBOL(blk_queue_make_request);	239	EXPORT_SYMBOL(blk_queue_make_request);
240		240
241	static void rq_init(struct request_queue q, struct request rq)	241	static void rq_init(struct request_queue q, struct request rq)
242	{	242	{
243	INIT_LIST_HEAD(&rq->queuelist);	243	INIT_LIST_HEAD(&rq->queuelist);
244	INIT_LIST_HEAD(&rq->donelist);	244	INIT_LIST_HEAD(&rq->donelist);
245		245
246	rq->errors = 0;	246	rq->errors = 0;
247	rq->bio = rq->biotail = NULL;	247	rq->bio = rq->biotail = NULL;
248	INIT_HLIST_NODE(&rq->hash);	248	INIT_HLIST_NODE(&rq->hash);
249	RB_CLEAR_NODE(&rq->rb_node);	249	RB_CLEAR_NODE(&rq->rb_node);
250	rq->ioprio = 0;	250	rq->ioprio = 0;
251	rq->buffer = NULL;	251	rq->buffer = NULL;
252	rq->ref_count = 1;	252	rq->ref_count = 1;
253	rq->q = q;	253	rq->q = q;
254	rq->special = NULL;	254	rq->special = NULL;
255	rq->data_len = 0;	255	rq->data_len = 0;
256	rq->data = NULL;	256	rq->data = NULL;
257	rq->nr_phys_segments = 0;	257	rq->nr_phys_segments = 0;
258	rq->sense = NULL;	258	rq->sense = NULL;
259	rq->end_io = NULL;	259	rq->end_io = NULL;
260	rq->end_io_data = NULL;	260	rq->end_io_data = NULL;
261	rq->completion_data = NULL;	261	rq->completion_data = NULL;
262	rq->next_rq = NULL;	262	rq->next_rq = NULL;
263	}	263	}
264		264
265	/**	265	/**
266	* blk_queue_ordered - does this queue support ordered writes	266	* blk_queue_ordered - does this queue support ordered writes
267	* @q: the request queue	267	* @q: the request queue
268	* @ordered: one of QUEUE_ORDERED_*	268	* @ordered: one of QUEUE_ORDERED_*
269	* @prepare_flush_fn: rq setup helper for cache flush ordered writes	269	* @prepare_flush_fn: rq setup helper for cache flush ordered writes
270	*	270	*
271	* Description:	271	* Description:
272	* For journalled file systems, doing ordered writes on a commit	272	* For journalled file systems, doing ordered writes on a commit
273	* block instead of explicitly doing wait_on_buffer (which is bad	273	* block instead of explicitly doing wait_on_buffer (which is bad
274	* for performance) can be a big win. Block drivers supporting this	274	* for performance) can be a big win. Block drivers supporting this
275	* feature should call this function and indicate so.	275	* feature should call this function and indicate so.
276	*	276	*
277	**/	277	**/
278	int blk_queue_ordered(struct request_queue *q, unsigned ordered,	278	int blk_queue_ordered(struct request_queue *q, unsigned ordered,
279	prepare_flush_fn *prepare_flush_fn)	279	prepare_flush_fn *prepare_flush_fn)
280	{	280	{
281	if (ordered & (QUEUE_ORDERED_PREFLUSH \| QUEUE_ORDERED_POSTFLUSH) &&	281	if (ordered & (QUEUE_ORDERED_PREFLUSH \| QUEUE_ORDERED_POSTFLUSH) &&
282	prepare_flush_fn == NULL) {	282	prepare_flush_fn == NULL) {
283	printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");	283	printk(KERN_ERR "blk_queue_ordered: prepare_flush_fn required\n");
284	return -EINVAL;	284	return -EINVAL;
285	}	285	}
286		286
287	if (ordered != QUEUE_ORDERED_NONE &&	287	if (ordered != QUEUE_ORDERED_NONE &&
288	ordered != QUEUE_ORDERED_DRAIN &&	288	ordered != QUEUE_ORDERED_DRAIN &&
289	ordered != QUEUE_ORDERED_DRAIN_FLUSH &&	289	ordered != QUEUE_ORDERED_DRAIN_FLUSH &&
290	ordered != QUEUE_ORDERED_DRAIN_FUA &&	290	ordered != QUEUE_ORDERED_DRAIN_FUA &&
291	ordered != QUEUE_ORDERED_TAG &&	291	ordered != QUEUE_ORDERED_TAG &&
292	ordered != QUEUE_ORDERED_TAG_FLUSH &&	292	ordered != QUEUE_ORDERED_TAG_FLUSH &&
293	ordered != QUEUE_ORDERED_TAG_FUA) {	293	ordered != QUEUE_ORDERED_TAG_FUA) {
294	printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);	294	printk(KERN_ERR "blk_queue_ordered: bad value %d\n", ordered);
295	return -EINVAL;	295	return -EINVAL;
296	}	296	}
297		297
298	q->ordered = ordered;	298	q->ordered = ordered;
299	q->next_ordered = ordered;	299	q->next_ordered = ordered;
300	q->prepare_flush_fn = prepare_flush_fn;	300	q->prepare_flush_fn = prepare_flush_fn;
301		301
302	return 0;	302	return 0;
303	}	303	}
304		304
305	EXPORT_SYMBOL(blk_queue_ordered);	305	EXPORT_SYMBOL(blk_queue_ordered);
306		306
307	/**	307	/**
308	* blk_queue_issue_flush_fn - set function for issuing a flush	308	* blk_queue_issue_flush_fn - set function for issuing a flush
309	* @q: the request queue	309	* @q: the request queue
310	* @iff: the function to be called issuing the flush	310	* @iff: the function to be called issuing the flush
311	*	311	*
312	* Description:	312	* Description:
313	* If a driver supports issuing a flush command, the support is notified	313	* If a driver supports issuing a flush command, the support is notified
314	* to the block layer by defining it through this call.	314	* to the block layer by defining it through this call.
315	*	315	*
316	**/	316	**/
317	void blk_queue_issue_flush_fn(struct request_queue q, issue_flush_fn iff)	317	void blk_queue_issue_flush_fn(struct request_queue q, issue_flush_fn iff)
318	{	318	{
319	q->issue_flush_fn = iff;	319	q->issue_flush_fn = iff;
320	}	320	}
321		321
322	EXPORT_SYMBOL(blk_queue_issue_flush_fn);	322	EXPORT_SYMBOL(blk_queue_issue_flush_fn);
323		323
324	/*	324	/*
325	* Cache flushing for ordered writes handling	325	* Cache flushing for ordered writes handling
326	*/	326	*/
327	inline unsigned blk_ordered_cur_seq(struct request_queue *q)	327	inline unsigned blk_ordered_cur_seq(struct request_queue *q)
328	{	328	{
329	if (!q->ordseq)	329	if (!q->ordseq)
330	return 0;	330	return 0;
331	return 1 << ffz(q->ordseq);	331	return 1 << ffz(q->ordseq);
332	}	332	}
333		333
334	unsigned blk_ordered_req_seq(struct request *rq)	334	unsigned blk_ordered_req_seq(struct request *rq)
335	{	335	{
336	struct request_queue *q = rq->q;	336	struct request_queue *q = rq->q;
337		337
338	BUG_ON(q->ordseq == 0);	338	BUG_ON(q->ordseq == 0);
339		339
340	if (rq == &q->pre_flush_rq)	340	if (rq == &q->pre_flush_rq)
341	return QUEUE_ORDSEQ_PREFLUSH;	341	return QUEUE_ORDSEQ_PREFLUSH;
342	if (rq == &q->bar_rq)	342	if (rq == &q->bar_rq)
343	return QUEUE_ORDSEQ_BAR;	343	return QUEUE_ORDSEQ_BAR;
344	if (rq == &q->post_flush_rq)	344	if (rq == &q->post_flush_rq)
345	return QUEUE_ORDSEQ_POSTFLUSH;	345	return QUEUE_ORDSEQ_POSTFLUSH;
346		346
347	/*	347	/*
348	* !fs requests don't need to follow barrier ordering. Always	348	* !fs requests don't need to follow barrier ordering. Always
349	* put them at the front. This fixes the following deadlock.	349	* put them at the front. This fixes the following deadlock.
350	*	350	*
351	* http://thread.gmane.org/gmane.linux.kernel/537473	351	* http://thread.gmane.org/gmane.linux.kernel/537473
352	*/	352	*/
353	if (!blk_fs_request(rq))	353	if (!blk_fs_request(rq))
354	return QUEUE_ORDSEQ_DRAIN;	354	return QUEUE_ORDSEQ_DRAIN;
355		355
356	if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==	356	if ((rq->cmd_flags & REQ_ORDERED_COLOR) ==
357	(q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))	357	(q->orig_bar_rq->cmd_flags & REQ_ORDERED_COLOR))
358	return QUEUE_ORDSEQ_DRAIN;	358	return QUEUE_ORDSEQ_DRAIN;
359	else	359	else
360	return QUEUE_ORDSEQ_DONE;	360	return QUEUE_ORDSEQ_DONE;
361	}	361	}
362		362
363	void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)	363	void blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error)
364	{	364	{
365	struct request *rq;	365	struct request *rq;
366	int uptodate;	366	int uptodate;
367		367
368	if (error && !q->orderr)	368	if (error && !q->orderr)
369	q->orderr = error;	369	q->orderr = error;
370		370
371	BUG_ON(q->ordseq & seq);	371	BUG_ON(q->ordseq & seq);
372	q->ordseq \|= seq;	372	q->ordseq \|= seq;
373		373
374	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)	374	if (blk_ordered_cur_seq(q) != QUEUE_ORDSEQ_DONE)
375	return;	375	return;
376		376
377	/*	377	/*
378	* Okay, sequence complete.	378	* Okay, sequence complete.
379	*/	379	*/
380	uptodate = 1;	380	uptodate = 1;
381	if (q->orderr)	381	if (q->orderr)
382	uptodate = q->orderr;	382	uptodate = q->orderr;
383		383
384	q->ordseq = 0;	384	q->ordseq = 0;
385	rq = q->orig_bar_rq;	385	rq = q->orig_bar_rq;
386		386
387	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);	387	end_that_request_first(rq, uptodate, rq->hard_nr_sectors);
388	end_that_request_last(rq, uptodate);	388	end_that_request_last(rq, uptodate);
389	}	389	}
390		390
391	static void pre_flush_end_io(struct request *rq, int error)	391	static void pre_flush_end_io(struct request *rq, int error)
392	{	392	{
393	elv_completed_request(rq->q, rq);	393	elv_completed_request(rq->q, rq);
394	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);	394	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_PREFLUSH, error);
395	}	395	}
396		396
397	static void bar_end_io(struct request *rq, int error)	397	static void bar_end_io(struct request *rq, int error)
398	{	398	{
399	elv_completed_request(rq->q, rq);	399	elv_completed_request(rq->q, rq);
400	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);	400	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_BAR, error);
401	}	401	}
402		402
403	static void post_flush_end_io(struct request *rq, int error)	403	static void post_flush_end_io(struct request *rq, int error)
404	{	404	{
405	elv_completed_request(rq->q, rq);	405	elv_completed_request(rq->q, rq);
406	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);	406	blk_ordered_complete_seq(rq->q, QUEUE_ORDSEQ_POSTFLUSH, error);
407	}	407	}
408		408
409	static void queue_flush(struct request_queue *q, unsigned which)	409	static void queue_flush(struct request_queue *q, unsigned which)
410	{	410	{
411	struct request *rq;	411	struct request *rq;
412	rq_end_io_fn *end_io;	412	rq_end_io_fn *end_io;
413		413
414	if (which == QUEUE_ORDERED_PREFLUSH) {	414	if (which == QUEUE_ORDERED_PREFLUSH) {
415	rq = &q->pre_flush_rq;	415	rq = &q->pre_flush_rq;
416	end_io = pre_flush_end_io;	416	end_io = pre_flush_end_io;
417	} else {	417	} else {
418	rq = &q->post_flush_rq;	418	rq = &q->post_flush_rq;
419	end_io = post_flush_end_io;	419	end_io = post_flush_end_io;
420	}	420	}
421		421
422	rq->cmd_flags = REQ_HARDBARRIER;	422	rq->cmd_flags = REQ_HARDBARRIER;
423	rq_init(q, rq);	423	rq_init(q, rq);
424	rq->elevator_private = NULL;	424	rq->elevator_private = NULL;
425	rq->elevator_private2 = NULL;	425	rq->elevator_private2 = NULL;
426	rq->rq_disk = q->bar_rq.rq_disk;	426	rq->rq_disk = q->bar_rq.rq_disk;
427	rq->end_io = end_io;	427	rq->end_io = end_io;
428	q->prepare_flush_fn(q, rq);	428	q->prepare_flush_fn(q, rq);
429		429
430	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);	430	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
431	}	431	}
432		432
433	static inline struct request start_ordered(struct request_queue q,	433	static inline struct request start_ordered(struct request_queue q,
434	struct request *rq)	434	struct request *rq)
435	{	435	{
436	q->orderr = 0;	436	q->orderr = 0;
437	q->ordered = q->next_ordered;	437	q->ordered = q->next_ordered;
438	q->ordseq \|= QUEUE_ORDSEQ_STARTED;	438	q->ordseq \|= QUEUE_ORDSEQ_STARTED;
439		439
440	/*	440	/*
441	* Prep proxy barrier request.	441	* Prep proxy barrier request.
442	*/	442	*/
443	blkdev_dequeue_request(rq);	443	blkdev_dequeue_request(rq);
444	q->orig_bar_rq = rq;	444	q->orig_bar_rq = rq;
445	rq = &q->bar_rq;	445	rq = &q->bar_rq;
446	rq->cmd_flags = 0;	446	rq->cmd_flags = 0;
447	rq_init(q, rq);	447	rq_init(q, rq);
448	if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)	448	if (bio_data_dir(q->orig_bar_rq->bio) == WRITE)
449	rq->cmd_flags \|= REQ_RW;	449	rq->cmd_flags \|= REQ_RW;
450	if (q->ordered & QUEUE_ORDERED_FUA)	450	if (q->ordered & QUEUE_ORDERED_FUA)
451	rq->cmd_flags \|= REQ_FUA;	451	rq->cmd_flags \|= REQ_FUA;
452	rq->elevator_private = NULL;	452	rq->elevator_private = NULL;
453	rq->elevator_private2 = NULL;	453	rq->elevator_private2 = NULL;
454	init_request_from_bio(rq, q->orig_bar_rq->bio);	454	init_request_from_bio(rq, q->orig_bar_rq->bio);
455	rq->end_io = bar_end_io;	455	rq->end_io = bar_end_io;
456		456
457	/*	457	/*
458	* Queue ordered sequence. As we stack them at the head, we	458	* Queue ordered sequence. As we stack them at the head, we
459	* need to queue in reverse order. Note that we rely on that	459	* need to queue in reverse order. Note that we rely on that
460	* no fs request uses ELEVATOR_INSERT_FRONT and thus no fs	460	* no fs request uses ELEVATOR_INSERT_FRONT and thus no fs
461	* request gets inbetween ordered sequence.	461	* request gets inbetween ordered sequence. If this request is
		462	* an empty barrier, we don't need to do a postflush ever since
		463	* there will be no data written between the pre and post flush.
		464	* Hence a single flush will suffice.
462	*/	465	*/
463	if (q->ordered & QUEUE_ORDERED_POSTFLUSH)	466	if ((q->ordered & QUEUE_ORDERED_POSTFLUSH) && !blk_empty_barrier(rq))
464	queue_flush(q, QUEUE_ORDERED_POSTFLUSH);	467	queue_flush(q, QUEUE_ORDERED_POSTFLUSH);
465	else	468	else
466	q->ordseq \|= QUEUE_ORDSEQ_POSTFLUSH;	469	q->ordseq \|= QUEUE_ORDSEQ_POSTFLUSH;
467		470
468	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);	471	elv_insert(q, rq, ELEVATOR_INSERT_FRONT);
469		472
470	if (q->ordered & QUEUE_ORDERED_PREFLUSH) {	473	if (q->ordered & QUEUE_ORDERED_PREFLUSH) {
471	queue_flush(q, QUEUE_ORDERED_PREFLUSH);	474	queue_flush(q, QUEUE_ORDERED_PREFLUSH);
472	rq = &q->pre_flush_rq;	475	rq = &q->pre_flush_rq;
473	} else	476	} else
474	q->ordseq \|= QUEUE_ORDSEQ_PREFLUSH;	477	q->ordseq \|= QUEUE_ORDSEQ_PREFLUSH;
475		478
476	if ((q->ordered & QUEUE_ORDERED_TAG) \|\| q->in_flight == 0)	479	if ((q->ordered & QUEUE_ORDERED_TAG) \|\| q->in_flight == 0)
477	q->ordseq \|= QUEUE_ORDSEQ_DRAIN;	480	q->ordseq \|= QUEUE_ORDSEQ_DRAIN;
478	else	481	else
479	rq = NULL;	482	rq = NULL;
480		483
481	return rq;	484	return rq;
482	}	485	}
483		486
484	int blk_do_ordered(struct request_queue q, struct request *rqp)	487	int blk_do_ordered(struct request_queue q, struct request *rqp)
485	{	488	{
486	struct request rq = rqp;	489	struct request rq = rqp;
487	int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);	490	const int is_barrier = blk_fs_request(rq) && blk_barrier_rq(rq);
488		491
489	if (!q->ordseq) {	492	if (!q->ordseq) {
490	if (!is_barrier)	493	if (!is_barrier)
491	return 1;	494	return 1;
492		495
493	if (q->next_ordered != QUEUE_ORDERED_NONE) {	496	if (q->next_ordered != QUEUE_ORDERED_NONE) {
494	*rqp = start_ordered(q, rq);	497	*rqp = start_ordered(q, rq);
495	return 1;	498	return 1;
496	} else {	499	} else {
497	/*	500	/*
498	* This can happen when the queue switches to	501	* This can happen when the queue switches to
499	* ORDERED_NONE while this request is on it.	502	* ORDERED_NONE while this request is on it.
500	*/	503	*/
501	blkdev_dequeue_request(rq);	504	blkdev_dequeue_request(rq);
502	end_that_request_first(rq, -EOPNOTSUPP,	505	end_that_request_first(rq, -EOPNOTSUPP,
503	rq->hard_nr_sectors);	506	rq->hard_nr_sectors);
504	end_that_request_last(rq, -EOPNOTSUPP);	507	end_that_request_last(rq, -EOPNOTSUPP);
505	*rqp = NULL;	508	*rqp = NULL;
506	return 0;	509	return 0;
507	}	510	}
508	}	511	}
509		512
510	/*	513	/*
511	* Ordered sequence in progress	514	* Ordered sequence in progress
512	*/	515	*/
513		516
514	/* Special requests are not subject to ordering rules. */	517	/* Special requests are not subject to ordering rules. */
515	if (!blk_fs_request(rq) &&	518	if (!blk_fs_request(rq) &&
516	rq != &q->pre_flush_rq && rq != &q->post_flush_rq)	519	rq != &q->pre_flush_rq && rq != &q->post_flush_rq)
517	return 1;	520	return 1;
518		521
519	if (q->ordered & QUEUE_ORDERED_TAG) {	522	if (q->ordered & QUEUE_ORDERED_TAG) {
520	/* Ordered by tag. Blocking the next barrier is enough. */	523	/* Ordered by tag. Blocking the next barrier is enough. */
521	if (is_barrier && rq != &q->bar_rq)	524	if (is_barrier && rq != &q->bar_rq)
522	*rqp = NULL;	525	*rqp = NULL;
523	} else {	526	} else {
524	/* Ordered by draining. Wait for turn. */	527	/* Ordered by draining. Wait for turn. */
525	WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));	528	WARN_ON(blk_ordered_req_seq(rq) < blk_ordered_cur_seq(q));
526	if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))	529	if (blk_ordered_req_seq(rq) > blk_ordered_cur_seq(q))
527	*rqp = NULL;	530	*rqp = NULL;
528	}	531	}
529		532
530	return 1;	533	return 1;
531	}	534	}
532		535
533	static void req_bio_endio(struct request rq, struct bio bio,	536	static void req_bio_endio(struct request rq, struct bio bio,
534	unsigned int nbytes, int error)	537	unsigned int nbytes, int error)
535	{	538	{
536	struct request_queue *q = rq->q;	539	struct request_queue *q = rq->q;
537		540
538	if (&q->bar_rq != rq) {	541	if (&q->bar_rq != rq) {
539	if (error)	542	if (error)
540	clear_bit(BIO_UPTODATE, &bio->bi_flags);	543	clear_bit(BIO_UPTODATE, &bio->bi_flags);
541	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))	544	else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
542	error = -EIO;	545	error = -EIO;
543		546
544	if (unlikely(nbytes > bio->bi_size)) {	547	if (unlikely(nbytes > bio->bi_size)) {
545	printk("%s: want %u bytes done, only %u left\n",	548	printk("%s: want %u bytes done, only %u left\n",
546	__FUNCTION__, nbytes, bio->bi_size);	549	__FUNCTION__, nbytes, bio->bi_size);
547	nbytes = bio->bi_size;	550	nbytes = bio->bi_size;
548	}	551	}
549		552
550	bio->bi_size -= nbytes;	553	bio->bi_size -= nbytes;
551	bio->bi_sector += (nbytes >> 9);	554	bio->bi_sector += (nbytes >> 9);
552	if (bio->bi_size == 0)	555	if (bio->bi_size == 0)
553	bio_endio(bio, error);	556	bio_endio(bio, error);
554	} else {	557	} else {
555		558
556	/*	559	/*
557	* Okay, this is the barrier request in progress, just	560	* Okay, this is the barrier request in progress, just
558	* record the error;	561	* record the error;
559	*/	562	*/
560	if (error && !q->orderr)	563	if (error && !q->orderr)
561	q->orderr = error;	564	q->orderr = error;
562	}	565	}
563	}	566	}
564		567
565	/**	568	/**
566	* blk_queue_bounce_limit - set bounce buffer limit for queue	569	* blk_queue_bounce_limit - set bounce buffer limit for queue
567	* @q: the request queue for the device	570	* @q: the request queue for the device
568	* @dma_addr: bus address limit	571	* @dma_addr: bus address limit
569	*	572	*
570	* Description:	573	* Description:
571	* Different hardware can have different requirements as to what pages	574	* Different hardware can have different requirements as to what pages
572	* it can do I/O directly to. A low level driver can call	575	* it can do I/O directly to. A low level driver can call
573	* blk_queue_bounce_limit to have lower memory pages allocated as bounce	576	* blk_queue_bounce_limit to have lower memory pages allocated as bounce
574	* buffers for doing I/O to pages residing above @page.	577	* buffers for doing I/O to pages residing above @page.
575	**/	578	**/
576	void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)	579	void blk_queue_bounce_limit(struct request_queue *q, u64 dma_addr)
577	{	580	{
578	unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;	581	unsigned long bounce_pfn = dma_addr >> PAGE_SHIFT;
579	int dma = 0;	582	int dma = 0;
580		583
581	q->bounce_gfp = GFP_NOIO;	584	q->bounce_gfp = GFP_NOIO;
582	#if BITS_PER_LONG == 64	585	#if BITS_PER_LONG == 64
583	/* Assume anything <= 4GB can be handled by IOMMU.	586	/* Assume anything <= 4GB can be handled by IOMMU.
584	Actually some IOMMUs can handle everything, but I don't	587	Actually some IOMMUs can handle everything, but I don't
585	know of a way to test this here. */	588	know of a way to test this here. */
586	if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT))	589	if (bounce_pfn < (min_t(u64,0xffffffff,BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
587	dma = 1;	590	dma = 1;
588	q->bounce_pfn = max_low_pfn;	591	q->bounce_pfn = max_low_pfn;
589	#else	592	#else
590	if (bounce_pfn < blk_max_low_pfn)	593	if (bounce_pfn < blk_max_low_pfn)
591	dma = 1;	594	dma = 1;
592	q->bounce_pfn = bounce_pfn;	595	q->bounce_pfn = bounce_pfn;
593	#endif	596	#endif
594	if (dma) {	597	if (dma) {
595	init_emergency_isa_pool();	598	init_emergency_isa_pool();
596	q->bounce_gfp = GFP_NOIO \| GFP_DMA;	599	q->bounce_gfp = GFP_NOIO \| GFP_DMA;
597	q->bounce_pfn = bounce_pfn;	600	q->bounce_pfn = bounce_pfn;
598	}	601	}
599	}	602	}
600		603
601	EXPORT_SYMBOL(blk_queue_bounce_limit);	604	EXPORT_SYMBOL(blk_queue_bounce_limit);
602		605
603	/**	606	/**
604	* blk_queue_max_sectors - set max sectors for a request for this queue	607	* blk_queue_max_sectors - set max sectors for a request for this queue
605	* @q: the request queue for the device	608	* @q: the request queue for the device
606	* @max_sectors: max sectors in the usual 512b unit	609	* @max_sectors: max sectors in the usual 512b unit
607	*	610	*
608	* Description:	611	* Description:
609	* Enables a low level driver to set an upper limit on the size of	612	* Enables a low level driver to set an upper limit on the size of
610	* received requests.	613	* received requests.
611	**/	614	**/
612	void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)	615	void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors)
613	{	616	{
614	if ((max_sectors << 9) < PAGE_CACHE_SIZE) {	617	if ((max_sectors << 9) < PAGE_CACHE_SIZE) {
615	max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);	618	max_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
616	printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);	619	printk("%s: set to minimum %d\n", __FUNCTION__, max_sectors);
617	}	620	}
618		621
619	if (BLK_DEF_MAX_SECTORS > max_sectors)	622	if (BLK_DEF_MAX_SECTORS > max_sectors)
620	q->max_hw_sectors = q->max_sectors = max_sectors;	623	q->max_hw_sectors = q->max_sectors = max_sectors;
621	else {	624	else {
622	q->max_sectors = BLK_DEF_MAX_SECTORS;	625	q->max_sectors = BLK_DEF_MAX_SECTORS;
623	q->max_hw_sectors = max_sectors;	626	q->max_hw_sectors = max_sectors;
624	}	627	}
625	}	628	}
626		629
627	EXPORT_SYMBOL(blk_queue_max_sectors);	630	EXPORT_SYMBOL(blk_queue_max_sectors);
628		631
629	/**	632	/**
630	* blk_queue_max_phys_segments - set max phys segments for a request for this queue	633	* blk_queue_max_phys_segments - set max phys segments for a request for this queue
631	* @q: the request queue for the device	634	* @q: the request queue for the device
632	* @max_segments: max number of segments	635	* @max_segments: max number of segments
633	*	636	*
634	* Description:	637	* Description:
635	* Enables a low level driver to set an upper limit on the number of	638	* Enables a low level driver to set an upper limit on the number of
636	* physical data segments in a request. This would be the largest sized	639	* physical data segments in a request. This would be the largest sized
637	* scatter list the driver could handle.	640	* scatter list the driver could handle.
638	**/	641	**/
639	void blk_queue_max_phys_segments(struct request_queue *q,	642	void blk_queue_max_phys_segments(struct request_queue *q,
640	unsigned short max_segments)	643	unsigned short max_segments)
641	{	644	{
642	if (!max_segments) {	645	if (!max_segments) {
643	max_segments = 1;	646	max_segments = 1;
644	printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);	647	printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
645	}	648	}
646		649
647	q->max_phys_segments = max_segments;	650	q->max_phys_segments = max_segments;
648	}	651	}
649		652
650	EXPORT_SYMBOL(blk_queue_max_phys_segments);	653	EXPORT_SYMBOL(blk_queue_max_phys_segments);
651		654
652	/**	655	/**
653	* blk_queue_max_hw_segments - set max hw segments for a request for this queue	656	* blk_queue_max_hw_segments - set max hw segments for a request for this queue
654	* @q: the request queue for the device	657	* @q: the request queue for the device
655	* @max_segments: max number of segments	658	* @max_segments: max number of segments
656	*	659	*
657	* Description:	660	* Description:
658	* Enables a low level driver to set an upper limit on the number of	661	* Enables a low level driver to set an upper limit on the number of
659	* hw data segments in a request. This would be the largest number of	662	* hw data segments in a request. This would be the largest number of
660	* address/length pairs the host adapter can actually give as once	663	* address/length pairs the host adapter can actually give as once
661	* to the device.	664	* to the device.
662	**/	665	**/
663	void blk_queue_max_hw_segments(struct request_queue *q,	666	void blk_queue_max_hw_segments(struct request_queue *q,
664	unsigned short max_segments)	667	unsigned short max_segments)
665	{	668	{
666	if (!max_segments) {	669	if (!max_segments) {
667	max_segments = 1;	670	max_segments = 1;
668	printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);	671	printk("%s: set to minimum %d\n", __FUNCTION__, max_segments);
669	}	672	}
670		673
671	q->max_hw_segments = max_segments;	674	q->max_hw_segments = max_segments;
672	}	675	}
673		676
674	EXPORT_SYMBOL(blk_queue_max_hw_segments);	677	EXPORT_SYMBOL(blk_queue_max_hw_segments);
675		678
676	/**	679	/**
677	* blk_queue_max_segment_size - set max segment size for blk_rq_map_sg	680	* blk_queue_max_segment_size - set max segment size for blk_rq_map_sg
678	* @q: the request queue for the device	681	* @q: the request queue for the device
679	* @max_size: max size of segment in bytes	682	* @max_size: max size of segment in bytes
680	*	683	*
681	* Description:	684	* Description:
682	* Enables a low level driver to set an upper limit on the size of a	685	* Enables a low level driver to set an upper limit on the size of a
683	* coalesced segment	686	* coalesced segment
684	**/	687	**/
685	void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)	688	void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
686	{	689	{
687	if (max_size < PAGE_CACHE_SIZE) {	690	if (max_size < PAGE_CACHE_SIZE) {
688	max_size = PAGE_CACHE_SIZE;	691	max_size = PAGE_CACHE_SIZE;
689	printk("%s: set to minimum %d\n", __FUNCTION__, max_size);	692	printk("%s: set to minimum %d\n", __FUNCTION__, max_size);
690	}	693	}
691		694
692	q->max_segment_size = max_size;	695	q->max_segment_size = max_size;
693	}	696	}
694		697
695	EXPORT_SYMBOL(blk_queue_max_segment_size);	698	EXPORT_SYMBOL(blk_queue_max_segment_size);
696		699
697	/**	700	/**
698	* blk_queue_hardsect_size - set hardware sector size for the queue	701	* blk_queue_hardsect_size - set hardware sector size for the queue
699	* @q: the request queue for the device	702	* @q: the request queue for the device
700	* @size: the hardware sector size, in bytes	703	* @size: the hardware sector size, in bytes
701	*	704	*
702	* Description:	705	* Description:
703	* This should typically be set to the lowest possible sector size	706	* This should typically be set to the lowest possible sector size
704	* that the hardware can operate on (possible without reverting to	707	* that the hardware can operate on (possible without reverting to
705	* even internal read-modify-write operations). Usually the default	708	* even internal read-modify-write operations). Usually the default
706	* of 512 covers most hardware.	709	* of 512 covers most hardware.
707	**/	710	**/
708	void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)	711	void blk_queue_hardsect_size(struct request_queue *q, unsigned short size)
709	{	712	{
710	q->hardsect_size = size;	713	q->hardsect_size = size;
711	}	714	}
712		715
713	EXPORT_SYMBOL(blk_queue_hardsect_size);	716	EXPORT_SYMBOL(blk_queue_hardsect_size);
714		717
715	/*	718	/*
716	* Returns the minimum that is _not_ zero, unless both are zero.	719	* Returns the minimum that is _not_ zero, unless both are zero.
717	*/	720	*/
718	#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))	721	#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
719		722
720	/**	723	/**
721	* blk_queue_stack_limits - inherit underlying queue limits for stacked drivers	724	* blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
722	* @t: the stacking driver (top)	725	* @t: the stacking driver (top)
723	* @b: the underlying device (bottom)	726	* @b: the underlying device (bottom)
724	**/	727	**/
725	void blk_queue_stack_limits(struct request_queue t, struct request_queue b)	728	void blk_queue_stack_limits(struct request_queue t, struct request_queue b)
726	{	729	{
727	/* zero is "infinity" */	730	/* zero is "infinity" */
728	t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);	731	t->max_sectors = min_not_zero(t->max_sectors,b->max_sectors);
729	t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);	732	t->max_hw_sectors = min_not_zero(t->max_hw_sectors,b->max_hw_sectors);
730		733
731	t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);	734	t->max_phys_segments = min(t->max_phys_segments,b->max_phys_segments);
732	t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);	735	t->max_hw_segments = min(t->max_hw_segments,b->max_hw_segments);
733	t->max_segment_size = min(t->max_segment_size,b->max_segment_size);	736	t->max_segment_size = min(t->max_segment_size,b->max_segment_size);
734	t->hardsect_size = max(t->hardsect_size,b->hardsect_size);	737	t->hardsect_size = max(t->hardsect_size,b->hardsect_size);
735	if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))	738	if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags))
736	clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);	739	clear_bit(QUEUE_FLAG_CLUSTER, &t->queue_flags);
737	}	740	}
738		741
739	EXPORT_SYMBOL(blk_queue_stack_limits);	742	EXPORT_SYMBOL(blk_queue_stack_limits);
740		743
741	/**	744	/**
742	* blk_queue_segment_boundary - set boundary rules for segment merging	745	* blk_queue_segment_boundary - set boundary rules for segment merging
743	* @q: the request queue for the device	746	* @q: the request queue for the device
744	* @mask: the memory boundary mask	747	* @mask: the memory boundary mask
745	**/	748	**/
746	void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)	749	void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
747	{	750	{
748	if (mask < PAGE_CACHE_SIZE - 1) {	751	if (mask < PAGE_CACHE_SIZE - 1) {
749	mask = PAGE_CACHE_SIZE - 1;	752	mask = PAGE_CACHE_SIZE - 1;
750	printk("%s: set to minimum %lx\n", __FUNCTION__, mask);	753	printk("%s: set to minimum %lx\n", __FUNCTION__, mask);
751	}	754	}
752		755
753	q->seg_boundary_mask = mask;	756	q->seg_boundary_mask = mask;
754	}	757	}
755		758
756	EXPORT_SYMBOL(blk_queue_segment_boundary);	759	EXPORT_SYMBOL(blk_queue_segment_boundary);
757		760
758	/**	761	/**
759	* blk_queue_dma_alignment - set dma length and memory alignment	762	* blk_queue_dma_alignment - set dma length and memory alignment
760	* @q: the request queue for the device	763	* @q: the request queue for the device
761	* @mask: alignment mask	764	* @mask: alignment mask
762	*	765	*
763	* description:	766	* description:
764	* set required memory and length aligment for direct dma transactions.	767	* set required memory and length aligment for direct dma transactions.
765	* this is used when buiding direct io requests for the queue.	768	* this is used when buiding direct io requests for the queue.
766	*	769	*
767	**/	770	**/
768	void blk_queue_dma_alignment(struct request_queue *q, int mask)	771	void blk_queue_dma_alignment(struct request_queue *q, int mask)
769	{	772	{
770	q->dma_alignment = mask;	773	q->dma_alignment = mask;
771	}	774	}
772		775
773	EXPORT_SYMBOL(blk_queue_dma_alignment);	776	EXPORT_SYMBOL(blk_queue_dma_alignment);
774		777
775	/**	778	/**
776	* blk_queue_find_tag - find a request by its tag and queue	779	* blk_queue_find_tag - find a request by its tag and queue
777	* @q: The request queue for the device	780	* @q: The request queue for the device
778	* @tag: The tag of the request	781	* @tag: The tag of the request
779	*	782	*
780	* Notes:	783	* Notes:
781	* Should be used when a device returns a tag and you want to match	784	* Should be used when a device returns a tag and you want to match
782	* it with a request.	785	* it with a request.
783	*	786	*
784	* no locks need be held.	787	* no locks need be held.
785	**/	788	**/
786	struct request blk_queue_find_tag(struct request_queue q, int tag)	789	struct request blk_queue_find_tag(struct request_queue q, int tag)
787	{	790	{
788	return blk_map_queue_find_tag(q->queue_tags, tag);	791	return blk_map_queue_find_tag(q->queue_tags, tag);
789	}	792	}
790		793
791	EXPORT_SYMBOL(blk_queue_find_tag);	794	EXPORT_SYMBOL(blk_queue_find_tag);
792		795
793	/**	796	/**
794	* __blk_free_tags - release a given set of tag maintenance info	797	* __blk_free_tags - release a given set of tag maintenance info
795	* @bqt: the tag map to free	798	* @bqt: the tag map to free
796	*	799	*
797	* Tries to free the specified @bqt@. Returns true if it was	800	* Tries to free the specified @bqt@. Returns true if it was
798	* actually freed and false if there are still references using it	801	* actually freed and false if there are still references using it
799	*/	802	*/
800	static int __blk_free_tags(struct blk_queue_tag *bqt)	803	static int __blk_free_tags(struct blk_queue_tag *bqt)
801	{	804	{
802	int retval;	805	int retval;
803		806
804	retval = atomic_dec_and_test(&bqt->refcnt);	807	retval = atomic_dec_and_test(&bqt->refcnt);
805	if (retval) {	808	if (retval) {
806	BUG_ON(bqt->busy);	809	BUG_ON(bqt->busy);
807	BUG_ON(!list_empty(&bqt->busy_list));	810	BUG_ON(!list_empty(&bqt->busy_list));
808		811
809	kfree(bqt->tag_index);	812	kfree(bqt->tag_index);
810	bqt->tag_index = NULL;	813	bqt->tag_index = NULL;
811		814
812	kfree(bqt->tag_map);	815	kfree(bqt->tag_map);
813	bqt->tag_map = NULL;	816	bqt->tag_map = NULL;
814		817
815	kfree(bqt);	818	kfree(bqt);
816		819
817	}	820	}
818		821
819	return retval;	822	return retval;
820	}	823	}
821		824
822	/**	825	/**
823	* __blk_queue_free_tags - release tag maintenance info	826	* __blk_queue_free_tags - release tag maintenance info
824	* @q: the request queue for the device	827	* @q: the request queue for the device
825	*	828	*
826	* Notes:	829	* Notes:
827	* blk_cleanup_queue() will take care of calling this function, if tagging	830	* blk_cleanup_queue() will take care of calling this function, if tagging
828	* has been used. So there's no need to call this directly.	831	* has been used. So there's no need to call this directly.
829	**/	832	**/
830	static void __blk_queue_free_tags(struct request_queue *q)	833	static void __blk_queue_free_tags(struct request_queue *q)
831	{	834	{
832	struct blk_queue_tag *bqt = q->queue_tags;	835	struct blk_queue_tag *bqt = q->queue_tags;
833		836
834	if (!bqt)	837	if (!bqt)
835	return;	838	return;
836		839
837	__blk_free_tags(bqt);	840	__blk_free_tags(bqt);
838		841
839	q->queue_tags = NULL;	842	q->queue_tags = NULL;
840	q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);	843	q->queue_flags &= ~(1 << QUEUE_FLAG_QUEUED);
841	}	844	}
842		845
843		846
844	/**	847	/**
845	* blk_free_tags - release a given set of tag maintenance info	848	* blk_free_tags - release a given set of tag maintenance info
846	* @bqt: the tag map to free	849	* @bqt: the tag map to free
847	*	850	*
848	* For externally managed @bqt@ frees the map. Callers of this	851	* For externally managed @bqt@ frees the map. Callers of this
849	* function must guarantee to have released all the queues that	852	* function must guarantee to have released all the queues that
850	* might have been using this tag map.	853	* might have been using this tag map.
851	*/	854	*/
852	void blk_free_tags(struct blk_queue_tag *bqt)	855	void blk_free_tags(struct blk_queue_tag *bqt)
853	{	856	{
854	if (unlikely(!__blk_free_tags(bqt)))	857	if (unlikely(!__blk_free_tags(bqt)))
855	BUG();	858	BUG();
856	}	859	}
857	EXPORT_SYMBOL(blk_free_tags);	860	EXPORT_SYMBOL(blk_free_tags);
858		861
859	/**	862	/**
860	* blk_queue_free_tags - release tag maintenance info	863	* blk_queue_free_tags - release tag maintenance info
861	* @q: the request queue for the device	864	* @q: the request queue for the device
862	*	865	*
863	* Notes:	866	* Notes:
864	* This is used to disabled tagged queuing to a device, yet leave	867	* This is used to disabled tagged queuing to a device, yet leave
865	* queue in function.	868	* queue in function.
866	**/	869	**/
867	void blk_queue_free_tags(struct request_queue *q)	870	void blk_queue_free_tags(struct request_queue *q)
868	{	871	{
869	clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);	872	clear_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
870	}	873	}
871		874
872	EXPORT_SYMBOL(blk_queue_free_tags);	875	EXPORT_SYMBOL(blk_queue_free_tags);
873		876
874	static int	877	static int
875	init_tag_map(struct request_queue q, struct blk_queue_tag tags, int depth)	878	init_tag_map(struct request_queue q, struct blk_queue_tag tags, int depth)
876	{	879	{
877	struct request **tag_index;	880	struct request **tag_index;
878	unsigned long *tag_map;	881	unsigned long *tag_map;
879	int nr_ulongs;	882	int nr_ulongs;
880		883
881	if (q && depth > q->nr_requests * 2) {	884	if (q && depth > q->nr_requests * 2) {
882	depth = q->nr_requests * 2;	885	depth = q->nr_requests * 2;
883	printk(KERN_ERR "%s: adjusted depth to %d\n",	886	printk(KERN_ERR "%s: adjusted depth to %d\n",
884	__FUNCTION__, depth);	887	__FUNCTION__, depth);
885	}	888	}
886		889
887	tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);	890	tag_index = kzalloc(depth * sizeof(struct request *), GFP_ATOMIC);
888	if (!tag_index)	891	if (!tag_index)
889	goto fail;	892	goto fail;
890		893
891	nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;	894	nr_ulongs = ALIGN(depth, BITS_PER_LONG) / BITS_PER_LONG;
892	tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);	895	tag_map = kzalloc(nr_ulongs * sizeof(unsigned long), GFP_ATOMIC);
893	if (!tag_map)	896	if (!tag_map)
894	goto fail;	897	goto fail;
895		898
896	tags->real_max_depth = depth;	899	tags->real_max_depth = depth;
897	tags->max_depth = depth;	900	tags->max_depth = depth;
898	tags->tag_index = tag_index;	901	tags->tag_index = tag_index;
899	tags->tag_map = tag_map;	902	tags->tag_map = tag_map;
900		903
901	return 0;	904	return 0;
902	fail:	905	fail:
903	kfree(tag_index);	906	kfree(tag_index);
904	return -ENOMEM;	907	return -ENOMEM;
905	}	908	}
906		909
907	static struct blk_queue_tag __blk_queue_init_tags(struct request_queue q,	910	static struct blk_queue_tag __blk_queue_init_tags(struct request_queue q,
908	int depth)	911	int depth)
909	{	912	{
910	struct blk_queue_tag *tags;	913	struct blk_queue_tag *tags;
911		914
912	tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);	915	tags = kmalloc(sizeof(struct blk_queue_tag), GFP_ATOMIC);
913	if (!tags)	916	if (!tags)
914	goto fail;	917	goto fail;
915		918
916	if (init_tag_map(q, tags, depth))	919	if (init_tag_map(q, tags, depth))
917	goto fail;	920	goto fail;
918		921
919	INIT_LIST_HEAD(&tags->busy_list);	922	INIT_LIST_HEAD(&tags->busy_list);
920	tags->busy = 0;	923	tags->busy = 0;
921	atomic_set(&tags->refcnt, 1);	924	atomic_set(&tags->refcnt, 1);
922	return tags;	925	return tags;
923	fail:	926	fail:
924	kfree(tags);	927	kfree(tags);
925	return NULL;	928	return NULL;
926	}	929	}
927		930
928	/**	931	/**
929	* blk_init_tags - initialize the tag info for an external tag map	932	* blk_init_tags - initialize the tag info for an external tag map
930	* @depth: the maximum queue depth supported	933	* @depth: the maximum queue depth supported
931	* @tags: the tag to use	934	* @tags: the tag to use
932	**/	935	**/
933	struct blk_queue_tag *blk_init_tags(int depth)	936	struct blk_queue_tag *blk_init_tags(int depth)
934	{	937	{
935	return __blk_queue_init_tags(NULL, depth);	938	return __blk_queue_init_tags(NULL, depth);
936	}	939	}
937	EXPORT_SYMBOL(blk_init_tags);	940	EXPORT_SYMBOL(blk_init_tags);
938		941
939	/**	942	/**
940	* blk_queue_init_tags - initialize the queue tag info	943	* blk_queue_init_tags - initialize the queue tag info
941	* @q: the request queue for the device	944	* @q: the request queue for the device
942	* @depth: the maximum queue depth supported	945	* @depth: the maximum queue depth supported
943	* @tags: the tag to use	946	* @tags: the tag to use
944	**/	947	**/
945	int blk_queue_init_tags(struct request_queue *q, int depth,	948	int blk_queue_init_tags(struct request_queue *q, int depth,
946	struct blk_queue_tag *tags)	949	struct blk_queue_tag *tags)
947	{	950	{
948	int rc;	951	int rc;
949		952
950	BUG_ON(tags && q->queue_tags && tags != q->queue_tags);	953	BUG_ON(tags && q->queue_tags && tags != q->queue_tags);
951		954
952	if (!tags && !q->queue_tags) {	955	if (!tags && !q->queue_tags) {
953	tags = __blk_queue_init_tags(q, depth);	956	tags = __blk_queue_init_tags(q, depth);
954		957
955	if (!tags)	958	if (!tags)
956	goto fail;	959	goto fail;
957	} else if (q->queue_tags) {	960	} else if (q->queue_tags) {
958	if ((rc = blk_queue_resize_tags(q, depth)))	961	if ((rc = blk_queue_resize_tags(q, depth)))
959	return rc;	962	return rc;
960	set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);	963	set_bit(QUEUE_FLAG_QUEUED, &q->queue_flags);
961	return 0;	964	return 0;
962	} else	965	} else
963	atomic_inc(&tags->refcnt);	966	atomic_inc(&tags->refcnt);
964		967
965	/*	968	/*
966	* assign it, all done	969	* assign it, all done
967	*/	970	*/
968	q->queue_tags = tags;	971	q->queue_tags = tags;
969	q->queue_flags \|= (1 << QUEUE_FLAG_QUEUED);	972	q->queue_flags \|= (1 << QUEUE_FLAG_QUEUED);
970	return 0;	973	return 0;
971	fail:	974	fail:
972	kfree(tags);	975	kfree(tags);
973	return -ENOMEM;	976	return -ENOMEM;
974	}	977	}
975		978
976	EXPORT_SYMBOL(blk_queue_init_tags);	979	EXPORT_SYMBOL(blk_queue_init_tags);
977		980
978	/**	981	/**
979	* blk_queue_resize_tags - change the queueing depth	982	* blk_queue_resize_tags - change the queueing depth
980	* @q: the request queue for the device	983	* @q: the request queue for the device
981	* @new_depth: the new max command queueing depth	984	* @new_depth: the new max command queueing depth
982	*	985	*
983	* Notes:	986	* Notes:
984	* Must be called with the queue lock held.	987	* Must be called with the queue lock held.
985	**/	988	**/
986	int blk_queue_resize_tags(struct request_queue *q, int new_depth)	989	int blk_queue_resize_tags(struct request_queue *q, int new_depth)
987	{	990	{
988	struct blk_queue_tag *bqt = q->queue_tags;	991	struct blk_queue_tag *bqt = q->queue_tags;
989	struct request **tag_index;	992	struct request **tag_index;
990	unsigned long *tag_map;	993	unsigned long *tag_map;
991	int max_depth, nr_ulongs;	994	int max_depth, nr_ulongs;
992		995
993	if (!bqt)	996	if (!bqt)
994	return -ENXIO;	997	return -ENXIO;
995		998
996	/*	999	/*
997	* if we already have large enough real_max_depth. just	1000	* if we already have large enough real_max_depth. just
998	* adjust max_depth. NOTE as requests with tag value	1001	* adjust max_depth. NOTE as requests with tag value
999	* between new_depth and real_max_depth can be in-flight, tag	1002	* between new_depth and real_max_depth can be in-flight, tag
1000	* map can not be shrunk blindly here.	1003	* map can not be shrunk blindly here.
1001	*/	1004	*/
1002	if (new_depth <= bqt->real_max_depth) {	1005	if (new_depth <= bqt->real_max_depth) {
1003	bqt->max_depth = new_depth;	1006	bqt->max_depth = new_depth;
1004	return 0;	1007	return 0;
1005	}	1008	}
1006		1009
1007	/*	1010	/*
1008	* Currently cannot replace a shared tag map with a new	1011	* Currently cannot replace a shared tag map with a new
1009	* one, so error out if this is the case	1012	* one, so error out if this is the case
1010	*/	1013	*/
1011	if (atomic_read(&bqt->refcnt) != 1)	1014	if (atomic_read(&bqt->refcnt) != 1)
1012	return -EBUSY;	1015	return -EBUSY;
1013		1016
1014	/*	1017	/*
1015	* save the old state info, so we can copy it back	1018	* save the old state info, so we can copy it back
1016	*/	1019	*/
1017	tag_index = bqt->tag_index;	1020	tag_index = bqt->tag_index;
1018	tag_map = bqt->tag_map;	1021	tag_map = bqt->tag_map;
1019	max_depth = bqt->real_max_depth;	1022	max_depth = bqt->real_max_depth;
1020		1023
1021	if (init_tag_map(q, bqt, new_depth))	1024	if (init_tag_map(q, bqt, new_depth))
1022	return -ENOMEM;	1025	return -ENOMEM;
1023		1026
1024	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));	1027	memcpy(bqt->tag_index, tag_index, max_depth * sizeof(struct request *));
1025	nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;	1028	nr_ulongs = ALIGN(max_depth, BITS_PER_LONG) / BITS_PER_LONG;
1026	memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));	1029	memcpy(bqt->tag_map, tag_map, nr_ulongs * sizeof(unsigned long));
1027		1030
1028	kfree(tag_index);	1031	kfree(tag_index);
1029	kfree(tag_map);	1032	kfree(tag_map);
1030	return 0;	1033	return 0;
1031	}	1034	}
1032		1035
1033	EXPORT_SYMBOL(blk_queue_resize_tags);	1036	EXPORT_SYMBOL(blk_queue_resize_tags);
1034		1037
1035	/**	1038	/**
1036	* blk_queue_end_tag - end tag operations for a request	1039	* blk_queue_end_tag - end tag operations for a request
1037	* @q: the request queue for the device	1040	* @q: the request queue for the device
1038	* @rq: the request that has completed	1041	* @rq: the request that has completed
1039	*	1042	*
1040	* Description:	1043	* Description:
1041	* Typically called when end_that_request_first() returns 0, meaning	1044	* Typically called when end_that_request_first() returns 0, meaning
1042	* all transfers have been done for a request. It's important to call	1045	* all transfers have been done for a request. It's important to call
1043	* this function before end_that_request_last(), as that will put the	1046	* this function before end_that_request_last(), as that will put the
1044	* request back on the free list thus corrupting the internal tag list.	1047	* request back on the free list thus corrupting the internal tag list.
1045	*	1048	*
1046	* Notes:	1049	* Notes:
1047	* queue lock must be held.	1050	* queue lock must be held.
1048	**/	1051	**/
1049	void blk_queue_end_tag(struct request_queue q, struct request rq)	1052	void blk_queue_end_tag(struct request_queue q, struct request rq)
1050	{	1053	{
1051	struct blk_queue_tag *bqt = q->queue_tags;	1054	struct blk_queue_tag *bqt = q->queue_tags;
1052	int tag = rq->tag;	1055	int tag = rq->tag;
1053		1056
1054	BUG_ON(tag == -1);	1057	BUG_ON(tag == -1);
1055		1058
1056	if (unlikely(tag >= bqt->real_max_depth))	1059	if (unlikely(tag >= bqt->real_max_depth))
1057	/*	1060	/*
1058	* This can happen after tag depth has been reduced.	1061	* This can happen after tag depth has been reduced.
1059	* FIXME: how about a warning or info message here?	1062	* FIXME: how about a warning or info message here?
1060	*/	1063	*/
1061	return;	1064	return;
1062		1065
1063	list_del_init(&rq->queuelist);	1066	list_del_init(&rq->queuelist);
1064	rq->cmd_flags &= ~REQ_QUEUED;	1067	rq->cmd_flags &= ~REQ_QUEUED;
1065	rq->tag = -1;	1068	rq->tag = -1;
1066		1069
1067	if (unlikely(bqt->tag_index[tag] == NULL))	1070	if (unlikely(bqt->tag_index[tag] == NULL))
1068	printk(KERN_ERR "%s: tag %d is missing\n",	1071	printk(KERN_ERR "%s: tag %d is missing\n",
1069	__FUNCTION__, tag);	1072	__FUNCTION__, tag);
1070		1073
1071	bqt->tag_index[tag] = NULL;	1074	bqt->tag_index[tag] = NULL;
1072		1075
1073	/*	1076	/*
1074	* We use test_and_clear_bit's memory ordering properties here.	1077	* We use test_and_clear_bit's memory ordering properties here.
1075	* The tag_map bit acts as a lock for tag_index[bit], so we need	1078	* The tag_map bit acts as a lock for tag_index[bit], so we need
1076	* a barrer before clearing the bit (precisely: release semantics).	1079	* a barrer before clearing the bit (precisely: release semantics).
1077	* Could use clear_bit_unlock when it is merged.	1080	* Could use clear_bit_unlock when it is merged.
1078	*/	1081	*/
1079	if (unlikely(!test_and_clear_bit(tag, bqt->tag_map))) {	1082	if (unlikely(!test_and_clear_bit(tag, bqt->tag_map))) {
1080	printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",	1083	printk(KERN_ERR "%s: attempt to clear non-busy tag (%d)\n",
1081	__FUNCTION__, tag);	1084	__FUNCTION__, tag);
1082	return;	1085	return;
1083	}	1086	}
1084		1087
1085	bqt->busy--;	1088	bqt->busy--;
1086	}	1089	}
1087		1090
1088	EXPORT_SYMBOL(blk_queue_end_tag);	1091	EXPORT_SYMBOL(blk_queue_end_tag);
1089		1092
1090	/**	1093	/**
1091	* blk_queue_start_tag - find a free tag and assign it	1094	* blk_queue_start_tag - find a free tag and assign it
1092	* @q: the request queue for the device	1095	* @q: the request queue for the device
1093	* @rq: the block request that needs tagging	1096	* @rq: the block request that needs tagging
1094	*	1097	*
1095	* Description:	1098	* Description:
1096	* This can either be used as a stand-alone helper, or possibly be	1099	* This can either be used as a stand-alone helper, or possibly be
1097	* assigned as the queue &prep_rq_fn (in which case &struct request	1100	* assigned as the queue &prep_rq_fn (in which case &struct request
1098	* automagically gets a tag assigned). Note that this function	1101	* automagically gets a tag assigned). Note that this function
1099	* assumes that any type of request can be queued! if this is not	1102	* assumes that any type of request can be queued! if this is not
1100	* true for your device, you must check the request type before	1103	* true for your device, you must check the request type before
1101	* calling this function. The request will also be removed from	1104	* calling this function. The request will also be removed from
1102	* the request queue, so it's the drivers responsibility to readd	1105	* the request queue, so it's the drivers responsibility to readd
1103	* it if it should need to be restarted for some reason.	1106	* it if it should need to be restarted for some reason.
1104	*	1107	*
1105	* Notes:	1108	* Notes:
1106	* queue lock must be held.	1109	* queue lock must be held.
1107	**/	1110	**/
1108	int blk_queue_start_tag(struct request_queue q, struct request rq)	1111	int blk_queue_start_tag(struct request_queue q, struct request rq)
1109	{	1112	{
1110	struct blk_queue_tag *bqt = q->queue_tags;	1113	struct blk_queue_tag *bqt = q->queue_tags;
1111	int tag;	1114	int tag;
1112		1115
1113	if (unlikely((rq->cmd_flags & REQ_QUEUED))) {	1116	if (unlikely((rq->cmd_flags & REQ_QUEUED))) {
1114	printk(KERN_ERR	1117	printk(KERN_ERR
1115	"%s: request %p for device [%s] already tagged %d",	1118	"%s: request %p for device [%s] already tagged %d",
1116	__FUNCTION__, rq,	1119	__FUNCTION__, rq,
1117	rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);	1120	rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->tag);
1118	BUG();	1121	BUG();
1119	}	1122	}
1120		1123
1121	/*	1124	/*
1122	* Protect against shared tag maps, as we may not have exclusive	1125	* Protect against shared tag maps, as we may not have exclusive
1123	* access to the tag map.	1126	* access to the tag map.
1124	*/	1127	*/
1125	do {	1128	do {
1126	tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);	1129	tag = find_first_zero_bit(bqt->tag_map, bqt->max_depth);
1127	if (tag >= bqt->max_depth)	1130	if (tag >= bqt->max_depth)
1128	return 1;	1131	return 1;
1129		1132
1130	} while (test_and_set_bit(tag, bqt->tag_map));	1133	} while (test_and_set_bit(tag, bqt->tag_map));
1131	/*	1134	/*
1132	* We rely on test_and_set_bit providing lock memory ordering semantics	1135	* We rely on test_and_set_bit providing lock memory ordering semantics
1133	* (could use test_and_set_bit_lock when it is merged).	1136	* (could use test_and_set_bit_lock when it is merged).
1134	*/	1137	*/
1135		1138
1136	rq->cmd_flags \|= REQ_QUEUED;	1139	rq->cmd_flags \|= REQ_QUEUED;
1137	rq->tag = tag;	1140	rq->tag = tag;
1138	bqt->tag_index[tag] = rq;	1141	bqt->tag_index[tag] = rq;
1139	blkdev_dequeue_request(rq);	1142	blkdev_dequeue_request(rq);
1140	list_add(&rq->queuelist, &bqt->busy_list);	1143	list_add(&rq->queuelist, &bqt->busy_list);
1141	bqt->busy++;	1144	bqt->busy++;
1142	return 0;	1145	return 0;
1143	}	1146	}
1144		1147
1145	EXPORT_SYMBOL(blk_queue_start_tag);	1148	EXPORT_SYMBOL(blk_queue_start_tag);
1146		1149
1147	/**	1150	/**
1148	* blk_queue_invalidate_tags - invalidate all pending tags	1151	* blk_queue_invalidate_tags - invalidate all pending tags
1149	* @q: the request queue for the device	1152	* @q: the request queue for the device
1150	*	1153	*
1151	* Description:	1154	* Description:
1152	* Hardware conditions may dictate a need to stop all pending requests.	1155	* Hardware conditions may dictate a need to stop all pending requests.
1153	* In this case, we will safely clear the block side of the tag queue and	1156	* In this case, we will safely clear the block side of the tag queue and
1154	* readd all requests to the request queue in the right order.	1157	* readd all requests to the request queue in the right order.
1155	*	1158	*
1156	* Notes:	1159	* Notes:
1157	* queue lock must be held.	1160	* queue lock must be held.
1158	**/	1161	**/
1159	void blk_queue_invalidate_tags(struct request_queue *q)	1162	void blk_queue_invalidate_tags(struct request_queue *q)
1160	{	1163	{
1161	struct blk_queue_tag *bqt = q->queue_tags;	1164	struct blk_queue_tag *bqt = q->queue_tags;
1162	struct list_head tmp, n;	1165	struct list_head tmp, n;
1163	struct request *rq;	1166	struct request *rq;
1164		1167
1165	list_for_each_safe(tmp, n, &bqt->busy_list) {	1168	list_for_each_safe(tmp, n, &bqt->busy_list) {
1166	rq = list_entry_rq(tmp);	1169	rq = list_entry_rq(tmp);
1167		1170
1168	if (rq->tag == -1) {	1171	if (rq->tag == -1) {
1169	printk(KERN_ERR	1172	printk(KERN_ERR
1170	"%s: bad tag found on list\n", __FUNCTION__);	1173	"%s: bad tag found on list\n", __FUNCTION__);
1171	list_del_init(&rq->queuelist);	1174	list_del_init(&rq->queuelist);
1172	rq->cmd_flags &= ~REQ_QUEUED;	1175	rq->cmd_flags &= ~REQ_QUEUED;
1173	} else	1176	} else
1174	blk_queue_end_tag(q, rq);	1177	blk_queue_end_tag(q, rq);
1175		1178
1176	rq->cmd_flags &= ~REQ_STARTED;	1179	rq->cmd_flags &= ~REQ_STARTED;
1177	__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);	1180	__elv_add_request(q, rq, ELEVATOR_INSERT_BACK, 0);
1178	}	1181	}
1179	}	1182	}
1180		1183
1181	EXPORT_SYMBOL(blk_queue_invalidate_tags);	1184	EXPORT_SYMBOL(blk_queue_invalidate_tags);
1182		1185
1183	void blk_dump_rq_flags(struct request rq, char msg)	1186	void blk_dump_rq_flags(struct request rq, char msg)
1184	{	1187	{
1185	int bit;	1188	int bit;
1186		1189
1187	printk("%s: dev %s: type=%x, flags=%x\n", msg,	1190	printk("%s: dev %s: type=%x, flags=%x\n", msg,
1188	rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,	1191	rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type,
1189	rq->cmd_flags);	1192	rq->cmd_flags);
1190		1193
1191	printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,	1194	printk("\nsector %llu, nr/cnr %lu/%u\n", (unsigned long long)rq->sector,
1192	rq->nr_sectors,	1195	rq->nr_sectors,
1193	rq->current_nr_sectors);	1196	rq->current_nr_sectors);
1194	printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);	1197	printk("bio %p, biotail %p, buffer %p, data %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data, rq->data_len);
1195		1198
1196	if (blk_pc_request(rq)) {	1199	if (blk_pc_request(rq)) {
1197	printk("cdb: ");	1200	printk("cdb: ");
1198	for (bit = 0; bit < sizeof(rq->cmd); bit++)	1201	for (bit = 0; bit < sizeof(rq->cmd); bit++)
1199	printk("%02x ", rq->cmd[bit]);	1202	printk("%02x ", rq->cmd[bit]);
1200	printk("\n");	1203	printk("\n");
1201	}	1204	}
1202	}	1205	}
1203		1206
1204	EXPORT_SYMBOL(blk_dump_rq_flags);	1207	EXPORT_SYMBOL(blk_dump_rq_flags);
1205		1208
1206	void blk_recount_segments(struct request_queue q, struct bio bio)	1209	void blk_recount_segments(struct request_queue q, struct bio bio)
1207	{	1210	{
1208	struct request rq;	1211	struct request rq;
1209	struct bio *nxt = bio->bi_next;	1212	struct bio *nxt = bio->bi_next;
1210	rq.q = q;	1213	rq.q = q;
1211	rq.bio = rq.biotail = bio;	1214	rq.bio = rq.biotail = bio;
1212	bio->bi_next = NULL;	1215	bio->bi_next = NULL;
1213	blk_recalc_rq_segments(&rq);	1216	blk_recalc_rq_segments(&rq);
1214	bio->bi_next = nxt;	1217	bio->bi_next = nxt;
1215	bio->bi_phys_segments = rq.nr_phys_segments;	1218	bio->bi_phys_segments = rq.nr_phys_segments;
1216	bio->bi_hw_segments = rq.nr_hw_segments;	1219	bio->bi_hw_segments = rq.nr_hw_segments;
1217	bio->bi_flags \|= (1 << BIO_SEG_VALID);	1220	bio->bi_flags \|= (1 << BIO_SEG_VALID);
1218	}	1221	}
1219	EXPORT_SYMBOL(blk_recount_segments);	1222	EXPORT_SYMBOL(blk_recount_segments);
1220		1223
1221	static void blk_recalc_rq_segments(struct request *rq)	1224	static void blk_recalc_rq_segments(struct request *rq)
1222	{	1225	{
1223	int nr_phys_segs;	1226	int nr_phys_segs;
1224	int nr_hw_segs;	1227	int nr_hw_segs;
1225	unsigned int phys_size;	1228	unsigned int phys_size;
1226	unsigned int hw_size;	1229	unsigned int hw_size;
1227	struct bio_vec bv, bvprv = NULL;	1230	struct bio_vec bv, bvprv = NULL;
1228	int seg_size;	1231	int seg_size;
1229	int hw_seg_size;	1232	int hw_seg_size;
1230	int cluster;	1233	int cluster;
1231	struct req_iterator iter;	1234	struct req_iterator iter;
1232	int high, highprv = 1;	1235	int high, highprv = 1;
1233	struct request_queue *q = rq->q;	1236	struct request_queue *q = rq->q;
1234		1237
1235	if (!rq->bio)	1238	if (!rq->bio)
1236	return;	1239	return;
1237		1240
1238	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);	1241	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1239	hw_seg_size = seg_size = 0;	1242	hw_seg_size = seg_size = 0;
1240	phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;	1243	phys_size = hw_size = nr_phys_segs = nr_hw_segs = 0;
1241	rq_for_each_segment(bv, rq, iter) {	1244	rq_for_each_segment(bv, rq, iter) {
1242	/*	1245	/*
1243	* the trick here is making sure that a high page is never	1246	* the trick here is making sure that a high page is never
1244	* considered part of another segment, since that might	1247	* considered part of another segment, since that might
1245	* change with the bounce page.	1248	* change with the bounce page.
1246	*/	1249	*/
1247	high = page_to_pfn(bv->bv_page) > q->bounce_pfn;	1250	high = page_to_pfn(bv->bv_page) > q->bounce_pfn;
1248	if (high \|\| highprv)	1251	if (high \|\| highprv)
1249	goto new_hw_segment;	1252	goto new_hw_segment;
1250	if (cluster) {	1253	if (cluster) {
1251	if (seg_size + bv->bv_len > q->max_segment_size)	1254	if (seg_size + bv->bv_len > q->max_segment_size)
1252	goto new_segment;	1255	goto new_segment;
1253	if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))	1256	if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv))
1254	goto new_segment;	1257	goto new_segment;
1255	if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))	1258	if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bv))
1256	goto new_segment;	1259	goto new_segment;
1257	if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))	1260	if (BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
1258	goto new_hw_segment;	1261	goto new_hw_segment;
1259		1262
1260	seg_size += bv->bv_len;	1263	seg_size += bv->bv_len;
1261	hw_seg_size += bv->bv_len;	1264	hw_seg_size += bv->bv_len;
1262	bvprv = bv;	1265	bvprv = bv;
1263	continue;	1266	continue;
1264	}	1267	}
1265	new_segment:	1268	new_segment:
1266	if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&	1269	if (BIOVEC_VIRT_MERGEABLE(bvprv, bv) &&
1267	!BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))	1270	!BIOVEC_VIRT_OVERSIZE(hw_seg_size + bv->bv_len))
1268	hw_seg_size += bv->bv_len;	1271	hw_seg_size += bv->bv_len;
1269	else {	1272	else {
1270	new_hw_segment:	1273	new_hw_segment:
1271	if (nr_hw_segs == 1 &&	1274	if (nr_hw_segs == 1 &&
1272	hw_seg_size > rq->bio->bi_hw_front_size)	1275	hw_seg_size > rq->bio->bi_hw_front_size)
1273	rq->bio->bi_hw_front_size = hw_seg_size;	1276	rq->bio->bi_hw_front_size = hw_seg_size;
1274	hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;	1277	hw_seg_size = BIOVEC_VIRT_START_SIZE(bv) + bv->bv_len;
1275	nr_hw_segs++;	1278	nr_hw_segs++;
1276	}	1279	}
1277		1280
1278	nr_phys_segs++;	1281	nr_phys_segs++;
1279	bvprv = bv;	1282	bvprv = bv;
1280	seg_size = bv->bv_len;	1283	seg_size = bv->bv_len;
1281	highprv = high;	1284	highprv = high;
1282	}	1285	}
1283		1286
1284	if (nr_hw_segs == 1 &&	1287	if (nr_hw_segs == 1 &&
1285	hw_seg_size > rq->bio->bi_hw_front_size)	1288	hw_seg_size > rq->bio->bi_hw_front_size)
1286	rq->bio->bi_hw_front_size = hw_seg_size;	1289	rq->bio->bi_hw_front_size = hw_seg_size;
1287	if (hw_seg_size > rq->biotail->bi_hw_back_size)	1290	if (hw_seg_size > rq->biotail->bi_hw_back_size)
1288	rq->biotail->bi_hw_back_size = hw_seg_size;	1291	rq->biotail->bi_hw_back_size = hw_seg_size;
1289	rq->nr_phys_segments = nr_phys_segs;	1292	rq->nr_phys_segments = nr_phys_segs;
1290	rq->nr_hw_segments = nr_hw_segs;	1293	rq->nr_hw_segments = nr_hw_segs;
1291	}	1294	}
1292		1295
1293	static int blk_phys_contig_segment(struct request_queue q, struct bio bio,	1296	static int blk_phys_contig_segment(struct request_queue q, struct bio bio,
1294	struct bio *nxt)	1297	struct bio *nxt)
1295	{	1298	{
1296	if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))	1299	if (!(q->queue_flags & (1 << QUEUE_FLAG_CLUSTER)))
1297	return 0;	1300	return 0;
1298		1301
1299	if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))	1302	if (!BIOVEC_PHYS_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)))
1300	return 0;	1303	return 0;
1301	if (bio->bi_size + nxt->bi_size > q->max_segment_size)	1304	if (bio->bi_size + nxt->bi_size > q->max_segment_size)
1302	return 0;	1305	return 0;
1303		1306
1304	/*	1307	/*
1305	* bio and nxt are contigous in memory, check if the queue allows	1308	* bio and nxt are contigous in memory, check if the queue allows
1306	* these two to be merged into one	1309	* these two to be merged into one
1307	*/	1310	*/
1308	if (BIO_SEG_BOUNDARY(q, bio, nxt))	1311	if (BIO_SEG_BOUNDARY(q, bio, nxt))
1309	return 1;	1312	return 1;
1310		1313
1311	return 0;	1314	return 0;
1312	}	1315	}
1313		1316
1314	static int blk_hw_contig_segment(struct request_queue q, struct bio bio,	1317	static int blk_hw_contig_segment(struct request_queue q, struct bio bio,
1315	struct bio *nxt)	1318	struct bio *nxt)
1316	{	1319	{
1317	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))	1320	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1318	blk_recount_segments(q, bio);	1321	blk_recount_segments(q, bio);
1319	if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))	1322	if (unlikely(!bio_flagged(nxt, BIO_SEG_VALID)))
1320	blk_recount_segments(q, nxt);	1323	blk_recount_segments(q, nxt);
1321	if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) \|\|	1324	if (!BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(nxt)) \|\|
1322	BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))	1325	BIOVEC_VIRT_OVERSIZE(bio->bi_hw_back_size + nxt->bi_hw_front_size))
1323	return 0;	1326	return 0;
1324	if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)	1327	if (bio->bi_hw_back_size + nxt->bi_hw_front_size > q->max_segment_size)
1325	return 0;	1328	return 0;
1326		1329
1327	return 1;	1330	return 1;
1328	}	1331	}
1329		1332
1330	/*	1333	/*
1331	* map a request to scatterlist, return number of sg entries setup. Caller	1334	* map a request to scatterlist, return number of sg entries setup. Caller
1332	* must make sure sg can hold rq->nr_phys_segments entries	1335	* must make sure sg can hold rq->nr_phys_segments entries
1333	*/	1336	*/
1334	int blk_rq_map_sg(struct request_queue q, struct request rq,	1337	int blk_rq_map_sg(struct request_queue q, struct request rq,
1335	struct scatterlist *sg)	1338	struct scatterlist *sg)
1336	{	1339	{
1337	struct bio_vec bvec, bvprv;	1340	struct bio_vec bvec, bvprv;
1338	struct req_iterator iter;	1341	struct req_iterator iter;
1339	int nsegs, cluster;	1342	int nsegs, cluster;
1340		1343
1341	nsegs = 0;	1344	nsegs = 0;
1342	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);	1345	cluster = q->queue_flags & (1 << QUEUE_FLAG_CLUSTER);
1343		1346
1344	/*	1347	/*
1345	* for each bio in rq	1348	* for each bio in rq
1346	*/	1349	*/
1347	bvprv = NULL;	1350	bvprv = NULL;
1348	rq_for_each_segment(bvec, rq, iter) {	1351	rq_for_each_segment(bvec, rq, iter) {
1349	int nbytes = bvec->bv_len;	1352	int nbytes = bvec->bv_len;
1350		1353
1351	if (bvprv && cluster) {	1354	if (bvprv && cluster) {
1352	if (sg[nsegs - 1].length + nbytes > q->max_segment_size)	1355	if (sg[nsegs - 1].length + nbytes > q->max_segment_size)
1353	goto new_segment;	1356	goto new_segment;
1354		1357
1355	if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))	1358	if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec))
1356	goto new_segment;	1359	goto new_segment;
1357	if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))	1360	if (!BIOVEC_SEG_BOUNDARY(q, bvprv, bvec))
1358	goto new_segment;	1361	goto new_segment;
1359		1362
1360	sg[nsegs - 1].length += nbytes;	1363	sg[nsegs - 1].length += nbytes;
1361	} else {	1364	} else {
1362	new_segment:	1365	new_segment:
1363	memset(&sg[nsegs],0,sizeof(struct scatterlist));	1366	memset(&sg[nsegs],0,sizeof(struct scatterlist));
1364	sg[nsegs].page = bvec->bv_page;	1367	sg[nsegs].page = bvec->bv_page;
1365	sg[nsegs].length = nbytes;	1368	sg[nsegs].length = nbytes;
1366	sg[nsegs].offset = bvec->bv_offset;	1369	sg[nsegs].offset = bvec->bv_offset;
1367		1370
1368	nsegs++;	1371	nsegs++;
1369	}	1372	}
1370	bvprv = bvec;	1373	bvprv = bvec;
1371	} /* segments in rq */	1374	} /* segments in rq */
1372		1375
1373	return nsegs;	1376	return nsegs;
1374	}	1377	}
1375		1378
1376	EXPORT_SYMBOL(blk_rq_map_sg);	1379	EXPORT_SYMBOL(blk_rq_map_sg);
1377		1380
1378	/*	1381	/*
1379	* the standard queue merge functions, can be overridden with device	1382	* the standard queue merge functions, can be overridden with device
1380	* specific ones if so desired	1383	* specific ones if so desired
1381	*/	1384	*/
1382		1385
1383	static inline int ll_new_mergeable(struct request_queue *q,	1386	static inline int ll_new_mergeable(struct request_queue *q,
1384	struct request *req,	1387	struct request *req,
1385	struct bio *bio)	1388	struct bio *bio)
1386	{	1389	{
1387	int nr_phys_segs = bio_phys_segments(q, bio);	1390	int nr_phys_segs = bio_phys_segments(q, bio);
1388		1391
1389	if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {	1392	if (req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1390	req->cmd_flags \|= REQ_NOMERGE;	1393	req->cmd_flags \|= REQ_NOMERGE;
1391	if (req == q->last_merge)	1394	if (req == q->last_merge)
1392	q->last_merge = NULL;	1395	q->last_merge = NULL;
1393	return 0;	1396	return 0;
1394	}	1397	}
1395		1398
1396	/*	1399	/*
1397	* A hw segment is just getting larger, bump just the phys	1400	* A hw segment is just getting larger, bump just the phys
1398	* counter.	1401	* counter.
1399	*/	1402	*/
1400	req->nr_phys_segments += nr_phys_segs;	1403	req->nr_phys_segments += nr_phys_segs;
1401	return 1;	1404	return 1;
1402	}	1405	}
1403		1406
1404	static inline int ll_new_hw_segment(struct request_queue *q,	1407	static inline int ll_new_hw_segment(struct request_queue *q,
1405	struct request *req,	1408	struct request *req,
1406	struct bio *bio)	1409	struct bio *bio)
1407	{	1410	{
1408	int nr_hw_segs = bio_hw_segments(q, bio);	1411	int nr_hw_segs = bio_hw_segments(q, bio);
1409	int nr_phys_segs = bio_phys_segments(q, bio);	1412	int nr_phys_segs = bio_phys_segments(q, bio);
1410		1413
1411	if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments	1414	if (req->nr_hw_segments + nr_hw_segs > q->max_hw_segments
1412	\|\| req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {	1415	\|\| req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) {
1413	req->cmd_flags \|= REQ_NOMERGE;	1416	req->cmd_flags \|= REQ_NOMERGE;
1414	if (req == q->last_merge)	1417	if (req == q->last_merge)
1415	q->last_merge = NULL;	1418	q->last_merge = NULL;
1416	return 0;	1419	return 0;
1417	}	1420	}
1418		1421
1419	/*	1422	/*
1420	* This will form the start of a new hw segment. Bump both	1423	* This will form the start of a new hw segment. Bump both
1421	* counters.	1424	* counters.
1422	*/	1425	*/
1423	req->nr_hw_segments += nr_hw_segs;	1426	req->nr_hw_segments += nr_hw_segs;
1424	req->nr_phys_segments += nr_phys_segs;	1427	req->nr_phys_segments += nr_phys_segs;
1425	return 1;	1428	return 1;
1426	}	1429	}
1427		1430
1428	static int ll_back_merge_fn(struct request_queue q, struct request req,	1431	static int ll_back_merge_fn(struct request_queue q, struct request req,
1429	struct bio *bio)	1432	struct bio *bio)
1430	{	1433	{
1431	unsigned short max_sectors;	1434	unsigned short max_sectors;
1432	int len;	1435	int len;
1433		1436
1434	if (unlikely(blk_pc_request(req)))	1437	if (unlikely(blk_pc_request(req)))
1435	max_sectors = q->max_hw_sectors;	1438	max_sectors = q->max_hw_sectors;
1436	else	1439	else
1437	max_sectors = q->max_sectors;	1440	max_sectors = q->max_sectors;
1438		1441
1439	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {	1442	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
1440	req->cmd_flags \|= REQ_NOMERGE;	1443	req->cmd_flags \|= REQ_NOMERGE;
1441	if (req == q->last_merge)	1444	if (req == q->last_merge)
1442	q->last_merge = NULL;	1445	q->last_merge = NULL;
1443	return 0;	1446	return 0;
1444	}	1447	}
1445	if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))	1448	if (unlikely(!bio_flagged(req->biotail, BIO_SEG_VALID)))
1446	blk_recount_segments(q, req->biotail);	1449	blk_recount_segments(q, req->biotail);
1447	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))	1450	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1448	blk_recount_segments(q, bio);	1451	blk_recount_segments(q, bio);
1449	len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;	1452	len = req->biotail->bi_hw_back_size + bio->bi_hw_front_size;
1450	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&	1453	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(req->biotail), __BVEC_START(bio)) &&
1451	!BIOVEC_VIRT_OVERSIZE(len)) {	1454	!BIOVEC_VIRT_OVERSIZE(len)) {
1452	int mergeable = ll_new_mergeable(q, req, bio);	1455	int mergeable = ll_new_mergeable(q, req, bio);
1453		1456
1454	if (mergeable) {	1457	if (mergeable) {
1455	if (req->nr_hw_segments == 1)	1458	if (req->nr_hw_segments == 1)
1456	req->bio->bi_hw_front_size = len;	1459	req->bio->bi_hw_front_size = len;
1457	if (bio->bi_hw_segments == 1)	1460	if (bio->bi_hw_segments == 1)
1458	bio->bi_hw_back_size = len;	1461	bio->bi_hw_back_size = len;
1459	}	1462	}
1460	return mergeable;	1463	return mergeable;
1461	}	1464	}
1462		1465
1463	return ll_new_hw_segment(q, req, bio);	1466	return ll_new_hw_segment(q, req, bio);
1464	}	1467	}
1465		1468
1466	static int ll_front_merge_fn(struct request_queue q, struct request req,	1469	static int ll_front_merge_fn(struct request_queue q, struct request req,
1467	struct bio *bio)	1470	struct bio *bio)
1468	{	1471	{
1469	unsigned short max_sectors;	1472	unsigned short max_sectors;
1470	int len;	1473	int len;
1471		1474
1472	if (unlikely(blk_pc_request(req)))	1475	if (unlikely(blk_pc_request(req)))
1473	max_sectors = q->max_hw_sectors;	1476	max_sectors = q->max_hw_sectors;
1474	else	1477	else
1475	max_sectors = q->max_sectors;	1478	max_sectors = q->max_sectors;
1476		1479
1477		1480
1478	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {	1481	if (req->nr_sectors + bio_sectors(bio) > max_sectors) {
1479	req->cmd_flags \|= REQ_NOMERGE;	1482	req->cmd_flags \|= REQ_NOMERGE;
1480	if (req == q->last_merge)	1483	if (req == q->last_merge)
1481	q->last_merge = NULL;	1484	q->last_merge = NULL;
1482	return 0;	1485	return 0;
1483	}	1486	}
1484	len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;	1487	len = bio->bi_hw_back_size + req->bio->bi_hw_front_size;
1485	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))	1488	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
1486	blk_recount_segments(q, bio);	1489	blk_recount_segments(q, bio);
1487	if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))	1490	if (unlikely(!bio_flagged(req->bio, BIO_SEG_VALID)))
1488	blk_recount_segments(q, req->bio);	1491	blk_recount_segments(q, req->bio);
1489	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&	1492	if (BIOVEC_VIRT_MERGEABLE(__BVEC_END(bio), __BVEC_START(req->bio)) &&
1490	!BIOVEC_VIRT_OVERSIZE(len)) {	1493	!BIOVEC_VIRT_OVERSIZE(len)) {
1491	int mergeable = ll_new_mergeable(q, req, bio);	1494	int mergeable = ll_new_mergeable(q, req, bio);
1492		1495
1493	if (mergeable) {	1496	if (mergeable) {
1494	if (bio->bi_hw_segments == 1)	1497	if (bio->bi_hw_segments == 1)
1495	bio->bi_hw_front_size = len;	1498	bio->bi_hw_front_size = len;
1496	if (req->nr_hw_segments == 1)	1499	if (req->nr_hw_segments == 1)
1497	req->biotail->bi_hw_back_size = len;	1500	req->biotail->bi_hw_back_size = len;
1498	}	1501	}
1499	return mergeable;	1502	return mergeable;
1500	}	1503	}
1501		1504
1502	return ll_new_hw_segment(q, req, bio);	1505	return ll_new_hw_segment(q, req, bio);
1503	}	1506	}
1504		1507
1505	static int ll_merge_requests_fn(struct request_queue q, struct request req,	1508	static int ll_merge_requests_fn(struct request_queue q, struct request req,
1506	struct request *next)	1509	struct request *next)
1507	{	1510	{
1508	int total_phys_segments;	1511	int total_phys_segments;
1509	int total_hw_segments;	1512	int total_hw_segments;
1510		1513
1511	/*	1514	/*
1512	* First check if the either of the requests are re-queued	1515	* First check if the either of the requests are re-queued
1513	* requests. Can't merge them if they are.	1516	* requests. Can't merge them if they are.
1514	*/	1517	*/
1515	if (req->special \|\| next->special)	1518	if (req->special \|\| next->special)
1516	return 0;	1519	return 0;
1517		1520
1518	/*	1521	/*
1519	* Will it become too large?	1522	* Will it become too large?
1520	*/	1523	*/
1521	if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)	1524	if ((req->nr_sectors + next->nr_sectors) > q->max_sectors)
1522	return 0;	1525	return 0;
1523		1526
1524	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;	1527	total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
1525	if (blk_phys_contig_segment(q, req->biotail, next->bio))	1528	if (blk_phys_contig_segment(q, req->biotail, next->bio))
1526	total_phys_segments--;	1529	total_phys_segments--;
1527		1530
1528	if (total_phys_segments > q->max_phys_segments)	1531	if (total_phys_segments > q->max_phys_segments)
1529	return 0;	1532	return 0;
1530		1533
1531	total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;	1534	total_hw_segments = req->nr_hw_segments + next->nr_hw_segments;
1532	if (blk_hw_contig_segment(q, req->biotail, next->bio)) {	1535	if (blk_hw_contig_segment(q, req->biotail, next->bio)) {
1533	int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;	1536	int len = req->biotail->bi_hw_back_size + next->bio->bi_hw_front_size;
1534	/*	1537	/*
1535	* propagate the combined length to the end of the requests	1538	* propagate the combined length to the end of the requests
1536	*/	1539	*/
1537	if (req->nr_hw_segments == 1)	1540	if (req->nr_hw_segments == 1)
1538	req->bio->bi_hw_front_size = len;	1541	req->bio->bi_hw_front_size = len;
1539	if (next->nr_hw_segments == 1)	1542	if (next->nr_hw_segments == 1)
1540	next->biotail->bi_hw_back_size = len;	1543	next->biotail->bi_hw_back_size = len;
1541	total_hw_segments--;	1544	total_hw_segments--;
1542	}	1545	}
1543		1546
1544	if (total_hw_segments > q->max_hw_segments)	1547	if (total_hw_segments > q->max_hw_segments)
1545	return 0;	1548	return 0;
1546		1549
1547	/* Merge is OK... */	1550	/* Merge is OK... */
1548	req->nr_phys_segments = total_phys_segments;	1551	req->nr_phys_segments = total_phys_segments;
1549	req->nr_hw_segments = total_hw_segments;	1552	req->nr_hw_segments = total_hw_segments;
1550	return 1;	1553	return 1;
1551	}	1554	}
1552		1555
1553	/*	1556	/*
1554	* "plug" the device if there are no outstanding requests: this will	1557	* "plug" the device if there are no outstanding requests: this will
1555	* force the transfer to start only after we have put all the requests	1558	* force the transfer to start only after we have put all the requests
1556	* on the list.	1559	* on the list.
1557	*	1560	*
1558	* This is called with interrupts off and no requests on the queue and	1561	* This is called with interrupts off and no requests on the queue and
1559	* with the queue lock held.	1562	* with the queue lock held.
1560	*/	1563	*/
1561	void blk_plug_device(struct request_queue *q)	1564	void blk_plug_device(struct request_queue *q)
1562	{	1565	{
1563	WARN_ON(!irqs_disabled());	1566	WARN_ON(!irqs_disabled());
1564		1567
1565	/*	1568	/*
1566	* don't plug a stopped queue, it must be paired with blk_start_queue()	1569	* don't plug a stopped queue, it must be paired with blk_start_queue()
1567	* which will restart the queueing	1570	* which will restart the queueing
1568	*/	1571	*/
1569	if (blk_queue_stopped(q))	1572	if (blk_queue_stopped(q))
1570	return;	1573	return;
1571		1574
1572	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {	1575	if (!test_and_set_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags)) {
1573	mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);	1576	mod_timer(&q->unplug_timer, jiffies + q->unplug_delay);
1574	blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);	1577	blk_add_trace_generic(q, NULL, 0, BLK_TA_PLUG);
1575	}	1578	}
1576	}	1579	}
1577		1580
1578	EXPORT_SYMBOL(blk_plug_device);	1581	EXPORT_SYMBOL(blk_plug_device);
1579		1582
1580	/*	1583	/*
1581	* remove the queue from the plugged list, if present. called with	1584	* remove the queue from the plugged list, if present. called with
1582	* queue lock held and interrupts disabled.	1585	* queue lock held and interrupts disabled.
1583	*/	1586	*/
1584	int blk_remove_plug(struct request_queue *q)	1587	int blk_remove_plug(struct request_queue *q)
1585	{	1588	{
1586	WARN_ON(!irqs_disabled());	1589	WARN_ON(!irqs_disabled());
1587		1590
1588	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))	1591	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
1589	return 0;	1592	return 0;
1590		1593
1591	del_timer(&q->unplug_timer);	1594	del_timer(&q->unplug_timer);
1592	return 1;	1595	return 1;
1593	}	1596	}
1594		1597
1595	EXPORT_SYMBOL(blk_remove_plug);	1598	EXPORT_SYMBOL(blk_remove_plug);
1596		1599
1597	/*	1600	/*
1598	* remove the plug and let it rip..	1601	* remove the plug and let it rip..
1599	*/	1602	*/
1600	void __generic_unplug_device(struct request_queue *q)	1603	void __generic_unplug_device(struct request_queue *q)
1601	{	1604	{
1602	if (unlikely(blk_queue_stopped(q)))	1605	if (unlikely(blk_queue_stopped(q)))
1603	return;	1606	return;
1604		1607
1605	if (!blk_remove_plug(q))	1608	if (!blk_remove_plug(q))
1606	return;	1609	return;
1607		1610
1608	q->request_fn(q);	1611	q->request_fn(q);
1609	}	1612	}
1610	EXPORT_SYMBOL(__generic_unplug_device);	1613	EXPORT_SYMBOL(__generic_unplug_device);
1611		1614
1612	/**	1615	/**
1613	* generic_unplug_device - fire a request queue	1616	* generic_unplug_device - fire a request queue
1614	* @q: The &struct request_queue in question	1617	* @q: The &struct request_queue in question
1615	*	1618	*
1616	* Description:	1619	* Description:
1617	* Linux uses plugging to build bigger requests queues before letting	1620	* Linux uses plugging to build bigger requests queues before letting
1618	* the device have at them. If a queue is plugged, the I/O scheduler	1621	* the device have at them. If a queue is plugged, the I/O scheduler
1619	* is still adding and merging requests on the queue. Once the queue	1622	* is still adding and merging requests on the queue. Once the queue
1620	* gets unplugged, the request_fn defined for the queue is invoked and	1623	* gets unplugged, the request_fn defined for the queue is invoked and
1621	* transfers started.	1624	* transfers started.
1622	**/	1625	**/
1623	void generic_unplug_device(struct request_queue *q)	1626	void generic_unplug_device(struct request_queue *q)
1624	{	1627	{
1625	spin_lock_irq(q->queue_lock);	1628	spin_lock_irq(q->queue_lock);
1626	__generic_unplug_device(q);	1629	__generic_unplug_device(q);
1627	spin_unlock_irq(q->queue_lock);	1630	spin_unlock_irq(q->queue_lock);
1628	}	1631	}
1629	EXPORT_SYMBOL(generic_unplug_device);	1632	EXPORT_SYMBOL(generic_unplug_device);
1630		1633
1631	static void blk_backing_dev_unplug(struct backing_dev_info *bdi,	1634	static void blk_backing_dev_unplug(struct backing_dev_info *bdi,
1632	struct page *page)	1635	struct page *page)
1633	{	1636	{
1634	struct request_queue *q = bdi->unplug_io_data;	1637	struct request_queue *q = bdi->unplug_io_data;
1635		1638
1636	/*	1639	/*
1637	* devices don't necessarily have an ->unplug_fn defined	1640	* devices don't necessarily have an ->unplug_fn defined
1638	*/	1641	*/
1639	if (q->unplug_fn) {	1642	if (q->unplug_fn) {
1640	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,	1643	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
1641	q->rq.count[READ] + q->rq.count[WRITE]);	1644	q->rq.count[READ] + q->rq.count[WRITE]);
1642		1645
1643	q->unplug_fn(q);	1646	q->unplug_fn(q);
1644	}	1647	}
1645	}	1648	}
1646		1649
1647	static void blk_unplug_work(struct work_struct *work)	1650	static void blk_unplug_work(struct work_struct *work)
1648	{	1651	{
1649	struct request_queue *q =	1652	struct request_queue *q =
1650	container_of(work, struct request_queue, unplug_work);	1653	container_of(work, struct request_queue, unplug_work);
1651		1654
1652	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,	1655	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_IO, NULL,
1653	q->rq.count[READ] + q->rq.count[WRITE]);	1656	q->rq.count[READ] + q->rq.count[WRITE]);
1654		1657
1655	q->unplug_fn(q);	1658	q->unplug_fn(q);
1656	}	1659	}
1657		1660
1658	static void blk_unplug_timeout(unsigned long data)	1661	static void blk_unplug_timeout(unsigned long data)
1659	{	1662	{
1660	struct request_queue q = (struct request_queue )data;	1663	struct request_queue q = (struct request_queue )data;
1661		1664
1662	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,	1665	blk_add_trace_pdu_int(q, BLK_TA_UNPLUG_TIMER, NULL,
1663	q->rq.count[READ] + q->rq.count[WRITE]);	1666	q->rq.count[READ] + q->rq.count[WRITE]);
1664		1667
1665	kblockd_schedule_work(&q->unplug_work);	1668	kblockd_schedule_work(&q->unplug_work);
1666	}	1669	}
1667		1670
1668	/**	1671	/**
1669	* blk_start_queue - restart a previously stopped queue	1672	* blk_start_queue - restart a previously stopped queue
1670	* @q: The &struct request_queue in question	1673	* @q: The &struct request_queue in question
1671	*	1674	*
1672	* Description:	1675	* Description:
1673	* blk_start_queue() will clear the stop flag on the queue, and call	1676	* blk_start_queue() will clear the stop flag on the queue, and call
1674	* the request_fn for the queue if it was in a stopped state when	1677	* the request_fn for the queue if it was in a stopped state when
1675	* entered. Also see blk_stop_queue(). Queue lock must be held.	1678	* entered. Also see blk_stop_queue(). Queue lock must be held.
1676	**/	1679	**/
1677	void blk_start_queue(struct request_queue *q)	1680	void blk_start_queue(struct request_queue *q)
1678	{	1681	{
1679	WARN_ON(!irqs_disabled());	1682	WARN_ON(!irqs_disabled());
1680		1683
1681	clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);	1684	clear_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1682		1685
1683	/*	1686	/*
1684	* one level of recursion is ok and is much faster than kicking	1687	* one level of recursion is ok and is much faster than kicking
1685	* the unplug handling	1688	* the unplug handling
1686	*/	1689	*/
1687	if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {	1690	if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
1688	q->request_fn(q);	1691	q->request_fn(q);
1689	clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);	1692	clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
1690	} else {	1693	} else {
1691	blk_plug_device(q);	1694	blk_plug_device(q);
1692	kblockd_schedule_work(&q->unplug_work);	1695	kblockd_schedule_work(&q->unplug_work);
1693	}	1696	}
1694	}	1697	}
1695		1698
1696	EXPORT_SYMBOL(blk_start_queue);	1699	EXPORT_SYMBOL(blk_start_queue);
1697		1700
1698	/**	1701	/**
1699	* blk_stop_queue - stop a queue	1702	* blk_stop_queue - stop a queue
1700	* @q: The &struct request_queue in question	1703	* @q: The &struct request_queue in question
1701	*	1704	*
1702	* Description:	1705	* Description:
1703	* The Linux block layer assumes that a block driver will consume all	1706	* The Linux block layer assumes that a block driver will consume all
1704	* entries on the request queue when the request_fn strategy is called.	1707	* entries on the request queue when the request_fn strategy is called.
1705	* Often this will not happen, because of hardware limitations (queue	1708	* Often this will not happen, because of hardware limitations (queue
1706	* depth settings). If a device driver gets a 'queue full' response,	1709	* depth settings). If a device driver gets a 'queue full' response,
1707	* or if it simply chooses not to queue more I/O at one point, it can	1710	* or if it simply chooses not to queue more I/O at one point, it can
1708	* call this function to prevent the request_fn from being called until	1711	* call this function to prevent the request_fn from being called until
1709	* the driver has signalled it's ready to go again. This happens by calling	1712	* the driver has signalled it's ready to go again. This happens by calling
1710	* blk_start_queue() to restart queue operations. Queue lock must be held.	1713	* blk_start_queue() to restart queue operations. Queue lock must be held.
1711	**/	1714	**/
1712	void blk_stop_queue(struct request_queue *q)	1715	void blk_stop_queue(struct request_queue *q)
1713	{	1716	{
1714	blk_remove_plug(q);	1717	blk_remove_plug(q);
1715	set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);	1718	set_bit(QUEUE_FLAG_STOPPED, &q->queue_flags);
1716	}	1719	}
1717	EXPORT_SYMBOL(blk_stop_queue);	1720	EXPORT_SYMBOL(blk_stop_queue);
1718		1721
1719	/**	1722	/**
1720	* blk_sync_queue - cancel any pending callbacks on a queue	1723	* blk_sync_queue - cancel any pending callbacks on a queue
1721	* @q: the queue	1724	* @q: the queue
1722	*	1725	*
1723	* Description:	1726	* Description:
1724	* The block layer may perform asynchronous callback activity	1727	* The block layer may perform asynchronous callback activity
1725	* on a queue, such as calling the unplug function after a timeout.	1728	* on a queue, such as calling the unplug function after a timeout.
1726	* A block device may call blk_sync_queue to ensure that any	1729	* A block device may call blk_sync_queue to ensure that any
1727	* such activity is cancelled, thus allowing it to release resources	1730	* such activity is cancelled, thus allowing it to release resources
1728	* that the callbacks might use. The caller must already have made sure	1731	* that the callbacks might use. The caller must already have made sure
1729	* that its ->make_request_fn will not re-add plugging prior to calling	1732	* that its ->make_request_fn will not re-add plugging prior to calling
1730	* this function.	1733	* this function.
1731	*	1734	*
1732	*/	1735	*/
1733	void blk_sync_queue(struct request_queue *q)	1736	void blk_sync_queue(struct request_queue *q)
1734	{	1737	{
1735	del_timer_sync(&q->unplug_timer);	1738	del_timer_sync(&q->unplug_timer);
1736	}	1739	}
1737	EXPORT_SYMBOL(blk_sync_queue);	1740	EXPORT_SYMBOL(blk_sync_queue);
1738		1741
1739	/**	1742	/**
1740	* blk_run_queue - run a single device queue	1743	* blk_run_queue - run a single device queue
1741	* @q: The queue to run	1744	* @q: The queue to run
1742	*/	1745	*/
1743	void blk_run_queue(struct request_queue *q)	1746	void blk_run_queue(struct request_queue *q)
1744	{	1747	{
1745	unsigned long flags;	1748	unsigned long flags;
1746		1749
1747	spin_lock_irqsave(q->queue_lock, flags);	1750	spin_lock_irqsave(q->queue_lock, flags);
1748	blk_remove_plug(q);	1751	blk_remove_plug(q);
1749		1752
1750	/*	1753	/*
1751	* Only recurse once to avoid overrunning the stack, let the unplug	1754	* Only recurse once to avoid overrunning the stack, let the unplug
1752	* handling reinvoke the handler shortly if we already got there.	1755	* handling reinvoke the handler shortly if we already got there.
1753	*/	1756	*/
1754	if (!elv_queue_empty(q)) {	1757	if (!elv_queue_empty(q)) {
1755	if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {	1758	if (!test_and_set_bit(QUEUE_FLAG_REENTER, &q->queue_flags)) {
1756	q->request_fn(q);	1759	q->request_fn(q);
1757	clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);	1760	clear_bit(QUEUE_FLAG_REENTER, &q->queue_flags);
1758	} else {	1761	} else {
1759	blk_plug_device(q);	1762	blk_plug_device(q);
1760	kblockd_schedule_work(&q->unplug_work);	1763	kblockd_schedule_work(&q->unplug_work);
1761	}	1764	}
1762	}	1765	}
1763		1766
1764	spin_unlock_irqrestore(q->queue_lock, flags);	1767	spin_unlock_irqrestore(q->queue_lock, flags);
1765	}	1768	}
1766	EXPORT_SYMBOL(blk_run_queue);	1769	EXPORT_SYMBOL(blk_run_queue);
1767		1770
1768	/**	1771	/**
1769	* blk_cleanup_queue: - release a &struct request_queue when it is no longer needed	1772	* blk_cleanup_queue: - release a &struct request_queue when it is no longer needed
1770	* @kobj: the kobj belonging of the request queue to be released	1773	* @kobj: the kobj belonging of the request queue to be released
1771	*	1774	*
1772	* Description:	1775	* Description:
1773	* blk_cleanup_queue is the pair to blk_init_queue() or	1776	* blk_cleanup_queue is the pair to blk_init_queue() or
1774	* blk_queue_make_request(). It should be called when a request queue is	1777	* blk_queue_make_request(). It should be called when a request queue is
1775	* being released; typically when a block device is being de-registered.	1778	* being released; typically when a block device is being de-registered.
1776	* Currently, its primary task it to free all the &struct request	1779	* Currently, its primary task it to free all the &struct request
1777	* structures that were allocated to the queue and the queue itself.	1780	* structures that were allocated to the queue and the queue itself.
1778	*	1781	*
1779	* Caveat:	1782	* Caveat:
1780	* Hopefully the low level driver will have finished any	1783	* Hopefully the low level driver will have finished any
1781	* outstanding requests first...	1784	* outstanding requests first...
1782	**/	1785	**/
1783	static void blk_release_queue(struct kobject *kobj)	1786	static void blk_release_queue(struct kobject *kobj)
1784	{	1787	{
1785	struct request_queue *q =	1788	struct request_queue *q =
1786	container_of(kobj, struct request_queue, kobj);	1789	container_of(kobj, struct request_queue, kobj);
1787	struct request_list *rl = &q->rq;	1790	struct request_list *rl = &q->rq;
1788		1791
1789	blk_sync_queue(q);	1792	blk_sync_queue(q);
1790		1793
1791	if (rl->rq_pool)	1794	if (rl->rq_pool)
1792	mempool_destroy(rl->rq_pool);	1795	mempool_destroy(rl->rq_pool);
1793		1796
1794	if (q->queue_tags)	1797	if (q->queue_tags)
1795	__blk_queue_free_tags(q);	1798	__blk_queue_free_tags(q);
1796		1799
1797	blk_trace_shutdown(q);	1800	blk_trace_shutdown(q);
1798		1801
1799	kmem_cache_free(requestq_cachep, q);	1802	kmem_cache_free(requestq_cachep, q);
1800	}	1803	}
1801		1804
1802	void blk_put_queue(struct request_queue *q)	1805	void blk_put_queue(struct request_queue *q)
1803	{	1806	{
1804	kobject_put(&q->kobj);	1807	kobject_put(&q->kobj);
1805	}	1808	}
1806	EXPORT_SYMBOL(blk_put_queue);	1809	EXPORT_SYMBOL(blk_put_queue);
1807		1810
1808	void blk_cleanup_queue(struct request_queue * q)	1811	void blk_cleanup_queue(struct request_queue * q)
1809	{	1812	{
1810	mutex_lock(&q->sysfs_lock);	1813	mutex_lock(&q->sysfs_lock);
1811	set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);	1814	set_bit(QUEUE_FLAG_DEAD, &q->queue_flags);
1812	mutex_unlock(&q->sysfs_lock);	1815	mutex_unlock(&q->sysfs_lock);
1813		1816
1814	if (q->elevator)	1817	if (q->elevator)
1815	elevator_exit(q->elevator);	1818	elevator_exit(q->elevator);
1816		1819
1817	blk_put_queue(q);	1820	blk_put_queue(q);
1818	}	1821	}
1819		1822
1820	EXPORT_SYMBOL(blk_cleanup_queue);	1823	EXPORT_SYMBOL(blk_cleanup_queue);
1821		1824
1822	static int blk_init_free_list(struct request_queue *q)	1825	static int blk_init_free_list(struct request_queue *q)
1823	{	1826	{
1824	struct request_list *rl = &q->rq;	1827	struct request_list *rl = &q->rq;
1825		1828
1826	rl->count[READ] = rl->count[WRITE] = 0;	1829	rl->count[READ] = rl->count[WRITE] = 0;
1827	rl->starved[READ] = rl->starved[WRITE] = 0;	1830	rl->starved[READ] = rl->starved[WRITE] = 0;
1828	rl->elvpriv = 0;	1831	rl->elvpriv = 0;
1829	init_waitqueue_head(&rl->wait[READ]);	1832	init_waitqueue_head(&rl->wait[READ]);
1830	init_waitqueue_head(&rl->wait[WRITE]);	1833	init_waitqueue_head(&rl->wait[WRITE]);
1831		1834
1832	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,	1835	rl->rq_pool = mempool_create_node(BLKDEV_MIN_RQ, mempool_alloc_slab,
1833	mempool_free_slab, request_cachep, q->node);	1836	mempool_free_slab, request_cachep, q->node);
1834		1837
1835	if (!rl->rq_pool)	1838	if (!rl->rq_pool)
1836	return -ENOMEM;	1839	return -ENOMEM;
1837		1840
1838	return 0;	1841	return 0;
1839	}	1842	}
1840		1843
1841	struct request_queue *blk_alloc_queue(gfp_t gfp_mask)	1844	struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
1842	{	1845	{
1843	return blk_alloc_queue_node(gfp_mask, -1);	1846	return blk_alloc_queue_node(gfp_mask, -1);
1844	}	1847	}
1845	EXPORT_SYMBOL(blk_alloc_queue);	1848	EXPORT_SYMBOL(blk_alloc_queue);
1846		1849
1847	static struct kobj_type queue_ktype;	1850	static struct kobj_type queue_ktype;
1848		1851
1849	struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)	1852	struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
1850	{	1853	{
1851	struct request_queue *q;	1854	struct request_queue *q;
1852		1855
1853	q = kmem_cache_alloc_node(requestq_cachep,	1856	q = kmem_cache_alloc_node(requestq_cachep,
1854	gfp_mask \| __GFP_ZERO, node_id);	1857	gfp_mask \| __GFP_ZERO, node_id);
1855	if (!q)	1858	if (!q)
1856	return NULL;	1859	return NULL;
1857		1860
1858	init_timer(&q->unplug_timer);	1861	init_timer(&q->unplug_timer);
1859		1862
1860	kobject_set_name(&q->kobj, "%s", "queue");	1863	kobject_set_name(&q->kobj, "%s", "queue");
1861	q->kobj.ktype = &queue_ktype;	1864	q->kobj.ktype = &queue_ktype;
1862	kobject_init(&q->kobj);	1865	kobject_init(&q->kobj);
1863		1866
1864	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;	1867	q->backing_dev_info.unplug_io_fn = blk_backing_dev_unplug;
1865	q->backing_dev_info.unplug_io_data = q;	1868	q->backing_dev_info.unplug_io_data = q;
1866		1869
1867	mutex_init(&q->sysfs_lock);	1870	mutex_init(&q->sysfs_lock);
1868		1871
1869	return q;	1872	return q;
1870	}	1873	}
1871	EXPORT_SYMBOL(blk_alloc_queue_node);	1874	EXPORT_SYMBOL(blk_alloc_queue_node);
1872		1875
1873	/**	1876	/**
1874	* blk_init_queue - prepare a request queue for use with a block device	1877	* blk_init_queue - prepare a request queue for use with a block device
1875	* @rfn: The function to be called to process requests that have been	1878	* @rfn: The function to be called to process requests that have been
1876	* placed on the queue.	1879	* placed on the queue.
1877	* @lock: Request queue spin lock	1880	* @lock: Request queue spin lock
1878	*	1881	*
1879	* Description:	1882	* Description:
1880	* If a block device wishes to use the standard request handling procedures,	1883	* If a block device wishes to use the standard request handling procedures,
1881	* which sorts requests and coalesces adjacent requests, then it must	1884	* which sorts requests and coalesces adjacent requests, then it must
1882	* call blk_init_queue(). The function @rfn will be called when there	1885	* call blk_init_queue(). The function @rfn will be called when there
1883	* are requests on the queue that need to be processed. If the device	1886	* are requests on the queue that need to be processed. If the device
1884	* supports plugging, then @rfn may not be called immediately when requests	1887	* supports plugging, then @rfn may not be called immediately when requests
1885	* are available on the queue, but may be called at some time later instead.	1888	* are available on the queue, but may be called at some time later instead.
1886	* Plugged queues are generally unplugged when a buffer belonging to one	1889	* Plugged queues are generally unplugged when a buffer belonging to one
1887	* of the requests on the queue is needed, or due to memory pressure.	1890	* of the requests on the queue is needed, or due to memory pressure.
1888	*	1891	*
1889	* @rfn is not required, or even expected, to remove all requests off the	1892	* @rfn is not required, or even expected, to remove all requests off the
1890	* queue, but only as many as it can handle at a time. If it does leave	1893	* queue, but only as many as it can handle at a time. If it does leave
1891	* requests on the queue, it is responsible for arranging that the requests	1894	* requests on the queue, it is responsible for arranging that the requests
1892	* get dealt with eventually.	1895	* get dealt with eventually.
1893	*	1896	*
1894	* The queue spin lock must be held while manipulating the requests on the	1897	* The queue spin lock must be held while manipulating the requests on the
1895	* request queue; this lock will be taken also from interrupt context, so irq	1898	* request queue; this lock will be taken also from interrupt context, so irq
1896	* disabling is needed for it.	1899	* disabling is needed for it.
1897	*	1900	*
1898	* Function returns a pointer to the initialized request queue, or NULL if	1901	* Function returns a pointer to the initialized request queue, or NULL if
1899	* it didn't succeed.	1902	* it didn't succeed.
1900	*	1903	*
1901	* Note:	1904	* Note:
1902	* blk_init_queue() must be paired with a blk_cleanup_queue() call	1905	* blk_init_queue() must be paired with a blk_cleanup_queue() call
1903	* when the block device is deactivated (such as at module unload).	1906	* when the block device is deactivated (such as at module unload).
1904	**/	1907	**/
1905		1908
1906	struct request_queue blk_init_queue(request_fn_proc rfn, spinlock_t *lock)	1909	struct request_queue blk_init_queue(request_fn_proc rfn, spinlock_t *lock)
1907	{	1910	{
1908	return blk_init_queue_node(rfn, lock, -1);	1911	return blk_init_queue_node(rfn, lock, -1);
1909	}	1912	}
1910	EXPORT_SYMBOL(blk_init_queue);	1913	EXPORT_SYMBOL(blk_init_queue);
1911		1914
1912	struct request_queue *	1915	struct request_queue *
1913	blk_init_queue_node(request_fn_proc rfn, spinlock_t lock, int node_id)	1916	blk_init_queue_node(request_fn_proc rfn, spinlock_t lock, int node_id)
1914	{	1917	{
1915	struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);	1918	struct request_queue *q = blk_alloc_queue_node(GFP_KERNEL, node_id);
1916		1919
1917	if (!q)	1920	if (!q)
1918	return NULL;	1921	return NULL;
1919		1922
1920	q->node = node_id;	1923	q->node = node_id;
1921	if (blk_init_free_list(q)) {	1924	if (blk_init_free_list(q)) {
1922	kmem_cache_free(requestq_cachep, q);	1925	kmem_cache_free(requestq_cachep, q);
1923	return NULL;	1926	return NULL;
1924	}	1927	}
1925		1928
1926	/*	1929	/*
1927	* if caller didn't supply a lock, they get per-queue locking with	1930	* if caller didn't supply a lock, they get per-queue locking with
1928	* our embedded lock	1931	* our embedded lock
1929	*/	1932	*/
1930	if (!lock) {	1933	if (!lock) {
1931	spin_lock_init(&q->__queue_lock);	1934	spin_lock_init(&q->__queue_lock);
1932	lock = &q->__queue_lock;	1935	lock = &q->__queue_lock;
1933	}	1936	}
1934		1937
1935	q->request_fn = rfn;	1938	q->request_fn = rfn;
1936	q->prep_rq_fn = NULL;	1939	q->prep_rq_fn = NULL;
1937	q->unplug_fn = generic_unplug_device;	1940	q->unplug_fn = generic_unplug_device;
1938	q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);	1941	q->queue_flags = (1 << QUEUE_FLAG_CLUSTER);
1939	q->queue_lock = lock;	1942	q->queue_lock = lock;
1940		1943
1941	blk_queue_segment_boundary(q, 0xffffffff);	1944	blk_queue_segment_boundary(q, 0xffffffff);
1942		1945
1943	blk_queue_make_request(q, __make_request);	1946	blk_queue_make_request(q, __make_request);
1944	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);	1947	blk_queue_max_segment_size(q, MAX_SEGMENT_SIZE);
1945		1948
1946	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);	1949	blk_queue_max_hw_segments(q, MAX_HW_SEGMENTS);
1947	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);	1950	blk_queue_max_phys_segments(q, MAX_PHYS_SEGMENTS);
1948		1951
1949	q->sg_reserved_size = INT_MAX;	1952	q->sg_reserved_size = INT_MAX;
1950		1953
1951	/*	1954	/*
1952	* all done	1955	* all done
1953	*/	1956	*/
1954	if (!elevator_init(q, NULL)) {	1957	if (!elevator_init(q, NULL)) {
1955	blk_queue_congestion_threshold(q);	1958	blk_queue_congestion_threshold(q);
1956	return q;	1959	return q;
1957	}	1960	}
1958		1961
1959	blk_put_queue(q);	1962	blk_put_queue(q);
1960	return NULL;	1963	return NULL;
1961	}	1964	}
1962	EXPORT_SYMBOL(blk_init_queue_node);	1965	EXPORT_SYMBOL(blk_init_queue_node);
1963		1966
1964	int blk_get_queue(struct request_queue *q)	1967	int blk_get_queue(struct request_queue *q)
1965	{	1968	{
1966	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {	1969	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
1967	kobject_get(&q->kobj);	1970	kobject_get(&q->kobj);
1968	return 0;	1971	return 0;
1969	}	1972	}
1970		1973
1971	return 1;	1974	return 1;
1972	}	1975	}
1973		1976
1974	EXPORT_SYMBOL(blk_get_queue);	1977	EXPORT_SYMBOL(blk_get_queue);
1975		1978
1976	static inline void blk_free_request(struct request_queue q, struct request rq)	1979	static inline void blk_free_request(struct request_queue q, struct request rq)
1977	{	1980	{
1978	if (rq->cmd_flags & REQ_ELVPRIV)	1981	if (rq->cmd_flags & REQ_ELVPRIV)
1979	elv_put_request(q, rq);	1982	elv_put_request(q, rq);
1980	mempool_free(rq, q->rq.rq_pool);	1983	mempool_free(rq, q->rq.rq_pool);
1981	}	1984	}
1982		1985
1983	static struct request *	1986	static struct request *
1984	blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)	1987	blk_alloc_request(struct request_queue *q, int rw, int priv, gfp_t gfp_mask)
1985	{	1988	{
1986	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);	1989	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
1987		1990
1988	if (!rq)	1991	if (!rq)
1989	return NULL;	1992	return NULL;
1990		1993
1991	/*	1994	/*
1992	* first three bits are identical in rq->cmd_flags and bio->bi_rw,	1995	* first three bits are identical in rq->cmd_flags and bio->bi_rw,
1993	* see bio.h and blkdev.h	1996	* see bio.h and blkdev.h
1994	*/	1997	*/
1995	rq->cmd_flags = rw \| REQ_ALLOCED;	1998	rq->cmd_flags = rw \| REQ_ALLOCED;
1996		1999
1997	if (priv) {	2000	if (priv) {
1998	if (unlikely(elv_set_request(q, rq, gfp_mask))) {	2001	if (unlikely(elv_set_request(q, rq, gfp_mask))) {
1999	mempool_free(rq, q->rq.rq_pool);	2002	mempool_free(rq, q->rq.rq_pool);
2000	return NULL;	2003	return NULL;
2001	}	2004	}
2002	rq->cmd_flags \|= REQ_ELVPRIV;	2005	rq->cmd_flags \|= REQ_ELVPRIV;
2003	}	2006	}
2004		2007
2005	return rq;	2008	return rq;
2006	}	2009	}
2007		2010
2008	/*	2011	/*
2009	* ioc_batching returns true if the ioc is a valid batching request and	2012	* ioc_batching returns true if the ioc is a valid batching request and
2010	* should be given priority access to a request.	2013	* should be given priority access to a request.
2011	*/	2014	*/
2012	static inline int ioc_batching(struct request_queue q, struct io_context ioc)	2015	static inline int ioc_batching(struct request_queue q, struct io_context ioc)
2013	{	2016	{
2014	if (!ioc)	2017	if (!ioc)
2015	return 0;	2018	return 0;
2016		2019
2017	/*	2020	/*
2018	* Make sure the process is able to allocate at least 1 request	2021	* Make sure the process is able to allocate at least 1 request
2019	* even if the batch times out, otherwise we could theoretically	2022	* even if the batch times out, otherwise we could theoretically
2020	* lose wakeups.	2023	* lose wakeups.
2021	*/	2024	*/
2022	return ioc->nr_batch_requests == q->nr_batching \|\|	2025	return ioc->nr_batch_requests == q->nr_batching \|\|
2023	(ioc->nr_batch_requests > 0	2026	(ioc->nr_batch_requests > 0
2024	&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));	2027	&& time_before(jiffies, ioc->last_waited + BLK_BATCH_TIME));
2025	}	2028	}
2026		2029
2027	/*	2030	/*
2028	* ioc_set_batching sets ioc to be a new "batcher" if it is not one. This	2031	* ioc_set_batching sets ioc to be a new "batcher" if it is not one. This
2029	* will cause the process to be a "batcher" on all queues in the system. This	2032	* will cause the process to be a "batcher" on all queues in the system. This
2030	* is the behaviour we want though - once it gets a wakeup it should be given	2033	* is the behaviour we want though - once it gets a wakeup it should be given
2031	* a nice run.	2034	* a nice run.
2032	*/	2035	*/
2033	static void ioc_set_batching(struct request_queue q, struct io_context ioc)	2036	static void ioc_set_batching(struct request_queue q, struct io_context ioc)
2034	{	2037	{
2035	if (!ioc \|\| ioc_batching(q, ioc))	2038	if (!ioc \|\| ioc_batching(q, ioc))
2036	return;	2039	return;
2037		2040
2038	ioc->nr_batch_requests = q->nr_batching;	2041	ioc->nr_batch_requests = q->nr_batching;
2039	ioc->last_waited = jiffies;	2042	ioc->last_waited = jiffies;
2040	}	2043	}
2041		2044
2042	static void __freed_request(struct request_queue *q, int rw)	2045	static void __freed_request(struct request_queue *q, int rw)
2043	{	2046	{
2044	struct request_list *rl = &q->rq;	2047	struct request_list *rl = &q->rq;
2045		2048
2046	if (rl->count[rw] < queue_congestion_off_threshold(q))	2049	if (rl->count[rw] < queue_congestion_off_threshold(q))
2047	blk_clear_queue_congested(q, rw);	2050	blk_clear_queue_congested(q, rw);
2048		2051
2049	if (rl->count[rw] + 1 <= q->nr_requests) {	2052	if (rl->count[rw] + 1 <= q->nr_requests) {
2050	if (waitqueue_active(&rl->wait[rw]))	2053	if (waitqueue_active(&rl->wait[rw]))
2051	wake_up(&rl->wait[rw]);	2054	wake_up(&rl->wait[rw]);
2052		2055
2053	blk_clear_queue_full(q, rw);	2056	blk_clear_queue_full(q, rw);
2054	}	2057	}
2055	}	2058	}
2056		2059
2057	/*	2060	/*
2058	* A request has just been released. Account for it, update the full and	2061	* A request has just been released. Account for it, update the full and
2059	* congestion status, wake up any waiters. Called under q->queue_lock.	2062	* congestion status, wake up any waiters. Called under q->queue_lock.
2060	*/	2063	*/
2061	static void freed_request(struct request_queue *q, int rw, int priv)	2064	static void freed_request(struct request_queue *q, int rw, int priv)
2062	{	2065	{
2063	struct request_list *rl = &q->rq;	2066	struct request_list *rl = &q->rq;
2064		2067
2065	rl->count[rw]--;	2068	rl->count[rw]--;
2066	if (priv)	2069	if (priv)
2067	rl->elvpriv--;	2070	rl->elvpriv--;
2068		2071
2069	__freed_request(q, rw);	2072	__freed_request(q, rw);
2070		2073
2071	if (unlikely(rl->starved[rw ^ 1]))	2074	if (unlikely(rl->starved[rw ^ 1]))
2072	__freed_request(q, rw ^ 1);	2075	__freed_request(q, rw ^ 1);
2073	}	2076	}
2074		2077
2075	#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)	2078	#define blkdev_free_rq(list) list_entry((list)->next, struct request, queuelist)
2076	/*	2079	/*
2077	* Get a free request, queue_lock must be held.	2080	* Get a free request, queue_lock must be held.
2078	* Returns NULL on failure, with queue_lock held.	2081	* Returns NULL on failure, with queue_lock held.
2079	* Returns !NULL on success, with queue_lock not held.	2082	* Returns !NULL on success, with queue_lock not held.
2080	*/	2083	*/
2081	static struct request get_request(struct request_queue q, int rw_flags,	2084	static struct request get_request(struct request_queue q, int rw_flags,
2082	struct bio *bio, gfp_t gfp_mask)	2085	struct bio *bio, gfp_t gfp_mask)
2083	{	2086	{
2084	struct request *rq = NULL;	2087	struct request *rq = NULL;
2085	struct request_list *rl = &q->rq;	2088	struct request_list *rl = &q->rq;
2086	struct io_context *ioc = NULL;	2089	struct io_context *ioc = NULL;
2087	const int rw = rw_flags & 0x01;	2090	const int rw = rw_flags & 0x01;
2088	int may_queue, priv;	2091	int may_queue, priv;
2089		2092
2090	may_queue = elv_may_queue(q, rw_flags);	2093	may_queue = elv_may_queue(q, rw_flags);
2091	if (may_queue == ELV_MQUEUE_NO)	2094	if (may_queue == ELV_MQUEUE_NO)
2092	goto rq_starved;	2095	goto rq_starved;
2093		2096
2094	if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {	2097	if (rl->count[rw]+1 >= queue_congestion_on_threshold(q)) {
2095	if (rl->count[rw]+1 >= q->nr_requests) {	2098	if (rl->count[rw]+1 >= q->nr_requests) {
2096	ioc = current_io_context(GFP_ATOMIC, q->node);	2099	ioc = current_io_context(GFP_ATOMIC, q->node);
2097	/*	2100	/*
2098	* The queue will fill after this allocation, so set	2101	* The queue will fill after this allocation, so set
2099	* it as full, and mark this process as "batching".	2102	* it as full, and mark this process as "batching".
2100	* This process will be allowed to complete a batch of	2103	* This process will be allowed to complete a batch of
2101	* requests, others will be blocked.	2104	* requests, others will be blocked.
2102	*/	2105	*/
2103	if (!blk_queue_full(q, rw)) {	2106	if (!blk_queue_full(q, rw)) {
2104	ioc_set_batching(q, ioc);	2107	ioc_set_batching(q, ioc);
2105	blk_set_queue_full(q, rw);	2108	blk_set_queue_full(q, rw);
2106	} else {	2109	} else {
2107	if (may_queue != ELV_MQUEUE_MUST	2110	if (may_queue != ELV_MQUEUE_MUST
2108	&& !ioc_batching(q, ioc)) {	2111	&& !ioc_batching(q, ioc)) {
2109	/*	2112	/*
2110	* The queue is full and the allocating	2113	* The queue is full and the allocating
2111	* process is not a "batcher", and not	2114	* process is not a "batcher", and not
2112	* exempted by the IO scheduler	2115	* exempted by the IO scheduler
2113	*/	2116	*/
2114	goto out;	2117	goto out;
2115	}	2118	}
2116	}	2119	}
2117	}	2120	}
2118	blk_set_queue_congested(q, rw);	2121	blk_set_queue_congested(q, rw);
2119	}	2122	}
2120		2123
2121	/*	2124	/*
2122	* Only allow batching queuers to allocate up to 50% over the defined	2125	* Only allow batching queuers to allocate up to 50% over the defined
2123	* limit of requests, otherwise we could have thousands of requests	2126	* limit of requests, otherwise we could have thousands of requests
2124	* allocated with any setting of ->nr_requests	2127	* allocated with any setting of ->nr_requests
2125	*/	2128	*/
2126	if (rl->count[rw] >= (3 * q->nr_requests / 2))	2129	if (rl->count[rw] >= (3 * q->nr_requests / 2))
2127	goto out;	2130	goto out;
2128		2131
2129	rl->count[rw]++;	2132	rl->count[rw]++;
2130	rl->starved[rw] = 0;	2133	rl->starved[rw] = 0;
2131		2134
2132	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);	2135	priv = !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags);
2133	if (priv)	2136	if (priv)
2134	rl->elvpriv++;	2137	rl->elvpriv++;
2135		2138
2136	spin_unlock_irq(q->queue_lock);	2139	spin_unlock_irq(q->queue_lock);
2137		2140
2138	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);	2141	rq = blk_alloc_request(q, rw_flags, priv, gfp_mask);
2139	if (unlikely(!rq)) {	2142	if (unlikely(!rq)) {
2140	/*	2143	/*
2141	* Allocation failed presumably due to memory. Undo anything	2144	* Allocation failed presumably due to memory. Undo anything
2142	* we might have messed up.	2145	* we might have messed up.
2143	*	2146	*
2144	* Allocating task should really be put onto the front of the	2147	* Allocating task should really be put onto the front of the
2145	* wait queue, but this is pretty rare.	2148	* wait queue, but this is pretty rare.
2146	*/	2149	*/
2147	spin_lock_irq(q->queue_lock);	2150	spin_lock_irq(q->queue_lock);
2148	freed_request(q, rw, priv);	2151	freed_request(q, rw, priv);
2149		2152
2150	/*	2153	/*
2151	* in the very unlikely event that allocation failed and no	2154	* in the very unlikely event that allocation failed and no
2152	* requests for this direction was pending, mark us starved	2155	* requests for this direction was pending, mark us starved
2153	* so that freeing of a request in the other direction will	2156	* so that freeing of a request in the other direction will
2154	* notice us. another possible fix would be to split the	2157	* notice us. another possible fix would be to split the
2155	* rq mempool into READ and WRITE	2158	* rq mempool into READ and WRITE
2156	*/	2159	*/
2157	rq_starved:	2160	rq_starved:
2158	if (unlikely(rl->count[rw] == 0))	2161	if (unlikely(rl->count[rw] == 0))
2159	rl->starved[rw] = 1;	2162	rl->starved[rw] = 1;
2160		2163
2161	goto out;	2164	goto out;
2162	}	2165	}
2163		2166
2164	/*	2167	/*
2165	* ioc may be NULL here, and ioc_batching will be false. That's	2168	* ioc may be NULL here, and ioc_batching will be false. That's
2166	* OK, if the queue is under the request limit then requests need	2169	* OK, if the queue is under the request limit then requests need
2167	* not count toward the nr_batch_requests limit. There will always	2170	* not count toward the nr_batch_requests limit. There will always
2168	* be some limit enforced by BLK_BATCH_TIME.	2171	* be some limit enforced by BLK_BATCH_TIME.
2169	*/	2172	*/
2170	if (ioc_batching(q, ioc))	2173	if (ioc_batching(q, ioc))
2171	ioc->nr_batch_requests--;	2174	ioc->nr_batch_requests--;
2172		2175
2173	rq_init(q, rq);	2176	rq_init(q, rq);
2174		2177
2175	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);	2178	blk_add_trace_generic(q, bio, rw, BLK_TA_GETRQ);
2176	out:	2179	out:
2177	return rq;	2180	return rq;
2178	}	2181	}
2179		2182
2180	/*	2183	/*
2181	* No available requests for this queue, unplug the device and wait for some	2184	* No available requests for this queue, unplug the device and wait for some
2182	* requests to become available.	2185	* requests to become available.
2183	*	2186	*
2184	* Called with q->queue_lock held, and returns with it unlocked.	2187	* Called with q->queue_lock held, and returns with it unlocked.
2185	*/	2188	*/
2186	static struct request get_request_wait(struct request_queue q, int rw_flags,	2189	static struct request get_request_wait(struct request_queue q, int rw_flags,
2187	struct bio *bio)	2190	struct bio *bio)
2188	{	2191	{
2189	const int rw = rw_flags & 0x01;	2192	const int rw = rw_flags & 0x01;
2190	struct request *rq;	2193	struct request *rq;
2191		2194
2192	rq = get_request(q, rw_flags, bio, GFP_NOIO);	2195	rq = get_request(q, rw_flags, bio, GFP_NOIO);
2193	while (!rq) {	2196	while (!rq) {
2194	DEFINE_WAIT(wait);	2197	DEFINE_WAIT(wait);
2195	struct request_list *rl = &q->rq;	2198	struct request_list *rl = &q->rq;
2196		2199
2197	prepare_to_wait_exclusive(&rl->wait[rw], &wait,	2200	prepare_to_wait_exclusive(&rl->wait[rw], &wait,
2198	TASK_UNINTERRUPTIBLE);	2201	TASK_UNINTERRUPTIBLE);
2199		2202
2200	rq = get_request(q, rw_flags, bio, GFP_NOIO);	2203	rq = get_request(q, rw_flags, bio, GFP_NOIO);
2201		2204
2202	if (!rq) {	2205	if (!rq) {
2203	struct io_context *ioc;	2206	struct io_context *ioc;
2204		2207
2205	blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);	2208	blk_add_trace_generic(q, bio, rw, BLK_TA_SLEEPRQ);
2206		2209
2207	__generic_unplug_device(q);	2210	__generic_unplug_device(q);
2208	spin_unlock_irq(q->queue_lock);	2211	spin_unlock_irq(q->queue_lock);
2209	io_schedule();	2212	io_schedule();
2210		2213
2211	/*	2214	/*
2212	* After sleeping, we become a "batching" process and	2215	* After sleeping, we become a "batching" process and
2213	* will be able to allocate at least one request, and	2216	* will be able to allocate at least one request, and
2214	* up to a big batch of them for a small period time.	2217	* up to a big batch of them for a small period time.
2215	* See ioc_batching, ioc_set_batching	2218	* See ioc_batching, ioc_set_batching
2216	*/	2219	*/
2217	ioc = current_io_context(GFP_NOIO, q->node);	2220	ioc = current_io_context(GFP_NOIO, q->node);
2218	ioc_set_batching(q, ioc);	2221	ioc_set_batching(q, ioc);
2219		2222
2220	spin_lock_irq(q->queue_lock);	2223	spin_lock_irq(q->queue_lock);
2221	}	2224	}
2222	finish_wait(&rl->wait[rw], &wait);	2225	finish_wait(&rl->wait[rw], &wait);
2223	}	2226	}
2224		2227
2225	return rq;	2228	return rq;
2226	}	2229	}
2227		2230
2228	struct request blk_get_request(struct request_queue q, int rw, gfp_t gfp_mask)	2231	struct request blk_get_request(struct request_queue q, int rw, gfp_t gfp_mask)
2229	{	2232	{
2230	struct request *rq;	2233	struct request *rq;
2231		2234
2232	BUG_ON(rw != READ && rw != WRITE);	2235	BUG_ON(rw != READ && rw != WRITE);
2233		2236
2234	spin_lock_irq(q->queue_lock);	2237	spin_lock_irq(q->queue_lock);
2235	if (gfp_mask & __GFP_WAIT) {	2238	if (gfp_mask & __GFP_WAIT) {
2236	rq = get_request_wait(q, rw, NULL);	2239	rq = get_request_wait(q, rw, NULL);
2237	} else {	2240	} else {
2238	rq = get_request(q, rw, NULL, gfp_mask);	2241	rq = get_request(q, rw, NULL, gfp_mask);
2239	if (!rq)	2242	if (!rq)
2240	spin_unlock_irq(q->queue_lock);	2243	spin_unlock_irq(q->queue_lock);
2241	}	2244	}
2242	/* q->queue_lock is unlocked at this point */	2245	/* q->queue_lock is unlocked at this point */
2243		2246
2244	return rq;	2247	return rq;
2245	}	2248	}
2246	EXPORT_SYMBOL(blk_get_request);	2249	EXPORT_SYMBOL(blk_get_request);
2247		2250
2248	/**	2251	/**
2249	* blk_start_queueing - initiate dispatch of requests to device	2252	* blk_start_queueing - initiate dispatch of requests to device
2250	* @q: request queue to kick into gear	2253	* @q: request queue to kick into gear
2251	*	2254	*
2252	* This is basically a helper to remove the need to know whether a queue	2255	* This is basically a helper to remove the need to know whether a queue
2253	* is plugged or not if someone just wants to initiate dispatch of requests	2256	* is plugged or not if someone just wants to initiate dispatch of requests
2254	* for this queue.	2257	* for this queue.
2255	*	2258	*
2256	* The queue lock must be held with interrupts disabled.	2259	* The queue lock must be held with interrupts disabled.
2257	*/	2260	*/
2258	void blk_start_queueing(struct request_queue *q)	2261	void blk_start_queueing(struct request_queue *q)
2259	{	2262	{
2260	if (!blk_queue_plugged(q))	2263	if (!blk_queue_plugged(q))
2261	q->request_fn(q);	2264	q->request_fn(q);
2262	else	2265	else
2263	__generic_unplug_device(q);	2266	__generic_unplug_device(q);
2264	}	2267	}
2265	EXPORT_SYMBOL(blk_start_queueing);	2268	EXPORT_SYMBOL(blk_start_queueing);
2266		2269
2267	/**	2270	/**
2268	* blk_requeue_request - put a request back on queue	2271	* blk_requeue_request - put a request back on queue
2269	* @q: request queue where request should be inserted	2272	* @q: request queue where request should be inserted
2270	* @rq: request to be inserted	2273	* @rq: request to be inserted
2271	*	2274	*
2272	* Description:	2275	* Description:
2273	* Drivers often keep queueing requests until the hardware cannot accept	2276	* Drivers often keep queueing requests until the hardware cannot accept
2274	* more, when that condition happens we need to put the request back	2277	* more, when that condition happens we need to put the request back
2275	* on the queue. Must be called with queue lock held.	2278	* on the queue. Must be called with queue lock held.
2276	*/	2279	*/
2277	void blk_requeue_request(struct request_queue q, struct request rq)	2280	void blk_requeue_request(struct request_queue q, struct request rq)
2278	{	2281	{
2279	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);	2282	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
2280		2283
2281	if (blk_rq_tagged(rq))	2284	if (blk_rq_tagged(rq))
2282	blk_queue_end_tag(q, rq);	2285	blk_queue_end_tag(q, rq);
2283		2286
2284	elv_requeue_request(q, rq);	2287	elv_requeue_request(q, rq);
2285	}	2288	}
2286		2289
2287	EXPORT_SYMBOL(blk_requeue_request);	2290	EXPORT_SYMBOL(blk_requeue_request);
2288		2291
2289	/**	2292	/**
2290	* blk_insert_request - insert a special request in to a request queue	2293	* blk_insert_request - insert a special request in to a request queue
2291	* @q: request queue where request should be inserted	2294	* @q: request queue where request should be inserted
2292	* @rq: request to be inserted	2295	* @rq: request to be inserted
2293	* @at_head: insert request at head or tail of queue	2296	* @at_head: insert request at head or tail of queue
2294	* @data: private data	2297	* @data: private data
2295	*	2298	*
2296	* Description:	2299	* Description:
2297	* Many block devices need to execute commands asynchronously, so they don't	2300	* Many block devices need to execute commands asynchronously, so they don't
2298	* block the whole kernel from preemption during request execution. This is	2301	* block the whole kernel from preemption during request execution. This is
2299	* accomplished normally by inserting aritficial requests tagged as	2302	* accomplished normally by inserting aritficial requests tagged as
2300	* REQ_SPECIAL in to the corresponding request queue, and letting them be	2303	* REQ_SPECIAL in to the corresponding request queue, and letting them be
2301	* scheduled for actual execution by the request queue.	2304	* scheduled for actual execution by the request queue.
2302	*	2305	*
2303	* We have the option of inserting the head or the tail of the queue.	2306	* We have the option of inserting the head or the tail of the queue.
2304	* Typically we use the tail for new ioctls and so forth. We use the head	2307	* Typically we use the tail for new ioctls and so forth. We use the head
2305	* of the queue for things like a QUEUE_FULL message from a device, or a	2308	* of the queue for things like a QUEUE_FULL message from a device, or a
2306	* host that is unable to accept a particular command.	2309	* host that is unable to accept a particular command.
2307	*/	2310	*/
2308	void blk_insert_request(struct request_queue q, struct request rq,	2311	void blk_insert_request(struct request_queue q, struct request rq,
2309	int at_head, void *data)	2312	int at_head, void *data)
2310	{	2313	{
2311	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;	2314	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2312	unsigned long flags;	2315	unsigned long flags;
2313		2316
2314	/*	2317	/*
2315	* tell I/O scheduler that this isn't a regular read/write (ie it	2318	* tell I/O scheduler that this isn't a regular read/write (ie it
2316	* must not attempt merges on this) and that it acts as a soft	2319	* must not attempt merges on this) and that it acts as a soft
2317	* barrier	2320	* barrier
2318	*/	2321	*/
2319	rq->cmd_type = REQ_TYPE_SPECIAL;	2322	rq->cmd_type = REQ_TYPE_SPECIAL;
2320	rq->cmd_flags \|= REQ_SOFTBARRIER;	2323	rq->cmd_flags \|= REQ_SOFTBARRIER;
2321		2324
2322	rq->special = data;	2325	rq->special = data;
2323		2326
2324	spin_lock_irqsave(q->queue_lock, flags);	2327	spin_lock_irqsave(q->queue_lock, flags);
2325		2328
2326	/*	2329	/*
2327	* If command is tagged, release the tag	2330	* If command is tagged, release the tag
2328	*/	2331	*/
2329	if (blk_rq_tagged(rq))	2332	if (blk_rq_tagged(rq))
2330	blk_queue_end_tag(q, rq);	2333	blk_queue_end_tag(q, rq);
2331		2334
2332	drive_stat_acct(rq, rq->nr_sectors, 1);	2335	drive_stat_acct(rq, rq->nr_sectors, 1);
2333	__elv_add_request(q, rq, where, 0);	2336	__elv_add_request(q, rq, where, 0);
2334	blk_start_queueing(q);	2337	blk_start_queueing(q);
2335	spin_unlock_irqrestore(q->queue_lock, flags);	2338	spin_unlock_irqrestore(q->queue_lock, flags);
2336	}	2339	}
2337		2340
2338	EXPORT_SYMBOL(blk_insert_request);	2341	EXPORT_SYMBOL(blk_insert_request);
2339		2342
2340	static int __blk_rq_unmap_user(struct bio *bio)	2343	static int __blk_rq_unmap_user(struct bio *bio)
2341	{	2344	{
2342	int ret = 0;	2345	int ret = 0;
2343		2346
2344	if (bio) {	2347	if (bio) {
2345	if (bio_flagged(bio, BIO_USER_MAPPED))	2348	if (bio_flagged(bio, BIO_USER_MAPPED))
2346	bio_unmap_user(bio);	2349	bio_unmap_user(bio);
2347	else	2350	else
2348	ret = bio_uncopy_user(bio);	2351	ret = bio_uncopy_user(bio);
2349	}	2352	}
2350		2353
2351	return ret;	2354	return ret;
2352	}	2355	}
2353		2356
2354	int blk_rq_append_bio(struct request_queue q, struct request rq,	2357	int blk_rq_append_bio(struct request_queue q, struct request rq,
2355	struct bio *bio)	2358	struct bio *bio)
2356	{	2359	{
2357	if (!rq->bio)	2360	if (!rq->bio)
2358	blk_rq_bio_prep(q, rq, bio);	2361	blk_rq_bio_prep(q, rq, bio);
2359	else if (!ll_back_merge_fn(q, rq, bio))	2362	else if (!ll_back_merge_fn(q, rq, bio))
2360	return -EINVAL;	2363	return -EINVAL;
2361	else {	2364	else {
2362	rq->biotail->bi_next = bio;	2365	rq->biotail->bi_next = bio;
2363	rq->biotail = bio;	2366	rq->biotail = bio;
2364		2367
2365	rq->data_len += bio->bi_size;	2368	rq->data_len += bio->bi_size;
2366	}	2369	}
2367	return 0;	2370	return 0;
2368	}	2371	}
2369	EXPORT_SYMBOL(blk_rq_append_bio);	2372	EXPORT_SYMBOL(blk_rq_append_bio);
2370		2373
2371	static int __blk_rq_map_user(struct request_queue q, struct request rq,	2374	static int __blk_rq_map_user(struct request_queue q, struct request rq,
2372	void __user *ubuf, unsigned int len)	2375	void __user *ubuf, unsigned int len)
2373	{	2376	{
2374	unsigned long uaddr;	2377	unsigned long uaddr;
2375	struct bio bio, orig_bio;	2378	struct bio bio, orig_bio;
2376	int reading, ret;	2379	int reading, ret;
2377		2380
2378	reading = rq_data_dir(rq) == READ;	2381	reading = rq_data_dir(rq) == READ;
2379		2382
2380	/*	2383	/*
2381	* if alignment requirement is satisfied, map in user pages for	2384	* if alignment requirement is satisfied, map in user pages for
2382	* direct dma. else, set up kernel bounce buffers	2385	* direct dma. else, set up kernel bounce buffers
2383	*/	2386	*/
2384	uaddr = (unsigned long) ubuf;	2387	uaddr = (unsigned long) ubuf;
2385	if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))	2388	if (!(uaddr & queue_dma_alignment(q)) && !(len & queue_dma_alignment(q)))
2386	bio = bio_map_user(q, NULL, uaddr, len, reading);	2389	bio = bio_map_user(q, NULL, uaddr, len, reading);
2387	else	2390	else
2388	bio = bio_copy_user(q, uaddr, len, reading);	2391	bio = bio_copy_user(q, uaddr, len, reading);
2389		2392
2390	if (IS_ERR(bio))	2393	if (IS_ERR(bio))
2391	return PTR_ERR(bio);	2394	return PTR_ERR(bio);
2392		2395
2393	orig_bio = bio;	2396	orig_bio = bio;
2394	blk_queue_bounce(q, &bio);	2397	blk_queue_bounce(q, &bio);
2395		2398
2396	/*	2399	/*
2397	* We link the bounce buffer in and could have to traverse it	2400	* We link the bounce buffer in and could have to traverse it
2398	* later so we have to get a ref to prevent it from being freed	2401	* later so we have to get a ref to prevent it from being freed
2399	*/	2402	*/
2400	bio_get(bio);	2403	bio_get(bio);
2401		2404
2402	ret = blk_rq_append_bio(q, rq, bio);	2405	ret = blk_rq_append_bio(q, rq, bio);
2403	if (!ret)	2406	if (!ret)
2404	return bio->bi_size;	2407	return bio->bi_size;
2405		2408
2406	/* if it was boucned we must call the end io function */	2409	/* if it was boucned we must call the end io function */
2407	bio_endio(bio, 0);	2410	bio_endio(bio, 0);
2408	__blk_rq_unmap_user(orig_bio);	2411	__blk_rq_unmap_user(orig_bio);
2409	bio_put(bio);	2412	bio_put(bio);
2410	return ret;	2413	return ret;
2411	}	2414	}
2412		2415
2413	/**	2416	/**
2414	* blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage	2417	* blk_rq_map_user - map user data to a request, for REQ_BLOCK_PC usage
2415	* @q: request queue where request should be inserted	2418	* @q: request queue where request should be inserted
2416	* @rq: request structure to fill	2419	* @rq: request structure to fill
2417	* @ubuf: the user buffer	2420	* @ubuf: the user buffer
2418	* @len: length of user data	2421	* @len: length of user data
2419	*	2422	*
2420	* Description:	2423	* Description:
2421	* Data will be mapped directly for zero copy io, if possible. Otherwise	2424	* Data will be mapped directly for zero copy io, if possible. Otherwise
2422	* a kernel bounce buffer is used.	2425	* a kernel bounce buffer is used.
2423	*	2426	*
2424	* A matching blk_rq_unmap_user() must be issued at the end of io, while	2427	* A matching blk_rq_unmap_user() must be issued at the end of io, while
2425	* still in process context.	2428	* still in process context.
2426	*	2429	*
2427	* Note: The mapped bio may need to be bounced through blk_queue_bounce()	2430	* Note: The mapped bio may need to be bounced through blk_queue_bounce()
2428	* before being submitted to the device, as pages mapped may be out of	2431	* before being submitted to the device, as pages mapped may be out of
2429	* reach. It's the callers responsibility to make sure this happens. The	2432	* reach. It's the callers responsibility to make sure this happens. The
2430	* original bio must be passed back in to blk_rq_unmap_user() for proper	2433	* original bio must be passed back in to blk_rq_unmap_user() for proper
2431	* unmapping.	2434	* unmapping.
2432	*/	2435	*/
2433	int blk_rq_map_user(struct request_queue q, struct request rq,	2436	int blk_rq_map_user(struct request_queue q, struct request rq,
2434	void __user *ubuf, unsigned long len)	2437	void __user *ubuf, unsigned long len)
2435	{	2438	{
2436	unsigned long bytes_read = 0;	2439	unsigned long bytes_read = 0;
2437	struct bio *bio = NULL;	2440	struct bio *bio = NULL;
2438	int ret;	2441	int ret;
2439		2442
2440	if (len > (q->max_hw_sectors << 9))	2443	if (len > (q->max_hw_sectors << 9))
2441	return -EINVAL;	2444	return -EINVAL;
2442	if (!len \|\| !ubuf)	2445	if (!len \|\| !ubuf)
2443	return -EINVAL;	2446	return -EINVAL;
2444		2447
2445	while (bytes_read != len) {	2448	while (bytes_read != len) {
2446	unsigned long map_len, end, start;	2449	unsigned long map_len, end, start;
2447		2450
2448	map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);	2451	map_len = min_t(unsigned long, len - bytes_read, BIO_MAX_SIZE);
2449	end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)	2452	end = ((unsigned long)ubuf + map_len + PAGE_SIZE - 1)
2450	>> PAGE_SHIFT;	2453	>> PAGE_SHIFT;
2451	start = (unsigned long)ubuf >> PAGE_SHIFT;	2454	start = (unsigned long)ubuf >> PAGE_SHIFT;
2452		2455
2453	/*	2456	/*
2454	* A bad offset could cause us to require BIO_MAX_PAGES + 1	2457	* A bad offset could cause us to require BIO_MAX_PAGES + 1
2455	* pages. If this happens we just lower the requested	2458	* pages. If this happens we just lower the requested
2456	* mapping len by a page so that we can fit	2459	* mapping len by a page so that we can fit
2457	*/	2460	*/
2458	if (end - start > BIO_MAX_PAGES)	2461	if (end - start > BIO_MAX_PAGES)
2459	map_len -= PAGE_SIZE;	2462	map_len -= PAGE_SIZE;
2460		2463
2461	ret = __blk_rq_map_user(q, rq, ubuf, map_len);	2464	ret = __blk_rq_map_user(q, rq, ubuf, map_len);
2462	if (ret < 0)	2465	if (ret < 0)
2463	goto unmap_rq;	2466	goto unmap_rq;
2464	if (!bio)	2467	if (!bio)
2465	bio = rq->bio;	2468	bio = rq->bio;
2466	bytes_read += ret;	2469	bytes_read += ret;
2467	ubuf += ret;	2470	ubuf += ret;
2468	}	2471	}
2469		2472
2470	rq->buffer = rq->data = NULL;	2473	rq->buffer = rq->data = NULL;
2471	return 0;	2474	return 0;
2472	unmap_rq:	2475	unmap_rq:
2473	blk_rq_unmap_user(bio);	2476	blk_rq_unmap_user(bio);
2474	return ret;	2477	return ret;
2475	}	2478	}
2476		2479
2477	EXPORT_SYMBOL(blk_rq_map_user);	2480	EXPORT_SYMBOL(blk_rq_map_user);
2478		2481
2479	/**	2482	/**
2480	* blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage	2483	* blk_rq_map_user_iov - map user data to a request, for REQ_BLOCK_PC usage
2481	* @q: request queue where request should be inserted	2484	* @q: request queue where request should be inserted
2482	* @rq: request to map data to	2485	* @rq: request to map data to
2483	* @iov: pointer to the iovec	2486	* @iov: pointer to the iovec
2484	* @iov_count: number of elements in the iovec	2487	* @iov_count: number of elements in the iovec
2485	* @len: I/O byte count	2488	* @len: I/O byte count
2486	*	2489	*
2487	* Description:	2490	* Description:
2488	* Data will be mapped directly for zero copy io, if possible. Otherwise	2491	* Data will be mapped directly for zero copy io, if possible. Otherwise
2489	* a kernel bounce buffer is used.	2492	* a kernel bounce buffer is used.
2490	*	2493	*
2491	* A matching blk_rq_unmap_user() must be issued at the end of io, while	2494	* A matching blk_rq_unmap_user() must be issued at the end of io, while
2492	* still in process context.	2495	* still in process context.
2493	*	2496	*
2494	* Note: The mapped bio may need to be bounced through blk_queue_bounce()	2497	* Note: The mapped bio may need to be bounced through blk_queue_bounce()
2495	* before being submitted to the device, as pages mapped may be out of	2498	* before being submitted to the device, as pages mapped may be out of
2496	* reach. It's the callers responsibility to make sure this happens. The	2499	* reach. It's the callers responsibility to make sure this happens. The
2497	* original bio must be passed back in to blk_rq_unmap_user() for proper	2500	* original bio must be passed back in to blk_rq_unmap_user() for proper
2498	* unmapping.	2501	* unmapping.
2499	*/	2502	*/
2500	int blk_rq_map_user_iov(struct request_queue q, struct request rq,	2503	int blk_rq_map_user_iov(struct request_queue q, struct request rq,
2501	struct sg_iovec *iov, int iov_count, unsigned int len)	2504	struct sg_iovec *iov, int iov_count, unsigned int len)
2502	{	2505	{
2503	struct bio *bio;	2506	struct bio *bio;
2504		2507
2505	if (!iov \|\| iov_count <= 0)	2508	if (!iov \|\| iov_count <= 0)
2506	return -EINVAL;	2509	return -EINVAL;
2507		2510
2508	/* we don't allow misaligned data like bio_map_user() does. If the	2511	/* we don't allow misaligned data like bio_map_user() does. If the
2509	* user is using sg, they're expected to know the alignment constraints	2512	* user is using sg, they're expected to know the alignment constraints
2510	* and respect them accordingly */	2513	* and respect them accordingly */
2511	bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);	2514	bio = bio_map_user_iov(q, NULL, iov, iov_count, rq_data_dir(rq)== READ);
2512	if (IS_ERR(bio))	2515	if (IS_ERR(bio))
2513	return PTR_ERR(bio);	2516	return PTR_ERR(bio);
2514		2517
2515	if (bio->bi_size != len) {	2518	if (bio->bi_size != len) {
2516	bio_endio(bio, 0);	2519	bio_endio(bio, 0);
2517	bio_unmap_user(bio);	2520	bio_unmap_user(bio);
2518	return -EINVAL;	2521	return -EINVAL;
2519	}	2522	}
2520		2523
2521	bio_get(bio);	2524	bio_get(bio);
2522	blk_rq_bio_prep(q, rq, bio);	2525	blk_rq_bio_prep(q, rq, bio);
2523	rq->buffer = rq->data = NULL;	2526	rq->buffer = rq->data = NULL;
2524	return 0;	2527	return 0;
2525	}	2528	}
2526		2529
2527	EXPORT_SYMBOL(blk_rq_map_user_iov);	2530	EXPORT_SYMBOL(blk_rq_map_user_iov);
2528		2531
2529	/**	2532	/**
2530	* blk_rq_unmap_user - unmap a request with user data	2533	* blk_rq_unmap_user - unmap a request with user data
2531	* @bio: start of bio list	2534	* @bio: start of bio list
2532	*	2535	*
2533	* Description:	2536	* Description:
2534	* Unmap a rq previously mapped by blk_rq_map_user(). The caller must	2537	* Unmap a rq previously mapped by blk_rq_map_user(). The caller must
2535	* supply the original rq->bio from the blk_rq_map_user() return, since	2538	* supply the original rq->bio from the blk_rq_map_user() return, since
2536	* the io completion may have changed rq->bio.	2539	* the io completion may have changed rq->bio.
2537	*/	2540	*/
2538	int blk_rq_unmap_user(struct bio *bio)	2541	int blk_rq_unmap_user(struct bio *bio)
2539	{	2542	{
2540	struct bio *mapped_bio;	2543	struct bio *mapped_bio;
2541	int ret = 0, ret2;	2544	int ret = 0, ret2;
2542		2545
2543	while (bio) {	2546	while (bio) {
2544	mapped_bio = bio;	2547	mapped_bio = bio;
2545	if (unlikely(bio_flagged(bio, BIO_BOUNCED)))	2548	if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
2546	mapped_bio = bio->bi_private;	2549	mapped_bio = bio->bi_private;
2547		2550
2548	ret2 = __blk_rq_unmap_user(mapped_bio);	2551	ret2 = __blk_rq_unmap_user(mapped_bio);
2549	if (ret2 && !ret)	2552	if (ret2 && !ret)
2550	ret = ret2;	2553	ret = ret2;
2551		2554
2552	mapped_bio = bio;	2555	mapped_bio = bio;
2553	bio = bio->bi_next;	2556	bio = bio->bi_next;
2554	bio_put(mapped_bio);	2557	bio_put(mapped_bio);
2555	}	2558	}
2556		2559
2557	return ret;	2560	return ret;
2558	}	2561	}
2559		2562
2560	EXPORT_SYMBOL(blk_rq_unmap_user);	2563	EXPORT_SYMBOL(blk_rq_unmap_user);
2561		2564
2562	/**	2565	/**
2563	* blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage	2566	* blk_rq_map_kern - map kernel data to a request, for REQ_BLOCK_PC usage
2564	* @q: request queue where request should be inserted	2567	* @q: request queue where request should be inserted
2565	* @rq: request to fill	2568	* @rq: request to fill
2566	* @kbuf: the kernel buffer	2569	* @kbuf: the kernel buffer
2567	* @len: length of user data	2570	* @len: length of user data
2568	* @gfp_mask: memory allocation flags	2571	* @gfp_mask: memory allocation flags
2569	*/	2572	*/
2570	int blk_rq_map_kern(struct request_queue q, struct request rq, void *kbuf,	2573	int blk_rq_map_kern(struct request_queue q, struct request rq, void *kbuf,
2571	unsigned int len, gfp_t gfp_mask)	2574	unsigned int len, gfp_t gfp_mask)
2572	{	2575	{
2573	struct bio *bio;	2576	struct bio *bio;
2574		2577
2575	if (len > (q->max_hw_sectors << 9))	2578	if (len > (q->max_hw_sectors << 9))
2576	return -EINVAL;	2579	return -EINVAL;
2577	if (!len \|\| !kbuf)	2580	if (!len \|\| !kbuf)
2578	return -EINVAL;	2581	return -EINVAL;
2579		2582
2580	bio = bio_map_kern(q, kbuf, len, gfp_mask);	2583	bio = bio_map_kern(q, kbuf, len, gfp_mask);
2581	if (IS_ERR(bio))	2584	if (IS_ERR(bio))
2582	return PTR_ERR(bio);	2585	return PTR_ERR(bio);
2583		2586
2584	if (rq_data_dir(rq) == WRITE)	2587	if (rq_data_dir(rq) == WRITE)
2585	bio->bi_rw \|= (1 << BIO_RW);	2588	bio->bi_rw \|= (1 << BIO_RW);
2586		2589
2587	blk_rq_bio_prep(q, rq, bio);	2590	blk_rq_bio_prep(q, rq, bio);
2588	blk_queue_bounce(q, &rq->bio);	2591	blk_queue_bounce(q, &rq->bio);
2589	rq->buffer = rq->data = NULL;	2592	rq->buffer = rq->data = NULL;
2590	return 0;	2593	return 0;
2591	}	2594	}
2592		2595
2593	EXPORT_SYMBOL(blk_rq_map_kern);	2596	EXPORT_SYMBOL(blk_rq_map_kern);
2594		2597
2595	/**	2598	/**
2596	* blk_execute_rq_nowait - insert a request into queue for execution	2599	* blk_execute_rq_nowait - insert a request into queue for execution
2597	* @q: queue to insert the request in	2600	* @q: queue to insert the request in
2598	* @bd_disk: matching gendisk	2601	* @bd_disk: matching gendisk
2599	* @rq: request to insert	2602	* @rq: request to insert
2600	* @at_head: insert request at head or tail of queue	2603	* @at_head: insert request at head or tail of queue
2601	* @done: I/O completion handler	2604	* @done: I/O completion handler
2602	*	2605	*
2603	* Description:	2606	* Description:
2604	* Insert a fully prepared request at the back of the io scheduler queue	2607	* Insert a fully prepared request at the back of the io scheduler queue
2605	* for execution. Don't wait for completion.	2608	* for execution. Don't wait for completion.
2606	*/	2609	*/
2607	void blk_execute_rq_nowait(struct request_queue q, struct gendisk bd_disk,	2610	void blk_execute_rq_nowait(struct request_queue q, struct gendisk bd_disk,
2608	struct request *rq, int at_head,	2611	struct request *rq, int at_head,
2609	rq_end_io_fn *done)	2612	rq_end_io_fn *done)
2610	{	2613	{
2611	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;	2614	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
2612		2615
2613	rq->rq_disk = bd_disk;	2616	rq->rq_disk = bd_disk;
2614	rq->cmd_flags \|= REQ_NOMERGE;	2617	rq->cmd_flags \|= REQ_NOMERGE;
2615	rq->end_io = done;	2618	rq->end_io = done;
2616	WARN_ON(irqs_disabled());	2619	WARN_ON(irqs_disabled());
2617	spin_lock_irq(q->queue_lock);	2620	spin_lock_irq(q->queue_lock);
2618	__elv_add_request(q, rq, where, 1);	2621	__elv_add_request(q, rq, where, 1);
2619	__generic_unplug_device(q);	2622	__generic_unplug_device(q);
2620	spin_unlock_irq(q->queue_lock);	2623	spin_unlock_irq(q->queue_lock);
2621	}	2624	}
2622	EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);	2625	EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
2623		2626
2624	/**	2627	/**
2625	* blk_execute_rq - insert a request into queue for execution	2628	* blk_execute_rq - insert a request into queue for execution
2626	* @q: queue to insert the request in	2629	* @q: queue to insert the request in
2627	* @bd_disk: matching gendisk	2630	* @bd_disk: matching gendisk
2628	* @rq: request to insert	2631	* @rq: request to insert
2629	* @at_head: insert request at head or tail of queue	2632	* @at_head: insert request at head or tail of queue
2630	*	2633	*
2631	* Description:	2634	* Description:
2632	* Insert a fully prepared request at the back of the io scheduler queue	2635	* Insert a fully prepared request at the back of the io scheduler queue
2633	* for execution and wait for completion.	2636	* for execution and wait for completion.
2634	*/	2637	*/
2635	int blk_execute_rq(struct request_queue q, struct gendisk bd_disk,	2638	int blk_execute_rq(struct request_queue q, struct gendisk bd_disk,
2636	struct request *rq, int at_head)	2639	struct request *rq, int at_head)
2637	{	2640	{
2638	DECLARE_COMPLETION_ONSTACK(wait);	2641	DECLARE_COMPLETION_ONSTACK(wait);
2639	char sense[SCSI_SENSE_BUFFERSIZE];	2642	char sense[SCSI_SENSE_BUFFERSIZE];
2640	int err = 0;	2643	int err = 0;
2641		2644
2642	/*	2645	/*
2643	* we need an extra reference to the request, so we can look at	2646	* we need an extra reference to the request, so we can look at
2644	* it after io completion	2647	* it after io completion
2645	*/	2648	*/
2646	rq->ref_count++;	2649	rq->ref_count++;
2647		2650
2648	if (!rq->sense) {	2651	if (!rq->sense) {
2649	memset(sense, 0, sizeof(sense));	2652	memset(sense, 0, sizeof(sense));
2650	rq->sense = sense;	2653	rq->sense = sense;
2651	rq->sense_len = 0;	2654	rq->sense_len = 0;
2652	}	2655	}
2653		2656
2654	rq->end_io_data = &wait;	2657	rq->end_io_data = &wait;
2655	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);	2658	blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
2656	wait_for_completion(&wait);	2659	wait_for_completion(&wait);
2657		2660
2658	if (rq->errors)	2661	if (rq->errors)
2659	err = -EIO;	2662	err = -EIO;
2660		2663
2661	return err;	2664	return err;
2662	}	2665	}
2663		2666
2664	EXPORT_SYMBOL(blk_execute_rq);	2667	EXPORT_SYMBOL(blk_execute_rq);
2665		2668
2666	/**	2669	/**
2667	* blkdev_issue_flush - queue a flush	2670	* blkdev_issue_flush - queue a flush
2668	* @bdev: blockdev to issue flush for	2671	* @bdev: blockdev to issue flush for
2669	* @error_sector: error sector	2672	* @error_sector: error sector
2670	*	2673	*
2671	* Description:	2674	* Description:
2672	* Issue a flush for the block device in question. Caller can supply	2675	* Issue a flush for the block device in question. Caller can supply
2673	* room for storing the error offset in case of a flush error, if they	2676	* room for storing the error offset in case of a flush error, if they
2674	* wish to. Caller must run wait_for_completion() on its own.	2677	* wish to. Caller must run wait_for_completion() on its own.
2675	*/	2678	*/
2676	int blkdev_issue_flush(struct block_device bdev, sector_t error_sector)	2679	int blkdev_issue_flush(struct block_device bdev, sector_t error_sector)
2677	{	2680	{
2678	struct request_queue *q;	2681	struct request_queue *q;
2679		2682
2680	if (bdev->bd_disk == NULL)	2683	if (bdev->bd_disk == NULL)
2681	return -ENXIO;	2684	return -ENXIO;
2682		2685
2683	q = bdev_get_queue(bdev);	2686	q = bdev_get_queue(bdev);
2684	if (!q)	2687	if (!q)
2685	return -ENXIO;	2688	return -ENXIO;
2686	if (!q->issue_flush_fn)	2689	if (!q->issue_flush_fn)
2687	return -EOPNOTSUPP;	2690	return -EOPNOTSUPP;
2688		2691
2689	return q->issue_flush_fn(q, bdev->bd_disk, error_sector);	2692	return q->issue_flush_fn(q, bdev->bd_disk, error_sector);
2690	}	2693	}
2691		2694
2692	EXPORT_SYMBOL(blkdev_issue_flush);	2695	EXPORT_SYMBOL(blkdev_issue_flush);
2693		2696
2694	static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)	2697	static void drive_stat_acct(struct request *rq, int nr_sectors, int new_io)
2695	{	2698	{
2696	int rw = rq_data_dir(rq);	2699	int rw = rq_data_dir(rq);
2697		2700
2698	if (!blk_fs_request(rq) \|\| !rq->rq_disk)	2701	if (!blk_fs_request(rq) \|\| !rq->rq_disk)
2699	return;	2702	return;
2700		2703
2701	if (!new_io) {	2704	if (!new_io) {
2702	__disk_stat_inc(rq->rq_disk, merges[rw]);	2705	__disk_stat_inc(rq->rq_disk, merges[rw]);
2703	} else {	2706	} else {
2704	disk_round_stats(rq->rq_disk);	2707	disk_round_stats(rq->rq_disk);
2705	rq->rq_disk->in_flight++;	2708	rq->rq_disk->in_flight++;
2706	}	2709	}
2707	}	2710	}
2708		2711
2709	/*	2712	/*
2710	* add-request adds a request to the linked list.	2713	* add-request adds a request to the linked list.
2711	* queue lock is held and interrupts disabled, as we muck with the	2714	* queue lock is held and interrupts disabled, as we muck with the
2712	* request queue list.	2715	* request queue list.
2713	*/	2716	*/
2714	static inline void add_request(struct request_queue * q, struct request * req)	2717	static inline void add_request(struct request_queue * q, struct request * req)
2715	{	2718	{
2716	drive_stat_acct(req, req->nr_sectors, 1);	2719	drive_stat_acct(req, req->nr_sectors, 1);
2717		2720
2718	/*	2721	/*
2719	* elevator indicated where it wants this request to be	2722	* elevator indicated where it wants this request to be
2720	* inserted at elevator_merge time	2723	* inserted at elevator_merge time
2721	*/	2724	*/
2722	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);	2725	__elv_add_request(q, req, ELEVATOR_INSERT_SORT, 0);
2723	}	2726	}
2724		2727
2725	/*	2728	/*
2726	* disk_round_stats() - Round off the performance stats on a struct	2729	* disk_round_stats() - Round off the performance stats on a struct
2727	* disk_stats.	2730	* disk_stats.
2728	*	2731	*
2729	* The average IO queue length and utilisation statistics are maintained	2732	* The average IO queue length and utilisation statistics are maintained
2730	* by observing the current state of the queue length and the amount of	2733	* by observing the current state of the queue length and the amount of
2731	* time it has been in this state for.	2734	* time it has been in this state for.
2732	*	2735	*
2733	* Normally, that accounting is done on IO completion, but that can result	2736	* Normally, that accounting is done on IO completion, but that can result
2734	* in more than a second's worth of IO being accounted for within any one	2737	* in more than a second's worth of IO being accounted for within any one
2735	* second, leading to >100% utilisation. To deal with that, we call this	2738	* second, leading to >100% utilisation. To deal with that, we call this
2736	* function to do a round-off before returning the results when reading	2739	* function to do a round-off before returning the results when reading
2737	* /proc/diskstats. This accounts immediately for all queue usage up to	2740	* /proc/diskstats. This accounts immediately for all queue usage up to
2738	* the current jiffies and restarts the counters again.	2741	* the current jiffies and restarts the counters again.
2739	*/	2742	*/
2740	void disk_round_stats(struct gendisk *disk)	2743	void disk_round_stats(struct gendisk *disk)
2741	{	2744	{
2742	unsigned long now = jiffies;	2745	unsigned long now = jiffies;
2743		2746
2744	if (now == disk->stamp)	2747	if (now == disk->stamp)
2745	return;	2748	return;
2746		2749
2747	if (disk->in_flight) {	2750	if (disk->in_flight) {
2748	__disk_stat_add(disk, time_in_queue,	2751	__disk_stat_add(disk, time_in_queue,
2749	disk->in_flight * (now - disk->stamp));	2752	disk->in_flight * (now - disk->stamp));
2750	__disk_stat_add(disk, io_ticks, (now - disk->stamp));	2753	__disk_stat_add(disk, io_ticks, (now - disk->stamp));
2751	}	2754	}
2752	disk->stamp = now;	2755	disk->stamp = now;
2753	}	2756	}
2754		2757
2755	EXPORT_SYMBOL_GPL(disk_round_stats);	2758	EXPORT_SYMBOL_GPL(disk_round_stats);
2756		2759
2757	/*	2760	/*
2758	* queue lock must be held	2761	* queue lock must be held
2759	*/	2762	*/
2760	void __blk_put_request(struct request_queue q, struct request req)	2763	void __blk_put_request(struct request_queue q, struct request req)
2761	{	2764	{
2762	if (unlikely(!q))	2765	if (unlikely(!q))
2763	return;	2766	return;
2764	if (unlikely(--req->ref_count))	2767	if (unlikely(--req->ref_count))
2765	return;	2768	return;
2766		2769
2767	elv_completed_request(q, req);	2770	elv_completed_request(q, req);
2768		2771
2769	/*	2772	/*
2770	* Request may not have originated from ll_rw_blk. if not,	2773	* Request may not have originated from ll_rw_blk. if not,
2771	* it didn't come out of our reserved rq pools	2774	* it didn't come out of our reserved rq pools
2772	*/	2775	*/
2773	if (req->cmd_flags & REQ_ALLOCED) {	2776	if (req->cmd_flags & REQ_ALLOCED) {
2774	int rw = rq_data_dir(req);	2777	int rw = rq_data_dir(req);
2775	int priv = req->cmd_flags & REQ_ELVPRIV;	2778	int priv = req->cmd_flags & REQ_ELVPRIV;
2776		2779
2777	BUG_ON(!list_empty(&req->queuelist));	2780	BUG_ON(!list_empty(&req->queuelist));
2778	BUG_ON(!hlist_unhashed(&req->hash));	2781	BUG_ON(!hlist_unhashed(&req->hash));
2779		2782
2780	blk_free_request(q, req);	2783	blk_free_request(q, req);
2781	freed_request(q, rw, priv);	2784	freed_request(q, rw, priv);
2782	}	2785	}
2783	}	2786	}
2784		2787
2785	EXPORT_SYMBOL_GPL(__blk_put_request);	2788	EXPORT_SYMBOL_GPL(__blk_put_request);
2786		2789
2787	void blk_put_request(struct request *req)	2790	void blk_put_request(struct request *req)
2788	{	2791	{
2789	unsigned long flags;	2792	unsigned long flags;
2790	struct request_queue *q = req->q;	2793	struct request_queue *q = req->q;
2791		2794
2792	/*	2795	/*
2793	* Gee, IDE calls in w/ NULL q. Fix IDE and remove the	2796	* Gee, IDE calls in w/ NULL q. Fix IDE and remove the
2794	* following if (q) test.	2797	* following if (q) test.
2795	*/	2798	*/
2796	if (q) {	2799	if (q) {
2797	spin_lock_irqsave(q->queue_lock, flags);	2800	spin_lock_irqsave(q->queue_lock, flags);
2798	__blk_put_request(q, req);	2801	__blk_put_request(q, req);
2799	spin_unlock_irqrestore(q->queue_lock, flags);	2802	spin_unlock_irqrestore(q->queue_lock, flags);
2800	}	2803	}
2801	}	2804	}
2802		2805
2803	EXPORT_SYMBOL(blk_put_request);	2806	EXPORT_SYMBOL(blk_put_request);
2804		2807
2805	/**	2808	/**
2806	* blk_end_sync_rq - executes a completion event on a request	2809	* blk_end_sync_rq - executes a completion event on a request
2807	* @rq: request to complete	2810	* @rq: request to complete
2808	* @error: end io status of the request	2811	* @error: end io status of the request
2809	*/	2812	*/
2810	void blk_end_sync_rq(struct request *rq, int error)	2813	void blk_end_sync_rq(struct request *rq, int error)
2811	{	2814	{
2812	struct completion *waiting = rq->end_io_data;	2815	struct completion *waiting = rq->end_io_data;
2813		2816
2814	rq->end_io_data = NULL;	2817	rq->end_io_data = NULL;
2815	__blk_put_request(rq->q, rq);	2818	__blk_put_request(rq->q, rq);
2816		2819
2817	/*	2820	/*
2818	* complete last, if this is a stack request the process (and thus	2821	* complete last, if this is a stack request the process (and thus
2819	* the rq pointer) could be invalid right after this complete()	2822	* the rq pointer) could be invalid right after this complete()
2820	*/	2823	*/
2821	complete(waiting);	2824	complete(waiting);
2822	}	2825	}
2823	EXPORT_SYMBOL(blk_end_sync_rq);	2826	EXPORT_SYMBOL(blk_end_sync_rq);
2824		2827
2825	/*	2828	/*
2826	* Has to be called with the request spinlock acquired	2829	* Has to be called with the request spinlock acquired
2827	*/	2830	*/
2828	static int attempt_merge(struct request_queue q, struct request req,	2831	static int attempt_merge(struct request_queue q, struct request req,
2829	struct request *next)	2832	struct request *next)
2830	{	2833	{
2831	if (!rq_mergeable(req) \|\| !rq_mergeable(next))	2834	if (!rq_mergeable(req) \|\| !rq_mergeable(next))
2832	return 0;	2835	return 0;
2833		2836
2834	/*	2837	/*
2835	* not contiguous	2838	* not contiguous
2836	*/	2839	*/
2837	if (req->sector + req->nr_sectors != next->sector)	2840	if (req->sector + req->nr_sectors != next->sector)
2838	return 0;	2841	return 0;
2839		2842
2840	if (rq_data_dir(req) != rq_data_dir(next)	2843	if (rq_data_dir(req) != rq_data_dir(next)
2841	\|\| req->rq_disk != next->rq_disk	2844	\|\| req->rq_disk != next->rq_disk
2842	\|\| next->special)	2845	\|\| next->special)
2843	return 0;	2846	return 0;
2844		2847
2845	/*	2848	/*
2846	* If we are allowed to merge, then append bio list	2849	* If we are allowed to merge, then append bio list
2847	* from next to rq and release next. merge_requests_fn	2850	* from next to rq and release next. merge_requests_fn
2848	* will have updated segment counts, update sector	2851	* will have updated segment counts, update sector
2849	* counts here.	2852	* counts here.
2850	*/	2853	*/
2851	if (!ll_merge_requests_fn(q, req, next))	2854	if (!ll_merge_requests_fn(q, req, next))
2852	return 0;	2855	return 0;
2853		2856
2854	/*	2857	/*
2855	* At this point we have either done a back merge	2858	* At this point we have either done a back merge
2856	* or front merge. We need the smaller start_time of	2859	* or front merge. We need the smaller start_time of
2857	* the merged requests to be the current request	2860	* the merged requests to be the current request
2858	* for accounting purposes.	2861	* for accounting purposes.
2859	*/	2862	*/
2860	if (time_after(req->start_time, next->start_time))	2863	if (time_after(req->start_time, next->start_time))
2861	req->start_time = next->start_time;	2864	req->start_time = next->start_time;
2862		2865
2863	req->biotail->bi_next = next->bio;	2866	req->biotail->bi_next = next->bio;
2864	req->biotail = next->biotail;	2867	req->biotail = next->biotail;
2865		2868
2866	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;	2869	req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors;
2867		2870
2868	elv_merge_requests(q, req, next);	2871	elv_merge_requests(q, req, next);
2869		2872
2870	if (req->rq_disk) {	2873	if (req->rq_disk) {
2871	disk_round_stats(req->rq_disk);	2874	disk_round_stats(req->rq_disk);
2872	req->rq_disk->in_flight--;	2875	req->rq_disk->in_flight--;
2873	}	2876	}
2874		2877
2875	req->ioprio = ioprio_best(req->ioprio, next->ioprio);	2878	req->ioprio = ioprio_best(req->ioprio, next->ioprio);
2876		2879
2877	__blk_put_request(q, next);	2880	__blk_put_request(q, next);
2878	return 1;	2881	return 1;
2879	}	2882	}
2880		2883
2881	static inline int attempt_back_merge(struct request_queue *q,	2884	static inline int attempt_back_merge(struct request_queue *q,
2882	struct request *rq)	2885	struct request *rq)
2883	{	2886	{
2884	struct request *next = elv_latter_request(q, rq);	2887	struct request *next = elv_latter_request(q, rq);
2885		2888
2886	if (next)	2889	if (next)
2887	return attempt_merge(q, rq, next);	2890	return attempt_merge(q, rq, next);
2888		2891
2889	return 0;	2892	return 0;
2890	}	2893	}
2891		2894
2892	static inline int attempt_front_merge(struct request_queue *q,	2895	static inline int attempt_front_merge(struct request_queue *q,
2893	struct request *rq)	2896	struct request *rq)
2894	{	2897	{
2895	struct request *prev = elv_former_request(q, rq);	2898	struct request *prev = elv_former_request(q, rq);
2896		2899
2897	if (prev)	2900	if (prev)
2898	return attempt_merge(q, prev, rq);	2901	return attempt_merge(q, prev, rq);
2899		2902
2900	return 0;	2903	return 0;
2901	}	2904	}
2902		2905
2903	static void init_request_from_bio(struct request req, struct bio bio)	2906	static void init_request_from_bio(struct request req, struct bio bio)
2904	{	2907	{
2905	req->cmd_type = REQ_TYPE_FS;	2908	req->cmd_type = REQ_TYPE_FS;
2906		2909
2907	/*	2910	/*
2908	* inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)	2911	* inherit FAILFAST from bio (for read-ahead, and explicit FAILFAST)
2909	*/	2912	*/
2910	if (bio_rw_ahead(bio) \|\| bio_failfast(bio))	2913	if (bio_rw_ahead(bio) \|\| bio_failfast(bio))
2911	req->cmd_flags \|= REQ_FAILFAST;	2914	req->cmd_flags \|= REQ_FAILFAST;
2912		2915
2913	/*	2916	/*
2914	* REQ_BARRIER implies no merging, but lets make it explicit	2917	* REQ_BARRIER implies no merging, but lets make it explicit
2915	*/	2918	*/
2916	if (unlikely(bio_barrier(bio)))	2919	if (unlikely(bio_barrier(bio)))
2917	req->cmd_flags \|= (REQ_HARDBARRIER \| REQ_NOMERGE);	2920	req->cmd_flags \|= (REQ_HARDBARRIER \| REQ_NOMERGE);
2918		2921
2919	if (bio_sync(bio))	2922	if (bio_sync(bio))
2920	req->cmd_flags \|= REQ_RW_SYNC;	2923	req->cmd_flags \|= REQ_RW_SYNC;
2921	if (bio_rw_meta(bio))	2924	if (bio_rw_meta(bio))
2922	req->cmd_flags \|= REQ_RW_META;	2925	req->cmd_flags \|= REQ_RW_META;
2923		2926
2924	req->errors = 0;	2927	req->errors = 0;
2925	req->hard_sector = req->sector = bio->bi_sector;	2928	req->hard_sector = req->sector = bio->bi_sector;
2926	req->ioprio = bio_prio(bio);	2929	req->ioprio = bio_prio(bio);
2927	req->start_time = jiffies;	2930	req->start_time = jiffies;
2928	blk_rq_bio_prep(req->q, req, bio);	2931	blk_rq_bio_prep(req->q, req, bio);
2929	}	2932	}
2930		2933
2931	static int __make_request(struct request_queue q, struct bio bio)	2934	static int __make_request(struct request_queue q, struct bio bio)
2932	{	2935	{
2933	struct request *req;	2936	struct request *req;
2934	int el_ret, nr_sectors, barrier, err;	2937	int el_ret, nr_sectors, barrier, err;
2935	const unsigned short prio = bio_prio(bio);	2938	const unsigned short prio = bio_prio(bio);
2936	const int sync = bio_sync(bio);	2939	const int sync = bio_sync(bio);
2937	int rw_flags;	2940	int rw_flags;
2938		2941
2939	nr_sectors = bio_sectors(bio);	2942	nr_sectors = bio_sectors(bio);
2940		2943
2941	/*	2944	/*
2942	* low level driver can indicate that it wants pages above a	2945	* low level driver can indicate that it wants pages above a
2943	* certain limit bounced to low memory (ie for highmem, or even	2946	* certain limit bounced to low memory (ie for highmem, or even
2944	* ISA dma in theory)	2947	* ISA dma in theory)
2945	*/	2948	*/
2946	blk_queue_bounce(q, &bio);	2949	blk_queue_bounce(q, &bio);
2947		2950
2948	barrier = bio_barrier(bio);	2951	barrier = bio_barrier(bio);
2949	if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {	2952	if (unlikely(barrier) && (q->next_ordered == QUEUE_ORDERED_NONE)) {
2950	err = -EOPNOTSUPP;	2953	err = -EOPNOTSUPP;
2951	goto end_io;	2954	goto end_io;
2952	}	2955	}
2953		2956
2954	spin_lock_irq(q->queue_lock);	2957	spin_lock_irq(q->queue_lock);
2955		2958
2956	if (unlikely(barrier) \|\| elv_queue_empty(q))	2959	if (unlikely(barrier) \|\| elv_queue_empty(q))
2957	goto get_rq;	2960	goto get_rq;
2958		2961
2959	el_ret = elv_merge(q, &req, bio);	2962	el_ret = elv_merge(q, &req, bio);
2960	switch (el_ret) {	2963	switch (el_ret) {
2961	case ELEVATOR_BACK_MERGE:	2964	case ELEVATOR_BACK_MERGE:
2962	BUG_ON(!rq_mergeable(req));	2965	BUG_ON(!rq_mergeable(req));
2963		2966
2964	if (!ll_back_merge_fn(q, req, bio))	2967	if (!ll_back_merge_fn(q, req, bio))
2965	break;	2968	break;
2966		2969
2967	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);	2970	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
2968		2971
2969	req->biotail->bi_next = bio;	2972	req->biotail->bi_next = bio;
2970	req->biotail = bio;	2973	req->biotail = bio;
2971	req->nr_sectors = req->hard_nr_sectors += nr_sectors;	2974	req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2972	req->ioprio = ioprio_best(req->ioprio, prio);	2975	req->ioprio = ioprio_best(req->ioprio, prio);
2973	drive_stat_acct(req, nr_sectors, 0);	2976	drive_stat_acct(req, nr_sectors, 0);
2974	if (!attempt_back_merge(q, req))	2977	if (!attempt_back_merge(q, req))
2975	elv_merged_request(q, req, el_ret);	2978	elv_merged_request(q, req, el_ret);
2976	goto out;	2979	goto out;
2977		2980
2978	case ELEVATOR_FRONT_MERGE:	2981	case ELEVATOR_FRONT_MERGE:
2979	BUG_ON(!rq_mergeable(req));	2982	BUG_ON(!rq_mergeable(req));
2980		2983
2981	if (!ll_front_merge_fn(q, req, bio))	2984	if (!ll_front_merge_fn(q, req, bio))
2982	break;	2985	break;
2983		2986
2984	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);	2987	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
2985		2988
2986	bio->bi_next = req->bio;	2989	bio->bi_next = req->bio;
2987	req->bio = bio;	2990	req->bio = bio;
2988		2991
2989	/*	2992	/*
2990	* may not be valid. if the low level driver said	2993	* may not be valid. if the low level driver said
2991	* it didn't need a bounce buffer then it better	2994	* it didn't need a bounce buffer then it better
2992	* not touch req->buffer either...	2995	* not touch req->buffer either...
2993	*/	2996	*/
2994	req->buffer = bio_data(bio);	2997	req->buffer = bio_data(bio);
2995	req->current_nr_sectors = bio_cur_sectors(bio);	2998	req->current_nr_sectors = bio_cur_sectors(bio);
2996	req->hard_cur_sectors = req->current_nr_sectors;	2999	req->hard_cur_sectors = req->current_nr_sectors;
2997	req->sector = req->hard_sector = bio->bi_sector;	3000	req->sector = req->hard_sector = bio->bi_sector;
2998	req->nr_sectors = req->hard_nr_sectors += nr_sectors;	3001	req->nr_sectors = req->hard_nr_sectors += nr_sectors;
2999	req->ioprio = ioprio_best(req->ioprio, prio);	3002	req->ioprio = ioprio_best(req->ioprio, prio);
3000	drive_stat_acct(req, nr_sectors, 0);	3003	drive_stat_acct(req, nr_sectors, 0);
3001	if (!attempt_front_merge(q, req))	3004	if (!attempt_front_merge(q, req))
3002	elv_merged_request(q, req, el_ret);	3005	elv_merged_request(q, req, el_ret);
3003	goto out;	3006	goto out;
3004		3007
3005	/* ELV_NO_MERGE: elevator says don't/can't merge. */	3008	/* ELV_NO_MERGE: elevator says don't/can't merge. */
3006	default:	3009	default:
3007	;	3010	;
3008	}	3011	}
3009		3012
3010	get_rq:	3013	get_rq:
3011	/*	3014	/*
3012	* This sync check and mask will be re-done in init_request_from_bio(),	3015	* This sync check and mask will be re-done in init_request_from_bio(),
3013	* but we need to set it earlier to expose the sync flag to the	3016	* but we need to set it earlier to expose the sync flag to the
3014	* rq allocator and io schedulers.	3017	* rq allocator and io schedulers.
3015	*/	3018	*/
3016	rw_flags = bio_data_dir(bio);	3019	rw_flags = bio_data_dir(bio);
3017	if (sync)	3020	if (sync)
3018	rw_flags \|= REQ_RW_SYNC;	3021	rw_flags \|= REQ_RW_SYNC;
3019		3022
3020	/*	3023	/*
3021	* Grab a free request. This is might sleep but can not fail.	3024	* Grab a free request. This is might sleep but can not fail.
3022	* Returns with the queue unlocked.	3025	* Returns with the queue unlocked.
3023	*/	3026	*/
3024	req = get_request_wait(q, rw_flags, bio);	3027	req = get_request_wait(q, rw_flags, bio);
3025		3028
3026	/*	3029	/*
3027	* After dropping the lock and possibly sleeping here, our request	3030	* After dropping the lock and possibly sleeping here, our request
3028	* may now be mergeable after it had proven unmergeable (above).	3031	* may now be mergeable after it had proven unmergeable (above).
3029	* We don't worry about that case for efficiency. It won't happen	3032	* We don't worry about that case for efficiency. It won't happen
3030	* often, and the elevators are able to handle it.	3033	* often, and the elevators are able to handle it.
3031	*/	3034	*/
3032	init_request_from_bio(req, bio);	3035	init_request_from_bio(req, bio);
3033		3036
3034	spin_lock_irq(q->queue_lock);	3037	spin_lock_irq(q->queue_lock);
3035	if (elv_queue_empty(q))	3038	if (elv_queue_empty(q))
3036	blk_plug_device(q);	3039	blk_plug_device(q);
3037	add_request(q, req);	3040	add_request(q, req);
3038	out:	3041	out:
3039	if (sync)	3042	if (sync)
3040	__generic_unplug_device(q);	3043	__generic_unplug_device(q);
3041		3044
3042	spin_unlock_irq(q->queue_lock);	3045	spin_unlock_irq(q->queue_lock);
3043	return 0;	3046	return 0;
3044		3047
3045	end_io:	3048	end_io:
3046	bio_endio(bio, err);	3049	bio_endio(bio, err);
3047	return 0;	3050	return 0;
3048	}	3051	}
3049		3052
3050	/*	3053	/*
3051	* If bio->bi_dev is a partition, remap the location	3054	* If bio->bi_dev is a partition, remap the location
3052	*/	3055	*/
3053	static inline void blk_partition_remap(struct bio *bio)	3056	static inline void blk_partition_remap(struct bio *bio)
3054	{	3057	{
3055	struct block_device *bdev = bio->bi_bdev;	3058	struct block_device *bdev = bio->bi_bdev;
3056		3059
3057	if (bdev != bdev->bd_contains) {	3060	if (bio_sectors(bio) && bdev != bdev->bd_contains) {
3058	struct hd_struct *p = bdev->bd_part;	3061	struct hd_struct *p = bdev->bd_part;
3059	const int rw = bio_data_dir(bio);	3062	const int rw = bio_data_dir(bio);
3060		3063
3061	p->sectors[rw] += bio_sectors(bio);	3064	p->sectors[rw] += bio_sectors(bio);
3062	p->ios[rw]++;	3065	p->ios[rw]++;
3063		3066
3064	bio->bi_sector += p->start_sect;	3067	bio->bi_sector += p->start_sect;
3065	bio->bi_bdev = bdev->bd_contains;	3068	bio->bi_bdev = bdev->bd_contains;
3066		3069
3067	blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,	3070	blk_add_trace_remap(bdev_get_queue(bio->bi_bdev), bio,
3068	bdev->bd_dev, bio->bi_sector,	3071	bdev->bd_dev, bio->bi_sector,
3069	bio->bi_sector - p->start_sect);	3072	bio->bi_sector - p->start_sect);
3070	}	3073	}
3071	}	3074	}
3072		3075
3073	static void handle_bad_sector(struct bio *bio)	3076	static void handle_bad_sector(struct bio *bio)
3074	{	3077	{
3075	char b[BDEVNAME_SIZE];	3078	char b[BDEVNAME_SIZE];
3076		3079
3077	printk(KERN_INFO "attempt to access beyond end of device\n");	3080	printk(KERN_INFO "attempt to access beyond end of device\n");
3078	printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",	3081	printk(KERN_INFO "%s: rw=%ld, want=%Lu, limit=%Lu\n",
3079	bdevname(bio->bi_bdev, b),	3082	bdevname(bio->bi_bdev, b),
3080	bio->bi_rw,	3083	bio->bi_rw,
3081	(unsigned long long)bio->bi_sector + bio_sectors(bio),	3084	(unsigned long long)bio->bi_sector + bio_sectors(bio),
3082	(long long)(bio->bi_bdev->bd_inode->i_size >> 9));	3085	(long long)(bio->bi_bdev->bd_inode->i_size >> 9));
3083		3086
3084	set_bit(BIO_EOF, &bio->bi_flags);	3087	set_bit(BIO_EOF, &bio->bi_flags);
3085	}	3088	}
3086		3089
3087	#ifdef CONFIG_FAIL_MAKE_REQUEST	3090	#ifdef CONFIG_FAIL_MAKE_REQUEST
3088		3091
3089	static DECLARE_FAULT_ATTR(fail_make_request);	3092	static DECLARE_FAULT_ATTR(fail_make_request);
3090		3093
3091	static int __init setup_fail_make_request(char *str)	3094	static int __init setup_fail_make_request(char *str)
3092	{	3095	{
3093	return setup_fault_attr(&fail_make_request, str);	3096	return setup_fault_attr(&fail_make_request, str);
3094	}	3097	}
3095	__setup("fail_make_request=", setup_fail_make_request);	3098	__setup("fail_make_request=", setup_fail_make_request);
3096		3099
3097	static int should_fail_request(struct bio *bio)	3100	static int should_fail_request(struct bio *bio)
3098	{	3101	{
3099	if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) \|\|	3102	if ((bio->bi_bdev->bd_disk->flags & GENHD_FL_FAIL) \|\|
3100	(bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))	3103	(bio->bi_bdev->bd_part && bio->bi_bdev->bd_part->make_it_fail))
3101	return should_fail(&fail_make_request, bio->bi_size);	3104	return should_fail(&fail_make_request, bio->bi_size);
3102		3105
3103	return 0;	3106	return 0;
3104	}	3107	}
3105		3108
3106	static int __init fail_make_request_debugfs(void)	3109	static int __init fail_make_request_debugfs(void)
3107	{	3110	{
3108	return init_fault_attr_dentries(&fail_make_request,	3111	return init_fault_attr_dentries(&fail_make_request,
3109	"fail_make_request");	3112	"fail_make_request");
3110	}	3113	}
3111		3114
3112	late_initcall(fail_make_request_debugfs);	3115	late_initcall(fail_make_request_debugfs);
3113		3116
3114	#else /* CONFIG_FAIL_MAKE_REQUEST */	3117	#else /* CONFIG_FAIL_MAKE_REQUEST */
3115		3118
3116	static inline int should_fail_request(struct bio *bio)	3119	static inline int should_fail_request(struct bio *bio)
3117	{	3120	{
3118	return 0;	3121	return 0;
3119	}	3122	}
3120		3123
3121	#endif /* CONFIG_FAIL_MAKE_REQUEST */	3124	#endif /* CONFIG_FAIL_MAKE_REQUEST */
3122		3125
3123	/*	3126	/*
3124	* Check whether this bio extends beyond the end of the device.	3127	* Check whether this bio extends beyond the end of the device.
3125	*/	3128	*/
3126	static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)	3129	static inline int bio_check_eod(struct bio *bio, unsigned int nr_sectors)
3127	{	3130	{
3128	sector_t maxsector;	3131	sector_t maxsector;
3129		3132
3130	if (!nr_sectors)	3133	if (!nr_sectors)
3131	return 0;	3134	return 0;
3132		3135
3133	/* Test device or partition size, when known. */	3136	/* Test device or partition size, when known. */
3134	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;	3137	maxsector = bio->bi_bdev->bd_inode->i_size >> 9;
3135	if (maxsector) {	3138	if (maxsector) {
3136	sector_t sector = bio->bi_sector;	3139	sector_t sector = bio->bi_sector;
3137		3140
3138	if (maxsector < nr_sectors \|\| maxsector - nr_sectors < sector) {	3141	if (maxsector < nr_sectors \|\| maxsector - nr_sectors < sector) {
3139	/*	3142	/*
3140	* This may well happen - the kernel calls bread()	3143	* This may well happen - the kernel calls bread()
3141	* without checking the size of the device, e.g., when	3144	* without checking the size of the device, e.g., when
3142	* mounting a device.	3145	* mounting a device.
3143	*/	3146	*/
3144	handle_bad_sector(bio);	3147	handle_bad_sector(bio);
3145	return 1;	3148	return 1;
3146	}	3149	}
3147	}	3150	}
3148		3151
3149	return 0;	3152	return 0;
3150	}	3153	}
3151		3154
3152	/**	3155	/**
3153	* generic_make_request: hand a buffer to its device driver for I/O	3156	* generic_make_request: hand a buffer to its device driver for I/O
3154	* @bio: The bio describing the location in memory and on the device.	3157	* @bio: The bio describing the location in memory and on the device.
3155	*	3158	*
3156	* generic_make_request() is used to make I/O requests of block	3159	* generic_make_request() is used to make I/O requests of block
3157	* devices. It is passed a &struct bio, which describes the I/O that needs	3160	* devices. It is passed a &struct bio, which describes the I/O that needs
3158	* to be done.	3161	* to be done.
3159	*	3162	*
3160	* generic_make_request() does not return any status. The	3163	* generic_make_request() does not return any status. The
3161	* success/failure status of the request, along with notification of	3164	* success/failure status of the request, along with notification of
3162	* completion, is delivered asynchronously through the bio->bi_end_io	3165	* completion, is delivered asynchronously through the bio->bi_end_io
3163	* function described (one day) else where.	3166	* function described (one day) else where.
3164	*	3167	*
3165	* The caller of generic_make_request must make sure that bi_io_vec	3168	* The caller of generic_make_request must make sure that bi_io_vec
3166	* are set to describe the memory buffer, and that bi_dev and bi_sector are	3169	* are set to describe the memory buffer, and that bi_dev and bi_sector are
3167	* set to describe the device address, and the	3170	* set to describe the device address, and the
3168	* bi_end_io and optionally bi_private are set to describe how	3171	* bi_end_io and optionally bi_private are set to describe how
3169	* completion notification should be signaled.	3172	* completion notification should be signaled.
3170	*	3173	*
3171	* generic_make_request and the drivers it calls may use bi_next if this	3174	* generic_make_request and the drivers it calls may use bi_next if this
3172	* bio happens to be merged with someone else, and may change bi_dev and	3175	* bio happens to be merged with someone else, and may change bi_dev and
3173	* bi_sector for remaps as it sees fit. So the values of these fields	3176	* bi_sector for remaps as it sees fit. So the values of these fields
3174	* should NOT be depended on after the call to generic_make_request.	3177	* should NOT be depended on after the call to generic_make_request.
3175	*/	3178	*/
3176	static inline void __generic_make_request(struct bio *bio)	3179	static inline void __generic_make_request(struct bio *bio)
3177	{	3180	{
3178	struct request_queue *q;	3181	struct request_queue *q;
3179	sector_t old_sector;	3182	sector_t old_sector;
3180	int ret, nr_sectors = bio_sectors(bio);	3183	int ret, nr_sectors = bio_sectors(bio);
3181	dev_t old_dev;	3184	dev_t old_dev;
3182		3185
3183	might_sleep();	3186	might_sleep();
3184		3187
3185	if (bio_check_eod(bio, nr_sectors))	3188	if (bio_check_eod(bio, nr_sectors))
3186	goto end_io;	3189	goto end_io;
3187		3190
3188	/*	3191	/*
3189	* Resolve the mapping until finished. (drivers are	3192	* Resolve the mapping until finished. (drivers are
3190	* still free to implement/resolve their own stacking	3193	* still free to implement/resolve their own stacking
3191	* by explicitly returning 0)	3194	* by explicitly returning 0)
3192	*	3195	*
3193	* NOTE: we don't repeat the blk_size check for each new device.	3196	* NOTE: we don't repeat the blk_size check for each new device.
3194	* Stacking drivers are expected to know what they are doing.	3197	* Stacking drivers are expected to know what they are doing.
3195	*/	3198	*/
3196	old_sector = -1;	3199	old_sector = -1;
3197	old_dev = 0;	3200	old_dev = 0;
3198	do {	3201	do {
3199	char b[BDEVNAME_SIZE];	3202	char b[BDEVNAME_SIZE];
3200		3203
3201	q = bdev_get_queue(bio->bi_bdev);	3204	q = bdev_get_queue(bio->bi_bdev);
3202	if (!q) {	3205	if (!q) {
3203	printk(KERN_ERR	3206	printk(KERN_ERR
3204	"generic_make_request: Trying to access "	3207	"generic_make_request: Trying to access "
3205	"nonexistent block-device %s (%Lu)\n",	3208	"nonexistent block-device %s (%Lu)\n",
3206	bdevname(bio->bi_bdev, b),	3209	bdevname(bio->bi_bdev, b),
3207	(long long) bio->bi_sector);	3210	(long long) bio->bi_sector);
3208	end_io:	3211	end_io:
3209	bio_endio(bio, -EIO);	3212	bio_endio(bio, -EIO);
3210	break;	3213	break;
3211	}	3214	}
3212		3215
3213	if (unlikely(nr_sectors > q->max_hw_sectors)) {	3216	if (unlikely(nr_sectors > q->max_hw_sectors)) {
3214	printk("bio too big device %s (%u > %u)\n",	3217	printk("bio too big device %s (%u > %u)\n",
3215	bdevname(bio->bi_bdev, b),	3218	bdevname(bio->bi_bdev, b),
3216	bio_sectors(bio),	3219	bio_sectors(bio),
3217	q->max_hw_sectors);	3220	q->max_hw_sectors);
3218	goto end_io;	3221	goto end_io;
3219	}	3222	}
3220		3223
3221	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))	3224	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
3222	goto end_io;	3225	goto end_io;
3223		3226
3224	if (should_fail_request(bio))	3227	if (should_fail_request(bio))
3225	goto end_io;	3228	goto end_io;
3226		3229
3227	/*	3230	/*
3228	* If this device has partitions, remap block n	3231	* If this device has partitions, remap block n
3229	* of partition p to block n+start(p) of the disk.	3232	* of partition p to block n+start(p) of the disk.
3230	*/	3233	*/
3231	blk_partition_remap(bio);	3234	blk_partition_remap(bio);
3232		3235
3233	if (old_sector != -1)	3236	if (old_sector != -1)
3234	blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,	3237	blk_add_trace_remap(q, bio, old_dev, bio->bi_sector,
3235	old_sector);	3238	old_sector);
3236		3239
3237	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);	3240	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
3238		3241
3239	old_sector = bio->bi_sector;	3242	old_sector = bio->bi_sector;
3240	old_dev = bio->bi_bdev->bd_dev;	3243	old_dev = bio->bi_bdev->bd_dev;
3241		3244
3242	if (bio_check_eod(bio, nr_sectors))	3245	if (bio_check_eod(bio, nr_sectors))
3243	goto end_io;	3246	goto end_io;
3244		3247
3245	ret = q->make_request_fn(q, bio);	3248	ret = q->make_request_fn(q, bio);
3246	} while (ret);	3249	} while (ret);
3247	}	3250	}
3248		3251
3249	/*	3252	/*
3250	* We only want one ->make_request_fn to be active at a time,	3253	* We only want one ->make_request_fn to be active at a time,
3251	* else stack usage with stacked devices could be a problem.	3254	* else stack usage with stacked devices could be a problem.
3252	* So use current->bio_{list,tail} to keep a list of requests	3255	* So use current->bio_{list,tail} to keep a list of requests
3253	* submited by a make_request_fn function.	3256	* submited by a make_request_fn function.
3254	* current->bio_tail is also used as a flag to say if	3257	* current->bio_tail is also used as a flag to say if
3255	* generic_make_request is currently active in this task or not.	3258	* generic_make_request is currently active in this task or not.
3256	* If it is NULL, then no make_request is active. If it is non-NULL,	3259	* If it is NULL, then no make_request is active. If it is non-NULL,
3257	* then a make_request is active, and new requests should be added	3260	* then a make_request is active, and new requests should be added
3258	* at the tail	3261	* at the tail
3259	*/	3262	*/
3260	void generic_make_request(struct bio *bio)	3263	void generic_make_request(struct bio *bio)
3261	{	3264	{
3262	if (current->bio_tail) {	3265	if (current->bio_tail) {
3263	/* make_request is active */	3266	/* make_request is active */
3264	*(current->bio_tail) = bio;	3267	*(current->bio_tail) = bio;
3265	bio->bi_next = NULL;	3268	bio->bi_next = NULL;
3266	current->bio_tail = &bio->bi_next;	3269	current->bio_tail = &bio->bi_next;
3267	return;	3270	return;
3268	}	3271	}
3269	/* following loop may be a bit non-obvious, and so deserves some	3272	/* following loop may be a bit non-obvious, and so deserves some
3270	* explanation.	3273	* explanation.
3271	* Before entering the loop, bio->bi_next is NULL (as all callers	3274	* Before entering the loop, bio->bi_next is NULL (as all callers
3272	* ensure that) so we have a list with a single bio.	3275	* ensure that) so we have a list with a single bio.
3273	* We pretend that we have just taken it off a longer list, so	3276	* We pretend that we have just taken it off a longer list, so
3274	* we assign bio_list to the next (which is NULL) and bio_tail	3277	* we assign bio_list to the next (which is NULL) and bio_tail
3275	* to &bio_list, thus initialising the bio_list of new bios to be	3278	* to &bio_list, thus initialising the bio_list of new bios to be
3276	* added. __generic_make_request may indeed add some more bios	3279	* added. __generic_make_request may indeed add some more bios
3277	* through a recursive call to generic_make_request. If it	3280	* through a recursive call to generic_make_request. If it
3278	* did, we find a non-NULL value in bio_list and re-enter the loop	3281	* did, we find a non-NULL value in bio_list and re-enter the loop
3279	* from the top. In this case we really did just take the bio	3282	* from the top. In this case we really did just take the bio
3280	* of the top of the list (no pretending) and so fixup bio_list and	3283	* of the top of the list (no pretending) and so fixup bio_list and
3281	* bio_tail or bi_next, and call into __generic_make_request again.	3284	* bio_tail or bi_next, and call into __generic_make_request again.
3282	*	3285	*
3283	* The loop was structured like this to make only one call to	3286	* The loop was structured like this to make only one call to
3284	* __generic_make_request (which is important as it is large and	3287	* __generic_make_request (which is important as it is large and
3285	* inlined) and to keep the structure simple.	3288	* inlined) and to keep the structure simple.
3286	*/	3289	*/
3287	BUG_ON(bio->bi_next);	3290	BUG_ON(bio->bi_next);
3288	do {	3291	do {
3289	current->bio_list = bio->bi_next;	3292	current->bio_list = bio->bi_next;
3290	if (bio->bi_next == NULL)	3293	if (bio->bi_next == NULL)
3291	current->bio_tail = &current->bio_list;	3294	current->bio_tail = &current->bio_list;
3292	else	3295	else
3293	bio->bi_next = NULL;	3296	bio->bi_next = NULL;
3294	__generic_make_request(bio);	3297	__generic_make_request(bio);
3295	bio = current->bio_list;	3298	bio = current->bio_list;
3296	} while (bio);	3299	} while (bio);
3297	current->bio_tail = NULL; /* deactivate */	3300	current->bio_tail = NULL; /* deactivate */
3298	}	3301	}
3299		3302
3300	EXPORT_SYMBOL(generic_make_request);	3303	EXPORT_SYMBOL(generic_make_request);
3301		3304
3302	/**	3305	/**
3303	* submit_bio: submit a bio to the block device layer for I/O	3306	* submit_bio: submit a bio to the block device layer for I/O
3304	* @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)	3307	* @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
3305	* @bio: The &struct bio which describes the I/O	3308	* @bio: The &struct bio which describes the I/O
3306	*	3309	*
3307	* submit_bio() is very similar in purpose to generic_make_request(), and	3310	* submit_bio() is very similar in purpose to generic_make_request(), and
3308	* uses that function to do most of the work. Both are fairly rough	3311	* uses that function to do most of the work. Both are fairly rough
3309	* interfaces, @bio must be presetup and ready for I/O.	3312	* interfaces, @bio must be presetup and ready for I/O.
3310	*	3313	*
3311	*/	3314	*/
3312	void submit_bio(int rw, struct bio *bio)	3315	void submit_bio(int rw, struct bio *bio)
3313	{	3316	{
3314	int count = bio_sectors(bio);	3317	int count = bio_sectors(bio);
3315		3318
3316	BIO_BUG_ON(!bio->bi_size);
3317	BIO_BUG_ON(!bio->bi_io_vec);
3318	bio->bi_rw \|= rw;	3319	bio->bi_rw \|= rw;
3319	if (rw & WRITE) {
3320	count_vm_events(PGPGOUT, count);
3321	} else {
3322	task_io_account_read(bio->bi_size);
3323	count_vm_events(PGPGIN, count);
3324	}
3325		3320
3326	if (unlikely(block_dump)) {	3321	/*
3327	char b[BDEVNAME_SIZE];	3322	* If it's a regular read/write or a barrier with data attached,
3328	printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",	3323	* go through the normal accounting stuff before submission.
3329	current->comm, current->pid,	3324	*/
3330	(rw & WRITE) ? "WRITE" : "READ",	3325	if (!bio_empty_barrier(bio)) {
3331	(unsigned long long)bio->bi_sector,	3326
3332	bdevname(bio->bi_bdev,b));	3327	BIO_BUG_ON(!bio->bi_size);
		3328	BIO_BUG_ON(!bio->bi_io_vec);
		3329
		3330	if (rw & WRITE) {
		3331	count_vm_events(PGPGOUT, count);
		3332	} else {
		3333	task_io_account_read(bio->bi_size);
		3334	count_vm_events(PGPGIN, count);
		3335	}
		3336
		3337	if (unlikely(block_dump)) {
		3338	char b[BDEVNAME_SIZE];
		3339	printk(KERN_DEBUG "%s(%d): %s block %Lu on %s\n",
		3340	current->comm, current->pid,
		3341	(rw & WRITE) ? "WRITE" : "READ",
		3342	(unsigned long long)bio->bi_sector,
		3343	bdevname(bio->bi_bdev,b));
		3344	}
3333	}	3345	}
3334		3346
3335	generic_make_request(bio);	3347	generic_make_request(bio);
3336	}	3348	}
3337		3349
3338	EXPORT_SYMBOL(submit_bio);	3350	EXPORT_SYMBOL(submit_bio);
3339		3351
3340	static void blk_recalc_rq_sectors(struct request *rq, int nsect)	3352	static void blk_recalc_rq_sectors(struct request *rq, int nsect)
3341	{	3353	{
3342	if (blk_fs_request(rq)) {	3354	if (blk_fs_request(rq)) {
3343	rq->hard_sector += nsect;	3355	rq->hard_sector += nsect;
3344	rq->hard_nr_sectors -= nsect;	3356	rq->hard_nr_sectors -= nsect;
3345		3357
3346	/*	3358	/*
3347	* Move the I/O submission pointers ahead if required.	3359	* Move the I/O submission pointers ahead if required.
3348	*/	3360	*/
3349	if ((rq->nr_sectors >= rq->hard_nr_sectors) &&	3361	if ((rq->nr_sectors >= rq->hard_nr_sectors) &&
3350	(rq->sector <= rq->hard_sector)) {	3362	(rq->sector <= rq->hard_sector)) {
3351	rq->sector = rq->hard_sector;	3363	rq->sector = rq->hard_sector;
3352	rq->nr_sectors = rq->hard_nr_sectors;	3364	rq->nr_sectors = rq->hard_nr_sectors;
3353	rq->hard_cur_sectors = bio_cur_sectors(rq->bio);	3365	rq->hard_cur_sectors = bio_cur_sectors(rq->bio);
3354	rq->current_nr_sectors = rq->hard_cur_sectors;	3366	rq->current_nr_sectors = rq->hard_cur_sectors;
3355	rq->buffer = bio_data(rq->bio);	3367	rq->buffer = bio_data(rq->bio);
3356	}	3368	}
3357		3369
3358	/*	3370	/*
3359	* if total number of sectors is less than the first segment	3371	* if total number of sectors is less than the first segment
3360	* size, something has gone terribly wrong	3372	* size, something has gone terribly wrong
3361	*/	3373	*/
3362	if (rq->nr_sectors < rq->current_nr_sectors) {	3374	if (rq->nr_sectors < rq->current_nr_sectors) {
3363	printk("blk: request botched\n");	3375	printk("blk: request botched\n");
3364	rq->nr_sectors = rq->current_nr_sectors;	3376	rq->nr_sectors = rq->current_nr_sectors;
3365	}	3377	}
3366	}	3378	}
3367	}	3379	}
3368		3380
3369	static int __end_that_request_first(struct request *req, int uptodate,	3381	static int __end_that_request_first(struct request *req, int uptodate,
3370	int nr_bytes)	3382	int nr_bytes)
3371	{	3383	{
3372	int total_bytes, bio_nbytes, error, next_idx = 0;	3384	int total_bytes, bio_nbytes, error, next_idx = 0;
3373	struct bio *bio;	3385	struct bio *bio;
3374		3386
3375	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);	3387	blk_add_trace_rq(req->q, req, BLK_TA_COMPLETE);
3376		3388
3377	/*	3389	/*
3378	* extend uptodate bool to allow < 0 value to be direct io error	3390	* extend uptodate bool to allow < 0 value to be direct io error
3379	*/	3391	*/
3380	error = 0;	3392	error = 0;
3381	if (end_io_error(uptodate))	3393	if (end_io_error(uptodate))
3382	error = !uptodate ? -EIO : uptodate;	3394	error = !uptodate ? -EIO : uptodate;
3383		3395
3384	/*	3396	/*
3385	* for a REQ_BLOCK_PC request, we want to carry any eventual	3397	* for a REQ_BLOCK_PC request, we want to carry any eventual
3386	* sense key with us all the way through	3398	* sense key with us all the way through
3387	*/	3399	*/
3388	if (!blk_pc_request(req))	3400	if (!blk_pc_request(req))
3389	req->errors = 0;	3401	req->errors = 0;
3390		3402
3391	if (!uptodate) {	3403	if (!uptodate) {
3392	if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))	3404	if (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))
3393	printk("end_request: I/O error, dev %s, sector %llu\n",	3405	printk("end_request: I/O error, dev %s, sector %llu\n",
3394	req->rq_disk ? req->rq_disk->disk_name : "?",	3406	req->rq_disk ? req->rq_disk->disk_name : "?",
3395	(unsigned long long)req->sector);	3407	(unsigned long long)req->sector);
3396	}	3408	}
3397		3409
3398	if (blk_fs_request(req) && req->rq_disk) {	3410	if (blk_fs_request(req) && req->rq_disk) {
3399	const int rw = rq_data_dir(req);	3411	const int rw = rq_data_dir(req);
3400		3412
3401	disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);	3413	disk_stat_add(req->rq_disk, sectors[rw], nr_bytes >> 9);
3402	}	3414	}
3403		3415
3404	total_bytes = bio_nbytes = 0;	3416	total_bytes = bio_nbytes = 0;
3405	while ((bio = req->bio) != NULL) {	3417	while ((bio = req->bio) != NULL) {
3406	int nbytes;	3418	int nbytes;
		3419
		3420	/*
		3421	* For an empty barrier request, the low level driver must
		3422	* store a potential error location in ->sector. We pass
		3423	* that back up in ->bi_sector.
		3424	*/
		3425	if (blk_empty_barrier(req))
		3426	bio->bi_sector = req->sector;
3407		3427
3408	if (nr_bytes >= bio->bi_size) {	3428	if (nr_bytes >= bio->bi_size) {
3409	req->bio = bio->bi_next;	3429	req->bio = bio->bi_next;
3410	nbytes = bio->bi_size;	3430	nbytes = bio->bi_size;
3411	req_bio_endio(req, bio, nbytes, error);	3431	req_bio_endio(req, bio, nbytes, error);
3412	next_idx = 0;	3432	next_idx = 0;
3413	bio_nbytes = 0;	3433	bio_nbytes = 0;
3414	} else {	3434	} else {
3415	int idx = bio->bi_idx + next_idx;	3435	int idx = bio->bi_idx + next_idx;
3416		3436
3417	if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {	3437	if (unlikely(bio->bi_idx >= bio->bi_vcnt)) {
3418	blk_dump_rq_flags(req, "__end_that");	3438	blk_dump_rq_flags(req, "__end_that");
3419	printk("%s: bio idx %d >= vcnt %d\n",	3439	printk("%s: bio idx %d >= vcnt %d\n",
3420	__FUNCTION__,	3440	__FUNCTION__,
3421	bio->bi_idx, bio->bi_vcnt);	3441	bio->bi_idx, bio->bi_vcnt);
3422	break;	3442	break;
3423	}	3443	}
3424		3444
3425	nbytes = bio_iovec_idx(bio, idx)->bv_len;	3445	nbytes = bio_iovec_idx(bio, idx)->bv_len;
3426	BIO_BUG_ON(nbytes > bio->bi_size);	3446	BIO_BUG_ON(nbytes > bio->bi_size);
3427		3447
3428	/*	3448	/*
3429	* not a complete bvec done	3449	* not a complete bvec done
3430	*/	3450	*/
3431	if (unlikely(nbytes > nr_bytes)) {	3451	if (unlikely(nbytes > nr_bytes)) {
3432	bio_nbytes += nr_bytes;	3452	bio_nbytes += nr_bytes;
3433	total_bytes += nr_bytes;	3453	total_bytes += nr_bytes;
3434	break;	3454	break;
3435	}	3455	}
3436		3456
3437	/*	3457	/*
3438	* advance to the next vector	3458	* advance to the next vector
3439	*/	3459	*/
3440	next_idx++;	3460	next_idx++;
3441	bio_nbytes += nbytes;	3461	bio_nbytes += nbytes;
3442	}	3462	}
3443		3463
3444	total_bytes += nbytes;	3464	total_bytes += nbytes;
3445	nr_bytes -= nbytes;	3465	nr_bytes -= nbytes;
3446		3466
3447	if ((bio = req->bio)) {	3467	if ((bio = req->bio)) {
3448	/*	3468	/*
3449	* end more in this run, or just return 'not-done'	3469	* end more in this run, or just return 'not-done'
3450	*/	3470	*/
3451	if (unlikely(nr_bytes <= 0))	3471	if (unlikely(nr_bytes <= 0))
3452	break;	3472	break;
3453	}	3473	}
3454	}	3474	}
3455		3475
3456	/*	3476	/*
3457	* completely done	3477	* completely done
3458	*/	3478	*/
3459	if (!req->bio)	3479	if (!req->bio)
3460	return 0;	3480	return 0;
3461		3481
3462	/*	3482	/*
3463	* if the request wasn't completed, update state	3483	* if the request wasn't completed, update state
3464	*/	3484	*/
3465	if (bio_nbytes) {	3485	if (bio_nbytes) {
3466	req_bio_endio(req, bio, bio_nbytes, error);	3486	req_bio_endio(req, bio, bio_nbytes, error);
3467	bio->bi_idx += next_idx;	3487	bio->bi_idx += next_idx;
3468	bio_iovec(bio)->bv_offset += nr_bytes;	3488	bio_iovec(bio)->bv_offset += nr_bytes;
3469	bio_iovec(bio)->bv_len -= nr_bytes;	3489	bio_iovec(bio)->bv_len -= nr_bytes;
3470	}	3490	}
3471		3491
3472	blk_recalc_rq_sectors(req, total_bytes >> 9);	3492	blk_recalc_rq_sectors(req, total_bytes >> 9);
3473	blk_recalc_rq_segments(req);	3493	blk_recalc_rq_segments(req);
3474	return 1;	3494	return 1;
3475	}	3495	}
3476		3496
3477	/**	3497	/**
3478	* end_that_request_first - end I/O on a request	3498	* end_that_request_first - end I/O on a request
3479	* @req: the request being processed	3499	* @req: the request being processed
3480	* @uptodate: 1 for success, 0 for I/O error, < 0 for specific error	3500	* @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3481	* @nr_sectors: number of sectors to end I/O on	3501	* @nr_sectors: number of sectors to end I/O on
3482	*	3502	*
3483	* Description:	3503	* Description:
3484	* Ends I/O on a number of sectors attached to @req, and sets it up	3504	* Ends I/O on a number of sectors attached to @req, and sets it up
3485	* for the next range of segments (if any) in the cluster.	3505	* for the next range of segments (if any) in the cluster.
3486	*	3506	*
3487	* Return:	3507	* Return:
3488	* 0 - we are done with this request, call end_that_request_last()	3508	* 0 - we are done with this request, call end_that_request_last()
3489	* 1 - still buffers pending for this request	3509	* 1 - still buffers pending for this request
3490	**/	3510	**/
3491	int end_that_request_first(struct request *req, int uptodate, int nr_sectors)	3511	int end_that_request_first(struct request *req, int uptodate, int nr_sectors)
3492	{	3512	{
3493	return __end_that_request_first(req, uptodate, nr_sectors << 9);	3513	return __end_that_request_first(req, uptodate, nr_sectors << 9);
3494	}	3514	}
3495		3515
3496	EXPORT_SYMBOL(end_that_request_first);	3516	EXPORT_SYMBOL(end_that_request_first);
3497		3517
3498	/**	3518	/**
3499	* end_that_request_chunk - end I/O on a request	3519	* end_that_request_chunk - end I/O on a request
3500	* @req: the request being processed	3520	* @req: the request being processed
3501	* @uptodate: 1 for success, 0 for I/O error, < 0 for specific error	3521	* @uptodate: 1 for success, 0 for I/O error, < 0 for specific error
3502	* @nr_bytes: number of bytes to complete	3522	* @nr_bytes: number of bytes to complete
3503	*	3523	*
3504	* Description:	3524	* Description:
3505	* Ends I/O on a number of bytes attached to @req, and sets it up	3525	* Ends I/O on a number of bytes attached to @req, and sets it up
3506	* for the next range of segments (if any). Like end_that_request_first(),	3526	* for the next range of segments (if any). Like end_that_request_first(),
3507	* but deals with bytes instead of sectors.	3527	* but deals with bytes instead of sectors.
3508	*	3528	*
3509	* Return:	3529	* Return:
3510	* 0 - we are done with this request, call end_that_request_last()	3530	* 0 - we are done with this request, call end_that_request_last()
3511	* 1 - still buffers pending for this request	3531	* 1 - still buffers pending for this request
3512	**/	3532	**/
3513	int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)	3533	int end_that_request_chunk(struct request *req, int uptodate, int nr_bytes)
3514	{	3534	{
3515	return __end_that_request_first(req, uptodate, nr_bytes);	3535	return __end_that_request_first(req, uptodate, nr_bytes);
3516	}	3536	}
3517		3537
3518	EXPORT_SYMBOL(end_that_request_chunk);	3538	EXPORT_SYMBOL(end_that_request_chunk);
3519		3539
3520	/*	3540	/*
3521	* splice the completion data to a local structure and hand off to	3541	* splice the completion data to a local structure and hand off to
3522	* process_completion_queue() to complete the requests	3542	* process_completion_queue() to complete the requests
3523	*/	3543	*/
3524	static void blk_done_softirq(struct softirq_action *h)	3544	static void blk_done_softirq(struct softirq_action *h)
3525	{	3545	{
3526	struct list_head *cpu_list, local_list;	3546	struct list_head *cpu_list, local_list;
3527		3547
3528	local_irq_disable();	3548	local_irq_disable();
3529	cpu_list = &__get_cpu_var(blk_cpu_done);	3549	cpu_list = &__get_cpu_var(blk_cpu_done);
3530	list_replace_init(cpu_list, &local_list);	3550	list_replace_init(cpu_list, &local_list);
3531	local_irq_enable();	3551	local_irq_enable();
3532		3552
3533	while (!list_empty(&local_list)) {	3553	while (!list_empty(&local_list)) {
3534	struct request *rq = list_entry(local_list.next, struct request, donelist);	3554	struct request *rq = list_entry(local_list.next, struct request, donelist);
3535		3555
3536	list_del_init(&rq->donelist);	3556	list_del_init(&rq->donelist);
3537	rq->q->softirq_done_fn(rq);	3557	rq->q->softirq_done_fn(rq);
3538	}	3558	}
3539	}	3559	}
3540		3560
3541	static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action,	3561	static int __cpuinit blk_cpu_notify(struct notifier_block *self, unsigned long action,
3542	void *hcpu)	3562	void *hcpu)
3543	{	3563	{
3544	/*	3564	/*
3545	* If a CPU goes away, splice its entries to the current CPU	3565	* If a CPU goes away, splice its entries to the current CPU
3546	* and trigger a run of the softirq	3566	* and trigger a run of the softirq
3547	*/	3567	*/
3548	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN) {	3568	if (action == CPU_DEAD \|\| action == CPU_DEAD_FROZEN) {
3549	int cpu = (unsigned long) hcpu;	3569	int cpu = (unsigned long) hcpu;
3550		3570
3551	local_irq_disable();	3571	local_irq_disable();
3552	list_splice_init(&per_cpu(blk_cpu_done, cpu),	3572	list_splice_init(&per_cpu(blk_cpu_done, cpu),
3553	&__get_cpu_var(blk_cpu_done));	3573	&__get_cpu_var(blk_cpu_done));
3554	raise_softirq_irqoff(BLOCK_SOFTIRQ);	3574	raise_softirq_irqoff(BLOCK_SOFTIRQ);
3555	local_irq_enable();	3575	local_irq_enable();
3556	}	3576	}
3557		3577
3558	return NOTIFY_OK;	3578	return NOTIFY_OK;
3559	}	3579	}
3560		3580
3561		3581
3562	static struct notifier_block blk_cpu_notifier __cpuinitdata = {	3582	static struct notifier_block blk_cpu_notifier __cpuinitdata = {
3563	.notifier_call = blk_cpu_notify,	3583	.notifier_call = blk_cpu_notify,
3564	};	3584	};
3565		3585
3566	/**	3586	/**
3567	* blk_complete_request - end I/O on a request	3587	* blk_complete_request - end I/O on a request
3568	* @req: the request being processed	3588	* @req: the request being processed
3569	*	3589	*
3570	* Description:	3590	* Description:
3571	* Ends all I/O on a request. It does not handle partial completions,	3591	* Ends all I/O on a request. It does not handle partial completions,
3572	* unless the driver actually implements this in its completion callback	3592	* unless the driver actually implements this in its completion callback
3573	* through requeueing. The actual completion happens out-of-order,	3593	* through requeueing. The actual completion happens out-of-order,
3574	* through a softirq handler. The user must have registered a completion	3594	* through a softirq handler. The user must have registered a completion
3575	* callback through blk_queue_softirq_done().	3595	* callback through blk_queue_softirq_done().
3576	**/	3596	**/
3577		3597
3578	void blk_complete_request(struct request *req)	3598	void blk_complete_request(struct request *req)
3579	{	3599	{
3580	struct list_head *cpu_list;	3600	struct list_head *cpu_list;
3581	unsigned long flags;	3601	unsigned long flags;
3582		3602
3583	BUG_ON(!req->q->softirq_done_fn);	3603	BUG_ON(!req->q->softirq_done_fn);
3584		3604
3585	local_irq_save(flags);	3605	local_irq_save(flags);
3586		3606
3587	cpu_list = &__get_cpu_var(blk_cpu_done);	3607	cpu_list = &__get_cpu_var(blk_cpu_done);
3588	list_add_tail(&req->donelist, cpu_list);	3608	list_add_tail(&req->donelist, cpu_list);
3589	raise_softirq_irqoff(BLOCK_SOFTIRQ);	3609	raise_softirq_irqoff(BLOCK_SOFTIRQ);
3590		3610
3591	local_irq_restore(flags);	3611	local_irq_restore(flags);
3592	}	3612	}
3593		3613
3594	EXPORT_SYMBOL(blk_complete_request);	3614	EXPORT_SYMBOL(blk_complete_request);
3595		3615
3596	/*	3616	/*
3597	* queue lock must be held	3617	* queue lock must be held
3598	*/	3618	*/
3599	void end_that_request_last(struct request *req, int uptodate)	3619	void end_that_request_last(struct request *req, int uptodate)
3600	{	3620	{
3601	struct gendisk *disk = req->rq_disk;	3621	struct gendisk *disk = req->rq_disk;
3602	int error;	3622	int error;
3603		3623
3604	/*	3624	/*
3605	* extend uptodate bool to allow < 0 value to be direct io error	3625	* extend uptodate bool to allow < 0 value to be direct io error
3606	*/	3626	*/
3607	error = 0;	3627	error = 0;
3608	if (end_io_error(uptodate))	3628	if (end_io_error(uptodate))
3609	error = !uptodate ? -EIO : uptodate;	3629	error = !uptodate ? -EIO : uptodate;
3610		3630
3611	if (unlikely(laptop_mode) && blk_fs_request(req))	3631	if (unlikely(laptop_mode) && blk_fs_request(req))
3612	laptop_io_completion();	3632	laptop_io_completion();
3613		3633
3614	/*	3634	/*
3615	* Account IO completion. bar_rq isn't accounted as a normal	3635	* Account IO completion. bar_rq isn't accounted as a normal
3616	* IO on queueing nor completion. Accounting the containing	3636	* IO on queueing nor completion. Accounting the containing
3617	* request is enough.	3637	* request is enough.
3618	*/	3638	*/
3619	if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {	3639	if (disk && blk_fs_request(req) && req != &req->q->bar_rq) {
3620	unsigned long duration = jiffies - req->start_time;	3640	unsigned long duration = jiffies - req->start_time;
3621	const int rw = rq_data_dir(req);	3641	const int rw = rq_data_dir(req);
3622		3642
3623	__disk_stat_inc(disk, ios[rw]);	3643	__disk_stat_inc(disk, ios[rw]);
3624	__disk_stat_add(disk, ticks[rw], duration);	3644	__disk_stat_add(disk, ticks[rw], duration);
3625	disk_round_stats(disk);	3645	disk_round_stats(disk);
3626	disk->in_flight--;	3646	disk->in_flight--;
3627	}	3647	}
3628	if (req->end_io)	3648	if (req->end_io)
3629	req->end_io(req, error);	3649	req->end_io(req, error);
3630	else	3650	else
3631	__blk_put_request(req->q, req);	3651	__blk_put_request(req->q, req);
3632	}	3652	}
3633		3653
3634	EXPORT_SYMBOL(end_that_request_last);	3654	EXPORT_SYMBOL(end_that_request_last);
3635		3655
3636	static inline void __end_request(struct request *rq, int uptodate,	3656	static inline void __end_request(struct request *rq, int uptodate,
3637	unsigned int nr_bytes, int dequeue)	3657	unsigned int nr_bytes, int dequeue)
3638	{	3658	{
3639	if (!end_that_request_chunk(rq, uptodate, nr_bytes)) {	3659	if (!end_that_request_chunk(rq, uptodate, nr_bytes)) {
3640	if (dequeue)	3660	if (dequeue)
3641	blkdev_dequeue_request(rq);	3661	blkdev_dequeue_request(rq);
3642	add_disk_randomness(rq->rq_disk);	3662	add_disk_randomness(rq->rq_disk);
3643	end_that_request_last(rq, uptodate);	3663	end_that_request_last(rq, uptodate);
3644	}	3664	}
3645	}	3665	}
3646		3666
3647	static unsigned int rq_byte_size(struct request *rq)	3667	static unsigned int rq_byte_size(struct request *rq)
3648	{	3668	{
3649	if (blk_fs_request(rq))	3669	if (blk_fs_request(rq))
3650	return rq->hard_nr_sectors << 9;	3670	return rq->hard_nr_sectors << 9;
3651		3671
3652	return rq->data_len;	3672	return rq->data_len;
3653	}	3673	}
3654		3674
3655	/**	3675	/**
3656	* end_queued_request - end all I/O on a queued request	3676	* end_queued_request - end all I/O on a queued request
3657	* @rq: the request being processed	3677	* @rq: the request being processed
3658	* @uptodate: error value or 0/1 uptodate flag	3678	* @uptodate: error value or 0/1 uptodate flag
3659	*	3679	*
3660	* Description:	3680	* Description:
3661	* Ends all I/O on a request, and removes it from the block layer queues.	3681	* Ends all I/O on a request, and removes it from the block layer queues.
3662	* Not suitable for normal IO completion, unless the driver still has	3682	* Not suitable for normal IO completion, unless the driver still has
3663	* the request attached to the block layer.	3683	* the request attached to the block layer.
3664	*	3684	*
3665	**/	3685	**/
3666	void end_queued_request(struct request *rq, int uptodate)	3686	void end_queued_request(struct request *rq, int uptodate)
3667	{	3687	{
3668	__end_request(rq, uptodate, rq_byte_size(rq), 1);	3688	__end_request(rq, uptodate, rq_byte_size(rq), 1);
3669	}	3689	}
3670	EXPORT_SYMBOL(end_queued_request);	3690	EXPORT_SYMBOL(end_queued_request);
3671		3691
3672	/**	3692	/**
3673	* end_dequeued_request - end all I/O on a dequeued request	3693	* end_dequeued_request - end all I/O on a dequeued request
3674	* @rq: the request being processed	3694	* @rq: the request being processed
3675	* @uptodate: error value or 0/1 uptodate flag	3695	* @uptodate: error value or 0/1 uptodate flag
3676	*	3696	*
3677	* Description:	3697	* Description:
3678	* Ends all I/O on a request. The request must already have been	3698	* Ends all I/O on a request. The request must already have been
3679	* dequeued using blkdev_dequeue_request(), as is normally the case	3699	* dequeued using blkdev_dequeue_request(), as is normally the case
3680	* for most drivers.	3700	* for most drivers.
3681	*	3701	*
3682	**/	3702	**/
3683	void end_dequeued_request(struct request *rq, int uptodate)	3703	void end_dequeued_request(struct request *rq, int uptodate)
3684	{	3704	{
3685	__end_request(rq, uptodate, rq_byte_size(rq), 0);	3705	__end_request(rq, uptodate, rq_byte_size(rq), 0);
3686	}	3706	}
3687	EXPORT_SYMBOL(end_dequeued_request);	3707	EXPORT_SYMBOL(end_dequeued_request);
3688		3708
3689		3709
3690	/**	3710	/**
3691	* end_request - end I/O on the current segment of the request	3711	* end_request - end I/O on the current segment of the request
3692	* @rq: the request being processed	3712	* @rq: the request being processed
3693	* @uptodate: error value or 0/1 uptodate flag	3713	* @uptodate: error value or 0/1 uptodate flag
3694	*	3714	*
3695	* Description:	3715	* Description:
3696	* Ends I/O on the current segment of a request. If that is the only	3716	* Ends I/O on the current segment of a request. If that is the only
3697	* remaining segment, the request is also completed and freed.	3717	* remaining segment, the request is also completed and freed.
3698	*	3718	*
3699	* This is a remnant of how older block drivers handled IO completions.	3719	* This is a remnant of how older block drivers handled IO completions.
3700	* Modern drivers typically end IO on the full request in one go, unless	3720	* Modern drivers typically end IO on the full request in one go, unless
3701	* they have a residual value to account for. For that case this function	3721	* they have a residual value to account for. For that case this function
3702	* isn't really useful, unless the residual just happens to be the	3722	* isn't really useful, unless the residual just happens to be the
3703	* full current segment. In other words, don't use this function in new	3723	* full current segment. In other words, don't use this function in new
3704	* code. Either use end_request_completely(), or the	3724	* code. Either use end_request_completely(), or the
3705	* end_that_request_chunk() (along with end_that_request_last()) for	3725	* end_that_request_chunk() (along with end_that_request_last()) for
3706	* partial completions.	3726	* partial completions.
3707	*	3727	*
3708	**/	3728	**/
3709	void end_request(struct request *req, int uptodate)	3729	void end_request(struct request *req, int uptodate)
3710	{	3730	{
3711	__end_request(req, uptodate, req->hard_cur_sectors << 9, 1);	3731	__end_request(req, uptodate, req->hard_cur_sectors << 9, 1);
3712	}	3732	}
3713	EXPORT_SYMBOL(end_request);	3733	EXPORT_SYMBOL(end_request);
3714		3734
3715	static void blk_rq_bio_prep(struct request_queue q, struct request rq,	3735	static void blk_rq_bio_prep(struct request_queue q, struct request rq,
3716	struct bio *bio)	3736	struct bio *bio)
3717	{	3737	{
3718	/* first two bits are identical in rq->cmd_flags and bio->bi_rw */	3738	/* first two bits are identical in rq->cmd_flags and bio->bi_rw */
3719	rq->cmd_flags \|= (bio->bi_rw & 3);	3739	rq->cmd_flags \|= (bio->bi_rw & 3);
3720		3740
3721	rq->nr_phys_segments = bio_phys_segments(q, bio);	3741	rq->nr_phys_segments = bio_phys_segments(q, bio);
3722	rq->nr_hw_segments = bio_hw_segments(q, bio);	3742	rq->nr_hw_segments = bio_hw_segments(q, bio);
3723	rq->current_nr_sectors = bio_cur_sectors(bio);	3743	rq->current_nr_sectors = bio_cur_sectors(bio);
3724	rq->hard_cur_sectors = rq->current_nr_sectors;	3744	rq->hard_cur_sectors = rq->current_nr_sectors;
3725	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);	3745	rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio);
3726	rq->buffer = bio_data(bio);	3746	rq->buffer = bio_data(bio);
3727	rq->data_len = bio->bi_size;	3747	rq->data_len = bio->bi_size;
3728		3748
3729	rq->bio = rq->biotail = bio;	3749	rq->bio = rq->biotail = bio;
3730		3750
3731	if (bio->bi_bdev)	3751	if (bio->bi_bdev)
3732	rq->rq_disk = bio->bi_bdev->bd_disk;	3752	rq->rq_disk = bio->bi_bdev->bd_disk;
3733	}	3753	}
3734		3754
3735	int kblockd_schedule_work(struct work_struct *work)	3755	int kblockd_schedule_work(struct work_struct *work)
3736	{	3756	{
3737	return queue_work(kblockd_workqueue, work);	3757	return queue_work(kblockd_workqueue, work);
3738	}	3758	}
3739		3759
3740	EXPORT_SYMBOL(kblockd_schedule_work);	3760	EXPORT_SYMBOL(kblockd_schedule_work);
3741		3761
3742	void kblockd_flush_work(struct work_struct *work)	3762	void kblockd_flush_work(struct work_struct *work)
3743	{	3763	{
3744	cancel_work_sync(work);	3764	cancel_work_sync(work);
3745	}	3765	}
3746	EXPORT_SYMBOL(kblockd_flush_work);	3766	EXPORT_SYMBOL(kblockd_flush_work);
3747		3767
3748	int __init blk_dev_init(void)	3768	int __init blk_dev_init(void)
3749	{	3769	{
3750	int i;	3770	int i;
3751		3771
3752	kblockd_workqueue = create_workqueue("kblockd");	3772	kblockd_workqueue = create_workqueue("kblockd");
3753	if (!kblockd_workqueue)	3773	if (!kblockd_workqueue)
3754	panic("Failed to create kblockd\n");	3774	panic("Failed to create kblockd\n");
3755		3775
3756	request_cachep = kmem_cache_create("blkdev_requests",	3776	request_cachep = kmem_cache_create("blkdev_requests",
3757	sizeof(struct request), 0, SLAB_PANIC, NULL);	3777	sizeof(struct request), 0, SLAB_PANIC, NULL);
3758		3778
3759	requestq_cachep = kmem_cache_create("blkdev_queue",	3779	requestq_cachep = kmem_cache_create("blkdev_queue",
3760	sizeof(struct request_queue), 0, SLAB_PANIC, NULL);	3780	sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
3761		3781
3762	iocontext_cachep = kmem_cache_create("blkdev_ioc",	3782	iocontext_cachep = kmem_cache_create("blkdev_ioc",
3763	sizeof(struct io_context), 0, SLAB_PANIC, NULL);	3783	sizeof(struct io_context), 0, SLAB_PANIC, NULL);
3764		3784
3765	for_each_possible_cpu(i)	3785	for_each_possible_cpu(i)
3766	INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));	3786	INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
3767		3787
3768	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);	3788	open_softirq(BLOCK_SOFTIRQ, blk_done_softirq, NULL);
3769	register_hotcpu_notifier(&blk_cpu_notifier);	3789	register_hotcpu_notifier(&blk_cpu_notifier);
3770		3790
3771	blk_max_low_pfn = max_low_pfn - 1;	3791	blk_max_low_pfn = max_low_pfn - 1;
3772	blk_max_pfn = max_pfn - 1;	3792	blk_max_pfn = max_pfn - 1;
3773		3793
3774	return 0;	3794	return 0;
3775	}	3795	}
3776		3796
3777	/*	3797	/*
3778	* IO Context helper functions	3798	* IO Context helper functions
3779	*/	3799	*/
3780	void put_io_context(struct io_context *ioc)	3800	void put_io_context(struct io_context *ioc)
3781	{	3801	{
3782	if (ioc == NULL)	3802	if (ioc == NULL)
3783	return;	3803	return;
3784		3804
3785	BUG_ON(atomic_read(&ioc->refcount) == 0);	3805	BUG_ON(atomic_read(&ioc->refcount) == 0);
3786		3806
3787	if (atomic_dec_and_test(&ioc->refcount)) {	3807	if (atomic_dec_and_test(&ioc->refcount)) {
3788	struct cfq_io_context *cic;	3808	struct cfq_io_context *cic;
3789		3809
3790	rcu_read_lock();	3810	rcu_read_lock();
3791	if (ioc->aic && ioc->aic->dtor)	3811	if (ioc->aic && ioc->aic->dtor)
3792	ioc->aic->dtor(ioc->aic);	3812	ioc->aic->dtor(ioc->aic);
3793	if (ioc->cic_root.rb_node != NULL) {	3813	if (ioc->cic_root.rb_node != NULL) {
3794	struct rb_node *n = rb_first(&ioc->cic_root);	3814	struct rb_node *n = rb_first(&ioc->cic_root);
3795		3815
3796	cic = rb_entry(n, struct cfq_io_context, rb_node);	3816	cic = rb_entry(n, struct cfq_io_context, rb_node);
3797	cic->dtor(ioc);	3817	cic->dtor(ioc);
3798	}	3818	}
3799	rcu_read_unlock();	3819	rcu_read_unlock();
3800		3820
3801	kmem_cache_free(iocontext_cachep, ioc);	3821	kmem_cache_free(iocontext_cachep, ioc);
3802	}	3822	}
3803	}	3823	}
3804	EXPORT_SYMBOL(put_io_context);	3824	EXPORT_SYMBOL(put_io_context);
3805		3825
3806	/* Called by the exitting task */	3826	/* Called by the exitting task */
3807	void exit_io_context(void)	3827	void exit_io_context(void)
3808	{	3828	{
3809	struct io_context *ioc;	3829	struct io_context *ioc;
3810	struct cfq_io_context *cic;	3830	struct cfq_io_context *cic;
3811		3831
3812	task_lock(current);	3832	task_lock(current);
3813	ioc = current->io_context;	3833	ioc = current->io_context;
3814	current->io_context = NULL;	3834	current->io_context = NULL;
3815	task_unlock(current);	3835	task_unlock(current);
3816		3836
3817	ioc->task = NULL;	3837	ioc->task = NULL;
3818	if (ioc->aic && ioc->aic->exit)	3838	if (ioc->aic && ioc->aic->exit)
3819	ioc->aic->exit(ioc->aic);	3839	ioc->aic->exit(ioc->aic);
3820	if (ioc->cic_root.rb_node != NULL) {	3840	if (ioc->cic_root.rb_node != NULL) {
3821	cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);	3841	cic = rb_entry(rb_first(&ioc->cic_root), struct cfq_io_context, rb_node);
3822	cic->exit(ioc);	3842	cic->exit(ioc);
3823	}	3843	}
3824		3844
3825	put_io_context(ioc);	3845	put_io_context(ioc);
3826	}	3846	}
3827		3847
3828	/*	3848	/*
3829	* If the current task has no IO context then create one and initialise it.	3849	* If the current task has no IO context then create one and initialise it.
3830	* Otherwise, return its existing IO context.	3850	* Otherwise, return its existing IO context.
3831	*	3851	*
3832	* This returned IO context doesn't have a specifically elevated refcount,	3852	* This returned IO context doesn't have a specifically elevated refcount,
3833	* but since the current task itself holds a reference, the context can be	3853	* but since the current task itself holds a reference, the context can be
3834	* used in general code, so long as it stays within `current` context.	3854	* used in general code, so long as it stays within `current` context.
3835	*/	3855	*/
3836	static struct io_context *current_io_context(gfp_t gfp_flags, int node)	3856	static struct io_context *current_io_context(gfp_t gfp_flags, int node)
3837	{	3857	{
3838	struct task_struct *tsk = current;	3858	struct task_struct *tsk = current;
3839	struct io_context *ret;	3859	struct io_context *ret;
3840		3860
3841	ret = tsk->io_context;	3861	ret = tsk->io_context;
3842	if (likely(ret))	3862	if (likely(ret))
3843	return ret;	3863	return ret;
3844		3864
3845	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);	3865	ret = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
3846	if (ret) {	3866	if (ret) {
3847	atomic_set(&ret->refcount, 1);	3867	atomic_set(&ret->refcount, 1);
3848	ret->task = current;	3868	ret->task = current;
3849	ret->ioprio_changed = 0;	3869	ret->ioprio_changed = 0;
3850	ret->last_waited = jiffies; /* doesn't matter... */	3870	ret->last_waited = jiffies; /* doesn't matter... */
3851	ret->nr_batch_requests = 0; /* because this is 0 */	3871	ret->nr_batch_requests = 0; /* because this is 0 */
3852	ret->aic = NULL;	3872	ret->aic = NULL;
3853	ret->cic_root.rb_node = NULL;	3873	ret->cic_root.rb_node = NULL;
3854	ret->ioc_data = NULL;	3874	ret->ioc_data = NULL;
3855	/* make sure set_task_ioprio() sees the settings above */	3875	/* make sure set_task_ioprio() sees the settings above */
3856	smp_wmb();	3876	smp_wmb();
3857	tsk->io_context = ret;	3877	tsk->io_context = ret;
3858	}	3878	}
3859		3879
3860	return ret;	3880	return ret;
3861	}	3881	}
3862		3882
3863	/*	3883	/*
3864	* If the current task has no IO context then create one and initialise it.	3884	* If the current task has no IO context then create one and initialise it.
3865	* If it does have a context, take a ref on it.	3885	* If it does have a context, take a ref on it.
3866	*	3886	*
3867	* This is always called in the context of the task which submitted the I/O.	3887	* This is always called in the context of the task which submitted the I/O.
3868	*/	3888	*/
3869	struct io_context *get_io_context(gfp_t gfp_flags, int node)	3889	struct io_context *get_io_context(gfp_t gfp_flags, int node)
3870	{	3890	{
3871	struct io_context *ret;	3891	struct io_context *ret;
3872	ret = current_io_context(gfp_flags, node);	3892	ret = current_io_context(gfp_flags, node);
3873	if (likely(ret))	3893	if (likely(ret))
3874	atomic_inc(&ret->refcount);	3894	atomic_inc(&ret->refcount);
3875	return ret;	3895	return ret;
3876	}	3896	}
3877	EXPORT_SYMBOL(get_io_context);	3897	EXPORT_SYMBOL(get_io_context);
3878		3898
3879	void copy_io_context(struct io_context pdst, struct io_context psrc)	3899	void copy_io_context(struct io_context pdst, struct io_context psrc)
3880	{	3900	{
3881	struct io_context src = psrc;	3901	struct io_context src = psrc;
3882	struct io_context dst = pdst;	3902	struct io_context dst = pdst;
3883		3903
3884	if (src) {	3904	if (src) {
3885	BUG_ON(atomic_read(&src->refcount) == 0);	3905	BUG_ON(atomic_read(&src->refcount) == 0);
3886	atomic_inc(&src->refcount);	3906	atomic_inc(&src->refcount);
3887	put_io_context(dst);	3907	put_io_context(dst);
3888	*pdst = src;	3908	*pdst = src;
3889	}	3909	}
3890	}	3910	}
3891	EXPORT_SYMBOL(copy_io_context);	3911	EXPORT_SYMBOL(copy_io_context);
3892		3912
3893	void swap_io_context(struct io_context ioc1, struct io_context ioc2)	3913	void swap_io_context(struct io_context ioc1, struct io_context ioc2)
3894	{	3914	{
3895	struct io_context *temp;	3915	struct io_context *temp;
3896	temp = *ioc1;	3916	temp = *ioc1;
3897	ioc1 = ioc2;	3917	ioc1 = ioc2;
3898	*ioc2 = temp;	3918	*ioc2 = temp;
3899	}	3919	}
3900	EXPORT_SYMBOL(swap_io_context);	3920	EXPORT_SYMBOL(swap_io_context);
3901		3921
3902	/*	3922	/*
3903	* sysfs parts below	3923	* sysfs parts below
3904	*/	3924	*/
3905	struct queue_sysfs_entry {	3925	struct queue_sysfs_entry {
3906	struct attribute attr;	3926	struct attribute attr;
3907	ssize_t (show)(struct request_queue , char *);	3927	ssize_t (show)(struct request_queue , char *);
3908	ssize_t (store)(struct request_queue , const char *, size_t);	3928	ssize_t (store)(struct request_queue , const char *, size_t);
3909	};	3929	};
3910		3930
3911	static ssize_t	3931	static ssize_t
3912	queue_var_show(unsigned int var, char *page)	3932	queue_var_show(unsigned int var, char *page)
3913	{	3933	{
3914	return sprintf(page, "%d\n", var);	3934	return sprintf(page, "%d\n", var);
3915	}	3935	}
3916		3936
3917	static ssize_t	3937	static ssize_t
3918	queue_var_store(unsigned long var, const char page, size_t count)	3938	queue_var_store(unsigned long var, const char page, size_t count)
3919	{	3939	{
3920	char p = (char ) page;	3940	char p = (char ) page;
3921		3941
3922	*var = simple_strtoul(p, &p, 10);	3942	*var = simple_strtoul(p, &p, 10);
3923	return count;	3943	return count;
3924	}	3944	}
3925		3945
3926	static ssize_t queue_requests_show(struct request_queue q, char page)	3946	static ssize_t queue_requests_show(struct request_queue q, char page)
3927	{	3947	{
3928	return queue_var_show(q->nr_requests, (page));	3948	return queue_var_show(q->nr_requests, (page));
3929	}	3949	}
3930		3950
3931	static ssize_t	3951	static ssize_t
3932	queue_requests_store(struct request_queue q, const char page, size_t count)	3952	queue_requests_store(struct request_queue q, const char page, size_t count)
3933	{	3953	{
3934	struct request_list *rl = &q->rq;	3954	struct request_list *rl = &q->rq;
3935	unsigned long nr;	3955	unsigned long nr;
3936	int ret = queue_var_store(&nr, page, count);	3956	int ret = queue_var_store(&nr, page, count);
3937	if (nr < BLKDEV_MIN_RQ)	3957	if (nr < BLKDEV_MIN_RQ)
3938	nr = BLKDEV_MIN_RQ;	3958	nr = BLKDEV_MIN_RQ;
3939		3959
3940	spin_lock_irq(q->queue_lock);	3960	spin_lock_irq(q->queue_lock);
3941	q->nr_requests = nr;	3961	q->nr_requests = nr;
3942	blk_queue_congestion_threshold(q);	3962	blk_queue_congestion_threshold(q);
3943		3963
3944	if (rl->count[READ] >= queue_congestion_on_threshold(q))	3964	if (rl->count[READ] >= queue_congestion_on_threshold(q))
3945	blk_set_queue_congested(q, READ);	3965	blk_set_queue_congested(q, READ);
3946	else if (rl->count[READ] < queue_congestion_off_threshold(q))	3966	else if (rl->count[READ] < queue_congestion_off_threshold(q))
3947	blk_clear_queue_congested(q, READ);	3967	blk_clear_queue_congested(q, READ);
3948		3968
3949	if (rl->count[WRITE] >= queue_congestion_on_threshold(q))	3969	if (rl->count[WRITE] >= queue_congestion_on_threshold(q))
3950	blk_set_queue_congested(q, WRITE);	3970	blk_set_queue_congested(q, WRITE);
3951	else if (rl->count[WRITE] < queue_congestion_off_threshold(q))	3971	else if (rl->count[WRITE] < queue_congestion_off_threshold(q))
3952	blk_clear_queue_congested(q, WRITE);	3972	blk_clear_queue_congested(q, WRITE);
3953		3973
3954	if (rl->count[READ] >= q->nr_requests) {	3974	if (rl->count[READ] >= q->nr_requests) {
3955	blk_set_queue_full(q, READ);	3975	blk_set_queue_full(q, READ);
3956	} else if (rl->count[READ]+1 <= q->nr_requests) {	3976	} else if (rl->count[READ]+1 <= q->nr_requests) {
3957	blk_clear_queue_full(q, READ);	3977	blk_clear_queue_full(q, READ);
3958	wake_up(&rl->wait[READ]);	3978	wake_up(&rl->wait[READ]);
3959	}	3979	}
3960		3980
3961	if (rl->count[WRITE] >= q->nr_requests) {	3981	if (rl->count[WRITE] >= q->nr_requests) {
3962	blk_set_queue_full(q, WRITE);	3982	blk_set_queue_full(q, WRITE);
3963	} else if (rl->count[WRITE]+1 <= q->nr_requests) {	3983	} else if (rl->count[WRITE]+1 <= q->nr_requests) {
3964	blk_clear_queue_full(q, WRITE);	3984	blk_clear_queue_full(q, WRITE);
3965	wake_up(&rl->wait[WRITE]);	3985	wake_up(&rl->wait[WRITE]);
3966	}	3986	}
3967	spin_unlock_irq(q->queue_lock);	3987	spin_unlock_irq(q->queue_lock);
3968	return ret;	3988	return ret;
3969	}	3989	}
3970		3990
3971	static ssize_t queue_ra_show(struct request_queue q, char page)	3991	static ssize_t queue_ra_show(struct request_queue q, char page)
3972	{	3992	{
3973	int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);	3993	int ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
3974		3994
3975	return queue_var_show(ra_kb, (page));	3995	return queue_var_show(ra_kb, (page));
3976	}	3996	}
3977		3997
3978	static ssize_t	3998	static ssize_t
3979	queue_ra_store(struct request_queue q, const char page, size_t count)	3999	queue_ra_store(struct request_queue q, const char page, size_t count)
3980	{	4000	{
3981	unsigned long ra_kb;	4001	unsigned long ra_kb;
3982	ssize_t ret = queue_var_store(&ra_kb, page, count);	4002	ssize_t ret = queue_var_store(&ra_kb, page, count);
3983		4003
3984	spin_lock_irq(q->queue_lock);	4004	spin_lock_irq(q->queue_lock);
3985	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);	4005	q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10);
3986	spin_unlock_irq(q->queue_lock);	4006	spin_unlock_irq(q->queue_lock);
3987		4007
3988	return ret;	4008	return ret;
3989	}	4009	}
3990		4010
3991	static ssize_t queue_max_sectors_show(struct request_queue q, char page)	4011	static ssize_t queue_max_sectors_show(struct request_queue q, char page)
3992	{	4012	{
3993	int max_sectors_kb = q->max_sectors >> 1;	4013	int max_sectors_kb = q->max_sectors >> 1;
3994		4014
3995	return queue_var_show(max_sectors_kb, (page));	4015	return queue_var_show(max_sectors_kb, (page));
3996	}	4016	}
3997		4017
3998	static ssize_t	4018	static ssize_t
3999	queue_max_sectors_store(struct request_queue q, const char page, size_t count)	4019	queue_max_sectors_store(struct request_queue q, const char page, size_t count)
4000	{	4020	{
4001	unsigned long max_sectors_kb,	4021	unsigned long max_sectors_kb,
4002	max_hw_sectors_kb = q->max_hw_sectors >> 1,	4022	max_hw_sectors_kb = q->max_hw_sectors >> 1,
4003	page_kb = 1 << (PAGE_CACHE_SHIFT - 10);	4023	page_kb = 1 << (PAGE_CACHE_SHIFT - 10);
4004	ssize_t ret = queue_var_store(&max_sectors_kb, page, count);	4024	ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
4005	int ra_kb;	4025	int ra_kb;
4006		4026
4007	if (max_sectors_kb > max_hw_sectors_kb \|\| max_sectors_kb < page_kb)	4027	if (max_sectors_kb > max_hw_sectors_kb \|\| max_sectors_kb < page_kb)
4008	return -EINVAL;	4028	return -EINVAL;
4009	/*	4029	/*
4010	* Take the queue lock to update the readahead and max_sectors	4030	* Take the queue lock to update the readahead and max_sectors
4011	* values synchronously:	4031	* values synchronously:
4012	*/	4032	*/
4013	spin_lock_irq(q->queue_lock);	4033	spin_lock_irq(q->queue_lock);
4014	/*	4034	/*
4015	* Trim readahead window as well, if necessary:	4035	* Trim readahead window as well, if necessary:
4016	*/	4036	*/
4017	ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);	4037	ra_kb = q->backing_dev_info.ra_pages << (PAGE_CACHE_SHIFT - 10);
4018	if (ra_kb > max_sectors_kb)	4038	if (ra_kb > max_sectors_kb)
4019	q->backing_dev_info.ra_pages =	4039	q->backing_dev_info.ra_pages =
4020	max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);	4040	max_sectors_kb >> (PAGE_CACHE_SHIFT - 10);
4021		4041
4022	q->max_sectors = max_sectors_kb << 1;	4042	q->max_sectors = max_sectors_kb << 1;
4023	spin_unlock_irq(q->queue_lock);	4043	spin_unlock_irq(q->queue_lock);
4024		4044
4025	return ret;	4045	return ret;
4026	}	4046	}
4027		4047
4028	static ssize_t queue_max_hw_sectors_show(struct request_queue q, char page)	4048	static ssize_t queue_max_hw_sectors_show(struct request_queue q, char page)
4029	{	4049	{
4030	int max_hw_sectors_kb = q->max_hw_sectors >> 1;	4050	int max_hw_sectors_kb = q->max_hw_sectors >> 1;
4031		4051
4032	return queue_var_show(max_hw_sectors_kb, (page));	4052	return queue_var_show(max_hw_sectors_kb, (page));
4033	}	4053	}
4034		4054
4035		4055
4036	static struct queue_sysfs_entry queue_requests_entry = {	4056	static struct queue_sysfs_entry queue_requests_entry = {
4037	.attr = {.name = "nr_requests", .mode = S_IRUGO \| S_IWUSR },	4057	.attr = {.name = "nr_requests", .mode = S_IRUGO \| S_IWUSR },
4038	.show = queue_requests_show,	4058	.show = queue_requests_show,
4039	.store = queue_requests_store,	4059	.store = queue_requests_store,
4040	};	4060	};
4041		4061
4042	static struct queue_sysfs_entry queue_ra_entry = {	4062	static struct queue_sysfs_entry queue_ra_entry = {
4043	.attr = {.name = "read_ahead_kb", .mode = S_IRUGO \| S_IWUSR },	4063	.attr = {.name = "read_ahead_kb", .mode = S_IRUGO \| S_IWUSR },
4044	.show = queue_ra_show,	4064	.show = queue_ra_show,
4045	.store = queue_ra_store,	4065	.store = queue_ra_store,
4046	};	4066	};
4047		4067
4048	static struct queue_sysfs_entry queue_max_sectors_entry = {	4068	static struct queue_sysfs_entry queue_max_sectors_entry = {
4049	.attr = {.name = "max_sectors_kb", .mode = S_IRUGO \| S_IWUSR },	4069	.attr = {.name = "max_sectors_kb", .mode = S_IRUGO \| S_IWUSR },
4050	.show = queue_max_sectors_show,	4070	.show = queue_max_sectors_show,
4051	.store = queue_max_sectors_store,	4071	.store = queue_max_sectors_store,
4052	};	4072	};
4053		4073
4054	static struct queue_sysfs_entry queue_max_hw_sectors_entry = {	4074	static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
4055	.attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },	4075	.attr = {.name = "max_hw_sectors_kb", .mode = S_IRUGO },
4056	.show = queue_max_hw_sectors_show,	4076	.show = queue_max_hw_sectors_show,
4057	};	4077	};
4058		4078
4059	static struct queue_sysfs_entry queue_iosched_entry = {	4079	static struct queue_sysfs_entry queue_iosched_entry = {
4060	.attr = {.name = "scheduler", .mode = S_IRUGO \| S_IWUSR },	4080	.attr = {.name = "scheduler", .mode = S_IRUGO \| S_IWUSR },
4061	.show = elv_iosched_show,	4081	.show = elv_iosched_show,
4062	.store = elv_iosched_store,	4082	.store = elv_iosched_store,
4063	};	4083	};
4064		4084
4065	static struct attribute *default_attrs[] = {	4085	static struct attribute *default_attrs[] = {
4066	&queue_requests_entry.attr,	4086	&queue_requests_entry.attr,
4067	&queue_ra_entry.attr,	4087	&queue_ra_entry.attr,
4068	&queue_max_hw_sectors_entry.attr,	4088	&queue_max_hw_sectors_entry.attr,
4069	&queue_max_sectors_entry.attr,	4089	&queue_max_sectors_entry.attr,
4070	&queue_iosched_entry.attr,	4090	&queue_iosched_entry.attr,
4071	NULL,	4091	NULL,
4072	};	4092	};
4073		4093
4074	#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)	4094	#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
4075		4095
4076	static ssize_t	4096	static ssize_t
4077	queue_attr_show(struct kobject kobj, struct attribute attr, char *page)	4097	queue_attr_show(struct kobject kobj, struct attribute attr, char *page)
4078	{	4098	{
4079	struct queue_sysfs_entry *entry = to_queue(attr);	4099	struct queue_sysfs_entry *entry = to_queue(attr);
4080	struct request_queue *q =	4100	struct request_queue *q =
4081	container_of(kobj, struct request_queue, kobj);	4101	container_of(kobj, struct request_queue, kobj);
4082	ssize_t res;	4102	ssize_t res;
4083		4103
4084	if (!entry->show)	4104	if (!entry->show)
4085	return -EIO;	4105	return -EIO;
4086	mutex_lock(&q->sysfs_lock);	4106	mutex_lock(&q->sysfs_lock);
4087	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {	4107	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
4088	mutex_unlock(&q->sysfs_lock);	4108	mutex_unlock(&q->sysfs_lock);
4089	return -ENOENT;	4109	return -ENOENT;
4090	}	4110	}
4091	res = entry->show(q, page);	4111	res = entry->show(q, page);
4092	mutex_unlock(&q->sysfs_lock);	4112	mutex_unlock(&q->sysfs_lock);
4093	return res;	4113	return res;
4094	}	4114	}
4095		4115
4096	static ssize_t	4116	static ssize_t
4097	queue_attr_store(struct kobject kobj, struct attribute attr,	4117	queue_attr_store(struct kobject kobj, struct attribute attr,
4098	const char *page, size_t length)	4118	const char *page, size_t length)
4099	{	4119	{
4100	struct queue_sysfs_entry *entry = to_queue(attr);	4120	struct queue_sysfs_entry *entry = to_queue(attr);
4101	struct request_queue *q = container_of(kobj, struct request_queue, kobj);	4121	struct request_queue *q = container_of(kobj, struct request_queue, kobj);
4102		4122
4103	ssize_t res;	4123	ssize_t res;
4104		4124
4105	if (!entry->store)	4125	if (!entry->store)
4106	return -EIO;	4126	return -EIO;
4107	mutex_lock(&q->sysfs_lock);	4127	mutex_lock(&q->sysfs_lock);
4108	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {	4128	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
4109	mutex_unlock(&q->sysfs_lock);	4129	mutex_unlock(&q->sysfs_lock);
4110	return -ENOENT;	4130	return -ENOENT;
4111	}	4131	}
4112	res = entry->store(q, page, length);	4132	res = entry->store(q, page, length);
4113	mutex_unlock(&q->sysfs_lock);	4133	mutex_unlock(&q->sysfs_lock);
4114	return res;	4134	return res;
4115	}	4135	}
4116		4136
4117	static struct sysfs_ops queue_sysfs_ops = {	4137	static struct sysfs_ops queue_sysfs_ops = {
4118	.show = queue_attr_show,	4138	.show = queue_attr_show,
4119	.store = queue_attr_store,	4139	.store = queue_attr_store,
4120	};	4140	};
4121		4141
4122	static struct kobj_type queue_ktype = {	4142	static struct kobj_type queue_ktype = {
4123	.sysfs_ops = &queue_sysfs_ops,	4143	.sysfs_ops = &queue_sysfs_ops,
4124	.default_attrs = default_attrs,	4144	.default_attrs = default_attrs,
4125	.release = blk_release_queue,	4145	.release = blk_release_queue,
4126	};	4146	};
4127		4147
4128	int blk_register_queue(struct gendisk *disk)	4148	int blk_register_queue(struct gendisk *disk)
4129	{	4149	{
4130	int ret;	4150	int ret;
4131		4151
4132	struct request_queue *q = disk->queue;	4152	struct request_queue *q = disk->queue;
4133		4153
4134	if (!q \|\| !q->request_fn)	4154	if (!q \|\| !q->request_fn)
4135	return -ENXIO;	4155	return -ENXIO;
4136		4156
4137	q->kobj.parent = kobject_get(&disk->kobj);	4157	q->kobj.parent = kobject_get(&disk->kobj);
4138		4158
4139	ret = kobject_add(&q->kobj);	4159	ret = kobject_add(&q->kobj);
4140	if (ret < 0)	4160	if (ret < 0)
4141	return ret;	4161	return ret;
4142		4162
4143	kobject_uevent(&q->kobj, KOBJ_ADD);	4163	kobject_uevent(&q->kobj, KOBJ_ADD);
4144		4164
4145	ret = elv_register_queue(q);	4165	ret = elv_register_queue(q);
4146	if (ret) {	4166	if (ret) {
4147	kobject_uevent(&q->kobj, KOBJ_REMOVE);	4167	kobject_uevent(&q->kobj, KOBJ_REMOVE);
4148	kobject_del(&q->kobj);	4168	kobject_del(&q->kobj);
4149	return ret;	4169	return ret;
4150	}	4170	}
4151		4171
4152	return 0;	4172	return 0;
4153	}	4173	}
4154		4174
4155	void blk_unregister_queue(struct gendisk *disk)	4175	void blk_unregister_queue(struct gendisk *disk)
4156	{	4176	{
4157	struct request_queue *q = disk->queue;	4177	struct request_queue *q = disk->queue;
4158		4178
4159	if (q && q->request_fn) {	4179	if (q && q->request_fn) {

include/linux/bio.h

Diff comments View file @ bf2de6f

 /*
  * 2.5 block I/O model
  *
  * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License for more details.
  *
  * You should have received a copy of the GNU General Public Licens
  * along with this program; if not, write to the Free Software
  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
  */
 #ifndef __LINUX_BIO_H
 #define __LINUX_BIO_H
 #include <linux/highmem.h>
 #include <linux/mempool.h>
 #include <linux/ioprio.h>
 #ifdef CONFIG_BLOCK
 /* Platforms may set this to teach the BIO layer about IOMMU hardware. */
 #include <asm/io.h>
 #if defined(BIO_VMERGE_MAX_SIZE) && defined(BIO_VMERGE_BOUNDARY)
 #define BIOVEC_VIRT_START_SIZE(x) (bvec_to_phys(x) & (BIO_VMERGE_BOUNDARY - 1))
 #define BIOVEC_VIRT_OVERSIZE(x)	((x) > BIO_VMERGE_MAX_SIZE)
 #else
 #define BIOVEC_VIRT_START_SIZE(x)	0
 #define BIOVEC_VIRT_OVERSIZE(x)		0
 #endif
 #ifndef BIO_VMERGE_BOUNDARY
 #define BIO_VMERGE_BOUNDARY	0
 #endif
 #define BIO_DEBUG
 #ifdef BIO_DEBUG
 #define BIO_BUG_ON	BUG_ON
 #else
 #define BIO_BUG_ON
 #endif
 #define BIO_MAX_PAGES		256
 #define BIO_MAX_SIZE		(BIO_MAX_PAGES << PAGE_CACHE_SHIFT)
 #define BIO_MAX_SECTORS		(BIO_MAX_SIZE >> 9)
 /*
  * was unsigned short, but we might as well be ready for > 64kB I/O pages
  */
 struct bio_vec {
 	struct page	*bv_page;
 	unsigned int	bv_len;
 	unsigned int	bv_offset;
 };
 struct bio_set;
 struct bio;
 typedef void (bio_end_io_t) (struct bio *, int);
 typedef void (bio_destructor_t) (struct bio *);
 /*
  * main unit of I/O for the block layer and lower layers (ie drivers and
  * stacking drivers)
  */
 struct bio {
 	sector_t		bi_sector;	/* device address in 512 byte
 						   sectors */
 	struct bio		*bi_next;	/* request queue link */
 	struct block_device	*bi_bdev;
 	unsigned long		bi_flags;	/* status, command, etc */
 	unsigned long		bi_rw;		/* bottom bits READ/WRITE,
 						 * top bits priority
 						 */
 	unsigned short		bi_vcnt;	/* how many bio_vec's */
 	unsigned short		bi_idx;		/* current index into bvl_vec */
 	/* Number of segments in this BIO after
 	 * physical address coalescing is performed.
 	 */
 	unsigned short		bi_phys_segments;
 	/* Number of segments after physical and DMA remapping
 	 * hardware coalescing is performed.
 	 */
 	unsigned short		bi_hw_segments;
 	unsigned int		bi_size;	/* residual I/O count */
 	/*
 	 * To keep track of the max hw size, we account for the
 	 * sizes of the first and last virtually mergeable segments
 	 * in this bio
 	 */
 	unsigned int		bi_hw_front_size;
 	unsigned int		bi_hw_back_size;
 	unsigned int		bi_max_vecs;	/* max bvl_vecs we can hold */
 	struct bio_vec		*bi_io_vec;	/* the actual vec list */
 	bio_end_io_t		*bi_end_io;
 	atomic_t		bi_cnt;		/* pin count */
 	void			*bi_private;
 	bio_destructor_t	*bi_destructor;	/* destructor */
 };
 /*
  * bio flags
  */
 #define BIO_UPTODATE	0	/* ok after I/O completion */
 #define BIO_RW_BLOCK	1	/* RW_AHEAD set, and read/write would block */
 #define BIO_EOF		2	/* out-out-bounds error */
 #define BIO_SEG_VALID	3	/* nr_hw_seg valid */
 #define BIO_CLONED	4	/* doesn't own data */
 #define BIO_BOUNCED	5	/* bio is a bounce bio */
 #define BIO_USER_MAPPED 6	/* contains user pages */
 #define BIO_EOPNOTSUPP	7	/* not supported */
 #define bio_flagged(bio, flag)	((bio)->bi_flags & (1 << (flag)))
 /*
  * top 4 bits of bio flags indicate the pool this bio came from
  */
 #define BIO_POOL_BITS		(4)
 #define BIO_POOL_OFFSET		(BITS_PER_LONG - BIO_POOL_BITS)
 #define BIO_POOL_MASK		(1UL << BIO_POOL_OFFSET)
 #define BIO_POOL_IDX(bio)	((bio)->bi_flags >> BIO_POOL_OFFSET)
 /*
  * bio bi_rw flags
  *
  * bit 0 -- read (not set) or write (set)
  * bit 1 -- rw-ahead when set
  * bit 2 -- barrier
  * bit 3 -- fail fast, don't want low level driver retries
  * bit 4 -- synchronous I/O hint: the block layer will unplug immediately
  */
 #define BIO_RW		0
 #define BIO_RW_AHEAD	1
 #define BIO_RW_BARRIER	2
 #define BIO_RW_FAILFAST	3
 #define BIO_RW_SYNC	4
 #define BIO_RW_META	5
 /*
  * upper 16 bits of bi_rw define the io priority of this bio
  */
 #define BIO_PRIO_SHIFT	(8 * sizeof(unsigned long) - IOPRIO_BITS)
 #define bio_prio(bio)	((bio)->bi_rw >> BIO_PRIO_SHIFT)
 #define bio_prio_valid(bio)	ioprio_valid(bio_prio(bio))
 #define bio_set_prio(bio, prio)		do {			\
 	WARN_ON(prio >= (1 << IOPRIO_BITS));			\
 	(bio)->bi_rw &= ((1UL << BIO_PRIO_SHIFT) - 1);		\
 	(bio)->bi_rw |= ((unsigned long) (prio) << BIO_PRIO_SHIFT);	\
 } while (0)
 /*
  * various member access, note that bio_data should of course not be used
  * on highmem page vectors
  */
 #define bio_iovec_idx(bio, idx)	(&((bio)->bi_io_vec[(idx)]))
 #define bio_iovec(bio)		bio_iovec_idx((bio), (bio)->bi_idx)
 #define bio_page(bio)		bio_iovec((bio))->bv_page
 #define bio_offset(bio)		bio_iovec((bio))->bv_offset
 #define bio_segments(bio)	((bio)->bi_vcnt - (bio)->bi_idx)
 #define bio_sectors(bio)	((bio)->bi_size >> 9)
-#define bio_cur_sectors(bio)	(bio_iovec(bio)->bv_len >> 9)
-#define bio_data(bio)		(page_address(bio_page((bio))) + bio_offset((bio)))
 #define bio_barrier(bio)	((bio)->bi_rw & (1 << BIO_RW_BARRIER))
 #define bio_sync(bio)		((bio)->bi_rw & (1 << BIO_RW_SYNC))
 #define bio_failfast(bio)	((bio)->bi_rw & (1 << BIO_RW_FAILFAST))
 #define bio_rw_ahead(bio)	((bio)->bi_rw & (1 << BIO_RW_AHEAD))
 #define bio_rw_meta(bio)	((bio)->bi_rw & (1 << BIO_RW_META))
+#define bio_empty_barrier(bio)	(bio_barrier(bio) && !(bio)->bi_size)
+static inline unsigned int bio_cur_sectors(struct bio *bio)
+{
+	if (bio->bi_vcnt)
+		return bio_iovec(bio)->bv_len >> 9;
+	return 0;
+}
+static inline void *bio_data(struct bio *bio)
+{
+	if (bio->bi_vcnt)
+		return page_address(bio_page(bio)) + bio_offset(bio);
+	return NULL;
+}
 /*
  * will die
  */
 #define bio_to_phys(bio)	(page_to_phys(bio_page((bio))) + (unsigned long) bio_offset((bio)))
 #define bvec_to_phys(bv)	(page_to_phys((bv)->bv_page) + (unsigned long) (bv)->bv_offset)
 /*
  * queues that have highmem support enabled may still need to revert to
  * PIO transfers occasionally and thus map high pages temporarily. For
  * permanent PIO fall back, user is probably better off disabling highmem
  * I/O completely on that queue (see ide-dma for example)
  */
 #define __bio_kmap_atomic(bio, idx, kmtype)				\
 	(kmap_atomic(bio_iovec_idx((bio), (idx))->bv_page, kmtype) +	\
 		bio_iovec_idx((bio), (idx))->bv_offset)
 #define __bio_kunmap_atomic(addr, kmtype) kunmap_atomic(addr, kmtype)
 /*
  * merge helpers etc
  */
 #define __BVEC_END(bio)		bio_iovec_idx((bio), (bio)->bi_vcnt - 1)
 #define __BVEC_START(bio)	bio_iovec_idx((bio), (bio)->bi_idx)
 /*
  * allow arch override, for eg virtualized architectures (put in asm/io.h)
  */
 #ifndef BIOVEC_PHYS_MERGEABLE
 #define BIOVEC_PHYS_MERGEABLE(vec1, vec2)	\
 	((bvec_to_phys((vec1)) + (vec1)->bv_len) == bvec_to_phys((vec2)))
 #endif
 #define BIOVEC_VIRT_MERGEABLE(vec1, vec2)	\
 	((((bvec_to_phys((vec1)) + (vec1)->bv_len) | bvec_to_phys((vec2))) & (BIO_VMERGE_BOUNDARY - 1)) == 0)
 #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \
 	(((addr1) | (mask)) == (((addr2) - 1) | (mask)))
 #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
 	__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask)
 #define BIO_SEG_BOUNDARY(q, b1, b2) \
 	BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2)))
 #define bio_io_error(bio) bio_endio((bio), -EIO)
 /*
  * drivers should not use the __ version unless they _really_ want to
  * run through the entire bio and not just pending pieces
  */
 #define __bio_for_each_segment(bvl, bio, i, start_idx)			\
 	for (bvl = bio_iovec_idx((bio), (start_idx)), i = (start_idx);	\
 	     i < (bio)->bi_vcnt;					\
 	     bvl++, i++)
 #define bio_for_each_segment(bvl, bio, i)				\
 	__bio_for_each_segment(bvl, bio, i, (bio)->bi_idx)
 /*
  * get a reference to a bio, so it won't disappear. the intended use is
  * something like:
  *
  * bio_get(bio);
  * submit_bio(rw, bio);
  * if (bio->bi_flags ...)
  *	do_something
  * bio_put(bio);
  *
  * without the bio_get(), it could potentially complete I/O before submit_bio
  * returns. and then bio would be freed memory when if (bio->bi_flags ...)
  * runs
  */
 #define bio_get(bio)	atomic_inc(&(bio)->bi_cnt)
 /*
  * A bio_pair is used when we need to split a bio.
  * This can only happen for a bio that refers to just one
  * page of data, and in the unusual situation when the
  * page crosses a chunk/device boundary
  *
  * The address of the master bio is stored in bio1.bi_private
  * The address of the pool the pair was allocated from is stored
  *   in bio2.bi_private
  */
 struct bio_pair {
 	struct bio	bio1, bio2;
 	struct bio_vec	bv1, bv2;
 	atomic_t	cnt;
 	int		error;
 };
 extern struct bio_pair *bio_split(struct bio *bi, mempool_t *pool,
 				  int first_sectors);
 extern mempool_t *bio_split_pool;
 extern void bio_pair_release(struct bio_pair *dbio);
 extern struct bio_set *bioset_create(int, int);
 extern void bioset_free(struct bio_set *);
 extern struct bio *bio_alloc(gfp_t, int);
 extern struct bio *bio_alloc_bioset(gfp_t, int, struct bio_set *);
 extern void bio_put(struct bio *);
 extern void bio_free(struct bio *, struct bio_set *);
 extern void bio_endio(struct bio *, int);
 struct request_queue;
 extern int bio_phys_segments(struct request_queue *, struct bio *);
 extern int bio_hw_segments(struct request_queue *, struct bio *);
 extern void __bio_clone(struct bio *, struct bio *);
 extern struct bio *bio_clone(struct bio *, gfp_t);
 extern void bio_init(struct bio *);
 extern int bio_add_page(struct bio *, struct page *, unsigned int,unsigned int);
 extern int bio_add_pc_page(struct request_queue *, struct bio *, struct page *,
 			   unsigned int, unsigned int);
 extern int bio_get_nr_vecs(struct block_device *);
 extern struct bio *bio_map_user(struct request_queue *, struct block_device *,
 				unsigned long, unsigned int, int);
 struct sg_iovec;
 extern struct bio *bio_map_user_iov(struct request_queue *,
 				    struct block_device *,
 				    struct sg_iovec *, int, int);
 extern void bio_unmap_user(struct bio *);
 extern struct bio *bio_map_kern(struct request_queue *, void *, unsigned int,
 				gfp_t);
 extern void bio_set_pages_dirty(struct bio *bio);
 extern void bio_check_pages_dirty(struct bio *bio);
 extern void bio_release_pages(struct bio *bio);
 extern struct bio *bio_copy_user(struct request_queue *, unsigned long, unsigned int, int);
 extern int bio_uncopy_user(struct bio *);
 void zero_fill_bio(struct bio *bio);
 #ifdef CONFIG_HIGHMEM
 /*
  * remember to add offset! and never ever reenable interrupts between a
  * bvec_kmap_irq and bvec_kunmap_irq!!
  *
  * This function MUST be inlined - it plays with the CPU interrupt flags.
  */
 static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags)
 {
 	unsigned long addr;
 	/*
 	 * might not be a highmem page, but the preempt/irq count
 	 * balancing is a lot nicer this way
 	 */
 	local_irq_save(*flags);
 	addr = (unsigned long) kmap_atomic(bvec->bv_page, KM_BIO_SRC_IRQ);
 	BUG_ON(addr & ~PAGE_MASK);
 	return (char *) addr + bvec->bv_offset;
 }
 static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags)
 {
 	unsigned long ptr = (unsigned long) buffer & PAGE_MASK;
 	kunmap_atomic((void *) ptr, KM_BIO_SRC_IRQ);
 	local_irq_restore(*flags);
 }
 #else
 #define bvec_kmap_irq(bvec, flags)	(page_address((bvec)->bv_page) + (bvec)->bv_offset)
 #define bvec_kunmap_irq(buf, flags)	do { *(flags) = 0; } while (0)
 #endif
 static inline char *__bio_kmap_irq(struct bio *bio, unsigned short idx,
 				   unsigned long *flags)
 {
 	return bvec_kmap_irq(bio_iovec_idx(bio, idx), flags);
 }
 #define __bio_kunmap_irq(buf, flags)	bvec_kunmap_irq(buf, flags)
 #define bio_kmap_irq(bio, flags) \
 	__bio_kmap_irq((bio), (bio)->bi_idx, (flags))
 #define bio_kunmap_irq(buf,flags)	__bio_kunmap_irq(buf, flags)
 #endif /* CONFIG_BLOCK */

include/linux/blkdev.h

Diff comments View file @ bf2de6f

 #ifndef _LINUX_BLKDEV_H
 #define _LINUX_BLKDEV_H
 #ifdef CONFIG_BLOCK
 #include <linux/sched.h>
 #include <linux/major.h>
 #include <linux/genhd.h>
 #include <linux/list.h>
 #include <linux/timer.h>
 #include <linux/workqueue.h>
 #include <linux/pagemap.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
 #include <linux/mempool.h>
 #include <linux/bio.h>
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include <linux/bsg.h>
 #include <asm/scatterlist.h>
 struct scsi_ioctl_command;
 struct request_queue;
 typedef struct request_queue request_queue_t __deprecated;
 struct elevator_queue;
 typedef struct elevator_queue elevator_t;
 struct request_pm_state;
 struct blk_trace;
 struct request;
 struct sg_io_hdr;
 #define BLKDEV_MIN_RQ	4
 #define BLKDEV_MAX_RQ	128	/* Default maximum */
 /*
  * This is the per-process anticipatory I/O scheduler state.
  */
 struct as_io_context {
 	spinlock_t lock;
 	void (*dtor)(struct as_io_context *aic); /* destructor */
 	void (*exit)(struct as_io_context *aic); /* called on task exit */
 	unsigned long state;
 	atomic_t nr_queued; /* queued reads & sync writes */
 	atomic_t nr_dispatched; /* number of requests gone to the drivers */
 	/* IO History tracking */
 	/* Thinktime */
 	unsigned long last_end_request;
 	unsigned long ttime_total;
 	unsigned long ttime_samples;
 	unsigned long ttime_mean;
 	/* Layout pattern */
 	unsigned int seek_samples;
 	sector_t last_request_pos;
 	u64 seek_total;
 	sector_t seek_mean;
 };
 struct cfq_queue;
 struct cfq_io_context {
 	struct rb_node rb_node;
 	void *key;
 	struct cfq_queue *cfqq[2];
 	struct io_context *ioc;
 	unsigned long last_end_request;
 	sector_t last_request_pos;
 	unsigned long ttime_total;
 	unsigned long ttime_samples;
 	unsigned long ttime_mean;
 	unsigned int seek_samples;
 	u64 seek_total;
 	sector_t seek_mean;
 	struct list_head queue_list;
 	void (*dtor)(struct io_context *); /* destructor */
 	void (*exit)(struct io_context *); /* called on task exit */
 };
 /*
  * This is the per-process I/O subsystem state.  It is refcounted and
  * kmalloc'ed. Currently all fields are modified in process io context
  * (apart from the atomic refcount), so require no locking.
  */
 struct io_context {
 	atomic_t refcount;
 	struct task_struct *task;
 	unsigned int ioprio_changed;
 	/*
 	 * For request batching
 	 */
 	unsigned long last_waited; /* Time last woken after wait for request */
 	int nr_batch_requests;     /* Number of requests left in the batch */
 	struct as_io_context *aic;
 	struct rb_root cic_root;
 	void *ioc_data;
 };
 void put_io_context(struct io_context *ioc);
 void exit_io_context(void);
 struct io_context *get_io_context(gfp_t gfp_flags, int node);
 void copy_io_context(struct io_context **pdst, struct io_context **psrc);
 void swap_io_context(struct io_context **ioc1, struct io_context **ioc2);
 struct request;
 typedef void (rq_end_io_fn)(struct request *, int);
 struct request_list {
 	int count[2];
 	int starved[2];
 	int elvpriv;
 	mempool_t *rq_pool;
 	wait_queue_head_t wait[2];
 };
 /*
  * request command types
  */
 enum rq_cmd_type_bits {
 	REQ_TYPE_FS		= 1,	/* fs request */
 	REQ_TYPE_BLOCK_PC,		/* scsi command */
 	REQ_TYPE_SENSE,			/* sense request */
 	REQ_TYPE_PM_SUSPEND,		/* suspend request */
 	REQ_TYPE_PM_RESUME,		/* resume request */
 	REQ_TYPE_PM_SHUTDOWN,		/* shutdown request */
 	REQ_TYPE_FLUSH,			/* flush request */
 	REQ_TYPE_SPECIAL,		/* driver defined type */
 	REQ_TYPE_LINUX_BLOCK,		/* generic block layer message */
 	/*
 	 * for ATA/ATAPI devices. this really doesn't belong here, ide should
 	 * use REQ_TYPE_SPECIAL and use rq->cmd[0] with the range of driver
 	 * private REQ_LB opcodes to differentiate what type of request this is
 	 */
 	REQ_TYPE_ATA_CMD,
 	REQ_TYPE_ATA_TASK,
 	REQ_TYPE_ATA_TASKFILE,
 	REQ_TYPE_ATA_PC,
 };
 /*
  * For request of type REQ_TYPE_LINUX_BLOCK, rq->cmd[0] is the opcode being
  * sent down (similar to how REQ_TYPE_BLOCK_PC means that ->cmd[] holds a
  * SCSI cdb.
  *
  * 0x00 -> 0x3f are driver private, to be used for whatever purpose they need,
  * typically to differentiate REQ_TYPE_SPECIAL requests.
  *
  */
 enum {
 	/*
 	 * just examples for now
 	 */
 	REQ_LB_OP_EJECT	= 0x40,		/* eject request */
 	REQ_LB_OP_FLUSH = 0x41,		/* flush device */
 };
 /*
  * request type modified bits. first three bits match BIO_RW* bits, important
  */
 enum rq_flag_bits {
 	__REQ_RW,		/* not set, read. set, write */
 	__REQ_FAILFAST,		/* no low level driver retries */
 	__REQ_SORTED,		/* elevator knows about this request */
 	__REQ_SOFTBARRIER,	/* may not be passed by ioscheduler */
 	__REQ_HARDBARRIER,	/* may not be passed by drive either */
 	__REQ_FUA,		/* forced unit access */
 	__REQ_NOMERGE,		/* don't touch this for merging */
 	__REQ_STARTED,		/* drive already may have started this one */
 	__REQ_DONTPREP,		/* don't call prep for this one */
 	__REQ_QUEUED,		/* uses queueing */
 	__REQ_ELVPRIV,		/* elevator private data attached */
 	__REQ_FAILED,		/* set if the request failed */
 	__REQ_QUIET,		/* don't worry about errors */
 	__REQ_PREEMPT,		/* set for "ide_preempt" requests */
 	__REQ_ORDERED_COLOR,	/* is before or after barrier */
 	__REQ_RW_SYNC,		/* request is sync (O_DIRECT) */
 	__REQ_ALLOCED,		/* request came from our alloc pool */
 	__REQ_RW_META,		/* metadata io request */
 	__REQ_NR_BITS,		/* stops here */
 };
 #define REQ_RW		(1 << __REQ_RW)
 #define REQ_FAILFAST	(1 << __REQ_FAILFAST)
 #define REQ_SORTED	(1 << __REQ_SORTED)
 #define REQ_SOFTBARRIER	(1 << __REQ_SOFTBARRIER)
 #define REQ_HARDBARRIER	(1 << __REQ_HARDBARRIER)
 #define REQ_FUA		(1 << __REQ_FUA)
 #define REQ_NOMERGE	(1 << __REQ_NOMERGE)
 #define REQ_STARTED	(1 << __REQ_STARTED)
 #define REQ_DONTPREP	(1 << __REQ_DONTPREP)
 #define REQ_QUEUED	(1 << __REQ_QUEUED)
 #define REQ_ELVPRIV	(1 << __REQ_ELVPRIV)
 #define REQ_FAILED	(1 << __REQ_FAILED)
 #define REQ_QUIET	(1 << __REQ_QUIET)
 #define REQ_PREEMPT	(1 << __REQ_PREEMPT)
 #define REQ_ORDERED_COLOR	(1 << __REQ_ORDERED_COLOR)
 #define REQ_RW_SYNC	(1 << __REQ_RW_SYNC)
 #define REQ_ALLOCED	(1 << __REQ_ALLOCED)
 #define REQ_RW_META	(1 << __REQ_RW_META)
 #define BLK_MAX_CDB	16
 /*
  * try to put the fields that are referenced together in the same cacheline
  */
 struct request {
 	struct list_head queuelist;
 	struct list_head donelist;
 	struct request_queue *q;
 	unsigned int cmd_flags;
 	enum rq_cmd_type_bits cmd_type;
 	/* Maintain bio traversal state for part by part I/O submission.
 	 * hard_* are block layer internals, no driver should touch them!
 	 */
 	sector_t sector;		/* next sector to submit */
 	sector_t hard_sector;		/* next sector to complete */
 	unsigned long nr_sectors;	/* no. of sectors left to submit */
 	unsigned long hard_nr_sectors;	/* no. of sectors left to complete */
 	/* no. of sectors left to submit in the current segment */
 	unsigned int current_nr_sectors;
 	/* no. of sectors left to complete in the current segment */
 	unsigned int hard_cur_sectors;
 	struct bio *bio;
 	struct bio *biotail;
 	struct hlist_node hash;	/* merge hash */
 	/*
 	 * The rb_node is only used inside the io scheduler, requests
 	 * are pruned when moved to the dispatch queue. So let the
 	 * completion_data share space with the rb_node.
 	 */
 	union {
 		struct rb_node rb_node;	/* sort/lookup */
 		void *completion_data;
 	};
 	/*
 	 * two pointers are available for the IO schedulers, if they need
 	 * more they have to dynamically allocate it.
 	 */
 	void *elevator_private;
 	void *elevator_private2;
 	struct gendisk *rq_disk;
 	unsigned long start_time;
 	/* Number of scatter-gather DMA addr+len pairs after
 	 * physical address coalescing is performed.
 	 */
 	unsigned short nr_phys_segments;
 	/* Number of scatter-gather addr+len pairs after
 	 * physical and DMA remapping hardware coalescing is performed.
 	 * This is the number of scatter-gather entries the driver
 	 * will actually have to deal with after DMA mapping is done.
 	 */
 	unsigned short nr_hw_segments;
 	unsigned short ioprio;
 	void *special;
 	char *buffer;
 	int tag;
 	int errors;
 	int ref_count;
 	/*
 	 * when request is used as a packet command carrier
 	 */
 	unsigned int cmd_len;
 	unsigned char cmd[BLK_MAX_CDB];
 	unsigned int data_len;
 	unsigned int sense_len;
 	void *data;
 	void *sense;
 	unsigned int timeout;
 	int retries;
 	/*
 	 * completion callback.
 	 */
 	rq_end_io_fn *end_io;
 	void *end_io_data;
 	/* for bidi */
 	struct request *next_rq;
 };
 /*
  * State information carried for REQ_TYPE_PM_SUSPEND and REQ_TYPE_PM_RESUME
  * requests. Some step values could eventually be made generic.
  */
 struct request_pm_state
 {
 	/* PM state machine step value, currently driver specific */
 	int	pm_step;
 	/* requested PM state value (S1, S2, S3, S4, ...) */
 	u32	pm_state;
 	void*	data;		/* for driver use */
 };
 #include <linux/elevator.h>
 typedef void (request_fn_proc) (struct request_queue *q);
 typedef int (make_request_fn) (struct request_queue *q, struct bio *bio);
 typedef int (prep_rq_fn) (struct request_queue *, struct request *);
 typedef void (unplug_fn) (struct request_queue *);
 struct bio_vec;
 typedef int (merge_bvec_fn) (struct request_queue *, struct bio *, struct bio_vec *);
 typedef int (issue_flush_fn) (struct request_queue *, struct gendisk *, sector_t *);
 typedef void (prepare_flush_fn) (struct request_queue *, struct request *);
 typedef void (softirq_done_fn)(struct request *);
 enum blk_queue_state {
 	Queue_down,
 	Queue_up,
 };
 struct blk_queue_tag {
 	struct request **tag_index;	/* map of busy tags */
 	unsigned long *tag_map;		/* bit map of free/busy tags */
 	struct list_head busy_list;	/* fifo list of busy tags */
 	int busy;			/* current depth */
 	int max_depth;			/* what we will send to device */
 	int real_max_depth;		/* what the array can hold */
 	atomic_t refcnt;		/* map can be shared */
 };
 struct request_queue
 {
 	/*
 	 * Together with queue_head for cacheline sharing
 	 */
 	struct list_head	queue_head;
 	struct request		*last_merge;
 	elevator_t		*elevator;
 	/*
 	 * the queue request freelist, one for reads and one for writes
 	 */
 	struct request_list	rq;
 	request_fn_proc		*request_fn;
 	make_request_fn		*make_request_fn;
 	prep_rq_fn		*prep_rq_fn;
 	unplug_fn		*unplug_fn;
 	merge_bvec_fn		*merge_bvec_fn;
 	issue_flush_fn		*issue_flush_fn;
 	prepare_flush_fn	*prepare_flush_fn;
 	softirq_done_fn		*softirq_done_fn;
 	/*
 	 * Dispatch queue sorting
 	 */
 	sector_t		end_sector;
 	struct request		*boundary_rq;
 	/*
 	 * Auto-unplugging state
 	 */
 	struct timer_list	unplug_timer;
 	int			unplug_thresh;	/* After this many requests */
 	unsigned long		unplug_delay;	/* After this many jiffies */
 	struct work_struct	unplug_work;
 	struct backing_dev_info	backing_dev_info;
 	/*
 	 * The queue owner gets to use this for whatever they like.
 	 * ll_rw_blk doesn't touch it.
 	 */
 	void			*queuedata;
 	/*
 	 * queue needs bounce pages for pages above this limit
 	 */
 	unsigned long		bounce_pfn;
 	gfp_t			bounce_gfp;
 	/*
 	 * various queue flags, see QUEUE_* below
 	 */
 	unsigned long		queue_flags;
 	/*
 	 * protects queue structures from reentrancy. ->__queue_lock should
 	 * _never_ be used directly, it is queue private. always use
 	 * ->queue_lock.
 	 */
 	spinlock_t		__queue_lock;
 	spinlock_t		*queue_lock;
 	/*
 	 * queue kobject
 	 */
 	struct kobject kobj;
 	/*
 	 * queue settings
 	 */
 	unsigned long		nr_requests;	/* Max # of requests */
 	unsigned int		nr_congestion_on;
 	unsigned int		nr_congestion_off;
 	unsigned int		nr_batching;
 	unsigned int		max_sectors;
 	unsigned int		max_hw_sectors;
 	unsigned short		max_phys_segments;
 	unsigned short		max_hw_segments;
 	unsigned short		hardsect_size;
 	unsigned int		max_segment_size;
 	unsigned long		seg_boundary_mask;
 	unsigned int		dma_alignment;
 	struct blk_queue_tag	*queue_tags;
 	unsigned int		nr_sorted;
 	unsigned int		in_flight;
 	/*
 	 * sg stuff
 	 */
 	unsigned int		sg_timeout;
 	unsigned int		sg_reserved_size;
 	int			node;
 #ifdef CONFIG_BLK_DEV_IO_TRACE
 	struct blk_trace	*blk_trace;
 #endif
 	/*
 	 * reserved for flush operations
 	 */
 	unsigned int		ordered, next_ordered, ordseq;
 	int			orderr, ordcolor;
 	struct request		pre_flush_rq, bar_rq, post_flush_rq;
 	struct request		*orig_bar_rq;
 	struct mutex		sysfs_lock;
 #if defined(CONFIG_BLK_DEV_BSG)
 	struct bsg_class_device bsg_dev;
 #endif
 };
 #define QUEUE_FLAG_CLUSTER	0	/* cluster several segments into 1 */
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
 #define QUEUE_FLAG_STOPPED	2	/* queue is stopped */
 #define	QUEUE_FLAG_READFULL	3	/* read queue has been filled */
 #define QUEUE_FLAG_WRITEFULL	4	/* write queue has been filled */
 #define QUEUE_FLAG_DEAD		5	/* queue being torn down */
 #define QUEUE_FLAG_REENTER	6	/* Re-entrancy avoidance */
 #define QUEUE_FLAG_PLUGGED	7	/* queue is plugged */
 #define QUEUE_FLAG_ELVSWITCH	8	/* don't use elevator, just do FIFO */
 #define QUEUE_FLAG_BIDI		9	/* queue supports bidi requests */
 enum {
 	/*
 	 * Hardbarrier is supported with one of the following methods.
 	 *
 	 * NONE		: hardbarrier unsupported
 	 * DRAIN	: ordering by draining is enough
 	 * DRAIN_FLUSH	: ordering by draining w/ pre and post flushes
 	 * DRAIN_FUA	: ordering by draining w/ pre flush and FUA write
 	 * TAG		: ordering by tag is enough
 	 * TAG_FLUSH	: ordering by tag w/ pre and post flushes
 	 * TAG_FUA	: ordering by tag w/ pre flush and FUA write
 	 */
 	QUEUE_ORDERED_NONE	= 0x00,
 	QUEUE_ORDERED_DRAIN	= 0x01,
 	QUEUE_ORDERED_TAG	= 0x02,
 	QUEUE_ORDERED_PREFLUSH	= 0x10,
 	QUEUE_ORDERED_POSTFLUSH	= 0x20,
 	QUEUE_ORDERED_FUA	= 0x40,
 	QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN |
 			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
 	QUEUE_ORDERED_DRAIN_FUA	= QUEUE_ORDERED_DRAIN |
 			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
 	QUEUE_ORDERED_TAG_FLUSH	= QUEUE_ORDERED_TAG |
 			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH,
 	QUEUE_ORDERED_TAG_FUA	= QUEUE_ORDERED_TAG |
 			QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA,
 	/*
 	 * Ordered operation sequence
 	 */
 	QUEUE_ORDSEQ_STARTED	= 0x01,	/* flushing in progress */
 	QUEUE_ORDSEQ_DRAIN	= 0x02,	/* waiting for the queue to be drained */
 	QUEUE_ORDSEQ_PREFLUSH	= 0x04,	/* pre-flushing in progress */
 	QUEUE_ORDSEQ_BAR	= 0x08,	/* original barrier req in progress */
 	QUEUE_ORDSEQ_POSTFLUSH	= 0x10,	/* post-flushing in progress */
 	QUEUE_ORDSEQ_DONE	= 0x20,
 };
 #define blk_queue_plugged(q)	test_bit(QUEUE_FLAG_PLUGGED, &(q)->queue_flags)
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
 #define blk_queue_flushing(q)	((q)->ordseq)
 #define blk_fs_request(rq)	((rq)->cmd_type == REQ_TYPE_FS)
 #define blk_pc_request(rq)	((rq)->cmd_type == REQ_TYPE_BLOCK_PC)
 #define blk_special_request(rq)	((rq)->cmd_type == REQ_TYPE_SPECIAL)
 #define blk_sense_request(rq)	((rq)->cmd_type == REQ_TYPE_SENSE)
 #define blk_noretry_request(rq)	((rq)->cmd_flags & REQ_FAILFAST)
 #define blk_rq_started(rq)	((rq)->cmd_flags & REQ_STARTED)
 #define blk_account_rq(rq)	(blk_rq_started(rq) && blk_fs_request(rq))
 #define blk_pm_suspend_request(rq)	((rq)->cmd_type == REQ_TYPE_PM_SUSPEND)
 #define blk_pm_resume_request(rq)	((rq)->cmd_type == REQ_TYPE_PM_RESUME)
 #define blk_pm_request(rq)	\
 	(blk_pm_suspend_request(rq) || blk_pm_resume_request(rq))
 #define blk_sorted_rq(rq)	((rq)->cmd_flags & REQ_SORTED)
 #define blk_barrier_rq(rq)	((rq)->cmd_flags & REQ_HARDBARRIER)
 #define blk_fua_rq(rq)		((rq)->cmd_flags & REQ_FUA)
 #define blk_bidi_rq(rq)		((rq)->next_rq != NULL)
+#define blk_empty_barrier(rq)	(blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors)
 #define list_entry_rq(ptr)	list_entry((ptr), struct request, queuelist)
 #define rq_data_dir(rq)		((rq)->cmd_flags & 1)
 /*
  * We regard a request as sync, if it's a READ or a SYNC write.
  */
 #define rq_is_sync(rq)		(rq_data_dir((rq)) == READ || (rq)->cmd_flags & REQ_RW_SYNC)
 #define rq_is_meta(rq)		((rq)->cmd_flags & REQ_RW_META)
 static inline int blk_queue_full(struct request_queue *q, int rw)
 {
 	if (rw == READ)
 		return test_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
 	return test_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
 }
 static inline void blk_set_queue_full(struct request_queue *q, int rw)
 {
 	if (rw == READ)
 		set_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
 	else
 		set_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
 }
 static inline void blk_clear_queue_full(struct request_queue *q, int rw)
 {
 	if (rw == READ)
 		clear_bit(QUEUE_FLAG_READFULL, &q->queue_flags);
 	else
 		clear_bit(QUEUE_FLAG_WRITEFULL, &q->queue_flags);
 }
 /*
  * mergeable request must not have _NOMERGE or _BARRIER bit set, nor may
  * it already be started by driver.
  */
 #define RQ_NOMERGE_FLAGS	\
 	(REQ_NOMERGE | REQ_STARTED | REQ_HARDBARRIER | REQ_SOFTBARRIER)
 #define rq_mergeable(rq)	\
 	(!((rq)->cmd_flags & RQ_NOMERGE_FLAGS) && blk_fs_request((rq)))
 /*
  * q->prep_rq_fn return values
  */
 #define BLKPREP_OK		0	/* serve it */
 #define BLKPREP_KILL		1	/* fatal error, kill */
 #define BLKPREP_DEFER		2	/* leave on queue */
 extern unsigned long blk_max_low_pfn, blk_max_pfn;
 /*
  * standard bounce addresses:
  *
  * BLK_BOUNCE_HIGH	: bounce all highmem pages
  * BLK_BOUNCE_ANY	: don't bounce anything
  * BLK_BOUNCE_ISA	: bounce pages above ISA DMA boundary
  */
 #define BLK_BOUNCE_HIGH		((u64)blk_max_low_pfn << PAGE_SHIFT)
 #define BLK_BOUNCE_ANY		((u64)blk_max_pfn << PAGE_SHIFT)
 #define BLK_BOUNCE_ISA		(ISA_DMA_THRESHOLD)
 /*
  * default timeout for SG_IO if none specified
  */
 #define BLK_DEFAULT_SG_TIMEOUT	(60 * HZ)
 #ifdef CONFIG_BOUNCE
 extern int init_emergency_isa_pool(void);
 extern void blk_queue_bounce(struct request_queue *q, struct bio **bio);
 #else
 static inline int init_emergency_isa_pool(void)
 {
 	return 0;
 }
 static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
 {
 }
 #endif /* CONFIG_MMU */
 struct req_iterator {
 	int i;
 	struct bio *bio;
 };
 /* This should not be used directly - use rq_for_each_segment */
 #define __rq_for_each_bio(_bio, rq)	\
 	if ((rq->bio))			\
 		for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
 #define rq_for_each_segment(bvl, _rq, _iter)			\
 	__rq_for_each_bio(_iter.bio, _rq)			\
 		bio_for_each_segment(bvl, _iter.bio, _iter.i)
 #define rq_iter_last(rq, _iter)					\
 		(_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1)
 extern int blk_register_queue(struct gendisk *disk);
 extern void blk_unregister_queue(struct gendisk *disk);
 extern void register_disk(struct gendisk *dev);
 extern void generic_make_request(struct bio *bio);
 extern void blk_put_request(struct request *);
 extern void __blk_put_request(struct request_queue *, struct request *);
 extern void blk_end_sync_rq(struct request *rq, int error);
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern void blk_plug_device(struct request_queue *);
 extern int blk_remove_plug(struct request_queue *);
 extern void blk_recount_segments(struct request_queue *, struct bio *);
 extern int scsi_cmd_ioctl(struct file *, struct request_queue *,
 			  struct gendisk *, unsigned int, void __user *);
 extern int sg_scsi_ioctl(struct file *, struct request_queue *,
 		struct gendisk *, struct scsi_ioctl_command __user *);
 /*
  * Temporary export, until SCSI gets fixed up.
  */
 extern int blk_rq_append_bio(struct request_queue *q, struct request *rq,
 			     struct bio *bio);
 /*
  * A queue has just exitted congestion.  Note this in the global counter of
  * congested queues, and wake up anyone who was waiting for requests to be
  * put back.
  */
 static inline void blk_clear_queue_congested(struct request_queue *q, int rw)
 {
 	clear_bdi_congested(&q->backing_dev_info, rw);
 }
 /*
  * A queue has just entered congestion.  Flag that in the queue's VM-visible
  * state flags and increment the global gounter of congested queues.
  */
 static inline void blk_set_queue_congested(struct request_queue *q, int rw)
 {
 	set_bdi_congested(&q->backing_dev_info, rw);
 }
 extern void blk_start_queue(struct request_queue *q);
 extern void blk_stop_queue(struct request_queue *q);
 extern void blk_sync_queue(struct request_queue *q);
 extern void __blk_stop_queue(struct request_queue *q);
 extern void blk_run_queue(struct request_queue *);
 extern void blk_start_queueing(struct request_queue *);
 extern int blk_rq_map_user(struct request_queue *, struct request *, void __user *, unsigned long);
 extern int blk_rq_unmap_user(struct bio *);
 extern int blk_rq_map_kern(struct request_queue *, struct request *, void *, unsigned int, gfp_t);
 extern int blk_rq_map_user_iov(struct request_queue *, struct request *,
 			       struct sg_iovec *, int, unsigned int);
 extern int blk_execute_rq(struct request_queue *, struct gendisk *,
 			  struct request *, int);
 extern void blk_execute_rq_nowait(struct request_queue *, struct gendisk *,
 				  struct request *, int, rq_end_io_fn *);
 extern int blk_verify_command(unsigned char *, int);
 static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
 {
 	return bdev->bd_disk->queue;
 }
 static inline void blk_run_backing_dev(struct backing_dev_info *bdi,
 				       struct page *page)
 {
 	if (bdi && bdi->unplug_io_fn)
 		bdi->unplug_io_fn(bdi, page);
 }
 static inline void blk_run_address_space(struct address_space *mapping)
 {
 	if (mapping)
 		blk_run_backing_dev(mapping->backing_dev_info, NULL);
 }
 /*
  * end_request() and friends. Must be called with the request queue spinlock
  * acquired. All functions called within end_request() _must_be_ atomic.
  *
  * Several drivers define their own end_request and call
  * end_that_request_first() and end_that_request_last()
  * for parts of the original function. This prevents
  * code duplication in drivers.
  */
 extern int end_that_request_first(struct request *, int, int);
 extern int end_that_request_chunk(struct request *, int, int);
 extern void end_that_request_last(struct request *, int);
 extern void end_request(struct request *, int);
 extern void end_queued_request(struct request *, int);
 extern void end_dequeued_request(struct request *, int);
 extern void blk_complete_request(struct request *);
 /*
  * end_that_request_first/chunk() takes an uptodate argument. we account
  * any value <= as an io error. 0 means -EIO for compatability reasons,
  * any other < 0 value is the direct error type. An uptodate value of
  * 1 indicates successful io completion
  */
 #define end_io_error(uptodate)	(unlikely((uptodate) <= 0))
 static inline void blkdev_dequeue_request(struct request *req)
 {
 	elv_dequeue_request(req->q, req);
 }
 /*
  * Access functions for manipulating queue properties
  */
 extern struct request_queue *blk_init_queue_node(request_fn_proc *rfn,
 					spinlock_t *lock, int node_id);
 extern struct request_queue *blk_init_queue(request_fn_proc *, spinlock_t *);
 extern void blk_cleanup_queue(struct request_queue *);
 extern void blk_queue_make_request(struct request_queue *, make_request_fn *);
 extern void blk_queue_bounce_limit(struct request_queue *, u64);
 extern void blk_queue_max_sectors(struct request_queue *, unsigned int);
 extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short);
 extern void blk_queue_max_segment_size(struct request_queue *, unsigned int);
 extern void blk_queue_hardsect_size(struct request_queue *, unsigned short);
 extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b);
 extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
 extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
 extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
 extern void blk_queue_dma_alignment(struct request_queue *, int);
 extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *);
 extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
 extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *);
 extern void blk_queue_issue_flush_fn(struct request_queue *, issue_flush_fn *);
 extern int blk_do_ordered(struct request_queue *, struct request **);
 extern unsigned blk_ordered_cur_seq(struct request_queue *);
 extern unsigned blk_ordered_req_seq(struct request *);
 extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int);
 extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
 extern void blk_dump_rq_flags(struct request *, char *);
 extern void generic_unplug_device(struct request_queue *);
 extern void __generic_unplug_device(struct request_queue *);
 extern long nr_blockdev_pages(void);
 int blk_get_queue(struct request_queue *);
 struct request_queue *blk_alloc_queue(gfp_t);
 struct request_queue *blk_alloc_queue_node(gfp_t, int);
 extern void blk_put_queue(struct request_queue *);
 /*
  * tag stuff
  */
 #define blk_queue_tag_depth(q)		((q)->queue_tags->busy)
 #define blk_queue_tag_queue(q)		((q)->queue_tags->busy < (q)->queue_tags->max_depth)
 #define blk_rq_tagged(rq)		((rq)->cmd_flags & REQ_QUEUED)
 extern int blk_queue_start_tag(struct request_queue *, struct request *);
 extern struct request *blk_queue_find_tag(struct request_queue *, int);
 extern void blk_queue_end_tag(struct request_queue *, struct request *);
 extern int blk_queue_init_tags(struct request_queue *, int, struct blk_queue_tag *);
 extern void blk_queue_free_tags(struct request_queue *);
 extern int blk_queue_resize_tags(struct request_queue *, int);
 extern void blk_queue_invalidate_tags(struct request_queue *);
 extern struct blk_queue_tag *blk_init_tags(int);
 extern void blk_free_tags(struct blk_queue_tag *);
 static inline struct request *blk_map_queue_find_tag(struct blk_queue_tag *bqt,
 						int tag)
 {
 	if (unlikely(bqt == NULL || tag >= bqt->real_max_depth))
 		return NULL;
 	return bqt->tag_index[tag];
 }
 extern int blkdev_issue_flush(struct block_device *, sector_t *);
 #define MAX_PHYS_SEGMENTS 128
 #define MAX_HW_SEGMENTS 128
 #define SAFE_MAX_SECTORS 255
 #define BLK_DEF_MAX_SECTORS 1024
 #define MAX_SEGMENT_SIZE	65536
 #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist)
 static inline int queue_hardsect_size(struct request_queue *q)
 {
 	int retval = 512;
 	if (q && q->hardsect_size)
 		retval = q->hardsect_size;
 	return retval;
 }
 static inline int bdev_hardsect_size(struct block_device *bdev)
 {
 	return queue_hardsect_size(bdev_get_queue(bdev));
 }
 static inline int queue_dma_alignment(struct request_queue *q)
 {
 	int retval = 511;
 	if (q && q->dma_alignment)
 		retval = q->dma_alignment;
 	return retval;
 }
 /* assumes size > 256 */
 static inline unsigned int blksize_bits(unsigned int size)
 {
 	unsigned int bits = 8;
 	do {
 		bits++;
 		size >>= 1;
 	} while (size > 256);
 	return bits;
 }
 static inline unsigned int block_size(struct block_device *bdev)
 {
 	return bdev->bd_block_size;
 }
 typedef struct {struct page *v;} Sector;
 unsigned char *read_dev_sector(struct block_device *, sector_t, Sector *);
 static inline void put_dev_sector(Sector p)
 {
 	page_cache_release(p.v);
 }
 struct work_struct;
 int kblockd_schedule_work(struct work_struct *work);
 void kblockd_flush_work(struct work_struct *work);
 #define MODULE_ALIAS_BLOCKDEV(major,minor) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor))
 #define MODULE_ALIAS_BLOCKDEV_MAJOR(major) \
 	MODULE_ALIAS("block-major-" __stringify(major) "-*")
 #else /* CONFIG_BLOCK */
 /*
  * stubs for when the block layer is configured out
  */
 #define buffer_heads_over_limit 0
 static inline long nr_blockdev_pages(void)
 {
 	return 0;
 }
 static inline void exit_io_context(void)
 {
 }
 #endif /* CONFIG_BLOCK */
 #endif

mm/bounce.c

Diff comments View file @ bf2de6f

 /* bounce buffer handling for block devices
  *
  * - Split from highmem.c
  */
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/swap.h>
 #include <linux/bio.h>
 #include <linux/pagemap.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/highmem.h>
 #include <linux/blktrace_api.h>
 #include <asm/tlbflush.h>
 #define POOL_SIZE	64
 #define ISA_POOL_SIZE	16
 static mempool_t *page_pool, *isa_page_pool;
 #ifdef CONFIG_HIGHMEM
 static __init int init_emergency_pool(void)
 {
 	struct sysinfo i;
 	si_meminfo(&i);
 	si_swapinfo(&i);
 	if (!i.totalhigh)
 		return 0;
 	page_pool = mempool_create_page_pool(POOL_SIZE, 0);
 	BUG_ON(!page_pool);
 	printk("highmem bounce pool size: %d pages\n", POOL_SIZE);
 	return 0;
 }
 __initcall(init_emergency_pool);
 /*
  * highmem version, map in to vec
  */
 static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
 {
 	unsigned long flags;
 	unsigned char *vto;
 	local_irq_save(flags);
 	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto, KM_BOUNCE_READ);
 	local_irq_restore(flags);
 }
 #else /* CONFIG_HIGHMEM */
 #define bounce_copy_vec(to, vfrom)	\
 	memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
 #endif /* CONFIG_HIGHMEM */
 /*
  * allocate pages in the DMA region for the ISA pool
  */
 static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
 {
 	return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
 }
 /*
  * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
  * as the max address, so check if the pool has already been created.
  */
 int init_emergency_isa_pool(void)
 {
 	if (isa_page_pool)
 		return 0;
 	isa_page_pool = mempool_create(ISA_POOL_SIZE, mempool_alloc_pages_isa,
 				       mempool_free_pages, (void *) 0);
 	BUG_ON(!isa_page_pool);
 	printk("isa bounce pool size: %d pages\n", ISA_POOL_SIZE);
 	return 0;
 }
 /*
  * Simple bounce buffer support for highmem pages. Depending on the
  * queue gfp mask set, *to may or may not be a highmem page. kmap it
  * always, it will do the Right Thing
  */
 static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
 {
 	unsigned char *vfrom;
 	struct bio_vec *tovec, *fromvec;
 	int i;
 	__bio_for_each_segment(tovec, to, i, 0) {
 		fromvec = from->bi_io_vec + i;
 		/*
 		 * not bounced
 		 */
 		if (tovec->bv_page == fromvec->bv_page)
 			continue;
 		/*
 		 * fromvec->bv_offset and fromvec->bv_len might have been
 		 * modified by the block layer, so use the original copy,
 		 * bounce_copy_vec already uses tovec->bv_len
 		 */
 		vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
 		flush_dcache_page(tovec->bv_page);
 		bounce_copy_vec(tovec, vfrom);
 	}
 }
 static void bounce_end_io(struct bio *bio, mempool_t *pool, int err)
 {
 	struct bio *bio_orig = bio->bi_private;
 	struct bio_vec *bvec, *org_vec;
 	int i;
 	if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
 		set_bit(BIO_EOPNOTSUPP, &bio_orig->bi_flags);
 	/*
 	 * free up bounce indirect pages used
 	 */
 	__bio_for_each_segment(bvec, bio, i, 0) {
 		org_vec = bio_orig->bi_io_vec + i;
 		if (bvec->bv_page == org_vec->bv_page)
 			continue;
 		dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
 		mempool_free(bvec->bv_page, pool);
 	}
 	bio_endio(bio_orig, err);
 	bio_put(bio);
 }
 static void bounce_end_io_write(struct bio *bio, int err)
 {
 	bounce_end_io(bio, page_pool, err);
 }
 static void bounce_end_io_write_isa(struct bio *bio, int err)
 {
 	bounce_end_io(bio, isa_page_pool, err);
 }
 static void __bounce_end_io_read(struct bio *bio, mempool_t *pool, int err)
 {
 	struct bio *bio_orig = bio->bi_private;
 	if (test_bit(BIO_UPTODATE, &bio->bi_flags))
 		copy_to_high_bio_irq(bio_orig, bio);
 	bounce_end_io(bio, pool, err);
 }
 static void bounce_end_io_read(struct bio *bio, int err)
 {
 	__bounce_end_io_read(bio, page_pool, err);
 }
 static void bounce_end_io_read_isa(struct bio *bio, int err)
 {
 	__bounce_end_io_read(bio, isa_page_pool, err);
 }
 static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
 			       mempool_t *pool)
 {
 	struct page *page;
 	struct bio *bio = NULL;
 	int i, rw = bio_data_dir(*bio_orig);
 	struct bio_vec *to, *from;
 	bio_for_each_segment(from, *bio_orig, i) {
 		page = from->bv_page;
 		/*
 		 * is destination page below bounce pfn?
 		 */
 		if (page_to_pfn(page) <= q->bounce_pfn)
 			continue;
 		/*
 		 * irk, bounce it
 		 */
 		if (!bio)
 			bio = bio_alloc(GFP_NOIO, (*bio_orig)->bi_vcnt);
 		to = bio->bi_io_vec + i;
 		to->bv_page = mempool_alloc(pool, q->bounce_gfp);
 		to->bv_len = from->bv_len;
 		to->bv_offset = from->bv_offset;
 		inc_zone_page_state(to->bv_page, NR_BOUNCE);
 		if (rw == WRITE) {
 			char *vto, *vfrom;
 			flush_dcache_page(from->bv_page);
 			vto = page_address(to->bv_page) + to->bv_offset;
 			vfrom = kmap(from->bv_page) + from->bv_offset;
 			memcpy(vto, vfrom, to->bv_len);
 			kunmap(from->bv_page);
 		}
 	}
 	/*
 	 * no pages bounced
 	 */
 	if (!bio)
 		return;
 	blk_add_trace_bio(q, *bio_orig, BLK_TA_BOUNCE);
 	/*
 	 * at least one page was bounced, fill in possible non-highmem
 	 * pages
 	 */
 	__bio_for_each_segment(from, *bio_orig, i, 0) {
 		to = bio_iovec_idx(bio, i);
 		if (!to->bv_page) {
 			to->bv_page = from->bv_page;
 			to->bv_len = from->bv_len;
 			to->bv_offset = from->bv_offset;
 		}
 	}
 	bio->bi_bdev = (*bio_orig)->bi_bdev;
 	bio->bi_flags |= (1 << BIO_BOUNCED);
 	bio->bi_sector = (*bio_orig)->bi_sector;
 	bio->bi_rw = (*bio_orig)->bi_rw;
 	bio->bi_vcnt = (*bio_orig)->bi_vcnt;
 	bio->bi_idx = (*bio_orig)->bi_idx;
 	bio->bi_size = (*bio_orig)->bi_size;
 	if (pool == page_pool) {
 		bio->bi_end_io = bounce_end_io_write;
 		if (rw == READ)
 			bio->bi_end_io = bounce_end_io_read;
 	} else {
 		bio->bi_end_io = bounce_end_io_write_isa;
 		if (rw == READ)
 			bio->bi_end_io = bounce_end_io_read_isa;
 	}
 	bio->bi_private = *bio_orig;
 	*bio_orig = bio;
 }
 void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
 {
 	mempool_t *pool;
 	/*
+	 * Data-less bio, nothing to bounce
+	 */
+	if (bio_empty_barrier(*bio_orig))
+		return;
+	/*
 	 * for non-isa bounce case, just check if the bounce pfn is equal
 	 * to or bigger than the highest pfn in the system -- in that case,
 	 * don't waste time iterating over bio segments
 	 */
 	if (!(q->bounce_gfp & GFP_DMA)) {
 		if (q->bounce_pfn >= blk_max_pfn)
 			return;
 		pool = page_pool;
 	} else {
 		BUG_ON(!isa_page_pool);
 		pool = isa_page_pool;
 	}
 	/*
 	 * slow path
 	 */
 	__blk_queue_bounce(q, bio_orig, pool);
 }
 EXPORT_SYMBOL(blk_queue_bounce);