Merge branch 'for-3.3/core' of git://git.kernel.dk/linux-block

* 'for-3.3/core' of git://git.kernel.dk/linux-block: (37 commits) Revert "block: recursive merge requests" block: Stop using macro stubs for the bio data integrity calls blockdev: convert some macros to static inlines fs: remove unneeded plug in mpage_readpages() block: Add BLKROTATIONAL ioctl block: Introduce blk_set_stacking_limits function block: remove WARN_ON_ONCE() in exit_io_context() block: an exiting task should be allowed to create io_context block: ioc_cgroup_changed() needs to be exported block: recursive merge requests block, cfq: fix empty queue crash caused by request merge block, cfq: move icq creation and rq->elv.icq association to block core block, cfq: restructure io_cq creation path for io_context interface cleanup block, cfq: move io_cq exit/release to blk-ioc.c block, cfq: move icq cache management to block core block, cfq: move io_cq lookup to blk-ioc.c block, cfq: move cfqd->icq_list to request_queue and add request->elv.icq block, cfq: reorganize cfq_io_context into generic and cfq specific parts block: remove elevator_queue->ops block: reorder elevator switch sequence ... Fix up conflicts in: - block/blk-cgroup.c Switch from can_attach_task to can_attach - block/cfq-iosched.c conflict with now removed cic index changes (we now use q->id instead)

Merge branch 'for-3.3/core' of git://git.kernel.dk/linux-block
* 'for-3.3/core' of git://git.kernel.dk/linux-block: (37 commits) Revert "block: recursive merge requests" block: Stop using macro stubs for the bio data integrity calls blockdev: convert some macros to static inlines fs: remove unneeded plug in mpage_readpages() block: Add BLKROTATIONAL ioctl block: Introduce blk_set_stacking_limits function block: remove WARN_ON_ONCE() in exit_io_context() block: an exiting task should be allowed to create io_context block: ioc_cgroup_changed() needs to be exported block: recursive merge requests block, cfq: fix empty queue crash caused by request merge block, cfq: move icq creation and rq->elv.icq association to block core block, cfq: restructure io_cq creation path for io_context interface cleanup block, cfq: move io_cq exit/release to blk-ioc.c block, cfq: move icq cache management to block core block, cfq: move io_cq lookup to blk-ioc.c block, cfq: move cfqd->icq_list to request_queue and add request->elv.icq block, cfq: reorganize cfq_io_context into generic and cfq specific parts block: remove elevator_queue->ops block: reorder elevator switch sequence ... Fix up conflicts in: - block/blk-cgroup.c Switch from can_attach_task to can_attach - block/cfq-iosched.c conflict with now removed cic index changes (we now use q->id instead)
Linus Torvalds
2 parents 83c2f912b4 5d381efb3d
Showing 28 changed files Side-by-side Diff
block/blk-cgroup.c
block/blk-core.c
block/blk-exec.c
block/blk-ioc.c
block/blk-settings.c
block/blk-sysfs.c
block/blk-throttle.c
block/blk.h
block/bsg.c
block/cfq-iosched.c
block/compat_ioctl.c
block/deadline-iosched.c
block/elevator.c
block/genhd.c
block/ioctl.c
block/noop-iosched.c
drivers/block/sx8.c
drivers/md/dm-table.c
drivers/md/md.c
drivers/scsi/scsi_scan.c
@@ -1655,11 +1655,12 @@
 	struct io_context *ioc;
  
 	cgroup_taskset_for_each(task, cgrp, tset) {
-		task_lock(task);
-		ioc = task->io_context;
-		if (ioc)
-			ioc->cgroup_changed = 1;
-		task_unlock(task);
+		/* we don't lose anything even if ioc allocation fails */
+		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+		if (ioc) {
+			ioc_cgroup_changed(ioc);
+			put_io_context(ioc, NULL);
+		}
 	}
 }
  
@@ -39,6 +39,8 @@
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
  
+DEFINE_IDA(blk_queue_ida);
+
 /*
  * For the allocated request tables
  */
@@ -358,7 +360,8 @@
 void blk_drain_queue(struct request_queue *q, bool drain_all)
 {
 	while (true) {
-		int nr_rqs;
+		bool drain = false;
+		int i;
  
 		spin_lock_irq(q->queue_lock);
  
  
  
@@ -375,14 +378,25 @@
 		if (!list_empty(&q->queue_head))
 			__blk_run_queue(q);
  
-		if (drain_all)
-			nr_rqs = q->rq.count[0] + q->rq.count[1];
-		else
-			nr_rqs = q->rq.elvpriv;
+		drain |= q->rq.elvpriv;
  
+		/*
+		 * Unfortunately, requests are queued at and tracked from
+		 * multiple places and there's no single counter which can
+		 * be drained.  Check all the queues and counters.
+		 */
+		if (drain_all) {
+			drain |= !list_empty(&q->queue_head);
+			for (i = 0; i < 2; i++) {
+				drain |= q->rq.count[i];
+				drain |= q->in_flight[i];
+				drain |= !list_empty(&q->flush_queue[i]);
+			}
+		}
+
 		spin_unlock_irq(q->queue_lock);
  
-		if (!nr_rqs)
+		if (!drain)
 			break;
 		msleep(10);
 	}
@@ -469,6 +483,10 @@
 	if (!q)
 		return NULL;
  
+	q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
+	if (q->id < 0)
+		goto fail_q;
+
 	q->backing_dev_info.ra_pages =
 			(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 	q->backing_dev_info.state = 0;
  
  
@@ -477,20 +495,17 @@
 	q->node = node_id;
  
 	err = bdi_init(&q->backing_dev_info);
-	if (err) {
-		kmem_cache_free(blk_requestq_cachep, q);
-		return NULL;
-	}
+	if (err)
+		goto fail_id;
  
-	if (blk_throtl_init(q)) {
-		kmem_cache_free(blk_requestq_cachep, q);
-		return NULL;
-	}
+	if (blk_throtl_init(q))
+		goto fail_id;
  
 	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
 		    laptop_mode_timer_fn, (unsigned long) q);
 	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
 	INIT_LIST_HEAD(&q->timeout_list);
+	INIT_LIST_HEAD(&q->icq_list);
 	INIT_LIST_HEAD(&q->flush_queue[0]);
 	INIT_LIST_HEAD(&q->flush_queue[1]);
 	INIT_LIST_HEAD(&q->flush_data_in_flight);
@@ -508,6 +523,12 @@
 	q->queue_lock = &q->__queue_lock;
  
 	return q;
+
+fail_id:
+	ida_simple_remove(&blk_queue_ida, q->id);
+fail_q:
+	kmem_cache_free(blk_requestq_cachep, q);
+	return NULL;
 }
 EXPORT_SYMBOL(blk_alloc_queue_node);
  
  
  
  
  
  
@@ -605,26 +626,31 @@
 }
 EXPORT_SYMBOL(blk_init_allocated_queue);
  
-int blk_get_queue(struct request_queue *q)
+bool blk_get_queue(struct request_queue *q)
 {
-	if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
-		kobject_get(&q->kobj);
-		return 0;
+	if (likely(!blk_queue_dead(q))) {
+		__blk_get_queue(q);
+		return true;
 	}
  
-	return 1;
+	return false;
 }
 EXPORT_SYMBOL(blk_get_queue);
  
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
-	if (rq->cmd_flags & REQ_ELVPRIV)
+	if (rq->cmd_flags & REQ_ELVPRIV) {
 		elv_put_request(q, rq);
+		if (rq->elv.icq)
+			put_io_context(rq->elv.icq->ioc, q);
+	}
+
 	mempool_free(rq, q->rq.rq_pool);
 }
  
 static struct request *
-blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
+blk_alloc_request(struct request_queue *q, struct io_cq *icq,
+		  unsigned int flags, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
  
@@ -635,10 +661,15 @@
  
 	rq->cmd_flags = flags | REQ_ALLOCED;
  
-	if ((flags & REQ_ELVPRIV) &&
-	    unlikely(elv_set_request(q, rq, gfp_mask))) {
-		mempool_free(rq, q->rq.rq_pool);
-		return NULL;
+	if (flags & REQ_ELVPRIV) {
+		rq->elv.icq = icq;
+		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
+			mempool_free(rq, q->rq.rq_pool);
+			return NULL;
+		}
+		/* @rq->elv.icq holds on to io_context until @rq is freed */
+		if (icq)
+			get_io_context(icq->ioc);
 	}
  
 	return rq;
  
  
  
@@ -750,11 +781,17 @@
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
-	struct io_context *ioc = NULL;
+	struct elevator_type *et;
+	struct io_context *ioc;
+	struct io_cq *icq = NULL;
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
+	bool retried = false;
 	int may_queue;
+retry:
+	et = q->elevator->type;
+	ioc = current->io_context;
  
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+	if (unlikely(blk_queue_dead(q)))
 		return NULL;
  
 	may_queue = elv_may_queue(q, rw_flags);
  
@@ -763,8 +800,21 @@
  
 	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
 		if (rl->count[is_sync]+1 >= q->nr_requests) {
-			ioc = current_io_context(GFP_ATOMIC, q->node);
 			/*
+			 * We want ioc to record batching state.  If it's
+			 * not already there, creating a new one requires
+			 * dropping queue_lock, which in turn requires
+			 * retesting conditions to avoid queue hang.
+			 */
+			if (!ioc && !retried) {
+				spin_unlock_irq(q->queue_lock);
+				create_io_context(current, gfp_mask, q->node);
+				spin_lock_irq(q->queue_lock);
+				retried = true;
+				goto retry;
+			}
+
+			/*
 			 * The queue will fill after this allocation, so set
 			 * it as full, and mark this process as "batching".
 			 * This process will be allowed to complete a batch of
  
  
@@ -799,17 +849,36 @@
 	rl->count[is_sync]++;
 	rl->starved[is_sync] = 0;
  
+	/*
+	 * Decide whether the new request will be managed by elevator.  If
+	 * so, mark @rw_flags and increment elvpriv.  Non-zero elvpriv will
+	 * prevent the current elevator from being destroyed until the new
+	 * request is freed.  This guarantees icq's won't be destroyed and
+	 * makes creating new ones safe.
+	 *
+	 * Also, lookup icq while holding queue_lock.  If it doesn't exist,
+	 * it will be created after releasing queue_lock.
+	 */
 	if (blk_rq_should_init_elevator(bio) &&
 	    !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
 		rw_flags |= REQ_ELVPRIV;
 		rl->elvpriv++;
+		if (et->icq_cache && ioc)
+			icq = ioc_lookup_icq(ioc, q);
 	}
  
 	if (blk_queue_io_stat(q))
 		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
  
-	rq = blk_alloc_request(q, rw_flags, gfp_mask);
+	/* create icq if missing */
+	if (unlikely(et->icq_cache && !icq))
+		icq = ioc_create_icq(q, gfp_mask);
+
+	/* rqs are guaranteed to have icq on elv_set_request() if requested */
+	if (likely(!et->icq_cache || icq))
+		rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
+
 	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
  
@@ -871,10 +940,9 @@
 	rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	while (!rq) {
 		DEFINE_WAIT(wait);
-		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
  
-		if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+		if (unlikely(blk_queue_dead(q)))
 			return NULL;
  
 		prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
@@ -891,8 +959,8 @@
 		 * up to a big batch of them for a small period time.
 		 * See ioc_batching, ioc_set_batching
 		 */
-		ioc = current_io_context(GFP_NOIO, q->node);
-		ioc_set_batching(q, ioc);
+		create_io_context(current, GFP_NOIO, q->node);
+		ioc_set_batching(q, current->io_context);
  
 		spin_lock_irq(q->queue_lock);
 		finish_wait(&rl->wait[is_sync], &wait);
@@ -1009,54 +1077,6 @@
 	__elv_add_request(q, rq, where);
 }
  
-/**
- * blk_insert_request - insert a special request into a request queue
- * @q:		request queue where request should be inserted
- * @rq:		request to be inserted
- * @at_head:	insert request at head or tail of queue
- * @data:	private data
- *
- * Description:
- *    Many block devices need to execute commands asynchronously, so they don't
- *    block the whole kernel from preemption during request execution.  This is
- *    accomplished normally by inserting aritficial requests tagged as
- *    REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
- *    be scheduled for actual execution by the request queue.
- *
- *    We have the option of inserting the head or the tail of the queue.
- *    Typically we use the tail for new ioctls and so forth.  We use the head
- *    of the queue for things like a QUEUE_FULL message from a device, or a
- *    host that is unable to accept a particular command.
- */
-void blk_insert_request(struct request_queue *q, struct request *rq,
-			int at_head, void *data)
-{
-	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
-	unsigned long flags;
-
-	/*
-	 * tell I/O scheduler that this isn't a regular read/write (ie it
-	 * must not attempt merges on this) and that it acts as a soft
-	 * barrier
-	 */
-	rq->cmd_type = REQ_TYPE_SPECIAL;
-
-	rq->special = data;
-
-	spin_lock_irqsave(q->queue_lock, flags);
-
-	/*
-	 * If command is tagged, release the tag
-	 */
-	if (blk_rq_tagged(rq))
-		blk_queue_end_tag(q, rq);
-
-	add_acct_request(q, rq, where);
-	__blk_run_queue(q);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-}
-EXPORT_SYMBOL(blk_insert_request);
-
 static void part_round_stats_single(int cpu, struct hd_struct *part,
 				    unsigned long now)
 {
@@ -1766,6 +1786,10 @@
 		return -EIO;
  
 	spin_lock_irqsave(q->queue_lock, flags);
+	if (unlikely(blk_queue_dead(q))) {
+		spin_unlock_irqrestore(q->queue_lock, flags);
+		return -ENODEV;
+	}
  
 	/*
 	 * Submitting request must be dequeued before calling this function
@@ -2740,6 +2764,14 @@
 	trace_block_unplug(q, depth, !from_schedule);
  
 	/*
+	 * Don't mess with dead queue.
+	 */
+	if (unlikely(blk_queue_dead(q))) {
+		spin_unlock(q->queue_lock);
+		return;
+	}
+
+	/*
 	 * If we are punting this to kblockd, then we can safely drop
 	 * the queue_lock before waking kblockd (which needs to take
 	 * this lock).
@@ -2815,6 +2847,15 @@
 			depth = 0;
 			spin_lock(q->queue_lock);
 		}
+
+		/*
+		 * Short-circuit if @q is dead
+		 */
+		if (unlikely(blk_queue_dead(q))) {
+			__blk_end_request_all(rq, -ENODEV);
+			continue;
+		}
+
 		/*
 		 * rq is already accounted, so use raw insert
 		 */
@@ -50,7 +50,11 @@
 {
 	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
  
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+	WARN_ON(irqs_disabled());
+	spin_lock_irq(q->queue_lock);
+
+	if (unlikely(blk_queue_dead(q))) {
+		spin_unlock_irq(q->queue_lock);
 		rq->errors = -ENXIO;
 		if (rq->end_io)
 			rq->end_io(rq, rq->errors);
@@ -59,8 +63,6 @@
  
 	rq->rq_disk = bd_disk;
 	rq->end_io = done;
-	WARN_ON(irqs_disabled());
-	spin_lock_irq(q->queue_lock);
 	__elv_add_request(q, rq, where);
 	__blk_run_queue(q);
 	/* the queue is stopped so it won't be run */
@@ -16,53 +16,214 @@
  */
 static struct kmem_cache *iocontext_cachep;
  
-static void cfq_dtor(struct io_context *ioc)
+/**
+ * get_io_context - increment reference count to io_context
+ * @ioc: io_context to get
+ *
+ * Increment reference count to @ioc.
+ */
+void get_io_context(struct io_context *ioc)
 {
-	if (!hlist_empty(&ioc->cic_list)) {
-		struct cfq_io_context *cic;
+	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+	atomic_long_inc(&ioc->refcount);
+}
+EXPORT_SYMBOL(get_io_context);
  
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
-		cic->dtor(ioc);
+/*
+ * Releasing ioc may nest into another put_io_context() leading to nested
+ * fast path release.  As the ioc's can't be the same, this is okay but
+ * makes lockdep whine.  Keep track of nesting and use it as subclass.
+ */
+#ifdef CONFIG_LOCKDEP
+#define ioc_release_depth(q)		((q) ? (q)->ioc_release_depth : 0)
+#define ioc_release_depth_inc(q)	(q)->ioc_release_depth++
+#define ioc_release_depth_dec(q)	(q)->ioc_release_depth--
+#else
+#define ioc_release_depth(q)		0
+#define ioc_release_depth_inc(q)	do { } while (0)
+#define ioc_release_depth_dec(q)	do { } while (0)
+#endif
+
+static void icq_free_icq_rcu(struct rcu_head *head)
+{
+	struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
+
+	kmem_cache_free(icq->__rcu_icq_cache, icq);
+}
+
+/*
+ * Exit and free an icq.  Called with both ioc and q locked.
+ */
+static void ioc_exit_icq(struct io_cq *icq)
+{
+	struct io_context *ioc = icq->ioc;
+	struct request_queue *q = icq->q;
+	struct elevator_type *et = q->elevator->type;
+
+	lockdep_assert_held(&ioc->lock);
+	lockdep_assert_held(q->queue_lock);
+
+	radix_tree_delete(&ioc->icq_tree, icq->q->id);
+	hlist_del_init(&icq->ioc_node);
+	list_del_init(&icq->q_node);
+
+	/*
+	 * Both setting lookup hint to and clearing it from @icq are done
+	 * under queue_lock.  If it's not pointing to @icq now, it never
+	 * will.  Hint assignment itself can race safely.
+	 */
+	if (rcu_dereference_raw(ioc->icq_hint) == icq)
+		rcu_assign_pointer(ioc->icq_hint, NULL);
+
+	if (et->ops.elevator_exit_icq_fn) {
+		ioc_release_depth_inc(q);
+		et->ops.elevator_exit_icq_fn(icq);
+		ioc_release_depth_dec(q);
 	}
+
+	/*
+	 * @icq->q might have gone away by the time RCU callback runs
+	 * making it impossible to determine icq_cache.  Record it in @icq.
+	 */
+	icq->__rcu_icq_cache = et->icq_cache;
+	call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
 }
  
 /*
- * IO Context helper functions. put_io_context() returns 1 if there are no
- * more users of this io context, 0 otherwise.
+ * Slow path for ioc release in put_io_context().  Performs double-lock
+ * dancing to unlink all icq's and then frees ioc.
  */
-int put_io_context(struct io_context *ioc)
+static void ioc_release_fn(struct work_struct *work)
 {
-	if (ioc == NULL)
-		return 1;
+	struct io_context *ioc = container_of(work, struct io_context,
+					      release_work);
+	struct request_queue *last_q = NULL;
  
-	BUG_ON(atomic_long_read(&ioc->refcount) == 0);
+	spin_lock_irq(&ioc->lock);
  
-	if (atomic_long_dec_and_test(&ioc->refcount)) {
-		rcu_read_lock();
-		cfq_dtor(ioc);
-		rcu_read_unlock();
+	while (!hlist_empty(&ioc->icq_list)) {
+		struct io_cq *icq = hlist_entry(ioc->icq_list.first,
+						struct io_cq, ioc_node);
+		struct request_queue *this_q = icq->q;
  
-		kmem_cache_free(iocontext_cachep, ioc);
-		return 1;
+		if (this_q != last_q) {
+			/*
+			 * Need to switch to @this_q.  Once we release
+			 * @ioc->lock, it can go away along with @cic.
+			 * Hold on to it.
+			 */
+			__blk_get_queue(this_q);
+
+			/*
+			 * blk_put_queue() might sleep thanks to kobject
+			 * idiocy.  Always release both locks, put and
+			 * restart.
+			 */
+			if (last_q) {
+				spin_unlock(last_q->queue_lock);
+				spin_unlock_irq(&ioc->lock);
+				blk_put_queue(last_q);
+			} else {
+				spin_unlock_irq(&ioc->lock);
+			}
+
+			last_q = this_q;
+			spin_lock_irq(this_q->queue_lock);
+			spin_lock(&ioc->lock);
+			continue;
+		}
+		ioc_exit_icq(icq);
 	}
-	return 0;
+
+	if (last_q) {
+		spin_unlock(last_q->queue_lock);
+		spin_unlock_irq(&ioc->lock);
+		blk_put_queue(last_q);
+	} else {
+		spin_unlock_irq(&ioc->lock);
+	}
+
+	kmem_cache_free(iocontext_cachep, ioc);
 }
-EXPORT_SYMBOL(put_io_context);
  
-static void cfq_exit(struct io_context *ioc)
+/**
+ * put_io_context - put a reference of io_context
+ * @ioc: io_context to put
+ * @locked_q: request_queue the caller is holding queue_lock of (hint)
+ *
+ * Decrement reference count of @ioc and release it if the count reaches
+ * zero.  If the caller is holding queue_lock of a queue, it can indicate
+ * that with @locked_q.  This is an optimization hint and the caller is
+ * allowed to pass in %NULL even when it's holding a queue_lock.
+ */
+void put_io_context(struct io_context *ioc, struct request_queue *locked_q)
 {
-	rcu_read_lock();
+	struct request_queue *last_q = locked_q;
+	unsigned long flags;
  
-	if (!hlist_empty(&ioc->cic_list)) {
-		struct cfq_io_context *cic;
+	if (ioc == NULL)
+		return;
  
-		cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
-								cic_list);
-		cic->exit(ioc);
+	BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+	if (locked_q)
+		lockdep_assert_held(locked_q->queue_lock);
+
+	if (!atomic_long_dec_and_test(&ioc->refcount))
+		return;
+
+	/*
+	 * Destroy @ioc.  This is a bit messy because icq's are chained
+	 * from both ioc and queue, and ioc->lock nests inside queue_lock.
+	 * The inner ioc->lock should be held to walk our icq_list and then
+	 * for each icq the outer matching queue_lock should be grabbed.
+	 * ie. We need to do reverse-order double lock dancing.
+	 *
+	 * Another twist is that we are often called with one of the
+	 * matching queue_locks held as indicated by @locked_q, which
+	 * prevents performing double-lock dance for other queues.
+	 *
+	 * So, we do it in two stages.  The fast path uses the queue_lock
+	 * the caller is holding and, if other queues need to be accessed,
+	 * uses trylock to avoid introducing locking dependency.  This can
+	 * handle most cases, especially if @ioc was performing IO on only
+	 * single device.
+	 *
+	 * If trylock doesn't cut it, we defer to @ioc->release_work which
+	 * can do all the double-locking dancing.
+	 */
+	spin_lock_irqsave_nested(&ioc->lock, flags,
+				 ioc_release_depth(locked_q));
+
+	while (!hlist_empty(&ioc->icq_list)) {
+		struct io_cq *icq = hlist_entry(ioc->icq_list.first,
+						struct io_cq, ioc_node);
+		struct request_queue *this_q = icq->q;
+
+		if (this_q != last_q) {
+			if (last_q && last_q != locked_q)
+				spin_unlock(last_q->queue_lock);
+			last_q = NULL;
+
+			if (!spin_trylock(this_q->queue_lock))
+				break;
+			last_q = this_q;
+			continue;
+		}
+		ioc_exit_icq(icq);
 	}
-	rcu_read_unlock();
+
+	if (last_q && last_q != locked_q)
+		spin_unlock(last_q->queue_lock);
+
+	spin_unlock_irqrestore(&ioc->lock, flags);
+
+	/* if no icq is left, we're done; otherwise, kick release_work */
+	if (hlist_empty(&ioc->icq_list))
+		kmem_cache_free(iocontext_cachep, ioc);
+	else
+		schedule_work(&ioc->release_work);
 }
+EXPORT_SYMBOL(put_io_context);
  
 /* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
  
@@ -74,86 +235,240 @@
 	task->io_context = NULL;
 	task_unlock(task);
  
-	if (atomic_dec_and_test(&ioc->nr_tasks))
-		cfq_exit(ioc);
+	atomic_dec(&ioc->nr_tasks);
+	put_io_context(ioc, NULL);
+}
  
-	put_io_context(ioc);
+/**
+ * ioc_clear_queue - break any ioc association with the specified queue
+ * @q: request_queue being cleared
+ *
+ * Walk @q->icq_list and exit all io_cq's.  Must be called with @q locked.
+ */
+void ioc_clear_queue(struct request_queue *q)
+{
+	lockdep_assert_held(q->queue_lock);
+
+	while (!list_empty(&q->icq_list)) {
+		struct io_cq *icq = list_entry(q->icq_list.next,
+					       struct io_cq, q_node);
+		struct io_context *ioc = icq->ioc;
+
+		spin_lock(&ioc->lock);
+		ioc_exit_icq(icq);
+		spin_unlock(&ioc->lock);
+	}
 }
  
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
+void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
+				int node)
 {
 	struct io_context *ioc;
  
-	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
-	if (ioc) {
-		atomic_long_set(&ioc->refcount, 1);
-		atomic_set(&ioc->nr_tasks, 1);
-		spin_lock_init(&ioc->lock);
-		ioc->ioprio_changed = 0;
-		ioc->ioprio = 0;
-		ioc->last_waited = 0; /* doesn't matter... */
-		ioc->nr_batch_requests = 0; /* because this is 0 */
-		INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC | __GFP_HIGH);
-		INIT_HLIST_HEAD(&ioc->cic_list);
-		ioc->ioc_data = NULL;
-#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
-		ioc->cgroup_changed = 0;
-#endif
-	}
+	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
+				    node);
+	if (unlikely(!ioc))
+		return;
  
-	return ioc;
+	/* initialize */
+	atomic_long_set(&ioc->refcount, 1);
+	atomic_set(&ioc->nr_tasks, 1);
+	spin_lock_init(&ioc->lock);
+	INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
+	INIT_HLIST_HEAD(&ioc->icq_list);
+	INIT_WORK(&ioc->release_work, ioc_release_fn);
+
+	/*
+	 * Try to install.  ioc shouldn't be installed if someone else
+	 * already did or @task, which isn't %current, is exiting.  Note
+	 * that we need to allow ioc creation on exiting %current as exit
+	 * path may issue IOs from e.g. exit_files().  The exit path is
+	 * responsible for not issuing IO after exit_io_context().
+	 */
+	task_lock(task);
+	if (!task->io_context &&
+	    (task == current || !(task->flags & PF_EXITING)))
+		task->io_context = ioc;
+	else
+		kmem_cache_free(iocontext_cachep, ioc);
+	task_unlock(task);
 }
  
-/*
- * If the current task has no IO context then create one and initialise it.
- * Otherwise, return its existing IO context.
+/**
+ * get_task_io_context - get io_context of a task
+ * @task: task of interest
+ * @gfp_flags: allocation flags, used if allocation is necessary
+ * @node: allocation node, used if allocation is necessary
  *
- * This returned IO context doesn't have a specifically elevated refcount,
- * but since the current task itself holds a reference, the context can be
- * used in general code, so long as it stays within `current` context.
+ * Return io_context of @task.  If it doesn't exist, it is created with
+ * @gfp_flags and @node.  The returned io_context has its reference count
+ * incremented.
+ *
+ * This function always goes through task_lock() and it's better to use
+ * %current->io_context + get_io_context() for %current.
  */
-struct io_context *current_io_context(gfp_t gfp_flags, int node)
+struct io_context *get_task_io_context(struct task_struct *task,
+				       gfp_t gfp_flags, int node)
 {
-	struct task_struct *tsk = current;
-	struct io_context *ret;
+	struct io_context *ioc;
  
-	ret = tsk->io_context;
-	if (likely(ret))
-		return ret;
+	might_sleep_if(gfp_flags & __GFP_WAIT);
  
-	ret = alloc_io_context(gfp_flags, node);
-	if (ret) {
-		/* make sure set_task_ioprio() sees the settings above */
-		smp_wmb();
-		tsk->io_context = ret;
-	}
+	do {
+		task_lock(task);
+		ioc = task->io_context;
+		if (likely(ioc)) {
+			get_io_context(ioc);
+			task_unlock(task);
+			return ioc;
+		}
+		task_unlock(task);
+	} while (create_io_context(task, gfp_flags, node));
  
-	return ret;
+	return NULL;
 }
+EXPORT_SYMBOL(get_task_io_context);
  
-/*
- * If the current task has no IO context then create one and initialise it.
- * If it does have a context, take a ref on it.
+/**
+ * ioc_lookup_icq - lookup io_cq from ioc
+ * @ioc: the associated io_context
+ * @q: the associated request_queue
  *
- * This is always called in the context of the task which submitted the I/O.
+ * Look up io_cq associated with @ioc - @q pair from @ioc.  Must be called
+ * with @q->queue_lock held.
  */
-struct io_context *get_io_context(gfp_t gfp_flags, int node)
+struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
 {
-	struct io_context *ioc = NULL;
+	struct io_cq *icq;
  
+	lockdep_assert_held(q->queue_lock);
+
 	/*
-	 * Check for unlikely race with exiting task. ioc ref count is
-	 * zero when ioc is being detached.
+	 * icq's are indexed from @ioc using radix tree and hint pointer,
+	 * both of which are protected with RCU.  All removals are done
+	 * holding both q and ioc locks, and we're holding q lock - if we
+	 * find a icq which points to us, it's guaranteed to be valid.
 	 */
-	do {
-		ioc = current_io_context(gfp_flags, node);
-		if (unlikely(!ioc))
-			break;
-	} while (!atomic_long_inc_not_zero(&ioc->refcount));
+	rcu_read_lock();
+	icq = rcu_dereference(ioc->icq_hint);
+	if (icq && icq->q == q)
+		goto out;
  
-	return ioc;
+	icq = radix_tree_lookup(&ioc->icq_tree, q->id);
+	if (icq && icq->q == q)
+		rcu_assign_pointer(ioc->icq_hint, icq);	/* allowed to race */
+	else
+		icq = NULL;
+out:
+	rcu_read_unlock();
+	return icq;
 }
-EXPORT_SYMBOL(get_io_context);
+EXPORT_SYMBOL(ioc_lookup_icq);
+
+/**
+ * ioc_create_icq - create and link io_cq
+ * @q: request_queue of interest
+ * @gfp_mask: allocation mask
+ *
+ * Make sure io_cq linking %current->io_context and @q exists.  If either
+ * io_context and/or icq don't exist, they will be created using @gfp_mask.
+ *
+ * The caller is responsible for ensuring @ioc won't go away and @q is
+ * alive and will stay alive until this function returns.
+ */
+struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
+{
+	struct elevator_type *et = q->elevator->type;
+	struct io_context *ioc;
+	struct io_cq *icq;
+
+	/* allocate stuff */
+	ioc = create_io_context(current, gfp_mask, q->node);
+	if (!ioc)
+		return NULL;
+
+	icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
+				    q->node);
+	if (!icq)
+		return NULL;
+
+	if (radix_tree_preload(gfp_mask) < 0) {
+		kmem_cache_free(et->icq_cache, icq);
+		return NULL;
+	}
+
+	icq->ioc = ioc;
+	icq->q = q;
+	INIT_LIST_HEAD(&icq->q_node);
+	INIT_HLIST_NODE(&icq->ioc_node);
+
+	/* lock both q and ioc and try to link @icq */
+	spin_lock_irq(q->queue_lock);
+	spin_lock(&ioc->lock);
+
+	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
+		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
+		list_add(&icq->q_node, &q->icq_list);
+		if (et->ops.elevator_init_icq_fn)
+			et->ops.elevator_init_icq_fn(icq);
+	} else {
+		kmem_cache_free(et->icq_cache, icq);
+		icq = ioc_lookup_icq(ioc, q);
+		if (!icq)
+			printk(KERN_ERR "cfq: icq link failed!\n");
+	}
+
+	spin_unlock(&ioc->lock);
+	spin_unlock_irq(q->queue_lock);
+	radix_tree_preload_end();
+	return icq;
+}
+
+void ioc_set_changed(struct io_context *ioc, int which)
+{
+	struct io_cq *icq;
+	struct hlist_node *n;
+
+	hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
+		set_bit(which, &icq->changed);
+}
+
+/**
+ * ioc_ioprio_changed - notify ioprio change
+ * @ioc: io_context of interest
+ * @ioprio: new ioprio
+ *
+ * @ioc's ioprio has changed to @ioprio.  Set %ICQ_IOPRIO_CHANGED for all
+ * icq's.  iosched is responsible for checking the bit and applying it on
+ * request issue path.
+ */
+void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioc->lock, flags);
+	ioc->ioprio = ioprio;
+	ioc_set_changed(ioc, ICQ_IOPRIO_CHANGED);
+	spin_unlock_irqrestore(&ioc->lock, flags);
+}
+
+/**
+ * ioc_cgroup_changed - notify cgroup change
+ * @ioc: io_context of interest
+ *
+ * @ioc's cgroup has changed.  Set %ICQ_CGROUP_CHANGED for all icq's.
+ * iosched is responsible for checking the bit and applying it on request
+ * issue path.
+ */
+void ioc_cgroup_changed(struct io_context *ioc)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&ioc->lock, flags);
+	ioc_set_changed(ioc, ICQ_CGROUP_CHANGED);
+	spin_unlock_irqrestore(&ioc->lock, flags);
+}
+EXPORT_SYMBOL(ioc_cgroup_changed);
  
 static int __init blk_ioc_init(void)
 {
@@ -104,9 +104,7 @@
  * @lim:  the queue_limits structure to reset
  *
  * Description:
- *   Returns a queue_limit struct to its default state.  Can be used by
- *   stacking drivers like DM that stage table swaps and reuse an
- *   existing device queue.
+ *   Returns a queue_limit struct to its default state.
  */
 void blk_set_default_limits(struct queue_limits *lim)
 {
  
@@ -114,13 +112,12 @@
 	lim->max_integrity_segments = 0;
 	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
 	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
-	lim->max_sectors = BLK_DEF_MAX_SECTORS;
-	lim->max_hw_sectors = INT_MAX;
+	lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
 	lim->max_discard_sectors = 0;
 	lim->discard_granularity = 0;
 	lim->discard_alignment = 0;
 	lim->discard_misaligned = 0;
-	lim->discard_zeroes_data = 1;
+	lim->discard_zeroes_data = 0;
 	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
 	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
 	lim->alignment_offset = 0;
@@ -131,6 +128,27 @@
 EXPORT_SYMBOL(blk_set_default_limits);
  
 /**
+ * blk_set_stacking_limits - set default limits for stacking devices
+ * @lim:  the queue_limits structure to reset
+ *
+ * Description:
+ *   Returns a queue_limit struct to its default state. Should be used
+ *   by stacking drivers like DM that have no internal limits.
+ */
+void blk_set_stacking_limits(struct queue_limits *lim)
+{
+	blk_set_default_limits(lim);
+
+	/* Inherit limits from component devices */
+	lim->discard_zeroes_data = 1;
+	lim->max_segments = USHRT_MAX;
+	lim->max_hw_sectors = UINT_MAX;
+
+	lim->max_sectors = BLK_DEF_MAX_SECTORS;
+}
+EXPORT_SYMBOL(blk_set_stacking_limits);
+
+/**
  * blk_queue_make_request - define an alternate make_request function for a device
  * @q:  the request queue for the device to be affected
  * @mfn: the alternate make_request function
@@ -165,8 +183,6 @@
 	q->nr_batching = BLK_BATCH_REQ;
  
 	blk_set_default_limits(&q->limits);
-	blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
-	q->limits.discard_zeroes_data = 0;
  
 	/*
 	 * by default assume old behaviour and bounce for any highmem page
@@ -425,7 +425,7 @@
 	if (!entry->show)
 		return -EIO;
 	mutex_lock(&q->sysfs_lock);
-	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+	if (blk_queue_dead(q)) {
 		mutex_unlock(&q->sysfs_lock);
 		return -ENOENT;
 	}
@@ -447,7 +447,7 @@
  
 	q = container_of(kobj, struct request_queue, kobj);
 	mutex_lock(&q->sysfs_lock);
-	if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
+	if (blk_queue_dead(q)) {
 		mutex_unlock(&q->sysfs_lock);
 		return -ENOENT;
 	}
  
@@ -479,8 +479,12 @@
  
 	blk_sync_queue(q);
  
-	if (q->elevator)
+	if (q->elevator) {
+		spin_lock_irq(q->queue_lock);
+		ioc_clear_queue(q);
+		spin_unlock_irq(q->queue_lock);
 		elevator_exit(q->elevator);
+	}
  
 	blk_throtl_exit(q);
  
@@ -494,6 +498,8 @@
 	blk_trace_shutdown(q);
  
 	bdi_destroy(&q->backing_dev_info);
+
+	ida_simple_remove(&blk_queue_ida, q->id);
 	kmem_cache_free(blk_requestq_cachep, q);
 }
  
@@ -310,7 +310,7 @@
 	struct request_queue *q = td->queue;
  
 	/* no throttling for dead queue */
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
+	if (unlikely(blk_queue_dead(q)))
 		return NULL;
  
 	rcu_read_lock();
@@ -335,7 +335,7 @@
 	spin_lock_irq(q->queue_lock);
  
 	/* Make sure @q is still alive */
-	if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
+	if (unlikely(blk_queue_dead(q))) {
 		kfree(tg);
 		return NULL;
 	}
 #ifndef BLK_INTERNAL_H
 #define BLK_INTERNAL_H
  
+#include <linux/idr.h>
+
 /* Amount of time in which a process may batch requests */
 #define BLK_BATCH_TIME	(HZ/50UL)
  
  
@@ -9,7 +11,13 @@
  
 extern struct kmem_cache *blk_requestq_cachep;
 extern struct kobj_type blk_queue_ktype;
+extern struct ida blk_queue_ida;
  
+static inline void __blk_get_queue(struct request_queue *q)
+{
+	kobject_get(&q->kobj);
+}
+
 void init_request_from_bio(struct request *req, struct bio *bio);
 void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
 			struct bio *bio);
@@ -85,8 +93,8 @@
 			q->flush_queue_delayed = 1;
 			return NULL;
 		}
-		if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) ||
-		    !q->elevator->ops->elevator_dispatch_fn(q, 0))
+		if (unlikely(blk_queue_dead(q)) ||
+		    !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
 			return NULL;
 	}
 }
  
@@ -95,16 +103,16 @@
 {
 	struct elevator_queue *e = q->elevator;
  
-	if (e->ops->elevator_activate_req_fn)
-		e->ops->elevator_activate_req_fn(q, rq);
+	if (e->type->ops.elevator_activate_req_fn)
+		e->type->ops.elevator_activate_req_fn(q, rq);
 }
  
 static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq)
 {
 	struct elevator_queue *e = q->elevator;
  
-	if (e->ops->elevator_deactivate_req_fn)
-		e->ops->elevator_deactivate_req_fn(q, rq);
+	if (e->type->ops.elevator_deactivate_req_fn)
+		e->type->ops.elevator_deactivate_req_fn(q, rq);
 }
  
 #ifdef CONFIG_FAIL_IO_TIMEOUT
@@ -119,8 +127,6 @@
 }
 #endif
  
-struct io_context *current_io_context(gfp_t gfp_flags, int node);
-
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		     struct bio *bio);
 int ll_front_merge_fn(struct request_queue *q, struct request *req, 
@@ -189,6 +195,42 @@
 	        (rq->cmd_flags & REQ_DISCARD));
 }
  
+/*
+ * Internal io_context interface
+ */
+void get_io_context(struct io_context *ioc);
+struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
+struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask);
+void ioc_clear_queue(struct request_queue *q);
+
+void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
+				int node);
+
+/**
+ * create_io_context - try to create task->io_context
+ * @task: target task
+ * @gfp_mask: allocation mask
+ * @node: allocation node
+ *
+ * If @task->io_context is %NULL, allocate a new io_context and install it.
+ * Returns the current @task->io_context which may be %NULL if allocation
+ * failed.
+ *
+ * Note that this function can't be called with IRQ disabled because
+ * task_lock which protects @task->io_context is IRQ-unsafe.
+ */
+static inline struct io_context *create_io_context(struct task_struct *task,
+						   gfp_t gfp_mask, int node)
+{
+	WARN_ON_ONCE(irqs_disabled());
+	if (unlikely(!task->io_context))
+		create_io_context_slowpath(task, gfp_mask, node);
+	return task->io_context;
+}
+
+/*
+ * Internal throttling interface
+ */
 #ifdef CONFIG_BLK_DEV_THROTTLING
 extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
 extern void blk_throtl_drain(struct request_queue *q);
@@ -769,12 +769,10 @@
 					 struct file *file)
 {
 	struct bsg_device *bd;
-	int ret;
 #ifdef BSG_DEBUG
 	unsigned char buf[32];
 #endif
-	ret = blk_get_queue(rq);
-	if (ret)
+	if (!blk_get_queue(rq))
 		return ERR_PTR(-ENXIO);
  
 	bd = bsg_alloc_device();
@@ -14,6 +14,7 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include "blk.h"
 #include "cfq.h"
  
 /*
  
  
@@ -53,21 +54,12 @@
 #define CFQQ_SECT_THR_NONROT	(sector_t)(2 * 32)
 #define CFQQ_SEEKY(cfqq)	(hweight32(cfqq->seek_history) > 32/8)
  
-#define RQ_CIC(rq)		\
-	((struct cfq_io_context *) (rq)->elevator_private[0])
-#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private[1])
-#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elevator_private[2])
+#define RQ_CIC(rq)		icq_to_cic((rq)->elv.icq)
+#define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elv.priv[0])
+#define RQ_CFQG(rq)		(struct cfq_group *) ((rq)->elv.priv[1])
  
 static struct kmem_cache *cfq_pool;
-static struct kmem_cache *cfq_ioc_pool;
  
-static DEFINE_PER_CPU(unsigned long, cfq_ioc_count);
-static struct completion *ioc_gone;
-static DEFINE_SPINLOCK(ioc_gone_lock);
-
-static DEFINE_SPINLOCK(cic_index_lock);
-static DEFINE_IDA(cic_index_ida);
-
 #define CFQ_PRIO_LISTS		IOPRIO_BE_NR
 #define cfq_class_idle(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
 #define cfq_class_rt(cfqq)	((cfqq)->ioprio_class == IOPRIO_CLASS_RT)
@@ -75,6 +67,14 @@
 #define sample_valid(samples)	((samples) > 80)
 #define rb_entry_cfqg(node)	rb_entry((node), struct cfq_group, rb_node)
  
+struct cfq_ttime {
+	unsigned long last_end_request;
+
+	unsigned long ttime_total;
+	unsigned long ttime_samples;
+	unsigned long ttime_mean;
+};
+
 /*
  * Most of our rbtree usage is for sorting with min extraction, so
  * if we cache the leftmost node we don't have to walk down the tree
@@ -216,6 +216,12 @@
 	struct cfq_ttime ttime;
 };
  
+struct cfq_io_cq {
+	struct io_cq		icq;		/* must be the first member */
+	struct cfq_queue	*cfqq[2];
+	struct cfq_ttime	ttime;
+};
+
 /*
  * Per block device queue structure
  */
@@ -267,7 +273,7 @@
 	struct work_struct unplug_work;
  
 	struct cfq_queue *active_queue;
-	struct cfq_io_context *active_cic;
+	struct cfq_io_cq *active_cic;
  
 	/*
 	 * async queue for each priority case
@@ -290,9 +296,6 @@
 	unsigned int cfq_group_idle;
 	unsigned int cfq_latency;
  
-	unsigned int cic_index;
-	struct list_head cic_list;
-
 	/*
 	 * Fallback dummy cfqq for extreme OOM conditions
 	 */
  
  
  
  
  
  
  
  
  
@@ -464,37 +467,35 @@
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *, bool,
 				       struct io_context *, gfp_t);
-static struct cfq_io_context *cfq_cic_lookup(struct cfq_data *,
-						struct io_context *);
  
-static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_context *cic,
-					    bool is_sync)
+static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 {
-	return cic->cfqq[is_sync];
+	/* cic->icq is the first member, %NULL will convert to %NULL */
+	return container_of(icq, struct cfq_io_cq, icq);
 }
  
-static inline void cic_set_cfqq(struct cfq_io_context *cic,
-				struct cfq_queue *cfqq, bool is_sync)
+static inline struct cfq_io_cq *cfq_cic_lookup(struct cfq_data *cfqd,
+					       struct io_context *ioc)
 {
-	cic->cfqq[is_sync] = cfqq;
+	if (ioc)
+		return icq_to_cic(ioc_lookup_icq(ioc, cfqd->queue));
+	return NULL;
 }
  
-#define CIC_DEAD_KEY	1ul
-#define CIC_DEAD_INDEX_SHIFT	1
-
-static inline void *cfqd_dead_key(struct cfq_data *cfqd)
+static inline struct cfq_queue *cic_to_cfqq(struct cfq_io_cq *cic, bool is_sync)
 {
-	return (void *)(cfqd->cic_index << CIC_DEAD_INDEX_SHIFT | CIC_DEAD_KEY);
+	return cic->cfqq[is_sync];
 }
  
-static inline struct cfq_data *cic_to_cfqd(struct cfq_io_context *cic)
+static inline void cic_set_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq,
+				bool is_sync)
 {
-	struct cfq_data *cfqd = cic->key;
+	cic->cfqq[is_sync] = cfqq;
+}
  
-	if (unlikely((unsigned long) cfqd & CIC_DEAD_KEY))
-		return NULL;
-
-	return cfqd;
+static inline struct cfq_data *cic_to_cfqd(struct cfq_io_cq *cic)
+{
+	return cic->icq.q->elevator->elevator_data;
 }
  
 /*
@@ -1561,7 +1562,7 @@
 cfq_find_rq_fmerge(struct cfq_data *cfqd, struct bio *bio)
 {
 	struct task_struct *tsk = current;
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic;
 	struct cfq_queue *cfqq;
  
 	cic = cfq_cic_lookup(cfqd, tsk->io_context);
@@ -1687,7 +1688,7 @@
 			   struct bio *bio)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic;
 	struct cfq_queue *cfqq;
  
 	/*
  
@@ -1697,12 +1698,19 @@
 		return false;
  
 	/*
-	 * Lookup the cfqq that this bio will be queued with. Allow
-	 * merge only if rq is queued there.
+	 * Lookup the cfqq that this bio will be queued with and allow
+	 * merge only if rq is queued there.  This function can be called
+	 * from plug merge without queue_lock.  In such cases, ioc of @rq
+	 * and %current are guaranteed to be equal.  Avoid lookup which
+	 * requires queue_lock by using @rq's cic.
 	 */
-	cic = cfq_cic_lookup(cfqd, current->io_context);
-	if (!cic)
-		return false;
+	if (current->io_context == RQ_CIC(rq)->icq.ioc) {
+		cic = RQ_CIC(rq);
+	} else {
+		cic = cfq_cic_lookup(cfqd, current->io_context);
+		if (!cic)
+			return false;
+	}
  
 	cfqq = cic_to_cfqq(cic, cfq_bio_sync(bio));
 	return cfqq == RQ_CFQQ(rq);
@@ -1786,7 +1794,7 @@
 		cfqd->active_queue = NULL;
  
 	if (cfqd->active_cic) {
-		put_io_context(cfqd->active_cic->ioc);
+		put_io_context(cfqd->active_cic->icq.ioc, cfqd->queue);
 		cfqd->active_cic = NULL;
 	}
 }
@@ -2006,7 +2014,7 @@
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
 {
 	struct cfq_queue *cfqq = cfqd->active_queue;
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic;
 	unsigned long sl, group_idle = 0;
  
 	/*
@@ -2041,7 +2049,7 @@
 	 * task has exited, don't wait
 	 */
 	cic = cfqd->active_cic;
-	if (!cic || !atomic_read(&cic->ioc->nr_tasks))
+	if (!cic || !atomic_read(&cic->icq.ioc->nr_tasks))
 		return;
  
 	/*
  
@@ -2592,9 +2600,9 @@
 	cfq_dispatch_insert(cfqd->queue, rq);
  
 	if (!cfqd->active_cic) {
-		struct cfq_io_context *cic = RQ_CIC(rq);
+		struct cfq_io_cq *cic = RQ_CIC(rq);
  
-		atomic_long_inc(&cic->ioc->refcount);
+		atomic_long_inc(&cic->icq.ioc->refcount);
 		cfqd->active_cic = cic;
 	}
  
@@ -2677,84 +2685,6 @@
 	cfq_put_cfqg(cfqg);
 }
  
-/*
- * Call func for each cic attached to this ioc.
- */
-static void
-call_for_each_cic(struct io_context *ioc,
-		  void (*func)(struct io_context *, struct cfq_io_context *))
-{
-	struct cfq_io_context *cic;
-	struct hlist_node *n;
-
-	rcu_read_lock();
-
-	hlist_for_each_entry_rcu(cic, n, &ioc->cic_list, cic_list)
-		func(ioc, cic);
-
-	rcu_read_unlock();
-}
-
-static void cfq_cic_free_rcu(struct rcu_head *head)
-{
-	struct cfq_io_context *cic;
-
-	cic = container_of(head, struct cfq_io_context, rcu_head);
-
-	kmem_cache_free(cfq_ioc_pool, cic);
-	elv_ioc_count_dec(cfq_ioc_count);
-
-	if (ioc_gone) {
-		/*
-		 * CFQ scheduler is exiting, grab exit lock and check
-		 * the pending io context count. If it hits zero,
-		 * complete ioc_gone and set it back to NULL
-		 */
-		spin_lock(&ioc_gone_lock);
-		if (ioc_gone && !elv_ioc_count_read(cfq_ioc_count)) {
-			complete(ioc_gone);
-			ioc_gone = NULL;
-		}
-		spin_unlock(&ioc_gone_lock);
-	}
-}
-
-static void cfq_cic_free(struct cfq_io_context *cic)
-{
-	call_rcu(&cic->rcu_head, cfq_cic_free_rcu);
-}
-
-static void cic_free_func(struct io_context *ioc, struct cfq_io_context *cic)
-{
-	unsigned long flags;
-	unsigned long dead_key = (unsigned long) cic->key;
-
-	BUG_ON(!(dead_key & CIC_DEAD_KEY));
-
-	spin_lock_irqsave(&ioc->lock, flags);
-	radix_tree_delete(&ioc->radix_root, dead_key >> CIC_DEAD_INDEX_SHIFT);
-	hlist_del_rcu(&cic->cic_list);
-	spin_unlock_irqrestore(&ioc->lock, flags);
-
-	cfq_cic_free(cic);
-}
-
-/*
- * Must be called with rcu_read_lock() held or preemption otherwise disabled.
- * Only two callers of this - ->dtor() which is called with the rcu_read_lock(),
- * and ->trim() which is called with the task lock held
- */
-static void cfq_free_io_context(struct io_context *ioc)
-{
-	/*
-	 * ioc->refcount is zero here, or we are called from elv_unregister(),
-	 * so no more cic's are allowed to be linked into this ioc.  So it
-	 * should be ok to iterate over the known list, we will see all cic's
-	 * since no new ones are added.
-	 */
-	call_for_each_cic(ioc, cic_free_func);
-}
-
 static void cfq_put_cooperator(struct cfq_queue *cfqq)
 {
 	struct cfq_queue *__cfqq, *next;
  
  
  
  
@@ -2788,28 +2718,18 @@
 	cfq_put_queue(cfqq);
 }
  
-static void __cfq_exit_single_io_context(struct cfq_data *cfqd,
-					 struct cfq_io_context *cic)
+static void cfq_init_icq(struct io_cq *icq)
 {
-	struct io_context *ioc = cic->ioc;
+	struct cfq_io_cq *cic = icq_to_cic(icq);
  
-	list_del_init(&cic->queue_list);
+	cic->ttime.last_end_request = jiffies;
+}
  
-	/*
-	 * Make sure dead mark is seen for dead queues
-	 */
-	smp_wmb();
-	cic->key = cfqd_dead_key(cfqd);
+static void cfq_exit_icq(struct io_cq *icq)
+{
+	struct cfq_io_cq *cic = icq_to_cic(icq);
+	struct cfq_data *cfqd = cic_to_cfqd(cic);
  
-	rcu_read_lock();
-	if (rcu_dereference(ioc->ioc_data) == cic) {
-		rcu_read_unlock();
-		spin_lock(&ioc->lock);
-		rcu_assign_pointer(ioc->ioc_data, NULL);
-		spin_unlock(&ioc->lock);
-	} else
-		rcu_read_unlock();
-
 	if (cic->cfqq[BLK_RW_ASYNC]) {
 		cfq_exit_cfqq(cfqd, cic->cfqq[BLK_RW_ASYNC]);
 		cic->cfqq[BLK_RW_ASYNC] = NULL;
@@ -2821,57 +2741,6 @@
 	}
 }
  
-static void cfq_exit_single_io_context(struct io_context *ioc,
-				       struct cfq_io_context *cic)
-{
-	struct cfq_data *cfqd = cic_to_cfqd(cic);
-
-	if (cfqd) {
-		struct request_queue *q = cfqd->queue;
-		unsigned long flags;
-
-		spin_lock_irqsave(q->queue_lock, flags);
-
-		/*
-		 * Ensure we get a fresh copy of the ->key to prevent
-		 * race between exiting task and queue
-		 */
-		smp_read_barrier_depends();
-		if (cic->key == cfqd)
-			__cfq_exit_single_io_context(cfqd, cic);
-
-		spin_unlock_irqrestore(q->queue_lock, flags);
-	}
-}
-
-/*
- * The process that ioc belongs to has exited, we need to clean up
- * and put the internal structures we have that belongs to that process.
- */
-static void cfq_exit_io_context(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, cfq_exit_single_io_context);
-}
-
-static struct cfq_io_context *
-cfq_alloc_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
-{
-	struct cfq_io_context *cic;
-
-	cic = kmem_cache_alloc_node(cfq_ioc_pool, gfp_mask | __GFP_ZERO,
-							cfqd->queue->node);
-	if (cic) {
-		cic->ttime.last_end_request = jiffies;
-		INIT_LIST_HEAD(&cic->queue_list);
-		INIT_HLIST_NODE(&cic->cic_list);
-		cic->dtor = cfq_free_io_context;
-		cic->exit = cfq_exit_io_context;
-		elv_ioc_count_inc(cfq_ioc_count);
-	}
-
-	return cic;
-}
-
 static void cfq_init_prio_data(struct cfq_queue *cfqq, struct io_context *ioc)
 {
 	struct task_struct *tsk = current;
  
  
  
@@ -2914,21 +2783,18 @@
 	cfq_clear_cfqq_prio_changed(cfqq);
 }
  
-static void changed_ioprio(struct io_context *ioc, struct cfq_io_context *cic)
+static void changed_ioprio(struct cfq_io_cq *cic)
 {
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
 	struct cfq_queue *cfqq;
-	unsigned long flags;
  
 	if (unlikely(!cfqd))
 		return;
  
-	spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-
 	cfqq = cic->cfqq[BLK_RW_ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
-		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->ioc,
+		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic->icq.ioc,
 						GFP_ATOMIC);
 		if (new_cfqq) {
 			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
  
@@ -2939,16 +2805,8 @@
 	cfqq = cic->cfqq[BLK_RW_SYNC];
 	if (cfqq)
 		cfq_mark_cfqq_prio_changed(cfqq);
-
-	spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
 }
  
-static void cfq_ioc_set_ioprio(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, changed_ioprio);
-	ioc->ioprio_changed = 0;
-}
-
 static void cfq_init_cfqq(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			  pid_t pid, bool is_sync)
 {
  
@@ -2970,11 +2828,10 @@
 }
  
 #ifdef CONFIG_CFQ_GROUP_IOSCHED
-static void changed_cgroup(struct io_context *ioc, struct cfq_io_context *cic)
+static void changed_cgroup(struct cfq_io_cq *cic)
 {
 	struct cfq_queue *sync_cfqq = cic_to_cfqq(cic, 1);
 	struct cfq_data *cfqd = cic_to_cfqd(cic);
-	unsigned long flags;
 	struct request_queue *q;
  
 	if (unlikely(!cfqd))
@@ -2982,8 +2839,6 @@
  
 	q = cfqd->queue;
  
-	spin_lock_irqsave(q->queue_lock, flags);
-
 	if (sync_cfqq) {
 		/*
 		 * Drop reference to sync queue. A new sync queue will be
  
@@ -2993,15 +2848,7 @@
 		cic_set_cfqq(cic, NULL, 1);
 		cfq_put_queue(sync_cfqq);
 	}
-
-	spin_unlock_irqrestore(q->queue_lock, flags);
 }
-
-static void cfq_ioc_set_cgroup(struct io_context *ioc)
-{
-	call_for_each_cic(ioc, changed_cgroup);
-	ioc->cgroup_changed = 0;
-}
 #endif  /* CONFIG_CFQ_GROUP_IOSCHED */
  
 static struct cfq_queue *
@@ -3009,7 +2856,7 @@
 		     struct io_context *ioc, gfp_t gfp_mask)
 {
 	struct cfq_queue *cfqq, *new_cfqq = NULL;
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic;
 	struct cfq_group *cfqg;
  
 retry:
  
@@ -3100,161 +2947,7 @@
 	return cfqq;
 }
  
-/*
- * We drop cfq io contexts lazily, so we may find a dead one.
- */
 static void
-cfq_drop_dead_cic(struct cfq_data *cfqd, struct io_context *ioc,
-		  struct cfq_io_context *cic)
-{
-	unsigned long flags;
-
-	WARN_ON(!list_empty(&cic->queue_list));
-	BUG_ON(cic->key != cfqd_dead_key(cfqd));
-
-	spin_lock_irqsave(&ioc->lock, flags);
-
-	BUG_ON(rcu_dereference_check(ioc->ioc_data,
-		lockdep_is_held(&ioc->lock)) == cic);
-
-	radix_tree_delete(&ioc->radix_root, cfqd->cic_index);
-	hlist_del_rcu(&cic->cic_list);
-	spin_unlock_irqrestore(&ioc->lock, flags);
-
-	cfq_cic_free(cic);
-}
-
-static struct cfq_io_context *
-cfq_cic_lookup(struct cfq_data *cfqd, struct io_context *ioc)
-{
-	struct cfq_io_context *cic;
-	unsigned long flags;
-
-	if (unlikely(!ioc))
-		return NULL;
-
-	rcu_read_lock();
-
-	/*
-	 * we maintain a last-hit cache, to avoid browsing over the tree
-	 */
-	cic = rcu_dereference(ioc->ioc_data);
-	if (cic && cic->key == cfqd) {
-		rcu_read_unlock();
-		return cic;
-	}
-
-	do {
-		cic = radix_tree_lookup(&ioc->radix_root, cfqd->cic_index);
-		rcu_read_unlock();
-		if (!cic)
-			break;
-		if (unlikely(cic->key != cfqd)) {
-			cfq_drop_dead_cic(cfqd, ioc, cic);
-			rcu_read_lock();
-			continue;
-		}
-
-		spin_lock_irqsave(&ioc->lock, flags);
-		rcu_assign_pointer(ioc->ioc_data, cic);
-		spin_unlock_irqrestore(&ioc->lock, flags);
-		break;
-	} while (1);
-
-	return cic;
-}
-
-/*
- * Add cic into ioc, using cfqd as the search key. This enables us to lookup
- * the process specific cfq io context when entered from the block layer.
- * Also adds the cic to a per-cfqd list, used when this queue is removed.
- */
-static int cfq_cic_link(struct cfq_data *cfqd, struct io_context *ioc,
-			struct cfq_io_context *cic, gfp_t gfp_mask)
-{
-	unsigned long flags;
-	int ret;
-
-	ret = radix_tree_preload(gfp_mask);
-	if (!ret) {
-		cic->ioc = ioc;
-		cic->key = cfqd;
-
-		spin_lock_irqsave(&ioc->lock, flags);
-		ret = radix_tree_insert(&ioc->radix_root,
-						cfqd->cic_index, cic);
-		if (!ret)
-			hlist_add_head_rcu(&cic->cic_list, &ioc->cic_list);
-		spin_unlock_irqrestore(&ioc->lock, flags);
-
-		radix_tree_preload_end();
-
-		if (!ret) {
-			spin_lock_irqsave(cfqd->queue->queue_lock, flags);
-			list_add(&cic->queue_list, &cfqd->cic_list);
-			spin_unlock_irqrestore(cfqd->queue->queue_lock, flags);
-		}
-	}
-
-	if (ret && ret != -EEXIST)
-		printk(KERN_ERR "cfq: cic link failed!\n");
-
-	return ret;
-}
-
-/*
- * Setup general io context and cfq io context. There can be several cfq
- * io contexts per general io context, if this process is doing io to more
- * than one device managed by cfq.
- */
-static struct cfq_io_context *
-cfq_get_io_context(struct cfq_data *cfqd, gfp_t gfp_mask)
-{
-	struct io_context *ioc = NULL;
-	struct cfq_io_context *cic;
-	int ret;
-
-	might_sleep_if(gfp_mask & __GFP_WAIT);
-
-	ioc = get_io_context(gfp_mask, cfqd->queue->node);
-	if (!ioc)
-		return NULL;
-
-retry:
-	cic = cfq_cic_lookup(cfqd, ioc);
-	if (cic)
-		goto out;
-
-	cic = cfq_alloc_io_context(cfqd, gfp_mask);
-	if (cic == NULL)
-		goto err;
-
-	ret = cfq_cic_link(cfqd, ioc, cic, gfp_mask);
-	if (ret == -EEXIST) {
-		/* someone has linked cic to ioc already */
-		cfq_cic_free(cic);
-		goto retry;
-	} else if (ret)
-		goto err_free;
-
-out:
-	smp_read_barrier_depends();
-	if (unlikely(ioc->ioprio_changed))
-		cfq_ioc_set_ioprio(ioc);
-
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-	if (unlikely(ioc->cgroup_changed))
-		cfq_ioc_set_cgroup(ioc);
-#endif
-	return cic;
-err_free:
-	cfq_cic_free(cic);
-err:
-	put_io_context(ioc);
-	return NULL;
-}
-
-static void
 __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
 {
 	unsigned long elapsed = jiffies - ttime->last_end_request;
@@ -3267,7 +2960,7 @@
  
 static void
 cfq_update_io_thinktime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-	struct cfq_io_context *cic)
+			struct cfq_io_cq *cic)
 {
 	if (cfq_cfqq_sync(cfqq)) {
 		__cfq_update_io_thinktime(&cic->ttime, cfqd->cfq_slice_idle);
@@ -3305,7 +2998,7 @@
  */
 static void
 cfq_update_idle_window(struct cfq_data *cfqd, struct cfq_queue *cfqq,
-		       struct cfq_io_context *cic)
+		       struct cfq_io_cq *cic)
 {
 	int old_idle, enable_idle;
  
@@ -3322,8 +3015,9 @@
  
 	if (cfqq->next_rq && (cfqq->next_rq->cmd_flags & REQ_NOIDLE))
 		enable_idle = 0;
-	else if (!atomic_read(&cic->ioc->nr_tasks) || !cfqd->cfq_slice_idle ||
-	    (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
+	else if (!atomic_read(&cic->icq.ioc->nr_tasks) ||
+		 !cfqd->cfq_slice_idle ||
+		 (!cfq_cfqq_deep(cfqq) && CFQQ_SEEKY(cfqq)))
 		enable_idle = 0;
 	else if (sample_valid(cic->ttime.ttime_samples)) {
 		if (cic->ttime.ttime_mean > cfqd->cfq_slice_idle)
@@ -3455,7 +3149,7 @@
 cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 		struct request *rq)
 {
-	struct cfq_io_context *cic = RQ_CIC(rq);
+	struct cfq_io_cq *cic = RQ_CIC(rq);
  
 	cfqd->rq_queued++;
 	if (rq->cmd_flags & REQ_PRIO)
@@ -3508,7 +3202,7 @@
 	struct cfq_queue *cfqq = RQ_CFQQ(rq);
  
 	cfq_log_cfqq(cfqd, cfqq, "insert_request");
-	cfq_init_prio_data(cfqq, RQ_CIC(rq)->ioc);
+	cfq_init_prio_data(cfqq, RQ_CIC(rq)->icq.ioc);
  
 	rq_set_fifo_time(rq, jiffies + cfqd->cfq_fifo_expire[rq_is_sync(rq)]);
 	list_add_tail(&rq->queuelist, &cfqq->fifo);
@@ -3558,7 +3252,7 @@
  
 static bool cfq_should_wait_busy(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 {
-	struct cfq_io_context *cic = cfqd->active_cic;
+	struct cfq_io_cq *cic = cfqd->active_cic;
  
 	/* If the queue already has requests, don't wait */
 	if (!RB_EMPTY_ROOT(&cfqq->sort_list))
@@ -3695,7 +3389,7 @@
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
 	struct task_struct *tsk = current;
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic;
 	struct cfq_queue *cfqq;
  
 	/*
@@ -3710,7 +3404,7 @@
  
 	cfqq = cic_to_cfqq(cic, rw_is_sync(rw));
 	if (cfqq) {
-		cfq_init_prio_data(cfqq, cic->ioc);
+		cfq_init_prio_data(cfqq, cic->icq.ioc);
  
 		return __cfq_may_queue(cfqq);
 	}
  
  
@@ -3731,21 +3425,17 @@
 		BUG_ON(!cfqq->allocated[rw]);
 		cfqq->allocated[rw]--;
  
-		put_io_context(RQ_CIC(rq)->ioc);
-
-		rq->elevator_private[0] = NULL;
-		rq->elevator_private[1] = NULL;
-
 		/* Put down rq reference on cfqg */
 		cfq_put_cfqg(RQ_CFQG(rq));
-		rq->elevator_private[2] = NULL;
+		rq->elv.priv[0] = NULL;
+		rq->elv.priv[1] = NULL;
  
 		cfq_put_queue(cfqq);
 	}
 }
  
 static struct cfq_queue *
-cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
+cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_cq *cic,
 		struct cfq_queue *cfqq)
 {
 	cfq_log_cfqq(cfqd, cfqq, "merging with queue %p", cfqq->new_cfqq);
@@ -3760,7 +3450,7 @@
  * was the last process referring to said cfqq.
  */
 static struct cfq_queue *
-split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
+split_cfqq(struct cfq_io_cq *cic, struct cfq_queue *cfqq)
 {
 	if (cfqq_process_refs(cfqq) == 1) {
 		cfqq->pid = current->pid;
  
  
  
  
  
@@ -3783,25 +3473,29 @@
 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_io_context *cic;
+	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
 	const int rw = rq_data_dir(rq);
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
-	unsigned long flags;
  
 	might_sleep_if(gfp_mask & __GFP_WAIT);
  
-	cic = cfq_get_io_context(cfqd, gfp_mask);
+	spin_lock_irq(q->queue_lock);
  
-	spin_lock_irqsave(q->queue_lock, flags);
+	/* handle changed notifications */
+	if (unlikely(cic->icq.changed)) {
+		if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
+			changed_ioprio(cic);
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+		if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
+			changed_cgroup(cic);
+#endif
+	}
  
-	if (!cic)
-		goto queue_fail;
-
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = cfq_get_queue(cfqd, is_sync, cic->ioc, gfp_mask);
+		cfqq = cfq_get_queue(cfqd, is_sync, cic->icq.ioc, gfp_mask);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
 		/*
  
@@ -3827,17 +3521,10 @@
 	cfqq->allocated[rw]++;
  
 	cfqq->ref++;
-	rq->elevator_private[0] = cic;
-	rq->elevator_private[1] = cfqq;
-	rq->elevator_private[2] = cfq_ref_get_cfqg(cfqq->cfqg);
-	spin_unlock_irqrestore(q->queue_lock, flags);
+	rq->elv.priv[0] = cfqq;
+	rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
+	spin_unlock_irq(q->queue_lock);
 	return 0;
-
-queue_fail:
-	cfq_schedule_dispatch(cfqd);
-	spin_unlock_irqrestore(q->queue_lock, flags);
-	cfq_log(cfqd, "set_request fail");
-	return 1;
 }
  
 static void cfq_kick_queue(struct work_struct *work)
@@ -3941,14 +3628,6 @@
 	if (cfqd->active_queue)
 		__cfq_slice_expired(cfqd, cfqd->active_queue, 0);
  
-	while (!list_empty(&cfqd->cic_list)) {
-		struct cfq_io_context *cic = list_entry(cfqd->cic_list.next,
-							struct cfq_io_context,
-							queue_list);
-
-		__cfq_exit_single_io_context(cfqd, cic);
-	}
-
 	cfq_put_async_queues(cfqd);
 	cfq_release_cfq_groups(cfqd);
  
@@ -3963,10 +3642,6 @@
  
 	cfq_shutdown_timer_wq(cfqd);
  
-	spin_lock(&cic_index_lock);
-	ida_remove(&cic_index_ida, cfqd->cic_index);
-	spin_unlock(&cic_index_lock);
-
 	/*
 	 * Wait for cfqg->blkg->key accessors to exit their grace periods.
 	 * Do this wait only if there are other unlinked groups out
@@ -3988,24 +3663,6 @@
 	kfree(cfqd);
 }
  
-static int cfq_alloc_cic_index(void)
-{
-	int index, error;
-
-	do {
-		if (!ida_pre_get(&cic_index_ida, GFP_KERNEL))
-			return -ENOMEM;
-
-		spin_lock(&cic_index_lock);
-		error = ida_get_new(&cic_index_ida, &index);
-		spin_unlock(&cic_index_lock);
-		if (error && error != -EAGAIN)
-			return error;
-	} while (error);
-
-	return index;
-}
-
 static void *cfq_init_queue(struct request_queue *q)
 {
 	struct cfq_data *cfqd;
  
  
  
@@ -4013,24 +3670,10 @@
 	struct cfq_group *cfqg;
 	struct cfq_rb_root *st;
  
-	i = cfq_alloc_cic_index();
-	if (i < 0)
-		return NULL;
-
 	cfqd = kmalloc_node(sizeof(*cfqd), GFP_KERNEL | __GFP_ZERO, q->node);
-	if (!cfqd) {
-		spin_lock(&cic_index_lock);
-		ida_remove(&cic_index_ida, i);
-		spin_unlock(&cic_index_lock);
+	if (!cfqd)
 		return NULL;
-	}
  
-	/*
-	 * Don't need take queue_lock in the routine, since we are
-	 * initializing the ioscheduler, and nobody is using cfqd
-	 */
-	cfqd->cic_index = i;
-
 	/* Init root service tree */
 	cfqd->grp_service_tree = CFQ_RB_ROOT;
  
@@ -4055,11 +3698,6 @@
  
 	if (blkio_alloc_blkg_stats(&cfqg->blkg)) {
 		kfree(cfqg);
-
-		spin_lock(&cic_index_lock);
-		ida_remove(&cic_index_ida, cfqd->cic_index);
-		spin_unlock(&cic_index_lock);
-
 		kfree(cfqd);
 		return NULL;
 	}
@@ -4091,8 +3729,6 @@
 	cfqd->oom_cfqq.ref++;
 	cfq_link_cfqq_cfqg(&cfqd->oom_cfqq, &cfqd->root_group);
  
-	INIT_LIST_HEAD(&cfqd->cic_list);
-
 	cfqd->queue = q;
  
 	init_timer(&cfqd->idle_slice_timer);
@@ -4121,34 +3757,6 @@
 	return cfqd;
 }
  
-static void cfq_slab_kill(void)
-{
-	/*
-	 * Caller already ensured that pending RCU callbacks are completed,
-	 * so we should have no busy allocations at this point.
-	 */
-	if (cfq_pool)
-		kmem_cache_destroy(cfq_pool);
-	if (cfq_ioc_pool)
-		kmem_cache_destroy(cfq_ioc_pool);
-}
-
-static int __init cfq_slab_setup(void)
-{
-	cfq_pool = KMEM_CACHE(cfq_queue, 0);
-	if (!cfq_pool)
-		goto fail;
-
-	cfq_ioc_pool = KMEM_CACHE(cfq_io_context, 0);
-	if (!cfq_ioc_pool)
-		goto fail;
-
-	return 0;
-fail:
-	cfq_slab_kill();
-	return -ENOMEM;
-}
-
 /*
  * sysfs parts below -->
  */
  
  
  
@@ -4254,15 +3862,18 @@
 		.elevator_completed_req_fn =	cfq_completed_request,
 		.elevator_former_req_fn =	elv_rb_former_request,
 		.elevator_latter_req_fn =	elv_rb_latter_request,
+		.elevator_init_icq_fn =		cfq_init_icq,
+		.elevator_exit_icq_fn =		cfq_exit_icq,
 		.elevator_set_req_fn =		cfq_set_request,
 		.elevator_put_req_fn =		cfq_put_request,
 		.elevator_may_queue_fn =	cfq_may_queue,
 		.elevator_init_fn =		cfq_init_queue,
 		.elevator_exit_fn =		cfq_exit_queue,
-		.trim =				cfq_free_io_context,
 	},
+	.icq_size	=	sizeof(struct cfq_io_cq),
+	.icq_align	=	__alignof__(struct cfq_io_cq),
 	.elevator_attrs =	cfq_attrs,
-	.elevator_name =	"cfq",
+	.elevator_name	=	"cfq",
 	.elevator_owner =	THIS_MODULE,
 };
  
@@ -4280,6 +3891,8 @@
  
 static int __init cfq_init(void)
 {
+	int ret;
+
 	/*
 	 * could be 0 on HZ < 1000 setups
 	 */
  
@@ -4294,10 +3907,16 @@
 #else
 		cfq_group_idle = 0;
 #endif
-	if (cfq_slab_setup())
+	cfq_pool = KMEM_CACHE(cfq_queue, 0);
+	if (!cfq_pool)
 		return -ENOMEM;
  
-	elv_register(&iosched_cfq);
+	ret = elv_register(&iosched_cfq);
+	if (ret) {
+		kmem_cache_destroy(cfq_pool);
+		return ret;
+	}
+
 	blkio_policy_register(&blkio_policy_cfq);
  
 	return 0;
  
@@ -4305,21 +3924,9 @@
  
 static void __exit cfq_exit(void)
 {
-	DECLARE_COMPLETION_ONSTACK(all_gone);
 	blkio_policy_unregister(&blkio_policy_cfq);
 	elv_unregister(&iosched_cfq);
-	ioc_gone = &all_gone;
-	/* ioc_gone's update must be visible before reading ioc_count */
-	smp_wmb();
-
-	/*
-	 * this also protects us from entering cfq_slab_kill() with
-	 * pending RCU callbacks
-	 */
-	if (elv_ioc_count_read(cfq_ioc_count))
-		wait_for_completion(&all_gone);
-	ida_destroy(&cic_index_ida);
-	cfq_slab_kill();
+	kmem_cache_destroy(cfq_pool);
 }
  
 module_init(cfq_init);
@@ -719,6 +719,9 @@
 	case BLKSECTGET:
 		return compat_put_ushort(arg,
 					 queue_max_sectors(bdev_get_queue(bdev)));
+	case BLKROTATIONAL:
+		return compat_put_ushort(arg,
+					 !blk_queue_nonrot(bdev_get_queue(bdev)));
 	case BLKRASET: /* compatible, but no compat_ptr (!) */
 	case BLKFRASET:
 		if (!capable(CAP_SYS_ADMIN))
@@ -448,9 +448,7 @@
  
 static int __init deadline_init(void)
 {
-	elv_register(&iosched_deadline);
-
-	return 0;
+	return elv_register(&iosched_deadline);
 }
  
 static void __exit deadline_exit(void)
@@ -61,8 +61,8 @@
 	struct request_queue *q = rq->q;
 	struct elevator_queue *e = q->elevator;
  
-	if (e->ops->elevator_allow_merge_fn)
-		return e->ops->elevator_allow_merge_fn(q, rq, bio);
+	if (e->type->ops.elevator_allow_merge_fn)
+		return e->type->ops.elevator_allow_merge_fn(q, rq, bio);
  
 	return 1;
 }
  
  
@@ -168,19 +168,15 @@
 	return e;
 }
  
-static void *elevator_init_queue(struct request_queue *q,
-				 struct elevator_queue *eq)
+static int elevator_init_queue(struct request_queue *q,
+			       struct elevator_queue *eq)
 {
-	return eq->ops->elevator_init_fn(q);
+	eq->elevator_data = eq->type->ops.elevator_init_fn(q);
+	if (eq->elevator_data)
+		return 0;
+	return -ENOMEM;
 }
  
-static void elevator_attach(struct request_queue *q, struct elevator_queue *eq,
-			   void *data)
-{
-	q->elevator = eq;
-	eq->elevator_data = data;
-}
-
 static char chosen_elevator[ELV_NAME_MAX];
  
 static int __init elevator_setup(char *str)
@@ -207,8 +203,7 @@
 	if (unlikely(!eq))
 		goto err;
  
-	eq->ops = &e->ops;
-	eq->elevator_type = e;
+	eq->type = e;
 	kobject_init(&eq->kobj, &elv_ktype);
 	mutex_init(&eq->sysfs_lock);
  
@@ -232,7 +227,7 @@
 	struct elevator_queue *e;
  
 	e = container_of(kobj, struct elevator_queue, kobj);
-	elevator_put(e->elevator_type);
+	elevator_put(e->type);
 	kfree(e->hash);
 	kfree(e);
 }
@@ -241,7 +236,7 @@
 {
 	struct elevator_type *e = NULL;
 	struct elevator_queue *eq;
-	void *data;
+	int err;
  
 	if (unlikely(q->elevator))
 		return 0;
  
  
@@ -278,13 +273,13 @@
 	if (!eq)
 		return -ENOMEM;
  
-	data = elevator_init_queue(q, eq);
-	if (!data) {
+	err = elevator_init_queue(q, eq);
+	if (err) {
 		kobject_put(&eq->kobj);
-		return -ENOMEM;
+		return err;
 	}
  
-	elevator_attach(q, eq, data);
+	q->elevator = eq;
 	return 0;
 }
 EXPORT_SYMBOL(elevator_init);
@@ -292,9 +287,8 @@
 void elevator_exit(struct elevator_queue *e)
 {
 	mutex_lock(&e->sysfs_lock);
-	if (e->ops->elevator_exit_fn)
-		e->ops->elevator_exit_fn(e);
-	e->ops = NULL;
+	if (e->type->ops.elevator_exit_fn)
+		e->type->ops.elevator_exit_fn(e);
 	mutex_unlock(&e->sysfs_lock);
  
 	kobject_put(&e->kobj);
@@ -504,8 +498,8 @@
 		return ELEVATOR_BACK_MERGE;
 	}
  
-	if (e->ops->elevator_merge_fn)
-		return e->ops->elevator_merge_fn(q, req, bio);
+	if (e->type->ops.elevator_merge_fn)
+		return e->type->ops.elevator_merge_fn(q, req, bio);
  
 	return ELEVATOR_NO_MERGE;
 }
@@ -548,8 +542,8 @@
 {
 	struct elevator_queue *e = q->elevator;
  
-	if (e->ops->elevator_merged_fn)
-		e->ops->elevator_merged_fn(q, rq, type);
+	if (e->type->ops.elevator_merged_fn)
+		e->type->ops.elevator_merged_fn(q, rq, type);
  
 	if (type == ELEVATOR_BACK_MERGE)
 		elv_rqhash_reposition(q, rq);
@@ -563,8 +557,8 @@
 	struct elevator_queue *e = q->elevator;
 	const int next_sorted = next->cmd_flags & REQ_SORTED;
  
-	if (next_sorted && e->ops->elevator_merge_req_fn)
-		e->ops->elevator_merge_req_fn(q, rq, next);
+	if (next_sorted && e->type->ops.elevator_merge_req_fn)
+		e->type->ops.elevator_merge_req_fn(q, rq, next);
  
 	elv_rqhash_reposition(q, rq);
  
@@ -581,8 +575,8 @@
 {
 	struct elevator_queue *e = q->elevator;
  
-	if (e->ops->elevator_bio_merged_fn)
-		e->ops->elevator_bio_merged_fn(q, rq, bio);
+	if (e->type->ops.elevator_bio_merged_fn)
+		e->type->ops.elevator_bio_merged_fn(q, rq, bio);
 }
  
 void elv_requeue_request(struct request_queue *q, struct request *rq)
  
@@ -608,12 +602,12 @@
  
 	lockdep_assert_held(q->queue_lock);
  
-	while (q->elevator->ops->elevator_dispatch_fn(q, 1))
+	while (q->elevator->type->ops.elevator_dispatch_fn(q, 1))
 		;
 	if (q->nr_sorted && printed++ < 10) {
 		printk(KERN_ERR "%s: forced dispatching is broken "
 		       "(nr_sorted=%u), please report this\n",
-		       q->elevator->elevator_type->elevator_name, q->nr_sorted);
+		       q->elevator->type->elevator_name, q->nr_sorted);
 	}
 }
  
@@ -702,7 +696,7 @@
 		 * rq cannot be accessed after calling
 		 * elevator_add_req_fn.
 		 */
-		q->elevator->ops->elevator_add_req_fn(q, rq);
+		q->elevator->type->ops.elevator_add_req_fn(q, rq);
 		break;
  
 	case ELEVATOR_INSERT_FLUSH:
@@ -731,8 +725,8 @@
 {
 	struct elevator_queue *e = q->elevator;
  
-	if (e->ops->elevator_latter_req_fn)
-		return e->ops->elevator_latter_req_fn(q, rq);
+	if (e->type->ops.elevator_latter_req_fn)
+		return e->type->ops.elevator_latter_req_fn(q, rq);
 	return NULL;
 }
  
@@ -740,8 +734,8 @@
 {
 	struct elevator_queue *e = q->elevator;
  
-	if (e->ops->elevator_former_req_fn)
-		return e->ops->elevator_former_req_fn(q, rq);
+	if (e->type->ops.elevator_former_req_fn)
+		return e->type->ops.elevator_former_req_fn(q, rq);
 	return NULL;
 }
  
@@ -749,10 +743,8 @@
 {
 	struct elevator_queue *e = q->elevator;
  
-	if (e->ops->elevator_set_req_fn)
-		return e->ops->elevator_set_req_fn(q, rq, gfp_mask);
-
-	rq->elevator_private[0] = NULL;
+	if (e->type->ops.elevator_set_req_fn)
+		return e->type->ops.elevator_set_req_fn(q, rq, gfp_mask);
 	return 0;
 }
  
  
@@ -760,16 +752,16 @@
 {
 	struct elevator_queue *e = q->elevator;
  
-	if (e->ops->elevator_put_req_fn)
-		e->ops->elevator_put_req_fn(rq);
+	if (e->type->ops.elevator_put_req_fn)
+		e->type->ops.elevator_put_req_fn(rq);
 }
  
 int elv_may_queue(struct request_queue *q, int rw)
 {
 	struct elevator_queue *e = q->elevator;
  
-	if (e->ops->elevator_may_queue_fn)
-		return e->ops->elevator_may_queue_fn(q, rw);
+	if (e->type->ops.elevator_may_queue_fn)
+		return e->type->ops.elevator_may_queue_fn(q, rw);
  
 	return ELV_MQUEUE_MAY;
 }
@@ -804,8 +796,8 @@
 	if (blk_account_rq(rq)) {
 		q->in_flight[rq_is_sync(rq)]--;
 		if ((rq->cmd_flags & REQ_SORTED) &&
-		    e->ops->elevator_completed_req_fn)
-			e->ops->elevator_completed_req_fn(q, rq);
+		    e->type->ops.elevator_completed_req_fn)
+			e->type->ops.elevator_completed_req_fn(q, rq);
 	}
 }
  
@@ -823,7 +815,7 @@
  
 	e = container_of(kobj, struct elevator_queue, kobj);
 	mutex_lock(&e->sysfs_lock);
-	error = e->ops ? entry->show(e, page) : -ENOENT;
+	error = e->type ? entry->show(e, page) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
 	return error;
 }
@@ -841,7 +833,7 @@
  
 	e = container_of(kobj, struct elevator_queue, kobj);
 	mutex_lock(&e->sysfs_lock);
-	error = e->ops ? entry->store(e, page, length) : -ENOENT;
+	error = e->type ? entry->store(e, page, length) : -ENOENT;
 	mutex_unlock(&e->sysfs_lock);
 	return error;
 }
  
  
@@ -856,14 +848,13 @@
 	.release	= elevator_release,
 };
  
-int elv_register_queue(struct request_queue *q)
+int __elv_register_queue(struct request_queue *q, struct elevator_queue *e)
 {
-	struct elevator_queue *e = q->elevator;
 	int error;
  
 	error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
 	if (!error) {
-		struct elv_fs_entry *attr = e->elevator_type->elevator_attrs;
+		struct elv_fs_entry *attr = e->type->elevator_attrs;
 		if (attr) {
 			while (attr->attr.name) {
 				if (sysfs_create_file(&e->kobj, &attr->attr))
  
  
  
  
  
  
  
  
@@ -876,31 +867,55 @@
 	}
 	return error;
 }
-EXPORT_SYMBOL(elv_register_queue);
  
-static void __elv_unregister_queue(struct elevator_queue *e)
+int elv_register_queue(struct request_queue *q)
 {
-	kobject_uevent(&e->kobj, KOBJ_REMOVE);
-	kobject_del(&e->kobj);
-	e->registered = 0;
+	return __elv_register_queue(q, q->elevator);
 }
+EXPORT_SYMBOL(elv_register_queue);
  
 void elv_unregister_queue(struct request_queue *q)
 {
-	if (q)
-		__elv_unregister_queue(q->elevator);
+	if (q) {
+		struct elevator_queue *e = q->elevator;
+
+		kobject_uevent(&e->kobj, KOBJ_REMOVE);
+		kobject_del(&e->kobj);
+		e->registered = 0;
+	}
 }
 EXPORT_SYMBOL(elv_unregister_queue);
  
-void elv_register(struct elevator_type *e)
+int elv_register(struct elevator_type *e)
 {
 	char *def = "";
  
+	/* create icq_cache if requested */
+	if (e->icq_size) {
+		if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
+		    WARN_ON(e->icq_align < __alignof__(struct io_cq)))
+			return -EINVAL;
+
+		snprintf(e->icq_cache_name, sizeof(e->icq_cache_name),
+			 "%s_io_cq", e->elevator_name);
+		e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size,
+						 e->icq_align, 0, NULL);
+		if (!e->icq_cache)
+			return -ENOMEM;
+	}
+
+	/* register, don't allow duplicate names */
 	spin_lock(&elv_list_lock);
-	BUG_ON(elevator_find(e->elevator_name));
+	if (elevator_find(e->elevator_name)) {
+		spin_unlock(&elv_list_lock);
+		if (e->icq_cache)
+			kmem_cache_destroy(e->icq_cache);
+		return -EBUSY;
+	}
 	list_add_tail(&e->list, &elv_list);
 	spin_unlock(&elv_list_lock);
  
+	/* print pretty message */
 	if (!strcmp(e->elevator_name, chosen_elevator) ||
 			(!*chosen_elevator &&
 			 !strcmp(e->elevator_name, CONFIG_DEFAULT_IOSCHED)))
  
  
  
  
@@ -908,30 +923,26 @@
  
 	printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name,
 								def);
+	return 0;
 }
 EXPORT_SYMBOL_GPL(elv_register);
  
 void elv_unregister(struct elevator_type *e)
 {
-	struct task_struct *g, *p;
+	/* unregister */
+	spin_lock(&elv_list_lock);
+	list_del_init(&e->list);
+	spin_unlock(&elv_list_lock);
  
 	/*
-	 * Iterate every thread in the process to remove the io contexts.
+	 * Destroy icq_cache if it exists.  icq's are RCU managed.  Make
+	 * sure all RCU operations are complete before proceeding.
 	 */
-	if (e->ops.trim) {
-		read_lock(&tasklist_lock);
-		do_each_thread(g, p) {
-			task_lock(p);
-			if (p->io_context)
-				e->ops.trim(p->io_context);
-			task_unlock(p);
-		} while_each_thread(g, p);
-		read_unlock(&tasklist_lock);
+	if (e->icq_cache) {
+		rcu_barrier();
+		kmem_cache_destroy(e->icq_cache);
+		e->icq_cache = NULL;
 	}
-
-	spin_lock(&elv_list_lock);
-	list_del_init(&e->list);
-	spin_unlock(&elv_list_lock);
 }
 EXPORT_SYMBOL_GPL(elv_unregister);
  
  
  
  
  
  
  
  
@@ -944,54 +955,41 @@
 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
 {
 	struct elevator_queue *old_elevator, *e;
-	void *data;
 	int err;
  
-	/*
-	 * Allocate new elevator
-	 */
+	/* allocate new elevator */
 	e = elevator_alloc(q, new_e);
 	if (!e)
 		return -ENOMEM;
  
-	data = elevator_init_queue(q, e);
-	if (!data) {
+	err = elevator_init_queue(q, e);
+	if (err) {
 		kobject_put(&e->kobj);
-		return -ENOMEM;
+		return err;
 	}
  
-	/*
-	 * Turn on BYPASS and drain all requests w/ elevator private data
-	 */
+	/* turn on BYPASS and drain all requests w/ elevator private data */
 	elv_quiesce_start(q);
  
-	/*
-	 * Remember old elevator.
-	 */
-	old_elevator = q->elevator;
-
-	/*
-	 * attach and start new elevator
-	 */
-	spin_lock_irq(q->queue_lock);
-	elevator_attach(q, e, data);
-	spin_unlock_irq(q->queue_lock);
-
-	if (old_elevator->registered) {
-		__elv_unregister_queue(old_elevator);
-
-		err = elv_register_queue(q);
+	/* unregister old queue, register new one and kill old elevator */
+	if (q->elevator->registered) {
+		elv_unregister_queue(q);
+		err = __elv_register_queue(q, e);
 		if (err)
 			goto fail_register;
 	}
  
-	/*
-	 * finally exit old elevator and turn off BYPASS.
-	 */
+	/* done, clear io_cq's, switch elevators and turn off BYPASS */
+	spin_lock_irq(q->queue_lock);
+	ioc_clear_queue(q);
+	old_elevator = q->elevator;
+	q->elevator = e;
+	spin_unlock_irq(q->queue_lock);
+
 	elevator_exit(old_elevator);
 	elv_quiesce_end(q);
  
-	blk_add_trace_msg(q, "elv switch: %s", e->elevator_type->elevator_name);
+	blk_add_trace_msg(q, "elv switch: %s", e->type->elevator_name);
  
 	return 0;
  
@@ -1001,7 +999,6 @@
 	 * one again (along with re-adding the sysfs dir)
 	 */
 	elevator_exit(e);
-	q->elevator = old_elevator;
 	elv_register_queue(q);
 	elv_quiesce_end(q);
  
@@ -1026,7 +1023,7 @@
 		return -EINVAL;
 	}
  
-	if (!strcmp(elevator_name, q->elevator->elevator_type->elevator_name)) {
+	if (!strcmp(elevator_name, q->elevator->type->elevator_name)) {
 		elevator_put(e);
 		return 0;
 	}
@@ -1061,7 +1058,7 @@
 	if (!q->elevator || !blk_queue_stackable(q))
 		return sprintf(name, "none\n");
  
-	elv = e->elevator_type;
+	elv = e->type;
  
 	spin_lock(&elv_list_lock);
 	list_for_each_entry(__e, &elv_list, list) {
@@ -614,7 +614,7 @@
 	 * Take an extra ref on queue which will be put on disk_release()
 	 * so that it sticks around as long as @disk is there.
 	 */
-	WARN_ON_ONCE(blk_get_queue(disk->queue));
+	WARN_ON_ONCE(!blk_get_queue(disk->queue));
  
 	retval = sysfs_create_link(&disk_to_dev(disk)->kobj, &bdi->dev->kobj,
 				   "bdi");
@@ -296,6 +296,8 @@
 		return put_uint(arg, bdev_discard_zeroes_data(bdev));
 	case BLKSECTGET:
 		return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev)));
+	case BLKROTATIONAL:
+		return put_ushort(arg, !blk_queue_nonrot(bdev_get_queue(bdev)));
 	case BLKRASET:
 	case BLKFRASET:
 		if(!capable(CAP_SYS_ADMIN))
@@ -94,9 +94,7 @@
  
 static int __init noop_init(void)
 {
-	elv_register(&elevator_noop);
-
-	return 0;
+	return elv_register(&elevator_noop);
 }
  
 static void __exit noop_exit(void)
@@ -619,8 +619,10 @@
 	       host->state == HST_DEV_SCAN);
 	spin_unlock_irq(&host->lock);
  
-	DPRINTK("blk_insert_request, tag == %u\n", idx);
-	blk_insert_request(host->oob_q, crq->rq, 1, crq);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->special = crq;
+	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
  
 	return 0;
  
@@ -658,8 +660,10 @@
 	BUG_ON(rc < 0);
 	crq->msg_bucket = (u32) rc;
  
-	DPRINTK("blk_insert_request, tag == %u\n", idx);
-	blk_insert_request(host->oob_q, crq->rq, 1, crq);
+	DPRINTK("blk_execute_rq_nowait, tag == %u\n", idx);
+	crq->rq->cmd_type = REQ_TYPE_SPECIAL;
+	crq->rq->special = crq;
+	blk_execute_rq_nowait(host->oob_q, NULL, crq->rq, true, NULL);
  
 	return 0;
 }
@@ -699,7 +699,7 @@
 	while (i < dm_table_get_num_targets(table)) {
 		ti = dm_table_get_target(table, i++);
  
-		blk_set_default_limits(&ti_limits);
+		blk_set_stacking_limits(&ti_limits);
  
 		/* combine all target devices' limits */
 		if (ti->type->iterate_devices)
  
@@ -1221,10 +1221,10 @@
 	struct queue_limits ti_limits;
 	unsigned i = 0;
  
-	blk_set_default_limits(limits);
+	blk_set_stacking_limits(limits);
  
 	while (i < dm_table_get_num_targets(table)) {
-		blk_set_default_limits(&ti_limits);
+		blk_set_stacking_limits(&ti_limits);
  
 		ti = dm_table_get_target(table, i++);
  
@@ -4666,6 +4666,7 @@
 	mddev->queue->queuedata = mddev;
  
 	blk_queue_make_request(mddev->queue, md_make_request);
+	blk_set_stacking_limits(&mddev->queue->limits);
  
 	disk = alloc_disk(1 << shift);
 	if (!disk) {
@@ -297,7 +297,7 @@
 		kfree(sdev);
 		goto out;
 	}
-	blk_get_queue(sdev->request_queue);
+	WARN_ON_ONCE(!blk_get_queue(sdev->request_queue));
 	sdev->request_queue->queuedata = sdev;
 	scsi_adjust_queue_depth(sdev, 0, sdev->host->cmd_per_lun);
  
@@ -48,28 +48,12 @@
 	if (err)
 		return err;
  
-	task_lock(task);
-	do {
-		ioc = task->io_context;
-		/* see wmb() in current_io_context() */
-		smp_read_barrier_depends();
-		if (ioc)
-			break;
-
-		ioc = alloc_io_context(GFP_ATOMIC, -1);
-		if (!ioc) {
-			err = -ENOMEM;
-			break;
-		}
-		task->io_context = ioc;
-	} while (1);
-
-	if (!err) {
-		ioc->ioprio = ioprio;
-		ioc->ioprio_changed = 1;
+	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
+	if (ioc) {
+		ioc_ioprio_changed(ioc, ioprio);
+		put_io_context(ioc, NULL);
 	}
  
-	task_unlock(task);
 	return err;
 }
 EXPORT_SYMBOL_GPL(set_task_ioprio);
@@ -371,10 +371,7 @@
 	sector_t last_block_in_bio = 0;
 	struct buffer_head map_bh;
 	unsigned long first_logical_block = 0;
-	struct blk_plug plug;
  
-	blk_start_plug(&plug);
-
 	map_bh.b_state = 0;
 	map_bh.b_size = 0;
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -395,7 +392,6 @@
 	BUG_ON(!list_empty(pages));
 	if (bio)
 		mpage_bio_submit(READ, bio);
-	blk_finish_plug(&plug);
 	return 0;
 }
 EXPORT_SYMBOL(mpage_readpages);
@@ -515,24 +515,64 @@
  
 #else /* CONFIG_BLK_DEV_INTEGRITY */
  
-#define bio_integrity(a)		(0)
-#define bioset_integrity_create(a, b)	(0)
-#define bio_integrity_prep(a)		(0)
-#define bio_integrity_enabled(a)	(0)
+static inline int bio_integrity(struct bio *bio)
+{
+	return 0;
+}
+
+static inline int bio_integrity_enabled(struct bio *bio)
+{
+	return 0;
+}
+
+static inline int bioset_integrity_create(struct bio_set *bs, int pool_size)
+{
+	return 0;
+}
+
+static inline void bioset_integrity_free (struct bio_set *bs)
+{
+	return;
+}
+
+static inline int bio_integrity_prep(struct bio *bio)
+{
+	return 0;
+}
+
+static inline void bio_integrity_free(struct bio *bio, struct bio_set *bs)
+{
+	return;
+}
+
 static inline int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
 				      gfp_t gfp_mask, struct bio_set *bs)
 {
 	return 0;
 }
-#define bioset_integrity_free(a)	do { } while (0)
-#define bio_integrity_free(a, b)	do { } while (0)
-#define bio_integrity_endio(a, b)	do { } while (0)
-#define bio_integrity_advance(a, b)	do { } while (0)
-#define bio_integrity_trim(a, b, c)	do { } while (0)
-#define bio_integrity_split(a, b, c)	do { } while (0)
-#define bio_integrity_set_tag(a, b, c)	do { } while (0)
-#define bio_integrity_get_tag(a, b, c)	do { } while (0)
-#define bio_integrity_init(a)		do { } while (0)
+
+static inline void bio_integrity_split(struct bio *bio, struct bio_pair *bp,
+				       int sectors)
+{
+	return;
+}
+
+static inline void bio_integrity_advance(struct bio *bio,
+					 unsigned int bytes_done)
+{
+	return;
+}
+
+static inline void bio_integrity_trim(struct bio *bio, unsigned int offset,
+				      unsigned int sectors)
+{
+	return;
+}
+
+static inline void bio_integrity_init(void)
+{
+	return;
+}
  
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
  
@@ -111,11 +111,15 @@
 	 * Three pointers are available for the IO schedulers, if they need
 	 * more they have to dynamically allocate it.  Flush requests are
 	 * never put on the IO scheduler. So let the flush fields share
-	 * space with the three elevator_private pointers.
+	 * space with the elevator data.
 	 */
 	union {
-		void *elevator_private[3];
 		struct {
+			struct io_cq		*icq;
+			void			*priv[2];
+		} elv;
+
+		struct {
 			unsigned int		seq;
 			struct list_head	list;
 			rq_end_io_fn		*saved_end_io;
@@ -311,6 +315,12 @@
 	unsigned long		queue_flags;
  
 	/*
+	 * ida allocated id for this queue.  Used to index queues from
+	 * ioctx.
+	 */
+	int			id;
+
+	/*
 	 * queue needs bounce pages for pages above this limit
 	 */
 	gfp_t			bounce_gfp;
@@ -351,6 +361,8 @@
 	struct timer_list	timeout;
 	struct list_head	timeout_list;
  
+	struct list_head	icq_list;
+
 	struct queue_limits	limits;
  
 	/*
@@ -387,6 +399,9 @@
 	/* Throttle data */
 	struct throtl_data *td;
 #endif
+#ifdef CONFIG_LOCKDEP
+	int			ioc_release_depth;
+#endif
 };
  
 #define QUEUE_FLAG_QUEUED	1	/* uses generic tag queueing */
@@ -481,6 +496,7 @@
  
 #define blk_queue_tagged(q)	test_bit(QUEUE_FLAG_QUEUED, &(q)->queue_flags)
 #define blk_queue_stopped(q)	test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
+#define blk_queue_dead(q)	test_bit(QUEUE_FLAG_DEAD, &(q)->queue_flags)
 #define blk_queue_nomerges(q)	test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
 #define blk_queue_noxmerges(q)	\
 	test_bit(QUEUE_FLAG_NOXMERGES, &(q)->queue_flags)
@@ -660,7 +676,6 @@
 extern struct request *blk_get_request(struct request_queue *, int, gfp_t);
 extern struct request *blk_make_request(struct request_queue *, struct bio *,
 					gfp_t);
-extern void blk_insert_request(struct request_queue *, struct request *, int, void *);
 extern void blk_requeue_request(struct request_queue *, struct request *);
 extern void blk_add_request_payload(struct request *rq, struct page *page,
 		unsigned int len);
@@ -829,6 +844,7 @@
 extern void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt);
 extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt);
 extern void blk_set_default_limits(struct queue_limits *lim);
+extern void blk_set_stacking_limits(struct queue_limits *lim);
 extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
 			    sector_t offset);
 extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
@@ -859,7 +875,7 @@
 extern void blk_dump_rq_flags(struct request *, char *);
 extern long nr_blockdev_pages(void);
  
-int blk_get_queue(struct request_queue *);
+bool __must_check blk_get_queue(struct request_queue *);
 struct request_queue *blk_alloc_queue(gfp_t);
 struct request_queue *blk_alloc_queue_node(gfp_t, int);
 extern void blk_put_queue(struct request_queue *);
@@ -1282,19 +1298,70 @@
  
 #else /* CONFIG_BLK_DEV_INTEGRITY */
  
-#define blk_integrity_rq(rq)			(0)
-#define blk_rq_count_integrity_sg(a, b)		(0)
-#define blk_rq_map_integrity_sg(a, b, c)	(0)
-#define bdev_get_integrity(a)			(0)
-#define blk_get_integrity(a)			(0)
-#define blk_integrity_compare(a, b)		(0)
-#define blk_integrity_register(a, b)		(0)
-#define blk_integrity_unregister(a)		do { } while (0)
-#define blk_queue_max_integrity_segments(a, b)	do { } while (0)
-#define queue_max_integrity_segments(a)		(0)
-#define blk_integrity_merge_rq(a, b, c)		(0)
-#define blk_integrity_merge_bio(a, b, c)	(0)
-#define blk_integrity_is_initialized(a)		(0)
+struct bio;
+struct block_device;
+struct gendisk;
+struct blk_integrity;
+
+static inline int blk_integrity_rq(struct request *rq)
+{
+	return 0;
+}
+static inline int blk_rq_count_integrity_sg(struct request_queue *q,
+					    struct bio *b)
+{
+	return 0;
+}
+static inline int blk_rq_map_integrity_sg(struct request_queue *q,
+					  struct bio *b,
+					  struct scatterlist *s)
+{
+	return 0;
+}
+static inline struct blk_integrity *bdev_get_integrity(struct block_device *b)
+{
+	return 0;
+}
+static inline struct blk_integrity *blk_get_integrity(struct gendisk *disk)
+{
+	return NULL;
+}
+static inline int blk_integrity_compare(struct gendisk *a, struct gendisk *b)
+{
+	return 0;
+}
+static inline int blk_integrity_register(struct gendisk *d,
+					 struct blk_integrity *b)
+{
+	return 0;
+}
+static inline void blk_integrity_unregister(struct gendisk *d)
+{
+}
+static inline void blk_queue_max_integrity_segments(struct request_queue *q,
+						    unsigned int segs)
+{
+}
+static inline unsigned short queue_max_integrity_segments(struct request_queue *q)
+{
+	return 0;
+}
+static inline int blk_integrity_merge_rq(struct request_queue *rq,
+					 struct request *r1,
+					 struct request *r2)
+{
+	return 0;
+}
+static inline int blk_integrity_merge_bio(struct request_queue *rq,
+					  struct request *r,
+					  struct bio *b)
+{
+	return 0;
+}
+static inline bool blk_integrity_is_initialized(struct gendisk *g)
+{
+	return 0;
+}
  
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
  
@@ -5,6 +5,8 @@
  
 #ifdef CONFIG_BLOCK
  
+struct io_cq;
+
 typedef int (elevator_merge_fn) (struct request_queue *, struct request **,
 				 struct bio *);
  
@@ -24,6 +26,8 @@
 typedef void (elevator_completed_req_fn) (struct request_queue *, struct request *);
 typedef int (elevator_may_queue_fn) (struct request_queue *, int);
  
+typedef void (elevator_init_icq_fn) (struct io_cq *);
+typedef void (elevator_exit_icq_fn) (struct io_cq *);
 typedef int (elevator_set_req_fn) (struct request_queue *, struct request *, gfp_t);
 typedef void (elevator_put_req_fn) (struct request *);
 typedef void (elevator_activate_req_fn) (struct request_queue *, struct request *);
@@ -56,6 +60,9 @@
 	elevator_request_list_fn *elevator_former_req_fn;
 	elevator_request_list_fn *elevator_latter_req_fn;
  
+	elevator_init_icq_fn *elevator_init_icq_fn;	/* see iocontext.h */
+	elevator_exit_icq_fn *elevator_exit_icq_fn;	/* ditto */
+
 	elevator_set_req_fn *elevator_set_req_fn;
 	elevator_put_req_fn *elevator_put_req_fn;
  
@@ -63,7 +70,6 @@
  
 	elevator_init_fn *elevator_init_fn;
 	elevator_exit_fn *elevator_exit_fn;
-	void (*trim)(struct io_context *);
 };
  
 #define ELV_NAME_MAX	(16)
  
  
@@ -79,11 +85,20 @@
  */
 struct elevator_type
 {
-	struct list_head list;
+	/* managed by elevator core */
+	struct kmem_cache *icq_cache;
+
+	/* fields provided by elevator implementation */
 	struct elevator_ops ops;
+	size_t icq_size;	/* see iocontext.h */
+	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	struct module *elevator_owner;
+
+	/* managed by elevator core */
+	char icq_cache_name[ELV_NAME_MAX + 5];	/* elvname + "_io_cq" */
+	struct list_head list;
 };
  
 /*
  
@@ -91,10 +106,9 @@
  */
 struct elevator_queue
 {
-	struct elevator_ops *ops;
+	struct elevator_type *type;
 	void *elevator_data;
 	struct kobject kobj;
-	struct elevator_type *elevator_type;
 	struct mutex sysfs_lock;
 	struct hlist_head *hash;
 	unsigned int registered:1;
@@ -129,7 +143,7 @@
 /*
  * io scheduler registration
  */
-extern void elv_register(struct elevator_type *);
+extern int elv_register(struct elevator_type *);
 extern void elv_unregister(struct elevator_type *);
  
 /*
@@ -196,23 +210,6 @@
 	list_del_init(&(rq)->queuelist);	\
 	INIT_LIST_HEAD(&(rq)->csd.list);	\
 	} while (0)
-
-/*
- * io context count accounting
- */
-#define elv_ioc_count_mod(name, __val) this_cpu_add(name, __val)
-#define elv_ioc_count_inc(name)	this_cpu_inc(name)
-#define elv_ioc_count_dec(name)	this_cpu_dec(name)
-
-#define elv_ioc_count_read(name)				\
-({								\
-	unsigned long __val = 0;				\
-	int __cpu;						\
-	smp_wmb();						\
-	for_each_possible_cpu(__cpu)				\
-		__val += per_cpu(name, __cpu);			\
-	__val;							\
-})
  
 #endif /* CONFIG_BLOCK */
 #endif
@@ -319,6 +319,7 @@
 #define BLKPBSZGET _IO(0x12,123)
 #define BLKDISCARDZEROES _IO(0x12,124)
 #define BLKSECDISCARD _IO(0x12,125)
+#define BLKROTATIONAL _IO(0x12,126)
  
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
@@ -3,32 +3,92 @@
  
 #include <linux/radix-tree.h>
 #include <linux/rcupdate.h>
+#include <linux/workqueue.h>
  
-struct cfq_queue;
-struct cfq_ttime {
-	unsigned long last_end_request;
-
-	unsigned long ttime_total;
-	unsigned long ttime_samples;
-	unsigned long ttime_mean;
+enum {
+	ICQ_IOPRIO_CHANGED,
+	ICQ_CGROUP_CHANGED,
 };
  
-struct cfq_io_context {
-	void *key;
+/*
+ * An io_cq (icq) is association between an io_context (ioc) and a
+ * request_queue (q).  This is used by elevators which need to track
+ * information per ioc - q pair.
+ *
+ * Elevator can request use of icq by setting elevator_type->icq_size and
+ * ->icq_align.  Both size and align must be larger than that of struct
+ * io_cq and elevator can use the tail area for private information.  The
+ * recommended way to do this is defining a struct which contains io_cq as
+ * the first member followed by private members and using its size and
+ * align.  For example,
+ *
+ *	struct snail_io_cq {
+ *		struct io_cq	icq;
+ *		int		poke_snail;
+ *		int		feed_snail;
+ *	};
+ *
+ *	struct elevator_type snail_elv_type {
+ *		.ops =		{ ... },
+ *		.icq_size =	sizeof(struct snail_io_cq),
+ *		.icq_align =	__alignof__(struct snail_io_cq),
+ *		...
+ *	};
+ *
+ * If icq_size is set, block core will manage icq's.  All requests will
+ * have its ->elv.icq field set before elevator_ops->elevator_set_req_fn()
+ * is called and be holding a reference to the associated io_context.
+ *
+ * Whenever a new icq is created, elevator_ops->elevator_init_icq_fn() is
+ * called and, on destruction, ->elevator_exit_icq_fn().  Both functions
+ * are called with both the associated io_context and queue locks held.
+ *
+ * Elevator is allowed to lookup icq using ioc_lookup_icq() while holding
+ * queue lock but the returned icq is valid only until the queue lock is
+ * released.  Elevators can not and should not try to create or destroy
+ * icq's.
+ *
+ * As icq's are linked from both ioc and q, the locking rules are a bit
+ * complex.
+ *
+ * - ioc lock nests inside q lock.
+ *
+ * - ioc->icq_list and icq->ioc_node are protected by ioc lock.
+ *   q->icq_list and icq->q_node by q lock.
+ *
+ * - ioc->icq_tree and ioc->icq_hint are protected by ioc lock, while icq
+ *   itself is protected by q lock.  However, both the indexes and icq
+ *   itself are also RCU managed and lookup can be performed holding only
+ *   the q lock.
+ *
+ * - icq's are not reference counted.  They are destroyed when either the
+ *   ioc or q goes away.  Each request with icq set holds an extra
+ *   reference to ioc to ensure it stays until the request is completed.
+ *
+ * - Linking and unlinking icq's are performed while holding both ioc and q
+ *   locks.  Due to the lock ordering, q exit is simple but ioc exit
+ *   requires reverse-order double lock dance.
+ */
+struct io_cq {
+	struct request_queue	*q;
+	struct io_context	*ioc;
  
-	struct cfq_queue *cfqq[2];
+	/*
+	 * q_node and ioc_node link io_cq through icq_list of q and ioc
+	 * respectively.  Both fields are unused once ioc_exit_icq() is
+	 * called and shared with __rcu_icq_cache and __rcu_head which are
+	 * used for RCU free of io_cq.
+	 */
+	union {
+		struct list_head	q_node;
+		struct kmem_cache	*__rcu_icq_cache;
+	};
+	union {
+		struct hlist_node	ioc_node;
+		struct rcu_head		__rcu_head;
+	};
  
-	struct io_context *ioc;
-
-	struct cfq_ttime ttime;
-
-	struct list_head queue_list;
-	struct hlist_node cic_list;
-
-	void (*dtor)(struct io_context *); /* destructor */
-	void (*exit)(struct io_context *); /* called on task exit */
-
-	struct rcu_head rcu_head;
+	unsigned long		changed;
 };
  
 /*
  
  
@@ -43,21 +103,18 @@
 	spinlock_t lock;
  
 	unsigned short ioprio;
-	unsigned short ioprio_changed;
  
-#if defined(CONFIG_BLK_CGROUP) || defined(CONFIG_BLK_CGROUP_MODULE)
-	unsigned short cgroup_changed;
-#endif
-
 	/*
 	 * For request batching
 	 */
 	int nr_batch_requests;     /* Number of requests left in the batch */
 	unsigned long last_waited; /* Time last woken after wait for request */
  
-	struct radix_tree_root radix_root;
-	struct hlist_head cic_list;
-	void __rcu *ioc_data;
+	struct radix_tree_root	icq_tree;
+	struct io_cq __rcu	*icq_hint;
+	struct hlist_head	icq_list;
+
+	struct work_struct release_work;
 };
  
 static inline struct io_context *ioc_task_link(struct io_context *ioc)
  
  
  
@@ -76,20 +133,17 @@
  
 struct task_struct;
 #ifdef CONFIG_BLOCK
-int put_io_context(struct io_context *ioc);
+void put_io_context(struct io_context *ioc, struct request_queue *locked_q);
 void exit_io_context(struct task_struct *task);
-struct io_context *get_io_context(gfp_t gfp_flags, int node);
-struct io_context *alloc_io_context(gfp_t gfp_flags, int node);
+struct io_context *get_task_io_context(struct task_struct *task,
+				       gfp_t gfp_flags, int node);
+void ioc_ioprio_changed(struct io_context *ioc, int ioprio);
+void ioc_cgroup_changed(struct io_context *ioc);
 #else
-static inline void exit_io_context(struct task_struct *task)
-{
-}
-
 struct io_context;
-static inline int put_io_context(struct io_context *ioc)
-{
-	return 1;
-}
+static inline void put_io_context(struct io_context *ioc,
+				  struct request_queue *locked_q) { }
+static inline void exit_io_context(struct task_struct *task) { }
 #endif
  
 #endif
@@ -873,6 +873,7 @@
 {
 #ifdef CONFIG_BLOCK
 	struct io_context *ioc = current->io_context;
+	struct io_context *new_ioc;
  
 	if (!ioc)
 		return 0;
  
@@ -884,11 +885,12 @@
 		if (unlikely(!tsk->io_context))
 			return -ENOMEM;
 	} else if (ioprio_valid(ioc->ioprio)) {
-		tsk->io_context = alloc_io_context(GFP_KERNEL, -1);
-		if (unlikely(!tsk->io_context))
+		new_ioc = get_task_io_context(tsk, GFP_KERNEL, NUMA_NO_NODE);
+		if (unlikely(!new_ioc))
 			return -ENOMEM;
  
-		tsk->io_context->ioprio = ioc->ioprio;
+		new_ioc->ioprio = ioc->ioprio;
+		put_io_context(new_ioc, NULL);
 	}
 #endif
 	return 0;
...	...	@@ -1655,11 +1655,12 @@
1655	1655	struct io_context *ioc;
1656	1656
1657	1657	cgroup_taskset_for_each(task, cgrp, tset) {
1658		- task_lock(task);
1659		- ioc = task->io_context;
1660		- if (ioc)
1661		- ioc->cgroup_changed = 1;
1662		- task_unlock(task);
	1658	+ /* we don't lose anything even if ioc allocation fails */
	1659	+ ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
	1660	+ if (ioc) {
	1661	+ ioc_cgroup_changed(ioc);
	1662	+ put_io_context(ioc, NULL);
	1663	+ }
1663	1664	}
1664	1665	}
1665	1666
...	...	@@ -39,6 +39,8 @@
39	39	EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
40	40	EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
41	41
	42	+DEFINE_IDA(blk_queue_ida);
	43	+
42	44	/*
43	45	* For the allocated request tables
44	46	*/
...	...	@@ -358,7 +360,8 @@
358	360	void blk_drain_queue(struct request_queue *q, bool drain_all)
359	361	{
360	362	while (true) {
361		- int nr_rqs;
	363	+ bool drain = false;
	364	+ int i;
362	365
363	366	spin_lock_irq(q->queue_lock);
364	367
365	368
366	369
...	...	@@ -375,14 +378,25 @@
375	378	if (!list_empty(&q->queue_head))
376	379	__blk_run_queue(q);
377	380
378		- if (drain_all)
379		- nr_rqs = q->rq.count[0] + q->rq.count[1];
380		- else
381		- nr_rqs = q->rq.elvpriv;
	381	+ drain \|= q->rq.elvpriv;
382	382
	383	+ /*
	384	+ * Unfortunately, requests are queued at and tracked from
	385	+ * multiple places and there's no single counter which can
	386	+ * be drained. Check all the queues and counters.
	387	+ */
	388	+ if (drain_all) {
	389	+ drain \|= !list_empty(&q->queue_head);
	390	+ for (i = 0; i < 2; i++) {
	391	+ drain \|= q->rq.count[i];
	392	+ drain \|= q->in_flight[i];
	393	+ drain \|= !list_empty(&q->flush_queue[i]);
	394	+ }
	395	+ }
	396	+
383	397	spin_unlock_irq(q->queue_lock);
384	398
385		- if (!nr_rqs)
	399	+ if (!drain)
386	400	break;
387	401	msleep(10);
388	402	}
...	...	@@ -469,6 +483,10 @@
469	483	if (!q)
470	484	return NULL;
471	485
	486	+ q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
	487	+ if (q->id < 0)
	488	+ goto fail_q;
	489	+
472	490	q->backing_dev_info.ra_pages =
473	491	(VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
474	492	q->backing_dev_info.state = 0;
475	493
476	494
...	...	@@ -477,20 +495,17 @@
477	495	q->node = node_id;
478	496
479	497	err = bdi_init(&q->backing_dev_info);
480		- if (err) {
481		- kmem_cache_free(blk_requestq_cachep, q);
482		- return NULL;
483		- }
	498	+ if (err)
	499	+ goto fail_id;
484	500
485		- if (blk_throtl_init(q)) {
486		- kmem_cache_free(blk_requestq_cachep, q);
487		- return NULL;
488		- }
	501	+ if (blk_throtl_init(q))
	502	+ goto fail_id;
489	503
490	504	setup_timer(&q->backing_dev_info.laptop_mode_wb_timer,
491	505	laptop_mode_timer_fn, (unsigned long) q);
492	506	setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q);
493	507	INIT_LIST_HEAD(&q->timeout_list);
	508	+ INIT_LIST_HEAD(&q->icq_list);
494	509	INIT_LIST_HEAD(&q->flush_queue[0]);
495	510	INIT_LIST_HEAD(&q->flush_queue[1]);
496	511	INIT_LIST_HEAD(&q->flush_data_in_flight);
...	...	@@ -508,6 +523,12 @@
508	523	q->queue_lock = &q->__queue_lock;
509	524
510	525	return q;
	526	+
	527	+fail_id:
	528	+ ida_simple_remove(&blk_queue_ida, q->id);
	529	+fail_q:
	530	+ kmem_cache_free(blk_requestq_cachep, q);
	531	+ return NULL;
511	532	}
512	533	EXPORT_SYMBOL(blk_alloc_queue_node);
513	534
514	535
515	536
516	537
517	538
518	539
...	...	@@ -605,26 +626,31 @@
605	626	}
606	627	EXPORT_SYMBOL(blk_init_allocated_queue);
607	628
608		-int blk_get_queue(struct request_queue *q)
	629	+bool blk_get_queue(struct request_queue *q)
609	630	{
610		- if (likely(!test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
611		- kobject_get(&q->kobj);
612		- return 0;
	631	+ if (likely(!blk_queue_dead(q))) {
	632	+ __blk_get_queue(q);
	633	+ return true;
613	634	}
614	635
615		- return 1;
	636	+ return false;
616	637	}
617	638	EXPORT_SYMBOL(blk_get_queue);
618	639
619	640	static inline void blk_free_request(struct request_queue q, struct request rq)
620	641	{
621		- if (rq->cmd_flags & REQ_ELVPRIV)
	642	+ if (rq->cmd_flags & REQ_ELVPRIV) {
622	643	elv_put_request(q, rq);
	644	+ if (rq->elv.icq)
	645	+ put_io_context(rq->elv.icq->ioc, q);
	646	+ }
	647	+
623	648	mempool_free(rq, q->rq.rq_pool);
624	649	}
625	650
626	651	static struct request *
627		-blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
	652	+blk_alloc_request(struct request_queue q, struct io_cq icq,
	653	+ unsigned int flags, gfp_t gfp_mask)
628	654	{
629	655	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
630	656
...	...	@@ -635,10 +661,15 @@
635	661
636	662	rq->cmd_flags = flags \| REQ_ALLOCED;
637	663
638		- if ((flags & REQ_ELVPRIV) &&
639		- unlikely(elv_set_request(q, rq, gfp_mask))) {
640		- mempool_free(rq, q->rq.rq_pool);
641		- return NULL;
	664	+ if (flags & REQ_ELVPRIV) {
	665	+ rq->elv.icq = icq;
	666	+ if (unlikely(elv_set_request(q, rq, gfp_mask))) {
	667	+ mempool_free(rq, q->rq.rq_pool);
	668	+ return NULL;
	669	+ }
	670	+ /* @rq->elv.icq holds on to io_context until @rq is freed */
	671	+ if (icq)
	672	+ get_io_context(icq->ioc);
642	673	}
643	674
644	675	return rq;
645	676
646	677
647	678
...	...	@@ -750,11 +781,17 @@
750	781	{
751	782	struct request *rq = NULL;
752	783	struct request_list *rl = &q->rq;
753		- struct io_context *ioc = NULL;
	784	+ struct elevator_type *et;
	785	+ struct io_context *ioc;
	786	+ struct io_cq *icq = NULL;
754	787	const bool is_sync = rw_is_sync(rw_flags) != 0;
	788	+ bool retried = false;
755	789	int may_queue;
	790	+retry:
	791	+ et = q->elevator->type;
	792	+ ioc = current->io_context;
756	793
757		- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
	794	+ if (unlikely(blk_queue_dead(q)))
758	795	return NULL;
759	796
760	797	may_queue = elv_may_queue(q, rw_flags);
761	798
...	...	@@ -763,8 +800,21 @@
763	800
764	801	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
765	802	if (rl->count[is_sync]+1 >= q->nr_requests) {
766		- ioc = current_io_context(GFP_ATOMIC, q->node);
767	803	/*
	804	+ * We want ioc to record batching state. If it's
	805	+ * not already there, creating a new one requires
	806	+ * dropping queue_lock, which in turn requires
	807	+ * retesting conditions to avoid queue hang.
	808	+ */
	809	+ if (!ioc && !retried) {
	810	+ spin_unlock_irq(q->queue_lock);
	811	+ create_io_context(current, gfp_mask, q->node);
	812	+ spin_lock_irq(q->queue_lock);
	813	+ retried = true;
	814	+ goto retry;
	815	+ }
	816	+
	817	+ /*
768	818	* The queue will fill after this allocation, so set
769	819	* it as full, and mark this process as "batching".
770	820	* This process will be allowed to complete a batch of
771	821
772	822
...	...	@@ -799,17 +849,36 @@
799	849	rl->count[is_sync]++;
800	850	rl->starved[is_sync] = 0;
801	851
	852	+ /*
	853	+ * Decide whether the new request will be managed by elevator. If
	854	+ * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will
	855	+ * prevent the current elevator from being destroyed until the new
	856	+ * request is freed. This guarantees icq's won't be destroyed and
	857	+ * makes creating new ones safe.
	858	+ *
	859	+ * Also, lookup icq while holding queue_lock. If it doesn't exist,
	860	+ * it will be created after releasing queue_lock.
	861	+ */
802	862	if (blk_rq_should_init_elevator(bio) &&
803	863	!test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
804	864	rw_flags \|= REQ_ELVPRIV;
805	865	rl->elvpriv++;
	866	+ if (et->icq_cache && ioc)
	867	+ icq = ioc_lookup_icq(ioc, q);
806	868	}
807	869
808	870	if (blk_queue_io_stat(q))
809	871	rw_flags \|= REQ_IO_STAT;
810	872	spin_unlock_irq(q->queue_lock);
811	873
812		- rq = blk_alloc_request(q, rw_flags, gfp_mask);
	874	+ /* create icq if missing */
	875	+ if (unlikely(et->icq_cache && !icq))
	876	+ icq = ioc_create_icq(q, gfp_mask);
	877	+
	878	+ /* rqs are guaranteed to have icq on elv_set_request() if requested */
	879	+ if (likely(!et->icq_cache \|\| icq))
	880	+ rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
	881	+
813	882	if (unlikely(!rq)) {
814	883	/*
815	884	* Allocation failed presumably due to memory. Undo anything
816	885
...	...	@@ -871,10 +940,9 @@
871	940	rq = get_request(q, rw_flags, bio, GFP_NOIO);
872	941	while (!rq) {
873	942	DEFINE_WAIT(wait);
874		- struct io_context *ioc;
875	943	struct request_list *rl = &q->rq;
876	944
877		- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
	945	+ if (unlikely(blk_queue_dead(q)))
878	946	return NULL;
879	947
880	948	prepare_to_wait_exclusive(&rl->wait[is_sync], &wait,
...	...	@@ -891,8 +959,8 @@
891	959	* up to a big batch of them for a small period time.
892	960	* See ioc_batching, ioc_set_batching
893	961	*/
894		- ioc = current_io_context(GFP_NOIO, q->node);
895		- ioc_set_batching(q, ioc);
	962	+ create_io_context(current, GFP_NOIO, q->node);
	963	+ ioc_set_batching(q, current->io_context);
896	964
897	965	spin_lock_irq(q->queue_lock);
898	966	finish_wait(&rl->wait[is_sync], &wait);
...	...	@@ -1009,54 +1077,6 @@
1009	1077	__elv_add_request(q, rq, where);
1010	1078	}
1011	1079
1012		-/**
1013		- * blk_insert_request - insert a special request into a request queue
1014		- * @q: request queue where request should be inserted
1015		- * @rq: request to be inserted
1016		- * @at_head: insert request at head or tail of queue
1017		- * @data: private data
1018		- *
1019		- * Description:
1020		- * Many block devices need to execute commands asynchronously, so they don't
1021		- * block the whole kernel from preemption during request execution. This is
1022		- * accomplished normally by inserting aritficial requests tagged as
1023		- * REQ_TYPE_SPECIAL in to the corresponding request queue, and letting them
1024		- * be scheduled for actual execution by the request queue.
1025		- *
1026		- * We have the option of inserting the head or the tail of the queue.
1027		- * Typically we use the tail for new ioctls and so forth. We use the head
1028		- * of the queue for things like a QUEUE_FULL message from a device, or a
1029		- * host that is unable to accept a particular command.
1030		- */
1031		-void blk_insert_request(struct request_queue q, struct request rq,
1032		- int at_head, void *data)
1033		-{
1034		- int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
1035		- unsigned long flags;
1036		-
1037		- /*
1038		- * tell I/O scheduler that this isn't a regular read/write (ie it
1039		- * must not attempt merges on this) and that it acts as a soft
1040		- * barrier
1041		- */
1042		- rq->cmd_type = REQ_TYPE_SPECIAL;
1043		-
1044		- rq->special = data;
1045		-
1046		- spin_lock_irqsave(q->queue_lock, flags);
1047		-
1048		- /*
1049		- * If command is tagged, release the tag
1050		- */
1051		- if (blk_rq_tagged(rq))
1052		- blk_queue_end_tag(q, rq);
1053		-
1054		- add_acct_request(q, rq, where);
1055		- __blk_run_queue(q);
1056		- spin_unlock_irqrestore(q->queue_lock, flags);
1057		-}
1058		-EXPORT_SYMBOL(blk_insert_request);
1059		-
1060	1080	static void part_round_stats_single(int cpu, struct hd_struct *part,
1061	1081	unsigned long now)
1062	1082	{
...	...	@@ -1766,6 +1786,10 @@
1766	1786	return -EIO;
1767	1787
1768	1788	spin_lock_irqsave(q->queue_lock, flags);
	1789	+ if (unlikely(blk_queue_dead(q))) {
	1790	+ spin_unlock_irqrestore(q->queue_lock, flags);
	1791	+ return -ENODEV;
	1792	+ }
1769	1793
1770	1794	/*
1771	1795	* Submitting request must be dequeued before calling this function
...	...	@@ -2740,6 +2764,14 @@
2740	2764	trace_block_unplug(q, depth, !from_schedule);
2741	2765
2742	2766	/*
	2767	+ * Don't mess with dead queue.
	2768	+ */
	2769	+ if (unlikely(blk_queue_dead(q))) {
	2770	+ spin_unlock(q->queue_lock);
	2771	+ return;
	2772	+ }
	2773	+
	2774	+ /*
2743	2775	* If we are punting this to kblockd, then we can safely drop
2744	2776	* the queue_lock before waking kblockd (which needs to take
2745	2777	* this lock).
...	...	@@ -2815,6 +2847,15 @@
2815	2847	depth = 0;
2816	2848	spin_lock(q->queue_lock);
2817	2849	}
	2850	+
	2851	+ /*
	2852	+ * Short-circuit if @q is dead
	2853	+ */
	2854	+ if (unlikely(blk_queue_dead(q))) {
	2855	+ __blk_end_request_all(rq, -ENODEV);
	2856	+ continue;
	2857	+ }
	2858	+
2818	2859	/*
2819	2860	* rq is already accounted, so use raw insert
2820	2861	*/
...	...	@@ -50,7 +50,11 @@
50	50	{
51	51	int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK;
52	52
53		- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
	53	+ WARN_ON(irqs_disabled());
	54	+ spin_lock_irq(q->queue_lock);
	55	+
	56	+ if (unlikely(blk_queue_dead(q))) {
	57	+ spin_unlock_irq(q->queue_lock);
54	58	rq->errors = -ENXIO;
55	59	if (rq->end_io)
56	60	rq->end_io(rq, rq->errors);
...	...	@@ -59,8 +63,6 @@
59	63
60	64	rq->rq_disk = bd_disk;
61	65	rq->end_io = done;
62		- WARN_ON(irqs_disabled());
63		- spin_lock_irq(q->queue_lock);
64	66	__elv_add_request(q, rq, where);
65	67	__blk_run_queue(q);
66	68	/* the queue is stopped so it won't be run */
...	...	@@ -16,53 +16,214 @@
16	16	*/
17	17	static struct kmem_cache *iocontext_cachep;
18	18
19		-static void cfq_dtor(struct io_context *ioc)
	19	+/**
	20	+ * get_io_context - increment reference count to io_context
	21	+ * @ioc: io_context to get
	22	+ *
	23	+ * Increment reference count to @ioc.
	24	+ */
	25	+void get_io_context(struct io_context *ioc)
20	26	{
21		- if (!hlist_empty(&ioc->cic_list)) {
22		- struct cfq_io_context *cic;
	27	+ BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
	28	+ atomic_long_inc(&ioc->refcount);
	29	+}
	30	+EXPORT_SYMBOL(get_io_context);
23	31
24		- cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
25		- cic_list);
26		- cic->dtor(ioc);
	32	+/*
	33	+ * Releasing ioc may nest into another put_io_context() leading to nested
	34	+ * fast path release. As the ioc's can't be the same, this is okay but
	35	+ * makes lockdep whine. Keep track of nesting and use it as subclass.
	36	+ */
	37	+#ifdef CONFIG_LOCKDEP
	38	+#define ioc_release_depth(q) ((q) ? (q)->ioc_release_depth : 0)
	39	+#define ioc_release_depth_inc(q) (q)->ioc_release_depth++
	40	+#define ioc_release_depth_dec(q) (q)->ioc_release_depth--
	41	+#else
	42	+#define ioc_release_depth(q) 0
	43	+#define ioc_release_depth_inc(q) do { } while (0)
	44	+#define ioc_release_depth_dec(q) do { } while (0)
	45	+#endif
	46	+
	47	+static void icq_free_icq_rcu(struct rcu_head *head)
	48	+{
	49	+ struct io_cq *icq = container_of(head, struct io_cq, __rcu_head);
	50	+
	51	+ kmem_cache_free(icq->__rcu_icq_cache, icq);
	52	+}
	53	+
	54	+/*
	55	+ * Exit and free an icq. Called with both ioc and q locked.
	56	+ */
	57	+static void ioc_exit_icq(struct io_cq *icq)
	58	+{
	59	+ struct io_context *ioc = icq->ioc;
	60	+ struct request_queue *q = icq->q;
	61	+ struct elevator_type *et = q->elevator->type;
	62	+
	63	+ lockdep_assert_held(&ioc->lock);
	64	+ lockdep_assert_held(q->queue_lock);
	65	+
	66	+ radix_tree_delete(&ioc->icq_tree, icq->q->id);
	67	+ hlist_del_init(&icq->ioc_node);
	68	+ list_del_init(&icq->q_node);
	69	+
	70	+ /*
	71	+ * Both setting lookup hint to and clearing it from @icq are done
	72	+ * under queue_lock. If it's not pointing to @icq now, it never
	73	+ * will. Hint assignment itself can race safely.
	74	+ */
	75	+ if (rcu_dereference_raw(ioc->icq_hint) == icq)
	76	+ rcu_assign_pointer(ioc->icq_hint, NULL);
	77	+
	78	+ if (et->ops.elevator_exit_icq_fn) {
	79	+ ioc_release_depth_inc(q);
	80	+ et->ops.elevator_exit_icq_fn(icq);
	81	+ ioc_release_depth_dec(q);
27	82	}
	83	+
	84	+ /*
	85	+ * @icq->q might have gone away by the time RCU callback runs
	86	+ * making it impossible to determine icq_cache. Record it in @icq.
	87	+ */
	88	+ icq->__rcu_icq_cache = et->icq_cache;
	89	+ call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
28	90	}
29	91
30	92	/*
31		- * IO Context helper functions. put_io_context() returns 1 if there are no
32		- * more users of this io context, 0 otherwise.
	93	+ * Slow path for ioc release in put_io_context(). Performs double-lock
	94	+ * dancing to unlink all icq's and then frees ioc.
33	95	*/
34		-int put_io_context(struct io_context *ioc)
	96	+static void ioc_release_fn(struct work_struct *work)
35	97	{
36		- if (ioc == NULL)
37		- return 1;
	98	+ struct io_context *ioc = container_of(work, struct io_context,
	99	+ release_work);
	100	+ struct request_queue *last_q = NULL;
38	101
39		- BUG_ON(atomic_long_read(&ioc->refcount) == 0);
	102	+ spin_lock_irq(&ioc->lock);
40	103
41		- if (atomic_long_dec_and_test(&ioc->refcount)) {
42		- rcu_read_lock();
43		- cfq_dtor(ioc);
44		- rcu_read_unlock();
	104	+ while (!hlist_empty(&ioc->icq_list)) {
	105	+ struct io_cq *icq = hlist_entry(ioc->icq_list.first,
	106	+ struct io_cq, ioc_node);
	107	+ struct request_queue *this_q = icq->q;
45	108
46		- kmem_cache_free(iocontext_cachep, ioc);
47		- return 1;
	109	+ if (this_q != last_q) {
	110	+ /*
	111	+ * Need to switch to @this_q. Once we release
	112	+ * @ioc->lock, it can go away along with @cic.
	113	+ * Hold on to it.
	114	+ */
	115	+ __blk_get_queue(this_q);
	116	+
	117	+ /*
	118	+ * blk_put_queue() might sleep thanks to kobject
	119	+ * idiocy. Always release both locks, put and
	120	+ * restart.
	121	+ */
	122	+ if (last_q) {
	123	+ spin_unlock(last_q->queue_lock);
	124	+ spin_unlock_irq(&ioc->lock);
	125	+ blk_put_queue(last_q);
	126	+ } else {
	127	+ spin_unlock_irq(&ioc->lock);
	128	+ }
	129	+
	130	+ last_q = this_q;
	131	+ spin_lock_irq(this_q->queue_lock);
	132	+ spin_lock(&ioc->lock);
	133	+ continue;
	134	+ }
	135	+ ioc_exit_icq(icq);
48	136	}
49		- return 0;
	137	+
	138	+ if (last_q) {
	139	+ spin_unlock(last_q->queue_lock);
	140	+ spin_unlock_irq(&ioc->lock);
	141	+ blk_put_queue(last_q);
	142	+ } else {
	143	+ spin_unlock_irq(&ioc->lock);
	144	+ }
	145	+
	146	+ kmem_cache_free(iocontext_cachep, ioc);
50	147	}
51		-EXPORT_SYMBOL(put_io_context);
52	148
53		-static void cfq_exit(struct io_context *ioc)
	149	+/**
	150	+ * put_io_context - put a reference of io_context
	151	+ * @ioc: io_context to put
	152	+ * @locked_q: request_queue the caller is holding queue_lock of (hint)
	153	+ *
	154	+ * Decrement reference count of @ioc and release it if the count reaches
	155	+ * zero. If the caller is holding queue_lock of a queue, it can indicate
	156	+ * that with @locked_q. This is an optimization hint and the caller is
	157	+ * allowed to pass in %NULL even when it's holding a queue_lock.
	158	+ */
	159	+void put_io_context(struct io_context ioc, struct request_queue locked_q)
54	160	{
55		- rcu_read_lock();
	161	+ struct request_queue *last_q = locked_q;
	162	+ unsigned long flags;
56	163
57		- if (!hlist_empty(&ioc->cic_list)) {
58		- struct cfq_io_context *cic;
	164	+ if (ioc == NULL)
	165	+ return;
59	166
60		- cic = hlist_entry(ioc->cic_list.first, struct cfq_io_context,
61		- cic_list);
62		- cic->exit(ioc);
	167	+ BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
	168	+ if (locked_q)
	169	+ lockdep_assert_held(locked_q->queue_lock);
	170	+
	171	+ if (!atomic_long_dec_and_test(&ioc->refcount))
	172	+ return;
	173	+
	174	+ /*
	175	+ * Destroy @ioc. This is a bit messy because icq's are chained
	176	+ * from both ioc and queue, and ioc->lock nests inside queue_lock.
	177	+ * The inner ioc->lock should be held to walk our icq_list and then
	178	+ * for each icq the outer matching queue_lock should be grabbed.
	179	+ * ie. We need to do reverse-order double lock dancing.
	180	+ *
	181	+ * Another twist is that we are often called with one of the
	182	+ * matching queue_locks held as indicated by @locked_q, which
	183	+ * prevents performing double-lock dance for other queues.
	184	+ *
	185	+ * So, we do it in two stages. The fast path uses the queue_lock
	186	+ * the caller is holding and, if other queues need to be accessed,
	187	+ * uses trylock to avoid introducing locking dependency. This can
	188	+ * handle most cases, especially if @ioc was performing IO on only
	189	+ * single device.
	190	+ *
	191	+ * If trylock doesn't cut it, we defer to @ioc->release_work which
	192	+ * can do all the double-locking dancing.
	193	+ */
	194	+ spin_lock_irqsave_nested(&ioc->lock, flags,
	195	+ ioc_release_depth(locked_q));
	196	+
	197	+ while (!hlist_empty(&ioc->icq_list)) {
	198	+ struct io_cq *icq = hlist_entry(ioc->icq_list.first,
	199	+ struct io_cq, ioc_node);
	200	+ struct request_queue *this_q = icq->q;
	201	+
	202	+ if (this_q != last_q) {
	203	+ if (last_q && last_q != locked_q)
	204	+ spin_unlock(last_q->queue_lock);
	205	+ last_q = NULL;
	206	+
	207	+ if (!spin_trylock(this_q->queue_lock))
	208	+ break;
	209	+ last_q = this_q;
	210	+ continue;
	211	+ }
	212	+ ioc_exit_icq(icq);
63	213	}
64		- rcu_read_unlock();
	214	+
	215	+ if (last_q && last_q != locked_q)
	216	+ spin_unlock(last_q->queue_lock);
	217	+
	218	+ spin_unlock_irqrestore(&ioc->lock, flags);
	219	+
	220	+ /* if no icq is left, we're done; otherwise, kick release_work */
	221	+ if (hlist_empty(&ioc->icq_list))
	222	+ kmem_cache_free(iocontext_cachep, ioc);
	223	+ else
	224	+ schedule_work(&ioc->release_work);
65	225	}
	226	+EXPORT_SYMBOL(put_io_context);
66	227
67	228	/* Called by the exiting task */
68	229	void exit_io_context(struct task_struct *task)
69	230
70	231
71	232
72	233
73	234
74	235
75	236
76	237
77	238
78	239
79	240
80	241
81	242
82	243
83	244
84	245
85	246
86	247
87	248
88	249
89	250
...	...	@@ -74,86 +235,240 @@
74	235	task->io_context = NULL;
75	236	task_unlock(task);
76	237
77		- if (atomic_dec_and_test(&ioc->nr_tasks))
78		- cfq_exit(ioc);
	238	+ atomic_dec(&ioc->nr_tasks);
	239	+ put_io_context(ioc, NULL);
	240	+}
79	241
80		- put_io_context(ioc);
	242	+/**
	243	+ * ioc_clear_queue - break any ioc association with the specified queue
	244	+ * @q: request_queue being cleared
	245	+ *
	246	+ * Walk @q->icq_list and exit all io_cq's. Must be called with @q locked.
	247	+ */
	248	+void ioc_clear_queue(struct request_queue *q)
	249	+{
	250	+ lockdep_assert_held(q->queue_lock);
	251	+
	252	+ while (!list_empty(&q->icq_list)) {
	253	+ struct io_cq *icq = list_entry(q->icq_list.next,
	254	+ struct io_cq, q_node);
	255	+ struct io_context *ioc = icq->ioc;
	256	+
	257	+ spin_lock(&ioc->lock);
	258	+ ioc_exit_icq(icq);
	259	+ spin_unlock(&ioc->lock);
	260	+ }
81	261	}
82	262
83		-struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
	263	+void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
	264	+ int node)
84	265	{
85	266	struct io_context *ioc;
86	267
87		- ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags, node);
88		- if (ioc) {
89		- atomic_long_set(&ioc->refcount, 1);
90		- atomic_set(&ioc->nr_tasks, 1);
91		- spin_lock_init(&ioc->lock);
92		- ioc->ioprio_changed = 0;
93		- ioc->ioprio = 0;
94		- ioc->last_waited = 0; /* doesn't matter... */
95		- ioc->nr_batch_requests = 0; /* because this is 0 */
96		- INIT_RADIX_TREE(&ioc->radix_root, GFP_ATOMIC \| __GFP_HIGH);
97		- INIT_HLIST_HEAD(&ioc->cic_list);
98		- ioc->ioc_data = NULL;
99		-#if defined(CONFIG_BLK_CGROUP) \|\| defined(CONFIG_BLK_CGROUP_MODULE)
100		- ioc->cgroup_changed = 0;
101		-#endif
102		- }
	268	+ ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags \| __GFP_ZERO,
	269	+ node);
	270	+ if (unlikely(!ioc))
	271	+ return;
103	272
104		- return ioc;
	273	+ /* initialize */
	274	+ atomic_long_set(&ioc->refcount, 1);
	275	+ atomic_set(&ioc->nr_tasks, 1);
	276	+ spin_lock_init(&ioc->lock);
	277	+ INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC \| __GFP_HIGH);
	278	+ INIT_HLIST_HEAD(&ioc->icq_list);
	279	+ INIT_WORK(&ioc->release_work, ioc_release_fn);
	280	+
	281	+ /*
	282	+ * Try to install. ioc shouldn't be installed if someone else
	283	+ * already did or @task, which isn't %current, is exiting. Note
	284	+ * that we need to allow ioc creation on exiting %current as exit
	285	+ * path may issue IOs from e.g. exit_files(). The exit path is
	286	+ * responsible for not issuing IO after exit_io_context().
	287	+ */
	288	+ task_lock(task);
	289	+ if (!task->io_context &&
	290	+ (task == current \|\| !(task->flags & PF_EXITING)))
	291	+ task->io_context = ioc;
	292	+ else
	293	+ kmem_cache_free(iocontext_cachep, ioc);
	294	+ task_unlock(task);
105	295	}
106	296
107		-/*
108		- * If the current task has no IO context then create one and initialise it.
109		- * Otherwise, return its existing IO context.
	297	+/**
	298	+ * get_task_io_context - get io_context of a task
	299	+ * @task: task of interest
	300	+ * @gfp_flags: allocation flags, used if allocation is necessary
	301	+ * @node: allocation node, used if allocation is necessary
110	302	*
111		- * This returned IO context doesn't have a specifically elevated refcount,
112		- * but since the current task itself holds a reference, the context can be
113		- * used in general code, so long as it stays within `current` context.
	303	+ * Return io_context of @task. If it doesn't exist, it is created with
	304	+ * @gfp_flags and @node. The returned io_context has its reference count
	305	+ * incremented.
	306	+ *
	307	+ * This function always goes through task_lock() and it's better to use
	308	+ * %current->io_context + get_io_context() for %current.
114	309	*/
115		-struct io_context *current_io_context(gfp_t gfp_flags, int node)
	310	+struct io_context get_task_io_context(struct task_struct task,
	311	+ gfp_t gfp_flags, int node)
116	312	{
117		- struct task_struct *tsk = current;
118		- struct io_context *ret;
	313	+ struct io_context *ioc;
119	314
120		- ret = tsk->io_context;
121		- if (likely(ret))
122		- return ret;
	315	+ might_sleep_if(gfp_flags & __GFP_WAIT);
123	316
124		- ret = alloc_io_context(gfp_flags, node);
125		- if (ret) {
126		- /* make sure set_task_ioprio() sees the settings above */
127		- smp_wmb();
128		- tsk->io_context = ret;
129		- }
	317	+ do {
	318	+ task_lock(task);
	319	+ ioc = task->io_context;
	320	+ if (likely(ioc)) {
	321	+ get_io_context(ioc);
	322	+ task_unlock(task);
	323	+ return ioc;
	324	+ }
	325	+ task_unlock(task);
	326	+ } while (create_io_context(task, gfp_flags, node));
130	327
131		- return ret;
	328	+ return NULL;
132	329	}
	330	+EXPORT_SYMBOL(get_task_io_context);
133	331
134		-/*
135		- * If the current task has no IO context then create one and initialise it.
136		- * If it does have a context, take a ref on it.
	332	+/**
	333	+ * ioc_lookup_icq - lookup io_cq from ioc
	334	+ * @ioc: the associated io_context
	335	+ * @q: the associated request_queue
137	336	*
138		- * This is always called in the context of the task which submitted the I/O.
	337	+ * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
	338	+ * with @q->queue_lock held.
139	339	*/
140		-struct io_context *get_io_context(gfp_t gfp_flags, int node)
	340	+struct io_cq ioc_lookup_icq(struct io_context ioc, struct request_queue *q)
141	341	{
142		- struct io_context *ioc = NULL;
	342	+ struct io_cq *icq;
143	343
	344	+ lockdep_assert_held(q->queue_lock);
	345	+
144	346	/*
145		- * Check for unlikely race with exiting task. ioc ref count is
146		- * zero when ioc is being detached.
	347	+ * icq's are indexed from @ioc using radix tree and hint pointer,
	348	+ * both of which are protected with RCU. All removals are done
	349	+ * holding both q and ioc locks, and we're holding q lock - if we
	350	+ * find a icq which points to us, it's guaranteed to be valid.
147	351	*/
148		- do {
149		- ioc = current_io_context(gfp_flags, node);
150		- if (unlikely(!ioc))
151		- break;
152		- } while (!atomic_long_inc_not_zero(&ioc->refcount));
	352	+ rcu_read_lock();
	353	+ icq = rcu_dereference(ioc->icq_hint);
	354	+ if (icq && icq->q == q)
	355	+ goto out;
153	356
154		- return ioc;
	357	+ icq = radix_tree_lookup(&ioc->icq_tree, q->id);
	358	+ if (icq && icq->q == q)
	359	+ rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */
	360	+ else
	361	+ icq = NULL;
	362	+out:
	363	+ rcu_read_unlock();
	364	+ return icq;
155	365	}
156		-EXPORT_SYMBOL(get_io_context);
	366	+EXPORT_SYMBOL(ioc_lookup_icq);
	367	+
	368	+/**
	369	+ * ioc_create_icq - create and link io_cq
	370	+ * @q: request_queue of interest
	371	+ * @gfp_mask: allocation mask
	372	+ *
	373	+ * Make sure io_cq linking %current->io_context and @q exists. If either
	374	+ * io_context and/or icq don't exist, they will be created using @gfp_mask.
	375	+ *
	376	+ * The caller is responsible for ensuring @ioc won't go away and @q is
	377	+ * alive and will stay alive until this function returns.
	378	+ */
	379	+struct io_cq ioc_create_icq(struct request_queue q, gfp_t gfp_mask)
	380	+{
	381	+ struct elevator_type *et = q->elevator->type;
	382	+ struct io_context *ioc;
	383	+ struct io_cq *icq;
	384	+
	385	+ /* allocate stuff */
	386	+ ioc = create_io_context(current, gfp_mask, q->node);
	387	+ if (!ioc)
	388	+ return NULL;
	389	+
	390	+ icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask \| __GFP_ZERO,
	391	+ q->node);
	392	+ if (!icq)
	393	+ return NULL;
	394	+
	395	+ if (radix_tree_preload(gfp_mask) < 0) {
	396	+ kmem_cache_free(et->icq_cache, icq);
	397	+ return NULL;
	398	+ }
	399	+
	400	+ icq->ioc = ioc;
	401	+ icq->q = q;
	402	+ INIT_LIST_HEAD(&icq->q_node);
	403	+ INIT_HLIST_NODE(&icq->ioc_node);
	404	+
	405	+ /* lock both q and ioc and try to link @icq */
	406	+ spin_lock_irq(q->queue_lock);
	407	+ spin_lock(&ioc->lock);
	408	+
	409	+ if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
	410	+ hlist_add_head(&icq->ioc_node, &ioc->icq_list);
	411	+ list_add(&icq->q_node, &q->icq_list);
	412	+ if (et->ops.elevator_init_icq_fn)
	413	+ et->ops.elevator_init_icq_fn(icq);
	414	+ } else {
	415	+ kmem_cache_free(et->icq_cache, icq);
	416	+ icq = ioc_lookup_icq(ioc, q);
	417	+ if (!icq)
	418	+ printk(KERN_ERR "cfq: icq link failed!\n");
	419	+ }
	420	+
	421	+ spin_unlock(&ioc->lock);
	422	+ spin_unlock_irq(q->queue_lock);
	423	+ radix_tree_preload_end();
	424	+ return icq;
	425	+}
	426	+
	427	+void ioc_set_changed(struct io_context *ioc, int which)
	428	+{
	429	+ struct io_cq *icq;
	430	+ struct hlist_node *n;
	431	+
	432	+ hlist_for_each_entry(icq, n, &ioc->icq_list, ioc_node)
	433	+ set_bit(which, &icq->changed);
	434	+}
	435	+
	436	+/**
	437	+ * ioc_ioprio_changed - notify ioprio change
	438	+ * @ioc: io_context of interest
	439	+ * @ioprio: new ioprio
	440	+ *
	441	+ * @ioc's ioprio has changed to @ioprio. Set %ICQ_IOPRIO_CHANGED for all
	442	+ * icq's. iosched is responsible for checking the bit and applying it on
	443	+ * request issue path.
	444	+ */
	445	+void ioc_ioprio_changed(struct io_context *ioc, int ioprio)
	446	+{
	447	+ unsigned long flags;
	448	+
	449	+ spin_lock_irqsave(&ioc->lock, flags);
	450	+ ioc->ioprio = ioprio;
	451	+ ioc_set_changed(ioc, ICQ_IOPRIO_CHANGED);
	452	+ spin_unlock_irqrestore(&ioc->lock, flags);
	453	+}
	454	+
	455	+/**
	456	+ * ioc_cgroup_changed - notify cgroup change
	457	+ * @ioc: io_context of interest
	458	+ *
	459	+ * @ioc's cgroup has changed. Set %ICQ_CGROUP_CHANGED for all icq's.
	460	+ * iosched is responsible for checking the bit and applying it on request
	461	+ * issue path.
	462	+ */
	463	+void ioc_cgroup_changed(struct io_context *ioc)
	464	+{
	465	+ unsigned long flags;
	466	+
	467	+ spin_lock_irqsave(&ioc->lock, flags);
	468	+ ioc_set_changed(ioc, ICQ_CGROUP_CHANGED);
	469	+ spin_unlock_irqrestore(&ioc->lock, flags);
	470	+}
	471	+EXPORT_SYMBOL(ioc_cgroup_changed);
157	472
158	473	static int __init blk_ioc_init(void)
159	474	{
...	...	@@ -104,9 +104,7 @@
104	104	* @lim: the queue_limits structure to reset
105	105	*
106	106	* Description:
107		- * Returns a queue_limit struct to its default state. Can be used by
108		- * stacking drivers like DM that stage table swaps and reuse an
109		- * existing device queue.
	107	+ * Returns a queue_limit struct to its default state.
110	108	*/
111	109	void blk_set_default_limits(struct queue_limits *lim)
112	110	{
113	111
...	...	@@ -114,13 +112,12 @@
114	112	lim->max_integrity_segments = 0;
115	113	lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
116	114	lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
117		- lim->max_sectors = BLK_DEF_MAX_SECTORS;
118		- lim->max_hw_sectors = INT_MAX;
	115	+ lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
119	116	lim->max_discard_sectors = 0;
120	117	lim->discard_granularity = 0;
121	118	lim->discard_alignment = 0;
122	119	lim->discard_misaligned = 0;
123		- lim->discard_zeroes_data = 1;
	120	+ lim->discard_zeroes_data = 0;
124	121	lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
125	122	lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
126	123	lim->alignment_offset = 0;
...	...	@@ -131,6 +128,27 @@
131	128	EXPORT_SYMBOL(blk_set_default_limits);
132	129
133	130	/**
	131	+ * blk_set_stacking_limits - set default limits for stacking devices
	132	+ * @lim: the queue_limits structure to reset
	133	+ *
	134	+ * Description:
	135	+ * Returns a queue_limit struct to its default state. Should be used
	136	+ * by stacking drivers like DM that have no internal limits.
	137	+ */
	138	+void blk_set_stacking_limits(struct queue_limits *lim)
	139	+{
	140	+ blk_set_default_limits(lim);
	141	+
	142	+ /* Inherit limits from component devices */
	143	+ lim->discard_zeroes_data = 1;
	144	+ lim->max_segments = USHRT_MAX;
	145	+ lim->max_hw_sectors = UINT_MAX;
	146	+
	147	+ lim->max_sectors = BLK_DEF_MAX_SECTORS;
	148	+}
	149	+EXPORT_SYMBOL(blk_set_stacking_limits);
	150	+
	151	+/**
134	152	* blk_queue_make_request - define an alternate make_request function for a device
135	153	* @q: the request queue for the device to be affected
136	154	* @mfn: the alternate make_request function
...	...	@@ -165,8 +183,6 @@
165	183	q->nr_batching = BLK_BATCH_REQ;
166	184
167	185	blk_set_default_limits(&q->limits);
168		- blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS);
169		- q->limits.discard_zeroes_data = 0;
170	186
171	187	/*
172	188	* by default assume old behaviour and bounce for any highmem page
...	...	@@ -425,7 +425,7 @@
425	425	if (!entry->show)
426	426	return -EIO;
427	427	mutex_lock(&q->sysfs_lock);
428		- if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
	428	+ if (blk_queue_dead(q)) {
429	429	mutex_unlock(&q->sysfs_lock);
430	430	return -ENOENT;
431	431	}
...	...	@@ -447,7 +447,7 @@
447	447
448	448	q = container_of(kobj, struct request_queue, kobj);
449	449	mutex_lock(&q->sysfs_lock);
450		- if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)) {
	450	+ if (blk_queue_dead(q)) {
451	451	mutex_unlock(&q->sysfs_lock);
452	452	return -ENOENT;
453	453	}
454	454
...	...	@@ -479,8 +479,12 @@
479	479
480	480	blk_sync_queue(q);
481	481
482		- if (q->elevator)
	482	+ if (q->elevator) {
	483	+ spin_lock_irq(q->queue_lock);
	484	+ ioc_clear_queue(q);
	485	+ spin_unlock_irq(q->queue_lock);
483	486	elevator_exit(q->elevator);
	487	+ }
484	488
485	489	blk_throtl_exit(q);
486	490
...	...	@@ -494,6 +498,8 @@
494	498	blk_trace_shutdown(q);
495	499
496	500	bdi_destroy(&q->backing_dev_info);
	501	+
	502	+ ida_simple_remove(&blk_queue_ida, q->id);
497	503	kmem_cache_free(blk_requestq_cachep, q);
498	504	}
499	505
...	...	@@ -310,7 +310,7 @@
310	310	struct request_queue *q = td->queue;
311	311
312	312	/* no throttling for dead queue */
313		- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags)))
	313	+ if (unlikely(blk_queue_dead(q)))
314	314	return NULL;
315	315
316	316	rcu_read_lock();
...	...	@@ -335,7 +335,7 @@
335	335	spin_lock_irq(q->queue_lock);
336	336
337	337	/* Make sure @q is still alive */
338		- if (unlikely(test_bit(QUEUE_FLAG_DEAD, &q->queue_flags))) {
	338	+ if (unlikely(blk_queue_dead(q))) {
339	339	kfree(tg);
340	340	return NULL;
341	341	}
1	1	#ifndef BLK_INTERNAL_H
2	2	#define BLK_INTERNAL_H
3	3
	4	+#include <linux/idr.h>
	5	+
4	6	/* Amount of time in which a process may batch requests */
5	7	#define BLK_BATCH_TIME (HZ/50UL)
6	8
7	9
...	...	@@ -9,7 +11,13 @@
9	11
10	12	extern struct kmem_cache *blk_requestq_cachep;
11	13	extern struct kobj_type blk_queue_ktype;
	14	+extern struct ida blk_queue_ida;
12	15
	16	+static inline void __blk_get_queue(struct request_queue *q)
	17	+{
	18	+ kobject_get(&q->kobj);
	19	+}
	20	+
13	21	void init_request_from_bio(struct request req, struct bio bio);
14	22	void blk_rq_bio_prep(struct request_queue q, struct request rq,
15	23	struct bio *bio);
...	...	@@ -85,8 +93,8 @@
85	93	q->flush_queue_delayed = 1;
86	94	return NULL;
87	95	}
88		- if (test_bit(QUEUE_FLAG_DEAD, &q->queue_flags) \|\|
89		- !q->elevator->ops->elevator_dispatch_fn(q, 0))
	96	+ if (unlikely(blk_queue_dead(q)) \|\|
	97	+ !q->elevator->type->ops.elevator_dispatch_fn(q, 0))
90	98	return NULL;
91	99	}
92	100	}
93	101
...	...	@@ -95,16 +103,16 @@
95	103	{
96	104	struct elevator_queue *e = q->elevator;
97	105
98		- if (e->ops->elevator_activate_req_fn)
99		- e->ops->elevator_activate_req_fn(q, rq);
	106	+ if (e->type->ops.elevator_activate_req_fn)
	107	+ e->type->ops.elevator_activate_req_fn(q, rq);
100	108	}
101	109
102	110	static inline void elv_deactivate_rq(struct request_queue q, struct request rq)
103	111	{
104	112	struct elevator_queue *e = q->elevator;
105	113
106		- if (e->ops->elevator_deactivate_req_fn)
107		- e->ops->elevator_deactivate_req_fn(q, rq);
	114	+ if (e->type->ops.elevator_deactivate_req_fn)
	115	+ e->type->ops.elevator_deactivate_req_fn(q, rq);
108	116	}
109	117
110	118	#ifdef CONFIG_FAIL_IO_TIMEOUT
...	...	@@ -119,8 +127,6 @@
119	127	}
120	128	#endif
121	129
122		-struct io_context *current_io_context(gfp_t gfp_flags, int node);
123		-
124	130	int ll_back_merge_fn(struct request_queue q, struct request req,
125	131	struct bio *bio);
126	132	int ll_front_merge_fn(struct request_queue q, struct request req,
...	...	@@ -189,6 +195,42 @@
189	195	(rq->cmd_flags & REQ_DISCARD));
190	196	}
191	197
	198	+/*
	199	+ * Internal io_context interface
	200	+ */
	201	+void get_io_context(struct io_context *ioc);
	202	+struct io_cq ioc_lookup_icq(struct io_context ioc, struct request_queue *q);
	203	+struct io_cq ioc_create_icq(struct request_queue q, gfp_t gfp_mask);
	204	+void ioc_clear_queue(struct request_queue *q);
	205	+
	206	+void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
	207	+ int node);
	208	+
	209	+/**
	210	+ * create_io_context - try to create task->io_context
	211	+ * @task: target task
	212	+ * @gfp_mask: allocation mask
	213	+ * @node: allocation node
	214	+ *
	215	+ * If @task->io_context is %NULL, allocate a new io_context and install it.
	216	+ * Returns the current @task->io_context which may be %NULL if allocation
	217	+ * failed.
	218	+ *
	219	+ * Note that this function can't be called with IRQ disabled because
	220	+ * task_lock which protects @task->io_context is IRQ-unsafe.
	221	+ */
	222	+static inline struct io_context create_io_context(struct task_struct task,
	223	+ gfp_t gfp_mask, int node)
	224	+{
	225	+ WARN_ON_ONCE(irqs_disabled());
	226	+ if (unlikely(!task->io_context))
	227	+ create_io_context_slowpath(task, gfp_mask, node);
	228	+ return task->io_context;
	229	+}
	230	+
	231	+/*
	232	+ * Internal throttling interface
	233	+ */
192	234	#ifdef CONFIG_BLK_DEV_THROTTLING
193	235	extern bool blk_throtl_bio(struct request_queue q, struct bio bio);
194	236	extern void blk_throtl_drain(struct request_queue *q);
...	...	@@ -769,12 +769,10 @@
769	769	struct file *file)
770	770	{
771	771	struct bsg_device *bd;
772		- int ret;
773	772	#ifdef BSG_DEBUG
774	773	unsigned char buf[32];
775	774	#endif
776		- ret = blk_get_queue(rq);
777		- if (ret)
	775	+ if (!blk_get_queue(rq))
778	776	return ERR_PTR(-ENXIO);
779	777
780	778	bd = bsg_alloc_device();
...	...	@@ -719,6 +719,9 @@
719	719	case BLKSECTGET:
720	720	return compat_put_ushort(arg,
721	721	queue_max_sectors(bdev_get_queue(bdev)));
	722	+ case BLKROTATIONAL:
	723	+ return compat_put_ushort(arg,
	724	+ !blk_queue_nonrot(bdev_get_queue(bdev)));
722	725	case BLKRASET: /* compatible, but no compat_ptr (!) */
723	726	case BLKFRASET:
724	727	if (!capable(CAP_SYS_ADMIN))