block, cfq: move icq creation and rq->elv.icq association to block core

Now block layer knows everything necessary to create and associate icq's with requests. Move ioc_create_icq() to blk-ioc.c and update get_request() such that, if elevator_type->icq_size is set, requests are automatically associated with their matching icq's before elv_set_request(). io_context reference is also managed by block core on request alloc/free. * Only ioprio/cgroup changed handling remains from cfq_get_cic(). Collapsed into cfq_set_request(). * This removes queue kicking on icq allocation failure (for now). As icq allocation failure is rare and the only effect of queue kicking achieved was possibily accelerating queue processing, this change shouldn't be noticeable. There is a larger underlying problem. Unlike request allocation, icq allocation is not guaranteed to succeed eventually after retries. The number of icq is unbound and thus mempool can't be the solution either. This effectively adds allocation dependency on memory free path and thus possibility of deadlock. This usually wouldn't happen because icq allocation is not a hot path and, even when the condition triggers, it's highly unlikely that none of the writeback workers already has icq. However, this is still possible especially if elevator is being switched under high memory pressure, so we better get it fixed. Probably the only solution is just bypassing elevator and appending to dispatch queue on any elevator allocation failure. * Comment added to explain how icq's are managed and synchronized. This completes cleanup of io_context interface. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>

block, cfq: move icq creation and rq->elv.icq association to block core
Now block layer knows everything necessary to create and associate icq's with requests. Move ioc_create_icq() to blk-ioc.c and update get_request() such that, if elevator_type->icq_size is set, requests are automatically associated with their matching icq's before elv_set_request(). io_context reference is also managed by block core on request alloc/free. * Only ioprio/cgroup changed handling remains from cfq_get_cic(). Collapsed into cfq_set_request(). * This removes queue kicking on icq allocation failure (for now). As icq allocation failure is rare and the only effect of queue kicking achieved was possibily accelerating queue processing, this change shouldn't be noticeable. There is a larger underlying problem. Unlike request allocation, icq allocation is not guaranteed to succeed eventually after retries. The number of icq is unbound and thus mempool can't be the solution either. This effectively adds allocation dependency on memory free path and thus possibility of deadlock. This usually wouldn't happen because icq allocation is not a hot path and, even when the condition triggers, it's highly unlikely that none of the writeback workers already has icq. However, this is still possible especially if elevator is being switched under high memory pressure, so we better get it fixed. Probably the only solution is just bypassing elevator and appending to dispatch queue on any elevator allocation failure. * Comment added to explain how icq's are managed and synchronized. This completes cleanup of io_context interface. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Tejun Heo · Jens Axboe
1 parent 9b84cacd01
Showing 6 changed files with 173 additions and 136 deletions Side-by-side Diff
block/blk-core.c
block/blk-ioc.c
block/blk.h
block/cfq-iosched.c
include/linux/elevator.h
include/linux/iocontext.h
@@ -640,13 +640,18 @@
  
 static inline void blk_free_request(struct request_queue *q, struct request *rq)
 {
-	if (rq->cmd_flags & REQ_ELVPRIV)
+	if (rq->cmd_flags & REQ_ELVPRIV) {
 		elv_put_request(q, rq);
+		if (rq->elv.icq)
+			put_io_context(rq->elv.icq->ioc, q);
+	}
+
 	mempool_free(rq, q->rq.rq_pool);
 }
  
 static struct request *
-blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
+blk_alloc_request(struct request_queue *q, struct io_cq *icq,
+		  unsigned int flags, gfp_t gfp_mask)
 {
 	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
  
@@ -657,10 +662,15 @@
  
 	rq->cmd_flags = flags | REQ_ALLOCED;
  
-	if ((flags & REQ_ELVPRIV) &&
-	    unlikely(elv_set_request(q, rq, gfp_mask))) {
-		mempool_free(rq, q->rq.rq_pool);
-		return NULL;
+	if (flags & REQ_ELVPRIV) {
+		rq->elv.icq = icq;
+		if (unlikely(elv_set_request(q, rq, gfp_mask))) {
+			mempool_free(rq, q->rq.rq_pool);
+			return NULL;
+		}
+		/* @rq->elv.icq holds on to io_context until @rq is freed */
+		if (icq)
+			get_io_context(icq->ioc);
 	}
  
 	return rq;
  
  
@@ -772,11 +782,14 @@
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
+	struct elevator_type *et;
 	struct io_context *ioc;
+	struct io_cq *icq = NULL;
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
 	bool retried = false;
 	int may_queue;
 retry:
+	et = q->elevator->type;
 	ioc = current->io_context;
  
 	if (unlikely(blk_queue_dead(q)))
  
  
@@ -837,17 +850,36 @@
 	rl->count[is_sync]++;
 	rl->starved[is_sync] = 0;
  
+	/*
+	 * Decide whether the new request will be managed by elevator.  If
+	 * so, mark @rw_flags and increment elvpriv.  Non-zero elvpriv will
+	 * prevent the current elevator from being destroyed until the new
+	 * request is freed.  This guarantees icq's won't be destroyed and
+	 * makes creating new ones safe.
+	 *
+	 * Also, lookup icq while holding queue_lock.  If it doesn't exist,
+	 * it will be created after releasing queue_lock.
+	 */
 	if (blk_rq_should_init_elevator(bio) &&
 	    !test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
 		rw_flags |= REQ_ELVPRIV;
 		rl->elvpriv++;
+		if (et->icq_cache && ioc)
+			icq = ioc_lookup_icq(ioc, q);
 	}
  
 	if (blk_queue_io_stat(q))
 		rw_flags |= REQ_IO_STAT;
 	spin_unlock_irq(q->queue_lock);
  
-	rq = blk_alloc_request(q, rw_flags, gfp_mask);
+	/* create icq if missing */
+	if (unlikely(et->icq_cache && !icq))
+		icq = ioc_create_icq(q, gfp_mask);
+
+	/* rqs are guaranteed to have icq on elv_set_request() if requested */
+	if (likely(!et->icq_cache || icq))
+		rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
+
 	if (unlikely(!rq)) {
 		/*
 		 * Allocation failed presumably due to memory. Undo anything
@@ -289,7 +289,6 @@
 		kmem_cache_free(iocontext_cachep, ioc);
 	task_unlock(task);
 }
-EXPORT_SYMBOL(create_io_context_slowpath);
  
 /**
  * get_task_io_context - get io_context of a task
@@ -361,6 +360,65 @@
 	return icq;
 }
 EXPORT_SYMBOL(ioc_lookup_icq);
+
+/**
+ * ioc_create_icq - create and link io_cq
+ * @q: request_queue of interest
+ * @gfp_mask: allocation mask
+ *
+ * Make sure io_cq linking %current->io_context and @q exists.  If either
+ * io_context and/or icq don't exist, they will be created using @gfp_mask.
+ *
+ * The caller is responsible for ensuring @ioc won't go away and @q is
+ * alive and will stay alive until this function returns.
+ */
+struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
+{
+	struct elevator_type *et = q->elevator->type;
+	struct io_context *ioc;
+	struct io_cq *icq;
+
+	/* allocate stuff */
+	ioc = create_io_context(current, gfp_mask, q->node);
+	if (!ioc)
+		return NULL;
+
+	icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
+				    q->node);
+	if (!icq)
+		return NULL;
+
+	if (radix_tree_preload(gfp_mask) < 0) {
+		kmem_cache_free(et->icq_cache, icq);
+		return NULL;
+	}
+
+	icq->ioc = ioc;
+	icq->q = q;
+	INIT_LIST_HEAD(&icq->q_node);
+	INIT_HLIST_NODE(&icq->ioc_node);
+
+	/* lock both q and ioc and try to link @icq */
+	spin_lock_irq(q->queue_lock);
+	spin_lock(&ioc->lock);
+
+	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
+		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
+		list_add(&icq->q_node, &q->icq_list);
+		if (et->ops.elevator_init_icq_fn)
+			et->ops.elevator_init_icq_fn(icq);
+	} else {
+		kmem_cache_free(et->icq_cache, icq);
+		icq = ioc_lookup_icq(ioc, q);
+		if (!icq)
+			printk(KERN_ERR "cfq: icq link failed!\n");
+	}
+
+	spin_unlock(&ioc->lock);
+	spin_unlock_irq(q->queue_lock);
+	radix_tree_preload_end();
+	return icq;
+}
  
 void ioc_set_changed(struct io_context *ioc, int which)
 {
@@ -200,6 +200,7 @@
  */
 void get_io_context(struct io_context *ioc);
 struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
+struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask);
 void ioc_clear_queue(struct request_queue *q);
  
 void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
@@ -2935,117 +2935,6 @@
 	return cfqq;
 }
  
-/**
- * ioc_create_icq - create and link io_cq
- * @q: request_queue of interest
- * @gfp_mask: allocation mask
- *
- * Make sure io_cq linking %current->io_context and @q exists.  If either
- * io_context and/or icq don't exist, they will be created using @gfp_mask.
- *
- * The caller is responsible for ensuring @ioc won't go away and @q is
- * alive and will stay alive until this function returns.
- */
-static struct io_cq *ioc_create_icq(struct request_queue *q, gfp_t gfp_mask)
-{
-	struct elevator_type *et = q->elevator->type;
-	struct io_context *ioc;
-	struct io_cq *icq;
-
-	/* allocate stuff */
-	ioc = create_io_context(current, gfp_mask, q->node);
-	if (!ioc)
-		return NULL;
-
-	icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
-				    q->node);
-	if (!icq)
-		return NULL;
-
-	if (radix_tree_preload(gfp_mask) < 0) {
-		kmem_cache_free(et->icq_cache, icq);
-		return NULL;
-	}
-
-	icq->ioc = ioc;
-	icq->q = q;
-	INIT_LIST_HEAD(&icq->q_node);
-	INIT_HLIST_NODE(&icq->ioc_node);
-
-	/* lock both q and ioc and try to link @icq */
-	spin_lock_irq(q->queue_lock);
-	spin_lock(&ioc->lock);
-
-	if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
-		hlist_add_head(&icq->ioc_node, &ioc->icq_list);
-		list_add(&icq->q_node, &q->icq_list);
-		if (et->ops.elevator_init_icq_fn)
-			et->ops.elevator_init_icq_fn(icq);
-	} else {
-		kmem_cache_free(et->icq_cache, icq);
-		icq = ioc_lookup_icq(ioc, q);
-		if (!icq)
-			printk(KERN_ERR "cfq: icq link failed!\n");
-	}
-
-	spin_unlock(&ioc->lock);
-	spin_unlock_irq(q->queue_lock);
-	radix_tree_preload_end();
-	return icq;
-}
-
-/**
- * cfq_get_cic - acquire cfq_io_cq and bump refcnt on io_context
- * @cfqd: cfqd to setup cic for
- * @gfp_mask: allocation mask
- *
- * Return cfq_io_cq associating @cfqd and %current->io_context and
- * bump refcnt on io_context.  If ioc or cic doesn't exist, they're created
- * using @gfp_mask.
- *
- * Must be called under queue_lock which may be released and re-acquired.
- * This function also may sleep depending on @gfp_mask.
- */
-static struct cfq_io_cq *cfq_get_cic(struct cfq_data *cfqd, gfp_t gfp_mask)
-{
-	struct request_queue *q = cfqd->queue;
-	struct cfq_io_cq *cic = NULL;
-	struct io_context *ioc;
-
-	lockdep_assert_held(q->queue_lock);
-
-	while (true) {
-		/* fast path */
-		ioc = current->io_context;
-		if (likely(ioc)) {
-			cic = cfq_cic_lookup(cfqd, ioc);
-			if (likely(cic))
-				break;
-		}
-
-		/* slow path - unlock, create missing ones and retry */
-		spin_unlock_irq(q->queue_lock);
-		cic = icq_to_cic(ioc_create_icq(q, gfp_mask));
-		spin_lock_irq(q->queue_lock);
-		if (!cic)
-			return NULL;
-	}
-
-	/* bump @ioc's refcnt and handle changed notifications */
-	get_io_context(ioc);
-
-	if (unlikely(cic->icq.changed)) {
-		if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
-			changed_ioprio(cic);
-#ifdef CONFIG_CFQ_GROUP_IOSCHED
-		if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
-			changed_cgroup(cic);
-#endif
-	}
-
-	return cic;
-}
-
 static void
 __cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
 {
@@ -3524,8 +3413,6 @@
 		BUG_ON(!cfqq->allocated[rw]);
 		cfqq->allocated[rw]--;
  
-		put_io_context(RQ_CIC(rq)->icq.ioc, cfqq->cfqd->queue);
-
 		/* Put down rq reference on cfqg */
 		cfq_put_cfqg(RQ_CFQG(rq));
 		rq->elv.priv[0] = NULL;
@@ -3574,7 +3461,7 @@
 cfq_set_request(struct request_queue *q, struct request *rq, gfp_t gfp_mask)
 {
 	struct cfq_data *cfqd = q->elevator->elevator_data;
-	struct cfq_io_cq *cic;
+	struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
 	const int rw = rq_data_dir(rq);
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
  
@@ -3582,10 +3469,17 @@
 	might_sleep_if(gfp_mask & __GFP_WAIT);
  
 	spin_lock_irq(q->queue_lock);
-	cic = cfq_get_cic(cfqd, gfp_mask);
-	if (!cic)
-		goto queue_fail;
  
+	/* handle changed notifications */
+	if (unlikely(cic->icq.changed)) {
+		if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
+			changed_ioprio(cic);
+#ifdef CONFIG_CFQ_GROUP_IOSCHED
+		if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
+			changed_cgroup(cic);
+#endif
+	}
+
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
  
@@ -3615,17 +3509,10 @@
 	cfqq->allocated[rw]++;
  
 	cfqq->ref++;
-	rq->elv.icq = &cic->icq;
 	rq->elv.priv[0] = cfqq;
 	rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
 	spin_unlock_irq(q->queue_lock);
 	return 0;
-
-queue_fail:
-	cfq_schedule_dispatch(cfqd);
-	spin_unlock_irq(q->queue_lock);
-	cfq_log(cfqd, "set_request fail");
-	return 1;
 }
  
 static void cfq_kick_queue(struct work_struct *work)
@@ -60,8 +60,8 @@
 	elevator_request_list_fn *elevator_former_req_fn;
 	elevator_request_list_fn *elevator_latter_req_fn;
  
-	elevator_init_icq_fn *elevator_init_icq_fn;
-	elevator_exit_icq_fn *elevator_exit_icq_fn;
+	elevator_init_icq_fn *elevator_init_icq_fn;	/* see iocontext.h */
+	elevator_exit_icq_fn *elevator_exit_icq_fn;	/* ditto */
  
 	elevator_set_req_fn *elevator_set_req_fn;
 	elevator_put_req_fn *elevator_put_req_fn;
@@ -90,8 +90,8 @@
  
 	/* fields provided by elevator implementation */
 	struct elevator_ops ops;
-	size_t icq_size;
-	size_t icq_align;
+	size_t icq_size;	/* see iocontext.h */
+	size_t icq_align;	/* ditto */
 	struct elv_fs_entry *elevator_attrs;
 	char elevator_name[ELV_NAME_MAX];
 	struct module *elevator_owner;
@@ -10,6 +10,65 @@
 	ICQ_CGROUP_CHANGED,
 };
  
+/*
+ * An io_cq (icq) is association between an io_context (ioc) and a
+ * request_queue (q).  This is used by elevators which need to track
+ * information per ioc - q pair.
+ *
+ * Elevator can request use of icq by setting elevator_type->icq_size and
+ * ->icq_align.  Both size and align must be larger than that of struct
+ * io_cq and elevator can use the tail area for private information.  The
+ * recommended way to do this is defining a struct which contains io_cq as
+ * the first member followed by private members and using its size and
+ * align.  For example,
+ *
+ *	struct snail_io_cq {
+ *		struct io_cq	icq;
+ *		int		poke_snail;
+ *		int		feed_snail;
+ *	};
+ *
+ *	struct elevator_type snail_elv_type {
+ *		.ops =		{ ... },
+ *		.icq_size =	sizeof(struct snail_io_cq),
+ *		.icq_align =	__alignof__(struct snail_io_cq),
+ *		...
+ *	};
+ *
+ * If icq_size is set, block core will manage icq's.  All requests will
+ * have its ->elv.icq field set before elevator_ops->elevator_set_req_fn()
+ * is called and be holding a reference to the associated io_context.
+ *
+ * Whenever a new icq is created, elevator_ops->elevator_init_icq_fn() is
+ * called and, on destruction, ->elevator_exit_icq_fn().  Both functions
+ * are called with both the associated io_context and queue locks held.
+ *
+ * Elevator is allowed to lookup icq using ioc_lookup_icq() while holding
+ * queue lock but the returned icq is valid only until the queue lock is
+ * released.  Elevators can not and should not try to create or destroy
+ * icq's.
+ *
+ * As icq's are linked from both ioc and q, the locking rules are a bit
+ * complex.
+ *
+ * - ioc lock nests inside q lock.
+ *
+ * - ioc->icq_list and icq->ioc_node are protected by ioc lock.
+ *   q->icq_list and icq->q_node by q lock.
+ *
+ * - ioc->icq_tree and ioc->icq_hint are protected by ioc lock, while icq
+ *   itself is protected by q lock.  However, both the indexes and icq
+ *   itself are also RCU managed and lookup can be performed holding only
+ *   the q lock.
+ *
+ * - icq's are not reference counted.  They are destroyed when either the
+ *   ioc or q goes away.  Each request with icq set holds an extra
+ *   reference to ioc to ensure it stays until the request is completed.
+ *
+ * - Linking and unlinking icq's are performed while holding both ioc and q
+ *   locks.  Due to the lock ordering, q exit is simple but ioc exit
+ *   requires reverse-order double lock dance.
+ */
 struct io_cq {
 	struct request_queue	*q;
 	struct io_context	*ioc;
...	...	@@ -640,13 +640,18 @@
640	640
641	641	static inline void blk_free_request(struct request_queue q, struct request rq)
642	642	{
643		- if (rq->cmd_flags & REQ_ELVPRIV)
	643	+ if (rq->cmd_flags & REQ_ELVPRIV) {
644	644	elv_put_request(q, rq);
	645	+ if (rq->elv.icq)
	646	+ put_io_context(rq->elv.icq->ioc, q);
	647	+ }
	648	+
645	649	mempool_free(rq, q->rq.rq_pool);
646	650	}
647	651
648	652	static struct request *
649		-blk_alloc_request(struct request_queue *q, unsigned int flags, gfp_t gfp_mask)
	653	+blk_alloc_request(struct request_queue q, struct io_cq icq,
	654	+ unsigned int flags, gfp_t gfp_mask)
650	655	{
651	656	struct request *rq = mempool_alloc(q->rq.rq_pool, gfp_mask);
652	657
...	...	@@ -657,10 +662,15 @@
657	662
658	663	rq->cmd_flags = flags \| REQ_ALLOCED;
659	664
660		- if ((flags & REQ_ELVPRIV) &&
661		- unlikely(elv_set_request(q, rq, gfp_mask))) {
662		- mempool_free(rq, q->rq.rq_pool);
663		- return NULL;
	665	+ if (flags & REQ_ELVPRIV) {
	666	+ rq->elv.icq = icq;
	667	+ if (unlikely(elv_set_request(q, rq, gfp_mask))) {
	668	+ mempool_free(rq, q->rq.rq_pool);
	669	+ return NULL;
	670	+ }
	671	+ /* @rq->elv.icq holds on to io_context until @rq is freed */
	672	+ if (icq)
	673	+ get_io_context(icq->ioc);
664	674	}
665	675
666	676	return rq;
667	677
668	678
...	...	@@ -772,11 +782,14 @@
772	782	{
773	783	struct request *rq = NULL;
774	784	struct request_list *rl = &q->rq;
	785	+ struct elevator_type *et;
775	786	struct io_context *ioc;
	787	+ struct io_cq *icq = NULL;
776	788	const bool is_sync = rw_is_sync(rw_flags) != 0;
777	789	bool retried = false;
778	790	int may_queue;
779	791	retry:
	792	+ et = q->elevator->type;
780	793	ioc = current->io_context;
781	794
782	795	if (unlikely(blk_queue_dead(q)))
783	796
784	797
...	...	@@ -837,17 +850,36 @@
837	850	rl->count[is_sync]++;
838	851	rl->starved[is_sync] = 0;
839	852
	853	+ /*
	854	+ * Decide whether the new request will be managed by elevator. If
	855	+ * so, mark @rw_flags and increment elvpriv. Non-zero elvpriv will
	856	+ * prevent the current elevator from being destroyed until the new
	857	+ * request is freed. This guarantees icq's won't be destroyed and
	858	+ * makes creating new ones safe.
	859	+ *
	860	+ * Also, lookup icq while holding queue_lock. If it doesn't exist,
	861	+ * it will be created after releasing queue_lock.
	862	+ */
840	863	if (blk_rq_should_init_elevator(bio) &&
841	864	!test_bit(QUEUE_FLAG_ELVSWITCH, &q->queue_flags)) {
842	865	rw_flags \|= REQ_ELVPRIV;
843	866	rl->elvpriv++;
	867	+ if (et->icq_cache && ioc)
	868	+ icq = ioc_lookup_icq(ioc, q);
844	869	}
845	870
846	871	if (blk_queue_io_stat(q))
847	872	rw_flags \|= REQ_IO_STAT;
848	873	spin_unlock_irq(q->queue_lock);
849	874
850		- rq = blk_alloc_request(q, rw_flags, gfp_mask);
	875	+ /* create icq if missing */
	876	+ if (unlikely(et->icq_cache && !icq))
	877	+ icq = ioc_create_icq(q, gfp_mask);
	878	+
	879	+ /* rqs are guaranteed to have icq on elv_set_request() if requested */
	880	+ if (likely(!et->icq_cache \|\| icq))
	881	+ rq = blk_alloc_request(q, icq, rw_flags, gfp_mask);
	882	+
851	883	if (unlikely(!rq)) {
852	884	/*
853	885	* Allocation failed presumably due to memory. Undo anything
...	...	@@ -289,7 +289,6 @@
289	289	kmem_cache_free(iocontext_cachep, ioc);
290	290	task_unlock(task);
291	291	}
292		-EXPORT_SYMBOL(create_io_context_slowpath);
293	292
294	293	/**
295	294	* get_task_io_context - get io_context of a task
...	...	@@ -361,6 +360,65 @@
361	360	return icq;
362	361	}
363	362	EXPORT_SYMBOL(ioc_lookup_icq);
	363	+
	364	+/**
	365	+ * ioc_create_icq - create and link io_cq
	366	+ * @q: request_queue of interest
	367	+ * @gfp_mask: allocation mask
	368	+ *
	369	+ * Make sure io_cq linking %current->io_context and @q exists. If either
	370	+ * io_context and/or icq don't exist, they will be created using @gfp_mask.
	371	+ *
	372	+ * The caller is responsible for ensuring @ioc won't go away and @q is
	373	+ * alive and will stay alive until this function returns.
	374	+ */
	375	+struct io_cq ioc_create_icq(struct request_queue q, gfp_t gfp_mask)
	376	+{
	377	+ struct elevator_type *et = q->elevator->type;
	378	+ struct io_context *ioc;
	379	+ struct io_cq *icq;
	380	+
	381	+ /* allocate stuff */
	382	+ ioc = create_io_context(current, gfp_mask, q->node);
	383	+ if (!ioc)
	384	+ return NULL;
	385	+
	386	+ icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask \| __GFP_ZERO,
	387	+ q->node);
	388	+ if (!icq)
	389	+ return NULL;
	390	+
	391	+ if (radix_tree_preload(gfp_mask) < 0) {
	392	+ kmem_cache_free(et->icq_cache, icq);
	393	+ return NULL;
	394	+ }
	395	+
	396	+ icq->ioc = ioc;
	397	+ icq->q = q;
	398	+ INIT_LIST_HEAD(&icq->q_node);
	399	+ INIT_HLIST_NODE(&icq->ioc_node);
	400	+
	401	+ /* lock both q and ioc and try to link @icq */
	402	+ spin_lock_irq(q->queue_lock);
	403	+ spin_lock(&ioc->lock);
	404	+
	405	+ if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
	406	+ hlist_add_head(&icq->ioc_node, &ioc->icq_list);
	407	+ list_add(&icq->q_node, &q->icq_list);
	408	+ if (et->ops.elevator_init_icq_fn)
	409	+ et->ops.elevator_init_icq_fn(icq);
	410	+ } else {
	411	+ kmem_cache_free(et->icq_cache, icq);
	412	+ icq = ioc_lookup_icq(ioc, q);
	413	+ if (!icq)
	414	+ printk(KERN_ERR "cfq: icq link failed!\n");
	415	+ }
	416	+
	417	+ spin_unlock(&ioc->lock);
	418	+ spin_unlock_irq(q->queue_lock);
	419	+ radix_tree_preload_end();
	420	+ return icq;
	421	+}
364	422
365	423	void ioc_set_changed(struct io_context *ioc, int which)
366	424	{
...	...	@@ -200,6 +200,7 @@
200	200	*/
201	201	void get_io_context(struct io_context *ioc);
202	202	struct io_cq ioc_lookup_icq(struct io_context ioc, struct request_queue *q);
	203	+struct io_cq ioc_create_icq(struct request_queue q, gfp_t gfp_mask);
203	204	void ioc_clear_queue(struct request_queue *q);
204	205
205	206	void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
...	...	@@ -2935,117 +2935,6 @@
2935	2935	return cfqq;
2936	2936	}
2937	2937
2938		-/**
2939		- * ioc_create_icq - create and link io_cq
2940		- * @q: request_queue of interest
2941		- * @gfp_mask: allocation mask
2942		- *
2943		- * Make sure io_cq linking %current->io_context and @q exists. If either
2944		- * io_context and/or icq don't exist, they will be created using @gfp_mask.
2945		- *
2946		- * The caller is responsible for ensuring @ioc won't go away and @q is
2947		- * alive and will stay alive until this function returns.
2948		- */
2949		-static struct io_cq ioc_create_icq(struct request_queue q, gfp_t gfp_mask)
2950		-{
2951		- struct elevator_type *et = q->elevator->type;
2952		- struct io_context *ioc;
2953		- struct io_cq *icq;
2954		-
2955		- /* allocate stuff */
2956		- ioc = create_io_context(current, gfp_mask, q->node);
2957		- if (!ioc)
2958		- return NULL;
2959		-
2960		- icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask \| __GFP_ZERO,
2961		- q->node);
2962		- if (!icq)
2963		- return NULL;
2964		-
2965		- if (radix_tree_preload(gfp_mask) < 0) {
2966		- kmem_cache_free(et->icq_cache, icq);
2967		- return NULL;
2968		- }
2969		-
2970		- icq->ioc = ioc;
2971		- icq->q = q;
2972		- INIT_LIST_HEAD(&icq->q_node);
2973		- INIT_HLIST_NODE(&icq->ioc_node);
2974		-
2975		- /* lock both q and ioc and try to link @icq */
2976		- spin_lock_irq(q->queue_lock);
2977		- spin_lock(&ioc->lock);
2978		-
2979		- if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) {
2980		- hlist_add_head(&icq->ioc_node, &ioc->icq_list);
2981		- list_add(&icq->q_node, &q->icq_list);
2982		- if (et->ops.elevator_init_icq_fn)
2983		- et->ops.elevator_init_icq_fn(icq);
2984		- } else {
2985		- kmem_cache_free(et->icq_cache, icq);
2986		- icq = ioc_lookup_icq(ioc, q);
2987		- if (!icq)
2988		- printk(KERN_ERR "cfq: icq link failed!\n");
2989		- }
2990		-
2991		- spin_unlock(&ioc->lock);
2992		- spin_unlock_irq(q->queue_lock);
2993		- radix_tree_preload_end();
2994		- return icq;
2995		-}
2996		-
2997		-/**
2998		- * cfq_get_cic - acquire cfq_io_cq and bump refcnt on io_context
2999		- * @cfqd: cfqd to setup cic for
3000		- * @gfp_mask: allocation mask
3001		- *
3002		- * Return cfq_io_cq associating @cfqd and %current->io_context and
3003		- * bump refcnt on io_context. If ioc or cic doesn't exist, they're created
3004		- * using @gfp_mask.
3005		- *
3006		- * Must be called under queue_lock which may be released and re-acquired.
3007		- * This function also may sleep depending on @gfp_mask.
3008		- */
3009		-static struct cfq_io_cq cfq_get_cic(struct cfq_data cfqd, gfp_t gfp_mask)
3010		-{
3011		- struct request_queue *q = cfqd->queue;
3012		- struct cfq_io_cq *cic = NULL;
3013		- struct io_context *ioc;
3014		-
3015		- lockdep_assert_held(q->queue_lock);
3016		-
3017		- while (true) {
3018		- /* fast path */
3019		- ioc = current->io_context;
3020		- if (likely(ioc)) {
3021		- cic = cfq_cic_lookup(cfqd, ioc);
3022		- if (likely(cic))
3023		- break;
3024		- }
3025		-
3026		- /* slow path - unlock, create missing ones and retry */
3027		- spin_unlock_irq(q->queue_lock);
3028		- cic = icq_to_cic(ioc_create_icq(q, gfp_mask));
3029		- spin_lock_irq(q->queue_lock);
3030		- if (!cic)
3031		- return NULL;
3032		- }
3033		-
3034		- /* bump @ioc's refcnt and handle changed notifications */
3035		- get_io_context(ioc);
3036		-
3037		- if (unlikely(cic->icq.changed)) {
3038		- if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
3039		- changed_ioprio(cic);
3040		-#ifdef CONFIG_CFQ_GROUP_IOSCHED
3041		- if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
3042		- changed_cgroup(cic);
3043		-#endif
3044		- }
3045		-
3046		- return cic;
3047		-}
3048		-
3049	2938	static void
3050	2939	__cfq_update_io_thinktime(struct cfq_ttime *ttime, unsigned long slice_idle)
3051	2940	{
...	...	@@ -3524,8 +3413,6 @@
3524	3413	BUG_ON(!cfqq->allocated[rw]);
3525	3414	cfqq->allocated[rw]--;
3526	3415
3527		- put_io_context(RQ_CIC(rq)->icq.ioc, cfqq->cfqd->queue);
3528		-
3529	3416	/* Put down rq reference on cfqg */
3530	3417	cfq_put_cfqg(RQ_CFQG(rq));
3531	3418	rq->elv.priv[0] = NULL;
...	...	@@ -3574,7 +3461,7 @@
3574	3461	cfq_set_request(struct request_queue q, struct request rq, gfp_t gfp_mask)
3575	3462	{
3576	3463	struct cfq_data *cfqd = q->elevator->elevator_data;
3577		- struct cfq_io_cq *cic;
	3464	+ struct cfq_io_cq *cic = icq_to_cic(rq->elv.icq);
3578	3465	const int rw = rq_data_dir(rq);
3579	3466	const bool is_sync = rq_is_sync(rq);
3580	3467	struct cfq_queue *cfqq;
3581	3468
...	...	@@ -3582,10 +3469,17 @@
3582	3469	might_sleep_if(gfp_mask & __GFP_WAIT);
3583	3470
3584	3471	spin_lock_irq(q->queue_lock);
3585		- cic = cfq_get_cic(cfqd, gfp_mask);
3586		- if (!cic)
3587		- goto queue_fail;
3588	3472
	3473	+ /* handle changed notifications */
	3474	+ if (unlikely(cic->icq.changed)) {
	3475	+ if (test_and_clear_bit(ICQ_IOPRIO_CHANGED, &cic->icq.changed))
	3476	+ changed_ioprio(cic);
	3477	+#ifdef CONFIG_CFQ_GROUP_IOSCHED
	3478	+ if (test_and_clear_bit(ICQ_CGROUP_CHANGED, &cic->icq.changed))
	3479	+ changed_cgroup(cic);
	3480	+#endif
	3481	+ }
	3482	+
3589	3483	new_queue:
3590	3484	cfqq = cic_to_cfqq(cic, is_sync);
3591	3485	if (!cfqq \|\| cfqq == &cfqd->oom_cfqq) {
3592	3486
...	...	@@ -3615,17 +3509,10 @@
3615	3509	cfqq->allocated[rw]++;
3616	3510
3617	3511	cfqq->ref++;
3618		- rq->elv.icq = &cic->icq;
3619	3512	rq->elv.priv[0] = cfqq;
3620	3513	rq->elv.priv[1] = cfq_ref_get_cfqg(cfqq->cfqg);
3621	3514	spin_unlock_irq(q->queue_lock);
3622	3515	return 0;
3623		-
3624		-queue_fail:
3625		- cfq_schedule_dispatch(cfqd);
3626		- spin_unlock_irq(q->queue_lock);
3627		- cfq_log(cfqd, "set_request fail");
3628		- return 1;
3629	3516	}
3630	3517
3631	3518	static void cfq_kick_queue(struct work_struct *work)
...	...	@@ -60,8 +60,8 @@
60	60	elevator_request_list_fn *elevator_former_req_fn;
61	61	elevator_request_list_fn *elevator_latter_req_fn;
62	62
63		- elevator_init_icq_fn *elevator_init_icq_fn;
64		- elevator_exit_icq_fn *elevator_exit_icq_fn;
	63	+ elevator_init_icq_fn elevator_init_icq_fn; / see iocontext.h */
	64	+ elevator_exit_icq_fn elevator_exit_icq_fn; / ditto */
65	65
66	66	elevator_set_req_fn *elevator_set_req_fn;
67	67	elevator_put_req_fn *elevator_put_req_fn;
...	...	@@ -90,8 +90,8 @@
90	90
91	91	/* fields provided by elevator implementation */
92	92	struct elevator_ops ops;
93		- size_t icq_size;
94		- size_t icq_align;
	93	+ size_t icq_size; /* see iocontext.h */
	94	+ size_t icq_align; /* ditto */
95	95	struct elv_fs_entry *elevator_attrs;
96	96	char elevator_name[ELV_NAME_MAX];
97	97	struct module *elevator_owner;
...	...	@@ -10,6 +10,65 @@
10	10	ICQ_CGROUP_CHANGED,
11	11	};
12	12
	13	+/*
	14	+ * An io_cq (icq) is association between an io_context (ioc) and a
	15	+ * request_queue (q). This is used by elevators which need to track
	16	+ * information per ioc - q pair.
	17	+ *
	18	+ * Elevator can request use of icq by setting elevator_type->icq_size and
	19	+ * ->icq_align. Both size and align must be larger than that of struct
	20	+ * io_cq and elevator can use the tail area for private information. The
	21	+ * recommended way to do this is defining a struct which contains io_cq as
	22	+ * the first member followed by private members and using its size and
	23	+ * align. For example,
	24	+ *
	25	+ * struct snail_io_cq {
	26	+ * struct io_cq icq;
	27	+ * int poke_snail;
	28	+ * int feed_snail;
	29	+ * };
	30	+ *
	31	+ * struct elevator_type snail_elv_type {
	32	+ * .ops = { ... },
	33	+ * .icq_size = sizeof(struct snail_io_cq),
	34	+ * .icq_align = __alignof__(struct snail_io_cq),
	35	+ * ...
	36	+ * };
	37	+ *
	38	+ * If icq_size is set, block core will manage icq's. All requests will
	39	+ * have its ->elv.icq field set before elevator_ops->elevator_set_req_fn()
	40	+ * is called and be holding a reference to the associated io_context.
	41	+ *
	42	+ * Whenever a new icq is created, elevator_ops->elevator_init_icq_fn() is
	43	+ * called and, on destruction, ->elevator_exit_icq_fn(). Both functions
	44	+ * are called with both the associated io_context and queue locks held.
	45	+ *
	46	+ * Elevator is allowed to lookup icq using ioc_lookup_icq() while holding
	47	+ * queue lock but the returned icq is valid only until the queue lock is
	48	+ * released. Elevators can not and should not try to create or destroy
	49	+ * icq's.
	50	+ *
	51	+ * As icq's are linked from both ioc and q, the locking rules are a bit
	52	+ * complex.
	53	+ *
	54	+ * - ioc lock nests inside q lock.
	55	+ *
	56	+ * - ioc->icq_list and icq->ioc_node are protected by ioc lock.
	57	+ * q->icq_list and icq->q_node by q lock.
	58	+ *
	59	+ * - ioc->icq_tree and ioc->icq_hint are protected by ioc lock, while icq
	60	+ * itself is protected by q lock. However, both the indexes and icq
	61	+ * itself are also RCU managed and lookup can be performed holding only
	62	+ * the q lock.
	63	+ *
	64	+ * - icq's are not reference counted. They are destroyed when either the
	65	+ * ioc or q goes away. Each request with icq set holds an extra
	66	+ * reference to ioc to ensure it stays until the request is completed.
	67	+ *
	68	+ * - Linking and unlinking icq's are performed while holding both ioc and q
	69	+ * locks. Due to the lock ordering, q exit is simple but ioc exit
	70	+ * requires reverse-order double lock dance.
	71	+ */
13	72	struct io_cq {
14	73	struct request_queue *q;
15	74	struct io_context *ioc;