block, cfq: replace current_io_context() with create_io_context()

When called under queue_lock, current_io_context() triggers lockdep warning if it hits allocation path. This is because io_context installation is protected by task_lock which is not IRQ safe, so it triggers irq-unsafe-lock -> irq -> irq-safe-lock -> irq-unsafe-lock deadlock warning. Given the restriction, accessor + creator rolled into one doesn't work too well. Drop current_io_context() and let the users access task->io_context directly inside queue_lock combined with explicit creation using create_io_context(). Future ioc updates will further consolidate ioc access and the create interface will be unexported. While at it, relocate ioc internal interface declarations in blk.h and add section comments before and after. This patch does not introduce functional change. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>

block, cfq: replace current_io_context() with create_io_context()
When called under queue_lock, current_io_context() triggers lockdep warning if it hits allocation path. This is because io_context installation is protected by task_lock which is not IRQ safe, so it triggers irq-unsafe-lock -> irq -> irq-safe-lock -> irq-unsafe-lock deadlock warning. Given the restriction, accessor + creator rolled into one doesn't work too well. Drop current_io_context() and let the users access task->io_context directly inside queue_lock combined with explicit creation using create_io_context(). Future ioc updates will further consolidate ioc access and the create interface will be unexported. While at it, relocate ioc internal interface declarations in blk.h and add section comments before and after. This patch does not introduce functional change. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Jens Axboe <axboe@kernel.dk>
Tejun Heo · Jens Axboe
1 parent 1238033c79
Showing 4 changed files with 71 additions and 54 deletions Side-by-side Diff
block/blk-core.c
block/blk-ioc.c
block/blk.h
block/cfq-iosched.c
@@ -771,9 +771,12 @@
 {
 	struct request *rq = NULL;
 	struct request_list *rl = &q->rq;
-	struct io_context *ioc = NULL;
+	struct io_context *ioc;
 	const bool is_sync = rw_is_sync(rw_flags) != 0;
+	bool retried = false;
 	int may_queue;
+retry:
+	ioc = current->io_context;
  
 	if (unlikely(blk_queue_dead(q)))
 		return NULL;
  
@@ -784,8 +787,21 @@
  
 	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
 		if (rl->count[is_sync]+1 >= q->nr_requests) {
-			ioc = current_io_context(GFP_ATOMIC, q->node);
 			/*
+			 * We want ioc to record batching state.  If it's
+			 * not already there, creating a new one requires
+			 * dropping queue_lock, which in turn requires
+			 * retesting conditions to avoid queue hang.
+			 */
+			if (!ioc && !retried) {
+				spin_unlock_irq(q->queue_lock);
+				create_io_context(current, gfp_mask, q->node);
+				spin_lock_irq(q->queue_lock);
+				retried = true;
+				goto retry;
+			}
+
+			/*
 			 * The queue will fill after this allocation, so set
 			 * it as full, and mark this process as "batching".
 			 * This process will be allowed to complete a batch of
@@ -892,7 +908,6 @@
 	rq = get_request(q, rw_flags, bio, GFP_NOIO);
 	while (!rq) {
 		DEFINE_WAIT(wait);
-		struct io_context *ioc;
 		struct request_list *rl = &q->rq;
  
 		if (unlikely(blk_queue_dead(q)))
@@ -912,8 +927,8 @@
 		 * up to a big batch of them for a small period time.
 		 * See ioc_batching, ioc_set_batching
 		 */
-		ioc = current_io_context(GFP_NOIO, q->node);
-		ioc_set_batching(q, ioc);
+		create_io_context(current, GFP_NOIO, q->node);
+		ioc_set_batching(q, current->io_context);
  
 		spin_lock_irq(q->queue_lock);
 		finish_wait(&rl->wait[is_sync], &wait);
@@ -205,16 +205,15 @@
 	put_io_context(ioc, NULL);
 }
  
-static struct io_context *create_task_io_context(struct task_struct *task,
-						 gfp_t gfp_flags, int node,
-						 bool take_ref)
+void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
+				int node)
 {
 	struct io_context *ioc;
  
 	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
 				    node);
 	if (unlikely(!ioc))
-		return NULL;
+		return;
  
 	/* initialize */
 	atomic_long_set(&ioc->refcount, 1);
  
  
  
  
  
@@ -226,44 +225,15 @@
  
 	/* try to install, somebody might already have beaten us to it */
 	task_lock(task);
-
-	if (!task->io_context && !(task->flags & PF_EXITING)) {
+	if (!task->io_context && !(task->flags & PF_EXITING))
 		task->io_context = ioc;
-	} else {
+	else
 		kmem_cache_free(iocontext_cachep, ioc);
-		ioc = task->io_context;
-	}
-
-	if (ioc && take_ref)
-		get_io_context(ioc);
-
 	task_unlock(task);
-	return ioc;
 }
+EXPORT_SYMBOL(create_io_context_slowpath);
  
 /**
- * current_io_context - get io_context of %current
- * @gfp_flags: allocation flags, used if allocation is necessary
- * @node: allocation node, used if allocation is necessary
- *
- * Return io_context of %current.  If it doesn't exist, it is created with
- * @gfp_flags and @node.  The returned io_context does NOT have its
- * reference count incremented.  Because io_context is exited only on task
- * exit, %current can be sure that the returned io_context is valid and
- * alive as long as it is executing.
- */
-struct io_context *current_io_context(gfp_t gfp_flags, int node)
-{
-	might_sleep_if(gfp_flags & __GFP_WAIT);
-
-	if (current->io_context)
-		return current->io_context;
-
-	return create_task_io_context(current, gfp_flags, node, false);
-}
-EXPORT_SYMBOL(current_io_context);
-
-/**
  * get_task_io_context - get io_context of a task
  * @task: task of interest
  * @gfp_flags: allocation flags, used if allocation is necessary
@@ -274,7 +244,7 @@
  * incremented.
  *
  * This function always goes through task_lock() and it's better to use
- * current_io_context() + get_io_context() for %current.
+ * %current->io_context + get_io_context() for %current.
  */
 struct io_context *get_task_io_context(struct task_struct *task,
 				       gfp_t gfp_flags, int node)
  
  
@@ -283,16 +253,18 @@
  
 	might_sleep_if(gfp_flags & __GFP_WAIT);
  
-	task_lock(task);
-	ioc = task->io_context;
-	if (likely(ioc)) {
-		get_io_context(ioc);
+	do {
+		task_lock(task);
+		ioc = task->io_context;
+		if (likely(ioc)) {
+			get_io_context(ioc);
+			task_unlock(task);
+			return ioc;
+		}
 		task_unlock(task);
-		return ioc;
-	}
-	task_unlock(task);
+	} while (create_io_context(task, gfp_flags, node));
  
-	return create_task_io_context(task, gfp_flags, node, true);
+	return NULL;
 }
 EXPORT_SYMBOL(get_task_io_context);
  
@@ -127,9 +127,6 @@
 }
 #endif
  
-void get_io_context(struct io_context *ioc);
-struct io_context *current_io_context(gfp_t gfp_flags, int node);
-
 int ll_back_merge_fn(struct request_queue *q, struct request *req,
 		     struct bio *bio);
 int ll_front_merge_fn(struct request_queue *q, struct request *req, 
@@ -198,6 +195,39 @@
 	        (rq->cmd_flags & REQ_DISCARD));
 }
  
+/*
+ * Internal io_context interface
+ */
+void get_io_context(struct io_context *ioc);
+
+void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
+				int node);
+
+/**
+ * create_io_context - try to create task->io_context
+ * @task: target task
+ * @gfp_mask: allocation mask
+ * @node: allocation node
+ *
+ * If @task->io_context is %NULL, allocate a new io_context and install it.
+ * Returns the current @task->io_context which may be %NULL if allocation
+ * failed.
+ *
+ * Note that this function can't be called with IRQ disabled because
+ * task_lock which protects @task->io_context is IRQ-unsafe.
+ */
+static inline struct io_context *create_io_context(struct task_struct *task,
+						   gfp_t gfp_mask, int node)
+{
+	WARN_ON_ONCE(irqs_disabled());
+	if (unlikely(!task->io_context))
+		create_io_context_slowpath(task, gfp_mask, node);
+	return task->io_context;
+}
+
+/*
+ * Internal throttling interface
+ */
 #ifdef CONFIG_BLK_DEV_THROTTLING
 extern bool blk_throtl_bio(struct request_queue *q, struct bio *bio);
 extern void blk_throtl_drain(struct request_queue *q);
@@ -3012,7 +3012,7 @@
 	might_sleep_if(gfp_mask & __GFP_WAIT);
  
 	/* allocate stuff */
-	ioc = current_io_context(gfp_mask, q->node);
+	ioc = create_io_context(current, gfp_mask, q->node);
 	if (!ioc)
 		goto out;
...	...	@@ -771,9 +771,12 @@
771	771	{
772	772	struct request *rq = NULL;
773	773	struct request_list *rl = &q->rq;
774		- struct io_context *ioc = NULL;
	774	+ struct io_context *ioc;
775	775	const bool is_sync = rw_is_sync(rw_flags) != 0;
	776	+ bool retried = false;
776	777	int may_queue;
	778	+retry:
	779	+ ioc = current->io_context;
777	780
778	781	if (unlikely(blk_queue_dead(q)))
779	782	return NULL;
780	783
...	...	@@ -784,8 +787,21 @@
784	787
785	788	if (rl->count[is_sync]+1 >= queue_congestion_on_threshold(q)) {
786	789	if (rl->count[is_sync]+1 >= q->nr_requests) {
787		- ioc = current_io_context(GFP_ATOMIC, q->node);
788	790	/*
	791	+ * We want ioc to record batching state. If it's
	792	+ * not already there, creating a new one requires
	793	+ * dropping queue_lock, which in turn requires
	794	+ * retesting conditions to avoid queue hang.
	795	+ */
	796	+ if (!ioc && !retried) {
	797	+ spin_unlock_irq(q->queue_lock);
	798	+ create_io_context(current, gfp_mask, q->node);
	799	+ spin_lock_irq(q->queue_lock);
	800	+ retried = true;
	801	+ goto retry;
	802	+ }
	803	+
	804	+ /*
789	805	* The queue will fill after this allocation, so set
790	806	* it as full, and mark this process as "batching".
791	807	* This process will be allowed to complete a batch of
...	...	@@ -892,7 +908,6 @@
892	908	rq = get_request(q, rw_flags, bio, GFP_NOIO);
893	909	while (!rq) {
894	910	DEFINE_WAIT(wait);
895		- struct io_context *ioc;
896	911	struct request_list *rl = &q->rq;
897	912
898	913	if (unlikely(blk_queue_dead(q)))
...	...	@@ -912,8 +927,8 @@
912	927	* up to a big batch of them for a small period time.
913	928	* See ioc_batching, ioc_set_batching
914	929	*/
915		- ioc = current_io_context(GFP_NOIO, q->node);
916		- ioc_set_batching(q, ioc);
	930	+ create_io_context(current, GFP_NOIO, q->node);
	931	+ ioc_set_batching(q, current->io_context);
917	932
918	933	spin_lock_irq(q->queue_lock);
919	934	finish_wait(&rl->wait[is_sync], &wait);
...	...	@@ -205,16 +205,15 @@
205	205	put_io_context(ioc, NULL);
206	206	}
207	207
208		-static struct io_context create_task_io_context(struct task_struct task,
209		- gfp_t gfp_flags, int node,
210		- bool take_ref)
	208	+void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_flags,
	209	+ int node)
211	210	{
212	211	struct io_context *ioc;
213	212
214	213	ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags \| __GFP_ZERO,
215	214	node);
216	215	if (unlikely(!ioc))
217		- return NULL;
	216	+ return;
218	217
219	218	/* initialize */
220	219	atomic_long_set(&ioc->refcount, 1);
221	220
222	221
223	222
224	223
225	224
...	...	@@ -226,44 +225,15 @@
226	225
227	226	/* try to install, somebody might already have beaten us to it */
228	227	task_lock(task);
229		-
230		- if (!task->io_context && !(task->flags & PF_EXITING)) {
	228	+ if (!task->io_context && !(task->flags & PF_EXITING))
231	229	task->io_context = ioc;
232		- } else {
	230	+ else
233	231	kmem_cache_free(iocontext_cachep, ioc);
234		- ioc = task->io_context;
235		- }
236		-
237		- if (ioc && take_ref)
238		- get_io_context(ioc);
239		-
240	232	task_unlock(task);
241		- return ioc;
242	233	}
	234	+EXPORT_SYMBOL(create_io_context_slowpath);
243	235
244	236	/**
245		- * current_io_context - get io_context of %current
246		- * @gfp_flags: allocation flags, used if allocation is necessary
247		- * @node: allocation node, used if allocation is necessary
248		- *
249		- * Return io_context of %current. If it doesn't exist, it is created with
250		- * @gfp_flags and @node. The returned io_context does NOT have its
251		- * reference count incremented. Because io_context is exited only on task
252		- * exit, %current can be sure that the returned io_context is valid and
253		- * alive as long as it is executing.
254		- */
255		-struct io_context *current_io_context(gfp_t gfp_flags, int node)
256		-{
257		- might_sleep_if(gfp_flags & __GFP_WAIT);
258		-
259		- if (current->io_context)
260		- return current->io_context;
261		-
262		- return create_task_io_context(current, gfp_flags, node, false);
263		-}
264		-EXPORT_SYMBOL(current_io_context);
265		-
266		-/**
267	237	* get_task_io_context - get io_context of a task
268	238	* @task: task of interest
269	239	* @gfp_flags: allocation flags, used if allocation is necessary
...	...	@@ -274,7 +244,7 @@
274	244	* incremented.
275	245	*
276	246	* This function always goes through task_lock() and it's better to use
277		- * current_io_context() + get_io_context() for %current.
	247	+ * %current->io_context + get_io_context() for %current.
278	248	*/
279	249	struct io_context get_task_io_context(struct task_struct task,
280	250	gfp_t gfp_flags, int node)
281	251
282	252
...	...	@@ -283,16 +253,18 @@
283	253
284	254	might_sleep_if(gfp_flags & __GFP_WAIT);
285	255
286		- task_lock(task);
287		- ioc = task->io_context;
288		- if (likely(ioc)) {
289		- get_io_context(ioc);
	256	+ do {
	257	+ task_lock(task);
	258	+ ioc = task->io_context;
	259	+ if (likely(ioc)) {
	260	+ get_io_context(ioc);
	261	+ task_unlock(task);
	262	+ return ioc;
	263	+ }
290	264	task_unlock(task);
291		- return ioc;
292		- }
293		- task_unlock(task);
	265	+ } while (create_io_context(task, gfp_flags, node));
294	266
295		- return create_task_io_context(task, gfp_flags, node, true);
	267	+ return NULL;
296	268	}
297	269	EXPORT_SYMBOL(get_task_io_context);
298	270
...	...	@@ -127,9 +127,6 @@
127	127	}
128	128	#endif
129	129
130		-void get_io_context(struct io_context *ioc);
131		-struct io_context *current_io_context(gfp_t gfp_flags, int node);
132		-
133	130	int ll_back_merge_fn(struct request_queue q, struct request req,
134	131	struct bio *bio);
135	132	int ll_front_merge_fn(struct request_queue q, struct request req,
...	...	@@ -198,6 +195,39 @@
198	195	(rq->cmd_flags & REQ_DISCARD));
199	196	}
200	197
	198	+/*
	199	+ * Internal io_context interface
	200	+ */
	201	+void get_io_context(struct io_context *ioc);
	202	+
	203	+void create_io_context_slowpath(struct task_struct *task, gfp_t gfp_mask,
	204	+ int node);
	205	+
	206	+/**
	207	+ * create_io_context - try to create task->io_context
	208	+ * @task: target task
	209	+ * @gfp_mask: allocation mask
	210	+ * @node: allocation node
	211	+ *
	212	+ * If @task->io_context is %NULL, allocate a new io_context and install it.
	213	+ * Returns the current @task->io_context which may be %NULL if allocation
	214	+ * failed.
	215	+ *
	216	+ * Note that this function can't be called with IRQ disabled because
	217	+ * task_lock which protects @task->io_context is IRQ-unsafe.
	218	+ */
	219	+static inline struct io_context create_io_context(struct task_struct task,
	220	+ gfp_t gfp_mask, int node)
	221	+{
	222	+ WARN_ON_ONCE(irqs_disabled());
	223	+ if (unlikely(!task->io_context))
	224	+ create_io_context_slowpath(task, gfp_mask, node);
	225	+ return task->io_context;
	226	+}
	227	+
	228	+/*
	229	+ * Internal throttling interface
	230	+ */
201	231	#ifdef CONFIG_BLK_DEV_THROTTLING
202	232	extern bool blk_throtl_bio(struct request_queue q, struct bio bio);
203	233	extern void blk_throtl_drain(struct request_queue *q);
...	...	@@ -3012,7 +3012,7 @@
3012	3012	might_sleep_if(gfp_mask & __GFP_WAIT);
3013	3013
3014	3014	/* allocate stuff */
3015		- ioc = current_io_context(gfp_mask, q->node);
	3015	+ ioc = create_io_context(current, gfp_mask, q->node);
3016	3016	if (!ioc)
3017	3017	goto out;
3018	3018