aio: Fix a trinity splat

aio kiocb refcounting was broken - it was relying on keeping track of the number of available ring buffer entries, which it needs to do anyways; then at shutdown time it'd wait for completions to be delivered until the # of available ring buffer entries equalled what it was initialized to. Problem with that is that the ring buffer is mapped writable into userspace, so userspace could futz with the head and tail pointers to cause the kernel to see extra completions, and cause free_ioctx() to return while there were still outstanding kiocbs. Which would be bad. Fix is just to directly refcount the kiocbs - which is more straightforward, and with the new percpu refcounting code doesn't cost us any cacheline bouncing which was the whole point of the original scheme. Also clean up ioctx_alloc()'s error path and fix a bug where it wasn't subtracting from aio_nr if ioctx_add_table() failed. Signed-off-by: Kent Overstreet <kmo@daterainc.com>

aio: Fix a trinity splat
aio kiocb refcounting was broken - it was relying on keeping track of the number of available ring buffer entries, which it needs to do anyways; then at shutdown time it'd wait for completions to be delivered until the # of available ring buffer entries equalled what it was initialized to. Problem with that is that the ring buffer is mapped writable into userspace, so userspace could futz with the head and tail pointers to cause the kernel to see extra completions, and cause free_ioctx() to return while there were still outstanding kiocbs. Which would be bad. Fix is just to directly refcount the kiocbs - which is more straightforward, and with the new percpu refcounting code doesn't cost us any cacheline bouncing which was the whole point of the original scheme. Also clean up ioctx_alloc()'s error path and fix a bug where it wasn't subtracting from aio_nr if ioctx_add_table() failed. Signed-off-by: Kent Overstreet <kmo@daterainc.com>
Kent Overstreet · Eric Lee · Eric Lee · Eric Lee · Eric Lee
1 parent d0e639c9e0
Showing 1 changed file with 48 additions and 81 deletions Side-by-side Diff
fs/aio.c
@@ -80,6 +80,8 @@
 	struct percpu_ref	users;
 	atomic_t		dead;
  
+	struct percpu_ref	reqs;
+
 	unsigned long		user_id;
  
 	struct __percpu kioctx_cpu *cpu;
@@ -107,7 +109,6 @@
 	struct page		**ring_pages;
 	long			nr_pages;
  
-	struct rcu_head		rcu_head;
 	struct work_struct	free_work;
  
 	struct {
  
  
  
  
  
  
@@ -412,26 +413,34 @@
 	return cancel(kiocb);
 }
  
-static void free_ioctx_rcu(struct rcu_head *head)
+static void free_ioctx(struct work_struct *work)
 {
-	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
  
+	pr_debug("freeing %p\n", ctx);
+
+	aio_free_ring(ctx);
 	free_percpu(ctx->cpu);
 	kmem_cache_free(kioctx_cachep, ctx);
 }
  
+static void free_ioctx_reqs(struct percpu_ref *ref)
+{
+	struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
+
+	INIT_WORK(&ctx->free_work, free_ioctx);
+	schedule_work(&ctx->free_work);
+}
+
 /*
  * When this function runs, the kioctx has been removed from the "hash table"
  * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
  * now it's safe to cancel any that need to be.
  */
-static void free_ioctx(struct work_struct *work)
+static void free_ioctx_users(struct percpu_ref *ref)
 {
-	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
-	struct aio_ring *ring;
+	struct kioctx *ctx = container_of(ref, struct kioctx, users);
 	struct kiocb *req;
-	unsigned cpu, avail;
-	DEFINE_WAIT(wait);
  
 	spin_lock_irq(&ctx->ctx_lock);
  
  
@@ -445,56 +454,10 @@
  
 	spin_unlock_irq(&ctx->ctx_lock);
  
-	for_each_possible_cpu(cpu) {
-		struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu);
-
-		atomic_add(kcpu->reqs_available, &ctx->reqs_available);
-		kcpu->reqs_available = 0;
-	}
-
-	while (1) {
-		prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE);
-
-		ring = kmap_atomic(ctx->ring_pages[0]);
-		avail = (ring->head <= ring->tail)
-			 ? ring->tail - ring->head
-			 : ctx->nr_events - ring->head + ring->tail;
-
-		atomic_add(avail, &ctx->reqs_available);
-		ring->head = ring->tail;
-		kunmap_atomic(ring);
-
-		if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1)
-			break;
-
-		schedule();
-	}
-	finish_wait(&ctx->wait, &wait);
-
-	WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1);
-
-	aio_free_ring(ctx);
-
-	pr_debug("freeing %p\n", ctx);
-
-	/*
-	 * Here the call_rcu() is between the wait_event() for reqs_active to
-	 * hit 0, and freeing the ioctx.
-	 *
-	 * aio_complete() decrements reqs_active, but it has to touch the ioctx
-	 * after to issue a wakeup so we use rcu.
-	 */
-	call_rcu(&ctx->rcu_head, free_ioctx_rcu);
+	percpu_ref_kill(&ctx->reqs);
+	percpu_ref_put(&ctx->reqs);
 }
  
-static void free_ioctx_ref(struct percpu_ref *ref)
-{
-	struct kioctx *ctx = container_of(ref, struct kioctx, users);
-
-	INIT_WORK(&ctx->free_work, free_ioctx);
-	schedule_work(&ctx->free_work);
-}
-
 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
 {
 	unsigned i, new_nr;
@@ -551,6 +514,16 @@
 	}
 }
  
+static void aio_nr_sub(unsigned nr)
+{
+	spin_lock(&aio_nr_lock);
+	if (WARN_ON(aio_nr - nr > aio_nr))
+		aio_nr = 0;
+	else
+		aio_nr -= nr;
+	spin_unlock(&aio_nr_lock);
+}
+
 /* ioctx_alloc
  *	Allocates and initializes an ioctx.  Returns an ERR_PTR if it failed.
  */
  
@@ -588,9 +561,12 @@
  
 	ctx->max_reqs = nr_events;
  
-	if (percpu_ref_init(&ctx->users, free_ioctx_ref))
-		goto out_freectx;
+	if (percpu_ref_init(&ctx->users, free_ioctx_users))
+		goto err;
  
+	if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
+		goto err;
+
 	spin_lock_init(&ctx->ctx_lock);
 	spin_lock_init(&ctx->completion_lock);
 	mutex_init(&ctx->ring_lock);
  
@@ -600,10 +576,10 @@
  
 	ctx->cpu = alloc_percpu(struct kioctx_cpu);
 	if (!ctx->cpu)
-		goto out_freeref;
+		goto err;
  
 	if (aio_setup_ring(ctx) < 0)
-		goto out_freepcpu;
+		goto err;
  
 	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
 	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
@@ -615,7 +591,8 @@
 	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
 	    aio_nr + nr_events < aio_nr) {
 		spin_unlock(&aio_nr_lock);
-		goto out_cleanup;
+		err = -EAGAIN;
+		goto err;
 	}
 	aio_nr += ctx->max_reqs;
 	spin_unlock(&aio_nr_lock);
  
  
  
  
@@ -624,23 +601,19 @@
  
 	err = ioctx_add_table(ctx, mm);
 	if (err)
-		goto out_cleanup_put;
+		goto err_cleanup;
  
 	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
 		 ctx, ctx->user_id, mm, ctx->nr_events);
 	return ctx;
  
-out_cleanup_put:
-	percpu_ref_put(&ctx->users);
-out_cleanup:
-	err = -EAGAIN;
+err_cleanup:
+	aio_nr_sub(ctx->max_reqs);
+err:
 	aio_free_ring(ctx);
-out_freepcpu:
 	free_percpu(ctx->cpu);
-out_freeref:
+	free_percpu(ctx->reqs.pcpu_count);
 	free_percpu(ctx->users.pcpu_count);
-out_freectx:
-	put_aio_ring_file(ctx);
 	kmem_cache_free(kioctx_cachep, ctx);
 	pr_debug("error allocating ioctx %d\n", err);
 	return ERR_PTR(err);
@@ -675,10 +648,7 @@
 		 * -EAGAIN with no ioctxs actually in use (as far as userspace
 		 *  could tell).
 		 */
-		spin_lock(&aio_nr_lock);
-		BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
-		aio_nr -= ctx->max_reqs;
-		spin_unlock(&aio_nr_lock);
+		aio_nr_sub(ctx->max_reqs);
  
 		if (ctx->mmap_size)
 			vm_munmap(ctx->mmap_base, ctx->mmap_size);
@@ -810,6 +780,8 @@
 	if (unlikely(!req))
 		goto out_put;
  
+	percpu_ref_get(&ctx->reqs);
+
 	req->ki_ctx = ctx;
 	return req;
 out_put:
@@ -879,12 +851,6 @@
 		return;
 	}
  
-	/*
-	 * Take rcu_read_lock() in case the kioctx is being destroyed, as we
-	 * need to issue a wakeup after incrementing reqs_available.
-	 */
-	rcu_read_lock();
-
 	if (iocb->ki_list.next) {
 		unsigned long flags;
  
@@ -959,7 +925,7 @@
 	if (waitqueue_active(&ctx->wait))
 		wake_up(&ctx->wait);
  
-	rcu_read_unlock();
+	percpu_ref_put(&ctx->reqs);
 }
 EXPORT_SYMBOL(aio_complete);
  
@@ -1370,6 +1336,7 @@
 	return 0;
 out_put_req:
 	put_reqs_available(ctx, 1);
+	percpu_ref_put(&ctx->reqs);
 	kiocb_free(req);
 	return ret;
 }
...	...	@@ -80,6 +80,8 @@
80	80	struct percpu_ref users;
81	81	atomic_t dead;
82	82
	83	+ struct percpu_ref reqs;
	84	+
83	85	unsigned long user_id;
84	86
85	87	struct __percpu kioctx_cpu *cpu;
...	...	@@ -107,7 +109,6 @@
107	109	struct page **ring_pages;
108	110	long nr_pages;
109	111
110		- struct rcu_head rcu_head;
111	112	struct work_struct free_work;
112	113
113	114	struct {
114	115
115	116
116	117
117	118
118	119
119	120
...	...	@@ -412,26 +413,34 @@
412	413	return cancel(kiocb);
413	414	}
414	415
415		-static void free_ioctx_rcu(struct rcu_head *head)
	416	+static void free_ioctx(struct work_struct *work)
416	417	{
417		- struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
	418	+ struct kioctx *ctx = container_of(work, struct kioctx, free_work);
418	419
	420	+ pr_debug("freeing %p\n", ctx);
	421	+
	422	+ aio_free_ring(ctx);
419	423	free_percpu(ctx->cpu);
420	424	kmem_cache_free(kioctx_cachep, ctx);
421	425	}
422	426
	427	+static void free_ioctx_reqs(struct percpu_ref *ref)
	428	+{
	429	+ struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
	430	+
	431	+ INIT_WORK(&ctx->free_work, free_ioctx);
	432	+ schedule_work(&ctx->free_work);
	433	+}
	434	+
423	435	/*
424	436	* When this function runs, the kioctx has been removed from the "hash table"
425	437	* and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
426	438	* now it's safe to cancel any that need to be.
427	439	*/
428		-static void free_ioctx(struct work_struct *work)
	440	+static void free_ioctx_users(struct percpu_ref *ref)
429	441	{
430		- struct kioctx *ctx = container_of(work, struct kioctx, free_work);
431		- struct aio_ring *ring;
	442	+ struct kioctx *ctx = container_of(ref, struct kioctx, users);
432	443	struct kiocb *req;
433		- unsigned cpu, avail;
434		- DEFINE_WAIT(wait);
435	444
436	445	spin_lock_irq(&ctx->ctx_lock);
437	446
438	447
...	...	@@ -445,56 +454,10 @@
445	454
446	455	spin_unlock_irq(&ctx->ctx_lock);
447	456
448		- for_each_possible_cpu(cpu) {
449		- struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu);
450		-
451		- atomic_add(kcpu->reqs_available, &ctx->reqs_available);
452		- kcpu->reqs_available = 0;
453		- }
454		-
455		- while (1) {
456		- prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE);
457		-
458		- ring = kmap_atomic(ctx->ring_pages[0]);
459		- avail = (ring->head <= ring->tail)
460		- ? ring->tail - ring->head
461		- : ctx->nr_events - ring->head + ring->tail;
462		-
463		- atomic_add(avail, &ctx->reqs_available);
464		- ring->head = ring->tail;
465		- kunmap_atomic(ring);
466		-
467		- if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1)
468		- break;
469		-
470		- schedule();
471		- }
472		- finish_wait(&ctx->wait, &wait);
473		-
474		- WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1);
475		-
476		- aio_free_ring(ctx);
477		-
478		- pr_debug("freeing %p\n", ctx);
479		-
480		- /*
481		- * Here the call_rcu() is between the wait_event() for reqs_active to
482		- * hit 0, and freeing the ioctx.
483		- *
484		- * aio_complete() decrements reqs_active, but it has to touch the ioctx
485		- * after to issue a wakeup so we use rcu.
486		- */
487		- call_rcu(&ctx->rcu_head, free_ioctx_rcu);
	457	+ percpu_ref_kill(&ctx->reqs);
	458	+ percpu_ref_put(&ctx->reqs);
488	459	}
489	460
490		-static void free_ioctx_ref(struct percpu_ref *ref)
491		-{
492		- struct kioctx *ctx = container_of(ref, struct kioctx, users);
493		-
494		- INIT_WORK(&ctx->free_work, free_ioctx);
495		- schedule_work(&ctx->free_work);
496		-}
497		-
498	461	static int ioctx_add_table(struct kioctx ctx, struct mm_struct mm)
499	462	{
500	463	unsigned i, new_nr;
...	...	@@ -551,6 +514,16 @@
551	514	}
552	515	}
553	516
	517	+static void aio_nr_sub(unsigned nr)
	518	+{
	519	+ spin_lock(&aio_nr_lock);
	520	+ if (WARN_ON(aio_nr - nr > aio_nr))
	521	+ aio_nr = 0;
	522	+ else
	523	+ aio_nr -= nr;
	524	+ spin_unlock(&aio_nr_lock);
	525	+}
	526	+
554	527	/* ioctx_alloc
555	528	* Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
556	529	*/
557	530
...	...	@@ -588,9 +561,12 @@
588	561
589	562	ctx->max_reqs = nr_events;
590	563
591		- if (percpu_ref_init(&ctx->users, free_ioctx_ref))
592		- goto out_freectx;
	564	+ if (percpu_ref_init(&ctx->users, free_ioctx_users))
	565	+ goto err;
593	566
	567	+ if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
	568	+ goto err;
	569	+
594	570	spin_lock_init(&ctx->ctx_lock);
595	571	spin_lock_init(&ctx->completion_lock);
596	572	mutex_init(&ctx->ring_lock);
597	573
...	...	@@ -600,10 +576,10 @@
600	576
601	577	ctx->cpu = alloc_percpu(struct kioctx_cpu);
602	578	if (!ctx->cpu)
603		- goto out_freeref;
	579	+ goto err;
604	580
605	581	if (aio_setup_ring(ctx) < 0)
606		- goto out_freepcpu;
	582	+ goto err;
607	583
608	584	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
609	585	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
...	...	@@ -615,7 +591,8 @@
615	591	if (aio_nr + nr_events > (aio_max_nr * 2UL) \|\|
616	592	aio_nr + nr_events < aio_nr) {
617	593	spin_unlock(&aio_nr_lock);
618		- goto out_cleanup;
	594	+ err = -EAGAIN;
	595	+ goto err;
619	596	}
620	597	aio_nr += ctx->max_reqs;
621	598	spin_unlock(&aio_nr_lock);
622	599
623	600
624	601
625	602
...	...	@@ -624,23 +601,19 @@
624	601
625	602	err = ioctx_add_table(ctx, mm);
626	603	if (err)
627		- goto out_cleanup_put;
	604	+ goto err_cleanup;
628	605
629	606	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
630	607	ctx, ctx->user_id, mm, ctx->nr_events);
631	608	return ctx;
632	609
633		-out_cleanup_put:
634		- percpu_ref_put(&ctx->users);
635		-out_cleanup:
636		- err = -EAGAIN;
	610	+err_cleanup:
	611	+ aio_nr_sub(ctx->max_reqs);
	612	+err:
637	613	aio_free_ring(ctx);
638		-out_freepcpu:
639	614	free_percpu(ctx->cpu);
640		-out_freeref:
	615	+ free_percpu(ctx->reqs.pcpu_count);
641	616	free_percpu(ctx->users.pcpu_count);
642		-out_freectx:
643		- put_aio_ring_file(ctx);
644	617	kmem_cache_free(kioctx_cachep, ctx);
645	618	pr_debug("error allocating ioctx %d\n", err);
646	619	return ERR_PTR(err);
...	...	@@ -675,10 +648,7 @@
675	648	* -EAGAIN with no ioctxs actually in use (as far as userspace
676	649	* could tell).
677	650	*/
678		- spin_lock(&aio_nr_lock);
679		- BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
680		- aio_nr -= ctx->max_reqs;
681		- spin_unlock(&aio_nr_lock);
	651	+ aio_nr_sub(ctx->max_reqs);
682	652
683	653	if (ctx->mmap_size)
684	654	vm_munmap(ctx->mmap_base, ctx->mmap_size);
...	...	@@ -810,6 +780,8 @@
810	780	if (unlikely(!req))
811	781	goto out_put;
812	782
	783	+ percpu_ref_get(&ctx->reqs);
	784	+
813	785	req->ki_ctx = ctx;
814	786	return req;
815	787	out_put:
...	...	@@ -879,12 +851,6 @@
879	851	return;
880	852	}
881	853
882		- /*
883		- * Take rcu_read_lock() in case the kioctx is being destroyed, as we
884		- * need to issue a wakeup after incrementing reqs_available.
885		- */
886		- rcu_read_lock();
887		-
888	854	if (iocb->ki_list.next) {
889	855	unsigned long flags;
890	856
...	...	@@ -959,7 +925,7 @@
959	925	if (waitqueue_active(&ctx->wait))
960	926	wake_up(&ctx->wait);
961	927
962		- rcu_read_unlock();
	928	+ percpu_ref_put(&ctx->reqs);
963	929	}
964	930	EXPORT_SYMBOL(aio_complete);
965	931
...	...	@@ -1370,6 +1336,7 @@
1370	1336	return 0;
1371	1337	out_put_req:
1372	1338	put_reqs_available(ctx, 1);
	1339	+ percpu_ref_put(&ctx->reqs);
1373	1340	kiocb_free(req);
1374	1341	return ret;
1375	1342	}