Commit e34ecee2ae791df674dfb466ce40692ca6218e43

Authored by Kent Overstreet
1 parent d0e639c9e0

aio: Fix a trinity splat

aio kiocb refcounting was broken - it was relying on keeping track of
the number of available ring buffer entries, which it needs to do
anyways; then at shutdown time it'd wait for completions to be delivered
until the # of available ring buffer entries equalled what it was
initialized to.

Problem with  that is that the ring buffer is mapped writable into
userspace, so userspace could futz with the head and tail pointers to
cause the kernel to see extra completions, and cause free_ioctx() to
return while there were still outstanding kiocbs. Which would be bad.

Fix is just to directly refcount the kiocbs - which is more
straightforward, and with the new percpu refcounting code doesn't cost
us any cacheline bouncing which was the whole point of the original
scheme.

Also clean up ioctx_alloc()'s error path and fix a bug where it wasn't
subtracting from aio_nr if ioctx_add_table() failed.

Signed-off-by: Kent Overstreet <kmo@daterainc.com>

Showing 1 changed file with 48 additions and 81 deletions Side-by-side Diff

... ... @@ -80,6 +80,8 @@
80 80 struct percpu_ref users;
81 81 atomic_t dead;
82 82  
  83 + struct percpu_ref reqs;
  84 +
83 85 unsigned long user_id;
84 86  
85 87 struct __percpu kioctx_cpu *cpu;
... ... @@ -107,7 +109,6 @@
107 109 struct page **ring_pages;
108 110 long nr_pages;
109 111  
110   - struct rcu_head rcu_head;
111 112 struct work_struct free_work;
112 113  
113 114 struct {
114 115  
115 116  
116 117  
117 118  
118 119  
119 120  
... ... @@ -412,26 +413,34 @@
412 413 return cancel(kiocb);
413 414 }
414 415  
415   -static void free_ioctx_rcu(struct rcu_head *head)
  416 +static void free_ioctx(struct work_struct *work)
416 417 {
417   - struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
  418 + struct kioctx *ctx = container_of(work, struct kioctx, free_work);
418 419  
  420 + pr_debug("freeing %p\n", ctx);
  421 +
  422 + aio_free_ring(ctx);
419 423 free_percpu(ctx->cpu);
420 424 kmem_cache_free(kioctx_cachep, ctx);
421 425 }
422 426  
  427 +static void free_ioctx_reqs(struct percpu_ref *ref)
  428 +{
  429 + struct kioctx *ctx = container_of(ref, struct kioctx, reqs);
  430 +
  431 + INIT_WORK(&ctx->free_work, free_ioctx);
  432 + schedule_work(&ctx->free_work);
  433 +}
  434 +
423 435 /*
424 436 * When this function runs, the kioctx has been removed from the "hash table"
425 437 * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
426 438 * now it's safe to cancel any that need to be.
427 439 */
428   -static void free_ioctx(struct work_struct *work)
  440 +static void free_ioctx_users(struct percpu_ref *ref)
429 441 {
430   - struct kioctx *ctx = container_of(work, struct kioctx, free_work);
431   - struct aio_ring *ring;
  442 + struct kioctx *ctx = container_of(ref, struct kioctx, users);
432 443 struct kiocb *req;
433   - unsigned cpu, avail;
434   - DEFINE_WAIT(wait);
435 444  
436 445 spin_lock_irq(&ctx->ctx_lock);
437 446  
438 447  
... ... @@ -445,56 +454,10 @@
445 454  
446 455 spin_unlock_irq(&ctx->ctx_lock);
447 456  
448   - for_each_possible_cpu(cpu) {
449   - struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu);
450   -
451   - atomic_add(kcpu->reqs_available, &ctx->reqs_available);
452   - kcpu->reqs_available = 0;
453   - }
454   -
455   - while (1) {
456   - prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE);
457   -
458   - ring = kmap_atomic(ctx->ring_pages[0]);
459   - avail = (ring->head <= ring->tail)
460   - ? ring->tail - ring->head
461   - : ctx->nr_events - ring->head + ring->tail;
462   -
463   - atomic_add(avail, &ctx->reqs_available);
464   - ring->head = ring->tail;
465   - kunmap_atomic(ring);
466   -
467   - if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1)
468   - break;
469   -
470   - schedule();
471   - }
472   - finish_wait(&ctx->wait, &wait);
473   -
474   - WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1);
475   -
476   - aio_free_ring(ctx);
477   -
478   - pr_debug("freeing %p\n", ctx);
479   -
480   - /*
481   - * Here the call_rcu() is between the wait_event() for reqs_active to
482   - * hit 0, and freeing the ioctx.
483   - *
484   - * aio_complete() decrements reqs_active, but it has to touch the ioctx
485   - * after to issue a wakeup so we use rcu.
486   - */
487   - call_rcu(&ctx->rcu_head, free_ioctx_rcu);
  457 + percpu_ref_kill(&ctx->reqs);
  458 + percpu_ref_put(&ctx->reqs);
488 459 }
489 460  
490   -static void free_ioctx_ref(struct percpu_ref *ref)
491   -{
492   - struct kioctx *ctx = container_of(ref, struct kioctx, users);
493   -
494   - INIT_WORK(&ctx->free_work, free_ioctx);
495   - schedule_work(&ctx->free_work);
496   -}
497   -
498 461 static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
499 462 {
500 463 unsigned i, new_nr;
... ... @@ -551,6 +514,16 @@
551 514 }
552 515 }
553 516  
  517 +static void aio_nr_sub(unsigned nr)
  518 +{
  519 + spin_lock(&aio_nr_lock);
  520 + if (WARN_ON(aio_nr - nr > aio_nr))
  521 + aio_nr = 0;
  522 + else
  523 + aio_nr -= nr;
  524 + spin_unlock(&aio_nr_lock);
  525 +}
  526 +
554 527 /* ioctx_alloc
555 528 * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed.
556 529 */
557 530  
... ... @@ -588,9 +561,12 @@
588 561  
589 562 ctx->max_reqs = nr_events;
590 563  
591   - if (percpu_ref_init(&ctx->users, free_ioctx_ref))
592   - goto out_freectx;
  564 + if (percpu_ref_init(&ctx->users, free_ioctx_users))
  565 + goto err;
593 566  
  567 + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs))
  568 + goto err;
  569 +
594 570 spin_lock_init(&ctx->ctx_lock);
595 571 spin_lock_init(&ctx->completion_lock);
596 572 mutex_init(&ctx->ring_lock);
597 573  
... ... @@ -600,10 +576,10 @@
600 576  
601 577 ctx->cpu = alloc_percpu(struct kioctx_cpu);
602 578 if (!ctx->cpu)
603   - goto out_freeref;
  579 + goto err;
604 580  
605 581 if (aio_setup_ring(ctx) < 0)
606   - goto out_freepcpu;
  582 + goto err;
607 583  
608 584 atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
609 585 ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
... ... @@ -615,7 +591,8 @@
615 591 if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
616 592 aio_nr + nr_events < aio_nr) {
617 593 spin_unlock(&aio_nr_lock);
618   - goto out_cleanup;
  594 + err = -EAGAIN;
  595 + goto err;
619 596 }
620 597 aio_nr += ctx->max_reqs;
621 598 spin_unlock(&aio_nr_lock);
622 599  
623 600  
624 601  
625 602  
... ... @@ -624,23 +601,19 @@
624 601  
625 602 err = ioctx_add_table(ctx, mm);
626 603 if (err)
627   - goto out_cleanup_put;
  604 + goto err_cleanup;
628 605  
629 606 pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
630 607 ctx, ctx->user_id, mm, ctx->nr_events);
631 608 return ctx;
632 609  
633   -out_cleanup_put:
634   - percpu_ref_put(&ctx->users);
635   -out_cleanup:
636   - err = -EAGAIN;
  610 +err_cleanup:
  611 + aio_nr_sub(ctx->max_reqs);
  612 +err:
637 613 aio_free_ring(ctx);
638   -out_freepcpu:
639 614 free_percpu(ctx->cpu);
640   -out_freeref:
  615 + free_percpu(ctx->reqs.pcpu_count);
641 616 free_percpu(ctx->users.pcpu_count);
642   -out_freectx:
643   - put_aio_ring_file(ctx);
644 617 kmem_cache_free(kioctx_cachep, ctx);
645 618 pr_debug("error allocating ioctx %d\n", err);
646 619 return ERR_PTR(err);
... ... @@ -675,10 +648,7 @@
675 648 * -EAGAIN with no ioctxs actually in use (as far as userspace
676 649 * could tell).
677 650 */
678   - spin_lock(&aio_nr_lock);
679   - BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
680   - aio_nr -= ctx->max_reqs;
681   - spin_unlock(&aio_nr_lock);
  651 + aio_nr_sub(ctx->max_reqs);
682 652  
683 653 if (ctx->mmap_size)
684 654 vm_munmap(ctx->mmap_base, ctx->mmap_size);
... ... @@ -810,6 +780,8 @@
810 780 if (unlikely(!req))
811 781 goto out_put;
812 782  
  783 + percpu_ref_get(&ctx->reqs);
  784 +
813 785 req->ki_ctx = ctx;
814 786 return req;
815 787 out_put:
... ... @@ -879,12 +851,6 @@
879 851 return;
880 852 }
881 853  
882   - /*
883   - * Take rcu_read_lock() in case the kioctx is being destroyed, as we
884   - * need to issue a wakeup after incrementing reqs_available.
885   - */
886   - rcu_read_lock();
887   -
888 854 if (iocb->ki_list.next) {
889 855 unsigned long flags;
890 856  
... ... @@ -959,7 +925,7 @@
959 925 if (waitqueue_active(&ctx->wait))
960 926 wake_up(&ctx->wait);
961 927  
962   - rcu_read_unlock();
  928 + percpu_ref_put(&ctx->reqs);
963 929 }
964 930 EXPORT_SYMBOL(aio_complete);
965 931  
... ... @@ -1370,6 +1336,7 @@
1370 1336 return 0;
1371 1337 out_put_req:
1372 1338 put_reqs_available(ctx, 1);
  1339 + percpu_ref_put(&ctx->reqs);
1373 1340 kiocb_free(req);
1374 1341 return ret;
1375 1342 }