Commit a31ad380bed817aa25f8830ad23e1a0480fef797

Authored by Kent Overstreet
Committed by Linus Torvalds
1 parent 774a08b354

aio: make aio_read_evt() more efficient, convert to hrtimers

Previously, aio_read_event() pulled a single completion off the
ringbuffer at a time, locking and unlocking each time.  Change it to
pull off as many events as it can at a time, and copy them directly to
userspace.

This also fixes a bug where if copying the event to userspace failed,
we'd lose the event.

Also convert it to wait_event_interruptible_hrtimeout(), which
simplifies it quite a bit.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 1 changed file with 90 additions and 150 deletions Side-by-side Diff

... ... @@ -63,7 +63,7 @@
63 63 unsigned long mmap_size;
64 64  
65 65 struct page **ring_pages;
66   - spinlock_t ring_lock;
  66 + struct mutex ring_lock;
67 67 long nr_pages;
68 68  
69 69 unsigned nr, tail;
... ... @@ -344,7 +344,7 @@
344 344 atomic_set(&ctx->users, 2);
345 345 atomic_set(&ctx->dead, 0);
346 346 spin_lock_init(&ctx->ctx_lock);
347   - spin_lock_init(&ctx->ring_info.ring_lock);
  347 + mutex_init(&ctx->ring_info.ring_lock);
348 348 init_waitqueue_head(&ctx->wait);
349 349  
350 350 INIT_LIST_HEAD(&ctx->active_reqs);
351 351  
352 352  
353 353  
354 354  
355 355  
356 356  
357 357  
358 358  
359 359  
360 360  
361 361  
362 362  
363 363  
364 364  
365 365  
366 366  
367 367  
368 368  
369 369  
370 370  
371 371  
372 372  
373 373  
374 374  
375 375  
376 376  
... ... @@ -748,187 +748,127 @@
748 748 }
749 749 EXPORT_SYMBOL(aio_complete);
750 750  
751   -/* aio_read_evt
752   - * Pull an event off of the ioctx's event ring. Returns the number of
753   - * events fetched (0 or 1 ;-)
754   - * FIXME: make this use cmpxchg.
755   - * TODO: make the ringbuffer user mmap()able (requires FIXME).
  751 +/* aio_read_events
  752 + * Pull an event off of the ioctx's event ring. Returns the number of
  753 + * events fetched
756 754 */
757   -static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent)
  755 +static long aio_read_events_ring(struct kioctx *ctx,
  756 + struct io_event __user *event, long nr)
758 757 {
759   - struct aio_ring_info *info = &ioctx->ring_info;
  758 + struct aio_ring_info *info = &ctx->ring_info;
760 759 struct aio_ring *ring;
761   - unsigned long head;
762   - int ret = 0;
  760 + unsigned head, pos;
  761 + long ret = 0;
  762 + int copy_ret;
763 763  
  764 + mutex_lock(&info->ring_lock);
  765 +
764 766 ring = kmap_atomic(info->ring_pages[0]);
765   - pr_debug("h%u t%u m%u\n", ring->head, ring->tail, ring->nr);
  767 + head = ring->head;
  768 + kunmap_atomic(ring);
766 769  
767   - if (ring->head == ring->tail)
  770 + pr_debug("h%u t%u m%u\n", head, info->tail, info->nr);
  771 +
  772 + if (head == info->tail)
768 773 goto out;
769 774  
770   - spin_lock(&info->ring_lock);
  775 + while (ret < nr) {
  776 + long avail;
  777 + struct io_event *ev;
  778 + struct page *page;
771 779  
772   - head = ring->head % info->nr;
773   - if (head != ring->tail) {
774   - struct io_event *evp = aio_ring_event(info, head);
775   - *ent = *evp;
776   - head = (head + 1) % info->nr;
777   - smp_mb(); /* finish reading the event before updatng the head */
778   - ring->head = head;
779   - ret = 1;
780   - put_aio_ring_event(evp);
  780 + avail = (head <= info->tail ? info->tail : info->nr) - head;
  781 + if (head == info->tail)
  782 + break;
  783 +
  784 + avail = min(avail, nr - ret);
  785 + avail = min_t(long, avail, AIO_EVENTS_PER_PAGE -
  786 + ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE));
  787 +
  788 + pos = head + AIO_EVENTS_OFFSET;
  789 + page = info->ring_pages[pos / AIO_EVENTS_PER_PAGE];
  790 + pos %= AIO_EVENTS_PER_PAGE;
  791 +
  792 + ev = kmap(page);
  793 + copy_ret = copy_to_user(event + ret, ev + pos,
  794 + sizeof(*ev) * avail);
  795 + kunmap(page);
  796 +
  797 + if (unlikely(copy_ret)) {
  798 + ret = -EFAULT;
  799 + goto out;
  800 + }
  801 +
  802 + ret += avail;
  803 + head += avail;
  804 + head %= info->nr;
781 805 }
782   - spin_unlock(&info->ring_lock);
783 806  
784   -out:
  807 + ring = kmap_atomic(info->ring_pages[0]);
  808 + ring->head = head;
785 809 kunmap_atomic(ring);
786   - pr_debug("%d h%u t%u\n", ret, ring->head, ring->tail);
  810 +
  811 + pr_debug("%li h%u t%u\n", ret, head, info->tail);
  812 +out:
  813 + mutex_unlock(&info->ring_lock);
  814 +
787 815 return ret;
788 816 }
789 817  
790   -struct aio_timeout {
791   - struct timer_list timer;
792   - int timed_out;
793   - struct task_struct *p;
794   -};
795   -
796   -static void timeout_func(unsigned long data)
  818 +static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr,
  819 + struct io_event __user *event, long *i)
797 820 {
798   - struct aio_timeout *to = (struct aio_timeout *)data;
  821 + long ret = aio_read_events_ring(ctx, event + *i, nr - *i);
799 822  
800   - to->timed_out = 1;
801   - wake_up_process(to->p);
802   -}
  823 + if (ret > 0)
  824 + *i += ret;
803 825  
804   -static inline void init_timeout(struct aio_timeout *to)
805   -{
806   - setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to);
807   - to->timed_out = 0;
808   - to->p = current;
809   -}
  826 + if (unlikely(atomic_read(&ctx->dead)))
  827 + ret = -EINVAL;
810 828  
811   -static inline void set_timeout(long start_jiffies, struct aio_timeout *to,
812   - const struct timespec *ts)
813   -{
814   - to->timer.expires = start_jiffies + timespec_to_jiffies(ts);
815   - if (time_after(to->timer.expires, jiffies))
816   - add_timer(&to->timer);
817   - else
818   - to->timed_out = 1;
819   -}
  829 + if (!*i)
  830 + *i = ret;
820 831  
821   -static inline void clear_timeout(struct aio_timeout *to)
822   -{
823   - del_singleshot_timer_sync(&to->timer);
  832 + return ret < 0 || *i >= min_nr;
824 833 }
825 834  
826   -static int read_events(struct kioctx *ctx,
827   - long min_nr, long nr,
  835 +static long read_events(struct kioctx *ctx, long min_nr, long nr,
828 836 struct io_event __user *event,
829 837 struct timespec __user *timeout)
830 838 {
831   - long start_jiffies = jiffies;
832   - struct task_struct *tsk = current;
833   - DECLARE_WAITQUEUE(wait, tsk);
834   - int ret;
835   - int i = 0;
836   - struct io_event ent;
837   - struct aio_timeout to;
  839 + ktime_t until = { .tv64 = KTIME_MAX };
  840 + long ret = 0;
838 841  
839   - /* needed to zero any padding within an entry (there shouldn't be
840   - * any, but C is fun!
841   - */
842   - memset(&ent, 0, sizeof(ent));
843   - ret = 0;
844   - while (likely(i < nr)) {
845   - ret = aio_read_evt(ctx, &ent);
846   - if (unlikely(ret <= 0))
847   - break;
848   -
849   - pr_debug("%Lx %Lx %Lx %Lx\n",
850   - ent.data, ent.obj, ent.res, ent.res2);
851   -
852   - /* Could we split the check in two? */
853   - ret = -EFAULT;
854   - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
855   - pr_debug("lost an event due to EFAULT.\n");
856   - break;
857   - }
858   - ret = 0;
859   -
860   - /* Good, event copied to userland, update counts. */
861   - event ++;
862   - i ++;
863   - }
864   -
865   - if (min_nr <= i)
866   - return i;
867   - if (ret)
868   - return ret;
869   -
870   - /* End fast path */
871   -
872   - init_timeout(&to);
873 842 if (timeout) {
874 843 struct timespec ts;
875   - ret = -EFAULT;
  844 +
876 845 if (unlikely(copy_from_user(&ts, timeout, sizeof(ts))))
877   - goto out;
  846 + return -EFAULT;
878 847  
879   - set_timeout(start_jiffies, &to, &ts);
  848 + until = timespec_to_ktime(ts);
880 849 }
881 850  
882   - while (likely(i < nr)) {
883   - add_wait_queue_exclusive(&ctx->wait, &wait);
884   - do {
885   - set_task_state(tsk, TASK_INTERRUPTIBLE);
886   - ret = aio_read_evt(ctx, &ent);
887   - if (ret)
888   - break;
889   - if (min_nr <= i)
890   - break;
891   - if (unlikely(atomic_read(&ctx->dead))) {
892   - ret = -EINVAL;
893   - break;
894   - }
895   - if (to.timed_out) /* Only check after read evt */
896   - break;
897   - /* Try to only show up in io wait if there are ops
898   - * in flight */
899   - if (atomic_read(&ctx->reqs_active))
900   - io_schedule();
901   - else
902   - schedule();
903   - if (signal_pending(tsk)) {
904   - ret = -EINTR;
905   - break;
906   - }
907   - /*ret = aio_read_evt(ctx, &ent);*/
908   - } while (1) ;
  851 + /*
  852 + * Note that aio_read_events() is being called as the conditional - i.e.
  853 + * we're calling it after prepare_to_wait() has set task state to
  854 + * TASK_INTERRUPTIBLE.
  855 + *
  856 + * But aio_read_events() can block, and if it blocks it's going to flip
  857 + * the task state back to TASK_RUNNING.
  858 + *
  859 + * This should be ok, provided it doesn't flip the state back to
  860 + * TASK_RUNNING and return 0 too much - that causes us to spin. That
  861 + * will only happen if the mutex_lock() call blocks, and we then find
  862 + * the ringbuffer empty. So in practice we should be ok, but it's
  863 + * something to be aware of when touching this code.
  864 + */
  865 + wait_event_interruptible_hrtimeout(ctx->wait,
  866 + aio_read_events(ctx, min_nr, nr, event, &ret), until);
909 867  
910   - set_task_state(tsk, TASK_RUNNING);
911   - remove_wait_queue(&ctx->wait, &wait);
  868 + if (!ret && signal_pending(current))
  869 + ret = -EINTR;
912 870  
913   - if (unlikely(ret <= 0))
914   - break;
915   -
916   - ret = -EFAULT;
917   - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) {
918   - pr_debug("lost an event due to EFAULT.\n");
919   - break;
920   - }
921   -
922   - /* Good, event copied to userland, update counts. */
923   - event ++;
924   - i ++;
925   - }
926   -
927   - if (timeout)
928   - clear_timeout(&to);
929   -out:
930   - destroy_timer_on_stack(&to.timer);
931   - return i ? i : ret;
  871 + return ret;
932 872 }
933 873  
934 874 /* sys_io_setup: