Commit a31ad380bed817aa25f8830ad23e1a0480fef797
Committed by
Linus Torvalds
1 parent
774a08b354
Exists in
master
and in
20 other branches
aio: make aio_read_evt() more efficient, convert to hrtimers
Previously, aio_read_event() pulled a single completion off the ringbuffer at a time, locking and unlocking each time. Change it to pull off as many events as it can at a time, and copy them directly to userspace. This also fixes a bug where if copying the event to userspace failed, we'd lose the event. Also convert it to wait_event_interruptible_hrtimeout(), which simplifies it quite a bit. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Kent Overstreet <koverstreet@google.com> Cc: Zach Brown <zab@redhat.com> Cc: Felipe Balbi <balbi@ti.com> Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org> Cc: Mark Fasheh <mfasheh@suse.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Rusty Russell <rusty@rustcorp.com.au> Cc: Jens Axboe <axboe@kernel.dk> Cc: Asai Thambi S P <asamymuthupa@micron.com> Cc: Selvan Mani <smani@micron.com> Cc: Sam Bradshaw <sbradshaw@micron.com> Cc: Jeff Moyer <jmoyer@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Benjamin LaHaise <bcrl@kvack.org> Reviewed-by: "Theodore Ts'o" <tytso@mit.edu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 90 additions and 150 deletions Side-by-side Diff
fs/aio.c
... | ... | @@ -63,7 +63,7 @@ |
63 | 63 | unsigned long mmap_size; |
64 | 64 | |
65 | 65 | struct page **ring_pages; |
66 | - spinlock_t ring_lock; | |
66 | + struct mutex ring_lock; | |
67 | 67 | long nr_pages; |
68 | 68 | |
69 | 69 | unsigned nr, tail; |
... | ... | @@ -344,7 +344,7 @@ |
344 | 344 | atomic_set(&ctx->users, 2); |
345 | 345 | atomic_set(&ctx->dead, 0); |
346 | 346 | spin_lock_init(&ctx->ctx_lock); |
347 | - spin_lock_init(&ctx->ring_info.ring_lock); | |
347 | + mutex_init(&ctx->ring_info.ring_lock); | |
348 | 348 | init_waitqueue_head(&ctx->wait); |
349 | 349 | |
350 | 350 | INIT_LIST_HEAD(&ctx->active_reqs); |
351 | 351 | |
352 | 352 | |
353 | 353 | |
354 | 354 | |
355 | 355 | |
356 | 356 | |
357 | 357 | |
358 | 358 | |
359 | 359 | |
360 | 360 | |
361 | 361 | |
362 | 362 | |
363 | 363 | |
364 | 364 | |
365 | 365 | |
366 | 366 | |
367 | 367 | |
368 | 368 | |
369 | 369 | |
370 | 370 | |
371 | 371 | |
372 | 372 | |
373 | 373 | |
374 | 374 | |
375 | 375 | |
376 | 376 | |
... | ... | @@ -748,187 +748,127 @@ |
748 | 748 | } |
749 | 749 | EXPORT_SYMBOL(aio_complete); |
750 | 750 | |
751 | -/* aio_read_evt | |
752 | - * Pull an event off of the ioctx's event ring. Returns the number of | |
753 | - * events fetched (0 or 1 ;-) | |
754 | - * FIXME: make this use cmpxchg. | |
755 | - * TODO: make the ringbuffer user mmap()able (requires FIXME). | |
751 | +/* aio_read_events | |
752 | + * Pull an event off of the ioctx's event ring. Returns the number of | |
753 | + * events fetched | |
756 | 754 | */ |
757 | -static int aio_read_evt(struct kioctx *ioctx, struct io_event *ent) | |
755 | +static long aio_read_events_ring(struct kioctx *ctx, | |
756 | + struct io_event __user *event, long nr) | |
758 | 757 | { |
759 | - struct aio_ring_info *info = &ioctx->ring_info; | |
758 | + struct aio_ring_info *info = &ctx->ring_info; | |
760 | 759 | struct aio_ring *ring; |
761 | - unsigned long head; | |
762 | - int ret = 0; | |
760 | + unsigned head, pos; | |
761 | + long ret = 0; | |
762 | + int copy_ret; | |
763 | 763 | |
764 | + mutex_lock(&info->ring_lock); | |
765 | + | |
764 | 766 | ring = kmap_atomic(info->ring_pages[0]); |
765 | - pr_debug("h%u t%u m%u\n", ring->head, ring->tail, ring->nr); | |
767 | + head = ring->head; | |
768 | + kunmap_atomic(ring); | |
766 | 769 | |
767 | - if (ring->head == ring->tail) | |
770 | + pr_debug("h%u t%u m%u\n", head, info->tail, info->nr); | |
771 | + | |
772 | + if (head == info->tail) | |
768 | 773 | goto out; |
769 | 774 | |
770 | - spin_lock(&info->ring_lock); | |
775 | + while (ret < nr) { | |
776 | + long avail; | |
777 | + struct io_event *ev; | |
778 | + struct page *page; | |
771 | 779 | |
772 | - head = ring->head % info->nr; | |
773 | - if (head != ring->tail) { | |
774 | - struct io_event *evp = aio_ring_event(info, head); | |
775 | - *ent = *evp; | |
776 | - head = (head + 1) % info->nr; | |
777 | - smp_mb(); /* finish reading the event before updatng the head */ | |
778 | - ring->head = head; | |
779 | - ret = 1; | |
780 | - put_aio_ring_event(evp); | |
780 | + avail = (head <= info->tail ? info->tail : info->nr) - head; | |
781 | + if (head == info->tail) | |
782 | + break; | |
783 | + | |
784 | + avail = min(avail, nr - ret); | |
785 | + avail = min_t(long, avail, AIO_EVENTS_PER_PAGE - | |
786 | + ((head + AIO_EVENTS_OFFSET) % AIO_EVENTS_PER_PAGE)); | |
787 | + | |
788 | + pos = head + AIO_EVENTS_OFFSET; | |
789 | + page = info->ring_pages[pos / AIO_EVENTS_PER_PAGE]; | |
790 | + pos %= AIO_EVENTS_PER_PAGE; | |
791 | + | |
792 | + ev = kmap(page); | |
793 | + copy_ret = copy_to_user(event + ret, ev + pos, | |
794 | + sizeof(*ev) * avail); | |
795 | + kunmap(page); | |
796 | + | |
797 | + if (unlikely(copy_ret)) { | |
798 | + ret = -EFAULT; | |
799 | + goto out; | |
800 | + } | |
801 | + | |
802 | + ret += avail; | |
803 | + head += avail; | |
804 | + head %= info->nr; | |
781 | 805 | } |
782 | - spin_unlock(&info->ring_lock); | |
783 | 806 | |
784 | -out: | |
807 | + ring = kmap_atomic(info->ring_pages[0]); | |
808 | + ring->head = head; | |
785 | 809 | kunmap_atomic(ring); |
786 | - pr_debug("%d h%u t%u\n", ret, ring->head, ring->tail); | |
810 | + | |
811 | + pr_debug("%li h%u t%u\n", ret, head, info->tail); | |
812 | +out: | |
813 | + mutex_unlock(&info->ring_lock); | |
814 | + | |
787 | 815 | return ret; |
788 | 816 | } |
789 | 817 | |
790 | -struct aio_timeout { | |
791 | - struct timer_list timer; | |
792 | - int timed_out; | |
793 | - struct task_struct *p; | |
794 | -}; | |
795 | - | |
796 | -static void timeout_func(unsigned long data) | |
818 | +static bool aio_read_events(struct kioctx *ctx, long min_nr, long nr, | |
819 | + struct io_event __user *event, long *i) | |
797 | 820 | { |
798 | - struct aio_timeout *to = (struct aio_timeout *)data; | |
821 | + long ret = aio_read_events_ring(ctx, event + *i, nr - *i); | |
799 | 822 | |
800 | - to->timed_out = 1; | |
801 | - wake_up_process(to->p); | |
802 | -} | |
823 | + if (ret > 0) | |
824 | + *i += ret; | |
803 | 825 | |
804 | -static inline void init_timeout(struct aio_timeout *to) | |
805 | -{ | |
806 | - setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); | |
807 | - to->timed_out = 0; | |
808 | - to->p = current; | |
809 | -} | |
826 | + if (unlikely(atomic_read(&ctx->dead))) | |
827 | + ret = -EINVAL; | |
810 | 828 | |
811 | -static inline void set_timeout(long start_jiffies, struct aio_timeout *to, | |
812 | - const struct timespec *ts) | |
813 | -{ | |
814 | - to->timer.expires = start_jiffies + timespec_to_jiffies(ts); | |
815 | - if (time_after(to->timer.expires, jiffies)) | |
816 | - add_timer(&to->timer); | |
817 | - else | |
818 | - to->timed_out = 1; | |
819 | -} | |
829 | + if (!*i) | |
830 | + *i = ret; | |
820 | 831 | |
821 | -static inline void clear_timeout(struct aio_timeout *to) | |
822 | -{ | |
823 | - del_singleshot_timer_sync(&to->timer); | |
832 | + return ret < 0 || *i >= min_nr; | |
824 | 833 | } |
825 | 834 | |
826 | -static int read_events(struct kioctx *ctx, | |
827 | - long min_nr, long nr, | |
835 | +static long read_events(struct kioctx *ctx, long min_nr, long nr, | |
828 | 836 | struct io_event __user *event, |
829 | 837 | struct timespec __user *timeout) |
830 | 838 | { |
831 | - long start_jiffies = jiffies; | |
832 | - struct task_struct *tsk = current; | |
833 | - DECLARE_WAITQUEUE(wait, tsk); | |
834 | - int ret; | |
835 | - int i = 0; | |
836 | - struct io_event ent; | |
837 | - struct aio_timeout to; | |
839 | + ktime_t until = { .tv64 = KTIME_MAX }; | |
840 | + long ret = 0; | |
838 | 841 | |
839 | - /* needed to zero any padding within an entry (there shouldn't be | |
840 | - * any, but C is fun! | |
841 | - */ | |
842 | - memset(&ent, 0, sizeof(ent)); | |
843 | - ret = 0; | |
844 | - while (likely(i < nr)) { | |
845 | - ret = aio_read_evt(ctx, &ent); | |
846 | - if (unlikely(ret <= 0)) | |
847 | - break; | |
848 | - | |
849 | - pr_debug("%Lx %Lx %Lx %Lx\n", | |
850 | - ent.data, ent.obj, ent.res, ent.res2); | |
851 | - | |
852 | - /* Could we split the check in two? */ | |
853 | - ret = -EFAULT; | |
854 | - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { | |
855 | - pr_debug("lost an event due to EFAULT.\n"); | |
856 | - break; | |
857 | - } | |
858 | - ret = 0; | |
859 | - | |
860 | - /* Good, event copied to userland, update counts. */ | |
861 | - event ++; | |
862 | - i ++; | |
863 | - } | |
864 | - | |
865 | - if (min_nr <= i) | |
866 | - return i; | |
867 | - if (ret) | |
868 | - return ret; | |
869 | - | |
870 | - /* End fast path */ | |
871 | - | |
872 | - init_timeout(&to); | |
873 | 842 | if (timeout) { |
874 | 843 | struct timespec ts; |
875 | - ret = -EFAULT; | |
844 | + | |
876 | 845 | if (unlikely(copy_from_user(&ts, timeout, sizeof(ts)))) |
877 | - goto out; | |
846 | + return -EFAULT; | |
878 | 847 | |
879 | - set_timeout(start_jiffies, &to, &ts); | |
848 | + until = timespec_to_ktime(ts); | |
880 | 849 | } |
881 | 850 | |
882 | - while (likely(i < nr)) { | |
883 | - add_wait_queue_exclusive(&ctx->wait, &wait); | |
884 | - do { | |
885 | - set_task_state(tsk, TASK_INTERRUPTIBLE); | |
886 | - ret = aio_read_evt(ctx, &ent); | |
887 | - if (ret) | |
888 | - break; | |
889 | - if (min_nr <= i) | |
890 | - break; | |
891 | - if (unlikely(atomic_read(&ctx->dead))) { | |
892 | - ret = -EINVAL; | |
893 | - break; | |
894 | - } | |
895 | - if (to.timed_out) /* Only check after read evt */ | |
896 | - break; | |
897 | - /* Try to only show up in io wait if there are ops | |
898 | - * in flight */ | |
899 | - if (atomic_read(&ctx->reqs_active)) | |
900 | - io_schedule(); | |
901 | - else | |
902 | - schedule(); | |
903 | - if (signal_pending(tsk)) { | |
904 | - ret = -EINTR; | |
905 | - break; | |
906 | - } | |
907 | - /*ret = aio_read_evt(ctx, &ent);*/ | |
908 | - } while (1) ; | |
851 | + /* | |
852 | + * Note that aio_read_events() is being called as the conditional - i.e. | |
853 | + * we're calling it after prepare_to_wait() has set task state to | |
854 | + * TASK_INTERRUPTIBLE. | |
855 | + * | |
856 | + * But aio_read_events() can block, and if it blocks it's going to flip | |
857 | + * the task state back to TASK_RUNNING. | |
858 | + * | |
859 | + * This should be ok, provided it doesn't flip the state back to | |
860 | + * TASK_RUNNING and return 0 too much - that causes us to spin. That | |
861 | + * will only happen if the mutex_lock() call blocks, and we then find | |
862 | + * the ringbuffer empty. So in practice we should be ok, but it's | |
863 | + * something to be aware of when touching this code. | |
864 | + */ | |
865 | + wait_event_interruptible_hrtimeout(ctx->wait, | |
866 | + aio_read_events(ctx, min_nr, nr, event, &ret), until); | |
909 | 867 | |
910 | - set_task_state(tsk, TASK_RUNNING); | |
911 | - remove_wait_queue(&ctx->wait, &wait); | |
868 | + if (!ret && signal_pending(current)) | |
869 | + ret = -EINTR; | |
912 | 870 | |
913 | - if (unlikely(ret <= 0)) | |
914 | - break; | |
915 | - | |
916 | - ret = -EFAULT; | |
917 | - if (unlikely(copy_to_user(event, &ent, sizeof(ent)))) { | |
918 | - pr_debug("lost an event due to EFAULT.\n"); | |
919 | - break; | |
920 | - } | |
921 | - | |
922 | - /* Good, event copied to userland, update counts. */ | |
923 | - event ++; | |
924 | - i ++; | |
925 | - } | |
926 | - | |
927 | - if (timeout) | |
928 | - clear_timeout(&to); | |
929 | -out: | |
930 | - destroy_timer_on_stack(&to.timer); | |
931 | - return i ? i : ret; | |
871 | + return ret; | |
932 | 872 | } |
933 | 873 | |
934 | 874 | /* sys_io_setup: |
-
mentioned in commit aa011e
-
mentioned in commit aa011e
-
mentioned in commit aa011e
-
mentioned in commit aa011e
-
mentioned in commit aa011e
-
mentioned in commit edfbbf
-
mentioned in commit edfbbf
-
mentioned in commit edfbbf
-
mentioned in commit edfbbf
-
mentioned in commit edfbbf
-
mentioned in commit edfbbf
-
mentioned in commit edfbbf
-
mentioned in commit edfbbf
-
mentioned in commit edfbbf
-
mentioned in commit edfbbf
-
mentioned in commit edfbbf
-
mentioned in commit edfbbf
-
mentioned in commit edfbbf