Commit 0460fef2a9215680f7f85415b57731b7e0fdf673

Authored by Kent Overstreet
Committed by Linus Torvalds
1 parent 21b40200cf

aio: use cancellation list lazily

Cancelling kiocbs requires adding them to a per kioctx linked list,
which is one of the few things we need to take the kioctx lock for in
the fast path.  But most kiocbs can't be cancelled - so if we just do
this lazily, we can avoid quite a bit of locking overhead.

While we're at it, instead of using a flag bit switch to using ki_cancel
itself to indicate that a kiocb has been cancelled/completed.  This lets
us get rid of ki_flags entirely.

[akpm@linux-foundation.org: remove buggy BUG()]
Signed-off-by: Kent Overstreet <koverstreet@google.com>
Cc: Zach Brown <zab@redhat.com>
Cc: Felipe Balbi <balbi@ti.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Mark Fasheh <mfasheh@suse.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Asai Thambi S P <asamymuthupa@micron.com>
Cc: Selvan Mani <smani@micron.com>
Cc: Sam Bradshaw <sbradshaw@micron.com>
Cc: Jeff Moyer <jmoyer@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Benjamin LaHaise <bcrl@kvack.org>
Reviewed-by: "Theodore Ts'o" <tytso@mit.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

Showing 3 changed files with 81 additions and 55 deletions Side-by-side Diff

drivers/usb/gadget/inode.c
... ... @@ -533,7 +533,6 @@
533 533 local_irq_disable();
534 534 epdata = priv->epdata;
535 535 // spin_lock(&epdata->dev->lock);
536   - kiocbSetCancelled(iocb);
537 536 if (likely(epdata && epdata->ep && priv->req))
538 537 value = usb_ep_dequeue (epdata->ep, priv->req);
539 538 else
... ... @@ -663,7 +662,7 @@
663 662 goto fail;
664 663 }
665 664  
666   - iocb->ki_cancel = ep_aio_cancel;
  665 + kiocb_set_cancel_fn(iocb, ep_aio_cancel);
667 666 get_ep(epdata);
668 667 priv->epdata = epdata;
669 668 priv->actual = 0;
... ... @@ -97,6 +97,8 @@
97 97  
98 98 struct aio_ring_info ring_info;
99 99  
  100 + spinlock_t completion_lock;
  101 +
100 102 struct rcu_head rcu_head;
101 103 struct work_struct rcu_work;
102 104 };
103 105  
104 106  
105 107  
106 108  
107 109  
... ... @@ -220,26 +222,52 @@
220 222 #define AIO_EVENTS_FIRST_PAGE ((PAGE_SIZE - sizeof(struct aio_ring)) / sizeof(struct io_event))
221 223 #define AIO_EVENTS_OFFSET (AIO_EVENTS_PER_PAGE - AIO_EVENTS_FIRST_PAGE)
222 224  
  225 +void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
  226 +{
  227 + struct kioctx *ctx = req->ki_ctx;
  228 + unsigned long flags;
  229 +
  230 + spin_lock_irqsave(&ctx->ctx_lock, flags);
  231 +
  232 + if (!req->ki_list.next)
  233 + list_add(&req->ki_list, &ctx->active_reqs);
  234 +
  235 + req->ki_cancel = cancel;
  236 +
  237 + spin_unlock_irqrestore(&ctx->ctx_lock, flags);
  238 +}
  239 +EXPORT_SYMBOL(kiocb_set_cancel_fn);
  240 +
223 241 static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
224 242 struct io_event *res)
225 243 {
226   - int (*cancel)(struct kiocb *, struct io_event *);
  244 + kiocb_cancel_fn *old, *cancel;
227 245 int ret = -EINVAL;
228 246  
229   - cancel = kiocb->ki_cancel;
230   - kiocbSetCancelled(kiocb);
231   - if (cancel) {
232   - atomic_inc(&kiocb->ki_users);
233   - spin_unlock_irq(&ctx->ctx_lock);
  247 + /*
  248 + * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
  249 + * actually has a cancel function, hence the cmpxchg()
  250 + */
234 251  
235   - memset(res, 0, sizeof(*res));
236   - res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
237   - res->data = kiocb->ki_user_data;
238   - ret = cancel(kiocb, res);
  252 + cancel = ACCESS_ONCE(kiocb->ki_cancel);
  253 + do {
  254 + if (!cancel || cancel == KIOCB_CANCELLED)
  255 + return ret;
239 256  
240   - spin_lock_irq(&ctx->ctx_lock);
241   - }
  257 + old = cancel;
  258 + cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
  259 + } while (cancel != old);
242 260  
  261 + atomic_inc(&kiocb->ki_users);
  262 + spin_unlock_irq(&ctx->ctx_lock);
  263 +
  264 + memset(res, 0, sizeof(*res));
  265 + res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
  266 + res->data = kiocb->ki_user_data;
  267 + ret = cancel(kiocb, res);
  268 +
  269 + spin_lock_irq(&ctx->ctx_lock);
  270 +
243 271 return ret;
244 272 }
245 273  
... ... @@ -326,6 +354,7 @@
326 354 atomic_set(&ctx->users, 2);
327 355 atomic_set(&ctx->dead, 0);
328 356 spin_lock_init(&ctx->ctx_lock);
  357 + spin_lock_init(&ctx->completion_lock);
329 358 mutex_init(&ctx->ring_info.ring_lock);
330 359 init_waitqueue_head(&ctx->wait);
331 360  
332 361  
333 362  
334 363  
... ... @@ -468,20 +497,12 @@
468 497 {
469 498 struct kiocb *req = NULL;
470 499  
471   - req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL);
  500 + req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
472 501 if (unlikely(!req))
473 502 return NULL;
474 503  
475   - req->ki_flags = 0;
476 504 atomic_set(&req->ki_users, 2);
477   - req->ki_key = 0;
478 505 req->ki_ctx = ctx;
479   - req->ki_cancel = NULL;
480   - req->ki_retry = NULL;
481   - req->ki_dtor = NULL;
482   - req->private = NULL;
483   - req->ki_iovec = NULL;
484   - req->ki_eventfd = NULL;
485 506  
486 507 return req;
487 508 }
... ... @@ -512,7 +533,6 @@
512 533 spin_lock_irq(&ctx->ctx_lock);
513 534 list_for_each_entry_safe(req, n, &batch->head, ki_batch) {
514 535 list_del(&req->ki_batch);
515   - list_del(&req->ki_list);
516 536 kmem_cache_free(kiocb_cachep, req);
517 537 atomic_dec(&ctx->reqs_active);
518 538 }
... ... @@ -559,10 +579,7 @@
559 579 }
560 580  
561 581 batch->count -= allocated;
562   - list_for_each_entry(req, &batch->head, ki_batch) {
563   - list_add(&req->ki_list, &ctx->active_reqs);
564   - atomic_inc(&ctx->reqs_active);
565   - }
  582 + atomic_add(allocated, &ctx->reqs_active);
566 583  
567 584 kunmap_atomic(ring);
568 585 spin_unlock_irq(&ctx->ctx_lock);
569 586  
570 587  
571 588  
572 589  
573 590  
... ... @@ -653,25 +670,34 @@
653 670 info = &ctx->ring_info;
654 671  
655 672 /*
656   - * Add a completion event to the ring buffer. Must be done holding
657   - * ctx->ctx_lock to prevent other code from messing with the tail
658   - * pointer since we might be called from irq context.
659   - *
660 673 * Take rcu_read_lock() in case the kioctx is being destroyed, as we
661 674 * need to issue a wakeup after decrementing reqs_active.
662 675 */
663 676 rcu_read_lock();
664   - spin_lock_irqsave(&ctx->ctx_lock, flags);
665 677  
666   - list_del(&iocb->ki_list); /* remove from active_reqs */
  678 + if (iocb->ki_list.next) {
  679 + unsigned long flags;
667 680  
  681 + spin_lock_irqsave(&ctx->ctx_lock, flags);
  682 + list_del(&iocb->ki_list);
  683 + spin_unlock_irqrestore(&ctx->ctx_lock, flags);
  684 + }
  685 +
668 686 /*
669 687 * cancelled requests don't get events, userland was given one
670 688 * when the event got cancelled.
671 689 */
672   - if (kiocbIsCancelled(iocb))
  690 + if (unlikely(xchg(&iocb->ki_cancel,
  691 + KIOCB_CANCELLED) == KIOCB_CANCELLED))
673 692 goto put_rq;
674 693  
  694 + /*
  695 + * Add a completion event to the ring buffer. Must be done holding
  696 + * ctx->ctx_lock to prevent other code from messing with the tail
  697 + * pointer since we might be called from irq context.
  698 + */
  699 + spin_lock_irqsave(&ctx->completion_lock, flags);
  700 +
675 701 tail = info->tail;
676 702 pos = tail + AIO_EVENTS_OFFSET;
677 703  
... ... @@ -705,6 +731,8 @@
705 731 kunmap_atomic(ring);
706 732 flush_dcache_page(info->ring_pages[0]);
707 733  
  734 + spin_unlock_irqrestore(&ctx->completion_lock, flags);
  735 +
708 736 pr_debug("added to ring %p at [%u]\n", iocb, tail);
709 737  
710 738 /*
... ... @@ -731,7 +759,6 @@
731 759 if (waitqueue_active(&ctx->wait))
732 760 wake_up(&ctx->wait);
733 761  
734   - spin_unlock_irqrestore(&ctx->ctx_lock, flags);
735 762 rcu_read_unlock();
736 763 }
737 764 EXPORT_SYMBOL(aio_complete);
738 765  
... ... @@ -1216,15 +1243,10 @@
1216 1243 req->ki_opcode = iocb->aio_lio_opcode;
1217 1244  
1218 1245 ret = aio_setup_iocb(req, compat);
1219   -
1220 1246 if (ret)
1221 1247 goto out_put_req;
1222 1248  
1223   - if (unlikely(kiocbIsCancelled(req)))
1224   - ret = -EINTR;
1225   - else
1226   - ret = req->ki_retry(req);
1227   -
  1249 + ret = req->ki_retry(req);
1228 1250 if (ret != -EIOCBQUEUED) {
1229 1251 /*
1230 1252 * There's no easy way to restart the syscall since other AIO's
... ... @@ -1241,10 +1263,6 @@
1241 1263 return 0;
1242 1264  
1243 1265 out_put_req:
1244   - spin_lock_irq(&ctx->ctx_lock);
1245   - list_del(&req->ki_list);
1246   - spin_unlock_irq(&ctx->ctx_lock);
1247   -
1248 1266 atomic_dec(&ctx->reqs_active);
1249 1267 aio_put_req(req); /* drop extra ref to req */
1250 1268 aio_put_req(req); /* drop i/o ref to req */
... ... @@ -10,18 +10,25 @@
10 10 #include <linux/atomic.h>
11 11  
12 12 struct kioctx;
  13 +struct kiocb;
13 14  
14 15 #define KIOCB_SYNC_KEY (~0U)
15 16  
16   -/* ki_flags bits */
17   -#define KIF_CANCELLED 2
  17 +/*
  18 + * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
  19 + * cancelled or completed (this makes a certain amount of sense because
  20 + * successful cancellation - io_cancel() - does deliver the completion to
  21 + * userspace).
  22 + *
  23 + * And since most things don't implement kiocb cancellation and we'd really like
  24 + * kiocb completion to be lockless when possible, we use ki_cancel to
  25 + * synchronize cancellation and completion - we only set it to KIOCB_CANCELLED
  26 + * with xchg() or cmpxchg(), see batch_complete_aio() and kiocb_cancel().
  27 + */
  28 +#define KIOCB_CANCELLED ((void *) (~0ULL))
18 29  
19   -#define kiocbSetCancelled(iocb) set_bit(KIF_CANCELLED, &(iocb)->ki_flags)
  30 +typedef int (kiocb_cancel_fn)(struct kiocb *, struct io_event *);
20 31  
21   -#define kiocbClearCancelled(iocb) clear_bit(KIF_CANCELLED, &(iocb)->ki_flags)
22   -
23   -#define kiocbIsCancelled(iocb) test_bit(KIF_CANCELLED, &(iocb)->ki_flags)
24   -
25 32 /* is there a better place to document function pointer methods? */
26 33 /**
27 34 * ki_retry - iocb forward progress callback
28 35  
... ... @@ -48,13 +55,12 @@
48 55 * calls may result in undefined behaviour.
49 56 */
50 57 struct kiocb {
51   - unsigned long ki_flags;
52 58 atomic_t ki_users;
53 59 unsigned ki_key; /* id of this request */
54 60  
55 61 struct file *ki_filp;
56 62 struct kioctx *ki_ctx; /* may be NULL for sync ops */
57   - int (*ki_cancel)(struct kiocb *, struct io_event *);
  63 + kiocb_cancel_fn *ki_cancel;
58 64 ssize_t (*ki_retry)(struct kiocb *);
59 65 void (*ki_dtor)(struct kiocb *);
60 66  
... ... @@ -112,6 +118,7 @@
112 118 extern void exit_aio(struct mm_struct *mm);
113 119 extern long do_io_submit(aio_context_t ctx_id, long nr,
114 120 struct iocb __user *__user *iocbpp, bool compat);
  121 +void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
115 122 #else
116 123 static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
117 124 static inline void aio_put_req(struct kiocb *iocb) { }
... ... @@ -121,6 +128,8 @@
121 128 static inline long do_io_submit(aio_context_t ctx_id, long nr,
122 129 struct iocb __user * __user *iocbpp,
123 130 bool compat) { return 0; }
  131 +static inline void kiocb_set_cancel_fn(struct kiocb *req,
  132 + kiocb_cancel_fn *cancel) { }
124 133 #endif /* CONFIG_AIO */
125 134  
126 135 static inline struct kiocb *list_kiocb(struct list_head *h)