Commit cfb1e33eed48165763edc7a4a067cf5f74898d0b

Authored by Jeff Moyer
Committed by Jens Axboe
1 parent 1af60fbd75

aio: implement request batching

Hi,

Some workloads issue batches of small I/O, and the performance is poor
due to the call to blk_run_address_space for every single iocb.  Nathan
Roberts pointed this out, and suggested that by deferring this call
until all I/Os in the iocb array are submitted to the block layer, we
can realize some impressive performance gains (up to 30% for sequential
4k reads in batches of 16).

Signed-off-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

Showing 2 changed files with 63 additions and 6 deletions Side-by-side Diff

... ... @@ -32,6 +32,9 @@
32 32 #include <linux/workqueue.h>
33 33 #include <linux/security.h>
34 34 #include <linux/eventfd.h>
  35 +#include <linux/blkdev.h>
  36 +#include <linux/mempool.h>
  37 +#include <linux/hash.h>
35 38  
36 39 #include <asm/kmap_types.h>
37 40 #include <asm/uaccess.h>
... ... @@ -60,6 +63,14 @@
60 63 static DEFINE_SPINLOCK(fput_lock);
61 64 static LIST_HEAD(fput_head);
62 65  
  66 +#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */
  67 +#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS)
  68 +struct aio_batch_entry {
  69 + struct hlist_node list;
  70 + struct address_space *mapping;
  71 +};
  72 +mempool_t *abe_pool;
  73 +
63 74 static void aio_kick_handler(struct work_struct *);
64 75 static void aio_queue_work(struct kioctx *);
65 76  
... ... @@ -73,6 +84,8 @@
73 84 kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
74 85  
75 86 aio_wq = create_workqueue("aio");
  87 + abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
  88 + BUG_ON(!abe_pool);
76 89  
77 90 pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
78 91  
79 92  
... ... @@ -1531,8 +1544,44 @@
1531 1544 return 1;
1532 1545 }
1533 1546  
  1547 +static void aio_batch_add(struct address_space *mapping,
  1548 + struct hlist_head *batch_hash)
  1549 +{
  1550 + struct aio_batch_entry *abe;
  1551 + struct hlist_node *pos;
  1552 + unsigned bucket;
  1553 +
  1554 + bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
  1555 + hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
  1556 + if (abe->mapping == mapping)
  1557 + return;
  1558 + }
  1559 +
  1560 + abe = mempool_alloc(abe_pool, GFP_KERNEL);
  1561 + BUG_ON(!igrab(mapping->host));
  1562 + abe->mapping = mapping;
  1563 + hlist_add_head(&abe->list, &batch_hash[bucket]);
  1564 + return;
  1565 +}
  1566 +
  1567 +static void aio_batch_free(struct hlist_head *batch_hash)
  1568 +{
  1569 + struct aio_batch_entry *abe;
  1570 + struct hlist_node *pos, *n;
  1571 + int i;
  1572 +
  1573 + for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
  1574 + hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
  1575 + blk_run_address_space(abe->mapping);
  1576 + iput(abe->mapping->host);
  1577 + hlist_del(&abe->list);
  1578 + mempool_free(abe, abe_pool);
  1579 + }
  1580 + }
  1581 +}
  1582 +
1534 1583 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
1535   - struct iocb *iocb)
  1584 + struct iocb *iocb, struct hlist_head *batch_hash)
1536 1585 {
1537 1586 struct kiocb *req;
1538 1587 struct file *file;
... ... @@ -1608,6 +1657,12 @@
1608 1657 ;
1609 1658 }
1610 1659 spin_unlock_irq(&ctx->ctx_lock);
  1660 + if (req->ki_opcode == IOCB_CMD_PREAD ||
  1661 + req->ki_opcode == IOCB_CMD_PREADV ||
  1662 + req->ki_opcode == IOCB_CMD_PWRITE ||
  1663 + req->ki_opcode == IOCB_CMD_PWRITEV)
  1664 + aio_batch_add(file->f_mapping, batch_hash);
  1665 +
1611 1666 aio_put_req(req); /* drop extra ref to req */
1612 1667 return 0;
1613 1668  
... ... @@ -1635,6 +1690,7 @@
1635 1690 struct kioctx *ctx;
1636 1691 long ret = 0;
1637 1692 int i;
  1693 + struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
1638 1694  
1639 1695 if (unlikely(nr < 0))
1640 1696 return -EINVAL;
1641 1697  
... ... @@ -1666,10 +1722,11 @@
1666 1722 break;
1667 1723 }
1668 1724  
1669   - ret = io_submit_one(ctx, user_iocb, &tmp);
  1725 + ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
1670 1726 if (ret)
1671 1727 break;
1672 1728 }
  1729 + aio_batch_free(batch_hash);
1673 1730  
1674 1731 put_ioctx(ctx);
1675 1732 return i ? i : ret;
... ... @@ -1028,9 +1028,6 @@
1028 1028 if (dio->bio)
1029 1029 dio_bio_submit(dio);
1030 1030  
1031   - /* All IO is now issued, send it on its way */
1032   - blk_run_address_space(inode->i_mapping);
1033   -
1034 1031 /*
1035 1032 * It is possible that, we return short IO due to end of file.
1036 1033 * In that case, we need to release all the pages we got hold on.
1037 1034  
... ... @@ -1057,8 +1054,11 @@
1057 1054 ((rw & READ) || (dio->result == dio->size)))
1058 1055 ret = -EIOCBQUEUED;
1059 1056  
1060   - if (ret != -EIOCBQUEUED)
  1057 + if (ret != -EIOCBQUEUED) {
  1058 + /* All IO is now issued, send it on its way */
  1059 + blk_run_address_space(inode->i_mapping);
1061 1060 dio_await_completion(dio);
  1061 + }
1062 1062  
1063 1063 /*
1064 1064 * Sync will always be dropping the final ref and completing the