Commit cfb1e33eed48165763edc7a4a067cf5f74898d0b
Committed by
Jens Axboe
1 parent
1af60fbd75
Exists in
master
and in
20 other branches
aio: implement request batching
Hi, Some workloads issue batches of small I/O, and the performance is poor due to the call to blk_run_address_space for every single iocb. Nathan Roberts pointed this out, and suggested that by deferring this call until all I/Os in the iocb array are submitted to the block layer, we can realize some impressive performance gains (up to 30% for sequential 4k reads in batches of 16). Signed-off-by: Jeff Moyer <jmoyer@redhat.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Showing 2 changed files with 63 additions and 6 deletions Side-by-side Diff
fs/aio.c
... | ... | @@ -32,6 +32,9 @@ |
32 | 32 | #include <linux/workqueue.h> |
33 | 33 | #include <linux/security.h> |
34 | 34 | #include <linux/eventfd.h> |
35 | +#include <linux/blkdev.h> | |
36 | +#include <linux/mempool.h> | |
37 | +#include <linux/hash.h> | |
35 | 38 | |
36 | 39 | #include <asm/kmap_types.h> |
37 | 40 | #include <asm/uaccess.h> |
... | ... | @@ -60,6 +63,14 @@ |
60 | 63 | static DEFINE_SPINLOCK(fput_lock); |
61 | 64 | static LIST_HEAD(fput_head); |
62 | 65 | |
66 | +#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */ | |
67 | +#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS) | |
68 | +struct aio_batch_entry { | |
69 | + struct hlist_node list; | |
70 | + struct address_space *mapping; | |
71 | +}; | |
72 | +mempool_t *abe_pool; | |
73 | + | |
63 | 74 | static void aio_kick_handler(struct work_struct *); |
64 | 75 | static void aio_queue_work(struct kioctx *); |
65 | 76 | |
... | ... | @@ -73,6 +84,8 @@ |
73 | 84 | kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC); |
74 | 85 | |
75 | 86 | aio_wq = create_workqueue("aio"); |
87 | + abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry)); | |
88 | + BUG_ON(!abe_pool); | |
76 | 89 | |
77 | 90 | pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page)); |
78 | 91 | |
79 | 92 | |
... | ... | @@ -1531,8 +1544,44 @@ |
1531 | 1544 | return 1; |
1532 | 1545 | } |
1533 | 1546 | |
1547 | +static void aio_batch_add(struct address_space *mapping, | |
1548 | + struct hlist_head *batch_hash) | |
1549 | +{ | |
1550 | + struct aio_batch_entry *abe; | |
1551 | + struct hlist_node *pos; | |
1552 | + unsigned bucket; | |
1553 | + | |
1554 | + bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS); | |
1555 | + hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) { | |
1556 | + if (abe->mapping == mapping) | |
1557 | + return; | |
1558 | + } | |
1559 | + | |
1560 | + abe = mempool_alloc(abe_pool, GFP_KERNEL); | |
1561 | + BUG_ON(!igrab(mapping->host)); | |
1562 | + abe->mapping = mapping; | |
1563 | + hlist_add_head(&abe->list, &batch_hash[bucket]); | |
1564 | + return; | |
1565 | +} | |
1566 | + | |
1567 | +static void aio_batch_free(struct hlist_head *batch_hash) | |
1568 | +{ | |
1569 | + struct aio_batch_entry *abe; | |
1570 | + struct hlist_node *pos, *n; | |
1571 | + int i; | |
1572 | + | |
1573 | + for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) { | |
1574 | + hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) { | |
1575 | + blk_run_address_space(abe->mapping); | |
1576 | + iput(abe->mapping->host); | |
1577 | + hlist_del(&abe->list); | |
1578 | + mempool_free(abe, abe_pool); | |
1579 | + } | |
1580 | + } | |
1581 | +} | |
1582 | + | |
1534 | 1583 | static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, |
1535 | - struct iocb *iocb) | |
1584 | + struct iocb *iocb, struct hlist_head *batch_hash) | |
1536 | 1585 | { |
1537 | 1586 | struct kiocb *req; |
1538 | 1587 | struct file *file; |
... | ... | @@ -1608,6 +1657,12 @@ |
1608 | 1657 | ; |
1609 | 1658 | } |
1610 | 1659 | spin_unlock_irq(&ctx->ctx_lock); |
1660 | + if (req->ki_opcode == IOCB_CMD_PREAD || | |
1661 | + req->ki_opcode == IOCB_CMD_PREADV || | |
1662 | + req->ki_opcode == IOCB_CMD_PWRITE || | |
1663 | + req->ki_opcode == IOCB_CMD_PWRITEV) | |
1664 | + aio_batch_add(file->f_mapping, batch_hash); | |
1665 | + | |
1611 | 1666 | aio_put_req(req); /* drop extra ref to req */ |
1612 | 1667 | return 0; |
1613 | 1668 | |
... | ... | @@ -1635,6 +1690,7 @@ |
1635 | 1690 | struct kioctx *ctx; |
1636 | 1691 | long ret = 0; |
1637 | 1692 | int i; |
1693 | + struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, }; | |
1638 | 1694 | |
1639 | 1695 | if (unlikely(nr < 0)) |
1640 | 1696 | return -EINVAL; |
1641 | 1697 | |
... | ... | @@ -1666,10 +1722,11 @@ |
1666 | 1722 | break; |
1667 | 1723 | } |
1668 | 1724 | |
1669 | - ret = io_submit_one(ctx, user_iocb, &tmp); | |
1725 | + ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash); | |
1670 | 1726 | if (ret) |
1671 | 1727 | break; |
1672 | 1728 | } |
1729 | + aio_batch_free(batch_hash); | |
1673 | 1730 | |
1674 | 1731 | put_ioctx(ctx); |
1675 | 1732 | return i ? i : ret; |
fs/direct-io.c
... | ... | @@ -1028,9 +1028,6 @@ |
1028 | 1028 | if (dio->bio) |
1029 | 1029 | dio_bio_submit(dio); |
1030 | 1030 | |
1031 | - /* All IO is now issued, send it on its way */ | |
1032 | - blk_run_address_space(inode->i_mapping); | |
1033 | - | |
1034 | 1031 | /* |
1035 | 1032 | * It is possible that, we return short IO due to end of file. |
1036 | 1033 | * In that case, we need to release all the pages we got hold on. |
1037 | 1034 | |
... | ... | @@ -1057,8 +1054,11 @@ |
1057 | 1054 | ((rw & READ) || (dio->result == dio->size))) |
1058 | 1055 | ret = -EIOCBQUEUED; |
1059 | 1056 | |
1060 | - if (ret != -EIOCBQUEUED) | |
1057 | + if (ret != -EIOCBQUEUED) { | |
1058 | + /* All IO is now issued, send it on its way */ | |
1059 | + blk_run_address_space(inode->i_mapping); | |
1061 | 1060 | dio_await_completion(dio); |
1061 | + } | |
1062 | 1062 | |
1063 | 1063 | /* |
1064 | 1064 | * Sync will always be dropping the final ref and completing the |