aio: implement request batching

Hi, Some workloads issue batches of small I/O, and the performance is poor due to the call to blk_run_address_space for every single iocb. Nathan Roberts pointed this out, and suggested that by deferring this call until all I/Os in the iocb array are submitted to the block layer, we can realize some impressive performance gains (up to 30% for sequential 4k reads in batches of 16). Signed-off-by: Jeff Moyer <jmoyer@redhat.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>

aio: implement request batching
Hi, Some workloads issue batches of small I/O, and the performance is poor due to the call to blk_run_address_space for every single iocb. Nathan Roberts pointed this out, and suggested that by deferring this call until all I/Os in the iocb array are submitted to the block layer, we can realize some impressive performance gains (up to 30% for sequential 4k reads in batches of 16). Signed-off-by: Jeff Moyer <jmoyer@redhat.com> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Jeff Moyer · Jens Axboe
1 parent 1af60fbd75
Showing 2 changed files with 63 additions and 6 deletions Side-by-side Diff
fs/aio.c
fs/direct-io.c
@@ -32,6 +32,9 @@
 #include <linux/workqueue.h>
 #include <linux/security.h>
 #include <linux/eventfd.h>
+#include <linux/blkdev.h>
+#include <linux/mempool.h>
+#include <linux/hash.h>
  
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -60,6 +63,14 @@
 static DEFINE_SPINLOCK(fput_lock);
 static LIST_HEAD(fput_head);
  
+#define AIO_BATCH_HASH_BITS	3 /* allocated on-stack, so don't go crazy */
+#define AIO_BATCH_HASH_SIZE	(1 << AIO_BATCH_HASH_BITS)
+struct aio_batch_entry {
+	struct hlist_node list;
+	struct address_space *mapping;
+};
+mempool_t *abe_pool;
+
 static void aio_kick_handler(struct work_struct *);
 static void aio_queue_work(struct kioctx *);
  
@@ -73,6 +84,8 @@
 	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN|SLAB_PANIC);
  
 	aio_wq = create_workqueue("aio");
+	abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
+	BUG_ON(!abe_pool);
  
 	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
  
  
@@ -1531,8 +1544,44 @@
 	return 1;
 }
  
+static void aio_batch_add(struct address_space *mapping,
+			  struct hlist_head *batch_hash)
+{
+	struct aio_batch_entry *abe;
+	struct hlist_node *pos;
+	unsigned bucket;
+
+	bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
+	hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
+		if (abe->mapping == mapping)
+			return;
+	}
+
+	abe = mempool_alloc(abe_pool, GFP_KERNEL);
+	BUG_ON(!igrab(mapping->host));
+	abe->mapping = mapping;
+	hlist_add_head(&abe->list, &batch_hash[bucket]);
+	return;
+}
+
+static void aio_batch_free(struct hlist_head *batch_hash)
+{
+	struct aio_batch_entry *abe;
+	struct hlist_node *pos, *n;
+	int i;
+
+	for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
+		hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
+			blk_run_address_space(abe->mapping);
+			iput(abe->mapping->host);
+			hlist_del(&abe->list);
+			mempool_free(abe, abe_pool);
+		}
+	}
+}
+
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
-			 struct iocb *iocb)
+			 struct iocb *iocb, struct hlist_head *batch_hash)
 {
 	struct kiocb *req;
 	struct file *file;
@@ -1608,6 +1657,12 @@
 			;
 	}
 	spin_unlock_irq(&ctx->ctx_lock);
+	if (req->ki_opcode == IOCB_CMD_PREAD ||
+	    req->ki_opcode == IOCB_CMD_PREADV ||
+	    req->ki_opcode == IOCB_CMD_PWRITE ||
+	    req->ki_opcode == IOCB_CMD_PWRITEV)
+		aio_batch_add(file->f_mapping, batch_hash);
+
 	aio_put_req(req);	/* drop extra ref to req */
 	return 0;
  
@@ -1635,6 +1690,7 @@
 	struct kioctx *ctx;
 	long ret = 0;
 	int i;
+	struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
  
 	if (unlikely(nr < 0))
 		return -EINVAL;
  
@@ -1666,10 +1722,11 @@
 			break;
 		}
  
-		ret = io_submit_one(ctx, user_iocb, &tmp);
+		ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
 		if (ret)
 			break;
 	}
+	aio_batch_free(batch_hash);
  
 	put_ioctx(ctx);
 	return i ? i : ret;
@@ -1028,9 +1028,6 @@
 	if (dio->bio)
 		dio_bio_submit(dio);
  
-	/* All IO is now issued, send it on its way */
-	blk_run_address_space(inode->i_mapping);
-
 	/*
 	 * It is possible that, we return short IO due to end of file.
 	 * In that case, we need to release all the pages we got hold on.
  
@@ -1057,8 +1054,11 @@
 	    ((rw & READ) || (dio->result == dio->size)))
 		ret = -EIOCBQUEUED;
  
-	if (ret != -EIOCBQUEUED)
+	if (ret != -EIOCBQUEUED) {
+		/* All IO is now issued, send it on its way */
+		blk_run_address_space(inode->i_mapping);
 		dio_await_completion(dio);
+	}
  
 	/*
 	 * Sync will always be dropping the final ref and completing the
...	...	@@ -32,6 +32,9 @@
32	32	#include <linux/workqueue.h>
33	33	#include <linux/security.h>
34	34	#include <linux/eventfd.h>
	35	+#include <linux/blkdev.h>
	36	+#include <linux/mempool.h>
	37	+#include <linux/hash.h>
35	38
36	39	#include <asm/kmap_types.h>
37	40	#include <asm/uaccess.h>
...	...	@@ -60,6 +63,14 @@
60	63	static DEFINE_SPINLOCK(fput_lock);
61	64	static LIST_HEAD(fput_head);
62	65
	66	+#define AIO_BATCH_HASH_BITS 3 /* allocated on-stack, so don't go crazy */
	67	+#define AIO_BATCH_HASH_SIZE (1 << AIO_BATCH_HASH_BITS)
	68	+struct aio_batch_entry {
	69	+ struct hlist_node list;
	70	+ struct address_space *mapping;
	71	+};
	72	+mempool_t *abe_pool;
	73	+
63	74	static void aio_kick_handler(struct work_struct *);
64	75	static void aio_queue_work(struct kioctx *);
65	76
...	...	@@ -73,6 +84,8 @@
73	84	kioctx_cachep = KMEM_CACHE(kioctx,SLAB_HWCACHE_ALIGN\|SLAB_PANIC);
74	85
75	86	aio_wq = create_workqueue("aio");
	87	+ abe_pool = mempool_create_kmalloc_pool(1, sizeof(struct aio_batch_entry));
	88	+ BUG_ON(!abe_pool);
76	89
77	90	pr_debug("aio_setup: sizeof(struct page) = %d\n", (int)sizeof(struct page));
78	91
79	92
...	...	@@ -1531,8 +1544,44 @@
1531	1544	return 1;
1532	1545	}
1533	1546
	1547	+static void aio_batch_add(struct address_space *mapping,
	1548	+ struct hlist_head *batch_hash)
	1549	+{
	1550	+ struct aio_batch_entry *abe;
	1551	+ struct hlist_node *pos;
	1552	+ unsigned bucket;
	1553	+
	1554	+ bucket = hash_ptr(mapping, AIO_BATCH_HASH_BITS);
	1555	+ hlist_for_each_entry(abe, pos, &batch_hash[bucket], list) {
	1556	+ if (abe->mapping == mapping)
	1557	+ return;
	1558	+ }
	1559	+
	1560	+ abe = mempool_alloc(abe_pool, GFP_KERNEL);
	1561	+ BUG_ON(!igrab(mapping->host));
	1562	+ abe->mapping = mapping;
	1563	+ hlist_add_head(&abe->list, &batch_hash[bucket]);
	1564	+ return;
	1565	+}
	1566	+
	1567	+static void aio_batch_free(struct hlist_head *batch_hash)
	1568	+{
	1569	+ struct aio_batch_entry *abe;
	1570	+ struct hlist_node pos, n;
	1571	+ int i;
	1572	+
	1573	+ for (i = 0; i < AIO_BATCH_HASH_SIZE; i++) {
	1574	+ hlist_for_each_entry_safe(abe, pos, n, &batch_hash[i], list) {
	1575	+ blk_run_address_space(abe->mapping);
	1576	+ iput(abe->mapping->host);
	1577	+ hlist_del(&abe->list);
	1578	+ mempool_free(abe, abe_pool);
	1579	+ }
	1580	+ }
	1581	+}
	1582	+
1534	1583	static int io_submit_one(struct kioctx ctx, struct iocb __user user_iocb,
1535		- struct iocb *iocb)
	1584	+ struct iocb iocb, struct hlist_head batch_hash)
1536	1585	{
1537	1586	struct kiocb *req;
1538	1587	struct file *file;
...	...	@@ -1608,6 +1657,12 @@
1608	1657	;
1609	1658	}
1610	1659	spin_unlock_irq(&ctx->ctx_lock);
	1660	+ if (req->ki_opcode == IOCB_CMD_PREAD \|\|
	1661	+ req->ki_opcode == IOCB_CMD_PREADV \|\|
	1662	+ req->ki_opcode == IOCB_CMD_PWRITE \|\|
	1663	+ req->ki_opcode == IOCB_CMD_PWRITEV)
	1664	+ aio_batch_add(file->f_mapping, batch_hash);
	1665	+
1611	1666	aio_put_req(req); /* drop extra ref to req */
1612	1667	return 0;
1613	1668
...	...	@@ -1635,6 +1690,7 @@
1635	1690	struct kioctx *ctx;
1636	1691	long ret = 0;
1637	1692	int i;
	1693	+ struct hlist_head batch_hash[AIO_BATCH_HASH_SIZE] = { { 0, }, };
1638	1694
1639	1695	if (unlikely(nr < 0))
1640	1696	return -EINVAL;
1641	1697
...	...	@@ -1666,10 +1722,11 @@
1666	1722	break;
1667	1723	}
1668	1724
1669		- ret = io_submit_one(ctx, user_iocb, &tmp);
	1725	+ ret = io_submit_one(ctx, user_iocb, &tmp, batch_hash);
1670	1726	if (ret)
1671	1727	break;
1672	1728	}
	1729	+ aio_batch_free(batch_hash);
1673	1730
1674	1731	put_ioctx(ctx);
1675	1732	return i ? i : ret;
...	...	@@ -1028,9 +1028,6 @@
1028	1028	if (dio->bio)
1029	1029	dio_bio_submit(dio);
1030	1030
1031		- /* All IO is now issued, send it on its way */
1032		- blk_run_address_space(inode->i_mapping);
1033		-
1034	1031	/*
1035	1032	* It is possible that, we return short IO due to end of file.
1036	1033	* In that case, we need to release all the pages we got hold on.
1037	1034
...	...	@@ -1057,8 +1054,11 @@
1057	1054	((rw & READ) \|\| (dio->result == dio->size)))
1058	1055	ret = -EIOCBQUEUED;
1059	1056
1060		- if (ret != -EIOCBQUEUED)
	1057	+ if (ret != -EIOCBQUEUED) {
	1058	+ /* All IO is now issued, send it on its way */
	1059	+ blk_run_address_space(inode->i_mapping);
1061	1060	dio_await_completion(dio);
	1061	+ }
1062	1062
1063	1063	/*
1064	1064	* Sync will always be dropping the final ref and completing the