Commit b8af67e2681c693a21f3933e3bdfce4cf66596d3
Committed by
Linus Torvalds
1 parent
98d5ce0d00
Exists in
master
and in
4 other branches
fs/block_dev.c: fix performance regression in O_DIRECT|O_SYNC writes to block devices
We are seeing a large regression in database performance on recent kernels. The database opens a block device with O_DIRECT|O_SYNC and a number of threads write to different regions of the file at the same time. A simple test case is below. I haven't defined DEVICE since getting it wrong will destroy your data :) On an 3 disk LVM with a 64k chunk size we see about 17MB/sec and only a few threads in IO wait: procs -----io---- -system-- -----cpu------ r b bi bo in cs us sy id wa st 0 3 0 16170 656 2259 0 0 86 14 0 0 2 0 16704 695 2408 0 0 92 8 0 0 2 0 17308 744 2653 0 0 86 14 0 0 2 0 17933 759 2777 0 0 89 10 0 Most threads are blocking in vfs_fsync_range, which has: mutex_lock(&mapping->host->i_mutex); err = fop->fsync(file, dentry, datasync); if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); commit 148f948ba877f4d3cdef036b1ff6d9f68986706a (vfs: Introduce new helpers for syncing after writing to O_SYNC file or IS_SYNC inode) offers some explanation of what is going on: Use these new helpers for syncing from generic VFS functions. This makes O_SYNC writes to block devices acquire i_mutex for syncing. If we really care about this, we can make block_fsync() drop the i_mutex and reacquire it before it returns. Thanks Jan for such a good commit message! As well as dropping i_mutex, Christoph suggests we should remove the call to sync_blockdev(): > sync_blockdev is an overcomplicated alias for filemap_write_and_wait on > the block device inode, which is exactly what we did just before calling > into ->fsync The patch below incorporates both suggestions. With it the testcase improves from 17MB/s to 68M/sec: procs -----io---- -system-- -----cpu------ r b bi bo in cs us sy id wa st 0 7 0 65536 1000 3878 0 0 70 30 0 0 34 0 69632 1016 3921 0 1 46 53 0 0 57 0 69632 1000 3921 0 0 55 45 0 0 53 0 69640 754 4111 0 0 81 19 0 Testcase: #define _GNU_SOURCE #include <stdio.h> #include <pthread.h> #include <unistd.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> #define NR_THREADS 64 #define BUFSIZE (64 * 1024) #define DEVICE "/dev/mapper/XXXXXX" #define ALIGN(VAL, SIZE) (((VAL)+(SIZE)-1) & ~((SIZE)-1)) static int fd; static void *doit(void *arg) { unsigned long offset = (long)arg; char *b, *buf; b = malloc(BUFSIZE + 1024); buf = (char *)ALIGN((unsigned long)b, 1024); memset(buf, 0, BUFSIZE); while (1) pwrite(fd, buf, BUFSIZE, offset); } int main(int argc, char *argv[]) { int flags = O_RDWR|O_DIRECT; int i; unsigned long offset = 0; if (argc > 1 && !strcmp(argv[1], "O_SYNC")) flags |= O_SYNC; fd = open(DEVICE, flags); if (fd == -1) { perror("open"); exit(1); } for (i = 0; i < NR_THREADS-1; i++) { pthread_t tid; pthread_create(&tid, NULL, doit, (void *)offset); offset += BUFSIZE; } doit((void *)offset); return 0; } Signed-off-by: Anton Blanchard <anton@samba.org> Acked-by: Jan Kara <jack@suse.cz> Cc: Christoph Hellwig <hch@lst.de> Cc: Alexander Viro <viro@zeniv.linux.org.uk> Cc: Jens Axboe <jens.axboe@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 1 changed file with 12 additions and 5 deletions Side-by-side Diff
fs/block_dev.c
... | ... | @@ -406,16 +406,23 @@ |
406 | 406 | |
407 | 407 | int blkdev_fsync(struct file *filp, struct dentry *dentry, int datasync) |
408 | 408 | { |
409 | - struct block_device *bdev = I_BDEV(filp->f_mapping->host); | |
409 | + struct inode *bd_inode = filp->f_mapping->host; | |
410 | + struct block_device *bdev = I_BDEV(bd_inode); | |
410 | 411 | int error; |
411 | 412 | |
412 | - error = sync_blockdev(bdev); | |
413 | - if (error) | |
414 | - return error; | |
415 | - | |
413 | + /* | |
414 | + * There is no need to serialise calls to blkdev_issue_flush with | |
415 | + * i_mutex and doing so causes performance issues with concurrent | |
416 | + * O_SYNC writers to a block device. | |
417 | + */ | |
418 | + mutex_unlock(&bd_inode->i_mutex); | |
419 | + | |
416 | 420 | error = blkdev_issue_flush(bdev, NULL); |
417 | 421 | if (error == -EOPNOTSUPP) |
418 | 422 | error = 0; |
423 | + | |
424 | + mutex_lock(&bd_inode->i_mutex); | |
425 | + | |
419 | 426 | return error; |
420 | 427 | } |
421 | 428 | EXPORT_SYMBOL(blkdev_fsync); |