Commit f79e2abb9bd452d97295f34376dedbec9686b986
Committed by
Linus Torvalds
1 parent
d6dfd1310d
Exists in
master
and in
4 other branches
[PATCH] sys_sync_file_range()
Remove the recently-added LINUX_FADV_ASYNC_WRITE and LINUX_FADV_WRITE_WAIT fadvise() additions, do it in a new sys_sync_file_range() syscall instead. Reasons: - It's more flexible. Things which would require two or three syscalls with fadvise() can be done in a single syscall. - Using fadvise() in this manner is something not covered by POSIX. The patch wires up the syscall for x86. The sycall is implemented in the new fs/sync.c. The intention is that we can move sys_fsync(), sys_fdatasync() and perhaps sys_sync() into there later. Documentation for the syscall is in fs/sync.c. A test app (sync_file_range.c) is in http://www.zip.com.au/~akpm/linux/patches/stuff/ext3-tools.tar.gz. The available-to-GPL-modules do_sync_file_range() is for knfsd: "A COMMIT can say NFS_DATA_SYNC or NFS_FILE_SYNC. I can skip the ->fsync call for NFS_DATA_SYNC which is hopefully the more common." Note: the `async' writeout mode SYNC_FILE_RANGE_WRITE will turn synchronous if the queue is congested. This is trivial to fix: add a new flag bit, set wbc->nonblocking. But I'm not sure that we want to expose implementation details down to that level. Note: it's notable that we can sync an fd which wasn't opened for writing. Same with fsync() and fdatasync()). Note: the code takes some care to handle attempts to sync file contents outside the 16TB offset on 32-bit machines. It makes such attempts appear to succeed, for best 32-bit/64-bit compatibility. Perhaps it should make such requests fail... Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Michael Kerrisk <mtk-manpages@gmx.net> Cc: Ulrich Drepper <drepper@redhat.com> Cc: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 8 changed files with 177 additions and 28 deletions Side-by-side Diff
arch/i386/kernel/syscall_table.S
fs/Makefile
... | ... | @@ -10,7 +10,7 @@ |
10 | 10 | ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ |
11 | 11 | attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ |
12 | 12 | seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ |
13 | - ioprio.o pnode.o drop_caches.o splice.o | |
13 | + ioprio.o pnode.o drop_caches.o splice.o sync.o | |
14 | 14 | |
15 | 15 | obj-$(CONFIG_INOTIFY) += inotify.o |
16 | 16 | obj-$(CONFIG_EPOLL) += eventpoll.o |
fs/sync.c
1 | +/* | |
2 | + * High-level sync()-related operations | |
3 | + */ | |
4 | + | |
5 | +#include <linux/kernel.h> | |
6 | +#include <linux/file.h> | |
7 | +#include <linux/fs.h> | |
8 | +#include <linux/module.h> | |
9 | +#include <linux/writeback.h> | |
10 | +#include <linux/syscalls.h> | |
11 | +#include <linux/linkage.h> | |
12 | +#include <linux/pagemap.h> | |
13 | + | |
14 | +#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ | |
15 | + SYNC_FILE_RANGE_WAIT_AFTER) | |
16 | + | |
17 | +/* | |
18 | + * sys_sync_file_range() permits finely controlled syncing over a segment of | |
19 | + * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is | |
20 | + * zero then sys_sync_file_range() will operate from offset out to EOF. | |
21 | + * | |
22 | + * The flag bits are: | |
23 | + * | |
24 | + * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range | |
25 | + * before performing the write. | |
26 | + * | |
27 | + * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the | |
28 | + * range which are not presently under writeback. | |
29 | + * | |
30 | + * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range | |
31 | + * after performing the write. | |
32 | + * | |
33 | + * Useful combinations of the flag bits are: | |
34 | + * | |
35 | + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages | |
36 | + * in the range which were dirty on entry to sys_sync_file_range() are placed | |
37 | + * under writeout. This is a start-write-for-data-integrity operation. | |
38 | + * | |
39 | + * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which | |
40 | + * are not presently under writeout. This is an asynchronous flush-to-disk | |
41 | + * operation. Not suitable for data integrity operations. | |
42 | + * | |
43 | + * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for | |
44 | + * completion of writeout of all pages in the range. This will be used after an | |
45 | + * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait | |
46 | + * for that operation to complete and to return the result. | |
47 | + * | |
48 | + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: | |
49 | + * a traditional sync() operation. This is a write-for-data-integrity operation | |
50 | + * which will ensure that all pages in the range which were dirty on entry to | |
51 | + * sys_sync_file_range() are committed to disk. | |
52 | + * | |
53 | + * | |
54 | + * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any | |
55 | + * I/O errors or ENOSPC conditions and will return those to the caller, after | |
56 | + * clearing the EIO and ENOSPC flags in the address_space. | |
57 | + * | |
58 | + * It should be noted that none of these operations write out the file's | |
59 | + * metadata. So unless the application is strictly performing overwrites of | |
60 | + * already-instantiated disk blocks, there are no guarantees here that the data | |
61 | + * will be available after a crash. | |
62 | + */ | |
63 | +asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, | |
64 | + int flags) | |
65 | +{ | |
66 | + int ret; | |
67 | + struct file *file; | |
68 | + loff_t endbyte; /* inclusive */ | |
69 | + int fput_needed; | |
70 | + umode_t i_mode; | |
71 | + | |
72 | + ret = -EINVAL; | |
73 | + if (flags & ~VALID_FLAGS) | |
74 | + goto out; | |
75 | + | |
76 | + endbyte = offset + nbytes; | |
77 | + | |
78 | + if ((s64)offset < 0) | |
79 | + goto out; | |
80 | + if ((s64)endbyte < 0) | |
81 | + goto out; | |
82 | + if (endbyte < offset) | |
83 | + goto out; | |
84 | + | |
85 | + if (sizeof(pgoff_t) == 4) { | |
86 | + if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { | |
87 | + /* | |
88 | + * The range starts outside a 32 bit machine's | |
89 | + * pagecache addressing capabilities. Let it "succeed" | |
90 | + */ | |
91 | + ret = 0; | |
92 | + goto out; | |
93 | + } | |
94 | + if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { | |
95 | + /* | |
96 | + * Out to EOF | |
97 | + */ | |
98 | + nbytes = 0; | |
99 | + } | |
100 | + } | |
101 | + | |
102 | + if (nbytes == 0) | |
103 | + endbyte = -1; | |
104 | + else | |
105 | + endbyte--; /* inclusive */ | |
106 | + | |
107 | + ret = -EBADF; | |
108 | + file = fget_light(fd, &fput_needed); | |
109 | + if (!file) | |
110 | + goto out; | |
111 | + | |
112 | + i_mode = file->f_dentry->d_inode->i_mode; | |
113 | + ret = -ESPIPE; | |
114 | + if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && | |
115 | + !S_ISLNK(i_mode)) | |
116 | + goto out_put; | |
117 | + | |
118 | + ret = do_sync_file_range(file, offset, endbyte, flags); | |
119 | +out_put: | |
120 | + fput_light(file, fput_needed); | |
121 | +out: | |
122 | + return ret; | |
123 | +} | |
124 | + | |
125 | +/* | |
126 | + * `endbyte' is inclusive | |
127 | + */ | |
128 | +int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, | |
129 | + int flags) | |
130 | +{ | |
131 | + int ret; | |
132 | + struct address_space *mapping; | |
133 | + | |
134 | + mapping = file->f_mapping; | |
135 | + if (!mapping) { | |
136 | + ret = -EINVAL; | |
137 | + goto out; | |
138 | + } | |
139 | + | |
140 | + ret = 0; | |
141 | + if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { | |
142 | + ret = wait_on_page_writeback_range(mapping, | |
143 | + offset >> PAGE_CACHE_SHIFT, | |
144 | + endbyte >> PAGE_CACHE_SHIFT); | |
145 | + if (ret < 0) | |
146 | + goto out; | |
147 | + } | |
148 | + | |
149 | + if (flags & SYNC_FILE_RANGE_WRITE) { | |
150 | + ret = __filemap_fdatawrite_range(mapping, offset, endbyte, | |
151 | + WB_SYNC_NONE); | |
152 | + if (ret < 0) | |
153 | + goto out; | |
154 | + } | |
155 | + | |
156 | + if (flags & SYNC_FILE_RANGE_WAIT_AFTER) { | |
157 | + ret = wait_on_page_writeback_range(mapping, | |
158 | + offset >> PAGE_CACHE_SHIFT, | |
159 | + endbyte >> PAGE_CACHE_SHIFT); | |
160 | + } | |
161 | +out: | |
162 | + return ret; | |
163 | +} | |
164 | +EXPORT_SYMBOL_GPL(do_sync_file_range); |
include/asm-i386/unistd.h
... | ... | @@ -319,8 +319,9 @@ |
319 | 319 | #define __NR_set_robust_list 311 |
320 | 320 | #define __NR_get_robust_list 312 |
321 | 321 | #define __NR_sys_splice 313 |
322 | +#define __NR_sys_sync_file_range 314 | |
322 | 323 | |
323 | -#define NR_syscalls 314 | |
324 | +#define NR_syscalls 315 | |
324 | 325 | |
325 | 326 | /* |
326 | 327 | * user-visible error numbers are in the range -1 - -128: see |
include/linux/fadvise.h
... | ... | @@ -18,11 +18,5 @@ |
18 | 18 | #define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */ |
19 | 19 | #endif |
20 | 20 | |
21 | -/* | |
22 | - * Linux-specific fadvise() extensions: | |
23 | - */ | |
24 | -#define LINUX_FADV_ASYNC_WRITE 32 /* Start writeout on range */ | |
25 | -#define LINUX_FADV_WRITE_WAIT 33 /* Wait upon writeout to range */ | |
26 | - | |
27 | 21 | #endif /* FADVISE_H_INCLUDED */ |
include/linux/fs.h
... | ... | @@ -757,6 +757,13 @@ |
757 | 757 | extern int fcntl_setlease(unsigned int fd, struct file *filp, long arg); |
758 | 758 | extern int fcntl_getlease(struct file *filp); |
759 | 759 | |
760 | +/* fs/sync.c */ | |
761 | +#define SYNC_FILE_RANGE_WAIT_BEFORE 1 | |
762 | +#define SYNC_FILE_RANGE_WRITE 2 | |
763 | +#define SYNC_FILE_RANGE_WAIT_AFTER 4 | |
764 | +extern int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, | |
765 | + int flags); | |
766 | + | |
760 | 767 | /* fs/locks.c */ |
761 | 768 | extern void locks_init_lock(struct file_lock *); |
762 | 769 | extern void locks_copy_lock(struct file_lock *, struct file_lock *); |
include/linux/syscalls.h
... | ... | @@ -571,6 +571,8 @@ |
571 | 571 | asmlinkage long sys_unshare(unsigned long unshare_flags); |
572 | 572 | asmlinkage long sys_splice(int fdin, int fdout, size_t len, |
573 | 573 | unsigned int flags); |
574 | +asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, | |
575 | + int flags); | |
574 | 576 | |
575 | 577 | #endif |
mm/fadvise.c
... | ... | @@ -35,17 +35,6 @@ |
35 | 35 | * |
36 | 36 | * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk. |
37 | 37 | * |
38 | - * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently | |
39 | - * dirty pages at the disk. | |
40 | - * | |
41 | - * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push | |
42 | - * all of the currently dirty pages at the disk, wait until they have been | |
43 | - * written. | |
44 | - * | |
45 | - * It should be noted that none of these operations write out the file's | |
46 | - * metadata. So unless the application is strictly performing overwrites of | |
47 | - * already-instantiated disk blocks, there are no guarantees here that the data | |
48 | - * will be available after a crash. | |
49 | 38 | */ |
50 | 39 | asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) |
51 | 40 | { |
... | ... | @@ -128,15 +117,6 @@ |
128 | 117 | if (end_index >= start_index) |
129 | 118 | invalidate_mapping_pages(mapping, start_index, |
130 | 119 | end_index); |
131 | - break; | |
132 | - case LINUX_FADV_ASYNC_WRITE: | |
133 | - ret = __filemap_fdatawrite_range(mapping, offset, endbyte, | |
134 | - WB_SYNC_NONE); | |
135 | - break; | |
136 | - case LINUX_FADV_WRITE_WAIT: | |
137 | - ret = wait_on_page_writeback_range(mapping, | |
138 | - offset >> PAGE_CACHE_SHIFT, | |
139 | - endbyte >> PAGE_CACHE_SHIFT); | |
140 | 120 | break; |
141 | 121 | default: |
142 | 122 | ret = -EINVAL; |