Commit ebcf28e1c7a295f3321249dd235ad2e45938fdd9

Authored by Andrew Morton
Committed by Linus Torvalds
1 parent 469eb4d038

[PATCH] fadvise(): write commands

Add two new linux-specific fadvise extensions():

LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
offsets `offset' and `offset+len'.  Any pages which are currently under
writeout are skipped, whether or not they are dirty.

LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
offsets `offset' and `offset+len'.

By combining these two operations the application may do several things:

LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.

LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently dirty
pages at the disk.

LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push all
of the currently dirty pages at the disk, wait until they have been written.

It should be noted that none of these operations write out the file's
metadata.  So unless the application is strictly performing overwrites of
already-instantiated disk blocks, there are no guarantees here that the data
will be available after a crash.

To complete this suite of operations I guess we should have a "sync file
metadata only" operation.  This gives applications access to all the building
blocks needed for all sorts of sync operations.  But sync-metadata doesn't fit
well with the fadvise() interface.  Probably it should be a new syscall:
sys_fmetadatasync().

The patch also diddles with the meaning of `endbyte' in sys_fadvise64_64().
It is made to represent that last affected byte in the file (ie: it is
inclusive).  Generally, all these byterange and pagerange functions are
inclusive so we can easily represent EOF with -1.

As Ulrich notes, these two functions are somewhat abusive of the fadvise()
concept, which appears to be "set the future policy for this fd".

But these commands are a perfect fit with the fadvise() impementation, and
several of the existing fadvise() commands are synchronous and don't affect
future policy either.   I think we can live with the slight incongruity.

Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 4 changed files with 57 additions and 10 deletions Side-by-side Diff

include/linux/fadvise.h
... ... @@ -18,5 +18,11 @@
18 18 #define POSIX_FADV_NOREUSE 5 /* Data will be accessed once. */
19 19 #endif
20 20  
  21 +/*
  22 + * Linux-specific fadvise() extensions:
  23 + */
  24 +#define LINUX_FADV_ASYNC_WRITE 32 /* Start writeout on range */
  25 +#define LINUX_FADV_WRITE_WAIT 33 /* Wait upon writeout to range */
  26 +
21 27 #endif /* FADVISE_H_INCLUDED */
... ... @@ -1473,6 +1473,11 @@
1473 1473 extern int filemap_write_and_wait(struct address_space *mapping);
1474 1474 extern int filemap_write_and_wait_range(struct address_space *mapping,
1475 1475 loff_t lstart, loff_t lend);
  1476 +extern int wait_on_page_writeback_range(struct address_space *mapping,
  1477 + pgoff_t start, pgoff_t end);
  1478 +extern int __filemap_fdatawrite_range(struct address_space *mapping,
  1479 + loff_t start, loff_t end, int sync_mode);
  1480 +
1476 1481 extern void sync_supers(void);
1477 1482 extern void sync_filesystems(int wait);
1478 1483 extern void emergency_sync(void);
... ... @@ -15,6 +15,7 @@
15 15 #include <linux/backing-dev.h>
16 16 #include <linux/pagevec.h>
17 17 #include <linux/fadvise.h>
  18 +#include <linux/writeback.h>
18 19 #include <linux/syscalls.h>
19 20  
20 21 #include <asm/unistd.h>
21 22  
... ... @@ -22,13 +23,36 @@
22 23 /*
23 24 * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
24 25 * deactivate the pages and clear PG_Referenced.
  26 + *
  27 + * LINUX_FADV_ASYNC_WRITE: start async writeout of any dirty pages between file
  28 + * offsets `offset' and `offset+len' inclusive. Any pages which are currently
  29 + * under writeout are skipped, whether or not they are dirty.
  30 + *
  31 + * LINUX_FADV_WRITE_WAIT: wait upon writeout of any dirty pages between file
  32 + * offsets `offset' and `offset+len'.
  33 + *
  34 + * By combining these two operations the application may do several things:
  35 + *
  36 + * LINUX_FADV_ASYNC_WRITE: push some or all of the dirty pages at the disk.
  37 + *
  38 + * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE: push all of the currently
  39 + * dirty pages at the disk.
  40 + *
  41 + * LINUX_FADV_WRITE_WAIT, LINUX_FADV_ASYNC_WRITE, LINUX_FADV_WRITE_WAIT: push
  42 + * all of the currently dirty pages at the disk, wait until they have been
  43 + * written.
  44 + *
  45 + * It should be noted that none of these operations write out the file's
  46 + * metadata. So unless the application is strictly performing overwrites of
  47 + * already-instantiated disk blocks, there are no guarantees here that the data
  48 + * will be available after a crash.
25 49 */
26 50 asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice)
27 51 {
28 52 struct file *file = fget(fd);
29 53 struct address_space *mapping;
30 54 struct backing_dev_info *bdi;
31   - loff_t endbyte;
  55 + loff_t endbyte; /* inclusive */
32 56 pgoff_t start_index;
33 57 pgoff_t end_index;
34 58 unsigned long nrpages;
... ... @@ -56,6 +80,8 @@
56 80 endbyte = offset + len;
57 81 if (!len || endbyte < len)
58 82 endbyte = -1;
  83 + else
  84 + endbyte--; /* inclusive */
59 85  
60 86 bdi = mapping->backing_dev_info;
61 87  
... ... @@ -78,7 +104,7 @@
78 104  
79 105 /* First and last PARTIAL page! */
80 106 start_index = offset >> PAGE_CACHE_SHIFT;
81   - end_index = (endbyte-1) >> PAGE_CACHE_SHIFT;
  107 + end_index = endbyte >> PAGE_CACHE_SHIFT;
82 108  
83 109 /* Careful about overflow on the "+1" */
84 110 nrpages = end_index - start_index + 1;
85 111  
... ... @@ -96,11 +122,21 @@
96 122 filemap_flush(mapping);
97 123  
98 124 /* First and last FULL page! */
99   - start_index = (offset + (PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
  125 + start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
100 126 end_index = (endbyte >> PAGE_CACHE_SHIFT);
101 127  
102   - if (end_index > start_index)
103   - invalidate_mapping_pages(mapping, start_index, end_index-1);
  128 + if (end_index >= start_index)
  129 + invalidate_mapping_pages(mapping, start_index,
  130 + end_index);
  131 + break;
  132 + case LINUX_FADV_ASYNC_WRITE:
  133 + ret = __filemap_fdatawrite_range(mapping, offset, endbyte,
  134 + WB_SYNC_NONE);
  135 + break;
  136 + case LINUX_FADV_WRITE_WAIT:
  137 + ret = wait_on_page_writeback_range(mapping,
  138 + offset >> PAGE_CACHE_SHIFT,
  139 + endbyte >> PAGE_CACHE_SHIFT);
104 140 break;
105 141 default:
106 142 ret = -EINVAL;
... ... @@ -183,8 +183,8 @@
183 183 * these two operations is that if a dirty page/buffer is encountered, it must
184 184 * be waited upon, and not just skipped over.
185 185 */
186   -static int __filemap_fdatawrite_range(struct address_space *mapping,
187   - loff_t start, loff_t end, int sync_mode)
  186 +int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
  187 + loff_t end, int sync_mode)
188 188 {
189 189 int ret;
190 190 struct writeback_control wbc = {
... ... @@ -213,8 +213,8 @@
213 213 }
214 214 EXPORT_SYMBOL(filemap_fdatawrite);
215 215  
216   -static int filemap_fdatawrite_range(struct address_space *mapping,
217   - loff_t start, loff_t end)
  216 +static int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
  217 + loff_t end)
218 218 {
219 219 return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
220 220 }
... ... @@ -233,7 +233,7 @@
233 233 * Wait for writeback to complete against pages indexed by start->end
234 234 * inclusive
235 235 */
236   -static int wait_on_page_writeback_range(struct address_space *mapping,
  236 +int wait_on_page_writeback_range(struct address_space *mapping,
237 237 pgoff_t start, pgoff_t end)
238 238 {
239 239 struct pagevec pvec;