Commit 5274f052e7b3dbd81935772eb551dfd0325dfa9d

Authored by Jens Axboe
Committed by Linus Torvalds
1 parent 5d4fe2c1ce

[PATCH] Introduce sys_splice() system call

This adds support for the sys_splice system call. Using a pipe as a
transport, it can connect to files or sockets (latter as output only).

From the splice.c comments:

   "splice": joining two ropes together by interweaving their strands.

   This is the "extended pipe" functionality, where a pipe is used as
   an arbitrary in-memory buffer. Think of a pipe as a small kernel
   buffer that you can use to transfer data from one end to the other.

   The traditional unix read/write is extended with a "splice()" operation
   that transfers data buffers to or from a pipe buffer.

   Named by Larry McVoy, original implementation from Linus, extended by
   Jens to support splicing to files and fixing the initial implementation
   bugs.

Signed-off-by: Jens Axboe <axboe@suse.de>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>

Showing 15 changed files with 669 additions and 11 deletions Side-by-side Diff

arch/i386/kernel/syscall_table.S
... ... @@ -312,4 +312,5 @@
312 312 .long sys_unshare /* 310 */
313 313 .long sys_set_robust_list
314 314 .long sys_get_robust_list
  315 + .long sys_splice
arch/ia64/kernel/entry.S
... ... @@ -1605,6 +1605,7 @@
1605 1605 data8 sys_ni_syscall // reserved for pselect
1606 1606 data8 sys_ni_syscall // 1295 reserved for ppoll
1607 1607 data8 sys_unshare
  1608 + data8 sys_splice
1608 1609  
1609 1610 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
... ... @@ -10,7 +10,7 @@
10 10 ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \
11 11 attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
12 12 seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \
13   - ioprio.o pnode.o drop_caches.o
  13 + ioprio.o pnode.o drop_caches.o splice.o
14 14  
15 15 obj-$(CONFIG_INOTIFY) += inotify.o
16 16 obj-$(CONFIG_EPOLL) += eventpoll.o
... ... @@ -53,6 +53,8 @@
53 53 .readv = generic_file_readv,
54 54 .writev = generic_file_writev,
55 55 .sendfile = generic_file_sendfile,
  56 + .splice_read = generic_file_splice_read,
  57 + .splice_write = generic_file_splice_write,
56 58 };
57 59  
58 60 #ifdef CONFIG_EXT2_FS_XIP
... ... @@ -119,6 +119,8 @@
119 119 .release = ext3_release_file,
120 120 .fsync = ext3_sync_file,
121 121 .sendfile = generic_file_sendfile,
  122 + .splice_read = generic_file_splice_read,
  123 + .splice_write = generic_file_splice_write,
122 124 };
123 125  
124 126 struct inode_operations ext3_file_inode_operations = {
... ... @@ -15,6 +15,7 @@
15 15 #include <linux/pipe_fs_i.h>
16 16 #include <linux/uio.h>
17 17 #include <linux/highmem.h>
  18 +#include <linux/pagemap.h>
18 19  
19 20 #include <asm/uaccess.h>
20 21 #include <asm/ioctls.h>
21 22  
... ... @@ -94,11 +95,20 @@
94 95 {
95 96 struct page *page = buf->page;
96 97  
97   - if (info->tmp_page) {
98   - __free_page(page);
  98 + /*
  99 + * If nobody else uses this page, and we don't already have a
  100 + * temporary page, let's keep track of it as a one-deep
  101 + * allocation cache
  102 + */
  103 + if (page_count(page) == 1 && !info->tmp_page) {
  104 + info->tmp_page = page;
99 105 return;
100 106 }
101   - info->tmp_page = page;
  107 +
  108 + /*
  109 + * Otherwise just release our reference to it
  110 + */
  111 + page_cache_release(page);
102 112 }
103 113  
104 114 static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf)
... ... @@ -152,6 +162,11 @@
152 162 chars = total_len;
153 163  
154 164 addr = ops->map(filp, info, buf);
  165 + if (IS_ERR(addr)) {
  166 + if (!ret)
  167 + ret = PTR_ERR(addr);
  168 + break;
  169 + }
155 170 error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
156 171 ops->unmap(info, buf);
157 172 if (unlikely(error)) {
... ... @@ -254,8 +269,16 @@
254 269 struct pipe_buf_operations *ops = buf->ops;
255 270 int offset = buf->offset + buf->len;
256 271 if (ops->can_merge && offset + chars <= PAGE_SIZE) {
257   - void *addr = ops->map(filp, info, buf);
258   - int error = pipe_iov_copy_from_user(offset + addr, iov, chars);
  272 + void *addr;
  273 + int error;
  274 +
  275 + addr = ops->map(filp, info, buf);
  276 + if (IS_ERR(addr)) {
  277 + error = PTR_ERR(addr);
  278 + goto out;
  279 + }
  280 + error = pipe_iov_copy_from_user(offset + addr, iov,
  281 + chars);
259 282 ops->unmap(info, buf);
260 283 ret = error;
261 284 do_wakeup = 1;
... ... @@ -1576,6 +1576,8 @@
1576 1576 .sendfile = generic_file_sendfile,
1577 1577 .aio_read = generic_file_aio_read,
1578 1578 .aio_write = reiserfs_aio_write,
  1579 + .splice_read = generic_file_splice_read,
  1580 + .splice_write = generic_file_splice_write,
1579 1581 };
1580 1582  
1581 1583 struct inode_operations reiserfs_file_inode_operations = {
  1 +/*
  2 + * "splice": joining two ropes together by interweaving their strands.
  3 + *
  4 + * This is the "extended pipe" functionality, where a pipe is used as
  5 + * an arbitrary in-memory buffer. Think of a pipe as a small kernel
  6 + * buffer that you can use to transfer data from one end to the other.
  7 + *
  8 + * The traditional unix read/write is extended with a "splice()" operation
  9 + * that transfers data buffers to or from a pipe buffer.
  10 + *
  11 + * Named by Larry McVoy, original implementation from Linus, extended by
  12 + * Jens to support splicing to files and fixing the initial implementation
  13 + * bugs.
  14 + *
  15 + * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
  16 + * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
  17 + *
  18 + */
  19 +#include <linux/fs.h>
  20 +#include <linux/file.h>
  21 +#include <linux/pagemap.h>
  22 +#include <linux/pipe_fs_i.h>
  23 +#include <linux/mm_inline.h>
  24 +
  25 +/*
  26 + * Passed to the actors
  27 + */
  28 +struct splice_desc {
  29 + unsigned int len, total_len; /* current and remaining length */
  30 + unsigned int flags; /* splice flags */
  31 + struct file *file; /* file to read/write */
  32 + loff_t pos; /* file position */
  33 +};
  34 +
  35 +static void page_cache_pipe_buf_release(struct pipe_inode_info *info,
  36 + struct pipe_buffer *buf)
  37 +{
  38 + page_cache_release(buf->page);
  39 + buf->page = NULL;
  40 +}
  41 +
  42 +static void *page_cache_pipe_buf_map(struct file *file,
  43 + struct pipe_inode_info *info,
  44 + struct pipe_buffer *buf)
  45 +{
  46 + struct page *page = buf->page;
  47 +
  48 + lock_page(page);
  49 +
  50 + if (!PageUptodate(page)) {
  51 + unlock_page(page);
  52 + return ERR_PTR(-EIO);
  53 + }
  54 +
  55 + if (!page->mapping) {
  56 + unlock_page(page);
  57 + return ERR_PTR(-ENODATA);
  58 + }
  59 +
  60 + return kmap(buf->page);
  61 +}
  62 +
  63 +static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
  64 + struct pipe_buffer *buf)
  65 +{
  66 + unlock_page(buf->page);
  67 + kunmap(buf->page);
  68 +}
  69 +
  70 +static struct pipe_buf_operations page_cache_pipe_buf_ops = {
  71 + .can_merge = 0,
  72 + .map = page_cache_pipe_buf_map,
  73 + .unmap = page_cache_pipe_buf_unmap,
  74 + .release = page_cache_pipe_buf_release,
  75 +};
  76 +
  77 +static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
  78 + int nr_pages, unsigned long offset,
  79 + unsigned long len)
  80 +{
  81 + struct pipe_inode_info *info;
  82 + int ret, do_wakeup, i;
  83 +
  84 + ret = 0;
  85 + do_wakeup = 0;
  86 + i = 0;
  87 +
  88 + mutex_lock(PIPE_MUTEX(*inode));
  89 +
  90 + info = inode->i_pipe;
  91 + for (;;) {
  92 + int bufs;
  93 +
  94 + if (!PIPE_READERS(*inode)) {
  95 + send_sig(SIGPIPE, current, 0);
  96 + if (!ret)
  97 + ret = -EPIPE;
  98 + break;
  99 + }
  100 +
  101 + bufs = info->nrbufs;
  102 + if (bufs < PIPE_BUFFERS) {
  103 + int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
  104 + struct pipe_buffer *buf = info->bufs + newbuf;
  105 + struct page *page = pages[i++];
  106 + unsigned long this_len;
  107 +
  108 + this_len = PAGE_CACHE_SIZE - offset;
  109 + if (this_len > len)
  110 + this_len = len;
  111 +
  112 + buf->page = page;
  113 + buf->offset = offset;
  114 + buf->len = this_len;
  115 + buf->ops = &page_cache_pipe_buf_ops;
  116 + info->nrbufs = ++bufs;
  117 + do_wakeup = 1;
  118 +
  119 + ret += this_len;
  120 + len -= this_len;
  121 + offset = 0;
  122 + if (!--nr_pages)
  123 + break;
  124 + if (!len)
  125 + break;
  126 + if (bufs < PIPE_BUFFERS)
  127 + continue;
  128 +
  129 + break;
  130 + }
  131 +
  132 + if (signal_pending(current)) {
  133 + if (!ret)
  134 + ret = -ERESTARTSYS;
  135 + break;
  136 + }
  137 +
  138 + if (do_wakeup) {
  139 + wake_up_interruptible_sync(PIPE_WAIT(*inode));
  140 + kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
  141 + POLL_IN);
  142 + do_wakeup = 0;
  143 + }
  144 +
  145 + PIPE_WAITING_WRITERS(*inode)++;
  146 + pipe_wait(inode);
  147 + PIPE_WAITING_WRITERS(*inode)--;
  148 + }
  149 +
  150 + mutex_unlock(PIPE_MUTEX(*inode));
  151 +
  152 + if (do_wakeup) {
  153 + wake_up_interruptible(PIPE_WAIT(*inode));
  154 + kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
  155 + }
  156 +
  157 + while (i < nr_pages)
  158 + page_cache_release(pages[i++]);
  159 +
  160 + return ret;
  161 +}
  162 +
  163 +static int __generic_file_splice_read(struct file *in, struct inode *pipe,
  164 + size_t len)
  165 +{
  166 + struct address_space *mapping = in->f_mapping;
  167 + unsigned int offset, nr_pages;
  168 + struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS];
  169 + struct page *page;
  170 + pgoff_t index, pidx;
  171 + int i, j;
  172 +
  173 + index = in->f_pos >> PAGE_CACHE_SHIFT;
  174 + offset = in->f_pos & ~PAGE_CACHE_MASK;
  175 + nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
  176 +
  177 + if (nr_pages > PIPE_BUFFERS)
  178 + nr_pages = PIPE_BUFFERS;
  179 +
  180 + /*
  181 + * initiate read-ahead on this page range
  182 + */
  183 + do_page_cache_readahead(mapping, in, index, nr_pages);
  184 +
  185 + /*
  186 + * Get as many pages from the page cache as possible..
  187 + * Start IO on the page cache entries we create (we
  188 + * can assume that any pre-existing ones we find have
  189 + * already had IO started on them).
  190 + */
  191 + i = find_get_pages(mapping, index, nr_pages, pages);
  192 +
  193 + /*
  194 + * common case - we found all pages and they are contiguous,
  195 + * kick them off
  196 + */
  197 + if (i && (pages[i - 1]->index == index + i - 1))
  198 + goto splice_them;
  199 +
  200 + /*
  201 + * fill shadow[] with pages at the right locations, so we only
  202 + * have to fill holes
  203 + */
  204 + memset(shadow, 0, i * sizeof(struct page *));
  205 + for (j = 0, pidx = index; j < i; pidx++, j++)
  206 + shadow[pages[j]->index - pidx] = pages[j];
  207 +
  208 + /*
  209 + * now fill in the holes
  210 + */
  211 + for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
  212 + int error;
  213 +
  214 + if (shadow[i])
  215 + continue;
  216 +
  217 + /*
  218 + * no page there, look one up / create it
  219 + */
  220 + page = find_or_create_page(mapping, pidx,
  221 + mapping_gfp_mask(mapping));
  222 + if (!page)
  223 + break;
  224 +
  225 + if (PageUptodate(page))
  226 + unlock_page(page);
  227 + else {
  228 + error = mapping->a_ops->readpage(in, page);
  229 +
  230 + if (unlikely(error)) {
  231 + page_cache_release(page);
  232 + break;
  233 + }
  234 + }
  235 + shadow[i] = page;
  236 + }
  237 +
  238 + if (!i) {
  239 + for (i = 0; i < nr_pages; i++) {
  240 + if (shadow[i])
  241 + page_cache_release(shadow[i]);
  242 + }
  243 + return 0;
  244 + }
  245 +
  246 + memcpy(pages, shadow, i * sizeof(struct page *));
  247 +
  248 + /*
  249 + * Now we splice them into the pipe..
  250 + */
  251 +splice_them:
  252 + return move_to_pipe(pipe, pages, i, offset, len);
  253 +}
  254 +
  255 +ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
  256 + size_t len, unsigned int flags)
  257 +{
  258 + ssize_t spliced;
  259 + int ret;
  260 +
  261 + ret = 0;
  262 + spliced = 0;
  263 + while (len) {
  264 + ret = __generic_file_splice_read(in, pipe, len);
  265 +
  266 + if (ret <= 0)
  267 + break;
  268 +
  269 + in->f_pos += ret;
  270 + len -= ret;
  271 + spliced += ret;
  272 + }
  273 +
  274 + if (spliced)
  275 + return spliced;
  276 +
  277 + return ret;
  278 +}
  279 +
  280 +/*
  281 + * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage().
  282 + */
  283 +static int pipe_to_sendpage(struct pipe_inode_info *info,
  284 + struct pipe_buffer *buf, struct splice_desc *sd)
  285 +{
  286 + struct file *file = sd->file;
  287 + loff_t pos = sd->pos;
  288 + unsigned int offset;
  289 + ssize_t ret;
  290 + void *ptr;
  291 +
  292 + /*
  293 + * sub-optimal, but we are limited by the pipe ->map. we don't
  294 + * need a kmap'ed buffer here, we just want to make sure we
  295 + * have the page pinned if the pipe page originates from the
  296 + * page cache
  297 + */
  298 + ptr = buf->ops->map(file, info, buf);
  299 + if (IS_ERR(ptr))
  300 + return PTR_ERR(ptr);
  301 +
  302 + offset = pos & ~PAGE_CACHE_MASK;
  303 +
  304 + ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,
  305 + sd->len < sd->total_len);
  306 +
  307 + buf->ops->unmap(info, buf);
  308 + if (ret == sd->len)
  309 + return 0;
  310 +
  311 + return -EIO;
  312 +}
  313 +
  314 +/*
  315 + * This is a little more tricky than the file -> pipe splicing. There are
  316 + * basically three cases:
  317 + *
  318 + * - Destination page already exists in the address space and there
  319 + * are users of it. For that case we have no other option that
  320 + * copying the data. Tough luck.
  321 + * - Destination page already exists in the address space, but there
  322 + * are no users of it. Make sure it's uptodate, then drop it. Fall
  323 + * through to last case.
  324 + * - Destination page does not exist, we can add the pipe page to
  325 + * the page cache and avoid the copy.
  326 + *
  327 + * For now we just do the slower thing and always copy pages over, it's
  328 + * easier than migrating pages from the pipe to the target file. For the
  329 + * case of doing file | file splicing, the migrate approach had some LRU
  330 + * nastiness...
  331 + */
  332 +static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
  333 + struct splice_desc *sd)
  334 +{
  335 + struct file *file = sd->file;
  336 + struct address_space *mapping = file->f_mapping;
  337 + unsigned int offset;
  338 + struct page *page;
  339 + char *src, *dst;
  340 + pgoff_t index;
  341 + int ret;
  342 +
  343 + /*
  344 + * after this, page will be locked and unmapped
  345 + */
  346 + src = buf->ops->map(file, info, buf);
  347 + if (IS_ERR(src))
  348 + return PTR_ERR(src);
  349 +
  350 + index = sd->pos >> PAGE_CACHE_SHIFT;
  351 + offset = sd->pos & ~PAGE_CACHE_MASK;
  352 +
  353 +find_page:
  354 + ret = -ENOMEM;
  355 + page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping));
  356 + if (!page)
  357 + goto out;
  358 +
  359 + /*
  360 + * If the page is uptodate, it is also locked. If it isn't
  361 + * uptodate, we can mark it uptodate if we are filling the
  362 + * full page. Otherwise we need to read it in first...
  363 + */
  364 + if (!PageUptodate(page)) {
  365 + if (sd->len < PAGE_CACHE_SIZE) {
  366 + ret = mapping->a_ops->readpage(file, page);
  367 + if (unlikely(ret))
  368 + goto out;
  369 +
  370 + lock_page(page);
  371 +
  372 + if (!PageUptodate(page)) {
  373 + /*
  374 + * page got invalidated, repeat
  375 + */
  376 + if (!page->mapping) {
  377 + unlock_page(page);
  378 + page_cache_release(page);
  379 + goto find_page;
  380 + }
  381 + ret = -EIO;
  382 + goto out;
  383 + }
  384 + } else {
  385 + WARN_ON(!PageLocked(page));
  386 + SetPageUptodate(page);
  387 + }
  388 + }
  389 +
  390 + ret = mapping->a_ops->prepare_write(file, page, 0, sd->len);
  391 + if (ret)
  392 + goto out;
  393 +
  394 + dst = kmap_atomic(page, KM_USER0);
  395 + memcpy(dst + offset, src + buf->offset, sd->len);
  396 + flush_dcache_page(page);
  397 + kunmap_atomic(dst, KM_USER0);
  398 +
  399 + ret = mapping->a_ops->commit_write(file, page, 0, sd->len);
  400 + if (ret < 0)
  401 + goto out;
  402 +
  403 + set_page_dirty(page);
  404 + ret = write_one_page(page, 0);
  405 +out:
  406 + if (ret < 0)
  407 + unlock_page(page);
  408 + page_cache_release(page);
  409 + buf->ops->unmap(info, buf);
  410 + return ret;
  411 +}
  412 +
  413 +typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
  414 + struct splice_desc *);
  415 +
  416 +static ssize_t move_from_pipe(struct inode *inode, struct file *out,
  417 + size_t len, unsigned int flags,
  418 + splice_actor *actor)
  419 +{
  420 + struct pipe_inode_info *info;
  421 + int ret, do_wakeup, err;
  422 + struct splice_desc sd;
  423 +
  424 + ret = 0;
  425 + do_wakeup = 0;
  426 +
  427 + sd.total_len = len;
  428 + sd.flags = flags;
  429 + sd.file = out;
  430 + sd.pos = out->f_pos;
  431 +
  432 + mutex_lock(PIPE_MUTEX(*inode));
  433 +
  434 + info = inode->i_pipe;
  435 + for (;;) {
  436 + int bufs = info->nrbufs;
  437 +
  438 + if (bufs) {
  439 + int curbuf = info->curbuf;
  440 + struct pipe_buffer *buf = info->bufs + curbuf;
  441 + struct pipe_buf_operations *ops = buf->ops;
  442 +
  443 + sd.len = buf->len;
  444 + if (sd.len > sd.total_len)
  445 + sd.len = sd.total_len;
  446 +
  447 + err = actor(info, buf, &sd);
  448 + if (err) {
  449 + if (!ret && err != -ENODATA)
  450 + ret = err;
  451 +
  452 + break;
  453 + }
  454 +
  455 + ret += sd.len;
  456 + buf->offset += sd.len;
  457 + buf->len -= sd.len;
  458 + if (!buf->len) {
  459 + buf->ops = NULL;
  460 + ops->release(info, buf);
  461 + curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
  462 + info->curbuf = curbuf;
  463 + info->nrbufs = --bufs;
  464 + do_wakeup = 1;
  465 + }
  466 +
  467 + sd.pos += sd.len;
  468 + sd.total_len -= sd.len;
  469 + if (!sd.total_len)
  470 + break;
  471 + }
  472 +
  473 + if (bufs)
  474 + continue;
  475 + if (!PIPE_WRITERS(*inode))
  476 + break;
  477 + if (!PIPE_WAITING_WRITERS(*inode)) {
  478 + if (ret)
  479 + break;
  480 + }
  481 +
  482 + if (signal_pending(current)) {
  483 + if (!ret)
  484 + ret = -ERESTARTSYS;
  485 + break;
  486 + }
  487 +
  488 + if (do_wakeup) {
  489 + wake_up_interruptible_sync(PIPE_WAIT(*inode));
  490 + kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
  491 + do_wakeup = 0;
  492 + }
  493 +
  494 + pipe_wait(inode);
  495 + }
  496 +
  497 + mutex_unlock(PIPE_MUTEX(*inode));
  498 +
  499 + if (do_wakeup) {
  500 + wake_up_interruptible(PIPE_WAIT(*inode));
  501 + kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
  502 + }
  503 +
  504 + mutex_lock(&out->f_mapping->host->i_mutex);
  505 + out->f_pos = sd.pos;
  506 + mutex_unlock(&out->f_mapping->host->i_mutex);
  507 + return ret;
  508 +
  509 +}
  510 +
  511 +ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
  512 + size_t len, unsigned int flags)
  513 +{
  514 + return move_from_pipe(inode, out, len, flags, pipe_to_file);
  515 +}
  516 +
  517 +ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
  518 + size_t len, unsigned int flags)
  519 +{
  520 + return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
  521 +}
  522 +
  523 +static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
  524 + unsigned int flags)
  525 +{
  526 + loff_t pos;
  527 + int ret;
  528 +
  529 + if (!out->f_op || !out->f_op->splice_write)
  530 + return -EINVAL;
  531 +
  532 + if (!(out->f_mode & FMODE_WRITE))
  533 + return -EBADF;
  534 +
  535 + pos = out->f_pos;
  536 + ret = rw_verify_area(WRITE, out, &pos, len);
  537 + if (unlikely(ret < 0))
  538 + return ret;
  539 +
  540 + return out->f_op->splice_write(pipe, out, len, flags);
  541 +}
  542 +
  543 +static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
  544 + unsigned int flags)
  545 +{
  546 + loff_t pos, isize, left;
  547 + int ret;
  548 +
  549 + if (!in->f_op || !in->f_op->splice_read)
  550 + return -EINVAL;
  551 +
  552 + if (!(in->f_mode & FMODE_READ))
  553 + return -EBADF;
  554 +
  555 + pos = in->f_pos;
  556 + ret = rw_verify_area(READ, in, &pos, len);
  557 + if (unlikely(ret < 0))
  558 + return ret;
  559 +
  560 + isize = i_size_read(in->f_mapping->host);
  561 + if (unlikely(in->f_pos >= isize))
  562 + return 0;
  563 +
  564 + left = isize - in->f_pos;
  565 + if (left < len)
  566 + len = left;
  567 +
  568 + return in->f_op->splice_read(in, pipe, len, flags);
  569 +}
  570 +
  571 +static long do_splice(struct file *in, struct file *out, size_t len,
  572 + unsigned int flags)
  573 +{
  574 + struct inode *pipe;
  575 +
  576 + pipe = in->f_dentry->d_inode;
  577 + if (pipe->i_pipe)
  578 + return do_splice_from(pipe, out, len, flags);
  579 +
  580 + pipe = out->f_dentry->d_inode;
  581 + if (pipe->i_pipe)
  582 + return do_splice_to(in, pipe, len, flags);
  583 +
  584 + return -EINVAL;
  585 +}
  586 +
  587 +asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
  588 +{
  589 + long error;
  590 + struct file *in, *out;
  591 + int fput_in, fput_out;
  592 +
  593 + if (unlikely(!len))
  594 + return 0;
  595 +
  596 + error = -EBADF;
  597 + in = fget_light(fdin, &fput_in);
  598 + if (in) {
  599 + if (in->f_mode & FMODE_READ) {
  600 + out = fget_light(fdout, &fput_out);
  601 + if (out) {
  602 + if (out->f_mode & FMODE_WRITE)
  603 + error = do_splice(in, out, len, flags);
  604 + fput_light(out, fput_out);
  605 + }
  606 + }
  607 +
  608 + fput_light(in, fput_in);
  609 + }
  610 +
  611 + return error;
  612 +}
include/asm-i386/unistd.h
... ... @@ -318,8 +318,9 @@
318 318 #define __NR_unshare 310
319 319 #define __NR_set_robust_list 311
320 320 #define __NR_get_robust_list 312
  321 +#define __NR_sys_splice 313
321 322  
322   -#define NR_syscalls 313
  323 +#define NR_syscalls 314
323 324  
324 325 /*
325 326 * user-visible error numbers are in the range -1 - -128: see
include/asm-ia64/unistd.h
... ... @@ -285,12 +285,13 @@
285 285 #define __NR_faccessat 1293
286 286 /* 1294, 1295 reserved for pselect/ppoll */
287 287 #define __NR_unshare 1296
  288 +#define __NR_splice 1297
288 289  
289 290 #ifdef __KERNEL__
290 291  
291 292 #include <linux/config.h>
292 293  
293   -#define NR_syscalls 273 /* length of syscall table */
  294 +#define NR_syscalls 274 /* length of syscall table */
294 295  
295 296 #define __ARCH_WANT_SYS_RT_SIGACTION
296 297  
include/asm-powerpc/unistd.h
... ... @@ -301,8 +301,9 @@
301 301 #define __NR_pselect6 280
302 302 #define __NR_ppoll 281
303 303 #define __NR_unshare 282
  304 +#define __NR_splice 283
304 305  
305   -#define __NR_syscalls 283
  306 +#define __NR_syscalls 284
306 307  
307 308 #ifdef __KERNEL__
308 309 #define __NR__exit __NR_exit
include/asm-x86_64/unistd.h
... ... @@ -609,8 +609,10 @@
609 609 __SYSCALL(__NR_set_robust_list, sys_set_robust_list)
610 610 #define __NR_get_robust_list 274
611 611 __SYSCALL(__NR_get_robust_list, sys_get_robust_list)
  612 +#define __NR_splice 275
  613 +__SYSCALL(__NR_splice, sys_splice)
612 614  
613   -#define __NR_syscall_max __NR_get_robust_list
  615 +#define __NR_syscall_max __NR_splice
614 616  
615 617 #ifndef __NO_STUBS
616 618  
... ... @@ -1032,6 +1032,8 @@
1032 1032 int (*check_flags)(int);
1033 1033 int (*dir_notify)(struct file *filp, unsigned long arg);
1034 1034 int (*flock) (struct file *, int, struct file_lock *);
  1035 + ssize_t (*splice_write)(struct inode *, struct file *, size_t, unsigned int);
  1036 + ssize_t (*splice_read)(struct file *, struct inode *, size_t, unsigned int);
1035 1037 };
1036 1038  
1037 1039 struct inode_operations {
... ... @@ -1609,6 +1611,8 @@
1609 1611 extern void do_generic_mapping_read(struct address_space *mapping,
1610 1612 struct file_ra_state *, struct file *,
1611 1613 loff_t *, read_descriptor_t *, read_actor_t);
  1614 +extern ssize_t generic_file_splice_read(struct file *, struct inode *, size_t, unsigned int);
  1615 +extern ssize_t generic_file_splice_write(struct inode *, struct file *, size_t, unsigned int);
1612 1616 extern void
1613 1617 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
1614 1618 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
include/linux/syscalls.h
... ... @@ -569,6 +569,8 @@
569 569 asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename,
570 570 int flags, int mode);
571 571 asmlinkage long sys_unshare(unsigned long unshare_flags);
  572 +asmlinkage long sys_splice(int fdin, int fdout, size_t len,
  573 + unsigned int flags);
572 574  
573 575 #endif
... ... @@ -119,7 +119,10 @@
119 119 static ssize_t sock_sendpage(struct file *file, struct page *page,
120 120 int offset, size_t size, loff_t *ppos, int more);
121 121  
  122 +extern ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
  123 + size_t len, unsigned int flags);
122 124  
  125 +
123 126 /*
124 127 * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear
125 128 * in the operation structures but are done directly via the socketcall() multiplexor.
... ... @@ -141,7 +144,8 @@
141 144 .fasync = sock_fasync,
142 145 .readv = sock_readv,
143 146 .writev = sock_writev,
144   - .sendpage = sock_sendpage
  147 + .sendpage = sock_sendpage,
  148 + .splice_write = generic_splice_sendpage,
145 149 };
146 150  
147 151 /*