Commit b92ce55893745e011edae70830b8bc863be881f9

Authored by Jens Axboe
1 parent 529565dcb1

[PATCH] splice: add direct fd <-> fd splicing support

It's more efficient for sendfile() emulation. Basically we cache an
internal private pipe and just use that as the intermediate area for
pages. Direct splicing is not available from sys_splice(), it is only
meant to be used for sendfile() emulation.

Additional patch from Ingo Molnar to avoid the PIPE_BUFFERS loop at
exit for the normal fast path.

Signed-off-by: Jens Axboe <axboe@suse.de>

Showing 6 changed files with 150 additions and 21 deletions Side-by-side Diff

... ... @@ -691,12 +691,10 @@
691 691 return info;
692 692 }
693 693  
694   -void free_pipe_info(struct inode *inode)
  694 +void __free_pipe_info(struct pipe_inode_info *info)
695 695 {
696 696 int i;
697   - struct pipe_inode_info *info = inode->i_pipe;
698 697  
699   - inode->i_pipe = NULL;
700 698 for (i = 0; i < PIPE_BUFFERS; i++) {
701 699 struct pipe_buffer *buf = info->bufs + i;
702 700 if (buf->ops)
... ... @@ -705,6 +703,12 @@
705 703 if (info->tmp_page)
706 704 __free_page(info->tmp_page);
707 705 kfree(info);
  706 +}
  707 +
  708 +void free_pipe_info(struct inode *inode)
  709 +{
  710 + __free_pipe_info(inode->i_pipe);
  711 + inode->i_pipe = NULL;
708 712 }
709 713  
710 714 static struct vfsmount *pipe_mnt __read_mostly;
... ... @@ -680,8 +680,7 @@
680 680 * Attempt to initiate a splice from pipe to file.
681 681 */
682 682 static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
683   - loff_t __user *off_out, size_t len,
684   - unsigned int flags)
  683 + size_t len, unsigned int flags)
685 684 {
686 685 loff_t pos;
687 686 int ret;
... ... @@ -692,9 +691,6 @@
692 691 if (!(out->f_mode & FMODE_WRITE))
693 692 return -EBADF;
694 693  
695   - if (off_out && copy_from_user(&out->f_pos, off_out, sizeof(loff_t)))
696   - return -EFAULT;
697   -
698 694 pos = out->f_pos;
699 695  
700 696 ret = rw_verify_area(WRITE, out, &pos, len);
... ... @@ -707,9 +703,8 @@
707 703 /*
708 704 * Attempt to initiate a splice from a file to a pipe.
709 705 */
710   -static long do_splice_to(struct file *in, loff_t __user *off_in,
711   - struct pipe_inode_info *pipe, size_t len,
712   - unsigned int flags)
  706 +static long do_splice_to(struct file *in, struct pipe_inode_info *pipe,
  707 + size_t len, unsigned int flags)
713 708 {
714 709 loff_t pos, isize, left;
715 710 int ret;
... ... @@ -720,9 +715,6 @@
720 715 if (!(in->f_mode & FMODE_READ))
721 716 return -EBADF;
722 717  
723   - if (off_in && copy_from_user(&in->f_pos, off_in, sizeof(loff_t)))
724   - return -EFAULT;
725   -
726 718 pos = in->f_pos;
727 719  
728 720 ret = rw_verify_area(READ, in, &pos, len);
... ... @@ -740,6 +732,118 @@
740 732 return in->f_op->splice_read(in, pipe, len, flags);
741 733 }
742 734  
  735 +long do_splice_direct(struct file *in, struct file *out, size_t len,
  736 + unsigned int flags)
  737 +{
  738 + struct pipe_inode_info *pipe;
  739 + long ret, bytes;
  740 + umode_t i_mode;
  741 + int i;
  742 +
  743 + /*
  744 + * We require the input being a regular file, as we don't want to
  745 + * randomly drop data for eg socket -> socket splicing. Use the
  746 + * piped splicing for that!
  747 + */
  748 + i_mode = in->f_dentry->d_inode->i_mode;
  749 + if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
  750 + return -EINVAL;
  751 +
  752 + /*
  753 + * neither in nor out is a pipe, setup an internal pipe attached to
  754 + * 'out' and transfer the wanted data from 'in' to 'out' through that
  755 + */
  756 + pipe = current->splice_pipe;
  757 + if (!pipe) {
  758 + pipe = alloc_pipe_info(NULL);
  759 + if (!pipe)
  760 + return -ENOMEM;
  761 +
  762 + /*
  763 + * We don't have an immediate reader, but we'll read the stuff
  764 + * out of the pipe right after the move_to_pipe(). So set
  765 + * PIPE_READERS appropriately.
  766 + */
  767 + pipe->readers = 1;
  768 +
  769 + current->splice_pipe = pipe;
  770 + }
  771 +
  772 + /*
  773 + * do the splice
  774 + */
  775 + ret = 0;
  776 + bytes = 0;
  777 +
  778 + while (len) {
  779 + size_t read_len, max_read_len;
  780 +
  781 + /*
  782 + * Do at most PIPE_BUFFERS pages worth of transfer:
  783 + */
  784 + max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
  785 +
  786 + ret = do_splice_to(in, pipe, max_read_len, flags);
  787 + if (unlikely(ret < 0))
  788 + goto out_release;
  789 +
  790 + read_len = ret;
  791 +
  792 + /*
  793 + * NOTE: nonblocking mode only applies to the input. We
  794 + * must not do the output in nonblocking mode as then we
  795 + * could get stuck data in the internal pipe:
  796 + */
  797 + ret = do_splice_from(pipe, out, read_len,
  798 + flags & ~SPLICE_F_NONBLOCK);
  799 + if (unlikely(ret < 0))
  800 + goto out_release;
  801 +
  802 + bytes += ret;
  803 + len -= ret;
  804 +
  805 + /*
  806 + * In nonblocking mode, if we got back a short read then
  807 + * that was due to either an IO error or due to the
  808 + * pagecache entry not being there. In the IO error case
  809 + * the _next_ splice attempt will produce a clean IO error
  810 + * return value (not a short read), so in both cases it's
  811 + * correct to break out of the loop here:
  812 + */
  813 + if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
  814 + break;
  815 + }
  816 +
  817 + pipe->nrbufs = pipe->curbuf = 0;
  818 +
  819 + return bytes;
  820 +
  821 +out_release:
  822 + /*
  823 + * If we did an incomplete transfer we must release
  824 + * the pipe buffers in question:
  825 + */
  826 + for (i = 0; i < PIPE_BUFFERS; i++) {
  827 + struct pipe_buffer *buf = pipe->bufs + i;
  828 +
  829 + if (buf->ops) {
  830 + buf->ops->release(pipe, buf);
  831 + buf->ops = NULL;
  832 + }
  833 + }
  834 + pipe->nrbufs = pipe->curbuf = 0;
  835 +
  836 + /*
  837 + * If we transferred some data, return the number of bytes:
  838 + */
  839 + if (bytes > 0)
  840 + return bytes;
  841 +
  842 + return ret;
  843 +}
  844 +
  845 +EXPORT_SYMBOL(do_splice_direct);
  846 +
743 847 /*
744 848 * Determine where to splice to/from.
745 849 */
746 850  
747 851  
748 852  
749 853  
... ... @@ -749,25 +853,33 @@
749 853 {
750 854 struct pipe_inode_info *pipe;
751 855  
752   - if (off_out && out->f_op->llseek == no_llseek)
753   - return -EINVAL;
754   - if (off_in && in->f_op->llseek == no_llseek)
755   - return -EINVAL;
756   -
757 856 pipe = in->f_dentry->d_inode->i_pipe;
758 857 if (pipe) {
759 858 if (off_in)
760 859 return -ESPIPE;
  860 + if (off_out) {
  861 + if (out->f_op->llseek == no_llseek)
  862 + return -EINVAL;
  863 + if (copy_from_user(&out->f_pos, off_out,
  864 + sizeof(loff_t)))
  865 + return -EFAULT;
  866 + }
761 867  
762   - return do_splice_from(pipe, out, off_out, len, flags);
  868 + return do_splice_from(pipe, out, len, flags);
763 869 }
764 870  
765 871 pipe = out->f_dentry->d_inode->i_pipe;
766 872 if (pipe) {
767 873 if (off_out)
768 874 return -ESPIPE;
  875 + if (off_in) {
  876 + if (in->f_op->llseek == no_llseek)
  877 + return -EINVAL;
  878 + if (copy_from_user(&in->f_pos, off_in, sizeof(loff_t)))
  879 + return -EFAULT;
  880 + }
769 881  
770   - return do_splice_to(in, off_in, pipe, len, flags);
  882 + return do_splice_to(in, pipe, len, flags);
771 883 }
772 884  
773 885 return -EINVAL;
... ... @@ -1613,6 +1613,8 @@
1613 1613 loff_t *, read_descriptor_t *, read_actor_t);
1614 1614 extern ssize_t generic_file_splice_read(struct file *, struct pipe_inode_info *, size_t, unsigned int);
1615 1615 extern ssize_t generic_file_splice_write(struct pipe_inode_info *, struct file *, size_t, unsigned int);
  1616 +extern long do_splice_direct(struct file *in, struct file *out,
  1617 + size_t len, unsigned int flags);
1616 1618 extern void
1617 1619 file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping);
1618 1620 extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov,
include/linux/pipe_fs_i.h
... ... @@ -58,6 +58,7 @@
58 58  
59 59 struct pipe_inode_info * alloc_pipe_info(struct inode * inode);
60 60 void free_pipe_info(struct inode * inode);
  61 +void __free_pipe_info(struct pipe_inode_info *);
61 62  
62 63 /*
63 64 * splice is tied to pipes as a transport (at least for now), so we'll just
include/linux/sched.h
... ... @@ -684,6 +684,7 @@
684 684  
685 685 struct audit_context; /* See audit.c */
686 686 struct mempolicy;
  687 +struct pipe_inode_info;
687 688  
688 689 enum sleep_type {
689 690 SLEEP_NORMAL,
... ... @@ -882,6 +883,11 @@
882 883  
883 884 atomic_t fs_excl; /* holding fs exclusive resources */
884 885 struct rcu_head rcu;
  886 +
  887 + /*
  888 + * cache last used pipe for splice
  889 + */
  890 + struct pipe_inode_info *splice_pipe;
885 891 };
886 892  
887 893 static inline pid_t process_group(struct task_struct *tsk)
... ... @@ -34,6 +34,7 @@
34 34 #include <linux/mutex.h>
35 35 #include <linux/futex.h>
36 36 #include <linux/compat.h>
  37 +#include <linux/pipe_fs_i.h>
37 38  
38 39 #include <asm/uaccess.h>
39 40 #include <asm/unistd.h>
... ... @@ -940,6 +941,9 @@
940 941  
941 942 if (tsk->io_context)
942 943 exit_io_context();
  944 +
  945 + if (tsk->splice_pipe)
  946 + __free_pipe_info(tsk->splice_pipe);
943 947  
944 948 /* PF_DEAD causes final put_task_struct after we schedule. */
945 949 preempt_disable();