Commit 70524490ee2ea1bbf6cee6c106597b3ac25a3fc2

Authored by Jens Axboe
1 parent cbb7e577e7

[PATCH] splice: add support for sys_tee()

Basically an in-kernel implementation of tee, which uses splice and the
pipe buffers as an intelligent way to pass data around by reference.

Where the user space tee consumes the input and produces a stdout and
file output, this syscall merely duplicates the data inside a pipe to
another pipe. No data is copied, the output just grabs a reference to the
input pipe data.

Signed-off-by: Jens Axboe <axboe@suse.de>

Showing 11 changed files with 208 additions and 4 deletions Side-by-side Diff

arch/i386/kernel/syscall_table.S
... ... @@ -314,4 +314,5 @@
314 314 .long sys_get_robust_list
315 315 .long sys_splice
316 316 .long sys_sync_file_range
  317 + .long sys_tee /* 315 */
arch/ia64/kernel/entry.S
... ... @@ -1609,6 +1609,7 @@
1609 1609 data8 sys_set_robust_list
1610 1610 data8 sys_get_robust_list
1611 1611 data8 sys_sync_file_range // 1300
  1612 + data8 sys_tee
1612 1613  
1613 1614 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls
arch/powerpc/kernel/systbl.S
... ... @@ -323,4 +323,5 @@
323 323 COMPAT_SYS(ppoll)
324 324 SYSCALL(unshare)
325 325 SYSCALL(splice)
  326 +SYSCALL(tee)
... ... @@ -131,12 +131,19 @@
131 131 return 0;
132 132 }
133 133  
  134 +static void anon_pipe_buf_get(struct pipe_inode_info *info,
  135 + struct pipe_buffer *buf)
  136 +{
  137 + page_cache_get(buf->page);
  138 +}
  139 +
134 140 static struct pipe_buf_operations anon_pipe_buf_ops = {
135 141 .can_merge = 1,
136 142 .map = anon_pipe_buf_map,
137 143 .unmap = anon_pipe_buf_unmap,
138 144 .release = anon_pipe_buf_release,
139 145 .steal = anon_pipe_buf_steal,
  146 + .get = anon_pipe_buf_get,
140 147 };
141 148  
142 149 static ssize_t
... ... @@ -125,12 +125,19 @@
125 125 kunmap(buf->page);
126 126 }
127 127  
  128 +static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
  129 + struct pipe_buffer *buf)
  130 +{
  131 + page_cache_get(buf->page);
  132 +}
  133 +
128 134 static struct pipe_buf_operations page_cache_pipe_buf_ops = {
129 135 .can_merge = 0,
130 136 .map = page_cache_pipe_buf_map,
131 137 .unmap = page_cache_pipe_buf_unmap,
132 138 .release = page_cache_pipe_buf_release,
133 139 .steal = page_cache_pipe_buf_steal,
  140 + .get = page_cache_pipe_buf_get,
134 141 };
135 142  
136 143 /*
... ... @@ -960,6 +967,185 @@
960 967  
961 968 fput_light(in, fput_in);
962 969 }
  970 +
  971 + return error;
  972 +}
  973 +
  974 +/*
  975 + * Link contents of ipipe to opipe.
  976 + */
  977 +static int link_pipe(struct pipe_inode_info *ipipe,
  978 + struct pipe_inode_info *opipe,
  979 + size_t len, unsigned int flags)
  980 +{
  981 + struct pipe_buffer *ibuf, *obuf;
  982 + int ret = 0, do_wakeup = 0, i;
  983 +
  984 + /*
  985 + * Potential ABBA deadlock, work around it by ordering lock
  986 + * grabbing by inode address. Otherwise two different processes
  987 + * could deadlock (one doing tee from A -> B, the other from B -> A).
  988 + */
  989 + if (ipipe->inode < opipe->inode) {
  990 + mutex_lock(&ipipe->inode->i_mutex);
  991 + mutex_lock(&opipe->inode->i_mutex);
  992 + } else {
  993 + mutex_lock(&opipe->inode->i_mutex);
  994 + mutex_lock(&ipipe->inode->i_mutex);
  995 + }
  996 +
  997 + for (i = 0;; i++) {
  998 + if (!opipe->readers) {
  999 + send_sig(SIGPIPE, current, 0);
  1000 + if (!ret)
  1001 + ret = -EPIPE;
  1002 + break;
  1003 + }
  1004 + if (ipipe->nrbufs - i) {
  1005 + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
  1006 +
  1007 + /*
  1008 + * If we have room, fill this buffer
  1009 + */
  1010 + if (opipe->nrbufs < PIPE_BUFFERS) {
  1011 + int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
  1012 +
  1013 + /*
  1014 + * Get a reference to this pipe buffer,
  1015 + * so we can copy the contents over.
  1016 + */
  1017 + ibuf->ops->get(ipipe, ibuf);
  1018 +
  1019 + obuf = opipe->bufs + nbuf;
  1020 + *obuf = *ibuf;
  1021 +
  1022 + if (obuf->len > len)
  1023 + obuf->len = len;
  1024 +
  1025 + opipe->nrbufs++;
  1026 + do_wakeup = 1;
  1027 + ret += obuf->len;
  1028 + len -= obuf->len;
  1029 +
  1030 + if (!len)
  1031 + break;
  1032 + if (opipe->nrbufs < PIPE_BUFFERS)
  1033 + continue;
  1034 + }
  1035 +
  1036 + /*
  1037 + * We have input available, but no output room.
  1038 + * If we already copied data, return that.
  1039 + */
  1040 + if (flags & SPLICE_F_NONBLOCK) {
  1041 + if (!ret)
  1042 + ret = -EAGAIN;
  1043 + break;
  1044 + }
  1045 + if (signal_pending(current)) {
  1046 + if (!ret)
  1047 + ret = -ERESTARTSYS;
  1048 + break;
  1049 + }
  1050 + if (do_wakeup) {
  1051 + smp_mb();
  1052 + if (waitqueue_active(&opipe->wait))
  1053 + wake_up_interruptible(&opipe->wait);
  1054 + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
  1055 + do_wakeup = 0;
  1056 + }
  1057 +
  1058 + opipe->waiting_writers++;
  1059 + pipe_wait(opipe);
  1060 + opipe->waiting_writers--;
  1061 + continue;
  1062 + }
  1063 +
  1064 + /*
  1065 + * No input buffers, do the usual checks for available
  1066 + * writers and blocking and wait if necessary
  1067 + */
  1068 + if (!ipipe->writers)
  1069 + break;
  1070 + if (!ipipe->waiting_writers) {
  1071 + if (ret)
  1072 + break;
  1073 + }
  1074 + if (flags & SPLICE_F_NONBLOCK) {
  1075 + if (!ret)
  1076 + ret = -EAGAIN;
  1077 + break;
  1078 + }
  1079 + if (signal_pending(current)) {
  1080 + if (!ret)
  1081 + ret = -ERESTARTSYS;
  1082 + break;
  1083 + }
  1084 +
  1085 + if (waitqueue_active(&ipipe->wait))
  1086 + wake_up_interruptible_sync(&ipipe->wait);
  1087 + kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT);
  1088 +
  1089 + pipe_wait(ipipe);
  1090 + }
  1091 +
  1092 + mutex_unlock(&ipipe->inode->i_mutex);
  1093 + mutex_unlock(&opipe->inode->i_mutex);
  1094 +
  1095 + if (do_wakeup) {
  1096 + smp_mb();
  1097 + if (waitqueue_active(&opipe->wait))
  1098 + wake_up_interruptible(&opipe->wait);
  1099 + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
  1100 + }
  1101 +
  1102 + return ret;
  1103 +}
  1104 +
  1105 +/*
  1106 + * This is a tee(1) implementation that works on pipes. It doesn't copy
  1107 + * any data, it simply references the 'in' pages on the 'out' pipe.
  1108 + * The 'flags' used are the SPLICE_F_* variants, currently the only
  1109 + * applicable one is SPLICE_F_NONBLOCK.
  1110 + */
  1111 +static long do_tee(struct file *in, struct file *out, size_t len,
  1112 + unsigned int flags)
  1113 +{
  1114 + struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe;
  1115 + struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe;
  1116 +
  1117 + /*
  1118 + * Link ipipe to the two output pipes, consuming as we go along.
  1119 + */
  1120 + if (ipipe && opipe)
  1121 + return link_pipe(ipipe, opipe, len, flags);
  1122 +
  1123 + return -EINVAL;
  1124 +}
  1125 +
  1126 +asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
  1127 +{
  1128 + struct file *in;
  1129 + int error, fput_in;
  1130 +
  1131 + if (unlikely(!len))
  1132 + return 0;
  1133 +
  1134 + error = -EBADF;
  1135 + in = fget_light(fdin, &fput_in);
  1136 + if (in) {
  1137 + if (in->f_mode & FMODE_READ) {
  1138 + int fput_out;
  1139 + struct file *out = fget_light(fdout, &fput_out);
  1140 +
  1141 + if (out) {
  1142 + if (out->f_mode & FMODE_WRITE)
  1143 + error = do_tee(in, out, len, flags);
  1144 + fput_light(out, fput_out);
  1145 + }
  1146 + }
  1147 + fput_light(in, fput_in);
  1148 + }
963 1149  
964 1150 return error;
965 1151 }
include/asm-i386/unistd.h
... ... @@ -320,8 +320,9 @@
320 320 #define __NR_get_robust_list 312
321 321 #define __NR_splice 313
322 322 #define __NR_sync_file_range 314
  323 +#define __NR_tee 315
323 324  
324   -#define NR_syscalls 315
  325 +#define NR_syscalls 316
325 326  
326 327 /*
327 328 * user-visible error numbers are in the range -1 - -128: see
include/asm-ia64/unistd.h
... ... @@ -289,12 +289,13 @@
289 289 #define __NR_set_robust_list 1298
290 290 #define __NR_get_robust_list 1299
291 291 #define __NR_sync_file_range 1300
  292 +#define __NR_tee 1301
292 293  
293 294 #ifdef __KERNEL__
294 295  
295 296 #include <linux/config.h>
296 297  
297   -#define NR_syscalls 277 /* length of syscall table */
  298 +#define NR_syscalls 278 /* length of syscall table */
298 299  
299 300 #define __ARCH_WANT_SYS_RT_SIGACTION
300 301  
include/asm-powerpc/unistd.h
... ... @@ -302,8 +302,9 @@
302 302 #define __NR_ppoll 281
303 303 #define __NR_unshare 282
304 304 #define __NR_splice 283
  305 +#define __NR_tee 284
305 306  
306   -#define __NR_syscalls 284
  307 +#define __NR_syscalls 285
307 308  
308 309 #ifdef __KERNEL__
309 310 #define __NR__exit __NR_exit
include/asm-x86_64/unistd.h
... ... @@ -611,8 +611,10 @@
611 611 __SYSCALL(__NR_get_robust_list, sys_get_robust_list)
612 612 #define __NR_splice 275
613 613 __SYSCALL(__NR_splice, sys_splice)
  614 +#define __NR_tee 276
  615 +__SYSCALL(__NR_tee, sys_tee)
614 616  
615   -#define __NR_syscall_max __NR_splice
  617 +#define __NR_syscall_max __NR_tee
616 618  
617 619 #ifndef __NO_STUBS
618 620  
include/linux/pipe_fs_i.h
... ... @@ -21,6 +21,7 @@
21 21 void (*unmap)(struct pipe_inode_info *, struct pipe_buffer *);
22 22 void (*release)(struct pipe_inode_info *, struct pipe_buffer *);
23 23 int (*steal)(struct pipe_inode_info *, struct pipe_buffer *);
  24 + void (*get)(struct pipe_inode_info *, struct pipe_buffer *);
24 25 };
25 26  
26 27 struct pipe_inode_info {
include/linux/syscalls.h
... ... @@ -574,6 +574,8 @@
574 574 int fd_out, loff_t __user *off_out,
575 575 size_t len, unsigned int flags);
576 576  
  577 +asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags);
  578 +
577 579 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
578 580 unsigned int flags);
579 581