Commit 5274f052e7b3dbd81935772eb551dfd0325dfa9d
Committed by
Linus Torvalds
1 parent
5d4fe2c1ce
Exists in
master
and in
4 other branches
[PATCH] Introduce sys_splice() system call
This adds support for the sys_splice system call. Using a pipe as a transport, it can connect to files or sockets (latter as output only). From the splice.c comments: "splice": joining two ropes together by interweaving their strands. This is the "extended pipe" functionality, where a pipe is used as an arbitrary in-memory buffer. Think of a pipe as a small kernel buffer that you can use to transfer data from one end to the other. The traditional unix read/write is extended with a "splice()" operation that transfers data buffers to or from a pipe buffer. Named by Larry McVoy, original implementation from Linus, extended by Jens to support splicing to files and fixing the initial implementation bugs. Signed-off-by: Jens Axboe <axboe@suse.de> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Showing 15 changed files with 669 additions and 11 deletions Side-by-side Diff
- arch/i386/kernel/syscall_table.S
- arch/ia64/kernel/entry.S
- fs/Makefile
- fs/ext2/file.c
- fs/ext3/file.c
- fs/pipe.c
- fs/reiserfs/file.c
- fs/splice.c
- include/asm-i386/unistd.h
- include/asm-ia64/unistd.h
- include/asm-powerpc/unistd.h
- include/asm-x86_64/unistd.h
- include/linux/fs.h
- include/linux/syscalls.h
- net/socket.c
arch/i386/kernel/syscall_table.S
arch/ia64/kernel/entry.S
... | ... | @@ -1605,6 +1605,7 @@ |
1605 | 1605 | data8 sys_ni_syscall // reserved for pselect |
1606 | 1606 | data8 sys_ni_syscall // 1295 reserved for ppoll |
1607 | 1607 | data8 sys_unshare |
1608 | + data8 sys_splice | |
1608 | 1609 | |
1609 | 1610 | .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls |
fs/Makefile
... | ... | @@ -10,7 +10,7 @@ |
10 | 10 | ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ |
11 | 11 | attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ |
12 | 12 | seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ |
13 | - ioprio.o pnode.o drop_caches.o | |
13 | + ioprio.o pnode.o drop_caches.o splice.o | |
14 | 14 | |
15 | 15 | obj-$(CONFIG_INOTIFY) += inotify.o |
16 | 16 | obj-$(CONFIG_EPOLL) += eventpoll.o |
fs/ext2/file.c
fs/ext3/file.c
... | ... | @@ -119,6 +119,8 @@ |
119 | 119 | .release = ext3_release_file, |
120 | 120 | .fsync = ext3_sync_file, |
121 | 121 | .sendfile = generic_file_sendfile, |
122 | + .splice_read = generic_file_splice_read, | |
123 | + .splice_write = generic_file_splice_write, | |
122 | 124 | }; |
123 | 125 | |
124 | 126 | struct inode_operations ext3_file_inode_operations = { |
fs/pipe.c
... | ... | @@ -15,6 +15,7 @@ |
15 | 15 | #include <linux/pipe_fs_i.h> |
16 | 16 | #include <linux/uio.h> |
17 | 17 | #include <linux/highmem.h> |
18 | +#include <linux/pagemap.h> | |
18 | 19 | |
19 | 20 | #include <asm/uaccess.h> |
20 | 21 | #include <asm/ioctls.h> |
21 | 22 | |
... | ... | @@ -94,11 +95,20 @@ |
94 | 95 | { |
95 | 96 | struct page *page = buf->page; |
96 | 97 | |
97 | - if (info->tmp_page) { | |
98 | - __free_page(page); | |
98 | + /* | |
99 | + * If nobody else uses this page, and we don't already have a | |
100 | + * temporary page, let's keep track of it as a one-deep | |
101 | + * allocation cache | |
102 | + */ | |
103 | + if (page_count(page) == 1 && !info->tmp_page) { | |
104 | + info->tmp_page = page; | |
99 | 105 | return; |
100 | 106 | } |
101 | - info->tmp_page = page; | |
107 | + | |
108 | + /* | |
109 | + * Otherwise just release our reference to it | |
110 | + */ | |
111 | + page_cache_release(page); | |
102 | 112 | } |
103 | 113 | |
104 | 114 | static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf) |
... | ... | @@ -152,6 +162,11 @@ |
152 | 162 | chars = total_len; |
153 | 163 | |
154 | 164 | addr = ops->map(filp, info, buf); |
165 | + if (IS_ERR(addr)) { | |
166 | + if (!ret) | |
167 | + ret = PTR_ERR(addr); | |
168 | + break; | |
169 | + } | |
155 | 170 | error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); |
156 | 171 | ops->unmap(info, buf); |
157 | 172 | if (unlikely(error)) { |
... | ... | @@ -254,8 +269,16 @@ |
254 | 269 | struct pipe_buf_operations *ops = buf->ops; |
255 | 270 | int offset = buf->offset + buf->len; |
256 | 271 | if (ops->can_merge && offset + chars <= PAGE_SIZE) { |
257 | - void *addr = ops->map(filp, info, buf); | |
258 | - int error = pipe_iov_copy_from_user(offset + addr, iov, chars); | |
272 | + void *addr; | |
273 | + int error; | |
274 | + | |
275 | + addr = ops->map(filp, info, buf); | |
276 | + if (IS_ERR(addr)) { | |
277 | + error = PTR_ERR(addr); | |
278 | + goto out; | |
279 | + } | |
280 | + error = pipe_iov_copy_from_user(offset + addr, iov, | |
281 | + chars); | |
259 | 282 | ops->unmap(info, buf); |
260 | 283 | ret = error; |
261 | 284 | do_wakeup = 1; |
fs/reiserfs/file.c
... | ... | @@ -1576,6 +1576,8 @@ |
1576 | 1576 | .sendfile = generic_file_sendfile, |
1577 | 1577 | .aio_read = generic_file_aio_read, |
1578 | 1578 | .aio_write = reiserfs_aio_write, |
1579 | + .splice_read = generic_file_splice_read, | |
1580 | + .splice_write = generic_file_splice_write, | |
1579 | 1581 | }; |
1580 | 1582 | |
1581 | 1583 | struct inode_operations reiserfs_file_inode_operations = { |
fs/splice.c
1 | +/* | |
2 | + * "splice": joining two ropes together by interweaving their strands. | |
3 | + * | |
4 | + * This is the "extended pipe" functionality, where a pipe is used as | |
5 | + * an arbitrary in-memory buffer. Think of a pipe as a small kernel | |
6 | + * buffer that you can use to transfer data from one end to the other. | |
7 | + * | |
8 | + * The traditional unix read/write is extended with a "splice()" operation | |
9 | + * that transfers data buffers to or from a pipe buffer. | |
10 | + * | |
11 | + * Named by Larry McVoy, original implementation from Linus, extended by | |
12 | + * Jens to support splicing to files and fixing the initial implementation | |
13 | + * bugs. | |
14 | + * | |
15 | + * Copyright (C) 2005 Jens Axboe <axboe@suse.de> | |
16 | + * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org> | |
17 | + * | |
18 | + */ | |
19 | +#include <linux/fs.h> | |
20 | +#include <linux/file.h> | |
21 | +#include <linux/pagemap.h> | |
22 | +#include <linux/pipe_fs_i.h> | |
23 | +#include <linux/mm_inline.h> | |
24 | + | |
25 | +/* | |
26 | + * Passed to the actors | |
27 | + */ | |
28 | +struct splice_desc { | |
29 | + unsigned int len, total_len; /* current and remaining length */ | |
30 | + unsigned int flags; /* splice flags */ | |
31 | + struct file *file; /* file to read/write */ | |
32 | + loff_t pos; /* file position */ | |
33 | +}; | |
34 | + | |
35 | +static void page_cache_pipe_buf_release(struct pipe_inode_info *info, | |
36 | + struct pipe_buffer *buf) | |
37 | +{ | |
38 | + page_cache_release(buf->page); | |
39 | + buf->page = NULL; | |
40 | +} | |
41 | + | |
42 | +static void *page_cache_pipe_buf_map(struct file *file, | |
43 | + struct pipe_inode_info *info, | |
44 | + struct pipe_buffer *buf) | |
45 | +{ | |
46 | + struct page *page = buf->page; | |
47 | + | |
48 | + lock_page(page); | |
49 | + | |
50 | + if (!PageUptodate(page)) { | |
51 | + unlock_page(page); | |
52 | + return ERR_PTR(-EIO); | |
53 | + } | |
54 | + | |
55 | + if (!page->mapping) { | |
56 | + unlock_page(page); | |
57 | + return ERR_PTR(-ENODATA); | |
58 | + } | |
59 | + | |
60 | + return kmap(buf->page); | |
61 | +} | |
62 | + | |
63 | +static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, | |
64 | + struct pipe_buffer *buf) | |
65 | +{ | |
66 | + unlock_page(buf->page); | |
67 | + kunmap(buf->page); | |
68 | +} | |
69 | + | |
70 | +static struct pipe_buf_operations page_cache_pipe_buf_ops = { | |
71 | + .can_merge = 0, | |
72 | + .map = page_cache_pipe_buf_map, | |
73 | + .unmap = page_cache_pipe_buf_unmap, | |
74 | + .release = page_cache_pipe_buf_release, | |
75 | +}; | |
76 | + | |
77 | +static ssize_t move_to_pipe(struct inode *inode, struct page **pages, | |
78 | + int nr_pages, unsigned long offset, | |
79 | + unsigned long len) | |
80 | +{ | |
81 | + struct pipe_inode_info *info; | |
82 | + int ret, do_wakeup, i; | |
83 | + | |
84 | + ret = 0; | |
85 | + do_wakeup = 0; | |
86 | + i = 0; | |
87 | + | |
88 | + mutex_lock(PIPE_MUTEX(*inode)); | |
89 | + | |
90 | + info = inode->i_pipe; | |
91 | + for (;;) { | |
92 | + int bufs; | |
93 | + | |
94 | + if (!PIPE_READERS(*inode)) { | |
95 | + send_sig(SIGPIPE, current, 0); | |
96 | + if (!ret) | |
97 | + ret = -EPIPE; | |
98 | + break; | |
99 | + } | |
100 | + | |
101 | + bufs = info->nrbufs; | |
102 | + if (bufs < PIPE_BUFFERS) { | |
103 | + int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1); | |
104 | + struct pipe_buffer *buf = info->bufs + newbuf; | |
105 | + struct page *page = pages[i++]; | |
106 | + unsigned long this_len; | |
107 | + | |
108 | + this_len = PAGE_CACHE_SIZE - offset; | |
109 | + if (this_len > len) | |
110 | + this_len = len; | |
111 | + | |
112 | + buf->page = page; | |
113 | + buf->offset = offset; | |
114 | + buf->len = this_len; | |
115 | + buf->ops = &page_cache_pipe_buf_ops; | |
116 | + info->nrbufs = ++bufs; | |
117 | + do_wakeup = 1; | |
118 | + | |
119 | + ret += this_len; | |
120 | + len -= this_len; | |
121 | + offset = 0; | |
122 | + if (!--nr_pages) | |
123 | + break; | |
124 | + if (!len) | |
125 | + break; | |
126 | + if (bufs < PIPE_BUFFERS) | |
127 | + continue; | |
128 | + | |
129 | + break; | |
130 | + } | |
131 | + | |
132 | + if (signal_pending(current)) { | |
133 | + if (!ret) | |
134 | + ret = -ERESTARTSYS; | |
135 | + break; | |
136 | + } | |
137 | + | |
138 | + if (do_wakeup) { | |
139 | + wake_up_interruptible_sync(PIPE_WAIT(*inode)); | |
140 | + kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, | |
141 | + POLL_IN); | |
142 | + do_wakeup = 0; | |
143 | + } | |
144 | + | |
145 | + PIPE_WAITING_WRITERS(*inode)++; | |
146 | + pipe_wait(inode); | |
147 | + PIPE_WAITING_WRITERS(*inode)--; | |
148 | + } | |
149 | + | |
150 | + mutex_unlock(PIPE_MUTEX(*inode)); | |
151 | + | |
152 | + if (do_wakeup) { | |
153 | + wake_up_interruptible(PIPE_WAIT(*inode)); | |
154 | + kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); | |
155 | + } | |
156 | + | |
157 | + while (i < nr_pages) | |
158 | + page_cache_release(pages[i++]); | |
159 | + | |
160 | + return ret; | |
161 | +} | |
162 | + | |
163 | +static int __generic_file_splice_read(struct file *in, struct inode *pipe, | |
164 | + size_t len) | |
165 | +{ | |
166 | + struct address_space *mapping = in->f_mapping; | |
167 | + unsigned int offset, nr_pages; | |
168 | + struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS]; | |
169 | + struct page *page; | |
170 | + pgoff_t index, pidx; | |
171 | + int i, j; | |
172 | + | |
173 | + index = in->f_pos >> PAGE_CACHE_SHIFT; | |
174 | + offset = in->f_pos & ~PAGE_CACHE_MASK; | |
175 | + nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | |
176 | + | |
177 | + if (nr_pages > PIPE_BUFFERS) | |
178 | + nr_pages = PIPE_BUFFERS; | |
179 | + | |
180 | + /* | |
181 | + * initiate read-ahead on this page range | |
182 | + */ | |
183 | + do_page_cache_readahead(mapping, in, index, nr_pages); | |
184 | + | |
185 | + /* | |
186 | + * Get as many pages from the page cache as possible.. | |
187 | + * Start IO on the page cache entries we create (we | |
188 | + * can assume that any pre-existing ones we find have | |
189 | + * already had IO started on them). | |
190 | + */ | |
191 | + i = find_get_pages(mapping, index, nr_pages, pages); | |
192 | + | |
193 | + /* | |
194 | + * common case - we found all pages and they are contiguous, | |
195 | + * kick them off | |
196 | + */ | |
197 | + if (i && (pages[i - 1]->index == index + i - 1)) | |
198 | + goto splice_them; | |
199 | + | |
200 | + /* | |
201 | + * fill shadow[] with pages at the right locations, so we only | |
202 | + * have to fill holes | |
203 | + */ | |
204 | + memset(shadow, 0, i * sizeof(struct page *)); | |
205 | + for (j = 0, pidx = index; j < i; pidx++, j++) | |
206 | + shadow[pages[j]->index - pidx] = pages[j]; | |
207 | + | |
208 | + /* | |
209 | + * now fill in the holes | |
210 | + */ | |
211 | + for (i = 0, pidx = index; i < nr_pages; pidx++, i++) { | |
212 | + int error; | |
213 | + | |
214 | + if (shadow[i]) | |
215 | + continue; | |
216 | + | |
217 | + /* | |
218 | + * no page there, look one up / create it | |
219 | + */ | |
220 | + page = find_or_create_page(mapping, pidx, | |
221 | + mapping_gfp_mask(mapping)); | |
222 | + if (!page) | |
223 | + break; | |
224 | + | |
225 | + if (PageUptodate(page)) | |
226 | + unlock_page(page); | |
227 | + else { | |
228 | + error = mapping->a_ops->readpage(in, page); | |
229 | + | |
230 | + if (unlikely(error)) { | |
231 | + page_cache_release(page); | |
232 | + break; | |
233 | + } | |
234 | + } | |
235 | + shadow[i] = page; | |
236 | + } | |
237 | + | |
238 | + if (!i) { | |
239 | + for (i = 0; i < nr_pages; i++) { | |
240 | + if (shadow[i]) | |
241 | + page_cache_release(shadow[i]); | |
242 | + } | |
243 | + return 0; | |
244 | + } | |
245 | + | |
246 | + memcpy(pages, shadow, i * sizeof(struct page *)); | |
247 | + | |
248 | + /* | |
249 | + * Now we splice them into the pipe.. | |
250 | + */ | |
251 | +splice_them: | |
252 | + return move_to_pipe(pipe, pages, i, offset, len); | |
253 | +} | |
254 | + | |
255 | +ssize_t generic_file_splice_read(struct file *in, struct inode *pipe, | |
256 | + size_t len, unsigned int flags) | |
257 | +{ | |
258 | + ssize_t spliced; | |
259 | + int ret; | |
260 | + | |
261 | + ret = 0; | |
262 | + spliced = 0; | |
263 | + while (len) { | |
264 | + ret = __generic_file_splice_read(in, pipe, len); | |
265 | + | |
266 | + if (ret <= 0) | |
267 | + break; | |
268 | + | |
269 | + in->f_pos += ret; | |
270 | + len -= ret; | |
271 | + spliced += ret; | |
272 | + } | |
273 | + | |
274 | + if (spliced) | |
275 | + return spliced; | |
276 | + | |
277 | + return ret; | |
278 | +} | |
279 | + | |
280 | +/* | |
281 | + * Send 'len' bytes to socket from 'file' at position 'pos' using sendpage(). | |
282 | + */ | |
283 | +static int pipe_to_sendpage(struct pipe_inode_info *info, | |
284 | + struct pipe_buffer *buf, struct splice_desc *sd) | |
285 | +{ | |
286 | + struct file *file = sd->file; | |
287 | + loff_t pos = sd->pos; | |
288 | + unsigned int offset; | |
289 | + ssize_t ret; | |
290 | + void *ptr; | |
291 | + | |
292 | + /* | |
293 | + * sub-optimal, but we are limited by the pipe ->map. we don't | |
294 | + * need a kmap'ed buffer here, we just want to make sure we | |
295 | + * have the page pinned if the pipe page originates from the | |
296 | + * page cache | |
297 | + */ | |
298 | + ptr = buf->ops->map(file, info, buf); | |
299 | + if (IS_ERR(ptr)) | |
300 | + return PTR_ERR(ptr); | |
301 | + | |
302 | + offset = pos & ~PAGE_CACHE_MASK; | |
303 | + | |
304 | + ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos, | |
305 | + sd->len < sd->total_len); | |
306 | + | |
307 | + buf->ops->unmap(info, buf); | |
308 | + if (ret == sd->len) | |
309 | + return 0; | |
310 | + | |
311 | + return -EIO; | |
312 | +} | |
313 | + | |
314 | +/* | |
315 | + * This is a little more tricky than the file -> pipe splicing. There are | |
316 | + * basically three cases: | |
317 | + * | |
318 | + * - Destination page already exists in the address space and there | |
319 | + * are users of it. For that case we have no other option that | |
320 | + * copying the data. Tough luck. | |
321 | + * - Destination page already exists in the address space, but there | |
322 | + * are no users of it. Make sure it's uptodate, then drop it. Fall | |
323 | + * through to last case. | |
324 | + * - Destination page does not exist, we can add the pipe page to | |
325 | + * the page cache and avoid the copy. | |
326 | + * | |
327 | + * For now we just do the slower thing and always copy pages over, it's | |
328 | + * easier than migrating pages from the pipe to the target file. For the | |
329 | + * case of doing file | file splicing, the migrate approach had some LRU | |
330 | + * nastiness... | |
331 | + */ | |
332 | +static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, | |
333 | + struct splice_desc *sd) | |
334 | +{ | |
335 | + struct file *file = sd->file; | |
336 | + struct address_space *mapping = file->f_mapping; | |
337 | + unsigned int offset; | |
338 | + struct page *page; | |
339 | + char *src, *dst; | |
340 | + pgoff_t index; | |
341 | + int ret; | |
342 | + | |
343 | + /* | |
344 | + * after this, page will be locked and unmapped | |
345 | + */ | |
346 | + src = buf->ops->map(file, info, buf); | |
347 | + if (IS_ERR(src)) | |
348 | + return PTR_ERR(src); | |
349 | + | |
350 | + index = sd->pos >> PAGE_CACHE_SHIFT; | |
351 | + offset = sd->pos & ~PAGE_CACHE_MASK; | |
352 | + | |
353 | +find_page: | |
354 | + ret = -ENOMEM; | |
355 | + page = find_or_create_page(mapping, index, mapping_gfp_mask(mapping)); | |
356 | + if (!page) | |
357 | + goto out; | |
358 | + | |
359 | + /* | |
360 | + * If the page is uptodate, it is also locked. If it isn't | |
361 | + * uptodate, we can mark it uptodate if we are filling the | |
362 | + * full page. Otherwise we need to read it in first... | |
363 | + */ | |
364 | + if (!PageUptodate(page)) { | |
365 | + if (sd->len < PAGE_CACHE_SIZE) { | |
366 | + ret = mapping->a_ops->readpage(file, page); | |
367 | + if (unlikely(ret)) | |
368 | + goto out; | |
369 | + | |
370 | + lock_page(page); | |
371 | + | |
372 | + if (!PageUptodate(page)) { | |
373 | + /* | |
374 | + * page got invalidated, repeat | |
375 | + */ | |
376 | + if (!page->mapping) { | |
377 | + unlock_page(page); | |
378 | + page_cache_release(page); | |
379 | + goto find_page; | |
380 | + } | |
381 | + ret = -EIO; | |
382 | + goto out; | |
383 | + } | |
384 | + } else { | |
385 | + WARN_ON(!PageLocked(page)); | |
386 | + SetPageUptodate(page); | |
387 | + } | |
388 | + } | |
389 | + | |
390 | + ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); | |
391 | + if (ret) | |
392 | + goto out; | |
393 | + | |
394 | + dst = kmap_atomic(page, KM_USER0); | |
395 | + memcpy(dst + offset, src + buf->offset, sd->len); | |
396 | + flush_dcache_page(page); | |
397 | + kunmap_atomic(dst, KM_USER0); | |
398 | + | |
399 | + ret = mapping->a_ops->commit_write(file, page, 0, sd->len); | |
400 | + if (ret < 0) | |
401 | + goto out; | |
402 | + | |
403 | + set_page_dirty(page); | |
404 | + ret = write_one_page(page, 0); | |
405 | +out: | |
406 | + if (ret < 0) | |
407 | + unlock_page(page); | |
408 | + page_cache_release(page); | |
409 | + buf->ops->unmap(info, buf); | |
410 | + return ret; | |
411 | +} | |
412 | + | |
413 | +typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, | |
414 | + struct splice_desc *); | |
415 | + | |
416 | +static ssize_t move_from_pipe(struct inode *inode, struct file *out, | |
417 | + size_t len, unsigned int flags, | |
418 | + splice_actor *actor) | |
419 | +{ | |
420 | + struct pipe_inode_info *info; | |
421 | + int ret, do_wakeup, err; | |
422 | + struct splice_desc sd; | |
423 | + | |
424 | + ret = 0; | |
425 | + do_wakeup = 0; | |
426 | + | |
427 | + sd.total_len = len; | |
428 | + sd.flags = flags; | |
429 | + sd.file = out; | |
430 | + sd.pos = out->f_pos; | |
431 | + | |
432 | + mutex_lock(PIPE_MUTEX(*inode)); | |
433 | + | |
434 | + info = inode->i_pipe; | |
435 | + for (;;) { | |
436 | + int bufs = info->nrbufs; | |
437 | + | |
438 | + if (bufs) { | |
439 | + int curbuf = info->curbuf; | |
440 | + struct pipe_buffer *buf = info->bufs + curbuf; | |
441 | + struct pipe_buf_operations *ops = buf->ops; | |
442 | + | |
443 | + sd.len = buf->len; | |
444 | + if (sd.len > sd.total_len) | |
445 | + sd.len = sd.total_len; | |
446 | + | |
447 | + err = actor(info, buf, &sd); | |
448 | + if (err) { | |
449 | + if (!ret && err != -ENODATA) | |
450 | + ret = err; | |
451 | + | |
452 | + break; | |
453 | + } | |
454 | + | |
455 | + ret += sd.len; | |
456 | + buf->offset += sd.len; | |
457 | + buf->len -= sd.len; | |
458 | + if (!buf->len) { | |
459 | + buf->ops = NULL; | |
460 | + ops->release(info, buf); | |
461 | + curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1); | |
462 | + info->curbuf = curbuf; | |
463 | + info->nrbufs = --bufs; | |
464 | + do_wakeup = 1; | |
465 | + } | |
466 | + | |
467 | + sd.pos += sd.len; | |
468 | + sd.total_len -= sd.len; | |
469 | + if (!sd.total_len) | |
470 | + break; | |
471 | + } | |
472 | + | |
473 | + if (bufs) | |
474 | + continue; | |
475 | + if (!PIPE_WRITERS(*inode)) | |
476 | + break; | |
477 | + if (!PIPE_WAITING_WRITERS(*inode)) { | |
478 | + if (ret) | |
479 | + break; | |
480 | + } | |
481 | + | |
482 | + if (signal_pending(current)) { | |
483 | + if (!ret) | |
484 | + ret = -ERESTARTSYS; | |
485 | + break; | |
486 | + } | |
487 | + | |
488 | + if (do_wakeup) { | |
489 | + wake_up_interruptible_sync(PIPE_WAIT(*inode)); | |
490 | + kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT); | |
491 | + do_wakeup = 0; | |
492 | + } | |
493 | + | |
494 | + pipe_wait(inode); | |
495 | + } | |
496 | + | |
497 | + mutex_unlock(PIPE_MUTEX(*inode)); | |
498 | + | |
499 | + if (do_wakeup) { | |
500 | + wake_up_interruptible(PIPE_WAIT(*inode)); | |
501 | + kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); | |
502 | + } | |
503 | + | |
504 | + mutex_lock(&out->f_mapping->host->i_mutex); | |
505 | + out->f_pos = sd.pos; | |
506 | + mutex_unlock(&out->f_mapping->host->i_mutex); | |
507 | + return ret; | |
508 | + | |
509 | +} | |
510 | + | |
511 | +ssize_t generic_file_splice_write(struct inode *inode, struct file *out, | |
512 | + size_t len, unsigned int flags) | |
513 | +{ | |
514 | + return move_from_pipe(inode, out, len, flags, pipe_to_file); | |
515 | +} | |
516 | + | |
517 | +ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, | |
518 | + size_t len, unsigned int flags) | |
519 | +{ | |
520 | + return move_from_pipe(inode, out, len, flags, pipe_to_sendpage); | |
521 | +} | |
522 | + | |
523 | +static long do_splice_from(struct inode *pipe, struct file *out, size_t len, | |
524 | + unsigned int flags) | |
525 | +{ | |
526 | + loff_t pos; | |
527 | + int ret; | |
528 | + | |
529 | + if (!out->f_op || !out->f_op->splice_write) | |
530 | + return -EINVAL; | |
531 | + | |
532 | + if (!(out->f_mode & FMODE_WRITE)) | |
533 | + return -EBADF; | |
534 | + | |
535 | + pos = out->f_pos; | |
536 | + ret = rw_verify_area(WRITE, out, &pos, len); | |
537 | + if (unlikely(ret < 0)) | |
538 | + return ret; | |
539 | + | |
540 | + return out->f_op->splice_write(pipe, out, len, flags); | |
541 | +} | |
542 | + | |
543 | +static long do_splice_to(struct file *in, struct inode *pipe, size_t len, | |
544 | + unsigned int flags) | |
545 | +{ | |
546 | + loff_t pos, isize, left; | |
547 | + int ret; | |
548 | + | |
549 | + if (!in->f_op || !in->f_op->splice_read) | |
550 | + return -EINVAL; | |
551 | + | |
552 | + if (!(in->f_mode & FMODE_READ)) | |
553 | + return -EBADF; | |
554 | + | |
555 | + pos = in->f_pos; | |
556 | + ret = rw_verify_area(READ, in, &pos, len); | |
557 | + if (unlikely(ret < 0)) | |
558 | + return ret; | |
559 | + | |
560 | + isize = i_size_read(in->f_mapping->host); | |
561 | + if (unlikely(in->f_pos >= isize)) | |
562 | + return 0; | |
563 | + | |
564 | + left = isize - in->f_pos; | |
565 | + if (left < len) | |
566 | + len = left; | |
567 | + | |
568 | + return in->f_op->splice_read(in, pipe, len, flags); | |
569 | +} | |
570 | + | |
571 | +static long do_splice(struct file *in, struct file *out, size_t len, | |
572 | + unsigned int flags) | |
573 | +{ | |
574 | + struct inode *pipe; | |
575 | + | |
576 | + pipe = in->f_dentry->d_inode; | |
577 | + if (pipe->i_pipe) | |
578 | + return do_splice_from(pipe, out, len, flags); | |
579 | + | |
580 | + pipe = out->f_dentry->d_inode; | |
581 | + if (pipe->i_pipe) | |
582 | + return do_splice_to(in, pipe, len, flags); | |
583 | + | |
584 | + return -EINVAL; | |
585 | +} | |
586 | + | |
587 | +asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags) | |
588 | +{ | |
589 | + long error; | |
590 | + struct file *in, *out; | |
591 | + int fput_in, fput_out; | |
592 | + | |
593 | + if (unlikely(!len)) | |
594 | + return 0; | |
595 | + | |
596 | + error = -EBADF; | |
597 | + in = fget_light(fdin, &fput_in); | |
598 | + if (in) { | |
599 | + if (in->f_mode & FMODE_READ) { | |
600 | + out = fget_light(fdout, &fput_out); | |
601 | + if (out) { | |
602 | + if (out->f_mode & FMODE_WRITE) | |
603 | + error = do_splice(in, out, len, flags); | |
604 | + fput_light(out, fput_out); | |
605 | + } | |
606 | + } | |
607 | + | |
608 | + fput_light(in, fput_in); | |
609 | + } | |
610 | + | |
611 | + return error; | |
612 | +} |
include/asm-i386/unistd.h
... | ... | @@ -318,8 +318,9 @@ |
318 | 318 | #define __NR_unshare 310 |
319 | 319 | #define __NR_set_robust_list 311 |
320 | 320 | #define __NR_get_robust_list 312 |
321 | +#define __NR_sys_splice 313 | |
321 | 322 | |
322 | -#define NR_syscalls 313 | |
323 | +#define NR_syscalls 314 | |
323 | 324 | |
324 | 325 | /* |
325 | 326 | * user-visible error numbers are in the range -1 - -128: see |
include/asm-ia64/unistd.h
... | ... | @@ -285,12 +285,13 @@ |
285 | 285 | #define __NR_faccessat 1293 |
286 | 286 | /* 1294, 1295 reserved for pselect/ppoll */ |
287 | 287 | #define __NR_unshare 1296 |
288 | +#define __NR_splice 1297 | |
288 | 289 | |
289 | 290 | #ifdef __KERNEL__ |
290 | 291 | |
291 | 292 | #include <linux/config.h> |
292 | 293 | |
293 | -#define NR_syscalls 273 /* length of syscall table */ | |
294 | +#define NR_syscalls 274 /* length of syscall table */ | |
294 | 295 | |
295 | 296 | #define __ARCH_WANT_SYS_RT_SIGACTION |
296 | 297 |
include/asm-powerpc/unistd.h
include/asm-x86_64/unistd.h
... | ... | @@ -609,8 +609,10 @@ |
609 | 609 | __SYSCALL(__NR_set_robust_list, sys_set_robust_list) |
610 | 610 | #define __NR_get_robust_list 274 |
611 | 611 | __SYSCALL(__NR_get_robust_list, sys_get_robust_list) |
612 | +#define __NR_splice 275 | |
613 | +__SYSCALL(__NR_splice, sys_splice) | |
612 | 614 | |
613 | -#define __NR_syscall_max __NR_get_robust_list | |
615 | +#define __NR_syscall_max __NR_splice | |
614 | 616 | |
615 | 617 | #ifndef __NO_STUBS |
616 | 618 |
include/linux/fs.h
... | ... | @@ -1032,6 +1032,8 @@ |
1032 | 1032 | int (*check_flags)(int); |
1033 | 1033 | int (*dir_notify)(struct file *filp, unsigned long arg); |
1034 | 1034 | int (*flock) (struct file *, int, struct file_lock *); |
1035 | + ssize_t (*splice_write)(struct inode *, struct file *, size_t, unsigned int); | |
1036 | + ssize_t (*splice_read)(struct file *, struct inode *, size_t, unsigned int); | |
1035 | 1037 | }; |
1036 | 1038 | |
1037 | 1039 | struct inode_operations { |
... | ... | @@ -1609,6 +1611,8 @@ |
1609 | 1611 | extern void do_generic_mapping_read(struct address_space *mapping, |
1610 | 1612 | struct file_ra_state *, struct file *, |
1611 | 1613 | loff_t *, read_descriptor_t *, read_actor_t); |
1614 | +extern ssize_t generic_file_splice_read(struct file *, struct inode *, size_t, unsigned int); | |
1615 | +extern ssize_t generic_file_splice_write(struct inode *, struct file *, size_t, unsigned int); | |
1612 | 1616 | extern void |
1613 | 1617 | file_ra_state_init(struct file_ra_state *ra, struct address_space *mapping); |
1614 | 1618 | extern ssize_t generic_file_readv(struct file *filp, const struct iovec *iov, |
include/linux/syscalls.h
... | ... | @@ -569,6 +569,8 @@ |
569 | 569 | asmlinkage long compat_sys_openat(unsigned int dfd, const char __user *filename, |
570 | 570 | int flags, int mode); |
571 | 571 | asmlinkage long sys_unshare(unsigned long unshare_flags); |
572 | +asmlinkage long sys_splice(int fdin, int fdout, size_t len, | |
573 | + unsigned int flags); | |
572 | 574 | |
573 | 575 | #endif |
net/socket.c
... | ... | @@ -119,7 +119,10 @@ |
119 | 119 | static ssize_t sock_sendpage(struct file *file, struct page *page, |
120 | 120 | int offset, size_t size, loff_t *ppos, int more); |
121 | 121 | |
122 | +extern ssize_t generic_splice_sendpage(struct inode *inode, struct file *out, | |
123 | + size_t len, unsigned int flags); | |
122 | 124 | |
125 | + | |
123 | 126 | /* |
124 | 127 | * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear |
125 | 128 | * in the operation structures but are done directly via the socketcall() multiplexor. |
... | ... | @@ -141,7 +144,8 @@ |
141 | 144 | .fasync = sock_fasync, |
142 | 145 | .readv = sock_readv, |
143 | 146 | .writev = sock_writev, |
144 | - .sendpage = sock_sendpage | |
147 | + .sendpage = sock_sendpage, | |
148 | + .splice_write = generic_splice_sendpage, | |
145 | 149 | }; |
146 | 150 | |
147 | 151 | /* |