Commit 7bfac9ecf0585962fe13584f5cf526d8c8e76f17
Committed by
Linus Torvalds
1 parent
612392307c
Exists in
master
and in
7 other branches
splice: fix deadlock in splicing to file
There's a possible deadlock in generic_file_splice_write(), splice_from_pipe() and ocfs2_file_splice_write(): - task A calls generic_file_splice_write() - this calls inode_double_lock(), which locks i_mutex on both pipe->inode and target inode - ordering depends on inode pointers, can happen that pipe->inode is locked first - __splice_from_pipe() needs more data, calls pipe_wait() - this releases lock on pipe->inode, goes to interruptible sleep - task B calls generic_file_splice_write(), similarly to the first - this locks pipe->inode, then tries to lock inode, but that is already held by task A - task A is interrupted, it tries to lock pipe->inode, but fails, as it is already held by task B - ABBA deadlock Fix this by explicitly ordering locks: the outer lock must be on target inode and the inner lock (which is later unlocked and relocked) must be on pipe->inode. This is OK, pipe inodes and target inodes form two nonoverlapping sets, generic_file_splice_write() and friends are not called with a target which is a pipe. Signed-off-by: Miklos Szeredi <mszeredi@suse.cz> Acked-by: Mark Fasheh <mfasheh@suse.com> Acked-by: Jens Axboe <jens.axboe@oracle.com> Cc: stable@kernel.org Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Showing 2 changed files with 26 additions and 7 deletions Inline Diff
fs/ocfs2/file.c
1 | /* -*- mode: c; c-basic-offset: 8; -*- | 1 | /* -*- mode: c; c-basic-offset: 8; -*- |
2 | * vim: noexpandtab sw=8 ts=8 sts=0: | 2 | * vim: noexpandtab sw=8 ts=8 sts=0: |
3 | * | 3 | * |
4 | * file.c | 4 | * file.c |
5 | * | 5 | * |
6 | * File open, close, extend, truncate | 6 | * File open, close, extend, truncate |
7 | * | 7 | * |
8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. | 8 | * Copyright (C) 2002, 2004 Oracle. All rights reserved. |
9 | * | 9 | * |
10 | * This program is free software; you can redistribute it and/or | 10 | * This program is free software; you can redistribute it and/or |
11 | * modify it under the terms of the GNU General Public | 11 | * modify it under the terms of the GNU General Public |
12 | * License as published by the Free Software Foundation; either | 12 | * License as published by the Free Software Foundation; either |
13 | * version 2 of the License, or (at your option) any later version. | 13 | * version 2 of the License, or (at your option) any later version. |
14 | * | 14 | * |
15 | * This program is distributed in the hope that it will be useful, | 15 | * This program is distributed in the hope that it will be useful, |
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | 16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | 17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
18 | * General Public License for more details. | 18 | * General Public License for more details. |
19 | * | 19 | * |
20 | * You should have received a copy of the GNU General Public | 20 | * You should have received a copy of the GNU General Public |
21 | * License along with this program; if not, write to the | 21 | * License along with this program; if not, write to the |
22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, | 22 | * Free Software Foundation, Inc., 59 Temple Place - Suite 330, |
23 | * Boston, MA 021110-1307, USA. | 23 | * Boston, MA 021110-1307, USA. |
24 | */ | 24 | */ |
25 | 25 | ||
26 | #include <linux/capability.h> | 26 | #include <linux/capability.h> |
27 | #include <linux/fs.h> | 27 | #include <linux/fs.h> |
28 | #include <linux/types.h> | 28 | #include <linux/types.h> |
29 | #include <linux/slab.h> | 29 | #include <linux/slab.h> |
30 | #include <linux/highmem.h> | 30 | #include <linux/highmem.h> |
31 | #include <linux/pagemap.h> | 31 | #include <linux/pagemap.h> |
32 | #include <linux/uio.h> | 32 | #include <linux/uio.h> |
33 | #include <linux/sched.h> | 33 | #include <linux/sched.h> |
34 | #include <linux/splice.h> | 34 | #include <linux/splice.h> |
35 | #include <linux/mount.h> | 35 | #include <linux/mount.h> |
36 | #include <linux/writeback.h> | 36 | #include <linux/writeback.h> |
37 | #include <linux/falloc.h> | 37 | #include <linux/falloc.h> |
38 | #include <linux/quotaops.h> | 38 | #include <linux/quotaops.h> |
39 | 39 | ||
40 | #define MLOG_MASK_PREFIX ML_INODE | 40 | #define MLOG_MASK_PREFIX ML_INODE |
41 | #include <cluster/masklog.h> | 41 | #include <cluster/masklog.h> |
42 | 42 | ||
43 | #include "ocfs2.h" | 43 | #include "ocfs2.h" |
44 | 44 | ||
45 | #include "alloc.h" | 45 | #include "alloc.h" |
46 | #include "aops.h" | 46 | #include "aops.h" |
47 | #include "dir.h" | 47 | #include "dir.h" |
48 | #include "dlmglue.h" | 48 | #include "dlmglue.h" |
49 | #include "extent_map.h" | 49 | #include "extent_map.h" |
50 | #include "file.h" | 50 | #include "file.h" |
51 | #include "sysfile.h" | 51 | #include "sysfile.h" |
52 | #include "inode.h" | 52 | #include "inode.h" |
53 | #include "ioctl.h" | 53 | #include "ioctl.h" |
54 | #include "journal.h" | 54 | #include "journal.h" |
55 | #include "locks.h" | 55 | #include "locks.h" |
56 | #include "mmap.h" | 56 | #include "mmap.h" |
57 | #include "suballoc.h" | 57 | #include "suballoc.h" |
58 | #include "super.h" | 58 | #include "super.h" |
59 | #include "xattr.h" | 59 | #include "xattr.h" |
60 | #include "acl.h" | 60 | #include "acl.h" |
61 | #include "quota.h" | 61 | #include "quota.h" |
62 | 62 | ||
63 | #include "buffer_head_io.h" | 63 | #include "buffer_head_io.h" |
64 | 64 | ||
65 | static int ocfs2_sync_inode(struct inode *inode) | 65 | static int ocfs2_sync_inode(struct inode *inode) |
66 | { | 66 | { |
67 | filemap_fdatawrite(inode->i_mapping); | 67 | filemap_fdatawrite(inode->i_mapping); |
68 | return sync_mapping_buffers(inode->i_mapping); | 68 | return sync_mapping_buffers(inode->i_mapping); |
69 | } | 69 | } |
70 | 70 | ||
71 | static int ocfs2_init_file_private(struct inode *inode, struct file *file) | 71 | static int ocfs2_init_file_private(struct inode *inode, struct file *file) |
72 | { | 72 | { |
73 | struct ocfs2_file_private *fp; | 73 | struct ocfs2_file_private *fp; |
74 | 74 | ||
75 | fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL); | 75 | fp = kzalloc(sizeof(struct ocfs2_file_private), GFP_KERNEL); |
76 | if (!fp) | 76 | if (!fp) |
77 | return -ENOMEM; | 77 | return -ENOMEM; |
78 | 78 | ||
79 | fp->fp_file = file; | 79 | fp->fp_file = file; |
80 | mutex_init(&fp->fp_mutex); | 80 | mutex_init(&fp->fp_mutex); |
81 | ocfs2_file_lock_res_init(&fp->fp_flock, fp); | 81 | ocfs2_file_lock_res_init(&fp->fp_flock, fp); |
82 | file->private_data = fp; | 82 | file->private_data = fp; |
83 | 83 | ||
84 | return 0; | 84 | return 0; |
85 | } | 85 | } |
86 | 86 | ||
87 | static void ocfs2_free_file_private(struct inode *inode, struct file *file) | 87 | static void ocfs2_free_file_private(struct inode *inode, struct file *file) |
88 | { | 88 | { |
89 | struct ocfs2_file_private *fp = file->private_data; | 89 | struct ocfs2_file_private *fp = file->private_data; |
90 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 90 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
91 | 91 | ||
92 | if (fp) { | 92 | if (fp) { |
93 | ocfs2_simple_drop_lockres(osb, &fp->fp_flock); | 93 | ocfs2_simple_drop_lockres(osb, &fp->fp_flock); |
94 | ocfs2_lock_res_free(&fp->fp_flock); | 94 | ocfs2_lock_res_free(&fp->fp_flock); |
95 | kfree(fp); | 95 | kfree(fp); |
96 | file->private_data = NULL; | 96 | file->private_data = NULL; |
97 | } | 97 | } |
98 | } | 98 | } |
99 | 99 | ||
100 | static int ocfs2_file_open(struct inode *inode, struct file *file) | 100 | static int ocfs2_file_open(struct inode *inode, struct file *file) |
101 | { | 101 | { |
102 | int status; | 102 | int status; |
103 | int mode = file->f_flags; | 103 | int mode = file->f_flags; |
104 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 104 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
105 | 105 | ||
106 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, | 106 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, |
107 | file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); | 107 | file->f_path.dentry->d_name.len, file->f_path.dentry->d_name.name); |
108 | 108 | ||
109 | spin_lock(&oi->ip_lock); | 109 | spin_lock(&oi->ip_lock); |
110 | 110 | ||
111 | /* Check that the inode hasn't been wiped from disk by another | 111 | /* Check that the inode hasn't been wiped from disk by another |
112 | * node. If it hasn't then we're safe as long as we hold the | 112 | * node. If it hasn't then we're safe as long as we hold the |
113 | * spin lock until our increment of open count. */ | 113 | * spin lock until our increment of open count. */ |
114 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { | 114 | if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_DELETED) { |
115 | spin_unlock(&oi->ip_lock); | 115 | spin_unlock(&oi->ip_lock); |
116 | 116 | ||
117 | status = -ENOENT; | 117 | status = -ENOENT; |
118 | goto leave; | 118 | goto leave; |
119 | } | 119 | } |
120 | 120 | ||
121 | if (mode & O_DIRECT) | 121 | if (mode & O_DIRECT) |
122 | oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; | 122 | oi->ip_flags |= OCFS2_INODE_OPEN_DIRECT; |
123 | 123 | ||
124 | oi->ip_open_count++; | 124 | oi->ip_open_count++; |
125 | spin_unlock(&oi->ip_lock); | 125 | spin_unlock(&oi->ip_lock); |
126 | 126 | ||
127 | status = ocfs2_init_file_private(inode, file); | 127 | status = ocfs2_init_file_private(inode, file); |
128 | if (status) { | 128 | if (status) { |
129 | /* | 129 | /* |
130 | * We want to set open count back if we're failing the | 130 | * We want to set open count back if we're failing the |
131 | * open. | 131 | * open. |
132 | */ | 132 | */ |
133 | spin_lock(&oi->ip_lock); | 133 | spin_lock(&oi->ip_lock); |
134 | oi->ip_open_count--; | 134 | oi->ip_open_count--; |
135 | spin_unlock(&oi->ip_lock); | 135 | spin_unlock(&oi->ip_lock); |
136 | } | 136 | } |
137 | 137 | ||
138 | leave: | 138 | leave: |
139 | mlog_exit(status); | 139 | mlog_exit(status); |
140 | return status; | 140 | return status; |
141 | } | 141 | } |
142 | 142 | ||
143 | static int ocfs2_file_release(struct inode *inode, struct file *file) | 143 | static int ocfs2_file_release(struct inode *inode, struct file *file) |
144 | { | 144 | { |
145 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 145 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
146 | 146 | ||
147 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, | 147 | mlog_entry("(0x%p, 0x%p, '%.*s')\n", inode, file, |
148 | file->f_path.dentry->d_name.len, | 148 | file->f_path.dentry->d_name.len, |
149 | file->f_path.dentry->d_name.name); | 149 | file->f_path.dentry->d_name.name); |
150 | 150 | ||
151 | spin_lock(&oi->ip_lock); | 151 | spin_lock(&oi->ip_lock); |
152 | if (!--oi->ip_open_count) | 152 | if (!--oi->ip_open_count) |
153 | oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; | 153 | oi->ip_flags &= ~OCFS2_INODE_OPEN_DIRECT; |
154 | spin_unlock(&oi->ip_lock); | 154 | spin_unlock(&oi->ip_lock); |
155 | 155 | ||
156 | ocfs2_free_file_private(inode, file); | 156 | ocfs2_free_file_private(inode, file); |
157 | 157 | ||
158 | mlog_exit(0); | 158 | mlog_exit(0); |
159 | 159 | ||
160 | return 0; | 160 | return 0; |
161 | } | 161 | } |
162 | 162 | ||
163 | static int ocfs2_dir_open(struct inode *inode, struct file *file) | 163 | static int ocfs2_dir_open(struct inode *inode, struct file *file) |
164 | { | 164 | { |
165 | return ocfs2_init_file_private(inode, file); | 165 | return ocfs2_init_file_private(inode, file); |
166 | } | 166 | } |
167 | 167 | ||
168 | static int ocfs2_dir_release(struct inode *inode, struct file *file) | 168 | static int ocfs2_dir_release(struct inode *inode, struct file *file) |
169 | { | 169 | { |
170 | ocfs2_free_file_private(inode, file); | 170 | ocfs2_free_file_private(inode, file); |
171 | return 0; | 171 | return 0; |
172 | } | 172 | } |
173 | 173 | ||
174 | static int ocfs2_sync_file(struct file *file, | 174 | static int ocfs2_sync_file(struct file *file, |
175 | struct dentry *dentry, | 175 | struct dentry *dentry, |
176 | int datasync) | 176 | int datasync) |
177 | { | 177 | { |
178 | int err = 0; | 178 | int err = 0; |
179 | journal_t *journal; | 179 | journal_t *journal; |
180 | struct inode *inode = dentry->d_inode; | 180 | struct inode *inode = dentry->d_inode; |
181 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 181 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
182 | 182 | ||
183 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, | 183 | mlog_entry("(0x%p, 0x%p, %d, '%.*s')\n", file, dentry, datasync, |
184 | dentry->d_name.len, dentry->d_name.name); | 184 | dentry->d_name.len, dentry->d_name.name); |
185 | 185 | ||
186 | err = ocfs2_sync_inode(dentry->d_inode); | 186 | err = ocfs2_sync_inode(dentry->d_inode); |
187 | if (err) | 187 | if (err) |
188 | goto bail; | 188 | goto bail; |
189 | 189 | ||
190 | journal = osb->journal->j_journal; | 190 | journal = osb->journal->j_journal; |
191 | err = jbd2_journal_force_commit(journal); | 191 | err = jbd2_journal_force_commit(journal); |
192 | 192 | ||
193 | bail: | 193 | bail: |
194 | mlog_exit(err); | 194 | mlog_exit(err); |
195 | 195 | ||
196 | return (err < 0) ? -EIO : 0; | 196 | return (err < 0) ? -EIO : 0; |
197 | } | 197 | } |
198 | 198 | ||
199 | int ocfs2_should_update_atime(struct inode *inode, | 199 | int ocfs2_should_update_atime(struct inode *inode, |
200 | struct vfsmount *vfsmnt) | 200 | struct vfsmount *vfsmnt) |
201 | { | 201 | { |
202 | struct timespec now; | 202 | struct timespec now; |
203 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 203 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
204 | 204 | ||
205 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) | 205 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) |
206 | return 0; | 206 | return 0; |
207 | 207 | ||
208 | if ((inode->i_flags & S_NOATIME) || | 208 | if ((inode->i_flags & S_NOATIME) || |
209 | ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) | 209 | ((inode->i_sb->s_flags & MS_NODIRATIME) && S_ISDIR(inode->i_mode))) |
210 | return 0; | 210 | return 0; |
211 | 211 | ||
212 | /* | 212 | /* |
213 | * We can be called with no vfsmnt structure - NFSD will | 213 | * We can be called with no vfsmnt structure - NFSD will |
214 | * sometimes do this. | 214 | * sometimes do this. |
215 | * | 215 | * |
216 | * Note that our action here is different than touch_atime() - | 216 | * Note that our action here is different than touch_atime() - |
217 | * if we can't tell whether this is a noatime mount, then we | 217 | * if we can't tell whether this is a noatime mount, then we |
218 | * don't know whether to trust the value of s_atime_quantum. | 218 | * don't know whether to trust the value of s_atime_quantum. |
219 | */ | 219 | */ |
220 | if (vfsmnt == NULL) | 220 | if (vfsmnt == NULL) |
221 | return 0; | 221 | return 0; |
222 | 222 | ||
223 | if ((vfsmnt->mnt_flags & MNT_NOATIME) || | 223 | if ((vfsmnt->mnt_flags & MNT_NOATIME) || |
224 | ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) | 224 | ((vfsmnt->mnt_flags & MNT_NODIRATIME) && S_ISDIR(inode->i_mode))) |
225 | return 0; | 225 | return 0; |
226 | 226 | ||
227 | if (vfsmnt->mnt_flags & MNT_RELATIME) { | 227 | if (vfsmnt->mnt_flags & MNT_RELATIME) { |
228 | if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || | 228 | if ((timespec_compare(&inode->i_atime, &inode->i_mtime) <= 0) || |
229 | (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) | 229 | (timespec_compare(&inode->i_atime, &inode->i_ctime) <= 0)) |
230 | return 1; | 230 | return 1; |
231 | 231 | ||
232 | return 0; | 232 | return 0; |
233 | } | 233 | } |
234 | 234 | ||
235 | now = CURRENT_TIME; | 235 | now = CURRENT_TIME; |
236 | if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) | 236 | if ((now.tv_sec - inode->i_atime.tv_sec <= osb->s_atime_quantum)) |
237 | return 0; | 237 | return 0; |
238 | else | 238 | else |
239 | return 1; | 239 | return 1; |
240 | } | 240 | } |
241 | 241 | ||
242 | int ocfs2_update_inode_atime(struct inode *inode, | 242 | int ocfs2_update_inode_atime(struct inode *inode, |
243 | struct buffer_head *bh) | 243 | struct buffer_head *bh) |
244 | { | 244 | { |
245 | int ret; | 245 | int ret; |
246 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 246 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
247 | handle_t *handle; | 247 | handle_t *handle; |
248 | struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; | 248 | struct ocfs2_dinode *di = (struct ocfs2_dinode *) bh->b_data; |
249 | 249 | ||
250 | mlog_entry_void(); | 250 | mlog_entry_void(); |
251 | 251 | ||
252 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 252 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
253 | if (IS_ERR(handle)) { | 253 | if (IS_ERR(handle)) { |
254 | ret = PTR_ERR(handle); | 254 | ret = PTR_ERR(handle); |
255 | mlog_errno(ret); | 255 | mlog_errno(ret); |
256 | goto out; | 256 | goto out; |
257 | } | 257 | } |
258 | 258 | ||
259 | ret = ocfs2_journal_access_di(handle, inode, bh, | 259 | ret = ocfs2_journal_access_di(handle, inode, bh, |
260 | OCFS2_JOURNAL_ACCESS_WRITE); | 260 | OCFS2_JOURNAL_ACCESS_WRITE); |
261 | if (ret) { | 261 | if (ret) { |
262 | mlog_errno(ret); | 262 | mlog_errno(ret); |
263 | goto out_commit; | 263 | goto out_commit; |
264 | } | 264 | } |
265 | 265 | ||
266 | /* | 266 | /* |
267 | * Don't use ocfs2_mark_inode_dirty() here as we don't always | 267 | * Don't use ocfs2_mark_inode_dirty() here as we don't always |
268 | * have i_mutex to guard against concurrent changes to other | 268 | * have i_mutex to guard against concurrent changes to other |
269 | * inode fields. | 269 | * inode fields. |
270 | */ | 270 | */ |
271 | inode->i_atime = CURRENT_TIME; | 271 | inode->i_atime = CURRENT_TIME; |
272 | di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); | 272 | di->i_atime = cpu_to_le64(inode->i_atime.tv_sec); |
273 | di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); | 273 | di->i_atime_nsec = cpu_to_le32(inode->i_atime.tv_nsec); |
274 | 274 | ||
275 | ret = ocfs2_journal_dirty(handle, bh); | 275 | ret = ocfs2_journal_dirty(handle, bh); |
276 | if (ret < 0) | 276 | if (ret < 0) |
277 | mlog_errno(ret); | 277 | mlog_errno(ret); |
278 | 278 | ||
279 | out_commit: | 279 | out_commit: |
280 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); | 280 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); |
281 | out: | 281 | out: |
282 | mlog_exit(ret); | 282 | mlog_exit(ret); |
283 | return ret; | 283 | return ret; |
284 | } | 284 | } |
285 | 285 | ||
286 | static int ocfs2_set_inode_size(handle_t *handle, | 286 | static int ocfs2_set_inode_size(handle_t *handle, |
287 | struct inode *inode, | 287 | struct inode *inode, |
288 | struct buffer_head *fe_bh, | 288 | struct buffer_head *fe_bh, |
289 | u64 new_i_size) | 289 | u64 new_i_size) |
290 | { | 290 | { |
291 | int status; | 291 | int status; |
292 | 292 | ||
293 | mlog_entry_void(); | 293 | mlog_entry_void(); |
294 | i_size_write(inode, new_i_size); | 294 | i_size_write(inode, new_i_size); |
295 | inode->i_blocks = ocfs2_inode_sector_count(inode); | 295 | inode->i_blocks = ocfs2_inode_sector_count(inode); |
296 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 296 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
297 | 297 | ||
298 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); | 298 | status = ocfs2_mark_inode_dirty(handle, inode, fe_bh); |
299 | if (status < 0) { | 299 | if (status < 0) { |
300 | mlog_errno(status); | 300 | mlog_errno(status); |
301 | goto bail; | 301 | goto bail; |
302 | } | 302 | } |
303 | 303 | ||
304 | bail: | 304 | bail: |
305 | mlog_exit(status); | 305 | mlog_exit(status); |
306 | return status; | 306 | return status; |
307 | } | 307 | } |
308 | 308 | ||
309 | int ocfs2_simple_size_update(struct inode *inode, | 309 | int ocfs2_simple_size_update(struct inode *inode, |
310 | struct buffer_head *di_bh, | 310 | struct buffer_head *di_bh, |
311 | u64 new_i_size) | 311 | u64 new_i_size) |
312 | { | 312 | { |
313 | int ret; | 313 | int ret; |
314 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 314 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
315 | handle_t *handle = NULL; | 315 | handle_t *handle = NULL; |
316 | 316 | ||
317 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 317 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
318 | if (IS_ERR(handle)) { | 318 | if (IS_ERR(handle)) { |
319 | ret = PTR_ERR(handle); | 319 | ret = PTR_ERR(handle); |
320 | mlog_errno(ret); | 320 | mlog_errno(ret); |
321 | goto out; | 321 | goto out; |
322 | } | 322 | } |
323 | 323 | ||
324 | ret = ocfs2_set_inode_size(handle, inode, di_bh, | 324 | ret = ocfs2_set_inode_size(handle, inode, di_bh, |
325 | new_i_size); | 325 | new_i_size); |
326 | if (ret < 0) | 326 | if (ret < 0) |
327 | mlog_errno(ret); | 327 | mlog_errno(ret); |
328 | 328 | ||
329 | ocfs2_commit_trans(osb, handle); | 329 | ocfs2_commit_trans(osb, handle); |
330 | out: | 330 | out: |
331 | return ret; | 331 | return ret; |
332 | } | 332 | } |
333 | 333 | ||
334 | static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, | 334 | static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, |
335 | struct inode *inode, | 335 | struct inode *inode, |
336 | struct buffer_head *fe_bh, | 336 | struct buffer_head *fe_bh, |
337 | u64 new_i_size) | 337 | u64 new_i_size) |
338 | { | 338 | { |
339 | int status; | 339 | int status; |
340 | handle_t *handle; | 340 | handle_t *handle; |
341 | struct ocfs2_dinode *di; | 341 | struct ocfs2_dinode *di; |
342 | u64 cluster_bytes; | 342 | u64 cluster_bytes; |
343 | 343 | ||
344 | mlog_entry_void(); | 344 | mlog_entry_void(); |
345 | 345 | ||
346 | /* TODO: This needs to actually orphan the inode in this | 346 | /* TODO: This needs to actually orphan the inode in this |
347 | * transaction. */ | 347 | * transaction. */ |
348 | 348 | ||
349 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 349 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
350 | if (IS_ERR(handle)) { | 350 | if (IS_ERR(handle)) { |
351 | status = PTR_ERR(handle); | 351 | status = PTR_ERR(handle); |
352 | mlog_errno(status); | 352 | mlog_errno(status); |
353 | goto out; | 353 | goto out; |
354 | } | 354 | } |
355 | 355 | ||
356 | status = ocfs2_journal_access_di(handle, inode, fe_bh, | 356 | status = ocfs2_journal_access_di(handle, inode, fe_bh, |
357 | OCFS2_JOURNAL_ACCESS_WRITE); | 357 | OCFS2_JOURNAL_ACCESS_WRITE); |
358 | if (status < 0) { | 358 | if (status < 0) { |
359 | mlog_errno(status); | 359 | mlog_errno(status); |
360 | goto out_commit; | 360 | goto out_commit; |
361 | } | 361 | } |
362 | 362 | ||
363 | /* | 363 | /* |
364 | * Do this before setting i_size. | 364 | * Do this before setting i_size. |
365 | */ | 365 | */ |
366 | cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); | 366 | cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); |
367 | status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, | 367 | status = ocfs2_zero_range_for_truncate(inode, handle, new_i_size, |
368 | cluster_bytes); | 368 | cluster_bytes); |
369 | if (status) { | 369 | if (status) { |
370 | mlog_errno(status); | 370 | mlog_errno(status); |
371 | goto out_commit; | 371 | goto out_commit; |
372 | } | 372 | } |
373 | 373 | ||
374 | i_size_write(inode, new_i_size); | 374 | i_size_write(inode, new_i_size); |
375 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 375 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
376 | 376 | ||
377 | di = (struct ocfs2_dinode *) fe_bh->b_data; | 377 | di = (struct ocfs2_dinode *) fe_bh->b_data; |
378 | di->i_size = cpu_to_le64(new_i_size); | 378 | di->i_size = cpu_to_le64(new_i_size); |
379 | di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); | 379 | di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); |
380 | di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); | 380 | di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); |
381 | 381 | ||
382 | status = ocfs2_journal_dirty(handle, fe_bh); | 382 | status = ocfs2_journal_dirty(handle, fe_bh); |
383 | if (status < 0) | 383 | if (status < 0) |
384 | mlog_errno(status); | 384 | mlog_errno(status); |
385 | 385 | ||
386 | out_commit: | 386 | out_commit: |
387 | ocfs2_commit_trans(osb, handle); | 387 | ocfs2_commit_trans(osb, handle); |
388 | out: | 388 | out: |
389 | 389 | ||
390 | mlog_exit(status); | 390 | mlog_exit(status); |
391 | return status; | 391 | return status; |
392 | } | 392 | } |
393 | 393 | ||
394 | static int ocfs2_truncate_file(struct inode *inode, | 394 | static int ocfs2_truncate_file(struct inode *inode, |
395 | struct buffer_head *di_bh, | 395 | struct buffer_head *di_bh, |
396 | u64 new_i_size) | 396 | u64 new_i_size) |
397 | { | 397 | { |
398 | int status = 0; | 398 | int status = 0; |
399 | struct ocfs2_dinode *fe = NULL; | 399 | struct ocfs2_dinode *fe = NULL; |
400 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 400 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
401 | struct ocfs2_truncate_context *tc = NULL; | 401 | struct ocfs2_truncate_context *tc = NULL; |
402 | 402 | ||
403 | mlog_entry("(inode = %llu, new_i_size = %llu\n", | 403 | mlog_entry("(inode = %llu, new_i_size = %llu\n", |
404 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 404 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
405 | (unsigned long long)new_i_size); | 405 | (unsigned long long)new_i_size); |
406 | 406 | ||
407 | /* We trust di_bh because it comes from ocfs2_inode_lock(), which | 407 | /* We trust di_bh because it comes from ocfs2_inode_lock(), which |
408 | * already validated it */ | 408 | * already validated it */ |
409 | fe = (struct ocfs2_dinode *) di_bh->b_data; | 409 | fe = (struct ocfs2_dinode *) di_bh->b_data; |
410 | 410 | ||
411 | mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), | 411 | mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), |
412 | "Inode %llu, inode i_size = %lld != di " | 412 | "Inode %llu, inode i_size = %lld != di " |
413 | "i_size = %llu, i_flags = 0x%x\n", | 413 | "i_size = %llu, i_flags = 0x%x\n", |
414 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 414 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
415 | i_size_read(inode), | 415 | i_size_read(inode), |
416 | (unsigned long long)le64_to_cpu(fe->i_size), | 416 | (unsigned long long)le64_to_cpu(fe->i_size), |
417 | le32_to_cpu(fe->i_flags)); | 417 | le32_to_cpu(fe->i_flags)); |
418 | 418 | ||
419 | if (new_i_size > le64_to_cpu(fe->i_size)) { | 419 | if (new_i_size > le64_to_cpu(fe->i_size)) { |
420 | mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", | 420 | mlog(0, "asked to truncate file with size (%llu) to size (%llu)!\n", |
421 | (unsigned long long)le64_to_cpu(fe->i_size), | 421 | (unsigned long long)le64_to_cpu(fe->i_size), |
422 | (unsigned long long)new_i_size); | 422 | (unsigned long long)new_i_size); |
423 | status = -EINVAL; | 423 | status = -EINVAL; |
424 | mlog_errno(status); | 424 | mlog_errno(status); |
425 | goto bail; | 425 | goto bail; |
426 | } | 426 | } |
427 | 427 | ||
428 | mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", | 428 | mlog(0, "inode %llu, i_size = %llu, new_i_size = %llu\n", |
429 | (unsigned long long)le64_to_cpu(fe->i_blkno), | 429 | (unsigned long long)le64_to_cpu(fe->i_blkno), |
430 | (unsigned long long)le64_to_cpu(fe->i_size), | 430 | (unsigned long long)le64_to_cpu(fe->i_size), |
431 | (unsigned long long)new_i_size); | 431 | (unsigned long long)new_i_size); |
432 | 432 | ||
433 | /* lets handle the simple truncate cases before doing any more | 433 | /* lets handle the simple truncate cases before doing any more |
434 | * cluster locking. */ | 434 | * cluster locking. */ |
435 | if (new_i_size == le64_to_cpu(fe->i_size)) | 435 | if (new_i_size == le64_to_cpu(fe->i_size)) |
436 | goto bail; | 436 | goto bail; |
437 | 437 | ||
438 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 438 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
439 | 439 | ||
440 | /* | 440 | /* |
441 | * The inode lock forced other nodes to sync and drop their | 441 | * The inode lock forced other nodes to sync and drop their |
442 | * pages, which (correctly) happens even if we have a truncate | 442 | * pages, which (correctly) happens even if we have a truncate |
443 | * without allocation change - ocfs2 cluster sizes can be much | 443 | * without allocation change - ocfs2 cluster sizes can be much |
444 | * greater than page size, so we have to truncate them | 444 | * greater than page size, so we have to truncate them |
445 | * anyway. | 445 | * anyway. |
446 | */ | 446 | */ |
447 | unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); | 447 | unmap_mapping_range(inode->i_mapping, new_i_size + PAGE_SIZE - 1, 0, 1); |
448 | truncate_inode_pages(inode->i_mapping, new_i_size); | 448 | truncate_inode_pages(inode->i_mapping, new_i_size); |
449 | 449 | ||
450 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { | 450 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { |
451 | status = ocfs2_truncate_inline(inode, di_bh, new_i_size, | 451 | status = ocfs2_truncate_inline(inode, di_bh, new_i_size, |
452 | i_size_read(inode), 1); | 452 | i_size_read(inode), 1); |
453 | if (status) | 453 | if (status) |
454 | mlog_errno(status); | 454 | mlog_errno(status); |
455 | 455 | ||
456 | goto bail_unlock_sem; | 456 | goto bail_unlock_sem; |
457 | } | 457 | } |
458 | 458 | ||
459 | /* alright, we're going to need to do a full blown alloc size | 459 | /* alright, we're going to need to do a full blown alloc size |
460 | * change. Orphan the inode so that recovery can complete the | 460 | * change. Orphan the inode so that recovery can complete the |
461 | * truncate if necessary. This does the task of marking | 461 | * truncate if necessary. This does the task of marking |
462 | * i_size. */ | 462 | * i_size. */ |
463 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); | 463 | status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size); |
464 | if (status < 0) { | 464 | if (status < 0) { |
465 | mlog_errno(status); | 465 | mlog_errno(status); |
466 | goto bail_unlock_sem; | 466 | goto bail_unlock_sem; |
467 | } | 467 | } |
468 | 468 | ||
469 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); | 469 | status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc); |
470 | if (status < 0) { | 470 | if (status < 0) { |
471 | mlog_errno(status); | 471 | mlog_errno(status); |
472 | goto bail_unlock_sem; | 472 | goto bail_unlock_sem; |
473 | } | 473 | } |
474 | 474 | ||
475 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); | 475 | status = ocfs2_commit_truncate(osb, inode, di_bh, tc); |
476 | if (status < 0) { | 476 | if (status < 0) { |
477 | mlog_errno(status); | 477 | mlog_errno(status); |
478 | goto bail_unlock_sem; | 478 | goto bail_unlock_sem; |
479 | } | 479 | } |
480 | 480 | ||
481 | /* TODO: orphan dir cleanup here. */ | 481 | /* TODO: orphan dir cleanup here. */ |
482 | bail_unlock_sem: | 482 | bail_unlock_sem: |
483 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | 483 | up_write(&OCFS2_I(inode)->ip_alloc_sem); |
484 | 484 | ||
485 | bail: | 485 | bail: |
486 | 486 | ||
487 | mlog_exit(status); | 487 | mlog_exit(status); |
488 | return status; | 488 | return status; |
489 | } | 489 | } |
490 | 490 | ||
491 | /* | 491 | /* |
492 | * extend file allocation only here. | 492 | * extend file allocation only here. |
493 | * we'll update all the disk stuff, and oip->alloc_size | 493 | * we'll update all the disk stuff, and oip->alloc_size |
494 | * | 494 | * |
495 | * expect stuff to be locked, a transaction started and enough data / | 495 | * expect stuff to be locked, a transaction started and enough data / |
496 | * metadata reservations in the contexts. | 496 | * metadata reservations in the contexts. |
497 | * | 497 | * |
498 | * Will return -EAGAIN, and a reason if a restart is needed. | 498 | * Will return -EAGAIN, and a reason if a restart is needed. |
499 | * If passed in, *reason will always be set, even in error. | 499 | * If passed in, *reason will always be set, even in error. |
500 | */ | 500 | */ |
501 | int ocfs2_add_inode_data(struct ocfs2_super *osb, | 501 | int ocfs2_add_inode_data(struct ocfs2_super *osb, |
502 | struct inode *inode, | 502 | struct inode *inode, |
503 | u32 *logical_offset, | 503 | u32 *logical_offset, |
504 | u32 clusters_to_add, | 504 | u32 clusters_to_add, |
505 | int mark_unwritten, | 505 | int mark_unwritten, |
506 | struct buffer_head *fe_bh, | 506 | struct buffer_head *fe_bh, |
507 | handle_t *handle, | 507 | handle_t *handle, |
508 | struct ocfs2_alloc_context *data_ac, | 508 | struct ocfs2_alloc_context *data_ac, |
509 | struct ocfs2_alloc_context *meta_ac, | 509 | struct ocfs2_alloc_context *meta_ac, |
510 | enum ocfs2_alloc_restarted *reason_ret) | 510 | enum ocfs2_alloc_restarted *reason_ret) |
511 | { | 511 | { |
512 | int ret; | 512 | int ret; |
513 | struct ocfs2_extent_tree et; | 513 | struct ocfs2_extent_tree et; |
514 | 514 | ||
515 | ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); | 515 | ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); |
516 | ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, | 516 | ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, |
517 | clusters_to_add, mark_unwritten, | 517 | clusters_to_add, mark_unwritten, |
518 | &et, handle, | 518 | &et, handle, |
519 | data_ac, meta_ac, reason_ret); | 519 | data_ac, meta_ac, reason_ret); |
520 | 520 | ||
521 | return ret; | 521 | return ret; |
522 | } | 522 | } |
523 | 523 | ||
524 | static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, | 524 | static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, |
525 | u32 clusters_to_add, int mark_unwritten) | 525 | u32 clusters_to_add, int mark_unwritten) |
526 | { | 526 | { |
527 | int status = 0; | 527 | int status = 0; |
528 | int restart_func = 0; | 528 | int restart_func = 0; |
529 | int credits; | 529 | int credits; |
530 | u32 prev_clusters; | 530 | u32 prev_clusters; |
531 | struct buffer_head *bh = NULL; | 531 | struct buffer_head *bh = NULL; |
532 | struct ocfs2_dinode *fe = NULL; | 532 | struct ocfs2_dinode *fe = NULL; |
533 | handle_t *handle = NULL; | 533 | handle_t *handle = NULL; |
534 | struct ocfs2_alloc_context *data_ac = NULL; | 534 | struct ocfs2_alloc_context *data_ac = NULL; |
535 | struct ocfs2_alloc_context *meta_ac = NULL; | 535 | struct ocfs2_alloc_context *meta_ac = NULL; |
536 | enum ocfs2_alloc_restarted why; | 536 | enum ocfs2_alloc_restarted why; |
537 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 537 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
538 | struct ocfs2_extent_tree et; | 538 | struct ocfs2_extent_tree et; |
539 | int did_quota = 0; | 539 | int did_quota = 0; |
540 | 540 | ||
541 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); | 541 | mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); |
542 | 542 | ||
543 | /* | 543 | /* |
544 | * This function only exists for file systems which don't | 544 | * This function only exists for file systems which don't |
545 | * support holes. | 545 | * support holes. |
546 | */ | 546 | */ |
547 | BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); | 547 | BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); |
548 | 548 | ||
549 | status = ocfs2_read_inode_block(inode, &bh); | 549 | status = ocfs2_read_inode_block(inode, &bh); |
550 | if (status < 0) { | 550 | if (status < 0) { |
551 | mlog_errno(status); | 551 | mlog_errno(status); |
552 | goto leave; | 552 | goto leave; |
553 | } | 553 | } |
554 | fe = (struct ocfs2_dinode *) bh->b_data; | 554 | fe = (struct ocfs2_dinode *) bh->b_data; |
555 | 555 | ||
556 | restart_all: | 556 | restart_all: |
557 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); | 557 | BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); |
558 | 558 | ||
559 | mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " | 559 | mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " |
560 | "clusters_to_add = %u\n", | 560 | "clusters_to_add = %u\n", |
561 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 561 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
562 | (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), | 562 | (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), |
563 | clusters_to_add); | 563 | clusters_to_add); |
564 | ocfs2_init_dinode_extent_tree(&et, inode, bh); | 564 | ocfs2_init_dinode_extent_tree(&et, inode, bh); |
565 | status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, | 565 | status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, |
566 | &data_ac, &meta_ac); | 566 | &data_ac, &meta_ac); |
567 | if (status) { | 567 | if (status) { |
568 | mlog_errno(status); | 568 | mlog_errno(status); |
569 | goto leave; | 569 | goto leave; |
570 | } | 570 | } |
571 | 571 | ||
572 | credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, | 572 | credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, |
573 | clusters_to_add); | 573 | clusters_to_add); |
574 | handle = ocfs2_start_trans(osb, credits); | 574 | handle = ocfs2_start_trans(osb, credits); |
575 | if (IS_ERR(handle)) { | 575 | if (IS_ERR(handle)) { |
576 | status = PTR_ERR(handle); | 576 | status = PTR_ERR(handle); |
577 | handle = NULL; | 577 | handle = NULL; |
578 | mlog_errno(status); | 578 | mlog_errno(status); |
579 | goto leave; | 579 | goto leave; |
580 | } | 580 | } |
581 | 581 | ||
582 | restarted_transaction: | 582 | restarted_transaction: |
583 | if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, | 583 | if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, |
584 | clusters_to_add))) { | 584 | clusters_to_add))) { |
585 | status = -EDQUOT; | 585 | status = -EDQUOT; |
586 | goto leave; | 586 | goto leave; |
587 | } | 587 | } |
588 | did_quota = 1; | 588 | did_quota = 1; |
589 | 589 | ||
590 | /* reserve a write to the file entry early on - that we if we | 590 | /* reserve a write to the file entry early on - that we if we |
591 | * run out of credits in the allocation path, we can still | 591 | * run out of credits in the allocation path, we can still |
592 | * update i_size. */ | 592 | * update i_size. */ |
593 | status = ocfs2_journal_access_di(handle, inode, bh, | 593 | status = ocfs2_journal_access_di(handle, inode, bh, |
594 | OCFS2_JOURNAL_ACCESS_WRITE); | 594 | OCFS2_JOURNAL_ACCESS_WRITE); |
595 | if (status < 0) { | 595 | if (status < 0) { |
596 | mlog_errno(status); | 596 | mlog_errno(status); |
597 | goto leave; | 597 | goto leave; |
598 | } | 598 | } |
599 | 599 | ||
600 | prev_clusters = OCFS2_I(inode)->ip_clusters; | 600 | prev_clusters = OCFS2_I(inode)->ip_clusters; |
601 | 601 | ||
602 | status = ocfs2_add_inode_data(osb, | 602 | status = ocfs2_add_inode_data(osb, |
603 | inode, | 603 | inode, |
604 | &logical_start, | 604 | &logical_start, |
605 | clusters_to_add, | 605 | clusters_to_add, |
606 | mark_unwritten, | 606 | mark_unwritten, |
607 | bh, | 607 | bh, |
608 | handle, | 608 | handle, |
609 | data_ac, | 609 | data_ac, |
610 | meta_ac, | 610 | meta_ac, |
611 | &why); | 611 | &why); |
612 | if ((status < 0) && (status != -EAGAIN)) { | 612 | if ((status < 0) && (status != -EAGAIN)) { |
613 | if (status != -ENOSPC) | 613 | if (status != -ENOSPC) |
614 | mlog_errno(status); | 614 | mlog_errno(status); |
615 | goto leave; | 615 | goto leave; |
616 | } | 616 | } |
617 | 617 | ||
618 | status = ocfs2_journal_dirty(handle, bh); | 618 | status = ocfs2_journal_dirty(handle, bh); |
619 | if (status < 0) { | 619 | if (status < 0) { |
620 | mlog_errno(status); | 620 | mlog_errno(status); |
621 | goto leave; | 621 | goto leave; |
622 | } | 622 | } |
623 | 623 | ||
624 | spin_lock(&OCFS2_I(inode)->ip_lock); | 624 | spin_lock(&OCFS2_I(inode)->ip_lock); |
625 | clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); | 625 | clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); |
626 | spin_unlock(&OCFS2_I(inode)->ip_lock); | 626 | spin_unlock(&OCFS2_I(inode)->ip_lock); |
627 | /* Release unused quota reservation */ | 627 | /* Release unused quota reservation */ |
628 | vfs_dq_free_space(inode, | 628 | vfs_dq_free_space(inode, |
629 | ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); | 629 | ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); |
630 | did_quota = 0; | 630 | did_quota = 0; |
631 | 631 | ||
632 | if (why != RESTART_NONE && clusters_to_add) { | 632 | if (why != RESTART_NONE && clusters_to_add) { |
633 | if (why == RESTART_META) { | 633 | if (why == RESTART_META) { |
634 | mlog(0, "restarting function.\n"); | 634 | mlog(0, "restarting function.\n"); |
635 | restart_func = 1; | 635 | restart_func = 1; |
636 | } else { | 636 | } else { |
637 | BUG_ON(why != RESTART_TRANS); | 637 | BUG_ON(why != RESTART_TRANS); |
638 | 638 | ||
639 | mlog(0, "restarting transaction.\n"); | 639 | mlog(0, "restarting transaction.\n"); |
640 | /* TODO: This can be more intelligent. */ | 640 | /* TODO: This can be more intelligent. */ |
641 | credits = ocfs2_calc_extend_credits(osb->sb, | 641 | credits = ocfs2_calc_extend_credits(osb->sb, |
642 | &fe->id2.i_list, | 642 | &fe->id2.i_list, |
643 | clusters_to_add); | 643 | clusters_to_add); |
644 | status = ocfs2_extend_trans(handle, credits); | 644 | status = ocfs2_extend_trans(handle, credits); |
645 | if (status < 0) { | 645 | if (status < 0) { |
646 | /* handle still has to be committed at | 646 | /* handle still has to be committed at |
647 | * this point. */ | 647 | * this point. */ |
648 | status = -ENOMEM; | 648 | status = -ENOMEM; |
649 | mlog_errno(status); | 649 | mlog_errno(status); |
650 | goto leave; | 650 | goto leave; |
651 | } | 651 | } |
652 | goto restarted_transaction; | 652 | goto restarted_transaction; |
653 | } | 653 | } |
654 | } | 654 | } |
655 | 655 | ||
656 | mlog(0, "fe: i_clusters = %u, i_size=%llu\n", | 656 | mlog(0, "fe: i_clusters = %u, i_size=%llu\n", |
657 | le32_to_cpu(fe->i_clusters), | 657 | le32_to_cpu(fe->i_clusters), |
658 | (unsigned long long)le64_to_cpu(fe->i_size)); | 658 | (unsigned long long)le64_to_cpu(fe->i_size)); |
659 | mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", | 659 | mlog(0, "inode: ip_clusters=%u, i_size=%lld\n", |
660 | OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); | 660 | OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); |
661 | 661 | ||
662 | leave: | 662 | leave: |
663 | if (status < 0 && did_quota) | 663 | if (status < 0 && did_quota) |
664 | vfs_dq_free_space(inode, | 664 | vfs_dq_free_space(inode, |
665 | ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); | 665 | ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); |
666 | if (handle) { | 666 | if (handle) { |
667 | ocfs2_commit_trans(osb, handle); | 667 | ocfs2_commit_trans(osb, handle); |
668 | handle = NULL; | 668 | handle = NULL; |
669 | } | 669 | } |
670 | if (data_ac) { | 670 | if (data_ac) { |
671 | ocfs2_free_alloc_context(data_ac); | 671 | ocfs2_free_alloc_context(data_ac); |
672 | data_ac = NULL; | 672 | data_ac = NULL; |
673 | } | 673 | } |
674 | if (meta_ac) { | 674 | if (meta_ac) { |
675 | ocfs2_free_alloc_context(meta_ac); | 675 | ocfs2_free_alloc_context(meta_ac); |
676 | meta_ac = NULL; | 676 | meta_ac = NULL; |
677 | } | 677 | } |
678 | if ((!status) && restart_func) { | 678 | if ((!status) && restart_func) { |
679 | restart_func = 0; | 679 | restart_func = 0; |
680 | goto restart_all; | 680 | goto restart_all; |
681 | } | 681 | } |
682 | brelse(bh); | 682 | brelse(bh); |
683 | bh = NULL; | 683 | bh = NULL; |
684 | 684 | ||
685 | mlog_exit(status); | 685 | mlog_exit(status); |
686 | return status; | 686 | return status; |
687 | } | 687 | } |
688 | 688 | ||
689 | /* Some parts of this taken from generic_cont_expand, which turned out | 689 | /* Some parts of this taken from generic_cont_expand, which turned out |
690 | * to be too fragile to do exactly what we need without us having to | 690 | * to be too fragile to do exactly what we need without us having to |
691 | * worry about recursive locking in ->write_begin() and ->write_end(). */ | 691 | * worry about recursive locking in ->write_begin() and ->write_end(). */ |
692 | static int ocfs2_write_zero_page(struct inode *inode, | 692 | static int ocfs2_write_zero_page(struct inode *inode, |
693 | u64 size) | 693 | u64 size) |
694 | { | 694 | { |
695 | struct address_space *mapping = inode->i_mapping; | 695 | struct address_space *mapping = inode->i_mapping; |
696 | struct page *page; | 696 | struct page *page; |
697 | unsigned long index; | 697 | unsigned long index; |
698 | unsigned int offset; | 698 | unsigned int offset; |
699 | handle_t *handle = NULL; | 699 | handle_t *handle = NULL; |
700 | int ret; | 700 | int ret; |
701 | 701 | ||
702 | offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ | 702 | offset = (size & (PAGE_CACHE_SIZE-1)); /* Within page */ |
703 | /* ugh. in prepare/commit_write, if from==to==start of block, we | 703 | /* ugh. in prepare/commit_write, if from==to==start of block, we |
704 | ** skip the prepare. make sure we never send an offset for the start | 704 | ** skip the prepare. make sure we never send an offset for the start |
705 | ** of a block | 705 | ** of a block |
706 | */ | 706 | */ |
707 | if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { | 707 | if ((offset & (inode->i_sb->s_blocksize - 1)) == 0) { |
708 | offset++; | 708 | offset++; |
709 | } | 709 | } |
710 | index = size >> PAGE_CACHE_SHIFT; | 710 | index = size >> PAGE_CACHE_SHIFT; |
711 | 711 | ||
712 | page = grab_cache_page(mapping, index); | 712 | page = grab_cache_page(mapping, index); |
713 | if (!page) { | 713 | if (!page) { |
714 | ret = -ENOMEM; | 714 | ret = -ENOMEM; |
715 | mlog_errno(ret); | 715 | mlog_errno(ret); |
716 | goto out; | 716 | goto out; |
717 | } | 717 | } |
718 | 718 | ||
719 | ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); | 719 | ret = ocfs2_prepare_write_nolock(inode, page, offset, offset); |
720 | if (ret < 0) { | 720 | if (ret < 0) { |
721 | mlog_errno(ret); | 721 | mlog_errno(ret); |
722 | goto out_unlock; | 722 | goto out_unlock; |
723 | } | 723 | } |
724 | 724 | ||
725 | if (ocfs2_should_order_data(inode)) { | 725 | if (ocfs2_should_order_data(inode)) { |
726 | handle = ocfs2_start_walk_page_trans(inode, page, offset, | 726 | handle = ocfs2_start_walk_page_trans(inode, page, offset, |
727 | offset); | 727 | offset); |
728 | if (IS_ERR(handle)) { | 728 | if (IS_ERR(handle)) { |
729 | ret = PTR_ERR(handle); | 729 | ret = PTR_ERR(handle); |
730 | handle = NULL; | 730 | handle = NULL; |
731 | goto out_unlock; | 731 | goto out_unlock; |
732 | } | 732 | } |
733 | } | 733 | } |
734 | 734 | ||
735 | /* must not update i_size! */ | 735 | /* must not update i_size! */ |
736 | ret = block_commit_write(page, offset, offset); | 736 | ret = block_commit_write(page, offset, offset); |
737 | if (ret < 0) | 737 | if (ret < 0) |
738 | mlog_errno(ret); | 738 | mlog_errno(ret); |
739 | else | 739 | else |
740 | ret = 0; | 740 | ret = 0; |
741 | 741 | ||
742 | if (handle) | 742 | if (handle) |
743 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); | 743 | ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); |
744 | out_unlock: | 744 | out_unlock: |
745 | unlock_page(page); | 745 | unlock_page(page); |
746 | page_cache_release(page); | 746 | page_cache_release(page); |
747 | out: | 747 | out: |
748 | return ret; | 748 | return ret; |
749 | } | 749 | } |
750 | 750 | ||
751 | static int ocfs2_zero_extend(struct inode *inode, | 751 | static int ocfs2_zero_extend(struct inode *inode, |
752 | u64 zero_to_size) | 752 | u64 zero_to_size) |
753 | { | 753 | { |
754 | int ret = 0; | 754 | int ret = 0; |
755 | u64 start_off; | 755 | u64 start_off; |
756 | struct super_block *sb = inode->i_sb; | 756 | struct super_block *sb = inode->i_sb; |
757 | 757 | ||
758 | start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); | 758 | start_off = ocfs2_align_bytes_to_blocks(sb, i_size_read(inode)); |
759 | while (start_off < zero_to_size) { | 759 | while (start_off < zero_to_size) { |
760 | ret = ocfs2_write_zero_page(inode, start_off); | 760 | ret = ocfs2_write_zero_page(inode, start_off); |
761 | if (ret < 0) { | 761 | if (ret < 0) { |
762 | mlog_errno(ret); | 762 | mlog_errno(ret); |
763 | goto out; | 763 | goto out; |
764 | } | 764 | } |
765 | 765 | ||
766 | start_off += sb->s_blocksize; | 766 | start_off += sb->s_blocksize; |
767 | 767 | ||
768 | /* | 768 | /* |
769 | * Very large extends have the potential to lock up | 769 | * Very large extends have the potential to lock up |
770 | * the cpu for extended periods of time. | 770 | * the cpu for extended periods of time. |
771 | */ | 771 | */ |
772 | cond_resched(); | 772 | cond_resched(); |
773 | } | 773 | } |
774 | 774 | ||
775 | out: | 775 | out: |
776 | return ret; | 776 | return ret; |
777 | } | 777 | } |
778 | 778 | ||
779 | int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) | 779 | int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to) |
780 | { | 780 | { |
781 | int ret; | 781 | int ret; |
782 | u32 clusters_to_add; | 782 | u32 clusters_to_add; |
783 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 783 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
784 | 784 | ||
785 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); | 785 | clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size); |
786 | if (clusters_to_add < oi->ip_clusters) | 786 | if (clusters_to_add < oi->ip_clusters) |
787 | clusters_to_add = 0; | 787 | clusters_to_add = 0; |
788 | else | 788 | else |
789 | clusters_to_add -= oi->ip_clusters; | 789 | clusters_to_add -= oi->ip_clusters; |
790 | 790 | ||
791 | if (clusters_to_add) { | 791 | if (clusters_to_add) { |
792 | ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, | 792 | ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, |
793 | clusters_to_add, 0); | 793 | clusters_to_add, 0); |
794 | if (ret) { | 794 | if (ret) { |
795 | mlog_errno(ret); | 795 | mlog_errno(ret); |
796 | goto out; | 796 | goto out; |
797 | } | 797 | } |
798 | } | 798 | } |
799 | 799 | ||
800 | /* | 800 | /* |
801 | * Call this even if we don't add any clusters to the tree. We | 801 | * Call this even if we don't add any clusters to the tree. We |
802 | * still need to zero the area between the old i_size and the | 802 | * still need to zero the area between the old i_size and the |
803 | * new i_size. | 803 | * new i_size. |
804 | */ | 804 | */ |
805 | ret = ocfs2_zero_extend(inode, zero_to); | 805 | ret = ocfs2_zero_extend(inode, zero_to); |
806 | if (ret < 0) | 806 | if (ret < 0) |
807 | mlog_errno(ret); | 807 | mlog_errno(ret); |
808 | 808 | ||
809 | out: | 809 | out: |
810 | return ret; | 810 | return ret; |
811 | } | 811 | } |
812 | 812 | ||
813 | static int ocfs2_extend_file(struct inode *inode, | 813 | static int ocfs2_extend_file(struct inode *inode, |
814 | struct buffer_head *di_bh, | 814 | struct buffer_head *di_bh, |
815 | u64 new_i_size) | 815 | u64 new_i_size) |
816 | { | 816 | { |
817 | int ret = 0; | 817 | int ret = 0; |
818 | struct ocfs2_inode_info *oi = OCFS2_I(inode); | 818 | struct ocfs2_inode_info *oi = OCFS2_I(inode); |
819 | 819 | ||
820 | BUG_ON(!di_bh); | 820 | BUG_ON(!di_bh); |
821 | 821 | ||
822 | /* setattr sometimes calls us like this. */ | 822 | /* setattr sometimes calls us like this. */ |
823 | if (new_i_size == 0) | 823 | if (new_i_size == 0) |
824 | goto out; | 824 | goto out; |
825 | 825 | ||
826 | if (i_size_read(inode) == new_i_size) | 826 | if (i_size_read(inode) == new_i_size) |
827 | goto out; | 827 | goto out; |
828 | BUG_ON(new_i_size < i_size_read(inode)); | 828 | BUG_ON(new_i_size < i_size_read(inode)); |
829 | 829 | ||
830 | /* | 830 | /* |
831 | * Fall through for converting inline data, even if the fs | 831 | * Fall through for converting inline data, even if the fs |
832 | * supports sparse files. | 832 | * supports sparse files. |
833 | * | 833 | * |
834 | * The check for inline data here is legal - nobody can add | 834 | * The check for inline data here is legal - nobody can add |
835 | * the feature since we have i_mutex. We must check it again | 835 | * the feature since we have i_mutex. We must check it again |
836 | * after acquiring ip_alloc_sem though, as paths like mmap | 836 | * after acquiring ip_alloc_sem though, as paths like mmap |
837 | * might have raced us to converting the inode to extents. | 837 | * might have raced us to converting the inode to extents. |
838 | */ | 838 | */ |
839 | if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) | 839 | if (!(oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) |
840 | && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) | 840 | && ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) |
841 | goto out_update_size; | 841 | goto out_update_size; |
842 | 842 | ||
843 | /* | 843 | /* |
844 | * The alloc sem blocks people in read/write from reading our | 844 | * The alloc sem blocks people in read/write from reading our |
845 | * allocation until we're done changing it. We depend on | 845 | * allocation until we're done changing it. We depend on |
846 | * i_mutex to block other extend/truncate calls while we're | 846 | * i_mutex to block other extend/truncate calls while we're |
847 | * here. | 847 | * here. |
848 | */ | 848 | */ |
849 | down_write(&oi->ip_alloc_sem); | 849 | down_write(&oi->ip_alloc_sem); |
850 | 850 | ||
851 | if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { | 851 | if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { |
852 | /* | 852 | /* |
853 | * We can optimize small extends by keeping the inodes | 853 | * We can optimize small extends by keeping the inodes |
854 | * inline data. | 854 | * inline data. |
855 | */ | 855 | */ |
856 | if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) { | 856 | if (ocfs2_size_fits_inline_data(di_bh, new_i_size)) { |
857 | up_write(&oi->ip_alloc_sem); | 857 | up_write(&oi->ip_alloc_sem); |
858 | goto out_update_size; | 858 | goto out_update_size; |
859 | } | 859 | } |
860 | 860 | ||
861 | ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); | 861 | ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); |
862 | if (ret) { | 862 | if (ret) { |
863 | up_write(&oi->ip_alloc_sem); | 863 | up_write(&oi->ip_alloc_sem); |
864 | 864 | ||
865 | mlog_errno(ret); | 865 | mlog_errno(ret); |
866 | goto out; | 866 | goto out; |
867 | } | 867 | } |
868 | } | 868 | } |
869 | 869 | ||
870 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) | 870 | if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) |
871 | ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); | 871 | ret = ocfs2_extend_no_holes(inode, new_i_size, new_i_size); |
872 | 872 | ||
873 | up_write(&oi->ip_alloc_sem); | 873 | up_write(&oi->ip_alloc_sem); |
874 | 874 | ||
875 | if (ret < 0) { | 875 | if (ret < 0) { |
876 | mlog_errno(ret); | 876 | mlog_errno(ret); |
877 | goto out; | 877 | goto out; |
878 | } | 878 | } |
879 | 879 | ||
880 | out_update_size: | 880 | out_update_size: |
881 | ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); | 881 | ret = ocfs2_simple_size_update(inode, di_bh, new_i_size); |
882 | if (ret < 0) | 882 | if (ret < 0) |
883 | mlog_errno(ret); | 883 | mlog_errno(ret); |
884 | 884 | ||
885 | out: | 885 | out: |
886 | return ret; | 886 | return ret; |
887 | } | 887 | } |
888 | 888 | ||
889 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) | 889 | int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) |
890 | { | 890 | { |
891 | int status = 0, size_change; | 891 | int status = 0, size_change; |
892 | struct inode *inode = dentry->d_inode; | 892 | struct inode *inode = dentry->d_inode; |
893 | struct super_block *sb = inode->i_sb; | 893 | struct super_block *sb = inode->i_sb; |
894 | struct ocfs2_super *osb = OCFS2_SB(sb); | 894 | struct ocfs2_super *osb = OCFS2_SB(sb); |
895 | struct buffer_head *bh = NULL; | 895 | struct buffer_head *bh = NULL; |
896 | handle_t *handle = NULL; | 896 | handle_t *handle = NULL; |
897 | int locked[MAXQUOTAS] = {0, 0}; | 897 | int locked[MAXQUOTAS] = {0, 0}; |
898 | int credits, qtype; | 898 | int credits, qtype; |
899 | struct ocfs2_mem_dqinfo *oinfo; | 899 | struct ocfs2_mem_dqinfo *oinfo; |
900 | 900 | ||
901 | mlog_entry("(0x%p, '%.*s')\n", dentry, | 901 | mlog_entry("(0x%p, '%.*s')\n", dentry, |
902 | dentry->d_name.len, dentry->d_name.name); | 902 | dentry->d_name.len, dentry->d_name.name); |
903 | 903 | ||
904 | /* ensuring we don't even attempt to truncate a symlink */ | 904 | /* ensuring we don't even attempt to truncate a symlink */ |
905 | if (S_ISLNK(inode->i_mode)) | 905 | if (S_ISLNK(inode->i_mode)) |
906 | attr->ia_valid &= ~ATTR_SIZE; | 906 | attr->ia_valid &= ~ATTR_SIZE; |
907 | 907 | ||
908 | if (attr->ia_valid & ATTR_MODE) | 908 | if (attr->ia_valid & ATTR_MODE) |
909 | mlog(0, "mode change: %d\n", attr->ia_mode); | 909 | mlog(0, "mode change: %d\n", attr->ia_mode); |
910 | if (attr->ia_valid & ATTR_UID) | 910 | if (attr->ia_valid & ATTR_UID) |
911 | mlog(0, "uid change: %d\n", attr->ia_uid); | 911 | mlog(0, "uid change: %d\n", attr->ia_uid); |
912 | if (attr->ia_valid & ATTR_GID) | 912 | if (attr->ia_valid & ATTR_GID) |
913 | mlog(0, "gid change: %d\n", attr->ia_gid); | 913 | mlog(0, "gid change: %d\n", attr->ia_gid); |
914 | if (attr->ia_valid & ATTR_SIZE) | 914 | if (attr->ia_valid & ATTR_SIZE) |
915 | mlog(0, "size change...\n"); | 915 | mlog(0, "size change...\n"); |
916 | if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) | 916 | if (attr->ia_valid & (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME)) |
917 | mlog(0, "time change...\n"); | 917 | mlog(0, "time change...\n"); |
918 | 918 | ||
919 | #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ | 919 | #define OCFS2_VALID_ATTRS (ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_SIZE \ |
920 | | ATTR_GID | ATTR_UID | ATTR_MODE) | 920 | | ATTR_GID | ATTR_UID | ATTR_MODE) |
921 | if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { | 921 | if (!(attr->ia_valid & OCFS2_VALID_ATTRS)) { |
922 | mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); | 922 | mlog(0, "can't handle attrs: 0x%x\n", attr->ia_valid); |
923 | return 0; | 923 | return 0; |
924 | } | 924 | } |
925 | 925 | ||
926 | status = inode_change_ok(inode, attr); | 926 | status = inode_change_ok(inode, attr); |
927 | if (status) | 927 | if (status) |
928 | return status; | 928 | return status; |
929 | 929 | ||
930 | size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; | 930 | size_change = S_ISREG(inode->i_mode) && attr->ia_valid & ATTR_SIZE; |
931 | if (size_change) { | 931 | if (size_change) { |
932 | status = ocfs2_rw_lock(inode, 1); | 932 | status = ocfs2_rw_lock(inode, 1); |
933 | if (status < 0) { | 933 | if (status < 0) { |
934 | mlog_errno(status); | 934 | mlog_errno(status); |
935 | goto bail; | 935 | goto bail; |
936 | } | 936 | } |
937 | } | 937 | } |
938 | 938 | ||
939 | status = ocfs2_inode_lock(inode, &bh, 1); | 939 | status = ocfs2_inode_lock(inode, &bh, 1); |
940 | if (status < 0) { | 940 | if (status < 0) { |
941 | if (status != -ENOENT) | 941 | if (status != -ENOENT) |
942 | mlog_errno(status); | 942 | mlog_errno(status); |
943 | goto bail_unlock_rw; | 943 | goto bail_unlock_rw; |
944 | } | 944 | } |
945 | 945 | ||
946 | if (size_change && attr->ia_size != i_size_read(inode)) { | 946 | if (size_change && attr->ia_size != i_size_read(inode)) { |
947 | if (attr->ia_size > sb->s_maxbytes) { | 947 | if (attr->ia_size > sb->s_maxbytes) { |
948 | status = -EFBIG; | 948 | status = -EFBIG; |
949 | goto bail_unlock; | 949 | goto bail_unlock; |
950 | } | 950 | } |
951 | 951 | ||
952 | if (i_size_read(inode) > attr->ia_size) { | 952 | if (i_size_read(inode) > attr->ia_size) { |
953 | if (ocfs2_should_order_data(inode)) { | 953 | if (ocfs2_should_order_data(inode)) { |
954 | status = ocfs2_begin_ordered_truncate(inode, | 954 | status = ocfs2_begin_ordered_truncate(inode, |
955 | attr->ia_size); | 955 | attr->ia_size); |
956 | if (status) | 956 | if (status) |
957 | goto bail_unlock; | 957 | goto bail_unlock; |
958 | } | 958 | } |
959 | status = ocfs2_truncate_file(inode, bh, attr->ia_size); | 959 | status = ocfs2_truncate_file(inode, bh, attr->ia_size); |
960 | } else | 960 | } else |
961 | status = ocfs2_extend_file(inode, bh, attr->ia_size); | 961 | status = ocfs2_extend_file(inode, bh, attr->ia_size); |
962 | if (status < 0) { | 962 | if (status < 0) { |
963 | if (status != -ENOSPC) | 963 | if (status != -ENOSPC) |
964 | mlog_errno(status); | 964 | mlog_errno(status); |
965 | status = -ENOSPC; | 965 | status = -ENOSPC; |
966 | goto bail_unlock; | 966 | goto bail_unlock; |
967 | } | 967 | } |
968 | } | 968 | } |
969 | 969 | ||
970 | if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || | 970 | if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || |
971 | (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { | 971 | (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { |
972 | credits = OCFS2_INODE_UPDATE_CREDITS; | 972 | credits = OCFS2_INODE_UPDATE_CREDITS; |
973 | if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid | 973 | if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid |
974 | && OCFS2_HAS_RO_COMPAT_FEATURE(sb, | 974 | && OCFS2_HAS_RO_COMPAT_FEATURE(sb, |
975 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { | 975 | OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { |
976 | oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv; | 976 | oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv; |
977 | status = ocfs2_lock_global_qf(oinfo, 1); | 977 | status = ocfs2_lock_global_qf(oinfo, 1); |
978 | if (status < 0) | 978 | if (status < 0) |
979 | goto bail_unlock; | 979 | goto bail_unlock; |
980 | credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) + | 980 | credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) + |
981 | ocfs2_calc_qdel_credits(sb, USRQUOTA); | 981 | ocfs2_calc_qdel_credits(sb, USRQUOTA); |
982 | locked[USRQUOTA] = 1; | 982 | locked[USRQUOTA] = 1; |
983 | } | 983 | } |
984 | if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid | 984 | if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid |
985 | && OCFS2_HAS_RO_COMPAT_FEATURE(sb, | 985 | && OCFS2_HAS_RO_COMPAT_FEATURE(sb, |
986 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { | 986 | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { |
987 | oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv; | 987 | oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv; |
988 | status = ocfs2_lock_global_qf(oinfo, 1); | 988 | status = ocfs2_lock_global_qf(oinfo, 1); |
989 | if (status < 0) | 989 | if (status < 0) |
990 | goto bail_unlock; | 990 | goto bail_unlock; |
991 | credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) + | 991 | credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) + |
992 | ocfs2_calc_qdel_credits(sb, GRPQUOTA); | 992 | ocfs2_calc_qdel_credits(sb, GRPQUOTA); |
993 | locked[GRPQUOTA] = 1; | 993 | locked[GRPQUOTA] = 1; |
994 | } | 994 | } |
995 | handle = ocfs2_start_trans(osb, credits); | 995 | handle = ocfs2_start_trans(osb, credits); |
996 | if (IS_ERR(handle)) { | 996 | if (IS_ERR(handle)) { |
997 | status = PTR_ERR(handle); | 997 | status = PTR_ERR(handle); |
998 | mlog_errno(status); | 998 | mlog_errno(status); |
999 | goto bail_unlock; | 999 | goto bail_unlock; |
1000 | } | 1000 | } |
1001 | status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; | 1001 | status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; |
1002 | if (status < 0) | 1002 | if (status < 0) |
1003 | goto bail_commit; | 1003 | goto bail_commit; |
1004 | } else { | 1004 | } else { |
1005 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 1005 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
1006 | if (IS_ERR(handle)) { | 1006 | if (IS_ERR(handle)) { |
1007 | status = PTR_ERR(handle); | 1007 | status = PTR_ERR(handle); |
1008 | mlog_errno(status); | 1008 | mlog_errno(status); |
1009 | goto bail_unlock; | 1009 | goto bail_unlock; |
1010 | } | 1010 | } |
1011 | } | 1011 | } |
1012 | 1012 | ||
1013 | /* | 1013 | /* |
1014 | * This will intentionally not wind up calling vmtruncate(), | 1014 | * This will intentionally not wind up calling vmtruncate(), |
1015 | * since all the work for a size change has been done above. | 1015 | * since all the work for a size change has been done above. |
1016 | * Otherwise, we could get into problems with truncate as | 1016 | * Otherwise, we could get into problems with truncate as |
1017 | * ip_alloc_sem is used there to protect against i_size | 1017 | * ip_alloc_sem is used there to protect against i_size |
1018 | * changes. | 1018 | * changes. |
1019 | */ | 1019 | */ |
1020 | status = inode_setattr(inode, attr); | 1020 | status = inode_setattr(inode, attr); |
1021 | if (status < 0) { | 1021 | if (status < 0) { |
1022 | mlog_errno(status); | 1022 | mlog_errno(status); |
1023 | goto bail_commit; | 1023 | goto bail_commit; |
1024 | } | 1024 | } |
1025 | 1025 | ||
1026 | status = ocfs2_mark_inode_dirty(handle, inode, bh); | 1026 | status = ocfs2_mark_inode_dirty(handle, inode, bh); |
1027 | if (status < 0) | 1027 | if (status < 0) |
1028 | mlog_errno(status); | 1028 | mlog_errno(status); |
1029 | 1029 | ||
1030 | bail_commit: | 1030 | bail_commit: |
1031 | ocfs2_commit_trans(osb, handle); | 1031 | ocfs2_commit_trans(osb, handle); |
1032 | bail_unlock: | 1032 | bail_unlock: |
1033 | for (qtype = 0; qtype < MAXQUOTAS; qtype++) { | 1033 | for (qtype = 0; qtype < MAXQUOTAS; qtype++) { |
1034 | if (!locked[qtype]) | 1034 | if (!locked[qtype]) |
1035 | continue; | 1035 | continue; |
1036 | oinfo = sb_dqinfo(sb, qtype)->dqi_priv; | 1036 | oinfo = sb_dqinfo(sb, qtype)->dqi_priv; |
1037 | ocfs2_unlock_global_qf(oinfo, 1); | 1037 | ocfs2_unlock_global_qf(oinfo, 1); |
1038 | } | 1038 | } |
1039 | ocfs2_inode_unlock(inode, 1); | 1039 | ocfs2_inode_unlock(inode, 1); |
1040 | bail_unlock_rw: | 1040 | bail_unlock_rw: |
1041 | if (size_change) | 1041 | if (size_change) |
1042 | ocfs2_rw_unlock(inode, 1); | 1042 | ocfs2_rw_unlock(inode, 1); |
1043 | bail: | 1043 | bail: |
1044 | brelse(bh); | 1044 | brelse(bh); |
1045 | 1045 | ||
1046 | if (!status && attr->ia_valid & ATTR_MODE) { | 1046 | if (!status && attr->ia_valid & ATTR_MODE) { |
1047 | status = ocfs2_acl_chmod(inode); | 1047 | status = ocfs2_acl_chmod(inode); |
1048 | if (status < 0) | 1048 | if (status < 0) |
1049 | mlog_errno(status); | 1049 | mlog_errno(status); |
1050 | } | 1050 | } |
1051 | 1051 | ||
1052 | mlog_exit(status); | 1052 | mlog_exit(status); |
1053 | return status; | 1053 | return status; |
1054 | } | 1054 | } |
1055 | 1055 | ||
1056 | int ocfs2_getattr(struct vfsmount *mnt, | 1056 | int ocfs2_getattr(struct vfsmount *mnt, |
1057 | struct dentry *dentry, | 1057 | struct dentry *dentry, |
1058 | struct kstat *stat) | 1058 | struct kstat *stat) |
1059 | { | 1059 | { |
1060 | struct inode *inode = dentry->d_inode; | 1060 | struct inode *inode = dentry->d_inode; |
1061 | struct super_block *sb = dentry->d_inode->i_sb; | 1061 | struct super_block *sb = dentry->d_inode->i_sb; |
1062 | struct ocfs2_super *osb = sb->s_fs_info; | 1062 | struct ocfs2_super *osb = sb->s_fs_info; |
1063 | int err; | 1063 | int err; |
1064 | 1064 | ||
1065 | mlog_entry_void(); | 1065 | mlog_entry_void(); |
1066 | 1066 | ||
1067 | err = ocfs2_inode_revalidate(dentry); | 1067 | err = ocfs2_inode_revalidate(dentry); |
1068 | if (err) { | 1068 | if (err) { |
1069 | if (err != -ENOENT) | 1069 | if (err != -ENOENT) |
1070 | mlog_errno(err); | 1070 | mlog_errno(err); |
1071 | goto bail; | 1071 | goto bail; |
1072 | } | 1072 | } |
1073 | 1073 | ||
1074 | generic_fillattr(inode, stat); | 1074 | generic_fillattr(inode, stat); |
1075 | 1075 | ||
1076 | /* We set the blksize from the cluster size for performance */ | 1076 | /* We set the blksize from the cluster size for performance */ |
1077 | stat->blksize = osb->s_clustersize; | 1077 | stat->blksize = osb->s_clustersize; |
1078 | 1078 | ||
1079 | bail: | 1079 | bail: |
1080 | mlog_exit(err); | 1080 | mlog_exit(err); |
1081 | 1081 | ||
1082 | return err; | 1082 | return err; |
1083 | } | 1083 | } |
1084 | 1084 | ||
1085 | int ocfs2_permission(struct inode *inode, int mask) | 1085 | int ocfs2_permission(struct inode *inode, int mask) |
1086 | { | 1086 | { |
1087 | int ret; | 1087 | int ret; |
1088 | 1088 | ||
1089 | mlog_entry_void(); | 1089 | mlog_entry_void(); |
1090 | 1090 | ||
1091 | ret = ocfs2_inode_lock(inode, NULL, 0); | 1091 | ret = ocfs2_inode_lock(inode, NULL, 0); |
1092 | if (ret) { | 1092 | if (ret) { |
1093 | if (ret != -ENOENT) | 1093 | if (ret != -ENOENT) |
1094 | mlog_errno(ret); | 1094 | mlog_errno(ret); |
1095 | goto out; | 1095 | goto out; |
1096 | } | 1096 | } |
1097 | 1097 | ||
1098 | ret = generic_permission(inode, mask, ocfs2_check_acl); | 1098 | ret = generic_permission(inode, mask, ocfs2_check_acl); |
1099 | 1099 | ||
1100 | ocfs2_inode_unlock(inode, 0); | 1100 | ocfs2_inode_unlock(inode, 0); |
1101 | out: | 1101 | out: |
1102 | mlog_exit(ret); | 1102 | mlog_exit(ret); |
1103 | return ret; | 1103 | return ret; |
1104 | } | 1104 | } |
1105 | 1105 | ||
1106 | static int __ocfs2_write_remove_suid(struct inode *inode, | 1106 | static int __ocfs2_write_remove_suid(struct inode *inode, |
1107 | struct buffer_head *bh) | 1107 | struct buffer_head *bh) |
1108 | { | 1108 | { |
1109 | int ret; | 1109 | int ret; |
1110 | handle_t *handle; | 1110 | handle_t *handle; |
1111 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1111 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1112 | struct ocfs2_dinode *di; | 1112 | struct ocfs2_dinode *di; |
1113 | 1113 | ||
1114 | mlog_entry("(Inode %llu, mode 0%o)\n", | 1114 | mlog_entry("(Inode %llu, mode 0%o)\n", |
1115 | (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); | 1115 | (unsigned long long)OCFS2_I(inode)->ip_blkno, inode->i_mode); |
1116 | 1116 | ||
1117 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 1117 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
1118 | if (IS_ERR(handle)) { | 1118 | if (IS_ERR(handle)) { |
1119 | ret = PTR_ERR(handle); | 1119 | ret = PTR_ERR(handle); |
1120 | mlog_errno(ret); | 1120 | mlog_errno(ret); |
1121 | goto out; | 1121 | goto out; |
1122 | } | 1122 | } |
1123 | 1123 | ||
1124 | ret = ocfs2_journal_access_di(handle, inode, bh, | 1124 | ret = ocfs2_journal_access_di(handle, inode, bh, |
1125 | OCFS2_JOURNAL_ACCESS_WRITE); | 1125 | OCFS2_JOURNAL_ACCESS_WRITE); |
1126 | if (ret < 0) { | 1126 | if (ret < 0) { |
1127 | mlog_errno(ret); | 1127 | mlog_errno(ret); |
1128 | goto out_trans; | 1128 | goto out_trans; |
1129 | } | 1129 | } |
1130 | 1130 | ||
1131 | inode->i_mode &= ~S_ISUID; | 1131 | inode->i_mode &= ~S_ISUID; |
1132 | if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) | 1132 | if ((inode->i_mode & S_ISGID) && (inode->i_mode & S_IXGRP)) |
1133 | inode->i_mode &= ~S_ISGID; | 1133 | inode->i_mode &= ~S_ISGID; |
1134 | 1134 | ||
1135 | di = (struct ocfs2_dinode *) bh->b_data; | 1135 | di = (struct ocfs2_dinode *) bh->b_data; |
1136 | di->i_mode = cpu_to_le16(inode->i_mode); | 1136 | di->i_mode = cpu_to_le16(inode->i_mode); |
1137 | 1137 | ||
1138 | ret = ocfs2_journal_dirty(handle, bh); | 1138 | ret = ocfs2_journal_dirty(handle, bh); |
1139 | if (ret < 0) | 1139 | if (ret < 0) |
1140 | mlog_errno(ret); | 1140 | mlog_errno(ret); |
1141 | 1141 | ||
1142 | out_trans: | 1142 | out_trans: |
1143 | ocfs2_commit_trans(osb, handle); | 1143 | ocfs2_commit_trans(osb, handle); |
1144 | out: | 1144 | out: |
1145 | mlog_exit(ret); | 1145 | mlog_exit(ret); |
1146 | return ret; | 1146 | return ret; |
1147 | } | 1147 | } |
1148 | 1148 | ||
1149 | /* | 1149 | /* |
1150 | * Will look for holes and unwritten extents in the range starting at | 1150 | * Will look for holes and unwritten extents in the range starting at |
1151 | * pos for count bytes (inclusive). | 1151 | * pos for count bytes (inclusive). |
1152 | */ | 1152 | */ |
1153 | static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, | 1153 | static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos, |
1154 | size_t count) | 1154 | size_t count) |
1155 | { | 1155 | { |
1156 | int ret = 0; | 1156 | int ret = 0; |
1157 | unsigned int extent_flags; | 1157 | unsigned int extent_flags; |
1158 | u32 cpos, clusters, extent_len, phys_cpos; | 1158 | u32 cpos, clusters, extent_len, phys_cpos; |
1159 | struct super_block *sb = inode->i_sb; | 1159 | struct super_block *sb = inode->i_sb; |
1160 | 1160 | ||
1161 | cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; | 1161 | cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits; |
1162 | clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; | 1162 | clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos; |
1163 | 1163 | ||
1164 | while (clusters) { | 1164 | while (clusters) { |
1165 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, | 1165 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len, |
1166 | &extent_flags); | 1166 | &extent_flags); |
1167 | if (ret < 0) { | 1167 | if (ret < 0) { |
1168 | mlog_errno(ret); | 1168 | mlog_errno(ret); |
1169 | goto out; | 1169 | goto out; |
1170 | } | 1170 | } |
1171 | 1171 | ||
1172 | if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { | 1172 | if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) { |
1173 | ret = 1; | 1173 | ret = 1; |
1174 | break; | 1174 | break; |
1175 | } | 1175 | } |
1176 | 1176 | ||
1177 | if (extent_len > clusters) | 1177 | if (extent_len > clusters) |
1178 | extent_len = clusters; | 1178 | extent_len = clusters; |
1179 | 1179 | ||
1180 | clusters -= extent_len; | 1180 | clusters -= extent_len; |
1181 | cpos += extent_len; | 1181 | cpos += extent_len; |
1182 | } | 1182 | } |
1183 | out: | 1183 | out: |
1184 | return ret; | 1184 | return ret; |
1185 | } | 1185 | } |
1186 | 1186 | ||
1187 | static int ocfs2_write_remove_suid(struct inode *inode) | 1187 | static int ocfs2_write_remove_suid(struct inode *inode) |
1188 | { | 1188 | { |
1189 | int ret; | 1189 | int ret; |
1190 | struct buffer_head *bh = NULL; | 1190 | struct buffer_head *bh = NULL; |
1191 | 1191 | ||
1192 | ret = ocfs2_read_inode_block(inode, &bh); | 1192 | ret = ocfs2_read_inode_block(inode, &bh); |
1193 | if (ret < 0) { | 1193 | if (ret < 0) { |
1194 | mlog_errno(ret); | 1194 | mlog_errno(ret); |
1195 | goto out; | 1195 | goto out; |
1196 | } | 1196 | } |
1197 | 1197 | ||
1198 | ret = __ocfs2_write_remove_suid(inode, bh); | 1198 | ret = __ocfs2_write_remove_suid(inode, bh); |
1199 | out: | 1199 | out: |
1200 | brelse(bh); | 1200 | brelse(bh); |
1201 | return ret; | 1201 | return ret; |
1202 | } | 1202 | } |
1203 | 1203 | ||
1204 | /* | 1204 | /* |
1205 | * Allocate enough extents to cover the region starting at byte offset | 1205 | * Allocate enough extents to cover the region starting at byte offset |
1206 | * start for len bytes. Existing extents are skipped, any extents | 1206 | * start for len bytes. Existing extents are skipped, any extents |
1207 | * added are marked as "unwritten". | 1207 | * added are marked as "unwritten". |
1208 | */ | 1208 | */ |
1209 | static int ocfs2_allocate_unwritten_extents(struct inode *inode, | 1209 | static int ocfs2_allocate_unwritten_extents(struct inode *inode, |
1210 | u64 start, u64 len) | 1210 | u64 start, u64 len) |
1211 | { | 1211 | { |
1212 | int ret; | 1212 | int ret; |
1213 | u32 cpos, phys_cpos, clusters, alloc_size; | 1213 | u32 cpos, phys_cpos, clusters, alloc_size; |
1214 | u64 end = start + len; | 1214 | u64 end = start + len; |
1215 | struct buffer_head *di_bh = NULL; | 1215 | struct buffer_head *di_bh = NULL; |
1216 | 1216 | ||
1217 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { | 1217 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { |
1218 | ret = ocfs2_read_inode_block(inode, &di_bh); | 1218 | ret = ocfs2_read_inode_block(inode, &di_bh); |
1219 | if (ret) { | 1219 | if (ret) { |
1220 | mlog_errno(ret); | 1220 | mlog_errno(ret); |
1221 | goto out; | 1221 | goto out; |
1222 | } | 1222 | } |
1223 | 1223 | ||
1224 | /* | 1224 | /* |
1225 | * Nothing to do if the requested reservation range | 1225 | * Nothing to do if the requested reservation range |
1226 | * fits within the inode. | 1226 | * fits within the inode. |
1227 | */ | 1227 | */ |
1228 | if (ocfs2_size_fits_inline_data(di_bh, end)) | 1228 | if (ocfs2_size_fits_inline_data(di_bh, end)) |
1229 | goto out; | 1229 | goto out; |
1230 | 1230 | ||
1231 | ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); | 1231 | ret = ocfs2_convert_inline_data_to_extents(inode, di_bh); |
1232 | if (ret) { | 1232 | if (ret) { |
1233 | mlog_errno(ret); | 1233 | mlog_errno(ret); |
1234 | goto out; | 1234 | goto out; |
1235 | } | 1235 | } |
1236 | } | 1236 | } |
1237 | 1237 | ||
1238 | /* | 1238 | /* |
1239 | * We consider both start and len to be inclusive. | 1239 | * We consider both start and len to be inclusive. |
1240 | */ | 1240 | */ |
1241 | cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; | 1241 | cpos = start >> OCFS2_SB(inode->i_sb)->s_clustersize_bits; |
1242 | clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); | 1242 | clusters = ocfs2_clusters_for_bytes(inode->i_sb, start + len); |
1243 | clusters -= cpos; | 1243 | clusters -= cpos; |
1244 | 1244 | ||
1245 | while (clusters) { | 1245 | while (clusters) { |
1246 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, | 1246 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, |
1247 | &alloc_size, NULL); | 1247 | &alloc_size, NULL); |
1248 | if (ret) { | 1248 | if (ret) { |
1249 | mlog_errno(ret); | 1249 | mlog_errno(ret); |
1250 | goto out; | 1250 | goto out; |
1251 | } | 1251 | } |
1252 | 1252 | ||
1253 | /* | 1253 | /* |
1254 | * Hole or existing extent len can be arbitrary, so | 1254 | * Hole or existing extent len can be arbitrary, so |
1255 | * cap it to our own allocation request. | 1255 | * cap it to our own allocation request. |
1256 | */ | 1256 | */ |
1257 | if (alloc_size > clusters) | 1257 | if (alloc_size > clusters) |
1258 | alloc_size = clusters; | 1258 | alloc_size = clusters; |
1259 | 1259 | ||
1260 | if (phys_cpos) { | 1260 | if (phys_cpos) { |
1261 | /* | 1261 | /* |
1262 | * We already have an allocation at this | 1262 | * We already have an allocation at this |
1263 | * region so we can safely skip it. | 1263 | * region so we can safely skip it. |
1264 | */ | 1264 | */ |
1265 | goto next; | 1265 | goto next; |
1266 | } | 1266 | } |
1267 | 1267 | ||
1268 | ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); | 1268 | ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); |
1269 | if (ret) { | 1269 | if (ret) { |
1270 | if (ret != -ENOSPC) | 1270 | if (ret != -ENOSPC) |
1271 | mlog_errno(ret); | 1271 | mlog_errno(ret); |
1272 | goto out; | 1272 | goto out; |
1273 | } | 1273 | } |
1274 | 1274 | ||
1275 | next: | 1275 | next: |
1276 | cpos += alloc_size; | 1276 | cpos += alloc_size; |
1277 | clusters -= alloc_size; | 1277 | clusters -= alloc_size; |
1278 | } | 1278 | } |
1279 | 1279 | ||
1280 | ret = 0; | 1280 | ret = 0; |
1281 | out: | 1281 | out: |
1282 | 1282 | ||
1283 | brelse(di_bh); | 1283 | brelse(di_bh); |
1284 | return ret; | 1284 | return ret; |
1285 | } | 1285 | } |
1286 | 1286 | ||
1287 | /* | 1287 | /* |
1288 | * Truncate a byte range, avoiding pages within partial clusters. This | 1288 | * Truncate a byte range, avoiding pages within partial clusters. This |
1289 | * preserves those pages for the zeroing code to write to. | 1289 | * preserves those pages for the zeroing code to write to. |
1290 | */ | 1290 | */ |
1291 | static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, | 1291 | static void ocfs2_truncate_cluster_pages(struct inode *inode, u64 byte_start, |
1292 | u64 byte_len) | 1292 | u64 byte_len) |
1293 | { | 1293 | { |
1294 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1294 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1295 | loff_t start, end; | 1295 | loff_t start, end; |
1296 | struct address_space *mapping = inode->i_mapping; | 1296 | struct address_space *mapping = inode->i_mapping; |
1297 | 1297 | ||
1298 | start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); | 1298 | start = (loff_t)ocfs2_align_bytes_to_clusters(inode->i_sb, byte_start); |
1299 | end = byte_start + byte_len; | 1299 | end = byte_start + byte_len; |
1300 | end = end & ~(osb->s_clustersize - 1); | 1300 | end = end & ~(osb->s_clustersize - 1); |
1301 | 1301 | ||
1302 | if (start < end) { | 1302 | if (start < end) { |
1303 | unmap_mapping_range(mapping, start, end - start, 0); | 1303 | unmap_mapping_range(mapping, start, end - start, 0); |
1304 | truncate_inode_pages_range(mapping, start, end - 1); | 1304 | truncate_inode_pages_range(mapping, start, end - 1); |
1305 | } | 1305 | } |
1306 | } | 1306 | } |
1307 | 1307 | ||
1308 | static int ocfs2_zero_partial_clusters(struct inode *inode, | 1308 | static int ocfs2_zero_partial_clusters(struct inode *inode, |
1309 | u64 start, u64 len) | 1309 | u64 start, u64 len) |
1310 | { | 1310 | { |
1311 | int ret = 0; | 1311 | int ret = 0; |
1312 | u64 tmpend, end = start + len; | 1312 | u64 tmpend, end = start + len; |
1313 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1313 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1314 | unsigned int csize = osb->s_clustersize; | 1314 | unsigned int csize = osb->s_clustersize; |
1315 | handle_t *handle; | 1315 | handle_t *handle; |
1316 | 1316 | ||
1317 | /* | 1317 | /* |
1318 | * The "start" and "end" values are NOT necessarily part of | 1318 | * The "start" and "end" values are NOT necessarily part of |
1319 | * the range whose allocation is being deleted. Rather, this | 1319 | * the range whose allocation is being deleted. Rather, this |
1320 | * is what the user passed in with the request. We must zero | 1320 | * is what the user passed in with the request. We must zero |
1321 | * partial clusters here. There's no need to worry about | 1321 | * partial clusters here. There's no need to worry about |
1322 | * physical allocation - the zeroing code knows to skip holes. | 1322 | * physical allocation - the zeroing code knows to skip holes. |
1323 | */ | 1323 | */ |
1324 | mlog(0, "byte start: %llu, end: %llu\n", | 1324 | mlog(0, "byte start: %llu, end: %llu\n", |
1325 | (unsigned long long)start, (unsigned long long)end); | 1325 | (unsigned long long)start, (unsigned long long)end); |
1326 | 1326 | ||
1327 | /* | 1327 | /* |
1328 | * If both edges are on a cluster boundary then there's no | 1328 | * If both edges are on a cluster boundary then there's no |
1329 | * zeroing required as the region is part of the allocation to | 1329 | * zeroing required as the region is part of the allocation to |
1330 | * be truncated. | 1330 | * be truncated. |
1331 | */ | 1331 | */ |
1332 | if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) | 1332 | if ((start & (csize - 1)) == 0 && (end & (csize - 1)) == 0) |
1333 | goto out; | 1333 | goto out; |
1334 | 1334 | ||
1335 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 1335 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
1336 | if (IS_ERR(handle)) { | 1336 | if (IS_ERR(handle)) { |
1337 | ret = PTR_ERR(handle); | 1337 | ret = PTR_ERR(handle); |
1338 | mlog_errno(ret); | 1338 | mlog_errno(ret); |
1339 | goto out; | 1339 | goto out; |
1340 | } | 1340 | } |
1341 | 1341 | ||
1342 | /* | 1342 | /* |
1343 | * We want to get the byte offset of the end of the 1st cluster. | 1343 | * We want to get the byte offset of the end of the 1st cluster. |
1344 | */ | 1344 | */ |
1345 | tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); | 1345 | tmpend = (u64)osb->s_clustersize + (start & ~(osb->s_clustersize - 1)); |
1346 | if (tmpend > end) | 1346 | if (tmpend > end) |
1347 | tmpend = end; | 1347 | tmpend = end; |
1348 | 1348 | ||
1349 | mlog(0, "1st range: start: %llu, tmpend: %llu\n", | 1349 | mlog(0, "1st range: start: %llu, tmpend: %llu\n", |
1350 | (unsigned long long)start, (unsigned long long)tmpend); | 1350 | (unsigned long long)start, (unsigned long long)tmpend); |
1351 | 1351 | ||
1352 | ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); | 1352 | ret = ocfs2_zero_range_for_truncate(inode, handle, start, tmpend); |
1353 | if (ret) | 1353 | if (ret) |
1354 | mlog_errno(ret); | 1354 | mlog_errno(ret); |
1355 | 1355 | ||
1356 | if (tmpend < end) { | 1356 | if (tmpend < end) { |
1357 | /* | 1357 | /* |
1358 | * This may make start and end equal, but the zeroing | 1358 | * This may make start and end equal, but the zeroing |
1359 | * code will skip any work in that case so there's no | 1359 | * code will skip any work in that case so there's no |
1360 | * need to catch it up here. | 1360 | * need to catch it up here. |
1361 | */ | 1361 | */ |
1362 | start = end & ~(osb->s_clustersize - 1); | 1362 | start = end & ~(osb->s_clustersize - 1); |
1363 | 1363 | ||
1364 | mlog(0, "2nd range: start: %llu, end: %llu\n", | 1364 | mlog(0, "2nd range: start: %llu, end: %llu\n", |
1365 | (unsigned long long)start, (unsigned long long)end); | 1365 | (unsigned long long)start, (unsigned long long)end); |
1366 | 1366 | ||
1367 | ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); | 1367 | ret = ocfs2_zero_range_for_truncate(inode, handle, start, end); |
1368 | if (ret) | 1368 | if (ret) |
1369 | mlog_errno(ret); | 1369 | mlog_errno(ret); |
1370 | } | 1370 | } |
1371 | 1371 | ||
1372 | ocfs2_commit_trans(osb, handle); | 1372 | ocfs2_commit_trans(osb, handle); |
1373 | out: | 1373 | out: |
1374 | return ret; | 1374 | return ret; |
1375 | } | 1375 | } |
1376 | 1376 | ||
1377 | static int ocfs2_remove_inode_range(struct inode *inode, | 1377 | static int ocfs2_remove_inode_range(struct inode *inode, |
1378 | struct buffer_head *di_bh, u64 byte_start, | 1378 | struct buffer_head *di_bh, u64 byte_start, |
1379 | u64 byte_len) | 1379 | u64 byte_len) |
1380 | { | 1380 | { |
1381 | int ret = 0; | 1381 | int ret = 0; |
1382 | u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; | 1382 | u32 trunc_start, trunc_len, cpos, phys_cpos, alloc_size; |
1383 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1383 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1384 | struct ocfs2_cached_dealloc_ctxt dealloc; | 1384 | struct ocfs2_cached_dealloc_ctxt dealloc; |
1385 | struct address_space *mapping = inode->i_mapping; | 1385 | struct address_space *mapping = inode->i_mapping; |
1386 | struct ocfs2_extent_tree et; | 1386 | struct ocfs2_extent_tree et; |
1387 | 1387 | ||
1388 | ocfs2_init_dinode_extent_tree(&et, inode, di_bh); | 1388 | ocfs2_init_dinode_extent_tree(&et, inode, di_bh); |
1389 | ocfs2_init_dealloc_ctxt(&dealloc); | 1389 | ocfs2_init_dealloc_ctxt(&dealloc); |
1390 | 1390 | ||
1391 | if (byte_len == 0) | 1391 | if (byte_len == 0) |
1392 | return 0; | 1392 | return 0; |
1393 | 1393 | ||
1394 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { | 1394 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { |
1395 | ret = ocfs2_truncate_inline(inode, di_bh, byte_start, | 1395 | ret = ocfs2_truncate_inline(inode, di_bh, byte_start, |
1396 | byte_start + byte_len, 0); | 1396 | byte_start + byte_len, 0); |
1397 | if (ret) { | 1397 | if (ret) { |
1398 | mlog_errno(ret); | 1398 | mlog_errno(ret); |
1399 | goto out; | 1399 | goto out; |
1400 | } | 1400 | } |
1401 | /* | 1401 | /* |
1402 | * There's no need to get fancy with the page cache | 1402 | * There's no need to get fancy with the page cache |
1403 | * truncate of an inline-data inode. We're talking | 1403 | * truncate of an inline-data inode. We're talking |
1404 | * about less than a page here, which will be cached | 1404 | * about less than a page here, which will be cached |
1405 | * in the dinode buffer anyway. | 1405 | * in the dinode buffer anyway. |
1406 | */ | 1406 | */ |
1407 | unmap_mapping_range(mapping, 0, 0, 0); | 1407 | unmap_mapping_range(mapping, 0, 0, 0); |
1408 | truncate_inode_pages(mapping, 0); | 1408 | truncate_inode_pages(mapping, 0); |
1409 | goto out; | 1409 | goto out; |
1410 | } | 1410 | } |
1411 | 1411 | ||
1412 | trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); | 1412 | trunc_start = ocfs2_clusters_for_bytes(osb->sb, byte_start); |
1413 | trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; | 1413 | trunc_len = (byte_start + byte_len) >> osb->s_clustersize_bits; |
1414 | if (trunc_len >= trunc_start) | 1414 | if (trunc_len >= trunc_start) |
1415 | trunc_len -= trunc_start; | 1415 | trunc_len -= trunc_start; |
1416 | else | 1416 | else |
1417 | trunc_len = 0; | 1417 | trunc_len = 0; |
1418 | 1418 | ||
1419 | mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", | 1419 | mlog(0, "Inode: %llu, start: %llu, len: %llu, cstart: %u, clen: %u\n", |
1420 | (unsigned long long)OCFS2_I(inode)->ip_blkno, | 1420 | (unsigned long long)OCFS2_I(inode)->ip_blkno, |
1421 | (unsigned long long)byte_start, | 1421 | (unsigned long long)byte_start, |
1422 | (unsigned long long)byte_len, trunc_start, trunc_len); | 1422 | (unsigned long long)byte_len, trunc_start, trunc_len); |
1423 | 1423 | ||
1424 | ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); | 1424 | ret = ocfs2_zero_partial_clusters(inode, byte_start, byte_len); |
1425 | if (ret) { | 1425 | if (ret) { |
1426 | mlog_errno(ret); | 1426 | mlog_errno(ret); |
1427 | goto out; | 1427 | goto out; |
1428 | } | 1428 | } |
1429 | 1429 | ||
1430 | cpos = trunc_start; | 1430 | cpos = trunc_start; |
1431 | while (trunc_len) { | 1431 | while (trunc_len) { |
1432 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, | 1432 | ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, |
1433 | &alloc_size, NULL); | 1433 | &alloc_size, NULL); |
1434 | if (ret) { | 1434 | if (ret) { |
1435 | mlog_errno(ret); | 1435 | mlog_errno(ret); |
1436 | goto out; | 1436 | goto out; |
1437 | } | 1437 | } |
1438 | 1438 | ||
1439 | if (alloc_size > trunc_len) | 1439 | if (alloc_size > trunc_len) |
1440 | alloc_size = trunc_len; | 1440 | alloc_size = trunc_len; |
1441 | 1441 | ||
1442 | /* Only do work for non-holes */ | 1442 | /* Only do work for non-holes */ |
1443 | if (phys_cpos != 0) { | 1443 | if (phys_cpos != 0) { |
1444 | ret = ocfs2_remove_btree_range(inode, &et, cpos, | 1444 | ret = ocfs2_remove_btree_range(inode, &et, cpos, |
1445 | phys_cpos, alloc_size, | 1445 | phys_cpos, alloc_size, |
1446 | &dealloc); | 1446 | &dealloc); |
1447 | if (ret) { | 1447 | if (ret) { |
1448 | mlog_errno(ret); | 1448 | mlog_errno(ret); |
1449 | goto out; | 1449 | goto out; |
1450 | } | 1450 | } |
1451 | } | 1451 | } |
1452 | 1452 | ||
1453 | cpos += alloc_size; | 1453 | cpos += alloc_size; |
1454 | trunc_len -= alloc_size; | 1454 | trunc_len -= alloc_size; |
1455 | } | 1455 | } |
1456 | 1456 | ||
1457 | ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); | 1457 | ocfs2_truncate_cluster_pages(inode, byte_start, byte_len); |
1458 | 1458 | ||
1459 | out: | 1459 | out: |
1460 | ocfs2_schedule_truncate_log_flush(osb, 1); | 1460 | ocfs2_schedule_truncate_log_flush(osb, 1); |
1461 | ocfs2_run_deallocs(osb, &dealloc); | 1461 | ocfs2_run_deallocs(osb, &dealloc); |
1462 | 1462 | ||
1463 | return ret; | 1463 | return ret; |
1464 | } | 1464 | } |
1465 | 1465 | ||
1466 | /* | 1466 | /* |
1467 | * Parts of this function taken from xfs_change_file_space() | 1467 | * Parts of this function taken from xfs_change_file_space() |
1468 | */ | 1468 | */ |
1469 | static int __ocfs2_change_file_space(struct file *file, struct inode *inode, | 1469 | static int __ocfs2_change_file_space(struct file *file, struct inode *inode, |
1470 | loff_t f_pos, unsigned int cmd, | 1470 | loff_t f_pos, unsigned int cmd, |
1471 | struct ocfs2_space_resv *sr, | 1471 | struct ocfs2_space_resv *sr, |
1472 | int change_size) | 1472 | int change_size) |
1473 | { | 1473 | { |
1474 | int ret; | 1474 | int ret; |
1475 | s64 llen; | 1475 | s64 llen; |
1476 | loff_t size; | 1476 | loff_t size; |
1477 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1477 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1478 | struct buffer_head *di_bh = NULL; | 1478 | struct buffer_head *di_bh = NULL; |
1479 | handle_t *handle; | 1479 | handle_t *handle; |
1480 | unsigned long long max_off = inode->i_sb->s_maxbytes; | 1480 | unsigned long long max_off = inode->i_sb->s_maxbytes; |
1481 | 1481 | ||
1482 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) | 1482 | if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) |
1483 | return -EROFS; | 1483 | return -EROFS; |
1484 | 1484 | ||
1485 | mutex_lock(&inode->i_mutex); | 1485 | mutex_lock(&inode->i_mutex); |
1486 | 1486 | ||
1487 | /* | 1487 | /* |
1488 | * This prevents concurrent writes on other nodes | 1488 | * This prevents concurrent writes on other nodes |
1489 | */ | 1489 | */ |
1490 | ret = ocfs2_rw_lock(inode, 1); | 1490 | ret = ocfs2_rw_lock(inode, 1); |
1491 | if (ret) { | 1491 | if (ret) { |
1492 | mlog_errno(ret); | 1492 | mlog_errno(ret); |
1493 | goto out; | 1493 | goto out; |
1494 | } | 1494 | } |
1495 | 1495 | ||
1496 | ret = ocfs2_inode_lock(inode, &di_bh, 1); | 1496 | ret = ocfs2_inode_lock(inode, &di_bh, 1); |
1497 | if (ret) { | 1497 | if (ret) { |
1498 | mlog_errno(ret); | 1498 | mlog_errno(ret); |
1499 | goto out_rw_unlock; | 1499 | goto out_rw_unlock; |
1500 | } | 1500 | } |
1501 | 1501 | ||
1502 | if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { | 1502 | if (inode->i_flags & (S_IMMUTABLE|S_APPEND)) { |
1503 | ret = -EPERM; | 1503 | ret = -EPERM; |
1504 | goto out_inode_unlock; | 1504 | goto out_inode_unlock; |
1505 | } | 1505 | } |
1506 | 1506 | ||
1507 | switch (sr->l_whence) { | 1507 | switch (sr->l_whence) { |
1508 | case 0: /*SEEK_SET*/ | 1508 | case 0: /*SEEK_SET*/ |
1509 | break; | 1509 | break; |
1510 | case 1: /*SEEK_CUR*/ | 1510 | case 1: /*SEEK_CUR*/ |
1511 | sr->l_start += f_pos; | 1511 | sr->l_start += f_pos; |
1512 | break; | 1512 | break; |
1513 | case 2: /*SEEK_END*/ | 1513 | case 2: /*SEEK_END*/ |
1514 | sr->l_start += i_size_read(inode); | 1514 | sr->l_start += i_size_read(inode); |
1515 | break; | 1515 | break; |
1516 | default: | 1516 | default: |
1517 | ret = -EINVAL; | 1517 | ret = -EINVAL; |
1518 | goto out_inode_unlock; | 1518 | goto out_inode_unlock; |
1519 | } | 1519 | } |
1520 | sr->l_whence = 0; | 1520 | sr->l_whence = 0; |
1521 | 1521 | ||
1522 | llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; | 1522 | llen = sr->l_len > 0 ? sr->l_len - 1 : sr->l_len; |
1523 | 1523 | ||
1524 | if (sr->l_start < 0 | 1524 | if (sr->l_start < 0 |
1525 | || sr->l_start > max_off | 1525 | || sr->l_start > max_off |
1526 | || (sr->l_start + llen) < 0 | 1526 | || (sr->l_start + llen) < 0 |
1527 | || (sr->l_start + llen) > max_off) { | 1527 | || (sr->l_start + llen) > max_off) { |
1528 | ret = -EINVAL; | 1528 | ret = -EINVAL; |
1529 | goto out_inode_unlock; | 1529 | goto out_inode_unlock; |
1530 | } | 1530 | } |
1531 | size = sr->l_start + sr->l_len; | 1531 | size = sr->l_start + sr->l_len; |
1532 | 1532 | ||
1533 | if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { | 1533 | if (cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) { |
1534 | if (sr->l_len <= 0) { | 1534 | if (sr->l_len <= 0) { |
1535 | ret = -EINVAL; | 1535 | ret = -EINVAL; |
1536 | goto out_inode_unlock; | 1536 | goto out_inode_unlock; |
1537 | } | 1537 | } |
1538 | } | 1538 | } |
1539 | 1539 | ||
1540 | if (file && should_remove_suid(file->f_path.dentry)) { | 1540 | if (file && should_remove_suid(file->f_path.dentry)) { |
1541 | ret = __ocfs2_write_remove_suid(inode, di_bh); | 1541 | ret = __ocfs2_write_remove_suid(inode, di_bh); |
1542 | if (ret) { | 1542 | if (ret) { |
1543 | mlog_errno(ret); | 1543 | mlog_errno(ret); |
1544 | goto out_inode_unlock; | 1544 | goto out_inode_unlock; |
1545 | } | 1545 | } |
1546 | } | 1546 | } |
1547 | 1547 | ||
1548 | down_write(&OCFS2_I(inode)->ip_alloc_sem); | 1548 | down_write(&OCFS2_I(inode)->ip_alloc_sem); |
1549 | switch (cmd) { | 1549 | switch (cmd) { |
1550 | case OCFS2_IOC_RESVSP: | 1550 | case OCFS2_IOC_RESVSP: |
1551 | case OCFS2_IOC_RESVSP64: | 1551 | case OCFS2_IOC_RESVSP64: |
1552 | /* | 1552 | /* |
1553 | * This takes unsigned offsets, but the signed ones we | 1553 | * This takes unsigned offsets, but the signed ones we |
1554 | * pass have been checked against overflow above. | 1554 | * pass have been checked against overflow above. |
1555 | */ | 1555 | */ |
1556 | ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, | 1556 | ret = ocfs2_allocate_unwritten_extents(inode, sr->l_start, |
1557 | sr->l_len); | 1557 | sr->l_len); |
1558 | break; | 1558 | break; |
1559 | case OCFS2_IOC_UNRESVSP: | 1559 | case OCFS2_IOC_UNRESVSP: |
1560 | case OCFS2_IOC_UNRESVSP64: | 1560 | case OCFS2_IOC_UNRESVSP64: |
1561 | ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, | 1561 | ret = ocfs2_remove_inode_range(inode, di_bh, sr->l_start, |
1562 | sr->l_len); | 1562 | sr->l_len); |
1563 | break; | 1563 | break; |
1564 | default: | 1564 | default: |
1565 | ret = -EINVAL; | 1565 | ret = -EINVAL; |
1566 | } | 1566 | } |
1567 | up_write(&OCFS2_I(inode)->ip_alloc_sem); | 1567 | up_write(&OCFS2_I(inode)->ip_alloc_sem); |
1568 | if (ret) { | 1568 | if (ret) { |
1569 | mlog_errno(ret); | 1569 | mlog_errno(ret); |
1570 | goto out_inode_unlock; | 1570 | goto out_inode_unlock; |
1571 | } | 1571 | } |
1572 | 1572 | ||
1573 | /* | 1573 | /* |
1574 | * We update c/mtime for these changes | 1574 | * We update c/mtime for these changes |
1575 | */ | 1575 | */ |
1576 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); | 1576 | handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); |
1577 | if (IS_ERR(handle)) { | 1577 | if (IS_ERR(handle)) { |
1578 | ret = PTR_ERR(handle); | 1578 | ret = PTR_ERR(handle); |
1579 | mlog_errno(ret); | 1579 | mlog_errno(ret); |
1580 | goto out_inode_unlock; | 1580 | goto out_inode_unlock; |
1581 | } | 1581 | } |
1582 | 1582 | ||
1583 | if (change_size && i_size_read(inode) < size) | 1583 | if (change_size && i_size_read(inode) < size) |
1584 | i_size_write(inode, size); | 1584 | i_size_write(inode, size); |
1585 | 1585 | ||
1586 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; | 1586 | inode->i_ctime = inode->i_mtime = CURRENT_TIME; |
1587 | ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); | 1587 | ret = ocfs2_mark_inode_dirty(handle, inode, di_bh); |
1588 | if (ret < 0) | 1588 | if (ret < 0) |
1589 | mlog_errno(ret); | 1589 | mlog_errno(ret); |
1590 | 1590 | ||
1591 | ocfs2_commit_trans(osb, handle); | 1591 | ocfs2_commit_trans(osb, handle); |
1592 | 1592 | ||
1593 | out_inode_unlock: | 1593 | out_inode_unlock: |
1594 | brelse(di_bh); | 1594 | brelse(di_bh); |
1595 | ocfs2_inode_unlock(inode, 1); | 1595 | ocfs2_inode_unlock(inode, 1); |
1596 | out_rw_unlock: | 1596 | out_rw_unlock: |
1597 | ocfs2_rw_unlock(inode, 1); | 1597 | ocfs2_rw_unlock(inode, 1); |
1598 | 1598 | ||
1599 | out: | 1599 | out: |
1600 | mutex_unlock(&inode->i_mutex); | 1600 | mutex_unlock(&inode->i_mutex); |
1601 | return ret; | 1601 | return ret; |
1602 | } | 1602 | } |
1603 | 1603 | ||
1604 | int ocfs2_change_file_space(struct file *file, unsigned int cmd, | 1604 | int ocfs2_change_file_space(struct file *file, unsigned int cmd, |
1605 | struct ocfs2_space_resv *sr) | 1605 | struct ocfs2_space_resv *sr) |
1606 | { | 1606 | { |
1607 | struct inode *inode = file->f_path.dentry->d_inode; | 1607 | struct inode *inode = file->f_path.dentry->d_inode; |
1608 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1608 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1609 | 1609 | ||
1610 | if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && | 1610 | if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && |
1611 | !ocfs2_writes_unwritten_extents(osb)) | 1611 | !ocfs2_writes_unwritten_extents(osb)) |
1612 | return -ENOTTY; | 1612 | return -ENOTTY; |
1613 | else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && | 1613 | else if ((cmd == OCFS2_IOC_UNRESVSP || cmd == OCFS2_IOC_UNRESVSP64) && |
1614 | !ocfs2_sparse_alloc(osb)) | 1614 | !ocfs2_sparse_alloc(osb)) |
1615 | return -ENOTTY; | 1615 | return -ENOTTY; |
1616 | 1616 | ||
1617 | if (!S_ISREG(inode->i_mode)) | 1617 | if (!S_ISREG(inode->i_mode)) |
1618 | return -EINVAL; | 1618 | return -EINVAL; |
1619 | 1619 | ||
1620 | if (!(file->f_mode & FMODE_WRITE)) | 1620 | if (!(file->f_mode & FMODE_WRITE)) |
1621 | return -EBADF; | 1621 | return -EBADF; |
1622 | 1622 | ||
1623 | return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); | 1623 | return __ocfs2_change_file_space(file, inode, file->f_pos, cmd, sr, 0); |
1624 | } | 1624 | } |
1625 | 1625 | ||
1626 | static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, | 1626 | static long ocfs2_fallocate(struct inode *inode, int mode, loff_t offset, |
1627 | loff_t len) | 1627 | loff_t len) |
1628 | { | 1628 | { |
1629 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1629 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1630 | struct ocfs2_space_resv sr; | 1630 | struct ocfs2_space_resv sr; |
1631 | int change_size = 1; | 1631 | int change_size = 1; |
1632 | 1632 | ||
1633 | if (!ocfs2_writes_unwritten_extents(osb)) | 1633 | if (!ocfs2_writes_unwritten_extents(osb)) |
1634 | return -EOPNOTSUPP; | 1634 | return -EOPNOTSUPP; |
1635 | 1635 | ||
1636 | if (S_ISDIR(inode->i_mode)) | 1636 | if (S_ISDIR(inode->i_mode)) |
1637 | return -ENODEV; | 1637 | return -ENODEV; |
1638 | 1638 | ||
1639 | if (mode & FALLOC_FL_KEEP_SIZE) | 1639 | if (mode & FALLOC_FL_KEEP_SIZE) |
1640 | change_size = 0; | 1640 | change_size = 0; |
1641 | 1641 | ||
1642 | sr.l_whence = 0; | 1642 | sr.l_whence = 0; |
1643 | sr.l_start = (s64)offset; | 1643 | sr.l_start = (s64)offset; |
1644 | sr.l_len = (s64)len; | 1644 | sr.l_len = (s64)len; |
1645 | 1645 | ||
1646 | return __ocfs2_change_file_space(NULL, inode, offset, | 1646 | return __ocfs2_change_file_space(NULL, inode, offset, |
1647 | OCFS2_IOC_RESVSP64, &sr, change_size); | 1647 | OCFS2_IOC_RESVSP64, &sr, change_size); |
1648 | } | 1648 | } |
1649 | 1649 | ||
1650 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, | 1650 | static int ocfs2_prepare_inode_for_write(struct dentry *dentry, |
1651 | loff_t *ppos, | 1651 | loff_t *ppos, |
1652 | size_t count, | 1652 | size_t count, |
1653 | int appending, | 1653 | int appending, |
1654 | int *direct_io) | 1654 | int *direct_io) |
1655 | { | 1655 | { |
1656 | int ret = 0, meta_level = 0; | 1656 | int ret = 0, meta_level = 0; |
1657 | struct inode *inode = dentry->d_inode; | 1657 | struct inode *inode = dentry->d_inode; |
1658 | loff_t saved_pos, end; | 1658 | loff_t saved_pos, end; |
1659 | 1659 | ||
1660 | /* | 1660 | /* |
1661 | * We start with a read level meta lock and only jump to an ex | 1661 | * We start with a read level meta lock and only jump to an ex |
1662 | * if we need to make modifications here. | 1662 | * if we need to make modifications here. |
1663 | */ | 1663 | */ |
1664 | for(;;) { | 1664 | for(;;) { |
1665 | ret = ocfs2_inode_lock(inode, NULL, meta_level); | 1665 | ret = ocfs2_inode_lock(inode, NULL, meta_level); |
1666 | if (ret < 0) { | 1666 | if (ret < 0) { |
1667 | meta_level = -1; | 1667 | meta_level = -1; |
1668 | mlog_errno(ret); | 1668 | mlog_errno(ret); |
1669 | goto out; | 1669 | goto out; |
1670 | } | 1670 | } |
1671 | 1671 | ||
1672 | /* Clear suid / sgid if necessary. We do this here | 1672 | /* Clear suid / sgid if necessary. We do this here |
1673 | * instead of later in the write path because | 1673 | * instead of later in the write path because |
1674 | * remove_suid() calls ->setattr without any hint that | 1674 | * remove_suid() calls ->setattr without any hint that |
1675 | * we may have already done our cluster locking. Since | 1675 | * we may have already done our cluster locking. Since |
1676 | * ocfs2_setattr() *must* take cluster locks to | 1676 | * ocfs2_setattr() *must* take cluster locks to |
1677 | * proceeed, this will lead us to recursively lock the | 1677 | * proceeed, this will lead us to recursively lock the |
1678 | * inode. There's also the dinode i_size state which | 1678 | * inode. There's also the dinode i_size state which |
1679 | * can be lost via setattr during extending writes (we | 1679 | * can be lost via setattr during extending writes (we |
1680 | * set inode->i_size at the end of a write. */ | 1680 | * set inode->i_size at the end of a write. */ |
1681 | if (should_remove_suid(dentry)) { | 1681 | if (should_remove_suid(dentry)) { |
1682 | if (meta_level == 0) { | 1682 | if (meta_level == 0) { |
1683 | ocfs2_inode_unlock(inode, meta_level); | 1683 | ocfs2_inode_unlock(inode, meta_level); |
1684 | meta_level = 1; | 1684 | meta_level = 1; |
1685 | continue; | 1685 | continue; |
1686 | } | 1686 | } |
1687 | 1687 | ||
1688 | ret = ocfs2_write_remove_suid(inode); | 1688 | ret = ocfs2_write_remove_suid(inode); |
1689 | if (ret < 0) { | 1689 | if (ret < 0) { |
1690 | mlog_errno(ret); | 1690 | mlog_errno(ret); |
1691 | goto out_unlock; | 1691 | goto out_unlock; |
1692 | } | 1692 | } |
1693 | } | 1693 | } |
1694 | 1694 | ||
1695 | /* work on a copy of ppos until we're sure that we won't have | 1695 | /* work on a copy of ppos until we're sure that we won't have |
1696 | * to recalculate it due to relocking. */ | 1696 | * to recalculate it due to relocking. */ |
1697 | if (appending) { | 1697 | if (appending) { |
1698 | saved_pos = i_size_read(inode); | 1698 | saved_pos = i_size_read(inode); |
1699 | mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); | 1699 | mlog(0, "O_APPEND: inode->i_size=%llu\n", saved_pos); |
1700 | } else { | 1700 | } else { |
1701 | saved_pos = *ppos; | 1701 | saved_pos = *ppos; |
1702 | } | 1702 | } |
1703 | 1703 | ||
1704 | end = saved_pos + count; | 1704 | end = saved_pos + count; |
1705 | 1705 | ||
1706 | /* | 1706 | /* |
1707 | * Skip the O_DIRECT checks if we don't need | 1707 | * Skip the O_DIRECT checks if we don't need |
1708 | * them. | 1708 | * them. |
1709 | */ | 1709 | */ |
1710 | if (!direct_io || !(*direct_io)) | 1710 | if (!direct_io || !(*direct_io)) |
1711 | break; | 1711 | break; |
1712 | 1712 | ||
1713 | /* | 1713 | /* |
1714 | * There's no sane way to do direct writes to an inode | 1714 | * There's no sane way to do direct writes to an inode |
1715 | * with inline data. | 1715 | * with inline data. |
1716 | */ | 1716 | */ |
1717 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { | 1717 | if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { |
1718 | *direct_io = 0; | 1718 | *direct_io = 0; |
1719 | break; | 1719 | break; |
1720 | } | 1720 | } |
1721 | 1721 | ||
1722 | /* | 1722 | /* |
1723 | * Allowing concurrent direct writes means | 1723 | * Allowing concurrent direct writes means |
1724 | * i_size changes wouldn't be synchronized, so | 1724 | * i_size changes wouldn't be synchronized, so |
1725 | * one node could wind up truncating another | 1725 | * one node could wind up truncating another |
1726 | * nodes writes. | 1726 | * nodes writes. |
1727 | */ | 1727 | */ |
1728 | if (end > i_size_read(inode)) { | 1728 | if (end > i_size_read(inode)) { |
1729 | *direct_io = 0; | 1729 | *direct_io = 0; |
1730 | break; | 1730 | break; |
1731 | } | 1731 | } |
1732 | 1732 | ||
1733 | /* | 1733 | /* |
1734 | * We don't fill holes during direct io, so | 1734 | * We don't fill holes during direct io, so |
1735 | * check for them here. If any are found, the | 1735 | * check for them here. If any are found, the |
1736 | * caller will have to retake some cluster | 1736 | * caller will have to retake some cluster |
1737 | * locks and initiate the io as buffered. | 1737 | * locks and initiate the io as buffered. |
1738 | */ | 1738 | */ |
1739 | ret = ocfs2_check_range_for_holes(inode, saved_pos, count); | 1739 | ret = ocfs2_check_range_for_holes(inode, saved_pos, count); |
1740 | if (ret == 1) { | 1740 | if (ret == 1) { |
1741 | *direct_io = 0; | 1741 | *direct_io = 0; |
1742 | ret = 0; | 1742 | ret = 0; |
1743 | } else if (ret < 0) | 1743 | } else if (ret < 0) |
1744 | mlog_errno(ret); | 1744 | mlog_errno(ret); |
1745 | break; | 1745 | break; |
1746 | } | 1746 | } |
1747 | 1747 | ||
1748 | if (appending) | 1748 | if (appending) |
1749 | *ppos = saved_pos; | 1749 | *ppos = saved_pos; |
1750 | 1750 | ||
1751 | out_unlock: | 1751 | out_unlock: |
1752 | ocfs2_inode_unlock(inode, meta_level); | 1752 | ocfs2_inode_unlock(inode, meta_level); |
1753 | 1753 | ||
1754 | out: | 1754 | out: |
1755 | return ret; | 1755 | return ret; |
1756 | } | 1756 | } |
1757 | 1757 | ||
1758 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, | 1758 | static ssize_t ocfs2_file_aio_write(struct kiocb *iocb, |
1759 | const struct iovec *iov, | 1759 | const struct iovec *iov, |
1760 | unsigned long nr_segs, | 1760 | unsigned long nr_segs, |
1761 | loff_t pos) | 1761 | loff_t pos) |
1762 | { | 1762 | { |
1763 | int ret, direct_io, appending, rw_level, have_alloc_sem = 0; | 1763 | int ret, direct_io, appending, rw_level, have_alloc_sem = 0; |
1764 | int can_do_direct; | 1764 | int can_do_direct; |
1765 | ssize_t written = 0; | 1765 | ssize_t written = 0; |
1766 | size_t ocount; /* original count */ | 1766 | size_t ocount; /* original count */ |
1767 | size_t count; /* after file limit checks */ | 1767 | size_t count; /* after file limit checks */ |
1768 | loff_t old_size, *ppos = &iocb->ki_pos; | 1768 | loff_t old_size, *ppos = &iocb->ki_pos; |
1769 | u32 old_clusters; | 1769 | u32 old_clusters; |
1770 | struct file *file = iocb->ki_filp; | 1770 | struct file *file = iocb->ki_filp; |
1771 | struct inode *inode = file->f_path.dentry->d_inode; | 1771 | struct inode *inode = file->f_path.dentry->d_inode; |
1772 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); | 1772 | struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); |
1773 | 1773 | ||
1774 | mlog_entry("(0x%p, %u, '%.*s')\n", file, | 1774 | mlog_entry("(0x%p, %u, '%.*s')\n", file, |
1775 | (unsigned int)nr_segs, | 1775 | (unsigned int)nr_segs, |
1776 | file->f_path.dentry->d_name.len, | 1776 | file->f_path.dentry->d_name.len, |
1777 | file->f_path.dentry->d_name.name); | 1777 | file->f_path.dentry->d_name.name); |
1778 | 1778 | ||
1779 | if (iocb->ki_left == 0) | 1779 | if (iocb->ki_left == 0) |
1780 | return 0; | 1780 | return 0; |
1781 | 1781 | ||
1782 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); | 1782 | vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); |
1783 | 1783 | ||
1784 | appending = file->f_flags & O_APPEND ? 1 : 0; | 1784 | appending = file->f_flags & O_APPEND ? 1 : 0; |
1785 | direct_io = file->f_flags & O_DIRECT ? 1 : 0; | 1785 | direct_io = file->f_flags & O_DIRECT ? 1 : 0; |
1786 | 1786 | ||
1787 | mutex_lock(&inode->i_mutex); | 1787 | mutex_lock(&inode->i_mutex); |
1788 | 1788 | ||
1789 | relock: | 1789 | relock: |
1790 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ | 1790 | /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ |
1791 | if (direct_io) { | 1791 | if (direct_io) { |
1792 | down_read(&inode->i_alloc_sem); | 1792 | down_read(&inode->i_alloc_sem); |
1793 | have_alloc_sem = 1; | 1793 | have_alloc_sem = 1; |
1794 | } | 1794 | } |
1795 | 1795 | ||
1796 | /* concurrent O_DIRECT writes are allowed */ | 1796 | /* concurrent O_DIRECT writes are allowed */ |
1797 | rw_level = !direct_io; | 1797 | rw_level = !direct_io; |
1798 | ret = ocfs2_rw_lock(inode, rw_level); | 1798 | ret = ocfs2_rw_lock(inode, rw_level); |
1799 | if (ret < 0) { | 1799 | if (ret < 0) { |
1800 | mlog_errno(ret); | 1800 | mlog_errno(ret); |
1801 | goto out_sems; | 1801 | goto out_sems; |
1802 | } | 1802 | } |
1803 | 1803 | ||
1804 | can_do_direct = direct_io; | 1804 | can_do_direct = direct_io; |
1805 | ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, | 1805 | ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos, |
1806 | iocb->ki_left, appending, | 1806 | iocb->ki_left, appending, |
1807 | &can_do_direct); | 1807 | &can_do_direct); |
1808 | if (ret < 0) { | 1808 | if (ret < 0) { |
1809 | mlog_errno(ret); | 1809 | mlog_errno(ret); |
1810 | goto out; | 1810 | goto out; |
1811 | } | 1811 | } |
1812 | 1812 | ||
1813 | /* | 1813 | /* |
1814 | * We can't complete the direct I/O as requested, fall back to | 1814 | * We can't complete the direct I/O as requested, fall back to |
1815 | * buffered I/O. | 1815 | * buffered I/O. |
1816 | */ | 1816 | */ |
1817 | if (direct_io && !can_do_direct) { | 1817 | if (direct_io && !can_do_direct) { |
1818 | ocfs2_rw_unlock(inode, rw_level); | 1818 | ocfs2_rw_unlock(inode, rw_level); |
1819 | up_read(&inode->i_alloc_sem); | 1819 | up_read(&inode->i_alloc_sem); |
1820 | 1820 | ||
1821 | have_alloc_sem = 0; | 1821 | have_alloc_sem = 0; |
1822 | rw_level = -1; | 1822 | rw_level = -1; |
1823 | 1823 | ||
1824 | direct_io = 0; | 1824 | direct_io = 0; |
1825 | goto relock; | 1825 | goto relock; |
1826 | } | 1826 | } |
1827 | 1827 | ||
1828 | /* | 1828 | /* |
1829 | * To later detect whether a journal commit for sync writes is | 1829 | * To later detect whether a journal commit for sync writes is |
1830 | * necessary, we sample i_size, and cluster count here. | 1830 | * necessary, we sample i_size, and cluster count here. |
1831 | */ | 1831 | */ |
1832 | old_size = i_size_read(inode); | 1832 | old_size = i_size_read(inode); |
1833 | old_clusters = OCFS2_I(inode)->ip_clusters; | 1833 | old_clusters = OCFS2_I(inode)->ip_clusters; |
1834 | 1834 | ||
1835 | /* communicate with ocfs2_dio_end_io */ | 1835 | /* communicate with ocfs2_dio_end_io */ |
1836 | ocfs2_iocb_set_rw_locked(iocb, rw_level); | 1836 | ocfs2_iocb_set_rw_locked(iocb, rw_level); |
1837 | 1837 | ||
1838 | if (direct_io) { | 1838 | if (direct_io) { |
1839 | ret = generic_segment_checks(iov, &nr_segs, &ocount, | 1839 | ret = generic_segment_checks(iov, &nr_segs, &ocount, |
1840 | VERIFY_READ); | 1840 | VERIFY_READ); |
1841 | if (ret) | 1841 | if (ret) |
1842 | goto out_dio; | 1842 | goto out_dio; |
1843 | 1843 | ||
1844 | ret = generic_write_checks(file, ppos, &count, | 1844 | ret = generic_write_checks(file, ppos, &count, |
1845 | S_ISBLK(inode->i_mode)); | 1845 | S_ISBLK(inode->i_mode)); |
1846 | if (ret) | 1846 | if (ret) |
1847 | goto out_dio; | 1847 | goto out_dio; |
1848 | 1848 | ||
1849 | written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, | 1849 | written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos, |
1850 | ppos, count, ocount); | 1850 | ppos, count, ocount); |
1851 | if (written < 0) { | 1851 | if (written < 0) { |
1852 | /* | 1852 | /* |
1853 | * direct write may have instantiated a few | 1853 | * direct write may have instantiated a few |
1854 | * blocks outside i_size. Trim these off again. | 1854 | * blocks outside i_size. Trim these off again. |
1855 | * Don't need i_size_read because we hold i_mutex. | 1855 | * Don't need i_size_read because we hold i_mutex. |
1856 | */ | 1856 | */ |
1857 | if (*ppos + count > inode->i_size) | 1857 | if (*ppos + count > inode->i_size) |
1858 | vmtruncate(inode, inode->i_size); | 1858 | vmtruncate(inode, inode->i_size); |
1859 | ret = written; | 1859 | ret = written; |
1860 | goto out_dio; | 1860 | goto out_dio; |
1861 | } | 1861 | } |
1862 | } else { | 1862 | } else { |
1863 | written = generic_file_aio_write_nolock(iocb, iov, nr_segs, | 1863 | written = generic_file_aio_write_nolock(iocb, iov, nr_segs, |
1864 | *ppos); | 1864 | *ppos); |
1865 | } | 1865 | } |
1866 | 1866 | ||
1867 | out_dio: | 1867 | out_dio: |
1868 | /* buffered aio wouldn't have proper lock coverage today */ | 1868 | /* buffered aio wouldn't have proper lock coverage today */ |
1869 | BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); | 1869 | BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT)); |
1870 | 1870 | ||
1871 | if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { | 1871 | if ((file->f_flags & O_SYNC && !direct_io) || IS_SYNC(inode)) { |
1872 | /* | 1872 | /* |
1873 | * The generic write paths have handled getting data | 1873 | * The generic write paths have handled getting data |
1874 | * to disk, but since we don't make use of the dirty | 1874 | * to disk, but since we don't make use of the dirty |
1875 | * inode list, a manual journal commit is necessary | 1875 | * inode list, a manual journal commit is necessary |
1876 | * here. | 1876 | * here. |
1877 | */ | 1877 | */ |
1878 | if (old_size != i_size_read(inode) || | 1878 | if (old_size != i_size_read(inode) || |
1879 | old_clusters != OCFS2_I(inode)->ip_clusters) { | 1879 | old_clusters != OCFS2_I(inode)->ip_clusters) { |
1880 | ret = jbd2_journal_force_commit(osb->journal->j_journal); | 1880 | ret = jbd2_journal_force_commit(osb->journal->j_journal); |
1881 | if (ret < 0) | 1881 | if (ret < 0) |
1882 | written = ret; | 1882 | written = ret; |
1883 | } | 1883 | } |
1884 | } | 1884 | } |
1885 | 1885 | ||
1886 | /* | 1886 | /* |
1887 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io | 1887 | * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io |
1888 | * function pointer which is called when o_direct io completes so that | 1888 | * function pointer which is called when o_direct io completes so that |
1889 | * it can unlock our rw lock. (it's the clustered equivalent of | 1889 | * it can unlock our rw lock. (it's the clustered equivalent of |
1890 | * i_alloc_sem; protects truncate from racing with pending ios). | 1890 | * i_alloc_sem; protects truncate from racing with pending ios). |
1891 | * Unfortunately there are error cases which call end_io and others | 1891 | * Unfortunately there are error cases which call end_io and others |
1892 | * that don't. so we don't have to unlock the rw_lock if either an | 1892 | * that don't. so we don't have to unlock the rw_lock if either an |
1893 | * async dio is going to do it in the future or an end_io after an | 1893 | * async dio is going to do it in the future or an end_io after an |
1894 | * error has already done it. | 1894 | * error has already done it. |
1895 | */ | 1895 | */ |
1896 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { | 1896 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { |
1897 | rw_level = -1; | 1897 | rw_level = -1; |
1898 | have_alloc_sem = 0; | 1898 | have_alloc_sem = 0; |
1899 | } | 1899 | } |
1900 | 1900 | ||
1901 | out: | 1901 | out: |
1902 | if (rw_level != -1) | 1902 | if (rw_level != -1) |
1903 | ocfs2_rw_unlock(inode, rw_level); | 1903 | ocfs2_rw_unlock(inode, rw_level); |
1904 | 1904 | ||
1905 | out_sems: | 1905 | out_sems: |
1906 | if (have_alloc_sem) | 1906 | if (have_alloc_sem) |
1907 | up_read(&inode->i_alloc_sem); | 1907 | up_read(&inode->i_alloc_sem); |
1908 | 1908 | ||
1909 | mutex_unlock(&inode->i_mutex); | 1909 | mutex_unlock(&inode->i_mutex); |
1910 | 1910 | ||
1911 | mlog_exit(ret); | 1911 | mlog_exit(ret); |
1912 | return written ? written : ret; | 1912 | return written ? written : ret; |
1913 | } | 1913 | } |
1914 | 1914 | ||
1915 | static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, | 1915 | static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe, |
1916 | struct file *out, | 1916 | struct file *out, |
1917 | loff_t *ppos, | 1917 | loff_t *ppos, |
1918 | size_t len, | 1918 | size_t len, |
1919 | unsigned int flags) | 1919 | unsigned int flags) |
1920 | { | 1920 | { |
1921 | int ret; | 1921 | int ret; |
1922 | struct inode *inode = out->f_path.dentry->d_inode; | 1922 | struct inode *inode = out->f_path.dentry->d_inode; |
1923 | 1923 | ||
1924 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, | 1924 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", out, pipe, |
1925 | (unsigned int)len, | 1925 | (unsigned int)len, |
1926 | out->f_path.dentry->d_name.len, | 1926 | out->f_path.dentry->d_name.len, |
1927 | out->f_path.dentry->d_name.name); | 1927 | out->f_path.dentry->d_name.name); |
1928 | 1928 | ||
1929 | inode_double_lock(inode, pipe->inode); | 1929 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); |
1930 | 1930 | ||
1931 | ret = ocfs2_rw_lock(inode, 1); | 1931 | ret = ocfs2_rw_lock(inode, 1); |
1932 | if (ret < 0) { | 1932 | if (ret < 0) { |
1933 | mlog_errno(ret); | 1933 | mlog_errno(ret); |
1934 | goto out; | 1934 | goto out; |
1935 | } | 1935 | } |
1936 | 1936 | ||
1937 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, | 1937 | ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0, |
1938 | NULL); | 1938 | NULL); |
1939 | if (ret < 0) { | 1939 | if (ret < 0) { |
1940 | mlog_errno(ret); | 1940 | mlog_errno(ret); |
1941 | goto out_unlock; | 1941 | goto out_unlock; |
1942 | } | 1942 | } |
1943 | 1943 | ||
1944 | if (pipe->inode) | ||
1945 | mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); | ||
1944 | ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); | 1946 | ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); |
1947 | if (pipe->inode) | ||
1948 | mutex_unlock(&pipe->inode->i_mutex); | ||
1945 | 1949 | ||
1946 | out_unlock: | 1950 | out_unlock: |
1947 | ocfs2_rw_unlock(inode, 1); | 1951 | ocfs2_rw_unlock(inode, 1); |
1948 | out: | 1952 | out: |
1949 | inode_double_unlock(inode, pipe->inode); | 1953 | mutex_unlock(&inode->i_mutex); |
1950 | 1954 | ||
1951 | mlog_exit(ret); | 1955 | mlog_exit(ret); |
1952 | return ret; | 1956 | return ret; |
1953 | } | 1957 | } |
1954 | 1958 | ||
1955 | static ssize_t ocfs2_file_splice_read(struct file *in, | 1959 | static ssize_t ocfs2_file_splice_read(struct file *in, |
1956 | loff_t *ppos, | 1960 | loff_t *ppos, |
1957 | struct pipe_inode_info *pipe, | 1961 | struct pipe_inode_info *pipe, |
1958 | size_t len, | 1962 | size_t len, |
1959 | unsigned int flags) | 1963 | unsigned int flags) |
1960 | { | 1964 | { |
1961 | int ret = 0; | 1965 | int ret = 0; |
1962 | struct inode *inode = in->f_path.dentry->d_inode; | 1966 | struct inode *inode = in->f_path.dentry->d_inode; |
1963 | 1967 | ||
1964 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, | 1968 | mlog_entry("(0x%p, 0x%p, %u, '%.*s')\n", in, pipe, |
1965 | (unsigned int)len, | 1969 | (unsigned int)len, |
1966 | in->f_path.dentry->d_name.len, | 1970 | in->f_path.dentry->d_name.len, |
1967 | in->f_path.dentry->d_name.name); | 1971 | in->f_path.dentry->d_name.name); |
1968 | 1972 | ||
1969 | /* | 1973 | /* |
1970 | * See the comment in ocfs2_file_aio_read() | 1974 | * See the comment in ocfs2_file_aio_read() |
1971 | */ | 1975 | */ |
1972 | ret = ocfs2_inode_lock(inode, NULL, 0); | 1976 | ret = ocfs2_inode_lock(inode, NULL, 0); |
1973 | if (ret < 0) { | 1977 | if (ret < 0) { |
1974 | mlog_errno(ret); | 1978 | mlog_errno(ret); |
1975 | goto bail; | 1979 | goto bail; |
1976 | } | 1980 | } |
1977 | ocfs2_inode_unlock(inode, 0); | 1981 | ocfs2_inode_unlock(inode, 0); |
1978 | 1982 | ||
1979 | ret = generic_file_splice_read(in, ppos, pipe, len, flags); | 1983 | ret = generic_file_splice_read(in, ppos, pipe, len, flags); |
1980 | 1984 | ||
1981 | bail: | 1985 | bail: |
1982 | mlog_exit(ret); | 1986 | mlog_exit(ret); |
1983 | return ret; | 1987 | return ret; |
1984 | } | 1988 | } |
1985 | 1989 | ||
1986 | static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, | 1990 | static ssize_t ocfs2_file_aio_read(struct kiocb *iocb, |
1987 | const struct iovec *iov, | 1991 | const struct iovec *iov, |
1988 | unsigned long nr_segs, | 1992 | unsigned long nr_segs, |
1989 | loff_t pos) | 1993 | loff_t pos) |
1990 | { | 1994 | { |
1991 | int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; | 1995 | int ret = 0, rw_level = -1, have_alloc_sem = 0, lock_level = 0; |
1992 | struct file *filp = iocb->ki_filp; | 1996 | struct file *filp = iocb->ki_filp; |
1993 | struct inode *inode = filp->f_path.dentry->d_inode; | 1997 | struct inode *inode = filp->f_path.dentry->d_inode; |
1994 | 1998 | ||
1995 | mlog_entry("(0x%p, %u, '%.*s')\n", filp, | 1999 | mlog_entry("(0x%p, %u, '%.*s')\n", filp, |
1996 | (unsigned int)nr_segs, | 2000 | (unsigned int)nr_segs, |
1997 | filp->f_path.dentry->d_name.len, | 2001 | filp->f_path.dentry->d_name.len, |
1998 | filp->f_path.dentry->d_name.name); | 2002 | filp->f_path.dentry->d_name.name); |
1999 | 2003 | ||
2000 | if (!inode) { | 2004 | if (!inode) { |
2001 | ret = -EINVAL; | 2005 | ret = -EINVAL; |
2002 | mlog_errno(ret); | 2006 | mlog_errno(ret); |
2003 | goto bail; | 2007 | goto bail; |
2004 | } | 2008 | } |
2005 | 2009 | ||
2006 | /* | 2010 | /* |
2007 | * buffered reads protect themselves in ->readpage(). O_DIRECT reads | 2011 | * buffered reads protect themselves in ->readpage(). O_DIRECT reads |
2008 | * need locks to protect pending reads from racing with truncate. | 2012 | * need locks to protect pending reads from racing with truncate. |
2009 | */ | 2013 | */ |
2010 | if (filp->f_flags & O_DIRECT) { | 2014 | if (filp->f_flags & O_DIRECT) { |
2011 | down_read(&inode->i_alloc_sem); | 2015 | down_read(&inode->i_alloc_sem); |
2012 | have_alloc_sem = 1; | 2016 | have_alloc_sem = 1; |
2013 | 2017 | ||
2014 | ret = ocfs2_rw_lock(inode, 0); | 2018 | ret = ocfs2_rw_lock(inode, 0); |
2015 | if (ret < 0) { | 2019 | if (ret < 0) { |
2016 | mlog_errno(ret); | 2020 | mlog_errno(ret); |
2017 | goto bail; | 2021 | goto bail; |
2018 | } | 2022 | } |
2019 | rw_level = 0; | 2023 | rw_level = 0; |
2020 | /* communicate with ocfs2_dio_end_io */ | 2024 | /* communicate with ocfs2_dio_end_io */ |
2021 | ocfs2_iocb_set_rw_locked(iocb, rw_level); | 2025 | ocfs2_iocb_set_rw_locked(iocb, rw_level); |
2022 | } | 2026 | } |
2023 | 2027 | ||
2024 | /* | 2028 | /* |
2025 | * We're fine letting folks race truncates and extending | 2029 | * We're fine letting folks race truncates and extending |
2026 | * writes with read across the cluster, just like they can | 2030 | * writes with read across the cluster, just like they can |
2027 | * locally. Hence no rw_lock during read. | 2031 | * locally. Hence no rw_lock during read. |
2028 | * | 2032 | * |
2029 | * Take and drop the meta data lock to update inode fields | 2033 | * Take and drop the meta data lock to update inode fields |
2030 | * like i_size. This allows the checks down below | 2034 | * like i_size. This allows the checks down below |
2031 | * generic_file_aio_read() a chance of actually working. | 2035 | * generic_file_aio_read() a chance of actually working. |
2032 | */ | 2036 | */ |
2033 | ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); | 2037 | ret = ocfs2_inode_lock_atime(inode, filp->f_vfsmnt, &lock_level); |
2034 | if (ret < 0) { | 2038 | if (ret < 0) { |
2035 | mlog_errno(ret); | 2039 | mlog_errno(ret); |
2036 | goto bail; | 2040 | goto bail; |
2037 | } | 2041 | } |
2038 | ocfs2_inode_unlock(inode, lock_level); | 2042 | ocfs2_inode_unlock(inode, lock_level); |
2039 | 2043 | ||
2040 | ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); | 2044 | ret = generic_file_aio_read(iocb, iov, nr_segs, iocb->ki_pos); |
2041 | if (ret == -EINVAL) | 2045 | if (ret == -EINVAL) |
2042 | mlog(0, "generic_file_aio_read returned -EINVAL\n"); | 2046 | mlog(0, "generic_file_aio_read returned -EINVAL\n"); |
2043 | 2047 | ||
2044 | /* buffered aio wouldn't have proper lock coverage today */ | 2048 | /* buffered aio wouldn't have proper lock coverage today */ |
2045 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); | 2049 | BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT)); |
2046 | 2050 | ||
2047 | /* see ocfs2_file_aio_write */ | 2051 | /* see ocfs2_file_aio_write */ |
2048 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { | 2052 | if (ret == -EIOCBQUEUED || !ocfs2_iocb_is_rw_locked(iocb)) { |
2049 | rw_level = -1; | 2053 | rw_level = -1; |
2050 | have_alloc_sem = 0; | 2054 | have_alloc_sem = 0; |
2051 | } | 2055 | } |
2052 | 2056 | ||
2053 | bail: | 2057 | bail: |
2054 | if (have_alloc_sem) | 2058 | if (have_alloc_sem) |
2055 | up_read(&inode->i_alloc_sem); | 2059 | up_read(&inode->i_alloc_sem); |
2056 | if (rw_level != -1) | 2060 | if (rw_level != -1) |
2057 | ocfs2_rw_unlock(inode, rw_level); | 2061 | ocfs2_rw_unlock(inode, rw_level); |
2058 | mlog_exit(ret); | 2062 | mlog_exit(ret); |
2059 | 2063 | ||
2060 | return ret; | 2064 | return ret; |
2061 | } | 2065 | } |
2062 | 2066 | ||
2063 | const struct inode_operations ocfs2_file_iops = { | 2067 | const struct inode_operations ocfs2_file_iops = { |
2064 | .setattr = ocfs2_setattr, | 2068 | .setattr = ocfs2_setattr, |
2065 | .getattr = ocfs2_getattr, | 2069 | .getattr = ocfs2_getattr, |
2066 | .permission = ocfs2_permission, | 2070 | .permission = ocfs2_permission, |
2067 | .setxattr = generic_setxattr, | 2071 | .setxattr = generic_setxattr, |
2068 | .getxattr = generic_getxattr, | 2072 | .getxattr = generic_getxattr, |
2069 | .listxattr = ocfs2_listxattr, | 2073 | .listxattr = ocfs2_listxattr, |
2070 | .removexattr = generic_removexattr, | 2074 | .removexattr = generic_removexattr, |
2071 | .fallocate = ocfs2_fallocate, | 2075 | .fallocate = ocfs2_fallocate, |
2072 | .fiemap = ocfs2_fiemap, | 2076 | .fiemap = ocfs2_fiemap, |
2073 | }; | 2077 | }; |
2074 | 2078 | ||
2075 | const struct inode_operations ocfs2_special_file_iops = { | 2079 | const struct inode_operations ocfs2_special_file_iops = { |
2076 | .setattr = ocfs2_setattr, | 2080 | .setattr = ocfs2_setattr, |
2077 | .getattr = ocfs2_getattr, | 2081 | .getattr = ocfs2_getattr, |
2078 | .permission = ocfs2_permission, | 2082 | .permission = ocfs2_permission, |
2079 | }; | 2083 | }; |
2080 | 2084 | ||
2081 | /* | 2085 | /* |
2082 | * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with | 2086 | * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with |
2083 | * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! | 2087 | * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! |
2084 | */ | 2088 | */ |
2085 | const struct file_operations ocfs2_fops = { | 2089 | const struct file_operations ocfs2_fops = { |
2086 | .llseek = generic_file_llseek, | 2090 | .llseek = generic_file_llseek, |
2087 | .read = do_sync_read, | 2091 | .read = do_sync_read, |
2088 | .write = do_sync_write, | 2092 | .write = do_sync_write, |
2089 | .mmap = ocfs2_mmap, | 2093 | .mmap = ocfs2_mmap, |
2090 | .fsync = ocfs2_sync_file, | 2094 | .fsync = ocfs2_sync_file, |
2091 | .release = ocfs2_file_release, | 2095 | .release = ocfs2_file_release, |
2092 | .open = ocfs2_file_open, | 2096 | .open = ocfs2_file_open, |
2093 | .aio_read = ocfs2_file_aio_read, | 2097 | .aio_read = ocfs2_file_aio_read, |
2094 | .aio_write = ocfs2_file_aio_write, | 2098 | .aio_write = ocfs2_file_aio_write, |
2095 | .unlocked_ioctl = ocfs2_ioctl, | 2099 | .unlocked_ioctl = ocfs2_ioctl, |
2096 | #ifdef CONFIG_COMPAT | 2100 | #ifdef CONFIG_COMPAT |
2097 | .compat_ioctl = ocfs2_compat_ioctl, | 2101 | .compat_ioctl = ocfs2_compat_ioctl, |
2098 | #endif | 2102 | #endif |
2099 | .lock = ocfs2_lock, | 2103 | .lock = ocfs2_lock, |
2100 | .flock = ocfs2_flock, | 2104 | .flock = ocfs2_flock, |
2101 | .splice_read = ocfs2_file_splice_read, | 2105 | .splice_read = ocfs2_file_splice_read, |
2102 | .splice_write = ocfs2_file_splice_write, | 2106 | .splice_write = ocfs2_file_splice_write, |
2103 | }; | 2107 | }; |
2104 | 2108 | ||
2105 | const struct file_operations ocfs2_dops = { | 2109 | const struct file_operations ocfs2_dops = { |
2106 | .llseek = generic_file_llseek, | 2110 | .llseek = generic_file_llseek, |
2107 | .read = generic_read_dir, | 2111 | .read = generic_read_dir, |
2108 | .readdir = ocfs2_readdir, | 2112 | .readdir = ocfs2_readdir, |
2109 | .fsync = ocfs2_sync_file, | 2113 | .fsync = ocfs2_sync_file, |
2110 | .release = ocfs2_dir_release, | 2114 | .release = ocfs2_dir_release, |
2111 | .open = ocfs2_dir_open, | 2115 | .open = ocfs2_dir_open, |
2112 | .unlocked_ioctl = ocfs2_ioctl, | 2116 | .unlocked_ioctl = ocfs2_ioctl, |
2113 | #ifdef CONFIG_COMPAT | 2117 | #ifdef CONFIG_COMPAT |
2114 | .compat_ioctl = ocfs2_compat_ioctl, | 2118 | .compat_ioctl = ocfs2_compat_ioctl, |
2115 | #endif | 2119 | #endif |
2116 | .lock = ocfs2_lock, | 2120 | .lock = ocfs2_lock, |
2117 | .flock = ocfs2_flock, | 2121 | .flock = ocfs2_flock, |
2118 | }; | 2122 | }; |
2119 | 2123 | ||
2120 | /* | 2124 | /* |
2121 | * POSIX-lockless variants of our file_operations. | 2125 | * POSIX-lockless variants of our file_operations. |
2122 | * | 2126 | * |
2123 | * These will be used if the underlying cluster stack does not support | 2127 | * These will be used if the underlying cluster stack does not support |
2124 | * posix file locking, if the user passes the "localflocks" mount | 2128 | * posix file locking, if the user passes the "localflocks" mount |
2125 | * option, or if we have a local-only fs. | 2129 | * option, or if we have a local-only fs. |
2126 | * | 2130 | * |
2127 | * ocfs2_flock is in here because all stacks handle UNIX file locks, | 2131 | * ocfs2_flock is in here because all stacks handle UNIX file locks, |
2128 | * so we still want it in the case of no stack support for | 2132 | * so we still want it in the case of no stack support for |
2129 | * plocks. Internally, it will do the right thing when asked to ignore | 2133 | * plocks. Internally, it will do the right thing when asked to ignore |
2130 | * the cluster. | 2134 | * the cluster. |
2131 | */ | 2135 | */ |
2132 | const struct file_operations ocfs2_fops_no_plocks = { | 2136 | const struct file_operations ocfs2_fops_no_plocks = { |
2133 | .llseek = generic_file_llseek, | 2137 | .llseek = generic_file_llseek, |
2134 | .read = do_sync_read, | 2138 | .read = do_sync_read, |
2135 | .write = do_sync_write, | 2139 | .write = do_sync_write, |
2136 | .mmap = ocfs2_mmap, | 2140 | .mmap = ocfs2_mmap, |
2137 | .fsync = ocfs2_sync_file, | 2141 | .fsync = ocfs2_sync_file, |
2138 | .release = ocfs2_file_release, | 2142 | .release = ocfs2_file_release, |
2139 | .open = ocfs2_file_open, | 2143 | .open = ocfs2_file_open, |
2140 | .aio_read = ocfs2_file_aio_read, | 2144 | .aio_read = ocfs2_file_aio_read, |
2141 | .aio_write = ocfs2_file_aio_write, | 2145 | .aio_write = ocfs2_file_aio_write, |
2142 | .unlocked_ioctl = ocfs2_ioctl, | 2146 | .unlocked_ioctl = ocfs2_ioctl, |
2143 | #ifdef CONFIG_COMPAT | 2147 | #ifdef CONFIG_COMPAT |
2144 | .compat_ioctl = ocfs2_compat_ioctl, | 2148 | .compat_ioctl = ocfs2_compat_ioctl, |
2145 | #endif | 2149 | #endif |
2146 | .flock = ocfs2_flock, | 2150 | .flock = ocfs2_flock, |
2147 | .splice_read = ocfs2_file_splice_read, | 2151 | .splice_read = ocfs2_file_splice_read, |
2148 | .splice_write = ocfs2_file_splice_write, | 2152 | .splice_write = ocfs2_file_splice_write, |
2149 | }; | 2153 | }; |
2150 | 2154 | ||
2151 | const struct file_operations ocfs2_dops_no_plocks = { | 2155 | const struct file_operations ocfs2_dops_no_plocks = { |
2152 | .llseek = generic_file_llseek, | 2156 | .llseek = generic_file_llseek, |
2153 | .read = generic_read_dir, | 2157 | .read = generic_read_dir, |
2154 | .readdir = ocfs2_readdir, | 2158 | .readdir = ocfs2_readdir, |
2155 | .fsync = ocfs2_sync_file, | 2159 | .fsync = ocfs2_sync_file, |
2156 | .release = ocfs2_dir_release, | 2160 | .release = ocfs2_dir_release, |
2157 | .open = ocfs2_dir_open, | 2161 | .open = ocfs2_dir_open, |
2158 | .unlocked_ioctl = ocfs2_ioctl, | 2162 | .unlocked_ioctl = ocfs2_ioctl, |
2159 | #ifdef CONFIG_COMPAT | 2163 | #ifdef CONFIG_COMPAT |
2160 | .compat_ioctl = ocfs2_compat_ioctl, | 2164 | .compat_ioctl = ocfs2_compat_ioctl, |
2161 | #endif | 2165 | #endif |
2162 | .flock = ocfs2_flock, | 2166 | .flock = ocfs2_flock, |
2163 | }; | 2167 | }; |
2164 | 2168 |
fs/splice.c
1 | /* | 1 | /* |
2 | * "splice": joining two ropes together by interweaving their strands. | 2 | * "splice": joining two ropes together by interweaving their strands. |
3 | * | 3 | * |
4 | * This is the "extended pipe" functionality, where a pipe is used as | 4 | * This is the "extended pipe" functionality, where a pipe is used as |
5 | * an arbitrary in-memory buffer. Think of a pipe as a small kernel | 5 | * an arbitrary in-memory buffer. Think of a pipe as a small kernel |
6 | * buffer that you can use to transfer data from one end to the other. | 6 | * buffer that you can use to transfer data from one end to the other. |
7 | * | 7 | * |
8 | * The traditional unix read/write is extended with a "splice()" operation | 8 | * The traditional unix read/write is extended with a "splice()" operation |
9 | * that transfers data buffers to or from a pipe buffer. | 9 | * that transfers data buffers to or from a pipe buffer. |
10 | * | 10 | * |
11 | * Named by Larry McVoy, original implementation from Linus, extended by | 11 | * Named by Larry McVoy, original implementation from Linus, extended by |
12 | * Jens to support splicing to files, network, direct splicing, etc and | 12 | * Jens to support splicing to files, network, direct splicing, etc and |
13 | * fixing lots of bugs. | 13 | * fixing lots of bugs. |
14 | * | 14 | * |
15 | * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> | 15 | * Copyright (C) 2005-2006 Jens Axboe <axboe@kernel.dk> |
16 | * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> | 16 | * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> |
17 | * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> | 17 | * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> |
18 | * | 18 | * |
19 | */ | 19 | */ |
20 | #include <linux/fs.h> | 20 | #include <linux/fs.h> |
21 | #include <linux/file.h> | 21 | #include <linux/file.h> |
22 | #include <linux/pagemap.h> | 22 | #include <linux/pagemap.h> |
23 | #include <linux/splice.h> | 23 | #include <linux/splice.h> |
24 | #include <linux/memcontrol.h> | 24 | #include <linux/memcontrol.h> |
25 | #include <linux/mm_inline.h> | 25 | #include <linux/mm_inline.h> |
26 | #include <linux/swap.h> | 26 | #include <linux/swap.h> |
27 | #include <linux/writeback.h> | 27 | #include <linux/writeback.h> |
28 | #include <linux/buffer_head.h> | 28 | #include <linux/buffer_head.h> |
29 | #include <linux/module.h> | 29 | #include <linux/module.h> |
30 | #include <linux/syscalls.h> | 30 | #include <linux/syscalls.h> |
31 | #include <linux/uio.h> | 31 | #include <linux/uio.h> |
32 | #include <linux/security.h> | 32 | #include <linux/security.h> |
33 | 33 | ||
34 | /* | 34 | /* |
35 | * Attempt to steal a page from a pipe buffer. This should perhaps go into | 35 | * Attempt to steal a page from a pipe buffer. This should perhaps go into |
36 | * a vm helper function, it's already simplified quite a bit by the | 36 | * a vm helper function, it's already simplified quite a bit by the |
37 | * addition of remove_mapping(). If success is returned, the caller may | 37 | * addition of remove_mapping(). If success is returned, the caller may |
38 | * attempt to reuse this page for another destination. | 38 | * attempt to reuse this page for another destination. |
39 | */ | 39 | */ |
40 | static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, | 40 | static int page_cache_pipe_buf_steal(struct pipe_inode_info *pipe, |
41 | struct pipe_buffer *buf) | 41 | struct pipe_buffer *buf) |
42 | { | 42 | { |
43 | struct page *page = buf->page; | 43 | struct page *page = buf->page; |
44 | struct address_space *mapping; | 44 | struct address_space *mapping; |
45 | 45 | ||
46 | lock_page(page); | 46 | lock_page(page); |
47 | 47 | ||
48 | mapping = page_mapping(page); | 48 | mapping = page_mapping(page); |
49 | if (mapping) { | 49 | if (mapping) { |
50 | WARN_ON(!PageUptodate(page)); | 50 | WARN_ON(!PageUptodate(page)); |
51 | 51 | ||
52 | /* | 52 | /* |
53 | * At least for ext2 with nobh option, we need to wait on | 53 | * At least for ext2 with nobh option, we need to wait on |
54 | * writeback completing on this page, since we'll remove it | 54 | * writeback completing on this page, since we'll remove it |
55 | * from the pagecache. Otherwise truncate wont wait on the | 55 | * from the pagecache. Otherwise truncate wont wait on the |
56 | * page, allowing the disk blocks to be reused by someone else | 56 | * page, allowing the disk blocks to be reused by someone else |
57 | * before we actually wrote our data to them. fs corruption | 57 | * before we actually wrote our data to them. fs corruption |
58 | * ensues. | 58 | * ensues. |
59 | */ | 59 | */ |
60 | wait_on_page_writeback(page); | 60 | wait_on_page_writeback(page); |
61 | 61 | ||
62 | if (page_has_private(page) && | 62 | if (page_has_private(page) && |
63 | !try_to_release_page(page, GFP_KERNEL)) | 63 | !try_to_release_page(page, GFP_KERNEL)) |
64 | goto out_unlock; | 64 | goto out_unlock; |
65 | 65 | ||
66 | /* | 66 | /* |
67 | * If we succeeded in removing the mapping, set LRU flag | 67 | * If we succeeded in removing the mapping, set LRU flag |
68 | * and return good. | 68 | * and return good. |
69 | */ | 69 | */ |
70 | if (remove_mapping(mapping, page)) { | 70 | if (remove_mapping(mapping, page)) { |
71 | buf->flags |= PIPE_BUF_FLAG_LRU; | 71 | buf->flags |= PIPE_BUF_FLAG_LRU; |
72 | return 0; | 72 | return 0; |
73 | } | 73 | } |
74 | } | 74 | } |
75 | 75 | ||
76 | /* | 76 | /* |
77 | * Raced with truncate or failed to remove page from current | 77 | * Raced with truncate or failed to remove page from current |
78 | * address space, unlock and return failure. | 78 | * address space, unlock and return failure. |
79 | */ | 79 | */ |
80 | out_unlock: | 80 | out_unlock: |
81 | unlock_page(page); | 81 | unlock_page(page); |
82 | return 1; | 82 | return 1; |
83 | } | 83 | } |
84 | 84 | ||
85 | static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, | 85 | static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, |
86 | struct pipe_buffer *buf) | 86 | struct pipe_buffer *buf) |
87 | { | 87 | { |
88 | page_cache_release(buf->page); | 88 | page_cache_release(buf->page); |
89 | buf->flags &= ~PIPE_BUF_FLAG_LRU; | 89 | buf->flags &= ~PIPE_BUF_FLAG_LRU; |
90 | } | 90 | } |
91 | 91 | ||
92 | /* | 92 | /* |
93 | * Check whether the contents of buf is OK to access. Since the content | 93 | * Check whether the contents of buf is OK to access. Since the content |
94 | * is a page cache page, IO may be in flight. | 94 | * is a page cache page, IO may be in flight. |
95 | */ | 95 | */ |
96 | static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, | 96 | static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, |
97 | struct pipe_buffer *buf) | 97 | struct pipe_buffer *buf) |
98 | { | 98 | { |
99 | struct page *page = buf->page; | 99 | struct page *page = buf->page; |
100 | int err; | 100 | int err; |
101 | 101 | ||
102 | if (!PageUptodate(page)) { | 102 | if (!PageUptodate(page)) { |
103 | lock_page(page); | 103 | lock_page(page); |
104 | 104 | ||
105 | /* | 105 | /* |
106 | * Page got truncated/unhashed. This will cause a 0-byte | 106 | * Page got truncated/unhashed. This will cause a 0-byte |
107 | * splice, if this is the first page. | 107 | * splice, if this is the first page. |
108 | */ | 108 | */ |
109 | if (!page->mapping) { | 109 | if (!page->mapping) { |
110 | err = -ENODATA; | 110 | err = -ENODATA; |
111 | goto error; | 111 | goto error; |
112 | } | 112 | } |
113 | 113 | ||
114 | /* | 114 | /* |
115 | * Uh oh, read-error from disk. | 115 | * Uh oh, read-error from disk. |
116 | */ | 116 | */ |
117 | if (!PageUptodate(page)) { | 117 | if (!PageUptodate(page)) { |
118 | err = -EIO; | 118 | err = -EIO; |
119 | goto error; | 119 | goto error; |
120 | } | 120 | } |
121 | 121 | ||
122 | /* | 122 | /* |
123 | * Page is ok afterall, we are done. | 123 | * Page is ok afterall, we are done. |
124 | */ | 124 | */ |
125 | unlock_page(page); | 125 | unlock_page(page); |
126 | } | 126 | } |
127 | 127 | ||
128 | return 0; | 128 | return 0; |
129 | error: | 129 | error: |
130 | unlock_page(page); | 130 | unlock_page(page); |
131 | return err; | 131 | return err; |
132 | } | 132 | } |
133 | 133 | ||
134 | static const struct pipe_buf_operations page_cache_pipe_buf_ops = { | 134 | static const struct pipe_buf_operations page_cache_pipe_buf_ops = { |
135 | .can_merge = 0, | 135 | .can_merge = 0, |
136 | .map = generic_pipe_buf_map, | 136 | .map = generic_pipe_buf_map, |
137 | .unmap = generic_pipe_buf_unmap, | 137 | .unmap = generic_pipe_buf_unmap, |
138 | .confirm = page_cache_pipe_buf_confirm, | 138 | .confirm = page_cache_pipe_buf_confirm, |
139 | .release = page_cache_pipe_buf_release, | 139 | .release = page_cache_pipe_buf_release, |
140 | .steal = page_cache_pipe_buf_steal, | 140 | .steal = page_cache_pipe_buf_steal, |
141 | .get = generic_pipe_buf_get, | 141 | .get = generic_pipe_buf_get, |
142 | }; | 142 | }; |
143 | 143 | ||
144 | static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, | 144 | static int user_page_pipe_buf_steal(struct pipe_inode_info *pipe, |
145 | struct pipe_buffer *buf) | 145 | struct pipe_buffer *buf) |
146 | { | 146 | { |
147 | if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) | 147 | if (!(buf->flags & PIPE_BUF_FLAG_GIFT)) |
148 | return 1; | 148 | return 1; |
149 | 149 | ||
150 | buf->flags |= PIPE_BUF_FLAG_LRU; | 150 | buf->flags |= PIPE_BUF_FLAG_LRU; |
151 | return generic_pipe_buf_steal(pipe, buf); | 151 | return generic_pipe_buf_steal(pipe, buf); |
152 | } | 152 | } |
153 | 153 | ||
154 | static const struct pipe_buf_operations user_page_pipe_buf_ops = { | 154 | static const struct pipe_buf_operations user_page_pipe_buf_ops = { |
155 | .can_merge = 0, | 155 | .can_merge = 0, |
156 | .map = generic_pipe_buf_map, | 156 | .map = generic_pipe_buf_map, |
157 | .unmap = generic_pipe_buf_unmap, | 157 | .unmap = generic_pipe_buf_unmap, |
158 | .confirm = generic_pipe_buf_confirm, | 158 | .confirm = generic_pipe_buf_confirm, |
159 | .release = page_cache_pipe_buf_release, | 159 | .release = page_cache_pipe_buf_release, |
160 | .steal = user_page_pipe_buf_steal, | 160 | .steal = user_page_pipe_buf_steal, |
161 | .get = generic_pipe_buf_get, | 161 | .get = generic_pipe_buf_get, |
162 | }; | 162 | }; |
163 | 163 | ||
164 | /** | 164 | /** |
165 | * splice_to_pipe - fill passed data into a pipe | 165 | * splice_to_pipe - fill passed data into a pipe |
166 | * @pipe: pipe to fill | 166 | * @pipe: pipe to fill |
167 | * @spd: data to fill | 167 | * @spd: data to fill |
168 | * | 168 | * |
169 | * Description: | 169 | * Description: |
170 | * @spd contains a map of pages and len/offset tuples, along with | 170 | * @spd contains a map of pages and len/offset tuples, along with |
171 | * the struct pipe_buf_operations associated with these pages. This | 171 | * the struct pipe_buf_operations associated with these pages. This |
172 | * function will link that data to the pipe. | 172 | * function will link that data to the pipe. |
173 | * | 173 | * |
174 | */ | 174 | */ |
175 | ssize_t splice_to_pipe(struct pipe_inode_info *pipe, | 175 | ssize_t splice_to_pipe(struct pipe_inode_info *pipe, |
176 | struct splice_pipe_desc *spd) | 176 | struct splice_pipe_desc *spd) |
177 | { | 177 | { |
178 | unsigned int spd_pages = spd->nr_pages; | 178 | unsigned int spd_pages = spd->nr_pages; |
179 | int ret, do_wakeup, page_nr; | 179 | int ret, do_wakeup, page_nr; |
180 | 180 | ||
181 | ret = 0; | 181 | ret = 0; |
182 | do_wakeup = 0; | 182 | do_wakeup = 0; |
183 | page_nr = 0; | 183 | page_nr = 0; |
184 | 184 | ||
185 | if (pipe->inode) | 185 | if (pipe->inode) |
186 | mutex_lock(&pipe->inode->i_mutex); | 186 | mutex_lock(&pipe->inode->i_mutex); |
187 | 187 | ||
188 | for (;;) { | 188 | for (;;) { |
189 | if (!pipe->readers) { | 189 | if (!pipe->readers) { |
190 | send_sig(SIGPIPE, current, 0); | 190 | send_sig(SIGPIPE, current, 0); |
191 | if (!ret) | 191 | if (!ret) |
192 | ret = -EPIPE; | 192 | ret = -EPIPE; |
193 | break; | 193 | break; |
194 | } | 194 | } |
195 | 195 | ||
196 | if (pipe->nrbufs < PIPE_BUFFERS) { | 196 | if (pipe->nrbufs < PIPE_BUFFERS) { |
197 | int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); | 197 | int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); |
198 | struct pipe_buffer *buf = pipe->bufs + newbuf; | 198 | struct pipe_buffer *buf = pipe->bufs + newbuf; |
199 | 199 | ||
200 | buf->page = spd->pages[page_nr]; | 200 | buf->page = spd->pages[page_nr]; |
201 | buf->offset = spd->partial[page_nr].offset; | 201 | buf->offset = spd->partial[page_nr].offset; |
202 | buf->len = spd->partial[page_nr].len; | 202 | buf->len = spd->partial[page_nr].len; |
203 | buf->private = spd->partial[page_nr].private; | 203 | buf->private = spd->partial[page_nr].private; |
204 | buf->ops = spd->ops; | 204 | buf->ops = spd->ops; |
205 | if (spd->flags & SPLICE_F_GIFT) | 205 | if (spd->flags & SPLICE_F_GIFT) |
206 | buf->flags |= PIPE_BUF_FLAG_GIFT; | 206 | buf->flags |= PIPE_BUF_FLAG_GIFT; |
207 | 207 | ||
208 | pipe->nrbufs++; | 208 | pipe->nrbufs++; |
209 | page_nr++; | 209 | page_nr++; |
210 | ret += buf->len; | 210 | ret += buf->len; |
211 | 211 | ||
212 | if (pipe->inode) | 212 | if (pipe->inode) |
213 | do_wakeup = 1; | 213 | do_wakeup = 1; |
214 | 214 | ||
215 | if (!--spd->nr_pages) | 215 | if (!--spd->nr_pages) |
216 | break; | 216 | break; |
217 | if (pipe->nrbufs < PIPE_BUFFERS) | 217 | if (pipe->nrbufs < PIPE_BUFFERS) |
218 | continue; | 218 | continue; |
219 | 219 | ||
220 | break; | 220 | break; |
221 | } | 221 | } |
222 | 222 | ||
223 | if (spd->flags & SPLICE_F_NONBLOCK) { | 223 | if (spd->flags & SPLICE_F_NONBLOCK) { |
224 | if (!ret) | 224 | if (!ret) |
225 | ret = -EAGAIN; | 225 | ret = -EAGAIN; |
226 | break; | 226 | break; |
227 | } | 227 | } |
228 | 228 | ||
229 | if (signal_pending(current)) { | 229 | if (signal_pending(current)) { |
230 | if (!ret) | 230 | if (!ret) |
231 | ret = -ERESTARTSYS; | 231 | ret = -ERESTARTSYS; |
232 | break; | 232 | break; |
233 | } | 233 | } |
234 | 234 | ||
235 | if (do_wakeup) { | 235 | if (do_wakeup) { |
236 | smp_mb(); | 236 | smp_mb(); |
237 | if (waitqueue_active(&pipe->wait)) | 237 | if (waitqueue_active(&pipe->wait)) |
238 | wake_up_interruptible_sync(&pipe->wait); | 238 | wake_up_interruptible_sync(&pipe->wait); |
239 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); | 239 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
240 | do_wakeup = 0; | 240 | do_wakeup = 0; |
241 | } | 241 | } |
242 | 242 | ||
243 | pipe->waiting_writers++; | 243 | pipe->waiting_writers++; |
244 | pipe_wait(pipe); | 244 | pipe_wait(pipe); |
245 | pipe->waiting_writers--; | 245 | pipe->waiting_writers--; |
246 | } | 246 | } |
247 | 247 | ||
248 | if (pipe->inode) { | 248 | if (pipe->inode) { |
249 | mutex_unlock(&pipe->inode->i_mutex); | 249 | mutex_unlock(&pipe->inode->i_mutex); |
250 | 250 | ||
251 | if (do_wakeup) { | 251 | if (do_wakeup) { |
252 | smp_mb(); | 252 | smp_mb(); |
253 | if (waitqueue_active(&pipe->wait)) | 253 | if (waitqueue_active(&pipe->wait)) |
254 | wake_up_interruptible(&pipe->wait); | 254 | wake_up_interruptible(&pipe->wait); |
255 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); | 255 | kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
256 | } | 256 | } |
257 | } | 257 | } |
258 | 258 | ||
259 | while (page_nr < spd_pages) | 259 | while (page_nr < spd_pages) |
260 | spd->spd_release(spd, page_nr++); | 260 | spd->spd_release(spd, page_nr++); |
261 | 261 | ||
262 | return ret; | 262 | return ret; |
263 | } | 263 | } |
264 | 264 | ||
265 | static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) | 265 | static void spd_release_page(struct splice_pipe_desc *spd, unsigned int i) |
266 | { | 266 | { |
267 | page_cache_release(spd->pages[i]); | 267 | page_cache_release(spd->pages[i]); |
268 | } | 268 | } |
269 | 269 | ||
270 | static int | 270 | static int |
271 | __generic_file_splice_read(struct file *in, loff_t *ppos, | 271 | __generic_file_splice_read(struct file *in, loff_t *ppos, |
272 | struct pipe_inode_info *pipe, size_t len, | 272 | struct pipe_inode_info *pipe, size_t len, |
273 | unsigned int flags) | 273 | unsigned int flags) |
274 | { | 274 | { |
275 | struct address_space *mapping = in->f_mapping; | 275 | struct address_space *mapping = in->f_mapping; |
276 | unsigned int loff, nr_pages, req_pages; | 276 | unsigned int loff, nr_pages, req_pages; |
277 | struct page *pages[PIPE_BUFFERS]; | 277 | struct page *pages[PIPE_BUFFERS]; |
278 | struct partial_page partial[PIPE_BUFFERS]; | 278 | struct partial_page partial[PIPE_BUFFERS]; |
279 | struct page *page; | 279 | struct page *page; |
280 | pgoff_t index, end_index; | 280 | pgoff_t index, end_index; |
281 | loff_t isize; | 281 | loff_t isize; |
282 | int error, page_nr; | 282 | int error, page_nr; |
283 | struct splice_pipe_desc spd = { | 283 | struct splice_pipe_desc spd = { |
284 | .pages = pages, | 284 | .pages = pages, |
285 | .partial = partial, | 285 | .partial = partial, |
286 | .flags = flags, | 286 | .flags = flags, |
287 | .ops = &page_cache_pipe_buf_ops, | 287 | .ops = &page_cache_pipe_buf_ops, |
288 | .spd_release = spd_release_page, | 288 | .spd_release = spd_release_page, |
289 | }; | 289 | }; |
290 | 290 | ||
291 | index = *ppos >> PAGE_CACHE_SHIFT; | 291 | index = *ppos >> PAGE_CACHE_SHIFT; |
292 | loff = *ppos & ~PAGE_CACHE_MASK; | 292 | loff = *ppos & ~PAGE_CACHE_MASK; |
293 | req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 293 | req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
294 | nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); | 294 | nr_pages = min(req_pages, (unsigned)PIPE_BUFFERS); |
295 | 295 | ||
296 | /* | 296 | /* |
297 | * Lookup the (hopefully) full range of pages we need. | 297 | * Lookup the (hopefully) full range of pages we need. |
298 | */ | 298 | */ |
299 | spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); | 299 | spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, pages); |
300 | index += spd.nr_pages; | 300 | index += spd.nr_pages; |
301 | 301 | ||
302 | /* | 302 | /* |
303 | * If find_get_pages_contig() returned fewer pages than we needed, | 303 | * If find_get_pages_contig() returned fewer pages than we needed, |
304 | * readahead/allocate the rest and fill in the holes. | 304 | * readahead/allocate the rest and fill in the holes. |
305 | */ | 305 | */ |
306 | if (spd.nr_pages < nr_pages) | 306 | if (spd.nr_pages < nr_pages) |
307 | page_cache_sync_readahead(mapping, &in->f_ra, in, | 307 | page_cache_sync_readahead(mapping, &in->f_ra, in, |
308 | index, req_pages - spd.nr_pages); | 308 | index, req_pages - spd.nr_pages); |
309 | 309 | ||
310 | error = 0; | 310 | error = 0; |
311 | while (spd.nr_pages < nr_pages) { | 311 | while (spd.nr_pages < nr_pages) { |
312 | /* | 312 | /* |
313 | * Page could be there, find_get_pages_contig() breaks on | 313 | * Page could be there, find_get_pages_contig() breaks on |
314 | * the first hole. | 314 | * the first hole. |
315 | */ | 315 | */ |
316 | page = find_get_page(mapping, index); | 316 | page = find_get_page(mapping, index); |
317 | if (!page) { | 317 | if (!page) { |
318 | /* | 318 | /* |
319 | * page didn't exist, allocate one. | 319 | * page didn't exist, allocate one. |
320 | */ | 320 | */ |
321 | page = page_cache_alloc_cold(mapping); | 321 | page = page_cache_alloc_cold(mapping); |
322 | if (!page) | 322 | if (!page) |
323 | break; | 323 | break; |
324 | 324 | ||
325 | error = add_to_page_cache_lru(page, mapping, index, | 325 | error = add_to_page_cache_lru(page, mapping, index, |
326 | mapping_gfp_mask(mapping)); | 326 | mapping_gfp_mask(mapping)); |
327 | if (unlikely(error)) { | 327 | if (unlikely(error)) { |
328 | page_cache_release(page); | 328 | page_cache_release(page); |
329 | if (error == -EEXIST) | 329 | if (error == -EEXIST) |
330 | continue; | 330 | continue; |
331 | break; | 331 | break; |
332 | } | 332 | } |
333 | /* | 333 | /* |
334 | * add_to_page_cache() locks the page, unlock it | 334 | * add_to_page_cache() locks the page, unlock it |
335 | * to avoid convoluting the logic below even more. | 335 | * to avoid convoluting the logic below even more. |
336 | */ | 336 | */ |
337 | unlock_page(page); | 337 | unlock_page(page); |
338 | } | 338 | } |
339 | 339 | ||
340 | pages[spd.nr_pages++] = page; | 340 | pages[spd.nr_pages++] = page; |
341 | index++; | 341 | index++; |
342 | } | 342 | } |
343 | 343 | ||
344 | /* | 344 | /* |
345 | * Now loop over the map and see if we need to start IO on any | 345 | * Now loop over the map and see if we need to start IO on any |
346 | * pages, fill in the partial map, etc. | 346 | * pages, fill in the partial map, etc. |
347 | */ | 347 | */ |
348 | index = *ppos >> PAGE_CACHE_SHIFT; | 348 | index = *ppos >> PAGE_CACHE_SHIFT; |
349 | nr_pages = spd.nr_pages; | 349 | nr_pages = spd.nr_pages; |
350 | spd.nr_pages = 0; | 350 | spd.nr_pages = 0; |
351 | for (page_nr = 0; page_nr < nr_pages; page_nr++) { | 351 | for (page_nr = 0; page_nr < nr_pages; page_nr++) { |
352 | unsigned int this_len; | 352 | unsigned int this_len; |
353 | 353 | ||
354 | if (!len) | 354 | if (!len) |
355 | break; | 355 | break; |
356 | 356 | ||
357 | /* | 357 | /* |
358 | * this_len is the max we'll use from this page | 358 | * this_len is the max we'll use from this page |
359 | */ | 359 | */ |
360 | this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); | 360 | this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); |
361 | page = pages[page_nr]; | 361 | page = pages[page_nr]; |
362 | 362 | ||
363 | if (PageReadahead(page)) | 363 | if (PageReadahead(page)) |
364 | page_cache_async_readahead(mapping, &in->f_ra, in, | 364 | page_cache_async_readahead(mapping, &in->f_ra, in, |
365 | page, index, req_pages - page_nr); | 365 | page, index, req_pages - page_nr); |
366 | 366 | ||
367 | /* | 367 | /* |
368 | * If the page isn't uptodate, we may need to start io on it | 368 | * If the page isn't uptodate, we may need to start io on it |
369 | */ | 369 | */ |
370 | if (!PageUptodate(page)) { | 370 | if (!PageUptodate(page)) { |
371 | /* | 371 | /* |
372 | * If in nonblock mode then dont block on waiting | 372 | * If in nonblock mode then dont block on waiting |
373 | * for an in-flight io page | 373 | * for an in-flight io page |
374 | */ | 374 | */ |
375 | if (flags & SPLICE_F_NONBLOCK) { | 375 | if (flags & SPLICE_F_NONBLOCK) { |
376 | if (!trylock_page(page)) { | 376 | if (!trylock_page(page)) { |
377 | error = -EAGAIN; | 377 | error = -EAGAIN; |
378 | break; | 378 | break; |
379 | } | 379 | } |
380 | } else | 380 | } else |
381 | lock_page(page); | 381 | lock_page(page); |
382 | 382 | ||
383 | /* | 383 | /* |
384 | * Page was truncated, or invalidated by the | 384 | * Page was truncated, or invalidated by the |
385 | * filesystem. Redo the find/create, but this time the | 385 | * filesystem. Redo the find/create, but this time the |
386 | * page is kept locked, so there's no chance of another | 386 | * page is kept locked, so there's no chance of another |
387 | * race with truncate/invalidate. | 387 | * race with truncate/invalidate. |
388 | */ | 388 | */ |
389 | if (!page->mapping) { | 389 | if (!page->mapping) { |
390 | unlock_page(page); | 390 | unlock_page(page); |
391 | page = find_or_create_page(mapping, index, | 391 | page = find_or_create_page(mapping, index, |
392 | mapping_gfp_mask(mapping)); | 392 | mapping_gfp_mask(mapping)); |
393 | 393 | ||
394 | if (!page) { | 394 | if (!page) { |
395 | error = -ENOMEM; | 395 | error = -ENOMEM; |
396 | break; | 396 | break; |
397 | } | 397 | } |
398 | page_cache_release(pages[page_nr]); | 398 | page_cache_release(pages[page_nr]); |
399 | pages[page_nr] = page; | 399 | pages[page_nr] = page; |
400 | } | 400 | } |
401 | /* | 401 | /* |
402 | * page was already under io and is now done, great | 402 | * page was already under io and is now done, great |
403 | */ | 403 | */ |
404 | if (PageUptodate(page)) { | 404 | if (PageUptodate(page)) { |
405 | unlock_page(page); | 405 | unlock_page(page); |
406 | goto fill_it; | 406 | goto fill_it; |
407 | } | 407 | } |
408 | 408 | ||
409 | /* | 409 | /* |
410 | * need to read in the page | 410 | * need to read in the page |
411 | */ | 411 | */ |
412 | error = mapping->a_ops->readpage(in, page); | 412 | error = mapping->a_ops->readpage(in, page); |
413 | if (unlikely(error)) { | 413 | if (unlikely(error)) { |
414 | /* | 414 | /* |
415 | * We really should re-lookup the page here, | 415 | * We really should re-lookup the page here, |
416 | * but it complicates things a lot. Instead | 416 | * but it complicates things a lot. Instead |
417 | * lets just do what we already stored, and | 417 | * lets just do what we already stored, and |
418 | * we'll get it the next time we are called. | 418 | * we'll get it the next time we are called. |
419 | */ | 419 | */ |
420 | if (error == AOP_TRUNCATED_PAGE) | 420 | if (error == AOP_TRUNCATED_PAGE) |
421 | error = 0; | 421 | error = 0; |
422 | 422 | ||
423 | break; | 423 | break; |
424 | } | 424 | } |
425 | } | 425 | } |
426 | fill_it: | 426 | fill_it: |
427 | /* | 427 | /* |
428 | * i_size must be checked after PageUptodate. | 428 | * i_size must be checked after PageUptodate. |
429 | */ | 429 | */ |
430 | isize = i_size_read(mapping->host); | 430 | isize = i_size_read(mapping->host); |
431 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; | 431 | end_index = (isize - 1) >> PAGE_CACHE_SHIFT; |
432 | if (unlikely(!isize || index > end_index)) | 432 | if (unlikely(!isize || index > end_index)) |
433 | break; | 433 | break; |
434 | 434 | ||
435 | /* | 435 | /* |
436 | * if this is the last page, see if we need to shrink | 436 | * if this is the last page, see if we need to shrink |
437 | * the length and stop | 437 | * the length and stop |
438 | */ | 438 | */ |
439 | if (end_index == index) { | 439 | if (end_index == index) { |
440 | unsigned int plen; | 440 | unsigned int plen; |
441 | 441 | ||
442 | /* | 442 | /* |
443 | * max good bytes in this page | 443 | * max good bytes in this page |
444 | */ | 444 | */ |
445 | plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; | 445 | plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; |
446 | if (plen <= loff) | 446 | if (plen <= loff) |
447 | break; | 447 | break; |
448 | 448 | ||
449 | /* | 449 | /* |
450 | * force quit after adding this page | 450 | * force quit after adding this page |
451 | */ | 451 | */ |
452 | this_len = min(this_len, plen - loff); | 452 | this_len = min(this_len, plen - loff); |
453 | len = this_len; | 453 | len = this_len; |
454 | } | 454 | } |
455 | 455 | ||
456 | partial[page_nr].offset = loff; | 456 | partial[page_nr].offset = loff; |
457 | partial[page_nr].len = this_len; | 457 | partial[page_nr].len = this_len; |
458 | len -= this_len; | 458 | len -= this_len; |
459 | loff = 0; | 459 | loff = 0; |
460 | spd.nr_pages++; | 460 | spd.nr_pages++; |
461 | index++; | 461 | index++; |
462 | } | 462 | } |
463 | 463 | ||
464 | /* | 464 | /* |
465 | * Release any pages at the end, if we quit early. 'page_nr' is how far | 465 | * Release any pages at the end, if we quit early. 'page_nr' is how far |
466 | * we got, 'nr_pages' is how many pages are in the map. | 466 | * we got, 'nr_pages' is how many pages are in the map. |
467 | */ | 467 | */ |
468 | while (page_nr < nr_pages) | 468 | while (page_nr < nr_pages) |
469 | page_cache_release(pages[page_nr++]); | 469 | page_cache_release(pages[page_nr++]); |
470 | in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; | 470 | in->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; |
471 | 471 | ||
472 | if (spd.nr_pages) | 472 | if (spd.nr_pages) |
473 | return splice_to_pipe(pipe, &spd); | 473 | return splice_to_pipe(pipe, &spd); |
474 | 474 | ||
475 | return error; | 475 | return error; |
476 | } | 476 | } |
477 | 477 | ||
478 | /** | 478 | /** |
479 | * generic_file_splice_read - splice data from file to a pipe | 479 | * generic_file_splice_read - splice data from file to a pipe |
480 | * @in: file to splice from | 480 | * @in: file to splice from |
481 | * @ppos: position in @in | 481 | * @ppos: position in @in |
482 | * @pipe: pipe to splice to | 482 | * @pipe: pipe to splice to |
483 | * @len: number of bytes to splice | 483 | * @len: number of bytes to splice |
484 | * @flags: splice modifier flags | 484 | * @flags: splice modifier flags |
485 | * | 485 | * |
486 | * Description: | 486 | * Description: |
487 | * Will read pages from given file and fill them into a pipe. Can be | 487 | * Will read pages from given file and fill them into a pipe. Can be |
488 | * used as long as the address_space operations for the source implements | 488 | * used as long as the address_space operations for the source implements |
489 | * a readpage() hook. | 489 | * a readpage() hook. |
490 | * | 490 | * |
491 | */ | 491 | */ |
492 | ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, | 492 | ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, |
493 | struct pipe_inode_info *pipe, size_t len, | 493 | struct pipe_inode_info *pipe, size_t len, |
494 | unsigned int flags) | 494 | unsigned int flags) |
495 | { | 495 | { |
496 | loff_t isize, left; | 496 | loff_t isize, left; |
497 | int ret; | 497 | int ret; |
498 | 498 | ||
499 | isize = i_size_read(in->f_mapping->host); | 499 | isize = i_size_read(in->f_mapping->host); |
500 | if (unlikely(*ppos >= isize)) | 500 | if (unlikely(*ppos >= isize)) |
501 | return 0; | 501 | return 0; |
502 | 502 | ||
503 | left = isize - *ppos; | 503 | left = isize - *ppos; |
504 | if (unlikely(left < len)) | 504 | if (unlikely(left < len)) |
505 | len = left; | 505 | len = left; |
506 | 506 | ||
507 | ret = __generic_file_splice_read(in, ppos, pipe, len, flags); | 507 | ret = __generic_file_splice_read(in, ppos, pipe, len, flags); |
508 | if (ret > 0) | 508 | if (ret > 0) |
509 | *ppos += ret; | 509 | *ppos += ret; |
510 | 510 | ||
511 | return ret; | 511 | return ret; |
512 | } | 512 | } |
513 | 513 | ||
514 | EXPORT_SYMBOL(generic_file_splice_read); | 514 | EXPORT_SYMBOL(generic_file_splice_read); |
515 | 515 | ||
516 | /* | 516 | /* |
517 | * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' | 517 | * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' |
518 | * using sendpage(). Return the number of bytes sent. | 518 | * using sendpage(). Return the number of bytes sent. |
519 | */ | 519 | */ |
520 | static int pipe_to_sendpage(struct pipe_inode_info *pipe, | 520 | static int pipe_to_sendpage(struct pipe_inode_info *pipe, |
521 | struct pipe_buffer *buf, struct splice_desc *sd) | 521 | struct pipe_buffer *buf, struct splice_desc *sd) |
522 | { | 522 | { |
523 | struct file *file = sd->u.file; | 523 | struct file *file = sd->u.file; |
524 | loff_t pos = sd->pos; | 524 | loff_t pos = sd->pos; |
525 | int ret, more; | 525 | int ret, more; |
526 | 526 | ||
527 | ret = buf->ops->confirm(pipe, buf); | 527 | ret = buf->ops->confirm(pipe, buf); |
528 | if (!ret) { | 528 | if (!ret) { |
529 | more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; | 529 | more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; |
530 | 530 | ||
531 | ret = file->f_op->sendpage(file, buf->page, buf->offset, | 531 | ret = file->f_op->sendpage(file, buf->page, buf->offset, |
532 | sd->len, &pos, more); | 532 | sd->len, &pos, more); |
533 | } | 533 | } |
534 | 534 | ||
535 | return ret; | 535 | return ret; |
536 | } | 536 | } |
537 | 537 | ||
538 | /* | 538 | /* |
539 | * This is a little more tricky than the file -> pipe splicing. There are | 539 | * This is a little more tricky than the file -> pipe splicing. There are |
540 | * basically three cases: | 540 | * basically three cases: |
541 | * | 541 | * |
542 | * - Destination page already exists in the address space and there | 542 | * - Destination page already exists in the address space and there |
543 | * are users of it. For that case we have no other option that | 543 | * are users of it. For that case we have no other option that |
544 | * copying the data. Tough luck. | 544 | * copying the data. Tough luck. |
545 | * - Destination page already exists in the address space, but there | 545 | * - Destination page already exists in the address space, but there |
546 | * are no users of it. Make sure it's uptodate, then drop it. Fall | 546 | * are no users of it. Make sure it's uptodate, then drop it. Fall |
547 | * through to last case. | 547 | * through to last case. |
548 | * - Destination page does not exist, we can add the pipe page to | 548 | * - Destination page does not exist, we can add the pipe page to |
549 | * the page cache and avoid the copy. | 549 | * the page cache and avoid the copy. |
550 | * | 550 | * |
551 | * If asked to move pages to the output file (SPLICE_F_MOVE is set in | 551 | * If asked to move pages to the output file (SPLICE_F_MOVE is set in |
552 | * sd->flags), we attempt to migrate pages from the pipe to the output | 552 | * sd->flags), we attempt to migrate pages from the pipe to the output |
553 | * file address space page cache. This is possible if no one else has | 553 | * file address space page cache. This is possible if no one else has |
554 | * the pipe page referenced outside of the pipe and page cache. If | 554 | * the pipe page referenced outside of the pipe and page cache. If |
555 | * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create | 555 | * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create |
556 | * a new page in the output file page cache and fill/dirty that. | 556 | * a new page in the output file page cache and fill/dirty that. |
557 | */ | 557 | */ |
558 | static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, | 558 | static int pipe_to_file(struct pipe_inode_info *pipe, struct pipe_buffer *buf, |
559 | struct splice_desc *sd) | 559 | struct splice_desc *sd) |
560 | { | 560 | { |
561 | struct file *file = sd->u.file; | 561 | struct file *file = sd->u.file; |
562 | struct address_space *mapping = file->f_mapping; | 562 | struct address_space *mapping = file->f_mapping; |
563 | unsigned int offset, this_len; | 563 | unsigned int offset, this_len; |
564 | struct page *page; | 564 | struct page *page; |
565 | void *fsdata; | 565 | void *fsdata; |
566 | int ret; | 566 | int ret; |
567 | 567 | ||
568 | /* | 568 | /* |
569 | * make sure the data in this buffer is uptodate | 569 | * make sure the data in this buffer is uptodate |
570 | */ | 570 | */ |
571 | ret = buf->ops->confirm(pipe, buf); | 571 | ret = buf->ops->confirm(pipe, buf); |
572 | if (unlikely(ret)) | 572 | if (unlikely(ret)) |
573 | return ret; | 573 | return ret; |
574 | 574 | ||
575 | offset = sd->pos & ~PAGE_CACHE_MASK; | 575 | offset = sd->pos & ~PAGE_CACHE_MASK; |
576 | 576 | ||
577 | this_len = sd->len; | 577 | this_len = sd->len; |
578 | if (this_len + offset > PAGE_CACHE_SIZE) | 578 | if (this_len + offset > PAGE_CACHE_SIZE) |
579 | this_len = PAGE_CACHE_SIZE - offset; | 579 | this_len = PAGE_CACHE_SIZE - offset; |
580 | 580 | ||
581 | ret = pagecache_write_begin(file, mapping, sd->pos, this_len, | 581 | ret = pagecache_write_begin(file, mapping, sd->pos, this_len, |
582 | AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); | 582 | AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); |
583 | if (unlikely(ret)) | 583 | if (unlikely(ret)) |
584 | goto out; | 584 | goto out; |
585 | 585 | ||
586 | if (buf->page != page) { | 586 | if (buf->page != page) { |
587 | /* | 587 | /* |
588 | * Careful, ->map() uses KM_USER0! | 588 | * Careful, ->map() uses KM_USER0! |
589 | */ | 589 | */ |
590 | char *src = buf->ops->map(pipe, buf, 1); | 590 | char *src = buf->ops->map(pipe, buf, 1); |
591 | char *dst = kmap_atomic(page, KM_USER1); | 591 | char *dst = kmap_atomic(page, KM_USER1); |
592 | 592 | ||
593 | memcpy(dst + offset, src + buf->offset, this_len); | 593 | memcpy(dst + offset, src + buf->offset, this_len); |
594 | flush_dcache_page(page); | 594 | flush_dcache_page(page); |
595 | kunmap_atomic(dst, KM_USER1); | 595 | kunmap_atomic(dst, KM_USER1); |
596 | buf->ops->unmap(pipe, buf, src); | 596 | buf->ops->unmap(pipe, buf, src); |
597 | } | 597 | } |
598 | ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, | 598 | ret = pagecache_write_end(file, mapping, sd->pos, this_len, this_len, |
599 | page, fsdata); | 599 | page, fsdata); |
600 | out: | 600 | out: |
601 | return ret; | 601 | return ret; |
602 | } | 602 | } |
603 | 603 | ||
604 | /** | 604 | /** |
605 | * __splice_from_pipe - splice data from a pipe to given actor | 605 | * __splice_from_pipe - splice data from a pipe to given actor |
606 | * @pipe: pipe to splice from | 606 | * @pipe: pipe to splice from |
607 | * @sd: information to @actor | 607 | * @sd: information to @actor |
608 | * @actor: handler that splices the data | 608 | * @actor: handler that splices the data |
609 | * | 609 | * |
610 | * Description: | 610 | * Description: |
611 | * This function does little more than loop over the pipe and call | 611 | * This function does little more than loop over the pipe and call |
612 | * @actor to do the actual moving of a single struct pipe_buffer to | 612 | * @actor to do the actual moving of a single struct pipe_buffer to |
613 | * the desired destination. See pipe_to_file, pipe_to_sendpage, or | 613 | * the desired destination. See pipe_to_file, pipe_to_sendpage, or |
614 | * pipe_to_user. | 614 | * pipe_to_user. |
615 | * | 615 | * |
616 | */ | 616 | */ |
617 | ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, | 617 | ssize_t __splice_from_pipe(struct pipe_inode_info *pipe, struct splice_desc *sd, |
618 | splice_actor *actor) | 618 | splice_actor *actor) |
619 | { | 619 | { |
620 | int ret, do_wakeup, err; | 620 | int ret, do_wakeup, err; |
621 | 621 | ||
622 | ret = 0; | 622 | ret = 0; |
623 | do_wakeup = 0; | 623 | do_wakeup = 0; |
624 | 624 | ||
625 | for (;;) { | 625 | for (;;) { |
626 | if (pipe->nrbufs) { | 626 | if (pipe->nrbufs) { |
627 | struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; | 627 | struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; |
628 | const struct pipe_buf_operations *ops = buf->ops; | 628 | const struct pipe_buf_operations *ops = buf->ops; |
629 | 629 | ||
630 | sd->len = buf->len; | 630 | sd->len = buf->len; |
631 | if (sd->len > sd->total_len) | 631 | if (sd->len > sd->total_len) |
632 | sd->len = sd->total_len; | 632 | sd->len = sd->total_len; |
633 | 633 | ||
634 | err = actor(pipe, buf, sd); | 634 | err = actor(pipe, buf, sd); |
635 | if (err <= 0) { | 635 | if (err <= 0) { |
636 | if (!ret && err != -ENODATA) | 636 | if (!ret && err != -ENODATA) |
637 | ret = err; | 637 | ret = err; |
638 | 638 | ||
639 | break; | 639 | break; |
640 | } | 640 | } |
641 | 641 | ||
642 | ret += err; | 642 | ret += err; |
643 | buf->offset += err; | 643 | buf->offset += err; |
644 | buf->len -= err; | 644 | buf->len -= err; |
645 | 645 | ||
646 | sd->len -= err; | 646 | sd->len -= err; |
647 | sd->pos += err; | 647 | sd->pos += err; |
648 | sd->total_len -= err; | 648 | sd->total_len -= err; |
649 | if (sd->len) | 649 | if (sd->len) |
650 | continue; | 650 | continue; |
651 | 651 | ||
652 | if (!buf->len) { | 652 | if (!buf->len) { |
653 | buf->ops = NULL; | 653 | buf->ops = NULL; |
654 | ops->release(pipe, buf); | 654 | ops->release(pipe, buf); |
655 | pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); | 655 | pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); |
656 | pipe->nrbufs--; | 656 | pipe->nrbufs--; |
657 | if (pipe->inode) | 657 | if (pipe->inode) |
658 | do_wakeup = 1; | 658 | do_wakeup = 1; |
659 | } | 659 | } |
660 | 660 | ||
661 | if (!sd->total_len) | 661 | if (!sd->total_len) |
662 | break; | 662 | break; |
663 | } | 663 | } |
664 | 664 | ||
665 | if (pipe->nrbufs) | 665 | if (pipe->nrbufs) |
666 | continue; | 666 | continue; |
667 | if (!pipe->writers) | 667 | if (!pipe->writers) |
668 | break; | 668 | break; |
669 | if (!pipe->waiting_writers) { | 669 | if (!pipe->waiting_writers) { |
670 | if (ret) | 670 | if (ret) |
671 | break; | 671 | break; |
672 | } | 672 | } |
673 | 673 | ||
674 | if (sd->flags & SPLICE_F_NONBLOCK) { | 674 | if (sd->flags & SPLICE_F_NONBLOCK) { |
675 | if (!ret) | 675 | if (!ret) |
676 | ret = -EAGAIN; | 676 | ret = -EAGAIN; |
677 | break; | 677 | break; |
678 | } | 678 | } |
679 | 679 | ||
680 | if (signal_pending(current)) { | 680 | if (signal_pending(current)) { |
681 | if (!ret) | 681 | if (!ret) |
682 | ret = -ERESTARTSYS; | 682 | ret = -ERESTARTSYS; |
683 | break; | 683 | break; |
684 | } | 684 | } |
685 | 685 | ||
686 | if (do_wakeup) { | 686 | if (do_wakeup) { |
687 | smp_mb(); | 687 | smp_mb(); |
688 | if (waitqueue_active(&pipe->wait)) | 688 | if (waitqueue_active(&pipe->wait)) |
689 | wake_up_interruptible_sync(&pipe->wait); | 689 | wake_up_interruptible_sync(&pipe->wait); |
690 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); | 690 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
691 | do_wakeup = 0; | 691 | do_wakeup = 0; |
692 | } | 692 | } |
693 | 693 | ||
694 | pipe_wait(pipe); | 694 | pipe_wait(pipe); |
695 | } | 695 | } |
696 | 696 | ||
697 | if (do_wakeup) { | 697 | if (do_wakeup) { |
698 | smp_mb(); | 698 | smp_mb(); |
699 | if (waitqueue_active(&pipe->wait)) | 699 | if (waitqueue_active(&pipe->wait)) |
700 | wake_up_interruptible(&pipe->wait); | 700 | wake_up_interruptible(&pipe->wait); |
701 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); | 701 | kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
702 | } | 702 | } |
703 | 703 | ||
704 | return ret; | 704 | return ret; |
705 | } | 705 | } |
706 | EXPORT_SYMBOL(__splice_from_pipe); | 706 | EXPORT_SYMBOL(__splice_from_pipe); |
707 | 707 | ||
708 | /** | 708 | /** |
709 | * splice_from_pipe - splice data from a pipe to a file | 709 | * splice_from_pipe - splice data from a pipe to a file |
710 | * @pipe: pipe to splice from | 710 | * @pipe: pipe to splice from |
711 | * @out: file to splice to | 711 | * @out: file to splice to |
712 | * @ppos: position in @out | 712 | * @ppos: position in @out |
713 | * @len: how many bytes to splice | 713 | * @len: how many bytes to splice |
714 | * @flags: splice modifier flags | 714 | * @flags: splice modifier flags |
715 | * @actor: handler that splices the data | 715 | * @actor: handler that splices the data |
716 | * | 716 | * |
717 | * Description: | 717 | * Description: |
718 | * See __splice_from_pipe. This function locks the input and output inodes, | 718 | * See __splice_from_pipe. This function locks the input and output inodes, |
719 | * otherwise it's identical to __splice_from_pipe(). | 719 | * otherwise it's identical to __splice_from_pipe(). |
720 | * | 720 | * |
721 | */ | 721 | */ |
722 | ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, | 722 | ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out, |
723 | loff_t *ppos, size_t len, unsigned int flags, | 723 | loff_t *ppos, size_t len, unsigned int flags, |
724 | splice_actor *actor) | 724 | splice_actor *actor) |
725 | { | 725 | { |
726 | ssize_t ret; | 726 | ssize_t ret; |
727 | struct inode *inode = out->f_mapping->host; | 727 | struct inode *inode = out->f_mapping->host; |
728 | struct splice_desc sd = { | 728 | struct splice_desc sd = { |
729 | .total_len = len, | 729 | .total_len = len, |
730 | .flags = flags, | 730 | .flags = flags, |
731 | .pos = *ppos, | 731 | .pos = *ppos, |
732 | .u.file = out, | 732 | .u.file = out, |
733 | }; | 733 | }; |
734 | 734 | ||
735 | /* | 735 | /* |
736 | * The actor worker might be calling ->write_begin and | 736 | * The actor worker might be calling ->write_begin and |
737 | * ->write_end. Most of the time, these expect i_mutex to | 737 | * ->write_end. Most of the time, these expect i_mutex to |
738 | * be held. Since this may result in an ABBA deadlock with | 738 | * be held. Since this may result in an ABBA deadlock with |
739 | * pipe->inode, we have to order lock acquiry here. | 739 | * pipe->inode, we have to order lock acquiry here. |
740 | * | ||
741 | * Outer lock must be inode->i_mutex, as pipe_wait() will | ||
742 | * release and reacquire pipe->inode->i_mutex, AND inode must | ||
743 | * never be a pipe. | ||
740 | */ | 744 | */ |
741 | inode_double_lock(inode, pipe->inode); | 745 | WARN_ON(S_ISFIFO(inode->i_mode)); |
746 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); | ||
747 | if (pipe->inode) | ||
748 | mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); | ||
742 | ret = __splice_from_pipe(pipe, &sd, actor); | 749 | ret = __splice_from_pipe(pipe, &sd, actor); |
743 | inode_double_unlock(inode, pipe->inode); | 750 | if (pipe->inode) |
751 | mutex_unlock(&pipe->inode->i_mutex); | ||
752 | mutex_unlock(&inode->i_mutex); | ||
744 | 753 | ||
745 | return ret; | 754 | return ret; |
746 | } | 755 | } |
747 | 756 | ||
748 | /** | 757 | /** |
749 | * generic_file_splice_write_nolock - generic_file_splice_write without mutexes | 758 | * generic_file_splice_write_nolock - generic_file_splice_write without mutexes |
750 | * @pipe: pipe info | 759 | * @pipe: pipe info |
751 | * @out: file to write to | 760 | * @out: file to write to |
752 | * @ppos: position in @out | 761 | * @ppos: position in @out |
753 | * @len: number of bytes to splice | 762 | * @len: number of bytes to splice |
754 | * @flags: splice modifier flags | 763 | * @flags: splice modifier flags |
755 | * | 764 | * |
756 | * Description: | 765 | * Description: |
757 | * Will either move or copy pages (determined by @flags options) from | 766 | * Will either move or copy pages (determined by @flags options) from |
758 | * the given pipe inode to the given file. The caller is responsible | 767 | * the given pipe inode to the given file. The caller is responsible |
759 | * for acquiring i_mutex on both inodes. | 768 | * for acquiring i_mutex on both inodes. |
760 | * | 769 | * |
761 | */ | 770 | */ |
762 | ssize_t | 771 | ssize_t |
763 | generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, | 772 | generic_file_splice_write_nolock(struct pipe_inode_info *pipe, struct file *out, |
764 | loff_t *ppos, size_t len, unsigned int flags) | 773 | loff_t *ppos, size_t len, unsigned int flags) |
765 | { | 774 | { |
766 | struct address_space *mapping = out->f_mapping; | 775 | struct address_space *mapping = out->f_mapping; |
767 | struct inode *inode = mapping->host; | 776 | struct inode *inode = mapping->host; |
768 | struct splice_desc sd = { | 777 | struct splice_desc sd = { |
769 | .total_len = len, | 778 | .total_len = len, |
770 | .flags = flags, | 779 | .flags = flags, |
771 | .pos = *ppos, | 780 | .pos = *ppos, |
772 | .u.file = out, | 781 | .u.file = out, |
773 | }; | 782 | }; |
774 | ssize_t ret; | 783 | ssize_t ret; |
775 | int err; | 784 | int err; |
776 | 785 | ||
777 | err = file_remove_suid(out); | 786 | err = file_remove_suid(out); |
778 | if (unlikely(err)) | 787 | if (unlikely(err)) |
779 | return err; | 788 | return err; |
780 | 789 | ||
781 | ret = __splice_from_pipe(pipe, &sd, pipe_to_file); | 790 | ret = __splice_from_pipe(pipe, &sd, pipe_to_file); |
782 | if (ret > 0) { | 791 | if (ret > 0) { |
783 | unsigned long nr_pages; | 792 | unsigned long nr_pages; |
784 | 793 | ||
785 | *ppos += ret; | 794 | *ppos += ret; |
786 | nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 795 | nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
787 | 796 | ||
788 | /* | 797 | /* |
789 | * If file or inode is SYNC and we actually wrote some data, | 798 | * If file or inode is SYNC and we actually wrote some data, |
790 | * sync it. | 799 | * sync it. |
791 | */ | 800 | */ |
792 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { | 801 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { |
793 | err = generic_osync_inode(inode, mapping, | 802 | err = generic_osync_inode(inode, mapping, |
794 | OSYNC_METADATA|OSYNC_DATA); | 803 | OSYNC_METADATA|OSYNC_DATA); |
795 | 804 | ||
796 | if (err) | 805 | if (err) |
797 | ret = err; | 806 | ret = err; |
798 | } | 807 | } |
799 | balance_dirty_pages_ratelimited_nr(mapping, nr_pages); | 808 | balance_dirty_pages_ratelimited_nr(mapping, nr_pages); |
800 | } | 809 | } |
801 | 810 | ||
802 | return ret; | 811 | return ret; |
803 | } | 812 | } |
804 | 813 | ||
805 | EXPORT_SYMBOL(generic_file_splice_write_nolock); | 814 | EXPORT_SYMBOL(generic_file_splice_write_nolock); |
806 | 815 | ||
807 | /** | 816 | /** |
808 | * generic_file_splice_write - splice data from a pipe to a file | 817 | * generic_file_splice_write - splice data from a pipe to a file |
809 | * @pipe: pipe info | 818 | * @pipe: pipe info |
810 | * @out: file to write to | 819 | * @out: file to write to |
811 | * @ppos: position in @out | 820 | * @ppos: position in @out |
812 | * @len: number of bytes to splice | 821 | * @len: number of bytes to splice |
813 | * @flags: splice modifier flags | 822 | * @flags: splice modifier flags |
814 | * | 823 | * |
815 | * Description: | 824 | * Description: |
816 | * Will either move or copy pages (determined by @flags options) from | 825 | * Will either move or copy pages (determined by @flags options) from |
817 | * the given pipe inode to the given file. | 826 | * the given pipe inode to the given file. |
818 | * | 827 | * |
819 | */ | 828 | */ |
820 | ssize_t | 829 | ssize_t |
821 | generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, | 830 | generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, |
822 | loff_t *ppos, size_t len, unsigned int flags) | 831 | loff_t *ppos, size_t len, unsigned int flags) |
823 | { | 832 | { |
824 | struct address_space *mapping = out->f_mapping; | 833 | struct address_space *mapping = out->f_mapping; |
825 | struct inode *inode = mapping->host; | 834 | struct inode *inode = mapping->host; |
826 | struct splice_desc sd = { | 835 | struct splice_desc sd = { |
827 | .total_len = len, | 836 | .total_len = len, |
828 | .flags = flags, | 837 | .flags = flags, |
829 | .pos = *ppos, | 838 | .pos = *ppos, |
830 | .u.file = out, | 839 | .u.file = out, |
831 | }; | 840 | }; |
832 | ssize_t ret; | 841 | ssize_t ret; |
833 | 842 | ||
834 | inode_double_lock(inode, pipe->inode); | 843 | WARN_ON(S_ISFIFO(inode->i_mode)); |
844 | mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); | ||
835 | ret = file_remove_suid(out); | 845 | ret = file_remove_suid(out); |
836 | if (likely(!ret)) | 846 | if (likely(!ret)) { |
847 | if (pipe->inode) | ||
848 | mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD); | ||
837 | ret = __splice_from_pipe(pipe, &sd, pipe_to_file); | 849 | ret = __splice_from_pipe(pipe, &sd, pipe_to_file); |
838 | inode_double_unlock(inode, pipe->inode); | 850 | if (pipe->inode) |
851 | mutex_unlock(&pipe->inode->i_mutex); | ||
852 | } | ||
853 | mutex_unlock(&inode->i_mutex); | ||
839 | if (ret > 0) { | 854 | if (ret > 0) { |
840 | unsigned long nr_pages; | 855 | unsigned long nr_pages; |
841 | 856 | ||
842 | *ppos += ret; | 857 | *ppos += ret; |
843 | nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; | 858 | nr_pages = (ret + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; |
844 | 859 | ||
845 | /* | 860 | /* |
846 | * If file or inode is SYNC and we actually wrote some data, | 861 | * If file or inode is SYNC and we actually wrote some data, |
847 | * sync it. | 862 | * sync it. |
848 | */ | 863 | */ |
849 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { | 864 | if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) { |
850 | int err; | 865 | int err; |
851 | 866 | ||
852 | mutex_lock(&inode->i_mutex); | 867 | mutex_lock(&inode->i_mutex); |
853 | err = generic_osync_inode(inode, mapping, | 868 | err = generic_osync_inode(inode, mapping, |
854 | OSYNC_METADATA|OSYNC_DATA); | 869 | OSYNC_METADATA|OSYNC_DATA); |
855 | mutex_unlock(&inode->i_mutex); | 870 | mutex_unlock(&inode->i_mutex); |
856 | 871 | ||
857 | if (err) | 872 | if (err) |
858 | ret = err; | 873 | ret = err; |
859 | } | 874 | } |
860 | balance_dirty_pages_ratelimited_nr(mapping, nr_pages); | 875 | balance_dirty_pages_ratelimited_nr(mapping, nr_pages); |
861 | } | 876 | } |
862 | 877 | ||
863 | return ret; | 878 | return ret; |
864 | } | 879 | } |
865 | 880 | ||
866 | EXPORT_SYMBOL(generic_file_splice_write); | 881 | EXPORT_SYMBOL(generic_file_splice_write); |
867 | 882 | ||
868 | /** | 883 | /** |
869 | * generic_splice_sendpage - splice data from a pipe to a socket | 884 | * generic_splice_sendpage - splice data from a pipe to a socket |
870 | * @pipe: pipe to splice from | 885 | * @pipe: pipe to splice from |
871 | * @out: socket to write to | 886 | * @out: socket to write to |
872 | * @ppos: position in @out | 887 | * @ppos: position in @out |
873 | * @len: number of bytes to splice | 888 | * @len: number of bytes to splice |
874 | * @flags: splice modifier flags | 889 | * @flags: splice modifier flags |
875 | * | 890 | * |
876 | * Description: | 891 | * Description: |
877 | * Will send @len bytes from the pipe to a network socket. No data copying | 892 | * Will send @len bytes from the pipe to a network socket. No data copying |
878 | * is involved. | 893 | * is involved. |
879 | * | 894 | * |
880 | */ | 895 | */ |
881 | ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, | 896 | ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, |
882 | loff_t *ppos, size_t len, unsigned int flags) | 897 | loff_t *ppos, size_t len, unsigned int flags) |
883 | { | 898 | { |
884 | return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); | 899 | return splice_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); |
885 | } | 900 | } |
886 | 901 | ||
887 | EXPORT_SYMBOL(generic_splice_sendpage); | 902 | EXPORT_SYMBOL(generic_splice_sendpage); |
888 | 903 | ||
889 | /* | 904 | /* |
890 | * Attempt to initiate a splice from pipe to file. | 905 | * Attempt to initiate a splice from pipe to file. |
891 | */ | 906 | */ |
892 | static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, | 907 | static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, |
893 | loff_t *ppos, size_t len, unsigned int flags) | 908 | loff_t *ppos, size_t len, unsigned int flags) |
894 | { | 909 | { |
895 | int ret; | 910 | int ret; |
896 | 911 | ||
897 | if (unlikely(!out->f_op || !out->f_op->splice_write)) | 912 | if (unlikely(!out->f_op || !out->f_op->splice_write)) |
898 | return -EINVAL; | 913 | return -EINVAL; |
899 | 914 | ||
900 | if (unlikely(!(out->f_mode & FMODE_WRITE))) | 915 | if (unlikely(!(out->f_mode & FMODE_WRITE))) |
901 | return -EBADF; | 916 | return -EBADF; |
902 | 917 | ||
903 | if (unlikely(out->f_flags & O_APPEND)) | 918 | if (unlikely(out->f_flags & O_APPEND)) |
904 | return -EINVAL; | 919 | return -EINVAL; |
905 | 920 | ||
906 | ret = rw_verify_area(WRITE, out, ppos, len); | 921 | ret = rw_verify_area(WRITE, out, ppos, len); |
907 | if (unlikely(ret < 0)) | 922 | if (unlikely(ret < 0)) |
908 | return ret; | 923 | return ret; |
909 | 924 | ||
910 | return out->f_op->splice_write(pipe, out, ppos, len, flags); | 925 | return out->f_op->splice_write(pipe, out, ppos, len, flags); |
911 | } | 926 | } |
912 | 927 | ||
913 | /* | 928 | /* |
914 | * Attempt to initiate a splice from a file to a pipe. | 929 | * Attempt to initiate a splice from a file to a pipe. |
915 | */ | 930 | */ |
916 | static long do_splice_to(struct file *in, loff_t *ppos, | 931 | static long do_splice_to(struct file *in, loff_t *ppos, |
917 | struct pipe_inode_info *pipe, size_t len, | 932 | struct pipe_inode_info *pipe, size_t len, |
918 | unsigned int flags) | 933 | unsigned int flags) |
919 | { | 934 | { |
920 | int ret; | 935 | int ret; |
921 | 936 | ||
922 | if (unlikely(!in->f_op || !in->f_op->splice_read)) | 937 | if (unlikely(!in->f_op || !in->f_op->splice_read)) |
923 | return -EINVAL; | 938 | return -EINVAL; |
924 | 939 | ||
925 | if (unlikely(!(in->f_mode & FMODE_READ))) | 940 | if (unlikely(!(in->f_mode & FMODE_READ))) |
926 | return -EBADF; | 941 | return -EBADF; |
927 | 942 | ||
928 | ret = rw_verify_area(READ, in, ppos, len); | 943 | ret = rw_verify_area(READ, in, ppos, len); |
929 | if (unlikely(ret < 0)) | 944 | if (unlikely(ret < 0)) |
930 | return ret; | 945 | return ret; |
931 | 946 | ||
932 | return in->f_op->splice_read(in, ppos, pipe, len, flags); | 947 | return in->f_op->splice_read(in, ppos, pipe, len, flags); |
933 | } | 948 | } |
934 | 949 | ||
935 | /** | 950 | /** |
936 | * splice_direct_to_actor - splices data directly between two non-pipes | 951 | * splice_direct_to_actor - splices data directly between two non-pipes |
937 | * @in: file to splice from | 952 | * @in: file to splice from |
938 | * @sd: actor information on where to splice to | 953 | * @sd: actor information on where to splice to |
939 | * @actor: handles the data splicing | 954 | * @actor: handles the data splicing |
940 | * | 955 | * |
941 | * Description: | 956 | * Description: |
942 | * This is a special case helper to splice directly between two | 957 | * This is a special case helper to splice directly between two |
943 | * points, without requiring an explicit pipe. Internally an allocated | 958 | * points, without requiring an explicit pipe. Internally an allocated |
944 | * pipe is cached in the process, and reused during the lifetime of | 959 | * pipe is cached in the process, and reused during the lifetime of |
945 | * that process. | 960 | * that process. |
946 | * | 961 | * |
947 | */ | 962 | */ |
948 | ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, | 963 | ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, |
949 | splice_direct_actor *actor) | 964 | splice_direct_actor *actor) |
950 | { | 965 | { |
951 | struct pipe_inode_info *pipe; | 966 | struct pipe_inode_info *pipe; |
952 | long ret, bytes; | 967 | long ret, bytes; |
953 | umode_t i_mode; | 968 | umode_t i_mode; |
954 | size_t len; | 969 | size_t len; |
955 | int i, flags; | 970 | int i, flags; |
956 | 971 | ||
957 | /* | 972 | /* |
958 | * We require the input being a regular file, as we don't want to | 973 | * We require the input being a regular file, as we don't want to |
959 | * randomly drop data for eg socket -> socket splicing. Use the | 974 | * randomly drop data for eg socket -> socket splicing. Use the |
960 | * piped splicing for that! | 975 | * piped splicing for that! |
961 | */ | 976 | */ |
962 | i_mode = in->f_path.dentry->d_inode->i_mode; | 977 | i_mode = in->f_path.dentry->d_inode->i_mode; |
963 | if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) | 978 | if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) |
964 | return -EINVAL; | 979 | return -EINVAL; |
965 | 980 | ||
966 | /* | 981 | /* |
967 | * neither in nor out is a pipe, setup an internal pipe attached to | 982 | * neither in nor out is a pipe, setup an internal pipe attached to |
968 | * 'out' and transfer the wanted data from 'in' to 'out' through that | 983 | * 'out' and transfer the wanted data from 'in' to 'out' through that |
969 | */ | 984 | */ |
970 | pipe = current->splice_pipe; | 985 | pipe = current->splice_pipe; |
971 | if (unlikely(!pipe)) { | 986 | if (unlikely(!pipe)) { |
972 | pipe = alloc_pipe_info(NULL); | 987 | pipe = alloc_pipe_info(NULL); |
973 | if (!pipe) | 988 | if (!pipe) |
974 | return -ENOMEM; | 989 | return -ENOMEM; |
975 | 990 | ||
976 | /* | 991 | /* |
977 | * We don't have an immediate reader, but we'll read the stuff | 992 | * We don't have an immediate reader, but we'll read the stuff |
978 | * out of the pipe right after the splice_to_pipe(). So set | 993 | * out of the pipe right after the splice_to_pipe(). So set |
979 | * PIPE_READERS appropriately. | 994 | * PIPE_READERS appropriately. |
980 | */ | 995 | */ |
981 | pipe->readers = 1; | 996 | pipe->readers = 1; |
982 | 997 | ||
983 | current->splice_pipe = pipe; | 998 | current->splice_pipe = pipe; |
984 | } | 999 | } |
985 | 1000 | ||
986 | /* | 1001 | /* |
987 | * Do the splice. | 1002 | * Do the splice. |
988 | */ | 1003 | */ |
989 | ret = 0; | 1004 | ret = 0; |
990 | bytes = 0; | 1005 | bytes = 0; |
991 | len = sd->total_len; | 1006 | len = sd->total_len; |
992 | flags = sd->flags; | 1007 | flags = sd->flags; |
993 | 1008 | ||
994 | /* | 1009 | /* |
995 | * Don't block on output, we have to drain the direct pipe. | 1010 | * Don't block on output, we have to drain the direct pipe. |
996 | */ | 1011 | */ |
997 | sd->flags &= ~SPLICE_F_NONBLOCK; | 1012 | sd->flags &= ~SPLICE_F_NONBLOCK; |
998 | 1013 | ||
999 | while (len) { | 1014 | while (len) { |
1000 | size_t read_len; | 1015 | size_t read_len; |
1001 | loff_t pos = sd->pos, prev_pos = pos; | 1016 | loff_t pos = sd->pos, prev_pos = pos; |
1002 | 1017 | ||
1003 | ret = do_splice_to(in, &pos, pipe, len, flags); | 1018 | ret = do_splice_to(in, &pos, pipe, len, flags); |
1004 | if (unlikely(ret <= 0)) | 1019 | if (unlikely(ret <= 0)) |
1005 | goto out_release; | 1020 | goto out_release; |
1006 | 1021 | ||
1007 | read_len = ret; | 1022 | read_len = ret; |
1008 | sd->total_len = read_len; | 1023 | sd->total_len = read_len; |
1009 | 1024 | ||
1010 | /* | 1025 | /* |
1011 | * NOTE: nonblocking mode only applies to the input. We | 1026 | * NOTE: nonblocking mode only applies to the input. We |
1012 | * must not do the output in nonblocking mode as then we | 1027 | * must not do the output in nonblocking mode as then we |
1013 | * could get stuck data in the internal pipe: | 1028 | * could get stuck data in the internal pipe: |
1014 | */ | 1029 | */ |
1015 | ret = actor(pipe, sd); | 1030 | ret = actor(pipe, sd); |
1016 | if (unlikely(ret <= 0)) { | 1031 | if (unlikely(ret <= 0)) { |
1017 | sd->pos = prev_pos; | 1032 | sd->pos = prev_pos; |
1018 | goto out_release; | 1033 | goto out_release; |
1019 | } | 1034 | } |
1020 | 1035 | ||
1021 | bytes += ret; | 1036 | bytes += ret; |
1022 | len -= ret; | 1037 | len -= ret; |
1023 | sd->pos = pos; | 1038 | sd->pos = pos; |
1024 | 1039 | ||
1025 | if (ret < read_len) { | 1040 | if (ret < read_len) { |
1026 | sd->pos = prev_pos + ret; | 1041 | sd->pos = prev_pos + ret; |
1027 | goto out_release; | 1042 | goto out_release; |
1028 | } | 1043 | } |
1029 | } | 1044 | } |
1030 | 1045 | ||
1031 | done: | 1046 | done: |
1032 | pipe->nrbufs = pipe->curbuf = 0; | 1047 | pipe->nrbufs = pipe->curbuf = 0; |
1033 | file_accessed(in); | 1048 | file_accessed(in); |
1034 | return bytes; | 1049 | return bytes; |
1035 | 1050 | ||
1036 | out_release: | 1051 | out_release: |
1037 | /* | 1052 | /* |
1038 | * If we did an incomplete transfer we must release | 1053 | * If we did an incomplete transfer we must release |
1039 | * the pipe buffers in question: | 1054 | * the pipe buffers in question: |
1040 | */ | 1055 | */ |
1041 | for (i = 0; i < PIPE_BUFFERS; i++) { | 1056 | for (i = 0; i < PIPE_BUFFERS; i++) { |
1042 | struct pipe_buffer *buf = pipe->bufs + i; | 1057 | struct pipe_buffer *buf = pipe->bufs + i; |
1043 | 1058 | ||
1044 | if (buf->ops) { | 1059 | if (buf->ops) { |
1045 | buf->ops->release(pipe, buf); | 1060 | buf->ops->release(pipe, buf); |
1046 | buf->ops = NULL; | 1061 | buf->ops = NULL; |
1047 | } | 1062 | } |
1048 | } | 1063 | } |
1049 | 1064 | ||
1050 | if (!bytes) | 1065 | if (!bytes) |
1051 | bytes = ret; | 1066 | bytes = ret; |
1052 | 1067 | ||
1053 | goto done; | 1068 | goto done; |
1054 | } | 1069 | } |
1055 | EXPORT_SYMBOL(splice_direct_to_actor); | 1070 | EXPORT_SYMBOL(splice_direct_to_actor); |
1056 | 1071 | ||
1057 | static int direct_splice_actor(struct pipe_inode_info *pipe, | 1072 | static int direct_splice_actor(struct pipe_inode_info *pipe, |
1058 | struct splice_desc *sd) | 1073 | struct splice_desc *sd) |
1059 | { | 1074 | { |
1060 | struct file *file = sd->u.file; | 1075 | struct file *file = sd->u.file; |
1061 | 1076 | ||
1062 | return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); | 1077 | return do_splice_from(pipe, file, &sd->pos, sd->total_len, sd->flags); |
1063 | } | 1078 | } |
1064 | 1079 | ||
1065 | /** | 1080 | /** |
1066 | * do_splice_direct - splices data directly between two files | 1081 | * do_splice_direct - splices data directly between two files |
1067 | * @in: file to splice from | 1082 | * @in: file to splice from |
1068 | * @ppos: input file offset | 1083 | * @ppos: input file offset |
1069 | * @out: file to splice to | 1084 | * @out: file to splice to |
1070 | * @len: number of bytes to splice | 1085 | * @len: number of bytes to splice |
1071 | * @flags: splice modifier flags | 1086 | * @flags: splice modifier flags |
1072 | * | 1087 | * |
1073 | * Description: | 1088 | * Description: |
1074 | * For use by do_sendfile(). splice can easily emulate sendfile, but | 1089 | * For use by do_sendfile(). splice can easily emulate sendfile, but |
1075 | * doing it in the application would incur an extra system call | 1090 | * doing it in the application would incur an extra system call |
1076 | * (splice in + splice out, as compared to just sendfile()). So this helper | 1091 | * (splice in + splice out, as compared to just sendfile()). So this helper |
1077 | * can splice directly through a process-private pipe. | 1092 | * can splice directly through a process-private pipe. |
1078 | * | 1093 | * |
1079 | */ | 1094 | */ |
1080 | long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, | 1095 | long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, |
1081 | size_t len, unsigned int flags) | 1096 | size_t len, unsigned int flags) |
1082 | { | 1097 | { |
1083 | struct splice_desc sd = { | 1098 | struct splice_desc sd = { |
1084 | .len = len, | 1099 | .len = len, |
1085 | .total_len = len, | 1100 | .total_len = len, |
1086 | .flags = flags, | 1101 | .flags = flags, |
1087 | .pos = *ppos, | 1102 | .pos = *ppos, |
1088 | .u.file = out, | 1103 | .u.file = out, |
1089 | }; | 1104 | }; |
1090 | long ret; | 1105 | long ret; |
1091 | 1106 | ||
1092 | ret = splice_direct_to_actor(in, &sd, direct_splice_actor); | 1107 | ret = splice_direct_to_actor(in, &sd, direct_splice_actor); |
1093 | if (ret > 0) | 1108 | if (ret > 0) |
1094 | *ppos = sd.pos; | 1109 | *ppos = sd.pos; |
1095 | 1110 | ||
1096 | return ret; | 1111 | return ret; |
1097 | } | 1112 | } |
1098 | 1113 | ||
1099 | /* | 1114 | /* |
1100 | * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same | 1115 | * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same |
1101 | * location, so checking ->i_pipe is not enough to verify that this is a | 1116 | * location, so checking ->i_pipe is not enough to verify that this is a |
1102 | * pipe. | 1117 | * pipe. |
1103 | */ | 1118 | */ |
1104 | static inline struct pipe_inode_info *pipe_info(struct inode *inode) | 1119 | static inline struct pipe_inode_info *pipe_info(struct inode *inode) |
1105 | { | 1120 | { |
1106 | if (S_ISFIFO(inode->i_mode)) | 1121 | if (S_ISFIFO(inode->i_mode)) |
1107 | return inode->i_pipe; | 1122 | return inode->i_pipe; |
1108 | 1123 | ||
1109 | return NULL; | 1124 | return NULL; |
1110 | } | 1125 | } |
1111 | 1126 | ||
1112 | /* | 1127 | /* |
1113 | * Determine where to splice to/from. | 1128 | * Determine where to splice to/from. |
1114 | */ | 1129 | */ |
1115 | static long do_splice(struct file *in, loff_t __user *off_in, | 1130 | static long do_splice(struct file *in, loff_t __user *off_in, |
1116 | struct file *out, loff_t __user *off_out, | 1131 | struct file *out, loff_t __user *off_out, |
1117 | size_t len, unsigned int flags) | 1132 | size_t len, unsigned int flags) |
1118 | { | 1133 | { |
1119 | struct pipe_inode_info *pipe; | 1134 | struct pipe_inode_info *pipe; |
1120 | loff_t offset, *off; | 1135 | loff_t offset, *off; |
1121 | long ret; | 1136 | long ret; |
1122 | 1137 | ||
1123 | pipe = pipe_info(in->f_path.dentry->d_inode); | 1138 | pipe = pipe_info(in->f_path.dentry->d_inode); |
1124 | if (pipe) { | 1139 | if (pipe) { |
1125 | if (off_in) | 1140 | if (off_in) |
1126 | return -ESPIPE; | 1141 | return -ESPIPE; |
1127 | if (off_out) { | 1142 | if (off_out) { |
1128 | if (out->f_op->llseek == no_llseek) | 1143 | if (out->f_op->llseek == no_llseek) |
1129 | return -EINVAL; | 1144 | return -EINVAL; |
1130 | if (copy_from_user(&offset, off_out, sizeof(loff_t))) | 1145 | if (copy_from_user(&offset, off_out, sizeof(loff_t))) |
1131 | return -EFAULT; | 1146 | return -EFAULT; |
1132 | off = &offset; | 1147 | off = &offset; |
1133 | } else | 1148 | } else |
1134 | off = &out->f_pos; | 1149 | off = &out->f_pos; |
1135 | 1150 | ||
1136 | ret = do_splice_from(pipe, out, off, len, flags); | 1151 | ret = do_splice_from(pipe, out, off, len, flags); |
1137 | 1152 | ||
1138 | if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) | 1153 | if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) |
1139 | ret = -EFAULT; | 1154 | ret = -EFAULT; |
1140 | 1155 | ||
1141 | return ret; | 1156 | return ret; |
1142 | } | 1157 | } |
1143 | 1158 | ||
1144 | pipe = pipe_info(out->f_path.dentry->d_inode); | 1159 | pipe = pipe_info(out->f_path.dentry->d_inode); |
1145 | if (pipe) { | 1160 | if (pipe) { |
1146 | if (off_out) | 1161 | if (off_out) |
1147 | return -ESPIPE; | 1162 | return -ESPIPE; |
1148 | if (off_in) { | 1163 | if (off_in) { |
1149 | if (in->f_op->llseek == no_llseek) | 1164 | if (in->f_op->llseek == no_llseek) |
1150 | return -EINVAL; | 1165 | return -EINVAL; |
1151 | if (copy_from_user(&offset, off_in, sizeof(loff_t))) | 1166 | if (copy_from_user(&offset, off_in, sizeof(loff_t))) |
1152 | return -EFAULT; | 1167 | return -EFAULT; |
1153 | off = &offset; | 1168 | off = &offset; |
1154 | } else | 1169 | } else |
1155 | off = &in->f_pos; | 1170 | off = &in->f_pos; |
1156 | 1171 | ||
1157 | ret = do_splice_to(in, off, pipe, len, flags); | 1172 | ret = do_splice_to(in, off, pipe, len, flags); |
1158 | 1173 | ||
1159 | if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) | 1174 | if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) |
1160 | ret = -EFAULT; | 1175 | ret = -EFAULT; |
1161 | 1176 | ||
1162 | return ret; | 1177 | return ret; |
1163 | } | 1178 | } |
1164 | 1179 | ||
1165 | return -EINVAL; | 1180 | return -EINVAL; |
1166 | } | 1181 | } |
1167 | 1182 | ||
1168 | /* | 1183 | /* |
1169 | * Map an iov into an array of pages and offset/length tupples. With the | 1184 | * Map an iov into an array of pages and offset/length tupples. With the |
1170 | * partial_page structure, we can map several non-contiguous ranges into | 1185 | * partial_page structure, we can map several non-contiguous ranges into |
1171 | * our ones pages[] map instead of splitting that operation into pieces. | 1186 | * our ones pages[] map instead of splitting that operation into pieces. |
1172 | * Could easily be exported as a generic helper for other users, in which | 1187 | * Could easily be exported as a generic helper for other users, in which |
1173 | * case one would probably want to add a 'max_nr_pages' parameter as well. | 1188 | * case one would probably want to add a 'max_nr_pages' parameter as well. |
1174 | */ | 1189 | */ |
1175 | static int get_iovec_page_array(const struct iovec __user *iov, | 1190 | static int get_iovec_page_array(const struct iovec __user *iov, |
1176 | unsigned int nr_vecs, struct page **pages, | 1191 | unsigned int nr_vecs, struct page **pages, |
1177 | struct partial_page *partial, int aligned) | 1192 | struct partial_page *partial, int aligned) |
1178 | { | 1193 | { |
1179 | int buffers = 0, error = 0; | 1194 | int buffers = 0, error = 0; |
1180 | 1195 | ||
1181 | while (nr_vecs) { | 1196 | while (nr_vecs) { |
1182 | unsigned long off, npages; | 1197 | unsigned long off, npages; |
1183 | struct iovec entry; | 1198 | struct iovec entry; |
1184 | void __user *base; | 1199 | void __user *base; |
1185 | size_t len; | 1200 | size_t len; |
1186 | int i; | 1201 | int i; |
1187 | 1202 | ||
1188 | error = -EFAULT; | 1203 | error = -EFAULT; |
1189 | if (copy_from_user(&entry, iov, sizeof(entry))) | 1204 | if (copy_from_user(&entry, iov, sizeof(entry))) |
1190 | break; | 1205 | break; |
1191 | 1206 | ||
1192 | base = entry.iov_base; | 1207 | base = entry.iov_base; |
1193 | len = entry.iov_len; | 1208 | len = entry.iov_len; |
1194 | 1209 | ||
1195 | /* | 1210 | /* |
1196 | * Sanity check this iovec. 0 read succeeds. | 1211 | * Sanity check this iovec. 0 read succeeds. |
1197 | */ | 1212 | */ |
1198 | error = 0; | 1213 | error = 0; |
1199 | if (unlikely(!len)) | 1214 | if (unlikely(!len)) |
1200 | break; | 1215 | break; |
1201 | error = -EFAULT; | 1216 | error = -EFAULT; |
1202 | if (!access_ok(VERIFY_READ, base, len)) | 1217 | if (!access_ok(VERIFY_READ, base, len)) |
1203 | break; | 1218 | break; |
1204 | 1219 | ||
1205 | /* | 1220 | /* |
1206 | * Get this base offset and number of pages, then map | 1221 | * Get this base offset and number of pages, then map |
1207 | * in the user pages. | 1222 | * in the user pages. |
1208 | */ | 1223 | */ |
1209 | off = (unsigned long) base & ~PAGE_MASK; | 1224 | off = (unsigned long) base & ~PAGE_MASK; |
1210 | 1225 | ||
1211 | /* | 1226 | /* |
1212 | * If asked for alignment, the offset must be zero and the | 1227 | * If asked for alignment, the offset must be zero and the |
1213 | * length a multiple of the PAGE_SIZE. | 1228 | * length a multiple of the PAGE_SIZE. |
1214 | */ | 1229 | */ |
1215 | error = -EINVAL; | 1230 | error = -EINVAL; |
1216 | if (aligned && (off || len & ~PAGE_MASK)) | 1231 | if (aligned && (off || len & ~PAGE_MASK)) |
1217 | break; | 1232 | break; |
1218 | 1233 | ||
1219 | npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; | 1234 | npages = (off + len + PAGE_SIZE - 1) >> PAGE_SHIFT; |
1220 | if (npages > PIPE_BUFFERS - buffers) | 1235 | if (npages > PIPE_BUFFERS - buffers) |
1221 | npages = PIPE_BUFFERS - buffers; | 1236 | npages = PIPE_BUFFERS - buffers; |
1222 | 1237 | ||
1223 | error = get_user_pages_fast((unsigned long)base, npages, | 1238 | error = get_user_pages_fast((unsigned long)base, npages, |
1224 | 0, &pages[buffers]); | 1239 | 0, &pages[buffers]); |
1225 | 1240 | ||
1226 | if (unlikely(error <= 0)) | 1241 | if (unlikely(error <= 0)) |
1227 | break; | 1242 | break; |
1228 | 1243 | ||
1229 | /* | 1244 | /* |
1230 | * Fill this contiguous range into the partial page map. | 1245 | * Fill this contiguous range into the partial page map. |
1231 | */ | 1246 | */ |
1232 | for (i = 0; i < error; i++) { | 1247 | for (i = 0; i < error; i++) { |
1233 | const int plen = min_t(size_t, len, PAGE_SIZE - off); | 1248 | const int plen = min_t(size_t, len, PAGE_SIZE - off); |
1234 | 1249 | ||
1235 | partial[buffers].offset = off; | 1250 | partial[buffers].offset = off; |
1236 | partial[buffers].len = plen; | 1251 | partial[buffers].len = plen; |
1237 | 1252 | ||
1238 | off = 0; | 1253 | off = 0; |
1239 | len -= plen; | 1254 | len -= plen; |
1240 | buffers++; | 1255 | buffers++; |
1241 | } | 1256 | } |
1242 | 1257 | ||
1243 | /* | 1258 | /* |
1244 | * We didn't complete this iov, stop here since it probably | 1259 | * We didn't complete this iov, stop here since it probably |
1245 | * means we have to move some of this into a pipe to | 1260 | * means we have to move some of this into a pipe to |
1246 | * be able to continue. | 1261 | * be able to continue. |
1247 | */ | 1262 | */ |
1248 | if (len) | 1263 | if (len) |
1249 | break; | 1264 | break; |
1250 | 1265 | ||
1251 | /* | 1266 | /* |
1252 | * Don't continue if we mapped fewer pages than we asked for, | 1267 | * Don't continue if we mapped fewer pages than we asked for, |
1253 | * or if we mapped the max number of pages that we have | 1268 | * or if we mapped the max number of pages that we have |
1254 | * room for. | 1269 | * room for. |
1255 | */ | 1270 | */ |
1256 | if (error < npages || buffers == PIPE_BUFFERS) | 1271 | if (error < npages || buffers == PIPE_BUFFERS) |
1257 | break; | 1272 | break; |
1258 | 1273 | ||
1259 | nr_vecs--; | 1274 | nr_vecs--; |
1260 | iov++; | 1275 | iov++; |
1261 | } | 1276 | } |
1262 | 1277 | ||
1263 | if (buffers) | 1278 | if (buffers) |
1264 | return buffers; | 1279 | return buffers; |
1265 | 1280 | ||
1266 | return error; | 1281 | return error; |
1267 | } | 1282 | } |
1268 | 1283 | ||
1269 | static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, | 1284 | static int pipe_to_user(struct pipe_inode_info *pipe, struct pipe_buffer *buf, |
1270 | struct splice_desc *sd) | 1285 | struct splice_desc *sd) |
1271 | { | 1286 | { |
1272 | char *src; | 1287 | char *src; |
1273 | int ret; | 1288 | int ret; |
1274 | 1289 | ||
1275 | ret = buf->ops->confirm(pipe, buf); | 1290 | ret = buf->ops->confirm(pipe, buf); |
1276 | if (unlikely(ret)) | 1291 | if (unlikely(ret)) |
1277 | return ret; | 1292 | return ret; |
1278 | 1293 | ||
1279 | /* | 1294 | /* |
1280 | * See if we can use the atomic maps, by prefaulting in the | 1295 | * See if we can use the atomic maps, by prefaulting in the |
1281 | * pages and doing an atomic copy | 1296 | * pages and doing an atomic copy |
1282 | */ | 1297 | */ |
1283 | if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { | 1298 | if (!fault_in_pages_writeable(sd->u.userptr, sd->len)) { |
1284 | src = buf->ops->map(pipe, buf, 1); | 1299 | src = buf->ops->map(pipe, buf, 1); |
1285 | ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, | 1300 | ret = __copy_to_user_inatomic(sd->u.userptr, src + buf->offset, |
1286 | sd->len); | 1301 | sd->len); |
1287 | buf->ops->unmap(pipe, buf, src); | 1302 | buf->ops->unmap(pipe, buf, src); |
1288 | if (!ret) { | 1303 | if (!ret) { |
1289 | ret = sd->len; | 1304 | ret = sd->len; |
1290 | goto out; | 1305 | goto out; |
1291 | } | 1306 | } |
1292 | } | 1307 | } |
1293 | 1308 | ||
1294 | /* | 1309 | /* |
1295 | * No dice, use slow non-atomic map and copy | 1310 | * No dice, use slow non-atomic map and copy |
1296 | */ | 1311 | */ |
1297 | src = buf->ops->map(pipe, buf, 0); | 1312 | src = buf->ops->map(pipe, buf, 0); |
1298 | 1313 | ||
1299 | ret = sd->len; | 1314 | ret = sd->len; |
1300 | if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) | 1315 | if (copy_to_user(sd->u.userptr, src + buf->offset, sd->len)) |
1301 | ret = -EFAULT; | 1316 | ret = -EFAULT; |
1302 | 1317 | ||
1303 | buf->ops->unmap(pipe, buf, src); | 1318 | buf->ops->unmap(pipe, buf, src); |
1304 | out: | 1319 | out: |
1305 | if (ret > 0) | 1320 | if (ret > 0) |
1306 | sd->u.userptr += ret; | 1321 | sd->u.userptr += ret; |
1307 | return ret; | 1322 | return ret; |
1308 | } | 1323 | } |
1309 | 1324 | ||
1310 | /* | 1325 | /* |
1311 | * For lack of a better implementation, implement vmsplice() to userspace | 1326 | * For lack of a better implementation, implement vmsplice() to userspace |
1312 | * as a simple copy of the pipes pages to the user iov. | 1327 | * as a simple copy of the pipes pages to the user iov. |
1313 | */ | 1328 | */ |
1314 | static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, | 1329 | static long vmsplice_to_user(struct file *file, const struct iovec __user *iov, |
1315 | unsigned long nr_segs, unsigned int flags) | 1330 | unsigned long nr_segs, unsigned int flags) |
1316 | { | 1331 | { |
1317 | struct pipe_inode_info *pipe; | 1332 | struct pipe_inode_info *pipe; |
1318 | struct splice_desc sd; | 1333 | struct splice_desc sd; |
1319 | ssize_t size; | 1334 | ssize_t size; |
1320 | int error; | 1335 | int error; |
1321 | long ret; | 1336 | long ret; |
1322 | 1337 | ||
1323 | pipe = pipe_info(file->f_path.dentry->d_inode); | 1338 | pipe = pipe_info(file->f_path.dentry->d_inode); |
1324 | if (!pipe) | 1339 | if (!pipe) |
1325 | return -EBADF; | 1340 | return -EBADF; |
1326 | 1341 | ||
1327 | if (pipe->inode) | 1342 | if (pipe->inode) |
1328 | mutex_lock(&pipe->inode->i_mutex); | 1343 | mutex_lock(&pipe->inode->i_mutex); |
1329 | 1344 | ||
1330 | error = ret = 0; | 1345 | error = ret = 0; |
1331 | while (nr_segs) { | 1346 | while (nr_segs) { |
1332 | void __user *base; | 1347 | void __user *base; |
1333 | size_t len; | 1348 | size_t len; |
1334 | 1349 | ||
1335 | /* | 1350 | /* |
1336 | * Get user address base and length for this iovec. | 1351 | * Get user address base and length for this iovec. |
1337 | */ | 1352 | */ |
1338 | error = get_user(base, &iov->iov_base); | 1353 | error = get_user(base, &iov->iov_base); |
1339 | if (unlikely(error)) | 1354 | if (unlikely(error)) |
1340 | break; | 1355 | break; |
1341 | error = get_user(len, &iov->iov_len); | 1356 | error = get_user(len, &iov->iov_len); |
1342 | if (unlikely(error)) | 1357 | if (unlikely(error)) |
1343 | break; | 1358 | break; |
1344 | 1359 | ||
1345 | /* | 1360 | /* |
1346 | * Sanity check this iovec. 0 read succeeds. | 1361 | * Sanity check this iovec. 0 read succeeds. |
1347 | */ | 1362 | */ |
1348 | if (unlikely(!len)) | 1363 | if (unlikely(!len)) |
1349 | break; | 1364 | break; |
1350 | if (unlikely(!base)) { | 1365 | if (unlikely(!base)) { |
1351 | error = -EFAULT; | 1366 | error = -EFAULT; |
1352 | break; | 1367 | break; |
1353 | } | 1368 | } |
1354 | 1369 | ||
1355 | if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { | 1370 | if (unlikely(!access_ok(VERIFY_WRITE, base, len))) { |
1356 | error = -EFAULT; | 1371 | error = -EFAULT; |
1357 | break; | 1372 | break; |
1358 | } | 1373 | } |
1359 | 1374 | ||
1360 | sd.len = 0; | 1375 | sd.len = 0; |
1361 | sd.total_len = len; | 1376 | sd.total_len = len; |
1362 | sd.flags = flags; | 1377 | sd.flags = flags; |
1363 | sd.u.userptr = base; | 1378 | sd.u.userptr = base; |
1364 | sd.pos = 0; | 1379 | sd.pos = 0; |
1365 | 1380 | ||
1366 | size = __splice_from_pipe(pipe, &sd, pipe_to_user); | 1381 | size = __splice_from_pipe(pipe, &sd, pipe_to_user); |
1367 | if (size < 0) { | 1382 | if (size < 0) { |
1368 | if (!ret) | 1383 | if (!ret) |
1369 | ret = size; | 1384 | ret = size; |
1370 | 1385 | ||
1371 | break; | 1386 | break; |
1372 | } | 1387 | } |
1373 | 1388 | ||
1374 | ret += size; | 1389 | ret += size; |
1375 | 1390 | ||
1376 | if (size < len) | 1391 | if (size < len) |
1377 | break; | 1392 | break; |
1378 | 1393 | ||
1379 | nr_segs--; | 1394 | nr_segs--; |
1380 | iov++; | 1395 | iov++; |
1381 | } | 1396 | } |
1382 | 1397 | ||
1383 | if (pipe->inode) | 1398 | if (pipe->inode) |
1384 | mutex_unlock(&pipe->inode->i_mutex); | 1399 | mutex_unlock(&pipe->inode->i_mutex); |
1385 | 1400 | ||
1386 | if (!ret) | 1401 | if (!ret) |
1387 | ret = error; | 1402 | ret = error; |
1388 | 1403 | ||
1389 | return ret; | 1404 | return ret; |
1390 | } | 1405 | } |
1391 | 1406 | ||
1392 | /* | 1407 | /* |
1393 | * vmsplice splices a user address range into a pipe. It can be thought of | 1408 | * vmsplice splices a user address range into a pipe. It can be thought of |
1394 | * as splice-from-memory, where the regular splice is splice-from-file (or | 1409 | * as splice-from-memory, where the regular splice is splice-from-file (or |
1395 | * to file). In both cases the output is a pipe, naturally. | 1410 | * to file). In both cases the output is a pipe, naturally. |
1396 | */ | 1411 | */ |
1397 | static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, | 1412 | static long vmsplice_to_pipe(struct file *file, const struct iovec __user *iov, |
1398 | unsigned long nr_segs, unsigned int flags) | 1413 | unsigned long nr_segs, unsigned int flags) |
1399 | { | 1414 | { |
1400 | struct pipe_inode_info *pipe; | 1415 | struct pipe_inode_info *pipe; |
1401 | struct page *pages[PIPE_BUFFERS]; | 1416 | struct page *pages[PIPE_BUFFERS]; |
1402 | struct partial_page partial[PIPE_BUFFERS]; | 1417 | struct partial_page partial[PIPE_BUFFERS]; |
1403 | struct splice_pipe_desc spd = { | 1418 | struct splice_pipe_desc spd = { |
1404 | .pages = pages, | 1419 | .pages = pages, |
1405 | .partial = partial, | 1420 | .partial = partial, |
1406 | .flags = flags, | 1421 | .flags = flags, |
1407 | .ops = &user_page_pipe_buf_ops, | 1422 | .ops = &user_page_pipe_buf_ops, |
1408 | .spd_release = spd_release_page, | 1423 | .spd_release = spd_release_page, |
1409 | }; | 1424 | }; |
1410 | 1425 | ||
1411 | pipe = pipe_info(file->f_path.dentry->d_inode); | 1426 | pipe = pipe_info(file->f_path.dentry->d_inode); |
1412 | if (!pipe) | 1427 | if (!pipe) |
1413 | return -EBADF; | 1428 | return -EBADF; |
1414 | 1429 | ||
1415 | spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, | 1430 | spd.nr_pages = get_iovec_page_array(iov, nr_segs, pages, partial, |
1416 | flags & SPLICE_F_GIFT); | 1431 | flags & SPLICE_F_GIFT); |
1417 | if (spd.nr_pages <= 0) | 1432 | if (spd.nr_pages <= 0) |
1418 | return spd.nr_pages; | 1433 | return spd.nr_pages; |
1419 | 1434 | ||
1420 | return splice_to_pipe(pipe, &spd); | 1435 | return splice_to_pipe(pipe, &spd); |
1421 | } | 1436 | } |
1422 | 1437 | ||
1423 | /* | 1438 | /* |
1424 | * Note that vmsplice only really supports true splicing _from_ user memory | 1439 | * Note that vmsplice only really supports true splicing _from_ user memory |
1425 | * to a pipe, not the other way around. Splicing from user memory is a simple | 1440 | * to a pipe, not the other way around. Splicing from user memory is a simple |
1426 | * operation that can be supported without any funky alignment restrictions | 1441 | * operation that can be supported without any funky alignment restrictions |
1427 | * or nasty vm tricks. We simply map in the user memory and fill them into | 1442 | * or nasty vm tricks. We simply map in the user memory and fill them into |
1428 | * a pipe. The reverse isn't quite as easy, though. There are two possible | 1443 | * a pipe. The reverse isn't quite as easy, though. There are two possible |
1429 | * solutions for that: | 1444 | * solutions for that: |
1430 | * | 1445 | * |
1431 | * - memcpy() the data internally, at which point we might as well just | 1446 | * - memcpy() the data internally, at which point we might as well just |
1432 | * do a regular read() on the buffer anyway. | 1447 | * do a regular read() on the buffer anyway. |
1433 | * - Lots of nasty vm tricks, that are neither fast nor flexible (it | 1448 | * - Lots of nasty vm tricks, that are neither fast nor flexible (it |
1434 | * has restriction limitations on both ends of the pipe). | 1449 | * has restriction limitations on both ends of the pipe). |
1435 | * | 1450 | * |
1436 | * Currently we punt and implement it as a normal copy, see pipe_to_user(). | 1451 | * Currently we punt and implement it as a normal copy, see pipe_to_user(). |
1437 | * | 1452 | * |
1438 | */ | 1453 | */ |
1439 | SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, | 1454 | SYSCALL_DEFINE4(vmsplice, int, fd, const struct iovec __user *, iov, |
1440 | unsigned long, nr_segs, unsigned int, flags) | 1455 | unsigned long, nr_segs, unsigned int, flags) |
1441 | { | 1456 | { |
1442 | struct file *file; | 1457 | struct file *file; |
1443 | long error; | 1458 | long error; |
1444 | int fput; | 1459 | int fput; |
1445 | 1460 | ||
1446 | if (unlikely(nr_segs > UIO_MAXIOV)) | 1461 | if (unlikely(nr_segs > UIO_MAXIOV)) |
1447 | return -EINVAL; | 1462 | return -EINVAL; |
1448 | else if (unlikely(!nr_segs)) | 1463 | else if (unlikely(!nr_segs)) |
1449 | return 0; | 1464 | return 0; |
1450 | 1465 | ||
1451 | error = -EBADF; | 1466 | error = -EBADF; |
1452 | file = fget_light(fd, &fput); | 1467 | file = fget_light(fd, &fput); |
1453 | if (file) { | 1468 | if (file) { |
1454 | if (file->f_mode & FMODE_WRITE) | 1469 | if (file->f_mode & FMODE_WRITE) |
1455 | error = vmsplice_to_pipe(file, iov, nr_segs, flags); | 1470 | error = vmsplice_to_pipe(file, iov, nr_segs, flags); |
1456 | else if (file->f_mode & FMODE_READ) | 1471 | else if (file->f_mode & FMODE_READ) |
1457 | error = vmsplice_to_user(file, iov, nr_segs, flags); | 1472 | error = vmsplice_to_user(file, iov, nr_segs, flags); |
1458 | 1473 | ||
1459 | fput_light(file, fput); | 1474 | fput_light(file, fput); |
1460 | } | 1475 | } |
1461 | 1476 | ||
1462 | return error; | 1477 | return error; |
1463 | } | 1478 | } |
1464 | 1479 | ||
1465 | SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, | 1480 | SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, |
1466 | int, fd_out, loff_t __user *, off_out, | 1481 | int, fd_out, loff_t __user *, off_out, |
1467 | size_t, len, unsigned int, flags) | 1482 | size_t, len, unsigned int, flags) |
1468 | { | 1483 | { |
1469 | long error; | 1484 | long error; |
1470 | struct file *in, *out; | 1485 | struct file *in, *out; |
1471 | int fput_in, fput_out; | 1486 | int fput_in, fput_out; |
1472 | 1487 | ||
1473 | if (unlikely(!len)) | 1488 | if (unlikely(!len)) |
1474 | return 0; | 1489 | return 0; |
1475 | 1490 | ||
1476 | error = -EBADF; | 1491 | error = -EBADF; |
1477 | in = fget_light(fd_in, &fput_in); | 1492 | in = fget_light(fd_in, &fput_in); |
1478 | if (in) { | 1493 | if (in) { |
1479 | if (in->f_mode & FMODE_READ) { | 1494 | if (in->f_mode & FMODE_READ) { |
1480 | out = fget_light(fd_out, &fput_out); | 1495 | out = fget_light(fd_out, &fput_out); |
1481 | if (out) { | 1496 | if (out) { |
1482 | if (out->f_mode & FMODE_WRITE) | 1497 | if (out->f_mode & FMODE_WRITE) |
1483 | error = do_splice(in, off_in, | 1498 | error = do_splice(in, off_in, |
1484 | out, off_out, | 1499 | out, off_out, |
1485 | len, flags); | 1500 | len, flags); |
1486 | fput_light(out, fput_out); | 1501 | fput_light(out, fput_out); |
1487 | } | 1502 | } |
1488 | } | 1503 | } |
1489 | 1504 | ||
1490 | fput_light(in, fput_in); | 1505 | fput_light(in, fput_in); |
1491 | } | 1506 | } |
1492 | 1507 | ||
1493 | return error; | 1508 | return error; |
1494 | } | 1509 | } |
1495 | 1510 | ||
1496 | /* | 1511 | /* |
1497 | * Make sure there's data to read. Wait for input if we can, otherwise | 1512 | * Make sure there's data to read. Wait for input if we can, otherwise |
1498 | * return an appropriate error. | 1513 | * return an appropriate error. |
1499 | */ | 1514 | */ |
1500 | static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) | 1515 | static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) |
1501 | { | 1516 | { |
1502 | int ret; | 1517 | int ret; |
1503 | 1518 | ||
1504 | /* | 1519 | /* |
1505 | * Check ->nrbufs without the inode lock first. This function | 1520 | * Check ->nrbufs without the inode lock first. This function |
1506 | * is speculative anyways, so missing one is ok. | 1521 | * is speculative anyways, so missing one is ok. |
1507 | */ | 1522 | */ |
1508 | if (pipe->nrbufs) | 1523 | if (pipe->nrbufs) |
1509 | return 0; | 1524 | return 0; |
1510 | 1525 | ||
1511 | ret = 0; | 1526 | ret = 0; |
1512 | mutex_lock(&pipe->inode->i_mutex); | 1527 | mutex_lock(&pipe->inode->i_mutex); |
1513 | 1528 | ||
1514 | while (!pipe->nrbufs) { | 1529 | while (!pipe->nrbufs) { |
1515 | if (signal_pending(current)) { | 1530 | if (signal_pending(current)) { |
1516 | ret = -ERESTARTSYS; | 1531 | ret = -ERESTARTSYS; |
1517 | break; | 1532 | break; |
1518 | } | 1533 | } |
1519 | if (!pipe->writers) | 1534 | if (!pipe->writers) |
1520 | break; | 1535 | break; |
1521 | if (!pipe->waiting_writers) { | 1536 | if (!pipe->waiting_writers) { |
1522 | if (flags & SPLICE_F_NONBLOCK) { | 1537 | if (flags & SPLICE_F_NONBLOCK) { |
1523 | ret = -EAGAIN; | 1538 | ret = -EAGAIN; |
1524 | break; | 1539 | break; |
1525 | } | 1540 | } |
1526 | } | 1541 | } |
1527 | pipe_wait(pipe); | 1542 | pipe_wait(pipe); |
1528 | } | 1543 | } |
1529 | 1544 | ||
1530 | mutex_unlock(&pipe->inode->i_mutex); | 1545 | mutex_unlock(&pipe->inode->i_mutex); |
1531 | return ret; | 1546 | return ret; |
1532 | } | 1547 | } |
1533 | 1548 | ||
1534 | /* | 1549 | /* |
1535 | * Make sure there's writeable room. Wait for room if we can, otherwise | 1550 | * Make sure there's writeable room. Wait for room if we can, otherwise |
1536 | * return an appropriate error. | 1551 | * return an appropriate error. |
1537 | */ | 1552 | */ |
1538 | static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) | 1553 | static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) |
1539 | { | 1554 | { |
1540 | int ret; | 1555 | int ret; |
1541 | 1556 | ||
1542 | /* | 1557 | /* |
1543 | * Check ->nrbufs without the inode lock first. This function | 1558 | * Check ->nrbufs without the inode lock first. This function |
1544 | * is speculative anyways, so missing one is ok. | 1559 | * is speculative anyways, so missing one is ok. |
1545 | */ | 1560 | */ |
1546 | if (pipe->nrbufs < PIPE_BUFFERS) | 1561 | if (pipe->nrbufs < PIPE_BUFFERS) |
1547 | return 0; | 1562 | return 0; |
1548 | 1563 | ||
1549 | ret = 0; | 1564 | ret = 0; |
1550 | mutex_lock(&pipe->inode->i_mutex); | 1565 | mutex_lock(&pipe->inode->i_mutex); |
1551 | 1566 | ||
1552 | while (pipe->nrbufs >= PIPE_BUFFERS) { | 1567 | while (pipe->nrbufs >= PIPE_BUFFERS) { |
1553 | if (!pipe->readers) { | 1568 | if (!pipe->readers) { |
1554 | send_sig(SIGPIPE, current, 0); | 1569 | send_sig(SIGPIPE, current, 0); |
1555 | ret = -EPIPE; | 1570 | ret = -EPIPE; |
1556 | break; | 1571 | break; |
1557 | } | 1572 | } |
1558 | if (flags & SPLICE_F_NONBLOCK) { | 1573 | if (flags & SPLICE_F_NONBLOCK) { |
1559 | ret = -EAGAIN; | 1574 | ret = -EAGAIN; |
1560 | break; | 1575 | break; |
1561 | } | 1576 | } |
1562 | if (signal_pending(current)) { | 1577 | if (signal_pending(current)) { |
1563 | ret = -ERESTARTSYS; | 1578 | ret = -ERESTARTSYS; |
1564 | break; | 1579 | break; |
1565 | } | 1580 | } |
1566 | pipe->waiting_writers++; | 1581 | pipe->waiting_writers++; |
1567 | pipe_wait(pipe); | 1582 | pipe_wait(pipe); |
1568 | pipe->waiting_writers--; | 1583 | pipe->waiting_writers--; |
1569 | } | 1584 | } |
1570 | 1585 | ||
1571 | mutex_unlock(&pipe->inode->i_mutex); | 1586 | mutex_unlock(&pipe->inode->i_mutex); |
1572 | return ret; | 1587 | return ret; |
1573 | } | 1588 | } |
1574 | 1589 | ||
1575 | /* | 1590 | /* |
1576 | * Link contents of ipipe to opipe. | 1591 | * Link contents of ipipe to opipe. |
1577 | */ | 1592 | */ |
1578 | static int link_pipe(struct pipe_inode_info *ipipe, | 1593 | static int link_pipe(struct pipe_inode_info *ipipe, |
1579 | struct pipe_inode_info *opipe, | 1594 | struct pipe_inode_info *opipe, |
1580 | size_t len, unsigned int flags) | 1595 | size_t len, unsigned int flags) |
1581 | { | 1596 | { |
1582 | struct pipe_buffer *ibuf, *obuf; | 1597 | struct pipe_buffer *ibuf, *obuf; |
1583 | int ret = 0, i = 0, nbuf; | 1598 | int ret = 0, i = 0, nbuf; |
1584 | 1599 | ||
1585 | /* | 1600 | /* |
1586 | * Potential ABBA deadlock, work around it by ordering lock | 1601 | * Potential ABBA deadlock, work around it by ordering lock |
1587 | * grabbing by inode address. Otherwise two different processes | 1602 | * grabbing by inode address. Otherwise two different processes |
1588 | * could deadlock (one doing tee from A -> B, the other from B -> A). | 1603 | * could deadlock (one doing tee from A -> B, the other from B -> A). |
1589 | */ | 1604 | */ |
1590 | inode_double_lock(ipipe->inode, opipe->inode); | 1605 | inode_double_lock(ipipe->inode, opipe->inode); |
1591 | 1606 | ||
1592 | do { | 1607 | do { |
1593 | if (!opipe->readers) { | 1608 | if (!opipe->readers) { |
1594 | send_sig(SIGPIPE, current, 0); | 1609 | send_sig(SIGPIPE, current, 0); |
1595 | if (!ret) | 1610 | if (!ret) |
1596 | ret = -EPIPE; | 1611 | ret = -EPIPE; |
1597 | break; | 1612 | break; |
1598 | } | 1613 | } |
1599 | 1614 | ||
1600 | /* | 1615 | /* |
1601 | * If we have iterated all input buffers or ran out of | 1616 | * If we have iterated all input buffers or ran out of |
1602 | * output room, break. | 1617 | * output room, break. |
1603 | */ | 1618 | */ |
1604 | if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) | 1619 | if (i >= ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) |
1605 | break; | 1620 | break; |
1606 | 1621 | ||
1607 | ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); | 1622 | ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); |
1608 | nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); | 1623 | nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); |
1609 | 1624 | ||
1610 | /* | 1625 | /* |
1611 | * Get a reference to this pipe buffer, | 1626 | * Get a reference to this pipe buffer, |
1612 | * so we can copy the contents over. | 1627 | * so we can copy the contents over. |
1613 | */ | 1628 | */ |
1614 | ibuf->ops->get(ipipe, ibuf); | 1629 | ibuf->ops->get(ipipe, ibuf); |
1615 | 1630 | ||
1616 | obuf = opipe->bufs + nbuf; | 1631 | obuf = opipe->bufs + nbuf; |
1617 | *obuf = *ibuf; | 1632 | *obuf = *ibuf; |
1618 | 1633 | ||
1619 | /* | 1634 | /* |
1620 | * Don't inherit the gift flag, we need to | 1635 | * Don't inherit the gift flag, we need to |
1621 | * prevent multiple steals of this page. | 1636 | * prevent multiple steals of this page. |
1622 | */ | 1637 | */ |
1623 | obuf->flags &= ~PIPE_BUF_FLAG_GIFT; | 1638 | obuf->flags &= ~PIPE_BUF_FLAG_GIFT; |
1624 | 1639 | ||
1625 | if (obuf->len > len) | 1640 | if (obuf->len > len) |
1626 | obuf->len = len; | 1641 | obuf->len = len; |
1627 | 1642 | ||
1628 | opipe->nrbufs++; | 1643 | opipe->nrbufs++; |
1629 | ret += obuf->len; | 1644 | ret += obuf->len; |
1630 | len -= obuf->len; | 1645 | len -= obuf->len; |
1631 | i++; | 1646 | i++; |
1632 | } while (len); | 1647 | } while (len); |
1633 | 1648 | ||
1634 | /* | 1649 | /* |
1635 | * return EAGAIN if we have the potential of some data in the | 1650 | * return EAGAIN if we have the potential of some data in the |
1636 | * future, otherwise just return 0 | 1651 | * future, otherwise just return 0 |
1637 | */ | 1652 | */ |
1638 | if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) | 1653 | if (!ret && ipipe->waiting_writers && (flags & SPLICE_F_NONBLOCK)) |
1639 | ret = -EAGAIN; | 1654 | ret = -EAGAIN; |
1640 | 1655 | ||
1641 | inode_double_unlock(ipipe->inode, opipe->inode); | 1656 | inode_double_unlock(ipipe->inode, opipe->inode); |
1642 | 1657 | ||
1643 | /* | 1658 | /* |
1644 | * If we put data in the output pipe, wakeup any potential readers. | 1659 | * If we put data in the output pipe, wakeup any potential readers. |
1645 | */ | 1660 | */ |
1646 | if (ret > 0) { | 1661 | if (ret > 0) { |
1647 | smp_mb(); | 1662 | smp_mb(); |
1648 | if (waitqueue_active(&opipe->wait)) | 1663 | if (waitqueue_active(&opipe->wait)) |
1649 | wake_up_interruptible(&opipe->wait); | 1664 | wake_up_interruptible(&opipe->wait); |
1650 | kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); | 1665 | kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); |
1651 | } | 1666 | } |
1652 | 1667 | ||
1653 | return ret; | 1668 | return ret; |
1654 | } | 1669 | } |
1655 | 1670 | ||
1656 | /* | 1671 | /* |
1657 | * This is a tee(1) implementation that works on pipes. It doesn't copy | 1672 | * This is a tee(1) implementation that works on pipes. It doesn't copy |
1658 | * any data, it simply references the 'in' pages on the 'out' pipe. | 1673 | * any data, it simply references the 'in' pages on the 'out' pipe. |
1659 | * The 'flags' used are the SPLICE_F_* variants, currently the only | 1674 | * The 'flags' used are the SPLICE_F_* variants, currently the only |
1660 | * applicable one is SPLICE_F_NONBLOCK. | 1675 | * applicable one is SPLICE_F_NONBLOCK. |
1661 | */ | 1676 | */ |
1662 | static long do_tee(struct file *in, struct file *out, size_t len, | 1677 | static long do_tee(struct file *in, struct file *out, size_t len, |
1663 | unsigned int flags) | 1678 | unsigned int flags) |
1664 | { | 1679 | { |
1665 | struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); | 1680 | struct pipe_inode_info *ipipe = pipe_info(in->f_path.dentry->d_inode); |
1666 | struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); | 1681 | struct pipe_inode_info *opipe = pipe_info(out->f_path.dentry->d_inode); |
1667 | int ret = -EINVAL; | 1682 | int ret = -EINVAL; |
1668 | 1683 | ||
1669 | /* | 1684 | /* |
1670 | * Duplicate the contents of ipipe to opipe without actually | 1685 | * Duplicate the contents of ipipe to opipe without actually |
1671 | * copying the data. | 1686 | * copying the data. |
1672 | */ | 1687 | */ |
1673 | if (ipipe && opipe && ipipe != opipe) { | 1688 | if (ipipe && opipe && ipipe != opipe) { |
1674 | /* | 1689 | /* |
1675 | * Keep going, unless we encounter an error. The ipipe/opipe | 1690 | * Keep going, unless we encounter an error. The ipipe/opipe |
1676 | * ordering doesn't really matter. | 1691 | * ordering doesn't really matter. |
1677 | */ | 1692 | */ |
1678 | ret = link_ipipe_prep(ipipe, flags); | 1693 | ret = link_ipipe_prep(ipipe, flags); |
1679 | if (!ret) { | 1694 | if (!ret) { |
1680 | ret = link_opipe_prep(opipe, flags); | 1695 | ret = link_opipe_prep(opipe, flags); |
1681 | if (!ret) | 1696 | if (!ret) |
1682 | ret = link_pipe(ipipe, opipe, len, flags); | 1697 | ret = link_pipe(ipipe, opipe, len, flags); |
1683 | } | 1698 | } |
1684 | } | 1699 | } |
1685 | 1700 | ||
1686 | return ret; | 1701 | return ret; |
1687 | } | 1702 | } |
1688 | 1703 | ||
1689 | SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) | 1704 | SYSCALL_DEFINE4(tee, int, fdin, int, fdout, size_t, len, unsigned int, flags) |
1690 | { | 1705 | { |
1691 | struct file *in; | 1706 | struct file *in; |
1692 | int error, fput_in; | 1707 | int error, fput_in; |
1693 | 1708 | ||
1694 | if (unlikely(!len)) | 1709 | if (unlikely(!len)) |
1695 | return 0; | 1710 | return 0; |
1696 | 1711 | ||
1697 | error = -EBADF; | 1712 | error = -EBADF; |
1698 | in = fget_light(fdin, &fput_in); | 1713 | in = fget_light(fdin, &fput_in); |
1699 | if (in) { | 1714 | if (in) { |
1700 | if (in->f_mode & FMODE_READ) { | 1715 | if (in->f_mode & FMODE_READ) { |
1701 | int fput_out; | 1716 | int fput_out; |
1702 | struct file *out = fget_light(fdout, &fput_out); | 1717 | struct file *out = fget_light(fdout, &fput_out); |
1703 | 1718 | ||
1704 | if (out) { | 1719 | if (out) { |
1705 | if (out->f_mode & FMODE_WRITE) | 1720 | if (out->f_mode & FMODE_WRITE) |
1706 | error = do_tee(in, out, len, flags); | 1721 | error = do_tee(in, out, len, flags); |
1707 | fput_light(out, fput_out); | 1722 | fput_light(out, fput_out); |
1708 | } | 1723 | } |
1709 | } | 1724 | } |
1710 | fput_light(in, fput_in); | 1725 | fput_light(in, fput_in); |
1711 | } | 1726 | } |
1712 | 1727 | ||
1713 | return error; | 1728 | return error; |
1714 | } | 1729 | } |
1715 | 1730 |